]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/xen-3.0.4-2.6.16.x.patch
Pakete lassen sich nun definitv per Webinterface installieren.
[people/pmueller/ipfire-2.x.git] / src / patches / xen-3.0.4-2.6.16.x.patch
CommitLineData
6230f0da
MT
1diff -Nur linux-2.6.16.33-noxen/Documentation/networking/netdevices.txt linux-2.6.16.33/Documentation/networking/netdevices.txt
2--- linux-2.6.16.33-noxen/Documentation/networking/netdevices.txt 2006-11-22 18:06:31.000000000 +0000
3+++ linux-2.6.16.33/Documentation/networking/netdevices.txt 2007-05-23 21:00:01.000000000 +0000
4@@ -42,9 +42,9 @@
5 Context: nominally process, but don't sleep inside an rwlock
6
7 dev->hard_start_xmit:
8- Synchronization: dev->xmit_lock spinlock.
9+ Synchronization: netif_tx_lock spinlock.
10 When the driver sets NETIF_F_LLTX in dev->features this will be
11- called without holding xmit_lock. In this case the driver
12+ called without holding netif_tx_lock. In this case the driver
13 has to lock by itself when needed. It is recommended to use a try lock
14 for this and return -1 when the spin lock fails.
15 The locking there should also properly protect against
16@@ -62,12 +62,12 @@
17 Only valid when NETIF_F_LLTX is set.
18
19 dev->tx_timeout:
20- Synchronization: dev->xmit_lock spinlock.
21+ Synchronization: netif_tx_lock spinlock.
22 Context: BHs disabled
23 Notes: netif_queue_stopped() is guaranteed true
24
25 dev->set_multicast_list:
26- Synchronization: dev->xmit_lock spinlock.
27+ Synchronization: netif_tx_lock spinlock.
28 Context: BHs disabled
29
30 dev->poll:
31diff -Nur linux-2.6.16.33-noxen/arch/i386/Kconfig linux-2.6.16.33/arch/i386/Kconfig
32--- linux-2.6.16.33-noxen/arch/i386/Kconfig 2006-11-22 18:06:31.000000000 +0000
33+++ linux-2.6.16.33/arch/i386/Kconfig 2007-01-08 15:00:45.000000000 +0000
34@@ -58,6 +58,15 @@
35 help
36 Choose this option if your computer is a standard PC or compatible.
37
38+config X86_XEN
39+ bool "Xen-compatible"
40+ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
41+ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
42+ select SWIOTLB
43+ help
44+ Choose this option if you plan to run this kernel on top of the
45+ Xen Hypervisor.
46+
47 config X86_ELAN
48 bool "AMD Elan"
49 help
50@@ -159,6 +168,7 @@
51
52 config HPET_TIMER
53 bool "HPET Timer Support"
54+ depends on !X86_XEN
55 help
56 This enables the use of the HPET for the kernel's internal timer.
57 HPET is the next generation timer replacing legacy 8254s.
58@@ -202,6 +212,19 @@
59
60 If you don't know what to do here, say N.
61
62+config SMP_ALTERNATIVES
63+ bool "SMP alternatives support (EXPERIMENTAL)"
64+ depends on SMP && EXPERIMENTAL
65+ help
66+ Try to reduce the overhead of running an SMP kernel on a uniprocessor
67+ host slightly by replacing certain key instruction sequences
68+ according to whether we currently have more than one CPU available.
69+ This should provide a noticeable boost to performance when
70+ running SMP kernels on UP machines, and have negligible impact
71+ when running on an true SMP host.
72+
73+ If unsure, say N.
74+
75 config NR_CPUS
76 int "Maximum number of CPUs (2-255)"
77 range 2 255
78@@ -218,7 +241,7 @@
79
80 config SCHED_SMT
81 bool "SMT (Hyperthreading) scheduler support"
82- depends on SMP
83+ depends on SMP && !X86_XEN
84 default off
85 help
86 SMT scheduler support improves the CPU scheduler's decision making
87@@ -230,7 +253,7 @@
88
89 config X86_UP_APIC
90 bool "Local APIC support on uniprocessors"
91- depends on !SMP && !(X86_VISWS || X86_VOYAGER)
92+ depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
93 help
94 A local APIC (Advanced Programmable Interrupt Controller) is an
95 integrated interrupt controller in the CPU. If you have a single-CPU
96@@ -255,12 +278,12 @@
97
98 config X86_LOCAL_APIC
99 bool
100- depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
101+ depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
102 default y
103
104 config X86_IO_APIC
105 bool
106- depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
107+ depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
108 default y
109
110 config X86_VISWS_APIC
111@@ -268,9 +291,14 @@
112 depends on X86_VISWS
113 default y
114
115+config X86_TSC
116+ bool
117+ depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN
118+ default y
119+
120 config X86_MCE
121 bool "Machine Check Exception"
122- depends on !X86_VOYAGER
123+ depends on !(X86_VOYAGER || X86_XEN)
124 ---help---
125 Machine Check Exception support allows the processor to notify the
126 kernel if it detects a problem (e.g. overheating, component failure).
127@@ -360,6 +388,7 @@
128
129 config MICROCODE
130 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
131+ depends on !XEN_UNPRIVILEGED_GUEST
132 ---help---
133 If you say Y here and also to "/dev file system support" in the
134 'File systems' section, you will be able to update the microcode on
135@@ -377,6 +406,7 @@
136
137 config X86_MSR
138 tristate "/dev/cpu/*/msr - Model-specific register support"
139+ depends on !X86_XEN
140 help
141 This device gives privileged processes access to the x86
142 Model-Specific Registers (MSRs). It is a character device with
143@@ -392,6 +422,10 @@
144 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
145 /dev/cpu/31/cpuid.
146
147+config SWIOTLB
148+ bool
149+ default n
150+
151 source "drivers/firmware/Kconfig"
152
153 choice
154@@ -560,7 +594,7 @@
155
156 config HIGHPTE
157 bool "Allocate 3rd-level pagetables from highmem"
158- depends on HIGHMEM4G || HIGHMEM64G
159+ depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
160 help
161 The VM uses one page table entry for each page of physical memory.
162 For systems with a lot of RAM, this can be wasteful of precious
163@@ -569,6 +603,7 @@
164
165 config MATH_EMULATION
166 bool "Math emulation"
167+ depends on !X86_XEN
168 ---help---
169 Linux can emulate a math coprocessor (used for floating point
170 operations) if you don't have one. 486DX and Pentium processors have
171@@ -594,6 +629,8 @@
172
173 config MTRR
174 bool "MTRR (Memory Type Range Register) support"
175+ depends on !XEN_UNPRIVILEGED_GUEST
176+ default y if X86_XEN
177 ---help---
178 On Intel P6 family processors (Pentium Pro, Pentium II and later)
179 the Memory Type Range Registers (MTRRs) may be used to control
180@@ -628,7 +665,7 @@
181
182 config EFI
183 bool "Boot from EFI support (EXPERIMENTAL)"
184- depends on ACPI
185+ depends on ACPI && !X86_XEN
186 default n
187 ---help---
188 This enables the the kernel to boot on EFI platforms using
189@@ -646,7 +683,7 @@
190
191 config IRQBALANCE
192 bool "Enable kernel irq balancing"
193- depends on SMP && X86_IO_APIC
194+ depends on SMP && X86_IO_APIC && !X86_XEN
195 default y
196 help
197 The default yes will allow the kernel to do irq load balancing.
198@@ -689,7 +726,7 @@
199
200 config KEXEC
201 bool "kexec system call (EXPERIMENTAL)"
202- depends on EXPERIMENTAL
203+ depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
204 help
205 kexec is a system call that implements the ability to shutdown your
206 current kernel, and to start another kernel. It is like a reboot
207@@ -743,6 +780,7 @@
208 config DOUBLEFAULT
209 default y
210 bool "Enable doublefault exception handler" if EMBEDDED
211+ depends on !X86_NO_TSS
212 help
213 This option allows trapping of rare doublefault exceptions that
214 would otherwise cause a system to silently reboot. Disabling this
215@@ -756,18 +794,20 @@
216 depends on HIGHMEM
217
218 menu "Power management options (ACPI, APM)"
219- depends on !X86_VOYAGER
220+ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
221
222+if !X86_XEN
223 source kernel/power/Kconfig
224+endif
225
226 source "drivers/acpi/Kconfig"
227
228 menu "APM (Advanced Power Management) BIOS Support"
229-depends on PM && !X86_VISWS
230+depends on PM && !(X86_VISWS || X86_XEN)
231
232 config APM
233 tristate "APM (Advanced Power Management) BIOS support"
234- depends on PM
235+ depends on PM && PM_LEGACY
236 ---help---
237 APM is a BIOS specification for saving power using several different
238 techniques. This is mostly useful for battery powered laptops with
239@@ -952,6 +992,7 @@
240
241 config PCI_GOBIOS
242 bool "BIOS"
243+ depends on !X86_XEN
244
245 config PCI_GOMMCONFIG
246 bool "MMConfig"
247@@ -959,6 +1000,13 @@
248 config PCI_GODIRECT
249 bool "Direct"
250
251+config PCI_GOXEN_FE
252+ bool "Xen PCI Frontend"
253+ depends on X86_XEN
254+ help
255+ The PCI device frontend driver allows the kernel to import arbitrary
256+ PCI devices from a PCI backend to support PCI driver domains.
257+
258 config PCI_GOANY
259 bool "Any"
260
261@@ -966,7 +1014,7 @@
262
263 config PCI_BIOS
264 bool
265- depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
266+ depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
267 default y
268
269 config PCI_DIRECT
270@@ -979,6 +1027,18 @@
271 depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
272 default y
273
274+config XEN_PCIDEV_FRONTEND
275+ bool
276+ depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
277+ default y
278+
279+config XEN_PCIDEV_FE_DEBUG
280+ bool "Xen PCI Frontend Debugging"
281+ depends on XEN_PCIDEV_FRONTEND
282+ default n
283+ help
284+ Enables some debug statements within the PCI Frontend.
285+
286 source "drivers/pci/pcie/Kconfig"
287
288 source "drivers/pci/Kconfig"
289@@ -989,7 +1049,7 @@
290
291 config ISA
292 bool "ISA support"
293- depends on !(X86_VOYAGER || X86_VISWS)
294+ depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
295 help
296 Find out whether you have ISA slots on your motherboard. ISA is the
297 name of a bus system, i.e. the way the CPU talks to the other stuff
298@@ -1016,7 +1076,7 @@
299 source "drivers/eisa/Kconfig"
300
301 config MCA
302- bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
303+ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
304 default y if X86_VOYAGER
305 help
306 MicroChannel Architecture is found in some IBM PS/2 machines and
307@@ -1078,6 +1138,8 @@
308
309 source "crypto/Kconfig"
310
311+source "drivers/xen/Kconfig"
312+
313 source "lib/Kconfig"
314
315 #
316@@ -1103,7 +1165,7 @@
317
318 config X86_HT
319 bool
320- depends on SMP && !(X86_VISWS || X86_VOYAGER)
321+ depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
322 default y
323
324 config X86_BIOS_REBOOT
325@@ -1116,6 +1178,16 @@
326 depends on X86_SMP || (X86_VOYAGER && SMP)
327 default y
328
329+config X86_NO_TSS
330+ bool
331+ depends on X86_XEN
332+ default y
333+
334+config X86_NO_IDT
335+ bool
336+ depends on X86_XEN
337+ default y
338+
339 config KTIME_SCALAR
340 bool
341 default y
342diff -Nur linux-2.6.16.33-noxen/arch/i386/Kconfig.cpu linux-2.6.16.33/arch/i386/Kconfig.cpu
343--- linux-2.6.16.33-noxen/arch/i386/Kconfig.cpu 2006-11-22 18:06:31.000000000 +0000
344+++ linux-2.6.16.33/arch/i386/Kconfig.cpu 2007-01-08 15:00:45.000000000 +0000
345@@ -251,7 +251,7 @@
346
347 config X86_F00F_BUG
348 bool
349- depends on M586MMX || M586TSC || M586 || M486 || M386
350+ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
351 default y
352
353 config X86_WP_WORKS_OK
354diff -Nur linux-2.6.16.33-noxen/arch/i386/Makefile linux-2.6.16.33/arch/i386/Makefile
355--- linux-2.6.16.33-noxen/arch/i386/Makefile 2006-11-22 18:06:31.000000000 +0000
356+++ linux-2.6.16.33/arch/i386/Makefile 2007-01-08 15:00:45.000000000 +0000
357@@ -45,6 +45,11 @@
358
359 CFLAGS += $(cflags-y)
360
361+cppflags-$(CONFIG_XEN) += \
362+ -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
363+
364+CPPFLAGS += $(cppflags-y)
365+
366 # Default subarch .c files
367 mcore-y := mach-default
368
369@@ -68,6 +73,10 @@
370 mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
371 mcore-$(CONFIG_X86_SUMMIT) := mach-default
372
373+# Xen subarch support
374+mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen
375+mcore-$(CONFIG_X86_XEN) := mach-xen
376+
377 # generic subarchitecture
378 mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
379 mcore-$(CONFIG_X86_GENERICARCH) := mach-default
380@@ -102,6 +111,19 @@
381 .PHONY: zImage bzImage compressed zlilo bzlilo \
382 zdisk bzdisk fdimage fdimage144 fdimage288 install
383
384+ifdef CONFIG_XEN
385+CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
386+head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
387+boot := arch/i386/boot-xen
388+.PHONY: vmlinuz
389+all: vmlinuz
390+
391+vmlinuz: vmlinux
392+ $(Q)$(MAKE) $(build)=$(boot) $@
393+
394+install:
395+ $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
396+else
397 all: bzImage
398
399 # KBUILD_IMAGE specify target image being built
400@@ -124,6 +146,7 @@
401
402 install:
403 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
404+endif
405
406 archclean:
407 $(Q)$(MAKE) $(clean)=arch/i386/boot
408@@ -139,3 +162,4 @@
409 endef
410
411 CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
412+CLEAN_FILES += vmlinuz vmlinux-stripped
413diff -Nur linux-2.6.16.33-noxen/arch/i386/boot-xen/Makefile linux-2.6.16.33/arch/i386/boot-xen/Makefile
414--- linux-2.6.16.33-noxen/arch/i386/boot-xen/Makefile 1970-01-01 00:00:00.000000000 +0000
415+++ linux-2.6.16.33/arch/i386/boot-xen/Makefile 2007-01-08 15:00:45.000000000 +0000
416@@ -0,0 +1,21 @@
417+
418+OBJCOPYFLAGS := -g --strip-unneeded
419+
420+vmlinuz: vmlinux-stripped FORCE
421+ $(call if_changed,gzip)
422+
423+vmlinux-stripped: vmlinux FORCE
424+ $(call if_changed,objcopy)
425+
426+INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
427+
428+XINSTALL_NAME ?= $(KERNELRELEASE)
429+install:
430+ mkdir -p $(INSTALL_ROOT)/boot
431+ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
432+ rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
433+ install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
434+ install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
435+ install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
436+ install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
437+ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
438diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/Makefile linux-2.6.16.33/arch/i386/kernel/Makefile
439--- linux-2.6.16.33-noxen/arch/i386/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
440+++ linux-2.6.16.33/arch/i386/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
441@@ -37,11 +37,18 @@
442 obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
443 obj-$(CONFIG_VM86) += vm86.o
444 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
445+obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
446
447 EXTRA_AFLAGS := -traditional
448
449 obj-$(CONFIG_SCx200) += scx200.o
450
451+ifdef CONFIG_XEN
452+vsyscall_note := vsyscall-note-xen.o
453+else
454+vsyscall_note := vsyscall-note.o
455+endif
456+
457 # vsyscall.o contains the vsyscall DSO images as __initdata.
458 # We must build both images before we can assemble it.
459 # Note: kbuild does not track this dependency due to usage of .incbin
460@@ -62,7 +69,7 @@
461
462 $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
463 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
464- $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
465+ $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
466 $(call if_changed,syscall)
467
468 # We also create a special relocatable object that should mirror the symbol
469@@ -74,5 +81,17 @@
470
471 SYSCFLAGS_vsyscall-syms.o = -r
472 $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
473- $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
474+ $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE
475 $(call if_changed,syscall)
476+
477+ifdef CONFIG_XEN
478+include $(srctree)/scripts/Makefile.xen
479+
480+obj-y += fixup.o
481+microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
482+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
483+
484+obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
485+obj-y := $(call cherrypickxen, $(obj-y))
486+extra-y := $(call cherrypickxen, $(extra-y))
487+endif
488diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/acpi/Makefile linux-2.6.16.33/arch/i386/kernel/acpi/Makefile
489--- linux-2.6.16.33-noxen/arch/i386/kernel/acpi/Makefile 2006-11-22 18:06:31.000000000 +0000
490+++ linux-2.6.16.33/arch/i386/kernel/acpi/Makefile 2007-01-08 15:00:45.000000000 +0000
491@@ -6,3 +6,7 @@
492 obj-y += cstate.o processor.o
493 endif
494
495+ifdef CONFIG_XEN
496+include $(srctree)/scripts/Makefile.xen
497+obj-y := $(call cherrypickxen, $(obj-y), $(src))
498+endif
499diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/acpi/boot-xen.c linux-2.6.16.33/arch/i386/kernel/acpi/boot-xen.c
500--- linux-2.6.16.33-noxen/arch/i386/kernel/acpi/boot-xen.c 1970-01-01 00:00:00.000000000 +0000
501+++ linux-2.6.16.33/arch/i386/kernel/acpi/boot-xen.c 2007-01-08 15:00:45.000000000 +0000
502@@ -0,0 +1,1161 @@
503+/*
504+ * boot.c - Architecture-Specific Low-Level ACPI Boot Support
505+ *
506+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
507+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
508+ *
509+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
510+ *
511+ * This program is free software; you can redistribute it and/or modify
512+ * it under the terms of the GNU General Public License as published by
513+ * the Free Software Foundation; either version 2 of the License, or
514+ * (at your option) any later version.
515+ *
516+ * This program is distributed in the hope that it will be useful,
517+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
518+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
519+ * GNU General Public License for more details.
520+ *
521+ * You should have received a copy of the GNU General Public License
522+ * along with this program; if not, write to the Free Software
523+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
524+ *
525+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
526+ */
527+
528+#include <linux/init.h>
529+#include <linux/config.h>
530+#include <linux/acpi.h>
531+#include <linux/efi.h>
532+#include <linux/module.h>
533+#include <linux/dmi.h>
534+#include <linux/irq.h>
535+
536+#include <asm/pgtable.h>
537+#include <asm/io_apic.h>
538+#include <asm/apic.h>
539+#include <asm/io.h>
540+#include <asm/mpspec.h>
541+
542+#ifdef CONFIG_X86_64
543+
544+extern void __init clustered_apic_check(void);
545+
546+extern int gsi_irq_sharing(int gsi);
547+#include <asm/proto.h>
548+
549+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
550+
551+
552+#else /* X86 */
553+
554+#ifdef CONFIG_X86_LOCAL_APIC
555+#include <mach_apic.h>
556+#include <mach_mpparse.h>
557+#endif /* CONFIG_X86_LOCAL_APIC */
558+
559+static inline int gsi_irq_sharing(int gsi) { return gsi; }
560+
561+#endif /* X86 */
562+
563+#define BAD_MADT_ENTRY(entry, end) ( \
564+ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
565+ ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
566+
567+#define PREFIX "ACPI: "
568+
569+int acpi_noirq __initdata; /* skip ACPI IRQ initialization */
570+int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
571+int acpi_ht __initdata = 1; /* enable HT */
572+
573+int acpi_lapic;
574+int acpi_ioapic;
575+int acpi_strict;
576+EXPORT_SYMBOL(acpi_strict);
577+
578+acpi_interrupt_flags acpi_sci_flags __initdata;
579+int acpi_sci_override_gsi __initdata;
580+int acpi_skip_timer_override __initdata;
581+
582+#ifdef CONFIG_X86_LOCAL_APIC
583+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
584+#endif
585+
586+#ifndef __HAVE_ARCH_CMPXCHG
587+#warning ACPI uses CMPXCHG, i486 and later hardware
588+#endif
589+
590+#define MAX_MADT_ENTRIES 256
591+u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
592+ {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
593+EXPORT_SYMBOL(x86_acpiid_to_apicid);
594+
595+/* --------------------------------------------------------------------------
596+ Boot-time Configuration
597+ -------------------------------------------------------------------------- */
598+
599+/*
600+ * The default interrupt routing model is PIC (8259). This gets
601+ * overriden if IOAPICs are enumerated (below).
602+ */
603+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
604+
605+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
606+
607+/* rely on all ACPI tables being in the direct mapping */
608+char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
609+{
610+ if (!phys_addr || !size)
611+ return NULL;
612+
613+ if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
614+ return __va(phys_addr);
615+
616+ return NULL;
617+}
618+
619+#else
620+
621+/*
622+ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
623+ * to map the target physical address. The problem is that set_fixmap()
624+ * provides a single page, and it is possible that the page is not
625+ * sufficient.
626+ * By using this area, we can map up to MAX_IO_APICS pages temporarily,
627+ * i.e. until the next __va_range() call.
628+ *
629+ * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
630+ * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
631+ * count idx down while incrementing the phys address.
632+ */
633+char *__acpi_map_table(unsigned long phys, unsigned long size)
634+{
635+ unsigned long base, offset, mapped_size;
636+ int idx;
637+
638+#ifndef CONFIG_XEN
639+ if (phys + size < 8 * 1024 * 1024)
640+ return __va(phys);
641+#endif
642+
643+ offset = phys & (PAGE_SIZE - 1);
644+ mapped_size = PAGE_SIZE - offset;
645+ set_fixmap(FIX_ACPI_END, phys);
646+ base = fix_to_virt(FIX_ACPI_END);
647+
648+ /*
649+ * Most cases can be covered by the below.
650+ */
651+ idx = FIX_ACPI_END;
652+ while (mapped_size < size) {
653+ if (--idx < FIX_ACPI_BEGIN)
654+ return NULL; /* cannot handle this */
655+ phys += PAGE_SIZE;
656+ set_fixmap(idx, phys);
657+ mapped_size += PAGE_SIZE;
658+ }
659+
660+ return ((unsigned char *)base + offset);
661+}
662+#endif
663+
664+#ifdef CONFIG_PCI_MMCONFIG
665+/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
666+struct acpi_table_mcfg_config *pci_mmcfg_config;
667+int pci_mmcfg_config_num;
668+
669+int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
670+{
671+ struct acpi_table_mcfg *mcfg;
672+ unsigned long i;
673+ int config_size;
674+
675+ if (!phys_addr || !size)
676+ return -EINVAL;
677+
678+ mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
679+ if (!mcfg) {
680+ printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
681+ return -ENODEV;
682+ }
683+
684+ /* how many config structures do we have */
685+ pci_mmcfg_config_num = 0;
686+ i = size - sizeof(struct acpi_table_mcfg);
687+ while (i >= sizeof(struct acpi_table_mcfg_config)) {
688+ ++pci_mmcfg_config_num;
689+ i -= sizeof(struct acpi_table_mcfg_config);
690+ };
691+ if (pci_mmcfg_config_num == 0) {
692+ printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
693+ return -ENODEV;
694+ }
695+
696+ config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
697+ pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
698+ if (!pci_mmcfg_config) {
699+ printk(KERN_WARNING PREFIX
700+ "No memory for MCFG config tables\n");
701+ return -ENOMEM;
702+ }
703+
704+ memcpy(pci_mmcfg_config, &mcfg->config, config_size);
705+ for (i = 0; i < pci_mmcfg_config_num; ++i) {
706+ if (mcfg->config[i].base_reserved) {
707+ printk(KERN_ERR PREFIX
708+ "MMCONFIG not in low 4GB of memory\n");
709+ return -ENODEV;
710+ }
711+ }
712+
713+ return 0;
714+}
715+#endif /* CONFIG_PCI_MMCONFIG */
716+
717+#ifdef CONFIG_X86_LOCAL_APIC
718+static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
719+{
720+ struct acpi_table_madt *madt = NULL;
721+
722+ if (!phys_addr || !size)
723+ return -EINVAL;
724+
725+ madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
726+ if (!madt) {
727+ printk(KERN_WARNING PREFIX "Unable to map MADT\n");
728+ return -ENODEV;
729+ }
730+
731+ if (madt->lapic_address) {
732+ acpi_lapic_addr = (u64) madt->lapic_address;
733+
734+ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
735+ madt->lapic_address);
736+ }
737+
738+ acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
739+
740+ return 0;
741+}
742+
743+static int __init
744+acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
745+{
746+ struct acpi_table_lapic *processor = NULL;
747+
748+ processor = (struct acpi_table_lapic *)header;
749+
750+ if (BAD_MADT_ENTRY(processor, end))
751+ return -EINVAL;
752+
753+ acpi_table_print_madt_entry(header);
754+
755+ /* Record local apic id only when enabled */
756+ if (processor->flags.enabled)
757+ x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
758+
759+ /*
760+ * We need to register disabled CPU as well to permit
761+ * counting disabled CPUs. This allows us to size
762+ * cpus_possible_map more accurately, to permit
763+ * to not preallocating memory for all NR_CPUS
764+ * when we use CPU hotplug.
765+ */
766+ mp_register_lapic(processor->id, /* APIC ID */
767+ processor->flags.enabled); /* Enabled? */
768+
769+ return 0;
770+}
771+
772+static int __init
773+acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
774+ const unsigned long end)
775+{
776+ struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
777+
778+ lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
779+
780+ if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
781+ return -EINVAL;
782+
783+ acpi_lapic_addr = lapic_addr_ovr->address;
784+
785+ return 0;
786+}
787+
788+static int __init
789+acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
790+{
791+ struct acpi_table_lapic_nmi *lapic_nmi = NULL;
792+
793+ lapic_nmi = (struct acpi_table_lapic_nmi *)header;
794+
795+ if (BAD_MADT_ENTRY(lapic_nmi, end))
796+ return -EINVAL;
797+
798+ acpi_table_print_madt_entry(header);
799+
800+ if (lapic_nmi->lint != 1)
801+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
802+
803+ return 0;
804+}
805+
806+#endif /*CONFIG_X86_LOCAL_APIC */
807+
808+#ifdef CONFIG_X86_IO_APIC
809+
810+static int __init
811+acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
812+{
813+ struct acpi_table_ioapic *ioapic = NULL;
814+
815+ ioapic = (struct acpi_table_ioapic *)header;
816+
817+ if (BAD_MADT_ENTRY(ioapic, end))
818+ return -EINVAL;
819+
820+ acpi_table_print_madt_entry(header);
821+
822+ mp_register_ioapic(ioapic->id,
823+ ioapic->address, ioapic->global_irq_base);
824+
825+ return 0;
826+}
827+
828+/*
829+ * Parse Interrupt Source Override for the ACPI SCI
830+ */
831+static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
832+{
833+ if (trigger == 0) /* compatible SCI trigger is level */
834+ trigger = 3;
835+
836+ if (polarity == 0) /* compatible SCI polarity is low */
837+ polarity = 3;
838+
839+ /* Command-line over-ride via acpi_sci= */
840+ if (acpi_sci_flags.trigger)
841+ trigger = acpi_sci_flags.trigger;
842+
843+ if (acpi_sci_flags.polarity)
844+ polarity = acpi_sci_flags.polarity;
845+
846+ /*
847+ * mp_config_acpi_legacy_irqs() already setup IRQs < 16
848+ * If GSI is < 16, this will update its flags,
849+ * else it will create a new mp_irqs[] entry.
850+ */
851+ mp_override_legacy_irq(gsi, polarity, trigger, gsi);
852+
853+ /*
854+ * stash over-ride to indicate we've been here
855+ * and for later update of acpi_fadt
856+ */
857+ acpi_sci_override_gsi = gsi;
858+ return;
859+}
860+
861+static int __init
862+acpi_parse_int_src_ovr(acpi_table_entry_header * header,
863+ const unsigned long end)
864+{
865+ struct acpi_table_int_src_ovr *intsrc = NULL;
866+
867+ intsrc = (struct acpi_table_int_src_ovr *)header;
868+
869+ if (BAD_MADT_ENTRY(intsrc, end))
870+ return -EINVAL;
871+
872+ acpi_table_print_madt_entry(header);
873+
874+ if (intsrc->bus_irq == acpi_fadt.sci_int) {
875+ acpi_sci_ioapic_setup(intsrc->global_irq,
876+ intsrc->flags.polarity,
877+ intsrc->flags.trigger);
878+ return 0;
879+ }
880+
881+ if (acpi_skip_timer_override &&
882+ intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
883+ printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
884+ return 0;
885+ }
886+
887+ mp_override_legacy_irq(intsrc->bus_irq,
888+ intsrc->flags.polarity,
889+ intsrc->flags.trigger, intsrc->global_irq);
890+
891+ return 0;
892+}
893+
894+static int __init
895+acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
896+{
897+ struct acpi_table_nmi_src *nmi_src = NULL;
898+
899+ nmi_src = (struct acpi_table_nmi_src *)header;
900+
901+ if (BAD_MADT_ENTRY(nmi_src, end))
902+ return -EINVAL;
903+
904+ acpi_table_print_madt_entry(header);
905+
906+ /* TBD: Support nimsrc entries? */
907+
908+ return 0;
909+}
910+
911+#endif /* CONFIG_X86_IO_APIC */
912+
913+/*
914+ * acpi_pic_sci_set_trigger()
915+ *
916+ * use ELCR to set PIC-mode trigger type for SCI
917+ *
918+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
919+ * it may require Edge Trigger -- use "acpi_sci=edge"
920+ *
921+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
922+ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
923+ * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
924+ * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
925+ */
926+
927+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
928+{
929+ unsigned int mask = 1 << irq;
930+ unsigned int old, new;
931+
932+ /* Real old ELCR mask */
933+ old = inb(0x4d0) | (inb(0x4d1) << 8);
934+
935+ /*
936+ * If we use ACPI to set PCI irq's, then we should clear ELCR
937+ * since we will set it correctly as we enable the PCI irq
938+ * routing.
939+ */
940+ new = acpi_noirq ? old : 0;
941+
942+ /*
943+ * Update SCI information in the ELCR, it isn't in the PCI
944+ * routing tables..
945+ */
946+ switch (trigger) {
947+ case 1: /* Edge - clear */
948+ new &= ~mask;
949+ break;
950+ case 3: /* Level - set */
951+ new |= mask;
952+ break;
953+ }
954+
955+ if (old == new)
956+ return;
957+
958+ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
959+ outb(new, 0x4d0);
960+ outb(new >> 8, 0x4d1);
961+}
962+
963+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
964+{
965+#ifdef CONFIG_X86_IO_APIC
966+ if (use_pci_vector() && !platform_legacy_irq(gsi))
967+ *irq = IO_APIC_VECTOR(gsi);
968+ else
969+#endif
970+ *irq = gsi_irq_sharing(gsi);
971+ return 0;
972+}
973+
974+/*
975+ * success: return IRQ number (>=0)
976+ * failure: return < 0
977+ */
978+int acpi_register_gsi(u32 gsi, int triggering, int polarity)
979+{
980+ unsigned int irq;
981+ unsigned int plat_gsi = gsi;
982+
983+#ifdef CONFIG_PCI
984+ /*
985+ * Make sure all (legacy) PCI IRQs are set as level-triggered.
986+ */
987+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
988+ extern void eisa_set_level_irq(unsigned int irq);
989+
990+ if (triggering == ACPI_LEVEL_SENSITIVE)
991+ eisa_set_level_irq(gsi);
992+ }
993+#endif
994+
995+#ifdef CONFIG_X86_IO_APIC
996+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
997+ plat_gsi = mp_register_gsi(gsi, triggering, polarity);
998+ }
999+#endif
1000+ acpi_gsi_to_irq(plat_gsi, &irq);
1001+ return irq;
1002+}
1003+
1004+EXPORT_SYMBOL(acpi_register_gsi);
1005+
1006+/*
1007+ * ACPI based hotplug support for CPU
1008+ */
1009+#ifdef CONFIG_ACPI_HOTPLUG_CPU
1010+int acpi_map_lsapic(acpi_handle handle, int *pcpu)
1011+{
1012+ /* TBD */
1013+ return -EINVAL;
1014+}
1015+
1016+EXPORT_SYMBOL(acpi_map_lsapic);
1017+
1018+int acpi_unmap_lsapic(int cpu)
1019+{
1020+ /* TBD */
1021+ return -EINVAL;
1022+}
1023+
1024+EXPORT_SYMBOL(acpi_unmap_lsapic);
1025+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
1026+
1027+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
1028+{
1029+ /* TBD */
1030+ return -EINVAL;
1031+}
1032+
1033+EXPORT_SYMBOL(acpi_register_ioapic);
1034+
1035+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
1036+{
1037+ /* TBD */
1038+ return -EINVAL;
1039+}
1040+
1041+EXPORT_SYMBOL(acpi_unregister_ioapic);
1042+
1043+static unsigned long __init
1044+acpi_scan_rsdp(unsigned long start, unsigned long length)
1045+{
1046+ unsigned long offset = 0;
1047+ unsigned long sig_len = sizeof("RSD PTR ") - 1;
1048+ unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
1049+
1050+ /*
1051+ * Scan all 16-byte boundaries of the physical memory region for the
1052+ * RSDP signature.
1053+ */
1054+ for (offset = 0; offset < length; offset += 16) {
1055+ if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
1056+ continue;
1057+ return (start + offset);
1058+ }
1059+
1060+ return 0;
1061+}
1062+
1063+static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
1064+{
1065+ struct acpi_table_sbf *sb;
1066+
1067+ if (!phys_addr || !size)
1068+ return -EINVAL;
1069+
1070+ sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
1071+ if (!sb) {
1072+ printk(KERN_WARNING PREFIX "Unable to map SBF\n");
1073+ return -ENODEV;
1074+ }
1075+
1076+ sbf_port = sb->sbf_cmos; /* Save CMOS port */
1077+
1078+ return 0;
1079+}
1080+
1081+#ifdef CONFIG_HPET_TIMER
1082+
1083+static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
1084+{
1085+ struct acpi_table_hpet *hpet_tbl;
1086+
1087+ if (!phys || !size)
1088+ return -EINVAL;
1089+
1090+ hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
1091+ if (!hpet_tbl) {
1092+ printk(KERN_WARNING PREFIX "Unable to map HPET\n");
1093+ return -ENODEV;
1094+ }
1095+
1096+ if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
1097+ printk(KERN_WARNING PREFIX "HPET timers must be located in "
1098+ "memory.\n");
1099+ return -1;
1100+ }
1101+#ifdef CONFIG_X86_64
1102+ vxtime.hpet_address = hpet_tbl->addr.addrl |
1103+ ((long)hpet_tbl->addr.addrh << 32);
1104+
1105+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1106+ hpet_tbl->id, vxtime.hpet_address);
1107+#else /* X86 */
1108+ {
1109+ extern unsigned long hpet_address;
1110+
1111+ hpet_address = hpet_tbl->addr.addrl;
1112+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1113+ hpet_tbl->id, hpet_address);
1114+ }
1115+#endif /* X86 */
1116+
1117+ return 0;
1118+}
1119+#else
1120+#define acpi_parse_hpet NULL
1121+#endif
1122+
1123+#ifdef CONFIG_X86_PM_TIMER
1124+extern u32 pmtmr_ioport;
1125+#endif
1126+
1127+static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
1128+{
1129+ struct fadt_descriptor_rev2 *fadt = NULL;
1130+
1131+ fadt = (struct fadt_descriptor_rev2 *)__acpi_map_table(phys, size);
1132+ if (!fadt) {
1133+ printk(KERN_WARNING PREFIX "Unable to map FADT\n");
1134+ return 0;
1135+ }
1136+ /* initialize sci_int early for INT_SRC_OVR MADT parsing */
1137+ acpi_fadt.sci_int = fadt->sci_int;
1138+
1139+ /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
1140+ acpi_fadt.revision = fadt->revision;
1141+ acpi_fadt.force_apic_physical_destination_mode =
1142+ fadt->force_apic_physical_destination_mode;
1143+
1144+#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
1145+ /* detect the location of the ACPI PM Timer */
1146+ if (fadt->revision >= FADT2_REVISION_ID) {
1147+ /* FADT rev. 2 */
1148+ if (fadt->xpm_tmr_blk.address_space_id !=
1149+ ACPI_ADR_SPACE_SYSTEM_IO)
1150+ return 0;
1151+
1152+ pmtmr_ioport = fadt->xpm_tmr_blk.address;
1153+ /*
1154+ * "X" fields are optional extensions to the original V1.0
1155+ * fields, so we must selectively expand V1.0 fields if the
1156+ * corresponding X field is zero.
1157+ */
1158+ if (!pmtmr_ioport)
1159+ pmtmr_ioport = fadt->V1_pm_tmr_blk;
1160+ } else {
1161+ /* FADT rev. 1 */
1162+ pmtmr_ioport = fadt->V1_pm_tmr_blk;
1163+ }
1164+ if (pmtmr_ioport)
1165+ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1166+ pmtmr_ioport);
1167+#endif
1168+ return 0;
1169+}
1170+
1171+unsigned long __init acpi_find_rsdp(void)
1172+{
1173+ unsigned long rsdp_phys = 0;
1174+
1175+ if (efi_enabled) {
1176+ if (efi.acpi20)
1177+ return __pa(efi.acpi20);
1178+ else if (efi.acpi)
1179+ return __pa(efi.acpi);
1180+ }
1181+ /*
1182+ * Scan memory looking for the RSDP signature. First search EBDA (low
1183+ * memory) paragraphs and then search upper memory (E0000-FFFFF).
1184+ */
1185+ rsdp_phys = acpi_scan_rsdp(0, 0x400);
1186+ if (!rsdp_phys)
1187+ rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1188+
1189+ return rsdp_phys;
1190+}
1191+
1192+#ifdef CONFIG_X86_LOCAL_APIC
1193+/*
1194+ * Parse LAPIC entries in MADT
1195+ * returns 0 on success, < 0 on error
1196+ */
1197+static int __init acpi_parse_madt_lapic_entries(void)
1198+{
1199+ int count;
1200+
1201+ /*
1202+ * Note that the LAPIC address is obtained from the MADT (32-bit value)
1203+ * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1204+ */
1205+
1206+ count =
1207+ acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1208+ acpi_parse_lapic_addr_ovr, 0);
1209+ if (count < 0) {
1210+ printk(KERN_ERR PREFIX
1211+ "Error parsing LAPIC address override entry\n");
1212+ return count;
1213+ }
1214+
1215+ mp_register_lapic_address(acpi_lapic_addr);
1216+
1217+ count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1218+ MAX_APICS);
1219+ if (!count) {
1220+ printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1221+ /* TBD: Cleanup to allow fallback to MPS */
1222+ return -ENODEV;
1223+ } else if (count < 0) {
1224+ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1225+ /* TBD: Cleanup to allow fallback to MPS */
1226+ return count;
1227+ }
1228+
1229+ count =
1230+ acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1231+ if (count < 0) {
1232+ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1233+ /* TBD: Cleanup to allow fallback to MPS */
1234+ return count;
1235+ }
1236+ return 0;
1237+}
1238+#endif /* CONFIG_X86_LOCAL_APIC */
1239+
1240+#ifdef CONFIG_X86_IO_APIC
1241+/*
1242+ * Parse IOAPIC related entries in MADT
1243+ * returns 0 on success, < 0 on error
1244+ */
1245+static int __init acpi_parse_madt_ioapic_entries(void)
1246+{
1247+ int count;
1248+
1249+ /*
1250+ * ACPI interpreter is required to complete interrupt setup,
1251+ * so if it is off, don't enumerate the io-apics with ACPI.
1252+ * If MPS is present, it will handle them,
1253+ * otherwise the system will stay in PIC mode
1254+ */
1255+ if (acpi_disabled || acpi_noirq) {
1256+ return -ENODEV;
1257+ }
1258+
1259+ /*
1260+ * if "noapic" boot option, don't look for IO-APICs
1261+ */
1262+ if (skip_ioapic_setup) {
1263+ printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1264+ "due to 'noapic' option.\n");
1265+ return -ENODEV;
1266+ }
1267+
1268+ count =
1269+ acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1270+ MAX_IO_APICS);
1271+ if (!count) {
1272+ printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1273+ return -ENODEV;
1274+ } else if (count < 0) {
1275+ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1276+ return count;
1277+ }
1278+
1279+ count =
1280+ acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1281+ NR_IRQ_VECTORS);
1282+ if (count < 0) {
1283+ printk(KERN_ERR PREFIX
1284+ "Error parsing interrupt source overrides entry\n");
1285+ /* TBD: Cleanup to allow fallback to MPS */
1286+ return count;
1287+ }
1288+
1289+ /*
1290+ * If BIOS did not supply an INT_SRC_OVR for the SCI
1291+ * pretend we got one so we can set the SCI flags.
1292+ */
1293+ if (!acpi_sci_override_gsi)
1294+ acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1295+
1296+ /* Fill in identity legacy mapings where no override */
1297+ mp_config_acpi_legacy_irqs();
1298+
1299+ count =
1300+ acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1301+ NR_IRQ_VECTORS);
1302+ if (count < 0) {
1303+ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1304+ /* TBD: Cleanup to allow fallback to MPS */
1305+ return count;
1306+ }
1307+
1308+ return 0;
1309+}
1310+#else
1311+static inline int acpi_parse_madt_ioapic_entries(void)
1312+{
1313+ return -1;
1314+}
1315+#endif /* !CONFIG_X86_IO_APIC */
1316+
1317+static void __init acpi_process_madt(void)
1318+{
1319+#ifdef CONFIG_X86_LOCAL_APIC
1320+ int count, error;
1321+
1322+ count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1323+ if (count >= 1) {
1324+
1325+ /*
1326+ * Parse MADT LAPIC entries
1327+ */
1328+ error = acpi_parse_madt_lapic_entries();
1329+ if (!error) {
1330+ acpi_lapic = 1;
1331+
1332+#ifdef CONFIG_X86_GENERICARCH
1333+ generic_bigsmp_probe();
1334+#endif
1335+ /*
1336+ * Parse MADT IO-APIC entries
1337+ */
1338+ error = acpi_parse_madt_ioapic_entries();
1339+ if (!error) {
1340+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1341+ acpi_irq_balance_set(NULL);
1342+ acpi_ioapic = 1;
1343+
1344+ smp_found_config = 1;
1345+ clustered_apic_check();
1346+ }
1347+ }
1348+ if (error == -EINVAL) {
1349+ /*
1350+ * Dell Precision Workstation 410, 610 come here.
1351+ */
1352+ printk(KERN_ERR PREFIX
1353+ "Invalid BIOS MADT, disabling ACPI\n");
1354+ disable_acpi();
1355+ }
1356+ }
1357+#endif
1358+ return;
1359+}
1360+
1361+extern int acpi_force;
1362+
1363+#ifdef __i386__
1364+
1365+static int __init disable_acpi_irq(struct dmi_system_id *d)
1366+{
1367+ if (!acpi_force) {
1368+ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1369+ d->ident);
1370+ acpi_noirq_set();
1371+ }
1372+ return 0;
1373+}
1374+
1375+static int __init disable_acpi_pci(struct dmi_system_id *d)
1376+{
1377+ if (!acpi_force) {
1378+ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1379+ d->ident);
1380+ acpi_disable_pci();
1381+ }
1382+ return 0;
1383+}
1384+
1385+static int __init dmi_disable_acpi(struct dmi_system_id *d)
1386+{
1387+ if (!acpi_force) {
1388+ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1389+ disable_acpi();
1390+ } else {
1391+ printk(KERN_NOTICE
1392+ "Warning: DMI blacklist says broken, but acpi forced\n");
1393+ }
1394+ return 0;
1395+}
1396+
1397+/*
1398+ * Limit ACPI to CPU enumeration for HT
1399+ */
1400+static int __init force_acpi_ht(struct dmi_system_id *d)
1401+{
1402+ if (!acpi_force) {
1403+ printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1404+ d->ident);
1405+ disable_acpi();
1406+ acpi_ht = 1;
1407+ } else {
1408+ printk(KERN_NOTICE
1409+ "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1410+ }
1411+ return 0;
1412+}
1413+
1414+/*
1415+ * If your system is blacklisted here, but you find that acpi=force
1416+ * works for you, please contact acpi-devel@sourceforge.net
1417+ */
1418+static struct dmi_system_id __initdata acpi_dmi_table[] = {
1419+ /*
1420+ * Boxes that need ACPI disabled
1421+ */
1422+ {
1423+ .callback = dmi_disable_acpi,
1424+ .ident = "IBM Thinkpad",
1425+ .matches = {
1426+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1427+ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1428+ },
1429+ },
1430+
1431+ /*
1432+ * Boxes that need acpi=ht
1433+ */
1434+ {
1435+ .callback = force_acpi_ht,
1436+ .ident = "FSC Primergy T850",
1437+ .matches = {
1438+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1439+ DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1440+ },
1441+ },
1442+ {
1443+ .callback = force_acpi_ht,
1444+ .ident = "DELL GX240",
1445+ .matches = {
1446+ DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1447+ DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1448+ },
1449+ },
1450+ {
1451+ .callback = force_acpi_ht,
1452+ .ident = "HP VISUALIZE NT Workstation",
1453+ .matches = {
1454+ DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1455+ DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1456+ },
1457+ },
1458+ {
1459+ .callback = force_acpi_ht,
1460+ .ident = "Compaq Workstation W8000",
1461+ .matches = {
1462+ DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1463+ DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1464+ },
1465+ },
1466+ {
1467+ .callback = force_acpi_ht,
1468+ .ident = "ASUS P4B266",
1469+ .matches = {
1470+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1471+ DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1472+ },
1473+ },
1474+ {
1475+ .callback = force_acpi_ht,
1476+ .ident = "ASUS P2B-DS",
1477+ .matches = {
1478+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1479+ DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1480+ },
1481+ },
1482+ {
1483+ .callback = force_acpi_ht,
1484+ .ident = "ASUS CUR-DLS",
1485+ .matches = {
1486+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1487+ DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1488+ },
1489+ },
1490+ {
1491+ .callback = force_acpi_ht,
1492+ .ident = "ABIT i440BX-W83977",
1493+ .matches = {
1494+ DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1495+ DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1496+ },
1497+ },
1498+ {
1499+ .callback = force_acpi_ht,
1500+ .ident = "IBM Bladecenter",
1501+ .matches = {
1502+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1503+ DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1504+ },
1505+ },
1506+ {
1507+ .callback = force_acpi_ht,
1508+ .ident = "IBM eServer xSeries 360",
1509+ .matches = {
1510+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1511+ DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1512+ },
1513+ },
1514+ {
1515+ .callback = force_acpi_ht,
1516+ .ident = "IBM eserver xSeries 330",
1517+ .matches = {
1518+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1519+ DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1520+ },
1521+ },
1522+ {
1523+ .callback = force_acpi_ht,
1524+ .ident = "IBM eserver xSeries 440",
1525+ .matches = {
1526+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1527+ DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1528+ },
1529+ },
1530+
1531+ /*
1532+ * Boxes that need ACPI PCI IRQ routing disabled
1533+ */
1534+ {
1535+ .callback = disable_acpi_irq,
1536+ .ident = "ASUS A7V",
1537+ .matches = {
1538+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1539+ DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1540+ /* newer BIOS, Revision 1011, does work */
1541+ DMI_MATCH(DMI_BIOS_VERSION,
1542+ "ASUS A7V ACPI BIOS Revision 1007"),
1543+ },
1544+ },
1545+
1546+ /*
1547+ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1548+ */
1549+ { /* _BBN 0 bug */
1550+ .callback = disable_acpi_pci,
1551+ .ident = "ASUS PR-DLS",
1552+ .matches = {
1553+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1554+ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1555+ DMI_MATCH(DMI_BIOS_VERSION,
1556+ "ASUS PR-DLS ACPI BIOS Revision 1010"),
1557+ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1558+ },
1559+ },
1560+ {
1561+ .callback = disable_acpi_pci,
1562+ .ident = "Acer TravelMate 36x Laptop",
1563+ .matches = {
1564+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1565+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1566+ },
1567+ },
1568+ {}
1569+};
1570+
1571+#endif /* __i386__ */
1572+
1573+/*
1574+ * acpi_boot_table_init() and acpi_boot_init()
1575+ * called from setup_arch(), always.
1576+ * 1. checksums all tables
1577+ * 2. enumerates lapics
1578+ * 3. enumerates io-apics
1579+ *
1580+ * acpi_table_init() is separate to allow reading SRAT without
1581+ * other side effects.
1582+ *
1583+ * side effects of acpi_boot_init:
1584+ * acpi_lapic = 1 if LAPIC found
1585+ * acpi_ioapic = 1 if IOAPIC found
1586+ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1587+ * if acpi_blacklisted() acpi_disabled = 1;
1588+ * acpi_irq_model=...
1589+ * ...
1590+ *
1591+ * return value: (currently ignored)
1592+ * 0: success
1593+ * !0: failure
1594+ */
1595+
1596+int __init acpi_boot_table_init(void)
1597+{
1598+ int error;
1599+
1600+#ifdef __i386__
1601+ dmi_check_system(acpi_dmi_table);
1602+#endif
1603+
1604+ /*
1605+ * If acpi_disabled, bail out
1606+ * One exception: acpi=ht continues far enough to enumerate LAPICs
1607+ */
1608+ if (acpi_disabled && !acpi_ht)
1609+ return 1;
1610+
1611+ /*
1612+ * Initialize the ACPI boot-time table parser.
1613+ */
1614+ error = acpi_table_init();
1615+ if (error) {
1616+ disable_acpi();
1617+ return error;
1618+ }
1619+
1620+ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1621+
1622+ /*
1623+ * blacklist may disable ACPI entirely
1624+ */
1625+ error = acpi_blacklisted();
1626+ if (error) {
1627+ if (acpi_force) {
1628+ printk(KERN_WARNING PREFIX "acpi=force override\n");
1629+ } else {
1630+ printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1631+ disable_acpi();
1632+ return error;
1633+ }
1634+ }
1635+
1636+ return 0;
1637+}
1638+
1639+int __init acpi_boot_init(void)
1640+{
1641+ /*
1642+ * If acpi_disabled, bail out
1643+ * One exception: acpi=ht continues far enough to enumerate LAPICs
1644+ */
1645+ if (acpi_disabled && !acpi_ht)
1646+ return 1;
1647+
1648+ acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1649+
1650+ /*
1651+ * set sci_int and PM timer address
1652+ */
1653+ acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1654+
1655+ /*
1656+ * Process the Multiple APIC Description Table (MADT), if present
1657+ */
1658+ acpi_process_madt();
1659+
1660+ acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1661+
1662+ return 0;
1663+}
1664diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/apic-xen.c linux-2.6.16.33/arch/i386/kernel/apic-xen.c
1665--- linux-2.6.16.33-noxen/arch/i386/kernel/apic-xen.c 1970-01-01 00:00:00.000000000 +0000
1666+++ linux-2.6.16.33/arch/i386/kernel/apic-xen.c 2007-01-08 15:00:45.000000000 +0000
1667@@ -0,0 +1,140 @@
1668+/*
1669+ * Local APIC handling, local APIC timers
1670+ *
1671+ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1672+ *
1673+ * Fixes
1674+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
1675+ * thanks to Eric Gilmore
1676+ * and Rolf G. Tews
1677+ * for testing these extensively.
1678+ * Maciej W. Rozycki : Various updates and fixes.
1679+ * Mikael Pettersson : Power Management for UP-APIC.
1680+ * Pavel Machek and
1681+ * Mikael Pettersson : PM converted to driver model.
1682+ */
1683+
1684+#include <linux/config.h>
1685+#include <linux/init.h>
1686+
1687+#include <linux/mm.h>
1688+#include <linux/delay.h>
1689+#include <linux/bootmem.h>
1690+#include <linux/smp_lock.h>
1691+#include <linux/interrupt.h>
1692+#include <linux/mc146818rtc.h>
1693+#include <linux/kernel_stat.h>
1694+#include <linux/sysdev.h>
1695+#include <linux/cpu.h>
1696+#include <linux/module.h>
1697+
1698+#include <asm/atomic.h>
1699+#include <asm/smp.h>
1700+#include <asm/mtrr.h>
1701+#include <asm/mpspec.h>
1702+#include <asm/desc.h>
1703+#include <asm/arch_hooks.h>
1704+#include <asm/hpet.h>
1705+#include <asm/i8253.h>
1706+
1707+#include <mach_apic.h>
1708+#include <mach_ipi.h>
1709+
1710+#include "io_ports.h"
1711+
1712+#ifndef CONFIG_XEN
1713+/*
1714+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1715+ * IPIs in place of local APIC timers
1716+ */
1717+static cpumask_t timer_bcast_ipi;
1718+#endif
1719+
1720+/*
1721+ * Knob to control our willingness to enable the local APIC.
1722+ */
1723+int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1724+
1725+/*
1726+ * Debug level
1727+ */
1728+int apic_verbosity;
1729+
1730+/*
1731+ * 'what should we do if we get a hw irq event on an illegal vector'.
1732+ * each architecture has to answer this themselves.
1733+ */
1734+void ack_bad_irq(unsigned int irq)
1735+{
1736+ printk("unexpected IRQ trap at vector %02x\n", irq);
1737+ /*
1738+ * Currently unexpected vectors happen only on SMP and APIC.
1739+ * We _must_ ack these because every local APIC has only N
1740+ * irq slots per priority level, and a 'hanging, unacked' IRQ
1741+ * holds up an irq slot - in excessive cases (when multiple
1742+ * unexpected vectors occur) that might lock up the APIC
1743+ * completely.
1744+ * But only ack when the APIC is enabled -AK
1745+ */
1746+ if (cpu_has_apic)
1747+ ack_APIC_irq();
1748+}
1749+
1750+int get_physical_broadcast(void)
1751+{
1752+ return 0xff;
1753+}
1754+
1755+#ifndef CONFIG_XEN
1756+#ifndef CONFIG_SMP
1757+static void up_apic_timer_interrupt_call(struct pt_regs *regs)
1758+{
1759+ int cpu = smp_processor_id();
1760+
1761+ /*
1762+ * the NMI deadlock-detector uses this.
1763+ */
1764+ per_cpu(irq_stat, cpu).apic_timer_irqs++;
1765+
1766+ smp_local_timer_interrupt(regs);
1767+}
1768+#endif
1769+
1770+void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
1771+{
1772+ cpumask_t mask;
1773+
1774+ cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1775+ if (!cpus_empty(mask)) {
1776+#ifdef CONFIG_SMP
1777+ send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1778+#else
1779+ /*
1780+ * We can directly call the apic timer interrupt handler
1781+ * in UP case. Minus all irq related functions
1782+ */
1783+ up_apic_timer_interrupt_call(regs);
1784+#endif
1785+ }
1786+}
1787+#endif
1788+
1789+int setup_profiling_timer(unsigned int multiplier)
1790+{
1791+ return -EINVAL;
1792+}
1793+
1794+/*
1795+ * This initializes the IO-APIC and APIC hardware if this is
1796+ * a UP kernel.
1797+ */
1798+int __init APIC_init_uniprocessor (void)
1799+{
1800+#ifdef CONFIG_X86_IO_APIC
1801+ if (smp_found_config)
1802+ if (!skip_ioapic_setup && nr_ioapics)
1803+ setup_IO_APIC();
1804+#endif
1805+
1806+ return 0;
1807+}
1808diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/asm-offsets.c linux-2.6.16.33/arch/i386/kernel/asm-offsets.c
1809--- linux-2.6.16.33-noxen/arch/i386/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
1810+++ linux-2.6.16.33/arch/i386/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
1811@@ -13,6 +13,7 @@
1812 #include <asm/fixmap.h>
1813 #include <asm/processor.h>
1814 #include <asm/thread_info.h>
1815+#include <asm/elf.h>
1816
1817 #define DEFINE(sym, val) \
1818 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
1819@@ -63,10 +64,15 @@
1820 OFFSET(pbe_orig_address, pbe, orig_address);
1821 OFFSET(pbe_next, pbe, next);
1822
1823+#ifndef CONFIG_X86_NO_TSS
1824 /* Offset from the sysenter stack to tss.esp0 */
1825- DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
1826+ DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) -
1827 sizeof(struct tss_struct));
1828+#else
1829+ /* sysenter stack points directly to esp0 */
1830+ DEFINE(SYSENTER_stack_esp0, 0);
1831+#endif
1832
1833 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
1834- DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
1835+ DEFINE(VSYSCALL_BASE, VSYSCALL_BASE);
1836 }
1837diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/Makefile linux-2.6.16.33/arch/i386/kernel/cpu/Makefile
1838--- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/Makefile 2006-11-22 18:06:31.000000000 +0000
1839+++ linux-2.6.16.33/arch/i386/kernel/cpu/Makefile 2007-01-08 15:00:45.000000000 +0000
1840@@ -17,3 +17,8 @@
1841
1842 obj-$(CONFIG_MTRR) += mtrr/
1843 obj-$(CONFIG_CPU_FREQ) += cpufreq/
1844+
1845+ifdef CONFIG_XEN
1846+include $(srctree)/scripts/Makefile.xen
1847+obj-y := $(call cherrypickxen, $(obj-y), $(src))
1848+endif
1849diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/common-xen.c linux-2.6.16.33/arch/i386/kernel/cpu/common-xen.c
1850--- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/common-xen.c 1970-01-01 00:00:00.000000000 +0000
1851+++ linux-2.6.16.33/arch/i386/kernel/cpu/common-xen.c 2007-01-08 15:00:45.000000000 +0000
1852@@ -0,0 +1,715 @@
1853+#include <linux/init.h>
1854+#include <linux/string.h>
1855+#include <linux/delay.h>
1856+#include <linux/smp.h>
1857+#include <linux/module.h>
1858+#include <linux/percpu.h>
1859+#include <linux/bootmem.h>
1860+#include <asm/semaphore.h>
1861+#include <asm/processor.h>
1862+#include <asm/i387.h>
1863+#include <asm/msr.h>
1864+#include <asm/io.h>
1865+#include <asm/mmu_context.h>
1866+#ifdef CONFIG_X86_LOCAL_APIC
1867+#include <asm/mpspec.h>
1868+#include <asm/apic.h>
1869+#include <mach_apic.h>
1870+#endif
1871+#include <asm/hypervisor.h>
1872+
1873+#include "cpu.h"
1874+
1875+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
1876+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
1877+
1878+#ifndef CONFIG_XEN
1879+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
1880+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
1881+#endif
1882+
1883+static int cachesize_override __devinitdata = -1;
1884+static int disable_x86_fxsr __devinitdata = 0;
1885+static int disable_x86_serial_nr __devinitdata = 1;
1886+
1887+struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
1888+
1889+extern int disable_pse;
1890+
1891+static void default_init(struct cpuinfo_x86 * c)
1892+{
1893+ /* Not much we can do here... */
1894+ /* Check if at least it has cpuid */
1895+ if (c->cpuid_level == -1) {
1896+ /* No cpuid. It must be an ancient CPU */
1897+ if (c->x86 == 4)
1898+ strcpy(c->x86_model_id, "486");
1899+ else if (c->x86 == 3)
1900+ strcpy(c->x86_model_id, "386");
1901+ }
1902+}
1903+
1904+static struct cpu_dev default_cpu = {
1905+ .c_init = default_init,
1906+ .c_vendor = "Unknown",
1907+};
1908+static struct cpu_dev * this_cpu = &default_cpu;
1909+
1910+static int __init cachesize_setup(char *str)
1911+{
1912+ get_option (&str, &cachesize_override);
1913+ return 1;
1914+}
1915+__setup("cachesize=", cachesize_setup);
1916+
1917+int __devinit get_model_name(struct cpuinfo_x86 *c)
1918+{
1919+ unsigned int *v;
1920+ char *p, *q;
1921+
1922+ if (cpuid_eax(0x80000000) < 0x80000004)
1923+ return 0;
1924+
1925+ v = (unsigned int *) c->x86_model_id;
1926+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
1927+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
1928+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
1929+ c->x86_model_id[48] = 0;
1930+
1931+ /* Intel chips right-justify this string for some dumb reason;
1932+ undo that brain damage */
1933+ p = q = &c->x86_model_id[0];
1934+ while ( *p == ' ' )
1935+ p++;
1936+ if ( p != q ) {
1937+ while ( *p )
1938+ *q++ = *p++;
1939+ while ( q <= &c->x86_model_id[48] )
1940+ *q++ = '\0'; /* Zero-pad the rest */
1941+ }
1942+
1943+ return 1;
1944+}
1945+
1946+
1947+void __devinit display_cacheinfo(struct cpuinfo_x86 *c)
1948+{
1949+ unsigned int n, dummy, ecx, edx, l2size;
1950+
1951+ n = cpuid_eax(0x80000000);
1952+
1953+ if (n >= 0x80000005) {
1954+ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
1955+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
1956+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
1957+ c->x86_cache_size=(ecx>>24)+(edx>>24);
1958+ }
1959+
1960+ if (n < 0x80000006) /* Some chips just has a large L1. */
1961+ return;
1962+
1963+ ecx = cpuid_ecx(0x80000006);
1964+ l2size = ecx >> 16;
1965+
1966+ /* do processor-specific cache resizing */
1967+ if (this_cpu->c_size_cache)
1968+ l2size = this_cpu->c_size_cache(c,l2size);
1969+
1970+ /* Allow user to override all this if necessary. */
1971+ if (cachesize_override != -1)
1972+ l2size = cachesize_override;
1973+
1974+ if ( l2size == 0 )
1975+ return; /* Again, no L2 cache is possible */
1976+
1977+ c->x86_cache_size = l2size;
1978+
1979+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
1980+ l2size, ecx & 0xFF);
1981+}
1982+
1983+/* Naming convention should be: <Name> [(<Codename>)] */
1984+/* This table only is used unless init_<vendor>() below doesn't set it; */
1985+/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
1986+
1987+/* Look up CPU names by table lookup. */
1988+static char __devinit *table_lookup_model(struct cpuinfo_x86 *c)
1989+{
1990+ struct cpu_model_info *info;
1991+
1992+ if ( c->x86_model >= 16 )
1993+ return NULL; /* Range check */
1994+
1995+ if (!this_cpu)
1996+ return NULL;
1997+
1998+ info = this_cpu->c_models;
1999+
2000+ while (info && info->family) {
2001+ if (info->family == c->x86)
2002+ return info->model_names[c->x86_model];
2003+ info++;
2004+ }
2005+ return NULL; /* Not found */
2006+}
2007+
2008+
2009+static void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
2010+{
2011+ char *v = c->x86_vendor_id;
2012+ int i;
2013+ static int printed;
2014+
2015+ for (i = 0; i < X86_VENDOR_NUM; i++) {
2016+ if (cpu_devs[i]) {
2017+ if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
2018+ (cpu_devs[i]->c_ident[1] &&
2019+ !strcmp(v,cpu_devs[i]->c_ident[1]))) {
2020+ c->x86_vendor = i;
2021+ if (!early)
2022+ this_cpu = cpu_devs[i];
2023+ return;
2024+ }
2025+ }
2026+ }
2027+ if (!printed) {
2028+ printed++;
2029+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
2030+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
2031+ }
2032+ c->x86_vendor = X86_VENDOR_UNKNOWN;
2033+ this_cpu = &default_cpu;
2034+}
2035+
2036+
2037+static int __init x86_fxsr_setup(char * s)
2038+{
2039+ disable_x86_fxsr = 1;
2040+ return 1;
2041+}
2042+__setup("nofxsr", x86_fxsr_setup);
2043+
2044+
2045+/* Standard macro to see if a specific flag is changeable */
2046+static inline int flag_is_changeable_p(u32 flag)
2047+{
2048+ u32 f1, f2;
2049+
2050+ asm("pushfl\n\t"
2051+ "pushfl\n\t"
2052+ "popl %0\n\t"
2053+ "movl %0,%1\n\t"
2054+ "xorl %2,%0\n\t"
2055+ "pushl %0\n\t"
2056+ "popfl\n\t"
2057+ "pushfl\n\t"
2058+ "popl %0\n\t"
2059+ "popfl\n\t"
2060+ : "=&r" (f1), "=&r" (f2)
2061+ : "ir" (flag));
2062+
2063+ return ((f1^f2) & flag) != 0;
2064+}
2065+
2066+
2067+/* Probe for the CPUID instruction */
2068+static int __devinit have_cpuid_p(void)
2069+{
2070+ return flag_is_changeable_p(X86_EFLAGS_ID);
2071+}
2072+
2073+/* Do minimum CPU detection early.
2074+ Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
2075+ The others are not touched to avoid unwanted side effects.
2076+
2077+ WARNING: this function is only called on the BP. Don't add code here
2078+ that is supposed to run on all CPUs. */
2079+static void __init early_cpu_detect(void)
2080+{
2081+ struct cpuinfo_x86 *c = &boot_cpu_data;
2082+
2083+ c->x86_cache_alignment = 32;
2084+
2085+ if (!have_cpuid_p())
2086+ return;
2087+
2088+ /* Get vendor name */
2089+ cpuid(0x00000000, &c->cpuid_level,
2090+ (int *)&c->x86_vendor_id[0],
2091+ (int *)&c->x86_vendor_id[8],
2092+ (int *)&c->x86_vendor_id[4]);
2093+
2094+ get_cpu_vendor(c, 1);
2095+
2096+ c->x86 = 4;
2097+ if (c->cpuid_level >= 0x00000001) {
2098+ u32 junk, tfms, cap0, misc;
2099+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
2100+ c->x86 = (tfms >> 8) & 15;
2101+ c->x86_model = (tfms >> 4) & 15;
2102+ if (c->x86 == 0xf)
2103+ c->x86 += (tfms >> 20) & 0xff;
2104+ if (c->x86 >= 0x6)
2105+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
2106+ c->x86_mask = tfms & 15;
2107+ if (cap0 & (1<<19))
2108+ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
2109+ }
2110+}
2111+
2112+void __devinit generic_identify(struct cpuinfo_x86 * c)
2113+{
2114+ u32 tfms, xlvl;
2115+ int junk;
2116+
2117+ if (have_cpuid_p()) {
2118+ /* Get vendor name */
2119+ cpuid(0x00000000, &c->cpuid_level,
2120+ (int *)&c->x86_vendor_id[0],
2121+ (int *)&c->x86_vendor_id[8],
2122+ (int *)&c->x86_vendor_id[4]);
2123+
2124+ get_cpu_vendor(c, 0);
2125+ /* Initialize the standard set of capabilities */
2126+ /* Note that the vendor-specific code below might override */
2127+
2128+ /* Intel-defined flags: level 0x00000001 */
2129+ if ( c->cpuid_level >= 0x00000001 ) {
2130+ u32 capability, excap;
2131+ cpuid(0x00000001, &tfms, &junk, &excap, &capability);
2132+ c->x86_capability[0] = capability;
2133+ c->x86_capability[4] = excap;
2134+ c->x86 = (tfms >> 8) & 15;
2135+ c->x86_model = (tfms >> 4) & 15;
2136+ if (c->x86 == 0xf)
2137+ c->x86 += (tfms >> 20) & 0xff;
2138+ if (c->x86 >= 0x6)
2139+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
2140+ c->x86_mask = tfms & 15;
2141+ } else {
2142+ /* Have CPUID level 0 only - unheard of */
2143+ c->x86 = 4;
2144+ }
2145+
2146+ /* AMD-defined flags: level 0x80000001 */
2147+ xlvl = cpuid_eax(0x80000000);
2148+ if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2149+ if ( xlvl >= 0x80000001 ) {
2150+ c->x86_capability[1] = cpuid_edx(0x80000001);
2151+ c->x86_capability[6] = cpuid_ecx(0x80000001);
2152+ }
2153+ if ( xlvl >= 0x80000004 )
2154+ get_model_name(c); /* Default name */
2155+ }
2156+ }
2157+
2158+ early_intel_workaround(c);
2159+
2160+#ifdef CONFIG_X86_HT
2161+ phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
2162+#endif
2163+}
2164+
2165+static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2166+{
2167+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2168+ /* Disable processor serial number */
2169+ unsigned long lo,hi;
2170+ rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2171+ lo |= 0x200000;
2172+ wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2173+ printk(KERN_NOTICE "CPU serial number disabled.\n");
2174+ clear_bit(X86_FEATURE_PN, c->x86_capability);
2175+
2176+ /* Disabling the serial number may affect the cpuid level */
2177+ c->cpuid_level = cpuid_eax(0);
2178+ }
2179+}
2180+
2181+static int __init x86_serial_nr_setup(char *s)
2182+{
2183+ disable_x86_serial_nr = 0;
2184+ return 1;
2185+}
2186+__setup("serialnumber", x86_serial_nr_setup);
2187+
2188+
2189+
2190+/*
2191+ * This does the hard work of actually picking apart the CPU stuff...
2192+ */
2193+void __devinit identify_cpu(struct cpuinfo_x86 *c)
2194+{
2195+ int i;
2196+
2197+ c->loops_per_jiffy = loops_per_jiffy;
2198+ c->x86_cache_size = -1;
2199+ c->x86_vendor = X86_VENDOR_UNKNOWN;
2200+ c->cpuid_level = -1; /* CPUID not detected */
2201+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
2202+ c->x86_vendor_id[0] = '\0'; /* Unset */
2203+ c->x86_model_id[0] = '\0'; /* Unset */
2204+ c->x86_max_cores = 1;
2205+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
2206+
2207+ if (!have_cpuid_p()) {
2208+ /* First of all, decide if this is a 486 or higher */
2209+ /* It's a 486 if we can modify the AC flag */
2210+ if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2211+ c->x86 = 4;
2212+ else
2213+ c->x86 = 3;
2214+ }
2215+
2216+ generic_identify(c);
2217+
2218+ printk(KERN_DEBUG "CPU: After generic identify, caps:");
2219+ for (i = 0; i < NCAPINTS; i++)
2220+ printk(" %08lx", c->x86_capability[i]);
2221+ printk("\n");
2222+
2223+ if (this_cpu->c_identify) {
2224+ this_cpu->c_identify(c);
2225+
2226+ printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2227+ for (i = 0; i < NCAPINTS; i++)
2228+ printk(" %08lx", c->x86_capability[i]);
2229+ printk("\n");
2230+ }
2231+
2232+ /*
2233+ * Vendor-specific initialization. In this section we
2234+ * canonicalize the feature flags, meaning if there are
2235+ * features a certain CPU supports which CPUID doesn't
2236+ * tell us, CPUID claiming incorrect flags, or other bugs,
2237+ * we handle them here.
2238+ *
2239+ * At the end of this section, c->x86_capability better
2240+ * indicate the features this CPU genuinely supports!
2241+ */
2242+ if (this_cpu->c_init)
2243+ this_cpu->c_init(c);
2244+
2245+ /* Disable the PN if appropriate */
2246+ squash_the_stupid_serial_number(c);
2247+
2248+ /*
2249+ * The vendor-specific functions might have changed features. Now
2250+ * we do "generic changes."
2251+ */
2252+
2253+ /* TSC disabled? */
2254+ if ( tsc_disable )
2255+ clear_bit(X86_FEATURE_TSC, c->x86_capability);
2256+
2257+ /* FXSR disabled? */
2258+ if (disable_x86_fxsr) {
2259+ clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2260+ clear_bit(X86_FEATURE_XMM, c->x86_capability);
2261+ }
2262+
2263+ if (disable_pse)
2264+ clear_bit(X86_FEATURE_PSE, c->x86_capability);
2265+
2266+ /* If the model name is still unset, do table lookup. */
2267+ if ( !c->x86_model_id[0] ) {
2268+ char *p;
2269+ p = table_lookup_model(c);
2270+ if ( p )
2271+ strcpy(c->x86_model_id, p);
2272+ else
2273+ /* Last resort... */
2274+ sprintf(c->x86_model_id, "%02x/%02x",
2275+ c->x86_vendor, c->x86_model);
2276+ }
2277+
2278+ /* Now the feature flags better reflect actual CPU features! */
2279+
2280+ printk(KERN_DEBUG "CPU: After all inits, caps:");
2281+ for (i = 0; i < NCAPINTS; i++)
2282+ printk(" %08lx", c->x86_capability[i]);
2283+ printk("\n");
2284+
2285+ /*
2286+ * On SMP, boot_cpu_data holds the common feature set between
2287+ * all CPUs; so make sure that we indicate which features are
2288+ * common between the CPUs. The first time this routine gets
2289+ * executed, c == &boot_cpu_data.
2290+ */
2291+ if ( c != &boot_cpu_data ) {
2292+ /* AND the already accumulated flags with these */
2293+ for ( i = 0 ; i < NCAPINTS ; i++ )
2294+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2295+ }
2296+
2297+ /* Init Machine Check Exception if available. */
2298+ mcheck_init(c);
2299+
2300+ if (c == &boot_cpu_data)
2301+ sysenter_setup();
2302+ enable_sep_cpu();
2303+
2304+ if (c == &boot_cpu_data)
2305+ mtrr_bp_init();
2306+ else
2307+ mtrr_ap_init();
2308+}
2309+
2310+#ifdef CONFIG_X86_HT
2311+void __devinit detect_ht(struct cpuinfo_x86 *c)
2312+{
2313+ u32 eax, ebx, ecx, edx;
2314+ int index_msb, core_bits;
2315+ int cpu = smp_processor_id();
2316+
2317+ cpuid(1, &eax, &ebx, &ecx, &edx);
2318+
2319+ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2320+
2321+ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2322+ return;
2323+
2324+ smp_num_siblings = (ebx & 0xff0000) >> 16;
2325+
2326+ if (smp_num_siblings == 1) {
2327+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
2328+ } else if (smp_num_siblings > 1 ) {
2329+
2330+ if (smp_num_siblings > NR_CPUS) {
2331+ printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
2332+ smp_num_siblings = 1;
2333+ return;
2334+ }
2335+
2336+ index_msb = get_count_order(smp_num_siblings);
2337+ phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2338+
2339+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
2340+ phys_proc_id[cpu]);
2341+
2342+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2343+
2344+ index_msb = get_count_order(smp_num_siblings) ;
2345+
2346+ core_bits = get_count_order(c->x86_max_cores);
2347+
2348+ cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2349+ ((1 << core_bits) - 1);
2350+
2351+ if (c->x86_max_cores > 1)
2352+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
2353+ cpu_core_id[cpu]);
2354+ }
2355+}
2356+#endif
2357+
2358+void __devinit print_cpu_info(struct cpuinfo_x86 *c)
2359+{
2360+ char *vendor = NULL;
2361+
2362+ if (c->x86_vendor < X86_VENDOR_NUM)
2363+ vendor = this_cpu->c_vendor;
2364+ else if (c->cpuid_level >= 0)
2365+ vendor = c->x86_vendor_id;
2366+
2367+ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2368+ printk("%s ", vendor);
2369+
2370+ if (!c->x86_model_id[0])
2371+ printk("%d86", c->x86);
2372+ else
2373+ printk("%s", c->x86_model_id);
2374+
2375+ if (c->x86_mask || c->cpuid_level >= 0)
2376+ printk(" stepping %02x\n", c->x86_mask);
2377+ else
2378+ printk("\n");
2379+}
2380+
2381+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2382+
2383+/* This is hacky. :)
2384+ * We're emulating future behavior.
2385+ * In the future, the cpu-specific init functions will be called implicitly
2386+ * via the magic of initcalls.
2387+ * They will insert themselves into the cpu_devs structure.
2388+ * Then, when cpu_init() is called, we can just iterate over that array.
2389+ */
2390+
2391+extern int intel_cpu_init(void);
2392+extern int cyrix_init_cpu(void);
2393+extern int nsc_init_cpu(void);
2394+extern int amd_init_cpu(void);
2395+extern int centaur_init_cpu(void);
2396+extern int transmeta_init_cpu(void);
2397+extern int rise_init_cpu(void);
2398+extern int nexgen_init_cpu(void);
2399+extern int umc_init_cpu(void);
2400+
2401+void __init early_cpu_init(void)
2402+{
2403+ intel_cpu_init();
2404+ cyrix_init_cpu();
2405+ nsc_init_cpu();
2406+ amd_init_cpu();
2407+ centaur_init_cpu();
2408+ transmeta_init_cpu();
2409+ rise_init_cpu();
2410+ nexgen_init_cpu();
2411+ umc_init_cpu();
2412+ early_cpu_detect();
2413+
2414+#ifdef CONFIG_DEBUG_PAGEALLOC
2415+ /* pse is not compatible with on-the-fly unmapping,
2416+ * disable it even if the cpus claim to support it.
2417+ */
2418+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2419+ disable_pse = 1;
2420+#endif
2421+}
2422+
2423+void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2424+{
2425+ unsigned long frames[16];
2426+ unsigned long va;
2427+ int f;
2428+
2429+ for (va = gdt_descr->address, f = 0;
2430+ va < gdt_descr->address + gdt_descr->size;
2431+ va += PAGE_SIZE, f++) {
2432+ frames[f] = virt_to_mfn(va);
2433+ make_lowmem_page_readonly(
2434+ (void *)va, XENFEAT_writable_descriptor_tables);
2435+ }
2436+ if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2437+ BUG();
2438+}
2439+
2440+/*
2441+ * cpu_init() initializes state that is per-CPU. Some data is already
2442+ * initialized (naturally) in the bootstrap process, such as the GDT
2443+ * and IDT. We reload them nevertheless, this function acts as a
2444+ * 'CPU state barrier', nothing should get across.
2445+ */
2446+void __cpuinit cpu_init(void)
2447+{
2448+ int cpu = smp_processor_id();
2449+#ifndef CONFIG_X86_NO_TSS
2450+ struct tss_struct * t = &per_cpu(init_tss, cpu);
2451+#endif
2452+ struct thread_struct *thread = &current->thread;
2453+ struct desc_struct *gdt;
2454+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2455+
2456+ if (cpu_test_and_set(cpu, cpu_initialized)) {
2457+ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2458+ for (;;) local_irq_enable();
2459+ }
2460+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2461+
2462+ if (cpu_has_vme || cpu_has_de)
2463+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2464+ if (tsc_disable && cpu_has_tsc) {
2465+ printk(KERN_NOTICE "Disabling TSC...\n");
2466+ /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2467+ clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2468+ set_in_cr4(X86_CR4_TSD);
2469+ }
2470+
2471+#ifndef CONFIG_XEN
2472+ /*
2473+ * This is a horrible hack to allocate the GDT. The problem
2474+ * is that cpu_init() is called really early for the boot CPU
2475+ * (and hence needs bootmem) but much later for the secondary
2476+ * CPUs, when bootmem will have gone away
2477+ */
2478+ if (NODE_DATA(0)->bdata->node_bootmem_map) {
2479+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2480+ /* alloc_bootmem_pages panics on failure, so no check */
2481+ memset(gdt, 0, PAGE_SIZE);
2482+ } else {
2483+ gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2484+ if (unlikely(!gdt)) {
2485+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2486+ for (;;)
2487+ local_irq_enable();
2488+ }
2489+ }
2490+
2491+ /*
2492+ * Initialize the per-CPU GDT with the boot GDT,
2493+ * and set up the GDT descriptor:
2494+ */
2495+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2496+
2497+ /* Set up GDT entry for 16bit stack */
2498+ *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2499+ ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2500+ ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2501+ (CPU_16BIT_STACK_SIZE - 1);
2502+
2503+ cpu_gdt_descr->size = GDT_SIZE - 1;
2504+ cpu_gdt_descr->address = (unsigned long)gdt;
2505+#else
2506+ if (cpu == 0 && cpu_gdt_descr->address == 0) {
2507+ gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2508+ /* alloc_bootmem_pages panics on failure, so no check */
2509+ memset(gdt, 0, PAGE_SIZE);
2510+
2511+ memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2512+
2513+ cpu_gdt_descr->size = GDT_SIZE;
2514+ cpu_gdt_descr->address = (unsigned long)gdt;
2515+ }
2516+#endif
2517+
2518+ cpu_gdt_init(cpu_gdt_descr);
2519+
2520+ /*
2521+ * Set up and load the per-CPU TSS and LDT
2522+ */
2523+ atomic_inc(&init_mm.mm_count);
2524+ current->active_mm = &init_mm;
2525+ if (current->mm)
2526+ BUG();
2527+ enter_lazy_tlb(&init_mm, current);
2528+
2529+ load_esp0(t, thread);
2530+
2531+ load_LDT(&init_mm.context);
2532+
2533+#ifdef CONFIG_DOUBLEFAULT
2534+ /* Set up doublefault TSS pointer in the GDT */
2535+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2536+#endif
2537+
2538+ /* Clear %fs and %gs. */
2539+ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
2540+
2541+ /* Clear all 6 debug registers: */
2542+ set_debugreg(0, 0);
2543+ set_debugreg(0, 1);
2544+ set_debugreg(0, 2);
2545+ set_debugreg(0, 3);
2546+ set_debugreg(0, 6);
2547+ set_debugreg(0, 7);
2548+
2549+ /*
2550+ * Force FPU initialization:
2551+ */
2552+ current_thread_info()->status = 0;
2553+ clear_used_math();
2554+ mxcsr_feature_mask_init();
2555+}
2556+
2557+#ifdef CONFIG_HOTPLUG_CPU
2558+void __devinit cpu_uninit(void)
2559+{
2560+ int cpu = raw_smp_processor_id();
2561+ cpu_clear(cpu, cpu_initialized);
2562+
2563+ /* lazy TLB state */
2564+ per_cpu(cpu_tlbstate, cpu).state = 0;
2565+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2566+}
2567+#endif
2568diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/Makefile
2569--- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/Makefile 2006-11-22 18:06:31.000000000 +0000
2570+++ linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/Makefile 2007-01-08 15:00:45.000000000 +0000
2571@@ -3,3 +3,10 @@
2572 obj-y += cyrix.o
2573 obj-y += centaur.o
2574
2575+ifdef CONFIG_XEN
2576+include $(srctree)/scripts/Makefile.xen
2577+n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2578+
2579+obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2580+obj-y := $(call cherrypickxen, $(obj-y))
2581+endif
2582diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/main-xen.c linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/main-xen.c
2583--- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/main-xen.c 1970-01-01 00:00:00.000000000 +0000
2584+++ linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-01-08 15:00:45.000000000 +0000
2585@@ -0,0 +1,196 @@
2586+#include <linux/init.h>
2587+#include <linux/proc_fs.h>
2588+#include <linux/ctype.h>
2589+#include <linux/module.h>
2590+#include <linux/seq_file.h>
2591+#include <asm/uaccess.h>
2592+
2593+#include <asm/mtrr.h>
2594+#include "mtrr.h"
2595+
2596+static DECLARE_MUTEX(mtrr_sem);
2597+
2598+void generic_get_mtrr(unsigned int reg, unsigned long *base,
2599+ unsigned int *size, mtrr_type * type)
2600+{
2601+ dom0_op_t op;
2602+
2603+ op.cmd = DOM0_READ_MEMTYPE;
2604+ op.u.read_memtype.reg = reg;
2605+ (void)HYPERVISOR_dom0_op(&op);
2606+
2607+ *size = op.u.read_memtype.nr_mfns;
2608+ *base = op.u.read_memtype.mfn;
2609+ *type = op.u.read_memtype.type;
2610+}
2611+
2612+struct mtrr_ops generic_mtrr_ops = {
2613+ .use_intel_if = 1,
2614+ .get = generic_get_mtrr,
2615+};
2616+
2617+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2618+unsigned int num_var_ranges;
2619+unsigned int *usage_table;
2620+
2621+static void __init set_num_var_ranges(void)
2622+{
2623+ dom0_op_t op;
2624+
2625+ for (num_var_ranges = 0; ; num_var_ranges++) {
2626+ op.cmd = DOM0_READ_MEMTYPE;
2627+ op.u.read_memtype.reg = num_var_ranges;
2628+ if (HYPERVISOR_dom0_op(&op) != 0)
2629+ break;
2630+ }
2631+}
2632+
2633+static void __init init_table(void)
2634+{
2635+ int i, max;
2636+
2637+ max = num_var_ranges;
2638+ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2639+ == NULL) {
2640+ printk(KERN_ERR "mtrr: could not allocate\n");
2641+ return;
2642+ }
2643+ for (i = 0; i < max; i++)
2644+ usage_table[i] = 0;
2645+}
2646+
2647+int mtrr_add_page(unsigned long base, unsigned long size,
2648+ unsigned int type, char increment)
2649+{
2650+ int error;
2651+ dom0_op_t op;
2652+
2653+ down(&mtrr_sem);
2654+
2655+ op.cmd = DOM0_ADD_MEMTYPE;
2656+ op.u.add_memtype.mfn = base;
2657+ op.u.add_memtype.nr_mfns = size;
2658+ op.u.add_memtype.type = type;
2659+ error = HYPERVISOR_dom0_op(&op);
2660+ if (error) {
2661+ up(&mtrr_sem);
2662+ BUG_ON(error > 0);
2663+ return error;
2664+ }
2665+
2666+ if (increment)
2667+ ++usage_table[op.u.add_memtype.reg];
2668+
2669+ up(&mtrr_sem);
2670+
2671+ return op.u.add_memtype.reg;
2672+}
2673+
2674+static int mtrr_check(unsigned long base, unsigned long size)
2675+{
2676+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2677+ printk(KERN_WARNING
2678+ "mtrr: size and base must be multiples of 4 kiB\n");
2679+ printk(KERN_DEBUG
2680+ "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
2681+ dump_stack();
2682+ return -1;
2683+ }
2684+ return 0;
2685+}
2686+
2687+int
2688+mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2689+ char increment)
2690+{
2691+ if (mtrr_check(base, size))
2692+ return -EINVAL;
2693+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2694+ increment);
2695+}
2696+
2697+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2698+{
2699+ unsigned i;
2700+ mtrr_type ltype;
2701+ unsigned long lbase;
2702+ unsigned int lsize;
2703+ int error = -EINVAL;
2704+ dom0_op_t op;
2705+
2706+ down(&mtrr_sem);
2707+
2708+ if (reg < 0) {
2709+ /* Search for existing MTRR */
2710+ for (i = 0; i < num_var_ranges; ++i) {
2711+ mtrr_if->get(i, &lbase, &lsize, &ltype);
2712+ if (lbase == base && lsize == size) {
2713+ reg = i;
2714+ break;
2715+ }
2716+ }
2717+ if (reg < 0) {
2718+ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2719+ size);
2720+ goto out;
2721+ }
2722+ }
2723+ if (usage_table[reg] < 1) {
2724+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
2725+ goto out;
2726+ }
2727+ if (--usage_table[reg] < 1) {
2728+ op.cmd = DOM0_DEL_MEMTYPE;
2729+ op.u.del_memtype.handle = 0;
2730+ op.u.del_memtype.reg = reg;
2731+ error = HYPERVISOR_dom0_op(&op);
2732+ if (error) {
2733+ BUG_ON(error > 0);
2734+ goto out;
2735+ }
2736+ }
2737+ error = reg;
2738+ out:
2739+ up(&mtrr_sem);
2740+ return error;
2741+}
2742+
2743+int
2744+mtrr_del(int reg, unsigned long base, unsigned long size)
2745+{
2746+ if (mtrr_check(base, size))
2747+ return -EINVAL;
2748+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
2749+}
2750+
2751+EXPORT_SYMBOL(mtrr_add);
2752+EXPORT_SYMBOL(mtrr_del);
2753+
2754+void __init mtrr_bp_init(void)
2755+{
2756+}
2757+
2758+void mtrr_ap_init(void)
2759+{
2760+}
2761+
2762+static int __init mtrr_init(void)
2763+{
2764+ struct cpuinfo_x86 *c = &boot_cpu_data;
2765+
2766+ if (!is_initial_xendomain())
2767+ return -ENODEV;
2768+
2769+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
2770+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
2771+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
2772+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
2773+ return -ENODEV;
2774+
2775+ set_num_var_ranges();
2776+ init_table();
2777+
2778+ return 0;
2779+}
2780+
2781+subsys_initcall(mtrr_init);
2782diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/crash.c linux-2.6.16.33/arch/i386/kernel/crash.c
2783--- linux-2.6.16.33-noxen/arch/i386/kernel/crash.c 2006-11-22 18:06:31.000000000 +0000
2784+++ linux-2.6.16.33/arch/i386/kernel/crash.c 2007-01-08 15:00:45.000000000 +0000
2785@@ -90,6 +90,7 @@
2786 crash_save_this_cpu(regs, cpu);
2787 }
2788
2789+#ifndef CONFIG_XEN
2790 #ifdef CONFIG_SMP
2791 static atomic_t waiting_for_crash_ipi;
2792
2793@@ -158,6 +159,7 @@
2794 /* There are no cpus to shootdown */
2795 }
2796 #endif
2797+#endif /* CONFIG_XEN */
2798
2799 void machine_crash_shutdown(struct pt_regs *regs)
2800 {
2801@@ -174,10 +176,12 @@
2802
2803 /* Make a note of crashing cpu. Will be used in NMI callback.*/
2804 crashing_cpu = smp_processor_id();
2805+#ifndef CONFIG_XEN
2806 nmi_shootdown_cpus();
2807 lapic_shutdown();
2808 #if defined(CONFIG_X86_IO_APIC)
2809 disable_IO_APIC();
2810 #endif
2811+#endif /* CONFIG_XEN */
2812 crash_save_self(regs);
2813 }
2814diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/early_printk-xen.c linux-2.6.16.33/arch/i386/kernel/early_printk-xen.c
2815--- linux-2.6.16.33-noxen/arch/i386/kernel/early_printk-xen.c 1970-01-01 00:00:00.000000000 +0000
2816+++ linux-2.6.16.33/arch/i386/kernel/early_printk-xen.c 2007-01-08 15:00:45.000000000 +0000
2817@@ -0,0 +1,2 @@
2818+
2819+#include "../../x86_64/kernel/early_printk-xen.c"
2820diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/entry-xen.S linux-2.6.16.33/arch/i386/kernel/entry-xen.S
2821--- linux-2.6.16.33-noxen/arch/i386/kernel/entry-xen.S 1970-01-01 00:00:00.000000000 +0000
2822+++ linux-2.6.16.33/arch/i386/kernel/entry-xen.S 2007-01-08 15:00:45.000000000 +0000
2823@@ -0,0 +1,899 @@
2824+/*
2825+ * linux/arch/i386/entry.S
2826+ *
2827+ * Copyright (C) 1991, 1992 Linus Torvalds
2828+ */
2829+
2830+/*
2831+ * entry.S contains the system-call and fault low-level handling routines.
2832+ * This also contains the timer-interrupt handler, as well as all interrupts
2833+ * and faults that can result in a task-switch.
2834+ *
2835+ * NOTE: This code handles signal-recognition, which happens every time
2836+ * after a timer-interrupt and after each system call.
2837+ *
2838+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
2839+ * on a 486.
2840+ *
2841+ * Stack layout in 'ret_from_system_call':
2842+ * ptrace needs to have all regs on the stack.
2843+ * if the order here is changed, it needs to be
2844+ * updated in fork.c:copy_process, signal.c:do_signal,
2845+ * ptrace.c and ptrace.h
2846+ *
2847+ * 0(%esp) - %ebx
2848+ * 4(%esp) - %ecx
2849+ * 8(%esp) - %edx
2850+ * C(%esp) - %esi
2851+ * 10(%esp) - %edi
2852+ * 14(%esp) - %ebp
2853+ * 18(%esp) - %eax
2854+ * 1C(%esp) - %ds
2855+ * 20(%esp) - %es
2856+ * 24(%esp) - orig_eax
2857+ * 28(%esp) - %eip
2858+ * 2C(%esp) - %cs
2859+ * 30(%esp) - %eflags
2860+ * 34(%esp) - %oldesp
2861+ * 38(%esp) - %oldss
2862+ *
2863+ * "current" is in register %ebx during any slow entries.
2864+ */
2865+
2866+#include <linux/config.h>
2867+#include <linux/linkage.h>
2868+#include <asm/thread_info.h>
2869+#include <asm/errno.h>
2870+#include <asm/segment.h>
2871+#include <asm/smp.h>
2872+#include <asm/page.h>
2873+#include <asm/desc.h>
2874+#include "irq_vectors.h"
2875+#include <xen/interface/xen.h>
2876+
2877+#define nr_syscalls ((syscall_table_size)/4)
2878+
2879+EBX = 0x00
2880+ECX = 0x04
2881+EDX = 0x08
2882+ESI = 0x0C
2883+EDI = 0x10
2884+EBP = 0x14
2885+EAX = 0x18
2886+DS = 0x1C
2887+ES = 0x20
2888+ORIG_EAX = 0x24
2889+EIP = 0x28
2890+CS = 0x2C
2891+EFLAGS = 0x30
2892+OLDESP = 0x34
2893+OLDSS = 0x38
2894+
2895+CF_MASK = 0x00000001
2896+TF_MASK = 0x00000100
2897+IF_MASK = 0x00000200
2898+DF_MASK = 0x00000400
2899+NT_MASK = 0x00004000
2900+VM_MASK = 0x00020000
2901+/* Pseudo-eflags. */
2902+NMI_MASK = 0x80000000
2903+
2904+#ifndef CONFIG_XEN
2905+#define DISABLE_INTERRUPTS cli
2906+#define ENABLE_INTERRUPTS sti
2907+#else
2908+/* Offsets into shared_info_t. */
2909+#define evtchn_upcall_pending /* 0 */
2910+#define evtchn_upcall_mask 1
2911+
2912+#define sizeof_vcpu_shift 6
2913+
2914+#ifdef CONFIG_SMP
2915+#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
2916+ shl $sizeof_vcpu_shift,%esi ; \
2917+ addl HYPERVISOR_shared_info,%esi
2918+#else
2919+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
2920+#endif
2921+
2922+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
2923+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
2924+#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
2925+ __DISABLE_INTERRUPTS
2926+#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
2927+ __ENABLE_INTERRUPTS
2928+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
2929+#endif
2930+
2931+#ifdef CONFIG_PREEMPT
2932+#define preempt_stop cli
2933+#else
2934+#define preempt_stop
2935+#define resume_kernel restore_nocheck
2936+#endif
2937+
2938+#define SAVE_ALL \
2939+ cld; \
2940+ pushl %es; \
2941+ pushl %ds; \
2942+ pushl %eax; \
2943+ pushl %ebp; \
2944+ pushl %edi; \
2945+ pushl %esi; \
2946+ pushl %edx; \
2947+ pushl %ecx; \
2948+ pushl %ebx; \
2949+ movl $(__USER_DS), %edx; \
2950+ movl %edx, %ds; \
2951+ movl %edx, %es;
2952+
2953+#define RESTORE_INT_REGS \
2954+ popl %ebx; \
2955+ popl %ecx; \
2956+ popl %edx; \
2957+ popl %esi; \
2958+ popl %edi; \
2959+ popl %ebp; \
2960+ popl %eax
2961+
2962+#define RESTORE_REGS \
2963+ RESTORE_INT_REGS; \
2964+1: popl %ds; \
2965+2: popl %es; \
2966+.section .fixup,"ax"; \
2967+3: movl $0,(%esp); \
2968+ jmp 1b; \
2969+4: movl $0,(%esp); \
2970+ jmp 2b; \
2971+.previous; \
2972+.section __ex_table,"a";\
2973+ .align 4; \
2974+ .long 1b,3b; \
2975+ .long 2b,4b; \
2976+.previous
2977+
2978+
2979+ENTRY(ret_from_fork)
2980+ pushl %eax
2981+ call schedule_tail
2982+ GET_THREAD_INFO(%ebp)
2983+ popl %eax
2984+ jmp syscall_exit
2985+
2986+/*
2987+ * Return to user mode is not as complex as all this looks,
2988+ * but we want the default path for a system call return to
2989+ * go as quickly as possible which is why some of this is
2990+ * less clear than it otherwise should be.
2991+ */
2992+
2993+ # userspace resumption stub bypassing syscall exit tracing
2994+ ALIGN
2995+ret_from_exception:
2996+ preempt_stop
2997+ret_from_intr:
2998+ GET_THREAD_INFO(%ebp)
2999+ movl EFLAGS(%esp), %eax # mix EFLAGS and CS
3000+ movb CS(%esp), %al
3001+ testl $(VM_MASK | 2), %eax
3002+ jz resume_kernel
3003+ENTRY(resume_userspace)
3004+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3005+ # setting need_resched or sigpending
3006+ # between sampling and the iret
3007+ movl TI_flags(%ebp), %ecx
3008+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
3009+ # int/exception return?
3010+ jne work_pending
3011+ jmp restore_all
3012+
3013+#ifdef CONFIG_PREEMPT
3014+ENTRY(resume_kernel)
3015+ cli
3016+ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
3017+ jnz restore_nocheck
3018+need_resched:
3019+ movl TI_flags(%ebp), %ecx # need_resched set ?
3020+ testb $_TIF_NEED_RESCHED, %cl
3021+ jz restore_all
3022+ testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
3023+ jz restore_all
3024+ call preempt_schedule_irq
3025+ jmp need_resched
3026+#endif
3027+
3028+/* SYSENTER_RETURN points to after the "sysenter" instruction in
3029+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
3030+
3031+ # sysenter call handler stub
3032+ENTRY(sysenter_entry)
3033+ movl SYSENTER_stack_esp0(%esp),%esp
3034+sysenter_past_esp:
3035+ sti
3036+ pushl $(__USER_DS)
3037+ pushl %ebp
3038+ pushfl
3039+ pushl $(__USER_CS)
3040+ pushl $SYSENTER_RETURN
3041+
3042+/*
3043+ * Load the potential sixth argument from user stack.
3044+ * Careful about security.
3045+ */
3046+ cmpl $__PAGE_OFFSET-3,%ebp
3047+ jae syscall_fault
3048+1: movl (%ebp),%ebp
3049+.section __ex_table,"a"
3050+ .align 4
3051+ .long 1b,syscall_fault
3052+.previous
3053+
3054+ pushl %eax
3055+ SAVE_ALL
3056+ GET_THREAD_INFO(%ebp)
3057+
3058+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3059+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3060+ jnz syscall_trace_entry
3061+ cmpl $(nr_syscalls), %eax
3062+ jae syscall_badsys
3063+ call *sys_call_table(,%eax,4)
3064+ movl %eax,EAX(%esp)
3065+ DISABLE_INTERRUPTS
3066+ movl TI_flags(%ebp), %ecx
3067+ testw $_TIF_ALLWORK_MASK, %cx
3068+ jne syscall_exit_work
3069+/* if something modifies registers it must also disable sysexit */
3070+ movl EIP(%esp), %edx
3071+ movl OLDESP(%esp), %ecx
3072+ xorl %ebp,%ebp
3073+#ifdef CONFIG_XEN
3074+ __ENABLE_INTERRUPTS
3075+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
3076+ __TEST_PENDING
3077+ jnz 14f # process more events if necessary...
3078+ movl ESI(%esp), %esi
3079+ sysexit
3080+14: __DISABLE_INTERRUPTS
3081+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
3082+ push %esp
3083+ call evtchn_do_upcall
3084+ add $4,%esp
3085+ jmp ret_from_intr
3086+#else
3087+ sti
3088+ sysexit
3089+#endif /* !CONFIG_XEN */
3090+
3091+
3092+ # system call handler stub
3093+ENTRY(system_call)
3094+ pushl %eax # save orig_eax
3095+ SAVE_ALL
3096+ GET_THREAD_INFO(%ebp)
3097+ # system call tracing in operation / emulation
3098+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3099+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3100+ jnz syscall_trace_entry
3101+ cmpl $(nr_syscalls), %eax
3102+ jae syscall_badsys
3103+syscall_call:
3104+ call *sys_call_table(,%eax,4)
3105+ movl %eax,EAX(%esp) # store the return value
3106+syscall_exit:
3107+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3108+ # setting need_resched or sigpending
3109+ # between sampling and the iret
3110+ movl TI_flags(%ebp), %ecx
3111+ testw $_TIF_ALLWORK_MASK, %cx # current->work
3112+ jne syscall_exit_work
3113+
3114+restore_all:
3115+#ifndef CONFIG_XEN
3116+ movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
3117+ # Warning: OLDSS(%esp) contains the wrong/random values if we
3118+ # are returning to the kernel.
3119+ # See comments in process.c:copy_thread() for details.
3120+ movb OLDSS(%esp), %ah
3121+ movb CS(%esp), %al
3122+ andl $(VM_MASK | (4 << 8) | 3), %eax
3123+ cmpl $((4 << 8) | 3), %eax
3124+ je ldt_ss # returning to user-space with LDT SS
3125+restore_nocheck:
3126+#else
3127+restore_nocheck:
3128+ movl EFLAGS(%esp), %eax
3129+ testl $(VM_MASK|NMI_MASK), %eax
3130+ jnz hypervisor_iret
3131+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
3132+ GET_VCPU_INFO
3133+ andb evtchn_upcall_mask(%esi),%al
3134+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
3135+ jnz restore_all_enable_events # != 0 => enable event delivery
3136+#endif
3137+ RESTORE_REGS
3138+ addl $4, %esp
3139+1: iret
3140+.section .fixup,"ax"
3141+iret_exc:
3142+#ifndef CONFIG_XEN
3143+ sti
3144+#endif
3145+ pushl $0 # no error code
3146+ pushl $do_iret_error
3147+ jmp error_code
3148+.previous
3149+.section __ex_table,"a"
3150+ .align 4
3151+ .long 1b,iret_exc
3152+.previous
3153+
3154+#ifndef CONFIG_XEN
3155+ldt_ss:
3156+ larl OLDSS(%esp), %eax
3157+ jnz restore_nocheck
3158+ testl $0x00400000, %eax # returning to 32bit stack?
3159+ jnz restore_nocheck # allright, normal return
3160+ /* If returning to userspace with 16bit stack,
3161+ * try to fix the higher word of ESP, as the CPU
3162+ * won't restore it.
3163+ * This is an "official" bug of all the x86-compatible
3164+ * CPUs, which we can try to work around to make
3165+ * dosemu and wine happy. */
3166+ subl $8, %esp # reserve space for switch16 pointer
3167+ cli
3168+ movl %esp, %eax
3169+ /* Set up the 16bit stack frame with switch32 pointer on top,
3170+ * and a switch16 pointer on top of the current frame. */
3171+ call setup_x86_bogus_stack
3172+ RESTORE_REGS
3173+ lss 20+4(%esp), %esp # switch to 16bit stack
3174+1: iret
3175+.section __ex_table,"a"
3176+ .align 4
3177+ .long 1b,iret_exc
3178+.previous
3179+#else
3180+hypervisor_iret:
3181+ andl $~NMI_MASK, EFLAGS(%esp)
3182+ RESTORE_REGS
3183+ addl $4, %esp
3184+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
3185+#endif
3186+
3187+ # perform work that needs to be done immediately before resumption
3188+ ALIGN
3189+work_pending:
3190+ testb $_TIF_NEED_RESCHED, %cl
3191+ jz work_notifysig
3192+work_resched:
3193+ call schedule
3194+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3195+ # setting need_resched or sigpending
3196+ # between sampling and the iret
3197+ movl TI_flags(%ebp), %ecx
3198+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
3199+ # than syscall tracing?
3200+ jz restore_all
3201+ testb $_TIF_NEED_RESCHED, %cl
3202+ jnz work_resched
3203+
3204+work_notifysig: # deal with pending signals and
3205+ # notify-resume requests
3206+ testl $VM_MASK, EFLAGS(%esp)
3207+ movl %esp, %eax
3208+ jne work_notifysig_v86 # returning to kernel-space or
3209+ # vm86-space
3210+ xorl %edx, %edx
3211+ call do_notify_resume
3212+ jmp resume_userspace
3213+
3214+ ALIGN
3215+work_notifysig_v86:
3216+#ifdef CONFIG_VM86
3217+ pushl %ecx # save ti_flags for do_notify_resume
3218+ call save_v86_state # %eax contains pt_regs pointer
3219+ popl %ecx
3220+ movl %eax, %esp
3221+ xorl %edx, %edx
3222+ call do_notify_resume
3223+ jmp resume_userspace
3224+#endif
3225+
3226+ # perform syscall exit tracing
3227+ ALIGN
3228+syscall_trace_entry:
3229+ movl $-ENOSYS,EAX(%esp)
3230+ movl %esp, %eax
3231+ xorl %edx,%edx
3232+ call do_syscall_trace
3233+ cmpl $0, %eax
3234+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
3235+ # so must skip actual syscall
3236+ movl ORIG_EAX(%esp), %eax
3237+ cmpl $(nr_syscalls), %eax
3238+ jnae syscall_call
3239+ jmp syscall_exit
3240+
3241+ # perform syscall exit tracing
3242+ ALIGN
3243+syscall_exit_work:
3244+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3245+ jz work_pending
3246+ ENABLE_INTERRUPTS # could let do_syscall_trace() call
3247+ # schedule() instead
3248+ movl %esp, %eax
3249+ movl $1, %edx
3250+ call do_syscall_trace
3251+ jmp resume_userspace
3252+
3253+ ALIGN
3254+syscall_fault:
3255+ pushl %eax # save orig_eax
3256+ SAVE_ALL
3257+ GET_THREAD_INFO(%ebp)
3258+ movl $-EFAULT,EAX(%esp)
3259+ jmp resume_userspace
3260+
3261+ ALIGN
3262+syscall_badsys:
3263+ movl $-ENOSYS,EAX(%esp)
3264+ jmp resume_userspace
3265+
3266+#ifndef CONFIG_XEN
3267+#define FIXUP_ESPFIX_STACK \
3268+ movl %esp, %eax; \
3269+ /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3270+ lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3271+ /* copy data from 16bit stack to 32bit stack */ \
3272+ call fixup_x86_bogus_stack; \
3273+ /* put ESP to the proper location */ \
3274+ movl %eax, %esp;
3275+#define UNWIND_ESPFIX_STACK \
3276+ pushl %eax; \
3277+ movl %ss, %eax; \
3278+ /* see if on 16bit stack */ \
3279+ cmpw $__ESPFIX_SS, %ax; \
3280+ jne 28f; \
3281+ movl $__KERNEL_DS, %edx; \
3282+ movl %edx, %ds; \
3283+ movl %edx, %es; \
3284+ /* switch to 32bit stack */ \
3285+ FIXUP_ESPFIX_STACK \
3286+28: popl %eax;
3287+
3288+/*
3289+ * Build the entry stubs and pointer table with
3290+ * some assembler magic.
3291+ */
3292+.data
3293+ENTRY(interrupt)
3294+.text
3295+
3296+vector=0
3297+ENTRY(irq_entries_start)
3298+.rept NR_IRQS
3299+ ALIGN
3300+1: pushl $~(vector)
3301+ jmp common_interrupt
3302+.data
3303+ .long 1b
3304+.text
3305+vector=vector+1
3306+.endr
3307+
3308+ ALIGN
3309+common_interrupt:
3310+ SAVE_ALL
3311+ movl %esp,%eax
3312+ call do_IRQ
3313+ jmp ret_from_intr
3314+
3315+#define BUILD_INTERRUPT(name, nr) \
3316+ENTRY(name) \
3317+ pushl $~(nr); \
3318+ SAVE_ALL \
3319+ movl %esp,%eax; \
3320+ call smp_/**/name; \
3321+ jmp ret_from_intr;
3322+
3323+/* The include is where all of the SMP etc. interrupts come from */
3324+#include "entry_arch.h"
3325+#else
3326+#define UNWIND_ESPFIX_STACK
3327+#endif
3328+
3329+ENTRY(divide_error)
3330+ pushl $0 # no error code
3331+ pushl $do_divide_error
3332+ ALIGN
3333+error_code:
3334+ pushl %ds
3335+ pushl %eax
3336+ xorl %eax, %eax
3337+ pushl %ebp
3338+ pushl %edi
3339+ pushl %esi
3340+ pushl %edx
3341+ decl %eax # eax = -1
3342+ pushl %ecx
3343+ pushl %ebx
3344+ cld
3345+ pushl %es
3346+ UNWIND_ESPFIX_STACK
3347+ popl %ecx
3348+ movl ES(%esp), %edi # get the function address
3349+ movl ORIG_EAX(%esp), %edx # get the error code
3350+ movl %eax, ORIG_EAX(%esp)
3351+ movl %ecx, ES(%esp)
3352+ movl $(__USER_DS), %ecx
3353+ movl %ecx, %ds
3354+ movl %ecx, %es
3355+ movl %esp,%eax # pt_regs pointer
3356+ call *%edi
3357+ jmp ret_from_exception
3358+
3359+#ifdef CONFIG_XEN
3360+# A note on the "critical region" in our callback handler.
3361+# We want to avoid stacking callback handlers due to events occurring
3362+# during handling of the last event. To do this, we keep events disabled
3363+# until we've done all processing. HOWEVER, we must enable events before
3364+# popping the stack frame (can't be done atomically) and so it would still
3365+# be possible to get enough handler activations to overflow the stack.
3366+# Although unlikely, bugs of that kind are hard to track down, so we'd
3367+# like to avoid the possibility.
3368+# So, on entry to the handler we detect whether we interrupted an
3369+# existing activation in its critical region -- if so, we pop the current
3370+# activation and restart the handler using the previous one.
3371+#
3372+# The sysexit critical region is slightly different. sysexit
3373+# atomically removes the entire stack frame. If we interrupt in the
3374+# critical region we know that the entire frame is present and correct
3375+# so we can simply throw away the new one.
3376+ENTRY(hypervisor_callback)
3377+ pushl %eax
3378+ SAVE_ALL
3379+ movl EIP(%esp),%eax
3380+ cmpl $scrit,%eax
3381+ jb 11f
3382+ cmpl $ecrit,%eax
3383+ jb critical_region_fixup
3384+ cmpl $sysexit_scrit,%eax
3385+ jb 11f
3386+ cmpl $sysexit_ecrit,%eax
3387+ ja 11f
3388+ addl $0x34,%esp # Remove cs...ebx from stack frame.
3389+11: push %esp
3390+ call evtchn_do_upcall
3391+ add $4,%esp
3392+ jmp ret_from_intr
3393+
3394+ ALIGN
3395+restore_all_enable_events:
3396+ __ENABLE_INTERRUPTS
3397+scrit: /**** START OF CRITICAL REGION ****/
3398+ __TEST_PENDING
3399+ jnz 14f # process more events if necessary...
3400+ RESTORE_REGS
3401+ addl $4, %esp
3402+1: iret
3403+.section __ex_table,"a"
3404+ .align 4
3405+ .long 1b,iret_exc
3406+.previous
3407+14: __DISABLE_INTERRUPTS
3408+ jmp 11b
3409+ecrit: /**** END OF CRITICAL REGION ****/
3410+# [How we do the fixup]. We want to merge the current stack frame with the
3411+# just-interrupted frame. How we do this depends on where in the critical
3412+# region the interrupted handler was executing, and so how many saved
3413+# registers are in each frame. We do this quickly using the lookup table
3414+# 'critical_fixup_table'. For each byte offset in the critical region, it
3415+# provides the number of bytes which have already been popped from the
3416+# interrupted stack frame.
3417+critical_region_fixup:
3418+ addl $critical_fixup_table-scrit,%eax
3419+ movzbl (%eax),%eax # %eax contains num bytes popped
3420+ cmpb $0xff,%al # 0xff => vcpu_info critical region
3421+ jne 15f
3422+ GET_THREAD_INFO(%ebp)
3423+ xorl %eax,%eax
3424+15: mov %esp,%esi
3425+ add %eax,%esi # %esi points at end of src region
3426+ mov %esp,%edi
3427+ add $0x34,%edi # %edi points at end of dst region
3428+ mov %eax,%ecx
3429+ shr $2,%ecx # convert words to bytes
3430+ je 17f # skip loop if nothing to copy
3431+16: subl $4,%esi # pre-decrementing copy loop
3432+ subl $4,%edi
3433+ movl (%esi),%eax
3434+ movl %eax,(%edi)
3435+ loop 16b
3436+17: movl %edi,%esp # final %edi is top of merged stack
3437+ jmp 11b
3438+
3439+critical_fixup_table:
3440+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
3441+ .byte 0xff,0xff # jnz 14f
3442+ .byte 0x00 # pop %ebx
3443+ .byte 0x04 # pop %ecx
3444+ .byte 0x08 # pop %edx
3445+ .byte 0x0c # pop %esi
3446+ .byte 0x10 # pop %edi
3447+ .byte 0x14 # pop %ebp
3448+ .byte 0x18 # pop %eax
3449+ .byte 0x1c # pop %ds
3450+ .byte 0x20 # pop %es
3451+ .byte 0x24,0x24,0x24 # add $4,%esp
3452+ .byte 0x28 # iret
3453+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
3454+ .byte 0x00,0x00 # jmp 11b
3455+
3456+# Hypervisor uses this for application faults while it executes.
3457+# We get here for two reasons:
3458+# 1. Fault while reloading DS, ES, FS or GS
3459+# 2. Fault while executing IRET
3460+# Category 1 we fix up by reattempting the load, and zeroing the segment
3461+# register if the load fails.
3462+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3463+# normal Linux return path in this case because if we use the IRET hypercall
3464+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3465+# We distinguish between categories by maintaining a status value in EAX.
3466+ENTRY(failsafe_callback)
3467+ pushl %eax
3468+ movl $1,%eax
3469+1: mov 4(%esp),%ds
3470+2: mov 8(%esp),%es
3471+3: mov 12(%esp),%fs
3472+4: mov 16(%esp),%gs
3473+ testl %eax,%eax
3474+ popl %eax
3475+ jz 5f
3476+ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
3477+ jmp iret_exc
3478+5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
3479+ pushl $0
3480+ SAVE_ALL
3481+ jmp ret_from_exception
3482+.section .fixup,"ax"; \
3483+6: xorl %eax,%eax; \
3484+ movl %eax,4(%esp); \
3485+ jmp 1b; \
3486+7: xorl %eax,%eax; \
3487+ movl %eax,8(%esp); \
3488+ jmp 2b; \
3489+8: xorl %eax,%eax; \
3490+ movl %eax,12(%esp); \
3491+ jmp 3b; \
3492+9: xorl %eax,%eax; \
3493+ movl %eax,16(%esp); \
3494+ jmp 4b; \
3495+.previous; \
3496+.section __ex_table,"a"; \
3497+ .align 4; \
3498+ .long 1b,6b; \
3499+ .long 2b,7b; \
3500+ .long 3b,8b; \
3501+ .long 4b,9b; \
3502+.previous
3503+#endif
3504+
3505+ENTRY(coprocessor_error)
3506+ pushl $0
3507+ pushl $do_coprocessor_error
3508+ jmp error_code
3509+
3510+ENTRY(simd_coprocessor_error)
3511+ pushl $0
3512+ pushl $do_simd_coprocessor_error
3513+ jmp error_code
3514+
3515+ENTRY(device_not_available)
3516+ pushl $-1 # mark this as an int
3517+ SAVE_ALL
3518+#ifndef CONFIG_XEN
3519+ movl %cr0, %eax
3520+ testl $0x4, %eax # EM (math emulation bit)
3521+ je device_available_emulate
3522+ pushl $0 # temporary storage for ORIG_EIP
3523+ call math_emulate
3524+ addl $4, %esp
3525+ jmp ret_from_exception
3526+device_available_emulate:
3527+#endif
3528+ preempt_stop
3529+ call math_state_restore
3530+ jmp ret_from_exception
3531+
3532+#ifndef CONFIG_XEN
3533+/*
3534+ * Debug traps and NMI can happen at the one SYSENTER instruction
3535+ * that sets up the real kernel stack. Check here, since we can't
3536+ * allow the wrong stack to be used.
3537+ *
3538+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
3539+ * already pushed 3 words if it hits on the sysenter instruction:
3540+ * eflags, cs and eip.
3541+ *
3542+ * We just load the right stack, and push the three (known) values
3543+ * by hand onto the new stack - while updating the return eip past
3544+ * the instruction that would have done it for sysenter.
3545+ */
3546+#define FIX_STACK(offset, ok, label) \
3547+ cmpw $__KERNEL_CS,4(%esp); \
3548+ jne ok; \
3549+label: \
3550+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \
3551+ pushfl; \
3552+ pushl $__KERNEL_CS; \
3553+ pushl $sysenter_past_esp
3554+#endif /* CONFIG_XEN */
3555+
3556+KPROBE_ENTRY(debug)
3557+#ifndef CONFIG_XEN
3558+ cmpl $sysenter_entry,(%esp)
3559+ jne debug_stack_correct
3560+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3561+debug_stack_correct:
3562+#endif /* !CONFIG_XEN */
3563+ pushl $-1 # mark this as an int
3564+ SAVE_ALL
3565+ xorl %edx,%edx # error code 0
3566+ movl %esp,%eax # pt_regs pointer
3567+ call do_debug
3568+ jmp ret_from_exception
3569+ .previous .text
3570+
3571+#ifndef CONFIG_XEN
3572+/*
3573+ * NMI is doubly nasty. It can happen _while_ we're handling
3574+ * a debug fault, and the debug fault hasn't yet been able to
3575+ * clear up the stack. So we first check whether we got an
3576+ * NMI on the sysenter entry path, but after that we need to
3577+ * check whether we got an NMI on the debug path where the debug
3578+ * fault happened on the sysenter path.
3579+ */
3580+ENTRY(nmi)
3581+ pushl %eax
3582+ movl %ss, %eax
3583+ cmpw $__ESPFIX_SS, %ax
3584+ popl %eax
3585+ je nmi_16bit_stack
3586+ cmpl $sysenter_entry,(%esp)
3587+ je nmi_stack_fixup
3588+ pushl %eax
3589+ movl %esp,%eax
3590+ /* Do not access memory above the end of our stack page,
3591+ * it might not exist.
3592+ */
3593+ andl $(THREAD_SIZE-1),%eax
3594+ cmpl $(THREAD_SIZE-20),%eax
3595+ popl %eax
3596+ jae nmi_stack_correct
3597+ cmpl $sysenter_entry,12(%esp)
3598+ je nmi_debug_stack_check
3599+nmi_stack_correct:
3600+ pushl %eax
3601+ SAVE_ALL
3602+ xorl %edx,%edx # zero error code
3603+ movl %esp,%eax # pt_regs pointer
3604+ call do_nmi
3605+ jmp restore_all
3606+
3607+nmi_stack_fixup:
3608+ FIX_STACK(12,nmi_stack_correct, 1)
3609+ jmp nmi_stack_correct
3610+nmi_debug_stack_check:
3611+ cmpw $__KERNEL_CS,16(%esp)
3612+ jne nmi_stack_correct
3613+ cmpl $debug,(%esp)
3614+ jb nmi_stack_correct
3615+ cmpl $debug_esp_fix_insn,(%esp)
3616+ ja nmi_stack_correct
3617+ FIX_STACK(24,nmi_stack_correct, 1)
3618+ jmp nmi_stack_correct
3619+
3620+nmi_16bit_stack:
3621+ /* create the pointer to lss back */
3622+ pushl %ss
3623+ pushl %esp
3624+ movzwl %sp, %esp
3625+ addw $4, (%esp)
3626+ /* copy the iret frame of 12 bytes */
3627+ .rept 3
3628+ pushl 16(%esp)
3629+ .endr
3630+ pushl %eax
3631+ SAVE_ALL
3632+ FIXUP_ESPFIX_STACK # %eax == %esp
3633+ xorl %edx,%edx # zero error code
3634+ call do_nmi
3635+ RESTORE_REGS
3636+ lss 12+4(%esp), %esp # back to 16bit stack
3637+1: iret
3638+.section __ex_table,"a"
3639+ .align 4
3640+ .long 1b,iret_exc
3641+.previous
3642+#else
3643+ENTRY(nmi)
3644+ pushl %eax
3645+ SAVE_ALL
3646+ xorl %edx,%edx # zero error code
3647+ movl %esp,%eax # pt_regs pointer
3648+ call do_nmi
3649+ orl $NMI_MASK, EFLAGS(%esp)
3650+ jmp restore_all
3651+#endif
3652+
3653+KPROBE_ENTRY(int3)
3654+ pushl $-1 # mark this as an int
3655+ SAVE_ALL
3656+ xorl %edx,%edx # zero error code
3657+ movl %esp,%eax # pt_regs pointer
3658+ call do_int3
3659+ jmp ret_from_exception
3660+ .previous .text
3661+
3662+ENTRY(overflow)
3663+ pushl $0
3664+ pushl $do_overflow
3665+ jmp error_code
3666+
3667+ENTRY(bounds)
3668+ pushl $0
3669+ pushl $do_bounds
3670+ jmp error_code
3671+
3672+ENTRY(invalid_op)
3673+ pushl $0
3674+ pushl $do_invalid_op
3675+ jmp error_code
3676+
3677+ENTRY(coprocessor_segment_overrun)
3678+ pushl $0
3679+ pushl $do_coprocessor_segment_overrun
3680+ jmp error_code
3681+
3682+ENTRY(invalid_TSS)
3683+ pushl $do_invalid_TSS
3684+ jmp error_code
3685+
3686+ENTRY(segment_not_present)
3687+ pushl $do_segment_not_present
3688+ jmp error_code
3689+
3690+ENTRY(stack_segment)
3691+ pushl $do_stack_segment
3692+ jmp error_code
3693+
3694+KPROBE_ENTRY(general_protection)
3695+ pushl $do_general_protection
3696+ jmp error_code
3697+ .previous .text
3698+
3699+ENTRY(alignment_check)
3700+ pushl $do_alignment_check
3701+ jmp error_code
3702+
3703+KPROBE_ENTRY(page_fault)
3704+ pushl $do_page_fault
3705+ jmp error_code
3706+ .previous .text
3707+
3708+#ifdef CONFIG_X86_MCE
3709+ENTRY(machine_check)
3710+ pushl $0
3711+ pushl machine_check_vector
3712+ jmp error_code
3713+#endif
3714+
3715+ENTRY(fixup_4gb_segment)
3716+ pushl $do_fixup_4gb_segment
3717+ jmp error_code
3718+
3719+.section .rodata,"a"
3720+#include "syscall_table.S"
3721+
3722+syscall_table_size=(.-sys_call_table)
3723diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/entry.S linux-2.6.16.33/arch/i386/kernel/entry.S
3724--- linux-2.6.16.33-noxen/arch/i386/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
3725+++ linux-2.6.16.33/arch/i386/kernel/entry.S 2007-05-23 21:00:01.000000000 +0000
3726@@ -177,7 +177,7 @@
3727
3728 # sysenter call handler stub
3729 ENTRY(sysenter_entry)
3730- movl TSS_sysenter_esp0(%esp),%esp
3731+ movl SYSENTER_stack_esp0(%esp),%esp
3732 sysenter_past_esp:
3733 sti
3734 pushl $(__USER_DS)
3735@@ -406,7 +406,7 @@
3736 ENTRY(irq_entries_start)
3737 .rept NR_IRQS
3738 ALIGN
3739-1: pushl $vector-256
3740+1: pushl $~(vector)
3741 jmp common_interrupt
3742 .data
3743 .long 1b
3744@@ -423,7 +423,7 @@
3745
3746 #define BUILD_INTERRUPT(name, nr) \
3747 ENTRY(name) \
3748- pushl $nr-256; \
3749+ pushl $~(nr); \
3750 SAVE_ALL \
3751 movl %esp,%eax; \
3752 call smp_/**/name; \
3753@@ -492,7 +492,7 @@
3754 * that sets up the real kernel stack. Check here, since we can't
3755 * allow the wrong stack to be used.
3756 *
3757- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
3758+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
3759 * already pushed 3 words if it hits on the sysenter instruction:
3760 * eflags, cs and eip.
3761 *
3762@@ -504,7 +504,7 @@
3763 cmpw $__KERNEL_CS,4(%esp); \
3764 jne ok; \
3765 label: \
3766- movl TSS_sysenter_esp0+offset(%esp),%esp; \
3767+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \
3768 pushfl; \
3769 pushl $__KERNEL_CS; \
3770 pushl $sysenter_past_esp
3771diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/fixup.c linux-2.6.16.33/arch/i386/kernel/fixup.c
3772--- linux-2.6.16.33-noxen/arch/i386/kernel/fixup.c 1970-01-01 00:00:00.000000000 +0000
3773+++ linux-2.6.16.33/arch/i386/kernel/fixup.c 2007-01-08 15:00:45.000000000 +0000
3774@@ -0,0 +1,89 @@
3775+/******************************************************************************
3776+ * fixup.c
3777+ *
3778+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
3779+ * Used to avoid repeated slow emulation of common instructions used by the
3780+ * user-space TLS (Thread-Local Storage) libraries.
3781+ *
3782+ * **** NOTE ****
3783+ * Issues with the binary rewriting have caused it to be removed. Instead
3784+ * we rely on Xen's emulator to boot the kernel, and then print a banner
3785+ * message recommending that the user disables /lib/tls.
3786+ *
3787+ * Copyright (c) 2004, K A Fraser
3788+ *
3789+ * This program is free software; you can redistribute it and/or modify
3790+ * it under the terms of the GNU General Public License as published by
3791+ * the Free Software Foundation; either version 2 of the License, or
3792+ * (at your option) any later version.
3793+ *
3794+ * This program is distributed in the hope that it will be useful,
3795+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3796+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3797+ * GNU General Public License for more details.
3798+ *
3799+ * You should have received a copy of the GNU General Public License
3800+ * along with this program; if not, write to the Free Software
3801+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3802+ */
3803+
3804+#include <linux/config.h>
3805+#include <linux/init.h>
3806+#include <linux/sched.h>
3807+#include <linux/slab.h>
3808+#include <linux/kernel.h>
3809+#include <linux/delay.h>
3810+#include <linux/version.h>
3811+
3812+#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
3813+
3814+fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3815+{
3816+ static unsigned long printed = 0;
3817+ char info[100];
3818+ int i;
3819+
3820+ /* Ignore statically-linked init. */
3821+ if (current->tgid == 1)
3822+ return;
3823+
3824+ HYPERVISOR_vm_assist(
3825+ VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
3826+
3827+ if (test_and_set_bit(0, &printed))
3828+ return;
3829+
3830+ sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
3831+
3832+ DP("");
3833+ DP("***************************************************************");
3834+ DP("***************************************************************");
3835+ DP("** WARNING: Currently emulating unsupported memory accesses **");
3836+ DP("** in /lib/tls glibc libraries. The emulation is **");
3837+ DP("** slow. To ensure full performance you should **");
3838+ DP("** install a 'xen-friendly' (nosegneg) version of **");
3839+ DP("** the library, or disable tls support by executing **");
3840+ DP("** the following as root: **");
3841+ DP("** mv /lib/tls /lib/tls.disabled **");
3842+ DP("** Offending process: %-38.38s **", info);
3843+ DP("***************************************************************");
3844+ DP("***************************************************************");
3845+ DP("");
3846+
3847+ for (i = 5; i > 0; i--) {
3848+ touch_softlockup_watchdog();
3849+ printk("Pausing... %d", i);
3850+ mdelay(1000);
3851+ printk("\b\b\b\b\b\b\b\b\b\b\b\b");
3852+ }
3853+
3854+ printk("Continuing...\n\n");
3855+}
3856+
3857+static int __init fixup_init(void)
3858+{
3859+ HYPERVISOR_vm_assist(
3860+ VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
3861+ return 0;
3862+}
3863+__initcall(fixup_init);
3864diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/head-xen.S linux-2.6.16.33/arch/i386/kernel/head-xen.S
3865--- linux-2.6.16.33-noxen/arch/i386/kernel/head-xen.S 1970-01-01 00:00:00.000000000 +0000
3866+++ linux-2.6.16.33/arch/i386/kernel/head-xen.S 2007-01-08 15:00:45.000000000 +0000
3867@@ -0,0 +1,202 @@
3868+
3869+
3870+.text
3871+#include <linux/config.h>
3872+#include <linux/elfnote.h>
3873+#include <linux/threads.h>
3874+#include <linux/linkage.h>
3875+#include <asm/segment.h>
3876+#include <asm/page.h>
3877+#include <asm/thread_info.h>
3878+#include <asm/asm-offsets.h>
3879+#include <xen/interface/xen.h>
3880+#include <xen/interface/elfnote.h>
3881+
3882+/*
3883+ * References to members of the new_cpu_data structure.
3884+ */
3885+
3886+#define X86 new_cpu_data+CPUINFO_x86
3887+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
3888+#define X86_MODEL new_cpu_data+CPUINFO_x86_model
3889+#define X86_MASK new_cpu_data+CPUINFO_x86_mask
3890+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
3891+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
3892+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
3893+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
3894+
3895+#define VIRT_ENTRY_OFFSET 0x0
3896+.org VIRT_ENTRY_OFFSET
3897+ENTRY(startup_32)
3898+ movl %esi,xen_start_info
3899+ cld
3900+
3901+ /* Set up the stack pointer */
3902+ movl $(init_thread_union+THREAD_SIZE),%esp
3903+
3904+ /* get vendor info */
3905+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
3906+ XEN_CPUID
3907+ movl %eax,X86_CPUID # save CPUID level
3908+ movl %ebx,X86_VENDOR_ID # lo 4 chars
3909+ movl %edx,X86_VENDOR_ID+4 # next 4 chars
3910+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
3911+
3912+ movl $1,%eax # Use the CPUID instruction to get CPU type
3913+ XEN_CPUID
3914+ movb %al,%cl # save reg for future use
3915+ andb $0x0f,%ah # mask processor family
3916+ movb %ah,X86
3917+ andb $0xf0,%al # mask model
3918+ shrb $4,%al
3919+ movb %al,X86_MODEL
3920+ andb $0x0f,%cl # mask mask revision
3921+ movb %cl,X86_MASK
3922+ movl %edx,X86_CAPABILITY
3923+
3924+ movb $1,X86_HARD_MATH
3925+
3926+ xorl %eax,%eax # Clear FS/GS and LDT
3927+ movl %eax,%fs
3928+ movl %eax,%gs
3929+ cld # gcc2 wants the direction flag cleared at all times
3930+
3931+ call start_kernel
3932+L6:
3933+ jmp L6 # main should never return here, but
3934+ # just in case, we know what happens.
3935+
3936+#define HYPERCALL_PAGE_OFFSET 0x1000
3937+.org HYPERCALL_PAGE_OFFSET
3938+ENTRY(hypercall_page)
3939+.skip 0x1000
3940+
3941+/*
3942+ * Real beginning of normal "text" segment
3943+ */
3944+ENTRY(stext)
3945+ENTRY(_stext)
3946+
3947+/*
3948+ * BSS section
3949+ */
3950+.section ".bss.page_aligned","w"
3951+ENTRY(empty_zero_page)
3952+ .fill 4096,1,0
3953+
3954+/*
3955+ * This starts the data section.
3956+ */
3957+.data
3958+
3959+/*
3960+ * The Global Descriptor Table contains 28 quadwords, per-CPU.
3961+ */
3962+ENTRY(cpu_gdt_table)
3963+ .quad 0x0000000000000000 /* NULL descriptor */
3964+ .quad 0x0000000000000000 /* 0x0b reserved */
3965+ .quad 0x0000000000000000 /* 0x13 reserved */
3966+ .quad 0x0000000000000000 /* 0x1b reserved */
3967+ .quad 0x0000000000000000 /* 0x20 unused */
3968+ .quad 0x0000000000000000 /* 0x28 unused */
3969+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
3970+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
3971+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
3972+ .quad 0x0000000000000000 /* 0x4b reserved */
3973+ .quad 0x0000000000000000 /* 0x53 reserved */
3974+ .quad 0x0000000000000000 /* 0x5b reserved */
3975+
3976+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
3977+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
3978+ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
3979+ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
3980+
3981+ .quad 0x0000000000000000 /* 0x80 TSS descriptor */
3982+ .quad 0x0000000000000000 /* 0x88 LDT descriptor */
3983+
3984+ /*
3985+ * Segments used for calling PnP BIOS have byte granularity.
3986+ * They code segments and data segments have fixed 64k limits,
3987+ * the transfer segment sizes are set at run time.
3988+ */
3989+ .quad 0x0000000000000000 /* 0x90 32-bit code */
3990+ .quad 0x0000000000000000 /* 0x98 16-bit code */
3991+ .quad 0x0000000000000000 /* 0xa0 16-bit data */
3992+ .quad 0x0000000000000000 /* 0xa8 16-bit data */
3993+ .quad 0x0000000000000000 /* 0xb0 16-bit data */
3994+
3995+ /*
3996+ * The APM segments have byte granularity and their bases
3997+ * are set at run time. All have 64k limits.
3998+ */
3999+ .quad 0x0000000000000000 /* 0xb8 APM CS code */
4000+ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
4001+ .quad 0x0000000000000000 /* 0xc8 APM DS data */
4002+
4003+ .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
4004+ .quad 0x0000000000000000 /* 0xd8 - unused */
4005+ .quad 0x0000000000000000 /* 0xe0 - unused */
4006+ .quad 0x0000000000000000 /* 0xe8 - unused */
4007+ .quad 0x0000000000000000 /* 0xf0 - unused */
4008+ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
4009+
4010+#ifdef CONFIG_XEN_COMPAT_030002
4011+/*
4012+ * __xen_guest information
4013+ */
4014+.macro utoa value
4015+ .if (\value) < 0 || (\value) >= 0x10
4016+ utoa (((\value)>>4)&0x0fffffff)
4017+ .endif
4018+ .if ((\value) & 0xf) < 10
4019+ .byte '0' + ((\value) & 0xf)
4020+ .else
4021+ .byte 'A' + ((\value) & 0xf) - 10
4022+ .endif
4023+.endm
4024+
4025+.section __xen_guest
4026+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
4027+ .ascii ",XEN_VER=xen-3.0"
4028+ .ascii ",VIRT_BASE=0x"
4029+ utoa __PAGE_OFFSET
4030+ .ascii ",ELF_PADDR_OFFSET=0x"
4031+ utoa __PAGE_OFFSET
4032+ .ascii ",VIRT_ENTRY=0x"
4033+ utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
4034+ .ascii ",HYPERCALL_PAGE=0x"
4035+ utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
4036+ .ascii ",FEATURES=writable_page_tables"
4037+ .ascii "|writable_descriptor_tables"
4038+ .ascii "|auto_translated_physmap"
4039+ .ascii "|pae_pgdir_above_4gb"
4040+ .ascii "|supervisor_mode_kernel"
4041+#ifdef CONFIG_X86_PAE
4042+ .ascii ",PAE=yes[extended-cr3]"
4043+#else
4044+ .ascii ",PAE=no"
4045+#endif
4046+ .ascii ",LOADER=generic"
4047+ .byte 0
4048+#endif /* CONFIG_XEN_COMPAT_030002 */
4049+
4050+
4051+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
4052+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
4053+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
4054+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
4055+#ifdef CONFIG_XEN_COMPAT_030002
4056+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
4057+#else
4058+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
4059+#endif /* !CONFIG_XEN_COMPAT_030002 */
4060+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
4061+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
4062+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
4063+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
4064+#ifdef CONFIG_X86_PAE
4065+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
4066+#else
4067+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
4068+#endif
4069+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
4070diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/init_task-xen.c linux-2.6.16.33/arch/i386/kernel/init_task-xen.c
4071--- linux-2.6.16.33-noxen/arch/i386/kernel/init_task-xen.c 1970-01-01 00:00:00.000000000 +0000
4072+++ linux-2.6.16.33/arch/i386/kernel/init_task-xen.c 2007-01-08 15:00:45.000000000 +0000
4073@@ -0,0 +1,51 @@
4074+#include <linux/mm.h>
4075+#include <linux/module.h>
4076+#include <linux/sched.h>
4077+#include <linux/init.h>
4078+#include <linux/init_task.h>
4079+#include <linux/fs.h>
4080+#include <linux/mqueue.h>
4081+
4082+#include <asm/uaccess.h>
4083+#include <asm/pgtable.h>
4084+#include <asm/desc.h>
4085+
4086+static struct fs_struct init_fs = INIT_FS;
4087+static struct files_struct init_files = INIT_FILES;
4088+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
4089+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
4090+
4091+#define swapper_pg_dir ((pgd_t *)NULL)
4092+struct mm_struct init_mm = INIT_MM(init_mm);
4093+#undef swapper_pg_dir
4094+
4095+EXPORT_SYMBOL(init_mm);
4096+
4097+/*
4098+ * Initial thread structure.
4099+ *
4100+ * We need to make sure that this is THREAD_SIZE aligned due to the
4101+ * way process stacks are handled. This is done by having a special
4102+ * "init_task" linker map entry..
4103+ */
4104+union thread_union init_thread_union
4105+ __attribute__((__section__(".data.init_task"))) =
4106+ { INIT_THREAD_INFO(init_task) };
4107+
4108+/*
4109+ * Initial task structure.
4110+ *
4111+ * All other task structs will be allocated on slabs in fork.c
4112+ */
4113+struct task_struct init_task = INIT_TASK(init_task);
4114+
4115+EXPORT_SYMBOL(init_task);
4116+
4117+#ifndef CONFIG_X86_NO_TSS
4118+/*
4119+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
4120+ * no more per-task TSS's.
4121+ */
4122+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
4123+#endif
4124+
4125diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/io_apic-xen.c linux-2.6.16.33/arch/i386/kernel/io_apic-xen.c
4126--- linux-2.6.16.33-noxen/arch/i386/kernel/io_apic-xen.c 1970-01-01 00:00:00.000000000 +0000
4127+++ linux-2.6.16.33/arch/i386/kernel/io_apic-xen.c 2007-01-08 15:00:45.000000000 +0000
4128@@ -0,0 +1,2748 @@
4129+/*
4130+ * Intel IO-APIC support for multi-Pentium hosts.
4131+ *
4132+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
4133+ *
4134+ * Many thanks to Stig Venaas for trying out countless experimental
4135+ * patches and reporting/debugging problems patiently!
4136+ *
4137+ * (c) 1999, Multiple IO-APIC support, developed by
4138+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
4139+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
4140+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
4141+ * and Ingo Molnar <mingo@redhat.com>
4142+ *
4143+ * Fixes
4144+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
4145+ * thanks to Eric Gilmore
4146+ * and Rolf G. Tews
4147+ * for testing these extensively
4148+ * Paul Diefenbaugh : Added full ACPI support
4149+ */
4150+
4151+#include <linux/mm.h>
4152+#include <linux/interrupt.h>
4153+#include <linux/init.h>
4154+#include <linux/delay.h>
4155+#include <linux/sched.h>
4156+#include <linux/config.h>
4157+#include <linux/smp_lock.h>
4158+#include <linux/mc146818rtc.h>
4159+#include <linux/compiler.h>
4160+#include <linux/acpi.h>
4161+#include <linux/module.h>
4162+#include <linux/sysdev.h>
4163+
4164+#include <asm/io.h>
4165+#include <asm/smp.h>
4166+#include <asm/desc.h>
4167+#include <asm/timer.h>
4168+#include <asm/i8259.h>
4169+
4170+#include <mach_apic.h>
4171+
4172+#include "io_ports.h"
4173+
4174+#ifdef CONFIG_XEN
4175+
4176+#include <xen/interface/xen.h>
4177+#include <xen/interface/physdev.h>
4178+
4179+/* Fake i8259 */
4180+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
4181+#define disable_8259A_irq(_irq) ((void)0)
4182+#define i8259A_irq_pending(_irq) (0)
4183+
4184+unsigned long io_apic_irqs;
4185+
4186+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
4187+{
4188+ struct physdev_apic apic_op;
4189+ int ret;
4190+
4191+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4192+ apic_op.reg = reg;
4193+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
4194+ if (ret)
4195+ return ret;
4196+ return apic_op.value;
4197+}
4198+
4199+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4200+{
4201+ struct physdev_apic apic_op;
4202+
4203+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4204+ apic_op.reg = reg;
4205+ apic_op.value = value;
4206+ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
4207+}
4208+
4209+#define io_apic_read(a,r) xen_io_apic_read(a,r)
4210+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
4211+
4212+#endif /* CONFIG_XEN */
4213+
4214+int (*ioapic_renumber_irq)(int ioapic, int irq);
4215+atomic_t irq_mis_count;
4216+
4217+/* Where if anywhere is the i8259 connect in external int mode */
4218+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
4219+
4220+static DEFINE_SPINLOCK(ioapic_lock);
4221+
4222+int timer_over_8254 __initdata = 1;
4223+
4224+/*
4225+ * Is the SiS APIC rmw bug present ?
4226+ * -1 = don't know, 0 = no, 1 = yes
4227+ */
4228+int sis_apic_bug = -1;
4229+
4230+/*
4231+ * # of IRQ routing registers
4232+ */
4233+int nr_ioapic_registers[MAX_IO_APICS];
4234+
4235+int disable_timer_pin_1 __initdata;
4236+
4237+/*
4238+ * Rough estimation of how many shared IRQs there are, can
4239+ * be changed anytime.
4240+ */
4241+#define MAX_PLUS_SHARED_IRQS NR_IRQS
4242+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
4243+
4244+/*
4245+ * This is performance-critical, we want to do it O(1)
4246+ *
4247+ * the indexing order of this array favors 1:1 mappings
4248+ * between pins and IRQs.
4249+ */
4250+
4251+static struct irq_pin_list {
4252+ int apic, pin, next;
4253+} irq_2_pin[PIN_MAP_SIZE];
4254+
4255+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
4256+#ifdef CONFIG_PCI_MSI
4257+#define vector_to_irq(vector) \
4258+ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
4259+#else
4260+#define vector_to_irq(vector) (vector)
4261+#endif
4262+
4263+/*
4264+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4265+ * shared ISA-space IRQs, so we have to support them. We are super
4266+ * fast in the common case, and fast for shared ISA-space IRQs.
4267+ */
4268+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4269+{
4270+ static int first_free_entry = NR_IRQS;
4271+ struct irq_pin_list *entry = irq_2_pin + irq;
4272+
4273+ while (entry->next)
4274+ entry = irq_2_pin + entry->next;
4275+
4276+ if (entry->pin != -1) {
4277+ entry->next = first_free_entry;
4278+ entry = irq_2_pin + entry->next;
4279+ if (++first_free_entry >= PIN_MAP_SIZE)
4280+ panic("io_apic.c: whoops");
4281+ }
4282+ entry->apic = apic;
4283+ entry->pin = pin;
4284+}
4285+
4286+#ifdef CONFIG_XEN
4287+#define clear_IO_APIC() ((void)0)
4288+#else
4289+/*
4290+ * Reroute an IRQ to a different pin.
4291+ */
4292+static void __init replace_pin_at_irq(unsigned int irq,
4293+ int oldapic, int oldpin,
4294+ int newapic, int newpin)
4295+{
4296+ struct irq_pin_list *entry = irq_2_pin + irq;
4297+
4298+ while (1) {
4299+ if (entry->apic == oldapic && entry->pin == oldpin) {
4300+ entry->apic = newapic;
4301+ entry->pin = newpin;
4302+ }
4303+ if (!entry->next)
4304+ break;
4305+ entry = irq_2_pin + entry->next;
4306+ }
4307+}
4308+
4309+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
4310+{
4311+ struct irq_pin_list *entry = irq_2_pin + irq;
4312+ unsigned int pin, reg;
4313+
4314+ for (;;) {
4315+ pin = entry->pin;
4316+ if (pin == -1)
4317+ break;
4318+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
4319+ reg &= ~disable;
4320+ reg |= enable;
4321+ io_apic_modify(entry->apic, 0x10 + pin*2, reg);
4322+ if (!entry->next)
4323+ break;
4324+ entry = irq_2_pin + entry->next;
4325+ }
4326+}
4327+
4328+/* mask = 1 */
4329+static void __mask_IO_APIC_irq (unsigned int irq)
4330+{
4331+ __modify_IO_APIC_irq(irq, 0x00010000, 0);
4332+}
4333+
4334+/* mask = 0 */
4335+static void __unmask_IO_APIC_irq (unsigned int irq)
4336+{
4337+ __modify_IO_APIC_irq(irq, 0, 0x00010000);
4338+}
4339+
4340+/* mask = 1, trigger = 0 */
4341+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
4342+{
4343+ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
4344+}
4345+
4346+/* mask = 0, trigger = 1 */
4347+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
4348+{
4349+ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
4350+}
4351+
4352+static void mask_IO_APIC_irq (unsigned int irq)
4353+{
4354+ unsigned long flags;
4355+
4356+ spin_lock_irqsave(&ioapic_lock, flags);
4357+ __mask_IO_APIC_irq(irq);
4358+ spin_unlock_irqrestore(&ioapic_lock, flags);
4359+}
4360+
4361+static void unmask_IO_APIC_irq (unsigned int irq)
4362+{
4363+ unsigned long flags;
4364+
4365+ spin_lock_irqsave(&ioapic_lock, flags);
4366+ __unmask_IO_APIC_irq(irq);
4367+ spin_unlock_irqrestore(&ioapic_lock, flags);
4368+}
4369+
4370+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
4371+{
4372+ struct IO_APIC_route_entry entry;
4373+ unsigned long flags;
4374+
4375+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
4376+ spin_lock_irqsave(&ioapic_lock, flags);
4377+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4378+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4379+ spin_unlock_irqrestore(&ioapic_lock, flags);
4380+ if (entry.delivery_mode == dest_SMI)
4381+ return;
4382+
4383+ /*
4384+ * Disable it in the IO-APIC irq-routing table:
4385+ */
4386+ memset(&entry, 0, sizeof(entry));
4387+ entry.mask = 1;
4388+ spin_lock_irqsave(&ioapic_lock, flags);
4389+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
4390+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
4391+ spin_unlock_irqrestore(&ioapic_lock, flags);
4392+}
4393+
4394+static void clear_IO_APIC (void)
4395+{
4396+ int apic, pin;
4397+
4398+ for (apic = 0; apic < nr_ioapics; apic++)
4399+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
4400+ clear_IO_APIC_pin(apic, pin);
4401+}
4402+
4403+#ifdef CONFIG_SMP
4404+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
4405+{
4406+ unsigned long flags;
4407+ int pin;
4408+ struct irq_pin_list *entry = irq_2_pin + irq;
4409+ unsigned int apicid_value;
4410+ cpumask_t tmp;
4411+
4412+ cpus_and(tmp, cpumask, cpu_online_map);
4413+ if (cpus_empty(tmp))
4414+ tmp = TARGET_CPUS;
4415+
4416+ cpus_and(cpumask, tmp, CPU_MASK_ALL);
4417+
4418+ apicid_value = cpu_mask_to_apicid(cpumask);
4419+ /* Prepare to do the io_apic_write */
4420+ apicid_value = apicid_value << 24;
4421+ spin_lock_irqsave(&ioapic_lock, flags);
4422+ for (;;) {
4423+ pin = entry->pin;
4424+ if (pin == -1)
4425+ break;
4426+ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
4427+ if (!entry->next)
4428+ break;
4429+ entry = irq_2_pin + entry->next;
4430+ }
4431+ set_irq_info(irq, cpumask);
4432+ spin_unlock_irqrestore(&ioapic_lock, flags);
4433+}
4434+
4435+#if defined(CONFIG_IRQBALANCE)
4436+# include <asm/processor.h> /* kernel_thread() */
4437+# include <linux/kernel_stat.h> /* kstat */
4438+# include <linux/slab.h> /* kmalloc() */
4439+# include <linux/timer.h> /* time_after() */
4440+
4441+# ifdef CONFIG_BALANCED_IRQ_DEBUG
4442+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
4443+# define Dprintk(x...) do { TDprintk(x); } while (0)
4444+# else
4445+# define TDprintk(x...)
4446+# define Dprintk(x...)
4447+# endif
4448+
4449+
4450+#define IRQBALANCE_CHECK_ARCH -999
4451+static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
4452+static int physical_balance = 0;
4453+
4454+static struct irq_cpu_info {
4455+ unsigned long * last_irq;
4456+ unsigned long * irq_delta;
4457+ unsigned long irq;
4458+} irq_cpu_data[NR_CPUS];
4459+
4460+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
4461+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
4462+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
4463+
4464+#define IDLE_ENOUGH(cpu,now) \
4465+ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
4466+
4467+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
4468+
4469+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
4470+
4471+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
4472+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
4473+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
4474+#define BALANCED_IRQ_LESS_DELTA (HZ)
4475+
4476+static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
4477+
4478+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
4479+ unsigned long now, int direction)
4480+{
4481+ int search_idle = 1;
4482+ int cpu = curr_cpu;
4483+
4484+ goto inside;
4485+
4486+ do {
4487+ if (unlikely(cpu == curr_cpu))
4488+ search_idle = 0;
4489+inside:
4490+ if (direction == 1) {
4491+ cpu++;
4492+ if (cpu >= NR_CPUS)
4493+ cpu = 0;
4494+ } else {
4495+ cpu--;
4496+ if (cpu == -1)
4497+ cpu = NR_CPUS-1;
4498+ }
4499+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
4500+ (search_idle && !IDLE_ENOUGH(cpu,now)));
4501+
4502+ return cpu;
4503+}
4504+
4505+static inline void balance_irq(int cpu, int irq)
4506+{
4507+ unsigned long now = jiffies;
4508+ cpumask_t allowed_mask;
4509+ unsigned int new_cpu;
4510+
4511+ if (irqbalance_disabled)
4512+ return;
4513+
4514+ cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
4515+ new_cpu = move(cpu, allowed_mask, now, 1);
4516+ if (cpu != new_cpu) {
4517+ set_pending_irq(irq, cpumask_of_cpu(new_cpu));
4518+ }
4519+}
4520+
4521+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
4522+{
4523+ int i, j;
4524+ Dprintk("Rotating IRQs among CPUs.\n");
4525+ for (i = 0; i < NR_CPUS; i++) {
4526+ for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
4527+ if (!irq_desc[j].action)
4528+ continue;
4529+ /* Is it a significant load ? */
4530+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
4531+ useful_load_threshold)
4532+ continue;
4533+ balance_irq(i, j);
4534+ }
4535+ }
4536+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4537+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
4538+ return;
4539+}
4540+
4541+static void do_irq_balance(void)
4542+{
4543+ int i, j;
4544+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
4545+ unsigned long move_this_load = 0;
4546+ int max_loaded = 0, min_loaded = 0;
4547+ int load;
4548+ unsigned long useful_load_threshold = balanced_irq_interval + 10;
4549+ int selected_irq;
4550+ int tmp_loaded, first_attempt = 1;
4551+ unsigned long tmp_cpu_irq;
4552+ unsigned long imbalance = 0;
4553+ cpumask_t allowed_mask, target_cpu_mask, tmp;
4554+
4555+ for (i = 0; i < NR_CPUS; i++) {
4556+ int package_index;
4557+ CPU_IRQ(i) = 0;
4558+ if (!cpu_online(i))
4559+ continue;
4560+ package_index = CPU_TO_PACKAGEINDEX(i);
4561+ for (j = 0; j < NR_IRQS; j++) {
4562+ unsigned long value_now, delta;
4563+ /* Is this an active IRQ? */
4564+ if (!irq_desc[j].action)
4565+ continue;
4566+ if ( package_index == i )
4567+ IRQ_DELTA(package_index,j) = 0;
4568+ /* Determine the total count per processor per IRQ */
4569+ value_now = (unsigned long) kstat_cpu(i).irqs[j];
4570+
4571+ /* Determine the activity per processor per IRQ */
4572+ delta = value_now - LAST_CPU_IRQ(i,j);
4573+
4574+ /* Update last_cpu_irq[][] for the next time */
4575+ LAST_CPU_IRQ(i,j) = value_now;
4576+
4577+ /* Ignore IRQs whose rate is less than the clock */
4578+ if (delta < useful_load_threshold)
4579+ continue;
4580+ /* update the load for the processor or package total */
4581+ IRQ_DELTA(package_index,j) += delta;
4582+
4583+ /* Keep track of the higher numbered sibling as well */
4584+ if (i != package_index)
4585+ CPU_IRQ(i) += delta;
4586+ /*
4587+ * We have sibling A and sibling B in the package
4588+ *
4589+ * cpu_irq[A] = load for cpu A + load for cpu B
4590+ * cpu_irq[B] = load for cpu B
4591+ */
4592+ CPU_IRQ(package_index) += delta;
4593+ }
4594+ }
4595+ /* Find the least loaded processor package */
4596+ for (i = 0; i < NR_CPUS; i++) {
4597+ if (!cpu_online(i))
4598+ continue;
4599+ if (i != CPU_TO_PACKAGEINDEX(i))
4600+ continue;
4601+ if (min_cpu_irq > CPU_IRQ(i)) {
4602+ min_cpu_irq = CPU_IRQ(i);
4603+ min_loaded = i;
4604+ }
4605+ }
4606+ max_cpu_irq = ULONG_MAX;
4607+
4608+tryanothercpu:
4609+ /* Look for heaviest loaded processor.
4610+ * We may come back to get the next heaviest loaded processor.
4611+ * Skip processors with trivial loads.
4612+ */
4613+ tmp_cpu_irq = 0;
4614+ tmp_loaded = -1;
4615+ for (i = 0; i < NR_CPUS; i++) {
4616+ if (!cpu_online(i))
4617+ continue;
4618+ if (i != CPU_TO_PACKAGEINDEX(i))
4619+ continue;
4620+ if (max_cpu_irq <= CPU_IRQ(i))
4621+ continue;
4622+ if (tmp_cpu_irq < CPU_IRQ(i)) {
4623+ tmp_cpu_irq = CPU_IRQ(i);
4624+ tmp_loaded = i;
4625+ }
4626+ }
4627+
4628+ if (tmp_loaded == -1) {
4629+ /* In the case of small number of heavy interrupt sources,
4630+ * loading some of the cpus too much. We use Ingo's original
4631+ * approach to rotate them around.
4632+ */
4633+ if (!first_attempt && imbalance >= useful_load_threshold) {
4634+ rotate_irqs_among_cpus(useful_load_threshold);
4635+ return;
4636+ }
4637+ goto not_worth_the_effort;
4638+ }
4639+
4640+ first_attempt = 0; /* heaviest search */
4641+ max_cpu_irq = tmp_cpu_irq; /* load */
4642+ max_loaded = tmp_loaded; /* processor */
4643+ imbalance = (max_cpu_irq - min_cpu_irq) / 2;
4644+
4645+ Dprintk("max_loaded cpu = %d\n", max_loaded);
4646+ Dprintk("min_loaded cpu = %d\n", min_loaded);
4647+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
4648+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
4649+ Dprintk("load imbalance = %lu\n", imbalance);
4650+
4651+ /* if imbalance is less than approx 10% of max load, then
4652+ * observe diminishing returns action. - quit
4653+ */
4654+ if (imbalance < (max_cpu_irq >> 3)) {
4655+ Dprintk("Imbalance too trivial\n");
4656+ goto not_worth_the_effort;
4657+ }
4658+
4659+tryanotherirq:
4660+ /* if we select an IRQ to move that can't go where we want, then
4661+ * see if there is another one to try.
4662+ */
4663+ move_this_load = 0;
4664+ selected_irq = -1;
4665+ for (j = 0; j < NR_IRQS; j++) {
4666+ /* Is this an active IRQ? */
4667+ if (!irq_desc[j].action)
4668+ continue;
4669+ if (imbalance <= IRQ_DELTA(max_loaded,j))
4670+ continue;
4671+ /* Try to find the IRQ that is closest to the imbalance
4672+ * without going over.
4673+ */
4674+ if (move_this_load < IRQ_DELTA(max_loaded,j)) {
4675+ move_this_load = IRQ_DELTA(max_loaded,j);
4676+ selected_irq = j;
4677+ }
4678+ }
4679+ if (selected_irq == -1) {
4680+ goto tryanothercpu;
4681+ }
4682+
4683+ imbalance = move_this_load;
4684+
4685+ /* For physical_balance case, we accumlated both load
4686+ * values in the one of the siblings cpu_irq[],
4687+ * to use the same code for physical and logical processors
4688+ * as much as possible.
4689+ *
4690+ * NOTE: the cpu_irq[] array holds the sum of the load for
4691+ * sibling A and sibling B in the slot for the lowest numbered
4692+ * sibling (A), _AND_ the load for sibling B in the slot for
4693+ * the higher numbered sibling.
4694+ *
4695+ * We seek the least loaded sibling by making the comparison
4696+ * (A+B)/2 vs B
4697+ */
4698+ load = CPU_IRQ(min_loaded) >> 1;
4699+ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
4700+ if (load > CPU_IRQ(j)) {
4701+ /* This won't change cpu_sibling_map[min_loaded] */
4702+ load = CPU_IRQ(j);
4703+ min_loaded = j;
4704+ }
4705+ }
4706+
4707+ cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
4708+ target_cpu_mask = cpumask_of_cpu(min_loaded);
4709+ cpus_and(tmp, target_cpu_mask, allowed_mask);
4710+
4711+ if (!cpus_empty(tmp)) {
4712+
4713+ Dprintk("irq = %d moved to cpu = %d\n",
4714+ selected_irq, min_loaded);
4715+ /* mark for change destination */
4716+ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
4717+
4718+ /* Since we made a change, come back sooner to
4719+ * check for more variation.
4720+ */
4721+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4722+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
4723+ return;
4724+ }
4725+ goto tryanotherirq;
4726+
4727+not_worth_the_effort:
4728+ /*
4729+ * if we did not find an IRQ to move, then adjust the time interval
4730+ * upward
4731+ */
4732+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
4733+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
4734+ Dprintk("IRQ worth rotating not found\n");
4735+ return;
4736+}
4737+
4738+static int balanced_irq(void *unused)
4739+{
4740+ int i;
4741+ unsigned long prev_balance_time = jiffies;
4742+ long time_remaining = balanced_irq_interval;
4743+
4744+ daemonize("kirqd");
4745+
4746+ /* push everything to CPU 0 to give us a starting point. */
4747+ for (i = 0 ; i < NR_IRQS ; i++) {
4748+ pending_irq_cpumask[i] = cpumask_of_cpu(0);
4749+ set_pending_irq(i, cpumask_of_cpu(0));
4750+ }
4751+
4752+ for ( ; ; ) {
4753+ time_remaining = schedule_timeout_interruptible(time_remaining);
4754+ try_to_freeze();
4755+ if (time_after(jiffies,
4756+ prev_balance_time+balanced_irq_interval)) {
4757+ preempt_disable();
4758+ do_irq_balance();
4759+ prev_balance_time = jiffies;
4760+ time_remaining = balanced_irq_interval;
4761+ preempt_enable();
4762+ }
4763+ }
4764+ return 0;
4765+}
4766+
4767+static int __init balanced_irq_init(void)
4768+{
4769+ int i;
4770+ struct cpuinfo_x86 *c;
4771+ cpumask_t tmp;
4772+
4773+ cpus_shift_right(tmp, cpu_online_map, 2);
4774+ c = &boot_cpu_data;
4775+ /* When not overwritten by the command line ask subarchitecture. */
4776+ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
4777+ irqbalance_disabled = NO_BALANCE_IRQ;
4778+ if (irqbalance_disabled)
4779+ return 0;
4780+
4781+ /* disable irqbalance completely if there is only one processor online */
4782+ if (num_online_cpus() < 2) {
4783+ irqbalance_disabled = 1;
4784+ return 0;
4785+ }
4786+ /*
4787+ * Enable physical balance only if more than 1 physical processor
4788+ * is present
4789+ */
4790+ if (smp_num_siblings > 1 && !cpus_empty(tmp))
4791+ physical_balance = 1;
4792+
4793+ for (i = 0; i < NR_CPUS; i++) {
4794+ if (!cpu_online(i))
4795+ continue;
4796+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4797+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4798+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
4799+ printk(KERN_ERR "balanced_irq_init: out of memory");
4800+ goto failed;
4801+ }
4802+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
4803+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
4804+ }
4805+
4806+ printk(KERN_INFO "Starting balanced_irq\n");
4807+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
4808+ return 0;
4809+ else
4810+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
4811+failed:
4812+ for (i = 0; i < NR_CPUS; i++) {
4813+ kfree(irq_cpu_data[i].irq_delta);
4814+ kfree(irq_cpu_data[i].last_irq);
4815+ }
4816+ return 0;
4817+}
4818+
4819+int __init irqbalance_disable(char *str)
4820+{
4821+ irqbalance_disabled = 1;
4822+ return 0;
4823+}
4824+
4825+__setup("noirqbalance", irqbalance_disable);
4826+
4827+late_initcall(balanced_irq_init);
4828+#endif /* CONFIG_IRQBALANCE */
4829+#endif /* CONFIG_SMP */
4830+#endif
4831+
4832+#ifndef CONFIG_SMP
4833+void fastcall send_IPI_self(int vector)
4834+{
4835+#ifndef CONFIG_XEN
4836+ unsigned int cfg;
4837+
4838+ /*
4839+ * Wait for idle.
4840+ */
4841+ apic_wait_icr_idle();
4842+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
4843+ /*
4844+ * Send the IPI. The write to APIC_ICR fires this off.
4845+ */
4846+ apic_write_around(APIC_ICR, cfg);
4847+#endif
4848+}
4849+#endif /* !CONFIG_SMP */
4850+
4851+
4852+/*
4853+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
4854+ * specific CPU-side IRQs.
4855+ */
4856+
4857+#define MAX_PIRQS 8
4858+static int pirq_entries [MAX_PIRQS];
4859+static int pirqs_enabled;
4860+int skip_ioapic_setup;
4861+
4862+static int __init ioapic_setup(char *str)
4863+{
4864+ skip_ioapic_setup = 1;
4865+ return 1;
4866+}
4867+
4868+__setup("noapic", ioapic_setup);
4869+
4870+static int __init ioapic_pirq_setup(char *str)
4871+{
4872+ int i, max;
4873+ int ints[MAX_PIRQS+1];
4874+
4875+ get_options(str, ARRAY_SIZE(ints), ints);
4876+
4877+ for (i = 0; i < MAX_PIRQS; i++)
4878+ pirq_entries[i] = -1;
4879+
4880+ pirqs_enabled = 1;
4881+ apic_printk(APIC_VERBOSE, KERN_INFO
4882+ "PIRQ redirection, working around broken MP-BIOS.\n");
4883+ max = MAX_PIRQS;
4884+ if (ints[0] < MAX_PIRQS)
4885+ max = ints[0];
4886+
4887+ for (i = 0; i < max; i++) {
4888+ apic_printk(APIC_VERBOSE, KERN_DEBUG
4889+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
4890+ /*
4891+ * PIRQs are mapped upside down, usually.
4892+ */
4893+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
4894+ }
4895+ return 1;
4896+}
4897+
4898+__setup("pirq=", ioapic_pirq_setup);
4899+
4900+/*
4901+ * Find the IRQ entry number of a certain pin.
4902+ */
4903+static int find_irq_entry(int apic, int pin, int type)
4904+{
4905+ int i;
4906+
4907+ for (i = 0; i < mp_irq_entries; i++)
4908+ if (mp_irqs[i].mpc_irqtype == type &&
4909+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
4910+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
4911+ mp_irqs[i].mpc_dstirq == pin)
4912+ return i;
4913+
4914+ return -1;
4915+}
4916+
4917+/*
4918+ * Find the pin to which IRQ[irq] (ISA) is connected
4919+ */
4920+static int __init find_isa_irq_pin(int irq, int type)
4921+{
4922+ int i;
4923+
4924+ for (i = 0; i < mp_irq_entries; i++) {
4925+ int lbus = mp_irqs[i].mpc_srcbus;
4926+
4927+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4928+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4929+ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4930+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4931+ ) &&
4932+ (mp_irqs[i].mpc_irqtype == type) &&
4933+ (mp_irqs[i].mpc_srcbusirq == irq))
4934+
4935+ return mp_irqs[i].mpc_dstirq;
4936+ }
4937+ return -1;
4938+}
4939+
4940+static int __init find_isa_irq_apic(int irq, int type)
4941+{
4942+ int i;
4943+
4944+ for (i = 0; i < mp_irq_entries; i++) {
4945+ int lbus = mp_irqs[i].mpc_srcbus;
4946+
4947+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4948+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4949+ mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4950+ mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4951+ ) &&
4952+ (mp_irqs[i].mpc_irqtype == type) &&
4953+ (mp_irqs[i].mpc_srcbusirq == irq))
4954+ break;
4955+ }
4956+ if (i < mp_irq_entries) {
4957+ int apic;
4958+ for(apic = 0; apic < nr_ioapics; apic++) {
4959+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
4960+ return apic;
4961+ }
4962+ }
4963+
4964+ return -1;
4965+}
4966+
4967+/*
4968+ * Find a specific PCI IRQ entry.
4969+ * Not an __init, possibly needed by modules
4970+ */
4971+static int pin_2_irq(int idx, int apic, int pin);
4972+
4973+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
4974+{
4975+ int apic, i, best_guess = -1;
4976+
4977+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4978+ "slot:%d, pin:%d.\n", bus, slot, pin);
4979+ if (mp_bus_id_to_pci_bus[bus] == -1) {
4980+ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4981+ return -1;
4982+ }
4983+ for (i = 0; i < mp_irq_entries; i++) {
4984+ int lbus = mp_irqs[i].mpc_srcbus;
4985+
4986+ for (apic = 0; apic < nr_ioapics; apic++)
4987+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4988+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4989+ break;
4990+
4991+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4992+ !mp_irqs[i].mpc_irqtype &&
4993+ (bus == lbus) &&
4994+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4995+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4996+
4997+ if (!(apic || IO_APIC_IRQ(irq)))
4998+ continue;
4999+
5000+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
5001+ return irq;
5002+ /*
5003+ * Use the first all-but-pin matching entry as a
5004+ * best-guess fuzzy result for broken mptables.
5005+ */
5006+ if (best_guess < 0)
5007+ best_guess = irq;
5008+ }
5009+ }
5010+ return best_guess;
5011+}
5012+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
5013+
5014+/*
5015+ * This function currently is only a helper for the i386 smp boot process where
5016+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
5017+ * so mask in all cases should simply be TARGET_CPUS
5018+ */
5019+#ifdef CONFIG_SMP
5020+#ifndef CONFIG_XEN
5021+void __init setup_ioapic_dest(void)
5022+{
5023+ int pin, ioapic, irq, irq_entry;
5024+
5025+ if (skip_ioapic_setup == 1)
5026+ return;
5027+
5028+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
5029+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
5030+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
5031+ if (irq_entry == -1)
5032+ continue;
5033+ irq = pin_2_irq(irq_entry, ioapic, pin);
5034+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
5035+ }
5036+
5037+ }
5038+}
5039+#endif /* !CONFIG_XEN */
5040+#endif
5041+
5042+/*
5043+ * EISA Edge/Level control register, ELCR
5044+ */
5045+static int EISA_ELCR(unsigned int irq)
5046+{
5047+ if (irq < 16) {
5048+ unsigned int port = 0x4d0 + (irq >> 3);
5049+ return (inb(port) >> (irq & 7)) & 1;
5050+ }
5051+ apic_printk(APIC_VERBOSE, KERN_INFO
5052+ "Broken MPtable reports ISA irq %d\n", irq);
5053+ return 0;
5054+}
5055+
5056+/* EISA interrupts are always polarity zero and can be edge or level
5057+ * trigger depending on the ELCR value. If an interrupt is listed as
5058+ * EISA conforming in the MP table, that means its trigger type must
5059+ * be read in from the ELCR */
5060+
5061+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
5062+#define default_EISA_polarity(idx) (0)
5063+
5064+/* ISA interrupts are always polarity zero edge triggered,
5065+ * when listed as conforming in the MP table. */
5066+
5067+#define default_ISA_trigger(idx) (0)
5068+#define default_ISA_polarity(idx) (0)
5069+
5070+/* PCI interrupts are always polarity one level triggered,
5071+ * when listed as conforming in the MP table. */
5072+
5073+#define default_PCI_trigger(idx) (1)
5074+#define default_PCI_polarity(idx) (1)
5075+
5076+/* MCA interrupts are always polarity zero level triggered,
5077+ * when listed as conforming in the MP table. */
5078+
5079+#define default_MCA_trigger(idx) (1)
5080+#define default_MCA_polarity(idx) (0)
5081+
5082+/* NEC98 interrupts are always polarity zero edge triggered,
5083+ * when listed as conforming in the MP table. */
5084+
5085+#define default_NEC98_trigger(idx) (0)
5086+#define default_NEC98_polarity(idx) (0)
5087+
5088+static int __init MPBIOS_polarity(int idx)
5089+{
5090+ int bus = mp_irqs[idx].mpc_srcbus;
5091+ int polarity;
5092+
5093+ /*
5094+ * Determine IRQ line polarity (high active or low active):
5095+ */
5096+ switch (mp_irqs[idx].mpc_irqflag & 3)
5097+ {
5098+ case 0: /* conforms, ie. bus-type dependent polarity */
5099+ {
5100+ switch (mp_bus_id_to_type[bus])
5101+ {
5102+ case MP_BUS_ISA: /* ISA pin */
5103+ {
5104+ polarity = default_ISA_polarity(idx);
5105+ break;
5106+ }
5107+ case MP_BUS_EISA: /* EISA pin */
5108+ {
5109+ polarity = default_EISA_polarity(idx);
5110+ break;
5111+ }
5112+ case MP_BUS_PCI: /* PCI pin */
5113+ {
5114+ polarity = default_PCI_polarity(idx);
5115+ break;
5116+ }
5117+ case MP_BUS_MCA: /* MCA pin */
5118+ {
5119+ polarity = default_MCA_polarity(idx);
5120+ break;
5121+ }
5122+ case MP_BUS_NEC98: /* NEC 98 pin */
5123+ {
5124+ polarity = default_NEC98_polarity(idx);
5125+ break;
5126+ }
5127+ default:
5128+ {
5129+ printk(KERN_WARNING "broken BIOS!!\n");
5130+ polarity = 1;
5131+ break;
5132+ }
5133+ }
5134+ break;
5135+ }
5136+ case 1: /* high active */
5137+ {
5138+ polarity = 0;
5139+ break;
5140+ }
5141+ case 2: /* reserved */
5142+ {
5143+ printk(KERN_WARNING "broken BIOS!!\n");
5144+ polarity = 1;
5145+ break;
5146+ }
5147+ case 3: /* low active */
5148+ {
5149+ polarity = 1;
5150+ break;
5151+ }
5152+ default: /* invalid */
5153+ {
5154+ printk(KERN_WARNING "broken BIOS!!\n");
5155+ polarity = 1;
5156+ break;
5157+ }
5158+ }
5159+ return polarity;
5160+}
5161+
5162+static int MPBIOS_trigger(int idx)
5163+{
5164+ int bus = mp_irqs[idx].mpc_srcbus;
5165+ int trigger;
5166+
5167+ /*
5168+ * Determine IRQ trigger mode (edge or level sensitive):
5169+ */
5170+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
5171+ {
5172+ case 0: /* conforms, ie. bus-type dependent */
5173+ {
5174+ switch (mp_bus_id_to_type[bus])
5175+ {
5176+ case MP_BUS_ISA: /* ISA pin */
5177+ {
5178+ trigger = default_ISA_trigger(idx);
5179+ break;
5180+ }
5181+ case MP_BUS_EISA: /* EISA pin */
5182+ {
5183+ trigger = default_EISA_trigger(idx);
5184+ break;
5185+ }
5186+ case MP_BUS_PCI: /* PCI pin */
5187+ {
5188+ trigger = default_PCI_trigger(idx);
5189+ break;
5190+ }
5191+ case MP_BUS_MCA: /* MCA pin */
5192+ {
5193+ trigger = default_MCA_trigger(idx);
5194+ break;
5195+ }
5196+ case MP_BUS_NEC98: /* NEC 98 pin */
5197+ {
5198+ trigger = default_NEC98_trigger(idx);
5199+ break;
5200+ }
5201+ default:
5202+ {
5203+ printk(KERN_WARNING "broken BIOS!!\n");
5204+ trigger = 1;
5205+ break;
5206+ }
5207+ }
5208+ break;
5209+ }
5210+ case 1: /* edge */
5211+ {
5212+ trigger = 0;
5213+ break;
5214+ }
5215+ case 2: /* reserved */
5216+ {
5217+ printk(KERN_WARNING "broken BIOS!!\n");
5218+ trigger = 1;
5219+ break;
5220+ }
5221+ case 3: /* level */
5222+ {
5223+ trigger = 1;
5224+ break;
5225+ }
5226+ default: /* invalid */
5227+ {
5228+ printk(KERN_WARNING "broken BIOS!!\n");
5229+ trigger = 0;
5230+ break;
5231+ }
5232+ }
5233+ return trigger;
5234+}
5235+
5236+static inline int irq_polarity(int idx)
5237+{
5238+ return MPBIOS_polarity(idx);
5239+}
5240+
5241+static inline int irq_trigger(int idx)
5242+{
5243+ return MPBIOS_trigger(idx);
5244+}
5245+
5246+static int pin_2_irq(int idx, int apic, int pin)
5247+{
5248+ int irq, i;
5249+ int bus = mp_irqs[idx].mpc_srcbus;
5250+
5251+ /*
5252+ * Debugging check, we are in big trouble if this message pops up!
5253+ */
5254+ if (mp_irqs[idx].mpc_dstirq != pin)
5255+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5256+
5257+ switch (mp_bus_id_to_type[bus])
5258+ {
5259+ case MP_BUS_ISA: /* ISA pin */
5260+ case MP_BUS_EISA:
5261+ case MP_BUS_MCA:
5262+ case MP_BUS_NEC98:
5263+ {
5264+ irq = mp_irqs[idx].mpc_srcbusirq;
5265+ break;
5266+ }
5267+ case MP_BUS_PCI: /* PCI pin */
5268+ {
5269+ /*
5270+ * PCI IRQs are mapped in order
5271+ */
5272+ i = irq = 0;
5273+ while (i < apic)
5274+ irq += nr_ioapic_registers[i++];
5275+ irq += pin;
5276+
5277+ /*
5278+ * For MPS mode, so far only needed by ES7000 platform
5279+ */
5280+ if (ioapic_renumber_irq)
5281+ irq = ioapic_renumber_irq(apic, irq);
5282+
5283+ break;
5284+ }
5285+ default:
5286+ {
5287+ printk(KERN_ERR "unknown bus type %d.\n",bus);
5288+ irq = 0;
5289+ break;
5290+ }
5291+ }
5292+
5293+ /*
5294+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
5295+ */
5296+ if ((pin >= 16) && (pin <= 23)) {
5297+ if (pirq_entries[pin-16] != -1) {
5298+ if (!pirq_entries[pin-16]) {
5299+ apic_printk(APIC_VERBOSE, KERN_DEBUG
5300+ "disabling PIRQ%d\n", pin-16);
5301+ } else {
5302+ irq = pirq_entries[pin-16];
5303+ apic_printk(APIC_VERBOSE, KERN_DEBUG
5304+ "using PIRQ%d -> IRQ %d\n",
5305+ pin-16, irq);
5306+ }
5307+ }
5308+ }
5309+ return irq;
5310+}
5311+
5312+static inline int IO_APIC_irq_trigger(int irq)
5313+{
5314+ int apic, idx, pin;
5315+
5316+ for (apic = 0; apic < nr_ioapics; apic++) {
5317+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5318+ idx = find_irq_entry(apic,pin,mp_INT);
5319+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
5320+ return irq_trigger(idx);
5321+ }
5322+ }
5323+ /*
5324+ * nonexistent IRQs are edge default
5325+ */
5326+ return 0;
5327+}
5328+
5329+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
5330+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
5331+
5332+int assign_irq_vector(int irq)
5333+{
5334+ struct physdev_irq irq_op;
5335+
5336+ BUG_ON(irq >= NR_IRQ_VECTORS);
5337+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
5338+ return IO_APIC_VECTOR(irq);
5339+
5340+ irq_op.irq = irq;
5341+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
5342+ return -ENOSPC;
5343+
5344+ vector_irq[irq_op.vector] = irq;
5345+ if (irq != AUTO_ASSIGN)
5346+ IO_APIC_VECTOR(irq) = irq_op.vector;
5347+
5348+ return irq_op.vector;
5349+}
5350+
5351+#ifndef CONFIG_XEN
5352+static struct hw_interrupt_type ioapic_level_type;
5353+static struct hw_interrupt_type ioapic_edge_type;
5354+
5355+#define IOAPIC_AUTO -1
5356+#define IOAPIC_EDGE 0
5357+#define IOAPIC_LEVEL 1
5358+
5359+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
5360+{
5361+ if (use_pci_vector() && !platform_legacy_irq(irq)) {
5362+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5363+ trigger == IOAPIC_LEVEL)
5364+ irq_desc[vector].handler = &ioapic_level_type;
5365+ else
5366+ irq_desc[vector].handler = &ioapic_edge_type;
5367+ set_intr_gate(vector, interrupt[vector]);
5368+ } else {
5369+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5370+ trigger == IOAPIC_LEVEL)
5371+ irq_desc[irq].handler = &ioapic_level_type;
5372+ else
5373+ irq_desc[irq].handler = &ioapic_edge_type;
5374+ set_intr_gate(vector, interrupt[irq]);
5375+ }
5376+}
5377+#else
5378+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
5379+#endif
5380+
5381+static void __init setup_IO_APIC_irqs(void)
5382+{
5383+ struct IO_APIC_route_entry entry;
5384+ int apic, pin, idx, irq, first_notcon = 1, vector;
5385+ unsigned long flags;
5386+
5387+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5388+
5389+ for (apic = 0; apic < nr_ioapics; apic++) {
5390+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5391+
5392+ /*
5393+ * add it to the IO-APIC irq-routing table:
5394+ */
5395+ memset(&entry,0,sizeof(entry));
5396+
5397+ entry.delivery_mode = INT_DELIVERY_MODE;
5398+ entry.dest_mode = INT_DEST_MODE;
5399+ entry.mask = 0; /* enable IRQ */
5400+ entry.dest.logical.logical_dest =
5401+ cpu_mask_to_apicid(TARGET_CPUS);
5402+
5403+ idx = find_irq_entry(apic,pin,mp_INT);
5404+ if (idx == -1) {
5405+ if (first_notcon) {
5406+ apic_printk(APIC_VERBOSE, KERN_DEBUG
5407+ " IO-APIC (apicid-pin) %d-%d",
5408+ mp_ioapics[apic].mpc_apicid,
5409+ pin);
5410+ first_notcon = 0;
5411+ } else
5412+ apic_printk(APIC_VERBOSE, ", %d-%d",
5413+ mp_ioapics[apic].mpc_apicid, pin);
5414+ continue;
5415+ }
5416+
5417+ entry.trigger = irq_trigger(idx);
5418+ entry.polarity = irq_polarity(idx);
5419+
5420+ if (irq_trigger(idx)) {
5421+ entry.trigger = 1;
5422+ entry.mask = 1;
5423+ }
5424+
5425+ irq = pin_2_irq(idx, apic, pin);
5426+ /*
5427+ * skip adding the timer int on secondary nodes, which causes
5428+ * a small but painful rift in the time-space continuum
5429+ */
5430+ if (multi_timer_check(apic, irq))
5431+ continue;
5432+ else
5433+ add_pin_to_irq(irq, apic, pin);
5434+
5435+ if (/*!apic &&*/ !IO_APIC_IRQ(irq))
5436+ continue;
5437+
5438+ if (IO_APIC_IRQ(irq)) {
5439+ vector = assign_irq_vector(irq);
5440+ entry.vector = vector;
5441+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5442+
5443+ if (!apic && (irq < 16))
5444+ disable_8259A_irq(irq);
5445+ }
5446+ spin_lock_irqsave(&ioapic_lock, flags);
5447+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5448+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5449+ set_native_irq_info(irq, TARGET_CPUS);
5450+ spin_unlock_irqrestore(&ioapic_lock, flags);
5451+ }
5452+ }
5453+
5454+ if (!first_notcon)
5455+ apic_printk(APIC_VERBOSE, " not connected.\n");
5456+}
5457+
5458+/*
5459+ * Set up the 8259A-master output pin:
5460+ */
5461+#ifndef CONFIG_XEN
5462+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
5463+{
5464+ struct IO_APIC_route_entry entry;
5465+ unsigned long flags;
5466+
5467+ memset(&entry,0,sizeof(entry));
5468+
5469+ disable_8259A_irq(0);
5470+
5471+ /* mask LVT0 */
5472+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5473+
5474+ /*
5475+ * We use logical delivery to get the timer IRQ
5476+ * to the first CPU.
5477+ */
5478+ entry.dest_mode = INT_DEST_MODE;
5479+ entry.mask = 0; /* unmask IRQ now */
5480+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5481+ entry.delivery_mode = INT_DELIVERY_MODE;
5482+ entry.polarity = 0;
5483+ entry.trigger = 0;
5484+ entry.vector = vector;
5485+
5486+ /*
5487+ * The timer IRQ doesn't have to know that behind the
5488+ * scene we have a 8259A-master in AEOI mode ...
5489+ */
5490+ irq_desc[0].handler = &ioapic_edge_type;
5491+
5492+ /*
5493+ * Add it to the IO-APIC irq-routing table:
5494+ */
5495+ spin_lock_irqsave(&ioapic_lock, flags);
5496+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5497+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5498+ spin_unlock_irqrestore(&ioapic_lock, flags);
5499+
5500+ enable_8259A_irq(0);
5501+}
5502+
5503+static inline void UNEXPECTED_IO_APIC(void)
5504+{
5505+}
5506+
5507+void __init print_IO_APIC(void)
5508+{
5509+ int apic, i;
5510+ union IO_APIC_reg_00 reg_00;
5511+ union IO_APIC_reg_01 reg_01;
5512+ union IO_APIC_reg_02 reg_02;
5513+ union IO_APIC_reg_03 reg_03;
5514+ unsigned long flags;
5515+
5516+ if (apic_verbosity == APIC_QUIET)
5517+ return;
5518+
5519+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
5520+ for (i = 0; i < nr_ioapics; i++)
5521+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
5522+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
5523+
5524+ /*
5525+ * We are a bit conservative about what we expect. We have to
5526+ * know about every hardware change ASAP.
5527+ */
5528+ printk(KERN_INFO "testing the IO APIC.......................\n");
5529+
5530+ for (apic = 0; apic < nr_ioapics; apic++) {
5531+
5532+ spin_lock_irqsave(&ioapic_lock, flags);
5533+ reg_00.raw = io_apic_read(apic, 0);
5534+ reg_01.raw = io_apic_read(apic, 1);
5535+ if (reg_01.bits.version >= 0x10)
5536+ reg_02.raw = io_apic_read(apic, 2);
5537+ if (reg_01.bits.version >= 0x20)
5538+ reg_03.raw = io_apic_read(apic, 3);
5539+ spin_unlock_irqrestore(&ioapic_lock, flags);
5540+
5541+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
5542+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
5543+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
5544+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
5545+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
5546+ if (reg_00.bits.ID >= get_physical_broadcast())
5547+ UNEXPECTED_IO_APIC();
5548+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
5549+ UNEXPECTED_IO_APIC();
5550+
5551+ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
5552+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
5553+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
5554+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
5555+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
5556+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
5557+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
5558+ (reg_01.bits.entries != 0x2E) &&
5559+ (reg_01.bits.entries != 0x3F)
5560+ )
5561+ UNEXPECTED_IO_APIC();
5562+
5563+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
5564+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
5565+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
5566+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
5567+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
5568+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
5569+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
5570+ )
5571+ UNEXPECTED_IO_APIC();
5572+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
5573+ UNEXPECTED_IO_APIC();
5574+
5575+ /*
5576+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
5577+ * but the value of reg_02 is read as the previous read register
5578+ * value, so ignore it if reg_02 == reg_01.
5579+ */
5580+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
5581+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
5582+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
5583+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
5584+ UNEXPECTED_IO_APIC();
5585+ }
5586+
5587+ /*
5588+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
5589+ * or reg_03, but the value of reg_0[23] is read as the previous read
5590+ * register value, so ignore it if reg_03 == reg_0[12].
5591+ */
5592+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
5593+ reg_03.raw != reg_01.raw) {
5594+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
5595+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
5596+ if (reg_03.bits.__reserved_1)
5597+ UNEXPECTED_IO_APIC();
5598+ }
5599+
5600+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
5601+
5602+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
5603+ " Stat Dest Deli Vect: \n");
5604+
5605+ for (i = 0; i <= reg_01.bits.entries; i++) {
5606+ struct IO_APIC_route_entry entry;
5607+
5608+ spin_lock_irqsave(&ioapic_lock, flags);
5609+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
5610+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
5611+ spin_unlock_irqrestore(&ioapic_lock, flags);
5612+
5613+ printk(KERN_DEBUG " %02x %03X %02X ",
5614+ i,
5615+ entry.dest.logical.logical_dest,
5616+ entry.dest.physical.physical_dest
5617+ );
5618+
5619+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
5620+ entry.mask,
5621+ entry.trigger,
5622+ entry.irr,
5623+ entry.polarity,
5624+ entry.delivery_status,
5625+ entry.dest_mode,
5626+ entry.delivery_mode,
5627+ entry.vector
5628+ );
5629+ }
5630+ }
5631+ if (use_pci_vector())
5632+ printk(KERN_INFO "Using vector-based indexing\n");
5633+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
5634+ for (i = 0; i < NR_IRQS; i++) {
5635+ struct irq_pin_list *entry = irq_2_pin + i;
5636+ if (entry->pin < 0)
5637+ continue;
5638+ if (use_pci_vector() && !platform_legacy_irq(i))
5639+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
5640+ else
5641+ printk(KERN_DEBUG "IRQ%d ", i);
5642+ for (;;) {
5643+ printk("-> %d:%d", entry->apic, entry->pin);
5644+ if (!entry->next)
5645+ break;
5646+ entry = irq_2_pin + entry->next;
5647+ }
5648+ printk("\n");
5649+ }
5650+
5651+ printk(KERN_INFO ".................................... done.\n");
5652+
5653+ return;
5654+}
5655+
5656+#if 0
5657+
5658+static void print_APIC_bitfield (int base)
5659+{
5660+ unsigned int v;
5661+ int i, j;
5662+
5663+ if (apic_verbosity == APIC_QUIET)
5664+ return;
5665+
5666+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
5667+ for (i = 0; i < 8; i++) {
5668+ v = apic_read(base + i*0x10);
5669+ for (j = 0; j < 32; j++) {
5670+ if (v & (1<<j))
5671+ printk("1");
5672+ else
5673+ printk("0");
5674+ }
5675+ printk("\n");
5676+ }
5677+}
5678+
5679+void /*__init*/ print_local_APIC(void * dummy)
5680+{
5681+ unsigned int v, ver, maxlvt;
5682+
5683+ if (apic_verbosity == APIC_QUIET)
5684+ return;
5685+
5686+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
5687+ smp_processor_id(), hard_smp_processor_id());
5688+ v = apic_read(APIC_ID);
5689+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
5690+ v = apic_read(APIC_LVR);
5691+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
5692+ ver = GET_APIC_VERSION(v);
5693+ maxlvt = get_maxlvt();
5694+
5695+ v = apic_read(APIC_TASKPRI);
5696+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
5697+
5698+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
5699+ v = apic_read(APIC_ARBPRI);
5700+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
5701+ v & APIC_ARBPRI_MASK);
5702+ v = apic_read(APIC_PROCPRI);
5703+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
5704+ }
5705+
5706+ v = apic_read(APIC_EOI);
5707+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
5708+ v = apic_read(APIC_RRR);
5709+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
5710+ v = apic_read(APIC_LDR);
5711+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
5712+ v = apic_read(APIC_DFR);
5713+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
5714+ v = apic_read(APIC_SPIV);
5715+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
5716+
5717+ printk(KERN_DEBUG "... APIC ISR field:\n");
5718+ print_APIC_bitfield(APIC_ISR);
5719+ printk(KERN_DEBUG "... APIC TMR field:\n");
5720+ print_APIC_bitfield(APIC_TMR);
5721+ printk(KERN_DEBUG "... APIC IRR field:\n");
5722+ print_APIC_bitfield(APIC_IRR);
5723+
5724+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
5725+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
5726+ apic_write(APIC_ESR, 0);
5727+ v = apic_read(APIC_ESR);
5728+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
5729+ }
5730+
5731+ v = apic_read(APIC_ICR);
5732+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
5733+ v = apic_read(APIC_ICR2);
5734+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
5735+
5736+ v = apic_read(APIC_LVTT);
5737+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
5738+
5739+ if (maxlvt > 3) { /* PC is LVT#4. */
5740+ v = apic_read(APIC_LVTPC);
5741+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
5742+ }
5743+ v = apic_read(APIC_LVT0);
5744+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
5745+ v = apic_read(APIC_LVT1);
5746+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
5747+
5748+ if (maxlvt > 2) { /* ERR is LVT#3. */
5749+ v = apic_read(APIC_LVTERR);
5750+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
5751+ }
5752+
5753+ v = apic_read(APIC_TMICT);
5754+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
5755+ v = apic_read(APIC_TMCCT);
5756+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
5757+ v = apic_read(APIC_TDCR);
5758+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
5759+ printk("\n");
5760+}
5761+
5762+void print_all_local_APICs (void)
5763+{
5764+ on_each_cpu(print_local_APIC, NULL, 1, 1);
5765+}
5766+
5767+void /*__init*/ print_PIC(void)
5768+{
5769+ unsigned int v;
5770+ unsigned long flags;
5771+
5772+ if (apic_verbosity == APIC_QUIET)
5773+ return;
5774+
5775+ printk(KERN_DEBUG "\nprinting PIC contents\n");
5776+
5777+ spin_lock_irqsave(&i8259A_lock, flags);
5778+
5779+ v = inb(0xa1) << 8 | inb(0x21);
5780+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
5781+
5782+ v = inb(0xa0) << 8 | inb(0x20);
5783+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
5784+
5785+ outb(0x0b,0xa0);
5786+ outb(0x0b,0x20);
5787+ v = inb(0xa0) << 8 | inb(0x20);
5788+ outb(0x0a,0xa0);
5789+ outb(0x0a,0x20);
5790+
5791+ spin_unlock_irqrestore(&i8259A_lock, flags);
5792+
5793+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
5794+
5795+ v = inb(0x4d1) << 8 | inb(0x4d0);
5796+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
5797+}
5798+
5799+#endif /* 0 */
5800+
5801+#else
5802+void __init print_IO_APIC(void) { }
5803+#endif /* !CONFIG_XEN */
5804+
5805+static void __init enable_IO_APIC(void)
5806+{
5807+ union IO_APIC_reg_01 reg_01;
5808+ int i8259_apic, i8259_pin;
5809+ int i, apic;
5810+ unsigned long flags;
5811+
5812+ for (i = 0; i < PIN_MAP_SIZE; i++) {
5813+ irq_2_pin[i].pin = -1;
5814+ irq_2_pin[i].next = 0;
5815+ }
5816+ if (!pirqs_enabled)
5817+ for (i = 0; i < MAX_PIRQS; i++)
5818+ pirq_entries[i] = -1;
5819+
5820+ /*
5821+ * The number of IO-APIC IRQ registers (== #pins):
5822+ */
5823+ for (apic = 0; apic < nr_ioapics; apic++) {
5824+ spin_lock_irqsave(&ioapic_lock, flags);
5825+ reg_01.raw = io_apic_read(apic, 1);
5826+ spin_unlock_irqrestore(&ioapic_lock, flags);
5827+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
5828+ }
5829+ for(apic = 0; apic < nr_ioapics; apic++) {
5830+ int pin;
5831+ /* See if any of the pins is in ExtINT mode */
5832+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5833+ struct IO_APIC_route_entry entry;
5834+ spin_lock_irqsave(&ioapic_lock, flags);
5835+ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5836+ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5837+ spin_unlock_irqrestore(&ioapic_lock, flags);
5838+
5839+
5840+ /* If the interrupt line is enabled and in ExtInt mode
5841+ * I have found the pin where the i8259 is connected.
5842+ */
5843+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
5844+ ioapic_i8259.apic = apic;
5845+ ioapic_i8259.pin = pin;
5846+ goto found_i8259;
5847+ }
5848+ }
5849+ }
5850+ found_i8259:
5851+ /* Look to see what if the MP table has reported the ExtINT */
5852+ /* If we could not find the appropriate pin by looking at the ioapic
5853+ * the i8259 probably is not connected the ioapic but give the
5854+ * mptable a chance anyway.
5855+ */
5856+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
5857+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
5858+ /* Trust the MP table if nothing is setup in the hardware */
5859+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
5860+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
5861+ ioapic_i8259.pin = i8259_pin;
5862+ ioapic_i8259.apic = i8259_apic;
5863+ }
5864+ /* Complain if the MP table and the hardware disagree */
5865+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
5866+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
5867+ {
5868+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
5869+ }
5870+
5871+ /*
5872+ * Do not trust the IO-APIC being empty at bootup
5873+ */
5874+ clear_IO_APIC();
5875+}
5876+
5877+/*
5878+ * Not an __init, needed by the reboot code
5879+ */
5880+void disable_IO_APIC(void)
5881+{
5882+ /*
5883+ * Clear the IO-APIC before rebooting:
5884+ */
5885+ clear_IO_APIC();
5886+
5887+#ifndef CONFIG_XEN
5888+ /*
5889+ * If the i8259 is routed through an IOAPIC
5890+ * Put that IOAPIC in virtual wire mode
5891+ * so legacy interrupts can be delivered.
5892+ */
5893+ if (ioapic_i8259.pin != -1) {
5894+ struct IO_APIC_route_entry entry;
5895+ unsigned long flags;
5896+
5897+ memset(&entry, 0, sizeof(entry));
5898+ entry.mask = 0; /* Enabled */
5899+ entry.trigger = 0; /* Edge */
5900+ entry.irr = 0;
5901+ entry.polarity = 0; /* High */
5902+ entry.delivery_status = 0;
5903+ entry.dest_mode = 0; /* Physical */
5904+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
5905+ entry.vector = 0;
5906+ entry.dest.physical.physical_dest =
5907+ GET_APIC_ID(apic_read(APIC_ID));
5908+
5909+ /*
5910+ * Add it to the IO-APIC irq-routing table:
5911+ */
5912+ spin_lock_irqsave(&ioapic_lock, flags);
5913+ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
5914+ *(((int *)&entry)+1));
5915+ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
5916+ *(((int *)&entry)+0));
5917+ spin_unlock_irqrestore(&ioapic_lock, flags);
5918+ }
5919+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
5920+#endif
5921+}
5922+
5923+/*
5924+ * function to set the IO-APIC physical IDs based on the
5925+ * values stored in the MPC table.
5926+ *
5927+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
5928+ */
5929+
5930+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
5931+static void __init setup_ioapic_ids_from_mpc(void)
5932+{
5933+ union IO_APIC_reg_00 reg_00;
5934+ physid_mask_t phys_id_present_map;
5935+ int apic;
5936+ int i;
5937+ unsigned char old_id;
5938+ unsigned long flags;
5939+
5940+ /*
5941+ * Don't check I/O APIC IDs for xAPIC systems. They have
5942+ * no meaning without the serial APIC bus.
5943+ */
5944+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15))
5945+ return;
5946+ /*
5947+ * This is broken; anything with a real cpu count has to
5948+ * circumvent this idiocy regardless.
5949+ */
5950+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
5951+
5952+ /*
5953+ * Set the IOAPIC ID to the value stored in the MPC table.
5954+ */
5955+ for (apic = 0; apic < nr_ioapics; apic++) {
5956+
5957+ /* Read the register 0 value */
5958+ spin_lock_irqsave(&ioapic_lock, flags);
5959+ reg_00.raw = io_apic_read(apic, 0);
5960+ spin_unlock_irqrestore(&ioapic_lock, flags);
5961+
5962+ old_id = mp_ioapics[apic].mpc_apicid;
5963+
5964+ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
5965+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
5966+ apic, mp_ioapics[apic].mpc_apicid);
5967+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5968+ reg_00.bits.ID);
5969+ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5970+ }
5971+
5972+ /*
5973+ * Sanity check, is the ID really free? Every APIC in a
5974+ * system must have a unique ID or we get lots of nice
5975+ * 'stuck on smp_invalidate_needed IPI wait' messages.
5976+ */
5977+ if (check_apicid_used(phys_id_present_map,
5978+ mp_ioapics[apic].mpc_apicid)) {
5979+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5980+ apic, mp_ioapics[apic].mpc_apicid);
5981+ for (i = 0; i < get_physical_broadcast(); i++)
5982+ if (!physid_isset(i, phys_id_present_map))
5983+ break;
5984+ if (i >= get_physical_broadcast())
5985+ panic("Max APIC ID exceeded!\n");
5986+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5987+ i);
5988+ physid_set(i, phys_id_present_map);
5989+ mp_ioapics[apic].mpc_apicid = i;
5990+ } else {
5991+ physid_mask_t tmp;
5992+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5993+ apic_printk(APIC_VERBOSE, "Setting %d in the "
5994+ "phys_id_present_map\n",
5995+ mp_ioapics[apic].mpc_apicid);
5996+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
5997+ }
5998+
5999+
6000+ /*
6001+ * We need to adjust the IRQ routing table
6002+ * if the ID changed.
6003+ */
6004+ if (old_id != mp_ioapics[apic].mpc_apicid)
6005+ for (i = 0; i < mp_irq_entries; i++)
6006+ if (mp_irqs[i].mpc_dstapic == old_id)
6007+ mp_irqs[i].mpc_dstapic
6008+ = mp_ioapics[apic].mpc_apicid;
6009+
6010+ /*
6011+ * Read the right value from the MPC table and
6012+ * write it into the ID register.
6013+ */
6014+ apic_printk(APIC_VERBOSE, KERN_INFO
6015+ "...changing IO-APIC physical APIC ID to %d ...",
6016+ mp_ioapics[apic].mpc_apicid);
6017+
6018+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
6019+ spin_lock_irqsave(&ioapic_lock, flags);
6020+ io_apic_write(apic, 0, reg_00.raw);
6021+ spin_unlock_irqrestore(&ioapic_lock, flags);
6022+
6023+ /*
6024+ * Sanity check
6025+ */
6026+ spin_lock_irqsave(&ioapic_lock, flags);
6027+ reg_00.raw = io_apic_read(apic, 0);
6028+ spin_unlock_irqrestore(&ioapic_lock, flags);
6029+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
6030+ printk("could not set ID!\n");
6031+ else
6032+ apic_printk(APIC_VERBOSE, " ok.\n");
6033+ }
6034+}
6035+#else
6036+static void __init setup_ioapic_ids_from_mpc(void) { }
6037+#endif
6038+
6039+#ifndef CONFIG_XEN
6040+/*
6041+ * There is a nasty bug in some older SMP boards, their mptable lies
6042+ * about the timer IRQ. We do the following to work around the situation:
6043+ *
6044+ * - timer IRQ defaults to IO-APIC IRQ
6045+ * - if this function detects that timer IRQs are defunct, then we fall
6046+ * back to ISA timer IRQs
6047+ */
6048+static int __init timer_irq_works(void)
6049+{
6050+ unsigned long t1 = jiffies;
6051+
6052+ local_irq_enable();
6053+ /* Let ten ticks pass... */
6054+ mdelay((10 * 1000) / HZ);
6055+
6056+ /*
6057+ * Expect a few ticks at least, to be sure some possible
6058+ * glue logic does not lock up after one or two first
6059+ * ticks in a non-ExtINT mode. Also the local APIC
6060+ * might have cached one ExtINT interrupt. Finally, at
6061+ * least one tick may be lost due to delays.
6062+ */
6063+ if (jiffies - t1 > 4)
6064+ return 1;
6065+
6066+ return 0;
6067+}
6068+
6069+/*
6070+ * In the SMP+IOAPIC case it might happen that there are an unspecified
6071+ * number of pending IRQ events unhandled. These cases are very rare,
6072+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
6073+ * better to do it this way as thus we do not have to be aware of
6074+ * 'pending' interrupts in the IRQ path, except at this point.
6075+ */
6076+/*
6077+ * Edge triggered needs to resend any interrupt
6078+ * that was delayed but this is now handled in the device
6079+ * independent code.
6080+ */
6081+
6082+/*
6083+ * Starting up a edge-triggered IO-APIC interrupt is
6084+ * nasty - we need to make sure that we get the edge.
6085+ * If it is already asserted for some reason, we need
6086+ * return 1 to indicate that is was pending.
6087+ *
6088+ * This is not complete - we should be able to fake
6089+ * an edge even if it isn't on the 8259A...
6090+ */
6091+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
6092+{
6093+ int was_pending = 0;
6094+ unsigned long flags;
6095+
6096+ spin_lock_irqsave(&ioapic_lock, flags);
6097+ if (irq < 16) {
6098+ disable_8259A_irq(irq);
6099+ if (i8259A_irq_pending(irq))
6100+ was_pending = 1;
6101+ }
6102+ __unmask_IO_APIC_irq(irq);
6103+ spin_unlock_irqrestore(&ioapic_lock, flags);
6104+
6105+ return was_pending;
6106+}
6107+
6108+/*
6109+ * Once we have recorded IRQ_PENDING already, we can mask the
6110+ * interrupt for real. This prevents IRQ storms from unhandled
6111+ * devices.
6112+ */
6113+static void ack_edge_ioapic_irq(unsigned int irq)
6114+{
6115+ move_irq(irq);
6116+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
6117+ == (IRQ_PENDING | IRQ_DISABLED))
6118+ mask_IO_APIC_irq(irq);
6119+ ack_APIC_irq();
6120+}
6121+
6122+/*
6123+ * Level triggered interrupts can just be masked,
6124+ * and shutting down and starting up the interrupt
6125+ * is the same as enabling and disabling them -- except
6126+ * with a startup need to return a "was pending" value.
6127+ *
6128+ * Level triggered interrupts are special because we
6129+ * do not touch any IO-APIC register while handling
6130+ * them. We ack the APIC in the end-IRQ handler, not
6131+ * in the start-IRQ-handler. Protection against reentrance
6132+ * from the same interrupt is still provided, both by the
6133+ * generic IRQ layer and by the fact that an unacked local
6134+ * APIC does not accept IRQs.
6135+ */
6136+static unsigned int startup_level_ioapic_irq (unsigned int irq)
6137+{
6138+ unmask_IO_APIC_irq(irq);
6139+
6140+ return 0; /* don't check for pending */
6141+}
6142+
6143+static void end_level_ioapic_irq (unsigned int irq)
6144+{
6145+ unsigned long v;
6146+ int i;
6147+
6148+ move_irq(irq);
6149+/*
6150+ * It appears there is an erratum which affects at least version 0x11
6151+ * of I/O APIC (that's the 82093AA and cores integrated into various
6152+ * chipsets). Under certain conditions a level-triggered interrupt is
6153+ * erroneously delivered as edge-triggered one but the respective IRR
6154+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
6155+ * message but it will never arrive and further interrupts are blocked
6156+ * from the source. The exact reason is so far unknown, but the
6157+ * phenomenon was observed when two consecutive interrupt requests
6158+ * from a given source get delivered to the same CPU and the source is
6159+ * temporarily disabled in between.
6160+ *
6161+ * A workaround is to simulate an EOI message manually. We achieve it
6162+ * by setting the trigger mode to edge and then to level when the edge
6163+ * trigger mode gets detected in the TMR of a local APIC for a
6164+ * level-triggered interrupt. We mask the source for the time of the
6165+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
6166+ * The idea is from Manfred Spraul. --macro
6167+ */
6168+ i = IO_APIC_VECTOR(irq);
6169+
6170+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
6171+
6172+ ack_APIC_irq();
6173+
6174+ if (!(v & (1 << (i & 0x1f)))) {
6175+ atomic_inc(&irq_mis_count);
6176+ spin_lock(&ioapic_lock);
6177+ __mask_and_edge_IO_APIC_irq(irq);
6178+ __unmask_and_level_IO_APIC_irq(irq);
6179+ spin_unlock(&ioapic_lock);
6180+ }
6181+}
6182+
6183+#ifdef CONFIG_PCI_MSI
6184+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
6185+{
6186+ int irq = vector_to_irq(vector);
6187+
6188+ return startup_edge_ioapic_irq(irq);
6189+}
6190+
6191+static void ack_edge_ioapic_vector(unsigned int vector)
6192+{
6193+ int irq = vector_to_irq(vector);
6194+
6195+ move_native_irq(vector);
6196+ ack_edge_ioapic_irq(irq);
6197+}
6198+
6199+static unsigned int startup_level_ioapic_vector (unsigned int vector)
6200+{
6201+ int irq = vector_to_irq(vector);
6202+
6203+ return startup_level_ioapic_irq (irq);
6204+}
6205+
6206+static void end_level_ioapic_vector (unsigned int vector)
6207+{
6208+ int irq = vector_to_irq(vector);
6209+
6210+ move_native_irq(vector);
6211+ end_level_ioapic_irq(irq);
6212+}
6213+
6214+static void mask_IO_APIC_vector (unsigned int vector)
6215+{
6216+ int irq = vector_to_irq(vector);
6217+
6218+ mask_IO_APIC_irq(irq);
6219+}
6220+
6221+static void unmask_IO_APIC_vector (unsigned int vector)
6222+{
6223+ int irq = vector_to_irq(vector);
6224+
6225+ unmask_IO_APIC_irq(irq);
6226+}
6227+
6228+#ifdef CONFIG_SMP
6229+static void set_ioapic_affinity_vector (unsigned int vector,
6230+ cpumask_t cpu_mask)
6231+{
6232+ int irq = vector_to_irq(vector);
6233+
6234+ set_native_irq_info(vector, cpu_mask);
6235+ set_ioapic_affinity_irq(irq, cpu_mask);
6236+}
6237+#endif
6238+#endif
6239+
6240+/*
6241+ * Level and edge triggered IO-APIC interrupts need different handling,
6242+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
6243+ * handled with the level-triggered descriptor, but that one has slightly
6244+ * more overhead. Level-triggered interrupts cannot be handled with the
6245+ * edge-triggered handler, without risking IRQ storms and other ugly
6246+ * races.
6247+ */
6248+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
6249+ .typename = "IO-APIC-edge",
6250+ .startup = startup_edge_ioapic,
6251+ .shutdown = shutdown_edge_ioapic,
6252+ .enable = enable_edge_ioapic,
6253+ .disable = disable_edge_ioapic,
6254+ .ack = ack_edge_ioapic,
6255+ .end = end_edge_ioapic,
6256+#ifdef CONFIG_SMP
6257+ .set_affinity = set_ioapic_affinity,
6258+#endif
6259+};
6260+
6261+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
6262+ .typename = "IO-APIC-level",
6263+ .startup = startup_level_ioapic,
6264+ .shutdown = shutdown_level_ioapic,
6265+ .enable = enable_level_ioapic,
6266+ .disable = disable_level_ioapic,
6267+ .ack = mask_and_ack_level_ioapic,
6268+ .end = end_level_ioapic,
6269+#ifdef CONFIG_SMP
6270+ .set_affinity = set_ioapic_affinity,
6271+#endif
6272+};
6273+#endif /* !CONFIG_XEN */
6274+
6275+static inline void init_IO_APIC_traps(void)
6276+{
6277+ int irq;
6278+
6279+ /*
6280+ * NOTE! The local APIC isn't very good at handling
6281+ * multiple interrupts at the same interrupt level.
6282+ * As the interrupt level is determined by taking the
6283+ * vector number and shifting that right by 4, we
6284+ * want to spread these out a bit so that they don't
6285+ * all fall in the same interrupt level.
6286+ *
6287+ * Also, we've got to be careful not to trash gate
6288+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
6289+ */
6290+ for (irq = 0; irq < NR_IRQS ; irq++) {
6291+ int tmp = irq;
6292+ if (use_pci_vector()) {
6293+ if (!platform_legacy_irq(tmp))
6294+ if ((tmp = vector_to_irq(tmp)) == -1)
6295+ continue;
6296+ }
6297+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
6298+ /*
6299+ * Hmm.. We don't have an entry for this,
6300+ * so default to an old-fashioned 8259
6301+ * interrupt if we can..
6302+ */
6303+ if (irq < 16)
6304+ make_8259A_irq(irq);
6305+#ifndef CONFIG_XEN
6306+ else
6307+ /* Strange. Oh, well.. */
6308+ irq_desc[irq].handler = &no_irq_type;
6309+#endif
6310+ }
6311+ }
6312+}
6313+
6314+#ifndef CONFIG_XEN
6315+static void enable_lapic_irq (unsigned int irq)
6316+{
6317+ unsigned long v;
6318+
6319+ v = apic_read(APIC_LVT0);
6320+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6321+}
6322+
6323+static void disable_lapic_irq (unsigned int irq)
6324+{
6325+ unsigned long v;
6326+
6327+ v = apic_read(APIC_LVT0);
6328+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6329+}
6330+
6331+static void ack_lapic_irq (unsigned int irq)
6332+{
6333+ ack_APIC_irq();
6334+}
6335+
6336+static void end_lapic_irq (unsigned int i) { /* nothing */ }
6337+
6338+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
6339+ .typename = "local-APIC-edge",
6340+ .startup = NULL, /* startup_irq() not used for IRQ0 */
6341+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
6342+ .enable = enable_lapic_irq,
6343+ .disable = disable_lapic_irq,
6344+ .ack = ack_lapic_irq,
6345+ .end = end_lapic_irq
6346+};
6347+
6348+static void setup_nmi (void)
6349+{
6350+ /*
6351+ * Dirty trick to enable the NMI watchdog ...
6352+ * We put the 8259A master into AEOI mode and
6353+ * unmask on all local APICs LVT0 as NMI.
6354+ *
6355+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6356+ * is from Maciej W. Rozycki - so we do not have to EOI from
6357+ * the NMI handler or the timer interrupt.
6358+ */
6359+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6360+
6361+ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6362+
6363+ apic_printk(APIC_VERBOSE, " done.\n");
6364+}
6365+
6366+/*
6367+ * This looks a bit hackish but it's about the only one way of sending
6368+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
6369+ * not support the ExtINT mode, unfortunately. We need to send these
6370+ * cycles as some i82489DX-based boards have glue logic that keeps the
6371+ * 8259A interrupt line asserted until INTA. --macro
6372+ */
6373+static inline void unlock_ExtINT_logic(void)
6374+{
6375+ int apic, pin, i;
6376+ struct IO_APIC_route_entry entry0, entry1;
6377+ unsigned char save_control, save_freq_select;
6378+ unsigned long flags;
6379+
6380+ pin = find_isa_irq_pin(8, mp_INT);
6381+ apic = find_isa_irq_apic(8, mp_INT);
6382+ if (pin == -1)
6383+ return;
6384+
6385+ spin_lock_irqsave(&ioapic_lock, flags);
6386+ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
6387+ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
6388+ spin_unlock_irqrestore(&ioapic_lock, flags);
6389+ clear_IO_APIC_pin(apic, pin);
6390+
6391+ memset(&entry1, 0, sizeof(entry1));
6392+
6393+ entry1.dest_mode = 0; /* physical delivery */
6394+ entry1.mask = 0; /* unmask IRQ now */
6395+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
6396+ entry1.delivery_mode = dest_ExtINT;
6397+ entry1.polarity = entry0.polarity;
6398+ entry1.trigger = 0;
6399+ entry1.vector = 0;
6400+
6401+ spin_lock_irqsave(&ioapic_lock, flags);
6402+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
6403+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
6404+ spin_unlock_irqrestore(&ioapic_lock, flags);
6405+
6406+ save_control = CMOS_READ(RTC_CONTROL);
6407+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6408+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6409+ RTC_FREQ_SELECT);
6410+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6411+
6412+ i = 100;
6413+ while (i-- > 0) {
6414+ mdelay(10);
6415+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6416+ i -= 10;
6417+ }
6418+
6419+ CMOS_WRITE(save_control, RTC_CONTROL);
6420+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
6421+ clear_IO_APIC_pin(apic, pin);
6422+
6423+ spin_lock_irqsave(&ioapic_lock, flags);
6424+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
6425+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
6426+ spin_unlock_irqrestore(&ioapic_lock, flags);
6427+}
6428+
6429+/*
6430+ * This code may look a bit paranoid, but it's supposed to cooperate with
6431+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
6432+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
6433+ * fanatically on his truly buggy board.
6434+ */
6435+static inline void check_timer(void)
6436+{
6437+ int apic1, pin1, apic2, pin2;
6438+ int vector;
6439+
6440+ /*
6441+ * get/set the timer IRQ vector:
6442+ */
6443+ disable_8259A_irq(0);
6444+ vector = assign_irq_vector(0);
6445+ set_intr_gate(vector, interrupt[0]);
6446+
6447+ /*
6448+ * Subtle, code in do_timer_interrupt() expects an AEOI
6449+ * mode for the 8259A whenever interrupts are routed
6450+ * through I/O APICs. Also IRQ0 has to be enabled in
6451+ * the 8259A which implies the virtual wire has to be
6452+ * disabled in the local APIC.
6453+ */
6454+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6455+ init_8259A(1);
6456+ timer_ack = 1;
6457+ if (timer_over_8254 > 0)
6458+ enable_8259A_irq(0);
6459+
6460+ pin1 = find_isa_irq_pin(0, mp_INT);
6461+ apic1 = find_isa_irq_apic(0, mp_INT);
6462+ pin2 = ioapic_i8259.pin;
6463+ apic2 = ioapic_i8259.apic;
6464+
6465+ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
6466+ vector, apic1, pin1, apic2, pin2);
6467+
6468+ if (pin1 != -1) {
6469+ /*
6470+ * Ok, does IRQ0 through the IOAPIC work?
6471+ */
6472+ unmask_IO_APIC_irq(0);
6473+ if (timer_irq_works()) {
6474+ if (nmi_watchdog == NMI_IO_APIC) {
6475+ disable_8259A_irq(0);
6476+ setup_nmi();
6477+ enable_8259A_irq(0);
6478+ }
6479+ if (disable_timer_pin_1 > 0)
6480+ clear_IO_APIC_pin(0, pin1);
6481+ return;
6482+ }
6483+ clear_IO_APIC_pin(apic1, pin1);
6484+ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
6485+ "IO-APIC\n");
6486+ }
6487+
6488+ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
6489+ if (pin2 != -1) {
6490+ printk("\n..... (found pin %d) ...", pin2);
6491+ /*
6492+ * legacy devices should be connected to IO APIC #0
6493+ */
6494+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
6495+ if (timer_irq_works()) {
6496+ printk("works.\n");
6497+ if (pin1 != -1)
6498+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
6499+ else
6500+ add_pin_to_irq(0, apic2, pin2);
6501+ if (nmi_watchdog == NMI_IO_APIC) {
6502+ setup_nmi();
6503+ }
6504+ return;
6505+ }
6506+ /*
6507+ * Cleanup, just in case ...
6508+ */
6509+ clear_IO_APIC_pin(apic2, pin2);
6510+ }
6511+ printk(" failed.\n");
6512+
6513+ if (nmi_watchdog == NMI_IO_APIC) {
6514+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
6515+ nmi_watchdog = 0;
6516+ }
6517+
6518+ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
6519+
6520+ disable_8259A_irq(0);
6521+ irq_desc[0].handler = &lapic_irq_type;
6522+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
6523+ enable_8259A_irq(0);
6524+
6525+ if (timer_irq_works()) {
6526+ printk(" works.\n");
6527+ return;
6528+ }
6529+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
6530+ printk(" failed.\n");
6531+
6532+ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
6533+
6534+ timer_ack = 0;
6535+ init_8259A(0);
6536+ make_8259A_irq(0);
6537+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
6538+
6539+ unlock_ExtINT_logic();
6540+
6541+ if (timer_irq_works()) {
6542+ printk(" works.\n");
6543+ return;
6544+ }
6545+ printk(" failed :(.\n");
6546+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
6547+ "report. Then try booting with the 'noapic' option");
6548+}
6549+#else
6550+#define check_timer() ((void)0)
6551+#endif
6552+
6553+/*
6554+ *
6555+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
6556+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
6557+ * Linux doesn't really care, as it's not actually used
6558+ * for any interrupt handling anyway.
6559+ */
6560+#define PIC_IRQS (1 << PIC_CASCADE_IR)
6561+
6562+void __init setup_IO_APIC(void)
6563+{
6564+ enable_IO_APIC();
6565+
6566+ if (acpi_ioapic)
6567+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
6568+ else
6569+ io_apic_irqs = ~PIC_IRQS;
6570+
6571+ printk("ENABLING IO-APIC IRQs\n");
6572+
6573+ /*
6574+ * Set up IO-APIC IRQ routing.
6575+ */
6576+ if (!acpi_ioapic)
6577+ setup_ioapic_ids_from_mpc();
6578+#ifndef CONFIG_XEN
6579+ sync_Arb_IDs();
6580+#endif
6581+ setup_IO_APIC_irqs();
6582+ init_IO_APIC_traps();
6583+ check_timer();
6584+ if (!acpi_ioapic)
6585+ print_IO_APIC();
6586+}
6587+
6588+static int __init setup_disable_8254_timer(char *s)
6589+{
6590+ timer_over_8254 = -1;
6591+ return 1;
6592+}
6593+static int __init setup_enable_8254_timer(char *s)
6594+{
6595+ timer_over_8254 = 2;
6596+ return 1;
6597+}
6598+
6599+__setup("disable_8254_timer", setup_disable_8254_timer);
6600+__setup("enable_8254_timer", setup_enable_8254_timer);
6601+
6602+/*
6603+ * Called after all the initialization is done. If we didnt find any
6604+ * APIC bugs then we can allow the modify fast path
6605+ */
6606+
6607+static int __init io_apic_bug_finalize(void)
6608+{
6609+ if(sis_apic_bug == -1)
6610+ sis_apic_bug = 0;
6611+ if (is_initial_xendomain()) {
6612+ dom0_op_t op = { .cmd = DOM0_PLATFORM_QUIRK };
6613+ op.u.platform_quirk.quirk_id = sis_apic_bug ?
6614+ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
6615+ HYPERVISOR_dom0_op(&op);
6616+ }
6617+ return 0;
6618+}
6619+
6620+late_initcall(io_apic_bug_finalize);
6621+
6622+struct sysfs_ioapic_data {
6623+ struct sys_device dev;
6624+ struct IO_APIC_route_entry entry[0];
6625+};
6626+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
6627+
6628+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
6629+{
6630+ struct IO_APIC_route_entry *entry;
6631+ struct sysfs_ioapic_data *data;
6632+ unsigned long flags;
6633+ int i;
6634+
6635+ data = container_of(dev, struct sysfs_ioapic_data, dev);
6636+ entry = data->entry;
6637+ spin_lock_irqsave(&ioapic_lock, flags);
6638+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6639+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
6640+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
6641+ }
6642+ spin_unlock_irqrestore(&ioapic_lock, flags);
6643+
6644+ return 0;
6645+}
6646+
6647+static int ioapic_resume(struct sys_device *dev)
6648+{
6649+ struct IO_APIC_route_entry *entry;
6650+ struct sysfs_ioapic_data *data;
6651+ unsigned long flags;
6652+ union IO_APIC_reg_00 reg_00;
6653+ int i;
6654+
6655+ data = container_of(dev, struct sysfs_ioapic_data, dev);
6656+ entry = data->entry;
6657+
6658+ spin_lock_irqsave(&ioapic_lock, flags);
6659+ reg_00.raw = io_apic_read(dev->id, 0);
6660+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
6661+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
6662+ io_apic_write(dev->id, 0, reg_00.raw);
6663+ }
6664+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6665+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
6666+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
6667+ }
6668+ spin_unlock_irqrestore(&ioapic_lock, flags);
6669+
6670+ return 0;
6671+}
6672+
6673+static struct sysdev_class ioapic_sysdev_class = {
6674+ set_kset_name("ioapic"),
6675+ .suspend = ioapic_suspend,
6676+ .resume = ioapic_resume,
6677+};
6678+
6679+static int __init ioapic_init_sysfs(void)
6680+{
6681+ struct sys_device * dev;
6682+ int i, size, error = 0;
6683+
6684+ error = sysdev_class_register(&ioapic_sysdev_class);
6685+ if (error)
6686+ return error;
6687+
6688+ for (i = 0; i < nr_ioapics; i++ ) {
6689+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
6690+ * sizeof(struct IO_APIC_route_entry);
6691+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
6692+ if (!mp_ioapic_data[i]) {
6693+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6694+ continue;
6695+ }
6696+ memset(mp_ioapic_data[i], 0, size);
6697+ dev = &mp_ioapic_data[i]->dev;
6698+ dev->id = i;
6699+ dev->cls = &ioapic_sysdev_class;
6700+ error = sysdev_register(dev);
6701+ if (error) {
6702+ kfree(mp_ioapic_data[i]);
6703+ mp_ioapic_data[i] = NULL;
6704+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6705+ continue;
6706+ }
6707+ }
6708+
6709+ return 0;
6710+}
6711+
6712+device_initcall(ioapic_init_sysfs);
6713+
6714+/* --------------------------------------------------------------------------
6715+ ACPI-based IOAPIC Configuration
6716+ -------------------------------------------------------------------------- */
6717+
6718+#ifdef CONFIG_ACPI
6719+
6720+int __init io_apic_get_unique_id (int ioapic, int apic_id)
6721+{
6722+#ifndef CONFIG_XEN
6723+ union IO_APIC_reg_00 reg_00;
6724+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
6725+ physid_mask_t tmp;
6726+ unsigned long flags;
6727+ int i = 0;
6728+
6729+ /*
6730+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
6731+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
6732+ * supports up to 16 on one shared APIC bus.
6733+ *
6734+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
6735+ * advantage of new APIC bus architecture.
6736+ */
6737+
6738+ if (physids_empty(apic_id_map))
6739+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
6740+
6741+ spin_lock_irqsave(&ioapic_lock, flags);
6742+ reg_00.raw = io_apic_read(ioapic, 0);
6743+ spin_unlock_irqrestore(&ioapic_lock, flags);
6744+
6745+ if (apic_id >= get_physical_broadcast()) {
6746+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
6747+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
6748+ apic_id = reg_00.bits.ID;
6749+ }
6750+
6751+ /*
6752+ * Every APIC in a system must have a unique ID or we get lots of nice
6753+ * 'stuck on smp_invalidate_needed IPI wait' messages.
6754+ */
6755+ if (check_apicid_used(apic_id_map, apic_id)) {
6756+
6757+ for (i = 0; i < get_physical_broadcast(); i++) {
6758+ if (!check_apicid_used(apic_id_map, i))
6759+ break;
6760+ }
6761+
6762+ if (i == get_physical_broadcast())
6763+ panic("Max apic_id exceeded!\n");
6764+
6765+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
6766+ "trying %d\n", ioapic, apic_id, i);
6767+
6768+ apic_id = i;
6769+ }
6770+
6771+ tmp = apicid_to_cpu_present(apic_id);
6772+ physids_or(apic_id_map, apic_id_map, tmp);
6773+
6774+ if (reg_00.bits.ID != apic_id) {
6775+ reg_00.bits.ID = apic_id;
6776+
6777+ spin_lock_irqsave(&ioapic_lock, flags);
6778+ io_apic_write(ioapic, 0, reg_00.raw);
6779+ reg_00.raw = io_apic_read(ioapic, 0);
6780+ spin_unlock_irqrestore(&ioapic_lock, flags);
6781+
6782+ /* Sanity check */
6783+ if (reg_00.bits.ID != apic_id) {
6784+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
6785+ return -1;
6786+ }
6787+ }
6788+
6789+ apic_printk(APIC_VERBOSE, KERN_INFO
6790+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
6791+#endif /* !CONFIG_XEN */
6792+
6793+ return apic_id;
6794+}
6795+
6796+
6797+int __init io_apic_get_version (int ioapic)
6798+{
6799+ union IO_APIC_reg_01 reg_01;
6800+ unsigned long flags;
6801+
6802+ spin_lock_irqsave(&ioapic_lock, flags);
6803+ reg_01.raw = io_apic_read(ioapic, 1);
6804+ spin_unlock_irqrestore(&ioapic_lock, flags);
6805+
6806+ return reg_01.bits.version;
6807+}
6808+
6809+
6810+int __init io_apic_get_redir_entries (int ioapic)
6811+{
6812+ union IO_APIC_reg_01 reg_01;
6813+ unsigned long flags;
6814+
6815+ spin_lock_irqsave(&ioapic_lock, flags);
6816+ reg_01.raw = io_apic_read(ioapic, 1);
6817+ spin_unlock_irqrestore(&ioapic_lock, flags);
6818+
6819+ return reg_01.bits.entries;
6820+}
6821+
6822+
6823+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
6824+{
6825+ struct IO_APIC_route_entry entry;
6826+ unsigned long flags;
6827+
6828+ if (!IO_APIC_IRQ(irq)) {
6829+ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
6830+ ioapic);
6831+ return -EINVAL;
6832+ }
6833+
6834+ /*
6835+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
6836+ * Note that we mask (disable) IRQs now -- these get enabled when the
6837+ * corresponding device driver registers for this IRQ.
6838+ */
6839+
6840+ memset(&entry,0,sizeof(entry));
6841+
6842+ entry.delivery_mode = INT_DELIVERY_MODE;
6843+ entry.dest_mode = INT_DEST_MODE;
6844+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6845+ entry.trigger = edge_level;
6846+ entry.polarity = active_high_low;
6847+ entry.mask = 1;
6848+
6849+ /*
6850+ * IRQs < 16 are already in the irq_2_pin[] map
6851+ */
6852+ if (irq >= 16)
6853+ add_pin_to_irq(irq, ioapic, pin);
6854+
6855+ entry.vector = assign_irq_vector(irq);
6856+
6857+ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
6858+ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
6859+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
6860+ edge_level, active_high_low);
6861+
6862+ ioapic_register_intr(irq, entry.vector, edge_level);
6863+
6864+ if (!ioapic && (irq < 16))
6865+ disable_8259A_irq(irq);
6866+
6867+ spin_lock_irqsave(&ioapic_lock, flags);
6868+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
6869+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
6870+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
6871+ spin_unlock_irqrestore(&ioapic_lock, flags);
6872+
6873+ return 0;
6874+}
6875+
6876+#endif /* CONFIG_ACPI */
6877diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/ioport-xen.c linux-2.6.16.33/arch/i386/kernel/ioport-xen.c
6878--- linux-2.6.16.33-noxen/arch/i386/kernel/ioport-xen.c 1970-01-01 00:00:00.000000000 +0000
6879+++ linux-2.6.16.33/arch/i386/kernel/ioport-xen.c 2007-01-08 15:00:45.000000000 +0000
6880@@ -0,0 +1,121 @@
6881+/*
6882+ * linux/arch/i386/kernel/ioport.c
6883+ *
6884+ * This contains the io-permission bitmap code - written by obz, with changes
6885+ * by Linus.
6886+ */
6887+
6888+#include <linux/sched.h>
6889+#include <linux/kernel.h>
6890+#include <linux/capability.h>
6891+#include <linux/errno.h>
6892+#include <linux/types.h>
6893+#include <linux/ioport.h>
6894+#include <linux/smp.h>
6895+#include <linux/smp_lock.h>
6896+#include <linux/stddef.h>
6897+#include <linux/slab.h>
6898+#include <linux/thread_info.h>
6899+#include <xen/interface/physdev.h>
6900+
6901+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
6902+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
6903+{
6904+ unsigned long mask;
6905+ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
6906+ unsigned int low_index = base & (BITS_PER_LONG-1);
6907+ int length = low_index + extent;
6908+
6909+ if (low_index != 0) {
6910+ mask = (~0UL << low_index);
6911+ if (length < BITS_PER_LONG)
6912+ mask &= ~(~0UL << length);
6913+ if (new_value)
6914+ *bitmap_base++ |= mask;
6915+ else
6916+ *bitmap_base++ &= ~mask;
6917+ length -= BITS_PER_LONG;
6918+ }
6919+
6920+ mask = (new_value ? ~0UL : 0UL);
6921+ while (length >= BITS_PER_LONG) {
6922+ *bitmap_base++ = mask;
6923+ length -= BITS_PER_LONG;
6924+ }
6925+
6926+ if (length > 0) {
6927+ mask = ~(~0UL << length);
6928+ if (new_value)
6929+ *bitmap_base++ |= mask;
6930+ else
6931+ *bitmap_base++ &= ~mask;
6932+ }
6933+}
6934+
6935+
6936+/*
6937+ * this changes the io permissions bitmap in the current task.
6938+ */
6939+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
6940+{
6941+ struct thread_struct * t = &current->thread;
6942+ unsigned long *bitmap;
6943+ struct physdev_set_iobitmap set_iobitmap;
6944+
6945+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
6946+ return -EINVAL;
6947+ if (turn_on && !capable(CAP_SYS_RAWIO))
6948+ return -EPERM;
6949+
6950+ /*
6951+ * If it's the first ioperm() call in this thread's lifetime, set the
6952+ * IO bitmap up. ioperm() is much less timing critical than clone(),
6953+ * this is why we delay this operation until now:
6954+ */
6955+ if (!t->io_bitmap_ptr) {
6956+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6957+ if (!bitmap)
6958+ return -ENOMEM;
6959+
6960+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
6961+ t->io_bitmap_ptr = bitmap;
6962+
6963+ set_iobitmap.bitmap = (char *)bitmap;
6964+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
6965+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
6966+ }
6967+
6968+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6969+
6970+ return 0;
6971+}
6972+
6973+/*
6974+ * sys_iopl has to be used when you want to access the IO ports
6975+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6976+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
6977+ *
6978+ * Here we just change the eflags value on the stack: we allow
6979+ * only the super-user to do it. This depends on the stack-layout
6980+ * on system-call entry - see also fork() and the signal handling
6981+ * code.
6982+ */
6983+
6984+asmlinkage long sys_iopl(unsigned long unused)
6985+{
6986+ volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6987+ unsigned int level = regs->ebx;
6988+ struct thread_struct *t = &current->thread;
6989+ unsigned int old = (t->iopl >> 12) & 3;
6990+
6991+ if (level > 3)
6992+ return -EINVAL;
6993+ /* Trying to gain more privileges? */
6994+ if (level > old) {
6995+ if (!capable(CAP_SYS_RAWIO))
6996+ return -EPERM;
6997+ }
6998+ t->iopl = level << 12;
6999+ set_iopl_mask(t->iopl);
7000+ return 0;
7001+}
7002diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/irq-xen.c linux-2.6.16.33/arch/i386/kernel/irq-xen.c
7003--- linux-2.6.16.33-noxen/arch/i386/kernel/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
7004+++ linux-2.6.16.33/arch/i386/kernel/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
7005@@ -0,0 +1,306 @@
7006+/*
7007+ * linux/arch/i386/kernel/irq.c
7008+ *
7009+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
7010+ *
7011+ * This file contains the lowest level x86-specific interrupt
7012+ * entry, irq-stacks and irq statistics code. All the remaining
7013+ * irq logic is done by the generic kernel/irq/ code and
7014+ * by the x86-specific irq controller code. (e.g. i8259.c and
7015+ * io_apic.c.)
7016+ */
7017+
7018+#include <asm/uaccess.h>
7019+#include <linux/module.h>
7020+#include <linux/seq_file.h>
7021+#include <linux/interrupt.h>
7022+#include <linux/kernel_stat.h>
7023+#include <linux/notifier.h>
7024+#include <linux/cpu.h>
7025+#include <linux/delay.h>
7026+
7027+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
7028+EXPORT_PER_CPU_SYMBOL(irq_stat);
7029+
7030+#ifndef CONFIG_X86_LOCAL_APIC
7031+/*
7032+ * 'what should we do if we get a hw irq event on an illegal vector'.
7033+ * each architecture has to answer this themselves.
7034+ */
7035+void ack_bad_irq(unsigned int irq)
7036+{
7037+ printk("unexpected IRQ trap at vector %02x\n", irq);
7038+}
7039+#endif
7040+
7041+#ifdef CONFIG_4KSTACKS
7042+/*
7043+ * per-CPU IRQ handling contexts (thread information and stack)
7044+ */
7045+union irq_ctx {
7046+ struct thread_info tinfo;
7047+ u32 stack[THREAD_SIZE/sizeof(u32)];
7048+};
7049+
7050+static union irq_ctx *hardirq_ctx[NR_CPUS];
7051+static union irq_ctx *softirq_ctx[NR_CPUS];
7052+#endif
7053+
7054+/*
7055+ * do_IRQ handles all normal device IRQ's (the special
7056+ * SMP cross-CPU interrupts have their own specific
7057+ * handlers).
7058+ */
7059+fastcall unsigned int do_IRQ(struct pt_regs *regs)
7060+{
7061+ /* high bit used in ret_from_ code */
7062+ int irq = ~regs->orig_eax;
7063+#ifdef CONFIG_4KSTACKS
7064+ union irq_ctx *curctx, *irqctx;
7065+ u32 *isp;
7066+#endif
7067+
7068+ irq_enter();
7069+#ifdef CONFIG_DEBUG_STACKOVERFLOW
7070+ /* Debugging check for stack overflow: is there less than 1KB free? */
7071+ {
7072+ long esp;
7073+
7074+ __asm__ __volatile__("andl %%esp,%0" :
7075+ "=r" (esp) : "0" (THREAD_SIZE - 1));
7076+ if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
7077+ printk("do_IRQ: stack overflow: %ld\n",
7078+ esp - sizeof(struct thread_info));
7079+ dump_stack();
7080+ }
7081+ }
7082+#endif
7083+
7084+#ifdef CONFIG_4KSTACKS
7085+
7086+ curctx = (union irq_ctx *) current_thread_info();
7087+ irqctx = hardirq_ctx[smp_processor_id()];
7088+
7089+ /*
7090+ * this is where we switch to the IRQ stack. However, if we are
7091+ * already using the IRQ stack (because we interrupted a hardirq
7092+ * handler) we can't do that and just have to keep using the
7093+ * current stack (which is the irq stack already after all)
7094+ */
7095+ if (curctx != irqctx) {
7096+ int arg1, arg2, ebx;
7097+
7098+ /* build the stack frame on the IRQ stack */
7099+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7100+ irqctx->tinfo.task = curctx->tinfo.task;
7101+ irqctx->tinfo.previous_esp = current_stack_pointer;
7102+
7103+ asm volatile(
7104+ " xchgl %%ebx,%%esp \n"
7105+ " call __do_IRQ \n"
7106+ " movl %%ebx,%%esp \n"
7107+ : "=a" (arg1), "=d" (arg2), "=b" (ebx)
7108+ : "0" (irq), "1" (regs), "2" (isp)
7109+ : "memory", "cc", "ecx"
7110+ );
7111+ } else
7112+#endif
7113+ __do_IRQ(irq, regs);
7114+
7115+ irq_exit();
7116+
7117+ return 1;
7118+}
7119+
7120+#ifdef CONFIG_4KSTACKS
7121+
7122+/*
7123+ * These should really be __section__(".bss.page_aligned") as well, but
7124+ * gcc's 3.0 and earlier don't handle that correctly.
7125+ */
7126+static char softirq_stack[NR_CPUS * THREAD_SIZE]
7127+ __attribute__((__aligned__(THREAD_SIZE)));
7128+
7129+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
7130+ __attribute__((__aligned__(THREAD_SIZE)));
7131+
7132+/*
7133+ * allocate per-cpu stacks for hardirq and for softirq processing
7134+ */
7135+void irq_ctx_init(int cpu)
7136+{
7137+ union irq_ctx *irqctx;
7138+
7139+ if (hardirq_ctx[cpu])
7140+ return;
7141+
7142+ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
7143+ irqctx->tinfo.task = NULL;
7144+ irqctx->tinfo.exec_domain = NULL;
7145+ irqctx->tinfo.cpu = cpu;
7146+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
7147+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
7148+
7149+ hardirq_ctx[cpu] = irqctx;
7150+
7151+ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
7152+ irqctx->tinfo.task = NULL;
7153+ irqctx->tinfo.exec_domain = NULL;
7154+ irqctx->tinfo.cpu = cpu;
7155+ irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
7156+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
7157+
7158+ softirq_ctx[cpu] = irqctx;
7159+
7160+ printk("CPU %u irqstacks, hard=%p soft=%p\n",
7161+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
7162+}
7163+
7164+void irq_ctx_exit(int cpu)
7165+{
7166+ hardirq_ctx[cpu] = NULL;
7167+}
7168+
7169+extern asmlinkage void __do_softirq(void);
7170+
7171+asmlinkage void do_softirq(void)
7172+{
7173+ unsigned long flags;
7174+ struct thread_info *curctx;
7175+ union irq_ctx *irqctx;
7176+ u32 *isp;
7177+
7178+ if (in_interrupt())
7179+ return;
7180+
7181+ local_irq_save(flags);
7182+
7183+ if (local_softirq_pending()) {
7184+ curctx = current_thread_info();
7185+ irqctx = softirq_ctx[smp_processor_id()];
7186+ irqctx->tinfo.task = curctx->task;
7187+ irqctx->tinfo.previous_esp = current_stack_pointer;
7188+
7189+ /* build the stack frame on the softirq stack */
7190+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7191+
7192+ asm volatile(
7193+ " xchgl %%ebx,%%esp \n"
7194+ " call __do_softirq \n"
7195+ " movl %%ebx,%%esp \n"
7196+ : "=b"(isp)
7197+ : "0"(isp)
7198+ : "memory", "cc", "edx", "ecx", "eax"
7199+ );
7200+ }
7201+
7202+ local_irq_restore(flags);
7203+}
7204+
7205+EXPORT_SYMBOL(do_softirq);
7206+#endif
7207+
7208+/*
7209+ * Interrupt statistics:
7210+ */
7211+
7212+atomic_t irq_err_count;
7213+
7214+/*
7215+ * /proc/interrupts printing:
7216+ */
7217+
7218+int show_interrupts(struct seq_file *p, void *v)
7219+{
7220+ int i = *(loff_t *) v, j;
7221+ struct irqaction * action;
7222+ unsigned long flags;
7223+
7224+ if (i == 0) {
7225+ seq_printf(p, " ");
7226+ for_each_online_cpu(j)
7227+ seq_printf(p, "CPU%d ",j);
7228+ seq_putc(p, '\n');
7229+ }
7230+
7231+ if (i < NR_IRQS) {
7232+ spin_lock_irqsave(&irq_desc[i].lock, flags);
7233+ action = irq_desc[i].action;
7234+ if (!action)
7235+ goto skip;
7236+ seq_printf(p, "%3d: ",i);
7237+#ifndef CONFIG_SMP
7238+ seq_printf(p, "%10u ", kstat_irqs(i));
7239+#else
7240+ for_each_online_cpu(j)
7241+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
7242+#endif
7243+ seq_printf(p, " %14s", irq_desc[i].handler->typename);
7244+ seq_printf(p, " %s", action->name);
7245+
7246+ for (action=action->next; action; action = action->next)
7247+ seq_printf(p, ", %s", action->name);
7248+
7249+ seq_putc(p, '\n');
7250+skip:
7251+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
7252+ } else if (i == NR_IRQS) {
7253+ seq_printf(p, "NMI: ");
7254+ for_each_online_cpu(j)
7255+ seq_printf(p, "%10u ", nmi_count(j));
7256+ seq_putc(p, '\n');
7257+#ifdef CONFIG_X86_LOCAL_APIC
7258+ seq_printf(p, "LOC: ");
7259+ for_each_online_cpu(j)
7260+ seq_printf(p, "%10u ",
7261+ per_cpu(irq_stat,j).apic_timer_irqs);
7262+ seq_putc(p, '\n');
7263+#endif
7264+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
7265+#if defined(CONFIG_X86_IO_APIC)
7266+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
7267+#endif
7268+ }
7269+ return 0;
7270+}
7271+
7272+#ifdef CONFIG_HOTPLUG_CPU
7273+
7274+void fixup_irqs(cpumask_t map)
7275+{
7276+ unsigned int irq;
7277+ static int warned;
7278+
7279+ for (irq = 0; irq < NR_IRQS; irq++) {
7280+ cpumask_t mask;
7281+ if (irq == 2)
7282+ continue;
7283+
7284+ cpus_and(mask, irq_affinity[irq], map);
7285+ if (any_online_cpu(mask) == NR_CPUS) {
7286+ /*printk("Breaking affinity for irq %i\n", irq);*/
7287+ mask = map;
7288+ }
7289+ if (irq_desc[irq].handler->set_affinity)
7290+ irq_desc[irq].handler->set_affinity(irq, mask);
7291+ else if (irq_desc[irq].action && !(warned++))
7292+ printk("Cannot set affinity for irq %i\n", irq);
7293+ }
7294+
7295+#if 0
7296+ barrier();
7297+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
7298+ [note the nop - the interrupt-enable boundary on x86 is two
7299+ instructions from sti] - to flush out pending hardirqs and
7300+ IPIs. After this point nothing is supposed to reach this CPU." */
7301+ __asm__ __volatile__("sti; nop; cli");
7302+ barrier();
7303+#else
7304+ /* That doesn't seem sufficient. Give it 1ms. */
7305+ local_irq_enable();
7306+ mdelay(1);
7307+ local_irq_disable();
7308+#endif
7309+}
7310+#endif
7311+
7312diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/irq.c linux-2.6.16.33/arch/i386/kernel/irq.c
7313--- linux-2.6.16.33-noxen/arch/i386/kernel/irq.c 2006-11-22 18:06:31.000000000 +0000
7314+++ linux-2.6.16.33/arch/i386/kernel/irq.c 2007-05-23 21:00:01.000000000 +0000
7315@@ -53,8 +53,8 @@
7316 */
7317 fastcall unsigned int do_IRQ(struct pt_regs *regs)
7318 {
7319- /* high bits used in ret_from_ code */
7320- int irq = regs->orig_eax & 0xff;
7321+ /* high bit used in ret_from_ code */
7322+ int irq = ~regs->orig_eax;
7323 #ifdef CONFIG_4KSTACKS
7324 union irq_ctx *curctx, *irqctx;
7325 u32 *isp;
7326diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/ldt-xen.c linux-2.6.16.33/arch/i386/kernel/ldt-xen.c
7327--- linux-2.6.16.33-noxen/arch/i386/kernel/ldt-xen.c 1970-01-01 00:00:00.000000000 +0000
7328+++ linux-2.6.16.33/arch/i386/kernel/ldt-xen.c 2007-01-08 15:00:45.000000000 +0000
7329@@ -0,0 +1,270 @@
7330+/*
7331+ * linux/kernel/ldt.c
7332+ *
7333+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
7334+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
7335+ */
7336+
7337+#include <linux/errno.h>
7338+#include <linux/sched.h>
7339+#include <linux/string.h>
7340+#include <linux/mm.h>
7341+#include <linux/smp.h>
7342+#include <linux/smp_lock.h>
7343+#include <linux/vmalloc.h>
7344+#include <linux/slab.h>
7345+
7346+#include <asm/uaccess.h>
7347+#include <asm/system.h>
7348+#include <asm/ldt.h>
7349+#include <asm/desc.h>
7350+#include <asm/mmu_context.h>
7351+
7352+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
7353+static void flush_ldt(void *null)
7354+{
7355+ if (current->active_mm)
7356+ load_LDT(&current->active_mm->context);
7357+}
7358+#endif
7359+
7360+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
7361+{
7362+ void *oldldt;
7363+ void *newldt;
7364+ int oldsize;
7365+
7366+ if (mincount <= pc->size)
7367+ return 0;
7368+ oldsize = pc->size;
7369+ mincount = (mincount+511)&(~511);
7370+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
7371+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
7372+ else
7373+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
7374+
7375+ if (!newldt)
7376+ return -ENOMEM;
7377+
7378+ if (oldsize)
7379+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
7380+ oldldt = pc->ldt;
7381+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
7382+ pc->ldt = newldt;
7383+ wmb();
7384+ pc->size = mincount;
7385+ wmb();
7386+
7387+ if (reload) {
7388+#ifdef CONFIG_SMP
7389+ cpumask_t mask;
7390+ preempt_disable();
7391+#endif
7392+ make_pages_readonly(
7393+ pc->ldt,
7394+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7395+ XENFEAT_writable_descriptor_tables);
7396+ load_LDT(pc);
7397+#ifdef CONFIG_SMP
7398+ mask = cpumask_of_cpu(smp_processor_id());
7399+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
7400+ smp_call_function(flush_ldt, NULL, 1, 1);
7401+ preempt_enable();
7402+#endif
7403+ }
7404+ if (oldsize) {
7405+ make_pages_writable(
7406+ oldldt,
7407+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
7408+ XENFEAT_writable_descriptor_tables);
7409+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
7410+ vfree(oldldt);
7411+ else
7412+ kfree(oldldt);
7413+ }
7414+ return 0;
7415+}
7416+
7417+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
7418+{
7419+ int err = alloc_ldt(new, old->size, 0);
7420+ if (err < 0)
7421+ return err;
7422+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
7423+ make_pages_readonly(
7424+ new->ldt,
7425+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7426+ XENFEAT_writable_descriptor_tables);
7427+ return 0;
7428+}
7429+
7430+/*
7431+ * we do not have to muck with descriptors here, that is
7432+ * done in switch_mm() as needed.
7433+ */
7434+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
7435+{
7436+ struct mm_struct * old_mm;
7437+ int retval = 0;
7438+
7439+ init_MUTEX(&mm->context.sem);
7440+ mm->context.size = 0;
7441+ mm->context.has_foreign_mappings = 0;
7442+ old_mm = current->mm;
7443+ if (old_mm && old_mm->context.size > 0) {
7444+ down(&old_mm->context.sem);
7445+ retval = copy_ldt(&mm->context, &old_mm->context);
7446+ up(&old_mm->context.sem);
7447+ }
7448+ return retval;
7449+}
7450+
7451+/*
7452+ * No need to lock the MM as we are the last user
7453+ */
7454+void destroy_context(struct mm_struct *mm)
7455+{
7456+ if (mm->context.size) {
7457+ if (mm == current->active_mm)
7458+ clear_LDT();
7459+ make_pages_writable(
7460+ mm->context.ldt,
7461+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7462+ XENFEAT_writable_descriptor_tables);
7463+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
7464+ vfree(mm->context.ldt);
7465+ else
7466+ kfree(mm->context.ldt);
7467+ mm->context.size = 0;
7468+ }
7469+}
7470+
7471+static int read_ldt(void __user * ptr, unsigned long bytecount)
7472+{
7473+ int err;
7474+ unsigned long size;
7475+ struct mm_struct * mm = current->mm;
7476+
7477+ if (!mm->context.size)
7478+ return 0;
7479+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
7480+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
7481+
7482+ down(&mm->context.sem);
7483+ size = mm->context.size*LDT_ENTRY_SIZE;
7484+ if (size > bytecount)
7485+ size = bytecount;
7486+
7487+ err = 0;
7488+ if (copy_to_user(ptr, mm->context.ldt, size))
7489+ err = -EFAULT;
7490+ up(&mm->context.sem);
7491+ if (err < 0)
7492+ goto error_return;
7493+ if (size != bytecount) {
7494+ /* zero-fill the rest */
7495+ if (clear_user(ptr+size, bytecount-size) != 0) {
7496+ err = -EFAULT;
7497+ goto error_return;
7498+ }
7499+ }
7500+ return bytecount;
7501+error_return:
7502+ return err;
7503+}
7504+
7505+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
7506+{
7507+ int err;
7508+ unsigned long size;
7509+ void *address;
7510+
7511+ err = 0;
7512+ address = &default_ldt[0];
7513+ size = 5*sizeof(struct desc_struct);
7514+ if (size > bytecount)
7515+ size = bytecount;
7516+
7517+ err = size;
7518+ if (copy_to_user(ptr, address, size))
7519+ err = -EFAULT;
7520+
7521+ return err;
7522+}
7523+
7524+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
7525+{
7526+ struct mm_struct * mm = current->mm;
7527+ __u32 entry_1, entry_2;
7528+ int error;
7529+ struct user_desc ldt_info;
7530+
7531+ error = -EINVAL;
7532+ if (bytecount != sizeof(ldt_info))
7533+ goto out;
7534+ error = -EFAULT;
7535+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
7536+ goto out;
7537+
7538+ error = -EINVAL;
7539+ if (ldt_info.entry_number >= LDT_ENTRIES)
7540+ goto out;
7541+ if (ldt_info.contents == 3) {
7542+ if (oldmode)
7543+ goto out;
7544+ if (ldt_info.seg_not_present == 0)
7545+ goto out;
7546+ }
7547+
7548+ down(&mm->context.sem);
7549+ if (ldt_info.entry_number >= mm->context.size) {
7550+ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
7551+ if (error < 0)
7552+ goto out_unlock;
7553+ }
7554+
7555+ /* Allow LDTs to be cleared by the user. */
7556+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
7557+ if (oldmode || LDT_empty(&ldt_info)) {
7558+ entry_1 = 0;
7559+ entry_2 = 0;
7560+ goto install;
7561+ }
7562+ }
7563+
7564+ entry_1 = LDT_entry_a(&ldt_info);
7565+ entry_2 = LDT_entry_b(&ldt_info);
7566+ if (oldmode)
7567+ entry_2 &= ~(1 << 20);
7568+
7569+ /* Install the new entry ... */
7570+install:
7571+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
7572+ entry_1, entry_2);
7573+
7574+out_unlock:
7575+ up(&mm->context.sem);
7576+out:
7577+ return error;
7578+}
7579+
7580+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
7581+{
7582+ int ret = -ENOSYS;
7583+
7584+ switch (func) {
7585+ case 0:
7586+ ret = read_ldt(ptr, bytecount);
7587+ break;
7588+ case 1:
7589+ ret = write_ldt(ptr, bytecount, 1);
7590+ break;
7591+ case 2:
7592+ ret = read_default_ldt(ptr, bytecount);
7593+ break;
7594+ case 0x11:
7595+ ret = write_ldt(ptr, bytecount, 0);
7596+ break;
7597+ }
7598+ return ret;
7599+}
7600diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c linux-2.6.16.33/arch/i386/kernel/machine_kexec.c
7601--- linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
7602+++ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
7603@@ -19,123 +19,52 @@
7604 #include <asm/desc.h>
7605 #include <asm/system.h>
7606
7607-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7608-
7609-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
7610-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
7611-#define L2_ATTR (_PAGE_PRESENT)
7612-
7613-#define LEVEL0_SIZE (1UL << 12UL)
7614-
7615-#ifndef CONFIG_X86_PAE
7616-#define LEVEL1_SIZE (1UL << 22UL)
7617-static u32 pgtable_level1[1024] PAGE_ALIGNED;
7618-
7619-static void identity_map_page(unsigned long address)
7620-{
7621- unsigned long level1_index, level2_index;
7622- u32 *pgtable_level2;
7623-
7624- /* Find the current page table */
7625- pgtable_level2 = __va(read_cr3());
7626-
7627- /* Find the indexes of the physical address to identity map */
7628- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
7629- level2_index = address / LEVEL1_SIZE;
7630-
7631- /* Identity map the page table entry */
7632- pgtable_level1[level1_index] = address | L0_ATTR;
7633- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
7634-
7635- /* Flush the tlb so the new mapping takes effect.
7636- * Global tlb entries are not flushed but that is not an issue.
7637- */
7638- load_cr3(pgtable_level2);
7639-}
7640-
7641-#else
7642-#define LEVEL1_SIZE (1UL << 21UL)
7643-#define LEVEL2_SIZE (1UL << 30UL)
7644-static u64 pgtable_level1[512] PAGE_ALIGNED;
7645-static u64 pgtable_level2[512] PAGE_ALIGNED;
7646-
7647-static void identity_map_page(unsigned long address)
7648-{
7649- unsigned long level1_index, level2_index, level3_index;
7650- u64 *pgtable_level3;
7651-
7652- /* Find the current page table */
7653- pgtable_level3 = __va(read_cr3());
7654+#ifdef CONFIG_XEN
7655+#include <xen/interface/kexec.h>
7656+#endif
7657
7658- /* Find the indexes of the physical address to identity map */
7659- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
7660- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
7661- level3_index = address / LEVEL2_SIZE;
7662-
7663- /* Identity map the page table entry */
7664- pgtable_level1[level1_index] = address | L0_ATTR;
7665- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
7666- set_64bit(&pgtable_level3[level3_index],
7667- __pa(pgtable_level2) | L2_ATTR);
7668-
7669- /* Flush the tlb so the new mapping takes effect.
7670- * Global tlb entries are not flushed but that is not an issue.
7671- */
7672- load_cr3(pgtable_level3);
7673-}
7674+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7675+static u32 kexec_pgd[1024] PAGE_ALIGNED;
7676+#ifdef CONFIG_X86_PAE
7677+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
7678+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
7679 #endif
7680+static u32 kexec_pte0[1024] PAGE_ALIGNED;
7681+static u32 kexec_pte1[1024] PAGE_ALIGNED;
7682
7683-static void set_idt(void *newidt, __u16 limit)
7684-{
7685- struct Xgt_desc_struct curidt;
7686+#ifdef CONFIG_XEN
7687
7688- /* ia32 supports unaliged loads & stores */
7689- curidt.size = limit;
7690- curidt.address = (unsigned long)newidt;
7691+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
7692
7693- load_idt(&curidt);
7694-};
7695+#if PAGES_NR > KEXEC_XEN_NO_PAGES
7696+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
7697+#endif
7698
7699+#if PA_CONTROL_PAGE != 0
7700+#error PA_CONTROL_PAGE is non zero - Xen support will break
7701+#endif
7702
7703-static void set_gdt(void *newgdt, __u16 limit)
7704+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
7705 {
7706- struct Xgt_desc_struct curgdt;
7707+ void *control_page;
7708
7709- /* ia32 supports unaligned loads & stores */
7710- curgdt.size = limit;
7711- curgdt.address = (unsigned long)newgdt;
7712+ memset(xki->page_list, 0, sizeof(xki->page_list));
7713
7714- load_gdt(&curgdt);
7715-};
7716+ control_page = page_address(image->control_code_page);
7717+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
7718
7719-static void load_segments(void)
7720-{
7721-#define __STR(X) #X
7722-#define STR(X) __STR(X)
7723+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
7724+ xki->page_list[PA_PGD] = __ma(kexec_pgd);
7725+#ifdef CONFIG_X86_PAE
7726+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
7727+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
7728+#endif
7729+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
7730+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
7731
7732- __asm__ __volatile__ (
7733- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
7734- "\t1:\n"
7735- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
7736- "\tmovl %%eax,%%ds\n"
7737- "\tmovl %%eax,%%es\n"
7738- "\tmovl %%eax,%%fs\n"
7739- "\tmovl %%eax,%%gs\n"
7740- "\tmovl %%eax,%%ss\n"
7741- ::: "eax", "memory");
7742-#undef STR
7743-#undef __STR
7744 }
7745
7746-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
7747- unsigned long indirection_page,
7748- unsigned long reboot_code_buffer,
7749- unsigned long start_address,
7750- unsigned int has_pae) ATTRIB_NORET;
7751-
7752-const extern unsigned char relocate_new_kernel[];
7753-extern void relocate_new_kernel_end(void);
7754-const extern unsigned int relocate_new_kernel_size;
7755+#endif /* CONFIG_XEN */
7756
7757 /*
7758 * A architecture hook called to validate the
7759@@ -163,52 +92,38 @@
7760 {
7761 }
7762
7763+#ifndef CONFIG_XEN
7764 /*
7765 * Do not allocate memory (or fail in any way) in machine_kexec().
7766 * We are past the point of no return, committed to rebooting now.
7767 */
7768 NORET_TYPE void machine_kexec(struct kimage *image)
7769 {
7770- unsigned long page_list;
7771- unsigned long reboot_code_buffer;
7772-
7773- relocate_new_kernel_t rnk;
7774+ unsigned long page_list[PAGES_NR];
7775+ void *control_page;
7776
7777 /* Interrupts aren't acceptable while we reboot */
7778 local_irq_disable();
7779
7780- /* Compute some offsets */
7781- reboot_code_buffer = page_to_pfn(image->control_code_page)
7782- << PAGE_SHIFT;
7783- page_list = image->head;
7784-
7785- /* Set up an identity mapping for the reboot_code_buffer */
7786- identity_map_page(reboot_code_buffer);
7787-
7788- /* copy it out */
7789- memcpy((void *)reboot_code_buffer, relocate_new_kernel,
7790- relocate_new_kernel_size);
7791-
7792- /* The segment registers are funny things, they are
7793- * automatically loaded from a table, in memory wherever you
7794- * set them to a specific selector, but this table is never
7795- * accessed again you set the segment to a different selector.
7796- *
7797- * The more common model is are caches where the behide
7798- * the scenes work is done, but is also dropped at arbitrary
7799- * times.
7800- *
7801- * I take advantage of this here by force loading the
7802- * segments, before I zap the gdt with an invalid value.
7803- */
7804- load_segments();
7805- /* The gdt & idt are now invalid.
7806- * If you want to load them you must set up your own idt & gdt.
7807- */
7808- set_gdt(phys_to_virt(0),0);
7809- set_idt(phys_to_virt(0),0);
7810-
7811- /* now call it */
7812- rnk = (relocate_new_kernel_t) reboot_code_buffer;
7813- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
7814+ control_page = page_address(image->control_code_page);
7815+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
7816+
7817+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
7818+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
7819+ page_list[PA_PGD] = __pa(kexec_pgd);
7820+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
7821+#ifdef CONFIG_X86_PAE
7822+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
7823+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
7824+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
7825+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
7826+#endif
7827+ page_list[PA_PTE_0] = __pa(kexec_pte0);
7828+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
7829+ page_list[PA_PTE_1] = __pa(kexec_pte1);
7830+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
7831+
7832+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
7833+ image->start, cpu_has_pae);
7834 }
7835+#endif
7836diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c~ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c~
7837--- linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c~ 1970-01-01 00:00:00.000000000 +0000
7838+++ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c~ 2007-05-23 21:00:01.000000000 +0000
7839@@ -0,0 +1,148 @@
7840+/*
7841+ * machine_kexec.c - handle transition of Linux booting another kernel
7842+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
7843+ *
7844+ * This source code is licensed under the GNU General Public License,
7845+ * Version 2. See the file COPYING for more details.
7846+ */
7847+
7848+#include <linux/mm.h>
7849+#include <linux/kexec.h>
7850+#include <linux/delay.h>
7851+#include <asm/pgtable.h>
7852+#include <asm/pgalloc.h>
7853+#include <asm/tlbflush.h>
7854+#include <asm/mmu_context.h>
7855+#include <asm/io.h>
7856+#include <asm/apic.h>
7857+#include <asm/cpufeature.h>
7858+#include <asm/desc.h>
7859+#include <asm/system.h>
7860+
7861+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7862+static u32 kexec_pgd[1024] PAGE_ALIGNED;
7863+#ifdef CONFIG_X86_PAE
7864+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
7865+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
7866+#endif
7867+static u32 kexec_pte0[1024] PAGE_ALIGNED;
7868+static u32 kexec_pte1[1024] PAGE_ALIGNED;
7869+
7870+static void set_idt(void *newidt, __u16 limit)
7871+{
7872+ struct Xgt_desc_struct curidt;
7873+
7874+ /* ia32 supports unaliged loads & stores */
7875+ curidt.size = limit;
7876+ curidt.address = (unsigned long)newidt;
7877+
7878+ load_idt(&curidt);
7879+};
7880+
7881+
7882+static void set_gdt(void *newgdt, __u16 limit)
7883+{
7884+ struct Xgt_desc_struct curgdt;
7885+
7886+ /* ia32 supports unaligned loads & stores */
7887+ curgdt.size = limit;
7888+ curgdt.address = (unsigned long)newgdt;
7889+
7890+ load_gdt(&curgdt);
7891+};
7892+
7893+static void load_segments(void)
7894+{
7895+#define __STR(X) #X
7896+#define STR(X) __STR(X)
7897+
7898+ __asm__ __volatile__ (
7899+ "\tljmp $"STR(__KERNEL_CS)",$1f\n"
7900+ "\t1:\n"
7901+ "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
7902+ "\tmovl %%eax,%%ds\n"
7903+ "\tmovl %%eax,%%es\n"
7904+ "\tmovl %%eax,%%fs\n"
7905+ "\tmovl %%eax,%%gs\n"
7906+ "\tmovl %%eax,%%ss\n"
7907+ ::: "eax", "memory");
7908+#undef STR
7909+#undef __STR
7910+}
7911+
7912+/*
7913+ * A architecture hook called to validate the
7914+ * proposed image and prepare the control pages
7915+ * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
7916+ * have been allocated, but the segments have yet
7917+ * been copied into the kernel.
7918+ *
7919+ * Do what every setup is needed on image and the
7920+ * reboot code buffer to allow us to avoid allocations
7921+ * later.
7922+ *
7923+ * Currently nothing.
7924+ */
7925+int machine_kexec_prepare(struct kimage *image)
7926+{
7927+ return 0;
7928+}
7929+
7930+/*
7931+ * Undo anything leftover by machine_kexec_prepare
7932+ * when an image is freed.
7933+ */
7934+void machine_kexec_cleanup(struct kimage *image)
7935+{
7936+}
7937+
7938+/*
7939+ * Do not allocate memory (or fail in any way) in machine_kexec().
7940+ * We are past the point of no return, committed to rebooting now.
7941+ */
7942+NORET_TYPE void machine_kexec(struct kimage *image)
7943+{
7944+ unsigned long page_list[PAGES_NR];
7945+ void *control_page;
7946+
7947+ /* Interrupts aren't acceptable while we reboot */
7948+ local_irq_disable();
7949+
7950+ control_page = page_address(image->control_code_page);
7951+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
7952+
7953+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
7954+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
7955+ page_list[PA_PGD] = __pa(kexec_pgd);
7956+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
7957+#ifdef CONFIG_X86_PAE
7958+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
7959+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
7960+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
7961+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
7962+#endif
7963+ page_list[PA_PTE_0] = __pa(kexec_pte0);
7964+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
7965+ page_list[PA_PTE_1] = __pa(kexec_pte1);
7966+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
7967+
7968+ /* The segment registers are funny things, they have both a
7969+ * visible and an invisible part. Whenever the visible part is
7970+ * set to a specific selector, the invisible part is loaded
7971+ * with from a table in memory. At no other time is the
7972+ * descriptor table in memory accessed.
7973+ *
7974+ * I take advantage of this here by force loading the
7975+ * segments, before I zap the gdt with an invalid value.
7976+ */
7977+ load_segments();
7978+ /* The gdt & idt are now invalid.
7979+ * If you want to load them you must set up your own idt & gdt.
7980+ */
7981+ set_gdt(phys_to_virt(0),0);
7982+ set_idt(phys_to_virt(0),0);
7983+
7984+ /* now call it */
7985+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
7986+ image->start, cpu_has_pae);
7987+}
7988diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/microcode-xen.c linux-2.6.16.33/arch/i386/kernel/microcode-xen.c
7989--- linux-2.6.16.33-noxen/arch/i386/kernel/microcode-xen.c 1970-01-01 00:00:00.000000000 +0000
7990+++ linux-2.6.16.33/arch/i386/kernel/microcode-xen.c 2007-01-08 15:00:45.000000000 +0000
7991@@ -0,0 +1,159 @@
7992+/*
7993+ * Intel CPU Microcode Update Driver for Linux
7994+ *
7995+ * Copyright (C) 2000-2004 Tigran Aivazian
7996+ *
7997+ * This driver allows to upgrade microcode on Intel processors
7998+ * belonging to IA-32 family - PentiumPro, Pentium II,
7999+ * Pentium III, Xeon, Pentium 4, etc.
8000+ *
8001+ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8002+ * Order Number 245472 or free download from:
8003+ *
8004+ * http://developer.intel.com/design/pentium4/manuals/245472.htm
8005+ *
8006+ * For more information, go to http://www.urbanmyth.org/microcode
8007+ *
8008+ * This program is free software; you can redistribute it and/or
8009+ * modify it under the terms of the GNU General Public License
8010+ * as published by the Free Software Foundation; either version
8011+ * 2 of the License, or (at your option) any later version.
8012+ */
8013+
8014+//#define DEBUG /* pr_debug */
8015+#include <linux/capability.h>
8016+#include <linux/kernel.h>
8017+#include <linux/init.h>
8018+#include <linux/sched.h>
8019+#include <linux/cpumask.h>
8020+#include <linux/module.h>
8021+#include <linux/slab.h>
8022+#include <linux/vmalloc.h>
8023+#include <linux/miscdevice.h>
8024+#include <linux/spinlock.h>
8025+#include <linux/mm.h>
8026+#include <linux/syscalls.h>
8027+
8028+#include <asm/msr.h>
8029+#include <asm/uaccess.h>
8030+#include <asm/processor.h>
8031+
8032+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
8033+MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
8034+MODULE_LICENSE("GPL");
8035+
8036+#define MICROCODE_VERSION "1.14-xen"
8037+
8038+#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
8039+#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
8040+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
8041+
8042+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
8043+static DECLARE_MUTEX(microcode_sem);
8044+
8045+static int microcode_open (struct inode *unused1, struct file *unused2)
8046+{
8047+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8048+}
8049+
8050+
8051+static int do_microcode_update (const void __user *ubuf, size_t len)
8052+{
8053+ int err;
8054+ void *kbuf;
8055+
8056+ kbuf = vmalloc(len);
8057+ if (!kbuf)
8058+ return -ENOMEM;
8059+
8060+ if (copy_from_user(kbuf, ubuf, len) == 0) {
8061+ dom0_op_t op;
8062+
8063+ op.cmd = DOM0_MICROCODE;
8064+ set_xen_guest_handle(op.u.microcode.data, kbuf);
8065+ op.u.microcode.length = len;
8066+ err = HYPERVISOR_dom0_op(&op);
8067+ } else
8068+ err = -EFAULT;
8069+
8070+ vfree(kbuf);
8071+
8072+ return err;
8073+}
8074+
8075+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
8076+{
8077+ ssize_t ret;
8078+
8079+ if (len < DEFAULT_UCODE_TOTALSIZE) {
8080+ printk(KERN_ERR "microcode: not enough data\n");
8081+ return -EINVAL;
8082+ }
8083+
8084+ down(&microcode_sem);
8085+
8086+ ret = do_microcode_update(buf, len);
8087+ if (!ret)
8088+ ret = (ssize_t)len;
8089+
8090+ up(&microcode_sem);
8091+
8092+ return ret;
8093+}
8094+
8095+static int microcode_ioctl (struct inode *inode, struct file *file,
8096+ unsigned int cmd, unsigned long arg)
8097+{
8098+ switch (cmd) {
8099+ /*
8100+ * XXX: will be removed after microcode_ctl
8101+ * is updated to ignore failure of this ioctl()
8102+ */
8103+ case MICROCODE_IOCFREE:
8104+ return 0;
8105+ default:
8106+ return -EINVAL;
8107+ }
8108+ return -EINVAL;
8109+}
8110+
8111+static struct file_operations microcode_fops = {
8112+ .owner = THIS_MODULE,
8113+ .write = microcode_write,
8114+ .ioctl = microcode_ioctl,
8115+ .open = microcode_open,
8116+};
8117+
8118+static struct miscdevice microcode_dev = {
8119+ .minor = MICROCODE_MINOR,
8120+ .name = "microcode",
8121+ .devfs_name = "cpu/microcode",
8122+ .fops = &microcode_fops,
8123+};
8124+
8125+static int __init microcode_init (void)
8126+{
8127+ int error;
8128+
8129+ error = misc_register(&microcode_dev);
8130+ if (error) {
8131+ printk(KERN_ERR
8132+ "microcode: can't misc_register on minor=%d\n",
8133+ MICROCODE_MINOR);
8134+ return error;
8135+ }
8136+
8137+ printk(KERN_INFO
8138+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
8139+ return 0;
8140+}
8141+
8142+static void __exit microcode_exit (void)
8143+{
8144+ misc_deregister(&microcode_dev);
8145+ printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n");
8146+}
8147+
8148+module_init(microcode_init)
8149+module_exit(microcode_exit)
8150+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
8151diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/mpparse-xen.c linux-2.6.16.33/arch/i386/kernel/mpparse-xen.c
8152--- linux-2.6.16.33-noxen/arch/i386/kernel/mpparse-xen.c 1970-01-01 00:00:00.000000000 +0000
8153+++ linux-2.6.16.33/arch/i386/kernel/mpparse-xen.c 2007-01-08 15:00:45.000000000 +0000
8154@@ -0,0 +1,1188 @@
8155+/*
8156+ * Intel Multiprocessor Specification 1.1 and 1.4
8157+ * compliant MP-table parsing routines.
8158+ *
8159+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8160+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
8161+ *
8162+ * Fixes
8163+ * Erich Boleyn : MP v1.4 and additional changes.
8164+ * Alan Cox : Added EBDA scanning
8165+ * Ingo Molnar : various cleanups and rewrites
8166+ * Maciej W. Rozycki: Bits for default MP configurations
8167+ * Paul Diefenbaugh: Added full ACPI support
8168+ */
8169+
8170+#include <linux/mm.h>
8171+#include <linux/init.h>
8172+#include <linux/acpi.h>
8173+#include <linux/delay.h>
8174+#include <linux/config.h>
8175+#include <linux/bootmem.h>
8176+#include <linux/smp_lock.h>
8177+#include <linux/kernel_stat.h>
8178+#include <linux/mc146818rtc.h>
8179+#include <linux/bitops.h>
8180+
8181+#include <asm/smp.h>
8182+#include <asm/acpi.h>
8183+#include <asm/mtrr.h>
8184+#include <asm/mpspec.h>
8185+#include <asm/io_apic.h>
8186+
8187+#include <mach_apic.h>
8188+#include <mach_mpparse.h>
8189+#include <bios_ebda.h>
8190+
8191+/* Have we found an MP table */
8192+int smp_found_config;
8193+unsigned int __initdata maxcpus = NR_CPUS;
8194+
8195+#ifdef CONFIG_HOTPLUG_CPU
8196+#define CPU_HOTPLUG_ENABLED (1)
8197+#else
8198+#define CPU_HOTPLUG_ENABLED (0)
8199+#endif
8200+
8201+/*
8202+ * Various Linux-internal data structures created from the
8203+ * MP-table.
8204+ */
8205+int apic_version [MAX_APICS];
8206+int mp_bus_id_to_type [MAX_MP_BUSSES];
8207+int mp_bus_id_to_node [MAX_MP_BUSSES];
8208+int mp_bus_id_to_local [MAX_MP_BUSSES];
8209+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
8210+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
8211+static int mp_current_pci_id;
8212+
8213+/* I/O APIC entries */
8214+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
8215+
8216+/* # of MP IRQ source entries */
8217+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
8218+
8219+/* MP IRQ source entries */
8220+int mp_irq_entries;
8221+
8222+int nr_ioapics;
8223+
8224+int pic_mode;
8225+unsigned long mp_lapic_addr;
8226+
8227+unsigned int def_to_bigsmp = 0;
8228+
8229+/* Processor that is doing the boot up */
8230+unsigned int boot_cpu_physical_apicid = -1U;
8231+/* Internal processor count */
8232+static unsigned int __devinitdata num_processors;
8233+
8234+/* Bitmask of physically existing CPUs */
8235+physid_mask_t phys_cpu_present_map;
8236+
8237+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
8238+
8239+/*
8240+ * Intel MP BIOS table parsing routines:
8241+ */
8242+
8243+
8244+/*
8245+ * Checksum an MP configuration block.
8246+ */
8247+
8248+static int __init mpf_checksum(unsigned char *mp, int len)
8249+{
8250+ int sum = 0;
8251+
8252+ while (len--)
8253+ sum += *mp++;
8254+
8255+ return sum & 0xFF;
8256+}
8257+
8258+/*
8259+ * Have to match translation table entries to main table entries by counter
8260+ * hence the mpc_record variable .... can't see a less disgusting way of
8261+ * doing this ....
8262+ */
8263+
8264+static int mpc_record;
8265+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
8266+
8267+#ifdef CONFIG_X86_NUMAQ
8268+static int MP_valid_apicid(int apicid, int version)
8269+{
8270+ return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
8271+}
8272+#elif !defined(CONFIG_XEN)
8273+static int MP_valid_apicid(int apicid, int version)
8274+{
8275+ if (version >= 0x14)
8276+ return apicid < 0xff;
8277+ else
8278+ return apicid < 0xf;
8279+}
8280+#endif
8281+
8282+#ifndef CONFIG_XEN
8283+static void __devinit MP_processor_info (struct mpc_config_processor *m)
8284+{
8285+ int ver, apicid;
8286+ physid_mask_t phys_cpu;
8287+
8288+ if (!(m->mpc_cpuflag & CPU_ENABLED))
8289+ return;
8290+
8291+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
8292+
8293+ if (m->mpc_featureflag&(1<<0))
8294+ Dprintk(" Floating point unit present.\n");
8295+ if (m->mpc_featureflag&(1<<7))
8296+ Dprintk(" Machine Exception supported.\n");
8297+ if (m->mpc_featureflag&(1<<8))
8298+ Dprintk(" 64 bit compare & exchange supported.\n");
8299+ if (m->mpc_featureflag&(1<<9))
8300+ Dprintk(" Internal APIC present.\n");
8301+ if (m->mpc_featureflag&(1<<11))
8302+ Dprintk(" SEP present.\n");
8303+ if (m->mpc_featureflag&(1<<12))
8304+ Dprintk(" MTRR present.\n");
8305+ if (m->mpc_featureflag&(1<<13))
8306+ Dprintk(" PGE present.\n");
8307+ if (m->mpc_featureflag&(1<<14))
8308+ Dprintk(" MCA present.\n");
8309+ if (m->mpc_featureflag&(1<<15))
8310+ Dprintk(" CMOV present.\n");
8311+ if (m->mpc_featureflag&(1<<16))
8312+ Dprintk(" PAT present.\n");
8313+ if (m->mpc_featureflag&(1<<17))
8314+ Dprintk(" PSE present.\n");
8315+ if (m->mpc_featureflag&(1<<18))
8316+ Dprintk(" PSN present.\n");
8317+ if (m->mpc_featureflag&(1<<19))
8318+ Dprintk(" Cache Line Flush Instruction present.\n");
8319+ /* 20 Reserved */
8320+ if (m->mpc_featureflag&(1<<21))
8321+ Dprintk(" Debug Trace and EMON Store present.\n");
8322+ if (m->mpc_featureflag&(1<<22))
8323+ Dprintk(" ACPI Thermal Throttle Registers present.\n");
8324+ if (m->mpc_featureflag&(1<<23))
8325+ Dprintk(" MMX present.\n");
8326+ if (m->mpc_featureflag&(1<<24))
8327+ Dprintk(" FXSR present.\n");
8328+ if (m->mpc_featureflag&(1<<25))
8329+ Dprintk(" XMM present.\n");
8330+ if (m->mpc_featureflag&(1<<26))
8331+ Dprintk(" Willamette New Instructions present.\n");
8332+ if (m->mpc_featureflag&(1<<27))
8333+ Dprintk(" Self Snoop present.\n");
8334+ if (m->mpc_featureflag&(1<<28))
8335+ Dprintk(" HT present.\n");
8336+ if (m->mpc_featureflag&(1<<29))
8337+ Dprintk(" Thermal Monitor present.\n");
8338+ /* 30, 31 Reserved */
8339+
8340+
8341+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
8342+ Dprintk(" Bootup CPU\n");
8343+ boot_cpu_physical_apicid = m->mpc_apicid;
8344+ }
8345+
8346+ ver = m->mpc_apicver;
8347+
8348+ if (!MP_valid_apicid(apicid, ver)) {
8349+ printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
8350+ m->mpc_apicid, MAX_APICS);
8351+ return;
8352+ }
8353+
8354+ /*
8355+ * Validate version
8356+ */
8357+ if (ver == 0x0) {
8358+ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
8359+ "fixing up to 0x10. (tell your hw vendor)\n",
8360+ m->mpc_apicid);
8361+ ver = 0x10;
8362+ }
8363+ apic_version[m->mpc_apicid] = ver;
8364+
8365+ phys_cpu = apicid_to_cpu_present(apicid);
8366+ physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
8367+
8368+ if (num_processors >= NR_CPUS) {
8369+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
8370+ " Processor ignored.\n", NR_CPUS);
8371+ return;
8372+ }
8373+
8374+ if (num_processors >= maxcpus) {
8375+ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
8376+ " Processor ignored.\n", maxcpus);
8377+ return;
8378+ }
8379+
8380+ cpu_set(num_processors, cpu_possible_map);
8381+ num_processors++;
8382+
8383+ if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
8384+ switch (boot_cpu_data.x86_vendor) {
8385+ case X86_VENDOR_INTEL:
8386+ if (!APIC_XAPIC(ver)) {
8387+ def_to_bigsmp = 0;
8388+ break;
8389+ }
8390+ /* If P4 and above fall through */
8391+ case X86_VENDOR_AMD:
8392+ def_to_bigsmp = 1;
8393+ }
8394+ }
8395+ bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
8396+}
8397+#else
8398+void __init MP_processor_info (struct mpc_config_processor *m)
8399+{
8400+ num_processors++;
8401+}
8402+#endif /* CONFIG_XEN */
8403+
8404+static void __init MP_bus_info (struct mpc_config_bus *m)
8405+{
8406+ char str[7];
8407+
8408+ memcpy(str, m->mpc_bustype, 6);
8409+ str[6] = 0;
8410+
8411+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
8412+
8413+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
8414+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
8415+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
8416+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
8417+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
8418+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
8419+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
8420+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
8421+ mp_current_pci_id++;
8422+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
8423+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
8424+ } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
8425+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
8426+ } else {
8427+ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
8428+ }
8429+}
8430+
8431+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
8432+{
8433+ if (!(m->mpc_flags & MPC_APIC_USABLE))
8434+ return;
8435+
8436+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
8437+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
8438+ if (nr_ioapics >= MAX_IO_APICS) {
8439+ printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
8440+ MAX_IO_APICS, nr_ioapics);
8441+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
8442+ }
8443+ if (!m->mpc_apicaddr) {
8444+ printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
8445+ " found in MP table, skipping!\n");
8446+ return;
8447+ }
8448+ mp_ioapics[nr_ioapics] = *m;
8449+ nr_ioapics++;
8450+}
8451+
8452+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
8453+{
8454+ mp_irqs [mp_irq_entries] = *m;
8455+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
8456+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
8457+ m->mpc_irqtype, m->mpc_irqflag & 3,
8458+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
8459+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
8460+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
8461+ panic("Max # of irq sources exceeded!!\n");
8462+}
8463+
8464+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
8465+{
8466+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
8467+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
8468+ m->mpc_irqtype, m->mpc_irqflag & 3,
8469+ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
8470+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
8471+ /*
8472+ * Well it seems all SMP boards in existence
8473+ * use ExtINT/LVT1 == LINT0 and
8474+ * NMI/LVT2 == LINT1 - the following check
8475+ * will show us if this assumptions is false.
8476+ * Until then we do not have to add baggage.
8477+ */
8478+ if ((m->mpc_irqtype == mp_ExtINT) &&
8479+ (m->mpc_destapiclint != 0))
8480+ BUG();
8481+ if ((m->mpc_irqtype == mp_NMI) &&
8482+ (m->mpc_destapiclint != 1))
8483+ BUG();
8484+}
8485+
8486+#ifdef CONFIG_X86_NUMAQ
8487+static void __init MP_translation_info (struct mpc_config_translation *m)
8488+{
8489+ printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
8490+
8491+ if (mpc_record >= MAX_MPC_ENTRY)
8492+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
8493+ else
8494+ translation_table[mpc_record] = m; /* stash this for later */
8495+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
8496+ node_set_online(m->trans_quad);
8497+}
8498+
8499+/*
8500+ * Read/parse the MPC oem tables
8501+ */
8502+
8503+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
8504+ unsigned short oemsize)
8505+{
8506+ int count = sizeof (*oemtable); /* the header size */
8507+ unsigned char *oemptr = ((unsigned char *)oemtable)+count;
8508+
8509+ mpc_record = 0;
8510+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
8511+ if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
8512+ {
8513+ printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
8514+ oemtable->oem_signature[0],
8515+ oemtable->oem_signature[1],
8516+ oemtable->oem_signature[2],
8517+ oemtable->oem_signature[3]);
8518+ return;
8519+ }
8520+ if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
8521+ {
8522+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
8523+ return;
8524+ }
8525+ while (count < oemtable->oem_length) {
8526+ switch (*oemptr) {
8527+ case MP_TRANSLATION:
8528+ {
8529+ struct mpc_config_translation *m=
8530+ (struct mpc_config_translation *)oemptr;
8531+ MP_translation_info(m);
8532+ oemptr += sizeof(*m);
8533+ count += sizeof(*m);
8534+ ++mpc_record;
8535+ break;
8536+ }
8537+ default:
8538+ {
8539+ printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
8540+ return;
8541+ }
8542+ }
8543+ }
8544+}
8545+
8546+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
8547+ char *productid)
8548+{
8549+ if (strncmp(oem, "IBM NUMA", 8))
8550+ printk("Warning! May not be a NUMA-Q system!\n");
8551+ if (mpc->mpc_oemptr)
8552+ smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
8553+ mpc->mpc_oemsize);
8554+}
8555+#endif /* CONFIG_X86_NUMAQ */
8556+
8557+/*
8558+ * Read/parse the MPC
8559+ */
8560+
8561+static int __init smp_read_mpc(struct mp_config_table *mpc)
8562+{
8563+ char str[16];
8564+ char oem[10];
8565+ int count=sizeof(*mpc);
8566+ unsigned char *mpt=((unsigned char *)mpc)+count;
8567+
8568+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
8569+ printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
8570+ *(u32 *)mpc->mpc_signature);
8571+ return 0;
8572+ }
8573+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
8574+ printk(KERN_ERR "SMP mptable: checksum error!\n");
8575+ return 0;
8576+ }
8577+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
8578+ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
8579+ mpc->mpc_spec);
8580+ return 0;
8581+ }
8582+ if (!mpc->mpc_lapic) {
8583+ printk(KERN_ERR "SMP mptable: null local APIC address!\n");
8584+ return 0;
8585+ }
8586+ memcpy(oem,mpc->mpc_oem,8);
8587+ oem[8]=0;
8588+ printk(KERN_INFO "OEM ID: %s ",oem);
8589+
8590+ memcpy(str,mpc->mpc_productid,12);
8591+ str[12]=0;
8592+ printk("Product ID: %s ",str);
8593+
8594+ mps_oem_check(mpc, oem, str);
8595+
8596+ printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
8597+
8598+ /*
8599+ * Save the local APIC address (it might be non-default) -- but only
8600+ * if we're not using ACPI.
8601+ */
8602+ if (!acpi_lapic)
8603+ mp_lapic_addr = mpc->mpc_lapic;
8604+
8605+ /*
8606+ * Now process the configuration blocks.
8607+ */
8608+ mpc_record = 0;
8609+ while (count < mpc->mpc_length) {
8610+ switch(*mpt) {
8611+ case MP_PROCESSOR:
8612+ {
8613+ struct mpc_config_processor *m=
8614+ (struct mpc_config_processor *)mpt;
8615+ /* ACPI may have already provided this data */
8616+ if (!acpi_lapic)
8617+ MP_processor_info(m);
8618+ mpt += sizeof(*m);
8619+ count += sizeof(*m);
8620+ break;
8621+ }
8622+ case MP_BUS:
8623+ {
8624+ struct mpc_config_bus *m=
8625+ (struct mpc_config_bus *)mpt;
8626+ MP_bus_info(m);
8627+ mpt += sizeof(*m);
8628+ count += sizeof(*m);
8629+ break;
8630+ }
8631+ case MP_IOAPIC:
8632+ {
8633+ struct mpc_config_ioapic *m=
8634+ (struct mpc_config_ioapic *)mpt;
8635+ MP_ioapic_info(m);
8636+ mpt+=sizeof(*m);
8637+ count+=sizeof(*m);
8638+ break;
8639+ }
8640+ case MP_INTSRC:
8641+ {
8642+ struct mpc_config_intsrc *m=
8643+ (struct mpc_config_intsrc *)mpt;
8644+
8645+ MP_intsrc_info(m);
8646+ mpt+=sizeof(*m);
8647+ count+=sizeof(*m);
8648+ break;
8649+ }
8650+ case MP_LINTSRC:
8651+ {
8652+ struct mpc_config_lintsrc *m=
8653+ (struct mpc_config_lintsrc *)mpt;
8654+ MP_lintsrc_info(m);
8655+ mpt+=sizeof(*m);
8656+ count+=sizeof(*m);
8657+ break;
8658+ }
8659+ default:
8660+ {
8661+ count = mpc->mpc_length;
8662+ break;
8663+ }
8664+ }
8665+ ++mpc_record;
8666+ }
8667+ clustered_apic_check();
8668+ if (!num_processors)
8669+ printk(KERN_ERR "SMP mptable: no processors registered!\n");
8670+ return num_processors;
8671+}
8672+
8673+static int __init ELCR_trigger(unsigned int irq)
8674+{
8675+ unsigned int port;
8676+
8677+ port = 0x4d0 + (irq >> 3);
8678+ return (inb(port) >> (irq & 7)) & 1;
8679+}
8680+
8681+static void __init construct_default_ioirq_mptable(int mpc_default_type)
8682+{
8683+ struct mpc_config_intsrc intsrc;
8684+ int i;
8685+ int ELCR_fallback = 0;
8686+
8687+ intsrc.mpc_type = MP_INTSRC;
8688+ intsrc.mpc_irqflag = 0; /* conforming */
8689+ intsrc.mpc_srcbus = 0;
8690+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
8691+
8692+ intsrc.mpc_irqtype = mp_INT;
8693+
8694+ /*
8695+ * If true, we have an ISA/PCI system with no IRQ entries
8696+ * in the MP table. To prevent the PCI interrupts from being set up
8697+ * incorrectly, we try to use the ELCR. The sanity check to see if
8698+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
8699+ * never be level sensitive, so we simply see if the ELCR agrees.
8700+ * If it does, we assume it's valid.
8701+ */
8702+ if (mpc_default_type == 5) {
8703+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
8704+
8705+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
8706+ printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
8707+ else {
8708+ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
8709+ ELCR_fallback = 1;
8710+ }
8711+ }
8712+
8713+ for (i = 0; i < 16; i++) {
8714+ switch (mpc_default_type) {
8715+ case 2:
8716+ if (i == 0 || i == 13)
8717+ continue; /* IRQ0 & IRQ13 not connected */
8718+ /* fall through */
8719+ default:
8720+ if (i == 2)
8721+ continue; /* IRQ2 is never connected */
8722+ }
8723+
8724+ if (ELCR_fallback) {
8725+ /*
8726+ * If the ELCR indicates a level-sensitive interrupt, we
8727+ * copy that information over to the MP table in the
8728+ * irqflag field (level sensitive, active high polarity).
8729+ */
8730+ if (ELCR_trigger(i))
8731+ intsrc.mpc_irqflag = 13;
8732+ else
8733+ intsrc.mpc_irqflag = 0;
8734+ }
8735+
8736+ intsrc.mpc_srcbusirq = i;
8737+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
8738+ MP_intsrc_info(&intsrc);
8739+ }
8740+
8741+ intsrc.mpc_irqtype = mp_ExtINT;
8742+ intsrc.mpc_srcbusirq = 0;
8743+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
8744+ MP_intsrc_info(&intsrc);
8745+}
8746+
8747+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
8748+{
8749+ struct mpc_config_processor processor;
8750+ struct mpc_config_bus bus;
8751+ struct mpc_config_ioapic ioapic;
8752+ struct mpc_config_lintsrc lintsrc;
8753+ int linttypes[2] = { mp_ExtINT, mp_NMI };
8754+ int i;
8755+
8756+ /*
8757+ * local APIC has default address
8758+ */
8759+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
8760+
8761+ /*
8762+ * 2 CPUs, numbered 0 & 1.
8763+ */
8764+ processor.mpc_type = MP_PROCESSOR;
8765+ /* Either an integrated APIC or a discrete 82489DX. */
8766+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8767+ processor.mpc_cpuflag = CPU_ENABLED;
8768+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
8769+ (boot_cpu_data.x86_model << 4) |
8770+ boot_cpu_data.x86_mask;
8771+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8772+ processor.mpc_reserved[0] = 0;
8773+ processor.mpc_reserved[1] = 0;
8774+ for (i = 0; i < 2; i++) {
8775+ processor.mpc_apicid = i;
8776+ MP_processor_info(&processor);
8777+ }
8778+
8779+ bus.mpc_type = MP_BUS;
8780+ bus.mpc_busid = 0;
8781+ switch (mpc_default_type) {
8782+ default:
8783+ printk("???\n");
8784+ printk(KERN_ERR "Unknown standard configuration %d\n",
8785+ mpc_default_type);
8786+ /* fall through */
8787+ case 1:
8788+ case 5:
8789+ memcpy(bus.mpc_bustype, "ISA ", 6);
8790+ break;
8791+ case 2:
8792+ case 6:
8793+ case 3:
8794+ memcpy(bus.mpc_bustype, "EISA ", 6);
8795+ break;
8796+ case 4:
8797+ case 7:
8798+ memcpy(bus.mpc_bustype, "MCA ", 6);
8799+ }
8800+ MP_bus_info(&bus);
8801+ if (mpc_default_type > 4) {
8802+ bus.mpc_busid = 1;
8803+ memcpy(bus.mpc_bustype, "PCI ", 6);
8804+ MP_bus_info(&bus);
8805+ }
8806+
8807+ ioapic.mpc_type = MP_IOAPIC;
8808+ ioapic.mpc_apicid = 2;
8809+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8810+ ioapic.mpc_flags = MPC_APIC_USABLE;
8811+ ioapic.mpc_apicaddr = 0xFEC00000;
8812+ MP_ioapic_info(&ioapic);
8813+
8814+ /*
8815+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
8816+ */
8817+ construct_default_ioirq_mptable(mpc_default_type);
8818+
8819+ lintsrc.mpc_type = MP_LINTSRC;
8820+ lintsrc.mpc_irqflag = 0; /* conforming */
8821+ lintsrc.mpc_srcbusid = 0;
8822+ lintsrc.mpc_srcbusirq = 0;
8823+ lintsrc.mpc_destapic = MP_APIC_ALL;
8824+ for (i = 0; i < 2; i++) {
8825+ lintsrc.mpc_irqtype = linttypes[i];
8826+ lintsrc.mpc_destapiclint = i;
8827+ MP_lintsrc_info(&lintsrc);
8828+ }
8829+}
8830+
8831+static struct intel_mp_floating *mpf_found;
8832+
8833+/*
8834+ * Scan the memory blocks for an SMP configuration block.
8835+ */
8836+void __init get_smp_config (void)
8837+{
8838+ struct intel_mp_floating *mpf = mpf_found;
8839+
8840+ /*
8841+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
8842+ * processors, where MPS only supports physical.
8843+ */
8844+ if (acpi_lapic && acpi_ioapic) {
8845+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
8846+ return;
8847+ }
8848+ else if (acpi_lapic)
8849+ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
8850+
8851+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
8852+ if (mpf->mpf_feature2 & (1<<7)) {
8853+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
8854+ pic_mode = 1;
8855+ } else {
8856+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
8857+ pic_mode = 0;
8858+ }
8859+
8860+ /*
8861+ * Now see if we need to read further.
8862+ */
8863+ if (mpf->mpf_feature1 != 0) {
8864+
8865+ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
8866+ construct_default_ISA_mptable(mpf->mpf_feature1);
8867+
8868+ } else if (mpf->mpf_physptr) {
8869+
8870+ /*
8871+ * Read the physical hardware table. Anything here will
8872+ * override the defaults.
8873+ */
8874+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
8875+ smp_found_config = 0;
8876+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
8877+ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
8878+ return;
8879+ }
8880+ /*
8881+ * If there are no explicit MP IRQ entries, then we are
8882+ * broken. We set up most of the low 16 IO-APIC pins to
8883+ * ISA defaults and hope it will work.
8884+ */
8885+ if (!mp_irq_entries) {
8886+ struct mpc_config_bus bus;
8887+
8888+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
8889+
8890+ bus.mpc_type = MP_BUS;
8891+ bus.mpc_busid = 0;
8892+ memcpy(bus.mpc_bustype, "ISA ", 6);
8893+ MP_bus_info(&bus);
8894+
8895+ construct_default_ioirq_mptable(0);
8896+ }
8897+
8898+ } else
8899+ BUG();
8900+
8901+ printk(KERN_INFO "Processors: %d\n", num_processors);
8902+ /*
8903+ * Only use the first configuration found.
8904+ */
8905+}
8906+
8907+static int __init smp_scan_config (unsigned long base, unsigned long length)
8908+{
8909+ unsigned long *bp = isa_bus_to_virt(base);
8910+ struct intel_mp_floating *mpf;
8911+
8912+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
8913+ if (sizeof(*mpf) != 16)
8914+ printk("Error: MPF size\n");
8915+
8916+ while (length > 0) {
8917+ mpf = (struct intel_mp_floating *)bp;
8918+ if ((*bp == SMP_MAGIC_IDENT) &&
8919+ (mpf->mpf_length == 1) &&
8920+ !mpf_checksum((unsigned char *)bp, 16) &&
8921+ ((mpf->mpf_specification == 1)
8922+ || (mpf->mpf_specification == 4)) ) {
8923+
8924+ smp_found_config = 1;
8925+#ifndef CONFIG_XEN
8926+ printk(KERN_INFO "found SMP MP-table at %08lx\n",
8927+ virt_to_phys(mpf));
8928+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
8929+ if (mpf->mpf_physptr) {
8930+ /*
8931+ * We cannot access to MPC table to compute
8932+ * table size yet, as only few megabytes from
8933+ * the bottom is mapped now.
8934+ * PC-9800's MPC table places on the very last
8935+ * of physical memory; so that simply reserving
8936+ * PAGE_SIZE from mpg->mpf_physptr yields BUG()
8937+ * in reserve_bootmem.
8938+ */
8939+ unsigned long size = PAGE_SIZE;
8940+ unsigned long end = max_low_pfn * PAGE_SIZE;
8941+ if (mpf->mpf_physptr + size > end)
8942+ size = end - mpf->mpf_physptr;
8943+ reserve_bootmem(mpf->mpf_physptr, size);
8944+ }
8945+#else
8946+ printk(KERN_INFO "found SMP MP-table at %08lx\n",
8947+ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
8948+#endif
8949+
8950+ mpf_found = mpf;
8951+ return 1;
8952+ }
8953+ bp += 4;
8954+ length -= 16;
8955+ }
8956+ return 0;
8957+}
8958+
8959+void __init find_smp_config (void)
8960+{
8961+#ifndef CONFIG_XEN
8962+ unsigned int address;
8963+#endif
8964+
8965+ /*
8966+ * FIXME: Linux assumes you have 640K of base ram..
8967+ * this continues the error...
8968+ *
8969+ * 1) Scan the bottom 1K for a signature
8970+ * 2) Scan the top 1K of base RAM
8971+ * 3) Scan the 64K of bios
8972+ */
8973+ if (smp_scan_config(0x0,0x400) ||
8974+ smp_scan_config(639*0x400,0x400) ||
8975+ smp_scan_config(0xF0000,0x10000))
8976+ return;
8977+ /*
8978+ * If it is an SMP machine we should know now, unless the
8979+ * configuration is in an EISA/MCA bus machine with an
8980+ * extended bios data area.
8981+ *
8982+ * there is a real-mode segmented pointer pointing to the
8983+ * 4K EBDA area at 0x40E, calculate and scan it here.
8984+ *
8985+ * NOTE! There are Linux loaders that will corrupt the EBDA
8986+ * area, and as such this kind of SMP config may be less
8987+ * trustworthy, simply because the SMP table may have been
8988+ * stomped on during early boot. These loaders are buggy and
8989+ * should be fixed.
8990+ *
8991+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
8992+ */
8993+
8994+#ifndef CONFIG_XEN
8995+ address = get_bios_ebda();
8996+ if (address)
8997+ smp_scan_config(address, 0x400);
8998+#endif
8999+}
9000+
9001+/* --------------------------------------------------------------------------
9002+ ACPI-based MP Configuration
9003+ -------------------------------------------------------------------------- */
9004+
9005+#ifdef CONFIG_ACPI
9006+
9007+void __init mp_register_lapic_address (
9008+ u64 address)
9009+{
9010+#ifndef CONFIG_XEN
9011+ mp_lapic_addr = (unsigned long) address;
9012+
9013+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
9014+
9015+ if (boot_cpu_physical_apicid == -1U)
9016+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
9017+
9018+ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
9019+#endif
9020+}
9021+
9022+
9023+void __devinit mp_register_lapic (
9024+ u8 id,
9025+ u8 enabled)
9026+{
9027+ struct mpc_config_processor processor;
9028+ int boot_cpu = 0;
9029+
9030+ if (MAX_APICS - id <= 0) {
9031+ printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
9032+ id, MAX_APICS);
9033+ return;
9034+ }
9035+
9036+ if (id == boot_cpu_physical_apicid)
9037+ boot_cpu = 1;
9038+
9039+#ifndef CONFIG_XEN
9040+ processor.mpc_type = MP_PROCESSOR;
9041+ processor.mpc_apicid = id;
9042+ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
9043+ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
9044+ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
9045+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9046+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9047+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9048+ processor.mpc_reserved[0] = 0;
9049+ processor.mpc_reserved[1] = 0;
9050+#endif
9051+
9052+ MP_processor_info(&processor);
9053+}
9054+
9055+#ifdef CONFIG_X86_IO_APIC
9056+
9057+#define MP_ISA_BUS 0
9058+#define MP_MAX_IOAPIC_PIN 127
9059+
9060+static struct mp_ioapic_routing {
9061+ int apic_id;
9062+ int gsi_base;
9063+ int gsi_end;
9064+ u32 pin_programmed[4];
9065+} mp_ioapic_routing[MAX_IO_APICS];
9066+
9067+
9068+static int mp_find_ioapic (
9069+ int gsi)
9070+{
9071+ int i = 0;
9072+
9073+ /* Find the IOAPIC that manages this GSI. */
9074+ for (i = 0; i < nr_ioapics; i++) {
9075+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
9076+ && (gsi <= mp_ioapic_routing[i].gsi_end))
9077+ return i;
9078+ }
9079+
9080+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9081+
9082+ return -1;
9083+}
9084+
9085+
9086+void __init mp_register_ioapic (
9087+ u8 id,
9088+ u32 address,
9089+ u32 gsi_base)
9090+{
9091+ int idx = 0;
9092+ int tmpid;
9093+
9094+ if (nr_ioapics >= MAX_IO_APICS) {
9095+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
9096+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
9097+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
9098+ }
9099+ if (!address) {
9100+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
9101+ " found in MADT table, skipping!\n");
9102+ return;
9103+ }
9104+
9105+ idx = nr_ioapics++;
9106+
9107+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
9108+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9109+ mp_ioapics[idx].mpc_apicaddr = address;
9110+
9111+#ifndef CONFIG_XEN
9112+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9113+#endif
9114+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
9115+ tmpid = io_apic_get_unique_id(idx, id);
9116+ else
9117+ tmpid = id;
9118+ if (tmpid == -1) {
9119+ nr_ioapics--;
9120+ return;
9121+ }
9122+ mp_ioapics[idx].mpc_apicid = tmpid;
9123+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9124+
9125+ /*
9126+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9127+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9128+ */
9129+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9130+ mp_ioapic_routing[idx].gsi_base = gsi_base;
9131+ mp_ioapic_routing[idx].gsi_end = gsi_base +
9132+ io_apic_get_redir_entries(idx);
9133+
9134+ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
9135+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9136+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9137+ mp_ioapic_routing[idx].gsi_base,
9138+ mp_ioapic_routing[idx].gsi_end);
9139+
9140+ return;
9141+}
9142+
9143+
9144+void __init mp_override_legacy_irq (
9145+ u8 bus_irq,
9146+ u8 polarity,
9147+ u8 trigger,
9148+ u32 gsi)
9149+{
9150+ struct mpc_config_intsrc intsrc;
9151+ int ioapic = -1;
9152+ int pin = -1;
9153+
9154+ /*
9155+ * Convert 'gsi' to 'ioapic.pin'.
9156+ */
9157+ ioapic = mp_find_ioapic(gsi);
9158+ if (ioapic < 0)
9159+ return;
9160+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9161+
9162+ /*
9163+ * TBD: This check is for faulty timer entries, where the override
9164+ * erroneously sets the trigger to level, resulting in a HUGE
9165+ * increase of timer interrupts!
9166+ */
9167+ if ((bus_irq == 0) && (trigger == 3))
9168+ trigger = 1;
9169+
9170+ intsrc.mpc_type = MP_INTSRC;
9171+ intsrc.mpc_irqtype = mp_INT;
9172+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
9173+ intsrc.mpc_srcbus = MP_ISA_BUS;
9174+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9175+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9176+ intsrc.mpc_dstirq = pin; /* INTIN# */
9177+
9178+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
9179+ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
9180+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
9181+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
9182+
9183+ mp_irqs[mp_irq_entries] = intsrc;
9184+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
9185+ panic("Max # of irq sources exceeded!\n");
9186+
9187+ return;
9188+}
9189+
9190+int es7000_plat;
9191+
9192+void __init mp_config_acpi_legacy_irqs (void)
9193+{
9194+ struct mpc_config_intsrc intsrc;
9195+ int i = 0;
9196+ int ioapic = -1;
9197+
9198+ /*
9199+ * Fabricate the legacy ISA bus (bus #31).
9200+ */
9201+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9202+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9203+
9204+ /*
9205+ * Older generations of ES7000 have no legacy identity mappings
9206+ */
9207+ if (es7000_plat == 1)
9208+ return;
9209+
9210+ /*
9211+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
9212+ */
9213+ ioapic = mp_find_ioapic(0);
9214+ if (ioapic < 0)
9215+ return;
9216+
9217+ intsrc.mpc_type = MP_INTSRC;
9218+ intsrc.mpc_irqflag = 0; /* Conforming */
9219+ intsrc.mpc_srcbus = MP_ISA_BUS;
9220+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9221+
9222+ /*
9223+ * Use the default configuration for the IRQs 0-15. Unless
9224+ * overriden by (MADT) interrupt source override entries.
9225+ */
9226+ for (i = 0; i < 16; i++) {
9227+ int idx;
9228+
9229+ for (idx = 0; idx < mp_irq_entries; idx++) {
9230+ struct mpc_config_intsrc *irq = mp_irqs + idx;
9231+
9232+ /* Do we already have a mapping for this ISA IRQ? */
9233+ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
9234+ break;
9235+
9236+ /* Do we already have a mapping for this IOAPIC pin */
9237+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9238+ (irq->mpc_dstirq == i))
9239+ break;
9240+ }
9241+
9242+ if (idx != mp_irq_entries) {
9243+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9244+ continue; /* IRQ already used */
9245+ }
9246+
9247+ intsrc.mpc_irqtype = mp_INT;
9248+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
9249+ intsrc.mpc_dstirq = i;
9250+
9251+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
9252+ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
9253+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
9254+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
9255+ intsrc.mpc_dstirq);
9256+
9257+ mp_irqs[mp_irq_entries] = intsrc;
9258+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
9259+ panic("Max # of irq sources exceeded!\n");
9260+ }
9261+}
9262+
9263+#define MAX_GSI_NUM 4096
9264+
9265+int mp_register_gsi (u32 gsi, int triggering, int polarity)
9266+{
9267+ int ioapic = -1;
9268+ int ioapic_pin = 0;
9269+ int idx, bit = 0;
9270+ static int pci_irq = 16;
9271+ /*
9272+ * Mapping between Global System Interrups, which
9273+ * represent all possible interrupts, and IRQs
9274+ * assigned to actual devices.
9275+ */
9276+ static int gsi_to_irq[MAX_GSI_NUM];
9277+
9278+ /* Don't set up the ACPI SCI because it's already set up */
9279+ if (acpi_fadt.sci_int == gsi)
9280+ return gsi;
9281+
9282+ ioapic = mp_find_ioapic(gsi);
9283+ if (ioapic < 0) {
9284+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
9285+ return gsi;
9286+ }
9287+
9288+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9289+
9290+ if (ioapic_renumber_irq)
9291+ gsi = ioapic_renumber_irq(ioapic, gsi);
9292+
9293+ /*
9294+ * Avoid pin reprogramming. PRTs typically include entries
9295+ * with redundant pin->gsi mappings (but unique PCI devices);
9296+ * we only program the IOAPIC on the first.
9297+ */
9298+ bit = ioapic_pin % 32;
9299+ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
9300+ if (idx > 3) {
9301+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
9302+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
9303+ ioapic_pin);
9304+ return gsi;
9305+ }
9306+ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
9307+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
9308+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
9309+ return gsi_to_irq[gsi];
9310+ }
9311+
9312+ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
9313+
9314+ if (triggering == ACPI_LEVEL_SENSITIVE) {
9315+ /*
9316+ * For PCI devices assign IRQs in order, avoiding gaps
9317+ * due to unused I/O APIC pins.
9318+ */
9319+ int irq = gsi;
9320+ if (gsi < MAX_GSI_NUM) {
9321+ if (gsi > 15)
9322+ gsi = pci_irq++;
9323+ /*
9324+ * Don't assign IRQ used by ACPI SCI
9325+ */
9326+ if (gsi == acpi_fadt.sci_int)
9327+ gsi = pci_irq++;
9328+ gsi_to_irq[irq] = gsi;
9329+ } else {
9330+ printk(KERN_ERR "GSI %u is too high\n", gsi);
9331+ return gsi;
9332+ }
9333+ }
9334+
9335+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
9336+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
9337+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
9338+ return gsi;
9339+}
9340+
9341+#endif /* CONFIG_X86_IO_APIC */
9342+#endif /* CONFIG_ACPI */
9343diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/pci-dma-xen.c linux-2.6.16.33/arch/i386/kernel/pci-dma-xen.c
9344--- linux-2.6.16.33-noxen/arch/i386/kernel/pci-dma-xen.c 1970-01-01 00:00:00.000000000 +0000
9345+++ linux-2.6.16.33/arch/i386/kernel/pci-dma-xen.c 2007-01-08 15:00:45.000000000 +0000
9346@@ -0,0 +1,345 @@
9347+/*
9348+ * Dynamic DMA mapping support.
9349+ *
9350+ * On i386 there is no hardware dynamic DMA address translation,
9351+ * so consistent alloc/free are merely page allocation/freeing.
9352+ * The rest of the dynamic DMA mapping interface is implemented
9353+ * in asm/pci.h.
9354+ */
9355+
9356+#include <linux/types.h>
9357+#include <linux/mm.h>
9358+#include <linux/string.h>
9359+#include <linux/pci.h>
9360+#include <linux/module.h>
9361+#include <linux/version.h>
9362+#include <asm/io.h>
9363+#include <xen/balloon.h>
9364+#include <asm/swiotlb.h>
9365+#include <asm/tlbflush.h>
9366+#include <asm-i386/mach-xen/asm/swiotlb.h>
9367+#include <asm/bug.h>
9368+
9369+#ifdef __x86_64__
9370+int iommu_merge __read_mostly = 0;
9371+EXPORT_SYMBOL(iommu_merge);
9372+
9373+dma_addr_t bad_dma_address __read_mostly;
9374+EXPORT_SYMBOL(bad_dma_address);
9375+
9376+/* This tells the BIO block layer to assume merging. Default to off
9377+ because we cannot guarantee merging later. */
9378+int iommu_bio_merge __read_mostly = 0;
9379+EXPORT_SYMBOL(iommu_bio_merge);
9380+
9381+__init int iommu_setup(char *p)
9382+{
9383+ return 1;
9384+}
9385+#endif
9386+
9387+struct dma_coherent_mem {
9388+ void *virt_base;
9389+ u32 device_base;
9390+ int size;
9391+ int flags;
9392+ unsigned long *bitmap;
9393+};
9394+
9395+#define IOMMU_BUG_ON(test) \
9396+do { \
9397+ if (unlikely(test)) { \
9398+ printk(KERN_ALERT "Fatal DMA error! " \
9399+ "Please use 'swiotlb=force'\n"); \
9400+ BUG(); \
9401+ } \
9402+} while (0)
9403+
9404+int
9405+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9406+ enum dma_data_direction direction)
9407+{
9408+ int i, rc;
9409+
9410+ if (direction == DMA_NONE)
9411+ BUG();
9412+ WARN_ON(nents == 0 || sg[0].length == 0);
9413+
9414+ if (swiotlb) {
9415+ rc = swiotlb_map_sg(hwdev, sg, nents, direction);
9416+ } else {
9417+ for (i = 0; i < nents; i++ ) {
9418+ sg[i].dma_address =
9419+ page_to_bus(sg[i].page) + sg[i].offset;
9420+ sg[i].dma_length = sg[i].length;
9421+ BUG_ON(!sg[i].page);
9422+ IOMMU_BUG_ON(address_needs_mapping(
9423+ hwdev, sg[i].dma_address));
9424+ }
9425+ rc = nents;
9426+ }
9427+
9428+ flush_write_buffers();
9429+ return rc;
9430+}
9431+EXPORT_SYMBOL(dma_map_sg);
9432+
9433+void
9434+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9435+ enum dma_data_direction direction)
9436+{
9437+ BUG_ON(direction == DMA_NONE);
9438+ if (swiotlb)
9439+ swiotlb_unmap_sg(hwdev, sg, nents, direction);
9440+}
9441+EXPORT_SYMBOL(dma_unmap_sg);
9442+
9443+/*
9444+ * XXX This file is also used by xenLinux/ia64.
9445+ * "defined(__i386__) || defined (__x86_64__)" means "!defined(__ia64__)".
9446+ * This #if work around should be removed once this file is merbed back into
9447+ * i386' pci-dma or is moved to drivers/xen/core.
9448+ */
9449+#if defined(__i386__) || defined(__x86_64__)
9450+dma_addr_t
9451+dma_map_page(struct device *dev, struct page *page, unsigned long offset,
9452+ size_t size, enum dma_data_direction direction)
9453+{
9454+ dma_addr_t dma_addr;
9455+
9456+ BUG_ON(direction == DMA_NONE);
9457+
9458+ if (swiotlb) {
9459+ dma_addr = swiotlb_map_page(
9460+ dev, page, offset, size, direction);
9461+ } else {
9462+ dma_addr = page_to_bus(page) + offset;
9463+ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
9464+ }
9465+
9466+ return dma_addr;
9467+}
9468+EXPORT_SYMBOL(dma_map_page);
9469+
9470+void
9471+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
9472+ enum dma_data_direction direction)
9473+{
9474+ BUG_ON(direction == DMA_NONE);
9475+ if (swiotlb)
9476+ swiotlb_unmap_page(dev, dma_address, size, direction);
9477+}
9478+EXPORT_SYMBOL(dma_unmap_page);
9479+#endif /* defined(__i386__) || defined(__x86_64__) */
9480+
9481+int
9482+dma_mapping_error(dma_addr_t dma_addr)
9483+{
9484+ if (swiotlb)
9485+ return swiotlb_dma_mapping_error(dma_addr);
9486+ return 0;
9487+}
9488+EXPORT_SYMBOL(dma_mapping_error);
9489+
9490+int
9491+dma_supported(struct device *dev, u64 mask)
9492+{
9493+ if (swiotlb)
9494+ return swiotlb_dma_supported(dev, mask);
9495+ /*
9496+ * By default we'll BUG when an infeasible DMA is requested, and
9497+ * request swiotlb=force (see IOMMU_BUG_ON).
9498+ */
9499+ return 1;
9500+}
9501+EXPORT_SYMBOL(dma_supported);
9502+
9503+void *dma_alloc_coherent(struct device *dev, size_t size,
9504+ dma_addr_t *dma_handle, gfp_t gfp)
9505+{
9506+ void *ret;
9507+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
9508+ unsigned int order = get_order(size);
9509+ unsigned long vstart;
9510+ /* ignore region specifiers */
9511+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
9512+
9513+ if (mem) {
9514+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
9515+ order);
9516+ if (page >= 0) {
9517+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
9518+ ret = mem->virt_base + (page << PAGE_SHIFT);
9519+ memset(ret, 0, size);
9520+ return ret;
9521+ }
9522+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
9523+ return NULL;
9524+ }
9525+
9526+ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
9527+ gfp |= GFP_DMA;
9528+
9529+ vstart = __get_free_pages(gfp, order);
9530+ ret = (void *)vstart;
9531+
9532+ if (ret != NULL) {
9533+ if (xen_create_contiguous_region(vstart, order,
9534+ dma_bits) != 0) {
9535+ free_pages(vstart, order);
9536+ return NULL;
9537+ }
9538+ memset(ret, 0, size);
9539+ *dma_handle = virt_to_bus(ret);
9540+ }
9541+ return ret;
9542+}
9543+EXPORT_SYMBOL(dma_alloc_coherent);
9544+
9545+void dma_free_coherent(struct device *dev, size_t size,
9546+ void *vaddr, dma_addr_t dma_handle)
9547+{
9548+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
9549+ int order = get_order(size);
9550+
9551+ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
9552+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
9553+
9554+ bitmap_release_region(mem->bitmap, page, order);
9555+ } else {
9556+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
9557+ free_pages((unsigned long)vaddr, order);
9558+ }
9559+}
9560+EXPORT_SYMBOL(dma_free_coherent);
9561+
9562+#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
9563+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
9564+ dma_addr_t device_addr, size_t size, int flags)
9565+{
9566+ void __iomem *mem_base;
9567+ int pages = size >> PAGE_SHIFT;
9568+ int bitmap_size = (pages + 31)/32;
9569+
9570+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
9571+ goto out;
9572+ if (!size)
9573+ goto out;
9574+ if (dev->dma_mem)
9575+ goto out;
9576+
9577+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
9578+
9579+ mem_base = ioremap(bus_addr, size);
9580+ if (!mem_base)
9581+ goto out;
9582+
9583+ dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
9584+ if (!dev->dma_mem)
9585+ goto out;
9586+ memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
9587+ dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
9588+ if (!dev->dma_mem->bitmap)
9589+ goto free1_out;
9590+ memset(dev->dma_mem->bitmap, 0, bitmap_size);
9591+
9592+ dev->dma_mem->virt_base = mem_base;
9593+ dev->dma_mem->device_base = device_addr;
9594+ dev->dma_mem->size = pages;
9595+ dev->dma_mem->flags = flags;
9596+
9597+ if (flags & DMA_MEMORY_MAP)
9598+ return DMA_MEMORY_MAP;
9599+
9600+ return DMA_MEMORY_IO;
9601+
9602+ free1_out:
9603+ kfree(dev->dma_mem->bitmap);
9604+ out:
9605+ return 0;
9606+}
9607+EXPORT_SYMBOL(dma_declare_coherent_memory);
9608+
9609+void dma_release_declared_memory(struct device *dev)
9610+{
9611+ struct dma_coherent_mem *mem = dev->dma_mem;
9612+
9613+ if(!mem)
9614+ return;
9615+ dev->dma_mem = NULL;
9616+ iounmap(mem->virt_base);
9617+ kfree(mem->bitmap);
9618+ kfree(mem);
9619+}
9620+EXPORT_SYMBOL(dma_release_declared_memory);
9621+
9622+void *dma_mark_declared_memory_occupied(struct device *dev,
9623+ dma_addr_t device_addr, size_t size)
9624+{
9625+ struct dma_coherent_mem *mem = dev->dma_mem;
9626+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
9627+ int pos, err;
9628+
9629+ if (!mem)
9630+ return ERR_PTR(-EINVAL);
9631+
9632+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
9633+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
9634+ if (err != 0)
9635+ return ERR_PTR(err);
9636+ return mem->virt_base + (pos << PAGE_SHIFT);
9637+}
9638+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
9639+#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
9640+
9641+dma_addr_t
9642+dma_map_single(struct device *dev, void *ptr, size_t size,
9643+ enum dma_data_direction direction)
9644+{
9645+ dma_addr_t dma;
9646+
9647+ if (direction == DMA_NONE)
9648+ BUG();
9649+ WARN_ON(size == 0);
9650+
9651+ if (swiotlb) {
9652+ dma = swiotlb_map_single(dev, ptr, size, direction);
9653+ } else {
9654+ dma = virt_to_bus(ptr);
9655+ IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
9656+ IOMMU_BUG_ON(address_needs_mapping(dev, dma));
9657+ }
9658+
9659+ flush_write_buffers();
9660+ return dma;
9661+}
9662+EXPORT_SYMBOL(dma_map_single);
9663+
9664+void
9665+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
9666+ enum dma_data_direction direction)
9667+{
9668+ if (direction == DMA_NONE)
9669+ BUG();
9670+ if (swiotlb)
9671+ swiotlb_unmap_single(dev, dma_addr, size, direction);
9672+}
9673+EXPORT_SYMBOL(dma_unmap_single);
9674+
9675+void
9676+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
9677+ enum dma_data_direction direction)
9678+{
9679+ if (swiotlb)
9680+ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
9681+}
9682+EXPORT_SYMBOL(dma_sync_single_for_cpu);
9683+
9684+void
9685+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
9686+ enum dma_data_direction direction)
9687+{
9688+ if (swiotlb)
9689+ swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
9690+}
9691+EXPORT_SYMBOL(dma_sync_single_for_device);
9692diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/process-xen.c linux-2.6.16.33/arch/i386/kernel/process-xen.c
9693--- linux-2.6.16.33-noxen/arch/i386/kernel/process-xen.c 1970-01-01 00:00:00.000000000 +0000
9694+++ linux-2.6.16.33/arch/i386/kernel/process-xen.c 2007-01-08 15:00:45.000000000 +0000
9695@@ -0,0 +1,821 @@
9696+/*
9697+ * linux/arch/i386/kernel/process.c
9698+ *
9699+ * Copyright (C) 1995 Linus Torvalds
9700+ *
9701+ * Pentium III FXSR, SSE support
9702+ * Gareth Hughes <gareth@valinux.com>, May 2000
9703+ */
9704+
9705+/*
9706+ * This file handles the architecture-dependent parts of process handling..
9707+ */
9708+
9709+#include <stdarg.h>
9710+
9711+#include <linux/cpu.h>
9712+#include <linux/errno.h>
9713+#include <linux/sched.h>
9714+#include <linux/fs.h>
9715+#include <linux/kernel.h>
9716+#include <linux/mm.h>
9717+#include <linux/elfcore.h>
9718+#include <linux/smp.h>
9719+#include <linux/smp_lock.h>
9720+#include <linux/stddef.h>
9721+#include <linux/slab.h>
9722+#include <linux/vmalloc.h>
9723+#include <linux/user.h>
9724+#include <linux/a.out.h>
9725+#include <linux/interrupt.h>
9726+#include <linux/config.h>
9727+#include <linux/utsname.h>
9728+#include <linux/delay.h>
9729+#include <linux/reboot.h>
9730+#include <linux/init.h>
9731+#include <linux/mc146818rtc.h>
9732+#include <linux/module.h>
9733+#include <linux/kallsyms.h>
9734+#include <linux/ptrace.h>
9735+#include <linux/random.h>
9736+#include <linux/kprobes.h>
9737+
9738+#include <asm/uaccess.h>
9739+#include <asm/pgtable.h>
9740+#include <asm/system.h>
9741+#include <asm/io.h>
9742+#include <asm/ldt.h>
9743+#include <asm/processor.h>
9744+#include <asm/i387.h>
9745+#include <asm/desc.h>
9746+#include <asm/vm86.h>
9747+#ifdef CONFIG_MATH_EMULATION
9748+#include <asm/math_emu.h>
9749+#endif
9750+
9751+#include <xen/interface/physdev.h>
9752+#include <xen/interface/vcpu.h>
9753+#include <xen/cpu_hotplug.h>
9754+
9755+#include <linux/err.h>
9756+
9757+#include <asm/tlbflush.h>
9758+#include <asm/cpu.h>
9759+
9760+#include <asm/tlbflush.h>
9761+#include <asm/cpu.h>
9762+
9763+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
9764+
9765+static int hlt_counter;
9766+
9767+unsigned long boot_option_idle_override = 0;
9768+EXPORT_SYMBOL(boot_option_idle_override);
9769+
9770+/*
9771+ * Return saved PC of a blocked thread.
9772+ */
9773+unsigned long thread_saved_pc(struct task_struct *tsk)
9774+{
9775+ return ((unsigned long *)tsk->thread.esp)[3];
9776+}
9777+
9778+/*
9779+ * Powermanagement idle function, if any..
9780+ */
9781+void (*pm_idle)(void);
9782+EXPORT_SYMBOL(pm_idle);
9783+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
9784+
9785+void disable_hlt(void)
9786+{
9787+ hlt_counter++;
9788+}
9789+
9790+EXPORT_SYMBOL(disable_hlt);
9791+
9792+void enable_hlt(void)
9793+{
9794+ hlt_counter--;
9795+}
9796+
9797+EXPORT_SYMBOL(enable_hlt);
9798+
9799+/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
9800+void xen_idle(void)
9801+{
9802+ local_irq_disable();
9803+
9804+ if (need_resched())
9805+ local_irq_enable();
9806+ else {
9807+ clear_thread_flag(TIF_POLLING_NRFLAG);
9808+ smp_mb__after_clear_bit();
9809+ safe_halt();
9810+ set_thread_flag(TIF_POLLING_NRFLAG);
9811+ }
9812+}
9813+#ifdef CONFIG_APM_MODULE
9814+EXPORT_SYMBOL(default_idle);
9815+#endif
9816+
9817+#ifdef CONFIG_HOTPLUG_CPU
9818+extern cpumask_t cpu_initialized;
9819+static inline void play_dead(void)
9820+{
9821+ idle_task_exit();
9822+ local_irq_disable();
9823+ cpu_clear(smp_processor_id(), cpu_initialized);
9824+ preempt_enable_no_resched();
9825+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
9826+ cpu_bringup();
9827+}
9828+#else
9829+static inline void play_dead(void)
9830+{
9831+ BUG();
9832+}
9833+#endif /* CONFIG_HOTPLUG_CPU */
9834+
9835+/*
9836+ * The idle thread. There's no useful work to be
9837+ * done, so just try to conserve power and have a
9838+ * low exit latency (ie sit in a loop waiting for
9839+ * somebody to say that they'd like to reschedule)
9840+ */
9841+void cpu_idle(void)
9842+{
9843+ int cpu = smp_processor_id();
9844+
9845+ set_thread_flag(TIF_POLLING_NRFLAG);
9846+
9847+ /* endless idle loop with no priority at all */
9848+ while (1) {
9849+ while (!need_resched()) {
9850+
9851+ if (__get_cpu_var(cpu_idle_state))
9852+ __get_cpu_var(cpu_idle_state) = 0;
9853+
9854+ rmb();
9855+
9856+ if (cpu_is_offline(cpu))
9857+ play_dead();
9858+
9859+ __get_cpu_var(irq_stat).idle_timestamp = jiffies;
9860+ xen_idle();
9861+ }
9862+ preempt_enable_no_resched();
9863+ schedule();
9864+ preempt_disable();
9865+ }
9866+}
9867+
9868+void cpu_idle_wait(void)
9869+{
9870+ unsigned int cpu, this_cpu = get_cpu();
9871+ cpumask_t map;
9872+
9873+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
9874+ put_cpu();
9875+
9876+ cpus_clear(map);
9877+ for_each_online_cpu(cpu) {
9878+ per_cpu(cpu_idle_state, cpu) = 1;
9879+ cpu_set(cpu, map);
9880+ }
9881+
9882+ __get_cpu_var(cpu_idle_state) = 0;
9883+
9884+ wmb();
9885+ do {
9886+ ssleep(1);
9887+ for_each_online_cpu(cpu) {
9888+ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
9889+ cpu_clear(cpu, map);
9890+ }
9891+ cpus_and(map, map, cpu_online_map);
9892+ } while (!cpus_empty(map));
9893+}
9894+EXPORT_SYMBOL_GPL(cpu_idle_wait);
9895+
9896+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
9897+/* Always use xen_idle() instead. */
9898+void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
9899+
9900+void show_regs(struct pt_regs * regs)
9901+{
9902+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
9903+
9904+ printk("\n");
9905+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
9906+ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
9907+ print_symbol("EIP is at %s\n", regs->eip);
9908+
9909+ if (user_mode(regs))
9910+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
9911+ printk(" EFLAGS: %08lx %s (%s %.*s)\n",
9912+ regs->eflags, print_tainted(), system_utsname.release,
9913+ (int)strcspn(system_utsname.version, " "),
9914+ system_utsname.version);
9915+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
9916+ regs->eax,regs->ebx,regs->ecx,regs->edx);
9917+ printk("ESI: %08lx EDI: %08lx EBP: %08lx",
9918+ regs->esi, regs->edi, regs->ebp);
9919+ printk(" DS: %04x ES: %04x\n",
9920+ 0xffff & regs->xds,0xffff & regs->xes);
9921+
9922+ cr0 = read_cr0();
9923+ cr2 = read_cr2();
9924+ cr3 = read_cr3();
9925+ cr4 = read_cr4_safe();
9926+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
9927+ show_trace(NULL, &regs->esp);
9928+}
9929+
9930+/*
9931+ * This gets run with %ebx containing the
9932+ * function to call, and %edx containing
9933+ * the "args".
9934+ */
9935+extern void kernel_thread_helper(void);
9936+__asm__(".section .text\n"
9937+ ".align 4\n"
9938+ "kernel_thread_helper:\n\t"
9939+ "movl %edx,%eax\n\t"
9940+ "pushl %edx\n\t"
9941+ "call *%ebx\n\t"
9942+ "pushl %eax\n\t"
9943+ "call do_exit\n"
9944+ ".previous");
9945+
9946+/*
9947+ * Create a kernel thread
9948+ */
9949+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
9950+{
9951+ struct pt_regs regs;
9952+
9953+ memset(&regs, 0, sizeof(regs));
9954+
9955+ regs.ebx = (unsigned long) fn;
9956+ regs.edx = (unsigned long) arg;
9957+
9958+ regs.xds = __USER_DS;
9959+ regs.xes = __USER_DS;
9960+ regs.orig_eax = -1;
9961+ regs.eip = (unsigned long) kernel_thread_helper;
9962+ regs.xcs = GET_KERNEL_CS();
9963+ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
9964+
9965+ /* Ok, create the new process.. */
9966+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
9967+}
9968+EXPORT_SYMBOL(kernel_thread);
9969+
9970+/*
9971+ * Free current thread data structures etc..
9972+ */
9973+void exit_thread(void)
9974+{
9975+ struct task_struct *tsk = current;
9976+ struct thread_struct *t = &tsk->thread;
9977+
9978+ /*
9979+ * Remove function-return probe instances associated with this task
9980+ * and put them back on the free list. Do not insert an exit probe for
9981+ * this function, it will be disabled by kprobe_flush_task if you do.
9982+ */
9983+ kprobe_flush_task(tsk);
9984+
9985+ /* The process may have allocated an io port bitmap... nuke it. */
9986+ if (unlikely(NULL != t->io_bitmap_ptr)) {
9987+ struct physdev_set_iobitmap set_iobitmap = { 0 };
9988+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
9989+ kfree(t->io_bitmap_ptr);
9990+ t->io_bitmap_ptr = NULL;
9991+ }
9992+}
9993+
9994+void flush_thread(void)
9995+{
9996+ struct task_struct *tsk = current;
9997+
9998+ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
9999+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
10000+ /*
10001+ * Forget coprocessor state..
10002+ */
10003+ clear_fpu(tsk);
10004+ clear_used_math();
10005+}
10006+
10007+void release_thread(struct task_struct *dead_task)
10008+{
10009+ BUG_ON(dead_task->mm);
10010+ release_vm86_irqs(dead_task);
10011+}
10012+
10013+/*
10014+ * This gets called before we allocate a new thread and copy
10015+ * the current task into it.
10016+ */
10017+void prepare_to_copy(struct task_struct *tsk)
10018+{
10019+ unlazy_fpu(tsk);
10020+}
10021+
10022+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
10023+ unsigned long unused,
10024+ struct task_struct * p, struct pt_regs * regs)
10025+{
10026+ struct pt_regs * childregs;
10027+ struct task_struct *tsk;
10028+ int err;
10029+
10030+ childregs = task_pt_regs(p);
10031+ *childregs = *regs;
10032+ childregs->eax = 0;
10033+ childregs->esp = esp;
10034+
10035+ p->thread.esp = (unsigned long) childregs;
10036+ p->thread.esp0 = (unsigned long) (childregs+1);
10037+
10038+ p->thread.eip = (unsigned long) ret_from_fork;
10039+
10040+ savesegment(fs,p->thread.fs);
10041+ savesegment(gs,p->thread.gs);
10042+
10043+ tsk = current;
10044+ if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
10045+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
10046+ if (!p->thread.io_bitmap_ptr) {
10047+ p->thread.io_bitmap_max = 0;
10048+ return -ENOMEM;
10049+ }
10050+ memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
10051+ IO_BITMAP_BYTES);
10052+ }
10053+
10054+ /*
10055+ * Set a new TLS for the child thread?
10056+ */
10057+ if (clone_flags & CLONE_SETTLS) {
10058+ struct desc_struct *desc;
10059+ struct user_desc info;
10060+ int idx;
10061+
10062+ err = -EFAULT;
10063+ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
10064+ goto out;
10065+ err = -EINVAL;
10066+ if (LDT_empty(&info))
10067+ goto out;
10068+
10069+ idx = info.entry_number;
10070+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10071+ goto out;
10072+
10073+ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
10074+ desc->a = LDT_entry_a(&info);
10075+ desc->b = LDT_entry_b(&info);
10076+ }
10077+
10078+ p->thread.iopl = current->thread.iopl;
10079+
10080+ err = 0;
10081+ out:
10082+ if (err && p->thread.io_bitmap_ptr) {
10083+ kfree(p->thread.io_bitmap_ptr);
10084+ p->thread.io_bitmap_max = 0;
10085+ }
10086+ return err;
10087+}
10088+
10089+/*
10090+ * fill in the user structure for a core dump..
10091+ */
10092+void dump_thread(struct pt_regs * regs, struct user * dump)
10093+{
10094+ int i;
10095+
10096+/* changed the size calculations - should hopefully work better. lbt */
10097+ dump->magic = CMAGIC;
10098+ dump->start_code = 0;
10099+ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
10100+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
10101+ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
10102+ dump->u_dsize -= dump->u_tsize;
10103+ dump->u_ssize = 0;
10104+ for (i = 0; i < 8; i++)
10105+ dump->u_debugreg[i] = current->thread.debugreg[i];
10106+
10107+ if (dump->start_stack < TASK_SIZE)
10108+ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
10109+
10110+ dump->regs.ebx = regs->ebx;
10111+ dump->regs.ecx = regs->ecx;
10112+ dump->regs.edx = regs->edx;
10113+ dump->regs.esi = regs->esi;
10114+ dump->regs.edi = regs->edi;
10115+ dump->regs.ebp = regs->ebp;
10116+ dump->regs.eax = regs->eax;
10117+ dump->regs.ds = regs->xds;
10118+ dump->regs.es = regs->xes;
10119+ savesegment(fs,dump->regs.fs);
10120+ savesegment(gs,dump->regs.gs);
10121+ dump->regs.orig_eax = regs->orig_eax;
10122+ dump->regs.eip = regs->eip;
10123+ dump->regs.cs = regs->xcs;
10124+ dump->regs.eflags = regs->eflags;
10125+ dump->regs.esp = regs->esp;
10126+ dump->regs.ss = regs->xss;
10127+
10128+ dump->u_fpvalid = dump_fpu (regs, &dump->i387);
10129+}
10130+EXPORT_SYMBOL(dump_thread);
10131+
10132+/*
10133+ * Capture the user space registers if the task is not running (in user space)
10134+ */
10135+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
10136+{
10137+ struct pt_regs ptregs = *task_pt_regs(tsk);
10138+ ptregs.xcs &= 0xffff;
10139+ ptregs.xds &= 0xffff;
10140+ ptregs.xes &= 0xffff;
10141+ ptregs.xss &= 0xffff;
10142+
10143+ elf_core_copy_regs(regs, &ptregs);
10144+
10145+ return 1;
10146+}
10147+
10148+/*
10149+ * This function selects if the context switch from prev to next
10150+ * has to tweak the TSC disable bit in the cr4.
10151+ */
10152+static inline void disable_tsc(struct task_struct *prev_p,
10153+ struct task_struct *next_p)
10154+{
10155+ struct thread_info *prev, *next;
10156+
10157+ /*
10158+ * gcc should eliminate the ->thread_info dereference if
10159+ * has_secure_computing returns 0 at compile time (SECCOMP=n).
10160+ */
10161+ prev = task_thread_info(prev_p);
10162+ next = task_thread_info(next_p);
10163+
10164+ if (has_secure_computing(prev) || has_secure_computing(next)) {
10165+ /* slow path here */
10166+ if (has_secure_computing(prev) &&
10167+ !has_secure_computing(next)) {
10168+ write_cr4(read_cr4() & ~X86_CR4_TSD);
10169+ } else if (!has_secure_computing(prev) &&
10170+ has_secure_computing(next))
10171+ write_cr4(read_cr4() | X86_CR4_TSD);
10172+ }
10173+}
10174+
10175+/*
10176+ * switch_to(x,yn) should switch tasks from x to y.
10177+ *
10178+ * We fsave/fwait so that an exception goes off at the right time
10179+ * (as a call from the fsave or fwait in effect) rather than to
10180+ * the wrong process. Lazy FP saving no longer makes any sense
10181+ * with modern CPU's, and this simplifies a lot of things (SMP
10182+ * and UP become the same).
10183+ *
10184+ * NOTE! We used to use the x86 hardware context switching. The
10185+ * reason for not using it any more becomes apparent when you
10186+ * try to recover gracefully from saved state that is no longer
10187+ * valid (stale segment register values in particular). With the
10188+ * hardware task-switch, there is no way to fix up bad state in
10189+ * a reasonable manner.
10190+ *
10191+ * The fact that Intel documents the hardware task-switching to
10192+ * be slow is a fairly red herring - this code is not noticeably
10193+ * faster. However, there _is_ some room for improvement here,
10194+ * so the performance issues may eventually be a valid point.
10195+ * More important, however, is the fact that this allows us much
10196+ * more flexibility.
10197+ *
10198+ * The return value (in %eax) will be the "prev" task after
10199+ * the task-switch, and shows up in ret_from_fork in entry.S,
10200+ * for example.
10201+ */
10202+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
10203+{
10204+ struct thread_struct *prev = &prev_p->thread,
10205+ *next = &next_p->thread;
10206+ int cpu = smp_processor_id();
10207+#ifndef CONFIG_X86_NO_TSS
10208+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
10209+#endif
10210+ struct physdev_set_iopl iopl_op;
10211+ struct physdev_set_iobitmap iobmp_op;
10212+ multicall_entry_t _mcl[8], *mcl = _mcl;
10213+
10214+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
10215+
10216+ /*
10217+ * This is basically '__unlazy_fpu', except that we queue a
10218+ * multicall to indicate FPU task switch, rather than
10219+ * synchronously trapping to Xen.
10220+ */
10221+ if (prev_p->thread_info->status & TS_USEDFPU) {
10222+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
10223+ mcl->op = __HYPERVISOR_fpu_taskswitch;
10224+ mcl->args[0] = 1;
10225+ mcl++;
10226+ }
10227+#if 0 /* lazy fpu sanity check */
10228+ else BUG_ON(!(read_cr0() & 8));
10229+#endif
10230+
10231+ /*
10232+ * Reload esp0.
10233+ * This is load_esp0(tss, next) with a multicall.
10234+ */
10235+ mcl->op = __HYPERVISOR_stack_switch;
10236+ mcl->args[0] = __KERNEL_DS;
10237+ mcl->args[1] = next->esp0;
10238+ mcl++;
10239+
10240+ /*
10241+ * Load the per-thread Thread-Local Storage descriptor.
10242+ * This is load_TLS(next, cpu) with multicalls.
10243+ */
10244+#define C(i) do { \
10245+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
10246+ next->tls_array[i].b != prev->tls_array[i].b)) { \
10247+ mcl->op = __HYPERVISOR_update_descriptor; \
10248+ *(u64 *)&mcl->args[0] = virt_to_machine( \
10249+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
10250+ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
10251+ mcl++; \
10252+ } \
10253+} while (0)
10254+ C(0); C(1); C(2);
10255+#undef C
10256+
10257+ if (unlikely(prev->iopl != next->iopl)) {
10258+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
10259+ mcl->op = __HYPERVISOR_physdev_op;
10260+ mcl->args[0] = PHYSDEVOP_set_iopl;
10261+ mcl->args[1] = (unsigned long)&iopl_op;
10262+ mcl++;
10263+ }
10264+
10265+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
10266+ iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
10267+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
10268+ mcl->op = __HYPERVISOR_physdev_op;
10269+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
10270+ mcl->args[1] = (unsigned long)&iobmp_op;
10271+ mcl++;
10272+ }
10273+
10274+ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
10275+
10276+ /*
10277+ * Restore %fs and %gs if needed.
10278+ *
10279+ * Glibc normally makes %fs be zero, and %gs is one of
10280+ * the TLS segments.
10281+ */
10282+ if (unlikely(next->fs))
10283+ loadsegment(fs, next->fs);
10284+
10285+ if (next->gs)
10286+ loadsegment(gs, next->gs);
10287+
10288+ /*
10289+ * Now maybe reload the debug registers
10290+ */
10291+ if (unlikely(next->debugreg[7])) {
10292+ set_debugreg(next->debugreg[0], 0);
10293+ set_debugreg(next->debugreg[1], 1);
10294+ set_debugreg(next->debugreg[2], 2);
10295+ set_debugreg(next->debugreg[3], 3);
10296+ /* no 4 and 5 */
10297+ set_debugreg(next->debugreg[6], 6);
10298+ set_debugreg(next->debugreg[7], 7);
10299+ }
10300+
10301+ disable_tsc(prev_p, next_p);
10302+
10303+ return prev_p;
10304+}
10305+
10306+asmlinkage int sys_fork(struct pt_regs regs)
10307+{
10308+ return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10309+}
10310+
10311+asmlinkage int sys_clone(struct pt_regs regs)
10312+{
10313+ unsigned long clone_flags;
10314+ unsigned long newsp;
10315+ int __user *parent_tidptr, *child_tidptr;
10316+
10317+ clone_flags = regs.ebx;
10318+ newsp = regs.ecx;
10319+ parent_tidptr = (int __user *)regs.edx;
10320+ child_tidptr = (int __user *)regs.edi;
10321+ if (!newsp)
10322+ newsp = regs.esp;
10323+ return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
10324+}
10325+
10326+/*
10327+ * This is trivial, and on the face of it looks like it
10328+ * could equally well be done in user mode.
10329+ *
10330+ * Not so, for quite unobvious reasons - register pressure.
10331+ * In user mode vfork() cannot have a stack frame, and if
10332+ * done by calling the "clone()" system call directly, you
10333+ * do not have enough call-clobbered registers to hold all
10334+ * the information you need.
10335+ */
10336+asmlinkage int sys_vfork(struct pt_regs regs)
10337+{
10338+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10339+}
10340+
10341+/*
10342+ * sys_execve() executes a new program.
10343+ */
10344+asmlinkage int sys_execve(struct pt_regs regs)
10345+{
10346+ int error;
10347+ char * filename;
10348+
10349+ filename = getname((char __user *) regs.ebx);
10350+ error = PTR_ERR(filename);
10351+ if (IS_ERR(filename))
10352+ goto out;
10353+ error = do_execve(filename,
10354+ (char __user * __user *) regs.ecx,
10355+ (char __user * __user *) regs.edx,
10356+ &regs);
10357+ if (error == 0) {
10358+ task_lock(current);
10359+ current->ptrace &= ~PT_DTRACE;
10360+ task_unlock(current);
10361+ /* Make sure we don't return using sysenter.. */
10362+ set_thread_flag(TIF_IRET);
10363+ }
10364+ putname(filename);
10365+out:
10366+ return error;
10367+}
10368+
10369+#define top_esp (THREAD_SIZE - sizeof(unsigned long))
10370+#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
10371+
10372+unsigned long get_wchan(struct task_struct *p)
10373+{
10374+ unsigned long ebp, esp, eip;
10375+ unsigned long stack_page;
10376+ int count = 0;
10377+ if (!p || p == current || p->state == TASK_RUNNING)
10378+ return 0;
10379+ stack_page = (unsigned long)task_stack_page(p);
10380+ esp = p->thread.esp;
10381+ if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
10382+ return 0;
10383+ /* include/asm-i386/system.h:switch_to() pushes ebp last. */
10384+ ebp = *(unsigned long *) esp;
10385+ do {
10386+ if (ebp < stack_page || ebp > top_ebp+stack_page)
10387+ return 0;
10388+ eip = *(unsigned long *) (ebp+4);
10389+ if (!in_sched_functions(eip))
10390+ return eip;
10391+ ebp = *(unsigned long *) ebp;
10392+ } while (count++ < 16);
10393+ return 0;
10394+}
10395+EXPORT_SYMBOL(get_wchan);
10396+
10397+/*
10398+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
10399+ */
10400+static int get_free_idx(void)
10401+{
10402+ struct thread_struct *t = &current->thread;
10403+ int idx;
10404+
10405+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
10406+ if (desc_empty(t->tls_array + idx))
10407+ return idx + GDT_ENTRY_TLS_MIN;
10408+ return -ESRCH;
10409+}
10410+
10411+/*
10412+ * Set a given TLS descriptor:
10413+ */
10414+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
10415+{
10416+ struct thread_struct *t = &current->thread;
10417+ struct user_desc info;
10418+ struct desc_struct *desc;
10419+ int cpu, idx;
10420+
10421+ if (copy_from_user(&info, u_info, sizeof(info)))
10422+ return -EFAULT;
10423+ idx = info.entry_number;
10424+
10425+ /*
10426+ * index -1 means the kernel should try to find and
10427+ * allocate an empty descriptor:
10428+ */
10429+ if (idx == -1) {
10430+ idx = get_free_idx();
10431+ if (idx < 0)
10432+ return idx;
10433+ if (put_user(idx, &u_info->entry_number))
10434+ return -EFAULT;
10435+ }
10436+
10437+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10438+ return -EINVAL;
10439+
10440+ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
10441+
10442+ /*
10443+ * We must not get preempted while modifying the TLS.
10444+ */
10445+ cpu = get_cpu();
10446+
10447+ if (LDT_empty(&info)) {
10448+ desc->a = 0;
10449+ desc->b = 0;
10450+ } else {
10451+ desc->a = LDT_entry_a(&info);
10452+ desc->b = LDT_entry_b(&info);
10453+ }
10454+ load_TLS(t, cpu);
10455+
10456+ put_cpu();
10457+
10458+ return 0;
10459+}
10460+
10461+/*
10462+ * Get the current Thread-Local Storage area:
10463+ */
10464+
10465+#define GET_BASE(desc) ( \
10466+ (((desc)->a >> 16) & 0x0000ffff) | \
10467+ (((desc)->b << 16) & 0x00ff0000) | \
10468+ ( (desc)->b & 0xff000000) )
10469+
10470+#define GET_LIMIT(desc) ( \
10471+ ((desc)->a & 0x0ffff) | \
10472+ ((desc)->b & 0xf0000) )
10473+
10474+#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
10475+#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
10476+#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
10477+#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
10478+#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
10479+#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
10480+
10481+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
10482+{
10483+ struct user_desc info;
10484+ struct desc_struct *desc;
10485+ int idx;
10486+
10487+ if (get_user(idx, &u_info->entry_number))
10488+ return -EFAULT;
10489+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10490+ return -EINVAL;
10491+
10492+ memset(&info, 0, sizeof(info));
10493+
10494+ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
10495+
10496+ info.entry_number = idx;
10497+ info.base_addr = GET_BASE(desc);
10498+ info.limit = GET_LIMIT(desc);
10499+ info.seg_32bit = GET_32BIT(desc);
10500+ info.contents = GET_CONTENTS(desc);
10501+ info.read_exec_only = !GET_WRITABLE(desc);
10502+ info.limit_in_pages = GET_LIMIT_PAGES(desc);
10503+ info.seg_not_present = !GET_PRESENT(desc);
10504+ info.useable = GET_USEABLE(desc);
10505+
10506+ if (copy_to_user(u_info, &info, sizeof(info)))
10507+ return -EFAULT;
10508+ return 0;
10509+}
10510+
10511+unsigned long arch_align_stack(unsigned long sp)
10512+{
10513+ if (randomize_va_space)
10514+ sp -= get_random_int() % 8192;
10515+ return sp & ~0xf;
10516+}
10517diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/quirks-xen.c linux-2.6.16.33/arch/i386/kernel/quirks-xen.c
10518--- linux-2.6.16.33-noxen/arch/i386/kernel/quirks-xen.c 1970-01-01 00:00:00.000000000 +0000
10519+++ linux-2.6.16.33/arch/i386/kernel/quirks-xen.c 2007-01-08 15:00:45.000000000 +0000
10520@@ -0,0 +1,48 @@
10521+/*
10522+ * This file contains work-arounds for x86 and x86_64 platform bugs.
10523+ */
10524+#include <linux/config.h>
10525+#include <linux/pci.h>
10526+#include <linux/irq.h>
10527+
10528+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
10529+
10530+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10531+{
10532+ u8 config, rev;
10533+ u32 word;
10534+
10535+ /* BIOS may enable hardware IRQ balancing for
10536+ * E7520/E7320/E7525(revision ID 0x9 and below)
10537+ * based platforms.
10538+ * Disable SW irqbalance/affinity on those platforms.
10539+ */
10540+ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
10541+ if (rev > 0x9)
10542+ return;
10543+
10544+ printk(KERN_INFO "Intel E7520/7320/7525 detected.");
10545+
10546+ /* enable access to config space*/
10547+ pci_read_config_byte(dev, 0xf4, &config);
10548+ pci_write_config_byte(dev, 0xf4, config|0x2);
10549+
10550+ /* read xTPR register */
10551+ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
10552+
10553+ if (!(word & (1 << 13))) {
10554+ dom0_op_t op;
10555+ printk(KERN_INFO "Disabling irq balancing and affinity\n");
10556+ op.cmd = DOM0_PLATFORM_QUIRK;
10557+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
10558+ (void)HYPERVISOR_dom0_op(&op);
10559+ }
10560+
10561+ /* put back the original value for config space*/
10562+ if (!(config & 0x2))
10563+ pci_write_config_byte(dev, 0xf4, config);
10564+}
10565+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
10566+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
10567+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
10568+#endif
10569diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/relocate_kernel.S linux-2.6.16.33/arch/i386/kernel/relocate_kernel.S
10570--- linux-2.6.16.33-noxen/arch/i386/kernel/relocate_kernel.S 2006-11-22 18:06:31.000000000 +0000
10571+++ linux-2.6.16.33/arch/i386/kernel/relocate_kernel.S 2007-05-23 21:00:01.000000000 +0000
10572@@ -7,16 +7,138 @@
10573 */
10574
10575 #include <linux/linkage.h>
10576+#include <asm/page.h>
10577+#include <asm/kexec.h>
10578+
10579+/*
10580+ * Must be relocatable PIC code callable as a C function
10581+ */
10582+
10583+#define PTR(x) (x << 2)
10584+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
10585+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
10586+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
10587+
10588+ .text
10589+ .align PAGE_ALIGNED
10590+ .globl relocate_kernel
10591+relocate_kernel:
10592+ movl 8(%esp), %ebp /* list of pages */
10593+
10594+#ifdef CONFIG_X86_PAE
10595+ /* map the control page at its virtual address */
10596+
10597+ movl PTR(VA_PGD)(%ebp), %edi
10598+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10599+ andl $0xc0000000, %eax
10600+ shrl $27, %eax
10601+ addl %edi, %eax
10602+
10603+ movl PTR(PA_PMD_0)(%ebp), %edx
10604+ orl $PAE_PGD_ATTR, %edx
10605+ movl %edx, (%eax)
10606+
10607+ movl PTR(VA_PMD_0)(%ebp), %edi
10608+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10609+ andl $0x3fe00000, %eax
10610+ shrl $18, %eax
10611+ addl %edi, %eax
10612+
10613+ movl PTR(PA_PTE_0)(%ebp), %edx
10614+ orl $PAGE_ATTR, %edx
10615+ movl %edx, (%eax)
10616+
10617+ movl PTR(VA_PTE_0)(%ebp), %edi
10618+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10619+ andl $0x001ff000, %eax
10620+ shrl $9, %eax
10621+ addl %edi, %eax
10622+
10623+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10624+ orl $PAGE_ATTR, %edx
10625+ movl %edx, (%eax)
10626+
10627+ /* identity map the control page at its physical address */
10628+
10629+ movl PTR(VA_PGD)(%ebp), %edi
10630+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10631+ andl $0xc0000000, %eax
10632+ shrl $27, %eax
10633+ addl %edi, %eax
10634+
10635+ movl PTR(PA_PMD_1)(%ebp), %edx
10636+ orl $PAE_PGD_ATTR, %edx
10637+ movl %edx, (%eax)
10638+
10639+ movl PTR(VA_PMD_1)(%ebp), %edi
10640+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10641+ andl $0x3fe00000, %eax
10642+ shrl $18, %eax
10643+ addl %edi, %eax
10644+
10645+ movl PTR(PA_PTE_1)(%ebp), %edx
10646+ orl $PAGE_ATTR, %edx
10647+ movl %edx, (%eax)
10648+
10649+ movl PTR(VA_PTE_1)(%ebp), %edi
10650+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10651+ andl $0x001ff000, %eax
10652+ shrl $9, %eax
10653+ addl %edi, %eax
10654+
10655+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10656+ orl $PAGE_ATTR, %edx
10657+ movl %edx, (%eax)
10658+#else
10659+ /* map the control page at its virtual address */
10660+
10661+ movl PTR(VA_PGD)(%ebp), %edi
10662+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10663+ andl $0xffc00000, %eax
10664+ shrl $20, %eax
10665+ addl %edi, %eax
10666+
10667+ movl PTR(PA_PTE_0)(%ebp), %edx
10668+ orl $PAGE_ATTR, %edx
10669+ movl %edx, (%eax)
10670+
10671+ movl PTR(VA_PTE_0)(%ebp), %edi
10672+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10673+ andl $0x003ff000, %eax
10674+ shrl $10, %eax
10675+ addl %edi, %eax
10676+
10677+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10678+ orl $PAGE_ATTR, %edx
10679+ movl %edx, (%eax)
10680+
10681+ /* identity map the control page at its physical address */
10682+
10683+ movl PTR(VA_PGD)(%ebp), %edi
10684+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10685+ andl $0xffc00000, %eax
10686+ shrl $20, %eax
10687+ addl %edi, %eax
10688+
10689+ movl PTR(PA_PTE_1)(%ebp), %edx
10690+ orl $PAGE_ATTR, %edx
10691+ movl %edx, (%eax)
10692+
10693+ movl PTR(VA_PTE_1)(%ebp), %edi
10694+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10695+ andl $0x003ff000, %eax
10696+ shrl $10, %eax
10697+ addl %edi, %eax
10698+
10699+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10700+ orl $PAGE_ATTR, %edx
10701+ movl %edx, (%eax)
10702+#endif
10703
10704- /*
10705- * Must be relocatable PIC code callable as a C function, that once
10706- * it starts can not use the previous processes stack.
10707- */
10708- .globl relocate_new_kernel
10709 relocate_new_kernel:
10710 /* read the arguments and say goodbye to the stack */
10711 movl 4(%esp), %ebx /* page_list */
10712- movl 8(%esp), %ebp /* reboot_code_buffer */
10713+ movl 8(%esp), %ebp /* list of pages */
10714 movl 12(%esp), %edx /* start address */
10715 movl 16(%esp), %ecx /* cpu_has_pae */
10716
10717@@ -24,11 +146,57 @@
10718 pushl $0
10719 popfl
10720
10721- /* set a new stack at the bottom of our page... */
10722- lea 4096(%ebp), %esp
10723+ /* get physical address of control page now */
10724+ /* this is impossible after page table switch */
10725+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
10726
10727- /* store the parameters back on the stack */
10728- pushl %edx /* store the start address */
10729+ /* switch to new set of page tables */
10730+ movl PTR(PA_PGD)(%ebp), %eax
10731+ movl %eax, %cr3
10732+
10733+ /* setup idt */
10734+ movl %edi, %eax
10735+ addl $(idt_48 - relocate_kernel), %eax
10736+ lidtl (%eax)
10737+
10738+ /* setup gdt */
10739+ movl %edi, %eax
10740+ addl $(gdt - relocate_kernel), %eax
10741+ movl %edi, %esi
10742+ addl $((gdt_48 - relocate_kernel) + 2), %esi
10743+ movl %eax, (%esi)
10744+
10745+ movl %edi, %eax
10746+ addl $(gdt_48 - relocate_kernel), %eax
10747+ lgdtl (%eax)
10748+
10749+ /* setup data segment registers */
10750+ mov $(gdt_ds - gdt), %eax
10751+ mov %eax, %ds
10752+ mov %eax, %es
10753+ mov %eax, %fs
10754+ mov %eax, %gs
10755+ mov %eax, %ss
10756+
10757+ /* setup a new stack at the end of the physical control page */
10758+ lea 4096(%edi), %esp
10759+
10760+ /* load new code segment and jump to identity mapped page */
10761+ movl %edi, %esi
10762+ xorl %eax, %eax
10763+ pushl %eax
10764+ pushl %esi
10765+ pushl %eax
10766+ movl $(gdt_cs - gdt), %eax
10767+ pushl %eax
10768+ movl %edi, %eax
10769+ addl $(identity_mapped - relocate_kernel),%eax
10770+ pushl %eax
10771+ iretl
10772+
10773+identity_mapped:
10774+ /* store the start address on the stack */
10775+ pushl %edx
10776
10777 /* Set cr0 to a known state:
10778 * 31 0 == Paging disabled
10779@@ -113,8 +281,20 @@
10780 xorl %edi, %edi
10781 xorl %ebp, %ebp
10782 ret
10783-relocate_new_kernel_end:
10784
10785- .globl relocate_new_kernel_size
10786-relocate_new_kernel_size:
10787- .long relocate_new_kernel_end - relocate_new_kernel
10788+ .align 16
10789+gdt:
10790+ .quad 0x0000000000000000 /* NULL descriptor */
10791+gdt_cs:
10792+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
10793+gdt_ds:
10794+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
10795+gdt_end:
10796+
10797+gdt_48:
10798+ .word gdt_end - gdt - 1 /* limit */
10799+ .long 0 /* base - filled in by code above */
10800+
10801+idt_48:
10802+ .word 0 /* limit */
10803+ .long 0 /* base */
10804diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/setup-xen.c linux-2.6.16.33/arch/i386/kernel/setup-xen.c
10805--- linux-2.6.16.33-noxen/arch/i386/kernel/setup-xen.c 1970-01-01 00:00:00.000000000 +0000
10806+++ linux-2.6.16.33/arch/i386/kernel/setup-xen.c 2007-01-08 15:00:45.000000000 +0000
10807@@ -0,0 +1,1892 @@
10808+/*
10809+ * linux/arch/i386/kernel/setup.c
10810+ *
10811+ * Copyright (C) 1995 Linus Torvalds
10812+ *
10813+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10814+ *
10815+ * Memory region support
10816+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
10817+ *
10818+ * Added E820 sanitization routine (removes overlapping memory regions);
10819+ * Brian Moyle <bmoyle@mvista.com>, February 2001
10820+ *
10821+ * Moved CPU detection code to cpu/${cpu}.c
10822+ * Patrick Mochel <mochel@osdl.org>, March 2002
10823+ *
10824+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
10825+ * Alex Achenbach <xela@slit.de>, December 2002.
10826+ *
10827+ */
10828+
10829+/*
10830+ * This file handles the architecture-dependent parts of initialization
10831+ */
10832+
10833+#include <linux/config.h>
10834+#include <linux/sched.h>
10835+#include <linux/mm.h>
10836+#include <linux/mmzone.h>
10837+#include <linux/tty.h>
10838+#include <linux/ioport.h>
10839+#include <linux/acpi.h>
10840+#include <linux/apm_bios.h>
10841+#include <linux/initrd.h>
10842+#include <linux/bootmem.h>
10843+#include <linux/seq_file.h>
10844+#include <linux/console.h>
10845+#include <linux/mca.h>
10846+#include <linux/root_dev.h>
10847+#include <linux/highmem.h>
10848+#include <linux/module.h>
10849+#include <linux/efi.h>
10850+#include <linux/init.h>
10851+#include <linux/edd.h>
10852+#include <linux/nodemask.h>
10853+#include <linux/kernel.h>
10854+#include <linux/percpu.h>
10855+#include <linux/notifier.h>
10856+#include <linux/kexec.h>
10857+#include <linux/crash_dump.h>
10858+#include <linux/dmi.h>
10859+
10860+#include <video/edid.h>
10861+
10862+#include <asm/apic.h>
10863+#include <asm/e820.h>
10864+#include <asm/mpspec.h>
10865+#include <asm/setup.h>
10866+#include <asm/arch_hooks.h>
10867+#include <asm/sections.h>
10868+#include <asm/io_apic.h>
10869+#include <asm/ist.h>
10870+#include <asm/io.h>
10871+#include <asm/hypervisor.h>
10872+#include <xen/interface/physdev.h>
10873+#include <xen/interface/memory.h>
10874+#include <xen/features.h>
10875+#include <xen/xencons.h>
10876+#include "setup_arch_pre.h"
10877+#include <bios_ebda.h>
10878+
10879+#ifdef CONFIG_XEN
10880+#include <xen/interface/kexec.h>
10881+#endif
10882+
10883+/* Forward Declaration. */
10884+void __init find_max_pfn(void);
10885+
10886+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
10887+static struct notifier_block xen_panic_block = {
10888+ xen_panic_event, NULL, 0 /* try to go last */
10889+};
10890+
10891+extern char hypercall_page[PAGE_SIZE];
10892+EXPORT_SYMBOL(hypercall_page);
10893+
10894+int disable_pse __devinitdata = 0;
10895+
10896+/*
10897+ * Machine setup..
10898+ */
10899+
10900+#ifdef CONFIG_EFI
10901+int efi_enabled = 0;
10902+EXPORT_SYMBOL(efi_enabled);
10903+#endif
10904+
10905+/* cpu data as detected by the assembly code in head.S */
10906+struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10907+/* common cpu data for all cpus */
10908+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10909+EXPORT_SYMBOL(boot_cpu_data);
10910+
10911+unsigned long mmu_cr4_features;
10912+
10913+#ifdef CONFIG_ACPI
10914+ int acpi_disabled = 0;
10915+#else
10916+ int acpi_disabled = 1;
10917+#endif
10918+EXPORT_SYMBOL(acpi_disabled);
10919+
10920+#ifdef CONFIG_ACPI
10921+int __initdata acpi_force = 0;
10922+extern acpi_interrupt_flags acpi_sci_flags;
10923+#endif
10924+
10925+/* for MCA, but anyone else can use it if they want */
10926+unsigned int machine_id;
10927+#ifdef CONFIG_MCA
10928+EXPORT_SYMBOL(machine_id);
10929+#endif
10930+unsigned int machine_submodel_id;
10931+unsigned int BIOS_revision;
10932+unsigned int mca_pentium_flag;
10933+
10934+/* For PCI or other memory-mapped resources */
10935+unsigned long pci_mem_start = 0x10000000;
10936+#ifdef CONFIG_PCI
10937+EXPORT_SYMBOL(pci_mem_start);
10938+#endif
10939+
10940+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
10941+int bootloader_type;
10942+
10943+/* user-defined highmem size */
10944+static unsigned int highmem_pages = -1;
10945+
10946+/*
10947+ * Setup options
10948+ */
10949+struct drive_info_struct { char dummy[32]; } drive_info;
10950+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
10951+ defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
10952+EXPORT_SYMBOL(drive_info);
10953+#endif
10954+struct screen_info screen_info;
10955+EXPORT_SYMBOL(screen_info);
10956+struct apm_info apm_info;
10957+EXPORT_SYMBOL(apm_info);
10958+struct sys_desc_table_struct {
10959+ unsigned short length;
10960+ unsigned char table[0];
10961+};
10962+struct edid_info edid_info;
10963+EXPORT_SYMBOL_GPL(edid_info);
10964+struct ist_info ist_info;
10965+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
10966+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
10967+EXPORT_SYMBOL(ist_info);
10968+#endif
10969+struct e820map e820;
10970+#ifdef CONFIG_XEN
10971+struct e820map machine_e820;
10972+#endif
10973+
10974+extern void early_cpu_init(void);
10975+extern void generic_apic_probe(char *);
10976+extern int root_mountflags;
10977+
10978+unsigned long saved_videomode;
10979+
10980+#define RAMDISK_IMAGE_START_MASK 0x07FF
10981+#define RAMDISK_PROMPT_FLAG 0x8000
10982+#define RAMDISK_LOAD_FLAG 0x4000
10983+
10984+static char command_line[COMMAND_LINE_SIZE];
10985+
10986+unsigned char __initdata boot_params[PARAM_SIZE];
10987+
10988+static struct resource data_resource = {
10989+ .name = "Kernel data",
10990+ .start = 0,
10991+ .end = 0,
10992+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
10993+};
10994+
10995+static struct resource code_resource = {
10996+ .name = "Kernel code",
10997+ .start = 0,
10998+ .end = 0,
10999+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11000+};
11001+
11002+static struct resource system_rom_resource = {
11003+ .name = "System ROM",
11004+ .start = 0xf0000,
11005+ .end = 0xfffff,
11006+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11007+};
11008+
11009+static struct resource extension_rom_resource = {
11010+ .name = "Extension ROM",
11011+ .start = 0xe0000,
11012+ .end = 0xeffff,
11013+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11014+};
11015+
11016+static struct resource adapter_rom_resources[] = { {
11017+ .name = "Adapter ROM",
11018+ .start = 0xc8000,
11019+ .end = 0,
11020+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11021+}, {
11022+ .name = "Adapter ROM",
11023+ .start = 0,
11024+ .end = 0,
11025+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11026+}, {
11027+ .name = "Adapter ROM",
11028+ .start = 0,
11029+ .end = 0,
11030+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11031+}, {
11032+ .name = "Adapter ROM",
11033+ .start = 0,
11034+ .end = 0,
11035+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11036+}, {
11037+ .name = "Adapter ROM",
11038+ .start = 0,
11039+ .end = 0,
11040+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11041+}, {
11042+ .name = "Adapter ROM",
11043+ .start = 0,
11044+ .end = 0,
11045+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11046+} };
11047+
11048+#define ADAPTER_ROM_RESOURCES \
11049+ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
11050+
11051+static struct resource video_rom_resource = {
11052+ .name = "Video ROM",
11053+ .start = 0xc0000,
11054+ .end = 0xc7fff,
11055+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11056+};
11057+
11058+static struct resource video_ram_resource = {
11059+ .name = "Video RAM area",
11060+ .start = 0xa0000,
11061+ .end = 0xbffff,
11062+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11063+};
11064+
11065+static struct resource standard_io_resources[] = { {
11066+ .name = "dma1",
11067+ .start = 0x0000,
11068+ .end = 0x001f,
11069+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11070+}, {
11071+ .name = "pic1",
11072+ .start = 0x0020,
11073+ .end = 0x0021,
11074+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11075+}, {
11076+ .name = "timer0",
11077+ .start = 0x0040,
11078+ .end = 0x0043,
11079+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11080+}, {
11081+ .name = "timer1",
11082+ .start = 0x0050,
11083+ .end = 0x0053,
11084+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11085+}, {
11086+ .name = "keyboard",
11087+ .start = 0x0060,
11088+ .end = 0x006f,
11089+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11090+}, {
11091+ .name = "dma page reg",
11092+ .start = 0x0080,
11093+ .end = 0x008f,
11094+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11095+}, {
11096+ .name = "pic2",
11097+ .start = 0x00a0,
11098+ .end = 0x00a1,
11099+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11100+}, {
11101+ .name = "dma2",
11102+ .start = 0x00c0,
11103+ .end = 0x00df,
11104+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11105+}, {
11106+ .name = "fpu",
11107+ .start = 0x00f0,
11108+ .end = 0x00ff,
11109+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
11110+} };
11111+
11112+#define STANDARD_IO_RESOURCES \
11113+ (sizeof standard_io_resources / sizeof standard_io_resources[0])
11114+
11115+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
11116+
11117+static int __init romchecksum(unsigned char *rom, unsigned long length)
11118+{
11119+ unsigned char *p, sum = 0;
11120+
11121+ for (p = rom; p < rom + length; p++)
11122+ sum += *p;
11123+ return sum == 0;
11124+}
11125+
11126+static void __init probe_roms(void)
11127+{
11128+ unsigned long start, length, upper;
11129+ unsigned char *rom;
11130+ int i;
11131+
11132+#ifdef CONFIG_XEN
11133+ /* Nothing to do if not running in dom0. */
11134+ if (!is_initial_xendomain())
11135+ return;
11136+#endif
11137+
11138+ /* video rom */
11139+ upper = adapter_rom_resources[0].start;
11140+ for (start = video_rom_resource.start; start < upper; start += 2048) {
11141+ rom = isa_bus_to_virt(start);
11142+ if (!romsignature(rom))
11143+ continue;
11144+
11145+ video_rom_resource.start = start;
11146+
11147+ /* 0 < length <= 0x7f * 512, historically */
11148+ length = rom[2] * 512;
11149+
11150+ /* if checksum okay, trust length byte */
11151+ if (length && romchecksum(rom, length))
11152+ video_rom_resource.end = start + length - 1;
11153+
11154+ request_resource(&iomem_resource, &video_rom_resource);
11155+ break;
11156+ }
11157+
11158+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
11159+ if (start < upper)
11160+ start = upper;
11161+
11162+ /* system rom */
11163+ request_resource(&iomem_resource, &system_rom_resource);
11164+ upper = system_rom_resource.start;
11165+
11166+ /* check for extension rom (ignore length byte!) */
11167+ rom = isa_bus_to_virt(extension_rom_resource.start);
11168+ if (romsignature(rom)) {
11169+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
11170+ if (romchecksum(rom, length)) {
11171+ request_resource(&iomem_resource, &extension_rom_resource);
11172+ upper = extension_rom_resource.start;
11173+ }
11174+ }
11175+
11176+ /* check for adapter roms on 2k boundaries */
11177+ for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
11178+ rom = isa_bus_to_virt(start);
11179+ if (!romsignature(rom))
11180+ continue;
11181+
11182+ /* 0 < length <= 0x7f * 512, historically */
11183+ length = rom[2] * 512;
11184+
11185+ /* but accept any length that fits if checksum okay */
11186+ if (!length || start + length > upper || !romchecksum(rom, length))
11187+ continue;
11188+
11189+ adapter_rom_resources[i].start = start;
11190+ adapter_rom_resources[i].end = start + length - 1;
11191+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
11192+
11193+ start = adapter_rom_resources[i++].end & ~2047UL;
11194+ }
11195+}
11196+
11197+/*
11198+ * Point at the empty zero page to start with. We map the real shared_info
11199+ * page as soon as fixmap is up and running.
11200+ */
11201+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11202+EXPORT_SYMBOL(HYPERVISOR_shared_info);
11203+
11204+unsigned long *phys_to_machine_mapping;
11205+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
11206+EXPORT_SYMBOL(phys_to_machine_mapping);
11207+
11208+/* Raw start-of-day parameters from the hypervisor. */
11209+start_info_t *xen_start_info;
11210+EXPORT_SYMBOL(xen_start_info);
11211+
11212+static void __init add_memory_region(unsigned long long start,
11213+ unsigned long long size, int type)
11214+{
11215+ int x;
11216+
11217+ if (!efi_enabled) {
11218+ x = e820.nr_map;
11219+
11220+ if (x == E820MAX) {
11221+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
11222+ return;
11223+ }
11224+
11225+ e820.map[x].addr = start;
11226+ e820.map[x].size = size;
11227+ e820.map[x].type = type;
11228+ e820.nr_map++;
11229+ }
11230+} /* add_memory_region */
11231+
11232+static void __init limit_regions(unsigned long long size)
11233+{
11234+ unsigned long long current_addr = 0;
11235+ int i;
11236+
11237+ if (efi_enabled) {
11238+ efi_memory_desc_t *md;
11239+ void *p;
11240+
11241+ for (p = memmap.map, i = 0; p < memmap.map_end;
11242+ p += memmap.desc_size, i++) {
11243+ md = p;
11244+ current_addr = md->phys_addr + (md->num_pages << 12);
11245+ if (md->type == EFI_CONVENTIONAL_MEMORY) {
11246+ if (current_addr >= size) {
11247+ md->num_pages -=
11248+ (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
11249+ memmap.nr_map = i + 1;
11250+ return;
11251+ }
11252+ }
11253+ }
11254+ }
11255+ for (i = 0; i < e820.nr_map; i++) {
11256+ current_addr = e820.map[i].addr + e820.map[i].size;
11257+ if (current_addr < size)
11258+ continue;
11259+
11260+ if (e820.map[i].type != E820_RAM)
11261+ continue;
11262+
11263+ if (e820.map[i].addr >= size) {
11264+ /*
11265+ * This region starts past the end of the
11266+ * requested size, skip it completely.
11267+ */
11268+ e820.nr_map = i;
11269+ } else {
11270+ e820.nr_map = i + 1;
11271+ e820.map[i].size -= current_addr - size;
11272+ }
11273+ return;
11274+ }
11275+#ifdef CONFIG_XEN
11276+ if (i==e820.nr_map && current_addr < size) {
11277+ /*
11278+ * The e820 map finished before our requested size so
11279+ * extend the final entry to the requested address.
11280+ */
11281+ --i;
11282+ if (e820.map[i].type == E820_RAM)
11283+ e820.map[i].size -= current_addr - size;
11284+ else
11285+ add_memory_region(current_addr, size - current_addr, E820_RAM);
11286+ }
11287+#endif
11288+}
11289+
11290+#define E820_DEBUG 1
11291+
11292+static void __init print_memory_map(char *who)
11293+{
11294+ int i;
11295+
11296+ for (i = 0; i < e820.nr_map; i++) {
11297+ printk(" %s: %016Lx - %016Lx ", who,
11298+ e820.map[i].addr,
11299+ e820.map[i].addr + e820.map[i].size);
11300+ switch (e820.map[i].type) {
11301+ case E820_RAM: printk("(usable)\n");
11302+ break;
11303+ case E820_RESERVED:
11304+ printk("(reserved)\n");
11305+ break;
11306+ case E820_ACPI:
11307+ printk("(ACPI data)\n");
11308+ break;
11309+ case E820_NVS:
11310+ printk("(ACPI NVS)\n");
11311+ break;
11312+ default: printk("type %lu\n", e820.map[i].type);
11313+ break;
11314+ }
11315+ }
11316+}
11317+
11318+/*
11319+ * Sanitize the BIOS e820 map.
11320+ *
11321+ * Some e820 responses include overlapping entries. The following
11322+ * replaces the original e820 map with a new one, removing overlaps.
11323+ *
11324+ */
11325+struct change_member {
11326+ struct e820entry *pbios; /* pointer to original bios entry */
11327+ unsigned long long addr; /* address for this change point */
11328+};
11329+static struct change_member change_point_list[2*E820MAX] __initdata;
11330+static struct change_member *change_point[2*E820MAX] __initdata;
11331+static struct e820entry *overlap_list[E820MAX] __initdata;
11332+static struct e820entry new_bios[E820MAX] __initdata;
11333+
11334+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
11335+{
11336+ struct change_member *change_tmp;
11337+ unsigned long current_type, last_type;
11338+ unsigned long long last_addr;
11339+ int chgidx, still_changing;
11340+ int overlap_entries;
11341+ int new_bios_entry;
11342+ int old_nr, new_nr, chg_nr;
11343+ int i;
11344+
11345+ /*
11346+ Visually we're performing the following (1,2,3,4 = memory types)...
11347+
11348+ Sample memory map (w/overlaps):
11349+ ____22__________________
11350+ ______________________4_
11351+ ____1111________________
11352+ _44_____________________
11353+ 11111111________________
11354+ ____________________33__
11355+ ___________44___________
11356+ __________33333_________
11357+ ______________22________
11358+ ___________________2222_
11359+ _________111111111______
11360+ _____________________11_
11361+ _________________4______
11362+
11363+ Sanitized equivalent (no overlap):
11364+ 1_______________________
11365+ _44_____________________
11366+ ___1____________________
11367+ ____22__________________
11368+ ______11________________
11369+ _________1______________
11370+ __________3_____________
11371+ ___________44___________
11372+ _____________33_________
11373+ _______________2________
11374+ ________________1_______
11375+ _________________4______
11376+ ___________________2____
11377+ ____________________33__
11378+ ______________________4_
11379+ */
11380+
11381+ /* if there's only one memory region, don't bother */
11382+ if (*pnr_map < 2)
11383+ return -1;
11384+
11385+ old_nr = *pnr_map;
11386+
11387+ /* bail out if we find any unreasonable addresses in bios map */
11388+ for (i=0; i<old_nr; i++)
11389+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
11390+ return -1;
11391+
11392+ /* create pointers for initial change-point information (for sorting) */
11393+ for (i=0; i < 2*old_nr; i++)
11394+ change_point[i] = &change_point_list[i];
11395+
11396+ /* record all known change-points (starting and ending addresses),
11397+ omitting those that are for empty memory regions */
11398+ chgidx = 0;
11399+ for (i=0; i < old_nr; i++) {
11400+ if (biosmap[i].size != 0) {
11401+ change_point[chgidx]->addr = biosmap[i].addr;
11402+ change_point[chgidx++]->pbios = &biosmap[i];
11403+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
11404+ change_point[chgidx++]->pbios = &biosmap[i];
11405+ }
11406+ }
11407+ chg_nr = chgidx; /* true number of change-points */
11408+
11409+ /* sort change-point list by memory addresses (low -> high) */
11410+ still_changing = 1;
11411+ while (still_changing) {
11412+ still_changing = 0;
11413+ for (i=1; i < chg_nr; i++) {
11414+ /* if <current_addr> > <last_addr>, swap */
11415+ /* or, if current=<start_addr> & last=<end_addr>, swap */
11416+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
11417+ ((change_point[i]->addr == change_point[i-1]->addr) &&
11418+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
11419+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
11420+ )
11421+ {
11422+ change_tmp = change_point[i];
11423+ change_point[i] = change_point[i-1];
11424+ change_point[i-1] = change_tmp;
11425+ still_changing=1;
11426+ }
11427+ }
11428+ }
11429+
11430+ /* create a new bios memory map, removing overlaps */
11431+ overlap_entries=0; /* number of entries in the overlap table */
11432+ new_bios_entry=0; /* index for creating new bios map entries */
11433+ last_type = 0; /* start with undefined memory type */
11434+ last_addr = 0; /* start with 0 as last starting address */
11435+ /* loop through change-points, determining affect on the new bios map */
11436+ for (chgidx=0; chgidx < chg_nr; chgidx++)
11437+ {
11438+ /* keep track of all overlapping bios entries */
11439+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
11440+ {
11441+ /* add map entry to overlap list (> 1 entry implies an overlap) */
11442+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
11443+ }
11444+ else
11445+ {
11446+ /* remove entry from list (order independent, so swap with last) */
11447+ for (i=0; i<overlap_entries; i++)
11448+ {
11449+ if (overlap_list[i] == change_point[chgidx]->pbios)
11450+ overlap_list[i] = overlap_list[overlap_entries-1];
11451+ }
11452+ overlap_entries--;
11453+ }
11454+ /* if there are overlapping entries, decide which "type" to use */
11455+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
11456+ current_type = 0;
11457+ for (i=0; i<overlap_entries; i++)
11458+ if (overlap_list[i]->type > current_type)
11459+ current_type = overlap_list[i]->type;
11460+ /* continue building up new bios map based on this information */
11461+ if (current_type != last_type) {
11462+ if (last_type != 0) {
11463+ new_bios[new_bios_entry].size =
11464+ change_point[chgidx]->addr - last_addr;
11465+ /* move forward only if the new size was non-zero */
11466+ if (new_bios[new_bios_entry].size != 0)
11467+ if (++new_bios_entry >= E820MAX)
11468+ break; /* no more space left for new bios entries */
11469+ }
11470+ if (current_type != 0) {
11471+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
11472+ new_bios[new_bios_entry].type = current_type;
11473+ last_addr=change_point[chgidx]->addr;
11474+ }
11475+ last_type = current_type;
11476+ }
11477+ }
11478+ new_nr = new_bios_entry; /* retain count for new bios entries */
11479+
11480+ /* copy new bios mapping into original location */
11481+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
11482+ *pnr_map = new_nr;
11483+
11484+ return 0;
11485+}
11486+
11487+/*
11488+ * Copy the BIOS e820 map into a safe place.
11489+ *
11490+ * Sanity-check it while we're at it..
11491+ *
11492+ * If we're lucky and live on a modern system, the setup code
11493+ * will have given us a memory map that we can use to properly
11494+ * set up memory. If we aren't, we'll fake a memory map.
11495+ *
11496+ * We check to see that the memory map contains at least 2 elements
11497+ * before we'll use it, because the detection code in setup.S may
11498+ * not be perfect and most every PC known to man has two memory
11499+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
11500+ * thinkpad 560x, for example, does not cooperate with the memory
11501+ * detection code.)
11502+ */
11503+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
11504+{
11505+#ifndef CONFIG_XEN
11506+ /* Only one memory region (or negative)? Ignore it */
11507+ if (nr_map < 2)
11508+ return -1;
11509+#else
11510+ BUG_ON(nr_map < 1);
11511+#endif
11512+
11513+ do {
11514+ unsigned long long start = biosmap->addr;
11515+ unsigned long long size = biosmap->size;
11516+ unsigned long long end = start + size;
11517+ unsigned long type = biosmap->type;
11518+
11519+ /* Overflow in 64 bits? Ignore the memory map. */
11520+ if (start > end)
11521+ return -1;
11522+
11523+#ifndef CONFIG_XEN
11524+ /*
11525+ * Some BIOSes claim RAM in the 640k - 1M region.
11526+ * Not right. Fix it up.
11527+ */
11528+ if (type == E820_RAM) {
11529+ if (start < 0x100000ULL && end > 0xA0000ULL) {
11530+ if (start < 0xA0000ULL)
11531+ add_memory_region(start, 0xA0000ULL-start, type);
11532+ if (end <= 0x100000ULL)
11533+ continue;
11534+ start = 0x100000ULL;
11535+ size = end - start;
11536+ }
11537+ }
11538+#endif
11539+ add_memory_region(start, size, type);
11540+ } while (biosmap++,--nr_map);
11541+ return 0;
11542+}
11543+
11544+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11545+struct edd edd;
11546+#ifdef CONFIG_EDD_MODULE
11547+EXPORT_SYMBOL(edd);
11548+#endif
11549+/**
11550+ * copy_edd() - Copy the BIOS EDD information
11551+ * from boot_params into a safe place.
11552+ *
11553+ */
11554+static inline void copy_edd(void)
11555+{
11556+ memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
11557+ memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
11558+ edd.mbr_signature_nr = EDD_MBR_SIG_NR;
11559+ edd.edd_info_nr = EDD_NR;
11560+}
11561+#else
11562+static inline void copy_edd(void)
11563+{
11564+}
11565+#endif
11566+
11567+/*
11568+ * Do NOT EVER look at the BIOS memory size location.
11569+ * It does not work on many machines.
11570+ */
11571+#define LOWMEMSIZE() (0x9f000)
11572+
11573+static void __init parse_cmdline_early (char ** cmdline_p)
11574+{
11575+ char c = ' ', *to = command_line, *from = saved_command_line;
11576+ int len = 0, max_cmdline;
11577+ int userdef = 0;
11578+
11579+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
11580+ max_cmdline = COMMAND_LINE_SIZE;
11581+ memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
11582+ /* Save unparsed command line copy for /proc/cmdline */
11583+ saved_command_line[max_cmdline-1] = '\0';
11584+
11585+ for (;;) {
11586+ if (c != ' ')
11587+ goto next_char;
11588+ /*
11589+ * "mem=nopentium" disables the 4MB page tables.
11590+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
11591+ * to <mem>, overriding the bios size.
11592+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
11593+ * <start> to <start>+<mem>, overriding the bios size.
11594+ *
11595+ * HPA tells me bootloaders need to parse mem=, so no new
11596+ * option should be mem= [also see Documentation/i386/boot.txt]
11597+ */
11598+ if (!memcmp(from, "mem=", 4)) {
11599+ if (to != command_line)
11600+ to--;
11601+ if (!memcmp(from+4, "nopentium", 9)) {
11602+ from += 9+4;
11603+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
11604+ disable_pse = 1;
11605+ } else {
11606+ /* If the user specifies memory size, we
11607+ * limit the BIOS-provided memory map to
11608+ * that size. exactmap can be used to specify
11609+ * the exact map. mem=number can be used to
11610+ * trim the existing memory map.
11611+ */
11612+ unsigned long long mem_size;
11613+
11614+ mem_size = memparse(from+4, &from);
11615+ limit_regions(mem_size);
11616+ userdef=1;
11617+ }
11618+ }
11619+
11620+ else if (!memcmp(from, "memmap=", 7)) {
11621+ if (to != command_line)
11622+ to--;
11623+ if (!memcmp(from+7, "exactmap", 8)) {
11624+#ifdef CONFIG_CRASH_DUMP
11625+ /* If we are doing a crash dump, we
11626+ * still need to know the real mem
11627+ * size before original memory map is
11628+ * reset.
11629+ */
11630+ find_max_pfn();
11631+ saved_max_pfn = max_pfn;
11632+#endif
11633+ from += 8+7;
11634+ e820.nr_map = 0;
11635+ userdef = 1;
11636+ } else {
11637+ /* If the user specifies memory size, we
11638+ * limit the BIOS-provided memory map to
11639+ * that size. exactmap can be used to specify
11640+ * the exact map. mem=number can be used to
11641+ * trim the existing memory map.
11642+ */
11643+ unsigned long long start_at, mem_size;
11644+
11645+ mem_size = memparse(from+7, &from);
11646+ if (*from == '@') {
11647+ start_at = memparse(from+1, &from);
11648+ add_memory_region(start_at, mem_size, E820_RAM);
11649+ } else if (*from == '#') {
11650+ start_at = memparse(from+1, &from);
11651+ add_memory_region(start_at, mem_size, E820_ACPI);
11652+ } else if (*from == '$') {
11653+ start_at = memparse(from+1, &from);
11654+ add_memory_region(start_at, mem_size, E820_RESERVED);
11655+ } else {
11656+ limit_regions(mem_size);
11657+ userdef=1;
11658+ }
11659+ }
11660+ }
11661+
11662+ else if (!memcmp(from, "noexec=", 7))
11663+ noexec_setup(from + 7);
11664+
11665+
11666+#ifdef CONFIG_X86_MPPARSE
11667+ /*
11668+ * If the BIOS enumerates physical processors before logical,
11669+ * maxcpus=N at enumeration-time can be used to disable HT.
11670+ */
11671+ else if (!memcmp(from, "maxcpus=", 8)) {
11672+ extern unsigned int maxcpus;
11673+
11674+ maxcpus = simple_strtoul(from + 8, NULL, 0);
11675+ }
11676+#endif
11677+
11678+#ifdef CONFIG_ACPI
11679+ /* "acpi=off" disables both ACPI table parsing and interpreter */
11680+ else if (!memcmp(from, "acpi=off", 8)) {
11681+ disable_acpi();
11682+ }
11683+
11684+ /* acpi=force to over-ride black-list */
11685+ else if (!memcmp(from, "acpi=force", 10)) {
11686+ acpi_force = 1;
11687+ acpi_ht = 1;
11688+ acpi_disabled = 0;
11689+ }
11690+
11691+ /* acpi=strict disables out-of-spec workarounds */
11692+ else if (!memcmp(from, "acpi=strict", 11)) {
11693+ acpi_strict = 1;
11694+ }
11695+
11696+ /* Limit ACPI just to boot-time to enable HT */
11697+ else if (!memcmp(from, "acpi=ht", 7)) {
11698+ if (!acpi_force)
11699+ disable_acpi();
11700+ acpi_ht = 1;
11701+ }
11702+
11703+ /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
11704+ else if (!memcmp(from, "pci=noacpi", 10)) {
11705+ acpi_disable_pci();
11706+ }
11707+ /* "acpi=noirq" disables ACPI interrupt routing */
11708+ else if (!memcmp(from, "acpi=noirq", 10)) {
11709+ acpi_noirq_set();
11710+ }
11711+
11712+ else if (!memcmp(from, "acpi_sci=edge", 13))
11713+ acpi_sci_flags.trigger = 1;
11714+
11715+ else if (!memcmp(from, "acpi_sci=level", 14))
11716+ acpi_sci_flags.trigger = 3;
11717+
11718+ else if (!memcmp(from, "acpi_sci=high", 13))
11719+ acpi_sci_flags.polarity = 1;
11720+
11721+ else if (!memcmp(from, "acpi_sci=low", 12))
11722+ acpi_sci_flags.polarity = 3;
11723+
11724+#ifdef CONFIG_X86_IO_APIC
11725+ else if (!memcmp(from, "acpi_skip_timer_override", 24))
11726+ acpi_skip_timer_override = 1;
11727+
11728+ if (!memcmp(from, "disable_timer_pin_1", 19))
11729+ disable_timer_pin_1 = 1;
11730+ if (!memcmp(from, "enable_timer_pin_1", 18))
11731+ disable_timer_pin_1 = -1;
11732+
11733+ /* disable IO-APIC */
11734+ else if (!memcmp(from, "noapic", 6))
11735+ disable_ioapic_setup();
11736+#endif /* CONFIG_X86_IO_APIC */
11737+#endif /* CONFIG_ACPI */
11738+
11739+#ifdef CONFIG_X86_LOCAL_APIC
11740+ /* enable local APIC */
11741+ else if (!memcmp(from, "lapic", 5))
11742+ lapic_enable();
11743+
11744+ /* disable local APIC */
11745+ else if (!memcmp(from, "nolapic", 6))
11746+ lapic_disable();
11747+#endif /* CONFIG_X86_LOCAL_APIC */
11748+
11749+#ifdef CONFIG_KEXEC
11750+ /* crashkernel=size@addr specifies the location to reserve for
11751+ * a crash kernel. By reserving this memory we guarantee
11752+ * that linux never set's it up as a DMA target.
11753+ * Useful for holding code to do something appropriate
11754+ * after a kernel panic.
11755+ */
11756+ else if (!memcmp(from, "crashkernel=", 12)) {
11757+#ifndef CONFIG_XEN
11758+ unsigned long size, base;
11759+ size = memparse(from+12, &from);
11760+ if (*from == '@') {
11761+ base = memparse(from+1, &from);
11762+ /* FIXME: Do I want a sanity check
11763+ * to validate the memory range?
11764+ */
11765+ crashk_res.start = base;
11766+ crashk_res.end = base + size - 1;
11767+ }
11768+#else
11769+ printk("Ignoring crashkernel command line, "
11770+ "parameter will be supplied by xen\n");
11771+#endif
11772+ }
11773+#endif
11774+#ifdef CONFIG_PROC_VMCORE
11775+ /* elfcorehdr= specifies the location of elf core header
11776+ * stored by the crashed kernel.
11777+ */
11778+ else if (!memcmp(from, "elfcorehdr=", 11))
11779+ elfcorehdr_addr = memparse(from+11, &from);
11780+#endif
11781+
11782+ /*
11783+ * highmem=size forces highmem to be exactly 'size' bytes.
11784+ * This works even on boxes that have no highmem otherwise.
11785+ * This also works to reduce highmem size on bigger boxes.
11786+ */
11787+ else if (!memcmp(from, "highmem=", 8))
11788+ highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
11789+
11790+ /*
11791+ * vmalloc=size forces the vmalloc area to be exactly 'size'
11792+ * bytes. This can be used to increase (or decrease) the
11793+ * vmalloc area - the default is 128m.
11794+ */
11795+ else if (!memcmp(from, "vmalloc=", 8))
11796+ __VMALLOC_RESERVE = memparse(from+8, &from);
11797+
11798+ next_char:
11799+ c = *(from++);
11800+ if (!c)
11801+ break;
11802+ if (COMMAND_LINE_SIZE <= ++len)
11803+ break;
11804+ *(to++) = c;
11805+ }
11806+ *to = '\0';
11807+ *cmdline_p = command_line;
11808+ if (userdef) {
11809+ printk(KERN_INFO "user-defined physical RAM map:\n");
11810+ print_memory_map("user");
11811+ }
11812+}
11813+
11814+/*
11815+ * Callback for efi_memory_walk.
11816+ */
11817+static int __init
11818+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
11819+{
11820+ unsigned long *max_pfn = arg, pfn;
11821+
11822+ if (start < end) {
11823+ pfn = PFN_UP(end -1);
11824+ if (pfn > *max_pfn)
11825+ *max_pfn = pfn;
11826+ }
11827+ return 0;
11828+}
11829+
11830+static int __init
11831+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
11832+{
11833+ memory_present(0, start, end);
11834+ return 0;
11835+}
11836+
11837+/*
11838+ * Find the highest page frame number we have available
11839+ */
11840+void __init find_max_pfn(void)
11841+{
11842+ int i;
11843+
11844+ max_pfn = 0;
11845+ if (efi_enabled) {
11846+ efi_memmap_walk(efi_find_max_pfn, &max_pfn);
11847+ efi_memmap_walk(efi_memory_present_wrapper, NULL);
11848+ return;
11849+ }
11850+
11851+ for (i = 0; i < e820.nr_map; i++) {
11852+ unsigned long start, end;
11853+ /* RAM? */
11854+ if (e820.map[i].type != E820_RAM)
11855+ continue;
11856+ start = PFN_UP(e820.map[i].addr);
11857+ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11858+ if (start >= end)
11859+ continue;
11860+ if (end > max_pfn)
11861+ max_pfn = end;
11862+ memory_present(0, start, end);
11863+ }
11864+}
11865+
11866+/*
11867+ * Determine low and high memory ranges:
11868+ */
11869+unsigned long __init find_max_low_pfn(void)
11870+{
11871+ unsigned long max_low_pfn;
11872+
11873+ max_low_pfn = max_pfn;
11874+ if (max_low_pfn > MAXMEM_PFN) {
11875+ if (highmem_pages == -1)
11876+ highmem_pages = max_pfn - MAXMEM_PFN;
11877+ if (highmem_pages + MAXMEM_PFN < max_pfn)
11878+ max_pfn = MAXMEM_PFN + highmem_pages;
11879+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
11880+ printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
11881+ highmem_pages = 0;
11882+ }
11883+ max_low_pfn = MAXMEM_PFN;
11884+#ifndef CONFIG_HIGHMEM
11885+ /* Maximum memory usable is what is directly addressable */
11886+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
11887+ MAXMEM>>20);
11888+ if (max_pfn > MAX_NONPAE_PFN)
11889+ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11890+ else
11891+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
11892+ max_pfn = MAXMEM_PFN;
11893+#else /* !CONFIG_HIGHMEM */
11894+#ifndef CONFIG_X86_PAE
11895+ if (max_pfn > MAX_NONPAE_PFN) {
11896+ max_pfn = MAX_NONPAE_PFN;
11897+ printk(KERN_WARNING "Warning only 4GB will be used.\n");
11898+ printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11899+ }
11900+#endif /* !CONFIG_X86_PAE */
11901+#endif /* !CONFIG_HIGHMEM */
11902+ } else {
11903+ if (highmem_pages == -1)
11904+ highmem_pages = 0;
11905+#ifdef CONFIG_HIGHMEM
11906+ if (highmem_pages >= max_pfn) {
11907+ printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
11908+ highmem_pages = 0;
11909+ }
11910+ if (highmem_pages) {
11911+ if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
11912+ printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
11913+ highmem_pages = 0;
11914+ }
11915+ max_low_pfn -= highmem_pages;
11916+ }
11917+#else
11918+ if (highmem_pages)
11919+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
11920+#endif
11921+ }
11922+ return max_low_pfn;
11923+}
11924+
11925+/*
11926+ * Free all available memory for boot time allocation. Used
11927+ * as a callback function by efi_memory_walk()
11928+ */
11929+
11930+static int __init
11931+free_available_memory(unsigned long start, unsigned long end, void *arg)
11932+{
11933+ /* check max_low_pfn */
11934+ if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
11935+ return 0;
11936+ if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
11937+ end = (max_low_pfn + 1) << PAGE_SHIFT;
11938+ if (start < end)
11939+ free_bootmem(start, end - start);
11940+
11941+ return 0;
11942+}
11943+/*
11944+ * Register fully available low RAM pages with the bootmem allocator.
11945+ */
11946+static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
11947+{
11948+ int i;
11949+
11950+ if (efi_enabled) {
11951+ efi_memmap_walk(free_available_memory, NULL);
11952+ return;
11953+ }
11954+ for (i = 0; i < e820.nr_map; i++) {
11955+ unsigned long curr_pfn, last_pfn, size;
11956+ /*
11957+ * Reserve usable low memory
11958+ */
11959+ if (e820.map[i].type != E820_RAM)
11960+ continue;
11961+ /*
11962+ * We are rounding up the start address of usable memory:
11963+ */
11964+ curr_pfn = PFN_UP(e820.map[i].addr);
11965+ if (curr_pfn >= max_low_pfn)
11966+ continue;
11967+ /*
11968+ * ... and at the end of the usable range downwards:
11969+ */
11970+ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11971+
11972+#ifdef CONFIG_XEN
11973+ /*
11974+ * Truncate to the number of actual pages currently
11975+ * present.
11976+ */
11977+ if (last_pfn > xen_start_info->nr_pages)
11978+ last_pfn = xen_start_info->nr_pages;
11979+#endif
11980+
11981+ if (last_pfn > max_low_pfn)
11982+ last_pfn = max_low_pfn;
11983+
11984+ /*
11985+ * .. finally, did all the rounding and playing
11986+ * around just make the area go away?
11987+ */
11988+ if (last_pfn <= curr_pfn)
11989+ continue;
11990+
11991+ size = last_pfn - curr_pfn;
11992+ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
11993+ }
11994+}
11995+
11996+#ifndef CONFIG_XEN
11997+/*
11998+ * workaround for Dell systems that neglect to reserve EBDA
11999+ */
12000+static void __init reserve_ebda_region(void)
12001+{
12002+ unsigned int addr;
12003+ addr = get_bios_ebda();
12004+ if (addr)
12005+ reserve_bootmem(addr, PAGE_SIZE);
12006+}
12007+#endif
12008+
12009+#ifndef CONFIG_NEED_MULTIPLE_NODES
12010+void __init setup_bootmem_allocator(void);
12011+static unsigned long __init setup_memory(void)
12012+{
12013+ /*
12014+ * partially used pages are not usable - thus
12015+ * we are rounding upwards:
12016+ */
12017+ min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
12018+ xen_start_info->nr_pt_frames;
12019+
12020+ find_max_pfn();
12021+
12022+ max_low_pfn = find_max_low_pfn();
12023+
12024+#ifdef CONFIG_HIGHMEM
12025+ highstart_pfn = highend_pfn = max_pfn;
12026+ if (max_pfn > max_low_pfn) {
12027+ highstart_pfn = max_low_pfn;
12028+ }
12029+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
12030+ pages_to_mb(highend_pfn - highstart_pfn));
12031+#endif
12032+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
12033+ pages_to_mb(max_low_pfn));
12034+
12035+ setup_bootmem_allocator();
12036+
12037+ return max_low_pfn;
12038+}
12039+
12040+void __init zone_sizes_init(void)
12041+{
12042+ unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
12043+ unsigned int max_dma, low;
12044+
12045+ /*
12046+ * XEN: Our notion of "DMA memory" is fake when running over Xen.
12047+ * We simply put all RAM in the DMA zone so that those drivers which
12048+ * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
12049+ * Those drivers that *do* require lowmem are screwed anyway when
12050+ * running over Xen!
12051+ */
12052+ max_dma = max_low_pfn;
12053+ low = max_low_pfn;
12054+
12055+ if (low < max_dma)
12056+ zones_size[ZONE_DMA] = low;
12057+ else {
12058+ zones_size[ZONE_DMA] = max_dma;
12059+ zones_size[ZONE_NORMAL] = low - max_dma;
12060+#ifdef CONFIG_HIGHMEM
12061+ zones_size[ZONE_HIGHMEM] = highend_pfn - low;
12062+#endif
12063+ }
12064+ free_area_init(zones_size);
12065+}
12066+#else
12067+extern unsigned long __init setup_memory(void);
12068+extern void zone_sizes_init(void);
12069+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
12070+
12071+void __init setup_bootmem_allocator(void)
12072+{
12073+ unsigned long bootmap_size;
12074+ /*
12075+ * Initialize the boot-time allocator (with low memory only):
12076+ */
12077+ bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
12078+
12079+ register_bootmem_low_pages(max_low_pfn);
12080+
12081+ /*
12082+ * Reserve the bootmem bitmap itself as well. We do this in two
12083+ * steps (first step was init_bootmem()) because this catches
12084+ * the (very unlikely) case of us accidentally initializing the
12085+ * bootmem allocator with an invalid RAM area.
12086+ */
12087+ reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
12088+ bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
12089+
12090+#ifndef CONFIG_XEN
12091+ /*
12092+ * reserve physical page 0 - it's a special BIOS page on many boxes,
12093+ * enabling clean reboots, SMP operation, laptop functions.
12094+ */
12095+ reserve_bootmem(0, PAGE_SIZE);
12096+
12097+ /* reserve EBDA region, it's a 4K region */
12098+ reserve_ebda_region();
12099+
12100+ /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
12101+ PCI prefetch into it (errata #56). Usually the page is reserved anyways,
12102+ unless you have no PS/2 mouse plugged in. */
12103+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
12104+ boot_cpu_data.x86 == 6)
12105+ reserve_bootmem(0xa0000 - 4096, 4096);
12106+
12107+#ifdef CONFIG_SMP
12108+ /*
12109+ * But first pinch a few for the stack/trampoline stuff
12110+ * FIXME: Don't need the extra page at 4K, but need to fix
12111+ * trampoline before removing it. (see the GDT stuff)
12112+ */
12113+ reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
12114+#endif
12115+#ifdef CONFIG_ACPI_SLEEP
12116+ /*
12117+ * Reserve low memory region for sleep support.
12118+ */
12119+ acpi_reserve_bootmem();
12120+#endif
12121+#endif /* !CONFIG_XEN */
12122+
12123+#ifdef CONFIG_BLK_DEV_INITRD
12124+ if (xen_start_info->mod_start) {
12125+ if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
12126+ /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
12127+ initrd_start = INITRD_START + PAGE_OFFSET;
12128+ initrd_end = initrd_start+INITRD_SIZE;
12129+ initrd_below_start_ok = 1;
12130+ }
12131+ else {
12132+ printk(KERN_ERR "initrd extends beyond end of memory "
12133+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
12134+ INITRD_START + INITRD_SIZE,
12135+ max_low_pfn << PAGE_SHIFT);
12136+ initrd_start = 0;
12137+ }
12138+ }
12139+#endif
12140+#ifdef CONFIG_KEXEC
12141+#ifdef CONFIG_XEN
12142+ xen_machine_kexec_setup_resources();
12143+#else
12144+ if (crashk_res.start != crashk_res.end)
12145+ reserve_bootmem(crashk_res.start,
12146+ crashk_res.end - crashk_res.start + 1);
12147+#endif
12148+#endif
12149+
12150+ if (!xen_feature(XENFEAT_auto_translated_physmap))
12151+ phys_to_machine_mapping =
12152+ (unsigned long *)xen_start_info->mfn_list;
12153+}
12154+
12155+/*
12156+ * The node 0 pgdat is initialized before all of these because
12157+ * it's needed for bootmem. node>0 pgdats have their virtual
12158+ * space allocated before the pagetables are in place to access
12159+ * them, so they can't be cleared then.
12160+ *
12161+ * This should all compile down to nothing when NUMA is off.
12162+ */
12163+void __init remapped_pgdat_init(void)
12164+{
12165+ int nid;
12166+
12167+ for_each_online_node(nid) {
12168+ if (nid != 0)
12169+ memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
12170+ }
12171+}
12172+
12173+/*
12174+ * Request address space for all standard RAM and ROM resources
12175+ * and also for regions reported as reserved by the e820.
12176+ */
12177+static void __init
12178+legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
12179+ struct resource *code_resource,
12180+ struct resource *data_resource)
12181+{
12182+ int i;
12183+
12184+ probe_roms();
12185+
12186+ for (i = 0; i < nr_map; i++) {
12187+ struct resource *res;
12188+ if (e820[i].addr + e820[i].size > 0x100000000ULL)
12189+ continue;
12190+ res = alloc_bootmem_low(sizeof(struct resource));
12191+ switch (e820[i].type) {
12192+ case E820_RAM: res->name = "System RAM"; break;
12193+ case E820_ACPI: res->name = "ACPI Tables"; break;
12194+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
12195+ default: res->name = "reserved";
12196+ }
12197+ res->start = e820[i].addr;
12198+ res->end = res->start + e820[i].size - 1;
12199+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
12200+ request_resource(&iomem_resource, res);
12201+ if (e820[i].type == E820_RAM) {
12202+ /*
12203+ * We don't know which RAM region contains kernel data,
12204+ * so we try it repeatedly and let the resource manager
12205+ * test it.
12206+ */
12207+#ifndef CONFIG_XEN
12208+ request_resource(res, code_resource);
12209+ request_resource(res, data_resource);
12210+#endif
12211+#ifdef CONFIG_KEXEC
12212+ if (crashk_res.start != crashk_res.end)
12213+ request_resource(res, &crashk_res);
12214+#ifdef CONFIG_XEN
12215+ xen_machine_kexec_register_resources(res);
12216+#endif
12217+#endif
12218+ }
12219+ }
12220+}
12221+
12222+/*
12223+ * Locate a unused range of the physical address space below 4G which
12224+ * can be used for PCI mappings.
12225+ */
12226+static void __init
12227+e820_setup_gap(struct e820entry *e820, int nr_map)
12228+{
12229+ unsigned long gapstart, gapsize, round;
12230+ unsigned long long last;
12231+ int i;
12232+
12233+ /*
12234+ * Search for the bigest gap in the low 32 bits of the e820
12235+ * memory space.
12236+ */
12237+ last = 0x100000000ull;
12238+ gapstart = 0x10000000;
12239+ gapsize = 0x400000;
12240+ i = nr_map;
12241+ while (--i >= 0) {
12242+ unsigned long long start = e820[i].addr;
12243+ unsigned long long end = start + e820[i].size;
12244+
12245+ /*
12246+ * Since "last" is at most 4GB, we know we'll
12247+ * fit in 32 bits if this condition is true
12248+ */
12249+ if (last > end) {
12250+ unsigned long gap = last - end;
12251+
12252+ if (gap > gapsize) {
12253+ gapsize = gap;
12254+ gapstart = end;
12255+ }
12256+ }
12257+ if (start < last)
12258+ last = start;
12259+ }
12260+
12261+ /*
12262+ * See how much we want to round up: start off with
12263+ * rounding to the next 1MB area.
12264+ */
12265+ round = 0x100000;
12266+ while ((gapsize >> 4) > round)
12267+ round += round;
12268+ /* Fun with two's complement */
12269+ pci_mem_start = (gapstart + round) & -round;
12270+
12271+ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
12272+ pci_mem_start, gapstart, gapsize);
12273+}
12274+
12275+/*
12276+ * Request address space for all standard resources
12277+ */
12278+static void __init register_memory(void)
12279+{
12280+#ifdef CONFIG_XEN
12281+ struct xen_memory_map memmap;
12282+#endif
12283+ int i;
12284+
12285+ /* Nothing to do if not running in dom0. */
12286+ if (!is_initial_xendomain())
12287+ return;
12288+
12289+#ifdef CONFIG_XEN
12290+ memmap.nr_entries = E820MAX;
12291+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
12292+
12293+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
12294+ BUG();
12295+ machine_e820.nr_map = memmap.nr_entries;
12296+
12297+ legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
12298+ &code_resource, &data_resource);
12299+#else
12300+ if (efi_enabled)
12301+ efi_initialize_iomem_resources(&code_resource, &data_resource);
12302+ else
12303+ legacy_init_iomem_resources(e820.map, e820.nr_map,
12304+ &code_resource, &data_resource);
12305+#endif
12306+
12307+ /* EFI systems may still have VGA */
12308+ request_resource(&iomem_resource, &video_ram_resource);
12309+
12310+ /* request I/O space for devices used on all i[345]86 PCs */
12311+ for (i = 0; i < STANDARD_IO_RESOURCES; i++)
12312+ request_resource(&ioport_resource, &standard_io_resources[i]);
12313+
12314+#ifdef CONFIG_XEN
12315+ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
12316+#else
12317+ e820_setup_gap(e820.map, e820.nr_map);
12318+#endif
12319+}
12320+
12321+/* Use inline assembly to define this because the nops are defined
12322+ as inline assembly strings in the include files and we cannot
12323+ get them easily into strings. */
12324+asm("\t.data\nintelnops: "
12325+ GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
12326+ GENERIC_NOP7 GENERIC_NOP8);
12327+asm("\t.data\nk8nops: "
12328+ K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
12329+ K8_NOP7 K8_NOP8);
12330+asm("\t.data\nk7nops: "
12331+ K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
12332+ K7_NOP7 K7_NOP8);
12333+
12334+extern unsigned char intelnops[], k8nops[], k7nops[];
12335+static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
12336+ NULL,
12337+ intelnops,
12338+ intelnops + 1,
12339+ intelnops + 1 + 2,
12340+ intelnops + 1 + 2 + 3,
12341+ intelnops + 1 + 2 + 3 + 4,
12342+ intelnops + 1 + 2 + 3 + 4 + 5,
12343+ intelnops + 1 + 2 + 3 + 4 + 5 + 6,
12344+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12345+};
12346+static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
12347+ NULL,
12348+ k8nops,
12349+ k8nops + 1,
12350+ k8nops + 1 + 2,
12351+ k8nops + 1 + 2 + 3,
12352+ k8nops + 1 + 2 + 3 + 4,
12353+ k8nops + 1 + 2 + 3 + 4 + 5,
12354+ k8nops + 1 + 2 + 3 + 4 + 5 + 6,
12355+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12356+};
12357+static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
12358+ NULL,
12359+ k7nops,
12360+ k7nops + 1,
12361+ k7nops + 1 + 2,
12362+ k7nops + 1 + 2 + 3,
12363+ k7nops + 1 + 2 + 3 + 4,
12364+ k7nops + 1 + 2 + 3 + 4 + 5,
12365+ k7nops + 1 + 2 + 3 + 4 + 5 + 6,
12366+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12367+};
12368+static struct nop {
12369+ int cpuid;
12370+ unsigned char **noptable;
12371+} noptypes[] = {
12372+ { X86_FEATURE_K8, k8_nops },
12373+ { X86_FEATURE_K7, k7_nops },
12374+ { -1, NULL }
12375+};
12376+
12377+/* Replace instructions with better alternatives for this CPU type.
12378+
12379+ This runs before SMP is initialized to avoid SMP problems with
12380+ self modifying code. This implies that assymetric systems where
12381+ APs have less capabilities than the boot processor are not handled.
12382+ Tough. Make sure you disable such features by hand. */
12383+void apply_alternatives(void *start, void *end)
12384+{
12385+ struct alt_instr *a;
12386+ int diff, i, k;
12387+ unsigned char **noptable = intel_nops;
12388+ for (i = 0; noptypes[i].cpuid >= 0; i++) {
12389+ if (boot_cpu_has(noptypes[i].cpuid)) {
12390+ noptable = noptypes[i].noptable;
12391+ break;
12392+ }
12393+ }
12394+ for (a = start; (void *)a < end; a++) {
12395+ if (!boot_cpu_has(a->cpuid))
12396+ continue;
12397+ BUG_ON(a->replacementlen > a->instrlen);
12398+ memcpy(a->instr, a->replacement, a->replacementlen);
12399+ diff = a->instrlen - a->replacementlen;
12400+ /* Pad the rest with nops */
12401+ for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
12402+ k = diff;
12403+ if (k > ASM_NOP_MAX)
12404+ k = ASM_NOP_MAX;
12405+ memcpy(a->instr + i, noptable[k], k);
12406+ }
12407+ }
12408+}
12409+
12410+void __init alternative_instructions(void)
12411+{
12412+ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
12413+ apply_alternatives(__alt_instructions, __alt_instructions_end);
12414+}
12415+
12416+static char * __init machine_specific_memory_setup(void);
12417+
12418+#ifdef CONFIG_MCA
12419+static void set_mca_bus(int x)
12420+{
12421+ MCA_bus = x;
12422+}
12423+#else
12424+static void set_mca_bus(int x) { }
12425+#endif
12426+
12427+/*
12428+ * Determine if we were loaded by an EFI loader. If so, then we have also been
12429+ * passed the efi memmap, systab, etc., so we should use these data structures
12430+ * for initialization. Note, the efi init code path is determined by the
12431+ * global efi_enabled. This allows the same kernel image to be used on existing
12432+ * systems (with a traditional BIOS) as well as on EFI systems.
12433+ */
12434+void __init setup_arch(char **cmdline_p)
12435+{
12436+ int i, j, k, fpp;
12437+ struct physdev_set_iopl set_iopl;
12438+ unsigned long max_low_pfn;
12439+
12440+ /* Force a quick death if the kernel panics (not domain 0). */
12441+ extern int panic_timeout;
12442+ if (!panic_timeout && !is_initial_xendomain())
12443+ panic_timeout = 1;
12444+
12445+ /* Register a call for panic conditions. */
12446+ notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12447+
12448+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
12449+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
12450+ VMASST_TYPE_writable_pagetables);
12451+
12452+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12453+ early_cpu_init();
12454+
12455+ /*
12456+ * FIXME: This isn't an official loader_type right
12457+ * now but does currently work with elilo.
12458+ * If we were configured as an EFI kernel, check to make
12459+ * sure that we were loaded correctly from elilo and that
12460+ * the system table is valid. If not, then initialize normally.
12461+ */
12462+#ifdef CONFIG_EFI
12463+ if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
12464+ efi_enabled = 1;
12465+#endif
12466+
12467+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12468+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12469+ */
12470+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12471+ drive_info = DRIVE_INFO;
12472+ screen_info = SCREEN_INFO;
12473+ edid_info = EDID_INFO;
12474+ apm_info.bios = APM_BIOS_INFO;
12475+ ist_info = IST_INFO;
12476+ saved_videomode = VIDEO_MODE;
12477+ if( SYS_DESC_TABLE.length != 0 ) {
12478+ set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
12479+ machine_id = SYS_DESC_TABLE.table[0];
12480+ machine_submodel_id = SYS_DESC_TABLE.table[1];
12481+ BIOS_revision = SYS_DESC_TABLE.table[2];
12482+ }
12483+ bootloader_type = LOADER_TYPE;
12484+
12485+ if (is_initial_xendomain()) {
12486+ /* This is drawn from a dump from vgacon:startup in
12487+ * standard Linux. */
12488+ screen_info.orig_video_mode = 3;
12489+ screen_info.orig_video_isVGA = 1;
12490+ screen_info.orig_video_lines = 25;
12491+ screen_info.orig_video_cols = 80;
12492+ screen_info.orig_video_ega_bx = 3;
12493+ screen_info.orig_video_points = 16;
12494+ screen_info.orig_y = screen_info.orig_video_lines - 1;
12495+ if (xen_start_info->console.dom0.info_size >=
12496+ sizeof(struct dom0_vga_console_info)) {
12497+ const struct dom0_vga_console_info *info =
12498+ (struct dom0_vga_console_info *)(
12499+ (char *)xen_start_info +
12500+ xen_start_info->console.dom0.info_off);
12501+ dom0_init_screen_info(info);
12502+ }
12503+ xen_start_info->console.domU.mfn = 0;
12504+ xen_start_info->console.domU.evtchn = 0;
12505+ } else
12506+ screen_info.orig_video_isVGA = 0;
12507+
12508+#ifdef CONFIG_BLK_DEV_RAM
12509+ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
12510+ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
12511+ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
12512+#endif
12513+
12514+ setup_xen_features();
12515+
12516+ ARCH_SETUP
12517+ if (efi_enabled)
12518+ efi_init();
12519+ else {
12520+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
12521+ print_memory_map(machine_specific_memory_setup());
12522+ }
12523+
12524+ copy_edd();
12525+
12526+ if (!MOUNT_ROOT_RDONLY)
12527+ root_mountflags &= ~MS_RDONLY;
12528+ init_mm.start_code = (unsigned long) _text;
12529+ init_mm.end_code = (unsigned long) _etext;
12530+ init_mm.end_data = (unsigned long) _edata;
12531+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12532+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12533+
12534+ code_resource.start = virt_to_phys(_text);
12535+ code_resource.end = virt_to_phys(_etext)-1;
12536+ data_resource.start = virt_to_phys(_etext);
12537+ data_resource.end = virt_to_phys(_edata)-1;
12538+
12539+ parse_cmdline_early(cmdline_p);
12540+
12541+ max_low_pfn = setup_memory();
12542+
12543+ /*
12544+ * NOTE: before this point _nobody_ is allowed to allocate
12545+ * any memory using the bootmem allocator. Although the
12546+ * alloctor is now initialised only the first 8Mb of the kernel
12547+ * virtual address space has been mapped. All allocations before
12548+ * paging_init() has completed must use the alloc_bootmem_low_pages()
12549+ * variant (which allocates DMA'able memory) and care must be taken
12550+ * not to exceed the 8Mb limit.
12551+ */
12552+
12553+#ifdef CONFIG_SMP
12554+ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
12555+#endif
12556+ paging_init();
12557+ remapped_pgdat_init();
12558+ sparse_init();
12559+ zone_sizes_init();
12560+
12561+#ifdef CONFIG_X86_FIND_SMP_CONFIG
12562+ /*
12563+ * Find and reserve possible boot-time SMP configuration:
12564+ */
12565+ find_smp_config();
12566+#endif
12567+
12568+ /* Make sure we have a correctly sized P->M table. */
12569+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12570+ phys_to_machine_mapping = alloc_bootmem_low_pages(
12571+ max_pfn * sizeof(unsigned long));
12572+ memset(phys_to_machine_mapping, ~0,
12573+ max_pfn * sizeof(unsigned long));
12574+ memcpy(phys_to_machine_mapping,
12575+ (unsigned long *)xen_start_info->mfn_list,
12576+ xen_start_info->nr_pages * sizeof(unsigned long));
12577+ free_bootmem(
12578+ __pa(xen_start_info->mfn_list),
12579+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12580+ sizeof(unsigned long))));
12581+
12582+ /*
12583+ * Initialise the list of the frames that specify the list of
12584+ * frames that make up the p2m table. Used by save/restore
12585+ */
12586+ pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
12587+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12588+ virt_to_mfn(pfn_to_mfn_frame_list_list);
12589+
12590+ fpp = PAGE_SIZE/sizeof(unsigned long);
12591+ for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
12592+ if ((j % fpp) == 0) {
12593+ k++;
12594+ BUG_ON(k>=16);
12595+ pfn_to_mfn_frame_list[k] =
12596+ alloc_bootmem_low_pages(PAGE_SIZE);
12597+ pfn_to_mfn_frame_list_list[k] =
12598+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
12599+ j=0;
12600+ }
12601+ pfn_to_mfn_frame_list[k][j] =
12602+ virt_to_mfn(&phys_to_machine_mapping[i]);
12603+ }
12604+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12605+ }
12606+
12607+ /*
12608+ * NOTE: at this point the bootmem allocator is fully available.
12609+ */
12610+
12611+#ifdef CONFIG_EARLY_PRINTK
12612+ {
12613+ char *s = strstr(*cmdline_p, "earlyprintk=");
12614+ if (s) {
12615+ extern void setup_early_printk(char *);
12616+
12617+ setup_early_printk(strchr(s, '=') + 1);
12618+ printk("early console enabled\n");
12619+ }
12620+ }
12621+#endif
12622+
12623+ if (is_initial_xendomain())
12624+ dmi_scan_machine();
12625+
12626+#ifdef CONFIG_X86_GENERICARCH
12627+ generic_apic_probe(*cmdline_p);
12628+#endif
12629+ if (efi_enabled)
12630+ efi_map_memmap();
12631+
12632+ set_iopl.iopl = 1;
12633+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
12634+
12635+#ifdef CONFIG_X86_IO_APIC
12636+ check_acpi_pci(); /* Checks more than just ACPI actually */
12637+#endif
12638+
12639+#ifdef CONFIG_ACPI
12640+ if (!is_initial_xendomain()) {
12641+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12642+ acpi_disabled = 1;
12643+ acpi_ht = 0;
12644+ }
12645+
12646+ /*
12647+ * Parse the ACPI tables for possible boot-time SMP configuration.
12648+ */
12649+ acpi_boot_table_init();
12650+ acpi_boot_init();
12651+
12652+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
12653+ if (def_to_bigsmp)
12654+ printk(KERN_WARNING "More than 8 CPUs detected and "
12655+ "CONFIG_X86_PC cannot handle it.\nUse "
12656+ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
12657+#endif
12658+#endif
12659+#ifdef CONFIG_X86_LOCAL_APIC
12660+ if (smp_found_config)
12661+ get_smp_config();
12662+#endif
12663+
12664+ register_memory();
12665+
12666+ if (is_initial_xendomain()) {
12667+#ifdef CONFIG_VT
12668+#if defined(CONFIG_VGA_CONSOLE)
12669+ if (!efi_enabled ||
12670+ (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12671+ conswitchp = &vga_con;
12672+#elif defined(CONFIG_DUMMY_CONSOLE)
12673+ conswitchp = &dummy_con;
12674+#endif
12675+#endif
12676+ } else {
12677+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
12678+ conswitchp = &dummy_con;
12679+#endif
12680+ }
12681+ xencons_early_setup();
12682+}
12683+
12684+static int
12685+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12686+{
12687+ HYPERVISOR_shutdown(SHUTDOWN_crash);
12688+ /* we're never actually going to get here... */
12689+ return NOTIFY_DONE;
12690+}
12691+
12692+#include "setup_arch_post.h"
12693+/*
12694+ * Local Variables:
12695+ * mode:c
12696+ * c-file-style:"k&r"
12697+ * c-basic-offset:8
12698+ * End:
12699+ */
12700diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smp-xen.c linux-2.6.16.33/arch/i386/kernel/smp-xen.c
12701--- linux-2.6.16.33-noxen/arch/i386/kernel/smp-xen.c 1970-01-01 00:00:00.000000000 +0000
12702+++ linux-2.6.16.33/arch/i386/kernel/smp-xen.c 2007-01-08 15:00:45.000000000 +0000
12703@@ -0,0 +1,618 @@
12704+/*
12705+ * Intel SMP support routines.
12706+ *
12707+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
12708+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
12709+ *
12710+ * This code is released under the GNU General Public License version 2 or
12711+ * later.
12712+ */
12713+
12714+#include <linux/init.h>
12715+
12716+#include <linux/mm.h>
12717+#include <linux/delay.h>
12718+#include <linux/spinlock.h>
12719+#include <linux/smp_lock.h>
12720+#include <linux/kernel_stat.h>
12721+#include <linux/mc146818rtc.h>
12722+#include <linux/cache.h>
12723+#include <linux/interrupt.h>
12724+#include <linux/cpu.h>
12725+#include <linux/module.h>
12726+
12727+#include <asm/mtrr.h>
12728+#include <asm/tlbflush.h>
12729+#if 0
12730+#include <mach_apic.h>
12731+#endif
12732+#include <xen/evtchn.h>
12733+
12734+/*
12735+ * Some notes on x86 processor bugs affecting SMP operation:
12736+ *
12737+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
12738+ * The Linux implications for SMP are handled as follows:
12739+ *
12740+ * Pentium III / [Xeon]
12741+ * None of the E1AP-E3AP errata are visible to the user.
12742+ *
12743+ * E1AP. see PII A1AP
12744+ * E2AP. see PII A2AP
12745+ * E3AP. see PII A3AP
12746+ *
12747+ * Pentium II / [Xeon]
12748+ * None of the A1AP-A3AP errata are visible to the user.
12749+ *
12750+ * A1AP. see PPro 1AP
12751+ * A2AP. see PPro 2AP
12752+ * A3AP. see PPro 7AP
12753+ *
12754+ * Pentium Pro
12755+ * None of 1AP-9AP errata are visible to the normal user,
12756+ * except occasional delivery of 'spurious interrupt' as trap #15.
12757+ * This is very rare and a non-problem.
12758+ *
12759+ * 1AP. Linux maps APIC as non-cacheable
12760+ * 2AP. worked around in hardware
12761+ * 3AP. fixed in C0 and above steppings microcode update.
12762+ * Linux does not use excessive STARTUP_IPIs.
12763+ * 4AP. worked around in hardware
12764+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
12765+ * 'noapic' mode has vector 0xf filled out properly.
12766+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
12767+ * 7AP. We do not assume writes to the LVT deassering IRQs
12768+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
12769+ * 9AP. We do not use mixed mode
12770+ *
12771+ * Pentium
12772+ * There is a marginal case where REP MOVS on 100MHz SMP
12773+ * machines with B stepping processors can fail. XXX should provide
12774+ * an L1cache=Writethrough or L1cache=off option.
12775+ *
12776+ * B stepping CPUs may hang. There are hardware work arounds
12777+ * for this. We warn about it in case your board doesn't have the work
12778+ * arounds. Basically thats so I can tell anyone with a B stepping
12779+ * CPU and SMP problems "tough".
12780+ *
12781+ * Specific items [From Pentium Processor Specification Update]
12782+ *
12783+ * 1AP. Linux doesn't use remote read
12784+ * 2AP. Linux doesn't trust APIC errors
12785+ * 3AP. We work around this
12786+ * 4AP. Linux never generated 3 interrupts of the same priority
12787+ * to cause a lost local interrupt.
12788+ * 5AP. Remote read is never used
12789+ * 6AP. not affected - worked around in hardware
12790+ * 7AP. not affected - worked around in hardware
12791+ * 8AP. worked around in hardware - we get explicit CS errors if not
12792+ * 9AP. only 'noapic' mode affected. Might generate spurious
12793+ * interrupts, we log only the first one and count the
12794+ * rest silently.
12795+ * 10AP. not affected - worked around in hardware
12796+ * 11AP. Linux reads the APIC between writes to avoid this, as per
12797+ * the documentation. Make sure you preserve this as it affects
12798+ * the C stepping chips too.
12799+ * 12AP. not affected - worked around in hardware
12800+ * 13AP. not affected - worked around in hardware
12801+ * 14AP. we always deassert INIT during bootup
12802+ * 15AP. not affected - worked around in hardware
12803+ * 16AP. not affected - worked around in hardware
12804+ * 17AP. not affected - worked around in hardware
12805+ * 18AP. not affected - worked around in hardware
12806+ * 19AP. not affected - worked around in BIOS
12807+ *
12808+ * If this sounds worrying believe me these bugs are either ___RARE___,
12809+ * or are signal timing bugs worked around in hardware and there's
12810+ * about nothing of note with C stepping upwards.
12811+ */
12812+
12813+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
12814+
12815+/*
12816+ * the following functions deal with sending IPIs between CPUs.
12817+ *
12818+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
12819+ */
12820+
12821+static inline int __prepare_ICR (unsigned int shortcut, int vector)
12822+{
12823+ return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
12824+}
12825+
12826+static inline int __prepare_ICR2 (unsigned int mask)
12827+{
12828+ return SET_APIC_DEST_FIELD(mask);
12829+}
12830+
12831+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
12832+
12833+static inline void __send_IPI_one(unsigned int cpu, int vector)
12834+{
12835+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
12836+ BUG_ON(irq < 0);
12837+ notify_remote_via_irq(irq);
12838+}
12839+
12840+void __send_IPI_shortcut(unsigned int shortcut, int vector)
12841+{
12842+ int cpu;
12843+
12844+ switch (shortcut) {
12845+ case APIC_DEST_SELF:
12846+ __send_IPI_one(smp_processor_id(), vector);
12847+ break;
12848+ case APIC_DEST_ALLBUT:
12849+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12850+ if (cpu == smp_processor_id())
12851+ continue;
12852+ if (cpu_isset(cpu, cpu_online_map)) {
12853+ __send_IPI_one(cpu, vector);
12854+ }
12855+ }
12856+ break;
12857+ default:
12858+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
12859+ vector);
12860+ break;
12861+ }
12862+}
12863+
12864+void fastcall send_IPI_self(int vector)
12865+{
12866+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
12867+}
12868+
12869+/*
12870+ * This is only used on smaller machines.
12871+ */
12872+void send_IPI_mask_bitmask(cpumask_t mask, int vector)
12873+{
12874+ unsigned long flags;
12875+ unsigned int cpu;
12876+
12877+ local_irq_save(flags);
12878+ WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
12879+
12880+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12881+ if (cpu_isset(cpu, mask)) {
12882+ __send_IPI_one(cpu, vector);
12883+ }
12884+ }
12885+
12886+ local_irq_restore(flags);
12887+}
12888+
12889+void send_IPI_mask_sequence(cpumask_t mask, int vector)
12890+{
12891+
12892+ send_IPI_mask_bitmask(mask, vector);
12893+}
12894+
12895+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
12896+
12897+#if 0 /* XEN */
12898+/*
12899+ * Smarter SMP flushing macros.
12900+ * c/o Linus Torvalds.
12901+ *
12902+ * These mean you can really definitely utterly forget about
12903+ * writing to user space from interrupts. (Its not allowed anyway).
12904+ *
12905+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
12906+ */
12907+
12908+static cpumask_t flush_cpumask;
12909+static struct mm_struct * flush_mm;
12910+static unsigned long flush_va;
12911+static DEFINE_SPINLOCK(tlbstate_lock);
12912+#define FLUSH_ALL 0xffffffff
12913+
12914+/*
12915+ * We cannot call mmdrop() because we are in interrupt context,
12916+ * instead update mm->cpu_vm_mask.
12917+ *
12918+ * We need to reload %cr3 since the page tables may be going
12919+ * away from under us..
12920+ */
12921+static inline void leave_mm (unsigned long cpu)
12922+{
12923+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
12924+ BUG();
12925+ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
12926+ load_cr3(swapper_pg_dir);
12927+}
12928+
12929+/*
12930+ *
12931+ * The flush IPI assumes that a thread switch happens in this order:
12932+ * [cpu0: the cpu that switches]
12933+ * 1) switch_mm() either 1a) or 1b)
12934+ * 1a) thread switch to a different mm
12935+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
12936+ * Stop ipi delivery for the old mm. This is not synchronized with
12937+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
12938+ * for the wrong mm, and in the worst case we perform a superflous
12939+ * tlb flush.
12940+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
12941+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
12942+ * was in lazy tlb mode.
12943+ * 1a3) update cpu_tlbstate[].active_mm
12944+ * Now cpu0 accepts tlb flushes for the new mm.
12945+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
12946+ * Now the other cpus will send tlb flush ipis.
12947+ * 1a4) change cr3.
12948+ * 1b) thread switch without mm change
12949+ * cpu_tlbstate[].active_mm is correct, cpu0 already handles
12950+ * flush ipis.
12951+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
12952+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
12953+ * Atomically set the bit [other cpus will start sending flush ipis],
12954+ * and test the bit.
12955+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
12956+ * 2) switch %%esp, ie current
12957+ *
12958+ * The interrupt must handle 2 special cases:
12959+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
12960+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
12961+ * runs in kernel space, the cpu could load tlb entries for user space
12962+ * pages.
12963+ *
12964+ * The good news is that cpu_tlbstate is local to each cpu, no
12965+ * write/read ordering problems.
12966+ */
12967+
12968+/*
12969+ * TLB flush IPI:
12970+ *
12971+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
12972+ * 2) Leave the mm if we are in the lazy tlb mode.
12973+ */
12974+
12975+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12976+ struct pt_regs *regs)
12977+{
12978+ unsigned long cpu;
12979+
12980+ cpu = get_cpu();
12981+
12982+ if (!cpu_isset(cpu, flush_cpumask))
12983+ goto out;
12984+ /*
12985+ * This was a BUG() but until someone can quote me the
12986+ * line from the intel manual that guarantees an IPI to
12987+ * multiple CPUs is retried _only_ on the erroring CPUs
12988+ * its staying as a return
12989+ *
12990+ * BUG();
12991+ */
12992+
12993+ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
12994+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
12995+ if (flush_va == FLUSH_ALL)
12996+ local_flush_tlb();
12997+ else
12998+ __flush_tlb_one(flush_va);
12999+ } else
13000+ leave_mm(cpu);
13001+ }
13002+ smp_mb__before_clear_bit();
13003+ cpu_clear(cpu, flush_cpumask);
13004+ smp_mb__after_clear_bit();
13005+out:
13006+ put_cpu_no_resched();
13007+
13008+ return IRQ_HANDLED;
13009+}
13010+
13011+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
13012+ unsigned long va)
13013+{
13014+ /*
13015+ * A couple of (to be removed) sanity checks:
13016+ *
13017+ * - current CPU must not be in mask
13018+ * - mask must exist :)
13019+ */
13020+ BUG_ON(cpus_empty(cpumask));
13021+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
13022+ BUG_ON(!mm);
13023+
13024+ /* If a CPU which we ran on has gone down, OK. */
13025+ cpus_and(cpumask, cpumask, cpu_online_map);
13026+ if (cpus_empty(cpumask))
13027+ return;
13028+
13029+ /*
13030+ * i'm not happy about this global shared spinlock in the
13031+ * MM hot path, but we'll see how contended it is.
13032+ * Temporarily this turns IRQs off, so that lockups are
13033+ * detected by the NMI watchdog.
13034+ */
13035+ spin_lock(&tlbstate_lock);
13036+
13037+ flush_mm = mm;
13038+ flush_va = va;
13039+#if NR_CPUS <= BITS_PER_LONG
13040+ atomic_set_mask(cpumask, &flush_cpumask);
13041+#else
13042+ {
13043+ int k;
13044+ unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
13045+ unsigned long *cpu_mask = (unsigned long *)&cpumask;
13046+ for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
13047+ atomic_set_mask(cpu_mask[k], &flush_mask[k]);
13048+ }
13049+#endif
13050+ /*
13051+ * We have to send the IPI only to
13052+ * CPUs affected.
13053+ */
13054+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
13055+
13056+ while (!cpus_empty(flush_cpumask))
13057+ /* nothing. lockup detection does not belong here */
13058+ mb();
13059+
13060+ flush_mm = NULL;
13061+ flush_va = 0;
13062+ spin_unlock(&tlbstate_lock);
13063+}
13064+
13065+void flush_tlb_current_task(void)
13066+{
13067+ struct mm_struct *mm = current->mm;
13068+ cpumask_t cpu_mask;
13069+
13070+ preempt_disable();
13071+ cpu_mask = mm->cpu_vm_mask;
13072+ cpu_clear(smp_processor_id(), cpu_mask);
13073+
13074+ local_flush_tlb();
13075+ if (!cpus_empty(cpu_mask))
13076+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13077+ preempt_enable();
13078+}
13079+
13080+void flush_tlb_mm (struct mm_struct * mm)
13081+{
13082+ cpumask_t cpu_mask;
13083+
13084+ preempt_disable();
13085+ cpu_mask = mm->cpu_vm_mask;
13086+ cpu_clear(smp_processor_id(), cpu_mask);
13087+
13088+ if (current->active_mm == mm) {
13089+ if (current->mm)
13090+ local_flush_tlb();
13091+ else
13092+ leave_mm(smp_processor_id());
13093+ }
13094+ if (!cpus_empty(cpu_mask))
13095+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13096+
13097+ preempt_enable();
13098+}
13099+
13100+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
13101+{
13102+ struct mm_struct *mm = vma->vm_mm;
13103+ cpumask_t cpu_mask;
13104+
13105+ preempt_disable();
13106+ cpu_mask = mm->cpu_vm_mask;
13107+ cpu_clear(smp_processor_id(), cpu_mask);
13108+
13109+ if (current->active_mm == mm) {
13110+ if(current->mm)
13111+ __flush_tlb_one(va);
13112+ else
13113+ leave_mm(smp_processor_id());
13114+ }
13115+
13116+ if (!cpus_empty(cpu_mask))
13117+ flush_tlb_others(cpu_mask, mm, va);
13118+
13119+ preempt_enable();
13120+}
13121+EXPORT_SYMBOL(flush_tlb_page);
13122+
13123+static void do_flush_tlb_all(void* info)
13124+{
13125+ unsigned long cpu = smp_processor_id();
13126+
13127+ __flush_tlb_all();
13128+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
13129+ leave_mm(cpu);
13130+}
13131+
13132+void flush_tlb_all(void)
13133+{
13134+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
13135+}
13136+
13137+#else
13138+
13139+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
13140+ struct pt_regs *regs)
13141+{ return 0; }
13142+void flush_tlb_current_task(void)
13143+{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
13144+void flush_tlb_mm(struct mm_struct * mm)
13145+{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
13146+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
13147+{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
13148+EXPORT_SYMBOL(flush_tlb_page);
13149+void flush_tlb_all(void)
13150+{ xen_tlb_flush_all(); }
13151+
13152+#endif /* XEN */
13153+
13154+/*
13155+ * this function sends a 'reschedule' IPI to another CPU.
13156+ * it goes straight through and wastes no time serializing
13157+ * anything. Worst case is that we lose a reschedule ...
13158+ */
13159+void smp_send_reschedule(int cpu)
13160+{
13161+ WARN_ON(cpu_is_offline(cpu));
13162+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
13163+}
13164+
13165+/*
13166+ * Structure and data for smp_call_function(). This is designed to minimise
13167+ * static memory requirements. It also looks cleaner.
13168+ */
13169+static DEFINE_SPINLOCK(call_lock);
13170+
13171+struct call_data_struct {
13172+ void (*func) (void *info);
13173+ void *info;
13174+ atomic_t started;
13175+ atomic_t finished;
13176+ int wait;
13177+};
13178+
13179+void lock_ipi_call_lock(void)
13180+{
13181+ spin_lock_irq(&call_lock);
13182+}
13183+
13184+void unlock_ipi_call_lock(void)
13185+{
13186+ spin_unlock_irq(&call_lock);
13187+}
13188+
13189+static struct call_data_struct * call_data;
13190+
13191+/*
13192+ * this function sends a 'generic call function' IPI to all other CPUs
13193+ * in the system.
13194+ */
13195+
13196+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
13197+ int wait)
13198+/*
13199+ * [SUMMARY] Run a function on all other CPUs.
13200+ * <func> The function to run. This must be fast and non-blocking.
13201+ * <info> An arbitrary pointer to pass to the function.
13202+ * <nonatomic> currently unused.
13203+ * <wait> If true, wait (atomically) until function has completed on other CPUs.
13204+ * [RETURNS] 0 on success, else a negative status code. Does not return until
13205+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
13206+ *
13207+ * You must not call this function with disabled interrupts or from a
13208+ * hardware interrupt handler or from a bottom half handler.
13209+ */
13210+{
13211+ struct call_data_struct data;
13212+ int cpus;
13213+
13214+ /* Holding any lock stops cpus from going down. */
13215+ spin_lock(&call_lock);
13216+ cpus = num_online_cpus() - 1;
13217+ if (!cpus) {
13218+ spin_unlock(&call_lock);
13219+ return 0;
13220+ }
13221+
13222+ /* Can deadlock when called with interrupts disabled */
13223+ WARN_ON(irqs_disabled());
13224+
13225+ data.func = func;
13226+ data.info = info;
13227+ atomic_set(&data.started, 0);
13228+ data.wait = wait;
13229+ if (wait)
13230+ atomic_set(&data.finished, 0);
13231+
13232+ call_data = &data;
13233+ mb();
13234+
13235+ /* Send a message to all other CPUs and wait for them to respond */
13236+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
13237+
13238+ /* Wait for response */
13239+ while (atomic_read(&data.started) != cpus)
13240+ barrier();
13241+
13242+ if (wait)
13243+ while (atomic_read(&data.finished) != cpus)
13244+ barrier();
13245+ spin_unlock(&call_lock);
13246+
13247+ return 0;
13248+}
13249+EXPORT_SYMBOL(smp_call_function);
13250+
13251+static void stop_this_cpu (void * dummy)
13252+{
13253+ /*
13254+ * Remove this CPU:
13255+ */
13256+ cpu_clear(smp_processor_id(), cpu_online_map);
13257+ local_irq_disable();
13258+#if 0
13259+ disable_local_APIC();
13260+#endif
13261+ if (cpu_data[smp_processor_id()].hlt_works_ok)
13262+ for(;;) halt();
13263+ for (;;);
13264+}
13265+
13266+/*
13267+ * this function calls the 'stop' function on all other CPUs in the system.
13268+ */
13269+
13270+void smp_send_stop(void)
13271+{
13272+ smp_call_function(stop_this_cpu, NULL, 1, 0);
13273+
13274+ local_irq_disable();
13275+#if 0
13276+ disable_local_APIC();
13277+#endif
13278+ local_irq_enable();
13279+}
13280+
13281+/*
13282+ * Reschedule call back. Nothing to do,
13283+ * all the work is done automatically when
13284+ * we return from the interrupt.
13285+ */
13286+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
13287+ struct pt_regs *regs)
13288+{
13289+
13290+ return IRQ_HANDLED;
13291+}
13292+
13293+#include <linux/kallsyms.h>
13294+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
13295+ struct pt_regs *regs)
13296+{
13297+ void (*func) (void *info) = call_data->func;
13298+ void *info = call_data->info;
13299+ int wait = call_data->wait;
13300+
13301+ /*
13302+ * Notify initiating CPU that I've grabbed the data and am
13303+ * about to execute the function
13304+ */
13305+ mb();
13306+ atomic_inc(&call_data->started);
13307+ /*
13308+ * At this point the info structure may be out of scope unless wait==1
13309+ */
13310+ irq_enter();
13311+ (*func)(info);
13312+ irq_exit();
13313+
13314+ if (wait) {
13315+ mb();
13316+ atomic_inc(&call_data->finished);
13317+ }
13318+
13319+ return IRQ_HANDLED;
13320+}
13321+
13322diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smpalts.c linux-2.6.16.33/arch/i386/kernel/smpalts.c
13323--- linux-2.6.16.33-noxen/arch/i386/kernel/smpalts.c 1970-01-01 00:00:00.000000000 +0000
13324+++ linux-2.6.16.33/arch/i386/kernel/smpalts.c 2007-01-08 15:00:45.000000000 +0000
13325@@ -0,0 +1,85 @@
13326+#include <linux/kernel.h>
13327+#include <asm/system.h>
13328+#include <asm/smp_alt.h>
13329+#include <asm/processor.h>
13330+#include <asm/string.h>
13331+
13332+struct smp_replacement_record {
13333+ unsigned char targ_size;
13334+ unsigned char smp1_size;
13335+ unsigned char smp2_size;
13336+ unsigned char up_size;
13337+ unsigned char feature;
13338+ unsigned char data[0];
13339+};
13340+
13341+struct smp_alternative_record {
13342+ void *targ_start;
13343+ struct smp_replacement_record *repl;
13344+};
13345+
13346+extern struct smp_alternative_record __start_smp_alternatives_table,
13347+ __stop_smp_alternatives_table;
13348+extern unsigned long __init_begin, __init_end;
13349+
13350+void prepare_for_smp(void)
13351+{
13352+ struct smp_alternative_record *r;
13353+ printk(KERN_INFO "Enabling SMP...\n");
13354+ for (r = &__start_smp_alternatives_table;
13355+ r != &__stop_smp_alternatives_table;
13356+ r++) {
13357+ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
13358+ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
13359+ BUG_ON(r->repl->targ_size < r->repl->up_size);
13360+ if (system_state == SYSTEM_RUNNING &&
13361+ r->targ_start >= (void *)&__init_begin &&
13362+ r->targ_start < (void *)&__init_end)
13363+ continue;
13364+ if (r->repl->feature != (unsigned char)-1 &&
13365+ boot_cpu_has(r->repl->feature)) {
13366+ memcpy(r->targ_start,
13367+ r->repl->data + r->repl->smp1_size,
13368+ r->repl->smp2_size);
13369+ memset(r->targ_start + r->repl->smp2_size,
13370+ 0x90,
13371+ r->repl->targ_size - r->repl->smp2_size);
13372+ } else {
13373+ memcpy(r->targ_start,
13374+ r->repl->data,
13375+ r->repl->smp1_size);
13376+ memset(r->targ_start + r->repl->smp1_size,
13377+ 0x90,
13378+ r->repl->targ_size - r->repl->smp1_size);
13379+ }
13380+ }
13381+ /* Paranoia */
13382+ asm volatile ("jmp 1f\n1:");
13383+ mb();
13384+}
13385+
13386+void unprepare_for_smp(void)
13387+{
13388+ struct smp_alternative_record *r;
13389+ printk(KERN_INFO "Disabling SMP...\n");
13390+ for (r = &__start_smp_alternatives_table;
13391+ r != &__stop_smp_alternatives_table;
13392+ r++) {
13393+ BUG_ON(r->repl->targ_size < r->repl->smp1_size);
13394+ BUG_ON(r->repl->targ_size < r->repl->smp2_size);
13395+ BUG_ON(r->repl->targ_size < r->repl->up_size);
13396+ if (system_state == SYSTEM_RUNNING &&
13397+ r->targ_start >= (void *)&__init_begin &&
13398+ r->targ_start < (void *)&__init_end)
13399+ continue;
13400+ memcpy(r->targ_start,
13401+ r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
13402+ r->repl->up_size);
13403+ memset(r->targ_start + r->repl->up_size,
13404+ 0x90,
13405+ r->repl->targ_size - r->repl->up_size);
13406+ }
13407+ /* Paranoia */
13408+ asm volatile ("jmp 1f\n1:");
13409+ mb();
13410+}
13411diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smpboot.c linux-2.6.16.33/arch/i386/kernel/smpboot.c
13412--- linux-2.6.16.33-noxen/arch/i386/kernel/smpboot.c 2006-11-22 18:06:31.000000000 +0000
13413+++ linux-2.6.16.33/arch/i386/kernel/smpboot.c 2007-01-08 15:00:45.000000000 +0000
13414@@ -1218,6 +1218,11 @@
13415 if (max_cpus <= cpucount+1)
13416 continue;
13417
13418+#ifdef CONFIG_SMP_ALTERNATIVES
13419+ if (kicked == 1)
13420+ prepare_for_smp();
13421+#endif
13422+
13423 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
13424 printk("CPU #%d not responding - cannot use it.\n",
13425 apicid);
13426@@ -1396,6 +1401,11 @@
13427 return -EIO;
13428 }
13429
13430+#ifdef CONFIG_SMP_ALTERNATIVES
13431+ if (num_online_cpus() == 1)
13432+ prepare_for_smp();
13433+#endif
13434+
13435 local_irq_enable();
13436 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
13437 /* Unleash the CPU! */
13438diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/swiotlb.c linux-2.6.16.33/arch/i386/kernel/swiotlb.c
13439--- linux-2.6.16.33-noxen/arch/i386/kernel/swiotlb.c 1970-01-01 00:00:00.000000000 +0000
13440+++ linux-2.6.16.33/arch/i386/kernel/swiotlb.c 2007-01-08 15:00:45.000000000 +0000
13441@@ -0,0 +1,683 @@
13442+/*
13443+ * Dynamic DMA mapping support.
13444+ *
13445+ * This implementation is a fallback for platforms that do not support
13446+ * I/O TLBs (aka DMA address translation hardware).
13447+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
13448+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
13449+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
13450+ * David Mosberger-Tang <davidm@hpl.hp.com>
13451+ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
13452+ */
13453+
13454+#include <linux/cache.h>
13455+#include <linux/mm.h>
13456+#include <linux/module.h>
13457+#include <linux/pci.h>
13458+#include <linux/spinlock.h>
13459+#include <linux/string.h>
13460+#include <linux/types.h>
13461+#include <linux/ctype.h>
13462+#include <linux/init.h>
13463+#include <linux/bootmem.h>
13464+#include <linux/highmem.h>
13465+#include <asm/io.h>
13466+#include <asm/pci.h>
13467+#include <asm/dma.h>
13468+#include <asm/uaccess.h>
13469+#include <xen/interface/memory.h>
13470+
13471+int swiotlb;
13472+EXPORT_SYMBOL(swiotlb);
13473+
13474+#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
13475+
13476+#define SG_ENT_PHYS_ADDRESS(sg) (page_to_bus((sg)->page) + (sg)->offset)
13477+
13478+/*
13479+ * Maximum allowable number of contiguous slabs to map,
13480+ * must be a power of 2. What is the appropriate value ?
13481+ * The complexity of {map,unmap}_single is linearly dependent on this value.
13482+ */
13483+#define IO_TLB_SEGSIZE 128
13484+
13485+/*
13486+ * log of the size of each IO TLB slab. The number of slabs is command line
13487+ * controllable.
13488+ */
13489+#define IO_TLB_SHIFT 11
13490+
13491+/* Width of DMA addresses. 30 bits is a b44 limitation. */
13492+#define DEFAULT_DMA_BITS 30
13493+
13494+static int swiotlb_force;
13495+static char *iotlb_virt_start;
13496+static unsigned long iotlb_nslabs;
13497+
13498+/*
13499+ * Used to do a quick range check in swiotlb_unmap_single and
13500+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
13501+ * API.
13502+ */
13503+static unsigned long iotlb_pfn_start, iotlb_pfn_end;
13504+
13505+/* Does the given dma address reside within the swiotlb aperture? */
13506+static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
13507+{
13508+ unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
13509+ return (pfn_valid(pfn)
13510+ && (pfn >= iotlb_pfn_start)
13511+ && (pfn < iotlb_pfn_end));
13512+}
13513+
13514+/*
13515+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
13516+ */
13517+static unsigned long io_tlb_overflow = 32*1024;
13518+
13519+void *io_tlb_overflow_buffer;
13520+
13521+/*
13522+ * This is a free list describing the number of free entries available from
13523+ * each index
13524+ */
13525+static unsigned int *io_tlb_list;
13526+static unsigned int io_tlb_index;
13527+
13528+/*
13529+ * We need to save away the original address corresponding to a mapped entry
13530+ * for the sync operations.
13531+ */
13532+static struct phys_addr {
13533+ struct page *page;
13534+ unsigned int offset;
13535+} *io_tlb_orig_addr;
13536+
13537+/*
13538+ * Protect the above data structures in the map and unmap calls
13539+ */
13540+static DEFINE_SPINLOCK(io_tlb_lock);
13541+
13542+unsigned int dma_bits = DEFAULT_DMA_BITS;
13543+static int __init
13544+setup_dma_bits(char *str)
13545+{
13546+ dma_bits = simple_strtoul(str, NULL, 0);
13547+ return 0;
13548+}
13549+__setup("dma_bits=", setup_dma_bits);
13550+
13551+static int __init
13552+setup_io_tlb_npages(char *str)
13553+{
13554+ /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
13555+ if (isdigit(*str)) {
13556+ iotlb_nslabs = simple_strtoul(str, &str, 0) <<
13557+ (20 - IO_TLB_SHIFT);
13558+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13559+ /* Round up to power of two (xen_create_contiguous_region). */
13560+ while (iotlb_nslabs & (iotlb_nslabs-1))
13561+ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13562+ }
13563+ if (*str == ',')
13564+ ++str;
13565+ /*
13566+ * NB. 'force' enables the swiotlb, but doesn't force its use for
13567+ * every DMA like it does on native Linux. 'off' forcibly disables
13568+ * use of the swiotlb.
13569+ */
13570+ if (!strcmp(str, "force"))
13571+ swiotlb_force = 1;
13572+ else if (!strcmp(str, "off"))
13573+ swiotlb_force = -1;
13574+ return 1;
13575+}
13576+__setup("swiotlb=", setup_io_tlb_npages);
13577+/* make io_tlb_overflow tunable too? */
13578+
13579+/*
13580+ * Statically reserve bounce buffer space and initialize bounce buffer data
13581+ * structures for the software IO TLB used to implement the PCI DMA API.
13582+ */
13583+void
13584+swiotlb_init_with_default_size (size_t default_size)
13585+{
13586+ unsigned long i, bytes;
13587+
13588+ if (!iotlb_nslabs) {
13589+ iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
13590+ iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13591+ /* Round up to power of two (xen_create_contiguous_region). */
13592+ while (iotlb_nslabs & (iotlb_nslabs-1))
13593+ iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13594+ }
13595+
13596+ bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
13597+
13598+ /*
13599+ * Get IO TLB memory from the low pages
13600+ */
13601+ iotlb_virt_start = alloc_bootmem_low_pages(bytes);
13602+ if (!iotlb_virt_start)
13603+ panic("Cannot allocate SWIOTLB buffer!\n"
13604+ "Use dom0_mem Xen boot parameter to reserve\n"
13605+ "some DMA memory (e.g., dom0_mem=-128M).\n");
13606+
13607+ for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
13608+ int rc = xen_create_contiguous_region(
13609+ (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
13610+ get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
13611+ dma_bits);
13612+ BUG_ON(rc);
13613+ }
13614+
13615+ /*
13616+ * Allocate and initialize the free list array. This array is used
13617+ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
13618+ */
13619+ io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
13620+ for (i = 0; i < iotlb_nslabs; i++)
13621+ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
13622+ io_tlb_index = 0;
13623+ io_tlb_orig_addr = alloc_bootmem(
13624+ iotlb_nslabs * sizeof(*io_tlb_orig_addr));
13625+
13626+ /*
13627+ * Get the overflow emergency buffer
13628+ */
13629+ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
13630+
13631+ iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
13632+ iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
13633+
13634+ printk(KERN_INFO "Software IO TLB enabled: \n"
13635+ " Aperture: %lu megabytes\n"
13636+ " Kernel range: 0x%016lx - 0x%016lx\n"
13637+ " Address size: %u bits\n",
13638+ bytes >> 20,
13639+ (unsigned long)iotlb_virt_start,
13640+ (unsigned long)iotlb_virt_start + bytes,
13641+ dma_bits);
13642+}
13643+
13644+void
13645+swiotlb_init(void)
13646+{
13647+ long ram_end;
13648+ size_t defsz = 64 * (1 << 20); /* 64MB default size */
13649+
13650+ if (swiotlb_force == 1) {
13651+ swiotlb = 1;
13652+ } else if ((swiotlb_force != -1) &&
13653+ is_running_on_xen() &&
13654+ is_initial_xendomain()) {
13655+ /* Domain 0 always has a swiotlb. */
13656+ ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
13657+ if (ram_end <= 0x7ffff)
13658+ defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
13659+ swiotlb = 1;
13660+ }
13661+
13662+ if (swiotlb)
13663+ swiotlb_init_with_default_size(defsz);
13664+ else
13665+ printk(KERN_INFO "Software IO TLB disabled\n");
13666+}
13667+
13668+/*
13669+ * We use __copy_to_user_inatomic to transfer to the host buffer because the
13670+ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
13671+ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
13672+ * unnecessary copy from the aperture to the host buffer, and a page fault.
13673+ */
13674+static void
13675+__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
13676+{
13677+ if (PageHighMem(buffer.page)) {
13678+ size_t len, bytes;
13679+ char *dev, *host, *kmp;
13680+ len = size;
13681+ while (len != 0) {
13682+ if (((bytes = len) + buffer.offset) > PAGE_SIZE)
13683+ bytes = PAGE_SIZE - buffer.offset;
13684+ kmp = kmap_atomic(buffer.page, KM_SWIOTLB);
13685+ dev = dma_addr + size - len;
13686+ host = kmp + buffer.offset;
13687+ if (dir == DMA_FROM_DEVICE) {
13688+ if (__copy_to_user_inatomic(host, dev, bytes))
13689+ /* inaccessible */;
13690+ } else
13691+ memcpy(dev, host, bytes);
13692+ kunmap_atomic(kmp, KM_SWIOTLB);
13693+ len -= bytes;
13694+ buffer.page++;
13695+ buffer.offset = 0;
13696+ }
13697+ } else {
13698+ char *host = (char *)phys_to_virt(
13699+ page_to_pseudophys(buffer.page)) + buffer.offset;
13700+ if (dir == DMA_FROM_DEVICE) {
13701+ if (__copy_to_user_inatomic(host, dma_addr, size))
13702+ /* inaccessible */;
13703+ } else if (dir == DMA_TO_DEVICE)
13704+ memcpy(dma_addr, host, size);
13705+ }
13706+}
13707+
13708+/*
13709+ * Allocates bounce buffer and returns its kernel virtual address.
13710+ */
13711+static void *
13712+map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
13713+{
13714+ unsigned long flags;
13715+ char *dma_addr;
13716+ unsigned int nslots, stride, index, wrap;
13717+ int i;
13718+
13719+ /*
13720+ * For mappings greater than a page, we limit the stride (and
13721+ * hence alignment) to a page size.
13722+ */
13723+ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13724+ if (size > PAGE_SIZE)
13725+ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
13726+ else
13727+ stride = 1;
13728+
13729+ BUG_ON(!nslots);
13730+
13731+ /*
13732+ * Find suitable number of IO TLB entries size that will fit this
13733+ * request and allocate a buffer from that IO TLB pool.
13734+ */
13735+ spin_lock_irqsave(&io_tlb_lock, flags);
13736+ {
13737+ wrap = index = ALIGN(io_tlb_index, stride);
13738+
13739+ if (index >= iotlb_nslabs)
13740+ wrap = index = 0;
13741+
13742+ do {
13743+ /*
13744+ * If we find a slot that indicates we have 'nslots'
13745+ * number of contiguous buffers, we allocate the
13746+ * buffers from that slot and mark the entries as '0'
13747+ * indicating unavailable.
13748+ */
13749+ if (io_tlb_list[index] >= nslots) {
13750+ int count = 0;
13751+
13752+ for (i = index; i < (int)(index + nslots); i++)
13753+ io_tlb_list[i] = 0;
13754+ for (i = index - 1;
13755+ (OFFSET(i, IO_TLB_SEGSIZE) !=
13756+ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13757+ i--)
13758+ io_tlb_list[i] = ++count;
13759+ dma_addr = iotlb_virt_start +
13760+ (index << IO_TLB_SHIFT);
13761+
13762+ /*
13763+ * Update the indices to avoid searching in
13764+ * the next round.
13765+ */
13766+ io_tlb_index =
13767+ ((index + nslots) < iotlb_nslabs
13768+ ? (index + nslots) : 0);
13769+
13770+ goto found;
13771+ }
13772+ index += stride;
13773+ if (index >= iotlb_nslabs)
13774+ index = 0;
13775+ } while (index != wrap);
13776+
13777+ spin_unlock_irqrestore(&io_tlb_lock, flags);
13778+ return NULL;
13779+ }
13780+ found:
13781+ spin_unlock_irqrestore(&io_tlb_lock, flags);
13782+
13783+ /*
13784+ * Save away the mapping from the original address to the DMA address.
13785+ * This is needed when we sync the memory. Then we sync the buffer if
13786+ * needed.
13787+ */
13788+ io_tlb_orig_addr[index] = buffer;
13789+ if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13790+ __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
13791+
13792+ return dma_addr;
13793+}
13794+
13795+/*
13796+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
13797+ */
13798+static void
13799+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13800+{
13801+ unsigned long flags;
13802+ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13803+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13804+ struct phys_addr buffer = io_tlb_orig_addr[index];
13805+
13806+ /*
13807+ * First, sync the memory before unmapping the entry
13808+ */
13809+ if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13810+ __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
13811+
13812+ /*
13813+ * Return the buffer to the free list by setting the corresponding
13814+ * entries to indicate the number of contigous entries available.
13815+ * While returning the entries to the free list, we merge the entries
13816+ * with slots below and above the pool being returned.
13817+ */
13818+ spin_lock_irqsave(&io_tlb_lock, flags);
13819+ {
13820+ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
13821+ io_tlb_list[index + nslots] : 0);
13822+ /*
13823+ * Step 1: return the slots to the free list, merging the
13824+ * slots with superceeding slots
13825+ */
13826+ for (i = index + nslots - 1; i >= index; i--)
13827+ io_tlb_list[i] = ++count;
13828+ /*
13829+ * Step 2: merge the returned slots with the preceding slots,
13830+ * if available (non zero)
13831+ */
13832+ for (i = index - 1;
13833+ (OFFSET(i, IO_TLB_SEGSIZE) !=
13834+ IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13835+ i--)
13836+ io_tlb_list[i] = ++count;
13837+ }
13838+ spin_unlock_irqrestore(&io_tlb_lock, flags);
13839+}
13840+
13841+static void
13842+sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13843+{
13844+ int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13845+ struct phys_addr buffer = io_tlb_orig_addr[index];
13846+ BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
13847+ __sync_single(buffer, dma_addr, size, dir);
13848+}
13849+
13850+static void
13851+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
13852+{
13853+ /*
13854+ * Ran out of IOMMU space for this operation. This is very bad.
13855+ * Unfortunately the drivers cannot handle this operation properly.
13856+ * unless they check for pci_dma_mapping_error (most don't)
13857+ * When the mapping is small enough return a static buffer to limit
13858+ * the damage, or panic when the transfer is too big.
13859+ */
13860+ printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
13861+ "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
13862+
13863+ if (size > io_tlb_overflow && do_panic) {
13864+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13865+ panic("PCI-DMA: Memory would be corrupted\n");
13866+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13867+ panic("PCI-DMA: Random memory would be DMAed\n");
13868+ }
13869+}
13870+
13871+/*
13872+ * Map a single buffer of the indicated size for DMA in streaming mode. The
13873+ * PCI address to use is returned.
13874+ *
13875+ * Once the device is given the dma address, the device owns this memory until
13876+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
13877+ */
13878+dma_addr_t
13879+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
13880+{
13881+ dma_addr_t dev_addr = virt_to_bus(ptr);
13882+ void *map;
13883+ struct phys_addr buffer;
13884+
13885+ BUG_ON(dir == DMA_NONE);
13886+
13887+ /*
13888+ * If the pointer passed in happens to be in the device's DMA window,
13889+ * we can safely return the device addr and not worry about bounce
13890+ * buffering it.
13891+ */
13892+ if (!range_straddles_page_boundary(ptr, size) &&
13893+ !address_needs_mapping(hwdev, dev_addr))
13894+ return dev_addr;
13895+
13896+ /*
13897+ * Oh well, have to allocate and map a bounce buffer.
13898+ */
13899+ buffer.page = virt_to_page(ptr);
13900+ buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
13901+ map = map_single(hwdev, buffer, size, dir);
13902+ if (!map) {
13903+ swiotlb_full(hwdev, size, dir, 1);
13904+ map = io_tlb_overflow_buffer;
13905+ }
13906+
13907+ dev_addr = virt_to_bus(map);
13908+ return dev_addr;
13909+}
13910+
13911+/*
13912+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
13913+ * match what was provided for in a previous swiotlb_map_single call. All
13914+ * other usages are undefined.
13915+ *
13916+ * After this call, reads by the cpu to the buffer are guaranteed to see
13917+ * whatever the device wrote there.
13918+ */
13919+void
13920+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
13921+ int dir)
13922+{
13923+ BUG_ON(dir == DMA_NONE);
13924+ if (in_swiotlb_aperture(dev_addr))
13925+ unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
13926+}
13927+
13928+/*
13929+ * Make physical memory consistent for a single streaming mode DMA translation
13930+ * after a transfer.
13931+ *
13932+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
13933+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
13934+ * call this function before doing so. At the next point you give the PCI dma
13935+ * address back to the card, you must first perform a
13936+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
13937+ */
13938+void
13939+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
13940+ size_t size, int dir)
13941+{
13942+ BUG_ON(dir == DMA_NONE);
13943+ if (in_swiotlb_aperture(dev_addr))
13944+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13945+}
13946+
13947+void
13948+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
13949+ size_t size, int dir)
13950+{
13951+ BUG_ON(dir == DMA_NONE);
13952+ if (in_swiotlb_aperture(dev_addr))
13953+ sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13954+}
13955+
13956+/*
13957+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
13958+ * This is the scatter-gather version of the above swiotlb_map_single
13959+ * interface. Here the scatter gather list elements are each tagged with the
13960+ * appropriate dma address and length. They are obtained via
13961+ * sg_dma_{address,length}(SG).
13962+ *
13963+ * NOTE: An implementation may be able to use a smaller number of
13964+ * DMA address/length pairs than there are SG table elements.
13965+ * (for example via virtual mapping capabilities)
13966+ * The routine returns the number of addr/length pairs actually
13967+ * used, at most nents.
13968+ *
13969+ * Device ownership issues as mentioned above for swiotlb_map_single are the
13970+ * same here.
13971+ */
13972+int
13973+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13974+ int dir)
13975+{
13976+ struct phys_addr buffer;
13977+ dma_addr_t dev_addr;
13978+ char *map;
13979+ int i;
13980+
13981+ BUG_ON(dir == DMA_NONE);
13982+
13983+ for (i = 0; i < nelems; i++, sg++) {
13984+ dev_addr = SG_ENT_PHYS_ADDRESS(sg);
13985+ if (address_needs_mapping(hwdev, dev_addr)) {
13986+ buffer.page = sg->page;
13987+ buffer.offset = sg->offset;
13988+ map = map_single(hwdev, buffer, sg->length, dir);
13989+ if (!map) {
13990+ /* Don't panic here, we expect map_sg users
13991+ to do proper error handling. */
13992+ swiotlb_full(hwdev, sg->length, dir, 0);
13993+ swiotlb_unmap_sg(hwdev, sg - i, i, dir);
13994+ sg[0].dma_length = 0;
13995+ return 0;
13996+ }
13997+ sg->dma_address = (dma_addr_t)virt_to_bus(map);
13998+ } else
13999+ sg->dma_address = dev_addr;
14000+ sg->dma_length = sg->length;
14001+ }
14002+ return nelems;
14003+}
14004+
14005+/*
14006+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
14007+ * concerning calls here are the same as for swiotlb_unmap_single() above.
14008+ */
14009+void
14010+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
14011+ int dir)
14012+{
14013+ int i;
14014+
14015+ BUG_ON(dir == DMA_NONE);
14016+
14017+ for (i = 0; i < nelems; i++, sg++)
14018+ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14019+ unmap_single(hwdev,
14020+ (void *)bus_to_virt(sg->dma_address),
14021+ sg->dma_length, dir);
14022+}
14023+
14024+/*
14025+ * Make physical memory consistent for a set of streaming mode DMA translations
14026+ * after a transfer.
14027+ *
14028+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
14029+ * and usage.
14030+ */
14031+void
14032+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
14033+ int nelems, int dir)
14034+{
14035+ int i;
14036+
14037+ BUG_ON(dir == DMA_NONE);
14038+
14039+ for (i = 0; i < nelems; i++, sg++)
14040+ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14041+ sync_single(hwdev,
14042+ (void *)bus_to_virt(sg->dma_address),
14043+ sg->dma_length, dir);
14044+}
14045+
14046+void
14047+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
14048+ int nelems, int dir)
14049+{
14050+ int i;
14051+
14052+ BUG_ON(dir == DMA_NONE);
14053+
14054+ for (i = 0; i < nelems; i++, sg++)
14055+ if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14056+ sync_single(hwdev,
14057+ (void *)bus_to_virt(sg->dma_address),
14058+ sg->dma_length, dir);
14059+}
14060+
14061+dma_addr_t
14062+swiotlb_map_page(struct device *hwdev, struct page *page,
14063+ unsigned long offset, size_t size,
14064+ enum dma_data_direction direction)
14065+{
14066+ struct phys_addr buffer;
14067+ dma_addr_t dev_addr;
14068+ char *map;
14069+
14070+ dev_addr = page_to_bus(page) + offset;
14071+ if (address_needs_mapping(hwdev, dev_addr)) {
14072+ buffer.page = page;
14073+ buffer.offset = offset;
14074+ map = map_single(hwdev, buffer, size, direction);
14075+ if (!map) {
14076+ swiotlb_full(hwdev, size, direction, 1);
14077+ map = io_tlb_overflow_buffer;
14078+ }
14079+ dev_addr = (dma_addr_t)virt_to_bus(map);
14080+ }
14081+
14082+ return dev_addr;
14083+}
14084+
14085+void
14086+swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
14087+ size_t size, enum dma_data_direction direction)
14088+{
14089+ BUG_ON(direction == DMA_NONE);
14090+ if (in_swiotlb_aperture(dma_address))
14091+ unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
14092+}
14093+
14094+int
14095+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
14096+{
14097+ return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
14098+}
14099+
14100+/*
14101+ * Return whether the given PCI device DMA address mask can be supported
14102+ * properly. For example, if your device can only drive the low 24-bits
14103+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
14104+ * this function.
14105+ */
14106+int
14107+swiotlb_dma_supported (struct device *hwdev, u64 mask)
14108+{
14109+ return (mask >= ((1UL << dma_bits) - 1));
14110+}
14111+
14112+EXPORT_SYMBOL(swiotlb_init);
14113+EXPORT_SYMBOL(swiotlb_map_single);
14114+EXPORT_SYMBOL(swiotlb_unmap_single);
14115+EXPORT_SYMBOL(swiotlb_map_sg);
14116+EXPORT_SYMBOL(swiotlb_unmap_sg);
14117+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
14118+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
14119+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
14120+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
14121+EXPORT_SYMBOL(swiotlb_map_page);
14122+EXPORT_SYMBOL(swiotlb_unmap_page);
14123+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
14124+EXPORT_SYMBOL(swiotlb_dma_supported);
14125diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/sysenter.c linux-2.6.16.33/arch/i386/kernel/sysenter.c
14126--- linux-2.6.16.33-noxen/arch/i386/kernel/sysenter.c 2006-11-22 18:06:31.000000000 +0000
14127+++ linux-2.6.16.33/arch/i386/kernel/sysenter.c 2007-01-08 15:00:45.000000000 +0000
14128@@ -13,16 +13,22 @@
14129 #include <linux/gfp.h>
14130 #include <linux/string.h>
14131 #include <linux/elf.h>
14132+#include <linux/mm.h>
14133
14134 #include <asm/cpufeature.h>
14135 #include <asm/msr.h>
14136 #include <asm/pgtable.h>
14137 #include <asm/unistd.h>
14138
14139+#ifdef CONFIG_XEN
14140+#include <xen/interface/callback.h>
14141+#endif
14142+
14143 extern asmlinkage void sysenter_entry(void);
14144
14145 void enable_sep_cpu(void)
14146 {
14147+#ifndef CONFIG_X86_NO_TSS
14148 int cpu = get_cpu();
14149 struct tss_struct *tss = &per_cpu(init_tss, cpu);
14150
14151@@ -37,6 +43,7 @@
14152 wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
14153 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
14154 put_cpu();
14155+#endif
14156 }
14157
14158 /*
14159@@ -45,23 +52,100 @@
14160 */
14161 extern const char vsyscall_int80_start, vsyscall_int80_end;
14162 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
14163+static void *syscall_page;
14164
14165 int __init sysenter_setup(void)
14166 {
14167- void *page = (void *)get_zeroed_page(GFP_ATOMIC);
14168+ syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
14169
14170- __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
14171+#ifdef CONFIG_XEN
14172+ if (boot_cpu_has(X86_FEATURE_SEP)) {
14173+ static struct callback_register __initdata sysenter = {
14174+ .type = CALLBACKTYPE_sysenter,
14175+ .address = { __KERNEL_CS, (unsigned long)sysenter_entry },
14176+ };
14177
14178- if (!boot_cpu_has(X86_FEATURE_SEP)) {
14179- memcpy(page,
14180- &vsyscall_int80_start,
14181- &vsyscall_int80_end - &vsyscall_int80_start);
14182+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
14183+ clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
14184+ }
14185+#endif
14186+
14187+ if (boot_cpu_has(X86_FEATURE_SEP)) {
14188+ memcpy(syscall_page,
14189+ &vsyscall_sysenter_start,
14190+ &vsyscall_sysenter_end - &vsyscall_sysenter_start);
14191 return 0;
14192 }
14193
14194- memcpy(page,
14195- &vsyscall_sysenter_start,
14196- &vsyscall_sysenter_end - &vsyscall_sysenter_start);
14197+ memcpy(syscall_page,
14198+ &vsyscall_int80_start,
14199+ &vsyscall_int80_end - &vsyscall_int80_start);
14200+
14201+ return 0;
14202+}
14203+
14204+static struct page*
14205+syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
14206+{
14207+ struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
14208+ get_page(p);
14209+ return p;
14210+}
14211+
14212+/* Prevent VMA merging */
14213+static void syscall_vma_close(struct vm_area_struct *vma)
14214+{
14215+}
14216+
14217+static struct vm_operations_struct syscall_vm_ops = {
14218+ .close = syscall_vma_close,
14219+ .nopage = syscall_nopage,
14220+};
14221
14222+/* Setup a VMA at program startup for the vsyscall page */
14223+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
14224+{
14225+ struct vm_area_struct *vma;
14226+ struct mm_struct *mm = current->mm;
14227+ int ret;
14228+
14229+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
14230+ if (!vma)
14231+ return -ENOMEM;
14232+
14233+ memset(vma, 0, sizeof(struct vm_area_struct));
14234+ /* Could randomize here */
14235+ vma->vm_start = VSYSCALL_BASE;
14236+ vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
14237+ /* MAYWRITE to allow gdb to COW and set breakpoints */
14238+ vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
14239+ vma->vm_flags |= mm->def_flags;
14240+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
14241+ vma->vm_ops = &syscall_vm_ops;
14242+ vma->vm_mm = mm;
14243+
14244+ down_write(&mm->mmap_sem);
14245+ if ((ret = insert_vm_struct(mm, vma))) {
14246+ up_write(&mm->mmap_sem);
14247+ kmem_cache_free(vm_area_cachep, vma);
14248+ return ret;
14249+ }
14250+ mm->total_vm++;
14251+ up_write(&mm->mmap_sem);
14252+ return 0;
14253+}
14254+
14255+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
14256+{
14257+ return NULL;
14258+}
14259+
14260+int in_gate_area(struct task_struct *task, unsigned long addr)
14261+{
14262+ return 0;
14263+}
14264+
14265+int in_gate_area_no_task(unsigned long addr)
14266+{
14267 return 0;
14268 }
14269diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/time-xen.c linux-2.6.16.33/arch/i386/kernel/time-xen.c
14270--- linux-2.6.16.33-noxen/arch/i386/kernel/time-xen.c 1970-01-01 00:00:00.000000000 +0000
14271+++ linux-2.6.16.33/arch/i386/kernel/time-xen.c 2007-01-08 15:00:45.000000000 +0000
14272@@ -0,0 +1,1121 @@
14273+/*
14274+ * linux/arch/i386/kernel/time.c
14275+ *
14276+ * Copyright (C) 1991, 1992, 1995 Linus Torvalds
14277+ *
14278+ * This file contains the PC-specific time handling details:
14279+ * reading the RTC at bootup, etc..
14280+ * 1994-07-02 Alan Modra
14281+ * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
14282+ * 1995-03-26 Markus Kuhn
14283+ * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
14284+ * precision CMOS clock update
14285+ * 1996-05-03 Ingo Molnar
14286+ * fixed time warps in do_[slow|fast]_gettimeoffset()
14287+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14288+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
14289+ * 1998-09-05 (Various)
14290+ * More robust do_fast_gettimeoffset() algorithm implemented
14291+ * (works with APM, Cyrix 6x86MX and Centaur C6),
14292+ * monotonic gettimeofday() with fast_get_timeoffset(),
14293+ * drift-proof precision TSC calibration on boot
14294+ * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
14295+ * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
14296+ * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
14297+ * 1998-12-16 Andrea Arcangeli
14298+ * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
14299+ * because was not accounting lost_ticks.
14300+ * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
14301+ * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
14302+ * serialize accesses to xtime/lost_ticks).
14303+ */
14304+
14305+#include <linux/errno.h>
14306+#include <linux/sched.h>
14307+#include <linux/kernel.h>
14308+#include <linux/param.h>
14309+#include <linux/string.h>
14310+#include <linux/mm.h>
14311+#include <linux/interrupt.h>
14312+#include <linux/time.h>
14313+#include <linux/delay.h>
14314+#include <linux/init.h>
14315+#include <linux/smp.h>
14316+#include <linux/module.h>
14317+#include <linux/sysdev.h>
14318+#include <linux/bcd.h>
14319+#include <linux/efi.h>
14320+#include <linux/mca.h>
14321+#include <linux/sysctl.h>
14322+#include <linux/percpu.h>
14323+#include <linux/kernel_stat.h>
14324+#include <linux/posix-timers.h>
14325+
14326+#include <asm/io.h>
14327+#include <asm/smp.h>
14328+#include <asm/irq.h>
14329+#include <asm/msr.h>
14330+#include <asm/delay.h>
14331+#include <asm/mpspec.h>
14332+#include <asm/uaccess.h>
14333+#include <asm/processor.h>
14334+#include <asm/timer.h>
14335+#include <asm/sections.h>
14336+
14337+#include "mach_time.h"
14338+
14339+#include <linux/timex.h>
14340+#include <linux/config.h>
14341+
14342+#include <asm/hpet.h>
14343+
14344+#include <asm/arch_hooks.h>
14345+
14346+#include <xen/evtchn.h>
14347+#include <xen/interface/vcpu.h>
14348+
14349+#if defined (__i386__)
14350+#include <asm/i8259.h>
14351+#endif
14352+
14353+int pit_latch_buggy; /* extern */
14354+
14355+#if defined(__x86_64__)
14356+unsigned long vxtime_hz = PIT_TICK_RATE;
14357+struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
14358+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
14359+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
14360+struct timespec __xtime __section_xtime;
14361+struct timezone __sys_tz __section_sys_tz;
14362+#endif
14363+
14364+unsigned int cpu_khz; /* Detected as we calibrate the TSC */
14365+EXPORT_SYMBOL(cpu_khz);
14366+
14367+extern unsigned long wall_jiffies;
14368+
14369+DEFINE_SPINLOCK(rtc_lock);
14370+EXPORT_SYMBOL(rtc_lock);
14371+
14372+#if defined (__i386__)
14373+#include <asm/i8253.h>
14374+#endif
14375+
14376+DEFINE_SPINLOCK(i8253_lock);
14377+EXPORT_SYMBOL(i8253_lock);
14378+
14379+extern struct init_timer_opts timer_tsc_init;
14380+extern struct timer_opts timer_tsc;
14381+#define timer_none timer_tsc
14382+struct timer_opts *cur_timer __read_mostly = &timer_tsc;
14383+
14384+/* These are peridically updated in shared_info, and then copied here. */
14385+struct shadow_time_info {
14386+ u64 tsc_timestamp; /* TSC at last update of time vals. */
14387+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
14388+ u32 tsc_to_nsec_mul;
14389+ u32 tsc_to_usec_mul;
14390+ int tsc_shift;
14391+ u32 version;
14392+};
14393+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
14394+static struct timespec shadow_tv;
14395+static u32 shadow_tv_version;
14396+
14397+/* Keep track of last time we did processing/updating of jiffies and xtime. */
14398+static u64 processed_system_time; /* System time (ns) at last processing. */
14399+static DEFINE_PER_CPU(u64, processed_system_time);
14400+
14401+/* How much CPU time was spent blocked and how much was 'stolen'? */
14402+static DEFINE_PER_CPU(u64, processed_stolen_time);
14403+static DEFINE_PER_CPU(u64, processed_blocked_time);
14404+
14405+/* Current runstate of each CPU (updated automatically by the hypervisor). */
14406+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
14407+
14408+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
14409+#define NS_PER_TICK (1000000000LL/HZ)
14410+
14411+static inline void __normalize_time(time_t *sec, s64 *nsec)
14412+{
14413+ while (*nsec >= NSEC_PER_SEC) {
14414+ (*nsec) -= NSEC_PER_SEC;
14415+ (*sec)++;
14416+ }
14417+ while (*nsec < 0) {
14418+ (*nsec) += NSEC_PER_SEC;
14419+ (*sec)--;
14420+ }
14421+}
14422+
14423+/* Does this guest OS track Xen time, or set its wall clock independently? */
14424+static int independent_wallclock = 0;
14425+static int __init __independent_wallclock(char *str)
14426+{
14427+ independent_wallclock = 1;
14428+ return 1;
14429+}
14430+__setup("independent_wallclock", __independent_wallclock);
14431+
14432+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
14433+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
14434+static int __init __permitted_clock_jitter(char *str)
14435+{
14436+ permitted_clock_jitter = simple_strtoul(str, NULL, 0);
14437+ return 1;
14438+}
14439+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
14440+
14441+int tsc_disable __devinitdata = 0;
14442+
14443+static void delay_tsc(unsigned long loops)
14444+{
14445+ unsigned long bclock, now;
14446+
14447+ rdtscl(bclock);
14448+ do {
14449+ rep_nop();
14450+ rdtscl(now);
14451+ } while ((now - bclock) < loops);
14452+}
14453+
14454+struct timer_opts timer_tsc = {
14455+ .name = "tsc",
14456+ .delay = delay_tsc,
14457+};
14458+
14459+/*
14460+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
14461+ * yielding a 64-bit result.
14462+ */
14463+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
14464+{
14465+ u64 product;
14466+#ifdef __i386__
14467+ u32 tmp1, tmp2;
14468+#endif
14469+
14470+ if (shift < 0)
14471+ delta >>= -shift;
14472+ else
14473+ delta <<= shift;
14474+
14475+#ifdef __i386__
14476+ __asm__ (
14477+ "mul %5 ; "
14478+ "mov %4,%%eax ; "
14479+ "mov %%edx,%4 ; "
14480+ "mul %5 ; "
14481+ "xor %5,%5 ; "
14482+ "add %4,%%eax ; "
14483+ "adc %5,%%edx ; "
14484+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
14485+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
14486+#else
14487+ __asm__ (
14488+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
14489+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
14490+#endif
14491+
14492+ return product;
14493+}
14494+
14495+#if defined (__i386__)
14496+int read_current_timer(unsigned long *timer_val)
14497+{
14498+ rdtscl(*timer_val);
14499+ return 0;
14500+}
14501+#endif
14502+
14503+void init_cpu_khz(void)
14504+{
14505+ u64 __cpu_khz = 1000000ULL << 32;
14506+ struct vcpu_time_info *info;
14507+ info = &HYPERVISOR_shared_info->vcpu_info[0].time;
14508+ do_div(__cpu_khz, info->tsc_to_system_mul);
14509+ if (info->tsc_shift < 0)
14510+ cpu_khz = __cpu_khz << -info->tsc_shift;
14511+ else
14512+ cpu_khz = __cpu_khz >> info->tsc_shift;
14513+}
14514+
14515+static u64 get_nsec_offset(struct shadow_time_info *shadow)
14516+{
14517+ u64 now, delta;
14518+ rdtscll(now);
14519+ delta = now - shadow->tsc_timestamp;
14520+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
14521+}
14522+
14523+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
14524+{
14525+ u64 now, delta;
14526+ rdtscll(now);
14527+ delta = now - shadow->tsc_timestamp;
14528+ return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
14529+}
14530+
14531+static void __update_wallclock(time_t sec, long nsec)
14532+{
14533+ long wtm_nsec, xtime_nsec;
14534+ time_t wtm_sec, xtime_sec;
14535+ u64 tmp, wc_nsec;
14536+
14537+ /* Adjust wall-clock time base based on wall_jiffies ticks. */
14538+ wc_nsec = processed_system_time;
14539+ wc_nsec += sec * (u64)NSEC_PER_SEC;
14540+ wc_nsec += nsec;
14541+ wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
14542+
14543+ /* Split wallclock base into seconds and nanoseconds. */
14544+ tmp = wc_nsec;
14545+ xtime_nsec = do_div(tmp, 1000000000);
14546+ xtime_sec = (time_t)tmp;
14547+
14548+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
14549+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
14550+
14551+ set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
14552+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
14553+
14554+ ntp_clear();
14555+}
14556+
14557+static void update_wallclock(void)
14558+{
14559+ shared_info_t *s = HYPERVISOR_shared_info;
14560+
14561+ do {
14562+ shadow_tv_version = s->wc_version;
14563+ rmb();
14564+ shadow_tv.tv_sec = s->wc_sec;
14565+ shadow_tv.tv_nsec = s->wc_nsec;
14566+ rmb();
14567+ } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
14568+
14569+ if (!independent_wallclock)
14570+ __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
14571+}
14572+
14573+/*
14574+ * Reads a consistent set of time-base values from Xen, into a shadow data
14575+ * area.
14576+ */
14577+static void get_time_values_from_xen(void)
14578+{
14579+ shared_info_t *s = HYPERVISOR_shared_info;
14580+ struct vcpu_time_info *src;
14581+ struct shadow_time_info *dst;
14582+
14583+ src = &s->vcpu_info[smp_processor_id()].time;
14584+ dst = &per_cpu(shadow_time, smp_processor_id());
14585+
14586+ do {
14587+ dst->version = src->version;
14588+ rmb();
14589+ dst->tsc_timestamp = src->tsc_timestamp;
14590+ dst->system_timestamp = src->system_time;
14591+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
14592+ dst->tsc_shift = src->tsc_shift;
14593+ rmb();
14594+ } while ((src->version & 1) | (dst->version ^ src->version));
14595+
14596+ dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
14597+}
14598+
14599+static inline int time_values_up_to_date(int cpu)
14600+{
14601+ struct vcpu_time_info *src;
14602+ struct shadow_time_info *dst;
14603+
14604+ src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
14605+ dst = &per_cpu(shadow_time, cpu);
14606+
14607+ rmb();
14608+ return (dst->version == src->version);
14609+}
14610+
14611+/*
14612+ * This is a special lock that is owned by the CPU and holds the index
14613+ * register we are working with. It is required for NMI access to the
14614+ * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
14615+ */
14616+volatile unsigned long cmos_lock = 0;
14617+EXPORT_SYMBOL(cmos_lock);
14618+
14619+/* Routines for accessing the CMOS RAM/RTC. */
14620+unsigned char rtc_cmos_read(unsigned char addr)
14621+{
14622+ unsigned char val;
14623+ lock_cmos_prefix(addr);
14624+ outb_p(addr, RTC_PORT(0));
14625+ val = inb_p(RTC_PORT(1));
14626+ lock_cmos_suffix(addr);
14627+ return val;
14628+}
14629+EXPORT_SYMBOL(rtc_cmos_read);
14630+
14631+void rtc_cmos_write(unsigned char val, unsigned char addr)
14632+{
14633+ lock_cmos_prefix(addr);
14634+ outb_p(addr, RTC_PORT(0));
14635+ outb_p(val, RTC_PORT(1));
14636+ lock_cmos_suffix(addr);
14637+}
14638+EXPORT_SYMBOL(rtc_cmos_write);
14639+
14640+/*
14641+ * This version of gettimeofday has microsecond resolution
14642+ * and better than microsecond precision on fast x86 machines with TSC.
14643+ */
14644+void do_gettimeofday(struct timeval *tv)
14645+{
14646+ unsigned long seq;
14647+ unsigned long usec, sec;
14648+ unsigned long max_ntp_tick;
14649+ s64 nsec;
14650+ unsigned int cpu;
14651+ struct shadow_time_info *shadow;
14652+ u32 local_time_version;
14653+
14654+ cpu = get_cpu();
14655+ shadow = &per_cpu(shadow_time, cpu);
14656+
14657+ do {
14658+ unsigned long lost;
14659+
14660+ local_time_version = shadow->version;
14661+ seq = read_seqbegin(&xtime_lock);
14662+
14663+ usec = get_usec_offset(shadow);
14664+ lost = jiffies - wall_jiffies;
14665+
14666+ /*
14667+ * If time_adjust is negative then NTP is slowing the clock
14668+ * so make sure not to go into next possible interval.
14669+ * Better to lose some accuracy than have time go backwards..
14670+ */
14671+ if (unlikely(time_adjust < 0)) {
14672+ max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
14673+ usec = min(usec, max_ntp_tick);
14674+
14675+ if (lost)
14676+ usec += lost * max_ntp_tick;
14677+ }
14678+ else if (unlikely(lost))
14679+ usec += lost * (USEC_PER_SEC / HZ);
14680+
14681+ sec = xtime.tv_sec;
14682+ usec += (xtime.tv_nsec / NSEC_PER_USEC);
14683+
14684+ nsec = shadow->system_timestamp - processed_system_time;
14685+ __normalize_time(&sec, &nsec);
14686+ usec += (long)nsec / NSEC_PER_USEC;
14687+
14688+ if (unlikely(!time_values_up_to_date(cpu))) {
14689+ /*
14690+ * We may have blocked for a long time,
14691+ * rendering our calculations invalid
14692+ * (e.g. the time delta may have
14693+ * overflowed). Detect that and recalculate
14694+ * with fresh values.
14695+ */
14696+ get_time_values_from_xen();
14697+ continue;
14698+ }
14699+ } while (read_seqretry(&xtime_lock, seq) ||
14700+ (local_time_version != shadow->version));
14701+
14702+ put_cpu();
14703+
14704+ while (usec >= USEC_PER_SEC) {
14705+ usec -= USEC_PER_SEC;
14706+ sec++;
14707+ }
14708+
14709+ tv->tv_sec = sec;
14710+ tv->tv_usec = usec;
14711+}
14712+
14713+EXPORT_SYMBOL(do_gettimeofday);
14714+
14715+int do_settimeofday(struct timespec *tv)
14716+{
14717+ time_t sec;
14718+ s64 nsec;
14719+ unsigned int cpu;
14720+ struct shadow_time_info *shadow;
14721+ dom0_op_t op;
14722+
14723+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
14724+ return -EINVAL;
14725+
14726+ cpu = get_cpu();
14727+ shadow = &per_cpu(shadow_time, cpu);
14728+
14729+ write_seqlock_irq(&xtime_lock);
14730+
14731+ /*
14732+ * Ensure we don't get blocked for a long time so that our time delta
14733+ * overflows. If that were to happen then our shadow time values would
14734+ * be stale, so we can retry with fresh ones.
14735+ */
14736+ for (;;) {
14737+ nsec = tv->tv_nsec - get_nsec_offset(shadow);
14738+ if (time_values_up_to_date(cpu))
14739+ break;
14740+ get_time_values_from_xen();
14741+ }
14742+ sec = tv->tv_sec;
14743+ __normalize_time(&sec, &nsec);
14744+
14745+ if (is_initial_xendomain() && !independent_wallclock) {
14746+ op.cmd = DOM0_SETTIME;
14747+ op.u.settime.secs = sec;
14748+ op.u.settime.nsecs = nsec;
14749+ op.u.settime.system_time = shadow->system_timestamp;
14750+ HYPERVISOR_dom0_op(&op);
14751+ update_wallclock();
14752+ } else if (independent_wallclock) {
14753+ nsec -= shadow->system_timestamp;
14754+ __normalize_time(&sec, &nsec);
14755+ __update_wallclock(sec, nsec);
14756+ }
14757+
14758+ write_sequnlock_irq(&xtime_lock);
14759+
14760+ put_cpu();
14761+
14762+ clock_was_set();
14763+ return 0;
14764+}
14765+
14766+EXPORT_SYMBOL(do_settimeofday);
14767+
14768+static void sync_xen_wallclock(unsigned long dummy);
14769+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
14770+static void sync_xen_wallclock(unsigned long dummy)
14771+{
14772+ time_t sec;
14773+ s64 nsec;
14774+ dom0_op_t op;
14775+
14776+ if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
14777+ return;
14778+
14779+ write_seqlock_irq(&xtime_lock);
14780+
14781+ sec = xtime.tv_sec;
14782+ nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
14783+ __normalize_time(&sec, &nsec);
14784+
14785+ op.cmd = DOM0_SETTIME;
14786+ op.u.settime.secs = sec;
14787+ op.u.settime.nsecs = nsec;
14788+ op.u.settime.system_time = processed_system_time;
14789+ HYPERVISOR_dom0_op(&op);
14790+
14791+ update_wallclock();
14792+
14793+ write_sequnlock_irq(&xtime_lock);
14794+
14795+ /* Once per minute. */
14796+ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
14797+}
14798+
14799+static int set_rtc_mmss(unsigned long nowtime)
14800+{
14801+ int retval;
14802+
14803+ WARN_ON(irqs_disabled());
14804+
14805+ if (independent_wallclock || !is_initial_xendomain())
14806+ return 0;
14807+
14808+ /* gets recalled with irq locally disabled */
14809+ spin_lock_irq(&rtc_lock);
14810+ if (efi_enabled)
14811+ retval = efi_set_rtc_mmss(nowtime);
14812+ else
14813+ retval = mach_set_rtc_mmss(nowtime);
14814+ spin_unlock_irq(&rtc_lock);
14815+
14816+ return retval;
14817+}
14818+
14819+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
14820+ * Note: This function is required to return accurate
14821+ * time even in the absence of multiple timer ticks.
14822+ */
14823+unsigned long long monotonic_clock(void)
14824+{
14825+ int cpu = get_cpu();
14826+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14827+ u64 time;
14828+ u32 local_time_version;
14829+
14830+ do {
14831+ local_time_version = shadow->version;
14832+ barrier();
14833+ time = shadow->system_timestamp + get_nsec_offset(shadow);
14834+ if (!time_values_up_to_date(cpu))
14835+ get_time_values_from_xen();
14836+ barrier();
14837+ } while (local_time_version != shadow->version);
14838+
14839+ put_cpu();
14840+
14841+ return time;
14842+}
14843+EXPORT_SYMBOL(monotonic_clock);
14844+
14845+unsigned long long sched_clock(void)
14846+{
14847+ return monotonic_clock();
14848+}
14849+
14850+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
14851+unsigned long profile_pc(struct pt_regs *regs)
14852+{
14853+ unsigned long pc = instruction_pointer(regs);
14854+
14855+#ifdef __x86_64__
14856+ /* Assume the lock function has either no stack frame or only a single word.
14857+ This checks if the address on the stack looks like a kernel text address.
14858+ There is a small window for false hits, but in that case the tick
14859+ is just accounted to the spinlock function.
14860+ Better would be to write these functions in assembler again
14861+ and check exactly. */
14862+ if (in_lock_functions(pc)) {
14863+ char *v = *(char **)regs->rsp;
14864+ if ((v >= _stext && v <= _etext) ||
14865+ (v >= _sinittext && v <= _einittext) ||
14866+ (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
14867+ return (unsigned long)v;
14868+ return ((unsigned long *)regs->rsp)[1];
14869+ }
14870+#else
14871+ if (in_lock_functions(pc))
14872+ return *(unsigned long *)(regs->ebp + 4);
14873+#endif
14874+
14875+ return pc;
14876+}
14877+EXPORT_SYMBOL(profile_pc);
14878+#endif
14879+
14880+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
14881+{
14882+ s64 delta, delta_cpu, stolen, blocked;
14883+ u64 sched_time;
14884+ int i, cpu = smp_processor_id();
14885+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14886+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14887+
14888+ write_seqlock(&xtime_lock);
14889+
14890+ do {
14891+ get_time_values_from_xen();
14892+
14893+ /* Obtain a consistent snapshot of elapsed wallclock cycles. */
14894+ delta = delta_cpu =
14895+ shadow->system_timestamp + get_nsec_offset(shadow);
14896+ delta -= processed_system_time;
14897+ delta_cpu -= per_cpu(processed_system_time, cpu);
14898+
14899+ /*
14900+ * Obtain a consistent snapshot of stolen/blocked cycles. We
14901+ * can use state_entry_time to detect if we get preempted here.
14902+ */
14903+ do {
14904+ sched_time = runstate->state_entry_time;
14905+ barrier();
14906+ stolen = runstate->time[RUNSTATE_runnable] +
14907+ runstate->time[RUNSTATE_offline] -
14908+ per_cpu(processed_stolen_time, cpu);
14909+ blocked = runstate->time[RUNSTATE_blocked] -
14910+ per_cpu(processed_blocked_time, cpu);
14911+ barrier();
14912+ } while (sched_time != runstate->state_entry_time);
14913+ } while (!time_values_up_to_date(cpu));
14914+
14915+ if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
14916+ unlikely(delta_cpu < -(s64)permitted_clock_jitter))
14917+ && printk_ratelimit()) {
14918+ printk("Timer ISR/%d: Time went backwards: "
14919+ "delta=%lld delta_cpu=%lld shadow=%lld "
14920+ "off=%lld processed=%lld cpu_processed=%lld\n",
14921+ cpu, delta, delta_cpu, shadow->system_timestamp,
14922+ (s64)get_nsec_offset(shadow),
14923+ processed_system_time,
14924+ per_cpu(processed_system_time, cpu));
14925+ for (i = 0; i < num_online_cpus(); i++)
14926+ printk(" %d: %lld\n", i,
14927+ per_cpu(processed_system_time, i));
14928+ }
14929+
14930+ /* System-wide jiffy work. */
14931+ while (delta >= NS_PER_TICK) {
14932+ delta -= NS_PER_TICK;
14933+ processed_system_time += NS_PER_TICK;
14934+ do_timer(regs);
14935+ }
14936+
14937+ if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
14938+ update_wallclock();
14939+ clock_was_set();
14940+ }
14941+
14942+ write_sequnlock(&xtime_lock);
14943+
14944+ /*
14945+ * Account stolen ticks.
14946+ * HACK: Passing NULL to account_steal_time()
14947+ * ensures that the ticks are accounted as stolen.
14948+ */
14949+ if ((stolen > 0) && (delta_cpu > 0)) {
14950+ delta_cpu -= stolen;
14951+ if (unlikely(delta_cpu < 0))
14952+ stolen += delta_cpu; /* clamp local-time progress */
14953+ do_div(stolen, NS_PER_TICK);
14954+ per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14955+ per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14956+ account_steal_time(NULL, (cputime_t)stolen);
14957+ }
14958+
14959+ /*
14960+ * Account blocked ticks.
14961+ * HACK: Passing idle_task to account_steal_time()
14962+ * ensures that the ticks are accounted as idle/wait.
14963+ */
14964+ if ((blocked > 0) && (delta_cpu > 0)) {
14965+ delta_cpu -= blocked;
14966+ if (unlikely(delta_cpu < 0))
14967+ blocked += delta_cpu; /* clamp local-time progress */
14968+ do_div(blocked, NS_PER_TICK);
14969+ per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14970+ per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
14971+ account_steal_time(idle_task(cpu), (cputime_t)blocked);
14972+ }
14973+
14974+ /* Account user/system ticks. */
14975+ if (delta_cpu > 0) {
14976+ do_div(delta_cpu, NS_PER_TICK);
14977+ per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14978+ if (user_mode(regs))
14979+ account_user_time(current, (cputime_t)delta_cpu);
14980+ else
14981+ account_system_time(current, HARDIRQ_OFFSET,
14982+ (cputime_t)delta_cpu);
14983+ }
14984+
14985+ /* Offlined for more than a few seconds? Avoid lockup warnings. */
14986+ if (stolen > 5*HZ)
14987+ touch_softlockup_watchdog();
14988+
14989+ /* Local timer processing (see update_process_times()). */
14990+ run_local_timers();
14991+ if (rcu_pending(cpu))
14992+ rcu_check_callbacks(cpu, user_mode(regs));
14993+ scheduler_tick();
14994+ run_posix_cpu_timers(current);
14995+ profile_tick(CPU_PROFILING, regs);
14996+
14997+ return IRQ_HANDLED;
14998+}
14999+
15000+static void init_missing_ticks_accounting(int cpu)
15001+{
15002+ struct vcpu_register_runstate_memory_area area;
15003+ struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
15004+
15005+ memset(runstate, 0, sizeof(*runstate));
15006+
15007+ area.addr.v = runstate;
15008+ HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
15009+
15010+ per_cpu(processed_blocked_time, cpu) =
15011+ runstate->time[RUNSTATE_blocked];
15012+ per_cpu(processed_stolen_time, cpu) =
15013+ runstate->time[RUNSTATE_runnable] +
15014+ runstate->time[RUNSTATE_offline];
15015+}
15016+
15017+/* not static: needed by APM */
15018+unsigned long get_cmos_time(void)
15019+{
15020+ unsigned long retval;
15021+
15022+ spin_lock(&rtc_lock);
15023+
15024+ if (efi_enabled)
15025+ retval = efi_get_time();
15026+ else
15027+ retval = mach_get_cmos_time();
15028+
15029+ spin_unlock(&rtc_lock);
15030+
15031+ return retval;
15032+}
15033+EXPORT_SYMBOL(get_cmos_time);
15034+
15035+static void sync_cmos_clock(unsigned long dummy);
15036+
15037+static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
15038+
15039+static void sync_cmos_clock(unsigned long dummy)
15040+{
15041+ struct timeval now, next;
15042+ int fail = 1;
15043+
15044+ /*
15045+ * If we have an externally synchronized Linux clock, then update
15046+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
15047+ * called as close as possible to 500 ms before the new second starts.
15048+ * This code is run on a timer. If the clock is set, that timer
15049+ * may not expire at the correct time. Thus, we adjust...
15050+ */
15051+ if (!ntp_synced())
15052+ /*
15053+ * Not synced, exit, do not restart a timer (if one is
15054+ * running, let it run out).
15055+ */
15056+ return;
15057+
15058+ do_gettimeofday(&now);
15059+ if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
15060+ now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
15061+ fail = set_rtc_mmss(now.tv_sec);
15062+
15063+ next.tv_usec = USEC_AFTER - now.tv_usec;
15064+ if (next.tv_usec <= 0)
15065+ next.tv_usec += USEC_PER_SEC;
15066+
15067+ if (!fail)
15068+ next.tv_sec = 659;
15069+ else
15070+ next.tv_sec = 0;
15071+
15072+ if (next.tv_usec >= USEC_PER_SEC) {
15073+ next.tv_sec++;
15074+ next.tv_usec -= USEC_PER_SEC;
15075+ }
15076+ mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
15077+}
15078+
15079+void notify_arch_cmos_timer(void)
15080+{
15081+ mod_timer(&sync_cmos_timer, jiffies + 1);
15082+ mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
15083+}
15084+
15085+static long clock_cmos_diff, sleep_start;
15086+
15087+static struct timer_opts *last_timer;
15088+static int timer_suspend(struct sys_device *dev, pm_message_t state)
15089+{
15090+ /*
15091+ * Estimate time zone so that set_time can update the clock
15092+ */
15093+ clock_cmos_diff = -get_cmos_time();
15094+ clock_cmos_diff += get_seconds();
15095+ sleep_start = get_cmos_time();
15096+ last_timer = cur_timer;
15097+ cur_timer = &timer_none;
15098+ if (last_timer->suspend)
15099+ last_timer->suspend(state);
15100+ return 0;
15101+}
15102+
15103+static int timer_resume(struct sys_device *dev)
15104+{
15105+ unsigned long flags;
15106+ unsigned long sec;
15107+ unsigned long sleep_length;
15108+
15109+#ifdef CONFIG_HPET_TIMER
15110+ if (is_hpet_enabled())
15111+ hpet_reenable();
15112+#endif
15113+ sec = get_cmos_time() + clock_cmos_diff;
15114+ sleep_length = (get_cmos_time() - sleep_start) * HZ;
15115+ write_seqlock_irqsave(&xtime_lock, flags);
15116+ xtime.tv_sec = sec;
15117+ xtime.tv_nsec = 0;
15118+ jiffies_64 += sleep_length;
15119+ wall_jiffies += sleep_length;
15120+ write_sequnlock_irqrestore(&xtime_lock, flags);
15121+ if (last_timer->resume)
15122+ last_timer->resume();
15123+ cur_timer = last_timer;
15124+ last_timer = NULL;
15125+ touch_softlockup_watchdog();
15126+ return 0;
15127+}
15128+
15129+static struct sysdev_class timer_sysclass = {
15130+ .resume = timer_resume,
15131+ .suspend = timer_suspend,
15132+ set_kset_name("timer"),
15133+};
15134+
15135+
15136+/* XXX this driverfs stuff should probably go elsewhere later -john */
15137+static struct sys_device device_timer = {
15138+ .id = 0,
15139+ .cls = &timer_sysclass,
15140+};
15141+
15142+static int time_init_device(void)
15143+{
15144+ int error = sysdev_class_register(&timer_sysclass);
15145+ if (!error)
15146+ error = sysdev_register(&device_timer);
15147+ return error;
15148+}
15149+
15150+device_initcall(time_init_device);
15151+
15152+#ifdef CONFIG_HPET_TIMER
15153+extern void (*late_time_init)(void);
15154+/* Duplicate of time_init() below, with hpet_enable part added */
15155+static void __init hpet_time_init(void)
15156+{
15157+ xtime.tv_sec = get_cmos_time();
15158+ xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
15159+ set_normalized_timespec(&wall_to_monotonic,
15160+ -xtime.tv_sec, -xtime.tv_nsec);
15161+
15162+ if ((hpet_enable() >= 0) && hpet_use_timer) {
15163+ printk("Using HPET for base-timer\n");
15164+ }
15165+
15166+ cur_timer = select_timer();
15167+ printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
15168+
15169+ time_init_hook();
15170+}
15171+#endif
15172+
15173+/* Dynamically-mapped IRQ. */
15174+DEFINE_PER_CPU(int, timer_irq);
15175+
15176+extern void (*late_time_init)(void);
15177+static void setup_cpu0_timer_irq(void)
15178+{
15179+ per_cpu(timer_irq, 0) =
15180+ bind_virq_to_irqhandler(
15181+ VIRQ_TIMER,
15182+ 0,
15183+ timer_interrupt,
15184+ SA_INTERRUPT,
15185+ "timer0",
15186+ NULL);
15187+ BUG_ON(per_cpu(timer_irq, 0) < 0);
15188+}
15189+
15190+void __init time_init(void)
15191+{
15192+#ifdef CONFIG_HPET_TIMER
15193+ if (is_hpet_capable()) {
15194+ /*
15195+ * HPET initialization needs to do memory-mapped io. So, let
15196+ * us do a late initialization after mem_init().
15197+ */
15198+ late_time_init = hpet_time_init;
15199+ return;
15200+ }
15201+#endif
15202+ get_time_values_from_xen();
15203+
15204+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15205+ per_cpu(processed_system_time, 0) = processed_system_time;
15206+ init_missing_ticks_accounting(0);
15207+
15208+ update_wallclock();
15209+
15210+ init_cpu_khz();
15211+ printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
15212+ cpu_khz / 1000, cpu_khz % 1000);
15213+
15214+#if defined(__x86_64__)
15215+ vxtime.mode = VXTIME_TSC;
15216+ vxtime.quot = (1000000L << 32) / vxtime_hz;
15217+ vxtime.tsc_quot = (1000L << 32) / cpu_khz;
15218+ sync_core();
15219+ rdtscll(vxtime.last_tsc);
15220+#endif
15221+
15222+ /* Cannot request_irq() until kmem is initialised. */
15223+ late_time_init = setup_cpu0_timer_irq;
15224+}
15225+
15226+/* Convert jiffies to system time. */
15227+u64 jiffies_to_st(unsigned long j)
15228+{
15229+ unsigned long seq;
15230+ long delta;
15231+ u64 st;
15232+
15233+ do {
15234+ seq = read_seqbegin(&xtime_lock);
15235+ delta = j - jiffies;
15236+ if (delta < 1) {
15237+ /* Triggers in some wrap-around cases, but that's okay:
15238+ * we just end up with a shorter timeout. */
15239+ st = processed_system_time + NS_PER_TICK;
15240+ } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
15241+ /* Very long timeout means there is no pending timer.
15242+ * We indicate this to Xen by passing zero timeout. */
15243+ st = 0;
15244+ } else {
15245+ st = processed_system_time + delta * (u64)NS_PER_TICK;
15246+ }
15247+ } while (read_seqretry(&xtime_lock, seq));
15248+
15249+ return st;
15250+}
15251+EXPORT_SYMBOL(jiffies_to_st);
15252+
15253+/*
15254+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
15255+ * These functions are based on implementations from arch/s390/kernel/time.c
15256+ */
15257+static void stop_hz_timer(void)
15258+{
15259+ unsigned int cpu = smp_processor_id();
15260+ unsigned long j;
15261+
15262+ cpu_set(cpu, nohz_cpu_mask);
15263+
15264+ /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
15265+ /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
15266+ /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
15267+ /* stop the hz timer then the cpumasks created for subsequent values */
15268+ /* of cur in rcu_start_batch are guaranteed to pick up the updated */
15269+ /* nohz_cpu_mask and so will not depend on this cpu. */
15270+
15271+ smp_mb();
15272+
15273+ /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
15274+ if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
15275+ (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
15276+ cpu_clear(cpu, nohz_cpu_mask);
15277+ j = jiffies + 1;
15278+ }
15279+
15280+ if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
15281+ BUG();
15282+}
15283+
15284+static void start_hz_timer(void)
15285+{
15286+ cpu_clear(smp_processor_id(), nohz_cpu_mask);
15287+}
15288+
15289+void safe_halt(void)
15290+{
15291+ stop_hz_timer();
15292+ /* Blocking includes an implicit local_irq_enable(). */
15293+ HYPERVISOR_block();
15294+ start_hz_timer();
15295+}
15296+EXPORT_SYMBOL(safe_halt);
15297+
15298+void halt(void)
15299+{
15300+ if (irqs_disabled())
15301+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
15302+}
15303+EXPORT_SYMBOL(halt);
15304+
15305+/* No locking required. We are only CPU running, and interrupts are off. */
15306+void time_resume(void)
15307+{
15308+ init_cpu_khz();
15309+
15310+ get_time_values_from_xen();
15311+
15312+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15313+ per_cpu(processed_system_time, 0) = processed_system_time;
15314+ init_missing_ticks_accounting(0);
15315+
15316+ update_wallclock();
15317+}
15318+
15319+#ifdef CONFIG_SMP
15320+static char timer_name[NR_CPUS][15];
15321+
15322+int local_setup_timer(unsigned int cpu)
15323+{
15324+ int seq, irq;
15325+
15326+ BUG_ON(cpu == 0);
15327+
15328+ do {
15329+ seq = read_seqbegin(&xtime_lock);
15330+ /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
15331+ per_cpu(processed_system_time, cpu) =
15332+ per_cpu(shadow_time, 0).system_timestamp;
15333+ init_missing_ticks_accounting(cpu);
15334+ } while (read_seqretry(&xtime_lock, seq));
15335+
15336+ sprintf(timer_name[cpu], "timer%d", cpu);
15337+ irq = bind_virq_to_irqhandler(VIRQ_TIMER,
15338+ cpu,
15339+ timer_interrupt,
15340+ SA_INTERRUPT,
15341+ timer_name[cpu],
15342+ NULL);
15343+ if (irq < 0)
15344+ return irq;
15345+ per_cpu(timer_irq, cpu) = irq;
15346+
15347+ return 0;
15348+}
15349+
15350+void local_teardown_timer(unsigned int cpu)
15351+{
15352+ BUG_ON(cpu == 0);
15353+ unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
15354+}
15355+#endif
15356+
15357+/*
15358+ * /proc/sys/xen: This really belongs in another file. It can stay here for
15359+ * now however.
15360+ */
15361+static ctl_table xen_subtable[] = {
15362+ {
15363+ .ctl_name = 1,
15364+ .procname = "independent_wallclock",
15365+ .data = &independent_wallclock,
15366+ .maxlen = sizeof(independent_wallclock),
15367+ .mode = 0644,
15368+ .proc_handler = proc_dointvec
15369+ },
15370+ {
15371+ .ctl_name = 2,
15372+ .procname = "permitted_clock_jitter",
15373+ .data = &permitted_clock_jitter,
15374+ .maxlen = sizeof(permitted_clock_jitter),
15375+ .mode = 0644,
15376+ .proc_handler = proc_doulongvec_minmax
15377+ },
15378+ { 0 }
15379+};
15380+static ctl_table xen_table[] = {
15381+ {
15382+ .ctl_name = 123,
15383+ .procname = "xen",
15384+ .mode = 0555,
15385+ .child = xen_subtable},
15386+ { 0 }
15387+};
15388+static int __init xen_sysctl_init(void)
15389+{
15390+ (void)register_sysctl_table(xen_table, 0);
15391+ return 0;
15392+}
15393+__initcall(xen_sysctl_init);
15394diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/traps-xen.c linux-2.6.16.33/arch/i386/kernel/traps-xen.c
15395--- linux-2.6.16.33-noxen/arch/i386/kernel/traps-xen.c 1970-01-01 00:00:00.000000000 +0000
15396+++ linux-2.6.16.33/arch/i386/kernel/traps-xen.c 2007-01-08 15:00:45.000000000 +0000
15397@@ -0,0 +1,1094 @@
15398+/*
15399+ * linux/arch/i386/traps.c
15400+ *
15401+ * Copyright (C) 1991, 1992 Linus Torvalds
15402+ *
15403+ * Pentium III FXSR, SSE support
15404+ * Gareth Hughes <gareth@valinux.com>, May 2000
15405+ */
15406+
15407+/*
15408+ * 'Traps.c' handles hardware traps and faults after we have saved some
15409+ * state in 'asm.s'.
15410+ */
15411+#include <linux/config.h>
15412+#include <linux/sched.h>
15413+#include <linux/kernel.h>
15414+#include <linux/string.h>
15415+#include <linux/errno.h>
15416+#include <linux/timer.h>
15417+#include <linux/mm.h>
15418+#include <linux/init.h>
15419+#include <linux/delay.h>
15420+#include <linux/spinlock.h>
15421+#include <linux/interrupt.h>
15422+#include <linux/highmem.h>
15423+#include <linux/kallsyms.h>
15424+#include <linux/ptrace.h>
15425+#include <linux/utsname.h>
15426+#include <linux/kprobes.h>
15427+#include <linux/kexec.h>
15428+
15429+#ifdef CONFIG_EISA
15430+#include <linux/ioport.h>
15431+#include <linux/eisa.h>
15432+#endif
15433+
15434+#ifdef CONFIG_MCA
15435+#include <linux/mca.h>
15436+#endif
15437+
15438+#include <asm/processor.h>
15439+#include <asm/system.h>
15440+#include <asm/uaccess.h>
15441+#include <asm/io.h>
15442+#include <asm/atomic.h>
15443+#include <asm/debugreg.h>
15444+#include <asm/desc.h>
15445+#include <asm/i387.h>
15446+#include <asm/nmi.h>
15447+
15448+#include <asm/smp.h>
15449+#include <asm/arch_hooks.h>
15450+#include <asm/kdebug.h>
15451+
15452+#include <linux/module.h>
15453+
15454+#include "mach_traps.h"
15455+
15456+asmlinkage int system_call(void);
15457+
15458+struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
15459+ { 0, 0 }, { 0, 0 } };
15460+
15461+/* Do we ignore FPU interrupts ? */
15462+char ignore_fpu_irq = 0;
15463+
15464+#ifndef CONFIG_X86_NO_IDT
15465+/*
15466+ * The IDT has to be page-aligned to simplify the Pentium
15467+ * F0 0F bug workaround.. We have a special link segment
15468+ * for this.
15469+ */
15470+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
15471+#endif
15472+
15473+asmlinkage void divide_error(void);
15474+asmlinkage void debug(void);
15475+asmlinkage void nmi(void);
15476+asmlinkage void int3(void);
15477+asmlinkage void overflow(void);
15478+asmlinkage void bounds(void);
15479+asmlinkage void invalid_op(void);
15480+asmlinkage void device_not_available(void);
15481+asmlinkage void coprocessor_segment_overrun(void);
15482+asmlinkage void invalid_TSS(void);
15483+asmlinkage void segment_not_present(void);
15484+asmlinkage void stack_segment(void);
15485+asmlinkage void general_protection(void);
15486+asmlinkage void page_fault(void);
15487+asmlinkage void coprocessor_error(void);
15488+asmlinkage void simd_coprocessor_error(void);
15489+asmlinkage void alignment_check(void);
15490+#ifndef CONFIG_XEN
15491+asmlinkage void spurious_interrupt_bug(void);
15492+#else
15493+asmlinkage void fixup_4gb_segment(void);
15494+#endif
15495+asmlinkage void machine_check(void);
15496+
15497+static int kstack_depth_to_print = 24;
15498+struct notifier_block *i386die_chain;
15499+static DEFINE_SPINLOCK(die_notifier_lock);
15500+
15501+int register_die_notifier(struct notifier_block *nb)
15502+{
15503+ int err = 0;
15504+ unsigned long flags;
15505+ spin_lock_irqsave(&die_notifier_lock, flags);
15506+ err = notifier_chain_register(&i386die_chain, nb);
15507+ spin_unlock_irqrestore(&die_notifier_lock, flags);
15508+ return err;
15509+}
15510+EXPORT_SYMBOL(register_die_notifier);
15511+
15512+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
15513+{
15514+ return p > (void *)tinfo &&
15515+ p < (void *)tinfo + THREAD_SIZE - 3;
15516+}
15517+
15518+static void print_addr_and_symbol(unsigned long addr, char *log_lvl)
15519+{
15520+ printk(log_lvl);
15521+ printk(" [<%08lx>] ", addr);
15522+ print_symbol("%s", addr);
15523+ printk("\n");
15524+}
15525+
15526+static inline unsigned long print_context_stack(struct thread_info *tinfo,
15527+ unsigned long *stack, unsigned long ebp,
15528+ char *log_lvl)
15529+{
15530+ unsigned long addr;
15531+
15532+#ifdef CONFIG_FRAME_POINTER
15533+ while (valid_stack_ptr(tinfo, (void *)ebp)) {
15534+ addr = *(unsigned long *)(ebp + 4);
15535+ print_addr_and_symbol(addr, log_lvl);
15536+ ebp = *(unsigned long *)ebp;
15537+ }
15538+#else
15539+ while (valid_stack_ptr(tinfo, stack)) {
15540+ addr = *stack++;
15541+ if (__kernel_text_address(addr))
15542+ print_addr_and_symbol(addr, log_lvl);
15543+ }
15544+#endif
15545+ return ebp;
15546+}
15547+
15548+static void show_trace_log_lvl(struct task_struct *task,
15549+ unsigned long *stack, char *log_lvl)
15550+{
15551+ unsigned long ebp;
15552+
15553+ if (!task)
15554+ task = current;
15555+
15556+ if (task == current) {
15557+ /* Grab ebp right from our regs */
15558+ asm ("movl %%ebp, %0" : "=r" (ebp) : );
15559+ } else {
15560+ /* ebp is the last reg pushed by switch_to */
15561+ ebp = *(unsigned long *) task->thread.esp;
15562+ }
15563+
15564+ while (1) {
15565+ struct thread_info *context;
15566+ context = (struct thread_info *)
15567+ ((unsigned long)stack & (~(THREAD_SIZE - 1)));
15568+ ebp = print_context_stack(context, stack, ebp, log_lvl);
15569+ stack = (unsigned long*)context->previous_esp;
15570+ if (!stack)
15571+ break;
15572+ printk(log_lvl);
15573+ printk(" =======================\n");
15574+ }
15575+}
15576+
15577+void show_trace(struct task_struct *task, unsigned long * stack)
15578+{
15579+ show_trace_log_lvl(task, stack, "");
15580+}
15581+
15582+static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
15583+ char *log_lvl)
15584+{
15585+ unsigned long *stack;
15586+ int i;
15587+
15588+ if (esp == NULL) {
15589+ if (task)
15590+ esp = (unsigned long*)task->thread.esp;
15591+ else
15592+ esp = (unsigned long *)&esp;
15593+ }
15594+
15595+ stack = esp;
15596+ printk(log_lvl);
15597+ for(i = 0; i < kstack_depth_to_print; i++) {
15598+ if (kstack_end(stack))
15599+ break;
15600+ if (i && ((i % 8) == 0)) {
15601+ printk("\n");
15602+ printk(log_lvl);
15603+ printk(" ");
15604+ }
15605+ printk("%08lx ", *stack++);
15606+ }
15607+ printk("\n");
15608+ printk(log_lvl);
15609+ printk("Call Trace:\n");
15610+ show_trace_log_lvl(task, esp, log_lvl);
15611+}
15612+
15613+void show_stack(struct task_struct *task, unsigned long *esp)
15614+{
15615+ show_stack_log_lvl(task, esp, "");
15616+}
15617+
15618+/*
15619+ * The architecture-independent dump_stack generator
15620+ */
15621+void dump_stack(void)
15622+{
15623+ unsigned long stack;
15624+
15625+ show_trace(current, &stack);
15626+}
15627+
15628+EXPORT_SYMBOL(dump_stack);
15629+
15630+void show_registers(struct pt_regs *regs)
15631+{
15632+ int i;
15633+ int in_kernel = 1;
15634+ unsigned long esp;
15635+ unsigned short ss;
15636+
15637+ esp = (unsigned long) (&regs->esp);
15638+ savesegment(ss, ss);
15639+ if (user_mode(regs)) {
15640+ in_kernel = 0;
15641+ esp = regs->esp;
15642+ ss = regs->xss & 0xffff;
15643+ }
15644+ print_modules();
15645+ printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
15646+ "EFLAGS: %08lx (%s %.*s) \n",
15647+ smp_processor_id(), 0xffff & regs->xcs, regs->eip,
15648+ print_tainted(), regs->eflags, system_utsname.release,
15649+ (int)strcspn(system_utsname.version, " "),
15650+ system_utsname.version);
15651+ print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
15652+ printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
15653+ regs->eax, regs->ebx, regs->ecx, regs->edx);
15654+ printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
15655+ regs->esi, regs->edi, regs->ebp, esp);
15656+ printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
15657+ regs->xds & 0xffff, regs->xes & 0xffff, ss);
15658+ printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
15659+ current->comm, current->pid, current_thread_info(), current);
15660+ /*
15661+ * When in-kernel, we also print out the stack and code at the
15662+ * time of the fault..
15663+ */
15664+ if (in_kernel) {
15665+ u8 __user *eip;
15666+
15667+ printk("\n" KERN_EMERG "Stack: ");
15668+ show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
15669+
15670+ printk(KERN_EMERG "Code: ");
15671+
15672+ eip = (u8 __user *)regs->eip - 43;
15673+ for (i = 0; i < 64; i++, eip++) {
15674+ unsigned char c;
15675+
15676+ if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
15677+ printk(" Bad EIP value.");
15678+ break;
15679+ }
15680+ if (eip == (u8 __user *)regs->eip)
15681+ printk("<%02x> ", c);
15682+ else
15683+ printk("%02x ", c);
15684+ }
15685+ }
15686+ printk("\n");
15687+}
15688+
15689+static void handle_BUG(struct pt_regs *regs)
15690+{
15691+ unsigned short ud2;
15692+ unsigned short line;
15693+ char *file;
15694+ char c;
15695+ unsigned long eip;
15696+
15697+ eip = regs->eip;
15698+
15699+ if (eip < PAGE_OFFSET)
15700+ goto no_bug;
15701+ if (__get_user(ud2, (unsigned short __user *)eip))
15702+ goto no_bug;
15703+ if (ud2 != 0x0b0f)
15704+ goto no_bug;
15705+ if (__get_user(line, (unsigned short __user *)(eip + 2)))
15706+ goto bug;
15707+ if (__get_user(file, (char * __user *)(eip + 4)) ||
15708+ (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
15709+ file = "<bad filename>";
15710+
15711+ printk(KERN_EMERG "------------[ cut here ]------------\n");
15712+ printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
15713+
15714+no_bug:
15715+ return;
15716+
15717+ /* Here we know it was a BUG but file-n-line is unavailable */
15718+bug:
15719+ printk(KERN_EMERG "Kernel BUG\n");
15720+}
15721+
15722+/* This is gone through when something in the kernel
15723+ * has done something bad and is about to be terminated.
15724+*/
15725+void die(const char * str, struct pt_regs * regs, long err)
15726+{
15727+ static struct {
15728+ spinlock_t lock;
15729+ u32 lock_owner;
15730+ int lock_owner_depth;
15731+ } die = {
15732+ .lock = SPIN_LOCK_UNLOCKED,
15733+ .lock_owner = -1,
15734+ .lock_owner_depth = 0
15735+ };
15736+ static int die_counter;
15737+ unsigned long flags;
15738+
15739+ if (die.lock_owner != raw_smp_processor_id()) {
15740+ console_verbose();
15741+ spin_lock_irqsave(&die.lock, flags);
15742+ die.lock_owner = smp_processor_id();
15743+ die.lock_owner_depth = 0;
15744+ bust_spinlocks(1);
15745+ }
15746+ else
15747+ local_save_flags(flags);
15748+
15749+ if (++die.lock_owner_depth < 3) {
15750+ int nl = 0;
15751+ handle_BUG(regs);
15752+ printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
15753+#ifdef CONFIG_PREEMPT
15754+ printk(KERN_EMERG "PREEMPT ");
15755+ nl = 1;
15756+#endif
15757+#ifdef CONFIG_SMP
15758+ if (!nl)
15759+ printk(KERN_EMERG);
15760+ printk("SMP ");
15761+ nl = 1;
15762+#endif
15763+#ifdef CONFIG_DEBUG_PAGEALLOC
15764+ if (!nl)
15765+ printk(KERN_EMERG);
15766+ printk("DEBUG_PAGEALLOC");
15767+ nl = 1;
15768+#endif
15769+ if (nl)
15770+ printk("\n");
15771+ notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
15772+ show_registers(regs);
15773+ } else
15774+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
15775+
15776+ bust_spinlocks(0);
15777+ die.lock_owner = -1;
15778+ spin_unlock_irqrestore(&die.lock, flags);
15779+
15780+ if (kexec_should_crash(current))
15781+ crash_kexec(regs);
15782+
15783+ if (in_interrupt())
15784+ panic("Fatal exception in interrupt");
15785+
15786+ if (panic_on_oops) {
15787+ printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
15788+ ssleep(5);
15789+ panic("Fatal exception");
15790+ }
15791+ do_exit(SIGSEGV);
15792+}
15793+
15794+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
15795+{
15796+ if (!user_mode_vm(regs))
15797+ die(str, regs, err);
15798+}
15799+
15800+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
15801+ struct pt_regs * regs, long error_code,
15802+ siginfo_t *info)
15803+{
15804+ struct task_struct *tsk = current;
15805+ tsk->thread.error_code = error_code;
15806+ tsk->thread.trap_no = trapnr;
15807+
15808+ if (regs->eflags & VM_MASK) {
15809+ if (vm86)
15810+ goto vm86_trap;
15811+ goto trap_signal;
15812+ }
15813+
15814+ if (!user_mode(regs))
15815+ goto kernel_trap;
15816+
15817+ trap_signal: {
15818+ if (info)
15819+ force_sig_info(signr, info, tsk);
15820+ else
15821+ force_sig(signr, tsk);
15822+ return;
15823+ }
15824+
15825+ kernel_trap: {
15826+ if (!fixup_exception(regs))
15827+ die(str, regs, error_code);
15828+ return;
15829+ }
15830+
15831+ vm86_trap: {
15832+ int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
15833+ if (ret) goto trap_signal;
15834+ return;
15835+ }
15836+}
15837+
15838+#define DO_ERROR(trapnr, signr, str, name) \
15839+fastcall void do_##name(struct pt_regs * regs, long error_code) \
15840+{ \
15841+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15842+ == NOTIFY_STOP) \
15843+ return; \
15844+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
15845+}
15846+
15847+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15848+fastcall void do_##name(struct pt_regs * regs, long error_code) \
15849+{ \
15850+ siginfo_t info; \
15851+ info.si_signo = signr; \
15852+ info.si_errno = 0; \
15853+ info.si_code = sicode; \
15854+ info.si_addr = (void __user *)siaddr; \
15855+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15856+ == NOTIFY_STOP) \
15857+ return; \
15858+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
15859+}
15860+
15861+#define DO_VM86_ERROR(trapnr, signr, str, name) \
15862+fastcall void do_##name(struct pt_regs * regs, long error_code) \
15863+{ \
15864+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15865+ == NOTIFY_STOP) \
15866+ return; \
15867+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
15868+}
15869+
15870+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15871+fastcall void do_##name(struct pt_regs * regs, long error_code) \
15872+{ \
15873+ siginfo_t info; \
15874+ info.si_signo = signr; \
15875+ info.si_errno = 0; \
15876+ info.si_code = sicode; \
15877+ info.si_addr = (void __user *)siaddr; \
15878+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15879+ == NOTIFY_STOP) \
15880+ return; \
15881+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
15882+}
15883+
15884+DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
15885+#ifndef CONFIG_KPROBES
15886+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
15887+#endif
15888+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
15889+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
15890+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
15891+DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
15892+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
15893+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
15894+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
15895+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
15896+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
15897+
15898+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
15899+ long error_code)
15900+{
15901+ current->thread.error_code = error_code;
15902+ current->thread.trap_no = 13;
15903+
15904+ if (regs->eflags & VM_MASK)
15905+ goto gp_in_vm86;
15906+
15907+ if (!user_mode(regs))
15908+ goto gp_in_kernel;
15909+
15910+ current->thread.error_code = error_code;
15911+ current->thread.trap_no = 13;
15912+ force_sig(SIGSEGV, current);
15913+ return;
15914+
15915+gp_in_vm86:
15916+ local_irq_enable();
15917+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
15918+ return;
15919+
15920+gp_in_kernel:
15921+ if (!fixup_exception(regs)) {
15922+ if (notify_die(DIE_GPF, "general protection fault", regs,
15923+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
15924+ return;
15925+ die("general protection fault", regs, error_code);
15926+ }
15927+}
15928+
15929+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
15930+{
15931+ printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
15932+ "to continue\n");
15933+ printk(KERN_EMERG "You probably have a hardware problem with your RAM "
15934+ "chips\n");
15935+
15936+ /* Clear and disable the memory parity error line. */
15937+ clear_mem_error(reason);
15938+}
15939+
15940+static void io_check_error(unsigned char reason, struct pt_regs * regs)
15941+{
15942+ printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15943+ show_registers(regs);
15944+
15945+ /* Re-enable the IOCK line, wait for a few seconds */
15946+ clear_io_check_error(reason);
15947+}
15948+
15949+static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15950+{
15951+#ifdef CONFIG_MCA
15952+ /* Might actually be able to figure out what the guilty party
15953+ * is. */
15954+ if( MCA_bus ) {
15955+ mca_handle_nmi();
15956+ return;
15957+ }
15958+#endif
15959+ printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
15960+ reason, smp_processor_id());
15961+ printk("Dazed and confused, but trying to continue\n");
15962+ printk("Do you have a strange power saving mode enabled?\n");
15963+}
15964+
15965+static DEFINE_SPINLOCK(nmi_print_lock);
15966+
15967+void die_nmi (struct pt_regs *regs, const char *msg)
15968+{
15969+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
15970+ NOTIFY_STOP)
15971+ return;
15972+
15973+ spin_lock(&nmi_print_lock);
15974+ /*
15975+ * We are in trouble anyway, lets at least try
15976+ * to get a message out.
15977+ */
15978+ bust_spinlocks(1);
15979+ printk(KERN_EMERG "%s", msg);
15980+ printk(" on CPU%d, eip %08lx, registers:\n",
15981+ smp_processor_id(), regs->eip);
15982+ show_registers(regs);
15983+ printk(KERN_EMERG "console shuts up ...\n");
15984+ console_silent();
15985+ spin_unlock(&nmi_print_lock);
15986+ bust_spinlocks(0);
15987+
15988+ /* If we are in kernel we are probably nested up pretty bad
15989+ * and might aswell get out now while we still can.
15990+ */
15991+ if (!user_mode(regs)) {
15992+ current->thread.trap_no = 2;
15993+ crash_kexec(regs);
15994+ }
15995+
15996+ do_exit(SIGSEGV);
15997+}
15998+
15999+static void default_do_nmi(struct pt_regs * regs)
16000+{
16001+ unsigned char reason = 0;
16002+
16003+ /* Only the BSP gets external NMIs from the system. */
16004+ if (!smp_processor_id())
16005+ reason = get_nmi_reason();
16006+
16007+ if (!(reason & 0xc0)) {
16008+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
16009+ == NOTIFY_STOP)
16010+ return;
16011+#ifdef CONFIG_X86_LOCAL_APIC
16012+ /*
16013+ * Ok, so this is none of the documented NMI sources,
16014+ * so it must be the NMI watchdog.
16015+ */
16016+ if (nmi_watchdog) {
16017+ nmi_watchdog_tick(regs);
16018+ return;
16019+ }
16020+#endif
16021+ unknown_nmi_error(reason, regs);
16022+ return;
16023+ }
16024+ if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
16025+ return;
16026+ if (reason & 0x80)
16027+ mem_parity_error(reason, regs);
16028+ if (reason & 0x40)
16029+ io_check_error(reason, regs);
16030+ /*
16031+ * Reassert NMI in case it became active meanwhile
16032+ * as it's edge-triggered.
16033+ */
16034+ reassert_nmi();
16035+}
16036+
16037+static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
16038+{
16039+ return 0;
16040+}
16041+
16042+static nmi_callback_t nmi_callback = dummy_nmi_callback;
16043+
16044+fastcall void do_nmi(struct pt_regs * regs, long error_code)
16045+{
16046+ int cpu;
16047+
16048+ nmi_enter();
16049+
16050+ cpu = smp_processor_id();
16051+
16052+ ++nmi_count(cpu);
16053+
16054+ if (!rcu_dereference(nmi_callback)(regs, cpu))
16055+ default_do_nmi(regs);
16056+
16057+ nmi_exit();
16058+}
16059+
16060+void set_nmi_callback(nmi_callback_t callback)
16061+{
16062+ rcu_assign_pointer(nmi_callback, callback);
16063+}
16064+EXPORT_SYMBOL_GPL(set_nmi_callback);
16065+
16066+void unset_nmi_callback(void)
16067+{
16068+ nmi_callback = dummy_nmi_callback;
16069+}
16070+EXPORT_SYMBOL_GPL(unset_nmi_callback);
16071+
16072+#ifdef CONFIG_KPROBES
16073+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
16074+{
16075+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
16076+ == NOTIFY_STOP)
16077+ return;
16078+ /* This is an interrupt gate, because kprobes wants interrupts
16079+ disabled. Normal trap handlers don't. */
16080+ restore_interrupts(regs);
16081+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
16082+}
16083+#endif
16084+
16085+/*
16086+ * Our handling of the processor debug registers is non-trivial.
16087+ * We do not clear them on entry and exit from the kernel. Therefore
16088+ * it is possible to get a watchpoint trap here from inside the kernel.
16089+ * However, the code in ./ptrace.c has ensured that the user can
16090+ * only set watchpoints on userspace addresses. Therefore the in-kernel
16091+ * watchpoint trap can only occur in code which is reading/writing
16092+ * from user space. Such code must not hold kernel locks (since it
16093+ * can equally take a page fault), therefore it is safe to call
16094+ * force_sig_info even though that claims and releases locks.
16095+ *
16096+ * Code in ./signal.c ensures that the debug control register
16097+ * is restored before we deliver any signal, and therefore that
16098+ * user code runs with the correct debug control register even though
16099+ * we clear it here.
16100+ *
16101+ * Being careful here means that we don't have to be as careful in a
16102+ * lot of more complicated places (task switching can be a bit lazy
16103+ * about restoring all the debug state, and ptrace doesn't have to
16104+ * find every occurrence of the TF bit that could be saved away even
16105+ * by user code)
16106+ */
16107+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
16108+{
16109+ unsigned int condition;
16110+ struct task_struct *tsk = current;
16111+
16112+ get_debugreg(condition, 6);
16113+
16114+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16115+ SIGTRAP) == NOTIFY_STOP)
16116+ return;
16117+ /* It's safe to allow irq's after DR6 has been saved */
16118+ if (regs->eflags & X86_EFLAGS_IF)
16119+ local_irq_enable();
16120+
16121+ /* Mask out spurious debug traps due to lazy DR7 setting */
16122+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
16123+ if (!tsk->thread.debugreg[7])
16124+ goto clear_dr7;
16125+ }
16126+
16127+ if (regs->eflags & VM_MASK)
16128+ goto debug_vm86;
16129+
16130+ /* Save debug status register where ptrace can see it */
16131+ tsk->thread.debugreg[6] = condition;
16132+
16133+ /*
16134+ * Single-stepping through TF: make sure we ignore any events in
16135+ * kernel space (but re-enable TF when returning to user mode).
16136+ */
16137+ if (condition & DR_STEP) {
16138+ /*
16139+ * We already checked v86 mode above, so we can
16140+ * check for kernel mode by just checking the CPL
16141+ * of CS.
16142+ */
16143+ if (!user_mode(regs))
16144+ goto clear_TF_reenable;
16145+ }
16146+
16147+ /* Ok, finally something we can handle */
16148+ send_sigtrap(tsk, regs, error_code);
16149+
16150+ /* Disable additional traps. They'll be re-enabled when
16151+ * the signal is delivered.
16152+ */
16153+clear_dr7:
16154+ set_debugreg(0, 7);
16155+ return;
16156+
16157+debug_vm86:
16158+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
16159+ return;
16160+
16161+clear_TF_reenable:
16162+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
16163+ regs->eflags &= ~TF_MASK;
16164+ return;
16165+}
16166+
16167+/*
16168+ * Note that we play around with the 'TS' bit in an attempt to get
16169+ * the correct behaviour even in the presence of the asynchronous
16170+ * IRQ13 behaviour
16171+ */
16172+void math_error(void __user *eip)
16173+{
16174+ struct task_struct * task;
16175+ siginfo_t info;
16176+ unsigned short cwd, swd;
16177+
16178+ /*
16179+ * Save the info for the exception handler and clear the error.
16180+ */
16181+ task = current;
16182+ save_init_fpu(task);
16183+ task->thread.trap_no = 16;
16184+ task->thread.error_code = 0;
16185+ info.si_signo = SIGFPE;
16186+ info.si_errno = 0;
16187+ info.si_code = __SI_FAULT;
16188+ info.si_addr = eip;
16189+ /*
16190+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
16191+ * status. 0x3f is the exception bits in these regs, 0x200 is the
16192+ * C1 reg you need in case of a stack fault, 0x040 is the stack
16193+ * fault bit. We should only be taking one exception at a time,
16194+ * so if this combination doesn't produce any single exception,
16195+ * then we have a bad program that isn't syncronizing its FPU usage
16196+ * and it will suffer the consequences since we won't be able to
16197+ * fully reproduce the context of the exception
16198+ */
16199+ cwd = get_fpu_cwd(task);
16200+ swd = get_fpu_swd(task);
16201+ switch (swd & ~cwd & 0x3f) {
16202+ case 0x000: /* No unmasked exception */
16203+ return;
16204+ default: /* Multiple exceptions */
16205+ break;
16206+ case 0x001: /* Invalid Op */
16207+ /*
16208+ * swd & 0x240 == 0x040: Stack Underflow
16209+ * swd & 0x240 == 0x240: Stack Overflow
16210+ * User must clear the SF bit (0x40) if set
16211+ */
16212+ info.si_code = FPE_FLTINV;
16213+ break;
16214+ case 0x002: /* Denormalize */
16215+ case 0x010: /* Underflow */
16216+ info.si_code = FPE_FLTUND;
16217+ break;
16218+ case 0x004: /* Zero Divide */
16219+ info.si_code = FPE_FLTDIV;
16220+ break;
16221+ case 0x008: /* Overflow */
16222+ info.si_code = FPE_FLTOVF;
16223+ break;
16224+ case 0x020: /* Precision */
16225+ info.si_code = FPE_FLTRES;
16226+ break;
16227+ }
16228+ force_sig_info(SIGFPE, &info, task);
16229+}
16230+
16231+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
16232+{
16233+ ignore_fpu_irq = 1;
16234+ math_error((void __user *)regs->eip);
16235+}
16236+
16237+static void simd_math_error(void __user *eip)
16238+{
16239+ struct task_struct * task;
16240+ siginfo_t info;
16241+ unsigned short mxcsr;
16242+
16243+ /*
16244+ * Save the info for the exception handler and clear the error.
16245+ */
16246+ task = current;
16247+ save_init_fpu(task);
16248+ task->thread.trap_no = 19;
16249+ task->thread.error_code = 0;
16250+ info.si_signo = SIGFPE;
16251+ info.si_errno = 0;
16252+ info.si_code = __SI_FAULT;
16253+ info.si_addr = eip;
16254+ /*
16255+ * The SIMD FPU exceptions are handled a little differently, as there
16256+ * is only a single status/control register. Thus, to determine which
16257+ * unmasked exception was caught we must mask the exception mask bits
16258+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
16259+ */
16260+ mxcsr = get_fpu_mxcsr(task);
16261+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
16262+ case 0x000:
16263+ default:
16264+ break;
16265+ case 0x001: /* Invalid Op */
16266+ info.si_code = FPE_FLTINV;
16267+ break;
16268+ case 0x002: /* Denormalize */
16269+ case 0x010: /* Underflow */
16270+ info.si_code = FPE_FLTUND;
16271+ break;
16272+ case 0x004: /* Zero Divide */
16273+ info.si_code = FPE_FLTDIV;
16274+ break;
16275+ case 0x008: /* Overflow */
16276+ info.si_code = FPE_FLTOVF;
16277+ break;
16278+ case 0x020: /* Precision */
16279+ info.si_code = FPE_FLTRES;
16280+ break;
16281+ }
16282+ force_sig_info(SIGFPE, &info, task);
16283+}
16284+
16285+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
16286+ long error_code)
16287+{
16288+ if (cpu_has_xmm) {
16289+ /* Handle SIMD FPU exceptions on PIII+ processors. */
16290+ ignore_fpu_irq = 1;
16291+ simd_math_error((void __user *)regs->eip);
16292+ } else {
16293+ /*
16294+ * Handle strange cache flush from user space exception
16295+ * in all other cases. This is undocumented behaviour.
16296+ */
16297+ if (regs->eflags & VM_MASK) {
16298+ handle_vm86_fault((struct kernel_vm86_regs *)regs,
16299+ error_code);
16300+ return;
16301+ }
16302+ current->thread.trap_no = 19;
16303+ current->thread.error_code = error_code;
16304+ die_if_kernel("cache flush denied", regs, error_code);
16305+ force_sig(SIGSEGV, current);
16306+ }
16307+}
16308+
16309+#ifndef CONFIG_XEN
16310+fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
16311+ long error_code)
16312+{
16313+#if 0
16314+ /* No need to warn about this any longer. */
16315+ printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
16316+#endif
16317+}
16318+
16319+fastcall void setup_x86_bogus_stack(unsigned char * stk)
16320+{
16321+ unsigned long *switch16_ptr, *switch32_ptr;
16322+ struct pt_regs *regs;
16323+ unsigned long stack_top, stack_bot;
16324+ unsigned short iret_frame16_off;
16325+ int cpu = smp_processor_id();
16326+ /* reserve the space on 32bit stack for the magic switch16 pointer */
16327+ memmove(stk, stk + 8, sizeof(struct pt_regs));
16328+ switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
16329+ regs = (struct pt_regs *)stk;
16330+ /* now the switch32 on 16bit stack */
16331+ stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16332+ stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16333+ switch32_ptr = (unsigned long *)(stack_top - 8);
16334+ iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
16335+ /* copy iret frame on 16bit stack */
16336+ memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
16337+ /* fill in the switch pointers */
16338+ switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
16339+ switch16_ptr[1] = __ESPFIX_SS;
16340+ switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
16341+ 8 - CPU_16BIT_STACK_SIZE;
16342+ switch32_ptr[1] = __KERNEL_DS;
16343+}
16344+
16345+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
16346+{
16347+ unsigned long *switch32_ptr;
16348+ unsigned char *stack16, *stack32;
16349+ unsigned long stack_top, stack_bot;
16350+ int len;
16351+ int cpu = smp_processor_id();
16352+ stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16353+ stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16354+ switch32_ptr = (unsigned long *)(stack_top - 8);
16355+ /* copy the data from 16bit stack to 32bit stack */
16356+ len = CPU_16BIT_STACK_SIZE - 8 - sp;
16357+ stack16 = (unsigned char *)(stack_bot + sp);
16358+ stack32 = (unsigned char *)
16359+ (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
16360+ memcpy(stack32, stack16, len);
16361+ return stack32;
16362+}
16363+#endif
16364+
16365+/*
16366+ * 'math_state_restore()' saves the current math information in the
16367+ * old math state array, and gets the new ones from the current task
16368+ *
16369+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
16370+ * Don't touch unless you *really* know how it works.
16371+ *
16372+ * Must be called with kernel preemption disabled (in this case,
16373+ * local interrupts are disabled at the call-site in entry.S).
16374+ */
16375+asmlinkage void math_state_restore(struct pt_regs regs)
16376+{
16377+ struct thread_info *thread = current_thread_info();
16378+ struct task_struct *tsk = thread->task;
16379+
16380+ /* NB. 'clts' is done for us by Xen during virtual trap. */
16381+ if (!tsk_used_math(tsk))
16382+ init_fpu(tsk);
16383+ restore_fpu(tsk);
16384+ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
16385+}
16386+
16387+#ifndef CONFIG_MATH_EMULATION
16388+
16389+asmlinkage void math_emulate(long arg)
16390+{
16391+ printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
16392+ printk(KERN_EMERG "killing %s.\n",current->comm);
16393+ force_sig(SIGFPE,current);
16394+ schedule();
16395+}
16396+
16397+#endif /* CONFIG_MATH_EMULATION */
16398+
16399+#ifdef CONFIG_X86_F00F_BUG
16400+void __init trap_init_f00f_bug(void)
16401+{
16402+ __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
16403+
16404+ /*
16405+ * Update the IDT descriptor and reload the IDT so that
16406+ * it uses the read-only mapped virtual address.
16407+ */
16408+ idt_descr.address = fix_to_virt(FIX_F00F_IDT);
16409+ load_idt(&idt_descr);
16410+}
16411+#endif
16412+
16413+
16414+/*
16415+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
16416+ * for those that specify <dpl>|4 in the second field.
16417+ */
16418+static trap_info_t trap_table[] = {
16419+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
16420+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
16421+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
16422+ { 4, 3, __KERNEL_CS, (unsigned long)overflow },
16423+ { 5, 0, __KERNEL_CS, (unsigned long)bounds },
16424+ { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
16425+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
16426+ { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
16427+ { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
16428+ { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
16429+ { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
16430+ { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
16431+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
16432+ { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
16433+ { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
16434+ { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
16435+#ifdef CONFIG_X86_MCE
16436+ { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
16437+#endif
16438+ { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
16439+ { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
16440+ { 0, 0, 0, 0 }
16441+};
16442+
16443+void __init trap_init(void)
16444+{
16445+ HYPERVISOR_set_trap_table(trap_table);
16446+
16447+ if (cpu_has_fxsr) {
16448+ /*
16449+ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
16450+ * Generates a compile-time "error: zero width for bit-field" if
16451+ * the alignment is wrong.
16452+ */
16453+ struct fxsrAlignAssert {
16454+ int _:!(offsetof(struct task_struct,
16455+ thread.i387.fxsave) & 15);
16456+ };
16457+
16458+ printk(KERN_INFO "Enabling fast FPU save and restore... ");
16459+ set_in_cr4(X86_CR4_OSFXSR);
16460+ printk("done.\n");
16461+ }
16462+ if (cpu_has_xmm) {
16463+ printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
16464+ "support... ");
16465+ set_in_cr4(X86_CR4_OSXMMEXCPT);
16466+ printk("done.\n");
16467+ }
16468+
16469+ /*
16470+ * Should be a barrier for any external CPU state.
16471+ */
16472+ cpu_init();
16473+}
16474+
16475+void smp_trap_init(trap_info_t *trap_ctxt)
16476+{
16477+ trap_info_t *t = trap_table;
16478+
16479+ for (t = trap_table; t->address; t++) {
16480+ trap_ctxt[t->vector].flags = t->flags;
16481+ trap_ctxt[t->vector].cs = t->cs;
16482+ trap_ctxt[t->vector].address = t->address;
16483+ }
16484+}
16485+
16486+static int __init kstack_setup(char *s)
16487+{
16488+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
16489+ return 0;
16490+}
16491+__setup("kstack=", kstack_setup);
16492diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/traps.c linux-2.6.16.33/arch/i386/kernel/traps.c
16493--- linux-2.6.16.33-noxen/arch/i386/kernel/traps.c 2006-11-22 18:06:31.000000000 +0000
16494+++ linux-2.6.16.33/arch/i386/kernel/traps.c 2007-01-08 15:00:45.000000000 +0000
16495@@ -567,18 +567,11 @@
16496
16497 static void io_check_error(unsigned char reason, struct pt_regs * regs)
16498 {
16499- unsigned long i;
16500-
16501 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
16502 show_registers(regs);
16503
16504 /* Re-enable the IOCK line, wait for a few seconds */
16505- reason = (reason & 0xf) | 8;
16506- outb(reason, 0x61);
16507- i = 2000;
16508- while (--i) udelay(1000);
16509- reason &= ~8;
16510- outb(reason, 0x61);
16511+ clear_io_check_error(reason);
16512 }
16513
16514 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
16515diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vm86.c linux-2.6.16.33/arch/i386/kernel/vm86.c
16516--- linux-2.6.16.33-noxen/arch/i386/kernel/vm86.c 2006-11-22 18:06:31.000000000 +0000
16517+++ linux-2.6.16.33/arch/i386/kernel/vm86.c 2007-01-08 15:00:45.000000000 +0000
16518@@ -98,7 +98,9 @@
16519 struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
16520 struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
16521 {
16522+#ifndef CONFIG_X86_NO_TSS
16523 struct tss_struct *tss;
16524+#endif
16525 struct pt_regs *ret;
16526 unsigned long tmp;
16527
16528@@ -123,12 +125,16 @@
16529 do_exit(SIGSEGV);
16530 }
16531
16532+#ifndef CONFIG_X86_NO_TSS
16533 tss = &per_cpu(init_tss, get_cpu());
16534+#endif
16535 current->thread.esp0 = current->thread.saved_esp0;
16536 current->thread.sysenter_cs = __KERNEL_CS;
16537 load_esp0(tss, &current->thread);
16538 current->thread.saved_esp0 = 0;
16539+#ifndef CONFIG_X86_NO_TSS
16540 put_cpu();
16541+#endif
16542
16543 loadsegment(fs, current->thread.saved_fs);
16544 loadsegment(gs, current->thread.saved_gs);
16545@@ -252,7 +258,9 @@
16546
16547 static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
16548 {
16549+#ifndef CONFIG_X86_NO_TSS
16550 struct tss_struct *tss;
16551+#endif
16552 long eax;
16553 /*
16554 * make sure the vm86() system call doesn't try to do anything silly
16555@@ -297,12 +305,16 @@
16556 savesegment(fs, tsk->thread.saved_fs);
16557 savesegment(gs, tsk->thread.saved_gs);
16558
16559+#ifndef CONFIG_X86_NO_TSS
16560 tss = &per_cpu(init_tss, get_cpu());
16561+#endif
16562 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
16563 if (cpu_has_sep)
16564 tsk->thread.sysenter_cs = 0;
16565 load_esp0(tss, &tsk->thread);
16566+#ifndef CONFIG_X86_NO_TSS
16567 put_cpu();
16568+#endif
16569
16570 tsk->thread.screen_bitmap = info->screen_bitmap;
16571 if (info->flags & VM86_SCREEN_BITMAP)
16572diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vmlinux.lds.S linux-2.6.16.33/arch/i386/kernel/vmlinux.lds.S
16573--- linux-2.6.16.33-noxen/arch/i386/kernel/vmlinux.lds.S 2006-11-22 18:06:31.000000000 +0000
16574+++ linux-2.6.16.33/arch/i386/kernel/vmlinux.lds.S 2007-01-08 15:00:45.000000000 +0000
16575@@ -12,6 +12,12 @@
16576 OUTPUT_ARCH(i386)
16577 ENTRY(phys_startup_32)
16578 jiffies = jiffies_64;
16579+
16580+PHDRS {
16581+ text PT_LOAD FLAGS(5); /* R_E */
16582+ data PT_LOAD FLAGS(7); /* RWE */
16583+ note PT_NOTE FLAGS(4); /* R__ */
16584+}
16585 SECTIONS
16586 {
16587 . = __KERNEL_START;
16588@@ -25,7 +31,7 @@
16589 KPROBES_TEXT
16590 *(.fixup)
16591 *(.gnu.warning)
16592- } = 0x9090
16593+ } :text = 0x9090
16594
16595 _etext = .; /* End of text section */
16596
16597@@ -34,13 +40,20 @@
16598 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
16599 __stop___ex_table = .;
16600
16601+ . = ALIGN(16);
16602+ __start_smp_alternatives_table = .;
16603+ __smp_alternatives : AT(ADDR(__smp_alternatives) - LOAD_OFFSET) { *(__smp_alternatives) }
16604+ __stop_smp_alternatives_table = .;
16605+
16606+ __smp_replacements : AT(ADDR(__smp_replacements) - LOAD_OFFSET) { *(__smp_replacements) }
16607+
16608 RODATA
16609
16610 /* writeable */
16611 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
16612 *(.data)
16613 CONSTRUCTORS
16614- }
16615+ } :data
16616
16617 . = ALIGN(4096);
16618 __nosave_begin = .;
16619@@ -147,4 +160,6 @@
16620 STABS_DEBUG
16621
16622 DWARF_DEBUG
16623+
16624+ NOTES
16625 }
16626diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vsyscall-note-xen.S linux-2.6.16.33/arch/i386/kernel/vsyscall-note-xen.S
16627--- linux-2.6.16.33-noxen/arch/i386/kernel/vsyscall-note-xen.S 1970-01-01 00:00:00.000000000 +0000
16628+++ linux-2.6.16.33/arch/i386/kernel/vsyscall-note-xen.S 2007-01-08 15:00:45.000000000 +0000
16629@@ -0,0 +1,32 @@
16630+/*
16631+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
16632+ * Here we can supply some information useful to userland.
16633+ * First we get the vanilla i386 note that supplies the kernel version info.
16634+ */
16635+
16636+#include "vsyscall-note.S"
16637+
16638+/*
16639+ * Now we add a special note telling glibc's dynamic linker a fake hardware
16640+ * flavor that it will use to choose the search path for libraries in the
16641+ * same way it uses real hardware capabilities like "mmx".
16642+ * We supply "nosegneg" as the fake capability, to indicate that we
16643+ * do not like negative offsets in instructions using segment overrides,
16644+ * since we implement those inefficiently. This makes it possible to
16645+ * install libraries optimized to avoid those access patterns in someplace
16646+ * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
16647+ * corresponding to the bits here is needed to make ldconfig work right.
16648+ * It should contain:
16649+ * hwcap 0 nosegneg
16650+ * to match the mapping of bit to name that we give here.
16651+ */
16652+#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
16653+ ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
16654+ .long ncaps, mask
16655+#define NOTE_KERNELCAP(bit, name) \
16656+ .byte bit; .asciz name
16657+#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
16658+
16659+NOTE_KERNELCAP_BEGIN(1, 1)
16660+NOTE_KERNELCAP(1, "nosegneg") /* Change 1 back to 0 when glibc is fixed! */
16661+NOTE_KERNELCAP_END
16662diff -Nur linux-2.6.16.33-noxen/arch/i386/mach-xen/Makefile linux-2.6.16.33/arch/i386/mach-xen/Makefile
16663--- linux-2.6.16.33-noxen/arch/i386/mach-xen/Makefile 1970-01-01 00:00:00.000000000 +0000
16664+++ linux-2.6.16.33/arch/i386/mach-xen/Makefile 2007-01-08 15:00:45.000000000 +0000
16665@@ -0,0 +1,5 @@
16666+#
16667+# Makefile for the linux kernel.
16668+#
16669+
16670+obj-y := setup.o
16671diff -Nur linux-2.6.16.33-noxen/arch/i386/mach-xen/setup.c linux-2.6.16.33/arch/i386/mach-xen/setup.c
16672--- linux-2.6.16.33-noxen/arch/i386/mach-xen/setup.c 1970-01-01 00:00:00.000000000 +0000
16673+++ linux-2.6.16.33/arch/i386/mach-xen/setup.c 2007-01-08 15:00:45.000000000 +0000
16674@@ -0,0 +1,37 @@
16675+/*
16676+ * Machine specific setup for generic
16677+ */
16678+
16679+#include <linux/config.h>
16680+#include <linux/smp.h>
16681+#include <linux/init.h>
16682+#include <linux/interrupt.h>
16683+#include <asm/acpi.h>
16684+#include <asm/arch_hooks.h>
16685+
16686+#ifdef CONFIG_HOTPLUG_CPU
16687+#define DEFAULT_SEND_IPI (1)
16688+#else
16689+#define DEFAULT_SEND_IPI (0)
16690+#endif
16691+
16692+int no_broadcast=DEFAULT_SEND_IPI;
16693+
16694+static __init int no_ipi_broadcast(char *str)
16695+{
16696+ get_option(&str, &no_broadcast);
16697+ printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
16698+ "IPI Broadcast");
16699+ return 1;
16700+}
16701+
16702+__setup("no_ipi_broadcast", no_ipi_broadcast);
16703+
16704+static int __init print_ipi_mode(void)
16705+{
16706+ printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
16707+ "Shortcut");
16708+ return 0;
16709+}
16710+
16711+late_initcall(print_ipi_mode);
16712diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/Makefile linux-2.6.16.33/arch/i386/mm/Makefile
16713--- linux-2.6.16.33-noxen/arch/i386/mm/Makefile 2006-11-22 18:06:31.000000000 +0000
16714+++ linux-2.6.16.33/arch/i386/mm/Makefile 2007-01-08 15:00:45.000000000 +0000
16715@@ -8,3 +8,11 @@
16716 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
16717 obj-$(CONFIG_HIGHMEM) += highmem.o
16718 obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
16719+
16720+ifdef CONFIG_XEN
16721+include $(srctree)/scripts/Makefile.xen
16722+
16723+obj-y += hypervisor.o
16724+
16725+obj-y := $(call cherrypickxen, $(obj-y))
16726+endif
16727diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/fault-xen.c linux-2.6.16.33/arch/i386/mm/fault-xen.c
16728--- linux-2.6.16.33-noxen/arch/i386/mm/fault-xen.c 1970-01-01 00:00:00.000000000 +0000
16729+++ linux-2.6.16.33/arch/i386/mm/fault-xen.c 2007-01-08 15:00:45.000000000 +0000
16730@@ -0,0 +1,662 @@
16731+/*
16732+ * linux/arch/i386/mm/fault.c
16733+ *
16734+ * Copyright (C) 1995 Linus Torvalds
16735+ */
16736+
16737+#include <linux/signal.h>
16738+#include <linux/sched.h>
16739+#include <linux/kernel.h>
16740+#include <linux/errno.h>
16741+#include <linux/string.h>
16742+#include <linux/types.h>
16743+#include <linux/ptrace.h>
16744+#include <linux/mman.h>
16745+#include <linux/mm.h>
16746+#include <linux/smp.h>
16747+#include <linux/smp_lock.h>
16748+#include <linux/interrupt.h>
16749+#include <linux/init.h>
16750+#include <linux/tty.h>
16751+#include <linux/vt_kern.h> /* For unblank_screen() */
16752+#include <linux/highmem.h>
16753+#include <linux/module.h>
16754+#include <linux/kprobes.h>
16755+
16756+#include <asm/system.h>
16757+#include <asm/uaccess.h>
16758+#include <asm/desc.h>
16759+#include <asm/kdebug.h>
16760+
16761+extern void die(const char *,struct pt_regs *,long);
16762+
16763+/*
16764+ * Unlock any spinlocks which will prevent us from getting the
16765+ * message out
16766+ */
16767+void bust_spinlocks(int yes)
16768+{
16769+ int loglevel_save = console_loglevel;
16770+
16771+ if (yes) {
16772+ oops_in_progress = 1;
16773+ return;
16774+ }
16775+#ifdef CONFIG_VT
16776+ unblank_screen();
16777+#endif
16778+ oops_in_progress = 0;
16779+ /*
16780+ * OK, the message is on the console. Now we call printk()
16781+ * without oops_in_progress set so that printk will give klogd
16782+ * a poke. Hold onto your hats...
16783+ */
16784+ console_loglevel = 15; /* NMI oopser may have shut the console up */
16785+ printk(" ");
16786+ console_loglevel = loglevel_save;
16787+}
16788+
16789+/*
16790+ * Return EIP plus the CS segment base. The segment limit is also
16791+ * adjusted, clamped to the kernel/user address space (whichever is
16792+ * appropriate), and returned in *eip_limit.
16793+ *
16794+ * The segment is checked, because it might have been changed by another
16795+ * task between the original faulting instruction and here.
16796+ *
16797+ * If CS is no longer a valid code segment, or if EIP is beyond the
16798+ * limit, or if it is a kernel address when CS is not a kernel segment,
16799+ * then the returned value will be greater than *eip_limit.
16800+ *
16801+ * This is slow, but is very rarely executed.
16802+ */
16803+static inline unsigned long get_segment_eip(struct pt_regs *regs,
16804+ unsigned long *eip_limit)
16805+{
16806+ unsigned long eip = regs->eip;
16807+ unsigned seg = regs->xcs & 0xffff;
16808+ u32 seg_ar, seg_limit, base, *desc;
16809+
16810+ /* The standard kernel/user address space limit. */
16811+ *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
16812+
16813+ /* Unlikely, but must come before segment checks. */
16814+ if (unlikely((regs->eflags & VM_MASK) != 0))
16815+ return eip + (seg << 4);
16816+
16817+ /* By far the most common cases. */
16818+ if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
16819+ return eip;
16820+
16821+ /* Check the segment exists, is within the current LDT/GDT size,
16822+ that kernel/user (ring 0..3) has the appropriate privilege,
16823+ that it's a code segment, and get the limit. */
16824+ __asm__ ("larl %3,%0; lsll %3,%1"
16825+ : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
16826+ if ((~seg_ar & 0x9800) || eip > seg_limit) {
16827+ *eip_limit = 0;
16828+ return 1; /* So that returned eip > *eip_limit. */
16829+ }
16830+
16831+ /* Get the GDT/LDT descriptor base.
16832+ When you look for races in this code remember that
16833+ LDT and other horrors are only used in user space. */
16834+ if (seg & (1<<2)) {
16835+ /* Must lock the LDT while reading it. */
16836+ down(&current->mm->context.sem);
16837+ desc = current->mm->context.ldt;
16838+ desc = (void *)desc + (seg & ~7);
16839+ } else {
16840+ /* Must disable preemption while reading the GDT. */
16841+ desc = (u32 *)get_cpu_gdt_table(get_cpu());
16842+ desc = (void *)desc + (seg & ~7);
16843+ }
16844+
16845+ /* Decode the code segment base from the descriptor */
16846+ base = get_desc_base((unsigned long *)desc);
16847+
16848+ if (seg & (1<<2)) {
16849+ up(&current->mm->context.sem);
16850+ } else
16851+ put_cpu();
16852+
16853+ /* Adjust EIP and segment limit, and clamp at the kernel limit.
16854+ It's legitimate for segments to wrap at 0xffffffff. */
16855+ seg_limit += base;
16856+ if (seg_limit < *eip_limit && seg_limit >= base)
16857+ *eip_limit = seg_limit;
16858+ return eip + base;
16859+}
16860+
16861+/*
16862+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
16863+ * Check that here and ignore it.
16864+ */
16865+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
16866+{
16867+ unsigned long limit;
16868+ unsigned long instr = get_segment_eip (regs, &limit);
16869+ int scan_more = 1;
16870+ int prefetch = 0;
16871+ int i;
16872+
16873+ for (i = 0; scan_more && i < 15; i++) {
16874+ unsigned char opcode;
16875+ unsigned char instr_hi;
16876+ unsigned char instr_lo;
16877+
16878+ if (instr > limit)
16879+ break;
16880+ if (__get_user(opcode, (unsigned char __user *) instr))
16881+ break;
16882+
16883+ instr_hi = opcode & 0xf0;
16884+ instr_lo = opcode & 0x0f;
16885+ instr++;
16886+
16887+ switch (instr_hi) {
16888+ case 0x20:
16889+ case 0x30:
16890+ /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
16891+ scan_more = ((instr_lo & 7) == 0x6);
16892+ break;
16893+
16894+ case 0x60:
16895+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
16896+ scan_more = (instr_lo & 0xC) == 0x4;
16897+ break;
16898+ case 0xF0:
16899+ /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
16900+ scan_more = !instr_lo || (instr_lo>>1) == 1;
16901+ break;
16902+ case 0x00:
16903+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
16904+ scan_more = 0;
16905+ if (instr > limit)
16906+ break;
16907+ if (__get_user(opcode, (unsigned char __user *) instr))
16908+ break;
16909+ prefetch = (instr_lo == 0xF) &&
16910+ (opcode == 0x0D || opcode == 0x18);
16911+ break;
16912+ default:
16913+ scan_more = 0;
16914+ break;
16915+ }
16916+ }
16917+ return prefetch;
16918+}
16919+
16920+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
16921+ unsigned long error_code)
16922+{
16923+ if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
16924+ boot_cpu_data.x86 >= 6)) {
16925+ /* Catch an obscure case of prefetch inside an NX page. */
16926+ if (nx_enabled && (error_code & 16))
16927+ return 0;
16928+ return __is_prefetch(regs, addr);
16929+ }
16930+ return 0;
16931+}
16932+
16933+static noinline void force_sig_info_fault(int si_signo, int si_code,
16934+ unsigned long address, struct task_struct *tsk)
16935+{
16936+ siginfo_t info;
16937+
16938+ info.si_signo = si_signo;
16939+ info.si_errno = 0;
16940+ info.si_code = si_code;
16941+ info.si_addr = (void __user *)address;
16942+ force_sig_info(si_signo, &info, tsk);
16943+}
16944+
16945+fastcall void do_invalid_op(struct pt_regs *, unsigned long);
16946+
16947+#ifdef CONFIG_X86_PAE
16948+static void dump_fault_path(unsigned long address)
16949+{
16950+ unsigned long *p, page;
16951+ unsigned long mfn;
16952+
16953+ page = read_cr3();
16954+ p = (unsigned long *)__va(page);
16955+ p += (address >> 30) * 2;
16956+ printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
16957+ if (p[0] & 1) {
16958+ mfn = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
16959+ page = mfn_to_pfn(mfn) << PAGE_SHIFT;
16960+ p = (unsigned long *)__va(page);
16961+ address &= 0x3fffffff;
16962+ p += (address >> 21) * 2;
16963+ printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
16964+ page, p[1], p[0]);
16965+#ifndef CONFIG_HIGHPTE
16966+ if (p[0] & 1) {
16967+ mfn = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
16968+ page = mfn_to_pfn(mfn) << PAGE_SHIFT;
16969+ p = (unsigned long *) __va(page);
16970+ address &= 0x001fffff;
16971+ p += (address >> 12) * 2;
16972+ printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
16973+ page, p[1], p[0]);
16974+ }
16975+#endif
16976+ }
16977+}
16978+#else
16979+static void dump_fault_path(unsigned long address)
16980+{
16981+ unsigned long page;
16982+
16983+ page = read_cr3();
16984+ page = ((unsigned long *) __va(page))[address >> 22];
16985+ printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
16986+ machine_to_phys(page));
16987+ /*
16988+ * We must not directly access the pte in the highpte
16989+ * case, the page table might be allocated in highmem.
16990+ * And lets rather not kmap-atomic the pte, just in case
16991+ * it's allocated already.
16992+ */
16993+#ifndef CONFIG_HIGHPTE
16994+ if (page & 1) {
16995+ page &= PAGE_MASK;
16996+ address &= 0x003ff000;
16997+ page = machine_to_phys(page);
16998+ page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
16999+ printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
17000+ machine_to_phys(page));
17001+ }
17002+#endif
17003+}
17004+#endif
17005+
17006+static int spurious_fault(struct pt_regs *regs,
17007+ unsigned long address,
17008+ unsigned long error_code)
17009+{
17010+ pgd_t *pgd;
17011+ pud_t *pud;
17012+ pmd_t *pmd;
17013+ pte_t *pte;
17014+
17015+ /* Reserved-bit violation or user access to kernel space? */
17016+ if (error_code & 0x0c)
17017+ return 0;
17018+
17019+ pgd = init_mm.pgd + pgd_index(address);
17020+ if (!pgd_present(*pgd))
17021+ return 0;
17022+
17023+ pud = pud_offset(pgd, address);
17024+ if (!pud_present(*pud))
17025+ return 0;
17026+
17027+ pmd = pmd_offset(pud, address);
17028+ if (!pmd_present(*pmd))
17029+ return 0;
17030+
17031+ pte = pte_offset_kernel(pmd, address);
17032+ if (!pte_present(*pte))
17033+ return 0;
17034+ if ((error_code & 0x02) && !pte_write(*pte))
17035+ return 0;
17036+#ifdef CONFIG_X86_PAE
17037+ if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
17038+ return 0;
17039+#endif
17040+
17041+ return 1;
17042+}
17043+
17044+/*
17045+ * This routine handles page faults. It determines the address,
17046+ * and the problem, and then passes it off to one of the appropriate
17047+ * routines.
17048+ *
17049+ * error_code:
17050+ * bit 0 == 0 means no page found, 1 means protection fault
17051+ * bit 1 == 0 means read, 1 means write
17052+ * bit 2 == 0 means kernel, 1 means user-mode
17053+ */
17054+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
17055+ unsigned long error_code)
17056+{
17057+ struct task_struct *tsk;
17058+ struct mm_struct *mm;
17059+ struct vm_area_struct * vma;
17060+ unsigned long address;
17061+ int write, si_code;
17062+
17063+ /* get the address */
17064+ address = read_cr2();
17065+
17066+ /* Set the "privileged fault" bit to something sane. */
17067+ error_code &= ~4;
17068+ error_code |= (regs->xcs & 2) << 1;
17069+ if (regs->eflags & X86_EFLAGS_VM)
17070+ error_code |= 4;
17071+
17072+ if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
17073+ SIGSEGV) == NOTIFY_STOP)
17074+ return;
17075+ /* It's safe to allow irq's after cr2 has been saved */
17076+ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
17077+ local_irq_enable();
17078+
17079+ tsk = current;
17080+
17081+ si_code = SEGV_MAPERR;
17082+
17083+ /*
17084+ * We fault-in kernel-space virtual memory on-demand. The
17085+ * 'reference' page table is init_mm.pgd.
17086+ *
17087+ * NOTE! We MUST NOT take any locks for this case. We may
17088+ * be in an interrupt or a critical region, and should
17089+ * only copy the information from the master page table,
17090+ * nothing more.
17091+ *
17092+ * This verifies that the fault happens in kernel space
17093+ * (error_code & 4) == 0, and that the fault was not a
17094+ * protection error (error_code & 1) == 0.
17095+ */
17096+ if (unlikely(address >= TASK_SIZE)) {
17097+#ifdef CONFIG_XEN
17098+ /* Faults in hypervisor area can never be patched up. */
17099+ if (address >= hypervisor_virt_start)
17100+ goto bad_area_nosemaphore;
17101+#endif
17102+ if (!(error_code & 5))
17103+ goto vmalloc_fault;
17104+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
17105+ if (spurious_fault(regs, address, error_code))
17106+ return;
17107+ /*
17108+ * Don't take the mm semaphore here. If we fixup a prefetch
17109+ * fault we could otherwise deadlock.
17110+ */
17111+ goto bad_area_nosemaphore;
17112+ }
17113+
17114+ mm = tsk->mm;
17115+
17116+ /*
17117+ * If we're in an interrupt, have no user context or are running in an
17118+ * atomic region then we must not take the fault..
17119+ */
17120+ if (in_atomic() || !mm)
17121+ goto bad_area_nosemaphore;
17122+
17123+ /* When running in the kernel we expect faults to occur only to
17124+ * addresses in user space. All other faults represent errors in the
17125+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
17126+ * erroneous fault occuring in a code path which already holds mmap_sem
17127+ * we will deadlock attempting to validate the fault against the
17128+ * address space. Luckily the kernel only validly references user
17129+ * space from well defined areas of code, which are listed in the
17130+ * exceptions table.
17131+ *
17132+ * As the vast majority of faults will be valid we will only perform
17133+ * the source reference check when there is a possibilty of a deadlock.
17134+ * Attempt to lock the address space, if we cannot we then validate the
17135+ * source. If this is invalid we can skip the address space check,
17136+ * thus avoiding the deadlock.
17137+ */
17138+ if (!down_read_trylock(&mm->mmap_sem)) {
17139+ if ((error_code & 4) == 0 &&
17140+ !search_exception_tables(regs->eip))
17141+ goto bad_area_nosemaphore;
17142+ down_read(&mm->mmap_sem);
17143+ }
17144+
17145+ vma = find_vma(mm, address);
17146+ if (!vma)
17147+ goto bad_area;
17148+ if (vma->vm_start <= address)
17149+ goto good_area;
17150+ if (!(vma->vm_flags & VM_GROWSDOWN))
17151+ goto bad_area;
17152+ if (error_code & 4) {
17153+ /*
17154+ * accessing the stack below %esp is always a bug.
17155+ * The "+ 32" is there due to some instructions (like
17156+ * pusha) doing post-decrement on the stack and that
17157+ * doesn't show up until later..
17158+ */
17159+ if (address + 32 < regs->esp)
17160+ goto bad_area;
17161+ }
17162+ if (expand_stack(vma, address))
17163+ goto bad_area;
17164+/*
17165+ * Ok, we have a good vm_area for this memory access, so
17166+ * we can handle it..
17167+ */
17168+good_area:
17169+ si_code = SEGV_ACCERR;
17170+ write = 0;
17171+ switch (error_code & 3) {
17172+ default: /* 3: write, present */
17173+#ifdef TEST_VERIFY_AREA
17174+ if (regs->cs == GET_KERNEL_CS())
17175+ printk("WP fault at %08lx\n", regs->eip);
17176+#endif
17177+ /* fall through */
17178+ case 2: /* write, not present */
17179+ if (!(vma->vm_flags & VM_WRITE))
17180+ goto bad_area;
17181+ write++;
17182+ break;
17183+ case 1: /* read, present */
17184+ goto bad_area;
17185+ case 0: /* read, not present */
17186+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
17187+ goto bad_area;
17188+ }
17189+
17190+ survive:
17191+ /*
17192+ * If for any reason at all we couldn't handle the fault,
17193+ * make sure we exit gracefully rather than endlessly redo
17194+ * the fault.
17195+ */
17196+ switch (handle_mm_fault(mm, vma, address, write)) {
17197+ case VM_FAULT_MINOR:
17198+ tsk->min_flt++;
17199+ break;
17200+ case VM_FAULT_MAJOR:
17201+ tsk->maj_flt++;
17202+ break;
17203+ case VM_FAULT_SIGBUS:
17204+ goto do_sigbus;
17205+ case VM_FAULT_OOM:
17206+ goto out_of_memory;
17207+ default:
17208+ BUG();
17209+ }
17210+
17211+ /*
17212+ * Did it hit the DOS screen memory VA from vm86 mode?
17213+ */
17214+ if (regs->eflags & VM_MASK) {
17215+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
17216+ if (bit < 32)
17217+ tsk->thread.screen_bitmap |= 1 << bit;
17218+ }
17219+ up_read(&mm->mmap_sem);
17220+ return;
17221+
17222+/*
17223+ * Something tried to access memory that isn't in our memory map..
17224+ * Fix it, but check if it's kernel or user first..
17225+ */
17226+bad_area:
17227+ up_read(&mm->mmap_sem);
17228+
17229+bad_area_nosemaphore:
17230+ /* User mode accesses just cause a SIGSEGV */
17231+ if (error_code & 4) {
17232+ /*
17233+ * Valid to do another page fault here because this one came
17234+ * from user space.
17235+ */
17236+ if (is_prefetch(regs, address, error_code))
17237+ return;
17238+
17239+ tsk->thread.cr2 = address;
17240+ /* Kernel addresses are always protection faults */
17241+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
17242+ tsk->thread.trap_no = 14;
17243+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
17244+ return;
17245+ }
17246+
17247+#ifdef CONFIG_X86_F00F_BUG
17248+ /*
17249+ * Pentium F0 0F C7 C8 bug workaround.
17250+ */
17251+ if (boot_cpu_data.f00f_bug) {
17252+ unsigned long nr;
17253+
17254+ nr = (address - idt_descr.address) >> 3;
17255+
17256+ if (nr == 6) {
17257+ do_invalid_op(regs, 0);
17258+ return;
17259+ }
17260+ }
17261+#endif
17262+
17263+no_context:
17264+ /* Are we prepared to handle this kernel fault? */
17265+ if (fixup_exception(regs))
17266+ return;
17267+
17268+ /*
17269+ * Valid to do another page fault here, because if this fault
17270+ * had been triggered by is_prefetch fixup_exception would have
17271+ * handled it.
17272+ */
17273+ if (is_prefetch(regs, address, error_code))
17274+ return;
17275+
17276+/*
17277+ * Oops. The kernel tried to access some bad page. We'll have to
17278+ * terminate things with extreme prejudice.
17279+ */
17280+
17281+ bust_spinlocks(1);
17282+
17283+#ifdef CONFIG_X86_PAE
17284+ if (error_code & 16) {
17285+ pte_t *pte = lookup_address(address);
17286+
17287+ if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
17288+ printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
17289+ }
17290+#endif
17291+ if (address < PAGE_SIZE)
17292+ printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
17293+ else
17294+ printk(KERN_ALERT "Unable to handle kernel paging request");
17295+ printk(" at virtual address %08lx\n",address);
17296+ printk(KERN_ALERT " printing eip:\n");
17297+ printk("%08lx\n", regs->eip);
17298+ dump_fault_path(address);
17299+ tsk->thread.cr2 = address;
17300+ tsk->thread.trap_no = 14;
17301+ tsk->thread.error_code = error_code;
17302+ die("Oops", regs, error_code);
17303+ bust_spinlocks(0);
17304+ do_exit(SIGKILL);
17305+
17306+/*
17307+ * We ran out of memory, or some other thing happened to us that made
17308+ * us unable to handle the page fault gracefully.
17309+ */
17310+out_of_memory:
17311+ up_read(&mm->mmap_sem);
17312+ if (tsk->pid == 1) {
17313+ yield();
17314+ down_read(&mm->mmap_sem);
17315+ goto survive;
17316+ }
17317+ printk("VM: killing process %s\n", tsk->comm);
17318+ if (error_code & 4)
17319+ do_exit(SIGKILL);
17320+ goto no_context;
17321+
17322+do_sigbus:
17323+ up_read(&mm->mmap_sem);
17324+
17325+ /* Kernel mode? Handle exceptions or die */
17326+ if (!(error_code & 4))
17327+ goto no_context;
17328+
17329+ /* User space => ok to do another page fault */
17330+ if (is_prefetch(regs, address, error_code))
17331+ return;
17332+
17333+ tsk->thread.cr2 = address;
17334+ tsk->thread.error_code = error_code;
17335+ tsk->thread.trap_no = 14;
17336+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
17337+ return;
17338+
17339+vmalloc_fault:
17340+ {
17341+ /*
17342+ * Synchronize this task's top level page-table
17343+ * with the 'reference' page table.
17344+ *
17345+ * Do _not_ use "tsk" here. We might be inside
17346+ * an interrupt in the middle of a task switch..
17347+ */
17348+ int index = pgd_index(address);
17349+ unsigned long pgd_paddr;
17350+ pgd_t *pgd, *pgd_k;
17351+ pud_t *pud, *pud_k;
17352+ pmd_t *pmd, *pmd_k;
17353+ pte_t *pte_k;
17354+
17355+ pgd_paddr = read_cr3();
17356+ pgd = index + (pgd_t *)__va(pgd_paddr);
17357+ pgd_k = init_mm.pgd + index;
17358+
17359+ if (!pgd_present(*pgd_k))
17360+ goto no_context;
17361+
17362+ /*
17363+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
17364+ * and redundant with the set_pmd() on non-PAE. As would
17365+ * set_pud.
17366+ */
17367+
17368+ pud = pud_offset(pgd, address);
17369+ pud_k = pud_offset(pgd_k, address);
17370+ if (!pud_present(*pud_k))
17371+ goto no_context;
17372+
17373+ pmd = pmd_offset(pud, address);
17374+ pmd_k = pmd_offset(pud_k, address);
17375+ if (!pmd_present(*pmd_k))
17376+ goto no_context;
17377+#ifndef CONFIG_XEN
17378+ set_pmd(pmd, *pmd_k);
17379+#else
17380+ /*
17381+ * When running on Xen we must launder *pmd_k through
17382+ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
17383+ */
17384+ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
17385+#endif
17386+
17387+ pte_k = pte_offset_kernel(pmd_k, address);
17388+ if (!pte_present(*pte_k))
17389+ goto no_context;
17390+ return;
17391+ }
17392+}
17393diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/highmem-xen.c linux-2.6.16.33/arch/i386/mm/highmem-xen.c
17394--- linux-2.6.16.33-noxen/arch/i386/mm/highmem-xen.c 1970-01-01 00:00:00.000000000 +0000
17395+++ linux-2.6.16.33/arch/i386/mm/highmem-xen.c 2007-01-08 15:00:45.000000000 +0000
17396@@ -0,0 +1,133 @@
17397+#include <linux/highmem.h>
17398+#include <linux/module.h>
17399+
17400+void *kmap(struct page *page)
17401+{
17402+ might_sleep();
17403+ if (!PageHighMem(page))
17404+ return page_address(page);
17405+ return kmap_high(page);
17406+}
17407+
17408+void kunmap(struct page *page)
17409+{
17410+ if (in_interrupt())
17411+ BUG();
17412+ if (!PageHighMem(page))
17413+ return;
17414+ kunmap_high(page);
17415+}
17416+
17417+/*
17418+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
17419+ * no global lock is needed and because the kmap code must perform a global TLB
17420+ * invalidation when the kmap pool wraps.
17421+ *
17422+ * However when holding an atomic kmap is is not legal to sleep, so atomic
17423+ * kmaps are appropriate for short, tight code paths only.
17424+ */
17425+static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
17426+{
17427+ enum fixed_addresses idx;
17428+ unsigned long vaddr;
17429+
17430+ /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
17431+ inc_preempt_count();
17432+ if (!PageHighMem(page))
17433+ return page_address(page);
17434+
17435+ idx = type + KM_TYPE_NR*smp_processor_id();
17436+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17437+#ifdef CONFIG_DEBUG_HIGHMEM
17438+ if (!pte_none(*(kmap_pte-idx)))
17439+ BUG();
17440+#endif
17441+ set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
17442+
17443+ return (void*) vaddr;
17444+}
17445+
17446+void *kmap_atomic(struct page *page, enum km_type type)
17447+{
17448+ return __kmap_atomic(page, type, kmap_prot);
17449+}
17450+
17451+/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
17452+void *kmap_atomic_pte(struct page *page, enum km_type type)
17453+{
17454+ return __kmap_atomic(page, type, PAGE_KERNEL_RO);
17455+}
17456+
17457+void kunmap_atomic(void *kvaddr, enum km_type type)
17458+{
17459+#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
17460+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
17461+ enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
17462+
17463+ if (vaddr < FIXADDR_START) { // FIXME
17464+ dec_preempt_count();
17465+ preempt_check_resched();
17466+ return;
17467+ }
17468+#endif
17469+
17470+#if defined(CONFIG_DEBUG_HIGHMEM)
17471+ if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
17472+ BUG();
17473+
17474+ /*
17475+ * force other mappings to Oops if they'll try to access
17476+ * this pte without first remap it
17477+ */
17478+ pte_clear(&init_mm, vaddr, kmap_pte-idx);
17479+ __flush_tlb_one(vaddr);
17480+#elif defined(CONFIG_XEN)
17481+ /*
17482+ * We must ensure there are no dangling pagetable references when
17483+ * returning memory to Xen (decrease_reservation).
17484+ * XXX TODO: We could make this faster by only zapping when
17485+ * kmap_flush_unused is called but that is trickier and more invasive.
17486+ */
17487+ pte_clear(&init_mm, vaddr, kmap_pte-idx);
17488+#endif
17489+
17490+ dec_preempt_count();
17491+ preempt_check_resched();
17492+}
17493+
17494+/* This is the same as kmap_atomic() but can map memory that doesn't
17495+ * have a struct page associated with it.
17496+ */
17497+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
17498+{
17499+ enum fixed_addresses idx;
17500+ unsigned long vaddr;
17501+
17502+ inc_preempt_count();
17503+
17504+ idx = type + KM_TYPE_NR*smp_processor_id();
17505+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17506+ set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
17507+ __flush_tlb_one(vaddr);
17508+
17509+ return (void*) vaddr;
17510+}
17511+
17512+struct page *kmap_atomic_to_page(void *ptr)
17513+{
17514+ unsigned long idx, vaddr = (unsigned long)ptr;
17515+ pte_t *pte;
17516+
17517+ if (vaddr < FIXADDR_START)
17518+ return virt_to_page(ptr);
17519+
17520+ idx = virt_to_fix(vaddr);
17521+ pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
17522+ return pte_page(*pte);
17523+}
17524+
17525+EXPORT_SYMBOL(kmap);
17526+EXPORT_SYMBOL(kunmap);
17527+EXPORT_SYMBOL(kmap_atomic);
17528+EXPORT_SYMBOL(kunmap_atomic);
17529+EXPORT_SYMBOL(kmap_atomic_to_page);
17530diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/hypervisor.c linux-2.6.16.33/arch/i386/mm/hypervisor.c
17531--- linux-2.6.16.33-noxen/arch/i386/mm/hypervisor.c 1970-01-01 00:00:00.000000000 +0000
17532+++ linux-2.6.16.33/arch/i386/mm/hypervisor.c 2007-01-08 15:00:45.000000000 +0000
17533@@ -0,0 +1,450 @@
17534+/******************************************************************************
17535+ * mm/hypervisor.c
17536+ *
17537+ * Update page tables via the hypervisor.
17538+ *
17539+ * Copyright (c) 2002-2004, K A Fraser
17540+ *
17541+ * This program is free software; you can redistribute it and/or
17542+ * modify it under the terms of the GNU General Public License version 2
17543+ * as published by the Free Software Foundation; or, when distributed
17544+ * separately from the Linux kernel or incorporated into other
17545+ * software packages, subject to the following license:
17546+ *
17547+ * Permission is hereby granted, free of charge, to any person obtaining a copy
17548+ * of this source file (the "Software"), to deal in the Software without
17549+ * restriction, including without limitation the rights to use, copy, modify,
17550+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17551+ * and to permit persons to whom the Software is furnished to do so, subject to
17552+ * the following conditions:
17553+ *
17554+ * The above copyright notice and this permission notice shall be included in
17555+ * all copies or substantial portions of the Software.
17556+ *
17557+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17558+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17559+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17560+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17561+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17562+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
17563+ * IN THE SOFTWARE.
17564+ */
17565+
17566+#include <linux/config.h>
17567+#include <linux/sched.h>
17568+#include <linux/mm.h>
17569+#include <linux/vmalloc.h>
17570+#include <asm/page.h>
17571+#include <asm/pgtable.h>
17572+#include <asm/hypervisor.h>
17573+#include <xen/balloon.h>
17574+#include <xen/features.h>
17575+#include <xen/interface/memory.h>
17576+#include <linux/module.h>
17577+#include <linux/percpu.h>
17578+#include <asm/tlbflush.h>
17579+
17580+#ifdef CONFIG_X86_64
17581+#define pmd_val_ma(v) (v).pmd
17582+#else
17583+#ifdef CONFIG_X86_PAE
17584+# define pmd_val_ma(v) ((v).pmd)
17585+# define pud_val_ma(v) ((v).pgd.pgd)
17586+#else
17587+# define pmd_val_ma(v) ((v).pud.pgd.pgd)
17588+#endif
17589+#endif
17590+
17591+void xen_l1_entry_update(pte_t *ptr, pte_t val)
17592+{
17593+ mmu_update_t u;
17594+ u.ptr = virt_to_machine(ptr);
17595+ u.val = pte_val_ma(val);
17596+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17597+}
17598+
17599+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
17600+{
17601+ mmu_update_t u;
17602+ u.ptr = virt_to_machine(ptr);
17603+ u.val = pmd_val_ma(val);
17604+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17605+}
17606+
17607+#ifdef CONFIG_X86_PAE
17608+void xen_l3_entry_update(pud_t *ptr, pud_t val)
17609+{
17610+ mmu_update_t u;
17611+ u.ptr = virt_to_machine(ptr);
17612+ u.val = pud_val_ma(val);
17613+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17614+}
17615+#endif
17616+
17617+#ifdef CONFIG_X86_64
17618+void xen_l3_entry_update(pud_t *ptr, pud_t val)
17619+{
17620+ mmu_update_t u;
17621+ u.ptr = virt_to_machine(ptr);
17622+ u.val = val.pud;
17623+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17624+}
17625+
17626+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
17627+{
17628+ mmu_update_t u;
17629+ u.ptr = virt_to_machine(ptr);
17630+ u.val = val.pgd;
17631+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17632+}
17633+#endif /* CONFIG_X86_64 */
17634+
17635+void xen_pt_switch(unsigned long ptr)
17636+{
17637+ struct mmuext_op op;
17638+ op.cmd = MMUEXT_NEW_BASEPTR;
17639+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17640+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17641+}
17642+
17643+void xen_new_user_pt(unsigned long ptr)
17644+{
17645+ struct mmuext_op op;
17646+ op.cmd = MMUEXT_NEW_USER_BASEPTR;
17647+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17648+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17649+}
17650+
17651+void xen_tlb_flush(void)
17652+{
17653+ struct mmuext_op op;
17654+ op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
17655+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17656+}
17657+EXPORT_SYMBOL(xen_tlb_flush);
17658+
17659+void xen_invlpg(unsigned long ptr)
17660+{
17661+ struct mmuext_op op;
17662+ op.cmd = MMUEXT_INVLPG_LOCAL;
17663+ op.arg1.linear_addr = ptr & PAGE_MASK;
17664+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17665+}
17666+EXPORT_SYMBOL(xen_invlpg);
17667+
17668+#ifdef CONFIG_SMP
17669+
17670+void xen_tlb_flush_all(void)
17671+{
17672+ struct mmuext_op op;
17673+ op.cmd = MMUEXT_TLB_FLUSH_ALL;
17674+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17675+}
17676+
17677+void xen_tlb_flush_mask(cpumask_t *mask)
17678+{
17679+ struct mmuext_op op;
17680+ if ( cpus_empty(*mask) )
17681+ return;
17682+ op.cmd = MMUEXT_TLB_FLUSH_MULTI;
17683+ op.arg2.vcpumask = mask->bits;
17684+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17685+}
17686+
17687+void xen_invlpg_all(unsigned long ptr)
17688+{
17689+ struct mmuext_op op;
17690+ op.cmd = MMUEXT_INVLPG_ALL;
17691+ op.arg1.linear_addr = ptr & PAGE_MASK;
17692+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17693+}
17694+
17695+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
17696+{
17697+ struct mmuext_op op;
17698+ if ( cpus_empty(*mask) )
17699+ return;
17700+ op.cmd = MMUEXT_INVLPG_MULTI;
17701+ op.arg1.linear_addr = ptr & PAGE_MASK;
17702+ op.arg2.vcpumask = mask->bits;
17703+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17704+}
17705+
17706+#endif /* CONFIG_SMP */
17707+
17708+void xen_pgd_pin(unsigned long ptr)
17709+{
17710+ struct mmuext_op op;
17711+#ifdef CONFIG_X86_64
17712+ op.cmd = MMUEXT_PIN_L4_TABLE;
17713+#elif defined(CONFIG_X86_PAE)
17714+ op.cmd = MMUEXT_PIN_L3_TABLE;
17715+#else
17716+ op.cmd = MMUEXT_PIN_L2_TABLE;
17717+#endif
17718+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17719+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17720+}
17721+
17722+void xen_pgd_unpin(unsigned long ptr)
17723+{
17724+ struct mmuext_op op;
17725+ op.cmd = MMUEXT_UNPIN_TABLE;
17726+ op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17727+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17728+}
17729+
17730+void xen_set_ldt(unsigned long ptr, unsigned long len)
17731+{
17732+ struct mmuext_op op;
17733+ op.cmd = MMUEXT_SET_LDT;
17734+ op.arg1.linear_addr = ptr;
17735+ op.arg2.nr_ents = len;
17736+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17737+}
17738+
17739+/*
17740+ * Bitmap is indexed by page number. If bit is set, the page is part of a
17741+ * xen_create_contiguous_region() area of memory.
17742+ */
17743+unsigned long *contiguous_bitmap;
17744+
17745+static void contiguous_bitmap_set(
17746+ unsigned long first_page, unsigned long nr_pages)
17747+{
17748+ unsigned long start_off, end_off, curr_idx, end_idx;
17749+
17750+ curr_idx = first_page / BITS_PER_LONG;
17751+ start_off = first_page & (BITS_PER_LONG-1);
17752+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
17753+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
17754+
17755+ if (curr_idx == end_idx) {
17756+ contiguous_bitmap[curr_idx] |=
17757+ ((1UL<<end_off)-1) & -(1UL<<start_off);
17758+ } else {
17759+ contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
17760+ while ( ++curr_idx < end_idx )
17761+ contiguous_bitmap[curr_idx] = ~0UL;
17762+ contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
17763+ }
17764+}
17765+
17766+static void contiguous_bitmap_clear(
17767+ unsigned long first_page, unsigned long nr_pages)
17768+{
17769+ unsigned long start_off, end_off, curr_idx, end_idx;
17770+
17771+ curr_idx = first_page / BITS_PER_LONG;
17772+ start_off = first_page & (BITS_PER_LONG-1);
17773+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
17774+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
17775+
17776+ if (curr_idx == end_idx) {
17777+ contiguous_bitmap[curr_idx] &=
17778+ -(1UL<<end_off) | ((1UL<<start_off)-1);
17779+ } else {
17780+ contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
17781+ while ( ++curr_idx != end_idx )
17782+ contiguous_bitmap[curr_idx] = 0;
17783+ contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
17784+ }
17785+}
17786+
17787+/* Protected by balloon_lock. */
17788+#define MAX_CONTIG_ORDER 9 /* 2MB */
17789+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
17790+static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
17791+
17792+/* Ensure multi-page extents are contiguous in machine memory. */
17793+int xen_create_contiguous_region(
17794+ unsigned long vstart, unsigned int order, unsigned int address_bits)
17795+{
17796+ unsigned long *in_frames = discontig_frames, out_frame;
17797+ unsigned long frame, i, flags;
17798+ long rc;
17799+ int success;
17800+ struct xen_memory_exchange exchange = {
17801+ .in = {
17802+ .nr_extents = 1UL << order,
17803+ .extent_order = 0,
17804+ .domid = DOMID_SELF
17805+ },
17806+ .out = {
17807+ .nr_extents = 1,
17808+ .extent_order = order,
17809+ .address_bits = address_bits,
17810+ .domid = DOMID_SELF
17811+ }
17812+ };
17813+
17814+ /*
17815+ * Currently an auto-translated guest will not perform I/O, nor will
17816+ * it require PAE page directories below 4GB. Therefore any calls to
17817+ * this function are redundant and can be ignored.
17818+ */
17819+ if (xen_feature(XENFEAT_auto_translated_physmap))
17820+ return 0;
17821+
17822+ if (unlikely(order > MAX_CONTIG_ORDER))
17823+ return -ENOMEM;
17824+
17825+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
17826+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
17827+
17828+ scrub_pages(vstart, 1 << order);
17829+
17830+ balloon_lock(flags);
17831+
17832+ /* 1. Zap current PTEs, remembering MFNs. */
17833+ for (i = 0; i < (1UL<<order); i++) {
17834+ in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
17835+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17836+ __pte_ma(0), 0);
17837+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17838+ INVALID_P2M_ENTRY);
17839+ }
17840+ if (HYPERVISOR_multicall(cr_mcl, i))
17841+ BUG();
17842+
17843+ /* 2. Get a new contiguous memory extent. */
17844+ out_frame = __pa(vstart) >> PAGE_SHIFT;
17845+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17846+ success = (exchange.nr_exchanged == (1UL << order));
17847+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17848+ BUG_ON(success && (rc != 0));
17849+#ifdef CONFIG_XEN_COMPAT_030002
17850+ if (unlikely(rc == -ENOSYS)) {
17851+ /* Compatibility when XENMEM_exchange is unsupported. */
17852+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17853+ &exchange.in) != (1UL << order))
17854+ BUG();
17855+ success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17856+ &exchange.out) == 1);
17857+ if (!success) {
17858+ /* Couldn't get special memory: fall back to normal. */
17859+ for (i = 0; i < (1UL<<order); i++)
17860+ in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
17861+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17862+ &exchange.in) != (1UL<<order))
17863+ BUG();
17864+ }
17865+ }
17866+#endif
17867+
17868+ /* 3. Map the new extent in place of old pages. */
17869+ for (i = 0; i < (1UL<<order); i++) {
17870+ frame = success ? (out_frame + i) : in_frames[i];
17871+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17872+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
17873+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17874+ }
17875+
17876+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
17877+ ? UVMF_TLB_FLUSH|UVMF_ALL
17878+ : UVMF_INVLPG|UVMF_ALL;
17879+ if (HYPERVISOR_multicall(cr_mcl, i))
17880+ BUG();
17881+
17882+ if (success)
17883+ contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT,
17884+ 1UL << order);
17885+
17886+ balloon_unlock(flags);
17887+
17888+ return success ? 0 : -ENOMEM;
17889+}
17890+
17891+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
17892+{
17893+ unsigned long *out_frames = discontig_frames, in_frame;
17894+ unsigned long frame, i, flags;
17895+ long rc;
17896+ int success;
17897+ struct xen_memory_exchange exchange = {
17898+ .in = {
17899+ .nr_extents = 1,
17900+ .extent_order = order,
17901+ .domid = DOMID_SELF
17902+ },
17903+ .out = {
17904+ .nr_extents = 1UL << order,
17905+ .extent_order = 0,
17906+ .domid = DOMID_SELF
17907+ }
17908+ };
17909+
17910+ if (xen_feature(XENFEAT_auto_translated_physmap) ||
17911+ !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap))
17912+ return;
17913+
17914+ if (unlikely(order > MAX_CONTIG_ORDER))
17915+ return;
17916+
17917+ set_xen_guest_handle(exchange.in.extent_start, &in_frame);
17918+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
17919+
17920+ scrub_pages(vstart, 1 << order);
17921+
17922+ balloon_lock(flags);
17923+
17924+ contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
17925+
17926+ /* 1. Find start MFN of contiguous extent. */
17927+ in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
17928+
17929+ /* 2. Zap current PTEs. */
17930+ for (i = 0; i < (1UL<<order); i++) {
17931+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17932+ __pte_ma(0), 0);
17933+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17934+ INVALID_P2M_ENTRY);
17935+ out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
17936+ }
17937+ if (HYPERVISOR_multicall(cr_mcl, i))
17938+ BUG();
17939+
17940+ /* 3. Do the exchange for non-contiguous MFNs. */
17941+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17942+ success = (exchange.nr_exchanged == 1);
17943+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17944+ BUG_ON(success && (rc != 0));
17945+#ifdef CONFIG_XEN_COMPAT_030002
17946+ if (unlikely(rc == -ENOSYS)) {
17947+ /* Compatibility when XENMEM_exchange is unsupported. */
17948+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17949+ &exchange.in) != 1)
17950+ BUG();
17951+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17952+ &exchange.out) != (1UL << order))
17953+ BUG();
17954+ success = 1;
17955+ }
17956+#endif
17957+
17958+ /* 4. Map new pages in place of old pages. */
17959+ for (i = 0; i < (1UL<<order); i++) {
17960+ frame = success ? out_frames[i] : (in_frame + i);
17961+ MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17962+ pfn_pte_ma(frame, PAGE_KERNEL), 0);
17963+ set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17964+ }
17965+
17966+ cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
17967+ ? UVMF_TLB_FLUSH|UVMF_ALL
17968+ : UVMF_INVLPG|UVMF_ALL;
17969+ if (HYPERVISOR_multicall(cr_mcl, i))
17970+ BUG();
17971+
17972+ balloon_unlock(flags);
17973+}
17974+
17975+#ifdef __i386__
17976+int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
17977+{
17978+ __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
17979+ maddr_t mach_lp = arbitrary_virt_to_machine(lp);
17980+ return HYPERVISOR_update_descriptor(
17981+ mach_lp, (u64)entry_a | ((u64)entry_b<<32));
17982+}
17983+#endif
17984diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/init-xen.c linux-2.6.16.33/arch/i386/mm/init-xen.c
17985--- linux-2.6.16.33-noxen/arch/i386/mm/init-xen.c 1970-01-01 00:00:00.000000000 +0000
17986+++ linux-2.6.16.33/arch/i386/mm/init-xen.c 2007-01-08 15:00:45.000000000 +0000
17987@@ -0,0 +1,849 @@
17988+/*
17989+ * linux/arch/i386/mm/init.c
17990+ *
17991+ * Copyright (C) 1995 Linus Torvalds
17992+ *
17993+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
17994+ */
17995+
17996+#include <linux/config.h>
17997+#include <linux/module.h>
17998+#include <linux/signal.h>
17999+#include <linux/sched.h>
18000+#include <linux/kernel.h>
18001+#include <linux/errno.h>
18002+#include <linux/string.h>
18003+#include <linux/types.h>
18004+#include <linux/ptrace.h>
18005+#include <linux/mman.h>
18006+#include <linux/mm.h>
18007+#include <linux/hugetlb.h>
18008+#include <linux/swap.h>
18009+#include <linux/smp.h>
18010+#include <linux/init.h>
18011+#include <linux/highmem.h>
18012+#include <linux/pagemap.h>
18013+#include <linux/bootmem.h>
18014+#include <linux/slab.h>
18015+#include <linux/proc_fs.h>
18016+#include <linux/efi.h>
18017+#include <linux/memory_hotplug.h>
18018+#include <linux/initrd.h>
18019+#include <linux/dma-mapping.h>
18020+#include <linux/scatterlist.h>
18021+
18022+#include <asm/processor.h>
18023+#include <asm/system.h>
18024+#include <asm/uaccess.h>
18025+#include <asm/pgtable.h>
18026+#include <asm/dma.h>
18027+#include <asm/fixmap.h>
18028+#include <asm/e820.h>
18029+#include <asm/apic.h>
18030+#include <asm/tlb.h>
18031+#include <asm/tlbflush.h>
18032+#include <asm/sections.h>
18033+#include <asm/hypervisor.h>
18034+#include <asm/swiotlb.h>
18035+
18036+extern unsigned long *contiguous_bitmap;
18037+
18038+unsigned int __VMALLOC_RESERVE = 128 << 20;
18039+
18040+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18041+unsigned long highstart_pfn, highend_pfn;
18042+
18043+static int noinline do_test_wp_bit(void);
18044+
18045+/*
18046+ * Creates a middle page table and puts a pointer to it in the
18047+ * given global directory entry. This only returns the gd entry
18048+ * in non-PAE compilation mode, since the middle layer is folded.
18049+ */
18050+static pmd_t * __init one_md_table_init(pgd_t *pgd)
18051+{
18052+ pud_t *pud;
18053+ pmd_t *pmd_table;
18054+
18055+#ifdef CONFIG_X86_PAE
18056+ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18057+ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18058+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18059+ pud = pud_offset(pgd, 0);
18060+ if (pmd_table != pmd_offset(pud, 0))
18061+ BUG();
18062+#else
18063+ pud = pud_offset(pgd, 0);
18064+ pmd_table = pmd_offset(pud, 0);
18065+#endif
18066+
18067+ return pmd_table;
18068+}
18069+
18070+/*
18071+ * Create a page table and place a pointer to it in a middle page
18072+ * directory entry.
18073+ */
18074+static pte_t * __init one_page_table_init(pmd_t *pmd)
18075+{
18076+ if (pmd_none(*pmd)) {
18077+ pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18078+ make_lowmem_page_readonly(page_table,
18079+ XENFEAT_writable_page_tables);
18080+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
18081+ if (page_table != pte_offset_kernel(pmd, 0))
18082+ BUG();
18083+
18084+ return page_table;
18085+ }
18086+
18087+ return pte_offset_kernel(pmd, 0);
18088+}
18089+
18090+/*
18091+ * This function initializes a certain range of kernel virtual memory
18092+ * with new bootmem page tables, everywhere page tables are missing in
18093+ * the given range.
18094+ */
18095+
18096+/*
18097+ * NOTE: The pagetables are allocated contiguous on the physical space
18098+ * so we can cache the place of the first one and move around without
18099+ * checking the pgd every time.
18100+ */
18101+static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
18102+{
18103+ pgd_t *pgd;
18104+ pud_t *pud;
18105+ pmd_t *pmd;
18106+ int pgd_idx, pmd_idx;
18107+ unsigned long vaddr;
18108+
18109+ vaddr = start;
18110+ pgd_idx = pgd_index(vaddr);
18111+ pmd_idx = pmd_index(vaddr);
18112+ pgd = pgd_base + pgd_idx;
18113+
18114+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
18115+ if (pgd_none(*pgd))
18116+ one_md_table_init(pgd);
18117+ pud = pud_offset(pgd, vaddr);
18118+ pmd = pmd_offset(pud, vaddr);
18119+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
18120+ if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
18121+ one_page_table_init(pmd);
18122+
18123+ vaddr += PMD_SIZE;
18124+ }
18125+ pmd_idx = 0;
18126+ }
18127+}
18128+
18129+static inline int is_kernel_text(unsigned long addr)
18130+{
18131+ if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
18132+ return 1;
18133+ return 0;
18134+}
18135+
18136+/*
18137+ * This maps the physical memory to kernel virtual address space, a total
18138+ * of max_low_pfn pages, by creating page tables starting from address
18139+ * PAGE_OFFSET.
18140+ */
18141+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18142+{
18143+ unsigned long pfn;
18144+ pgd_t *pgd;
18145+ pmd_t *pmd;
18146+ pte_t *pte;
18147+ int pgd_idx, pmd_idx, pte_ofs;
18148+
18149+ unsigned long max_ram_pfn = xen_start_info->nr_pages;
18150+ if (max_ram_pfn > max_low_pfn)
18151+ max_ram_pfn = max_low_pfn;
18152+
18153+ pgd_idx = pgd_index(PAGE_OFFSET);
18154+ pgd = pgd_base + pgd_idx;
18155+ pfn = 0;
18156+ pmd_idx = pmd_index(PAGE_OFFSET);
18157+ pte_ofs = pte_index(PAGE_OFFSET);
18158+
18159+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18160+#ifdef CONFIG_XEN
18161+ /*
18162+ * Native linux hasn't PAE-paging enabled yet at this
18163+ * point. When running as xen domain we are in PAE
18164+ * mode already, thus we can't simply hook a empty
18165+ * pmd. That would kill the mappings we are currently
18166+ * using ...
18167+ */
18168+ pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
18169+#else
18170+ pmd = one_md_table_init(pgd);
18171+#endif
18172+ if (pfn >= max_low_pfn)
18173+ continue;
18174+ pmd += pmd_idx;
18175+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
18176+ unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
18177+ if (address >= hypervisor_virt_start)
18178+ continue;
18179+
18180+ /* Map with big pages if possible, otherwise create normal page tables. */
18181+ if (cpu_has_pse) {
18182+ unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
18183+
18184+ if (is_kernel_text(address) || is_kernel_text(address2))
18185+ set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
18186+ else
18187+ set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
18188+ pfn += PTRS_PER_PTE;
18189+ } else {
18190+ pte = one_page_table_init(pmd);
18191+
18192+ pte += pte_ofs;
18193+ for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
18194+ /* XEN: Only map initial RAM allocation. */
18195+ if ((pfn >= max_ram_pfn) || pte_present(*pte))
18196+ continue;
18197+ if (is_kernel_text(address))
18198+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
18199+ else
18200+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
18201+ }
18202+ pte_ofs = 0;
18203+ }
18204+ }
18205+ pmd_idx = 0;
18206+ }
18207+}
18208+
18209+#ifndef CONFIG_XEN
18210+
18211+static inline int page_kills_ppro(unsigned long pagenr)
18212+{
18213+ if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18214+ return 1;
18215+ return 0;
18216+}
18217+
18218+#else
18219+
18220+#define page_kills_ppro(p) 0
18221+
18222+#endif
18223+
18224+extern int is_available_memory(efi_memory_desc_t *);
18225+
18226+int page_is_ram(unsigned long pagenr)
18227+{
18228+ int i;
18229+ unsigned long addr, end;
18230+
18231+ if (efi_enabled) {
18232+ efi_memory_desc_t *md;
18233+ void *p;
18234+
18235+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
18236+ md = p;
18237+ if (!is_available_memory(md))
18238+ continue;
18239+ addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18240+ end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
18241+
18242+ if ((pagenr >= addr) && (pagenr < end))
18243+ return 1;
18244+ }
18245+ return 0;
18246+ }
18247+
18248+ for (i = 0; i < e820.nr_map; i++) {
18249+
18250+ if (e820.map[i].type != E820_RAM) /* not usable memory */
18251+ continue;
18252+ /*
18253+ * !!!FIXME!!! Some BIOSen report areas as RAM that
18254+ * are not. Notably the 640->1Mb area. We need a sanity
18255+ * check here.
18256+ */
18257+ addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18258+ end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
18259+ if ((pagenr >= addr) && (pagenr < end))
18260+ return 1;
18261+ }
18262+ return 0;
18263+}
18264+
18265+#ifdef CONFIG_HIGHMEM
18266+pte_t *kmap_pte;
18267+pgprot_t kmap_prot;
18268+
18269+#define kmap_get_fixmap_pte(vaddr) \
18270+ pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
18271+
18272+static void __init kmap_init(void)
18273+{
18274+ unsigned long kmap_vstart;
18275+
18276+ /* cache the first kmap pte */
18277+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
18278+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
18279+
18280+ kmap_prot = PAGE_KERNEL;
18281+}
18282+
18283+static void __init permanent_kmaps_init(pgd_t *pgd_base)
18284+{
18285+ pgd_t *pgd;
18286+ pud_t *pud;
18287+ pmd_t *pmd;
18288+ pte_t *pte;
18289+ unsigned long vaddr;
18290+
18291+ vaddr = PKMAP_BASE;
18292+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
18293+
18294+ pgd = swapper_pg_dir + pgd_index(vaddr);
18295+ pud = pud_offset(pgd, vaddr);
18296+ pmd = pmd_offset(pud, vaddr);
18297+ pte = pte_offset_kernel(pmd, vaddr);
18298+ pkmap_page_table = pte;
18299+}
18300+
18301+static void __meminit free_new_highpage(struct page *page, int pfn)
18302+{
18303+ set_page_count(page, 1);
18304+ if (pfn < xen_start_info->nr_pages)
18305+ __free_page(page);
18306+ totalhigh_pages++;
18307+}
18308+
18309+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18310+{
18311+ if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18312+ ClearPageReserved(page);
18313+ free_new_highpage(page, pfn);
18314+ } else
18315+ SetPageReserved(page);
18316+}
18317+
18318+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
18319+{
18320+ free_new_highpage(page, pfn);
18321+ totalram_pages++;
18322+#ifdef CONFIG_FLATMEM
18323+ max_mapnr = max(pfn, max_mapnr);
18324+#endif
18325+ num_physpages++;
18326+ return 0;
18327+}
18328+
18329+/*
18330+ * Not currently handling the NUMA case.
18331+ * Assuming single node and all memory that
18332+ * has been added dynamically that would be
18333+ * onlined here is in HIGHMEM
18334+ */
18335+void online_page(struct page *page)
18336+{
18337+ ClearPageReserved(page);
18338+ add_one_highpage_hotplug(page, page_to_pfn(page));
18339+}
18340+
18341+
18342+#ifdef CONFIG_NUMA
18343+extern void set_highmem_pages_init(int);
18344+#else
18345+static void __init set_highmem_pages_init(int bad_ppro)
18346+{
18347+ int pfn;
18348+ for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
18349+ add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18350+ totalram_pages += totalhigh_pages;
18351+}
18352+#endif /* CONFIG_FLATMEM */
18353+
18354+#else
18355+#define kmap_init() do { } while (0)
18356+#define permanent_kmaps_init(pgd_base) do { } while (0)
18357+#define set_highmem_pages_init(bad_ppro) do { } while (0)
18358+#endif /* CONFIG_HIGHMEM */
18359+
18360+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
18361+EXPORT_SYMBOL(__PAGE_KERNEL);
18362+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18363+
18364+#ifdef CONFIG_NUMA
18365+extern void __init remap_numa_kva(void);
18366+#else
18367+#define remap_numa_kva() do {} while (0)
18368+#endif
18369+
18370+pgd_t *swapper_pg_dir;
18371+
18372+static void __init pagetable_init (void)
18373+{
18374+ unsigned long vaddr;
18375+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18376+
18377+ swapper_pg_dir = pgd_base;
18378+ init_mm.pgd = pgd_base;
18379+
18380+ /* Enable PSE if available */
18381+ if (cpu_has_pse) {
18382+ set_in_cr4(X86_CR4_PSE);
18383+ }
18384+
18385+ /* Enable PGE if available */
18386+ if (cpu_has_pge) {
18387+ set_in_cr4(X86_CR4_PGE);
18388+ __PAGE_KERNEL |= _PAGE_GLOBAL;
18389+ __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18390+ }
18391+
18392+ kernel_physical_mapping_init(pgd_base);
18393+ remap_numa_kva();
18394+
18395+ /*
18396+ * Fixed mappings, only the page table structure has to be
18397+ * created - mappings will be set by set_fixmap():
18398+ */
18399+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
18400+ page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
18401+
18402+ permanent_kmaps_init(pgd_base);
18403+}
18404+
18405+#ifdef CONFIG_SOFTWARE_SUSPEND
18406+/*
18407+ * Swap suspend & friends need this for resume because things like the intel-agp
18408+ * driver might have split up a kernel 4MB mapping.
18409+ */
18410+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
18411+ __attribute__ ((aligned (PAGE_SIZE)));
18412+
18413+static inline void save_pg_dir(void)
18414+{
18415+ memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
18416+}
18417+#else
18418+static inline void save_pg_dir(void)
18419+{
18420+}
18421+#endif
18422+
18423+void zap_low_mappings (void)
18424+{
18425+ int i;
18426+
18427+ save_pg_dir();
18428+
18429+ /*
18430+ * Zap initial low-memory mappings.
18431+ *
18432+ * Note that "pgd_clear()" doesn't do it for
18433+ * us, because pgd_clear() is a no-op on i386.
18434+ */
18435+ for (i = 0; i < USER_PTRS_PER_PGD; i++)
18436+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18437+ set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
18438+#else
18439+ set_pgd(swapper_pg_dir+i, __pgd(0));
18440+#endif
18441+ flush_tlb_all();
18442+}
18443+
18444+static int disable_nx __initdata = 0;
18445+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
18446+EXPORT_SYMBOL(__supported_pte_mask);
18447+
18448+/*
18449+ * noexec = on|off
18450+ *
18451+ * Control non executable mappings.
18452+ *
18453+ * on Enable
18454+ * off Disable
18455+ */
18456+void __init noexec_setup(const char *str)
18457+{
18458+ if (!strncmp(str, "on",2) && cpu_has_nx) {
18459+ __supported_pte_mask |= _PAGE_NX;
18460+ disable_nx = 0;
18461+ } else if (!strncmp(str,"off",3)) {
18462+ disable_nx = 1;
18463+ __supported_pte_mask &= ~_PAGE_NX;
18464+ }
18465+}
18466+
18467+int nx_enabled = 0;
18468+#ifdef CONFIG_X86_PAE
18469+
18470+static void __init set_nx(void)
18471+{
18472+ unsigned int v[4], l, h;
18473+
18474+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
18475+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
18476+ if ((v[3] & (1 << 20)) && !disable_nx) {
18477+ rdmsr(MSR_EFER, l, h);
18478+ l |= EFER_NX;
18479+ wrmsr(MSR_EFER, l, h);
18480+ nx_enabled = 1;
18481+ __supported_pte_mask |= _PAGE_NX;
18482+ }
18483+ }
18484+}
18485+
18486+/*
18487+ * Enables/disables executability of a given kernel page and
18488+ * returns the previous setting.
18489+ */
18490+int __init set_kernel_exec(unsigned long vaddr, int enable)
18491+{
18492+ pte_t *pte;
18493+ int ret = 1;
18494+
18495+ if (!nx_enabled)
18496+ goto out;
18497+
18498+ pte = lookup_address(vaddr);
18499+ BUG_ON(!pte);
18500+
18501+ if (!pte_exec_kernel(*pte))
18502+ ret = 0;
18503+
18504+ if (enable)
18505+ pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
18506+ else
18507+ pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
18508+ __flush_tlb_all();
18509+out:
18510+ return ret;
18511+}
18512+
18513+#endif
18514+
18515+/*
18516+ * paging_init() sets up the page tables - note that the first 8MB are
18517+ * already mapped by head.S.
18518+ *
18519+ * This routines also unmaps the page at virtual kernel address 0, so
18520+ * that we can trap those pesky NULL-reference errors in the kernel.
18521+ */
18522+void __init paging_init(void)
18523+{
18524+ int i;
18525+
18526+#ifdef CONFIG_X86_PAE
18527+ set_nx();
18528+ if (nx_enabled)
18529+ printk("NX (Execute Disable) protection: active\n");
18530+#endif
18531+
18532+ pagetable_init();
18533+
18534+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18535+ /*
18536+ * We will bail out later - printk doesn't work right now so
18537+ * the user would just see a hanging kernel.
18538+ * when running as xen domain we are already in PAE mode at
18539+ * this point.
18540+ */
18541+ if (cpu_has_pae)
18542+ set_in_cr4(X86_CR4_PAE);
18543+#endif
18544+ __flush_tlb_all();
18545+
18546+ kmap_init();
18547+
18548+ /* Switch to the real shared_info page, and clear the
18549+ * dummy page. */
18550+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18551+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18552+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
18553+
18554+ /* Setup mapping of lower 1st MB */
18555+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
18556+ if (is_initial_xendomain())
18557+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18558+ else
18559+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
18560+ virt_to_machine(empty_zero_page),
18561+ PAGE_KERNEL_RO);
18562+}
18563+
18564+/*
18565+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
18566+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
18567+ * used to involve black magic jumps to work around some nasty CPU bugs,
18568+ * but fortunately the switch to using exceptions got rid of all that.
18569+ */
18570+
18571+static void __init test_wp_bit(void)
18572+{
18573+ printk("Checking if this processor honours the WP bit even in supervisor mode... ");
18574+
18575+ /* Any page-aligned address will do, the test is non-destructive */
18576+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
18577+ boot_cpu_data.wp_works_ok = do_test_wp_bit();
18578+ clear_fixmap(FIX_WP_TEST);
18579+
18580+ if (!boot_cpu_data.wp_works_ok) {
18581+ printk("No.\n");
18582+#ifdef CONFIG_X86_WP_WORKS_OK
18583+ panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
18584+#endif
18585+ } else {
18586+ printk("Ok.\n");
18587+ }
18588+}
18589+
18590+static void __init set_max_mapnr_init(void)
18591+{
18592+#ifdef CONFIG_HIGHMEM
18593+ num_physpages = highend_pfn;
18594+#else
18595+ num_physpages = max_low_pfn;
18596+#endif
18597+#ifdef CONFIG_FLATMEM
18598+ max_mapnr = num_physpages;
18599+#endif
18600+}
18601+
18602+static struct kcore_list kcore_mem, kcore_vmalloc;
18603+
18604+void __init mem_init(void)
18605+{
18606+ extern int ppro_with_ram_bug(void);
18607+ int codesize, reservedpages, datasize, initsize;
18608+ int tmp;
18609+ int bad_ppro;
18610+ unsigned long pfn;
18611+
18612+ contiguous_bitmap = alloc_bootmem_low_pages(
18613+ (max_low_pfn + 2*BITS_PER_LONG) >> 3);
18614+ BUG_ON(!contiguous_bitmap);
18615+ memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
18616+
18617+#if defined(CONFIG_SWIOTLB)
18618+ swiotlb_init();
18619+#endif
18620+
18621+#ifdef CONFIG_FLATMEM
18622+ if (!mem_map)
18623+ BUG();
18624+#endif
18625+
18626+ bad_ppro = ppro_with_ram_bug();
18627+
18628+#ifdef CONFIG_HIGHMEM
18629+ /* check that fixmap and pkmap do not overlap */
18630+ if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18631+ printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
18632+ printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18633+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
18634+ BUG();
18635+ }
18636+#endif
18637+
18638+ set_max_mapnr_init();
18639+
18640+#ifdef CONFIG_HIGHMEM
18641+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18642+#else
18643+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18644+#endif
18645+ printk("vmalloc area: %lx-%lx, maxmem %lx\n",
18646+ VMALLOC_START,VMALLOC_END,MAXMEM);
18647+ BUG_ON(VMALLOC_START > VMALLOC_END);
18648+
18649+ /* this will put all low memory onto the freelists */
18650+ totalram_pages += free_all_bootmem();
18651+ /* XEN: init and count low-mem pages outside initial allocation. */
18652+ for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
18653+ ClearPageReserved(pfn_to_page(pfn));
18654+ set_page_count(pfn_to_page(pfn), 1);
18655+ totalram_pages++;
18656+ }
18657+
18658+ reservedpages = 0;
18659+ for (tmp = 0; tmp < max_low_pfn; tmp++)
18660+ /*
18661+ * Only count reserved RAM pages
18662+ */
18663+ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18664+ reservedpages++;
18665+
18666+ set_highmem_pages_init(bad_ppro);
18667+
18668+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
18669+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18670+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
18671+
18672+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
18673+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
18674+ VMALLOC_END-VMALLOC_START);
18675+
18676+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
18677+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
18678+ num_physpages << (PAGE_SHIFT-10),
18679+ codesize >> 10,
18680+ reservedpages << (PAGE_SHIFT-10),
18681+ datasize >> 10,
18682+ initsize >> 10,
18683+ (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18684+ );
18685+
18686+#ifdef CONFIG_X86_PAE
18687+ if (!cpu_has_pae)
18688+ panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
18689+#endif
18690+ if (boot_cpu_data.wp_works_ok < 0)
18691+ test_wp_bit();
18692+
18693+ /*
18694+ * Subtle. SMP is doing it's boot stuff late (because it has to
18695+ * fork idle threads) - but it also needs low mappings for the
18696+ * protected-mode entry to work. We zap these entries only after
18697+ * the WP-bit has been tested.
18698+ */
18699+#ifndef CONFIG_SMP
18700+ zap_low_mappings();
18701+#endif
18702+
18703+ set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
18704+}
18705+
18706+/*
18707+ * this is for the non-NUMA, single node SMP system case.
18708+ * Specifically, in the case of x86, we will always add
18709+ * memory to the highmem for now.
18710+ */
18711+#ifndef CONFIG_NEED_MULTIPLE_NODES
18712+int add_memory(u64 start, u64 size)
18713+{
18714+ struct pglist_data *pgdata = &contig_page_data;
18715+ struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
18716+ unsigned long start_pfn = start >> PAGE_SHIFT;
18717+ unsigned long nr_pages = size >> PAGE_SHIFT;
18718+
18719+ return __add_pages(zone, start_pfn, nr_pages);
18720+}
18721+
18722+int remove_memory(u64 start, u64 size)
18723+{
18724+ return -EINVAL;
18725+}
18726+#endif
18727+
18728+kmem_cache_t *pgd_cache;
18729+kmem_cache_t *pmd_cache;
18730+
18731+void __init pgtable_cache_init(void)
18732+{
18733+ if (PTRS_PER_PMD > 1) {
18734+ pmd_cache = kmem_cache_create("pmd",
18735+ PTRS_PER_PMD*sizeof(pmd_t),
18736+ PTRS_PER_PMD*sizeof(pmd_t),
18737+ 0,
18738+ pmd_ctor,
18739+ NULL);
18740+ if (!pmd_cache)
18741+ panic("pgtable_cache_init(): cannot create pmd cache");
18742+ }
18743+ pgd_cache = kmem_cache_create("pgd",
18744+#ifndef CONFIG_XEN
18745+ PTRS_PER_PGD*sizeof(pgd_t),
18746+ PTRS_PER_PGD*sizeof(pgd_t),
18747+#else
18748+ PAGE_SIZE,
18749+ PAGE_SIZE,
18750+#endif
18751+ 0,
18752+ pgd_ctor,
18753+ PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
18754+ if (!pgd_cache)
18755+ panic("pgtable_cache_init(): Cannot create pgd cache");
18756+}
18757+
18758+/*
18759+ * This function cannot be __init, since exceptions don't work in that
18760+ * section. Put this after the callers, so that it cannot be inlined.
18761+ */
18762+static int noinline do_test_wp_bit(void)
18763+{
18764+ char tmp_reg;
18765+ int flag;
18766+
18767+ __asm__ __volatile__(
18768+ " movb %0,%1 \n"
18769+ "1: movb %1,%0 \n"
18770+ " xorl %2,%2 \n"
18771+ "2: \n"
18772+ ".section __ex_table,\"a\"\n"
18773+ " .align 4 \n"
18774+ " .long 1b,2b \n"
18775+ ".previous \n"
18776+ :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
18777+ "=q" (tmp_reg),
18778+ "=r" (flag)
18779+ :"2" (1)
18780+ :"memory");
18781+
18782+ return flag;
18783+}
18784+
18785+void free_initmem(void)
18786+{
18787+ unsigned long addr;
18788+
18789+ addr = (unsigned long)(&__init_begin);
18790+ for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
18791+ ClearPageReserved(virt_to_page(addr));
18792+ set_page_count(virt_to_page(addr), 1);
18793+ memset((void *)addr, 0xcc, PAGE_SIZE);
18794+ free_page(addr);
18795+ totalram_pages++;
18796+ }
18797+ printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
18798+}
18799+
18800+#ifdef CONFIG_DEBUG_RODATA
18801+
18802+extern char __start_rodata, __end_rodata;
18803+void mark_rodata_ro(void)
18804+{
18805+ unsigned long addr = (unsigned long)&__start_rodata;
18806+
18807+ for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
18808+ change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
18809+
18810+ printk ("Write protecting the kernel read-only data: %luk\n",
18811+ (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
18812+
18813+ /*
18814+ * change_page_attr() requires a global_flush_tlb() call after it.
18815+ * We do this after the printk so that if something went wrong in the
18816+ * change, the printk gets out at least to give a better debug hint
18817+ * of who is the culprit.
18818+ */
18819+ global_flush_tlb();
18820+}
18821+#endif
18822+
18823+
18824+#ifdef CONFIG_BLK_DEV_INITRD
18825+void free_initrd_mem(unsigned long start, unsigned long end)
18826+{
18827+ if (start < end)
18828+ printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
18829+ for (; start < end; start += PAGE_SIZE) {
18830+ ClearPageReserved(virt_to_page(start));
18831+ set_page_count(virt_to_page(start), 1);
18832+ free_page(start);
18833+ totalram_pages++;
18834+ }
18835+}
18836+#endif
18837diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/ioremap-xen.c linux-2.6.16.33/arch/i386/mm/ioremap-xen.c
18838--- linux-2.6.16.33-noxen/arch/i386/mm/ioremap-xen.c 1970-01-01 00:00:00.000000000 +0000
18839+++ linux-2.6.16.33/arch/i386/mm/ioremap-xen.c 2007-01-08 15:00:45.000000000 +0000
18840@@ -0,0 +1,447 @@
18841+/*
18842+ * arch/i386/mm/ioremap.c
18843+ *
18844+ * Re-map IO memory to kernel address space so that we can access it.
18845+ * This is needed for high PCI addresses that aren't mapped in the
18846+ * 640k-1MB IO memory area on PC's
18847+ *
18848+ * (C) Copyright 1995 1996 Linus Torvalds
18849+ */
18850+
18851+#include <linux/vmalloc.h>
18852+#include <linux/init.h>
18853+#include <linux/slab.h>
18854+#include <linux/module.h>
18855+#include <asm/io.h>
18856+#include <asm/fixmap.h>
18857+#include <asm/cacheflush.h>
18858+#include <asm/tlbflush.h>
18859+#include <asm/pgtable.h>
18860+#include <asm/pgalloc.h>
18861+
18862+#define ISA_START_ADDRESS 0x0
18863+#define ISA_END_ADDRESS 0x100000
18864+
18865+static int direct_remap_area_pte_fn(pte_t *pte,
18866+ struct page *pmd_page,
18867+ unsigned long address,
18868+ void *data)
18869+{
18870+ mmu_update_t **v = (mmu_update_t **)data;
18871+
18872+ BUG_ON(!pte_none(*pte));
18873+
18874+ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18875+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18876+ (*v)++;
18877+
18878+ return 0;
18879+}
18880+
18881+static int __direct_remap_pfn_range(struct mm_struct *mm,
18882+ unsigned long address,
18883+ unsigned long mfn,
18884+ unsigned long size,
18885+ pgprot_t prot,
18886+ domid_t domid)
18887+{
18888+ int rc;
18889+ unsigned long i, start_address;
18890+ mmu_update_t *u, *v, *w;
18891+
18892+ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
18893+ if (u == NULL)
18894+ return -ENOMEM;
18895+
18896+ start_address = address;
18897+
18898+ flush_cache_all();
18899+
18900+ for (i = 0; i < size; i += PAGE_SIZE) {
18901+ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
18902+ /* Flush a full batch after filling in the PTE ptrs. */
18903+ rc = apply_to_page_range(mm, start_address,
18904+ address - start_address,
18905+ direct_remap_area_pte_fn, &w);
18906+ if (rc)
18907+ goto out;
18908+ rc = -EFAULT;
18909+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
18910+ goto out;
18911+ v = w = u;
18912+ start_address = address;
18913+ }
18914+
18915+ /*
18916+ * Fill in the machine address: PTE ptr is done later by
18917+ * __direct_remap_area_pages().
18918+ */
18919+ v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
18920+
18921+ mfn++;
18922+ address += PAGE_SIZE;
18923+ v++;
18924+ }
18925+
18926+ if (v != u) {
18927+ /* Final batch. */
18928+ rc = apply_to_page_range(mm, start_address,
18929+ address - start_address,
18930+ direct_remap_area_pte_fn, &w);
18931+ if (rc)
18932+ goto out;
18933+ rc = -EFAULT;
18934+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
18935+ goto out;
18936+ }
18937+
18938+ rc = 0;
18939+
18940+ out:
18941+ flush_tlb_all();
18942+
18943+ free_page((unsigned long)u);
18944+
18945+ return rc;
18946+}
18947+
18948+int direct_remap_pfn_range(struct vm_area_struct *vma,
18949+ unsigned long address,
18950+ unsigned long mfn,
18951+ unsigned long size,
18952+ pgprot_t prot,
18953+ domid_t domid)
18954+{
18955+ if (xen_feature(XENFEAT_auto_translated_physmap))
18956+ return remap_pfn_range(vma, address, mfn, size, prot);
18957+
18958+ if (domid == DOMID_SELF)
18959+ return -EINVAL;
18960+
18961+ vma->vm_flags |= VM_IO | VM_RESERVED;
18962+
18963+ vma->vm_mm->context.has_foreign_mappings = 1;
18964+
18965+ return __direct_remap_pfn_range(
18966+ vma->vm_mm, address, mfn, size, prot, domid);
18967+}
18968+EXPORT_SYMBOL(direct_remap_pfn_range);
18969+
18970+int direct_kernel_remap_pfn_range(unsigned long address,
18971+ unsigned long mfn,
18972+ unsigned long size,
18973+ pgprot_t prot,
18974+ domid_t domid)
18975+{
18976+ return __direct_remap_pfn_range(
18977+ &init_mm, address, mfn, size, prot, domid);
18978+}
18979+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
18980+
18981+static int lookup_pte_fn(
18982+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18983+{
18984+ uint64_t *ptep = (uint64_t *)data;
18985+ if (ptep)
18986+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18987+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18988+ return 0;
18989+}
18990+
18991+int create_lookup_pte_addr(struct mm_struct *mm,
18992+ unsigned long address,
18993+ uint64_t *ptep)
18994+{
18995+ return apply_to_page_range(mm, address, PAGE_SIZE,
18996+ lookup_pte_fn, ptep);
18997+}
18998+
18999+EXPORT_SYMBOL(create_lookup_pte_addr);
19000+
19001+static int noop_fn(
19002+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
19003+{
19004+ return 0;
19005+}
19006+
19007+int touch_pte_range(struct mm_struct *mm,
19008+ unsigned long address,
19009+ unsigned long size)
19010+{
19011+ return apply_to_page_range(mm, address, size, noop_fn, NULL);
19012+}
19013+
19014+EXPORT_SYMBOL(touch_pte_range);
19015+
19016+/*
19017+ * Does @address reside within a non-highmem page that is local to this virtual
19018+ * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
19019+ * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
19020+ * why this works.
19021+ */
19022+static inline int is_local_lowmem(unsigned long address)
19023+{
19024+ extern unsigned long max_low_pfn;
19025+ return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
19026+}
19027+
19028+/*
19029+ * Generic mapping function (not visible outside):
19030+ */
19031+
19032+/*
19033+ * Remap an arbitrary physical address space into the kernel virtual
19034+ * address space. Needed when the kernel wants to access high addresses
19035+ * directly.
19036+ *
19037+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
19038+ * have to convert them into an offset in a page-aligned mapping, but the
19039+ * caller shouldn't need to know that small detail.
19040+ */
19041+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
19042+{
19043+ void __iomem * addr;
19044+ struct vm_struct * area;
19045+ unsigned long offset, last_addr;
19046+ domid_t domid = DOMID_IO;
19047+
19048+ /* Don't allow wraparound or zero size */
19049+ last_addr = phys_addr + size - 1;
19050+ if (!size || last_addr < phys_addr)
19051+ return NULL;
19052+
19053+ /*
19054+ * Don't remap the low PCI/ISA area, it's always mapped..
19055+ */
19056+ if (is_initial_xendomain() &&
19057+ phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19058+ return (void __iomem *) isa_bus_to_virt(phys_addr);
19059+
19060+ /*
19061+ * Don't allow anybody to remap normal RAM that we're using..
19062+ */
19063+ if (is_local_lowmem(phys_addr)) {
19064+ char *t_addr, *t_end;
19065+ struct page *page;
19066+
19067+ t_addr = bus_to_virt(phys_addr);
19068+ t_end = t_addr + (size - 1);
19069+
19070+ for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
19071+ if(!PageReserved(page))
19072+ return NULL;
19073+
19074+ domid = DOMID_SELF;
19075+ }
19076+
19077+ /*
19078+ * Mappings have to be page-aligned
19079+ */
19080+ offset = phys_addr & ~PAGE_MASK;
19081+ phys_addr &= PAGE_MASK;
19082+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
19083+
19084+ /*
19085+ * Ok, go for it..
19086+ */
19087+ area = get_vm_area(size, VM_IOREMAP | (flags << 20));
19088+ if (!area)
19089+ return NULL;
19090+ area->phys_addr = phys_addr;
19091+ addr = (void __iomem *) area->addr;
19092+ flags |= _KERNPG_TABLE;
19093+ if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
19094+ phys_addr>>PAGE_SHIFT,
19095+ size, __pgprot(flags), domid)) {
19096+ vunmap((void __force *) addr);
19097+ return NULL;
19098+ }
19099+ return (void __iomem *) (offset + (char __iomem *)addr);
19100+}
19101+EXPORT_SYMBOL(__ioremap);
19102+
19103+/**
19104+ * ioremap_nocache - map bus memory into CPU space
19105+ * @offset: bus address of the memory
19106+ * @size: size of the resource to map
19107+ *
19108+ * ioremap_nocache performs a platform specific sequence of operations to
19109+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
19110+ * writew/writel functions and the other mmio helpers. The returned
19111+ * address is not guaranteed to be usable directly as a virtual
19112+ * address.
19113+ *
19114+ * This version of ioremap ensures that the memory is marked uncachable
19115+ * on the CPU as well as honouring existing caching rules from things like
19116+ * the PCI bus. Note that there are other caches and buffers on many
19117+ * busses. In particular driver authors should read up on PCI writes
19118+ *
19119+ * It's useful if some control registers are in such an area and
19120+ * write combining or read caching is not desirable:
19121+ *
19122+ * Must be freed with iounmap.
19123+ */
19124+
19125+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
19126+{
19127+ unsigned long last_addr;
19128+ void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
19129+ if (!p)
19130+ return p;
19131+
19132+ /* Guaranteed to be > phys_addr, as per __ioremap() */
19133+ last_addr = phys_addr + size - 1;
19134+
19135+ if (is_local_lowmem(last_addr)) {
19136+ struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
19137+ unsigned long npages;
19138+
19139+ phys_addr &= PAGE_MASK;
19140+
19141+ /* This might overflow and become zero.. */
19142+ last_addr = PAGE_ALIGN(last_addr);
19143+
19144+ /* .. but that's ok, because modulo-2**n arithmetic will make
19145+ * the page-aligned "last - first" come out right.
19146+ */
19147+ npages = (last_addr - phys_addr) >> PAGE_SHIFT;
19148+
19149+ if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
19150+ iounmap(p);
19151+ p = NULL;
19152+ }
19153+ global_flush_tlb();
19154+ }
19155+
19156+ return p;
19157+}
19158+EXPORT_SYMBOL(ioremap_nocache);
19159+
19160+/**
19161+ * iounmap - Free a IO remapping
19162+ * @addr: virtual address from ioremap_*
19163+ *
19164+ * Caller must ensure there is only one unmapping for the same pointer.
19165+ */
19166+void iounmap(volatile void __iomem *addr)
19167+{
19168+ struct vm_struct *p, *o;
19169+
19170+ if ((void __force *)addr <= high_memory)
19171+ return;
19172+
19173+ /*
19174+ * __ioremap special-cases the PCI/ISA range by not instantiating a
19175+ * vm_area and by simply returning an address into the kernel mapping
19176+ * of ISA space. So handle that here.
19177+ */
19178+ if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19179+ return;
19180+
19181+ addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
19182+
19183+ /* Use the vm area unlocked, assuming the caller
19184+ ensures there isn't another iounmap for the same address
19185+ in parallel. Reuse of the virtual address is prevented by
19186+ leaving it in the global lists until we're done with it.
19187+ cpa takes care of the direct mappings. */
19188+ read_lock(&vmlist_lock);
19189+ for (p = vmlist; p; p = p->next) {
19190+ if (p->addr == addr)
19191+ break;
19192+ }
19193+ read_unlock(&vmlist_lock);
19194+
19195+ if (!p) {
19196+ printk("iounmap: bad address %p\n", addr);
19197+ dump_stack();
19198+ return;
19199+ }
19200+
19201+ /* Reset the direct mapping. Can block */
19202+ if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
19203+ /* p->size includes the guard page, but cpa doesn't like that */
19204+ change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
19205+ (p->size - PAGE_SIZE) >> PAGE_SHIFT,
19206+ PAGE_KERNEL);
19207+ global_flush_tlb();
19208+ }
19209+
19210+ /* Finally remove it */
19211+ o = remove_vm_area((void *)addr);
19212+ BUG_ON(p != o || o == NULL);
19213+ kfree(p);
19214+}
19215+EXPORT_SYMBOL(iounmap);
19216+
19217+#ifdef __i386__
19218+
19219+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
19220+{
19221+ unsigned long offset, last_addr;
19222+ unsigned int nrpages;
19223+ enum fixed_addresses idx;
19224+
19225+ /* Don't allow wraparound or zero size */
19226+ last_addr = phys_addr + size - 1;
19227+ if (!size || last_addr < phys_addr)
19228+ return NULL;
19229+
19230+ /*
19231+ * Don't remap the low PCI/ISA area, it's always mapped..
19232+ */
19233+ if (is_initial_xendomain() &&
19234+ phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19235+ return isa_bus_to_virt(phys_addr);
19236+
19237+ /*
19238+ * Mappings have to be page-aligned
19239+ */
19240+ offset = phys_addr & ~PAGE_MASK;
19241+ phys_addr &= PAGE_MASK;
19242+ size = PAGE_ALIGN(last_addr) - phys_addr;
19243+
19244+ /*
19245+ * Mappings have to fit in the FIX_BTMAP area.
19246+ */
19247+ nrpages = size >> PAGE_SHIFT;
19248+ if (nrpages > NR_FIX_BTMAPS)
19249+ return NULL;
19250+
19251+ /*
19252+ * Ok, go for it..
19253+ */
19254+ idx = FIX_BTMAP_BEGIN;
19255+ while (nrpages > 0) {
19256+ set_fixmap(idx, phys_addr);
19257+ phys_addr += PAGE_SIZE;
19258+ --idx;
19259+ --nrpages;
19260+ }
19261+ return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
19262+}
19263+
19264+void __init bt_iounmap(void *addr, unsigned long size)
19265+{
19266+ unsigned long virt_addr;
19267+ unsigned long offset;
19268+ unsigned int nrpages;
19269+ enum fixed_addresses idx;
19270+
19271+ virt_addr = (unsigned long)addr;
19272+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
19273+ return;
19274+ if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19275+ return;
19276+ offset = virt_addr & ~PAGE_MASK;
19277+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
19278+
19279+ idx = FIX_BTMAP_BEGIN;
19280+ while (nrpages > 0) {
19281+ clear_fixmap(idx);
19282+ --idx;
19283+ --nrpages;
19284+ }
19285+}
19286+
19287+#endif /* __i386__ */
19288diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pageattr.c linux-2.6.16.33/arch/i386/mm/pageattr.c
19289--- linux-2.6.16.33-noxen/arch/i386/mm/pageattr.c 2006-11-22 18:06:31.000000000 +0000
19290+++ linux-2.6.16.33/arch/i386/mm/pageattr.c 2007-05-23 21:00:01.000000000 +0000
19291@@ -78,7 +78,7 @@
19292 unsigned long flags;
19293
19294 set_pte_atomic(kpte, pte); /* change init_mm */
19295- if (PTRS_PER_PMD > 1)
19296+ if (HAVE_SHARED_KERNEL_PMD)
19297 return;
19298
19299 spin_lock_irqsave(&pgd_lock, flags);
19300diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pgtable-xen.c linux-2.6.16.33/arch/i386/mm/pgtable-xen.c
19301--- linux-2.6.16.33-noxen/arch/i386/mm/pgtable-xen.c 1970-01-01 00:00:00.000000000 +0000
19302+++ linux-2.6.16.33/arch/i386/mm/pgtable-xen.c 2007-01-08 15:00:45.000000000 +0000
19303@@ -0,0 +1,707 @@
19304+/*
19305+ * linux/arch/i386/mm/pgtable.c
19306+ */
19307+
19308+#include <linux/config.h>
19309+#include <linux/sched.h>
19310+#include <linux/kernel.h>
19311+#include <linux/errno.h>
19312+#include <linux/mm.h>
19313+#include <linux/swap.h>
19314+#include <linux/smp.h>
19315+#include <linux/highmem.h>
19316+#include <linux/slab.h>
19317+#include <linux/pagemap.h>
19318+#include <linux/spinlock.h>
19319+#include <linux/module.h>
19320+
19321+#include <asm/system.h>
19322+#include <asm/pgtable.h>
19323+#include <asm/pgalloc.h>
19324+#include <asm/fixmap.h>
19325+#include <asm/e820.h>
19326+#include <asm/tlb.h>
19327+#include <asm/tlbflush.h>
19328+#include <asm/io.h>
19329+#include <asm/mmu_context.h>
19330+
19331+#include <xen/features.h>
19332+#include <xen/foreign_page.h>
19333+#include <asm/hypervisor.h>
19334+
19335+static void pgd_test_and_unpin(pgd_t *pgd);
19336+
19337+void show_mem(void)
19338+{
19339+ int total = 0, reserved = 0;
19340+ int shared = 0, cached = 0;
19341+ int highmem = 0;
19342+ struct page *page;
19343+ pg_data_t *pgdat;
19344+ unsigned long i;
19345+ struct page_state ps;
19346+ unsigned long flags;
19347+
19348+ printk(KERN_INFO "Mem-info:\n");
19349+ show_free_areas();
19350+ printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
19351+ for_each_pgdat(pgdat) {
19352+ pgdat_resize_lock(pgdat, &flags);
19353+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19354+ page = pgdat_page_nr(pgdat, i);
19355+ total++;
19356+ if (PageHighMem(page))
19357+ highmem++;
19358+ if (PageReserved(page))
19359+ reserved++;
19360+ else if (PageSwapCache(page))
19361+ cached++;
19362+ else if (page_count(page))
19363+ shared += page_count(page) - 1;
19364+ }
19365+ pgdat_resize_unlock(pgdat, &flags);
19366+ }
19367+ printk(KERN_INFO "%d pages of RAM\n", total);
19368+ printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
19369+ printk(KERN_INFO "%d reserved pages\n", reserved);
19370+ printk(KERN_INFO "%d pages shared\n", shared);
19371+ printk(KERN_INFO "%d pages swap cached\n", cached);
19372+
19373+ get_page_state(&ps);
19374+ printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
19375+ printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
19376+ printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
19377+ printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
19378+ printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
19379+}
19380+
19381+/*
19382+ * Associate a virtual page frame with a given physical page frame
19383+ * and protection flags for that frame.
19384+ */
19385+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19386+{
19387+ pgd_t *pgd;
19388+ pud_t *pud;
19389+ pmd_t *pmd;
19390+ pte_t *pte;
19391+
19392+ pgd = swapper_pg_dir + pgd_index(vaddr);
19393+ if (pgd_none(*pgd)) {
19394+ BUG();
19395+ return;
19396+ }
19397+ pud = pud_offset(pgd, vaddr);
19398+ if (pud_none(*pud)) {
19399+ BUG();
19400+ return;
19401+ }
19402+ pmd = pmd_offset(pud, vaddr);
19403+ if (pmd_none(*pmd)) {
19404+ BUG();
19405+ return;
19406+ }
19407+ pte = pte_offset_kernel(pmd, vaddr);
19408+ if (pgprot_val(flags))
19409+ /* <pfn,flags> stored as-is, to permit clearing entries */
19410+ set_pte(pte, pfn_pte(pfn, flags));
19411+ else
19412+ pte_clear(&init_mm, vaddr, pte);
19413+
19414+ /*
19415+ * It's enough to flush this one mapping.
19416+ * (PGE mappings get flushed as well)
19417+ */
19418+ __flush_tlb_one(vaddr);
19419+}
19420+
19421+/*
19422+ * Associate a virtual page frame with a given physical page frame
19423+ * and protection flags for that frame.
19424+ */
19425+static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
19426+ pgprot_t flags)
19427+{
19428+ pgd_t *pgd;
19429+ pud_t *pud;
19430+ pmd_t *pmd;
19431+ pte_t *pte;
19432+
19433+ pgd = swapper_pg_dir + pgd_index(vaddr);
19434+ if (pgd_none(*pgd)) {
19435+ BUG();
19436+ return;
19437+ }
19438+ pud = pud_offset(pgd, vaddr);
19439+ if (pud_none(*pud)) {
19440+ BUG();
19441+ return;
19442+ }
19443+ pmd = pmd_offset(pud, vaddr);
19444+ if (pmd_none(*pmd)) {
19445+ BUG();
19446+ return;
19447+ }
19448+ pte = pte_offset_kernel(pmd, vaddr);
19449+ if (pgprot_val(flags))
19450+ /* <pfn,flags> stored as-is, to permit clearing entries */
19451+ set_pte(pte, pfn_pte_ma(pfn, flags));
19452+ else
19453+ pte_clear(&init_mm, vaddr, pte);
19454+
19455+ /*
19456+ * It's enough to flush this one mapping.
19457+ * (PGE mappings get flushed as well)
19458+ */
19459+ __flush_tlb_one(vaddr);
19460+}
19461+
19462+/*
19463+ * Associate a large virtual page frame with a given physical page frame
19464+ * and protection flags for that frame. pfn is for the base of the page,
19465+ * vaddr is what the page gets mapped to - both must be properly aligned.
19466+ * The pmd must already be instantiated. Assumes PAE mode.
19467+ */
19468+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19469+{
19470+ pgd_t *pgd;
19471+ pud_t *pud;
19472+ pmd_t *pmd;
19473+
19474+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
19475+ printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
19476+ return; /* BUG(); */
19477+ }
19478+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
19479+ printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
19480+ return; /* BUG(); */
19481+ }
19482+ pgd = swapper_pg_dir + pgd_index(vaddr);
19483+ if (pgd_none(*pgd)) {
19484+ printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
19485+ return; /* BUG(); */
19486+ }
19487+ pud = pud_offset(pgd, vaddr);
19488+ pmd = pmd_offset(pud, vaddr);
19489+ set_pmd(pmd, pfn_pmd(pfn, flags));
19490+ /*
19491+ * It's enough to flush this one mapping.
19492+ * (PGE mappings get flushed as well)
19493+ */
19494+ __flush_tlb_one(vaddr);
19495+}
19496+
19497+static int nr_fixmaps = 0;
19498+unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
19499+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
19500+EXPORT_SYMBOL(__FIXADDR_TOP);
19501+
19502+void __init set_fixaddr_top()
19503+{
19504+ BUG_ON(nr_fixmaps > 0);
19505+ __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
19506+}
19507+
19508+void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
19509+{
19510+ unsigned long address = __fix_to_virt(idx);
19511+
19512+ if (idx >= __end_of_fixed_addresses) {
19513+ BUG();
19514+ return;
19515+ }
19516+ switch (idx) {
19517+ case FIX_WP_TEST:
19518+#ifdef CONFIG_X86_F00F_BUG
19519+ case FIX_F00F_IDT:
19520+#endif
19521+ set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
19522+ break;
19523+ default:
19524+ set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
19525+ break;
19526+ }
19527+ nr_fixmaps++;
19528+}
19529+
19530+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
19531+{
19532+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
19533+ if (pte)
19534+ make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
19535+ return pte;
19536+}
19537+
19538+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
19539+{
19540+ struct page *pte;
19541+
19542+#ifdef CONFIG_HIGHPTE
19543+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
19544+#else
19545+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
19546+ if (pte) {
19547+ SetPageForeign(pte, pte_free);
19548+ set_page_count(pte, 1);
19549+ }
19550+#endif
19551+ return pte;
19552+}
19553+
19554+void pte_free(struct page *pte)
19555+{
19556+ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
19557+
19558+ if (!pte_write(*virt_to_ptep(va)))
19559+ BUG_ON(HYPERVISOR_update_va_mapping(
19560+ va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
19561+
19562+ ClearPageForeign(pte);
19563+ set_page_count(pte, 1);
19564+
19565+ __free_page(pte);
19566+}
19567+
19568+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
19569+{
19570+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
19571+}
19572+
19573+/*
19574+ * List of all pgd's needed for non-PAE so it can invalidate entries
19575+ * in both cached and uncached pgd's; not needed for PAE since the
19576+ * kernel pmd is shared. If PAE were not to share the pmd a similar
19577+ * tactic would be needed. This is essentially codepath-based locking
19578+ * against pageattr.c; it is the unique case in which a valid change
19579+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
19580+ * vmalloc faults work because attached pagetables are never freed.
19581+ * The locking scheme was chosen on the basis of manfred's
19582+ * recommendations and having no core impact whatsoever.
19583+ * -- wli
19584+ */
19585+DEFINE_SPINLOCK(pgd_lock);
19586+struct page *pgd_list;
19587+
19588+static inline void pgd_list_add(pgd_t *pgd)
19589+{
19590+ struct page *page = virt_to_page(pgd);
19591+ page->index = (unsigned long)pgd_list;
19592+ if (pgd_list)
19593+ set_page_private(pgd_list, (unsigned long)&page->index);
19594+ pgd_list = page;
19595+ set_page_private(page, (unsigned long)&pgd_list);
19596+}
19597+
19598+static inline void pgd_list_del(pgd_t *pgd)
19599+{
19600+ struct page *next, **pprev, *page = virt_to_page(pgd);
19601+ next = (struct page *)page->index;
19602+ pprev = (struct page **)page_private(page);
19603+ *pprev = next;
19604+ if (next)
19605+ set_page_private(next, (unsigned long)pprev);
19606+}
19607+
19608+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
19609+{
19610+ unsigned long flags;
19611+
19612+ if (PTRS_PER_PMD > 1) {
19613+ if (HAVE_SHARED_KERNEL_PMD)
19614+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
19615+ swapper_pg_dir + USER_PTRS_PER_PGD,
19616+ KERNEL_PGD_PTRS);
19617+ } else {
19618+ spin_lock_irqsave(&pgd_lock, flags);
19619+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
19620+ swapper_pg_dir + USER_PTRS_PER_PGD,
19621+ KERNEL_PGD_PTRS);
19622+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
19623+ pgd_list_add(pgd);
19624+ spin_unlock_irqrestore(&pgd_lock, flags);
19625+ }
19626+}
19627+
19628+/* never called when PTRS_PER_PMD > 1 */
19629+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
19630+{
19631+ unsigned long flags; /* can be called from interrupt context */
19632+
19633+ spin_lock_irqsave(&pgd_lock, flags);
19634+ pgd_list_del(pgd);
19635+ spin_unlock_irqrestore(&pgd_lock, flags);
19636+
19637+ pgd_test_and_unpin(pgd);
19638+}
19639+
19640+pgd_t *pgd_alloc(struct mm_struct *mm)
19641+{
19642+ int i;
19643+ pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
19644+ pmd_t **pmd;
19645+ unsigned long flags;
19646+
19647+ pgd_test_and_unpin(pgd);
19648+
19649+ if (PTRS_PER_PMD == 1 || !pgd)
19650+ return pgd;
19651+
19652+ if (HAVE_SHARED_KERNEL_PMD) {
19653+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
19654+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
19655+ if (!pmd)
19656+ goto out_oom;
19657+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
19658+ }
19659+ return pgd;
19660+ }
19661+
19662+ /*
19663+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
19664+ * allocation). We therefore store virtual addresses of pmds as they
19665+ * do not change across save/restore, and poke the machine addresses
19666+ * into the pgdir under the pgd_lock.
19667+ */
19668+ pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
19669+ if (!pmd) {
19670+ kmem_cache_free(pgd_cache, pgd);
19671+ return NULL;
19672+ }
19673+
19674+ /* Allocate pmds, remember virtual addresses. */
19675+ for (i = 0; i < PTRS_PER_PGD; ++i) {
19676+ pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
19677+ if (!pmd[i])
19678+ goto out_oom;
19679+ }
19680+
19681+ spin_lock_irqsave(&pgd_lock, flags);
19682+
19683+ /* Protect against save/restore: move below 4GB under pgd_lock. */
19684+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
19685+ int rc = xen_create_contiguous_region(
19686+ (unsigned long)pgd, 0, 32);
19687+ if (rc) {
19688+ spin_unlock_irqrestore(&pgd_lock, flags);
19689+ goto out_oom;
19690+ }
19691+ }
19692+
19693+ /* Copy kernel pmd contents and write-protect the new pmds. */
19694+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
19695+ unsigned long v = (unsigned long)i << PGDIR_SHIFT;
19696+ pgd_t *kpgd = pgd_offset_k(v);
19697+ pud_t *kpud = pud_offset(kpgd, v);
19698+ pmd_t *kpmd = pmd_offset(kpud, v);
19699+ memcpy(pmd[i], kpmd, PAGE_SIZE);
19700+ make_lowmem_page_readonly(
19701+ pmd[i], XENFEAT_writable_page_tables);
19702+ }
19703+
19704+ /* It is safe to poke machine addresses of pmds under the pmd_lock. */
19705+ for (i = 0; i < PTRS_PER_PGD; i++)
19706+ set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
19707+
19708+ /* Ensure this pgd gets picked up and pinned on save/restore. */
19709+ pgd_list_add(pgd);
19710+
19711+ spin_unlock_irqrestore(&pgd_lock, flags);
19712+
19713+ kfree(pmd);
19714+
19715+ return pgd;
19716+
19717+out_oom:
19718+ if (HAVE_SHARED_KERNEL_PMD) {
19719+ for (i--; i >= 0; i--)
19720+ kmem_cache_free(pmd_cache,
19721+ (void *)__va(pgd_val(pgd[i])-1));
19722+ } else {
19723+ for (i--; i >= 0; i--)
19724+ kmem_cache_free(pmd_cache, pmd[i]);
19725+ kfree(pmd);
19726+ }
19727+ kmem_cache_free(pgd_cache, pgd);
19728+ return NULL;
19729+}
19730+
19731+void pgd_free(pgd_t *pgd)
19732+{
19733+ int i;
19734+
19735+ /*
19736+ * After this the pgd should not be pinned for the duration of this
19737+ * function's execution. We should never sleep and thus never race:
19738+ * 1. User pmds will not become write-protected under our feet due
19739+ * to a concurrent mm_pin_all().
19740+ * 2. The machine addresses in PGD entries will not become invalid
19741+ * due to a concurrent save/restore.
19742+ */
19743+ pgd_test_and_unpin(pgd);
19744+
19745+ /* in the PAE case user pgd entries are overwritten before usage */
19746+ if (PTRS_PER_PMD > 1) {
19747+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
19748+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
19749+ kmem_cache_free(pmd_cache, pmd);
19750+ }
19751+
19752+ if (!HAVE_SHARED_KERNEL_PMD) {
19753+ unsigned long flags;
19754+ spin_lock_irqsave(&pgd_lock, flags);
19755+ pgd_list_del(pgd);
19756+ spin_unlock_irqrestore(&pgd_lock, flags);
19757+
19758+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
19759+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
19760+ make_lowmem_page_writable(
19761+ pmd, XENFEAT_writable_page_tables);
19762+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
19763+ kmem_cache_free(pmd_cache, pmd);
19764+ }
19765+
19766+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
19767+ xen_destroy_contiguous_region(
19768+ (unsigned long)pgd, 0);
19769+ }
19770+ }
19771+
19772+ /* in the non-PAE case, free_pgtables() clears user pgd entries */
19773+ kmem_cache_free(pgd_cache, pgd);
19774+}
19775+
19776+void make_lowmem_page_readonly(void *va, unsigned int feature)
19777+{
19778+ pte_t *pte;
19779+ int rc;
19780+
19781+ if (xen_feature(feature))
19782+ return;
19783+
19784+ pte = virt_to_ptep(va);
19785+ rc = HYPERVISOR_update_va_mapping(
19786+ (unsigned long)va, pte_wrprotect(*pte), 0);
19787+ BUG_ON(rc);
19788+}
19789+
19790+void make_lowmem_page_writable(void *va, unsigned int feature)
19791+{
19792+ pte_t *pte;
19793+ int rc;
19794+
19795+ if (xen_feature(feature))
19796+ return;
19797+
19798+ pte = virt_to_ptep(va);
19799+ rc = HYPERVISOR_update_va_mapping(
19800+ (unsigned long)va, pte_mkwrite(*pte), 0);
19801+ BUG_ON(rc);
19802+}
19803+
19804+void make_page_readonly(void *va, unsigned int feature)
19805+{
19806+ pte_t *pte;
19807+ int rc;
19808+
19809+ if (xen_feature(feature))
19810+ return;
19811+
19812+ pte = virt_to_ptep(va);
19813+ rc = HYPERVISOR_update_va_mapping(
19814+ (unsigned long)va, pte_wrprotect(*pte), 0);
19815+ if (rc) /* fallback? */
19816+ xen_l1_entry_update(pte, pte_wrprotect(*pte));
19817+ if ((unsigned long)va >= (unsigned long)high_memory) {
19818+ unsigned long pfn = pte_pfn(*pte);
19819+#ifdef CONFIG_HIGHMEM
19820+ if (pfn >= highstart_pfn)
19821+ kmap_flush_unused(); /* flush stale writable kmaps */
19822+ else
19823+#endif
19824+ make_lowmem_page_readonly(
19825+ phys_to_virt(pfn << PAGE_SHIFT), feature);
19826+ }
19827+}
19828+
19829+void make_page_writable(void *va, unsigned int feature)
19830+{
19831+ pte_t *pte;
19832+ int rc;
19833+
19834+ if (xen_feature(feature))
19835+ return;
19836+
19837+ pte = virt_to_ptep(va);
19838+ rc = HYPERVISOR_update_va_mapping(
19839+ (unsigned long)va, pte_mkwrite(*pte), 0);
19840+ if (rc) /* fallback? */
19841+ xen_l1_entry_update(pte, pte_mkwrite(*pte));
19842+ if ((unsigned long)va >= (unsigned long)high_memory) {
19843+ unsigned long pfn = pte_pfn(*pte);
19844+#ifdef CONFIG_HIGHMEM
19845+ if (pfn < highstart_pfn)
19846+#endif
19847+ make_lowmem_page_writable(
19848+ phys_to_virt(pfn << PAGE_SHIFT), feature);
19849+ }
19850+}
19851+
19852+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
19853+{
19854+ if (xen_feature(feature))
19855+ return;
19856+
19857+ while (nr-- != 0) {
19858+ make_page_readonly(va, feature);
19859+ va = (void *)((unsigned long)va + PAGE_SIZE);
19860+ }
19861+}
19862+
19863+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
19864+{
19865+ if (xen_feature(feature))
19866+ return;
19867+
19868+ while (nr-- != 0) {
19869+ make_page_writable(va, feature);
19870+ va = (void *)((unsigned long)va + PAGE_SIZE);
19871+ }
19872+}
19873+
19874+static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
19875+{
19876+ struct page *page = virt_to_page(pt);
19877+ unsigned long pfn = page_to_pfn(page);
19878+
19879+ if (PageHighMem(page))
19880+ return;
19881+ BUG_ON(HYPERVISOR_update_va_mapping(
19882+ (unsigned long)__va(pfn << PAGE_SHIFT),
19883+ pfn_pte(pfn, flags), 0));
19884+}
19885+
19886+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
19887+{
19888+ pgd_t *pgd = pgd_base;
19889+ pud_t *pud;
19890+ pmd_t *pmd;
19891+ pte_t *pte;
19892+ int g, u, m;
19893+
19894+ if (xen_feature(XENFEAT_auto_translated_physmap))
19895+ return;
19896+
19897+ for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
19898+ if (pgd_none(*pgd))
19899+ continue;
19900+ pud = pud_offset(pgd, 0);
19901+ if (PTRS_PER_PUD > 1) /* not folded */
19902+ pgd_walk_set_prot(pud,flags);
19903+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
19904+ if (pud_none(*pud))
19905+ continue;
19906+ pmd = pmd_offset(pud, 0);
19907+ if (PTRS_PER_PMD > 1) /* not folded */
19908+ pgd_walk_set_prot(pmd,flags);
19909+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
19910+ if (pmd_none(*pmd))
19911+ continue;
19912+ pte = pte_offset_kernel(pmd,0);
19913+ pgd_walk_set_prot(pte,flags);
19914+ }
19915+ }
19916+ }
19917+
19918+ BUG_ON(HYPERVISOR_update_va_mapping(
19919+ (unsigned long)pgd_base,
19920+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
19921+ UVMF_TLB_FLUSH));
19922+}
19923+
19924+static void __pgd_pin(pgd_t *pgd)
19925+{
19926+ pgd_walk(pgd, PAGE_KERNEL_RO);
19927+ xen_pgd_pin(__pa(pgd));
19928+ set_bit(PG_pinned, &virt_to_page(pgd)->flags);
19929+}
19930+
19931+static void __pgd_unpin(pgd_t *pgd)
19932+{
19933+ xen_pgd_unpin(__pa(pgd));
19934+ pgd_walk(pgd, PAGE_KERNEL);
19935+ clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
19936+}
19937+
19938+static void pgd_test_and_unpin(pgd_t *pgd)
19939+{
19940+ if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
19941+ __pgd_unpin(pgd);
19942+}
19943+
19944+void mm_pin(struct mm_struct *mm)
19945+{
19946+ if (xen_feature(XENFEAT_writable_page_tables))
19947+ return;
19948+ spin_lock(&mm->page_table_lock);
19949+ __pgd_pin(mm->pgd);
19950+ spin_unlock(&mm->page_table_lock);
19951+}
19952+
19953+void mm_unpin(struct mm_struct *mm)
19954+{
19955+ if (xen_feature(XENFEAT_writable_page_tables))
19956+ return;
19957+ spin_lock(&mm->page_table_lock);
19958+ __pgd_unpin(mm->pgd);
19959+ spin_unlock(&mm->page_table_lock);
19960+}
19961+
19962+void mm_pin_all(void)
19963+{
19964+ struct page *page;
19965+
19966+ /* Only pgds on the pgd_list please: none hidden in the slab cache. */
19967+ kmem_cache_shrink(pgd_cache);
19968+
19969+ if (xen_feature(XENFEAT_writable_page_tables))
19970+ return;
19971+
19972+ for (page = pgd_list; page; page = (struct page *)page->index) {
19973+ if (!test_bit(PG_pinned, &page->flags))
19974+ __pgd_pin((pgd_t *)page_address(page));
19975+ }
19976+}
19977+
19978+void _arch_dup_mmap(struct mm_struct *mm)
19979+{
19980+ if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
19981+ mm_pin(mm);
19982+}
19983+
19984+void _arch_exit_mmap(struct mm_struct *mm)
19985+{
19986+ struct task_struct *tsk = current;
19987+
19988+ task_lock(tsk);
19989+
19990+ /*
19991+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
19992+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
19993+ */
19994+ if (tsk->active_mm == mm) {
19995+ tsk->active_mm = &init_mm;
19996+ atomic_inc(&init_mm.mm_count);
19997+
19998+ switch_mm(mm, &init_mm, tsk);
19999+
20000+ atomic_dec(&mm->mm_count);
20001+ BUG_ON(atomic_read(&mm->mm_count) == 0);
20002+ }
20003+
20004+ task_unlock(tsk);
20005+
20006+ if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
20007+ (atomic_read(&mm->mm_count) == 1) &&
20008+ !mm->context.has_foreign_mappings)
20009+ mm_unpin(mm);
20010+}
20011diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pgtable.c linux-2.6.16.33/arch/i386/mm/pgtable.c
20012--- linux-2.6.16.33-noxen/arch/i386/mm/pgtable.c 2006-11-22 18:06:31.000000000 +0000
20013+++ linux-2.6.16.33/arch/i386/mm/pgtable.c 2007-01-08 15:00:45.000000000 +0000
20014@@ -13,6 +13,7 @@
20015 #include <linux/slab.h>
20016 #include <linux/pagemap.h>
20017 #include <linux/spinlock.h>
20018+#include <linux/module.h>
20019
20020 #include <asm/system.h>
20021 #include <asm/pgtable.h>
20022@@ -138,6 +139,10 @@
20023 __flush_tlb_one(vaddr);
20024 }
20025
20026+static int nr_fixmaps = 0;
20027+unsigned long __FIXADDR_TOP = 0xfffff000;
20028+EXPORT_SYMBOL(__FIXADDR_TOP);
20029+
20030 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
20031 {
20032 unsigned long address = __fix_to_virt(idx);
20033@@ -147,6 +152,13 @@
20034 return;
20035 }
20036 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
20037+ nr_fixmaps++;
20038+}
20039+
20040+void set_fixaddr_top(unsigned long top)
20041+{
20042+ BUG_ON(nr_fixmaps > 0);
20043+ __FIXADDR_TOP = top - PAGE_SIZE;
20044 }
20045
20046 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
20047diff -Nur linux-2.6.16.33-noxen/arch/i386/oprofile/Makefile linux-2.6.16.33/arch/i386/oprofile/Makefile
20048--- linux-2.6.16.33-noxen/arch/i386/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
20049+++ linux-2.6.16.33/arch/i386/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
20050@@ -6,7 +6,14 @@
20051 oprofilefs.o oprofile_stats.o \
20052 timer_int.o )
20053
20054+ifdef CONFIG_XEN
20055+XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
20056+ xenoprofile.o)
20057+oprofile-y := $(DRIVER_OBJS) \
20058+ $(XENOPROF_COMMON_OBJS) xenoprof.o
20059+else
20060 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
20061 oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \
20062 op_model_ppro.o op_model_p4.o
20063 oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
20064+endif
20065diff -Nur linux-2.6.16.33-noxen/arch/i386/oprofile/xenoprof.c linux-2.6.16.33/arch/i386/oprofile/xenoprof.c
20066--- linux-2.6.16.33-noxen/arch/i386/oprofile/xenoprof.c 1970-01-01 00:00:00.000000000 +0000
20067+++ linux-2.6.16.33/arch/i386/oprofile/xenoprof.c 2007-01-08 15:00:45.000000000 +0000
20068@@ -0,0 +1,179 @@
20069+/**
20070+ * @file xenoprof.c
20071+ *
20072+ * @remark Copyright 2002 OProfile authors
20073+ * @remark Read the file COPYING
20074+ *
20075+ * @author John Levon <levon@movementarian.org>
20076+ *
20077+ * Modified by Aravind Menon and Jose Renato Santos for Xen
20078+ * These modifications are:
20079+ * Copyright (C) 2005 Hewlett-Packard Co.
20080+ *
20081+ * x86-specific part
20082+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
20083+ * VA Linux Systems Japan K.K.
20084+ */
20085+
20086+#include <linux/init.h>
20087+#include <linux/oprofile.h>
20088+#include <linux/sched.h>
20089+#include <asm/pgtable.h>
20090+
20091+#include <xen/driver_util.h>
20092+#include <xen/interface/xen.h>
20093+#include <xen/interface/xenoprof.h>
20094+#include <xen/xenoprof.h>
20095+#include "op_counter.h"
20096+
20097+static unsigned int num_events = 0;
20098+
20099+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
20100+{
20101+ num_events = init->num_events;
20102+ /* just in case - make sure we do not overflow event list
20103+ (i.e. counter_config list) */
20104+ if (num_events > OP_MAX_COUNTER) {
20105+ num_events = OP_MAX_COUNTER;
20106+ init->num_events = num_events;
20107+ }
20108+}
20109+
20110+void xenoprof_arch_counter(void)
20111+{
20112+ int i;
20113+ struct xenoprof_counter counter;
20114+
20115+ for (i=0; i<num_events; i++) {
20116+ counter.ind = i;
20117+ counter.count = (uint64_t)counter_config[i].count;
20118+ counter.enabled = (uint32_t)counter_config[i].enabled;
20119+ counter.event = (uint32_t)counter_config[i].event;
20120+ counter.kernel = (uint32_t)counter_config[i].kernel;
20121+ counter.user = (uint32_t)counter_config[i].user;
20122+ counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
20123+ HYPERVISOR_xenoprof_op(XENOPROF_counter,
20124+ &counter);
20125+ }
20126+}
20127+
20128+void xenoprof_arch_start(void)
20129+{
20130+ /* nothing */
20131+}
20132+
20133+void xenoprof_arch_stop(void)
20134+{
20135+ /* nothing */
20136+}
20137+
20138+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
20139+{
20140+ if (sbuf->buffer) {
20141+ vunmap(sbuf->buffer);
20142+ sbuf->buffer = NULL;
20143+ }
20144+}
20145+
20146+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
20147+ struct xenoprof_shared_buffer * sbuf)
20148+{
20149+ int npages, ret;
20150+ struct vm_struct *area;
20151+
20152+ sbuf->buffer = NULL;
20153+ if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
20154+ return ret;
20155+
20156+ npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
20157+
20158+ area = alloc_vm_area(npages * PAGE_SIZE);
20159+ if (area == NULL)
20160+ return -ENOMEM;
20161+
20162+ if ( (ret = direct_kernel_remap_pfn_range(
20163+ (unsigned long)area->addr,
20164+ get_buffer->buf_gmaddr >> PAGE_SHIFT,
20165+ npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
20166+ DOMID_SELF)) ) {
20167+ vunmap(area->addr);
20168+ return ret;
20169+ }
20170+
20171+ sbuf->buffer = area->addr;
20172+ return ret;
20173+}
20174+
20175+int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
20176+ struct xenoprof_shared_buffer * sbuf)
20177+{
20178+ int ret;
20179+ int npages;
20180+ struct vm_struct *area;
20181+ pgprot_t prot = __pgprot(_KERNPG_TABLE);
20182+
20183+ sbuf->buffer = NULL;
20184+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
20185+ if (ret)
20186+ goto out;
20187+
20188+ npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
20189+
20190+ area = alloc_vm_area(npages * PAGE_SIZE);
20191+ if (area == NULL) {
20192+ ret = -ENOMEM;
20193+ goto out;
20194+ }
20195+
20196+ ret = direct_kernel_remap_pfn_range(
20197+ (unsigned long)area->addr,
20198+ pdomain->buf_gmaddr >> PAGE_SHIFT,
20199+ npages * PAGE_SIZE, prot, DOMID_SELF);
20200+ if (ret) {
20201+ vunmap(area->addr);
20202+ goto out;
20203+ }
20204+ sbuf->buffer = area->addr;
20205+
20206+out:
20207+ return ret;
20208+}
20209+
20210+struct op_counter_config counter_config[OP_MAX_COUNTER];
20211+
20212+int xenoprof_create_files(struct super_block * sb, struct dentry * root)
20213+{
20214+ unsigned int i;
20215+
20216+ for (i = 0; i < num_events; ++i) {
20217+ struct dentry * dir;
20218+ char buf[2];
20219+
20220+ snprintf(buf, 2, "%d", i);
20221+ dir = oprofilefs_mkdir(sb, root, buf);
20222+ oprofilefs_create_ulong(sb, dir, "enabled",
20223+ &counter_config[i].enabled);
20224+ oprofilefs_create_ulong(sb, dir, "event",
20225+ &counter_config[i].event);
20226+ oprofilefs_create_ulong(sb, dir, "count",
20227+ &counter_config[i].count);
20228+ oprofilefs_create_ulong(sb, dir, "unit_mask",
20229+ &counter_config[i].unit_mask);
20230+ oprofilefs_create_ulong(sb, dir, "kernel",
20231+ &counter_config[i].kernel);
20232+ oprofilefs_create_ulong(sb, dir, "user",
20233+ &counter_config[i].user);
20234+ }
20235+
20236+ return 0;
20237+}
20238+
20239+int __init oprofile_arch_init(struct oprofile_operations * ops)
20240+{
20241+ return xenoprofile_init(ops);
20242+}
20243+
20244+void oprofile_arch_exit(void)
20245+{
20246+ xenoprofile_exit();
20247+}
20248diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/Makefile linux-2.6.16.33/arch/i386/pci/Makefile
20249--- linux-2.6.16.33-noxen/arch/i386/pci/Makefile 2006-11-22 18:06:31.000000000 +0000
20250+++ linux-2.6.16.33/arch/i386/pci/Makefile 2007-01-08 15:00:45.000000000 +0000
20251@@ -4,6 +4,10 @@
20252 obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
20253 obj-$(CONFIG_PCI_DIRECT) += direct.o
20254
20255+# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
20256+# take over if direct access to the PCI bus is unavailable
20257+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
20258+
20259 pci-y := fixup.o
20260 pci-$(CONFIG_ACPI) += acpi.o
20261 pci-y += legacy.o irq.o
20262@@ -12,3 +16,8 @@
20263 pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
20264
20265 obj-y += $(pci-y) common.o
20266+
20267+ifdef CONFIG_XEN
20268+include $(srctree)/scripts/Makefile.xen
20269+obj-y := $(call cherrypickxen, $(obj-y))
20270+endif
20271diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/irq-xen.c linux-2.6.16.33/arch/i386/pci/irq-xen.c
20272--- linux-2.6.16.33-noxen/arch/i386/pci/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
20273+++ linux-2.6.16.33/arch/i386/pci/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
20274@@ -0,0 +1,1204 @@
20275+/*
20276+ * Low-Level PCI Support for PC -- Routing of Interrupts
20277+ *
20278+ * (c) 1999--2000 Martin Mares <mj@ucw.cz>
20279+ */
20280+
20281+#include <linux/config.h>
20282+#include <linux/types.h>
20283+#include <linux/kernel.h>
20284+#include <linux/pci.h>
20285+#include <linux/init.h>
20286+#include <linux/slab.h>
20287+#include <linux/interrupt.h>
20288+#include <linux/dmi.h>
20289+#include <asm/io.h>
20290+#include <asm/smp.h>
20291+#include <asm/io_apic.h>
20292+#include <linux/irq.h>
20293+#include <linux/acpi.h>
20294+
20295+#include "pci.h"
20296+
20297+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
20298+#define PIRQ_VERSION 0x0100
20299+
20300+static int broken_hp_bios_irq9;
20301+static int acer_tm360_irqrouting;
20302+
20303+static struct irq_routing_table *pirq_table;
20304+
20305+static int pirq_enable_irq(struct pci_dev *dev);
20306+
20307+/*
20308+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
20309+ * Avoid using: 13, 14 and 15 (FP error and IDE).
20310+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
20311+ */
20312+unsigned int pcibios_irq_mask = 0xfff8;
20313+
20314+static int pirq_penalty[16] = {
20315+ 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
20316+ 0, 0, 0, 0, 1000, 100000, 100000, 100000
20317+};
20318+
20319+struct irq_router {
20320+ char *name;
20321+ u16 vendor, device;
20322+ int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
20323+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
20324+};
20325+
20326+struct irq_router_handler {
20327+ u16 vendor;
20328+ int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
20329+};
20330+
20331+int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
20332+void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
20333+
20334+/*
20335+ * Check passed address for the PCI IRQ Routing Table signature
20336+ * and perform checksum verification.
20337+ */
20338+
20339+static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
20340+{
20341+ struct irq_routing_table *rt;
20342+ int i;
20343+ u8 sum;
20344+
20345+ rt = (struct irq_routing_table *) addr;
20346+ if (rt->signature != PIRQ_SIGNATURE ||
20347+ rt->version != PIRQ_VERSION ||
20348+ rt->size % 16 ||
20349+ rt->size < sizeof(struct irq_routing_table))
20350+ return NULL;
20351+ sum = 0;
20352+ for (i=0; i < rt->size; i++)
20353+ sum += addr[i];
20354+ if (!sum) {
20355+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
20356+ return rt;
20357+ }
20358+ return NULL;
20359+}
20360+
20361+
20362+
20363+/*
20364+ * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
20365+ */
20366+
20367+static struct irq_routing_table * __init pirq_find_routing_table(void)
20368+{
20369+ u8 *addr;
20370+ struct irq_routing_table *rt;
20371+
20372+#ifdef CONFIG_XEN
20373+ if (!is_initial_xendomain())
20374+ return NULL;
20375+#endif
20376+ if (pirq_table_addr) {
20377+ rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
20378+ if (rt)
20379+ return rt;
20380+ printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
20381+ }
20382+ for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
20383+ rt = pirq_check_routing_table(addr);
20384+ if (rt)
20385+ return rt;
20386+ }
20387+
20388+ return NULL;
20389+}
20390+
20391+/*
20392+ * If we have a IRQ routing table, use it to search for peer host
20393+ * bridges. It's a gross hack, but since there are no other known
20394+ * ways how to get a list of buses, we have to go this way.
20395+ */
20396+
20397+static void __init pirq_peer_trick(void)
20398+{
20399+ struct irq_routing_table *rt = pirq_table;
20400+ u8 busmap[256];
20401+ int i;
20402+ struct irq_info *e;
20403+
20404+ memset(busmap, 0, sizeof(busmap));
20405+ for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
20406+ e = &rt->slots[i];
20407+#ifdef DEBUG
20408+ {
20409+ int j;
20410+ DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
20411+ for(j=0; j<4; j++)
20412+ DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
20413+ DBG("\n");
20414+ }
20415+#endif
20416+ busmap[e->bus] = 1;
20417+ }
20418+ for(i = 1; i < 256; i++) {
20419+ if (!busmap[i] || pci_find_bus(0, i))
20420+ continue;
20421+ if (pci_scan_bus(i, &pci_root_ops, NULL))
20422+ printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
20423+ }
20424+ pcibios_last_bus = -1;
20425+}
20426+
20427+/*
20428+ * Code for querying and setting of IRQ routes on various interrupt routers.
20429+ */
20430+
20431+void eisa_set_level_irq(unsigned int irq)
20432+{
20433+ unsigned char mask = 1 << (irq & 7);
20434+ unsigned int port = 0x4d0 + (irq >> 3);
20435+ unsigned char val;
20436+ static u16 eisa_irq_mask;
20437+
20438+ if (irq >= 16 || (1 << irq) & eisa_irq_mask)
20439+ return;
20440+
20441+ eisa_irq_mask |= (1 << irq);
20442+ printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
20443+ val = inb(port);
20444+ if (!(val & mask)) {
20445+ DBG(KERN_DEBUG " -> edge");
20446+ outb(val | mask, port);
20447+ }
20448+}
20449+
20450+/*
20451+ * Common IRQ routing practice: nybbles in config space,
20452+ * offset by some magic constant.
20453+ */
20454+static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
20455+{
20456+ u8 x;
20457+ unsigned reg = offset + (nr >> 1);
20458+
20459+ pci_read_config_byte(router, reg, &x);
20460+ return (nr & 1) ? (x >> 4) : (x & 0xf);
20461+}
20462+
20463+static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
20464+{
20465+ u8 x;
20466+ unsigned reg = offset + (nr >> 1);
20467+
20468+ pci_read_config_byte(router, reg, &x);
20469+ x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
20470+ pci_write_config_byte(router, reg, x);
20471+}
20472+
20473+/*
20474+ * ALI pirq entries are damn ugly, and completely undocumented.
20475+ * This has been figured out from pirq tables, and it's not a pretty
20476+ * picture.
20477+ */
20478+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20479+{
20480+ static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
20481+
20482+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
20483+}
20484+
20485+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20486+{
20487+ static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
20488+ unsigned int val = irqmap[irq];
20489+
20490+ if (val) {
20491+ write_config_nybble(router, 0x48, pirq-1, val);
20492+ return 1;
20493+ }
20494+ return 0;
20495+}
20496+
20497+/*
20498+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
20499+ * just a pointer to the config space.
20500+ */
20501+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20502+{
20503+ u8 x;
20504+
20505+ pci_read_config_byte(router, pirq, &x);
20506+ return (x < 16) ? x : 0;
20507+}
20508+
20509+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20510+{
20511+ pci_write_config_byte(router, pirq, irq);
20512+ return 1;
20513+}
20514+
20515+/*
20516+ * The VIA pirq rules are nibble-based, like ALI,
20517+ * but without the ugly irq number munging.
20518+ * However, PIRQD is in the upper instead of lower 4 bits.
20519+ */
20520+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20521+{
20522+ return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
20523+}
20524+
20525+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20526+{
20527+ write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
20528+ return 1;
20529+}
20530+
20531+/*
20532+ * The VIA pirq rules are nibble-based, like ALI,
20533+ * but without the ugly irq number munging.
20534+ * However, for 82C586, nibble map is different .
20535+ */
20536+static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20537+{
20538+ static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20539+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
20540+}
20541+
20542+static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20543+{
20544+ static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20545+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
20546+ return 1;
20547+}
20548+
20549+/*
20550+ * ITE 8330G pirq rules are nibble-based
20551+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
20552+ * 2+3 are both mapped to irq 9 on my system
20553+ */
20554+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20555+{
20556+ static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20557+ return read_config_nybble(router,0x43, pirqmap[pirq-1]);
20558+}
20559+
20560+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20561+{
20562+ static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20563+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
20564+ return 1;
20565+}
20566+
20567+/*
20568+ * OPTI: high four bits are nibble pointer..
20569+ * I wonder what the low bits do?
20570+ */
20571+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20572+{
20573+ return read_config_nybble(router, 0xb8, pirq >> 4);
20574+}
20575+
20576+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20577+{
20578+ write_config_nybble(router, 0xb8, pirq >> 4, irq);
20579+ return 1;
20580+}
20581+
20582+/*
20583+ * Cyrix: nibble offset 0x5C
20584+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
20585+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
20586+ */
20587+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20588+{
20589+ return read_config_nybble(router, 0x5C, (pirq-1)^1);
20590+}
20591+
20592+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20593+{
20594+ write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
20595+ return 1;
20596+}
20597+
20598+/*
20599+ * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
20600+ * We have to deal with the following issues here:
20601+ * - vendors have different ideas about the meaning of link values
20602+ * - some onboard devices (integrated in the chipset) have special
20603+ * links and are thus routed differently (i.e. not via PCI INTA-INTD)
20604+ * - different revision of the router have a different layout for
20605+ * the routing registers, particularly for the onchip devices
20606+ *
20607+ * For all routing registers the common thing is we have one byte
20608+ * per routeable link which is defined as:
20609+ * bit 7 IRQ mapping enabled (0) or disabled (1)
20610+ * bits [6:4] reserved (sometimes used for onchip devices)
20611+ * bits [3:0] IRQ to map to
20612+ * allowed: 3-7, 9-12, 14-15
20613+ * reserved: 0, 1, 2, 8, 13
20614+ *
20615+ * The config-space registers located at 0x41/0x42/0x43/0x44 are
20616+ * always used to route the normal PCI INT A/B/C/D respectively.
20617+ * Apparently there are systems implementing PCI routing table using
20618+ * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
20619+ * We try our best to handle both link mappings.
20620+ *
20621+ * Currently (2003-05-21) it appears most SiS chipsets follow the
20622+ * definition of routing registers from the SiS-5595 southbridge.
20623+ * According to the SiS 5595 datasheets the revision id's of the
20624+ * router (ISA-bridge) should be 0x01 or 0xb0.
20625+ *
20626+ * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
20627+ * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
20628+ * They seem to work with the current routing code. However there is
20629+ * some concern because of the two USB-OHCI HCs (original SiS 5595
20630+ * had only one). YMMV.
20631+ *
20632+ * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
20633+ *
20634+ * 0x61: IDEIRQ:
20635+ * bits [6:5] must be written 01
20636+ * bit 4 channel-select primary (0), secondary (1)
20637+ *
20638+ * 0x62: USBIRQ:
20639+ * bit 6 OHCI function disabled (0), enabled (1)
20640+ *
20641+ * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
20642+ *
20643+ * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
20644+ *
20645+ * We support USBIRQ (in addition to INTA-INTD) and keep the
20646+ * IDE, ACPI and DAQ routing untouched as set by the BIOS.
20647+ *
20648+ * Currently the only reported exception is the new SiS 65x chipset
20649+ * which includes the SiS 69x southbridge. Here we have the 85C503
20650+ * router revision 0x04 and there are changes in the register layout
20651+ * mostly related to the different USB HCs with USB 2.0 support.
20652+ *
20653+ * Onchip routing for router rev-id 0x04 (try-and-error observation)
20654+ *
20655+ * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
20656+ * bit 6-4 are probably unused, not like 5595
20657+ */
20658+
20659+#define PIRQ_SIS_IRQ_MASK 0x0f
20660+#define PIRQ_SIS_IRQ_DISABLE 0x80
20661+#define PIRQ_SIS_USB_ENABLE 0x40
20662+
20663+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20664+{
20665+ u8 x;
20666+ int reg;
20667+
20668+ reg = pirq;
20669+ if (reg >= 0x01 && reg <= 0x04)
20670+ reg += 0x40;
20671+ pci_read_config_byte(router, reg, &x);
20672+ return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
20673+}
20674+
20675+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20676+{
20677+ u8 x;
20678+ int reg;
20679+
20680+ reg = pirq;
20681+ if (reg >= 0x01 && reg <= 0x04)
20682+ reg += 0x40;
20683+ pci_read_config_byte(router, reg, &x);
20684+ x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
20685+ x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
20686+ pci_write_config_byte(router, reg, x);
20687+ return 1;
20688+}
20689+
20690+
20691+/*
20692+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
20693+ * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
20694+ * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
20695+ * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
20696+ * for the busbridge to the docking station.
20697+ */
20698+
20699+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20700+{
20701+ if (pirq > 8) {
20702+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20703+ return 0;
20704+ }
20705+ return read_config_nybble(router, 0x74, pirq-1);
20706+}
20707+
20708+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20709+{
20710+ if (pirq > 8) {
20711+ printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20712+ return 0;
20713+ }
20714+ write_config_nybble(router, 0x74, pirq-1, irq);
20715+ return 1;
20716+}
20717+
20718+/*
20719+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
20720+ * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
20721+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
20722+ * register is a straight binary coding of desired PIC IRQ (low nibble).
20723+ *
20724+ * The 'link' value in the PIRQ table is already in the correct format
20725+ * for the Index register. There are some special index values:
20726+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
20727+ * and 0x03 for SMBus.
20728+ */
20729+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20730+{
20731+ outb_p(pirq, 0xc00);
20732+ return inb(0xc01) & 0xf;
20733+}
20734+
20735+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20736+{
20737+ outb_p(pirq, 0xc00);
20738+ outb_p(irq, 0xc01);
20739+ return 1;
20740+}
20741+
20742+/* Support for AMD756 PCI IRQ Routing
20743+ * Jhon H. Caicedo <jhcaiced@osso.org.co>
20744+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
20745+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
20746+ * The AMD756 pirq rules are nibble-based
20747+ * offset 0x56 0-3 PIRQA 4-7 PIRQB
20748+ * offset 0x57 0-3 PIRQC 4-7 PIRQD
20749+ */
20750+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20751+{
20752+ u8 irq;
20753+ irq = 0;
20754+ if (pirq <= 4)
20755+ {
20756+ irq = read_config_nybble(router, 0x56, pirq - 1);
20757+ }
20758+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
20759+ dev->vendor, dev->device, pirq, irq);
20760+ return irq;
20761+}
20762+
20763+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20764+{
20765+ printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
20766+ dev->vendor, dev->device, pirq, irq);
20767+ if (pirq <= 4)
20768+ {
20769+ write_config_nybble(router, 0x56, pirq - 1, irq);
20770+ }
20771+ return 1;
20772+}
20773+
20774+#ifdef CONFIG_PCI_BIOS
20775+
20776+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20777+{
20778+ struct pci_dev *bridge;
20779+ int pin = pci_get_interrupt_pin(dev, &bridge);
20780+ return pcibios_set_irq_routing(bridge, pin, irq);
20781+}
20782+
20783+#endif
20784+
20785+static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20786+{
20787+ static struct pci_device_id pirq_440gx[] = {
20788+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
20789+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
20790+ { },
20791+ };
20792+
20793+ /* 440GX has a proprietary PIRQ router -- don't use it */
20794+ if (pci_dev_present(pirq_440gx))
20795+ return 0;
20796+
20797+ switch(device)
20798+ {
20799+ case PCI_DEVICE_ID_INTEL_82371FB_0:
20800+ case PCI_DEVICE_ID_INTEL_82371SB_0:
20801+ case PCI_DEVICE_ID_INTEL_82371AB_0:
20802+ case PCI_DEVICE_ID_INTEL_82371MX:
20803+ case PCI_DEVICE_ID_INTEL_82443MX_0:
20804+ case PCI_DEVICE_ID_INTEL_82801AA_0:
20805+ case PCI_DEVICE_ID_INTEL_82801AB_0:
20806+ case PCI_DEVICE_ID_INTEL_82801BA_0:
20807+ case PCI_DEVICE_ID_INTEL_82801BA_10:
20808+ case PCI_DEVICE_ID_INTEL_82801CA_0:
20809+ case PCI_DEVICE_ID_INTEL_82801CA_12:
20810+ case PCI_DEVICE_ID_INTEL_82801DB_0:
20811+ case PCI_DEVICE_ID_INTEL_82801E_0:
20812+ case PCI_DEVICE_ID_INTEL_82801EB_0:
20813+ case PCI_DEVICE_ID_INTEL_ESB_1:
20814+ case PCI_DEVICE_ID_INTEL_ICH6_0:
20815+ case PCI_DEVICE_ID_INTEL_ICH6_1:
20816+ case PCI_DEVICE_ID_INTEL_ICH7_0:
20817+ case PCI_DEVICE_ID_INTEL_ICH7_1:
20818+ case PCI_DEVICE_ID_INTEL_ICH7_30:
20819+ case PCI_DEVICE_ID_INTEL_ICH7_31:
20820+ case PCI_DEVICE_ID_INTEL_ESB2_0:
20821+ case PCI_DEVICE_ID_INTEL_ICH8_0:
20822+ case PCI_DEVICE_ID_INTEL_ICH8_1:
20823+ case PCI_DEVICE_ID_INTEL_ICH8_2:
20824+ case PCI_DEVICE_ID_INTEL_ICH8_3:
20825+ case PCI_DEVICE_ID_INTEL_ICH8_4:
20826+ r->name = "PIIX/ICH";
20827+ r->get = pirq_piix_get;
20828+ r->set = pirq_piix_set;
20829+ return 1;
20830+ }
20831+ return 0;
20832+}
20833+
20834+static __init int via_router_probe(struct irq_router *r,
20835+ struct pci_dev *router, u16 device)
20836+{
20837+ /* FIXME: We should move some of the quirk fixup stuff here */
20838+
20839+ /*
20840+ * work arounds for some buggy BIOSes
20841+ */
20842+ if (device == PCI_DEVICE_ID_VIA_82C586_0) {
20843+ switch(router->device) {
20844+ case PCI_DEVICE_ID_VIA_82C686:
20845+ /*
20846+ * Asus k7m bios wrongly reports 82C686A
20847+ * as 586-compatible
20848+ */
20849+ device = PCI_DEVICE_ID_VIA_82C686;
20850+ break;
20851+ case PCI_DEVICE_ID_VIA_8235:
20852+ /**
20853+ * Asus a7v-x bios wrongly reports 8235
20854+ * as 586-compatible
20855+ */
20856+ device = PCI_DEVICE_ID_VIA_8235;
20857+ break;
20858+ }
20859+ }
20860+
20861+ switch(device) {
20862+ case PCI_DEVICE_ID_VIA_82C586_0:
20863+ r->name = "VIA";
20864+ r->get = pirq_via586_get;
20865+ r->set = pirq_via586_set;
20866+ return 1;
20867+ case PCI_DEVICE_ID_VIA_82C596:
20868+ case PCI_DEVICE_ID_VIA_82C686:
20869+ case PCI_DEVICE_ID_VIA_8231:
20870+ case PCI_DEVICE_ID_VIA_8235:
20871+ /* FIXME: add new ones for 8233/5 */
20872+ r->name = "VIA";
20873+ r->get = pirq_via_get;
20874+ r->set = pirq_via_set;
20875+ return 1;
20876+ }
20877+ return 0;
20878+}
20879+
20880+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20881+{
20882+ switch(device)
20883+ {
20884+ case PCI_DEVICE_ID_VLSI_82C534:
20885+ r->name = "VLSI 82C534";
20886+ r->get = pirq_vlsi_get;
20887+ r->set = pirq_vlsi_set;
20888+ return 1;
20889+ }
20890+ return 0;
20891+}
20892+
20893+
20894+static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20895+{
20896+ switch(device)
20897+ {
20898+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
20899+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
20900+ r->name = "ServerWorks";
20901+ r->get = pirq_serverworks_get;
20902+ r->set = pirq_serverworks_set;
20903+ return 1;
20904+ }
20905+ return 0;
20906+}
20907+
20908+static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20909+{
20910+ if (device != PCI_DEVICE_ID_SI_503)
20911+ return 0;
20912+
20913+ r->name = "SIS";
20914+ r->get = pirq_sis_get;
20915+ r->set = pirq_sis_set;
20916+ return 1;
20917+}
20918+
20919+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20920+{
20921+ switch(device)
20922+ {
20923+ case PCI_DEVICE_ID_CYRIX_5520:
20924+ r->name = "NatSemi";
20925+ r->get = pirq_cyrix_get;
20926+ r->set = pirq_cyrix_set;
20927+ return 1;
20928+ }
20929+ return 0;
20930+}
20931+
20932+static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20933+{
20934+ switch(device)
20935+ {
20936+ case PCI_DEVICE_ID_OPTI_82C700:
20937+ r->name = "OPTI";
20938+ r->get = pirq_opti_get;
20939+ r->set = pirq_opti_set;
20940+ return 1;
20941+ }
20942+ return 0;
20943+}
20944+
20945+static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20946+{
20947+ switch(device)
20948+ {
20949+ case PCI_DEVICE_ID_ITE_IT8330G_0:
20950+ r->name = "ITE";
20951+ r->get = pirq_ite_get;
20952+ r->set = pirq_ite_set;
20953+ return 1;
20954+ }
20955+ return 0;
20956+}
20957+
20958+static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20959+{
20960+ switch(device)
20961+ {
20962+ case PCI_DEVICE_ID_AL_M1533:
20963+ case PCI_DEVICE_ID_AL_M1563:
20964+ printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
20965+ r->name = "ALI";
20966+ r->get = pirq_ali_get;
20967+ r->set = pirq_ali_set;
20968+ return 1;
20969+ }
20970+ return 0;
20971+}
20972+
20973+static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20974+{
20975+ switch(device)
20976+ {
20977+ case PCI_DEVICE_ID_AMD_VIPER_740B:
20978+ r->name = "AMD756";
20979+ break;
20980+ case PCI_DEVICE_ID_AMD_VIPER_7413:
20981+ r->name = "AMD766";
20982+ break;
20983+ case PCI_DEVICE_ID_AMD_VIPER_7443:
20984+ r->name = "AMD768";
20985+ break;
20986+ default:
20987+ return 0;
20988+ }
20989+ r->get = pirq_amd756_get;
20990+ r->set = pirq_amd756_set;
20991+ return 1;
20992+}
20993+
20994+static __initdata struct irq_router_handler pirq_routers[] = {
20995+ { PCI_VENDOR_ID_INTEL, intel_router_probe },
20996+ { PCI_VENDOR_ID_AL, ali_router_probe },
20997+ { PCI_VENDOR_ID_ITE, ite_router_probe },
20998+ { PCI_VENDOR_ID_VIA, via_router_probe },
20999+ { PCI_VENDOR_ID_OPTI, opti_router_probe },
21000+ { PCI_VENDOR_ID_SI, sis_router_probe },
21001+ { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
21002+ { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
21003+ { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
21004+ { PCI_VENDOR_ID_AMD, amd_router_probe },
21005+ /* Someone with docs needs to add the ATI Radeon IGP */
21006+ { 0, NULL }
21007+};
21008+static struct irq_router pirq_router;
21009+static struct pci_dev *pirq_router_dev;
21010+
21011+
21012+/*
21013+ * FIXME: should we have an option to say "generic for
21014+ * chipset" ?
21015+ */
21016+
21017+static void __init pirq_find_router(struct irq_router *r)
21018+{
21019+ struct irq_routing_table *rt = pirq_table;
21020+ struct irq_router_handler *h;
21021+
21022+#ifdef CONFIG_PCI_BIOS
21023+ if (!rt->signature) {
21024+ printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
21025+ r->set = pirq_bios_set;
21026+ r->name = "BIOS";
21027+ return;
21028+ }
21029+#endif
21030+
21031+ /* Default unless a driver reloads it */
21032+ r->name = "default";
21033+ r->get = NULL;
21034+ r->set = NULL;
21035+
21036+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
21037+ rt->rtr_vendor, rt->rtr_device);
21038+
21039+ pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
21040+ if (!pirq_router_dev) {
21041+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
21042+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
21043+ return;
21044+ }
21045+
21046+ for( h = pirq_routers; h->vendor; h++) {
21047+ /* First look for a router match */
21048+ if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
21049+ break;
21050+ /* Fall back to a device match */
21051+ if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
21052+ break;
21053+ }
21054+ printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
21055+ pirq_router.name,
21056+ pirq_router_dev->vendor,
21057+ pirq_router_dev->device,
21058+ pci_name(pirq_router_dev));
21059+}
21060+
21061+static struct irq_info *pirq_get_info(struct pci_dev *dev)
21062+{
21063+ struct irq_routing_table *rt = pirq_table;
21064+ int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
21065+ struct irq_info *info;
21066+
21067+ for (info = rt->slots; entries--; info++)
21068+ if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
21069+ return info;
21070+ return NULL;
21071+}
21072+
21073+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
21074+{
21075+ u8 pin;
21076+ struct irq_info *info;
21077+ int i, pirq, newirq;
21078+ int irq = 0;
21079+ u32 mask;
21080+ struct irq_router *r = &pirq_router;
21081+ struct pci_dev *dev2 = NULL;
21082+ char *msg = NULL;
21083+
21084+ /* Find IRQ pin */
21085+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21086+ if (!pin) {
21087+ DBG(KERN_DEBUG " -> no interrupt pin\n");
21088+ return 0;
21089+ }
21090+ pin = pin - 1;
21091+
21092+ /* Find IRQ routing entry */
21093+
21094+ if (!pirq_table)
21095+ return 0;
21096+
21097+ DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
21098+ info = pirq_get_info(dev);
21099+ if (!info) {
21100+ DBG(" -> not found in routing table\n" KERN_DEBUG);
21101+ return 0;
21102+ }
21103+ pirq = info->irq[pin].link;
21104+ mask = info->irq[pin].bitmap;
21105+ if (!pirq) {
21106+ DBG(" -> not routed\n" KERN_DEBUG);
21107+ return 0;
21108+ }
21109+ DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
21110+ mask &= pcibios_irq_mask;
21111+
21112+ /* Work around broken HP Pavilion Notebooks which assign USB to
21113+ IRQ 9 even though it is actually wired to IRQ 11 */
21114+
21115+ if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
21116+ dev->irq = 11;
21117+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
21118+ r->set(pirq_router_dev, dev, pirq, 11);
21119+ }
21120+
21121+ /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
21122+ if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
21123+ pirq = 0x68;
21124+ mask = 0x400;
21125+ dev->irq = r->get(pirq_router_dev, dev, pirq);
21126+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
21127+ }
21128+
21129+ /*
21130+ * Find the best IRQ to assign: use the one
21131+ * reported by the device if possible.
21132+ */
21133+ newirq = dev->irq;
21134+ if (newirq && !((1 << newirq) & mask)) {
21135+ if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
21136+ else printk("\n" KERN_WARNING
21137+ "PCI: IRQ %i for device %s doesn't match PIRQ mask "
21138+ "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
21139+ pci_name(dev));
21140+ }
21141+ if (!newirq && assign) {
21142+ for (i = 0; i < 16; i++) {
21143+ if (!(mask & (1 << i)))
21144+ continue;
21145+ if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ))
21146+ newirq = i;
21147+ }
21148+ }
21149+ DBG(" -> newirq=%d", newirq);
21150+
21151+ /* Check if it is hardcoded */
21152+ if ((pirq & 0xf0) == 0xf0) {
21153+ irq = pirq & 0xf;
21154+ DBG(" -> hardcoded IRQ %d\n", irq);
21155+ msg = "Hardcoded";
21156+ } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
21157+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
21158+ DBG(" -> got IRQ %d\n", irq);
21159+ msg = "Found";
21160+ } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
21161+ DBG(" -> assigning IRQ %d", newirq);
21162+ if (r->set(pirq_router_dev, dev, pirq, newirq)) {
21163+ eisa_set_level_irq(newirq);
21164+ DBG(" ... OK\n");
21165+ msg = "Assigned";
21166+ irq = newirq;
21167+ }
21168+ }
21169+
21170+ if (!irq) {
21171+ DBG(" ... failed\n");
21172+ if (newirq && mask == (1 << newirq)) {
21173+ msg = "Guessed";
21174+ irq = newirq;
21175+ } else
21176+ return 0;
21177+ }
21178+ printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
21179+
21180+ /* Update IRQ for all devices with the same pirq value */
21181+ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
21182+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
21183+ if (!pin)
21184+ continue;
21185+ pin--;
21186+ info = pirq_get_info(dev2);
21187+ if (!info)
21188+ continue;
21189+ if (info->irq[pin].link == pirq) {
21190+ /* We refuse to override the dev->irq information. Give a warning! */
21191+ if ( dev2->irq && dev2->irq != irq && \
21192+ (!(pci_probe & PCI_USE_PIRQ_MASK) || \
21193+ ((1 << dev2->irq) & mask)) ) {
21194+#ifndef CONFIG_PCI_MSI
21195+ printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
21196+ pci_name(dev2), dev2->irq, irq);
21197+#endif
21198+ continue;
21199+ }
21200+ dev2->irq = irq;
21201+ pirq_penalty[irq]++;
21202+ if (dev != dev2)
21203+ printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
21204+ }
21205+ }
21206+ return 1;
21207+}
21208+
21209+static void __init pcibios_fixup_irqs(void)
21210+{
21211+ struct pci_dev *dev = NULL;
21212+ u8 pin;
21213+
21214+ DBG(KERN_DEBUG "PCI: IRQ fixup\n");
21215+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21216+ /*
21217+ * If the BIOS has set an out of range IRQ number, just ignore it.
21218+ * Also keep track of which IRQ's are already in use.
21219+ */
21220+ if (dev->irq >= 16) {
21221+ DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
21222+ dev->irq = 0;
21223+ }
21224+ /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
21225+ if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
21226+ pirq_penalty[dev->irq] = 0;
21227+ pirq_penalty[dev->irq]++;
21228+ }
21229+
21230+ dev = NULL;
21231+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21232+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21233+#ifdef CONFIG_X86_IO_APIC
21234+ /*
21235+ * Recalculate IRQ numbers if we use the I/O APIC.
21236+ */
21237+ if (io_apic_assign_pci_irqs)
21238+ {
21239+ int irq;
21240+
21241+ if (pin) {
21242+ pin--; /* interrupt pins are numbered starting from 1 */
21243+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21244+ /*
21245+ * Busses behind bridges are typically not listed in the MP-table.
21246+ * In this case we have to look up the IRQ based on the parent bus,
21247+ * parent slot, and pin number. The SMP code detects such bridged
21248+ * busses itself so we should get into this branch reliably.
21249+ */
21250+ if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21251+ struct pci_dev * bridge = dev->bus->self;
21252+
21253+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21254+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
21255+ PCI_SLOT(bridge->devfn), pin);
21256+ if (irq >= 0)
21257+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21258+ pci_name(bridge), 'A' + pin, irq);
21259+ }
21260+ if (irq >= 0) {
21261+ if (use_pci_vector() &&
21262+ !platform_legacy_irq(irq))
21263+ irq = IO_APIC_VECTOR(irq);
21264+
21265+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21266+ pci_name(dev), 'A' + pin, irq);
21267+ dev->irq = irq;
21268+ }
21269+ }
21270+ }
21271+#endif
21272+ /*
21273+ * Still no IRQ? Try to lookup one...
21274+ */
21275+ if (pin && !dev->irq)
21276+ pcibios_lookup_irq(dev, 0);
21277+ }
21278+}
21279+
21280+/*
21281+ * Work around broken HP Pavilion Notebooks which assign USB to
21282+ * IRQ 9 even though it is actually wired to IRQ 11
21283+ */
21284+static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
21285+{
21286+ if (!broken_hp_bios_irq9) {
21287+ broken_hp_bios_irq9 = 1;
21288+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21289+ }
21290+ return 0;
21291+}
21292+
21293+/*
21294+ * Work around broken Acer TravelMate 360 Notebooks which assign
21295+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
21296+ */
21297+static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
21298+{
21299+ if (!acer_tm360_irqrouting) {
21300+ acer_tm360_irqrouting = 1;
21301+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21302+ }
21303+ return 0;
21304+}
21305+
21306+static struct dmi_system_id __initdata pciirq_dmi_table[] = {
21307+ {
21308+ .callback = fix_broken_hp_bios_irq9,
21309+ .ident = "HP Pavilion N5400 Series Laptop",
21310+ .matches = {
21311+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
21312+ DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
21313+ DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
21314+ DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
21315+ },
21316+ },
21317+ {
21318+ .callback = fix_acer_tm360_irqrouting,
21319+ .ident = "Acer TravelMate 36x Laptop",
21320+ .matches = {
21321+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
21322+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
21323+ },
21324+ },
21325+ { }
21326+};
21327+
21328+static int __init pcibios_irq_init(void)
21329+{
21330+ DBG(KERN_DEBUG "PCI: IRQ init\n");
21331+
21332+ if (pcibios_enable_irq || raw_pci_ops == NULL)
21333+ return 0;
21334+
21335+ dmi_check_system(pciirq_dmi_table);
21336+
21337+ pirq_table = pirq_find_routing_table();
21338+
21339+#ifdef CONFIG_PCI_BIOS
21340+ if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
21341+ pirq_table = pcibios_get_irq_routing_table();
21342+#endif
21343+ if (pirq_table) {
21344+ pirq_peer_trick();
21345+ pirq_find_router(&pirq_router);
21346+ if (pirq_table->exclusive_irqs) {
21347+ int i;
21348+ for (i=0; i<16; i++)
21349+ if (!(pirq_table->exclusive_irqs & (1 << i)))
21350+ pirq_penalty[i] += 100;
21351+ }
21352+ /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
21353+ if (io_apic_assign_pci_irqs)
21354+ pirq_table = NULL;
21355+ }
21356+
21357+ pcibios_enable_irq = pirq_enable_irq;
21358+
21359+ pcibios_fixup_irqs();
21360+ return 0;
21361+}
21362+
21363+subsys_initcall(pcibios_irq_init);
21364+
21365+
21366+static void pirq_penalize_isa_irq(int irq, int active)
21367+{
21368+ /*
21369+ * If any ISAPnP device reports an IRQ in its list of possible
21370+ * IRQ's, we try to avoid assigning it to PCI devices.
21371+ */
21372+ if (irq < 16) {
21373+ if (active)
21374+ pirq_penalty[irq] += 1000;
21375+ else
21376+ pirq_penalty[irq] += 100;
21377+ }
21378+}
21379+
21380+void pcibios_penalize_isa_irq(int irq, int active)
21381+{
21382+#ifdef CONFIG_ACPI
21383+ if (!acpi_noirq)
21384+ acpi_penalize_isa_irq(irq, active);
21385+ else
21386+#endif
21387+ pirq_penalize_isa_irq(irq, active);
21388+}
21389+
21390+static int pirq_enable_irq(struct pci_dev *dev)
21391+{
21392+ u8 pin;
21393+ struct pci_dev *temp_dev;
21394+
21395+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21396+ if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
21397+ char *msg = "";
21398+
21399+ pin--; /* interrupt pins are numbered starting from 1 */
21400+
21401+ if (io_apic_assign_pci_irqs) {
21402+ int irq;
21403+
21404+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21405+ /*
21406+ * Busses behind bridges are typically not listed in the MP-table.
21407+ * In this case we have to look up the IRQ based on the parent bus,
21408+ * parent slot, and pin number. The SMP code detects such bridged
21409+ * busses itself so we should get into this branch reliably.
21410+ */
21411+ temp_dev = dev;
21412+ while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21413+ struct pci_dev * bridge = dev->bus->self;
21414+
21415+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21416+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
21417+ PCI_SLOT(bridge->devfn), pin);
21418+ if (irq >= 0)
21419+ printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21420+ pci_name(bridge), 'A' + pin, irq);
21421+ dev = bridge;
21422+ }
21423+ dev = temp_dev;
21424+ if (irq >= 0) {
21425+#ifdef CONFIG_PCI_MSI
21426+ if (!platform_legacy_irq(irq))
21427+ irq = IO_APIC_VECTOR(irq);
21428+#endif
21429+ printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21430+ pci_name(dev), 'A' + pin, irq);
21431+ dev->irq = irq;
21432+ return 0;
21433+ } else
21434+ msg = " Probably buggy MP table.";
21435+ } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
21436+ msg = "";
21437+ else
21438+ msg = " Please try using pci=biosirq.";
21439+
21440+ /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
21441+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
21442+ return 0;
21443+
21444+ printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
21445+ 'A' + pin, pci_name(dev), msg);
21446+ }
21447+ return 0;
21448+}
21449+
21450+int pci_vector_resources(int last, int nr_released)
21451+{
21452+ int count = nr_released;
21453+
21454+ int next = last;
21455+ int offset = (last % 8);
21456+
21457+ while (next < FIRST_SYSTEM_VECTOR) {
21458+ next += 8;
21459+#ifdef CONFIG_X86_64
21460+ if (next == IA32_SYSCALL_VECTOR)
21461+ continue;
21462+#else
21463+ if (next == SYSCALL_VECTOR)
21464+ continue;
21465+#endif
21466+ count++;
21467+ if (next >= FIRST_SYSTEM_VECTOR) {
21468+ if (offset%8) {
21469+ next = FIRST_DEVICE_VECTOR + offset;
21470+ offset++;
21471+ continue;
21472+ }
21473+ count--;
21474+ }
21475+ }
21476+
21477+ return count;
21478+}
21479diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/mmconfig.c linux-2.6.16.33/arch/i386/pci/mmconfig.c
21480--- linux-2.6.16.33-noxen/arch/i386/pci/mmconfig.c 2006-11-22 18:06:31.000000000 +0000
21481+++ linux-2.6.16.33/arch/i386/pci/mmconfig.c 2007-05-23 21:00:01.000000000 +0000
21482@@ -12,14 +12,22 @@
21483 #include <linux/pci.h>
21484 #include <linux/init.h>
21485 #include <linux/acpi.h>
21486+#include <asm/e820.h>
21487 #include "pci.h"
21488
21489+/* aperture is up to 256MB but BIOS may reserve less */
21490+#define MMCONFIG_APER_MIN (2 * 1024*1024)
21491+#define MMCONFIG_APER_MAX (256 * 1024*1024)
21492+
21493+/* Assume systems with more busses have correct MCFG */
21494+#define MAX_CHECK_BUS 16
21495+
21496 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
21497
21498 /* The base address of the last MMCONFIG device accessed */
21499 static u32 mmcfg_last_accessed_device;
21500
21501-static DECLARE_BITMAP(fallback_slots, 32);
21502+static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
21503
21504 /*
21505 * Functions for accessing PCI configuration space with MMCONFIG accesses
21506@@ -29,8 +37,8 @@
21507 int cfg_num = -1;
21508 struct acpi_table_mcfg_config *cfg;
21509
21510- if (seg == 0 && bus == 0 &&
21511- test_bit(PCI_SLOT(devfn), fallback_slots))
21512+ if (seg == 0 && bus < MAX_CHECK_BUS &&
21513+ test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
21514 return 0;
21515
21516 while (1) {
21517@@ -74,8 +82,10 @@
21518 unsigned long flags;
21519 u32 base;
21520
21521- if (!value || (bus > 255) || (devfn > 255) || (reg > 4095))
21522+ if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
21523+ *value = -1;
21524 return -EINVAL;
21525+ }
21526
21527 base = get_base_addr(seg, bus, devfn);
21528 if (!base)
21529@@ -146,30 +156,66 @@
21530 Normally this can be expressed in the MCFG by not listing them
21531 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
21532 Instead try to discover all devices on bus 0 that are unreachable using MM
21533- and fallback for them.
21534- We only do this for bus 0/seg 0 */
21535+ and fallback for them. */
21536 static __init void unreachable_devices(void)
21537 {
21538- int i;
21539+ int i, k;
21540 unsigned long flags;
21541
21542- for (i = 0; i < 32; i++) {
21543- u32 val1;
21544- u32 addr;
21545+ for (k = 0; k < MAX_CHECK_BUS; k++) {
21546+ for (i = 0; i < 32; i++) {
21547+ u32 val1;
21548+ u32 addr;
21549+
21550+ pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
21551+ if (val1 == 0xffffffff)
21552+ continue;
21553+
21554+ /* Locking probably not needed, but safer */
21555+ spin_lock_irqsave(&pci_config_lock, flags);
21556+ addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
21557+ if (addr != 0)
21558+ pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
21559+ if (addr == 0 ||
21560+ readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
21561+ set_bit(i, fallback_slots);
21562+ printk(KERN_NOTICE
21563+ "PCI: No mmconfig possible on %x:%x\n", k, i);
21564+ }
21565+ spin_unlock_irqrestore(&pci_config_lock, flags);
21566+ }
21567+ }
21568+}
21569
21570- pci_conf1_read(0, 0, PCI_DEVFN(i, 0), 0, 4, &val1);
21571- if (val1 == 0xffffffff)
21572+/* NB. Ripped from arch/i386/kernel/setup.c for this Xen bugfix patch. */
21573+#ifdef CONFIG_XEN
21574+extern struct e820map machine_e820;
21575+#define e820 machine_e820
21576+#endif
21577+static int __init
21578+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
21579+{
21580+ u64 start = s;
21581+ u64 end = e;
21582+ int i;
21583+ for (i = 0; i < e820.nr_map; i++) {
21584+ struct e820entry *ei = &e820.map[i];
21585+ if (type && ei->type != type)
21586 continue;
21587-
21588- /* Locking probably not needed, but safer */
21589- spin_lock_irqsave(&pci_config_lock, flags);
21590- addr = get_base_addr(0, 0, PCI_DEVFN(i, 0));
21591- if (addr != 0)
21592- pci_exp_set_dev_base(addr, 0, PCI_DEVFN(i, 0));
21593- if (addr == 0 || readl((u32 __iomem *)mmcfg_virt_addr) != val1)
21594- set_bit(i, fallback_slots);
21595- spin_unlock_irqrestore(&pci_config_lock, flags);
21596+ /* is the region (part) in overlap with the current region ?*/
21597+ if (ei->addr >= end || ei->addr + ei->size <= start)
21598+ continue;
21599+ /* if the region is at the beginning of <start,end> we move
21600+ * start to the end of the region since it's ok until there
21601+ */
21602+ if (ei->addr <= start)
21603+ start = ei->addr + ei->size;
21604+ /* if start is now at or beyond end, we're done, full
21605+ * coverage */
21606+ if (start >= end)
21607+ return 1; /* we're done */
21608 }
21609+ return 0;
21610 }
21611
21612 static int __init pci_mmcfg_init(void)
21613@@ -183,6 +229,15 @@
21614 (pci_mmcfg_config[0].base_address == 0))
21615 goto out;
21616
21617+ if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
21618+ pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
21619+ E820_RESERVED)) {
21620+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
21621+ pci_mmcfg_config[0].base_address);
21622+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
21623+ goto out;
21624+ }
21625+
21626 printk(KERN_INFO "PCI: Using MMCONFIG\n");
21627 raw_pci_ops = &pci_mmcfg;
21628 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
21629diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/pcifront.c linux-2.6.16.33/arch/i386/pci/pcifront.c
21630--- linux-2.6.16.33-noxen/arch/i386/pci/pcifront.c 1970-01-01 00:00:00.000000000 +0000
21631+++ linux-2.6.16.33/arch/i386/pci/pcifront.c 2007-01-08 15:00:45.000000000 +0000
21632@@ -0,0 +1,55 @@
21633+/*
21634+ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
21635+ * to support the Xen PCI Frontend's operation
21636+ *
21637+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
21638+ */
21639+#include <linux/module.h>
21640+#include <linux/init.h>
21641+#include <linux/pci.h>
21642+#include <asm/acpi.h>
21643+#include "pci.h"
21644+
21645+static int pcifront_enable_irq(struct pci_dev *dev)
21646+{
21647+ u8 irq;
21648+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
21649+ dev->irq = irq;
21650+
21651+ return 0;
21652+}
21653+
21654+extern u8 pci_cache_line_size;
21655+
21656+static int __init pcifront_x86_stub_init(void)
21657+{
21658+ struct cpuinfo_x86 *c = &boot_cpu_data;
21659+
21660+ /* Only install our method if we haven't found real hardware already */
21661+ if (raw_pci_ops)
21662+ return 0;
21663+
21664+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
21665+
21666+ /* Copied from arch/i386/pci/common.c */
21667+ pci_cache_line_size = 32 >> 2;
21668+ if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
21669+ pci_cache_line_size = 64 >> 2; /* K7 & K8 */
21670+ else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
21671+ pci_cache_line_size = 128 >> 2; /* P4 */
21672+
21673+ /* On x86, we need to disable the normal IRQ routing table and
21674+ * just ask the backend
21675+ */
21676+ pcibios_enable_irq = pcifront_enable_irq;
21677+ pcibios_disable_irq = NULL;
21678+
21679+#ifdef CONFIG_ACPI
21680+ /* Keep ACPI out of the picture */
21681+ acpi_noirq = 1;
21682+#endif
21683+
21684+ return 0;
21685+}
21686+
21687+arch_initcall(pcifront_x86_stub_init);
21688diff -Nur linux-2.6.16.33-noxen/arch/i386/power/Makefile linux-2.6.16.33/arch/i386/power/Makefile
21689--- linux-2.6.16.33-noxen/arch/i386/power/Makefile 2006-11-22 18:06:31.000000000 +0000
21690+++ linux-2.6.16.33/arch/i386/power/Makefile 2007-01-08 15:00:45.000000000 +0000
21691@@ -1,2 +1,4 @@
21692-obj-$(CONFIG_PM) += cpu.o
21693+obj-$(CONFIG_PM_LEGACY) += cpu.o
21694+obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
21695+obj-$(CONFIG_ACPI_SLEEP) += cpu.o
21696 obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
21697diff -Nur linux-2.6.16.33-noxen/arch/ia64/Kconfig linux-2.6.16.33/arch/ia64/Kconfig
21698--- linux-2.6.16.33-noxen/arch/ia64/Kconfig 2006-11-22 18:06:31.000000000 +0000
21699+++ linux-2.6.16.33/arch/ia64/Kconfig 2007-01-08 15:00:45.000000000 +0000
21700@@ -50,6 +50,34 @@
21701 bool
21702 default y
21703
21704+config XEN
21705+ bool "Xen hypervisor support"
21706+ default y
21707+ help
21708+ Enable Xen hypervisor support. Resulting kernel runs
21709+ both as a guest OS on Xen and natively on hardware.
21710+
21711+config XEN_IA64_VDSO_PARAVIRT
21712+ bool
21713+ depends on XEN && !ITANIUM
21714+ default y
21715+ help
21716+ vDSO paravirtualization
21717+
21718+config XEN_IA64_EXPOSE_P2M
21719+ bool "Xen/IA64 exposure p2m table"
21720+ depends on XEN
21721+ default y
21722+ help
21723+ expose p2m from xen
21724+
21725+config XEN_IA64_EXPOSE_P2M_USE_DTR
21726+ bool "Xen/IA64 map p2m table with dtr"
21727+ depends on XEN_IA64_EXPOSE_P2M
21728+ default y
21729+ help
21730+ use dtr to map the exposed p2m table
21731+
21732 config SCHED_NO_NO_OMIT_FRAME_POINTER
21733 bool
21734 default y
21735@@ -413,6 +441,21 @@
21736 bool
21737 default PCI
21738
21739+config XEN_PCIDEV_FRONTEND
21740+ bool "Xen PCI Frontend"
21741+ depends on PCI && XEN
21742+ default y
21743+ help
21744+ The PCI device frontend driver allows the kernel to import arbitrary
21745+ PCI devices from a PCI backend to support PCI driver domains.
21746+
21747+config XEN_PCIDEV_FE_DEBUG
21748+ bool "Xen PCI Frontend Debugging"
21749+ depends on XEN_PCIDEV_FRONTEND
21750+ default n
21751+ help
21752+ Enables some debug statements within the PCI Frontend.
21753+
21754 source "drivers/pci/Kconfig"
21755
21756 source "drivers/pci/hotplug/Kconfig"
21757@@ -470,3 +513,32 @@
21758 source "security/Kconfig"
21759
21760 source "crypto/Kconfig"
21761+
21762+#
21763+# override default values of drivers/xen/Kconfig
21764+#
21765+if XEN
21766+config XEN_UTIL
21767+ default n
21768+
21769+config HAVE_ARCH_ALLOC_SKB
21770+ default y
21771+
21772+config HAVE_ARCH_DEV_ALLOC_SKB
21773+ default y
21774+
21775+config XEN_BALLOON
21776+ default y
21777+
21778+config XEN_SKBUFF
21779+ default y
21780+ depends on NET
21781+
21782+config XEN_REBOOT
21783+ default y
21784+
21785+config XEN_SMPBOOT
21786+ default n
21787+endif
21788+
21789+source "drivers/xen/Kconfig"
21790diff -Nur linux-2.6.16.33-noxen/arch/ia64/Makefile linux-2.6.16.33/arch/ia64/Makefile
21791--- linux-2.6.16.33-noxen/arch/ia64/Makefile 2006-11-22 18:06:31.000000000 +0000
21792+++ linux-2.6.16.33/arch/ia64/Makefile 2007-01-08 15:00:45.000000000 +0000
21793@@ -42,6 +42,12 @@
21794 endif
21795
21796 CFLAGS += $(cflags-y)
21797+
21798+cppflags-$(CONFIG_XEN) += \
21799+ -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
21800+
21801+CPPFLAGS += $(cppflags-y)
21802+
21803 head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
21804
21805 libs-y += arch/ia64/lib/
21806@@ -52,9 +58,15 @@
21807 core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/
21808 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
21809 core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/
21810+core-$(CONFIG_XEN) += arch/ia64/xen/
21811
21812 drivers-$(CONFIG_PCI) += arch/ia64/pci/
21813+ifneq ($(CONFIG_XEN),y)
21814 drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
21815+endif
21816+ifneq ($(CONFIG_IA64_GENERIC),y)
21817+drivers-$(CONFIG_XEN) += arch/ia64/hp/sim/
21818+endif
21819 drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
21820 drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
21821 drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
21822@@ -68,6 +80,8 @@
21823
21824 compressed: vmlinux.gz
21825
21826+vmlinuz: vmlinux.gz
21827+
21828 vmlinux.gz: vmlinux
21829 $(Q)$(MAKE) $(build)=$(boot) $@
21830
21831@@ -82,8 +96,8 @@
21832 boot: lib/lib.a vmlinux
21833 $(Q)$(MAKE) $(build)=$(boot) $@
21834
21835-install: vmlinux.gz
21836- sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
21837+install:
21838+ -yes | sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) vmlinux.gz System.map "$(INSTALL_PATH)"
21839
21840 define archhelp
21841 echo '* compressed - Build compressed kernel image'
21842diff -Nur linux-2.6.16.33-noxen/arch/ia64/dig/setup.c linux-2.6.16.33/arch/ia64/dig/setup.c
21843--- linux-2.6.16.33-noxen/arch/ia64/dig/setup.c 2006-11-22 18:06:31.000000000 +0000
21844+++ linux-2.6.16.33/arch/ia64/dig/setup.c 2007-01-08 15:00:45.000000000 +0000
21845@@ -25,6 +25,8 @@
21846 #include <asm/machvec.h>
21847 #include <asm/system.h>
21848
21849+#include <xen/xencons.h>
21850+
21851 void __init
21852 dig_setup (char **cmdline_p)
21853 {
21854@@ -68,6 +70,21 @@
21855 screen_info.orig_video_mode = 3; /* XXX fake */
21856 screen_info.orig_video_isVGA = 1; /* XXX fake */
21857 screen_info.orig_video_ega_bx = 3; /* XXX fake */
21858+#ifdef CONFIG_XEN
21859+ if (!is_running_on_xen() || !is_initial_xendomain())
21860+ return;
21861+
21862+ if (xen_start_info->console.dom0.info_size >=
21863+ sizeof(struct dom0_vga_console_info)) {
21864+ const struct dom0_vga_console_info *info =
21865+ (struct dom0_vga_console_info *)(
21866+ (char *)xen_start_info +
21867+ xen_start_info->console.dom0.info_off);
21868+ dom0_init_screen_info(info);
21869+ }
21870+ xen_start_info->console.domU.mfn = 0;
21871+ xen_start_info->console.domU.evtchn = 0;
21872+#endif
21873 }
21874
21875 void __init
21876diff -Nur linux-2.6.16.33-noxen/arch/ia64/hp/sim/Makefile linux-2.6.16.33/arch/ia64/hp/sim/Makefile
21877--- linux-2.6.16.33-noxen/arch/ia64/hp/sim/Makefile 2006-11-22 18:06:31.000000000 +0000
21878+++ linux-2.6.16.33/arch/ia64/hp/sim/Makefile 2007-01-08 15:00:45.000000000 +0000
21879@@ -14,3 +14,5 @@
21880 obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
21881 obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
21882 obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
21883+obj-$(CONFIG_XEN) += simserial.o
21884+obj-$(CONFIG_XEN) += hpsim_console.o
21885diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/Makefile linux-2.6.16.33/arch/ia64/kernel/Makefile
21886--- linux-2.6.16.33-noxen/arch/ia64/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
21887+++ linux-2.6.16.33/arch/ia64/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
21888@@ -44,7 +44,8 @@
21889 quiet_cmd_gate = GATE $@
21890 cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
21891
21892-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1
21893+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
21894+ $(call ld-option, -Wl$(comma)--hash-style=sysv)
21895 $(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
21896 $(call if_changed,gate)
21897
21898diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/asm-offsets.c linux-2.6.16.33/arch/ia64/kernel/asm-offsets.c
21899--- linux-2.6.16.33-noxen/arch/ia64/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
21900+++ linux-2.6.16.33/arch/ia64/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
21901@@ -261,4 +261,28 @@
21902 DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
21903 DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
21904 DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
21905+
21906+#ifdef CONFIG_XEN
21907+ BLANK();
21908+
21909+#define DEFINE_MAPPED_REG_OFS(sym, field) \
21910+ DEFINE(sym, (XMAPPEDREGS_OFS + offsetof(mapped_regs_t, field)))
21911+
21912+ DEFINE_MAPPED_REG_OFS(XSI_PSR_I_ADDR_OFS, interrupt_mask_addr);
21913+ DEFINE_MAPPED_REG_OFS(XSI_IPSR_OFS, ipsr);
21914+ DEFINE_MAPPED_REG_OFS(XSI_IIP_OFS, iip);
21915+ DEFINE_MAPPED_REG_OFS(XSI_IFS_OFS, ifs);
21916+ DEFINE_MAPPED_REG_OFS(XSI_PRECOVER_IFS_OFS, precover_ifs);
21917+ DEFINE_MAPPED_REG_OFS(XSI_ISR_OFS, isr);
21918+ DEFINE_MAPPED_REG_OFS(XSI_IFA_OFS, ifa);
21919+ DEFINE_MAPPED_REG_OFS(XSI_IIPA_OFS, iipa);
21920+ DEFINE_MAPPED_REG_OFS(XSI_IIM_OFS, iim);
21921+ DEFINE_MAPPED_REG_OFS(XSI_IHA_OFS, iha);
21922+ DEFINE_MAPPED_REG_OFS(XSI_ITIR_OFS, itir);
21923+ DEFINE_MAPPED_REG_OFS(XSI_PSR_IC_OFS, interrupt_collection_enabled);
21924+ DEFINE_MAPPED_REG_OFS(XSI_INCOMPL_REGFR_OFS, incomplete_regframe);
21925+ DEFINE_MAPPED_REG_OFS(XSI_BANKNUM_OFS, banknum);
21926+ DEFINE_MAPPED_REG_OFS(XSI_BANK0_R16_OFS, bank0_regs[0]);
21927+ DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
21928+#endif /* CONFIG_XEN */
21929 }
21930diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/entry.S linux-2.6.16.33/arch/ia64/kernel/entry.S
21931--- linux-2.6.16.33-noxen/arch/ia64/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
21932+++ linux-2.6.16.33/arch/ia64/kernel/entry.S 2007-01-08 15:00:45.000000000 +0000
21933@@ -181,7 +181,7 @@
21934 * called. The code starting at .map relies on this. The rest of the code
21935 * doesn't care about the interrupt masking status.
21936 */
21937-GLOBAL_ENTRY(ia64_switch_to)
21938+GLOBAL_ENTRY(__ia64_switch_to)
21939 .prologue
21940 alloc r16=ar.pfs,1,0,0,0
21941 DO_SAVE_SWITCH_STACK
21942@@ -235,7 +235,7 @@
21943 ;;
21944 srlz.d
21945 br.cond.sptk .done
21946-END(ia64_switch_to)
21947+END(__ia64_switch_to)
21948
21949 /*
21950 * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This
21951@@ -376,7 +376,7 @@
21952 * - b7 holds address to return to
21953 * - must not touch r8-r11
21954 */
21955-ENTRY(load_switch_stack)
21956+GLOBAL_ENTRY(load_switch_stack)
21957 .prologue
21958 .altrp b7
21959
21960@@ -511,7 +511,7 @@
21961 * because some system calls (such as ia64_execve) directly
21962 * manipulate ar.pfs.
21963 */
21964-GLOBAL_ENTRY(ia64_trace_syscall)
21965+GLOBAL_ENTRY(__ia64_trace_syscall)
21966 PT_REGS_UNWIND_INFO(0)
21967 /*
21968 * We need to preserve the scratch registers f6-f11 in case the system
21969@@ -583,7 +583,7 @@
21970 (p6) mov r10=-1
21971 (p6) mov r8=r9
21972 br.cond.sptk .strace_save_retval
21973-END(ia64_trace_syscall)
21974+END(__ia64_trace_syscall)
21975
21976 /*
21977 * When traced and returning from sigreturn, we invoke syscall_trace but then
21978@@ -602,7 +602,7 @@
21979 .ret4: br.cond.sptk ia64_leave_kernel
21980 END(ia64_strace_leave_kernel)
21981
21982-GLOBAL_ENTRY(ia64_ret_from_clone)
21983+GLOBAL_ENTRY(__ia64_ret_from_clone)
21984 PT_REGS_UNWIND_INFO(0)
21985 { /*
21986 * Some versions of gas generate bad unwind info if the first instruction of a
21987@@ -628,7 +628,7 @@
21988 cmp.ne p6,p0=r2,r0
21989 (p6) br.cond.spnt .strace_check_retval
21990 ;; // added stop bits to prevent r8 dependency
21991-END(ia64_ret_from_clone)
21992+END(__ia64_ret_from_clone)
21993 // fall through
21994 GLOBAL_ENTRY(ia64_ret_from_syscall)
21995 PT_REGS_UNWIND_INFO(0)
21996@@ -636,8 +636,11 @@
21997 adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
21998 mov r10=r0 // clear error indication in r10
21999 (p7) br.cond.spnt handle_syscall_error // handle potential syscall failure
22000+ ;;
22001+ // don't fall through, ia64_leave_syscall may be #define'd
22002+ br.cond.sptk.few ia64_leave_syscall
22003+ ;;
22004 END(ia64_ret_from_syscall)
22005- // fall through
22006 /*
22007 * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
22008 * need to switch to bank 0 and doesn't restore the scratch registers.
22009@@ -682,7 +685,7 @@
22010 * ar.csd: cleared
22011 * ar.ssd: cleared
22012 */
22013-ENTRY(ia64_leave_syscall)
22014+GLOBAL_ENTRY(__ia64_leave_syscall)
22015 PT_REGS_UNWIND_INFO(0)
22016 /*
22017 * work.need_resched etc. mustn't get changed by this CPU before it returns to
22018@@ -790,7 +793,7 @@
22019 mov.m ar.ssd=r0 // M2 clear ar.ssd
22020 mov f11=f0 // F clear f11
22021 br.cond.sptk.many rbs_switch // B
22022-END(ia64_leave_syscall)
22023+END(__ia64_leave_syscall)
22024
22025 #ifdef CONFIG_IA32_SUPPORT
22026 GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
22027@@ -802,10 +805,13 @@
22028 st8.spill [r2]=r8 // store return value in slot for r8 and set unat bit
22029 .mem.offset 8,0
22030 st8.spill [r3]=r0 // clear error indication in slot for r10 and set unat bit
22031+ ;;
22032+ // don't fall through, ia64_leave_kernel may be #define'd
22033+ br.cond.sptk.few ia64_leave_kernel
22034+ ;;
22035 END(ia64_ret_from_ia32_execve)
22036- // fall through
22037 #endif /* CONFIG_IA32_SUPPORT */
22038-GLOBAL_ENTRY(ia64_leave_kernel)
22039+GLOBAL_ENTRY(__ia64_leave_kernel)
22040 PT_REGS_UNWIND_INFO(0)
22041 /*
22042 * work.need_resched etc. mustn't get changed by this CPU before it returns to
22043@@ -1150,7 +1156,7 @@
22044 ld8 r10=[r3]
22045 br.cond.sptk.many .work_processed_syscall // re-check
22046
22047-END(ia64_leave_kernel)
22048+END(__ia64_leave_kernel)
22049
22050 ENTRY(handle_syscall_error)
22051 /*
22052@@ -1190,7 +1196,7 @@
22053 * be set up by the caller. We declare 8 input registers so the system call
22054 * args get preserved, in case we need to restart a system call.
22055 */
22056-ENTRY(notify_resume_user)
22057+GLOBAL_ENTRY(notify_resume_user)
22058 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
22059 alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
22060 mov r9=ar.unat
22061@@ -1278,7 +1284,7 @@
22062 adds sp=16,sp
22063 ;;
22064 ld8 r9=[sp] // load new ar.unat
22065- mov.sptk b7=r8,ia64_leave_kernel
22066+ mov.sptk b7=r8,__ia64_leave_kernel
22067 ;;
22068 mov ar.unat=r9
22069 br.many b7
22070diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/gate.S linux-2.6.16.33/arch/ia64/kernel/gate.S
22071--- linux-2.6.16.33-noxen/arch/ia64/kernel/gate.S 2006-11-22 18:06:31.000000000 +0000
22072+++ linux-2.6.16.33/arch/ia64/kernel/gate.S 2007-01-08 15:00:45.000000000 +0000
22073@@ -14,6 +14,9 @@
22074 #include <asm/sigcontext.h>
22075 #include <asm/system.h>
22076 #include <asm/unistd.h>
22077+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22078+# include <asm/privop.h>
22079+#endif
22080
22081 /*
22082 * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation,
22083@@ -33,6 +36,52 @@
22084 [1:](pr)brl.cond.sptk 0; \
22085 .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
22086
22087+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22088+ // The page in which hyperprivop lives must be pinned by ITR.
22089+ // However vDSO area isn't pinned. So issuing hyperprivop
22090+ // from vDSO page causes trouble that Kevin pointed out.
22091+ // After clearing vpsr.ic, the vcpu is pre-empted and the itlb
22092+ // is flushed. Then vcpu get cpu again, tlb miss fault occures.
22093+ // However it results in nested dtlb fault because vpsr.ic is off.
22094+ // To avoid such a situation, we jump into the kernel text area
22095+ // which is pinned, and then issue hyperprivop and return back
22096+ // to vDSO page.
22097+ // This is Dan Magenheimer's idea.
22098+
22099+ // Currently is_running_on_xen() is defined as running_on_xen.
22100+ // If is_running_on_xen() is a real function, we must update
22101+ // according to it.
22102+ .section ".data.patch.running_on_xen", "a"
22103+ .previous
22104+#define LOAD_RUNNING_ON_XEN(reg) \
22105+[1:] movl reg=0; \
22106+ .xdata4 ".data.patch.running_on_xen", 1b-.
22107+
22108+ .section ".data.patch.brl_xen_rsm_be_i", "a"
22109+ .previous
22110+#define BRL_COND_XEN_RSM_BE_I(pr) \
22111+[1:](pr)brl.cond.sptk 0; \
22112+ .xdata4 ".data.patch.brl_xen_rsm_be_i", 1b-.
22113+
22114+ .section ".data.patch.brl_xen_get_psr", "a"
22115+ .previous
22116+#define BRL_COND_XEN_GET_PSR(pr) \
22117+[1:](pr)brl.cond.sptk 0; \
22118+ .xdata4 ".data.patch.brl_xen_get_psr", 1b-.
22119+
22120+ .section ".data.patch.brl_xen_ssm_i_0", "a"
22121+ .previous
22122+#define BRL_COND_XEN_SSM_I_0(pr) \
22123+[1:](pr)brl.cond.sptk 0; \
22124+ .xdata4 ".data.patch.brl_xen_ssm_i_0", 1b-.
22125+
22126+ .section ".data.patch.brl_xen_ssm_i_1", "a"
22127+ .previous
22128+#define BRL_COND_XEN_SSM_I_1(pr) \
22129+[1:](pr)brl.cond.sptk 0; \
22130+ .xdata4 ".data.patch.brl_xen_ssm_i_1", 1b-.
22131+#endif
22132+
22133 GLOBAL_ENTRY(__kernel_syscall_via_break)
22134 .prologue
22135 .altrp b6
22136@@ -77,7 +126,42 @@
22137 epc // B causes split-issue
22138 }
22139 ;;
22140+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22141+ // r20 = 1
22142+ // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
22143+ // r23 = &vpsr.ic
22144+ // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
22145+ // r25 = tmp
22146+ // r28 = &running_on_xen
22147+ // r30 = running_on_xen
22148+ // r31 = tmp
22149+ // p11 = tmp
22150+ // p12 = running_on_xen
22151+ // p13 = !running_on_xen
22152+ // p14 = tmp
22153+ // p15 = tmp
22154+#define isXen p12
22155+#define isRaw p13
22156+ LOAD_RUNNING_ON_XEN(r28)
22157+ movl r22=XSI_PSR_I_ADDR
22158+ ;;
22159+ ld8 r22=[r22]
22160+ ;;
22161+ movl r23=XSI_PSR_IC
22162+ adds r24=-1,r22
22163+ mov r20=1
22164+ ;;
22165+ ld4 r30=[r28]
22166+ ;;
22167+ cmp.ne isXen,isRaw=r0,r30
22168+ ;;
22169+(isRaw) rsm psr.be | psr.i
22170+ BRL_COND_XEN_RSM_BE_I(isXen)
22171+ .global .vdso_rsm_be_i_ret
22172+.vdso_rsm_be_i_ret:
22173+#else
22174 rsm psr.be | psr.i // M2 (5 cyc to srlz.d)
22175+#endif
22176 LOAD_FSYSCALL_TABLE(r14) // X
22177 ;;
22178 mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
22179@@ -85,7 +169,14 @@
22180 mov r19=NR_syscalls-1 // A
22181 ;;
22182 lfetch [r18] // M0|1
22183+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22184+(isRaw) mov r29=psr
22185+ BRL_COND_XEN_GET_PSR(isXen)
22186+ .global .vdso_get_psr_ret
22187+.vdso_get_psr_ret:
22188+#else
22189 mov r29=psr // M2 (12 cyc)
22190+#endif
22191 // If r17 is a NaT, p6 will be zero
22192 cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
22193 ;;
22194@@ -99,9 +190,21 @@
22195 ;;
22196 nop.m 0
22197 (p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
22198+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22199+ ;;
22200+ // p14 = running_on_xen && p8
22201+ // p15 = !running_on_xen && p8
22202+(p8) cmp.ne.unc p14,p15=r0,r30
22203+ ;;
22204+(p15) ssm psr.i
22205+ BRL_COND_XEN_SSM_I_0(p14)
22206+ .global .vdso_ssm_i_0_ret
22207+.vdso_ssm_i_0_ret:
22208+#else
22209 nop.i 0
22210 ;;
22211 (p8) ssm psr.i
22212+#endif
22213 (p6) mov b7=r18 // I0
22214 (p8) br.dptk.many b7 // B
22215
22216@@ -122,9 +225,21 @@
22217 #else
22218 BRL_COND_FSYS_BUBBLE_DOWN(p6)
22219 #endif
22220+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22221+(isRaw) ssm psr.i
22222+ BRL_COND_XEN_SSM_I_1(isXen)
22223+ .global .vdso_ssm_i_1_ret
22224+.vdso_ssm_i_1_ret:
22225+#else
22226 ssm psr.i
22227+#endif
22228 mov r10=-1
22229 (p10) mov r8=EINVAL
22230+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22231+ dv_serialize_data // shut up gas warning.
22232+ // we know xen_hyper_ssm_i_0 or xen_hyper_ssm_i_1
22233+ // doesn't change p9 and p10
22234+#endif
22235 (p9) mov r8=ENOSYS
22236 FSYS_RETURN
22237 END(__kernel_syscall_via_epc)
22238diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/gate.lds.S linux-2.6.16.33/arch/ia64/kernel/gate.lds.S
22239--- linux-2.6.16.33-noxen/arch/ia64/kernel/gate.lds.S 2006-11-22 18:06:31.000000000 +0000
22240+++ linux-2.6.16.33/arch/ia64/kernel/gate.lds.S 2007-01-08 15:00:45.000000000 +0000
22241@@ -13,6 +13,7 @@
22242 . = GATE_ADDR + SIZEOF_HEADERS;
22243
22244 .hash : { *(.hash) } :readable
22245+ .gnu.hash : { *(.gnu.hash) }
22246 .dynsym : { *(.dynsym) }
22247 .dynstr : { *(.dynstr) }
22248 .gnu.version : { *(.gnu.version) }
22249@@ -43,6 +44,28 @@
22250 __start_gate_brl_fsys_bubble_down_patchlist = .;
22251 *(.data.patch.brl_fsys_bubble_down)
22252 __end_gate_brl_fsys_bubble_down_patchlist = .;
22253+
22254+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22255+ __start_gate_running_on_xen_patchlist = .;
22256+ *(.data.patch.running_on_xen)
22257+ __end_gate_running_on_xen_patchlist = .;
22258+
22259+ __start_gate_brl_xen_rsm_be_i_patchlist = .;
22260+ *(.data.patch.brl_xen_rsm_be_i)
22261+ __end_gate_brl_xen_rsm_be_i_patchlist = .;
22262+
22263+ __start_gate_brl_xen_get_psr_patchlist = .;
22264+ *(.data.patch.brl_xen_get_psr)
22265+ __end_gate_brl_xen_get_psr_patchlist = .;
22266+
22267+ __start_gate_brl_xen_ssm_i_0_patchlist = .;
22268+ *(.data.patch.brl_xen_ssm_i_0)
22269+ __end_gate_brl_xen_ssm_i_0_patchlist = .;
22270+
22271+ __start_gate_brl_xen_ssm_i_1_patchlist = .;
22272+ *(.data.patch.brl_xen_ssm_i_1)
22273+ __end_gate_brl_xen_ssm_i_1_patchlist = .;
22274+#endif
22275 } :readable
22276 .IA_64.unwind_info : { *(.IA_64.unwind_info*) }
22277 .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind
22278diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/head.S linux-2.6.16.33/arch/ia64/kernel/head.S
22279--- linux-2.6.16.33-noxen/arch/ia64/kernel/head.S 2006-11-22 18:06:31.000000000 +0000
22280+++ linux-2.6.16.33/arch/ia64/kernel/head.S 2007-01-08 15:00:45.000000000 +0000
22281@@ -363,6 +363,12 @@
22282 ;;
22283 (isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader
22284
22285+#ifdef CONFIG_XEN
22286+ // Note: isBP is used by the subprogram.
22287+ br.call.sptk.many rp=early_xen_setup
22288+ ;;
22289+#endif
22290+
22291 #ifdef CONFIG_SMP
22292 (isAP) br.call.sptk.many rp=start_secondary
22293 .ret0:
22294diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/iosapic.c linux-2.6.16.33/arch/ia64/kernel/iosapic.c
22295--- linux-2.6.16.33-noxen/arch/ia64/kernel/iosapic.c 2006-11-22 18:06:31.000000000 +0000
22296+++ linux-2.6.16.33/arch/ia64/kernel/iosapic.c 2007-01-08 15:00:45.000000000 +0000
22297@@ -140,6 +140,75 @@
22298 static int iosapic_kmalloc_ok;
22299 static LIST_HEAD(free_rte_list);
22300
22301+#ifdef CONFIG_XEN
22302+#include <xen/interface/xen.h>
22303+#include <xen/interface/physdev.h>
22304+#include <asm/hypervisor.h>
22305+static inline unsigned int xen_iosapic_read(char __iomem *iosapic, unsigned int reg)
22306+{
22307+ struct physdev_apic apic_op;
22308+ int ret;
22309+
22310+ apic_op.apic_physbase = (unsigned long)iosapic -
22311+ __IA64_UNCACHED_OFFSET;
22312+ apic_op.reg = reg;
22313+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
22314+ if (ret)
22315+ return ret;
22316+ return apic_op.value;
22317+}
22318+
22319+static inline void xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22320+{
22321+ struct physdev_apic apic_op;
22322+
22323+ apic_op.apic_physbase = (unsigned long)iosapic -
22324+ __IA64_UNCACHED_OFFSET;
22325+ apic_op.reg = reg;
22326+ apic_op.value = val;
22327+ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
22328+}
22329+
22330+static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
22331+{
22332+ if (!is_running_on_xen()) {
22333+ writel(reg, iosapic + IOSAPIC_REG_SELECT);
22334+ return readl(iosapic + IOSAPIC_WINDOW);
22335+ } else
22336+ return xen_iosapic_read(iosapic, reg);
22337+}
22338+
22339+static inline void iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22340+{
22341+ if (!is_running_on_xen()) {
22342+ writel(reg, iosapic + IOSAPIC_REG_SELECT);
22343+ writel(val, iosapic + IOSAPIC_WINDOW);
22344+ } else
22345+ xen_iosapic_write(iosapic, reg, val);
22346+}
22347+
22348+int xen_assign_irq_vector(int irq)
22349+{
22350+ struct physdev_irq irq_op;
22351+
22352+ irq_op.irq = irq;
22353+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
22354+ return -ENOSPC;
22355+
22356+ return irq_op.vector;
22357+}
22358+
22359+void xen_free_irq_vector(int vector)
22360+{
22361+ struct physdev_irq irq_op;
22362+
22363+ irq_op.vector = vector;
22364+ if (HYPERVISOR_physdev_op(PHYSDEVOP_free_irq_vector, &irq_op))
22365+ printk(KERN_WARNING "%s: xen_free_irq_vecotr fail vector=%d\n",
22366+ __FUNCTION__, vector);
22367+}
22368+#endif /* XEN */
22369+
22370 /*
22371 * Find an IOSAPIC associated with a GSI
22372 */
22373@@ -611,6 +680,9 @@
22374 iosapic_intr_info[vector].dmode = delivery;
22375 iosapic_intr_info[vector].trigger = trigger;
22376
22377+ if (is_running_on_xen())
22378+ return 0;
22379+
22380 if (trigger == IOSAPIC_EDGE)
22381 irq_type = &irq_type_iosapic_edge;
22382 else
22383@@ -953,6 +1025,9 @@
22384 }
22385
22386 pcat_compat = system_pcat_compat;
22387+ if (is_running_on_xen())
22388+ return;
22389+
22390 if (pcat_compat) {
22391 /*
22392 * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support
22393diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/irq_ia64.c linux-2.6.16.33/arch/ia64/kernel/irq_ia64.c
22394--- linux-2.6.16.33-noxen/arch/ia64/kernel/irq_ia64.c 2006-11-22 18:06:31.000000000 +0000
22395+++ linux-2.6.16.33/arch/ia64/kernel/irq_ia64.c 2007-01-08 15:00:45.000000000 +0000
22396@@ -31,6 +31,9 @@
22397 #include <linux/smp_lock.h>
22398 #include <linux/threads.h>
22399 #include <linux/bitops.h>
22400+#ifdef CONFIG_XEN
22401+#include <linux/cpu.h>
22402+#endif
22403
22404 #include <asm/delay.h>
22405 #include <asm/intrinsics.h>
22406@@ -66,6 +69,13 @@
22407 assign_irq_vector (int irq)
22408 {
22409 int pos, vector;
22410+
22411+#ifdef CONFIG_XEN
22412+ if (is_running_on_xen()) {
22413+ extern int xen_assign_irq_vector(int);
22414+ return xen_assign_irq_vector(irq);
22415+ }
22416+#endif
22417 again:
22418 pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
22419 vector = IA64_FIRST_DEVICE_VECTOR + pos;
22420@@ -84,6 +94,13 @@
22421 if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
22422 return;
22423
22424+#ifdef CONFIG_XEN
22425+ if (is_running_on_xen()) {
22426+ extern void xen_free_irq_vector(int);
22427+ xen_free_irq_vector(vector);
22428+ return;
22429+ }
22430+#endif
22431 pos = vector - IA64_FIRST_DEVICE_VECTOR;
22432 if (!test_and_clear_bit(pos, ia64_vector_mask))
22433 printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
22434@@ -224,12 +241,264 @@
22435 };
22436 #endif
22437
22438+#ifdef CONFIG_XEN
22439+#include <xen/evtchn.h>
22440+#include <xen/interface/callback.h>
22441+
22442+static DEFINE_PER_CPU(int, timer_irq) = -1;
22443+static DEFINE_PER_CPU(int, ipi_irq) = -1;
22444+static DEFINE_PER_CPU(int, resched_irq) = -1;
22445+static DEFINE_PER_CPU(int, cmc_irq) = -1;
22446+static DEFINE_PER_CPU(int, cmcp_irq) = -1;
22447+static DEFINE_PER_CPU(int, cpep_irq) = -1;
22448+static char timer_name[NR_CPUS][15];
22449+static char ipi_name[NR_CPUS][15];
22450+static char resched_name[NR_CPUS][15];
22451+static char cmc_name[NR_CPUS][15];
22452+static char cmcp_name[NR_CPUS][15];
22453+static char cpep_name[NR_CPUS][15];
22454+
22455+struct saved_irq {
22456+ unsigned int irq;
22457+ struct irqaction *action;
22458+};
22459+/* 16 should be far optimistic value, since only several percpu irqs
22460+ * are registered early.
22461+ */
22462+#define MAX_LATE_IRQ 16
22463+static struct saved_irq saved_percpu_irqs[MAX_LATE_IRQ];
22464+static unsigned short late_irq_cnt = 0;
22465+static unsigned short saved_irq_cnt = 0;
22466+static int xen_slab_ready = 0;
22467+
22468+#ifdef CONFIG_SMP
22469+/* Dummy stub. Though we may check RESCHEDULE_VECTOR before __do_IRQ,
22470+ * it ends up to issue several memory accesses upon percpu data and
22471+ * thus adds unnecessary traffic to other paths.
22472+ */
22473+static irqreturn_t
22474+handle_reschedule(int irq, void *dev_id, struct pt_regs *regs)
22475+{
22476+
22477+ return IRQ_HANDLED;
22478+}
22479+
22480+static struct irqaction resched_irqaction = {
22481+ .handler = handle_reschedule,
22482+ .flags = SA_INTERRUPT,
22483+ .name = "RESCHED"
22484+};
22485+#endif
22486+
22487+/*
22488+ * This is xen version percpu irq registration, which needs bind
22489+ * to xen specific evtchn sub-system. One trick here is that xen
22490+ * evtchn binding interface depends on kmalloc because related
22491+ * port needs to be freed at device/cpu down. So we cache the
22492+ * registration on BSP before slab is ready and then deal them
22493+ * at later point. For rest instances happening after slab ready,
22494+ * we hook them to xen evtchn immediately.
22495+ *
22496+ * FIXME: MCA is not supported by far, and thus "nomca" boot param is
22497+ * required.
22498+ */
22499+static void
22500+xen_register_percpu_irq (unsigned int irq, struct irqaction *action, int save)
22501+{
22502+ unsigned int cpu = smp_processor_id();
22503+ int ret = 0;
22504+
22505+ if (xen_slab_ready) {
22506+ switch (irq) {
22507+ case IA64_TIMER_VECTOR:
22508+ sprintf(timer_name[cpu], "%s%d", action->name, cpu);
22509+ ret = bind_virq_to_irqhandler(VIRQ_ITC, cpu,
22510+ action->handler, action->flags,
22511+ timer_name[cpu], action->dev_id);
22512+ per_cpu(timer_irq,cpu) = ret;
22513+ printk(KERN_INFO "register VIRQ_ITC (%s) to xen irq (%d)\n", timer_name[cpu], ret);
22514+ break;
22515+ case IA64_IPI_RESCHEDULE:
22516+ sprintf(resched_name[cpu], "%s%d", action->name, cpu);
22517+ ret = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, cpu,
22518+ action->handler, action->flags,
22519+ resched_name[cpu], action->dev_id);
22520+ per_cpu(resched_irq,cpu) = ret;
22521+ printk(KERN_INFO "register RESCHEDULE_VECTOR (%s) to xen irq (%d)\n", resched_name[cpu], ret);
22522+ break;
22523+ case IA64_IPI_VECTOR:
22524+ sprintf(ipi_name[cpu], "%s%d", action->name, cpu);
22525+ ret = bind_ipi_to_irqhandler(IPI_VECTOR, cpu,
22526+ action->handler, action->flags,
22527+ ipi_name[cpu], action->dev_id);
22528+ per_cpu(ipi_irq,cpu) = ret;
22529+ printk(KERN_INFO "register IPI_VECTOR (%s) to xen irq (%d)\n", ipi_name[cpu], ret);
22530+ break;
22531+ case IA64_SPURIOUS_INT_VECTOR:
22532+ break;
22533+ case IA64_CMC_VECTOR:
22534+ sprintf(cmc_name[cpu], "%s%d", action->name, cpu);
22535+ ret = bind_virq_to_irqhandler(VIRQ_MCA_CMC, cpu,
22536+ action->handler,
22537+ action->flags,
22538+ cmc_name[cpu],
22539+ action->dev_id);
22540+ per_cpu(cmc_irq,cpu) = ret;
22541+ printk(KERN_INFO "register VIRQ_MCA_CMC (%s) to xen "
22542+ "irq (%d)\n", cmc_name[cpu], ret);
22543+ break;
22544+ case IA64_CMCP_VECTOR:
22545+ sprintf(cmcp_name[cpu], "%s%d", action->name, cpu);
22546+ ret = bind_ipi_to_irqhandler(CMCP_VECTOR, cpu,
22547+ action->handler,
22548+ action->flags,
22549+ cmcp_name[cpu],
22550+ action->dev_id);
22551+ per_cpu(cmcp_irq,cpu) = ret;
22552+ printk(KERN_INFO "register CMCP_VECTOR (%s) to xen "
22553+ "irq (%d)\n", cmcp_name[cpu], ret);
22554+ break;
22555+ case IA64_CPEP_VECTOR:
22556+ sprintf(cpep_name[cpu], "%s%d", action->name, cpu);
22557+ ret = bind_ipi_to_irqhandler(CPEP_VECTOR, cpu,
22558+ action->handler,
22559+ action->flags,
22560+ cpep_name[cpu],
22561+ action->dev_id);
22562+ per_cpu(cpep_irq,cpu) = ret;
22563+ printk(KERN_INFO "register CPEP_VECTOR (%s) to xen "
22564+ "irq (%d)\n", cpep_name[cpu], ret);
22565+ break;
22566+ case IA64_CPE_VECTOR:
22567+ printk(KERN_WARNING "register IA64_CPE_VECTOR "
22568+ "IGNORED\n");
22569+ break;
22570+ default:
22571+ printk(KERN_WARNING "Percpu irq %d is unsupported by xen!\n", irq);
22572+ break;
22573+ }
22574+ BUG_ON(ret < 0);
22575+ }
22576+
22577+ /* For BSP, we cache registered percpu irqs, and then re-walk
22578+ * them when initializing APs
22579+ */
22580+ if (!cpu && save) {
22581+ BUG_ON(saved_irq_cnt == MAX_LATE_IRQ);
22582+ saved_percpu_irqs[saved_irq_cnt].irq = irq;
22583+ saved_percpu_irqs[saved_irq_cnt].action = action;
22584+ saved_irq_cnt++;
22585+ if (!xen_slab_ready)
22586+ late_irq_cnt++;
22587+ }
22588+}
22589+
22590+static void
22591+xen_bind_early_percpu_irq (void)
22592+{
22593+ int i;
22594+
22595+ xen_slab_ready = 1;
22596+ /* There's no race when accessing this cached array, since only
22597+ * BSP will face with such step shortly
22598+ */
22599+ for (i = 0; i < late_irq_cnt; i++)
22600+ xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22601+ saved_percpu_irqs[i].action, 0);
22602+}
22603+
22604+/* FIXME: There's no obvious point to check whether slab is ready. So
22605+ * a hack is used here by utilizing a late time hook.
22606+ */
22607+extern void (*late_time_init)(void);
22608+extern char xen_event_callback;
22609+extern void xen_init_IRQ(void);
22610+
22611+#ifdef CONFIG_HOTPLUG_CPU
22612+static int __devinit
22613+unbind_evtchn_callback(struct notifier_block *nfb,
22614+ unsigned long action, void *hcpu)
22615+{
22616+ unsigned int cpu = (unsigned long)hcpu;
22617+
22618+ if (action == CPU_DEAD) {
22619+ /* Unregister evtchn. */
22620+ if (per_cpu(cpep_irq,cpu) >= 0) {
22621+ unbind_from_irqhandler(per_cpu(cpep_irq, cpu), NULL);
22622+ per_cpu(cpep_irq, cpu) = -1;
22623+ }
22624+ if (per_cpu(cmcp_irq,cpu) >= 0) {
22625+ unbind_from_irqhandler(per_cpu(cmcp_irq, cpu), NULL);
22626+ per_cpu(cmcp_irq, cpu) = -1;
22627+ }
22628+ if (per_cpu(cmc_irq,cpu) >= 0) {
22629+ unbind_from_irqhandler(per_cpu(cmc_irq, cpu), NULL);
22630+ per_cpu(cmc_irq, cpu) = -1;
22631+ }
22632+ if (per_cpu(ipi_irq,cpu) >= 0) {
22633+ unbind_from_irqhandler (per_cpu(ipi_irq, cpu), NULL);
22634+ per_cpu(ipi_irq, cpu) = -1;
22635+ }
22636+ if (per_cpu(resched_irq,cpu) >= 0) {
22637+ unbind_from_irqhandler (per_cpu(resched_irq, cpu),
22638+ NULL);
22639+ per_cpu(resched_irq, cpu) = -1;
22640+ }
22641+ if (per_cpu(timer_irq,cpu) >= 0) {
22642+ unbind_from_irqhandler (per_cpu(timer_irq, cpu), NULL);
22643+ per_cpu(timer_irq, cpu) = -1;
22644+ }
22645+ }
22646+ return NOTIFY_OK;
22647+}
22648+
22649+static struct notifier_block unbind_evtchn_notifier = {
22650+ .notifier_call = unbind_evtchn_callback,
22651+ .priority = 0
22652+};
22653+#endif
22654+
22655+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
22656+void xen_smp_intr_init(void)
22657+{
22658+#ifdef CONFIG_SMP
22659+ unsigned int cpu = smp_processor_id();
22660+ unsigned int i = 0;
22661+ struct callback_register event = {
22662+ .type = CALLBACKTYPE_event,
22663+ .address = (unsigned long)&xen_event_callback,
22664+ };
22665+
22666+ if (cpu == 0) {
22667+ /* Initialization was already done for boot cpu. */
22668+#ifdef CONFIG_HOTPLUG_CPU
22669+ /* Register the notifier only once. */
22670+ register_cpu_notifier(&unbind_evtchn_notifier);
22671+#endif
22672+ return;
22673+ }
22674+
22675+ /* This should be piggyback when setup vcpu guest context */
22676+ BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22677+
22678+ for (i = 0; i < saved_irq_cnt; i++)
22679+ xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22680+ saved_percpu_irqs[i].action, 0);
22681+#endif /* CONFIG_SMP */
22682+}
22683+#endif /* CONFIG_XEN */
22684+
22685 void
22686 register_percpu_irq (ia64_vector vec, struct irqaction *action)
22687 {
22688 irq_desc_t *desc;
22689 unsigned int irq;
22690
22691+#ifdef CONFIG_XEN
22692+ if (is_running_on_xen())
22693+ return xen_register_percpu_irq(vec, action, 1);
22694+#endif
22695+
22696 for (irq = 0; irq < NR_IRQS; ++irq)
22697 if (irq_to_vector(irq) == vec) {
22698 desc = irq_descp(irq);
22699@@ -243,6 +512,21 @@
22700 void __init
22701 init_IRQ (void)
22702 {
22703+#ifdef CONFIG_XEN
22704+ /* Maybe put into platform_irq_init later */
22705+ if (is_running_on_xen()) {
22706+ struct callback_register event = {
22707+ .type = CALLBACKTYPE_event,
22708+ .address = (unsigned long)&xen_event_callback,
22709+ };
22710+ xen_init_IRQ();
22711+ BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22712+ late_time_init = xen_bind_early_percpu_irq;
22713+#ifdef CONFIG_SMP
22714+ register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
22715+#endif /* CONFIG_SMP */
22716+ }
22717+#endif /* CONFIG_XEN */
22718 register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
22719 #ifdef CONFIG_SMP
22720 register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
22721@@ -260,6 +544,45 @@
22722 unsigned long ipi_data;
22723 unsigned long phys_cpu_id;
22724
22725+#ifdef CONFIG_XEN
22726+ if (is_running_on_xen()) {
22727+ int irq = -1;
22728+
22729+#ifdef CONFIG_SMP
22730+ /* TODO: we need to call vcpu_up here */
22731+ if (unlikely(vector == ap_wakeup_vector)) {
22732+ extern void xen_send_ipi (int cpu, int vec);
22733+ xen_send_ipi (cpu, vector);
22734+ //vcpu_prepare_and_up(cpu);
22735+ return;
22736+ }
22737+#endif
22738+
22739+ switch(vector) {
22740+ case IA64_IPI_VECTOR:
22741+ irq = per_cpu(ipi_to_irq, cpu)[IPI_VECTOR];
22742+ break;
22743+ case IA64_IPI_RESCHEDULE:
22744+ irq = per_cpu(ipi_to_irq, cpu)[RESCHEDULE_VECTOR];
22745+ break;
22746+ case IA64_CMCP_VECTOR:
22747+ irq = per_cpu(ipi_to_irq, cpu)[CMCP_VECTOR];
22748+ break;
22749+ case IA64_CPEP_VECTOR:
22750+ irq = per_cpu(ipi_to_irq, cpu)[CPEP_VECTOR];
22751+ break;
22752+ default:
22753+ printk(KERN_WARNING"Unsupported IPI type 0x%x\n", vector);
22754+ irq = 0;
22755+ break;
22756+ }
22757+
22758+ BUG_ON(irq < 0);
22759+ notify_remote_via_irq(irq);
22760+ return;
22761+ }
22762+#endif /* CONFIG_XEN */
22763+
22764 #ifdef CONFIG_SMP
22765 phys_cpu_id = cpu_physical_id(cpu);
22766 #else
22767diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/pal.S linux-2.6.16.33/arch/ia64/kernel/pal.S
22768--- linux-2.6.16.33-noxen/arch/ia64/kernel/pal.S 2006-11-22 18:06:31.000000000 +0000
22769+++ linux-2.6.16.33/arch/ia64/kernel/pal.S 2007-01-08 15:00:45.000000000 +0000
22770@@ -16,6 +16,7 @@
22771 #include <asm/processor.h>
22772
22773 .data
22774+ .globl pal_entry_point
22775 pal_entry_point:
22776 data8 ia64_pal_default_handler
22777 .text
22778@@ -53,7 +54,7 @@
22779 * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic
22780 *
22781 */
22782-GLOBAL_ENTRY(ia64_pal_call_static)
22783+GLOBAL_ENTRY(__ia64_pal_call_static)
22784 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
22785 alloc loc1 = ar.pfs,5,5,0,0
22786 movl loc2 = pal_entry_point
22787@@ -90,7 +91,7 @@
22788 ;;
22789 srlz.d // seralize restoration of psr.l
22790 br.ret.sptk.many b0
22791-END(ia64_pal_call_static)
22792+END(__ia64_pal_call_static)
22793
22794 /*
22795 * Make a PAL call using the stacked registers calling convention.
22796diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/patch.c linux-2.6.16.33/arch/ia64/kernel/patch.c
22797--- linux-2.6.16.33-noxen/arch/ia64/kernel/patch.c 2006-11-22 18:06:31.000000000 +0000
22798+++ linux-2.6.16.33/arch/ia64/kernel/patch.c 2007-01-08 15:00:45.000000000 +0000
22799@@ -184,6 +184,73 @@
22800 ia64_srlz_i();
22801 }
22802
22803+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22804+extern char __start_gate_running_on_xen_patchlist[];
22805+extern char __end_gate_running_on_xen_patchlist[];
22806+
22807+void
22808+patch_running_on_xen(unsigned long start, unsigned long end)
22809+{
22810+ extern int running_on_xen;
22811+ s32 *offp = (s32 *)start;
22812+ u64 ip;
22813+
22814+ while (offp < (s32 *)end) {
22815+ ip = (u64)ia64_imva((char *)offp + *offp);
22816+ ia64_patch_imm64(ip, (u64)&running_on_xen);
22817+ ia64_fc((void *)ip);
22818+ ++offp;
22819+ }
22820+ ia64_sync_i();
22821+ ia64_srlz_i();
22822+}
22823+
22824+static void
22825+patch_brl_symaddr(unsigned long start, unsigned long end,
22826+ unsigned long symaddr)
22827+{
22828+ s32 *offp = (s32 *)start;
22829+ u64 ip;
22830+
22831+ while (offp < (s32 *)end) {
22832+ ip = (u64)offp + *offp;
22833+ ia64_patch_imm60((u64)ia64_imva((void *)ip),
22834+ (u64)(symaddr - (ip & -16)) / 16);
22835+ ia64_fc((void *)ip);
22836+ ++offp;
22837+ }
22838+ ia64_sync_i();
22839+ ia64_srlz_i();
22840+}
22841+
22842+#define EXTERN_PATCHLIST(name) \
22843+ extern char __start_gate_brl_##name##_patchlist[]; \
22844+ extern char __end_gate_brl_##name##_patchlist[]; \
22845+ extern char name[]
22846+
22847+#define PATCH_BRL_SYMADDR(name) \
22848+ patch_brl_symaddr((unsigned long)__start_gate_brl_##name##_patchlist, \
22849+ (unsigned long)__end_gate_brl_##name##_patchlist, \
22850+ (unsigned long)name)
22851+
22852+static void
22853+patch_brl_in_vdso(void)
22854+{
22855+ EXTERN_PATCHLIST(xen_rsm_be_i);
22856+ EXTERN_PATCHLIST(xen_get_psr);
22857+ EXTERN_PATCHLIST(xen_ssm_i_0);
22858+ EXTERN_PATCHLIST(xen_ssm_i_1);
22859+
22860+ PATCH_BRL_SYMADDR(xen_rsm_be_i);
22861+ PATCH_BRL_SYMADDR(xen_get_psr);
22862+ PATCH_BRL_SYMADDR(xen_ssm_i_0);
22863+ PATCH_BRL_SYMADDR(xen_ssm_i_1);
22864+}
22865+#else
22866+#define patch_running_on_xen(start, end) do { } while (0)
22867+#define patch_brl_in_vdso() do { } while (0)
22868+#endif
22869+
22870 void
22871 ia64_patch_gate (void)
22872 {
22873@@ -192,6 +259,10 @@
22874
22875 patch_fsyscall_table(START(fsyscall), END(fsyscall));
22876 patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
22877+#ifdef CONFIG_XEN
22878+ patch_running_on_xen(START(running_on_xen), END(running_on_xen));
22879+ patch_brl_in_vdso();
22880+#endif
22881 ia64_patch_vtop(START(vtop), END(vtop));
22882 ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
22883 }
22884diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/perfmon.c linux-2.6.16.33/arch/ia64/kernel/perfmon.c
22885--- linux-2.6.16.33-noxen/arch/ia64/kernel/perfmon.c 2006-11-22 18:06:31.000000000 +0000
22886+++ linux-2.6.16.33/arch/ia64/kernel/perfmon.c 2007-01-08 15:00:45.000000000 +0000
22887@@ -53,6 +53,28 @@
22888 #include <asm/delay.h>
22889
22890 #ifdef CONFIG_PERFMON
22891+#ifdef CONFIG_XEN
22892+//#include <xen/xenoprof.h>
22893+#include <xen/interface/xenoprof.h>
22894+
22895+static int xenoprof_is_primary = 0;
22896+#define init_xenoprof_primary(is_primary) (xenoprof_is_primary = (is_primary))
22897+#define is_xenoprof_primary() (xenoprof_is_primary)
22898+#define XEN_NOT_SUPPORTED_YET \
22899+ do { \
22900+ if (is_running_on_xen()) { \
22901+ printk("%s is not supported yet under xen.\n", \
22902+ __func__); \
22903+ return -ENOSYS; \
22904+ } \
22905+ } while (0)
22906+#else
22907+#define init_xenoprof_primary(is_primary) do { } while (0)
22908+#define is_xenoprof_primary() (0)
22909+#define XEN_NOT_SUPPORTED_YET do { } while (0)
22910+#define HYPERVISOR_perfmon_op(cmd, arg, count) do { } while (0)
22911+#endif
22912+
22913 /*
22914 * perfmon context state
22915 */
22916@@ -1515,6 +1537,7 @@
22917 ssize_t ret;
22918 unsigned long flags;
22919 DECLARE_WAITQUEUE(wait, current);
22920+ XEN_NOT_SUPPORTED_YET;
22921 if (PFM_IS_FILE(filp) == 0) {
22922 printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
22923 return -EINVAL;
22924@@ -2113,6 +2136,15 @@
22925 */
22926 if (free_possible) pfm_context_free(ctx);
22927
22928+ if (is_running_on_xen()) {
22929+ if (is_xenoprof_primary()) {
22930+ int ret = HYPERVISOR_perfmon_op(PFM_DESTROY_CONTEXT,
22931+ NULL, 0);
22932+ if (ret)
22933+ printk("%s:%d PFM_DESTROY_CONTEXT hypercall "
22934+ "failed\n", __func__, __LINE__);
22935+ }
22936+ }
22937 return 0;
22938 }
22939
22940@@ -2736,6 +2768,23 @@
22941 */
22942 pfm_reset_pmu_state(ctx);
22943
22944+ if (is_running_on_xen()) {
22945+ /*
22946+ * kludge to get xenoprof.is_primary.
22947+ * XENOPROF_init/ia64 is nop. so it is safe to call it here.
22948+ */
22949+ struct xenoprof_init init;
22950+ ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
22951+ if (ret)
22952+ goto buffer_error;
22953+ init_xenoprof_primary(init.is_primary);
22954+
22955+ if (is_xenoprof_primary()) {
22956+ ret = HYPERVISOR_perfmon_op(PFM_CREATE_CONTEXT, arg, 0);
22957+ if (ret)
22958+ goto buffer_error;
22959+ }
22960+ }
22961 return 0;
22962
22963 buffer_error:
22964@@ -2872,6 +2921,12 @@
22965 pfm_reg_check_t wr_func;
22966 #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
22967
22968+ if (is_running_on_xen()) {
22969+ if (is_xenoprof_primary())
22970+ return HYPERVISOR_perfmon_op(PFM_WRITE_PMCS,
22971+ arg, count);
22972+ return 0;
22973+ }
22974 state = ctx->ctx_state;
22975 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
22976 is_system = ctx->ctx_fl_system;
22977@@ -3112,6 +3167,12 @@
22978 int ret = -EINVAL;
22979 pfm_reg_check_t wr_func;
22980
22981+ if (is_running_on_xen()) {
22982+ if (is_xenoprof_primary())
22983+ return HYPERVISOR_perfmon_op(PFM_WRITE_PMDS,
22984+ arg, count);
22985+ return 0;
22986+ }
22987
22988 state = ctx->ctx_state;
22989 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
22990@@ -3309,6 +3370,7 @@
22991 int is_loaded, is_system, is_counting, expert_mode;
22992 int ret = -EINVAL;
22993 pfm_reg_check_t rd_func;
22994+ XEN_NOT_SUPPORTED_YET;
22995
22996 /*
22997 * access is possible when loaded only for
22998@@ -3560,6 +3622,7 @@
22999 pfm_ovfl_ctrl_t rst_ctrl;
23000 int state, is_system;
23001 int ret = 0;
23002+ XEN_NOT_SUPPORTED_YET;
23003
23004 state = ctx->ctx_state;
23005 fmt = ctx->ctx_buf_fmt;
23006@@ -3709,6 +3772,7 @@
23007 pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
23008 {
23009 unsigned int m = *(unsigned int *)arg;
23010+ XEN_NOT_SUPPORTED_YET;
23011
23012 pfm_sysctl.debug = m == 0 ? 0 : 1;
23013
23014@@ -3979,6 +4043,8 @@
23015 {
23016 pfarg_features_t *req = (pfarg_features_t *)arg;
23017
23018+ if (is_running_on_xen())
23019+ return HYPERVISOR_perfmon_op(PFM_GET_FEATURES, &arg, 0);
23020 req->ft_version = PFM_VERSION;
23021 return 0;
23022 }
23023@@ -3990,6 +4056,12 @@
23024 struct task_struct *task = PFM_CTX_TASK(ctx);
23025 int state, is_system;
23026
23027+ if (is_running_on_xen()) {
23028+ if (is_xenoprof_primary())
23029+ return HYPERVISOR_perfmon_op(PFM_STOP, NULL, 0);
23030+ return 0;
23031+ }
23032+
23033 state = ctx->ctx_state;
23034 is_system = ctx->ctx_fl_system;
23035
23036@@ -4078,6 +4150,11 @@
23037 struct pt_regs *tregs;
23038 int state, is_system;
23039
23040+ if (is_running_on_xen()) {
23041+ if (is_xenoprof_primary())
23042+ return HYPERVISOR_perfmon_op(PFM_START, NULL, 0);
23043+ return 0;
23044+ }
23045 state = ctx->ctx_state;
23046 is_system = ctx->ctx_fl_system;
23047
23048@@ -4160,6 +4237,7 @@
23049 unsigned int cnum;
23050 int i;
23051 int ret = -EINVAL;
23052+ XEN_NOT_SUPPORTED_YET;
23053
23054 for (i = 0; i < count; i++, req++) {
23055
23056@@ -4218,6 +4296,11 @@
23057 int ret = 0;
23058 int state, is_system, set_dbregs = 0;
23059
23060+ if (is_running_on_xen()) {
23061+ if (is_xenoprof_primary())
23062+ return HYPERVISOR_perfmon_op(PFM_LOAD_CONTEXT, arg, 0);
23063+ return 0;
23064+ }
23065 state = ctx->ctx_state;
23066 is_system = ctx->ctx_fl_system;
23067 /*
23068@@ -4466,6 +4549,12 @@
23069 int prev_state, is_system;
23070 int ret;
23071
23072+ if (is_running_on_xen()) {
23073+ if (is_xenoprof_primary())
23074+ return HYPERVISOR_perfmon_op(PFM_UNLOAD_CONTEXT,
23075+ NULL, 0);
23076+ return 0;
23077+ }
23078 DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
23079
23080 prev_state = ctx->ctx_state;
23081diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/setup.c linux-2.6.16.33/arch/ia64/kernel/setup.c
23082--- linux-2.6.16.33-noxen/arch/ia64/kernel/setup.c 2006-11-22 18:06:31.000000000 +0000
23083+++ linux-2.6.16.33/arch/ia64/kernel/setup.c 2007-01-08 15:00:45.000000000 +0000
23084@@ -61,6 +61,11 @@
23085 #include <asm/system.h>
23086 #include <asm/unistd.h>
23087 #include <asm/system.h>
23088+#ifdef CONFIG_XEN
23089+#include <asm/hypervisor.h>
23090+#include <asm/xen/xencomm.h>
23091+#endif
23092+#include <linux/dma-mapping.h>
23093
23094 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
23095 # error "struct cpuinfo_ia64 too big!"
23096@@ -71,6 +76,20 @@
23097 EXPORT_SYMBOL(__per_cpu_offset);
23098 #endif
23099
23100+#ifdef CONFIG_XEN
23101+static int
23102+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
23103+{
23104+ HYPERVISOR_shutdown(SHUTDOWN_crash);
23105+ /* we're never actually going to get here... */
23106+ return NOTIFY_DONE;
23107+}
23108+
23109+static struct notifier_block xen_panic_block = {
23110+ xen_panic_event, NULL, 0 /* try to go last */
23111+};
23112+#endif
23113+
23114 extern void ia64_setup_printk_clock(void);
23115
23116 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
23117@@ -243,6 +262,14 @@
23118 rsvd_region[n].end = (unsigned long) ia64_imva(_end);
23119 n++;
23120
23121+#ifdef CONFIG_XEN
23122+ if (is_running_on_xen()) {
23123+ rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
23124+ rsvd_region[n].end = rsvd_region[n].start + PAGE_SIZE;
23125+ n++;
23126+ }
23127+#endif
23128+
23129 #ifdef CONFIG_BLK_DEV_INITRD
23130 if (ia64_boot_param->initrd_start) {
23131 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
23132@@ -260,6 +287,7 @@
23133 n++;
23134
23135 num_rsvd_regions = n;
23136+ BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
23137
23138 sort_regions(rsvd_region, num_rsvd_regions);
23139 }
23140@@ -333,6 +361,16 @@
23141 {
23142 int earlycons = 0;
23143
23144+#ifdef CONFIG_XEN
23145+#ifndef CONFIG_IA64_HP_SIM
23146+ if (is_running_on_xen()) {
23147+ extern struct console hpsim_cons;
23148+ hpsim_cons.flags |= CON_BOOT;
23149+ register_console(&hpsim_cons);
23150+ earlycons++;
23151+ }
23152+#endif
23153+#endif
23154 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
23155 {
23156 extern int sn_serial_console_early_setup(void);
23157@@ -394,6 +432,17 @@
23158 {
23159 unw_init();
23160
23161+#ifdef CONFIG_XEN
23162+ if (is_running_on_xen()) {
23163+ /* Must be done before any hypercall. */
23164+ xencomm_init();
23165+
23166+ setup_xen_features();
23167+ /* Register a call for panic conditions. */
23168+ notifier_chain_register(&panic_notifier_list, &xen_panic_block);
23169+ }
23170+#endif
23171+
23172 ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
23173
23174 *cmdline_p = __va(ia64_boot_param->command_line);
23175@@ -490,7 +539,26 @@
23176 conswitchp = &vga_con;
23177 # endif
23178 }
23179+#ifdef CONFIG_XEN
23180+ if (is_running_on_xen()) {
23181+ shared_info_t *s = HYPERVISOR_shared_info;
23182+
23183+ xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
23184+
23185+ printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%ld "
23186+ "flags=0x%x\n", s->arch.start_info_pfn,
23187+ xen_start_info->nr_pages, xen_start_info->flags);
23188+
23189+ if (!is_initial_xendomain()) {
23190+#if !defined(CONFIG_VT) || !defined(CONFIG_DUMMY_CONSOLE)
23191+ conswitchp = NULL;
23192+#endif
23193+ }
23194+ }
23195+ xencons_early_setup();
23196 #endif
23197+#endif
23198+
23199
23200 /* enable IA-64 Machine Check Abort Handling unless disabled */
23201 if (!strstr(saved_command_line, "nomca"))
23202@@ -498,6 +566,9 @@
23203
23204 platform_setup(cmdline_p);
23205 paging_init();
23206+#ifdef CONFIG_XEN
23207+ contiguous_bitmap_init(max_pfn);
23208+#endif
23209 }
23210
23211 /*
23212@@ -882,6 +953,15 @@
23213 /* size of physical stacked register partition plus 8 bytes: */
23214 __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
23215 platform_cpu_init();
23216+
23217+#ifdef CONFIG_XEN
23218+ /* Need to be moved into platform_cpu_init later */
23219+ if (is_running_on_xen()) {
23220+ extern void xen_smp_intr_init(void);
23221+ xen_smp_intr_init();
23222+ }
23223+#endif
23224+
23225 pm_idle = default_idle;
23226 }
23227
23228diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/Makefile linux-2.6.16.33/arch/ia64/oprofile/Makefile
23229--- linux-2.6.16.33-noxen/arch/ia64/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
23230+++ linux-2.6.16.33/arch/ia64/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
23231@@ -8,3 +8,7 @@
23232
23233 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
23234 oprofile-$(CONFIG_PERFMON) += perfmon.o
23235+ifeq ($(CONFIG_XEN), y)
23236+oprofile-$(CONFIG_PERFMON) += xenoprof.o \
23237+ ../../../drivers/xen/xenoprof/xenoprofile.o
23238+endif
23239diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/init.c linux-2.6.16.33/arch/ia64/oprofile/init.c
23240--- linux-2.6.16.33-noxen/arch/ia64/oprofile/init.c 2006-11-22 18:06:31.000000000 +0000
23241+++ linux-2.6.16.33/arch/ia64/oprofile/init.c 2007-01-08 15:00:45.000000000 +0000
23242@@ -11,6 +11,7 @@
23243 #include <linux/oprofile.h>
23244 #include <linux/init.h>
23245 #include <linux/errno.h>
23246+#include "oprofile_perfmon.h"
23247
23248 extern int perfmon_init(struct oprofile_operations * ops);
23249 extern void perfmon_exit(void);
23250@@ -20,6 +21,13 @@
23251 {
23252 int ret = -ENODEV;
23253
23254+ if (is_running_on_xen()) {
23255+ ret = xen_perfmon_init();
23256+ if (ret)
23257+ return ret;
23258+ return xenoprofile_init(ops);
23259+ }
23260+
23261 #ifdef CONFIG_PERFMON
23262 /* perfmon_init() can fail, but we have no way to report it */
23263 ret = perfmon_init(ops);
23264@@ -32,6 +40,12 @@
23265
23266 void oprofile_arch_exit(void)
23267 {
23268+ if (is_running_on_xen()) {
23269+ xenoprofile_exit();
23270+ xen_perfmon_exit();
23271+ return;
23272+ }
23273+
23274 #ifdef CONFIG_PERFMON
23275 perfmon_exit();
23276 #endif
23277diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/oprofile_perfmon.h linux-2.6.16.33/arch/ia64/oprofile/oprofile_perfmon.h
23278--- linux-2.6.16.33-noxen/arch/ia64/oprofile/oprofile_perfmon.h 1970-01-01 00:00:00.000000000 +0000
23279+++ linux-2.6.16.33/arch/ia64/oprofile/oprofile_perfmon.h 2007-01-08 15:00:45.000000000 +0000
23280@@ -0,0 +1,30 @@
23281+#ifndef OPROFILE_PERFMON_H
23282+#define OPROFILE_PERFMON_H
23283+
23284+#include <linux/config.h>
23285+
23286+#ifdef CONFIG_PERFMON
23287+int __perfmon_init(void);
23288+void __perfmon_exit(void);
23289+int perfmon_start(void);
23290+void perfmon_stop(void);
23291+#else
23292+#define __perfmon_init() (-ENOSYS)
23293+#define __perfmon_exit() do {} while (0)
23294+#endif /* CONFIG_PERFMON */
23295+
23296+#ifdef CONFIG_XEN
23297+#define STATIC_IF_NO_XEN /* nothing */
23298+#define xen_perfmon_init() __perfmon_init()
23299+#define xen_perfmon_exit() __perfmon_exit()
23300+extern int xenoprofile_init(struct oprofile_operations * ops);
23301+extern void xenoprofile_exit(void);
23302+#else
23303+#define STATIC_IF_NO_XEN static
23304+#define xen_perfmon_init() (-ENOSYS)
23305+#define xen_perfmon_exit() do {} while (0)
23306+#define xenoprofile_init() (-ENOSYS)
23307+#define xenoprofile_exit() do {} while (0)
23308+#endif /* CONFIG_XEN */
23309+
23310+#endif /* OPROFILE_PERFMON_H */
23311diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/perfmon.c linux-2.6.16.33/arch/ia64/oprofile/perfmon.c
23312--- linux-2.6.16.33-noxen/arch/ia64/oprofile/perfmon.c 2006-11-22 18:06:31.000000000 +0000
23313+++ linux-2.6.16.33/arch/ia64/oprofile/perfmon.c 2007-01-08 15:00:45.000000000 +0000
23314@@ -14,6 +14,7 @@
23315 #include <asm/perfmon.h>
23316 #include <asm/ptrace.h>
23317 #include <asm/errno.h>
23318+#include "oprofile_perfmon.h"
23319
23320 static int allow_ints;
23321
23322@@ -34,14 +35,16 @@
23323 }
23324
23325
23326-static int perfmon_start(void)
23327+STATIC_IF_NO_XEN
23328+int perfmon_start(void)
23329 {
23330 allow_ints = 1;
23331 return 0;
23332 }
23333
23334
23335-static void perfmon_stop(void)
23336+STATIC_IF_NO_XEN
23337+void perfmon_stop(void)
23338 {
23339 allow_ints = 0;
23340 }
23341@@ -76,16 +79,35 @@
23342
23343 static int using_perfmon;
23344
23345-int perfmon_init(struct oprofile_operations * ops)
23346+STATIC_IF_NO_XEN
23347+int __perfmon_init(void)
23348 {
23349 int ret = pfm_register_buffer_fmt(&oprofile_fmt);
23350 if (ret)
23351 return -ENODEV;
23352
23353+ using_perfmon = 1;
23354+ return 0;
23355+}
23356+
23357+STATIC_IF_NO_XEN
23358+void __perfmon_exit(void)
23359+{
23360+ if (!using_perfmon)
23361+ return;
23362+
23363+ pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23364+}
23365+
23366+int perfmon_init(struct oprofile_operations * ops)
23367+{
23368+ int ret = __perfmon_init();
23369+ if (ret)
23370+ return -ENODEV;
23371+
23372 ops->cpu_type = get_cpu_type();
23373 ops->start = perfmon_start;
23374 ops->stop = perfmon_stop;
23375- using_perfmon = 1;
23376 printk(KERN_INFO "oprofile: using perfmon.\n");
23377 return 0;
23378 }
23379@@ -93,8 +115,5 @@
23380
23381 void perfmon_exit(void)
23382 {
23383- if (!using_perfmon)
23384- return;
23385-
23386- pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23387+ __perfmon_exit();
23388 }
23389diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/xenoprof.c linux-2.6.16.33/arch/ia64/oprofile/xenoprof.c
23390--- linux-2.6.16.33-noxen/arch/ia64/oprofile/xenoprof.c 1970-01-01 00:00:00.000000000 +0000
23391+++ linux-2.6.16.33/arch/ia64/oprofile/xenoprof.c 2007-01-08 15:00:45.000000000 +0000
23392@@ -0,0 +1,142 @@
23393+/******************************************************************************
23394+ * xenoprof ia64 specific part
23395+ *
23396+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23397+ * VA Linux Systems Japan K.K.
23398+ *
23399+ * This program is free software; you can redistribute it and/or modify
23400+ * it under the terms of the GNU General Public License as published by
23401+ * the Free Software Foundation; either version 2 of the License, or
23402+ * (at your option) any later version.
23403+ *
23404+ * This program is distributed in the hope that it will be useful,
23405+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23406+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23407+ * GNU General Public License for more details.
23408+ *
23409+ * You should have received a copy of the GNU General Public License
23410+ * along with this program; if not, write to the Free Software
23411+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23412+ *
23413+ */
23414+#include <linux/init.h>
23415+#include <linux/oprofile.h>
23416+#include <linux/ioport.h>
23417+
23418+#include <xen/driver_util.h>
23419+#include <xen/interface/xen.h>
23420+#include <xen/interface/xenoprof.h>
23421+#include <xen/xenoprof.h>
23422+
23423+#include "oprofile_perfmon.h"
23424+
23425+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
23426+{
23427+ init->num_events = 0; /* perfmon manages. */
23428+}
23429+
23430+void xenoprof_arch_counter(void)
23431+{
23432+ /* nothing. perfmon does. */
23433+}
23434+
23435+void xenoprof_arch_start(void)
23436+{
23437+ perfmon_start();
23438+}
23439+
23440+void xenoprof_arch_stop(void)
23441+{
23442+ perfmon_stop();
23443+}
23444+
23445+/* XXX move them to an appropriate header file. */
23446+struct resource* xen_ia64_allocate_resource(unsigned long size);
23447+void xen_ia64_release_resource(struct resource* res);
23448+void xen_ia64_unmap_resource(struct resource* res);
23449+
23450+struct resource*
23451+xenoprof_ia64_allocate_resource(int32_t max_samples)
23452+{
23453+ unsigned long bufsize;
23454+
23455+ /* XXX add hypercall to get bufsize? */
23456+ /* this value is taken from alloc_xenoprof_struct(). */
23457+#if 0
23458+ bufsize = NR_CPUS * (sizeof(struct xenoprof_buf) +
23459+ (max_samples - 1) * sizeof(struct event_log));
23460+ bufsize = PAGE_ALIGN(bufsize) + PAGE_SIZE;
23461+#else
23462+#define MAX_OPROF_SHARED_PAGES 32
23463+ bufsize = (MAX_OPROF_SHARED_PAGES + 1) * PAGE_SIZE;
23464+#endif
23465+ return xen_ia64_allocate_resource(bufsize);
23466+}
23467+
23468+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf)
23469+{
23470+ if (sbuf->buffer) {
23471+ xen_ia64_unmap_resource(sbuf->arch.res);
23472+ sbuf->buffer = NULL;
23473+ sbuf->arch.res = NULL;
23474+ }
23475+}
23476+
23477+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
23478+ struct xenoprof_shared_buffer* sbuf)
23479+{
23480+ int ret;
23481+ struct resource* res;
23482+
23483+ sbuf->buffer = NULL;
23484+ sbuf->arch.res = NULL;
23485+
23486+ res = xenoprof_ia64_allocate_resource(get_buffer->max_samples);
23487+ if (IS_ERR(res))
23488+ return PTR_ERR(res);
23489+
23490+ get_buffer->buf_gmaddr = res->start;
23491+
23492+ ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer);
23493+ if (ret) {
23494+ xen_ia64_release_resource(res);
23495+ return ret;
23496+ }
23497+
23498+ BUG_ON((res->end - res->start + 1) <
23499+ get_buffer->bufsize * get_buffer->nbuf);
23500+
23501+ sbuf->buffer = __va(res->start);
23502+ sbuf->arch.res = res;
23503+
23504+ return ret;
23505+}
23506+
23507+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
23508+ struct xenoprof_shared_buffer* sbuf)
23509+{
23510+ int ret;
23511+ struct resource* res;
23512+
23513+ sbuf->buffer = NULL;
23514+ sbuf->arch.res = NULL;
23515+
23516+ res = xenoprof_ia64_allocate_resource(pdomain->max_samples);
23517+ if (IS_ERR(res))
23518+ return PTR_ERR(res);
23519+
23520+ pdomain->buf_gmaddr = res->start;
23521+
23522+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
23523+ if (ret) {
23524+ xen_ia64_release_resource(res);
23525+ return ret;
23526+ }
23527+
23528+ BUG_ON((res->end - res->start + 1) < pdomain->bufsize * pdomain->nbuf);
23529+
23530+ sbuf->buffer = __va(res->start);
23531+ sbuf->arch.res = res;
23532+
23533+ return ret;
23534+}
23535diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/Makefile linux-2.6.16.33/arch/ia64/xen/Makefile
23536--- linux-2.6.16.33-noxen/arch/ia64/xen/Makefile 1970-01-01 00:00:00.000000000 +0000
23537+++ linux-2.6.16.33/arch/ia64/xen/Makefile 2007-01-08 15:00:45.000000000 +0000
23538@@ -0,0 +1,9 @@
23539+#
23540+# Makefile for Xen components
23541+#
23542+
23543+obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o \
23544+ hypervisor.o pci-dma-xen.o util.o xencomm.o xcom_hcall.o \
23545+ xcom_mini.o xcom_privcmd.o mem.o
23546+
23547+pci-dma-xen-y := ../../i386/kernel/pci-dma-xen.o
23548diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/hypercall.S linux-2.6.16.33/arch/ia64/xen/hypercall.S
23549--- linux-2.6.16.33-noxen/arch/ia64/xen/hypercall.S 1970-01-01 00:00:00.000000000 +0000
23550+++ linux-2.6.16.33/arch/ia64/xen/hypercall.S 2007-01-08 15:00:45.000000000 +0000
23551@@ -0,0 +1,412 @@
23552+/*
23553+ * Support routines for Xen hypercalls
23554+ *
23555+ * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
23556+ */
23557+
23558+#include <linux/config.h>
23559+#include <asm/processor.h>
23560+#include <asm/asmmacro.h>
23561+
23562+/* To clear vpsr.ic, vpsr.i needs to be cleared first */
23563+#define XEN_CLEAR_PSR_IC \
23564+ mov r14=1; \
23565+ movl r15=XSI_PSR_I_ADDR; \
23566+ movl r2=XSI_PSR_IC; \
23567+ ;; \
23568+ ld8 r15=[r15]; \
23569+ ld4 r3=[r2]; \
23570+ ;; \
23571+ ld1 r16=[r15]; \
23572+ ;; \
23573+ st1 [r15]=r14; \
23574+ st4 [r2]=r0; \
23575+ ;;
23576+
23577+/* First restore vpsr.ic, and then vpsr.i */
23578+#define XEN_RESTORE_PSR_IC \
23579+ st4 [r2]=r3; \
23580+ st1 [r15]=r16; \
23581+ ;;
23582+
23583+GLOBAL_ENTRY(xen_get_ivr)
23584+ movl r8=running_on_xen;;
23585+ ld4 r8=[r8];;
23586+ cmp.eq p7,p0=r8,r0;;
23587+(p7) mov r8=cr.ivr;;
23588+(p7) br.ret.sptk.many rp
23589+ ;;
23590+ XEN_CLEAR_PSR_IC
23591+ ;;
23592+ XEN_HYPER_GET_IVR
23593+ ;;
23594+ XEN_RESTORE_PSR_IC
23595+ ;;
23596+ br.ret.sptk.many rp
23597+ ;;
23598+END(xen_get_ivr)
23599+
23600+GLOBAL_ENTRY(xen_get_tpr)
23601+ movl r8=running_on_xen;;
23602+ ld4 r8=[r8];;
23603+ cmp.eq p7,p0=r8,r0;;
23604+(p7) mov r8=cr.tpr;;
23605+(p7) br.ret.sptk.many rp
23606+ ;;
23607+ XEN_CLEAR_PSR_IC
23608+ ;;
23609+ XEN_HYPER_GET_TPR
23610+ ;;
23611+ XEN_RESTORE_PSR_IC
23612+ ;;
23613+ br.ret.sptk.many rp
23614+ ;;
23615+END(xen_get_tpr)
23616+
23617+GLOBAL_ENTRY(xen_set_tpr)
23618+ movl r8=running_on_xen;;
23619+ ld4 r8=[r8];;
23620+ cmp.eq p7,p0=r8,r0;;
23621+(p7) mov cr.tpr=r32;;
23622+(p7) br.ret.sptk.many rp
23623+ ;;
23624+ mov r8=r32
23625+ ;;
23626+ XEN_CLEAR_PSR_IC
23627+ ;;
23628+ XEN_HYPER_SET_TPR
23629+ ;;
23630+ XEN_RESTORE_PSR_IC
23631+ ;;
23632+ br.ret.sptk.many rp
23633+ ;;
23634+END(xen_set_tpr)
23635+
23636+GLOBAL_ENTRY(xen_eoi)
23637+ movl r8=running_on_xen;;
23638+ ld4 r8=[r8];;
23639+ cmp.eq p7,p0=r8,r0;;
23640+(p7) mov cr.eoi=r0;;
23641+(p7) br.ret.sptk.many rp
23642+ ;;
23643+ mov r8=r32
23644+ ;;
23645+ XEN_CLEAR_PSR_IC
23646+ ;;
23647+ XEN_HYPER_EOI
23648+ ;;
23649+ XEN_RESTORE_PSR_IC
23650+ ;;
23651+ br.ret.sptk.many rp
23652+ ;;
23653+END(xen_eoi)
23654+
23655+GLOBAL_ENTRY(xen_thash)
23656+ movl r8=running_on_xen;;
23657+ ld4 r8=[r8];;
23658+ cmp.eq p7,p0=r8,r0;;
23659+(p7) thash r8=r32;;
23660+(p7) br.ret.sptk.many rp
23661+ ;;
23662+ mov r8=r32
23663+ ;;
23664+ XEN_CLEAR_PSR_IC
23665+ ;;
23666+ XEN_HYPER_THASH
23667+ ;;
23668+ XEN_RESTORE_PSR_IC
23669+ ;;
23670+ br.ret.sptk.many rp
23671+ ;;
23672+END(xen_thash)
23673+
23674+GLOBAL_ENTRY(xen_set_itm)
23675+ movl r8=running_on_xen;;
23676+ ld4 r8=[r8];;
23677+ cmp.eq p7,p0=r8,r0;;
23678+(p7) mov cr.itm=r32;;
23679+(p7) br.ret.sptk.many rp
23680+ ;;
23681+ mov r8=r32
23682+ ;;
23683+ XEN_CLEAR_PSR_IC
23684+ ;;
23685+ XEN_HYPER_SET_ITM
23686+ ;;
23687+ XEN_RESTORE_PSR_IC
23688+ ;;
23689+ br.ret.sptk.many rp
23690+ ;;
23691+END(xen_set_itm)
23692+
23693+GLOBAL_ENTRY(xen_ptcga)
23694+ movl r8=running_on_xen;;
23695+ ld4 r8=[r8];;
23696+ cmp.eq p7,p0=r8,r0;;
23697+(p7) ptc.ga r32,r33;;
23698+(p7) br.ret.sptk.many rp
23699+ ;;
23700+ mov r8=r32
23701+ mov r9=r33
23702+ ;;
23703+ XEN_CLEAR_PSR_IC
23704+ ;;
23705+ XEN_HYPER_PTC_GA
23706+ ;;
23707+ XEN_RESTORE_PSR_IC
23708+ ;;
23709+ br.ret.sptk.many rp
23710+ ;;
23711+END(xen_ptcga)
23712+
23713+GLOBAL_ENTRY(xen_get_rr)
23714+ movl r8=running_on_xen;;
23715+ ld4 r8=[r8];;
23716+ cmp.eq p7,p0=r8,r0;;
23717+(p7) mov r8=rr[r32];;
23718+(p7) br.ret.sptk.many rp
23719+ ;;
23720+ mov r8=r32
23721+ ;;
23722+ XEN_CLEAR_PSR_IC
23723+ ;;
23724+ XEN_HYPER_GET_RR
23725+ ;;
23726+ XEN_RESTORE_PSR_IC
23727+ ;;
23728+ br.ret.sptk.many rp
23729+ ;;
23730+END(xen_get_rr)
23731+
23732+GLOBAL_ENTRY(xen_set_rr)
23733+ movl r8=running_on_xen;;
23734+ ld4 r8=[r8];;
23735+ cmp.eq p7,p0=r8,r0;;
23736+(p7) mov rr[r32]=r33;;
23737+(p7) br.ret.sptk.many rp
23738+ ;;
23739+ mov r8=r32
23740+ mov r9=r33
23741+ ;;
23742+ XEN_CLEAR_PSR_IC
23743+ ;;
23744+ XEN_HYPER_SET_RR
23745+ ;;
23746+ XEN_RESTORE_PSR_IC
23747+ ;;
23748+ br.ret.sptk.many rp
23749+ ;;
23750+END(xen_set_rr)
23751+
23752+GLOBAL_ENTRY(xen_set_kr)
23753+ movl r8=running_on_xen;;
23754+ ld4 r8=[r8];;
23755+ cmp.ne p7,p0=r8,r0;;
23756+(p7) br.cond.spnt.few 1f;
23757+ ;;
23758+ cmp.eq p7,p0=r8,r0
23759+ adds r8=-1,r8;;
23760+(p7) mov ar0=r9
23761+(p7) br.ret.sptk.many rp;;
23762+ cmp.eq p7,p0=r8,r0
23763+ adds r8=-1,r8;;
23764+(p7) mov ar1=r9
23765+(p7) br.ret.sptk.many rp;;
23766+ cmp.eq p7,p0=r8,r0
23767+ adds r8=-1,r8;;
23768+(p7) mov ar2=r9
23769+(p7) br.ret.sptk.many rp;;
23770+ cmp.eq p7,p0=r8,r0
23771+ adds r8=-1,r8;;
23772+(p7) mov ar3=r9
23773+(p7) br.ret.sptk.many rp;;
23774+ cmp.eq p7,p0=r8,r0
23775+ adds r8=-1,r8;;
23776+(p7) mov ar4=r9
23777+(p7) br.ret.sptk.many rp;;
23778+ cmp.eq p7,p0=r8,r0
23779+ adds r8=-1,r8;;
23780+(p7) mov ar5=r9
23781+(p7) br.ret.sptk.many rp;;
23782+ cmp.eq p7,p0=r8,r0
23783+ adds r8=-1,r8;;
23784+(p7) mov ar6=r9
23785+(p7) br.ret.sptk.many rp;;
23786+ cmp.eq p7,p0=r8,r0
23787+ adds r8=-1,r8;;
23788+(p7) mov ar7=r9
23789+(p7) br.ret.sptk.many rp;;
23790+
23791+1: mov r8=r32
23792+ mov r9=r33
23793+ ;;
23794+ XEN_CLEAR_PSR_IC
23795+ ;;
23796+ XEN_HYPER_SET_KR
23797+ ;;
23798+ XEN_RESTORE_PSR_IC
23799+ ;;
23800+ br.ret.sptk.many rp
23801+END(xen_set_kr)
23802+
23803+GLOBAL_ENTRY(xen_fc)
23804+ movl r8=running_on_xen;;
23805+ ld4 r8=[r8];;
23806+ cmp.eq p7,p0=r8,r0;;
23807+(p7) fc r32;;
23808+(p7) br.ret.sptk.many rp
23809+ ;;
23810+ mov r8=r32
23811+ ;;
23812+ XEN_CLEAR_PSR_IC
23813+ ;;
23814+ XEN_HYPER_FC
23815+ ;;
23816+ XEN_RESTORE_PSR_IC
23817+ ;;
23818+ br.ret.sptk.many rp
23819+END(xen_fc)
23820+
23821+GLOBAL_ENTRY(xen_get_cpuid)
23822+ movl r8=running_on_xen;;
23823+ ld4 r8=[r8];;
23824+ cmp.eq p7,p0=r8,r0;;
23825+(p7) mov r8=cpuid[r32];;
23826+(p7) br.ret.sptk.many rp
23827+ ;;
23828+ mov r8=r32
23829+ ;;
23830+ XEN_CLEAR_PSR_IC
23831+ ;;
23832+ XEN_HYPER_GET_CPUID
23833+ ;;
23834+ XEN_RESTORE_PSR_IC
23835+ ;;
23836+ br.ret.sptk.many rp
23837+END(xen_get_cpuid)
23838+
23839+GLOBAL_ENTRY(xen_get_pmd)
23840+ movl r8=running_on_xen;;
23841+ ld4 r8=[r8];;
23842+ cmp.eq p7,p0=r8,r0;;
23843+(p7) mov r8=pmd[r32];;
23844+(p7) br.ret.sptk.many rp
23845+ ;;
23846+ mov r8=r32
23847+ ;;
23848+ XEN_CLEAR_PSR_IC
23849+ ;;
23850+ XEN_HYPER_GET_PMD
23851+ ;;
23852+ XEN_RESTORE_PSR_IC
23853+ ;;
23854+ br.ret.sptk.many rp
23855+END(xen_get_pmd)
23856+
23857+#ifdef CONFIG_IA32_SUPPORT
23858+GLOBAL_ENTRY(xen_get_eflag)
23859+ movl r8=running_on_xen;;
23860+ ld4 r8=[r8];;
23861+ cmp.eq p7,p0=r8,r0;;
23862+(p7) mov r8=ar24;;
23863+(p7) br.ret.sptk.many rp
23864+ ;;
23865+ mov r8=r32
23866+ ;;
23867+ XEN_CLEAR_PSR_IC
23868+ ;;
23869+ XEN_HYPER_GET_EFLAG
23870+ ;;
23871+ XEN_RESTORE_PSR_IC
23872+ ;;
23873+ br.ret.sptk.many rp
23874+END(xen_get_eflag)
23875+
23876+// some bits aren't set if pl!=0, see SDM vol1 3.1.8
23877+GLOBAL_ENTRY(xen_set_eflag)
23878+ movl r8=running_on_xen;;
23879+ ld4 r8=[r8];;
23880+ cmp.eq p7,p0=r8,r0;;
23881+(p7) mov ar24=r32
23882+(p7) br.ret.sptk.many rp
23883+ ;;
23884+ mov r8=r32
23885+ ;;
23886+ XEN_CLEAR_PSR_IC
23887+ ;;
23888+ XEN_HYPER_SET_EFLAG
23889+ ;;
23890+ XEN_RESTORE_PSR_IC
23891+ ;;
23892+ br.ret.sptk.many rp
23893+END(xen_set_eflag)
23894+#endif
23895+
23896+GLOBAL_ENTRY(xen_send_ipi)
23897+ mov r14=r32
23898+ mov r15=r33
23899+ mov r2=0x400
23900+ break 0x1000
23901+ ;;
23902+ br.ret.sptk.many rp
23903+ ;;
23904+END(xen_send_ipi)
23905+
23906+#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
23907+// Those are vdso specialized.
23908+// In fsys mode, call, ret can't be used.
23909+GLOBAL_ENTRY(xen_rsm_be_i)
23910+ st1 [r22]=r20
23911+ st4 [r23]=r0
23912+ XEN_HYPER_RSM_BE
23913+ st4 [r23]=r20
23914+ brl.cond.sptk .vdso_rsm_be_i_ret
23915+ ;;
23916+END(xen_rsm_be_i)
23917+
23918+GLOBAL_ENTRY(xen_get_psr)
23919+ mov r31=r8
23920+ mov r25=IA64_PSR_IC
23921+ st4 [r23]=r0
23922+ XEN_HYPER_GET_PSR
23923+ ;;
23924+ st4 [r23]=r20
23925+ or r29=r8,r25 // vpsr.ic was cleared for hyperprivop
23926+ mov r8=r31
23927+ brl.cond.sptk .vdso_get_psr_ret
23928+ ;;
23929+END(xen_get_psr)
23930+
23931+ // see xen_ssm_i() in privop.h
23932+ // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
23933+ // r23 = &vpsr.ic
23934+ // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
23935+ // r25 = tmp
23936+ // r31 = tmp
23937+ // p11 = tmp
23938+ // p14 = tmp
23939+#define XEN_SET_PSR_I \
23940+ ld1 r31=[r22]; \
23941+ ld1 r25=[r24]; \
23942+ ;; \
23943+ st1 [r22]=r0; \
23944+ cmp.ne.unc p14,p0=r0,r31; \
23945+ ;; \
23946+(p14) cmp.ne.unc p11,p0=r0,r25; \
23947+ ;; \
23948+(p11) st1 [r22]=r20; \
23949+(p11) st4 [r23]=r0; \
23950+(p11) XEN_HYPER_SSM_I;
23951+
23952+GLOBAL_ENTRY(xen_ssm_i_0)
23953+ XEN_SET_PSR_I
23954+ brl.cond.sptk .vdso_ssm_i_0_ret
23955+ ;;
23956+END(xen_ssm_i_0)
23957+
23958+GLOBAL_ENTRY(xen_ssm_i_1)
23959+ XEN_SET_PSR_I
23960+ brl.cond.sptk .vdso_ssm_i_1_ret
23961+ ;;
23962+END(xen_ssm_i_1)
23963+#endif
23964diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/hypervisor.c linux-2.6.16.33/arch/ia64/xen/hypervisor.c
23965--- linux-2.6.16.33-noxen/arch/ia64/xen/hypervisor.c 1970-01-01 00:00:00.000000000 +0000
23966+++ linux-2.6.16.33/arch/ia64/xen/hypervisor.c 2007-01-08 15:00:45.000000000 +0000
23967@@ -0,0 +1,1104 @@
23968+/******************************************************************************
23969+ * include/asm-ia64/shadow.h
23970+ *
23971+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23972+ * VA Linux Systems Japan K.K.
23973+ *
23974+ * This program is free software; you can redistribute it and/or modify
23975+ * it under the terms of the GNU General Public License as published by
23976+ * the Free Software Foundation; either version 2 of the License, or
23977+ * (at your option) any later version.
23978+ *
23979+ * This program is distributed in the hope that it will be useful,
23980+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23981+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23982+ * GNU General Public License for more details.
23983+ *
23984+ * You should have received a copy of the GNU General Public License
23985+ * along with this program; if not, write to the Free Software
23986+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23987+ *
23988+ */
23989+
23990+//#include <linux/kernel.h>
23991+#include <linux/spinlock.h>
23992+#include <linux/bootmem.h>
23993+#include <linux/module.h>
23994+#include <linux/vmalloc.h>
23995+#include <asm/page.h>
23996+#include <asm/hypervisor.h>
23997+#include <asm/hypercall.h>
23998+#include <xen/interface/memory.h>
23999+#include <xen/balloon.h>
24000+
24001+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)XSI_BASE;
24002+EXPORT_SYMBOL(HYPERVISOR_shared_info);
24003+
24004+start_info_t *xen_start_info;
24005+EXPORT_SYMBOL(xen_start_info);
24006+
24007+int running_on_xen;
24008+EXPORT_SYMBOL(running_on_xen);
24009+
24010+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
24011+static int p2m_expose_init(void);
24012+#else
24013+#define p2m_expose_init() (-ENOSYS)
24014+#endif
24015+
24016+//XXX same as i386, x86_64 contiguous_bitmap_set(), contiguous_bitmap_clear()
24017+// move those to lib/contiguous_bitmap?
24018+//XXX discontigmem/sparsemem
24019+
24020+/*
24021+ * Bitmap is indexed by page number. If bit is set, the page is part of a
24022+ * xen_create_contiguous_region() area of memory.
24023+ */
24024+unsigned long *contiguous_bitmap;
24025+
24026+void
24027+contiguous_bitmap_init(unsigned long end_pfn)
24028+{
24029+ unsigned long size = (end_pfn + 2 * BITS_PER_LONG) >> 3;
24030+ contiguous_bitmap = alloc_bootmem_low_pages(size);
24031+ BUG_ON(!contiguous_bitmap);
24032+ memset(contiguous_bitmap, 0, size);
24033+}
24034+
24035+#if 0
24036+int
24037+contiguous_bitmap_test(void* p)
24038+{
24039+ return test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap);
24040+}
24041+#endif
24042+
24043+static void contiguous_bitmap_set(
24044+ unsigned long first_page, unsigned long nr_pages)
24045+{
24046+ unsigned long start_off, end_off, curr_idx, end_idx;
24047+
24048+ curr_idx = first_page / BITS_PER_LONG;
24049+ start_off = first_page & (BITS_PER_LONG-1);
24050+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
24051+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
24052+
24053+ if (curr_idx == end_idx) {
24054+ contiguous_bitmap[curr_idx] |=
24055+ ((1UL<<end_off)-1) & -(1UL<<start_off);
24056+ } else {
24057+ contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
24058+ while ( ++curr_idx < end_idx )
24059+ contiguous_bitmap[curr_idx] = ~0UL;
24060+ contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
24061+ }
24062+}
24063+
24064+static void contiguous_bitmap_clear(
24065+ unsigned long first_page, unsigned long nr_pages)
24066+{
24067+ unsigned long start_off, end_off, curr_idx, end_idx;
24068+
24069+ curr_idx = first_page / BITS_PER_LONG;
24070+ start_off = first_page & (BITS_PER_LONG-1);
24071+ end_idx = (first_page + nr_pages) / BITS_PER_LONG;
24072+ end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
24073+
24074+ if (curr_idx == end_idx) {
24075+ contiguous_bitmap[curr_idx] &=
24076+ -(1UL<<end_off) | ((1UL<<start_off)-1);
24077+ } else {
24078+ contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
24079+ while ( ++curr_idx != end_idx )
24080+ contiguous_bitmap[curr_idx] = 0;
24081+ contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
24082+ }
24083+}
24084+
24085+// __xen_create_contiguous_region(), __xen_destroy_contiguous_region()
24086+// are based on i386 xen_create_contiguous_region(),
24087+// xen_destroy_contiguous_region()
24088+
24089+/* Protected by balloon_lock. */
24090+#define MAX_CONTIG_ORDER 7
24091+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
24092+
24093+/* Ensure multi-page extents are contiguous in machine memory. */
24094+int
24095+__xen_create_contiguous_region(unsigned long vstart,
24096+ unsigned int order, unsigned int address_bits)
24097+{
24098+ unsigned long error = 0;
24099+ unsigned long gphys = __pa(vstart);
24100+ unsigned long start_gpfn = gphys >> PAGE_SHIFT;
24101+ unsigned long num_gpfn = 1 << order;
24102+ unsigned long i;
24103+ unsigned long flags;
24104+
24105+ unsigned long *in_frames = discontig_frames, out_frame;
24106+ int success;
24107+ struct xen_memory_exchange exchange = {
24108+ .in = {
24109+ .nr_extents = num_gpfn,
24110+ .extent_order = 0,
24111+ .domid = DOMID_SELF
24112+ },
24113+ .out = {
24114+ .nr_extents = 1,
24115+ .extent_order = order,
24116+ .address_bits = address_bits,
24117+ .domid = DOMID_SELF
24118+ },
24119+ .nr_exchanged = 0
24120+ };
24121+
24122+ if (unlikely(order > MAX_CONTIG_ORDER))
24123+ return -ENOMEM;
24124+
24125+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
24126+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
24127+
24128+ scrub_pages(vstart, num_gpfn);
24129+
24130+ balloon_lock(flags);
24131+
24132+ /* Get a new contiguous memory extent. */
24133+ for (i = 0; i < num_gpfn; i++) {
24134+ in_frames[i] = start_gpfn + i;
24135+ }
24136+ out_frame = start_gpfn;
24137+ error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24138+ success = (exchange.nr_exchanged == num_gpfn);
24139+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24140+ BUG_ON(success && (error != 0));
24141+ if (unlikely(error == -ENOSYS)) {
24142+ /* Compatibility when XENMEM_exchange is unsupported. */
24143+ error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24144+ &exchange.in);
24145+ BUG_ON(error != num_gpfn);
24146+ error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24147+ &exchange.out);
24148+ if (error != 1) {
24149+ /* Couldn't get special memory: fall back to normal. */
24150+ for (i = 0; i < num_gpfn; i++) {
24151+ in_frames[i] = start_gpfn + i;
24152+ }
24153+ error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24154+ &exchange.in);
24155+ BUG_ON(error != num_gpfn);
24156+ success = 0;
24157+ } else
24158+ success = 1;
24159+ }
24160+ if (success)
24161+ contiguous_bitmap_set(start_gpfn, num_gpfn);
24162+#if 0
24163+ if (success) {
24164+ unsigned long mfn;
24165+ unsigned long mfn_prev = ~0UL;
24166+ for (i = 0; i < num_gpfn; i++) {
24167+ mfn = pfn_to_mfn_for_dma(start_gpfn + i);
24168+ if (mfn_prev != ~0UL && mfn != mfn_prev + 1) {
24169+ xprintk("\n");
24170+ xprintk("%s:%d order %d "
24171+ "start 0x%lx bus 0x%lx "
24172+ "machine 0x%lx\n",
24173+ __func__, __LINE__, order,
24174+ vstart, virt_to_bus((void*)vstart),
24175+ phys_to_machine_for_dma(gphys));
24176+ xprintk("mfn: ");
24177+ for (i = 0; i < num_gpfn; i++) {
24178+ mfn = pfn_to_mfn_for_dma(
24179+ start_gpfn + i);
24180+ xprintk("0x%lx ", mfn);
24181+ }
24182+ xprintk("\n");
24183+ break;
24184+ }
24185+ mfn_prev = mfn;
24186+ }
24187+ }
24188+#endif
24189+ balloon_unlock(flags);
24190+ return success? 0: -ENOMEM;
24191+}
24192+
24193+void
24194+__xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
24195+{
24196+ unsigned long flags;
24197+ unsigned long error = 0;
24198+ unsigned long start_gpfn = __pa(vstart) >> PAGE_SHIFT;
24199+ unsigned long num_gpfn = 1UL << order;
24200+ unsigned long i;
24201+
24202+ unsigned long *out_frames = discontig_frames, in_frame;
24203+ int success;
24204+ struct xen_memory_exchange exchange = {
24205+ .in = {
24206+ .nr_extents = 1,
24207+ .extent_order = order,
24208+ .domid = DOMID_SELF
24209+ },
24210+ .out = {
24211+ .nr_extents = num_gpfn,
24212+ .extent_order = 0,
24213+ .address_bits = 0,
24214+ .domid = DOMID_SELF
24215+ },
24216+ .nr_exchanged = 0
24217+ };
24218+
24219+
24220+ if (!test_bit(start_gpfn, contiguous_bitmap))
24221+ return;
24222+
24223+ if (unlikely(order > MAX_CONTIG_ORDER))
24224+ return;
24225+
24226+ set_xen_guest_handle(exchange.in.extent_start, &in_frame);
24227+ set_xen_guest_handle(exchange.out.extent_start, out_frames);
24228+
24229+ scrub_pages(vstart, num_gpfn);
24230+
24231+ balloon_lock(flags);
24232+
24233+ contiguous_bitmap_clear(start_gpfn, num_gpfn);
24234+
24235+ /* Do the exchange for non-contiguous MFNs. */
24236+ in_frame = start_gpfn;
24237+ for (i = 0; i < num_gpfn; i++) {
24238+ out_frames[i] = start_gpfn + i;
24239+ }
24240+ error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24241+ success = (exchange.nr_exchanged == 1);
24242+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24243+ BUG_ON(success && (error != 0));
24244+ if (unlikely(error == -ENOSYS)) {
24245+ /* Compatibility when XENMEM_exchange is unsupported. */
24246+ error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24247+ &exchange.in);
24248+ BUG_ON(error != 1);
24249+
24250+ error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24251+ &exchange.out);
24252+ BUG_ON(error != num_gpfn);
24253+ }
24254+ balloon_unlock(flags);
24255+}
24256+
24257+
24258+///////////////////////////////////////////////////////////////////////////
24259+// grant table hack
24260+// cmd: GNTTABOP_xxx
24261+
24262+#include <linux/mm.h>
24263+#include <xen/interface/xen.h>
24264+#include <xen/gnttab.h>
24265+
24266+static void
24267+gnttab_map_grant_ref_pre(struct gnttab_map_grant_ref *uop)
24268+{
24269+ uint32_t flags;
24270+
24271+ flags = uop->flags;
24272+
24273+ if (flags & GNTMAP_host_map) {
24274+ if (flags & GNTMAP_application_map) {
24275+ xprintd("GNTMAP_application_map is not supported yet: flags 0x%x\n", flags);
24276+ BUG();
24277+ }
24278+ if (flags & GNTMAP_contains_pte) {
24279+ xprintd("GNTMAP_contains_pte is not supported yet flags 0x%x\n", flags);
24280+ BUG();
24281+ }
24282+ } else if (flags & GNTMAP_device_map) {
24283+ xprintd("GNTMAP_device_map is not supported yet 0x%x\n", flags);
24284+ BUG();//XXX not yet. actually this flag is not used.
24285+ } else {
24286+ BUG();
24287+ }
24288+}
24289+
24290+int
24291+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
24292+{
24293+ if (cmd == GNTTABOP_map_grant_ref) {
24294+ unsigned int i;
24295+ for (i = 0; i < count; i++) {
24296+ gnttab_map_grant_ref_pre(
24297+ (struct gnttab_map_grant_ref*)uop + i);
24298+ }
24299+ }
24300+ return xencomm_mini_hypercall_grant_table_op(cmd, uop, count);
24301+}
24302+EXPORT_SYMBOL(HYPERVISOR_grant_table_op);
24303+
24304+///////////////////////////////////////////////////////////////////////////
24305+// PageForeign(), SetPageForeign(), ClearPageForeign()
24306+
24307+struct address_space xen_ia64_foreign_dummy_mapping;
24308+EXPORT_SYMBOL(xen_ia64_foreign_dummy_mapping);
24309+
24310+///////////////////////////////////////////////////////////////////////////
24311+// foreign mapping
24312+#include <linux/efi.h>
24313+#include <asm/meminit.h> // for IA64_GRANULE_SIZE, GRANULEROUND{UP,DOWN}()
24314+
24315+static unsigned long privcmd_resource_min = 0;
24316+// Xen/ia64 currently can handle pseudo physical address bits up to
24317+// (PAGE_SHIFT * 3)
24318+static unsigned long privcmd_resource_max = GRANULEROUNDDOWN((1UL << (PAGE_SHIFT * 3)) - 1);
24319+static unsigned long privcmd_resource_align = IA64_GRANULE_SIZE;
24320+
24321+static unsigned long
24322+md_end_addr(const efi_memory_desc_t *md)
24323+{
24324+ return md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
24325+}
24326+
24327+#define XEN_IA64_PRIVCMD_LEAST_GAP_SIZE (1024 * 1024 * 1024UL)
24328+static int
24329+xen_ia64_privcmd_check_size(unsigned long start, unsigned long end)
24330+{
24331+ return (start < end &&
24332+ (end - start) > XEN_IA64_PRIVCMD_LEAST_GAP_SIZE);
24333+}
24334+
24335+static int __init
24336+xen_ia64_privcmd_init(void)
24337+{
24338+ void *efi_map_start, *efi_map_end, *p;
24339+ u64 efi_desc_size;
24340+ efi_memory_desc_t *md;
24341+ unsigned long tmp_min;
24342+ unsigned long tmp_max;
24343+ unsigned long gap_size;
24344+ unsigned long prev_end;
24345+
24346+ if (!is_running_on_xen())
24347+ return -1;
24348+
24349+ efi_map_start = __va(ia64_boot_param->efi_memmap);
24350+ efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
24351+ efi_desc_size = ia64_boot_param->efi_memdesc_size;
24352+
24353+ // at first check the used highest address
24354+ for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
24355+ // nothing
24356+ }
24357+ md = p - efi_desc_size;
24358+ privcmd_resource_min = GRANULEROUNDUP(md_end_addr(md));
24359+ if (xen_ia64_privcmd_check_size(privcmd_resource_min,
24360+ privcmd_resource_max)) {
24361+ goto out;
24362+ }
24363+
24364+ // the used highest address is too large. try to find the largest gap.
24365+ tmp_min = privcmd_resource_max;
24366+ tmp_max = 0;
24367+ gap_size = 0;
24368+ prev_end = 0;
24369+ for (p = efi_map_start;
24370+ p < efi_map_end - efi_desc_size;
24371+ p += efi_desc_size) {
24372+ unsigned long end;
24373+ efi_memory_desc_t* next;
24374+ unsigned long next_start;
24375+
24376+ md = p;
24377+ end = md_end_addr(md);
24378+ if (end > privcmd_resource_max) {
24379+ break;
24380+ }
24381+ if (end < prev_end) {
24382+ // work around.
24383+ // Xen may pass incompletely sorted memory
24384+ // descriptors like
24385+ // [x, x + length]
24386+ // [x, x]
24387+ // this order should be reversed.
24388+ continue;
24389+ }
24390+ next = p + efi_desc_size;
24391+ next_start = next->phys_addr;
24392+ if (next_start > privcmd_resource_max) {
24393+ next_start = privcmd_resource_max;
24394+ }
24395+ if (end < next_start && gap_size < (next_start - end)) {
24396+ tmp_min = end;
24397+ tmp_max = next_start;
24398+ gap_size = tmp_max - tmp_min;
24399+ }
24400+ prev_end = end;
24401+ }
24402+
24403+ privcmd_resource_min = GRANULEROUNDUP(tmp_min);
24404+ if (xen_ia64_privcmd_check_size(privcmd_resource_min, tmp_max)) {
24405+ privcmd_resource_max = tmp_max;
24406+ goto out;
24407+ }
24408+
24409+ privcmd_resource_min = tmp_min;
24410+ privcmd_resource_max = tmp_max;
24411+ if (!xen_ia64_privcmd_check_size(privcmd_resource_min,
24412+ privcmd_resource_max)) {
24413+ // Any large enough gap isn't found.
24414+ // go ahead anyway with the warning hoping that large region
24415+ // won't be requested.
24416+ printk(KERN_WARNING "xen privcmd: large enough region for privcmd mmap is not found.\n");
24417+ }
24418+
24419+out:
24420+ printk(KERN_INFO "xen privcmd uses pseudo physical addr range [0x%lx, 0x%lx] (%ldMB)\n",
24421+ privcmd_resource_min, privcmd_resource_max,
24422+ (privcmd_resource_max - privcmd_resource_min) >> 20);
24423+ BUG_ON(privcmd_resource_min >= privcmd_resource_max);
24424+
24425+ // XXX this should be somewhere appropriate
24426+ (void)p2m_expose_init();
24427+
24428+ return 0;
24429+}
24430+late_initcall(xen_ia64_privcmd_init);
24431+
24432+struct xen_ia64_privcmd_entry {
24433+ atomic_t map_count;
24434+#define INVALID_GPFN (~0UL)
24435+ unsigned long gpfn;
24436+};
24437+
24438+struct xen_ia64_privcmd_range {
24439+ atomic_t ref_count;
24440+ unsigned long pgoff; // in PAGE_SIZE
24441+ struct resource* res;
24442+
24443+ unsigned long num_entries;
24444+ struct xen_ia64_privcmd_entry entries[0];
24445+};
24446+
24447+struct xen_ia64_privcmd_vma {
24448+ int is_privcmd_mmapped;
24449+ struct xen_ia64_privcmd_range* range;
24450+
24451+ unsigned long num_entries;
24452+ struct xen_ia64_privcmd_entry* entries;
24453+};
24454+
24455+static void
24456+xen_ia64_privcmd_init_entry(struct xen_ia64_privcmd_entry* entry)
24457+{
24458+ atomic_set(&entry->map_count, 0);
24459+ entry->gpfn = INVALID_GPFN;
24460+}
24461+
24462+static int
24463+xen_ia64_privcmd_entry_mmap(struct vm_area_struct* vma,
24464+ unsigned long addr,
24465+ struct xen_ia64_privcmd_range* privcmd_range,
24466+ int i,
24467+ unsigned long gmfn,
24468+ pgprot_t prot,
24469+ domid_t domid)
24470+{
24471+ int error = 0;
24472+ struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24473+ unsigned long gpfn;
24474+ unsigned long flags;
24475+
24476+ if ((addr & ~PAGE_MASK) != 0 || gmfn == INVALID_MFN) {
24477+ error = -EINVAL;
24478+ goto out;
24479+ }
24480+
24481+ if (entry->gpfn != INVALID_GPFN) {
24482+ error = -EBUSY;
24483+ goto out;
24484+ }
24485+ gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + i;
24486+
24487+ flags = ASSIGN_writable;
24488+ if (pgprot_val(prot) == PROT_READ) {
24489+ flags = ASSIGN_readonly;
24490+ }
24491+ error = HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn, flags, domid);
24492+ if (error != 0) {
24493+ goto out;
24494+ }
24495+
24496+ prot = vma->vm_page_prot;
24497+ error = remap_pfn_range(vma, addr, gpfn, 1 << PAGE_SHIFT, prot);
24498+ if (error != 0) {
24499+ error = HYPERVISOR_zap_physmap(gpfn, 0);
24500+ if (error) {
24501+ BUG();//XXX
24502+ }
24503+ } else {
24504+ atomic_inc(&entry->map_count);
24505+ entry->gpfn = gpfn;
24506+ }
24507+
24508+out:
24509+ return error;
24510+}
24511+
24512+static void
24513+xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_range* privcmd_range,
24514+ int i)
24515+{
24516+ struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24517+ unsigned long gpfn = entry->gpfn;
24518+ //gpfn = (privcmd_range->res->start >> PAGE_SHIFT) +
24519+ // (vma->vm_pgoff - privcmd_range->pgoff);
24520+ int error;
24521+
24522+ error = HYPERVISOR_zap_physmap(gpfn, 0);
24523+ if (error) {
24524+ BUG();//XXX
24525+ }
24526+ entry->gpfn = INVALID_GPFN;
24527+}
24528+
24529+static void
24530+xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_range* privcmd_range,
24531+ int i)
24532+{
24533+ struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24534+ if (entry->gpfn != INVALID_GPFN) {
24535+ atomic_inc(&entry->map_count);
24536+ } else {
24537+ BUG_ON(atomic_read(&entry->map_count) != 0);
24538+ }
24539+}
24540+
24541+static void
24542+xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_range* privcmd_range,
24543+ int i)
24544+{
24545+ struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24546+ if (entry->gpfn != INVALID_GPFN &&
24547+ atomic_dec_and_test(&entry->map_count)) {
24548+ xen_ia64_privcmd_entry_munmap(privcmd_range, i);
24549+ }
24550+}
24551+
24552+static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma);
24553+static void xen_ia64_privcmd_vma_close(struct vm_area_struct* vma);
24554+
24555+struct vm_operations_struct xen_ia64_privcmd_vm_ops = {
24556+ .open = &xen_ia64_privcmd_vma_open,
24557+ .close = &xen_ia64_privcmd_vma_close,
24558+};
24559+
24560+static void
24561+__xen_ia64_privcmd_vma_open(struct vm_area_struct* vma,
24562+ struct xen_ia64_privcmd_vma* privcmd_vma,
24563+ struct xen_ia64_privcmd_range* privcmd_range)
24564+{
24565+ unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24566+ unsigned long num_entries = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
24567+ unsigned long i;
24568+
24569+ BUG_ON(entry_offset < 0);
24570+ BUG_ON(entry_offset + num_entries > privcmd_range->num_entries);
24571+
24572+ privcmd_vma->range = privcmd_range;
24573+ privcmd_vma->num_entries = num_entries;
24574+ privcmd_vma->entries = &privcmd_range->entries[entry_offset];
24575+ vma->vm_private_data = privcmd_vma;
24576+ for (i = 0; i < privcmd_vma->num_entries; i++) {
24577+ xen_ia64_privcmd_entry_open(privcmd_range, entry_offset + i);
24578+ }
24579+
24580+ vma->vm_private_data = privcmd_vma;
24581+ vma->vm_ops = &xen_ia64_privcmd_vm_ops;
24582+}
24583+
24584+static void
24585+xen_ia64_privcmd_vma_open(struct vm_area_struct* vma)
24586+{
24587+ struct xen_ia64_privcmd_vma* old_privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24588+ struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24589+ struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24590+
24591+ atomic_inc(&privcmd_range->ref_count);
24592+ // vm_op->open() can't fail.
24593+ privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL | __GFP_NOFAIL);
24594+ // copy original value if necessary
24595+ privcmd_vma->is_privcmd_mmapped = old_privcmd_vma->is_privcmd_mmapped;
24596+
24597+ __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
24598+}
24599+
24600+static void
24601+xen_ia64_privcmd_vma_close(struct vm_area_struct* vma)
24602+{
24603+ struct xen_ia64_privcmd_vma* privcmd_vma =
24604+ (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24605+ struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24606+ unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24607+ unsigned long i;
24608+
24609+ for (i = 0; i < privcmd_vma->num_entries; i++) {
24610+ xen_ia64_privcmd_entry_close(privcmd_range, entry_offset + i);
24611+ }
24612+ vma->vm_private_data = NULL;
24613+ kfree(privcmd_vma);
24614+
24615+ if (atomic_dec_and_test(&privcmd_range->ref_count)) {
24616+#if 1
24617+ for (i = 0; i < privcmd_range->num_entries; i++) {
24618+ struct xen_ia64_privcmd_entry* entry =
24619+ &privcmd_range->entries[i];
24620+ BUG_ON(atomic_read(&entry->map_count) != 0);
24621+ BUG_ON(entry->gpfn != INVALID_GPFN);
24622+ }
24623+#endif
24624+ release_resource(privcmd_range->res);
24625+ kfree(privcmd_range->res);
24626+ vfree(privcmd_range);
24627+ }
24628+}
24629+
24630+int
24631+privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
24632+{
24633+ struct xen_ia64_privcmd_vma* privcmd_vma =
24634+ (struct xen_ia64_privcmd_vma *)vma->vm_private_data;
24635+ return (xchg(&privcmd_vma->is_privcmd_mmapped, 1) == 0);
24636+}
24637+
24638+int
24639+privcmd_mmap(struct file * file, struct vm_area_struct * vma)
24640+{
24641+ int error;
24642+ unsigned long size = vma->vm_end - vma->vm_start;
24643+ unsigned long num_entries = size >> PAGE_SHIFT;
24644+ struct xen_ia64_privcmd_range* privcmd_range = NULL;
24645+ struct xen_ia64_privcmd_vma* privcmd_vma = NULL;
24646+ struct resource* res = NULL;
24647+ unsigned long i;
24648+ BUG_ON(!is_running_on_xen());
24649+
24650+ BUG_ON(file->private_data != NULL);
24651+
24652+ error = -ENOMEM;
24653+ privcmd_range =
24654+ vmalloc(sizeof(*privcmd_range) +
24655+ sizeof(privcmd_range->entries[0]) * num_entries);
24656+ if (privcmd_range == NULL) {
24657+ goto out_enomem0;
24658+ }
24659+ privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL);
24660+ if (privcmd_vma == NULL) {
24661+ goto out_enomem1;
24662+ }
24663+ privcmd_vma->is_privcmd_mmapped = 0;
24664+
24665+ res = kzalloc(sizeof(*res), GFP_KERNEL);
24666+ if (res == NULL) {
24667+ goto out_enomem1;
24668+ }
24669+ res->name = "Xen privcmd mmap";
24670+ error = allocate_resource(&iomem_resource, res, size,
24671+ privcmd_resource_min, privcmd_resource_max,
24672+ privcmd_resource_align, NULL, NULL);
24673+ if (error) {
24674+ goto out_enomem1;
24675+ }
24676+ privcmd_range->res = res;
24677+
24678+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
24679+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
24680+
24681+ atomic_set(&privcmd_range->ref_count, 1);
24682+ privcmd_range->pgoff = vma->vm_pgoff;
24683+ privcmd_range->num_entries = num_entries;
24684+ for (i = 0; i < privcmd_range->num_entries; i++) {
24685+ xen_ia64_privcmd_init_entry(&privcmd_range->entries[i]);
24686+ }
24687+
24688+ __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
24689+ return 0;
24690+
24691+out_enomem1:
24692+ kfree(res);
24693+ kfree(privcmd_vma);
24694+out_enomem0:
24695+ vfree(privcmd_range);
24696+ return error;
24697+}
24698+
24699+int
24700+direct_remap_pfn_range(struct vm_area_struct *vma,
24701+ unsigned long address, // process virtual address
24702+ unsigned long gmfn, // gmfn, gmfn + 1, ... gmfn + size/PAGE_SIZE
24703+ unsigned long size,
24704+ pgprot_t prot,
24705+ domid_t domid) // target domain
24706+{
24707+ struct xen_ia64_privcmd_vma* privcmd_vma =
24708+ (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24709+ struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24710+ unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24711+
24712+ unsigned long i;
24713+ unsigned long offset;
24714+ int error = 0;
24715+ BUG_ON(!is_running_on_xen());
24716+
24717+#if 0
24718+ if (prot != vm->vm_page_prot) {
24719+ return -EINVAL;
24720+ }
24721+#endif
24722+
24723+ i = (address - vma->vm_start) >> PAGE_SHIFT;
24724+ for (offset = 0; offset < size; offset += PAGE_SIZE) {
24725+ error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, privcmd_range, entry_offset + i, gmfn, prot, domid);
24726+ if (error != 0) {
24727+ break;
24728+ }
24729+
24730+ i++;
24731+ gmfn++;
24732+ }
24733+
24734+ return error;
24735+}
24736+
24737+
24738+/* Called after suspend, to resume time. */
24739+void
24740+time_resume(void)
24741+{
24742+ extern void ia64_cpu_local_tick(void);
24743+
24744+ /* Just trigger a tick. */
24745+ ia64_cpu_local_tick();
24746+}
24747+
24748+///////////////////////////////////////////////////////////////////////////
24749+// expose p2m table
24750+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
24751+#include <linux/cpu.h>
24752+#include <asm/uaccess.h>
24753+
24754+int p2m_initialized __read_mostly = 0;
24755+
24756+unsigned long p2m_min_low_pfn __read_mostly;
24757+unsigned long p2m_max_low_pfn __read_mostly;
24758+unsigned long p2m_convert_min_pfn __read_mostly;
24759+unsigned long p2m_convert_max_pfn __read_mostly;
24760+
24761+static struct resource p2m_resource = {
24762+ .name = "Xen p2m table",
24763+ .flags = IORESOURCE_MEM,
24764+};
24765+static unsigned long p2m_assign_start_pfn __read_mostly;
24766+static unsigned long p2m_assign_end_pfn __read_mostly;
24767+volatile const pte_t* p2m_pte __read_mostly;
24768+
24769+#define GRNULE_PFN PTRS_PER_PTE
24770+static unsigned long p2m_granule_pfn __read_mostly = GRNULE_PFN;
24771+
24772+#define ROUNDDOWN(x, y) ((x) & ~((y) - 1))
24773+#define ROUNDUP(x, y) (((x) + (y) - 1) & ~((y) - 1))
24774+
24775+#define P2M_PREFIX "Xen p2m: "
24776+
24777+static int xen_ia64_p2m_expose __read_mostly = 1;
24778+module_param(xen_ia64_p2m_expose, int, 0);
24779+MODULE_PARM_DESC(xen_ia64_p2m_expose,
24780+ "enable/disable xen/ia64 p2m exposure optimization\n");
24781+
24782+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24783+static int xen_ia64_p2m_expose_use_dtr __read_mostly = 1;
24784+module_param(xen_ia64_p2m_expose_use_dtr, int, 0);
24785+MODULE_PARM_DESC(xen_ia64_p2m_expose_use_dtr,
24786+ "use/unuse dtr to map exposed p2m table\n");
24787+
24788+static const int p2m_page_shifts[] = {
24789+ _PAGE_SIZE_4K,
24790+ _PAGE_SIZE_8K,
24791+ _PAGE_SIZE_16K,
24792+ _PAGE_SIZE_64K,
24793+ _PAGE_SIZE_256K,
24794+ _PAGE_SIZE_1M,
24795+ _PAGE_SIZE_4M,
24796+ _PAGE_SIZE_16M,
24797+ _PAGE_SIZE_64M,
24798+ _PAGE_SIZE_256M,
24799+};
24800+
24801+struct p2m_itr_arg {
24802+ unsigned long vaddr;
24803+ unsigned long pteval;
24804+ unsigned long log_page_size;
24805+};
24806+static struct p2m_itr_arg p2m_itr_arg __read_mostly;
24807+
24808+// This should be in asm-ia64/kregs.h
24809+#define IA64_TR_P2M_TABLE 3
24810+
24811+static void
24812+p2m_itr(void* info)
24813+{
24814+ struct p2m_itr_arg* arg = (struct p2m_itr_arg*)info;
24815+ ia64_itr(0x2, IA64_TR_P2M_TABLE,
24816+ arg->vaddr, arg->pteval, arg->log_page_size);
24817+ ia64_srlz_d();
24818+}
24819+
24820+static int
24821+p2m_expose_dtr_call(struct notifier_block *self,
24822+ unsigned long event, void* ptr)
24823+{
24824+ unsigned int cpu = (unsigned int)(long)ptr;
24825+ if (event != CPU_ONLINE)
24826+ return 0;
24827+ if (!(p2m_initialized && xen_ia64_p2m_expose_use_dtr))
24828+ smp_call_function_single(cpu, &p2m_itr, &p2m_itr_arg, 1, 1);
24829+ return 0;
24830+}
24831+
24832+static struct notifier_block p2m_expose_dtr_hotplug_notifier = {
24833+ .notifier_call = p2m_expose_dtr_call,
24834+ .next = NULL,
24835+ .priority = 0
24836+};
24837+#endif
24838+
24839+static int
24840+p2m_expose_init(void)
24841+{
24842+ unsigned long num_pfn;
24843+ unsigned long size = 0;
24844+ unsigned long p2m_size = 0;
24845+ unsigned long align = ~0UL;
24846+ int error = 0;
24847+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24848+ int i;
24849+ unsigned long page_size;
24850+ unsigned long log_page_size = 0;
24851+#endif
24852+
24853+ if (!xen_ia64_p2m_expose)
24854+ return -ENOSYS;
24855+ if (p2m_initialized)
24856+ return 0;
24857+
24858+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24859+ error = register_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
24860+ if (error < 0)
24861+ return error;
24862+#endif
24863+
24864+ lock_cpu_hotplug();
24865+ if (p2m_initialized)
24866+ goto out;
24867+
24868+#ifdef CONFIG_DISCONTIGMEM
24869+ p2m_min_low_pfn = min_low_pfn;
24870+ p2m_max_low_pfn = max_low_pfn;
24871+#else
24872+ p2m_min_low_pfn = 0;
24873+ p2m_max_low_pfn = max_pfn;
24874+#endif
24875+
24876+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24877+ if (xen_ia64_p2m_expose_use_dtr) {
24878+ unsigned long granule_pfn = 0;
24879+ p2m_size = p2m_max_low_pfn - p2m_min_low_pfn;
24880+ for (i = 0;
24881+ i < sizeof(p2m_page_shifts)/sizeof(p2m_page_shifts[0]);
24882+ i++) {
24883+ log_page_size = p2m_page_shifts[i];
24884+ page_size = 1UL << log_page_size;
24885+ if (page_size < p2m_size)
24886+ continue;
24887+
24888+ granule_pfn = max(page_size >> PAGE_SHIFT,
24889+ p2m_granule_pfn);
24890+ p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
24891+ granule_pfn);
24892+ p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn,
24893+ granule_pfn);
24894+ num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
24895+ size = num_pfn << PAGE_SHIFT;
24896+ p2m_size = num_pfn / PTRS_PER_PTE;
24897+ p2m_size = ROUNDUP(p2m_size, granule_pfn << PAGE_SHIFT);
24898+ if (p2m_size == page_size)
24899+ break;
24900+ }
24901+ if (p2m_size != page_size) {
24902+ printk(KERN_ERR "p2m_size != page_size\n");
24903+ error = -EINVAL;
24904+ goto out;
24905+ }
24906+ align = max(privcmd_resource_align, granule_pfn << PAGE_SHIFT);
24907+ } else
24908+#endif
24909+ {
24910+ BUG_ON(p2m_granule_pfn & (p2m_granule_pfn - 1));
24911+ p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
24912+ p2m_granule_pfn);
24913+ p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn, p2m_granule_pfn);
24914+ num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
24915+ size = num_pfn << PAGE_SHIFT;
24916+ p2m_size = num_pfn / PTRS_PER_PTE;
24917+ p2m_size = ROUNDUP(p2m_size, p2m_granule_pfn << PAGE_SHIFT);
24918+ align = max(privcmd_resource_align,
24919+ p2m_granule_pfn << PAGE_SHIFT);
24920+ }
24921+
24922+ // use privcmd region
24923+ error = allocate_resource(&iomem_resource, &p2m_resource, p2m_size,
24924+ privcmd_resource_min, privcmd_resource_max,
24925+ align, NULL, NULL);
24926+ if (error) {
24927+ printk(KERN_ERR P2M_PREFIX
24928+ "can't allocate region for p2m exposure "
24929+ "[0x%016lx, 0x%016lx) 0x%016lx\n",
24930+ p2m_convert_min_pfn, p2m_convert_max_pfn, p2m_size);
24931+ goto out;
24932+ }
24933+
24934+ p2m_assign_start_pfn = p2m_resource.start >> PAGE_SHIFT;
24935+ p2m_assign_end_pfn = p2m_resource.end >> PAGE_SHIFT;
24936+
24937+ error = HYPERVISOR_expose_p2m(p2m_convert_min_pfn,
24938+ p2m_assign_start_pfn,
24939+ size, p2m_granule_pfn);
24940+ if (error) {
24941+ printk(KERN_ERR P2M_PREFIX "failed expose p2m hypercall %d\n",
24942+ error);
24943+ printk(KERN_ERR P2M_PREFIX "conv 0x%016lx assign 0x%016lx "
24944+ "size 0x%016lx granule 0x%016lx\n",
24945+ p2m_convert_min_pfn, p2m_assign_start_pfn,
24946+ size, p2m_granule_pfn);;
24947+ release_resource(&p2m_resource);
24948+ goto out;
24949+ }
24950+ p2m_pte = (volatile const pte_t*)pfn_to_kaddr(p2m_assign_start_pfn);
24951+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24952+ if (xen_ia64_p2m_expose_use_dtr) {
24953+ p2m_itr_arg.vaddr = (unsigned long)__va(p2m_assign_start_pfn
24954+ << PAGE_SHIFT);
24955+ p2m_itr_arg.pteval = pte_val(pfn_pte(p2m_assign_start_pfn,
24956+ PAGE_KERNEL));
24957+ p2m_itr_arg.log_page_size = log_page_size;
24958+ smp_mb();
24959+ smp_call_function(&p2m_itr, &p2m_itr_arg, 1, 1);
24960+ p2m_itr(&p2m_itr_arg);
24961+ }
24962+#endif
24963+ smp_mb();
24964+ p2m_initialized = 1;
24965+ printk(P2M_PREFIX "assign p2m table of [0x%016lx, 0x%016lx)\n",
24966+ p2m_convert_min_pfn << PAGE_SHIFT,
24967+ p2m_convert_max_pfn << PAGE_SHIFT);
24968+ printk(P2M_PREFIX "to [0x%016lx, 0x%016lx) (%ld KBytes)\n",
24969+ p2m_assign_start_pfn << PAGE_SHIFT,
24970+ p2m_assign_end_pfn << PAGE_SHIFT,
24971+ p2m_size / 1024);
24972+out:
24973+ unlock_cpu_hotplug();
24974+ return error;
24975+}
24976+
24977+#ifdef notyet
24978+void
24979+p2m_expose_cleanup(void)
24980+{
24981+ BUG_ON(!p2m_initialized);
24982+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24983+ unregister_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
24984+#endif
24985+ release_resource(&p2m_resource);
24986+}
24987+#endif
24988+
24989+//XXX inlinize?
24990+unsigned long
24991+p2m_phystomach(unsigned long gpfn)
24992+{
24993+ volatile const pte_t* pte;
24994+ unsigned long mfn;
24995+ unsigned long pteval;
24996+
24997+ if (!p2m_initialized ||
24998+ gpfn < p2m_min_low_pfn || gpfn > p2m_max_low_pfn
24999+ /* || !pfn_valid(gpfn) */)
25000+ return INVALID_MFN;
25001+ pte = p2m_pte + (gpfn - p2m_convert_min_pfn);
25002+
25003+ mfn = INVALID_MFN;
25004+ if (likely(__get_user(pteval, (unsigned long __user *)pte) == 0 &&
25005+ pte_present(__pte(pteval)) &&
25006+ pte_pfn(__pte(pteval)) != (INVALID_MFN >> PAGE_SHIFT)))
25007+ mfn = (pteval & _PFN_MASK) >> PAGE_SHIFT;
25008+
25009+ return mfn;
25010+}
25011+
25012+EXPORT_SYMBOL_GPL(p2m_initialized);
25013+EXPORT_SYMBOL_GPL(p2m_min_low_pfn);
25014+EXPORT_SYMBOL_GPL(p2m_max_low_pfn);
25015+EXPORT_SYMBOL_GPL(p2m_convert_min_pfn);
25016+EXPORT_SYMBOL_GPL(p2m_convert_max_pfn);
25017+EXPORT_SYMBOL_GPL(p2m_pte);
25018+EXPORT_SYMBOL_GPL(p2m_phystomach);
25019+#endif
25020+
25021+///////////////////////////////////////////////////////////////////////////
25022+// for xenoprof
25023+
25024+struct resource*
25025+xen_ia64_allocate_resource(unsigned long size)
25026+{
25027+ struct resource* res;
25028+ int error;
25029+
25030+ res = kmalloc(sizeof(*res), GFP_KERNEL);
25031+ if (res == NULL)
25032+ return ERR_PTR(-ENOMEM);
25033+
25034+ res->name = "Xen";
25035+ res->flags = IORESOURCE_MEM;
25036+ error = allocate_resource(&iomem_resource, res, PAGE_ALIGN(size),
25037+ privcmd_resource_min, privcmd_resource_max,
25038+ IA64_GRANULE_SIZE, NULL, NULL);
25039+ if (error) {
25040+ kfree(res);
25041+ return ERR_PTR(error);
25042+ }
25043+ return res;
25044+}
25045+EXPORT_SYMBOL_GPL(xen_ia64_allocate_resource);
25046+
25047+void
25048+xen_ia64_release_resource(struct resource* res)
25049+{
25050+ release_resource(res);
25051+ kfree(res);
25052+}
25053+EXPORT_SYMBOL_GPL(xen_ia64_release_resource);
25054+
25055+void
25056+xen_ia64_unmap_resource(struct resource* res)
25057+{
25058+ unsigned long gpfn = res->start >> PAGE_SHIFT;
25059+ unsigned long nr_pages = (res->end - res->start) >> PAGE_SHIFT;
25060+ unsigned long i;
25061+
25062+ for (i = 0; i < nr_pages; i++) {
25063+ int error = HYPERVISOR_zap_physmap(gpfn + i, 0);
25064+ if (error)
25065+ printk(KERN_ERR
25066+ "%s:%d zap_phsymap failed %d gpfn %lx\n",
25067+ __func__, __LINE__, error, gpfn + i);
25068+ }
25069+ xen_ia64_release_resource(res);
25070+}
25071+EXPORT_SYMBOL_GPL(xen_ia64_unmap_resource);
25072diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/mem.c linux-2.6.16.33/arch/ia64/xen/mem.c
25073--- linux-2.6.16.33-noxen/arch/ia64/xen/mem.c 1970-01-01 00:00:00.000000000 +0000
25074+++ linux-2.6.16.33/arch/ia64/xen/mem.c 2007-01-08 15:00:45.000000000 +0000
25075@@ -0,0 +1,76 @@
25076+/*
25077+ * Originally from linux/drivers/char/mem.c
25078+ *
25079+ * Copyright (C) 1991, 1992 Linus Torvalds
25080+ *
25081+ * Added devfs support.
25082+ * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
25083+ * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
25084+ */
25085+/*
25086+ * taken from
25087+ * linux/drivers/char/mem.c and linux-2.6-xen-sparse/drivers/xen/char/mem.c.
25088+ * adjusted for IA64 and made transparent.
25089+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25090+ * VA Linux Systems Japan K.K.
25091+ */
25092+
25093+#include <linux/config.h>
25094+#include <linux/mm.h>
25095+#include <linux/efi.h>
25096+
25097+/*
25098+ * Architectures vary in how they handle caching for addresses
25099+ * outside of main memory.
25100+ *
25101+ */
25102+static inline int uncached_access(struct file *file, unsigned long addr)
25103+{
25104+ /*
25105+ * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
25106+ */
25107+ return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
25108+}
25109+
25110+int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
25111+{
25112+ unsigned long addr = vma->vm_pgoff << PAGE_SHIFT;
25113+ size_t size = vma->vm_end - vma->vm_start;
25114+
25115+
25116+#if 0
25117+ /*
25118+ *XXX FIXME: linux-2.6.16.29, linux-2.6.17
25119+ * valid_mmap_phys_addr_range() in linux/arch/ia64/kernel/efi.c
25120+ * fails checks.
25121+ * linux-2.6.18.1's returns always 1.
25122+ * Its comments says
25123+ *
25124+ * MMIO regions are often missing from the EFI memory map.
25125+ * We must allow mmap of them for programs like X, so we
25126+ * currently can't do any useful validation.
25127+ */
25128+ if (!valid_mmap_phys_addr_range(addr, &size))
25129+ return -EINVAL;
25130+ if (size < vma->vm_end - vma->vm_start)
25131+ return -EINVAL;
25132+#endif
25133+
25134+ if (is_running_on_xen()) {
25135+ unsigned long offset = HYPERVISOR_ioremap(addr, size);
25136+ if (IS_ERR_VALUE(offset))
25137+ return offset;
25138+ }
25139+
25140+ if (uncached_access(file, vma->vm_pgoff << PAGE_SHIFT))
25141+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
25142+
25143+ /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
25144+ if (remap_pfn_range(vma,
25145+ vma->vm_start,
25146+ vma->vm_pgoff,
25147+ size,
25148+ vma->vm_page_prot))
25149+ return -EAGAIN;
25150+ return 0;
25151+}
25152diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/util.c linux-2.6.16.33/arch/ia64/xen/util.c
25153--- linux-2.6.16.33-noxen/arch/ia64/xen/util.c 1970-01-01 00:00:00.000000000 +0000
25154+++ linux-2.6.16.33/arch/ia64/xen/util.c 2007-01-08 15:00:45.000000000 +0000
25155@@ -0,0 +1,118 @@
25156+/******************************************************************************
25157+ * arch/ia64/xen/util.c
25158+ * This file is the ia64 counterpart of drivers/xen/util.c
25159+ *
25160+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25161+ * VA Linux Systems Japan K.K.
25162+ *
25163+ * This program is free software; you can redistribute it and/or modify
25164+ * it under the terms of the GNU General Public License as published by
25165+ * the Free Software Foundation; either version 2 of the License, or
25166+ * (at your option) any later version.
25167+ *
25168+ * This program is distributed in the hope that it will be useful,
25169+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25170+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25171+ * GNU General Public License for more details.
25172+ *
25173+ * You should have received a copy of the GNU General Public License
25174+ * along with this program; if not, write to the Free Software
25175+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25176+ *
25177+ */
25178+
25179+#include <linux/config.h>
25180+#include <linux/mm.h>
25181+#include <linux/module.h>
25182+#include <linux/slab.h>
25183+#include <linux/vmalloc.h>
25184+#include <asm/uaccess.h>
25185+#include <xen/driver_util.h>
25186+#include <xen/interface/memory.h>
25187+#include <asm/hypercall.h>
25188+
25189+struct vm_struct *alloc_vm_area(unsigned long size)
25190+{
25191+ int order;
25192+ unsigned long virt;
25193+ unsigned long nr_pages;
25194+ struct vm_struct* area;
25195+
25196+ order = get_order(size);
25197+ virt = __get_free_pages(GFP_KERNEL, order);
25198+ if (virt == 0) {
25199+ goto err0;
25200+ }
25201+ nr_pages = 1 << order;
25202+ scrub_pages(virt, nr_pages);
25203+
25204+ area = kmalloc(sizeof(*area), GFP_KERNEL);
25205+ if (area == NULL) {
25206+ goto err1;
25207+ }
25208+
25209+ area->flags = VM_IOREMAP;//XXX
25210+ area->addr = (void*)virt;
25211+ area->size = size;
25212+ area->pages = NULL; //XXX
25213+ area->nr_pages = nr_pages;
25214+ area->phys_addr = 0; /* xenbus_map_ring_valloc uses this field! */
25215+
25216+ return area;
25217+
25218+err1:
25219+ free_pages(virt, order);
25220+err0:
25221+ return NULL;
25222+
25223+}
25224+EXPORT_SYMBOL_GPL(alloc_vm_area);
25225+
25226+void free_vm_area(struct vm_struct *area)
25227+{
25228+ unsigned int order = get_order(area->size);
25229+ unsigned long i;
25230+ unsigned long phys_addr = __pa(area->addr);
25231+
25232+ // This area is used for foreign page mappping.
25233+ // So underlying machine page may not be assigned.
25234+ for (i = 0; i < (1 << order); i++) {
25235+ unsigned long ret;
25236+ unsigned long gpfn = (phys_addr >> PAGE_SHIFT) + i;
25237+ struct xen_memory_reservation reservation = {
25238+ .nr_extents = 1,
25239+ .address_bits = 0,
25240+ .extent_order = 0,
25241+ .domid = DOMID_SELF
25242+ };
25243+ set_xen_guest_handle(reservation.extent_start, &gpfn);
25244+ ret = HYPERVISOR_memory_op(XENMEM_populate_physmap,
25245+ &reservation);
25246+ BUG_ON(ret != 1);
25247+ }
25248+ free_pages((unsigned long)area->addr, order);
25249+ kfree(area);
25250+}
25251+EXPORT_SYMBOL_GPL(free_vm_area);
25252+
25253+void lock_vm_area(struct vm_struct *area)
25254+{
25255+ // nothing
25256+}
25257+EXPORT_SYMBOL_GPL(lock_vm_area);
25258+
25259+void unlock_vm_area(struct vm_struct *area)
25260+{
25261+ // nothing
25262+}
25263+EXPORT_SYMBOL_GPL(unlock_vm_area);
25264+
25265+/*
25266+ * Local variables:
25267+ * c-file-style: "linux"
25268+ * indent-tabs-mode: t
25269+ * c-indent-level: 8
25270+ * c-basic-offset: 8
25271+ * tab-width: 8
25272+ * End:
25273+ */
25274diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_hcall.c linux-2.6.16.33/arch/ia64/xen/xcom_hcall.c
25275--- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_hcall.c 1970-01-01 00:00:00.000000000 +0000
25276+++ linux-2.6.16.33/arch/ia64/xen/xcom_hcall.c 2007-01-08 15:00:45.000000000 +0000
25277@@ -0,0 +1,365 @@
25278+/*
25279+ * This program is free software; you can redistribute it and/or modify
25280+ * it under the terms of the GNU General Public License as published by
25281+ * the Free Software Foundation; either version 2 of the License, or
25282+ * (at your option) any later version.
25283+ *
25284+ * This program is distributed in the hope that it will be useful,
25285+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25286+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25287+ * GNU General Public License for more details.
25288+ *
25289+ * You should have received a copy of the GNU General Public License
25290+ * along with this program; if not, write to the Free Software
25291+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25292+ *
25293+ * Tristan Gingold <tristan.gingold@bull.net>
25294+ */
25295+#include <linux/types.h>
25296+#include <linux/errno.h>
25297+#include <linux/kernel.h>
25298+#include <linux/gfp.h>
25299+#include <linux/module.h>
25300+#include <xen/interface/xen.h>
25301+#include <xen/interface/dom0_ops.h>
25302+#include <xen/interface/memory.h>
25303+#include <xen/interface/xencomm.h>
25304+#include <xen/interface/version.h>
25305+#include <xen/interface/sched.h>
25306+#include <xen/interface/event_channel.h>
25307+#include <xen/interface/physdev.h>
25308+#include <xen/interface/grant_table.h>
25309+#include <xen/interface/callback.h>
25310+#include <xen/interface/acm_ops.h>
25311+#include <xen/interface/hvm/params.h>
25312+#include <xen/interface/xenoprof.h>
25313+#include <asm/hypercall.h>
25314+#include <asm/page.h>
25315+#include <asm/uaccess.h>
25316+#include <asm/xen/xencomm.h>
25317+#include <asm/perfmon.h>
25318+
25319+/* Xencomm notes:
25320+ * This file defines hypercalls to be used by xencomm. The hypercalls simply
25321+ * create inlines descriptors for pointers and then call the raw arch hypercall
25322+ * xencomm_arch_hypercall_XXX
25323+ *
25324+ * If the arch wants to directly use these hypercalls, simply define macros
25325+ * in asm/hypercall.h, eg:
25326+ * #define HYPERVISOR_sched_op xencomm_hypercall_sched_op
25327+ *
25328+ * The arch may also define HYPERVISOR_xxx as a function and do more operations
25329+ * before/after doing the hypercall.
25330+ *
25331+ * Note: because only inline descriptors are created these functions must only
25332+ * be called with in kernel memory parameters.
25333+ */
25334+
25335+int
25336+xencomm_hypercall_console_io(int cmd, int count, char *str)
25337+{
25338+ return xencomm_arch_hypercall_console_io
25339+ (cmd, count, xencomm_create_inline(str));
25340+}
25341+
25342+int
25343+xencomm_hypercall_event_channel_op(int cmd, void *op)
25344+{
25345+ return xencomm_arch_hypercall_event_channel_op
25346+ (cmd, xencomm_create_inline(op));
25347+}
25348+
25349+int
25350+xencomm_hypercall_xen_version(int cmd, void *arg)
25351+{
25352+ switch (cmd) {
25353+ case XENVER_version:
25354+ case XENVER_extraversion:
25355+ case XENVER_compile_info:
25356+ case XENVER_capabilities:
25357+ case XENVER_changeset:
25358+ case XENVER_platform_parameters:
25359+ case XENVER_pagesize:
25360+ case XENVER_get_features:
25361+ break;
25362+ default:
25363+ printk("%s: unknown version cmd %d\n", __func__, cmd);
25364+ return -ENOSYS;
25365+ }
25366+
25367+ return xencomm_arch_hypercall_xen_version
25368+ (cmd, xencomm_create_inline(arg));
25369+}
25370+
25371+int
25372+xencomm_hypercall_physdev_op(int cmd, void *op)
25373+{
25374+ return xencomm_arch_hypercall_physdev_op
25375+ (cmd, xencomm_create_inline(op));
25376+}
25377+
25378+static void *
25379+xencommize_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25380+{
25381+ switch (cmd) {
25382+ case GNTTABOP_map_grant_ref:
25383+ case GNTTABOP_unmap_grant_ref:
25384+ break;
25385+ case GNTTABOP_setup_table:
25386+ {
25387+ struct gnttab_setup_table *setup = op;
25388+ struct xencomm_handle *frame_list;
25389+
25390+ frame_list = xencomm_create_inline
25391+ (xen_guest_handle(setup->frame_list));
25392+
25393+ set_xen_guest_handle(setup->frame_list, (void *)frame_list);
25394+ break;
25395+ }
25396+ case GNTTABOP_dump_table:
25397+ case GNTTABOP_transfer:
25398+ case GNTTABOP_copy:
25399+ break;
25400+ default:
25401+ printk("%s: unknown grant table op %d\n", __func__, cmd);
25402+ BUG();
25403+ }
25404+
25405+ return xencomm_create_inline(op);
25406+}
25407+
25408+int
25409+xencomm_hypercall_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25410+{
25411+ void *desc = xencommize_grant_table_op (cmd, op, count);
25412+
25413+ return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
25414+}
25415+
25416+int
25417+xencomm_hypercall_sched_op(int cmd, void *arg)
25418+{
25419+ switch (cmd) {
25420+ case SCHEDOP_yield:
25421+ case SCHEDOP_block:
25422+ case SCHEDOP_shutdown:
25423+ case SCHEDOP_remote_shutdown:
25424+ break;
25425+ case SCHEDOP_poll:
25426+ {
25427+ sched_poll_t *poll = arg;
25428+ struct xencomm_handle *ports;
25429+
25430+ ports = xencomm_create_inline(xen_guest_handle(poll->ports));
25431+
25432+ set_xen_guest_handle(poll->ports, (void *)ports);
25433+ break;
25434+ }
25435+ default:
25436+ printk("%s: unknown sched op %d\n", __func__, cmd);
25437+ return -ENOSYS;
25438+ }
25439+
25440+ return xencomm_arch_hypercall_sched_op(cmd, xencomm_create_inline(arg));
25441+}
25442+
25443+int
25444+xencomm_hypercall_multicall(void *call_list, int nr_calls)
25445+{
25446+ int i;
25447+ multicall_entry_t *mce;
25448+
25449+ for (i = 0; i < nr_calls; i++) {
25450+ mce = (multicall_entry_t *)call_list + i;
25451+
25452+ switch (mce->op) {
25453+ case __HYPERVISOR_update_va_mapping:
25454+ case __HYPERVISOR_mmu_update:
25455+ /* No-op on ia64. */
25456+ break;
25457+ case __HYPERVISOR_grant_table_op:
25458+ mce->args[1] = (unsigned long)xencommize_grant_table_op
25459+ (mce->args[0], (void *)mce->args[1],
25460+ mce->args[2]);
25461+ break;
25462+ case __HYPERVISOR_memory_op:
25463+ default:
25464+ printk("%s: unhandled multicall op entry op %lu\n",
25465+ __func__, mce->op);
25466+ return -ENOSYS;
25467+ }
25468+ }
25469+
25470+ return xencomm_arch_hypercall_multicall
25471+ (xencomm_create_inline(call_list), nr_calls);
25472+}
25473+
25474+int
25475+xencomm_hypercall_callback_op(int cmd, void *arg)
25476+{
25477+ switch (cmd)
25478+ {
25479+ case CALLBACKOP_register:
25480+ case CALLBACKOP_unregister:
25481+ break;
25482+ default:
25483+ printk("%s: unknown callback op %d\n", __func__, cmd);
25484+ return -ENOSYS;
25485+ }
25486+
25487+ return xencomm_arch_hypercall_callback_op
25488+ (cmd, xencomm_create_inline(arg));
25489+}
25490+
25491+static void
25492+xencommize_memory_reservation (xen_memory_reservation_t *mop)
25493+{
25494+ struct xencomm_handle *desc;
25495+
25496+ desc = xencomm_create_inline(xen_guest_handle(mop->extent_start));
25497+ set_xen_guest_handle(mop->extent_start, (void *)desc);
25498+}
25499+
25500+int
25501+xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
25502+{
25503+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2];
25504+ xen_memory_reservation_t *xmr = NULL, *xme_in = NULL, *xme_out = NULL;
25505+ int rc;
25506+
25507+ switch (cmd) {
25508+ case XENMEM_increase_reservation:
25509+ case XENMEM_decrease_reservation:
25510+ case XENMEM_populate_physmap:
25511+ xmr = (xen_memory_reservation_t *)arg;
25512+ xen_guest_handle(extent_start_va[0]) =
25513+ xen_guest_handle(xmr->extent_start);
25514+ xencommize_memory_reservation((xen_memory_reservation_t *)arg);
25515+ break;
25516+
25517+ case XENMEM_maximum_ram_page:
25518+ break;
25519+
25520+ case XENMEM_exchange:
25521+ xme_in = &((xen_memory_exchange_t *)arg)->in;
25522+ xme_out = &((xen_memory_exchange_t *)arg)->out;
25523+ xen_guest_handle(extent_start_va[0]) =
25524+ xen_guest_handle(xme_in->extent_start);
25525+ xen_guest_handle(extent_start_va[1]) =
25526+ xen_guest_handle(xme_out->extent_start);
25527+ xencommize_memory_reservation
25528+ (&((xen_memory_exchange_t *)arg)->in);
25529+ xencommize_memory_reservation
25530+ (&((xen_memory_exchange_t *)arg)->out);
25531+ break;
25532+
25533+ default:
25534+ printk("%s: unknown memory op %d\n", __func__, cmd);
25535+ return -ENOSYS;
25536+ }
25537+
25538+ rc = xencomm_arch_hypercall_memory_op(cmd, xencomm_create_inline(arg));
25539+
25540+ switch (cmd) {
25541+ case XENMEM_increase_reservation:
25542+ case XENMEM_decrease_reservation:
25543+ case XENMEM_populate_physmap:
25544+ xen_guest_handle(xmr->extent_start) =
25545+ xen_guest_handle(extent_start_va[0]);
25546+ break;
25547+
25548+ case XENMEM_exchange:
25549+ xen_guest_handle(xme_in->extent_start) =
25550+ xen_guest_handle(extent_start_va[0]);
25551+ xen_guest_handle(xme_out->extent_start) =
25552+ xen_guest_handle(extent_start_va[1]);
25553+ break;
25554+ }
25555+
25556+ return rc;
25557+}
25558+
25559+unsigned long
25560+xencomm_hypercall_hvm_op(int cmd, void *arg)
25561+{
25562+ switch (cmd) {
25563+ case HVMOP_set_param:
25564+ case HVMOP_get_param:
25565+ break;
25566+ default:
25567+ printk("%s: unknown hvm op %d\n", __func__, cmd);
25568+ return -ENOSYS;
25569+ }
25570+
25571+ return xencomm_arch_hypercall_hvm_op(cmd, xencomm_create_inline(arg));
25572+}
25573+
25574+int
25575+xencomm_hypercall_suspend(unsigned long srec)
25576+{
25577+ struct sched_shutdown arg;
25578+
25579+ arg.reason = SHUTDOWN_suspend;
25580+
25581+ return xencomm_arch_hypercall_suspend(xencomm_create_inline(&arg));
25582+}
25583+
25584+int
25585+xencomm_hypercall_xenoprof_op(int op, void *arg)
25586+{
25587+ switch (op) {
25588+ case XENOPROF_init:
25589+ case XENOPROF_set_active:
25590+ case XENOPROF_set_passive:
25591+ case XENOPROF_counter:
25592+ case XENOPROF_get_buffer:
25593+ break;
25594+
25595+ case XENOPROF_reset_active_list:
25596+ case XENOPROF_reset_passive_list:
25597+ case XENOPROF_reserve_counters:
25598+ case XENOPROF_setup_events:
25599+ case XENOPROF_enable_virq:
25600+ case XENOPROF_start:
25601+ case XENOPROF_stop:
25602+ case XENOPROF_disable_virq:
25603+ case XENOPROF_release_counters:
25604+ case XENOPROF_shutdown:
25605+ return xencomm_arch_hypercall_xenoprof_op(op, arg);
25606+ break;
25607+
25608+ default:
25609+ printk("%s: op %d isn't supported\n", __func__, op);
25610+ return -ENOSYS;
25611+ }
25612+ return xencomm_arch_hypercall_xenoprof_op(op,
25613+ xencomm_create_inline(arg));
25614+}
25615+
25616+int
25617+xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg, unsigned long count)
25618+{
25619+ switch (cmd) {
25620+ case PFM_GET_FEATURES:
25621+ case PFM_CREATE_CONTEXT:
25622+ case PFM_WRITE_PMCS:
25623+ case PFM_WRITE_PMDS:
25624+ case PFM_LOAD_CONTEXT:
25625+ break;
25626+
25627+ case PFM_DESTROY_CONTEXT:
25628+ case PFM_UNLOAD_CONTEXT:
25629+ case PFM_START:
25630+ case PFM_STOP:
25631+ return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
25632+
25633+ default:
25634+ printk("%s:%d cmd %ld isn't supported\n",
25635+ __func__,__LINE__, cmd);
25636+ BUG();
25637+ }
25638+
25639+ return xencomm_arch_hypercall_perfmon_op(cmd,
25640+ xencomm_create_inline(arg),
25641+ count);
25642+}
25643diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_mini.c linux-2.6.16.33/arch/ia64/xen/xcom_mini.c
25644--- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_mini.c 1970-01-01 00:00:00.000000000 +0000
25645+++ linux-2.6.16.33/arch/ia64/xen/xcom_mini.c 2007-01-08 15:00:45.000000000 +0000
25646@@ -0,0 +1,417 @@
25647+/*
25648+ * This program is free software; you can redistribute it and/or modify
25649+ * it under the terms of the GNU General Public License as published by
25650+ * the Free Software Foundation; either version 2 of the License, or
25651+ * (at your option) any later version.
25652+ *
25653+ * This program is distributed in the hope that it will be useful,
25654+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25655+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25656+ * GNU General Public License for more details.
25657+ *
25658+ * You should have received a copy of the GNU General Public License
25659+ * along with this program; if not, write to the Free Software
25660+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25661+ *
25662+ * Tristan Gingold <tristan.gingold@bull.net>
25663+ */
25664+#include <linux/types.h>
25665+#include <linux/errno.h>
25666+#include <linux/kernel.h>
25667+#include <linux/module.h>
25668+#include <xen/interface/xen.h>
25669+#include <xen/interface/dom0_ops.h>
25670+#include <xen/interface/memory.h>
25671+#include <xen/interface/xencomm.h>
25672+#include <xen/interface/version.h>
25673+#include <xen/interface/event_channel.h>
25674+#include <xen/interface/physdev.h>
25675+#include <xen/interface/grant_table.h>
25676+#include <xen/interface/hvm/params.h>
25677+#include <xen/interface/xenoprof.h>
25678+#ifdef CONFIG_VMX_GUEST
25679+#include <asm/hypervisor.h>
25680+#else
25681+#include <asm/hypercall.h>
25682+#endif
25683+#include <asm/xen/xencomm.h>
25684+#include <asm/perfmon.h>
25685+
25686+int
25687+xencomm_mini_hypercall_event_channel_op(int cmd, void *op)
25688+{
25689+ struct xencomm_mini xc_area[2];
25690+ int nbr_area = 2;
25691+ struct xencomm_handle *desc;
25692+ int rc;
25693+
25694+ rc = xencomm_create_mini(xc_area, &nbr_area,
25695+ op, sizeof(evtchn_op_t), &desc);
25696+ if (rc)
25697+ return rc;
25698+
25699+ return xencomm_arch_hypercall_event_channel_op(cmd, desc);
25700+}
25701+EXPORT_SYMBOL(xencomm_mini_hypercall_event_channel_op);
25702+
25703+static int
25704+xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area,
25705+ unsigned int cmd, void *op, unsigned int count,
25706+ struct xencomm_handle **desc)
25707+{
25708+ struct xencomm_handle *desc1;
25709+ unsigned int argsize;
25710+ int rc;
25711+
25712+ switch (cmd) {
25713+ case GNTTABOP_map_grant_ref:
25714+ argsize = sizeof(struct gnttab_map_grant_ref);
25715+ break;
25716+ case GNTTABOP_unmap_grant_ref:
25717+ argsize = sizeof(struct gnttab_unmap_grant_ref);
25718+ break;
25719+ case GNTTABOP_setup_table:
25720+ {
25721+ struct gnttab_setup_table *setup = op;
25722+
25723+ argsize = sizeof(*setup);
25724+
25725+ if (count != 1)
25726+ return -EINVAL;
25727+ rc = xencomm_create_mini
25728+ (xc_area, nbr_area,
25729+ xen_guest_handle(setup->frame_list),
25730+ setup->nr_frames
25731+ * sizeof(*xen_guest_handle(setup->frame_list)),
25732+ &desc1);
25733+ if (rc)
25734+ return rc;
25735+ set_xen_guest_handle(setup->frame_list, (void *)desc1);
25736+ break;
25737+ }
25738+ case GNTTABOP_dump_table:
25739+ argsize = sizeof(struct gnttab_dump_table);
25740+ break;
25741+ case GNTTABOP_transfer:
25742+ argsize = sizeof(struct gnttab_transfer);
25743+ break;
25744+ case GNTTABOP_copy:
25745+ argsize = sizeof(struct gnttab_copy);
25746+ break;
25747+ default:
25748+ printk("%s: unknown mini grant table op %d\n", __func__, cmd);
25749+ BUG();
25750+ }
25751+
25752+ rc = xencomm_create_mini(xc_area, nbr_area, op, count * argsize, desc);
25753+ if (rc)
25754+ return rc;
25755+
25756+ return 0;
25757+}
25758+
25759+int
25760+xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
25761+ unsigned int count)
25762+{
25763+ int rc;
25764+ struct xencomm_handle *desc;
25765+ int nbr_area = 2;
25766+ struct xencomm_mini xc_area[2];
25767+
25768+ rc = xencommize_mini_grant_table_op(xc_area, &nbr_area,
25769+ cmd, op, count, &desc);
25770+ if (rc)
25771+ return rc;
25772+
25773+ return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
25774+}
25775+EXPORT_SYMBOL(xencomm_mini_hypercall_grant_table_op);
25776+
25777+int
25778+xencomm_mini_hypercall_multicall(void *call_list, int nr_calls)
25779+{
25780+ int i;
25781+ multicall_entry_t *mce;
25782+ int nbr_area = 2 + nr_calls * 3;
25783+ struct xencomm_mini xc_area[nbr_area];
25784+ struct xencomm_handle *desc;
25785+ int rc;
25786+
25787+ for (i = 0; i < nr_calls; i++) {
25788+ mce = (multicall_entry_t *)call_list + i;
25789+
25790+ switch (mce->op) {
25791+ case __HYPERVISOR_update_va_mapping:
25792+ case __HYPERVISOR_mmu_update:
25793+ /* No-op on ia64. */
25794+ break;
25795+ case __HYPERVISOR_grant_table_op:
25796+ rc = xencommize_mini_grant_table_op
25797+ (xc_area, &nbr_area,
25798+ mce->args[0], (void *)mce->args[1],
25799+ mce->args[2], &desc);
25800+ if (rc)
25801+ return rc;
25802+ mce->args[1] = (unsigned long)desc;
25803+ break;
25804+ case __HYPERVISOR_memory_op:
25805+ default:
25806+ printk("%s: unhandled multicall op entry op %lu\n",
25807+ __func__, mce->op);
25808+ return -ENOSYS;
25809+ }
25810+ }
25811+
25812+ rc = xencomm_create_mini(xc_area, &nbr_area, call_list,
25813+ nr_calls * sizeof(multicall_entry_t), &desc);
25814+ if (rc)
25815+ return rc;
25816+
25817+ return xencomm_arch_hypercall_multicall(desc, nr_calls);
25818+}
25819+EXPORT_SYMBOL(xencomm_mini_hypercall_multicall);
25820+
25821+static int
25822+xencommize_mini_memory_reservation(struct xencomm_mini *area, int *nbr_area,
25823+ xen_memory_reservation_t *mop)
25824+{
25825+ struct xencomm_handle *desc;
25826+ int rc;
25827+
25828+ rc = xencomm_create_mini
25829+ (area, nbr_area,
25830+ xen_guest_handle(mop->extent_start),
25831+ mop->nr_extents
25832+ * sizeof(*xen_guest_handle(mop->extent_start)),
25833+ &desc);
25834+ if (rc)
25835+ return rc;
25836+
25837+ set_xen_guest_handle(mop->extent_start, (void *)desc);
25838+
25839+ return 0;
25840+}
25841+
25842+int
25843+xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg)
25844+{
25845+ int nbr_area = 4;
25846+ struct xencomm_mini xc_area[4];
25847+ struct xencomm_handle *desc;
25848+ int rc;
25849+ unsigned int argsize;
25850+
25851+ switch (cmd) {
25852+ case XENMEM_increase_reservation:
25853+ case XENMEM_decrease_reservation:
25854+ case XENMEM_populate_physmap:
25855+ argsize = sizeof(xen_memory_reservation_t);
25856+ rc = xencommize_mini_memory_reservation
25857+ (xc_area, &nbr_area, (xen_memory_reservation_t *)arg);
25858+ if (rc)
25859+ return rc;
25860+ break;
25861+
25862+ case XENMEM_maximum_ram_page:
25863+ argsize = 0;
25864+ break;
25865+
25866+ case XENMEM_exchange:
25867+ argsize = sizeof(xen_memory_exchange_t);
25868+ rc = xencommize_mini_memory_reservation
25869+ (xc_area, &nbr_area,
25870+ &((xen_memory_exchange_t *)arg)->in);
25871+ if (rc)
25872+ return rc;
25873+ rc = xencommize_mini_memory_reservation
25874+ (xc_area, &nbr_area,
25875+ &((xen_memory_exchange_t *)arg)->out);
25876+ if (rc)
25877+ return rc;
25878+ break;
25879+
25880+ case XENMEM_add_to_physmap:
25881+ argsize = sizeof (xen_add_to_physmap_t);
25882+ break;
25883+
25884+ default:
25885+ printk("%s: unknown mini memory op %d\n", __func__, cmd);
25886+ return -ENOSYS;
25887+ }
25888+
25889+ rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25890+ if (rc)
25891+ return rc;
25892+
25893+ return xencomm_arch_hypercall_memory_op(cmd, desc);
25894+}
25895+EXPORT_SYMBOL(xencomm_mini_hypercall_memory_op);
25896+
25897+unsigned long
25898+xencomm_mini_hypercall_hvm_op(int cmd, void *arg)
25899+{
25900+ struct xencomm_handle *desc;
25901+ int nbr_area = 2;
25902+ struct xencomm_mini xc_area[2];
25903+ unsigned int argsize;
25904+ int rc;
25905+
25906+ switch (cmd) {
25907+ case HVMOP_get_param:
25908+ case HVMOP_set_param:
25909+ argsize = sizeof(xen_hvm_param_t);
25910+ break;
25911+ default:
25912+ printk("%s: unknown HVMOP %d\n", __func__, cmd);
25913+ return -EINVAL;
25914+ }
25915+
25916+ rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25917+ if (rc)
25918+ return rc;
25919+
25920+ return xencomm_arch_hypercall_hvm_op(cmd, desc);
25921+}
25922+EXPORT_SYMBOL(xencomm_mini_hypercall_hvm_op);
25923+
25924+int
25925+xencomm_mini_hypercall_xen_version(int cmd, void *arg)
25926+{
25927+ struct xencomm_handle *desc;
25928+ int nbr_area = 2;
25929+ struct xencomm_mini xc_area[2];
25930+ unsigned int argsize;
25931+ int rc;
25932+
25933+ switch (cmd) {
25934+ case XENVER_version:
25935+ /* do not actually pass an argument */
25936+ return xencomm_arch_hypercall_xen_version(cmd, 0);
25937+ case XENVER_extraversion:
25938+ argsize = sizeof(xen_extraversion_t);
25939+ break;
25940+ case XENVER_compile_info:
25941+ argsize = sizeof(xen_compile_info_t);
25942+ break;
25943+ case XENVER_capabilities:
25944+ argsize = sizeof(xen_capabilities_info_t);
25945+ break;
25946+ case XENVER_changeset:
25947+ argsize = sizeof(xen_changeset_info_t);
25948+ break;
25949+ case XENVER_platform_parameters:
25950+ argsize = sizeof(xen_platform_parameters_t);
25951+ break;
25952+ case XENVER_pagesize:
25953+ argsize = (arg == NULL) ? 0 : sizeof(void *);
25954+ break;
25955+ case XENVER_get_features:
25956+ argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
25957+ break;
25958+
25959+ default:
25960+ printk("%s: unknown version op %d\n", __func__, cmd);
25961+ return -ENOSYS;
25962+ }
25963+
25964+ rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25965+ if (rc)
25966+ return rc;
25967+
25968+ return xencomm_arch_hypercall_xen_version(cmd, desc);
25969+}
25970+EXPORT_SYMBOL(xencomm_mini_hypercall_xen_version);
25971+
25972+int
25973+xencomm_mini_hypercall_xenoprof_op(int op, void *arg)
25974+{
25975+ unsigned int argsize;
25976+ struct xencomm_mini xc_area[2];
25977+ int nbr_area = 2;
25978+ struct xencomm_handle *desc;
25979+ int rc;
25980+
25981+ switch (op) {
25982+ case XENOPROF_init:
25983+ argsize = sizeof(xenoprof_init_t);
25984+ break;
25985+ case XENOPROF_set_active:
25986+ argsize = sizeof(domid_t);
25987+ break;
25988+ case XENOPROF_set_passive:
25989+ argsize = sizeof(xenoprof_passive_t);
25990+ break;
25991+ case XENOPROF_counter:
25992+ argsize = sizeof(xenoprof_counter_t);
25993+ break;
25994+ case XENOPROF_get_buffer:
25995+ argsize = sizeof(xenoprof_get_buffer_t);
25996+ break;
25997+
25998+ case XENOPROF_reset_active_list:
25999+ case XENOPROF_reset_passive_list:
26000+ case XENOPROF_reserve_counters:
26001+ case XENOPROF_setup_events:
26002+ case XENOPROF_enable_virq:
26003+ case XENOPROF_start:
26004+ case XENOPROF_stop:
26005+ case XENOPROF_disable_virq:
26006+ case XENOPROF_release_counters:
26007+ case XENOPROF_shutdown:
26008+ return xencomm_arch_hypercall_xenoprof_op(op, arg);
26009+
26010+ default:
26011+ printk("%s: op %d isn't supported\n", __func__, op);
26012+ return -ENOSYS;
26013+ }
26014+ rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26015+ if (rc)
26016+ return rc;
26017+ return xencomm_arch_hypercall_xenoprof_op(op, desc);
26018+}
26019+EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_xenoprof_op);
26020+
26021+int
26022+xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
26023+ unsigned long count)
26024+{
26025+ unsigned int argsize;
26026+ struct xencomm_mini xc_area[2];
26027+ int nbr_area = 2;
26028+ struct xencomm_handle *desc;
26029+ int rc;
26030+
26031+ switch (cmd) {
26032+ case PFM_GET_FEATURES:
26033+ argsize = sizeof(pfarg_features_t);
26034+ break;
26035+ case PFM_CREATE_CONTEXT:
26036+ argsize = sizeof(pfarg_context_t);
26037+ break;
26038+ case PFM_LOAD_CONTEXT:
26039+ argsize = sizeof(pfarg_load_t);
26040+ break;
26041+ case PFM_WRITE_PMCS:
26042+ case PFM_WRITE_PMDS:
26043+ argsize = sizeof(pfarg_reg_t) * count;
26044+ break;
26045+
26046+ case PFM_DESTROY_CONTEXT:
26047+ case PFM_UNLOAD_CONTEXT:
26048+ case PFM_START:
26049+ case PFM_STOP:
26050+ return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
26051+
26052+ default:
26053+ printk("%s:%d cmd %ld isn't supported\n",
26054+ __func__, __LINE__, cmd);
26055+ BUG();
26056+ }
26057+
26058+ rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26059+ if (rc)
26060+ return rc;
26061+ return xencomm_arch_hypercall_perfmon_op(cmd, desc, count);
26062+}
26063+EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_perfmon_op);
26064diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_privcmd.c linux-2.6.16.33/arch/ia64/xen/xcom_privcmd.c
26065--- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_privcmd.c 1970-01-01 00:00:00.000000000 +0000
26066+++ linux-2.6.16.33/arch/ia64/xen/xcom_privcmd.c 2007-01-08 15:00:45.000000000 +0000
26067@@ -0,0 +1,663 @@
26068+/*
26069+ * This program is free software; you can redistribute it and/or modify
26070+ * it under the terms of the GNU General Public License as published by
26071+ * the Free Software Foundation; either version 2 of the License, or
26072+ * (at your option) any later version.
26073+ *
26074+ * This program is distributed in the hope that it will be useful,
26075+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26076+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26077+ * GNU General Public License for more details.
26078+ *
26079+ * You should have received a copy of the GNU General Public License
26080+ * along with this program; if not, write to the Free Software
26081+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26082+ *
26083+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
26084+ * Tristan Gingold <tristan.gingold@bull.net>
26085+ */
26086+#include <linux/types.h>
26087+#include <linux/errno.h>
26088+#include <linux/kernel.h>
26089+#include <linux/gfp.h>
26090+#include <linux/module.h>
26091+#include <xen/interface/xen.h>
26092+#include <xen/interface/dom0_ops.h>
26093+#define __XEN__
26094+#include <xen/interface/domctl.h>
26095+#include <xen/interface/sysctl.h>
26096+#include <xen/interface/memory.h>
26097+#include <xen/interface/version.h>
26098+#include <xen/interface/event_channel.h>
26099+#include <xen/interface/acm_ops.h>
26100+#include <xen/interface/hvm/params.h>
26101+#include <xen/public/privcmd.h>
26102+#include <asm/hypercall.h>
26103+#include <asm/page.h>
26104+#include <asm/uaccess.h>
26105+#include <asm/xen/xencomm.h>
26106+
26107+#define ROUND_DIV(v,s) (((v) + (s) - 1) / (s))
26108+
26109+static int
26110+xencomm_privcmd_dom0_op(privcmd_hypercall_t *hypercall)
26111+{
26112+ dom0_op_t kern_op;
26113+ dom0_op_t __user *user_op = (dom0_op_t __user *)hypercall->arg[0];
26114+ struct xencomm_handle *op_desc;
26115+ struct xencomm_handle *desc = NULL;
26116+ int ret = 0;
26117+
26118+ if (copy_from_user(&kern_op, user_op, sizeof(dom0_op_t)))
26119+ return -EFAULT;
26120+
26121+ if (kern_op.interface_version != DOM0_INTERFACE_VERSION)
26122+ return -EACCES;
26123+
26124+ op_desc = xencomm_create_inline(&kern_op);
26125+
26126+ switch (kern_op.cmd) {
26127+ default:
26128+ printk("%s: unknown dom0 cmd %d\n", __func__, kern_op.cmd);
26129+ return -ENOSYS;
26130+ }
26131+
26132+ if (ret) {
26133+ /* error mapping the nested pointer */
26134+ return ret;
26135+ }
26136+
26137+ ret = xencomm_arch_hypercall_dom0_op(op_desc);
26138+
26139+ /* FIXME: should we restore the handle? */
26140+ if (copy_to_user(user_op, &kern_op, sizeof(dom0_op_t)))
26141+ ret = -EFAULT;
26142+
26143+ if (desc)
26144+ xencomm_free(desc);
26145+ return ret;
26146+}
26147+
26148+/*
26149+ * Temporarily disable the NUMA PHYSINFO code until the rest of the
26150+ * changes are upstream.
26151+ */
26152+#undef IA64_NUMA_PHYSINFO
26153+
26154+static int
26155+xencomm_privcmd_sysctl(privcmd_hypercall_t *hypercall)
26156+{
26157+ xen_sysctl_t kern_op;
26158+ xen_sysctl_t __user *user_op;
26159+ struct xencomm_handle *op_desc;
26160+ struct xencomm_handle *desc = NULL;
26161+ struct xencomm_handle *desc1 = NULL;
26162+ int ret = 0;
26163+
26164+ user_op = (xen_sysctl_t __user *)hypercall->arg[0];
26165+
26166+ if (copy_from_user(&kern_op, user_op, sizeof(xen_sysctl_t)))
26167+ return -EFAULT;
26168+
26169+ if (kern_op.interface_version != XEN_SYSCTL_INTERFACE_VERSION)
26170+ return -EACCES;
26171+
26172+ op_desc = xencomm_create_inline(&kern_op);
26173+
26174+ switch (kern_op.cmd) {
26175+ case XEN_SYSCTL_readconsole:
26176+ ret = xencomm_create(
26177+ xen_guest_handle(kern_op.u.readconsole.buffer),
26178+ kern_op.u.readconsole.count,
26179+ &desc, GFP_KERNEL);
26180+ set_xen_guest_handle(kern_op.u.readconsole.buffer,
26181+ (void *)desc);
26182+ break;
26183+ case XEN_SYSCTL_tbuf_op:
26184+#ifndef IA64_NUMA_PHYSINFO
26185+ case XEN_SYSCTL_physinfo:
26186+#endif
26187+ case XEN_SYSCTL_sched_id:
26188+ break;
26189+ case XEN_SYSCTL_perfc_op:
26190+ {
26191+ struct xencomm_handle *tmp_desc;
26192+ xen_sysctl_t tmp_op = {
26193+ .cmd = XEN_SYSCTL_perfc_op,
26194+ .interface_version = XEN_SYSCTL_INTERFACE_VERSION,
26195+ .u.perfc_op = {
26196+ .cmd = XEN_SYSCTL_PERFCOP_query,
26197+ // .desc.p = NULL,
26198+ // .val.p = NULL,
26199+ },
26200+ };
26201+
26202+ if (xen_guest_handle(kern_op.u.perfc_op.desc) == NULL) {
26203+ if (xen_guest_handle(kern_op.u.perfc_op.val) != NULL)
26204+ return -EINVAL;
26205+ break;
26206+ }
26207+
26208+ /* query the buffer size for xencomm */
26209+ tmp_desc = xencomm_create_inline(&tmp_op);
26210+ ret = xencomm_arch_hypercall_sysctl(tmp_desc);
26211+ if (ret)
26212+ return ret;
26213+
26214+ ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.desc),
26215+ tmp_op.u.perfc_op.nr_counters *
26216+ sizeof(xen_sysctl_perfc_desc_t),
26217+ &desc, GFP_KERNEL);
26218+ if (ret)
26219+ return ret;
26220+
26221+ set_xen_guest_handle(kern_op.u.perfc_op.desc, (void *)desc);
26222+
26223+ ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.val),
26224+ tmp_op.u.perfc_op.nr_vals *
26225+ sizeof(xen_sysctl_perfc_val_t),
26226+ &desc1, GFP_KERNEL);
26227+ if (ret)
26228+ xencomm_free(desc);
26229+
26230+ set_xen_guest_handle(kern_op.u.perfc_op.val, (void *)desc1);
26231+ break;
26232+ }
26233+ case XEN_SYSCTL_getdomaininfolist:
26234+ ret = xencomm_create(
26235+ xen_guest_handle(kern_op.u.getdomaininfolist.buffer),
26236+ kern_op.u.getdomaininfolist.max_domains *
26237+ sizeof(xen_domctl_getdomaininfo_t),
26238+ &desc, GFP_KERNEL);
26239+ set_xen_guest_handle(kern_op.u.getdomaininfolist.buffer,
26240+ (void *)desc);
26241+ break;
26242+#ifdef IA64_NUMA_PHYSINFO
26243+ case XEN_SYSCTL_physinfo:
26244+ ret = xencomm_create(
26245+ xen_guest_handle(kern_op.u.physinfo.memory_chunks),
26246+ PUBLIC_MAXCHUNKS * sizeof(node_data_t),
26247+ &desc, GFP_KERNEL);
26248+ if (ret)
26249+ return ret;
26250+ set_xen_guest_handle(kern_op.u.physinfo.memory_chunks,
26251+ (void *)desc);
26252+
26253+ ret = xencomm_create(
26254+ xen_guest_handle(kern_op.u.physinfo.cpu_to_node),
26255+ PUBLIC_MAX_NUMNODES * sizeof(u64),
26256+ &desc1, GFP_KERNEL);
26257+ if (ret)
26258+ xencomm_free(desc);
26259+ set_xen_guest_handle(kern_op.u.physinfo.cpu_to_node,
26260+ (void *)desc1);
26261+ break;
26262+#endif
26263+ default:
26264+ printk("%s: unknown sysctl cmd %d\n", __func__, kern_op.cmd);
26265+ return -ENOSYS;
26266+ }
26267+
26268+ if (ret) {
26269+ /* error mapping the nested pointer */
26270+ return ret;
26271+ }
26272+
26273+ ret = xencomm_arch_hypercall_sysctl(op_desc);
26274+
26275+ /* FIXME: should we restore the handles? */
26276+ if (copy_to_user(user_op, &kern_op, sizeof(xen_sysctl_t)))
26277+ ret = -EFAULT;
26278+
26279+ if (desc)
26280+ xencomm_free(desc);
26281+ if (desc1)
26282+ xencomm_free(desc1);
26283+ return ret;
26284+}
26285+
26286+static int
26287+xencomm_privcmd_domctl(privcmd_hypercall_t *hypercall)
26288+{
26289+ xen_domctl_t kern_op;
26290+ xen_domctl_t __user *user_op;
26291+ struct xencomm_handle *op_desc;
26292+ struct xencomm_handle *desc = NULL;
26293+ int ret = 0;
26294+
26295+ user_op = (xen_domctl_t __user *)hypercall->arg[0];
26296+
26297+ if (copy_from_user(&kern_op, user_op, sizeof(xen_domctl_t)))
26298+ return -EFAULT;
26299+
26300+ if (kern_op.interface_version != XEN_DOMCTL_INTERFACE_VERSION)
26301+ return -EACCES;
26302+
26303+ op_desc = xencomm_create_inline(&kern_op);
26304+
26305+ switch (kern_op.cmd) {
26306+ case XEN_DOMCTL_createdomain:
26307+ case XEN_DOMCTL_destroydomain:
26308+ case XEN_DOMCTL_pausedomain:
26309+ case XEN_DOMCTL_unpausedomain:
26310+ case XEN_DOMCTL_getdomaininfo:
26311+ break;
26312+ case XEN_DOMCTL_getmemlist:
26313+ {
26314+ unsigned long nr_pages = kern_op.u.getmemlist.max_pfns;
26315+
26316+ ret = xencomm_create(
26317+ xen_guest_handle(kern_op.u.getmemlist.buffer),
26318+ nr_pages * sizeof(unsigned long),
26319+ &desc, GFP_KERNEL);
26320+ set_xen_guest_handle(kern_op.u.getmemlist.buffer,
26321+ (void *)desc);
26322+ break;
26323+ }
26324+ case XEN_DOMCTL_getpageframeinfo:
26325+ break;
26326+ case XEN_DOMCTL_getpageframeinfo2:
26327+ ret = xencomm_create(
26328+ xen_guest_handle(kern_op.u.getpageframeinfo2.array),
26329+ kern_op.u.getpageframeinfo2.num,
26330+ &desc, GFP_KERNEL);
26331+ set_xen_guest_handle(kern_op.u.getpageframeinfo2.array,
26332+ (void *)desc);
26333+ break;
26334+ case XEN_DOMCTL_shadow_op:
26335+ ret = xencomm_create(
26336+ xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap),
26337+ ROUND_DIV(kern_op.u.shadow_op.pages, 8),
26338+ &desc, GFP_KERNEL);
26339+ set_xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap,
26340+ (void *)desc);
26341+ break;
26342+ case XEN_DOMCTL_max_mem:
26343+ break;
26344+ case XEN_DOMCTL_setvcpucontext:
26345+ case XEN_DOMCTL_getvcpucontext:
26346+ ret = xencomm_create(
26347+ xen_guest_handle(kern_op.u.vcpucontext.ctxt),
26348+ sizeof(vcpu_guest_context_t),
26349+ &desc, GFP_KERNEL);
26350+ set_xen_guest_handle(kern_op.u.vcpucontext.ctxt, (void *)desc);
26351+ break;
26352+ case XEN_DOMCTL_getvcpuinfo:
26353+ break;
26354+ case XEN_DOMCTL_setvcpuaffinity:
26355+ case XEN_DOMCTL_getvcpuaffinity:
26356+ ret = xencomm_create(
26357+ xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap),
26358+ ROUND_DIV(kern_op.u.vcpuaffinity.cpumap.nr_cpus, 8),
26359+ &desc, GFP_KERNEL);
26360+ set_xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap,
26361+ (void *)desc);
26362+ break;
26363+ case XEN_DOMCTL_max_vcpus:
26364+ case XEN_DOMCTL_scheduler_op:
26365+ case XEN_DOMCTL_setdomainhandle:
26366+ case XEN_DOMCTL_setdebugging:
26367+ case XEN_DOMCTL_irq_permission:
26368+ case XEN_DOMCTL_iomem_permission:
26369+ case XEN_DOMCTL_ioport_permission:
26370+ case XEN_DOMCTL_hypercall_init:
26371+ case XEN_DOMCTL_arch_setup:
26372+ case XEN_DOMCTL_settimeoffset:
26373+ break;
26374+ default:
26375+ printk("%s: unknown domctl cmd %d\n", __func__, kern_op.cmd);
26376+ return -ENOSYS;
26377+ }
26378+
26379+ if (ret) {
26380+ /* error mapping the nested pointer */
26381+ return ret;
26382+ }
26383+
26384+ ret = xencomm_arch_hypercall_domctl (op_desc);
26385+
26386+ /* FIXME: should we restore the handle? */
26387+ if (copy_to_user(user_op, &kern_op, sizeof(xen_domctl_t)))
26388+ ret = -EFAULT;
26389+
26390+ if (desc)
26391+ xencomm_free(desc);
26392+ return ret;
26393+}
26394+
26395+static int
26396+xencomm_privcmd_acm_op(privcmd_hypercall_t *hypercall)
26397+{
26398+ int cmd = hypercall->arg[0];
26399+ void __user *arg = (void __user *)hypercall->arg[1];
26400+ struct xencomm_handle *op_desc;
26401+ struct xencomm_handle *desc = NULL;
26402+ int ret;
26403+
26404+ switch (cmd) {
26405+ case ACMOP_getssid:
26406+ {
26407+ struct acm_getssid kern_arg;
26408+
26409+ if (copy_from_user(&kern_arg, arg, sizeof (kern_arg)))
26410+ return -EFAULT;
26411+
26412+ op_desc = xencomm_create_inline(&kern_arg);
26413+
26414+ ret = xencomm_create(xen_guest_handle(kern_arg.ssidbuf),
26415+ kern_arg.ssidbuf_size, &desc, GFP_KERNEL);
26416+ if (ret)
26417+ return ret;
26418+
26419+ set_xen_guest_handle(kern_arg.ssidbuf, (void *)desc);
26420+
26421+ ret = xencomm_arch_hypercall_acm_op(cmd, op_desc);
26422+
26423+ xencomm_free(desc);
26424+
26425+ if (copy_to_user(arg, &kern_arg, sizeof (kern_arg)))
26426+ return -EFAULT;
26427+
26428+ return ret;
26429+ }
26430+ default:
26431+ printk("%s: unknown acm_op cmd %d\n", __func__, cmd);
26432+ return -ENOSYS;
26433+ }
26434+
26435+ return ret;
26436+}
26437+
26438+static int
26439+xencomm_privcmd_memory_op(privcmd_hypercall_t *hypercall)
26440+{
26441+ const unsigned long cmd = hypercall->arg[0];
26442+ int ret = 0;
26443+
26444+ switch (cmd) {
26445+ case XENMEM_increase_reservation:
26446+ case XENMEM_decrease_reservation:
26447+ case XENMEM_populate_physmap:
26448+ {
26449+ xen_memory_reservation_t kern_op;
26450+ xen_memory_reservation_t __user *user_op;
26451+ struct xencomm_handle *desc = NULL;
26452+ struct xencomm_handle *desc_op;
26453+
26454+ user_op = (xen_memory_reservation_t __user *)hypercall->arg[1];
26455+ if (copy_from_user(&kern_op, user_op,
26456+ sizeof(xen_memory_reservation_t)))
26457+ return -EFAULT;
26458+ desc_op = xencomm_create_inline(&kern_op);
26459+
26460+ if (xen_guest_handle(kern_op.extent_start)) {
26461+ void * addr;
26462+
26463+ addr = xen_guest_handle(kern_op.extent_start);
26464+ ret = xencomm_create
26465+ (addr,
26466+ kern_op.nr_extents *
26467+ sizeof(*xen_guest_handle
26468+ (kern_op.extent_start)),
26469+ &desc, GFP_KERNEL);
26470+ if (ret)
26471+ return ret;
26472+ set_xen_guest_handle(kern_op.extent_start,
26473+ (void *)desc);
26474+ }
26475+
26476+ ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26477+
26478+ if (desc)
26479+ xencomm_free(desc);
26480+
26481+ if (ret != 0)
26482+ return ret;
26483+
26484+ if (copy_to_user(user_op, &kern_op,
26485+ sizeof(xen_memory_reservation_t)))
26486+ return -EFAULT;
26487+
26488+ return ret;
26489+ }
26490+ case XENMEM_translate_gpfn_list:
26491+ {
26492+ xen_translate_gpfn_list_t kern_op;
26493+ xen_translate_gpfn_list_t __user *user_op;
26494+ struct xencomm_handle *desc_gpfn = NULL;
26495+ struct xencomm_handle *desc_mfn = NULL;
26496+ struct xencomm_handle *desc_op;
26497+ void *addr;
26498+
26499+ user_op = (xen_translate_gpfn_list_t __user *)
26500+ hypercall->arg[1];
26501+ if (copy_from_user(&kern_op, user_op,
26502+ sizeof(xen_translate_gpfn_list_t)))
26503+ return -EFAULT;
26504+ desc_op = xencomm_create_inline(&kern_op);
26505+
26506+ if (kern_op.nr_gpfns) {
26507+ /* gpfn_list. */
26508+ addr = xen_guest_handle(kern_op.gpfn_list);
26509+
26510+ ret = xencomm_create(addr, kern_op.nr_gpfns *
26511+ sizeof(*xen_guest_handle
26512+ (kern_op.gpfn_list)),
26513+ &desc_gpfn, GFP_KERNEL);
26514+ if (ret)
26515+ return ret;
26516+ set_xen_guest_handle(kern_op.gpfn_list,
26517+ (void *)desc_gpfn);
26518+
26519+ /* mfn_list. */
26520+ addr = xen_guest_handle(kern_op.mfn_list);
26521+
26522+ ret = xencomm_create(addr, kern_op.nr_gpfns *
26523+ sizeof(*xen_guest_handle
26524+ (kern_op.mfn_list)),
26525+ &desc_mfn, GFP_KERNEL);
26526+ if (ret)
26527+ return ret;
26528+ set_xen_guest_handle(kern_op.mfn_list,
26529+ (void *)desc_mfn);
26530+ }
26531+
26532+ ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26533+
26534+ if (desc_gpfn)
26535+ xencomm_free(desc_gpfn);
26536+
26537+ if (desc_mfn)
26538+ xencomm_free(desc_mfn);
26539+
26540+ if (ret != 0)
26541+ return ret;
26542+
26543+ return ret;
26544+ }
26545+ default:
26546+ printk("%s: unknown memory op %lu\n", __func__, cmd);
26547+ ret = -ENOSYS;
26548+ }
26549+ return ret;
26550+}
26551+
26552+static int
26553+xencomm_privcmd_xen_version(privcmd_hypercall_t *hypercall)
26554+{
26555+ int cmd = hypercall->arg[0];
26556+ void __user *arg = (void __user *)hypercall->arg[1];
26557+ struct xencomm_handle *desc;
26558+ size_t argsize;
26559+ int rc;
26560+
26561+ switch (cmd) {
26562+ case XENVER_version:
26563+ /* do not actually pass an argument */
26564+ return xencomm_arch_hypercall_xen_version(cmd, 0);
26565+ case XENVER_extraversion:
26566+ argsize = sizeof(xen_extraversion_t);
26567+ break;
26568+ case XENVER_compile_info:
26569+ argsize = sizeof(xen_compile_info_t);
26570+ break;
26571+ case XENVER_capabilities:
26572+ argsize = sizeof(xen_capabilities_info_t);
26573+ break;
26574+ case XENVER_changeset:
26575+ argsize = sizeof(xen_changeset_info_t);
26576+ break;
26577+ case XENVER_platform_parameters:
26578+ argsize = sizeof(xen_platform_parameters_t);
26579+ break;
26580+ case XENVER_pagesize:
26581+ argsize = (arg == NULL) ? 0 : sizeof(void *);
26582+ break;
26583+ case XENVER_get_features:
26584+ argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
26585+ break;
26586+
26587+ default:
26588+ printk("%s: unknown version op %d\n", __func__, cmd);
26589+ return -ENOSYS;
26590+ }
26591+
26592+ rc = xencomm_create(arg, argsize, &desc, GFP_KERNEL);
26593+ if (rc)
26594+ return rc;
26595+
26596+ rc = xencomm_arch_hypercall_xen_version(cmd, desc);
26597+
26598+ xencomm_free(desc);
26599+
26600+ return rc;
26601+}
26602+
26603+static int
26604+xencomm_privcmd_event_channel_op(privcmd_hypercall_t *hypercall)
26605+{
26606+ int cmd = hypercall->arg[0];
26607+ struct xencomm_handle *desc;
26608+ unsigned int argsize;
26609+ int ret;
26610+
26611+ switch (cmd) {
26612+ case EVTCHNOP_alloc_unbound:
26613+ argsize = sizeof(evtchn_alloc_unbound_t);
26614+ break;
26615+
26616+ case EVTCHNOP_status:
26617+ argsize = sizeof(evtchn_status_t);
26618+ break;
26619+
26620+ default:
26621+ printk("%s: unknown EVTCHNOP %d\n", __func__, cmd);
26622+ return -EINVAL;
26623+ }
26624+
26625+ ret = xencomm_create((void *)hypercall->arg[1], argsize,
26626+ &desc, GFP_KERNEL);
26627+ if (ret)
26628+ return ret;
26629+
26630+ ret = xencomm_arch_hypercall_event_channel_op(cmd, desc);
26631+
26632+ xencomm_free(desc);
26633+ return ret;
26634+}
26635+
26636+static int
26637+xencomm_privcmd_hvm_op(privcmd_hypercall_t *hypercall)
26638+{
26639+ int cmd = hypercall->arg[0];
26640+ struct xencomm_handle *desc;
26641+ unsigned int argsize;
26642+ int ret;
26643+
26644+ switch (cmd) {
26645+ case HVMOP_get_param:
26646+ case HVMOP_set_param:
26647+ argsize = sizeof(xen_hvm_param_t);
26648+ break;
26649+ case HVMOP_set_pci_intx_level:
26650+ argsize = sizeof(xen_hvm_set_pci_intx_level_t);
26651+ break;
26652+ case HVMOP_set_isa_irq_level:
26653+ argsize = sizeof(xen_hvm_set_isa_irq_level_t);
26654+ break;
26655+ case HVMOP_set_pci_link_route:
26656+ argsize = sizeof(xen_hvm_set_pci_link_route_t);
26657+ break;
26658+
26659+ default:
26660+ printk("%s: unknown HVMOP %d\n", __func__, cmd);
26661+ return -EINVAL;
26662+ }
26663+
26664+ ret = xencomm_create((void *)hypercall->arg[1], argsize,
26665+ &desc, GFP_KERNEL);
26666+ if (ret)
26667+ return ret;
26668+
26669+ ret = xencomm_arch_hypercall_hvm_op(cmd, desc);
26670+
26671+ xencomm_free(desc);
26672+ return ret;
26673+}
26674+
26675+static int
26676+xencomm_privcmd_sched_op(privcmd_hypercall_t *hypercall)
26677+{
26678+ int cmd = hypercall->arg[0];
26679+ struct xencomm_handle *desc;
26680+ unsigned int argsize;
26681+ int ret;
26682+
26683+ switch (cmd) {
26684+ case SCHEDOP_remote_shutdown:
26685+ argsize = sizeof(sched_remote_shutdown_t);
26686+ break;
26687+ default:
26688+ printk("%s: unknown SCHEDOP %d\n", __func__, cmd);
26689+ return -EINVAL;
26690+ }
26691+
26692+ ret = xencomm_create((void *)hypercall->arg[1], argsize,
26693+ &desc, GFP_KERNEL);
26694+ if (ret)
26695+ return ret;
26696+
26697+ ret = xencomm_arch_hypercall_sched_op(cmd, desc);
26698+
26699+ xencomm_free(desc);
26700+ return ret;
26701+}
26702+
26703+int
26704+privcmd_hypercall(privcmd_hypercall_t *hypercall)
26705+{
26706+ switch (hypercall->op) {
26707+ case __HYPERVISOR_dom0_op:
26708+ return xencomm_privcmd_dom0_op(hypercall);
26709+ case __HYPERVISOR_domctl:
26710+ return xencomm_privcmd_domctl(hypercall);
26711+ case __HYPERVISOR_sysctl:
26712+ return xencomm_privcmd_sysctl(hypercall);
26713+ case __HYPERVISOR_acm_op:
26714+ return xencomm_privcmd_acm_op(hypercall);
26715+ case __HYPERVISOR_xen_version:
26716+ return xencomm_privcmd_xen_version(hypercall);
26717+ case __HYPERVISOR_memory_op:
26718+ return xencomm_privcmd_memory_op(hypercall);
26719+ case __HYPERVISOR_event_channel_op:
26720+ return xencomm_privcmd_event_channel_op(hypercall);
26721+ case __HYPERVISOR_hvm_op:
26722+ return xencomm_privcmd_hvm_op(hypercall);
26723+ case __HYPERVISOR_sched_op:
26724+ return xencomm_privcmd_sched_op(hypercall);
26725+ default:
26726+ printk("%s: unknown hcall (%ld)\n", __func__, hypercall->op);
26727+ return -ENOSYS;
26728+ }
26729+}
26730+
26731diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xencomm.c linux-2.6.16.33/arch/ia64/xen/xencomm.c
26732--- linux-2.6.16.33-noxen/arch/ia64/xen/xencomm.c 1970-01-01 00:00:00.000000000 +0000
26733+++ linux-2.6.16.33/arch/ia64/xen/xencomm.c 2007-01-08 15:00:45.000000000 +0000
26734@@ -0,0 +1,263 @@
26735+/*
26736+ * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
26737+ *
26738+ * This program is free software; you can redistribute it and/or modify
26739+ * it under the terms of the GNU General Public License as published by
26740+ * the Free Software Foundation; either version 2 of the License, or
26741+ * (at your option) any later version.
26742+ *
26743+ * This program is distributed in the hope that it will be useful,
26744+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26745+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26746+ * GNU General Public License for more details.
26747+ *
26748+ * You should have received a copy of the GNU General Public License
26749+ * along with this program; if not, write to the Free Software
26750+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26751+ */
26752+
26753+#include <linux/gfp.h>
26754+#include <linux/mm.h>
26755+#include <xen/interface/xen.h>
26756+#include <asm/page.h>
26757+
26758+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
26759+#include <xen/platform-compat.h>
26760+#endif
26761+
26762+#include <asm/xen/xencomm.h>
26763+
26764+static int xencomm_debug = 0;
26765+
26766+static unsigned long kernel_start_pa;
26767+
26768+void
26769+xencomm_init (void)
26770+{
26771+ kernel_start_pa = KERNEL_START - ia64_tpa(KERNEL_START);
26772+}
26773+
26774+/* Translate virtual address to physical address. */
26775+unsigned long
26776+xencomm_vaddr_to_paddr(unsigned long vaddr)
26777+{
26778+#ifndef CONFIG_VMX_GUEST
26779+ struct page *page;
26780+ struct vm_area_struct *vma;
26781+#endif
26782+
26783+ if (vaddr == 0)
26784+ return 0;
26785+
26786+#ifdef __ia64__
26787+ if (REGION_NUMBER(vaddr) == 5) {
26788+ pgd_t *pgd;
26789+ pud_t *pud;
26790+ pmd_t *pmd;
26791+ pte_t *ptep;
26792+
26793+ /* On ia64, TASK_SIZE refers to current. It is not initialized
26794+ during boot.
26795+ Furthermore the kernel is relocatable and __pa() doesn't
26796+ work on addresses. */
26797+ if (vaddr >= KERNEL_START
26798+ && vaddr < (KERNEL_START + KERNEL_TR_PAGE_SIZE)) {
26799+ return vaddr - kernel_start_pa;
26800+ }
26801+
26802+ /* In kernel area -- virtually mapped. */
26803+ pgd = pgd_offset_k(vaddr);
26804+ if (pgd_none(*pgd) || pgd_bad(*pgd))
26805+ return ~0UL;
26806+
26807+ pud = pud_offset(pgd, vaddr);
26808+ if (pud_none(*pud) || pud_bad(*pud))
26809+ return ~0UL;
26810+
26811+ pmd = pmd_offset(pud, vaddr);
26812+ if (pmd_none(*pmd) || pmd_bad(*pmd))
26813+ return ~0UL;
26814+
26815+ ptep = pte_offset_kernel(pmd, vaddr);
26816+ if (!ptep)
26817+ return ~0UL;
26818+
26819+ return (pte_val(*ptep) & _PFN_MASK) | (vaddr & ~PAGE_MASK);
26820+ }
26821+#endif
26822+
26823+ if (vaddr > TASK_SIZE) {
26824+ /* kernel address */
26825+ return __pa(vaddr);
26826+ }
26827+
26828+
26829+#ifdef CONFIG_VMX_GUEST
26830+ /* No privcmd within vmx guest. */
26831+ return ~0UL;
26832+#else
26833+ /* XXX double-check (lack of) locking */
26834+ vma = find_extend_vma(current->mm, vaddr);
26835+ if (!vma)
26836+ return ~0UL;
26837+
26838+ /* We assume the page is modified. */
26839+ page = follow_page(vma, vaddr, FOLL_WRITE | FOLL_TOUCH);
26840+ if (!page)
26841+ return ~0UL;
26842+
26843+ return (page_to_pfn(page) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
26844+#endif
26845+}
26846+
26847+static int
26848+xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
26849+{
26850+ unsigned long recorded = 0;
26851+ int i = 0;
26852+
26853+ BUG_ON((buffer == NULL) && (bytes > 0));
26854+
26855+ /* record the physical pages used */
26856+ if (buffer == NULL)
26857+ desc->nr_addrs = 0;
26858+
26859+ while ((recorded < bytes) && (i < desc->nr_addrs)) {
26860+ unsigned long vaddr = (unsigned long)buffer + recorded;
26861+ unsigned long paddr;
26862+ int offset;
26863+ int chunksz;
26864+
26865+ offset = vaddr % PAGE_SIZE; /* handle partial pages */
26866+ chunksz = min(PAGE_SIZE - offset, bytes - recorded);
26867+
26868+ paddr = xencomm_vaddr_to_paddr(vaddr);
26869+ if (paddr == ~0UL) {
26870+ printk("%s: couldn't translate vaddr %lx\n",
26871+ __func__, vaddr);
26872+ return -EINVAL;
26873+ }
26874+
26875+ desc->address[i++] = paddr;
26876+ recorded += chunksz;
26877+ }
26878+
26879+ if (recorded < bytes) {
26880+ printk("%s: could only translate %ld of %ld bytes\n",
26881+ __func__, recorded, bytes);
26882+ return -ENOSPC;
26883+ }
26884+
26885+ /* mark remaining addresses invalid (just for safety) */
26886+ while (i < desc->nr_addrs)
26887+ desc->address[i++] = XENCOMM_INVALID;
26888+
26889+ desc->magic = XENCOMM_MAGIC;
26890+
26891+ return 0;
26892+}
26893+
26894+static struct xencomm_desc *
26895+xencomm_alloc(gfp_t gfp_mask)
26896+{
26897+ struct xencomm_desc *desc;
26898+
26899+ desc = (struct xencomm_desc *)__get_free_page(gfp_mask);
26900+ if (desc == NULL)
26901+ panic("%s: page allocation failed\n", __func__);
26902+
26903+ desc->nr_addrs = (PAGE_SIZE - sizeof(struct xencomm_desc)) /
26904+ sizeof(*desc->address);
26905+
26906+ return desc;
26907+}
26908+
26909+void
26910+xencomm_free(struct xencomm_handle *desc)
26911+{
26912+ if (desc)
26913+ free_page((unsigned long)__va(desc));
26914+}
26915+
26916+int
26917+xencomm_create(void *buffer, unsigned long bytes,
26918+ struct xencomm_handle **ret, gfp_t gfp_mask)
26919+{
26920+ struct xencomm_desc *desc;
26921+ struct xencomm_handle *handle;
26922+ int rc;
26923+
26924+ if (xencomm_debug)
26925+ printk("%s: %p[%ld]\n", __func__, buffer, bytes);
26926+
26927+ if (buffer == NULL || bytes == 0) {
26928+ *ret = (struct xencomm_handle *)NULL;
26929+ return 0;
26930+ }
26931+
26932+ desc = xencomm_alloc(gfp_mask);
26933+ if (!desc) {
26934+ printk("%s failure\n", "xencomm_alloc");
26935+ return -ENOMEM;
26936+ }
26937+ handle = (struct xencomm_handle *)__pa(desc);
26938+
26939+ rc = xencomm_init_desc(desc, buffer, bytes);
26940+ if (rc) {
26941+ printk("%s failure: %d\n", "xencomm_init_desc", rc);
26942+ xencomm_free(handle);
26943+ return rc;
26944+ }
26945+
26946+ *ret = handle;
26947+ return 0;
26948+}
26949+
26950+/* "mini" routines, for stack-based communications: */
26951+
26952+static void *
26953+xencomm_alloc_mini(struct xencomm_mini *area, int *nbr_area)
26954+{
26955+ unsigned long base;
26956+ unsigned int pageoffset;
26957+
26958+ while (*nbr_area >= 0) {
26959+ /* Allocate an area. */
26960+ (*nbr_area)--;
26961+
26962+ base = (unsigned long)(area + *nbr_area);
26963+ pageoffset = base % PAGE_SIZE;
26964+
26965+ /* If the area does not cross a page, use it. */
26966+ if ((PAGE_SIZE - pageoffset) >= sizeof(struct xencomm_mini))
26967+ return &area[*nbr_area];
26968+ }
26969+ /* No more area. */
26970+ return NULL;
26971+}
26972+
26973+int
26974+xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
26975+ void *buffer, unsigned long bytes,
26976+ struct xencomm_handle **ret)
26977+{
26978+ struct xencomm_desc *desc;
26979+ int rc;
26980+ unsigned long res;
26981+
26982+ desc = xencomm_alloc_mini(area, nbr_area);
26983+ if (!desc)
26984+ return -ENOMEM;
26985+ desc->nr_addrs = XENCOMM_MINI_ADDRS;
26986+
26987+ rc = xencomm_init_desc(desc, buffer, bytes);
26988+ if (rc)
26989+ return rc;
26990+
26991+ res = xencomm_vaddr_to_paddr((unsigned long)desc);
26992+ if (res == ~0UL)
26993+ return -EINVAL;
26994+
26995+ *ret = (struct xencomm_handle*)res;
26996+ return 0;
26997+}
26998diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenentry.S linux-2.6.16.33/arch/ia64/xen/xenentry.S
26999--- linux-2.6.16.33-noxen/arch/ia64/xen/xenentry.S 1970-01-01 00:00:00.000000000 +0000
27000+++ linux-2.6.16.33/arch/ia64/xen/xenentry.S 2007-01-08 15:00:45.000000000 +0000
27001@@ -0,0 +1,924 @@
27002+/*
27003+ * ia64/xen/entry.S
27004+ *
27005+ * Alternate kernel routines for Xen. Heavily leveraged from
27006+ * ia64/kernel/entry.S
27007+ *
27008+ * Copyright (C) 2005 Hewlett-Packard Co
27009+ * Dan Magenheimer <dan.magenheimer@.hp.com>
27010+ */
27011+
27012+#include <linux/config.h>
27013+
27014+#include <asm/asmmacro.h>
27015+#include <asm/cache.h>
27016+#include <asm/errno.h>
27017+#include <asm/kregs.h>
27018+#include <asm/asm-offsets.h>
27019+#include <asm/pgtable.h>
27020+#include <asm/percpu.h>
27021+#include <asm/processor.h>
27022+#include <asm/thread_info.h>
27023+#include <asm/unistd.h>
27024+
27025+#ifdef CONFIG_XEN
27026+#include "xenminstate.h"
27027+#else
27028+#include "minstate.h"
27029+#endif
27030+
27031+/*
27032+ * prev_task <- ia64_switch_to(struct task_struct *next)
27033+ * With Ingo's new scheduler, interrupts are disabled when this routine gets
27034+ * called. The code starting at .map relies on this. The rest of the code
27035+ * doesn't care about the interrupt masking status.
27036+ */
27037+#ifdef CONFIG_XEN
27038+GLOBAL_ENTRY(xen_switch_to)
27039+ .prologue
27040+ alloc r16=ar.pfs,1,0,0,0
27041+ movl r22=running_on_xen;;
27042+ ld4 r22=[r22];;
27043+ cmp.eq p7,p0=r22,r0
27044+(p7) br.cond.sptk.many __ia64_switch_to;;
27045+#else
27046+GLOBAL_ENTRY(ia64_switch_to)
27047+ .prologue
27048+ alloc r16=ar.pfs,1,0,0,0
27049+#endif
27050+ DO_SAVE_SWITCH_STACK
27051+ .body
27052+
27053+ adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
27054+ movl r25=init_task
27055+ mov r27=IA64_KR(CURRENT_STACK)
27056+ adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
27057+ dep r20=0,in0,61,3 // physical address of "next"
27058+ ;;
27059+ st8 [r22]=sp // save kernel stack pointer of old task
27060+ shr.u r26=r20,IA64_GRANULE_SHIFT
27061+ cmp.eq p7,p6=r25,in0
27062+ ;;
27063+#ifdef CONFIG_XEN
27064+ movl r8=XSI_PSR_IC
27065+ ;;
27066+ st4 [r8]=r0 // force psr.ic off for hyperprivop(s)
27067+ ;;
27068+#endif
27069+ /*
27070+ * If we've already mapped this task's page, we can skip doing it again.
27071+ */
27072+(p6) cmp.eq p7,p6=r26,r27
27073+(p6) br.cond.dpnt .map
27074+ ;;
27075+.done:
27076+#ifdef CONFIG_XEN
27077+ // psr.ic already off
27078+ // update "current" application register
27079+ mov r8=IA64_KR_CURRENT
27080+ mov r9=in0;;
27081+ XEN_HYPER_SET_KR
27082+ ld8 sp=[r21] // load kernel stack pointer of new task
27083+ movl r27=XSI_PSR_IC
27084+ mov r8=1
27085+ ;;
27086+ st4 [r27]=r8 // psr.ic back on
27087+#else
27088+ ld8 sp=[r21] // load kernel stack pointer of new task
27089+ mov IA64_KR(CURRENT)=in0 // update "current" application register
27090+#endif
27091+ mov r8=r13 // return pointer to previously running task
27092+ mov r13=in0 // set "current" pointer
27093+ ;;
27094+ DO_LOAD_SWITCH_STACK
27095+
27096+#ifdef CONFIG_SMP
27097+ sync.i // ensure "fc"s done by this CPU are visible on other CPUs
27098+#endif
27099+ br.ret.sptk.many rp // boogie on out in new context
27100+
27101+.map:
27102+#ifdef CONFIG_XEN
27103+ // psr.ic already off
27104+#else
27105+ rsm psr.ic // interrupts (psr.i) are already disabled here
27106+#endif
27107+ movl r25=PAGE_KERNEL
27108+ ;;
27109+ srlz.d
27110+ or r23=r25,r20 // construct PA | page properties
27111+ mov r25=IA64_GRANULE_SHIFT<<2
27112+ ;;
27113+#ifdef CONFIG_XEN
27114+ movl r8=XSI_ITIR
27115+ ;;
27116+ st8 [r8]=r25
27117+ ;;
27118+ movl r8=XSI_IFA
27119+ ;;
27120+ st8 [r8]=in0 // VA of next task...
27121+ ;;
27122+ mov r25=IA64_TR_CURRENT_STACK
27123+ // remember last page we mapped...
27124+ mov r8=IA64_KR_CURRENT_STACK
27125+ mov r9=r26;;
27126+ XEN_HYPER_SET_KR;;
27127+#else
27128+ mov cr.itir=r25
27129+ mov cr.ifa=in0 // VA of next task...
27130+ ;;
27131+ mov r25=IA64_TR_CURRENT_STACK
27132+ mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped...
27133+#endif
27134+ ;;
27135+ itr.d dtr[r25]=r23 // wire in new mapping...
27136+#ifndef CONFIG_XEN
27137+ ssm psr.ic // reenable the psr.ic bit
27138+ ;;
27139+ srlz.d
27140+#endif
27141+ br.cond.sptk .done
27142+#ifdef CONFIG_XEN
27143+END(xen_switch_to)
27144+#else
27145+END(ia64_switch_to)
27146+#endif
27147+
27148+ /*
27149+ * Invoke a system call, but do some tracing before and after the call.
27150+ * We MUST preserve the current register frame throughout this routine
27151+ * because some system calls (such as ia64_execve) directly
27152+ * manipulate ar.pfs.
27153+ */
27154+#ifdef CONFIG_XEN
27155+GLOBAL_ENTRY(xen_trace_syscall)
27156+ PT_REGS_UNWIND_INFO(0)
27157+ movl r16=running_on_xen;;
27158+ ld4 r16=[r16];;
27159+ cmp.eq p7,p0=r16,r0
27160+(p7) br.cond.sptk.many __ia64_trace_syscall;;
27161+#else
27162+GLOBAL_ENTRY(ia64_trace_syscall)
27163+ PT_REGS_UNWIND_INFO(0)
27164+#endif
27165+ /*
27166+ * We need to preserve the scratch registers f6-f11 in case the system
27167+ * call is sigreturn.
27168+ */
27169+ adds r16=PT(F6)+16,sp
27170+ adds r17=PT(F7)+16,sp
27171+ ;;
27172+ stf.spill [r16]=f6,32
27173+ stf.spill [r17]=f7,32
27174+ ;;
27175+ stf.spill [r16]=f8,32
27176+ stf.spill [r17]=f9,32
27177+ ;;
27178+ stf.spill [r16]=f10
27179+ stf.spill [r17]=f11
27180+ br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
27181+ adds r16=PT(F6)+16,sp
27182+ adds r17=PT(F7)+16,sp
27183+ ;;
27184+ ldf.fill f6=[r16],32
27185+ ldf.fill f7=[r17],32
27186+ ;;
27187+ ldf.fill f8=[r16],32
27188+ ldf.fill f9=[r17],32
27189+ ;;
27190+ ldf.fill f10=[r16]
27191+ ldf.fill f11=[r17]
27192+ // the syscall number may have changed, so re-load it and re-calculate the
27193+ // syscall entry-point:
27194+ adds r15=PT(R15)+16,sp // r15 = &pt_regs.r15 (syscall #)
27195+ ;;
27196+ ld8 r15=[r15]
27197+ mov r3=NR_syscalls - 1
27198+ ;;
27199+ adds r15=-1024,r15
27200+ movl r16=sys_call_table
27201+ ;;
27202+ shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024)
27203+ cmp.leu p6,p7=r15,r3
27204+ ;;
27205+(p6) ld8 r20=[r20] // load address of syscall entry point
27206+(p7) movl r20=sys_ni_syscall
27207+ ;;
27208+ mov b6=r20
27209+ br.call.sptk.many rp=b6 // do the syscall
27210+.strace_check_retval:
27211+ cmp.lt p6,p0=r8,r0 // syscall failed?
27212+ adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
27213+ adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10
27214+ mov r10=0
27215+(p6) br.cond.sptk strace_error // syscall failed ->
27216+ ;; // avoid RAW on r10
27217+.strace_save_retval:
27218+.mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8
27219+.mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10
27220+ br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
27221+.ret3:
27222+(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27223+ br.cond.sptk .work_pending_syscall_end
27224+
27225+strace_error:
27226+ ld8 r3=[r2] // load pt_regs.r8
27227+ sub r9=0,r8 // negate return value to get errno value
27228+ ;;
27229+ cmp.ne p6,p0=r3,r0 // is pt_regs.r8!=0?
27230+ adds r3=16,r2 // r3=&pt_regs.r10
27231+ ;;
27232+(p6) mov r10=-1
27233+(p6) mov r8=r9
27234+ br.cond.sptk .strace_save_retval
27235+#ifdef CONFIG_XEN
27236+END(xen_trace_syscall)
27237+#else
27238+END(ia64_trace_syscall)
27239+#endif
27240+
27241+#ifdef CONFIG_XEN
27242+GLOBAL_ENTRY(xen_ret_from_clone)
27243+ PT_REGS_UNWIND_INFO(0)
27244+ movl r16=running_on_xen;;
27245+ ld4 r16=[r16];;
27246+ cmp.eq p7,p0=r16,r0
27247+(p7) br.cond.sptk.many __ia64_ret_from_clone;;
27248+#else
27249+GLOBAL_ENTRY(ia64_ret_from_clone)
27250+ PT_REGS_UNWIND_INFO(0)
27251+#endif
27252+{ /*
27253+ * Some versions of gas generate bad unwind info if the first instruction of a
27254+ * procedure doesn't go into the first slot of a bundle. This is a workaround.
27255+ */
27256+ nop.m 0
27257+ nop.i 0
27258+ /*
27259+ * We need to call schedule_tail() to complete the scheduling process.
27260+ * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the
27261+ * address of the previously executing task.
27262+ */
27263+ br.call.sptk.many rp=ia64_invoke_schedule_tail
27264+}
27265+.ret8:
27266+ adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
27267+ ;;
27268+ ld4 r2=[r2]
27269+ ;;
27270+ mov r8=0
27271+ and r2=_TIF_SYSCALL_TRACEAUDIT,r2
27272+ ;;
27273+ cmp.ne p6,p0=r2,r0
27274+(p6) br.cond.spnt .strace_check_retval
27275+ ;; // added stop bits to prevent r8 dependency
27276+#ifdef CONFIG_XEN
27277+ br.cond.sptk ia64_ret_from_syscall
27278+END(xen_ret_from_clone)
27279+#else
27280+END(ia64_ret_from_clone)
27281+#endif
27282+/*
27283+ * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
27284+ * need to switch to bank 0 and doesn't restore the scratch registers.
27285+ * To avoid leaking kernel bits, the scratch registers are set to
27286+ * the following known-to-be-safe values:
27287+ *
27288+ * r1: restored (global pointer)
27289+ * r2: cleared
27290+ * r3: 1 (when returning to user-level)
27291+ * r8-r11: restored (syscall return value(s))
27292+ * r12: restored (user-level stack pointer)
27293+ * r13: restored (user-level thread pointer)
27294+ * r14: set to __kernel_syscall_via_epc
27295+ * r15: restored (syscall #)
27296+ * r16-r17: cleared
27297+ * r18: user-level b6
27298+ * r19: cleared
27299+ * r20: user-level ar.fpsr
27300+ * r21: user-level b0
27301+ * r22: cleared
27302+ * r23: user-level ar.bspstore
27303+ * r24: user-level ar.rnat
27304+ * r25: user-level ar.unat
27305+ * r26: user-level ar.pfs
27306+ * r27: user-level ar.rsc
27307+ * r28: user-level ip
27308+ * r29: user-level psr
27309+ * r30: user-level cfm
27310+ * r31: user-level pr
27311+ * f6-f11: cleared
27312+ * pr: restored (user-level pr)
27313+ * b0: restored (user-level rp)
27314+ * b6: restored
27315+ * b7: set to __kernel_syscall_via_epc
27316+ * ar.unat: restored (user-level ar.unat)
27317+ * ar.pfs: restored (user-level ar.pfs)
27318+ * ar.rsc: restored (user-level ar.rsc)
27319+ * ar.rnat: restored (user-level ar.rnat)
27320+ * ar.bspstore: restored (user-level ar.bspstore)
27321+ * ar.fpsr: restored (user-level ar.fpsr)
27322+ * ar.ccv: cleared
27323+ * ar.csd: cleared
27324+ * ar.ssd: cleared
27325+ */
27326+#ifdef CONFIG_XEN
27327+GLOBAL_ENTRY(xen_leave_syscall)
27328+ PT_REGS_UNWIND_INFO(0)
27329+ movl r22=running_on_xen;;
27330+ ld4 r22=[r22];;
27331+ cmp.eq p7,p0=r22,r0
27332+(p7) br.cond.sptk.many __ia64_leave_syscall;;
27333+#else
27334+ENTRY(ia64_leave_syscall)
27335+ PT_REGS_UNWIND_INFO(0)
27336+#endif
27337+ /*
27338+ * work.need_resched etc. mustn't get changed by this CPU before it returns to
27339+ * user- or fsys-mode, hence we disable interrupts early on.
27340+ *
27341+ * p6 controls whether current_thread_info()->flags needs to be check for
27342+ * extra work. We always check for extra work when returning to user-level.
27343+ * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27344+ * is 0. After extra work processing has been completed, execution
27345+ * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27346+ * needs to be redone.
27347+ */
27348+#ifdef CONFIG_PREEMPT
27349+ rsm psr.i // disable interrupts
27350+ cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
27351+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27352+ ;;
27353+ .pred.rel.mutex pUStk,pKStk
27354+(pKStk) ld4 r21=[r20] // r21 <- preempt_count
27355+(pUStk) mov r21=0 // r21 <- 0
27356+ ;;
27357+ cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
27358+#else /* !CONFIG_PREEMPT */
27359+#ifdef CONFIG_XEN
27360+ movl r2=XSI_PSR_I_ADDR
27361+ mov r18=1
27362+ ;;
27363+ ld8 r2=[r2]
27364+ ;;
27365+(pUStk) st1 [r2]=r18
27366+#else
27367+(pUStk) rsm psr.i
27368+#endif
27369+ cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
27370+(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27371+#endif
27372+.work_processed_syscall:
27373+ adds r2=PT(LOADRS)+16,r12
27374+ adds r3=PT(AR_BSPSTORE)+16,r12
27375+ adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
27376+ ;;
27377+(p6) ld4 r31=[r18] // load current_thread_info()->flags
27378+ ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
27379+ nop.i 0
27380+ ;;
27381+ mov r16=ar.bsp // M2 get existing backing store pointer
27382+ ld8 r18=[r2],PT(R9)-PT(B6) // load b6
27383+(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
27384+ ;;
27385+ ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
27386+(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending?
27387+(p6) br.cond.spnt .work_pending_syscall
27388+ ;;
27389+ // start restoring the state saved on the kernel stack (struct pt_regs):
27390+ ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
27391+ ld8 r11=[r3],PT(CR_IIP)-PT(R11)
27392+(pNonSys) break 0 // bug check: we shouldn't be here if pNonSys is TRUE!
27393+ ;;
27394+ invala // M0|1 invalidate ALAT
27395+#ifdef CONFIG_XEN
27396+ movl r28=XSI_PSR_I_ADDR
27397+ movl r29=XSI_PSR_IC
27398+ ;;
27399+ ld8 r28=[r28]
27400+ mov r30=1
27401+ ;;
27402+ st1 [r28]=r30
27403+ st4 [r29]=r0 // note: clears both vpsr.i and vpsr.ic!
27404+ ;;
27405+#else
27406+ rsm psr.i | psr.ic // M2 turn off interrupts and interruption collection
27407+#endif
27408+ cmp.eq p9,p0=r0,r0 // A set p9 to indicate that we should restore cr.ifs
27409+
27410+ ld8 r29=[r2],16 // M0|1 load cr.ipsr
27411+ ld8 r28=[r3],16 // M0|1 load cr.iip
27412+ mov r22=r0 // A clear r22
27413+ ;;
27414+ ld8 r30=[r2],16 // M0|1 load cr.ifs
27415+ ld8 r25=[r3],16 // M0|1 load ar.unat
27416+(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
27417+ ;;
27418+ ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
27419+(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
27420+ nop 0
27421+ ;;
27422+ ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
27423+ ld8 r27=[r3],PT(PR)-PT(AR_RSC) // M0|1 load ar.rsc
27424+ mov f6=f0 // F clear f6
27425+ ;;
27426+ ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // M0|1 load ar.rnat (may be garbage)
27427+ ld8 r31=[r3],PT(R1)-PT(PR) // M0|1 load predicates
27428+ mov f7=f0 // F clear f7
27429+ ;;
27430+ ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // M0|1 load ar.fpsr
27431+ ld8.fill r1=[r3],16 // M0|1 load r1
27432+(pUStk) mov r17=1 // A
27433+ ;;
27434+(pUStk) st1 [r14]=r17 // M2|3
27435+ ld8.fill r13=[r3],16 // M0|1
27436+ mov f8=f0 // F clear f8
27437+ ;;
27438+ ld8.fill r12=[r2] // M0|1 restore r12 (sp)
27439+ ld8.fill r15=[r3] // M0|1 restore r15
27440+ mov b6=r18 // I0 restore b6
27441+
27442+ addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
27443+ mov f9=f0 // F clear f9
27444+(pKStk) br.cond.dpnt.many skip_rbs_switch // B
27445+
27446+ srlz.d // M0 ensure interruption collection is off (for cover)
27447+ shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition
27448+#ifdef CONFIG_XEN
27449+ XEN_HYPER_COVER;
27450+#else
27451+ cover // B add current frame into dirty partition & set cr.ifs
27452+#endif
27453+ ;;
27454+(pUStk) ld4 r17=[r17] // M0|1 r17 = cpu_data->phys_stacked_size_p8
27455+ mov r19=ar.bsp // M2 get new backing store pointer
27456+ mov f10=f0 // F clear f10
27457+
27458+ nop.m 0
27459+ movl r14=__kernel_syscall_via_epc // X
27460+ ;;
27461+ mov.m ar.csd=r0 // M2 clear ar.csd
27462+ mov.m ar.ccv=r0 // M2 clear ar.ccv
27463+ mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc)
27464+
27465+ mov.m ar.ssd=r0 // M2 clear ar.ssd
27466+ mov f11=f0 // F clear f11
27467+ br.cond.sptk.many rbs_switch // B
27468+#ifdef CONFIG_XEN
27469+END(xen_leave_syscall)
27470+#else
27471+END(ia64_leave_syscall)
27472+#endif
27473+
27474+#ifdef CONFIG_XEN
27475+GLOBAL_ENTRY(xen_leave_kernel)
27476+ PT_REGS_UNWIND_INFO(0)
27477+ movl r22=running_on_xen;;
27478+ ld4 r22=[r22];;
27479+ cmp.eq p7,p0=r22,r0
27480+(p7) br.cond.sptk.many __ia64_leave_kernel;;
27481+#else
27482+GLOBAL_ENTRY(ia64_leave_kernel)
27483+ PT_REGS_UNWIND_INFO(0)
27484+#endif
27485+ /*
27486+ * work.need_resched etc. mustn't get changed by this CPU before it returns to
27487+ * user- or fsys-mode, hence we disable interrupts early on.
27488+ *
27489+ * p6 controls whether current_thread_info()->flags needs to be check for
27490+ * extra work. We always check for extra work when returning to user-level.
27491+ * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27492+ * is 0. After extra work processing has been completed, execution
27493+ * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27494+ * needs to be redone.
27495+ */
27496+#ifdef CONFIG_PREEMPT
27497+ rsm psr.i // disable interrupts
27498+ cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
27499+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27500+ ;;
27501+ .pred.rel.mutex pUStk,pKStk
27502+(pKStk) ld4 r21=[r20] // r21 <- preempt_count
27503+(pUStk) mov r21=0 // r21 <- 0
27504+ ;;
27505+ cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
27506+#else
27507+#ifdef CONFIG_XEN
27508+(pUStk) movl r17=XSI_PSR_I_ADDR
27509+(pUStk) mov r31=1
27510+ ;;
27511+(pUStk) ld8 r17=[r17]
27512+ ;;
27513+(pUStk) st1 [r17]=r31
27514+ ;;
27515+#else
27516+(pUStk) rsm psr.i
27517+#endif
27518+ cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
27519+(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27520+#endif
27521+.work_processed_kernel:
27522+ adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
27523+ ;;
27524+(p6) ld4 r31=[r17] // load current_thread_info()->flags
27525+ adds r21=PT(PR)+16,r12
27526+ ;;
27527+
27528+ lfetch [r21],PT(CR_IPSR)-PT(PR)
27529+ adds r2=PT(B6)+16,r12
27530+ adds r3=PT(R16)+16,r12
27531+ ;;
27532+ lfetch [r21]
27533+ ld8 r28=[r2],8 // load b6
27534+ adds r29=PT(R24)+16,r12
27535+
27536+ ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
27537+ adds r30=PT(AR_CCV)+16,r12
27538+(p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
27539+ ;;
27540+ ld8.fill r24=[r29]
27541+ ld8 r15=[r30] // load ar.ccv
27542+(p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending?
27543+ ;;
27544+ ld8 r29=[r2],16 // load b7
27545+ ld8 r30=[r3],16 // load ar.csd
27546+(p6) br.cond.spnt .work_pending
27547+ ;;
27548+ ld8 r31=[r2],16 // load ar.ssd
27549+ ld8.fill r8=[r3],16
27550+ ;;
27551+ ld8.fill r9=[r2],16
27552+ ld8.fill r10=[r3],PT(R17)-PT(R10)
27553+ ;;
27554+ ld8.fill r11=[r2],PT(R18)-PT(R11)
27555+ ld8.fill r17=[r3],16
27556+ ;;
27557+ ld8.fill r18=[r2],16
27558+ ld8.fill r19=[r3],16
27559+ ;;
27560+ ld8.fill r20=[r2],16
27561+ ld8.fill r21=[r3],16
27562+ mov ar.csd=r30
27563+ mov ar.ssd=r31
27564+ ;;
27565+#ifdef CONFIG_XEN
27566+ movl r23=XSI_PSR_I_ADDR
27567+ movl r22=XSI_PSR_IC
27568+ ;;
27569+ ld8 r23=[r23]
27570+ mov r25=1
27571+ ;;
27572+ st1 [r23]=r25
27573+ st4 [r22]=r0 // note: clears both vpsr.i and vpsr.ic!
27574+ ;;
27575+#else
27576+ rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection
27577+#endif
27578+ invala // invalidate ALAT
27579+ ;;
27580+ ld8.fill r22=[r2],24
27581+ ld8.fill r23=[r3],24
27582+ mov b6=r28
27583+ ;;
27584+ ld8.fill r25=[r2],16
27585+ ld8.fill r26=[r3],16
27586+ mov b7=r29
27587+ ;;
27588+ ld8.fill r27=[r2],16
27589+ ld8.fill r28=[r3],16
27590+ ;;
27591+ ld8.fill r29=[r2],16
27592+ ld8.fill r30=[r3],24
27593+ ;;
27594+ ld8.fill r31=[r2],PT(F9)-PT(R31)
27595+ adds r3=PT(F10)-PT(F6),r3
27596+ ;;
27597+ ldf.fill f9=[r2],PT(F6)-PT(F9)
27598+ ldf.fill f10=[r3],PT(F8)-PT(F10)
27599+ ;;
27600+ ldf.fill f6=[r2],PT(F7)-PT(F6)
27601+ ;;
27602+ ldf.fill f7=[r2],PT(F11)-PT(F7)
27603+ ldf.fill f8=[r3],32
27604+ ;;
27605+ srlz.d // ensure that inter. collection is off (VHPT is don't care, since text is pinned)
27606+ mov ar.ccv=r15
27607+ ;;
27608+ ldf.fill f11=[r2]
27609+#ifdef CONFIG_XEN
27610+ ;;
27611+ // r16-r31 all now hold bank1 values
27612+ movl r2=XSI_BANK1_R16
27613+ movl r3=XSI_BANK1_R16+8
27614+ ;;
27615+.mem.offset 0,0; st8.spill [r2]=r16,16
27616+.mem.offset 8,0; st8.spill [r3]=r17,16
27617+ ;;
27618+.mem.offset 0,0; st8.spill [r2]=r18,16
27619+.mem.offset 8,0; st8.spill [r3]=r19,16
27620+ ;;
27621+.mem.offset 0,0; st8.spill [r2]=r20,16
27622+.mem.offset 8,0; st8.spill [r3]=r21,16
27623+ ;;
27624+.mem.offset 0,0; st8.spill [r2]=r22,16
27625+.mem.offset 8,0; st8.spill [r3]=r23,16
27626+ ;;
27627+.mem.offset 0,0; st8.spill [r2]=r24,16
27628+.mem.offset 8,0; st8.spill [r3]=r25,16
27629+ ;;
27630+.mem.offset 0,0; st8.spill [r2]=r26,16
27631+.mem.offset 8,0; st8.spill [r3]=r27,16
27632+ ;;
27633+.mem.offset 0,0; st8.spill [r2]=r28,16
27634+.mem.offset 8,0; st8.spill [r3]=r29,16
27635+ ;;
27636+.mem.offset 0,0; st8.spill [r2]=r30,16
27637+.mem.offset 8,0; st8.spill [r3]=r31,16
27638+ ;;
27639+ movl r2=XSI_BANKNUM;;
27640+ st4 [r2]=r0;
27641+#else
27642+ bsw.0 // switch back to bank 0 (no stop bit required beforehand...)
27643+#endif
27644+ ;;
27645+(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
27646+ adds r16=PT(CR_IPSR)+16,r12
27647+ adds r17=PT(CR_IIP)+16,r12
27648+
27649+(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
27650+ nop.i 0
27651+ nop.i 0
27652+ ;;
27653+ ld8 r29=[r16],16 // load cr.ipsr
27654+ ld8 r28=[r17],16 // load cr.iip
27655+ ;;
27656+ ld8 r30=[r16],16 // load cr.ifs
27657+ ld8 r25=[r17],16 // load ar.unat
27658+ ;;
27659+ ld8 r26=[r16],16 // load ar.pfs
27660+ ld8 r27=[r17],16 // load ar.rsc
27661+ cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
27662+ ;;
27663+ ld8 r24=[r16],16 // load ar.rnat (may be garbage)
27664+ ld8 r23=[r17],16 // load ar.bspstore (may be garbage)
27665+ ;;
27666+ ld8 r31=[r16],16 // load predicates
27667+ ld8 r21=[r17],16 // load b0
27668+ ;;
27669+ ld8 r19=[r16],16 // load ar.rsc value for "loadrs"
27670+ ld8.fill r1=[r17],16 // load r1
27671+ ;;
27672+ ld8.fill r12=[r16],16
27673+ ld8.fill r13=[r17],16
27674+(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
27675+ ;;
27676+ ld8 r20=[r16],16 // ar.fpsr
27677+ ld8.fill r15=[r17],16
27678+ ;;
27679+ ld8.fill r14=[r16],16
27680+ ld8.fill r2=[r17]
27681+(pUStk) mov r17=1
27682+ ;;
27683+ ld8.fill r3=[r16]
27684+(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack
27685+ shr.u r18=r19,16 // get byte size of existing "dirty" partition
27686+ ;;
27687+ mov r16=ar.bsp // get existing backing store pointer
27688+ addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
27689+ ;;
27690+ ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8
27691+(pKStk) br.cond.dpnt skip_rbs_switch
27692+
27693+ /*
27694+ * Restore user backing store.
27695+ *
27696+ * NOTE: alloc, loadrs, and cover can't be predicated.
27697+ */
27698+(pNonSys) br.cond.dpnt dont_preserve_current_frame
27699+
27700+#ifdef CONFIG_XEN
27701+ XEN_HYPER_COVER;
27702+#else
27703+ cover // add current frame into dirty partition and set cr.ifs
27704+#endif
27705+ ;;
27706+ mov r19=ar.bsp // get new backing store pointer
27707+rbs_switch:
27708+ sub r16=r16,r18 // krbs = old bsp - size of dirty partition
27709+ cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs
27710+ ;;
27711+ sub r19=r19,r16 // calculate total byte size of dirty partition
27712+ add r18=64,r18 // don't force in0-in7 into memory...
27713+ ;;
27714+ shl r19=r19,16 // shift size of dirty partition into loadrs position
27715+ ;;
27716+dont_preserve_current_frame:
27717+ /*
27718+ * To prevent leaking bits between the kernel and user-space,
27719+ * we must clear the stacked registers in the "invalid" partition here.
27720+ * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
27721+ * 5 registers/cycle on McKinley).
27722+ */
27723+# define pRecurse p6
27724+# define pReturn p7
27725+#ifdef CONFIG_ITANIUM
27726+# define Nregs 10
27727+#else
27728+# define Nregs 14
27729+#endif
27730+ alloc loc0=ar.pfs,2,Nregs-2,2,0
27731+ shr.u loc1=r18,9 // RNaTslots <= floor(dirtySize / (64*8))
27732+ sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize
27733+ ;;
27734+ mov ar.rsc=r19 // load ar.rsc to be used for "loadrs"
27735+ shladd in0=loc1,3,r17
27736+ mov in1=0
27737+ ;;
27738+ TEXT_ALIGN(32)
27739+rse_clear_invalid:
27740+#ifdef CONFIG_ITANIUM
27741+ // cycle 0
27742+ { .mii
27743+ alloc loc0=ar.pfs,2,Nregs-2,2,0
27744+ cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
27745+ add out0=-Nregs*8,in0
27746+}{ .mfb
27747+ add out1=1,in1 // increment recursion count
27748+ nop.f 0
27749+ nop.b 0 // can't do br.call here because of alloc (WAW on CFM)
27750+ ;;
27751+}{ .mfi // cycle 1
27752+ mov loc1=0
27753+ nop.f 0
27754+ mov loc2=0
27755+}{ .mib
27756+ mov loc3=0
27757+ mov loc4=0
27758+(pRecurse) br.call.sptk.many b0=rse_clear_invalid
27759+
27760+}{ .mfi // cycle 2
27761+ mov loc5=0
27762+ nop.f 0
27763+ cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
27764+}{ .mib
27765+ mov loc6=0
27766+ mov loc7=0
27767+(pReturn) br.ret.sptk.many b0
27768+}
27769+#else /* !CONFIG_ITANIUM */
27770+ alloc loc0=ar.pfs,2,Nregs-2,2,0
27771+ cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
27772+ add out0=-Nregs*8,in0
27773+ add out1=1,in1 // increment recursion count
27774+ mov loc1=0
27775+ mov loc2=0
27776+ ;;
27777+ mov loc3=0
27778+ mov loc4=0
27779+ mov loc5=0
27780+ mov loc6=0
27781+ mov loc7=0
27782+(pRecurse) br.call.dptk.few b0=rse_clear_invalid
27783+ ;;
27784+ mov loc8=0
27785+ mov loc9=0
27786+ cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
27787+ mov loc10=0
27788+ mov loc11=0
27789+(pReturn) br.ret.dptk.many b0
27790+#endif /* !CONFIG_ITANIUM */
27791+# undef pRecurse
27792+# undef pReturn
27793+ ;;
27794+ alloc r17=ar.pfs,0,0,0,0 // drop current register frame
27795+ ;;
27796+ loadrs
27797+ ;;
27798+skip_rbs_switch:
27799+ mov ar.unat=r25 // M2
27800+(pKStk) extr.u r22=r22,21,1 // I0 extract current value of psr.pp from r22
27801+(pLvSys)mov r19=r0 // A clear r19 for leave_syscall, no-op otherwise
27802+ ;;
27803+(pUStk) mov ar.bspstore=r23 // M2
27804+(pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp
27805+(pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise
27806+ ;;
27807+#ifdef CONFIG_XEN
27808+ movl r25=XSI_IPSR
27809+ ;;
27810+ st8[r25]=r29,XSI_IFS_OFS-XSI_IPSR_OFS
27811+ ;;
27812+#else
27813+ mov cr.ipsr=r29 // M2
27814+#endif
27815+ mov ar.pfs=r26 // I0
27816+(pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise
27817+
27818+#ifdef CONFIG_XEN
27819+(p9) st8 [r25]=r30
27820+ ;;
27821+ adds r25=XSI_IIP_OFS-XSI_IFS_OFS,r25
27822+ ;;
27823+#else
27824+(p9) mov cr.ifs=r30 // M2
27825+#endif
27826+ mov b0=r21 // I0
27827+(pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise
27828+
27829+ mov ar.fpsr=r20 // M2
27830+#ifdef CONFIG_XEN
27831+ st8 [r25]=r28
27832+#else
27833+ mov cr.iip=r28 // M2
27834+#endif
27835+ nop 0
27836+ ;;
27837+(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode
27838+ nop 0
27839+(pLvSys)mov r2=r0
27840+
27841+ mov ar.rsc=r27 // M2
27842+ mov pr=r31,-1 // I0
27843+#ifdef CONFIG_XEN
27844+ ;;
27845+ XEN_HYPER_RFI;
27846+#else
27847+ rfi // B
27848+#endif
27849+
27850+ /*
27851+ * On entry:
27852+ * r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
27853+ * r31 = current->thread_info->flags
27854+ * On exit:
27855+ * p6 = TRUE if work-pending-check needs to be redone
27856+ */
27857+.work_pending_syscall:
27858+ add r2=-8,r2
27859+ add r3=-8,r3
27860+ ;;
27861+ st8 [r2]=r8
27862+ st8 [r3]=r10
27863+.work_pending:
27864+ tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context?
27865+(p6) br.cond.sptk.few .sigdelayed
27866+ ;;
27867+ tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0?
27868+(p6) br.cond.sptk.few .notify
27869+#ifdef CONFIG_PREEMPT
27870+(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
27871+ ;;
27872+(pKStk) st4 [r20]=r21
27873+ ssm psr.i // enable interrupts
27874+#endif
27875+ br.call.spnt.many rp=schedule
27876+.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1
27877+#ifdef CONFIG_XEN
27878+ movl r2=XSI_PSR_I_ADDR
27879+ mov r20=1
27880+ ;;
27881+ ld8 r2=[r2]
27882+ ;;
27883+ st1 [r2]=r20
27884+#else
27885+ rsm psr.i // disable interrupts
27886+#endif
27887+ ;;
27888+#ifdef CONFIG_PREEMPT
27889+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27890+ ;;
27891+(pKStk) st4 [r20]=r0 // preempt_count() <- 0
27892+#endif
27893+(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27894+ br.cond.sptk.many .work_processed_kernel // re-check
27895+
27896+.notify:
27897+(pUStk) br.call.spnt.many rp=notify_resume_user
27898+.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0
27899+(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27900+ br.cond.sptk.many .work_processed_kernel // don't re-check
27901+
27902+// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
27903+// it could not be delivered. Deliver it now. The signal might be for us and
27904+// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
27905+// signal.
27906+
27907+.sigdelayed:
27908+ br.call.sptk.many rp=do_sigdelayed
27909+ cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check
27910+(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27911+ br.cond.sptk.many .work_processed_kernel // re-check
27912+
27913+.work_pending_syscall_end:
27914+ adds r2=PT(R8)+16,r12
27915+ adds r3=PT(R10)+16,r12
27916+ ;;
27917+ ld8 r8=[r2]
27918+ ld8 r10=[r3]
27919+ br.cond.sptk.many .work_processed_syscall // re-check
27920+
27921+#ifdef CONFIG_XEN
27922+END(xen_leave_kernel)
27923+#else
27924+END(ia64_leave_kernel)
27925+#endif
27926diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenhpski.c linux-2.6.16.33/arch/ia64/xen/xenhpski.c
27927--- linux-2.6.16.33-noxen/arch/ia64/xen/xenhpski.c 1970-01-01 00:00:00.000000000 +0000
27928+++ linux-2.6.16.33/arch/ia64/xen/xenhpski.c 2007-01-08 15:00:45.000000000 +0000
27929@@ -0,0 +1,19 @@
27930+
27931+extern unsigned long xen_get_cpuid(int);
27932+
27933+int
27934+running_on_sim(void)
27935+{
27936+ int i;
27937+ long cpuid[6];
27938+
27939+ for (i = 0; i < 5; ++i)
27940+ cpuid[i] = xen_get_cpuid(i);
27941+ if ((cpuid[0] & 0xff) != 'H') return 0;
27942+ if ((cpuid[3] & 0xff) != 0x4) return 0;
27943+ if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
27944+ if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
27945+ if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
27946+ return 1;
27947+}
27948+
27949diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenivt.S linux-2.6.16.33/arch/ia64/xen/xenivt.S
27950--- linux-2.6.16.33-noxen/arch/ia64/xen/xenivt.S 1970-01-01 00:00:00.000000000 +0000
27951+++ linux-2.6.16.33/arch/ia64/xen/xenivt.S 2007-01-08 15:00:45.000000000 +0000
27952@@ -0,0 +1,2180 @@
27953+/*
27954+ * arch/ia64/xen/ivt.S
27955+ *
27956+ * Copyright (C) 2005 Hewlett-Packard Co
27957+ * Dan Magenheimer <dan.magenheimer@hp.com>
27958+ */
27959+/*
27960+ * This file defines the interruption vector table used by the CPU.
27961+ * It does not include one entry per possible cause of interruption.
27962+ *
27963+ * The first 20 entries of the table contain 64 bundles each while the
27964+ * remaining 48 entries contain only 16 bundles each.
27965+ *
27966+ * The 64 bundles are used to allow inlining the whole handler for critical
27967+ * interruptions like TLB misses.
27968+ *
27969+ * For each entry, the comment is as follows:
27970+ *
27971+ * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
27972+ * entry offset ----/ / / / /
27973+ * entry number ---------/ / / /
27974+ * size of the entry -------------/ / /
27975+ * vector name -------------------------------------/ /
27976+ * interruptions triggering this vector ----------------------/
27977+ *
27978+ * The table is 32KB in size and must be aligned on 32KB boundary.
27979+ * (The CPU ignores the 15 lower bits of the address)
27980+ *
27981+ * Table is based upon EAS2.6 (Oct 1999)
27982+ */
27983+
27984+#include <linux/config.h>
27985+
27986+#include <asm/asmmacro.h>
27987+#include <asm/break.h>
27988+#include <asm/ia32.h>
27989+#include <asm/kregs.h>
27990+#include <asm/asm-offsets.h>
27991+#include <asm/pgtable.h>
27992+#include <asm/processor.h>
27993+#include <asm/ptrace.h>
27994+#include <asm/system.h>
27995+#include <asm/thread_info.h>
27996+#include <asm/unistd.h>
27997+#include <asm/errno.h>
27998+
27999+#ifdef CONFIG_XEN
28000+#define ia64_ivt xen_ivt
28001+#endif
28002+
28003+#if 1
28004+# define PSR_DEFAULT_BITS psr.ac
28005+#else
28006+# define PSR_DEFAULT_BITS 0
28007+#endif
28008+
28009+#if 0
28010+ /*
28011+ * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't
28012+ * needed for something else before enabling this...
28013+ */
28014+# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
28015+#else
28016+# define DBG_FAULT(i)
28017+#endif
28018+
28019+#define MINSTATE_VIRT /* needed by minstate.h */
28020+#include "xenminstate.h"
28021+
28022+#define FAULT(n) \
28023+ mov r31=pr; \
28024+ mov r19=n;; /* prepare to save predicates */ \
28025+ br.sptk.many dispatch_to_fault_handler
28026+
28027+ .section .text.ivt,"ax"
28028+
28029+ .align 32768 // align on 32KB boundary
28030+ .global ia64_ivt
28031+ia64_ivt:
28032+/////////////////////////////////////////////////////////////////////////////////////////
28033+// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
28034+ENTRY(vhpt_miss)
28035+ DBG_FAULT(0)
28036+ /*
28037+ * The VHPT vector is invoked when the TLB entry for the virtual page table
28038+ * is missing. This happens only as a result of a previous
28039+ * (the "original") TLB miss, which may either be caused by an instruction
28040+ * fetch or a data access (or non-access).
28041+ *
28042+ * What we do here is normal TLB miss handing for the _original_ miss,
28043+ * followed by inserting the TLB entry for the virtual page table page
28044+ * that the VHPT walker was attempting to access. The latter gets
28045+ * inserted as long as page table entry above pte level have valid
28046+ * mappings for the faulting address. The TLB entry for the original
28047+ * miss gets inserted only if the pte entry indicates that the page is
28048+ * present.
28049+ *
28050+ * do_page_fault gets invoked in the following cases:
28051+ * - the faulting virtual address uses unimplemented address bits
28052+ * - the faulting virtual address has no valid page table mapping
28053+ */
28054+#ifdef CONFIG_XEN
28055+ movl r16=XSI_IFA
28056+ ;;
28057+ ld8 r16=[r16]
28058+#ifdef CONFIG_HUGETLB_PAGE
28059+ movl r18=PAGE_SHIFT
28060+ movl r25=XSI_ITIR
28061+ ;;
28062+ ld8 r25=[r25]
28063+#endif
28064+ ;;
28065+#else
28066+ mov r16=cr.ifa // get address that caused the TLB miss
28067+#ifdef CONFIG_HUGETLB_PAGE
28068+ movl r18=PAGE_SHIFT
28069+ mov r25=cr.itir
28070+#endif
28071+#endif
28072+ ;;
28073+#ifdef CONFIG_XEN
28074+ XEN_HYPER_RSM_PSR_DT;
28075+#else
28076+ rsm psr.dt // use physical addressing for data
28077+#endif
28078+ mov r31=pr // save the predicate registers
28079+ mov r19=IA64_KR(PT_BASE) // get page table base address
28080+ shl r21=r16,3 // shift bit 60 into sign bit
28081+ shr.u r17=r16,61 // get the region number into r17
28082+ ;;
28083+ shr.u r22=r21,3
28084+#ifdef CONFIG_HUGETLB_PAGE
28085+ extr.u r26=r25,2,6
28086+ ;;
28087+ cmp.ne p8,p0=r18,r26
28088+ sub r27=r26,r18
28089+ ;;
28090+(p8) dep r25=r18,r25,2,6
28091+(p8) shr r22=r22,r27
28092+#endif
28093+ ;;
28094+ cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5?
28095+ shr.u r18=r22,PGDIR_SHIFT // get bottom portion of pgd index bit
28096+ ;;
28097+(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
28098+
28099+ srlz.d
28100+ LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
28101+
28102+ .pred.rel "mutex", p6, p7
28103+(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28104+(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28105+ ;;
28106+(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
28107+(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
28108+ cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
28109+#ifdef CONFIG_PGTABLE_4
28110+ shr.u r28=r22,PUD_SHIFT // shift pud index into position
28111+#else
28112+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28113+#endif
28114+ ;;
28115+ ld8 r17=[r17] // get *pgd (may be 0)
28116+ ;;
28117+(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
28118+#ifdef CONFIG_PGTABLE_4
28119+ dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
28120+ ;;
28121+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28122+(p7) ld8 r29=[r28] // get *pud (may be 0)
28123+ ;;
28124+(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) == NULL?
28125+ dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
28126+#else
28127+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr)
28128+#endif
28129+ ;;
28130+(p7) ld8 r20=[r17] // get *pmd (may be 0)
28131+ shr.u r19=r22,PAGE_SHIFT // shift pte index into position
28132+ ;;
28133+(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was pmd_present(*pmd) == NULL?
28134+ dep r21=r19,r20,3,(PAGE_SHIFT-3) // r21=pte_offset(pmd,addr)
28135+ ;;
28136+(p7) ld8 r18=[r21] // read *pte
28137+#ifdef CONFIG_XEN
28138+ movl r19=XSI_ISR
28139+ ;;
28140+ ld8 r19=[r19]
28141+#else
28142+ mov r19=cr.isr // cr.isr bit 32 tells us if this is an insn miss
28143+#endif
28144+ ;;
28145+(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
28146+#ifdef CONFIG_XEN
28147+ movl r22=XSI_IHA
28148+ ;;
28149+ ld8 r22=[r22]
28150+#else
28151+ mov r22=cr.iha // get the VHPT address that caused the TLB miss
28152+#endif
28153+ ;; // avoid RAW on p7
28154+(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss?
28155+ dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
28156+ ;;
28157+#ifdef CONFIG_XEN
28158+ mov r24=r8
28159+ mov r8=r18
28160+ ;;
28161+(p10) XEN_HYPER_ITC_I
28162+ ;;
28163+(p11) XEN_HYPER_ITC_D
28164+ ;;
28165+ mov r8=r24
28166+ ;;
28167+#else
28168+(p10) itc.i r18 // insert the instruction TLB entry
28169+(p11) itc.d r18 // insert the data TLB entry
28170+#endif
28171+(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault)
28172+#ifdef CONFIG_XEN
28173+ movl r24=XSI_IFA
28174+ ;;
28175+ st8 [r24]=r22
28176+ ;;
28177+#else
28178+ mov cr.ifa=r22
28179+#endif
28180+
28181+#ifdef CONFIG_HUGETLB_PAGE
28182+(p8) mov cr.itir=r25 // change to default page-size for VHPT
28183+#endif
28184+
28185+ /*
28186+ * Now compute and insert the TLB entry for the virtual page table. We never
28187+ * execute in a page table page so there is no need to set the exception deferral
28188+ * bit.
28189+ */
28190+ adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
28191+ ;;
28192+#ifdef CONFIG_XEN
28193+(p7) mov r25=r8
28194+(p7) mov r8=r24
28195+ ;;
28196+(p7) XEN_HYPER_ITC_D
28197+ ;;
28198+(p7) mov r8=r25
28199+ ;;
28200+#else
28201+(p7) itc.d r24
28202+#endif
28203+ ;;
28204+#ifdef CONFIG_SMP
28205+ /*
28206+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28207+ * cannot possibly affect the following loads:
28208+ */
28209+ dv_serialize_data
28210+
28211+ /*
28212+ * Re-check pagetable entry. If they changed, we may have received a ptc.g
28213+ * between reading the pagetable and the "itc". If so, flush the entry we
28214+ * inserted and retry. At this point, we have:
28215+ *
28216+ * r28 = equivalent of pud_offset(pgd, ifa)
28217+ * r17 = equivalent of pmd_offset(pud, ifa)
28218+ * r21 = equivalent of pte_offset(pmd, ifa)
28219+ *
28220+ * r29 = *pud
28221+ * r20 = *pmd
28222+ * r18 = *pte
28223+ */
28224+ ld8 r25=[r21] // read *pte again
28225+ ld8 r26=[r17] // read *pmd again
28226+#ifdef CONFIG_PGTABLE_4
28227+ ld8 r19=[r28] // read *pud again
28228+#endif
28229+ cmp.ne p6,p7=r0,r0
28230+ ;;
28231+ cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
28232+#ifdef CONFIG_PGTABLE_4
28233+ cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
28234+#endif
28235+ mov r27=PAGE_SHIFT<<2
28236+ ;;
28237+(p6) ptc.l r22,r27 // purge PTE page translation
28238+(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change
28239+ ;;
28240+(p6) ptc.l r16,r27 // purge translation
28241+#endif
28242+
28243+ mov pr=r31,-1 // restore predicate registers
28244+#ifdef CONFIG_XEN
28245+ XEN_HYPER_RFI
28246+ dv_serialize_data
28247+#else
28248+ rfi
28249+#endif
28250+END(vhpt_miss)
28251+
28252+ .org ia64_ivt+0x400
28253+/////////////////////////////////////////////////////////////////////////////////////////
28254+// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
28255+ENTRY(itlb_miss)
28256+ DBG_FAULT(1)
28257+ /*
28258+ * The ITLB handler accesses the PTE via the virtually mapped linear
28259+ * page table. If a nested TLB miss occurs, we switch into physical
28260+ * mode, walk the page table, and then re-execute the PTE read and
28261+ * go on normally after that.
28262+ */
28263+#ifdef CONFIG_XEN
28264+ movl r16=XSI_IFA
28265+ ;;
28266+ ld8 r16=[r16]
28267+#else
28268+ mov r16=cr.ifa // get virtual address
28269+#endif
28270+ mov r29=b0 // save b0
28271+ mov r31=pr // save predicates
28272+.itlb_fault:
28273+#ifdef CONFIG_XEN
28274+ movl r17=XSI_IHA
28275+ ;;
28276+ ld8 r17=[r17] // get virtual address of L3 PTE
28277+#else
28278+ mov r17=cr.iha // get virtual address of PTE
28279+#endif
28280+ movl r30=1f // load nested fault continuation point
28281+ ;;
28282+1: ld8 r18=[r17] // read *pte
28283+ ;;
28284+ mov b0=r29
28285+ tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
28286+(p6) br.cond.spnt page_fault
28287+ ;;
28288+#ifdef CONFIG_XEN
28289+ mov r19=r8
28290+ mov r8=r18
28291+ ;;
28292+ XEN_HYPER_ITC_I
28293+ ;;
28294+ mov r8=r19
28295+#else
28296+ itc.i r18
28297+#endif
28298+ ;;
28299+#ifdef CONFIG_SMP
28300+ /*
28301+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28302+ * cannot possibly affect the following loads:
28303+ */
28304+ dv_serialize_data
28305+
28306+ ld8 r19=[r17] // read *pte again and see if same
28307+ mov r20=PAGE_SHIFT<<2 // setup page size for purge
28308+ ;;
28309+ cmp.ne p7,p0=r18,r19
28310+ ;;
28311+(p7) ptc.l r16,r20
28312+#endif
28313+ mov pr=r31,-1
28314+#ifdef CONFIG_XEN
28315+ XEN_HYPER_RFI
28316+ dv_serialize_data
28317+#else
28318+ rfi
28319+#endif
28320+END(itlb_miss)
28321+
28322+ .org ia64_ivt+0x0800
28323+/////////////////////////////////////////////////////////////////////////////////////////
28324+// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
28325+ENTRY(dtlb_miss)
28326+ DBG_FAULT(2)
28327+ /*
28328+ * The DTLB handler accesses the PTE via the virtually mapped linear
28329+ * page table. If a nested TLB miss occurs, we switch into physical
28330+ * mode, walk the page table, and then re-execute the PTE read and
28331+ * go on normally after that.
28332+ */
28333+#ifdef CONFIG_XEN
28334+ movl r16=XSI_IFA
28335+ ;;
28336+ ld8 r16=[r16]
28337+#else
28338+ mov r16=cr.ifa // get virtual address
28339+#endif
28340+ mov r29=b0 // save b0
28341+ mov r31=pr // save predicates
28342+dtlb_fault:
28343+#ifdef CONFIG_XEN
28344+ movl r17=XSI_IHA
28345+ ;;
28346+ ld8 r17=[r17] // get virtual address of L3 PTE
28347+#else
28348+ mov r17=cr.iha // get virtual address of PTE
28349+#endif
28350+ movl r30=1f // load nested fault continuation point
28351+ ;;
28352+1: ld8 r18=[r17] // read *pte
28353+ ;;
28354+ mov b0=r29
28355+ tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
28356+(p6) br.cond.spnt page_fault
28357+ ;;
28358+#ifdef CONFIG_XEN
28359+ mov r19=r8
28360+ mov r8=r18
28361+ ;;
28362+ XEN_HYPER_ITC_D
28363+ ;;
28364+ mov r8=r19
28365+ ;;
28366+#else
28367+ itc.d r18
28368+#endif
28369+ ;;
28370+#ifdef CONFIG_SMP
28371+ /*
28372+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28373+ * cannot possibly affect the following loads:
28374+ */
28375+ dv_serialize_data
28376+
28377+ ld8 r19=[r17] // read *pte again and see if same
28378+ mov r20=PAGE_SHIFT<<2 // setup page size for purge
28379+ ;;
28380+ cmp.ne p7,p0=r18,r19
28381+ ;;
28382+(p7) ptc.l r16,r20
28383+#endif
28384+ mov pr=r31,-1
28385+#ifdef CONFIG_XEN
28386+ XEN_HYPER_RFI
28387+ dv_serialize_data
28388+#else
28389+ rfi
28390+#endif
28391+END(dtlb_miss)
28392+
28393+ .org ia64_ivt+0x0c00
28394+/////////////////////////////////////////////////////////////////////////////////////////
28395+// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
28396+ENTRY(alt_itlb_miss)
28397+ DBG_FAULT(3)
28398+#ifdef CONFIG_XEN
28399+ movl r31=XSI_IPSR
28400+ ;;
28401+ ld8 r21=[r31],XSI_IFA_OFS-XSI_IPSR_OFS // get ipsr, point to ifa
28402+ movl r17=PAGE_KERNEL
28403+ ;;
28404+ ld8 r16=[r31] // get ifa
28405+#else
28406+ mov r16=cr.ifa // get address that caused the TLB miss
28407+ movl r17=PAGE_KERNEL
28408+ mov r21=cr.ipsr
28409+#endif
28410+ movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28411+ mov r31=pr
28412+ ;;
28413+#ifdef CONFIG_DISABLE_VHPT
28414+ shr.u r22=r16,61 // get the region number into r21
28415+ ;;
28416+ cmp.gt p8,p0=6,r22 // user mode
28417+ ;;
28418+#ifndef CONFIG_XEN
28419+(p8) thash r17=r16
28420+ ;;
28421+(p8) mov cr.iha=r17
28422+#endif
28423+(p8) mov r29=b0 // save b0
28424+(p8) br.cond.dptk .itlb_fault
28425+#endif
28426+ extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
28427+ and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
28428+ shr.u r18=r16,57 // move address bit 61 to bit 4
28429+ ;;
28430+ andcm r18=0x10,r18 // bit 4=~address-bit(61)
28431+ cmp.ne p8,p0=r0,r23 // psr.cpl != 0?
28432+ or r19=r17,r19 // insert PTE control bits into r19
28433+ ;;
28434+ or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
28435+(p8) br.cond.spnt page_fault
28436+ ;;
28437+#ifdef CONFIG_XEN
28438+ mov r18=r8
28439+ mov r8=r19
28440+ ;;
28441+ XEN_HYPER_ITC_I
28442+ ;;
28443+ mov r8=r18
28444+ ;;
28445+ mov pr=r31,-1
28446+ ;;
28447+ XEN_HYPER_RFI;
28448+#else
28449+ itc.i r19 // insert the TLB entry
28450+ mov pr=r31,-1
28451+ rfi
28452+#endif
28453+END(alt_itlb_miss)
28454+
28455+ .org ia64_ivt+0x1000
28456+/////////////////////////////////////////////////////////////////////////////////////////
28457+// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
28458+ENTRY(alt_dtlb_miss)
28459+ DBG_FAULT(4)
28460+#ifdef CONFIG_XEN
28461+ movl r31=XSI_IPSR
28462+ ;;
28463+ ld8 r21=[r31],XSI_ISR_OFS-XSI_IPSR_OFS // get ipsr, point to isr
28464+ movl r17=PAGE_KERNEL
28465+ ;;
28466+ ld8 r20=[r31],XSI_IFA_OFS-XSI_ISR_OFS // get isr, point to ifa
28467+ movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28468+ ;;
28469+ ld8 r16=[r31] // get ifa
28470+#else
28471+ mov r16=cr.ifa // get address that caused the TLB miss
28472+ movl r17=PAGE_KERNEL
28473+ mov r20=cr.isr
28474+ movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28475+ mov r21=cr.ipsr
28476+#endif
28477+ mov r31=pr
28478+ ;;
28479+#ifdef CONFIG_DISABLE_VHPT
28480+ shr.u r22=r16,61 // get the region number into r21
28481+ ;;
28482+ cmp.gt p8,p0=6,r22 // access to region 0-5
28483+ ;;
28484+#ifndef CONFIG_XEN
28485+(p8) thash r17=r16
28486+ ;;
28487+(p8) mov cr.iha=r17
28488+#endif
28489+(p8) mov r29=b0 // save b0
28490+(p8) br.cond.dptk dtlb_fault
28491+#endif
28492+ extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
28493+ and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
28494+ tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
28495+ shr.u r18=r16,57 // move address bit 61 to bit 4
28496+ and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
28497+ tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
28498+ ;;
28499+ andcm r18=0x10,r18 // bit 4=~address-bit(61)
28500+ cmp.ne p8,p0=r0,r23
28501+(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
28502+(p8) br.cond.spnt page_fault
28503+
28504+ dep r21=-1,r21,IA64_PSR_ED_BIT,1
28505+ or r19=r19,r17 // insert PTE control bits into r19
28506+ ;;
28507+ or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
28508+(p6) mov cr.ipsr=r21
28509+ ;;
28510+#ifdef CONFIG_XEN
28511+(p7) mov r18=r8
28512+(p7) mov r8=r19
28513+ ;;
28514+(p7) XEN_HYPER_ITC_D
28515+ ;;
28516+(p7) mov r8=r18
28517+ ;;
28518+ mov pr=r31,-1
28519+ ;;
28520+ XEN_HYPER_RFI;
28521+#else
28522+(p7) itc.d r19 // insert the TLB entry
28523+ mov pr=r31,-1
28524+ rfi
28525+#endif
28526+END(alt_dtlb_miss)
28527+
28528+ .org ia64_ivt+0x1400
28529+/////////////////////////////////////////////////////////////////////////////////////////
28530+// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
28531+ENTRY(nested_dtlb_miss)
28532+ /*
28533+ * In the absence of kernel bugs, we get here when the virtually mapped linear
28534+ * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
28535+ * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page
28536+ * table is missing, a nested TLB miss fault is triggered and control is
28537+ * transferred to this point. When this happens, we lookup the pte for the
28538+ * faulting address by walking the page table in physical mode and return to the
28539+ * continuation point passed in register r30 (or call page_fault if the address is
28540+ * not mapped).
28541+ *
28542+ * Input: r16: faulting address
28543+ * r29: saved b0
28544+ * r30: continuation address
28545+ * r31: saved pr
28546+ *
28547+ * Output: r17: physical address of PTE of faulting address
28548+ * r29: saved b0
28549+ * r30: continuation address
28550+ * r31: saved pr
28551+ *
28552+ * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared)
28553+ */
28554+#ifdef CONFIG_XEN
28555+ XEN_HYPER_RSM_PSR_DT;
28556+#else
28557+ rsm psr.dt // switch to using physical data addressing
28558+#endif
28559+ mov r19=IA64_KR(PT_BASE) // get the page table base address
28560+ shl r21=r16,3 // shift bit 60 into sign bit
28561+#ifdef CONFIG_XEN
28562+ movl r18=XSI_ITIR
28563+ ;;
28564+ ld8 r18=[r18]
28565+#else
28566+ mov r18=cr.itir
28567+#endif
28568+ ;;
28569+ shr.u r17=r16,61 // get the region number into r17
28570+ extr.u r18=r18,2,6 // get the faulting page size
28571+ ;;
28572+ cmp.eq p6,p7=5,r17 // is faulting address in region 5?
28573+ add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address
28574+ add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
28575+ ;;
28576+ shr.u r22=r16,r22
28577+ shr.u r18=r16,r18
28578+(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
28579+
28580+ srlz.d
28581+ LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
28582+
28583+ .pred.rel "mutex", p6, p7
28584+(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28585+(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28586+ ;;
28587+(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
28588+(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
28589+ cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
28590+#ifdef CONFIG_PGTABLE_4
28591+ shr.u r18=r22,PUD_SHIFT // shift pud index into position
28592+#else
28593+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28594+#endif
28595+ ;;
28596+ ld8 r17=[r17] // get *pgd (may be 0)
28597+ ;;
28598+(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
28599+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
28600+ ;;
28601+#ifdef CONFIG_PGTABLE_4
28602+(p7) ld8 r17=[r17] // get *pud (may be 0)
28603+ shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28604+ ;;
28605+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) == NULL?
28606+ dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
28607+ ;;
28608+#endif
28609+(p7) ld8 r17=[r17] // get *pmd (may be 0)
28610+ shr.u r19=r22,PAGE_SHIFT // shift pte index into position
28611+ ;;
28612+(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) == NULL?
28613+ dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr);
28614+(p6) br.cond.spnt page_fault
28615+ mov b0=r30
28616+ br.sptk.many b0 // return to continuation point
28617+END(nested_dtlb_miss)
28618+
28619+ .org ia64_ivt+0x1800
28620+/////////////////////////////////////////////////////////////////////////////////////////
28621+// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
28622+ENTRY(ikey_miss)
28623+ DBG_FAULT(6)
28624+ FAULT(6)
28625+END(ikey_miss)
28626+
28627+ //-----------------------------------------------------------------------------------
28628+ // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
28629+ENTRY(page_fault)
28630+#ifdef CONFIG_XEN
28631+ XEN_HYPER_SSM_PSR_DT
28632+#else
28633+ ssm psr.dt
28634+ ;;
28635+ srlz.i
28636+#endif
28637+ ;;
28638+ SAVE_MIN_WITH_COVER
28639+ alloc r15=ar.pfs,0,0,3,0
28640+#ifdef CONFIG_XEN
28641+ movl r3=XSI_ISR
28642+ ;;
28643+ ld8 out1=[r3],XSI_IFA_OFS-XSI_ISR_OFS // get vcr.isr, point to ifa
28644+ ;;
28645+ ld8 out0=[r3] // get vcr.ifa
28646+ mov r14=1
28647+ ;;
28648+ add r3=XSI_PSR_IC_OFS-XSI_IFA_OFS, r3 // point to vpsr.ic
28649+ ;;
28650+ st4 [r3]=r14 // vpsr.ic = 1
28651+ adds r3=8,r2 // set up second base pointer
28652+ ;;
28653+#else
28654+ mov out0=cr.ifa
28655+ mov out1=cr.isr
28656+ adds r3=8,r2 // set up second base pointer
28657+ ;;
28658+ ssm psr.ic | PSR_DEFAULT_BITS
28659+ ;;
28660+ srlz.i // guarantee that interruption collectin is on
28661+ ;;
28662+#endif
28663+#ifdef CONFIG_XEN
28664+ br.cond.sptk.many xen_page_fault
28665+ ;;
28666+done_xen_page_fault:
28667+#endif
28668+(p15) ssm psr.i // restore psr.i
28669+ movl r14=ia64_leave_kernel
28670+ ;;
28671+ SAVE_REST
28672+ mov rp=r14
28673+ ;;
28674+ adds out2=16,r12 // out2 = pointer to pt_regs
28675+ br.call.sptk.many b6=ia64_do_page_fault // ignore return address
28676+END(page_fault)
28677+
28678+ .org ia64_ivt+0x1c00
28679+/////////////////////////////////////////////////////////////////////////////////////////
28680+// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
28681+ENTRY(dkey_miss)
28682+ DBG_FAULT(7)
28683+ FAULT(7)
28684+#ifdef CONFIG_XEN
28685+ // Leaving this code inline above results in an IVT section overflow
28686+ // There is no particular reason for this code to be here...
28687+xen_page_fault:
28688+(p15) movl r3=XSI_PSR_I_ADDR
28689+ ;;
28690+(p15) ld8 r3=[r3]
28691+ ;;
28692+(p15) st1 [r3]=r0,-1 // if (p15) vpsr.i = 1
28693+ mov r14=r0
28694+ ;;
28695+(p15) ld1 r14=[r3] // if (pending_events)
28696+ adds r3=8,r2 // re-set up second base pointer
28697+ ;;
28698+(p15) cmp.ne p15,p0=r14,r0
28699+ ;;
28700+ br.cond.sptk.many done_xen_page_fault
28701+ ;;
28702+#endif
28703+END(dkey_miss)
28704+
28705+ .org ia64_ivt+0x2000
28706+/////////////////////////////////////////////////////////////////////////////////////////
28707+// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
28708+ENTRY(dirty_bit)
28709+ DBG_FAULT(8)
28710+ /*
28711+ * What we do here is to simply turn on the dirty bit in the PTE. We need to
28712+ * update both the page-table and the TLB entry. To efficiently access the PTE,
28713+ * we address it through the virtual page table. Most likely, the TLB entry for
28714+ * the relevant virtual page table page is still present in the TLB so we can
28715+ * normally do this without additional TLB misses. In case the necessary virtual
28716+ * page table TLB entry isn't present, we take a nested TLB miss hit where we look
28717+ * up the physical address of the L3 PTE and then continue at label 1 below.
28718+ */
28719+#ifdef CONFIG_XEN
28720+ movl r16=XSI_IFA
28721+ ;;
28722+ ld8 r16=[r16]
28723+ ;;
28724+#else
28725+ mov r16=cr.ifa // get the address that caused the fault
28726+#endif
28727+ movl r30=1f // load continuation point in case of nested fault
28728+ ;;
28729+#ifdef CONFIG_XEN
28730+ mov r18=r8;
28731+ mov r8=r16;
28732+ XEN_HYPER_THASH;;
28733+ mov r17=r8;
28734+ mov r8=r18;;
28735+#else
28736+ thash r17=r16 // compute virtual address of L3 PTE
28737+#endif
28738+ mov r29=b0 // save b0 in case of nested fault
28739+ mov r31=pr // save pr
28740+#ifdef CONFIG_SMP
28741+ mov r28=ar.ccv // save ar.ccv
28742+ ;;
28743+1: ld8 r18=[r17]
28744+ ;; // avoid RAW on r18
28745+ mov ar.ccv=r18 // set compare value for cmpxchg
28746+ or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
28747+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28748+ ;;
28749+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only update if page is present
28750+ mov r24=PAGE_SHIFT<<2
28751+ ;;
28752+(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present
28753+ ;;
28754+#ifdef CONFIG_XEN
28755+(p6) mov r18=r8
28756+(p6) mov r8=r25
28757+ ;;
28758+(p6) XEN_HYPER_ITC_D
28759+ ;;
28760+(p6) mov r8=r18
28761+#else
28762+(p6) itc.d r25 // install updated PTE
28763+#endif
28764+ ;;
28765+ /*
28766+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28767+ * cannot possibly affect the following loads:
28768+ */
28769+ dv_serialize_data
28770+
28771+ ld8 r18=[r17] // read PTE again
28772+ ;;
28773+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28774+ ;;
28775+(p7) ptc.l r16,r24
28776+ mov b0=r29 // restore b0
28777+ mov ar.ccv=r28
28778+#else
28779+ ;;
28780+1: ld8 r18=[r17]
28781+ ;; // avoid RAW on r18
28782+ or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
28783+ mov b0=r29 // restore b0
28784+ ;;
28785+ st8 [r17]=r18 // store back updated PTE
28786+ itc.d r18 // install updated PTE
28787+#endif
28788+ mov pr=r31,-1 // restore pr
28789+#ifdef CONFIG_XEN
28790+ XEN_HYPER_RFI
28791+ dv_serialize_data
28792+#else
28793+ rfi
28794+#endif
28795+END(dirty_bit)
28796+
28797+ .org ia64_ivt+0x2400
28798+/////////////////////////////////////////////////////////////////////////////////////////
28799+// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
28800+ENTRY(iaccess_bit)
28801+ DBG_FAULT(9)
28802+ // Like Entry 8, except for instruction access
28803+#ifdef CONFIG_XEN
28804+ movl r16=XSI_IFA
28805+ ;;
28806+ ld8 r16=[r16]
28807+ ;;
28808+#else
28809+ mov r16=cr.ifa // get the address that caused the fault
28810+#endif
28811+ movl r30=1f // load continuation point in case of nested fault
28812+ mov r31=pr // save predicates
28813+#ifdef CONFIG_ITANIUM
28814+ /*
28815+ * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
28816+ */
28817+ mov r17=cr.ipsr
28818+ ;;
28819+ mov r18=cr.iip
28820+ tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set?
28821+ ;;
28822+(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa
28823+#endif /* CONFIG_ITANIUM */
28824+ ;;
28825+#ifdef CONFIG_XEN
28826+ mov r18=r8;
28827+ mov r8=r16;
28828+ XEN_HYPER_THASH;;
28829+ mov r17=r8;
28830+ mov r8=r18;;
28831+#else
28832+ thash r17=r16 // compute virtual address of L3 PTE
28833+#endif
28834+ mov r29=b0 // save b0 in case of nested fault)
28835+#ifdef CONFIG_SMP
28836+ mov r28=ar.ccv // save ar.ccv
28837+ ;;
28838+1: ld8 r18=[r17]
28839+ ;;
28840+ mov ar.ccv=r18 // set compare value for cmpxchg
28841+ or r25=_PAGE_A,r18 // set the accessed bit
28842+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28843+ ;;
28844+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page present
28845+ mov r24=PAGE_SHIFT<<2
28846+ ;;
28847+(p6) cmp.eq p6,p7=r26,r18 // Only if page present
28848+ ;;
28849+#ifdef CONFIG_XEN
28850+ mov r26=r8
28851+ mov r8=r25
28852+ ;;
28853+(p6) XEN_HYPER_ITC_I
28854+ ;;
28855+ mov r8=r26
28856+ ;;
28857+#else
28858+(p6) itc.i r25 // install updated PTE
28859+#endif
28860+ ;;
28861+ /*
28862+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28863+ * cannot possibly affect the following loads:
28864+ */
28865+ dv_serialize_data
28866+
28867+ ld8 r18=[r17] // read PTE again
28868+ ;;
28869+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28870+ ;;
28871+(p7) ptc.l r16,r24
28872+ mov b0=r29 // restore b0
28873+ mov ar.ccv=r28
28874+#else /* !CONFIG_SMP */
28875+ ;;
28876+1: ld8 r18=[r17]
28877+ ;;
28878+ or r18=_PAGE_A,r18 // set the accessed bit
28879+ mov b0=r29 // restore b0
28880+ ;;
28881+ st8 [r17]=r18 // store back updated PTE
28882+ itc.i r18 // install updated PTE
28883+#endif /* !CONFIG_SMP */
28884+ mov pr=r31,-1
28885+#ifdef CONFIG_XEN
28886+ XEN_HYPER_RFI
28887+ dv_serialize_data
28888+#else
28889+ rfi
28890+#endif
28891+END(iaccess_bit)
28892+
28893+ .org ia64_ivt+0x2800
28894+/////////////////////////////////////////////////////////////////////////////////////////
28895+// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
28896+ENTRY(daccess_bit)
28897+ DBG_FAULT(10)
28898+ // Like Entry 8, except for data access
28899+#ifdef CONFIG_XEN
28900+ movl r16=XSI_IFA
28901+ ;;
28902+ ld8 r16=[r16]
28903+ ;;
28904+#else
28905+ mov r16=cr.ifa // get the address that caused the fault
28906+#endif
28907+ movl r30=1f // load continuation point in case of nested fault
28908+ ;;
28909+#ifdef CONFIG_XEN
28910+ mov r18=r8
28911+ mov r8=r16
28912+ XEN_HYPER_THASH
28913+ ;;
28914+ mov r17=r8
28915+ mov r8=r18
28916+ ;;
28917+#else
28918+ thash r17=r16 // compute virtual address of L3 PTE
28919+#endif
28920+ mov r31=pr
28921+ mov r29=b0 // save b0 in case of nested fault)
28922+#ifdef CONFIG_SMP
28923+ mov r28=ar.ccv // save ar.ccv
28924+ ;;
28925+1: ld8 r18=[r17]
28926+ ;; // avoid RAW on r18
28927+ mov ar.ccv=r18 // set compare value for cmpxchg
28928+ or r25=_PAGE_A,r18 // set the dirty bit
28929+ tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28930+ ;;
28931+(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page is present
28932+ mov r24=PAGE_SHIFT<<2
28933+ ;;
28934+(p6) cmp.eq p6,p7=r26,r18 // Only if page is present
28935+ ;;
28936+#ifdef CONFIG_XEN
28937+ mov r26=r8
28938+ mov r8=r25
28939+ ;;
28940+(p6) XEN_HYPER_ITC_D
28941+ ;;
28942+ mov r8=r26
28943+ ;;
28944+#else
28945+(p6) itc.d r25 // install updated PTE
28946+#endif
28947+ /*
28948+ * Tell the assemblers dependency-violation checker that the above "itc" instructions
28949+ * cannot possibly affect the following loads:
28950+ */
28951+ dv_serialize_data
28952+ ;;
28953+ ld8 r18=[r17] // read PTE again
28954+ ;;
28955+ cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28956+ ;;
28957+(p7) ptc.l r16,r24
28958+ mov ar.ccv=r28
28959+#else
28960+ ;;
28961+1: ld8 r18=[r17]
28962+ ;; // avoid RAW on r18
28963+ or r18=_PAGE_A,r18 // set the accessed bit
28964+ ;;
28965+ st8 [r17]=r18 // store back updated PTE
28966+ itc.d r18 // install updated PTE
28967+#endif
28968+ mov b0=r29 // restore b0
28969+ mov pr=r31,-1
28970+#ifdef CONFIG_XEN
28971+ XEN_HYPER_RFI
28972+ dv_serialize_data
28973+#else
28974+ rfi
28975+#endif
28976+END(daccess_bit)
28977+
28978+ .org ia64_ivt+0x2c00
28979+/////////////////////////////////////////////////////////////////////////////////////////
28980+// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
28981+ENTRY(break_fault)
28982+ /*
28983+ * The streamlined system call entry/exit paths only save/restore the initial part
28984+ * of pt_regs. This implies that the callers of system-calls must adhere to the
28985+ * normal procedure calling conventions.
28986+ *
28987+ * Registers to be saved & restored:
28988+ * CR registers: cr.ipsr, cr.iip, cr.ifs
28989+ * AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
28990+ * others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
28991+ * Registers to be restored only:
28992+ * r8-r11: output value from the system call.
28993+ *
28994+ * During system call exit, scratch registers (including r15) are modified/cleared
28995+ * to prevent leaking bits from kernel to user level.
28996+ */
28997+ DBG_FAULT(11)
28998+ mov.m r16=IA64_KR(CURRENT) // M2 r16 <- current task (12 cyc)
28999+#ifdef CONFIG_XEN
29000+ movl r22=XSI_IPSR
29001+ ;;
29002+ ld8 r29=[r22],XSI_IIM_OFS-XSI_IPSR_OFS // get ipsr, point to iip
29003+#else
29004+ mov r29=cr.ipsr // M2 (12 cyc)
29005+#endif
29006+ mov r31=pr // I0 (2 cyc)
29007+
29008+#ifdef CONFIG_XEN
29009+ ;;
29010+ ld8 r17=[r22],XSI_IIP_OFS-XSI_IIM_OFS
29011+#else
29012+ mov r17=cr.iim // M2 (2 cyc)
29013+#endif
29014+ mov.m r27=ar.rsc // M2 (12 cyc)
29015+ mov r18=__IA64_BREAK_SYSCALL // A
29016+
29017+ mov.m ar.rsc=0 // M2
29018+ mov.m r21=ar.fpsr // M2 (12 cyc)
29019+ mov r19=b6 // I0 (2 cyc)
29020+ ;;
29021+ mov.m r23=ar.bspstore // M2 (12 cyc)
29022+ mov.m r24=ar.rnat // M2 (5 cyc)
29023+ mov.i r26=ar.pfs // I0 (2 cyc)
29024+
29025+ invala // M0|1
29026+ nop.m 0 // M
29027+ mov r20=r1 // A save r1
29028+
29029+ nop.m 0
29030+ movl r30=sys_call_table // X
29031+
29032+#ifdef CONFIG_XEN
29033+ ld8 r28=[r22]
29034+#else
29035+ mov r28=cr.iip // M2 (2 cyc)
29036+#endif
29037+ cmp.eq p0,p7=r18,r17 // I0 is this a system call?
29038+(p7) br.cond.spnt non_syscall // B no ->
29039+ //
29040+ // From this point on, we are definitely on the syscall-path
29041+ // and we can use (non-banked) scratch registers.
29042+ //
29043+///////////////////////////////////////////////////////////////////////
29044+ mov r1=r16 // A move task-pointer to "addl"-addressable reg
29045+ mov r2=r16 // A setup r2 for ia64_syscall_setup
29046+ add r9=TI_FLAGS+IA64_TASK_SIZE,r16 // A r9 = &current_thread_info()->flags
29047+
29048+ adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
29049+ adds r15=-1024,r15 // A subtract 1024 from syscall number
29050+ mov r3=NR_syscalls - 1
29051+ ;;
29052+ ld1.bias r17=[r16] // M0|1 r17 = current->thread.on_ustack flag
29053+ ld4 r9=[r9] // M0|1 r9 = current_thread_info()->flags
29054+ extr.u r8=r29,41,2 // I0 extract ei field from cr.ipsr
29055+
29056+ shladd r30=r15,3,r30 // A r30 = sys_call_table + 8*(syscall-1024)
29057+ addl r22=IA64_RBS_OFFSET,r1 // A compute base of RBS
29058+ cmp.leu p6,p7=r15,r3 // A syscall number in range?
29059+ ;;
29060+
29061+ lfetch.fault.excl.nt1 [r22] // M0|1 prefetch RBS
29062+(p6) ld8 r30=[r30] // M0|1 load address of syscall entry point
29063+ tnat.nz.or p7,p0=r15 // I0 is syscall nr a NaT?
29064+
29065+ mov.m ar.bspstore=r22 // M2 switch to kernel RBS
29066+ cmp.eq p8,p9=2,r8 // A isr.ei==2?
29067+ ;;
29068+
29069+(p8) mov r8=0 // A clear ei to 0
29070+(p7) movl r30=sys_ni_syscall // X
29071+
29072+(p8) adds r28=16,r28 // A switch cr.iip to next bundle
29073+(p9) adds r8=1,r8 // A increment ei to next slot
29074+ nop.i 0
29075+ ;;
29076+
29077+ mov.m r25=ar.unat // M2 (5 cyc)
29078+ dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr
29079+ adds r15=1024,r15 // A restore original syscall number
29080+ //
29081+ // If any of the above loads miss in L1D, we'll stall here until
29082+ // the data arrives.
29083+ //
29084+///////////////////////////////////////////////////////////////////////
29085+ st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag
29086+ mov b6=r30 // I0 setup syscall handler branch reg early
29087+ cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already?
29088+
29089+ and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit
29090+ mov r18=ar.bsp // M2 (12 cyc)
29091+(pKStk) br.cond.spnt .break_fixup // B we're already in kernel-mode -- fix up RBS
29092+ ;;
29093+.back_from_break_fixup:
29094+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A compute base of memory stack
29095+ cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited?
29096+ br.call.sptk.many b7=ia64_syscall_setup // B
29097+1:
29098+ mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0
29099+ nop 0
29100+#ifdef CONFIG_XEN
29101+ mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
29102+#else
29103+ bsw.1 // B (6 cyc) regs are saved, switch to bank 1
29104+#endif
29105+ ;;
29106+
29107+#ifdef CONFIG_XEN
29108+ movl r16=XSI_PSR_IC
29109+ mov r3=1
29110+ ;;
29111+ st4 [r16]=r3,XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS // vpsr.ic = 1
29112+#else
29113+ ssm psr.ic | PSR_DEFAULT_BITS // M2 now it's safe to re-enable intr.-collection
29114+#endif
29115+ movl r3=ia64_ret_from_syscall // X
29116+ ;;
29117+
29118+ srlz.i // M0 ensure interruption collection is on
29119+ mov rp=r3 // I0 set the real return addr
29120+(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT
29121+
29122+#ifdef CONFIG_XEN
29123+(p15) ld8 r16=[r16] // vpsr.i
29124+ ;;
29125+(p15) st1 [r16]=r0,-1 // if (p15) vpsr.i = 1
29126+ mov r2=r0
29127+ ;;
29128+(p15) ld1 r2=[r16] // if (pending_events)
29129+ ;;
29130+ cmp.ne p6,p0=r2,r0
29131+ ;;
29132+(p6) ssm psr.i // do a real ssm psr.i
29133+#else
29134+(p15) ssm psr.i // M2 restore psr.i
29135+#endif
29136+(p14) br.call.sptk.many b6=b6 // B invoke syscall-handker (ignore return addr)
29137+ br.cond.spnt.many ia64_trace_syscall // B do syscall-tracing thingamagic
29138+ // NOT REACHED
29139+///////////////////////////////////////////////////////////////////////
29140+ // On entry, we optimistically assumed that we're coming from user-space.
29141+ // For the rare cases where a system-call is done from within the kernel,
29142+ // we fix things up at this point:
29143+.break_fixup:
29144+ add r1=-IA64_PT_REGS_SIZE,sp // A allocate space for pt_regs structure
29145+ mov ar.rnat=r24 // M2 restore kernel's AR.RNAT
29146+ ;;
29147+ mov ar.bspstore=r23 // M2 restore kernel's AR.BSPSTORE
29148+ br.cond.sptk .back_from_break_fixup
29149+END(break_fault)
29150+
29151+ .org ia64_ivt+0x3000
29152+/////////////////////////////////////////////////////////////////////////////////////////
29153+// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
29154+ENTRY(interrupt)
29155+ DBG_FAULT(12)
29156+ mov r31=pr // prepare to save predicates
29157+ ;;
29158+ SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3
29159+#ifdef CONFIG_XEN
29160+ movl r3=XSI_PSR_IC
29161+ mov r14=1
29162+ ;;
29163+ st4 [r3]=r14
29164+#else
29165+ ssm psr.ic | PSR_DEFAULT_BITS
29166+#endif
29167+ ;;
29168+ adds r3=8,r2 // set up second base pointer for SAVE_REST
29169+ srlz.i // ensure everybody knows psr.ic is back on
29170+ ;;
29171+ SAVE_REST
29172+ ;;
29173+ alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
29174+#ifdef CONFIG_XEN
29175+ ;;
29176+ br.call.sptk.many rp=xen_get_ivr
29177+ ;;
29178+ mov out0=r8 // pass cr.ivr as first arg
29179+#else
29180+ mov out0=cr.ivr // pass cr.ivr as first arg
29181+#endif
29182+ add out1=16,sp // pass pointer to pt_regs as second arg
29183+ ;;
29184+ srlz.d // make sure we see the effect of cr.ivr
29185+ movl r14=ia64_leave_kernel
29186+ ;;
29187+ mov rp=r14
29188+ br.call.sptk.many b6=ia64_handle_irq
29189+END(interrupt)
29190+
29191+ .org ia64_ivt+0x3400
29192+/////////////////////////////////////////////////////////////////////////////////////////
29193+// 0x3400 Entry 13 (size 64 bundles) Reserved
29194+ DBG_FAULT(13)
29195+ FAULT(13)
29196+
29197+ .org ia64_ivt+0x3800
29198+/////////////////////////////////////////////////////////////////////////////////////////
29199+// 0x3800 Entry 14 (size 64 bundles) Reserved
29200+ DBG_FAULT(14)
29201+ FAULT(14)
29202+
29203+ /*
29204+ * There is no particular reason for this code to be here, other than that
29205+ * there happens to be space here that would go unused otherwise. If this
29206+ * fault ever gets "unreserved", simply moved the following code to a more
29207+ * suitable spot...
29208+ *
29209+ * ia64_syscall_setup() is a separate subroutine so that it can
29210+ * allocate stacked registers so it can safely demine any
29211+ * potential NaT values from the input registers.
29212+ *
29213+ * On entry:
29214+ * - executing on bank 0 or bank 1 register set (doesn't matter)
29215+ * - r1: stack pointer
29216+ * - r2: current task pointer
29217+ * - r3: preserved
29218+ * - r11: original contents (saved ar.pfs to be saved)
29219+ * - r12: original contents (sp to be saved)
29220+ * - r13: original contents (tp to be saved)
29221+ * - r15: original contents (syscall # to be saved)
29222+ * - r18: saved bsp (after switching to kernel stack)
29223+ * - r19: saved b6
29224+ * - r20: saved r1 (gp)
29225+ * - r21: saved ar.fpsr
29226+ * - r22: kernel's register backing store base (krbs_base)
29227+ * - r23: saved ar.bspstore
29228+ * - r24: saved ar.rnat
29229+ * - r25: saved ar.unat
29230+ * - r26: saved ar.pfs
29231+ * - r27: saved ar.rsc
29232+ * - r28: saved cr.iip
29233+ * - r29: saved cr.ipsr
29234+ * - r31: saved pr
29235+ * - b0: original contents (to be saved)
29236+ * On exit:
29237+ * - p10: TRUE if syscall is invoked with more than 8 out
29238+ * registers or r15's Nat is true
29239+ * - r1: kernel's gp
29240+ * - r3: preserved (same as on entry)
29241+ * - r8: -EINVAL if p10 is true
29242+ * - r12: points to kernel stack
29243+ * - r13: points to current task
29244+ * - r14: preserved (same as on entry)
29245+ * - p13: preserved
29246+ * - p15: TRUE if interrupts need to be re-enabled
29247+ * - ar.fpsr: set to kernel settings
29248+ * - b6: preserved (same as on entry)
29249+ */
29250+#ifndef CONFIG_XEN
29251+GLOBAL_ENTRY(ia64_syscall_setup)
29252+#if PT(B6) != 0
29253+# error This code assumes that b6 is the first field in pt_regs.
29254+#endif
29255+ st8 [r1]=r19 // save b6
29256+ add r16=PT(CR_IPSR),r1 // initialize first base pointer
29257+ add r17=PT(R11),r1 // initialize second base pointer
29258+ ;;
29259+ alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable
29260+ st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr
29261+ tnat.nz p8,p0=in0
29262+
29263+ st8.spill [r17]=r11,PT(CR_IIP)-PT(R11) // save r11
29264+ tnat.nz p9,p0=in1
29265+(pKStk) mov r18=r0 // make sure r18 isn't NaT
29266+ ;;
29267+
29268+ st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS) // save ar.pfs
29269+ st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP) // save cr.iip
29270+ mov r28=b0 // save b0 (2 cyc)
29271+ ;;
29272+
29273+ st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT) // save ar.unat
29274+ dep r19=0,r19,38,26 // clear all bits but 0..37 [I0]
29275+(p8) mov in0=-1
29276+ ;;
29277+
29278+ st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS) // store ar.pfs.pfm in cr.ifs
29279+ extr.u r11=r19,7,7 // I0 // get sol of ar.pfs
29280+ and r8=0x7f,r19 // A // get sof of ar.pfs
29281+
29282+ st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
29283+ tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
29284+(p9) mov in1=-1
29285+ ;;
29286+
29287+(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8
29288+ tnat.nz p10,p0=in2
29289+ add r11=8,r11
29290+ ;;
29291+(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16 // skip over ar_rnat field
29292+(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17 // skip over ar_bspstore field
29293+ tnat.nz p11,p0=in3
29294+ ;;
29295+(p10) mov in2=-1
29296+ tnat.nz p12,p0=in4 // [I0]
29297+(p11) mov in3=-1
29298+ ;;
29299+(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT) // save ar.rnat
29300+(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE) // save ar.bspstore
29301+ shl r18=r18,16 // compute ar.rsc to be used for "loadrs"
29302+ ;;
29303+ st8 [r16]=r31,PT(LOADRS)-PT(PR) // save predicates
29304+ st8 [r17]=r28,PT(R1)-PT(B0) // save b0
29305+ tnat.nz p13,p0=in5 // [I0]
29306+ ;;
29307+ st8 [r16]=r18,PT(R12)-PT(LOADRS) // save ar.rsc value for "loadrs"
29308+ st8.spill [r17]=r20,PT(R13)-PT(R1) // save original r1
29309+(p12) mov in4=-1
29310+ ;;
29311+
29312+.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12) // save r12
29313+.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13) // save r13
29314+(p13) mov in5=-1
29315+ ;;
29316+ st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr
29317+ tnat.nz p13,p0=in6
29318+ cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8
29319+ ;;
29320+ mov r8=1
29321+(p9) tnat.nz p10,p0=r15
29322+ adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch)
29323+
29324+ st8.spill [r17]=r15 // save r15
29325+ tnat.nz p8,p0=in7
29326+ nop.i 0
29327+
29328+ mov r13=r2 // establish `current'
29329+ movl r1=__gp // establish kernel global pointer
29330+ ;;
29331+ st8 [r16]=r8 // ensure pt_regs.r8 != 0 (see handle_syscall_error)
29332+(p13) mov in6=-1
29333+(p8) mov in7=-1
29334+
29335+ cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
29336+ movl r17=FPSR_DEFAULT
29337+ ;;
29338+ mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value
29339+(p10) mov r8=-EINVAL
29340+ br.ret.sptk.many b7
29341+END(ia64_syscall_setup)
29342+#endif
29343+
29344+ .org ia64_ivt+0x3c00
29345+/////////////////////////////////////////////////////////////////////////////////////////
29346+// 0x3c00 Entry 15 (size 64 bundles) Reserved
29347+ DBG_FAULT(15)
29348+ FAULT(15)
29349+
29350+ /*
29351+ * Squatting in this space ...
29352+ *
29353+ * This special case dispatcher for illegal operation faults allows preserved
29354+ * registers to be modified through a callback function (asm only) that is handed
29355+ * back from the fault handler in r8. Up to three arguments can be passed to the
29356+ * callback function by returning an aggregate with the callback as its first
29357+ * element, followed by the arguments.
29358+ */
29359+ENTRY(dispatch_illegal_op_fault)
29360+ .prologue
29361+ .body
29362+ SAVE_MIN_WITH_COVER
29363+ ssm psr.ic | PSR_DEFAULT_BITS
29364+ ;;
29365+ srlz.i // guarantee that interruption collection is on
29366+ ;;
29367+(p15) ssm psr.i // restore psr.i
29368+ adds r3=8,r2 // set up second base pointer for SAVE_REST
29369+ ;;
29370+ alloc r14=ar.pfs,0,0,1,0 // must be first in insn group
29371+ mov out0=ar.ec
29372+ ;;
29373+ SAVE_REST
29374+ PT_REGS_UNWIND_INFO(0)
29375+ ;;
29376+ br.call.sptk.many rp=ia64_illegal_op_fault
29377+.ret0: ;;
29378+ alloc r14=ar.pfs,0,0,3,0 // must be first in insn group
29379+ mov out0=r9
29380+ mov out1=r10
29381+ mov out2=r11
29382+ movl r15=ia64_leave_kernel
29383+ ;;
29384+ mov rp=r15
29385+ mov b6=r8
29386+ ;;
29387+ cmp.ne p6,p0=0,r8
29388+(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel
29389+ br.sptk.many ia64_leave_kernel
29390+END(dispatch_illegal_op_fault)
29391+
29392+ .org ia64_ivt+0x4000
29393+/////////////////////////////////////////////////////////////////////////////////////////
29394+// 0x4000 Entry 16 (size 64 bundles) Reserved
29395+ DBG_FAULT(16)
29396+ FAULT(16)
29397+
29398+ .org ia64_ivt+0x4400
29399+/////////////////////////////////////////////////////////////////////////////////////////
29400+// 0x4400 Entry 17 (size 64 bundles) Reserved
29401+ DBG_FAULT(17)
29402+ FAULT(17)
29403+
29404+ENTRY(non_syscall)
29405+ mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER
29406+ ;;
29407+ SAVE_MIN_WITH_COVER
29408+
29409+ // There is no particular reason for this code to be here, other than that
29410+ // there happens to be space here that would go unused otherwise. If this
29411+ // fault ever gets "unreserved", simply moved the following code to a more
29412+ // suitable spot...
29413+
29414+ alloc r14=ar.pfs,0,0,2,0
29415+ mov out0=cr.iim
29416+ add out1=16,sp
29417+ adds r3=8,r2 // set up second base pointer for SAVE_REST
29418+
29419+ ssm psr.ic | PSR_DEFAULT_BITS
29420+ ;;
29421+ srlz.i // guarantee that interruption collection is on
29422+ ;;
29423+(p15) ssm psr.i // restore psr.i
29424+ movl r15=ia64_leave_kernel
29425+ ;;
29426+ SAVE_REST
29427+ mov rp=r15
29428+ ;;
29429+ br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr
29430+END(non_syscall)
29431+
29432+ .org ia64_ivt+0x4800
29433+/////////////////////////////////////////////////////////////////////////////////////////
29434+// 0x4800 Entry 18 (size 64 bundles) Reserved
29435+ DBG_FAULT(18)
29436+ FAULT(18)
29437+
29438+ /*
29439+ * There is no particular reason for this code to be here, other than that
29440+ * there happens to be space here that would go unused otherwise. If this
29441+ * fault ever gets "unreserved", simply moved the following code to a more
29442+ * suitable spot...
29443+ */
29444+
29445+ENTRY(dispatch_unaligned_handler)
29446+ SAVE_MIN_WITH_COVER
29447+ ;;
29448+ alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
29449+ mov out0=cr.ifa
29450+ adds out1=16,sp
29451+
29452+ ssm psr.ic | PSR_DEFAULT_BITS
29453+ ;;
29454+ srlz.i // guarantee that interruption collection is on
29455+ ;;
29456+(p15) ssm psr.i // restore psr.i
29457+ adds r3=8,r2 // set up second base pointer
29458+ ;;
29459+ SAVE_REST
29460+ movl r14=ia64_leave_kernel
29461+ ;;
29462+ mov rp=r14
29463+ br.sptk.many ia64_prepare_handle_unaligned
29464+END(dispatch_unaligned_handler)
29465+
29466+ .org ia64_ivt+0x4c00
29467+/////////////////////////////////////////////////////////////////////////////////////////
29468+// 0x4c00 Entry 19 (size 64 bundles) Reserved
29469+ DBG_FAULT(19)
29470+ FAULT(19)
29471+
29472+ /*
29473+ * There is no particular reason for this code to be here, other than that
29474+ * there happens to be space here that would go unused otherwise. If this
29475+ * fault ever gets "unreserved", simply moved the following code to a more
29476+ * suitable spot...
29477+ */
29478+
29479+ENTRY(dispatch_to_fault_handler)
29480+ /*
29481+ * Input:
29482+ * psr.ic: off
29483+ * r19: fault vector number (e.g., 24 for General Exception)
29484+ * r31: contains saved predicates (pr)
29485+ */
29486+ SAVE_MIN_WITH_COVER_R19
29487+ alloc r14=ar.pfs,0,0,5,0
29488+ mov out0=r15
29489+#ifdef CONFIG_XEN
29490+ movl out1=XSI_ISR
29491+ ;;
29492+ adds out2=XSI_IFA-XSI_ISR,out1
29493+ adds out3=XSI_IIM-XSI_ISR,out1
29494+ adds out4=XSI_ITIR-XSI_ISR,out1
29495+ ;;
29496+ ld8 out1=[out1]
29497+ ld8 out2=[out2]
29498+ ld8 out3=[out4]
29499+ ld8 out4=[out4]
29500+ ;;
29501+#else
29502+ mov out1=cr.isr
29503+ mov out2=cr.ifa
29504+ mov out3=cr.iim
29505+ mov out4=cr.itir
29506+ ;;
29507+#endif
29508+ ssm psr.ic | PSR_DEFAULT_BITS
29509+ ;;
29510+ srlz.i // guarantee that interruption collection is on
29511+ ;;
29512+(p15) ssm psr.i // restore psr.i
29513+ adds r3=8,r2 // set up second base pointer for SAVE_REST
29514+ ;;
29515+ SAVE_REST
29516+ movl r14=ia64_leave_kernel
29517+ ;;
29518+ mov rp=r14
29519+ br.call.sptk.many b6=ia64_fault
29520+END(dispatch_to_fault_handler)
29521+
29522+//
29523+// --- End of long entries, Beginning of short entries
29524+//
29525+
29526+ .org ia64_ivt+0x5000
29527+/////////////////////////////////////////////////////////////////////////////////////////
29528+// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
29529+ENTRY(page_not_present)
29530+ DBG_FAULT(20)
29531+ mov r16=cr.ifa
29532+ rsm psr.dt
29533+ /*
29534+ * The Linux page fault handler doesn't expect non-present pages to be in
29535+ * the TLB. Flush the existing entry now, so we meet that expectation.
29536+ */
29537+ mov r17=PAGE_SHIFT<<2
29538+ ;;
29539+ ptc.l r16,r17
29540+ ;;
29541+ mov r31=pr
29542+ srlz.d
29543+ br.sptk.many page_fault
29544+END(page_not_present)
29545+
29546+ .org ia64_ivt+0x5100
29547+/////////////////////////////////////////////////////////////////////////////////////////
29548+// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
29549+ENTRY(key_permission)
29550+ DBG_FAULT(21)
29551+ mov r16=cr.ifa
29552+ rsm psr.dt
29553+ mov r31=pr
29554+ ;;
29555+ srlz.d
29556+ br.sptk.many page_fault
29557+END(key_permission)
29558+
29559+ .org ia64_ivt+0x5200
29560+/////////////////////////////////////////////////////////////////////////////////////////
29561+// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
29562+ENTRY(iaccess_rights)
29563+ DBG_FAULT(22)
29564+ mov r16=cr.ifa
29565+ rsm psr.dt
29566+ mov r31=pr
29567+ ;;
29568+ srlz.d
29569+ br.sptk.many page_fault
29570+END(iaccess_rights)
29571+
29572+ .org ia64_ivt+0x5300
29573+/////////////////////////////////////////////////////////////////////////////////////////
29574+// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
29575+ENTRY(daccess_rights)
29576+ DBG_FAULT(23)
29577+#ifdef CONFIG_XEN
29578+ movl r16=XSI_IFA
29579+ ;;
29580+ ld8 r16=[r16]
29581+ ;;
29582+ XEN_HYPER_RSM_PSR_DT
29583+#else
29584+ mov r16=cr.ifa
29585+ rsm psr.dt
29586+#endif
29587+ mov r31=pr
29588+ ;;
29589+ srlz.d
29590+ br.sptk.many page_fault
29591+END(daccess_rights)
29592+
29593+ .org ia64_ivt+0x5400
29594+/////////////////////////////////////////////////////////////////////////////////////////
29595+// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
29596+ENTRY(general_exception)
29597+ DBG_FAULT(24)
29598+ mov r16=cr.isr
29599+ mov r31=pr
29600+ ;;
29601+ cmp4.eq p6,p0=0,r16
29602+(p6) br.sptk.many dispatch_illegal_op_fault
29603+ ;;
29604+ mov r19=24 // fault number
29605+ br.sptk.many dispatch_to_fault_handler
29606+END(general_exception)
29607+
29608+ .org ia64_ivt+0x5500
29609+/////////////////////////////////////////////////////////////////////////////////////////
29610+// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
29611+ENTRY(disabled_fp_reg)
29612+ DBG_FAULT(25)
29613+ rsm psr.dfh // ensure we can access fph
29614+ ;;
29615+ srlz.d
29616+ mov r31=pr
29617+ mov r19=25
29618+ br.sptk.many dispatch_to_fault_handler
29619+END(disabled_fp_reg)
29620+
29621+ .org ia64_ivt+0x5600
29622+/////////////////////////////////////////////////////////////////////////////////////////
29623+// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
29624+ENTRY(nat_consumption)
29625+ DBG_FAULT(26)
29626+
29627+ mov r16=cr.ipsr
29628+ mov r17=cr.isr
29629+ mov r31=pr // save PR
29630+ ;;
29631+ and r18=0xf,r17 // r18 = cr.ipsr.code{3:0}
29632+ tbit.z p6,p0=r17,IA64_ISR_NA_BIT
29633+ ;;
29634+ cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
29635+ dep r16=-1,r16,IA64_PSR_ED_BIT,1
29636+(p6) br.cond.spnt 1f // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
29637+ ;;
29638+ mov cr.ipsr=r16 // set cr.ipsr.na
29639+ mov pr=r31,-1
29640+ ;;
29641+ rfi
29642+
29643+1: mov pr=r31,-1
29644+ ;;
29645+ FAULT(26)
29646+END(nat_consumption)
29647+
29648+ .org ia64_ivt+0x5700
29649+/////////////////////////////////////////////////////////////////////////////////////////
29650+// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
29651+ENTRY(speculation_vector)
29652+ DBG_FAULT(27)
29653+ /*
29654+ * A [f]chk.[as] instruction needs to take the branch to the recovery code but
29655+ * this part of the architecture is not implemented in hardware on some CPUs, such
29656+ * as Itanium. Thus, in general we need to emulate the behavior. IIM contains
29657+ * the relative target (not yet sign extended). So after sign extending it we
29658+ * simply add it to IIP. We also need to reset the EI field of the IPSR to zero,
29659+ * i.e., the slot to restart into.
29660+ *
29661+ * cr.imm contains zero_ext(imm21)
29662+ */
29663+ mov r18=cr.iim
29664+ ;;
29665+ mov r17=cr.iip
29666+ shl r18=r18,43 // put sign bit in position (43=64-21)
29667+ ;;
29668+
29669+ mov r16=cr.ipsr
29670+ shr r18=r18,39 // sign extend (39=43-4)
29671+ ;;
29672+
29673+ add r17=r17,r18 // now add the offset
29674+ ;;
29675+ mov cr.iip=r17
29676+ dep r16=0,r16,41,2 // clear EI
29677+ ;;
29678+
29679+ mov cr.ipsr=r16
29680+ ;;
29681+
29682+#ifdef CONFIG_XEN
29683+ XEN_HYPER_RFI;
29684+#else
29685+ rfi // and go back
29686+#endif
29687+END(speculation_vector)
29688+
29689+ .org ia64_ivt+0x5800
29690+/////////////////////////////////////////////////////////////////////////////////////////
29691+// 0x5800 Entry 28 (size 16 bundles) Reserved
29692+ DBG_FAULT(28)
29693+ FAULT(28)
29694+
29695+ .org ia64_ivt+0x5900
29696+/////////////////////////////////////////////////////////////////////////////////////////
29697+// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
29698+ENTRY(debug_vector)
29699+ DBG_FAULT(29)
29700+ FAULT(29)
29701+END(debug_vector)
29702+
29703+ .org ia64_ivt+0x5a00
29704+/////////////////////////////////////////////////////////////////////////////////////////
29705+// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
29706+ENTRY(unaligned_access)
29707+ DBG_FAULT(30)
29708+ mov r31=pr // prepare to save predicates
29709+ ;;
29710+ br.sptk.many dispatch_unaligned_handler
29711+END(unaligned_access)
29712+
29713+ .org ia64_ivt+0x5b00
29714+/////////////////////////////////////////////////////////////////////////////////////////
29715+// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
29716+ENTRY(unsupported_data_reference)
29717+ DBG_FAULT(31)
29718+ FAULT(31)
29719+END(unsupported_data_reference)
29720+
29721+ .org ia64_ivt+0x5c00
29722+/////////////////////////////////////////////////////////////////////////////////////////
29723+// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
29724+ENTRY(floating_point_fault)
29725+ DBG_FAULT(32)
29726+ FAULT(32)
29727+END(floating_point_fault)
29728+
29729+ .org ia64_ivt+0x5d00
29730+/////////////////////////////////////////////////////////////////////////////////////////
29731+// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
29732+ENTRY(floating_point_trap)
29733+ DBG_FAULT(33)
29734+ FAULT(33)
29735+END(floating_point_trap)
29736+
29737+ .org ia64_ivt+0x5e00
29738+/////////////////////////////////////////////////////////////////////////////////////////
29739+// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
29740+ENTRY(lower_privilege_trap)
29741+ DBG_FAULT(34)
29742+ FAULT(34)
29743+END(lower_privilege_trap)
29744+
29745+ .org ia64_ivt+0x5f00
29746+/////////////////////////////////////////////////////////////////////////////////////////
29747+// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
29748+ENTRY(taken_branch_trap)
29749+ DBG_FAULT(35)
29750+ FAULT(35)
29751+END(taken_branch_trap)
29752+
29753+ .org ia64_ivt+0x6000
29754+/////////////////////////////////////////////////////////////////////////////////////////
29755+// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
29756+ENTRY(single_step_trap)
29757+ DBG_FAULT(36)
29758+ FAULT(36)
29759+END(single_step_trap)
29760+
29761+ .org ia64_ivt+0x6100
29762+/////////////////////////////////////////////////////////////////////////////////////////
29763+// 0x6100 Entry 37 (size 16 bundles) Reserved
29764+ DBG_FAULT(37)
29765+ FAULT(37)
29766+
29767+ .org ia64_ivt+0x6200
29768+/////////////////////////////////////////////////////////////////////////////////////////
29769+// 0x6200 Entry 38 (size 16 bundles) Reserved
29770+ DBG_FAULT(38)
29771+ FAULT(38)
29772+
29773+ .org ia64_ivt+0x6300
29774+/////////////////////////////////////////////////////////////////////////////////////////
29775+// 0x6300 Entry 39 (size 16 bundles) Reserved
29776+ DBG_FAULT(39)
29777+ FAULT(39)
29778+
29779+ .org ia64_ivt+0x6400
29780+/////////////////////////////////////////////////////////////////////////////////////////
29781+// 0x6400 Entry 40 (size 16 bundles) Reserved
29782+ DBG_FAULT(40)
29783+ FAULT(40)
29784+
29785+ .org ia64_ivt+0x6500
29786+/////////////////////////////////////////////////////////////////////////////////////////
29787+// 0x6500 Entry 41 (size 16 bundles) Reserved
29788+ DBG_FAULT(41)
29789+ FAULT(41)
29790+
29791+ .org ia64_ivt+0x6600
29792+/////////////////////////////////////////////////////////////////////////////////////////
29793+// 0x6600 Entry 42 (size 16 bundles) Reserved
29794+ DBG_FAULT(42)
29795+ FAULT(42)
29796+
29797+ .org ia64_ivt+0x6700
29798+/////////////////////////////////////////////////////////////////////////////////////////
29799+// 0x6700 Entry 43 (size 16 bundles) Reserved
29800+ DBG_FAULT(43)
29801+ FAULT(43)
29802+
29803+ .org ia64_ivt+0x6800
29804+/////////////////////////////////////////////////////////////////////////////////////////
29805+// 0x6800 Entry 44 (size 16 bundles) Reserved
29806+ DBG_FAULT(44)
29807+ FAULT(44)
29808+
29809+ .org ia64_ivt+0x6900
29810+/////////////////////////////////////////////////////////////////////////////////////////
29811+// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
29812+ENTRY(ia32_exception)
29813+ DBG_FAULT(45)
29814+ FAULT(45)
29815+END(ia32_exception)
29816+
29817+ .org ia64_ivt+0x6a00
29818+/////////////////////////////////////////////////////////////////////////////////////////
29819+// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71)
29820+ENTRY(ia32_intercept)
29821+ DBG_FAULT(46)
29822+#ifdef CONFIG_IA32_SUPPORT
29823+ mov r31=pr
29824+ mov r16=cr.isr
29825+ ;;
29826+ extr.u r17=r16,16,8 // get ISR.code
29827+ mov r18=ar.eflag
29828+ mov r19=cr.iim // old eflag value
29829+ ;;
29830+ cmp.ne p6,p0=2,r17
29831+(p6) br.cond.spnt 1f // not a system flag fault
29832+ xor r16=r18,r19
29833+ ;;
29834+ extr.u r17=r16,18,1 // get the eflags.ac bit
29835+ ;;
29836+ cmp.eq p6,p0=0,r17
29837+(p6) br.cond.spnt 1f // eflags.ac bit didn't change
29838+ ;;
29839+ mov pr=r31,-1 // restore predicate registers
29840+#ifdef CONFIG_XEN
29841+ XEN_HYPER_RFI;
29842+#else
29843+ rfi
29844+#endif
29845+
29846+1:
29847+#endif // CONFIG_IA32_SUPPORT
29848+ FAULT(46)
29849+END(ia32_intercept)
29850+
29851+ .org ia64_ivt+0x6b00
29852+/////////////////////////////////////////////////////////////////////////////////////////
29853+// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74)
29854+ENTRY(ia32_interrupt)
29855+ DBG_FAULT(47)
29856+#ifdef CONFIG_IA32_SUPPORT
29857+ mov r31=pr
29858+ br.sptk.many dispatch_to_ia32_handler
29859+#else
29860+ FAULT(47)
29861+#endif
29862+END(ia32_interrupt)
29863+
29864+ .org ia64_ivt+0x6c00
29865+/////////////////////////////////////////////////////////////////////////////////////////
29866+// 0x6c00 Entry 48 (size 16 bundles) Reserved
29867+ DBG_FAULT(48)
29868+ FAULT(48)
29869+
29870+ .org ia64_ivt+0x6d00
29871+/////////////////////////////////////////////////////////////////////////////////////////
29872+// 0x6d00 Entry 49 (size 16 bundles) Reserved
29873+ DBG_FAULT(49)
29874+ FAULT(49)
29875+
29876+ .org ia64_ivt+0x6e00
29877+/////////////////////////////////////////////////////////////////////////////////////////
29878+// 0x6e00 Entry 50 (size 16 bundles) Reserved
29879+ DBG_FAULT(50)
29880+ FAULT(50)
29881+
29882+ .org ia64_ivt+0x6f00
29883+/////////////////////////////////////////////////////////////////////////////////////////
29884+// 0x6f00 Entry 51 (size 16 bundles) Reserved
29885+ DBG_FAULT(51)
29886+ FAULT(51)
29887+
29888+ .org ia64_ivt+0x7000
29889+/////////////////////////////////////////////////////////////////////////////////////////
29890+// 0x7000 Entry 52 (size 16 bundles) Reserved
29891+ DBG_FAULT(52)
29892+ FAULT(52)
29893+
29894+ .org ia64_ivt+0x7100
29895+/////////////////////////////////////////////////////////////////////////////////////////
29896+// 0x7100 Entry 53 (size 16 bundles) Reserved
29897+ DBG_FAULT(53)
29898+ FAULT(53)
29899+
29900+ .org ia64_ivt+0x7200
29901+/////////////////////////////////////////////////////////////////////////////////////////
29902+// 0x7200 Entry 54 (size 16 bundles) Reserved
29903+ DBG_FAULT(54)
29904+ FAULT(54)
29905+
29906+ .org ia64_ivt+0x7300
29907+/////////////////////////////////////////////////////////////////////////////////////////
29908+// 0x7300 Entry 55 (size 16 bundles) Reserved
29909+ DBG_FAULT(55)
29910+ FAULT(55)
29911+
29912+ .org ia64_ivt+0x7400
29913+/////////////////////////////////////////////////////////////////////////////////////////
29914+// 0x7400 Entry 56 (size 16 bundles) Reserved
29915+ DBG_FAULT(56)
29916+ FAULT(56)
29917+
29918+ .org ia64_ivt+0x7500
29919+/////////////////////////////////////////////////////////////////////////////////////////
29920+// 0x7500 Entry 57 (size 16 bundles) Reserved
29921+ DBG_FAULT(57)
29922+ FAULT(57)
29923+
29924+ .org ia64_ivt+0x7600
29925+/////////////////////////////////////////////////////////////////////////////////////////
29926+// 0x7600 Entry 58 (size 16 bundles) Reserved
29927+ DBG_FAULT(58)
29928+ FAULT(58)
29929+
29930+ .org ia64_ivt+0x7700
29931+/////////////////////////////////////////////////////////////////////////////////////////
29932+// 0x7700 Entry 59 (size 16 bundles) Reserved
29933+ DBG_FAULT(59)
29934+ FAULT(59)
29935+
29936+ .org ia64_ivt+0x7800
29937+/////////////////////////////////////////////////////////////////////////////////////////
29938+// 0x7800 Entry 60 (size 16 bundles) Reserved
29939+ DBG_FAULT(60)
29940+ FAULT(60)
29941+
29942+ .org ia64_ivt+0x7900
29943+/////////////////////////////////////////////////////////////////////////////////////////
29944+// 0x7900 Entry 61 (size 16 bundles) Reserved
29945+ DBG_FAULT(61)
29946+ FAULT(61)
29947+
29948+ .org ia64_ivt+0x7a00
29949+/////////////////////////////////////////////////////////////////////////////////////////
29950+// 0x7a00 Entry 62 (size 16 bundles) Reserved
29951+ DBG_FAULT(62)
29952+ FAULT(62)
29953+
29954+ .org ia64_ivt+0x7b00
29955+/////////////////////////////////////////////////////////////////////////////////////////
29956+// 0x7b00 Entry 63 (size 16 bundles) Reserved
29957+ DBG_FAULT(63)
29958+ FAULT(63)
29959+
29960+ .org ia64_ivt+0x7c00
29961+/////////////////////////////////////////////////////////////////////////////////////////
29962+// 0x7c00 Entry 64 (size 16 bundles) Reserved
29963+ DBG_FAULT(64)
29964+ FAULT(64)
29965+
29966+ .org ia64_ivt+0x7d00
29967+/////////////////////////////////////////////////////////////////////////////////////////
29968+// 0x7d00 Entry 65 (size 16 bundles) Reserved
29969+ DBG_FAULT(65)
29970+ FAULT(65)
29971+
29972+ .org ia64_ivt+0x7e00
29973+/////////////////////////////////////////////////////////////////////////////////////////
29974+// 0x7e00 Entry 66 (size 16 bundles) Reserved
29975+ DBG_FAULT(66)
29976+ FAULT(66)
29977+
29978+#ifdef CONFIG_XEN
29979+ /*
29980+ * There is no particular reason for this code to be here, other than that
29981+ * there happens to be space here that would go unused otherwise. If this
29982+ * fault ever gets "unreserved", simply moved the following code to a more
29983+ * suitable spot...
29984+ */
29985+
29986+GLOBAL_ENTRY(xen_bsw1)
29987+ /* FIXME: THIS CODE IS NOT NaT SAFE! */
29988+ movl r30=XSI_BANKNUM;
29989+ mov r31=1;;
29990+ st4 [r30]=r31;
29991+ movl r30=XSI_BANK1_R16;
29992+ movl r31=XSI_BANK1_R16+8;;
29993+ ld8 r16=[r30],16; ld8 r17=[r31],16;;
29994+ ld8 r18=[r30],16; ld8 r19=[r31],16;;
29995+ ld8 r20=[r30],16; ld8 r21=[r31],16;;
29996+ ld8 r22=[r30],16; ld8 r23=[r31],16;;
29997+ ld8 r24=[r30],16; ld8 r25=[r31],16;;
29998+ ld8 r26=[r30],16; ld8 r27=[r31],16;;
29999+ ld8 r28=[r30],16; ld8 r29=[r31],16;;
30000+ ld8 r30=[r30]; ld8 r31=[r31];;
30001+ br.ret.sptk.many b0
30002+END(xen_bsw1)
30003+#endif
30004+
30005+ .org ia64_ivt+0x7f00
30006+/////////////////////////////////////////////////////////////////////////////////////////
30007+// 0x7f00 Entry 67 (size 16 bundles) Reserved
30008+ DBG_FAULT(67)
30009+ FAULT(67)
30010+
30011+#ifdef CONFIG_IA32_SUPPORT
30012+
30013+ /*
30014+ * There is no particular reason for this code to be here, other than that
30015+ * there happens to be space here that would go unused otherwise. If this
30016+ * fault ever gets "unreserved", simply moved the following code to a more
30017+ * suitable spot...
30018+ */
30019+
30020+ // IA32 interrupt entry point
30021+
30022+ENTRY(dispatch_to_ia32_handler)
30023+ SAVE_MIN
30024+ ;;
30025+ mov r14=cr.isr
30026+ ssm psr.ic | PSR_DEFAULT_BITS
30027+ ;;
30028+ srlz.i // guarantee that interruption collection is on
30029+ ;;
30030+(p15) ssm psr.i
30031+ adds r3=8,r2 // Base pointer for SAVE_REST
30032+ ;;
30033+ SAVE_REST
30034+ ;;
30035+ mov r15=0x80
30036+ shr r14=r14,16 // Get interrupt number
30037+ ;;
30038+ cmp.ne p6,p0=r14,r15
30039+(p6) br.call.dpnt.many b6=non_ia32_syscall
30040+
30041+ adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
30042+ adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
30043+ ;;
30044+ cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
30045+ ld8 r8=[r14] // get r8
30046+ ;;
30047+ st8 [r15]=r8 // save original EAX in r1 (IA32 procs don't use the GP)
30048+ ;;
30049+ alloc r15=ar.pfs,0,0,6,0 // must first in an insn group
30050+ ;;
30051+ ld4 r8=[r14],8 // r8 == eax (syscall number)
30052+ mov r15=IA32_NR_syscalls
30053+ ;;
30054+ cmp.ltu.unc p6,p7=r8,r15
30055+ ld4 out1=[r14],8 // r9 == ecx
30056+ ;;
30057+ ld4 out2=[r14],8 // r10 == edx
30058+ ;;
30059+ ld4 out0=[r14] // r11 == ebx
30060+ adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
30061+ ;;
30062+ ld4 out5=[r14],PT(R14)-PT(R13) // r13 == ebp
30063+ ;;
30064+ ld4 out3=[r14],PT(R15)-PT(R14) // r14 == esi
30065+ adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
30066+ ;;
30067+ ld4 out4=[r14] // r15 == edi
30068+ movl r16=ia32_syscall_table
30069+ ;;
30070+(p6) shladd r16=r8,3,r16 // force ni_syscall if not valid syscall number
30071+ ld4 r2=[r2] // r2 = current_thread_info()->flags
30072+ ;;
30073+ ld8 r16=[r16]
30074+ and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit
30075+ ;;
30076+ mov b6=r16
30077+ movl r15=ia32_ret_from_syscall
30078+ cmp.eq p8,p0=r2,r0
30079+ ;;
30080+ mov rp=r15
30081+(p8) br.call.sptk.many b6=b6
30082+ br.cond.sptk ia32_trace_syscall
30083+
30084+non_ia32_syscall:
30085+ alloc r15=ar.pfs,0,0,2,0
30086+ mov out0=r14 // interrupt #
30087+ add out1=16,sp // pointer to pt_regs
30088+ ;; // avoid WAW on CFM
30089+ br.call.sptk.many rp=ia32_bad_interrupt
30090+.ret1: movl r15=ia64_leave_kernel
30091+ ;;
30092+ mov rp=r15
30093+ br.ret.sptk.many rp
30094+END(dispatch_to_ia32_handler)
30095+#endif /* CONFIG_IA32_SUPPORT */
30096+
30097+#ifdef CONFIG_XEN
30098+ .section .text,"ax"
30099+GLOBAL_ENTRY(xen_event_callback)
30100+ mov r31=pr // prepare to save predicates
30101+ ;;
30102+ SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3
30103+ ;;
30104+ movl r3=XSI_PSR_IC
30105+ mov r14=1
30106+ ;;
30107+ st4 [r3]=r14
30108+ ;;
30109+ adds r3=8,r2 // set up second base pointer for SAVE_REST
30110+ srlz.i // ensure everybody knows psr.ic is back on
30111+ ;;
30112+ SAVE_REST
30113+ ;;
30114+1:
30115+ alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
30116+ add out0=16,sp // pass pointer to pt_regs as first arg
30117+ ;;
30118+ br.call.sptk.many b0=evtchn_do_upcall
30119+ ;;
30120+ movl r20=XSI_PSR_I_ADDR
30121+ ;;
30122+ ld8 r20=[r20]
30123+ ;;
30124+ adds r20=-1,r20 // vcpu_info->evtchn_upcall_pending
30125+ ;;
30126+ ld1 r20=[r20]
30127+ ;;
30128+ cmp.ne p6,p0=r20,r0 // if there are pending events,
30129+ (p6) br.spnt.few 1b // call evtchn_do_upcall again.
30130+ br.sptk.many ia64_leave_kernel
30131+END(xen_event_callback)
30132+#endif
30133diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenminstate.h linux-2.6.16.33/arch/ia64/xen/xenminstate.h
30134--- linux-2.6.16.33-noxen/arch/ia64/xen/xenminstate.h 1970-01-01 00:00:00.000000000 +0000
30135+++ linux-2.6.16.33/arch/ia64/xen/xenminstate.h 2007-01-08 15:00:45.000000000 +0000
30136@@ -0,0 +1,369 @@
30137+#include <linux/config.h>
30138+
30139+#include <asm/cache.h>
30140+
30141+#ifdef CONFIG_XEN
30142+#include "../kernel/entry.h"
30143+#else
30144+#include "entry.h"
30145+#endif
30146+
30147+/*
30148+ * For ivt.s we want to access the stack virtually so we don't have to disable translation
30149+ * on interrupts.
30150+ *
30151+ * On entry:
30152+ * r1: pointer to current task (ar.k6)
30153+ */
30154+#define MINSTATE_START_SAVE_MIN_VIRT \
30155+(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
30156+ ;; \
30157+(pUStk) mov.m r24=ar.rnat; \
30158+(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \
30159+(pKStk) mov r1=sp; /* get sp */ \
30160+ ;; \
30161+(pUStk) lfetch.fault.excl.nt1 [r22]; \
30162+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
30163+(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
30164+ ;; \
30165+(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
30166+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
30167+ ;; \
30168+(pUStk) mov r18=ar.bsp; \
30169+(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
30170+
30171+#define MINSTATE_END_SAVE_MIN_VIRT \
30172+ bsw.1; /* switch back to bank 1 (must be last in insn group) */ \
30173+ ;;
30174+
30175+/*
30176+ * For mca_asm.S we want to access the stack physically since the state is saved before we
30177+ * go virtual and don't want to destroy the iip or ipsr.
30178+ */
30179+#define MINSTATE_START_SAVE_MIN_PHYS \
30180+(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \
30181+(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \
30182+(pKStk) ld8 r3 = [r3];; \
30183+(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \
30184+(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \
30185+(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
30186+(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \
30187+ ;; \
30188+(pUStk) mov r24=ar.rnat; \
30189+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
30190+(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
30191+(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \
30192+ ;; \
30193+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
30194+(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
30195+ ;; \
30196+(pUStk) mov r18=ar.bsp; \
30197+(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
30198+
30199+#define MINSTATE_END_SAVE_MIN_PHYS \
30200+ dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \
30201+ ;;
30202+
30203+#ifdef MINSTATE_VIRT
30204+# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT)
30205+# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT
30206+# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT
30207+#endif
30208+
30209+#ifdef MINSTATE_PHYS
30210+# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg
30211+# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS
30212+# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS
30213+#endif
30214+
30215+/*
30216+ * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
30217+ * the minimum state necessary that allows us to turn psr.ic back
30218+ * on.
30219+ *
30220+ * Assumed state upon entry:
30221+ * psr.ic: off
30222+ * r31: contains saved predicates (pr)
30223+ *
30224+ * Upon exit, the state is as follows:
30225+ * psr.ic: off
30226+ * r2 = points to &pt_regs.r16
30227+ * r8 = contents of ar.ccv
30228+ * r9 = contents of ar.csd
30229+ * r10 = contents of ar.ssd
30230+ * r11 = FPSR_DEFAULT
30231+ * r12 = kernel sp (kernel virtual address)
30232+ * r13 = points to current task_struct (kernel virtual address)
30233+ * p15 = TRUE if psr.i is set in cr.ipsr
30234+ * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
30235+ * preserved
30236+ * CONFIG_XEN note: p6/p7 are not preserved
30237+ *
30238+ * Note that psr.ic is NOT turned on by this macro. This is so that
30239+ * we can pass interruption state as arguments to a handler.
30240+ */
30241+#ifdef CONFIG_XEN
30242+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \
30243+ MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \
30244+ mov r27=ar.rsc; /* M */ \
30245+ mov r20=r1; /* A */ \
30246+ mov r25=ar.unat; /* M */ \
30247+ /* mov r29=cr.ipsr; /* M */ \
30248+ movl r29=XSI_IPSR;; \
30249+ ld8 r29=[r29];; \
30250+ mov r26=ar.pfs; /* I */ \
30251+ /* mov r28=cr.iip; /* M */ \
30252+ movl r28=XSI_IIP;; \
30253+ ld8 r28=[r28];; \
30254+ mov r21=ar.fpsr; /* M */ \
30255+ COVER; /* B;; (or nothing) */ \
30256+ ;; \
30257+ adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \
30258+ ;; \
30259+ ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \
30260+ st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \
30261+ adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \
30262+ /* switch from user to kernel RBS: */ \
30263+ ;; \
30264+ invala; /* M */ \
30265+ /* SAVE_IFS; /* see xen special handling below */ \
30266+ cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \
30267+ ;; \
30268+ MINSTATE_START_SAVE_MIN \
30269+ adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \
30270+ adds r16=PT(CR_IPSR),r1; \
30271+ ;; \
30272+ lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
30273+ st8 [r16]=r29; /* save cr.ipsr */ \
30274+ ;; \
30275+ lfetch.fault.excl.nt1 [r17]; \
30276+ tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \
30277+ mov r29=b0 \
30278+ ;; \
30279+ adds r16=PT(R8),r1; /* initialize first base pointer */ \
30280+ adds r17=PT(R9),r1; /* initialize second base pointer */ \
30281+(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \
30282+ ;; \
30283+.mem.offset 0,0; st8.spill [r16]=r8,16; \
30284+.mem.offset 8,0; st8.spill [r17]=r9,16; \
30285+ ;; \
30286+.mem.offset 0,0; st8.spill [r16]=r10,24; \
30287+.mem.offset 8,0; st8.spill [r17]=r11,24; \
30288+ ;; \
30289+ /* xen special handling for possibly lazy cover */ \
30290+ movl r8=XSI_INCOMPL_REGFR; \
30291+ ;; \
30292+ ld4 r30=[r8]; \
30293+ ;; \
30294+ /* set XSI_INCOMPL_REGFR 0 */ \
30295+ st4 [r8]=r0; \
30296+ cmp.eq p6,p7=r30,r0; \
30297+ ;; /* not sure if this stop bit is necessary */ \
30298+(p6) adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8; \
30299+(p7) adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8; \
30300+ ;; \
30301+ ld8 r30=[r8]; \
30302+ ;; \
30303+ st8 [r16]=r28,16; /* save cr.iip */ \
30304+ st8 [r17]=r30,16; /* save cr.ifs */ \
30305+(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \
30306+ mov r8=ar.ccv; \
30307+ mov r9=ar.csd; \
30308+ mov r10=ar.ssd; \
30309+ movl r11=FPSR_DEFAULT; /* L-unit */ \
30310+ ;; \
30311+ st8 [r16]=r25,16; /* save ar.unat */ \
30312+ st8 [r17]=r26,16; /* save ar.pfs */ \
30313+ shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
30314+ ;; \
30315+ st8 [r16]=r27,16; /* save ar.rsc */ \
30316+(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \
30317+(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \
30318+ ;; /* avoid RAW on r16 & r17 */ \
30319+(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \
30320+ st8 [r17]=r31,16; /* save predicates */ \
30321+(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \
30322+ ;; \
30323+ st8 [r16]=r29,16; /* save b0 */ \
30324+ st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \
30325+ cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
30326+ ;; \
30327+.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \
30328+.mem.offset 8,0; st8.spill [r17]=r12,16; \
30329+ adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
30330+ ;; \
30331+.mem.offset 0,0; st8.spill [r16]=r13,16; \
30332+.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \
30333+ mov r13=IA64_KR(CURRENT); /* establish `current' */ \
30334+ ;; \
30335+.mem.offset 0,0; st8.spill [r16]=r15,16; \
30336+.mem.offset 8,0; st8.spill [r17]=r14,16; \
30337+ ;; \
30338+.mem.offset 0,0; st8.spill [r16]=r2,16; \
30339+.mem.offset 8,0; st8.spill [r17]=r3,16; \
30340+ ;; \
30341+ EXTRA; \
30342+ mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2; \
30343+ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
30344+ ;; \
30345+ movl r1=__gp; /* establish kernel global pointer */ \
30346+ ;; \
30347+ /* MINSTATE_END_SAVE_MIN */
30348+#else
30349+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \
30350+ MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \
30351+ mov r27=ar.rsc; /* M */ \
30352+ mov r20=r1; /* A */ \
30353+ mov r25=ar.unat; /* M */ \
30354+ mov r29=cr.ipsr; /* M */ \
30355+ mov r26=ar.pfs; /* I */ \
30356+ mov r28=cr.iip; /* M */ \
30357+ mov r21=ar.fpsr; /* M */ \
30358+ COVER; /* B;; (or nothing) */ \
30359+ ;; \
30360+ adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \
30361+ ;; \
30362+ ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \
30363+ st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \
30364+ adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \
30365+ /* switch from user to kernel RBS: */ \
30366+ ;; \
30367+ invala; /* M */ \
30368+ SAVE_IFS; \
30369+ cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \
30370+ ;; \
30371+ MINSTATE_START_SAVE_MIN \
30372+ adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \
30373+ adds r16=PT(CR_IPSR),r1; \
30374+ ;; \
30375+ lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
30376+ st8 [r16]=r29; /* save cr.ipsr */ \
30377+ ;; \
30378+ lfetch.fault.excl.nt1 [r17]; \
30379+ tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \
30380+ mov r29=b0 \
30381+ ;; \
30382+ adds r16=PT(R8),r1; /* initialize first base pointer */ \
30383+ adds r17=PT(R9),r1; /* initialize second base pointer */ \
30384+(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \
30385+ ;; \
30386+.mem.offset 0,0; st8.spill [r16]=r8,16; \
30387+.mem.offset 8,0; st8.spill [r17]=r9,16; \
30388+ ;; \
30389+.mem.offset 0,0; st8.spill [r16]=r10,24; \
30390+.mem.offset 8,0; st8.spill [r17]=r11,24; \
30391+ ;; \
30392+ st8 [r16]=r28,16; /* save cr.iip */ \
30393+ st8 [r17]=r30,16; /* save cr.ifs */ \
30394+(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \
30395+ mov r8=ar.ccv; \
30396+ mov r9=ar.csd; \
30397+ mov r10=ar.ssd; \
30398+ movl r11=FPSR_DEFAULT; /* L-unit */ \
30399+ ;; \
30400+ st8 [r16]=r25,16; /* save ar.unat */ \
30401+ st8 [r17]=r26,16; /* save ar.pfs */ \
30402+ shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
30403+ ;; \
30404+ st8 [r16]=r27,16; /* save ar.rsc */ \
30405+(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \
30406+(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \
30407+ ;; /* avoid RAW on r16 & r17 */ \
30408+(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \
30409+ st8 [r17]=r31,16; /* save predicates */ \
30410+(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \
30411+ ;; \
30412+ st8 [r16]=r29,16; /* save b0 */ \
30413+ st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \
30414+ cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
30415+ ;; \
30416+.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \
30417+.mem.offset 8,0; st8.spill [r17]=r12,16; \
30418+ adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
30419+ ;; \
30420+.mem.offset 0,0; st8.spill [r16]=r13,16; \
30421+.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \
30422+ mov r13=IA64_KR(CURRENT); /* establish `current' */ \
30423+ ;; \
30424+.mem.offset 0,0; st8.spill [r16]=r15,16; \
30425+.mem.offset 8,0; st8.spill [r17]=r14,16; \
30426+ ;; \
30427+.mem.offset 0,0; st8.spill [r16]=r2,16; \
30428+.mem.offset 8,0; st8.spill [r17]=r3,16; \
30429+ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
30430+ ;; \
30431+ EXTRA; \
30432+ movl r1=__gp; /* establish kernel global pointer */ \
30433+ ;; \
30434+ MINSTATE_END_SAVE_MIN
30435+#endif
30436+
30437+/*
30438+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
30439+ *
30440+ * Assumed state upon entry:
30441+ * psr.ic: on
30442+ * r2: points to &pt_regs.r16
30443+ * r3: points to &pt_regs.r17
30444+ * r8: contents of ar.ccv
30445+ * r9: contents of ar.csd
30446+ * r10: contents of ar.ssd
30447+ * r11: FPSR_DEFAULT
30448+ *
30449+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
30450+ */
30451+#define SAVE_REST \
30452+.mem.offset 0,0; st8.spill [r2]=r16,16; \
30453+.mem.offset 8,0; st8.spill [r3]=r17,16; \
30454+ ;; \
30455+.mem.offset 0,0; st8.spill [r2]=r18,16; \
30456+.mem.offset 8,0; st8.spill [r3]=r19,16; \
30457+ ;; \
30458+.mem.offset 0,0; st8.spill [r2]=r20,16; \
30459+.mem.offset 8,0; st8.spill [r3]=r21,16; \
30460+ mov r18=b6; \
30461+ ;; \
30462+.mem.offset 0,0; st8.spill [r2]=r22,16; \
30463+.mem.offset 8,0; st8.spill [r3]=r23,16; \
30464+ mov r19=b7; \
30465+ ;; \
30466+.mem.offset 0,0; st8.spill [r2]=r24,16; \
30467+.mem.offset 8,0; st8.spill [r3]=r25,16; \
30468+ ;; \
30469+.mem.offset 0,0; st8.spill [r2]=r26,16; \
30470+.mem.offset 8,0; st8.spill [r3]=r27,16; \
30471+ ;; \
30472+.mem.offset 0,0; st8.spill [r2]=r28,16; \
30473+.mem.offset 8,0; st8.spill [r3]=r29,16; \
30474+ ;; \
30475+.mem.offset 0,0; st8.spill [r2]=r30,16; \
30476+.mem.offset 8,0; st8.spill [r3]=r31,32; \
30477+ ;; \
30478+ mov ar.fpsr=r11; /* M-unit */ \
30479+ st8 [r2]=r8,8; /* ar.ccv */ \
30480+ adds r24=PT(B6)-PT(F7),r3; \
30481+ ;; \
30482+ stf.spill [r2]=f6,32; \
30483+ stf.spill [r3]=f7,32; \
30484+ ;; \
30485+ stf.spill [r2]=f8,32; \
30486+ stf.spill [r3]=f9,32; \
30487+ ;; \
30488+ stf.spill [r2]=f10; \
30489+ stf.spill [r3]=f11; \
30490+ adds r25=PT(B7)-PT(F11),r3; \
30491+ ;; \
30492+ st8 [r24]=r18,16; /* b6 */ \
30493+ st8 [r25]=r19,16; /* b7 */ \
30494+ ;; \
30495+ st8 [r24]=r9; /* ar.csd */ \
30496+ st8 [r25]=r10; /* ar.ssd */ \
30497+ ;;
30498+
30499+#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,)
30500+#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
30501+#ifdef CONFIG_XEN
30502+#define SAVE_MIN break 0;; /* FIXME: non-cover version only for ia32 support? */
30503+#else
30504+#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
30505+#endif
30506diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenpal.S linux-2.6.16.33/arch/ia64/xen/xenpal.S
30507--- linux-2.6.16.33-noxen/arch/ia64/xen/xenpal.S 1970-01-01 00:00:00.000000000 +0000
30508+++ linux-2.6.16.33/arch/ia64/xen/xenpal.S 2007-01-08 15:00:45.000000000 +0000
30509@@ -0,0 +1,76 @@
30510+/*
30511+ * ia64/xen/xenpal.S
30512+ *
30513+ * Alternate PAL routines for Xen. Heavily leveraged from
30514+ * ia64/kernel/pal.S
30515+ *
30516+ * Copyright (C) 2005 Hewlett-Packard Co
30517+ * Dan Magenheimer <dan.magenheimer@.hp.com>
30518+ */
30519+
30520+#include <asm/asmmacro.h>
30521+#include <asm/processor.h>
30522+
30523+GLOBAL_ENTRY(xen_pal_call_static)
30524+ .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
30525+ alloc loc1 = ar.pfs,5,5,0,0
30526+#ifdef CONFIG_XEN
30527+ movl r22=running_on_xen;;
30528+ ld4 r22=[r22];;
30529+ cmp.eq p7,p0=r22,r0
30530+(p7) br.cond.spnt.many __ia64_pal_call_static;;
30531+#endif
30532+ movl loc2 = pal_entry_point
30533+1: {
30534+ mov r28 = in0
30535+ mov r29 = in1
30536+ mov r8 = ip
30537+ }
30538+ ;;
30539+ ld8 loc2 = [loc2] // loc2 <- entry point
30540+ tbit.nz p6,p7 = in4, 0
30541+ adds r8 = 1f-1b,r8
30542+ mov loc4=ar.rsc // save RSE configuration
30543+ ;;
30544+ mov ar.rsc=0 // put RSE in enforced lazy, LE mode
30545+ mov loc3 = psr
30546+ mov loc0 = rp
30547+ .body
30548+ mov r30 = in2
30549+
30550+#ifdef CONFIG_XEN
30551+ // this is low priority for paravirtualization, but is called
30552+ // from the idle loop so confuses privop counting
30553+ movl r31=XSI_PSR_IC
30554+ ;;
30555+(p6) st4 [r31]=r0
30556+ ;;
30557+(p7) adds r31=XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS,r31
30558+(p7) mov r22=1
30559+ ;;
30560+(p7) ld8 r31=[r31]
30561+ ;;
30562+(p7) st1 [r31]=r22
30563+ ;;
30564+ mov r31 = in3
30565+ mov b7 = loc2
30566+ ;;
30567+#else
30568+(p6) rsm psr.i | psr.ic
30569+ mov r31 = in3
30570+ mov b7 = loc2
30571+
30572+(p7) rsm psr.i
30573+ ;;
30574+(p6) srlz.i
30575+#endif
30576+ mov rp = r8
30577+ br.cond.sptk.many b7
30578+1: mov psr.l = loc3
30579+ mov ar.rsc = loc4 // restore RSE configuration
30580+ mov ar.pfs = loc1
30581+ mov rp = loc0
30582+ ;;
30583+ srlz.d // seralize restoration of psr.l
30584+ br.ret.sptk.many b0
30585+END(xen_pal_call_static)
30586diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xensetup.S linux-2.6.16.33/arch/ia64/xen/xensetup.S
30587--- linux-2.6.16.33-noxen/arch/ia64/xen/xensetup.S 1970-01-01 00:00:00.000000000 +0000
30588+++ linux-2.6.16.33/arch/ia64/xen/xensetup.S 2007-01-08 15:00:45.000000000 +0000
30589@@ -0,0 +1,54 @@
30590+/*
30591+ * Support routines for Xen
30592+ *
30593+ * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
30594+ */
30595+
30596+#include <linux/config.h>
30597+#include <asm/processor.h>
30598+#include <asm/asmmacro.h>
30599+
30600+#define isBP p3 // are we the Bootstrap Processor?
30601+
30602+ .text
30603+GLOBAL_ENTRY(early_xen_setup)
30604+ mov r8=ar.rsc // Initialized in head.S
30605+(isBP) movl r9=running_on_xen;;
30606+ extr.u r8=r8,2,2;; // Extract pl fields
30607+ cmp.eq p7,p0=r8,r0 // p7: !running on xen
30608+ mov r8=1 // booleanize.
30609+(p7) br.ret.sptk.many rp;;
30610+(isBP) st4 [r9]=r8
30611+ movl r10=xen_ivt;;
30612+
30613+ mov cr.iva=r10
30614+
30615+ /* Set xsi base. */
30616+#define FW_HYPERCALL_SET_SHARED_INFO_VA 0x600
30617+(isBP) mov r2=FW_HYPERCALL_SET_SHARED_INFO_VA
30618+(isBP) movl r28=XSI_BASE;;
30619+(isBP) break 0x1000;;
30620+
30621+ br.ret.sptk.many rp
30622+ ;;
30623+END(early_xen_setup)
30624+
30625+#include <xen/interface/xen.h>
30626+
30627+/* Stub for suspend.
30628+ Just force the stacked registers to be written in memory. */
30629+GLOBAL_ENTRY(xencomm_arch_hypercall_suspend)
30630+ mov r15=r32
30631+ ;;
30632+ alloc r20=ar.pfs,0,0,0,0
30633+ mov r2=__HYPERVISOR_sched_op
30634+ ;;
30635+ /* We don't want to deal with RSE. */
30636+ flushrs
30637+ mov r14=2 // SCHEDOP_shutdown
30638+ ;;
30639+ break 0x1000
30640+ ;;
30641+ mov ar.pfs=r20
30642+ br.ret.sptk.many b0
30643+END(xencomm_arch_hypercall_suspend)
30644diff -Nur linux-2.6.16.33-noxen/arch/powerpc/kernel/machine_kexec_32.c linux-2.6.16.33/arch/powerpc/kernel/machine_kexec_32.c
30645--- linux-2.6.16.33-noxen/arch/powerpc/kernel/machine_kexec_32.c 2006-11-22 18:06:31.000000000 +0000
30646+++ linux-2.6.16.33/arch/powerpc/kernel/machine_kexec_32.c 2007-05-23 21:00:01.000000000 +0000
30647@@ -30,8 +30,8 @@
30648 */
30649 void default_machine_kexec(struct kimage *image)
30650 {
30651- const extern unsigned char relocate_new_kernel[];
30652- const extern unsigned int relocate_new_kernel_size;
30653+ extern const unsigned char relocate_new_kernel[];
30654+ extern const unsigned int relocate_new_kernel_size;
30655 unsigned long page_list;
30656 unsigned long reboot_code_buffer, reboot_code_buffer_phys;
30657 relocate_new_kernel_t rnk;
30658diff -Nur linux-2.6.16.33-noxen/arch/ppc/kernel/machine_kexec.c linux-2.6.16.33/arch/ppc/kernel/machine_kexec.c
30659--- linux-2.6.16.33-noxen/arch/ppc/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30660+++ linux-2.6.16.33/arch/ppc/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30661@@ -25,8 +25,8 @@
30662 unsigned long reboot_code_buffer,
30663 unsigned long start_address) ATTRIB_NORET;
30664
30665-const extern unsigned char relocate_new_kernel[];
30666-const extern unsigned int relocate_new_kernel_size;
30667+extern const unsigned char relocate_new_kernel[];
30668+extern const unsigned int relocate_new_kernel_size;
30669
30670 void machine_shutdown(void)
30671 {
30672diff -Nur linux-2.6.16.33-noxen/arch/s390/kernel/machine_kexec.c linux-2.6.16.33/arch/s390/kernel/machine_kexec.c
30673--- linux-2.6.16.33-noxen/arch/s390/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30674+++ linux-2.6.16.33/arch/s390/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30675@@ -27,8 +27,8 @@
30676
30677 typedef void (*relocate_kernel_t) (kimage_entry_t *, unsigned long);
30678
30679-const extern unsigned char relocate_kernel[];
30680-const extern unsigned long long relocate_kernel_len;
30681+extern const unsigned char relocate_kernel[];
30682+extern const unsigned long long relocate_kernel_len;
30683
30684 int
30685 machine_kexec_prepare(struct kimage *image)
30686diff -Nur linux-2.6.16.33-noxen/arch/sh/kernel/machine_kexec.c linux-2.6.16.33/arch/sh/kernel/machine_kexec.c
30687--- linux-2.6.16.33-noxen/arch/sh/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30688+++ linux-2.6.16.33/arch/sh/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30689@@ -25,8 +25,8 @@
30690 unsigned long start_address,
30691 unsigned long vbr_reg) ATTRIB_NORET;
30692
30693-const extern unsigned char relocate_new_kernel[];
30694-const extern unsigned int relocate_new_kernel_size;
30695+extern const unsigned char relocate_new_kernel[];
30696+extern const unsigned int relocate_new_kernel_size;
30697 extern void *gdb_vbr_vector;
30698
30699 /*
30700diff -Nur linux-2.6.16.33-noxen/arch/um/kernel/physmem.c linux-2.6.16.33/arch/um/kernel/physmem.c
30701--- linux-2.6.16.33-noxen/arch/um/kernel/physmem.c 2006-11-22 18:06:31.000000000 +0000
30702+++ linux-2.6.16.33/arch/um/kernel/physmem.c 2007-01-08 15:00:45.000000000 +0000
30703@@ -225,7 +225,7 @@
30704 EXPORT_SYMBOL(physmem_remove_mapping);
30705 EXPORT_SYMBOL(physmem_subst_mapping);
30706
30707-void arch_free_page(struct page *page, int order)
30708+int arch_free_page(struct page *page, int order)
30709 {
30710 void *virt;
30711 int i;
30712@@ -234,6 +234,8 @@
30713 virt = __va(page_to_phys(page + i));
30714 physmem_remove_mapping(virt);
30715 }
30716+
30717+ return 0;
30718 }
30719
30720 int is_remapped(void *virt)
30721diff -Nur linux-2.6.16.33-noxen/arch/x86_64/Kconfig linux-2.6.16.33/arch/x86_64/Kconfig
30722--- linux-2.6.16.33-noxen/arch/x86_64/Kconfig 2006-11-22 18:06:31.000000000 +0000
30723+++ linux-2.6.16.33/arch/x86_64/Kconfig 2007-01-08 15:00:45.000000000 +0000
30724@@ -119,6 +119,22 @@
30725
30726 endchoice
30727
30728+config X86_64_XEN
30729+ bool "Enable Xen compatible kernel"
30730+ select SWIOTLB
30731+ help
30732+ This option will compile a kernel compatible with Xen hypervisor
30733+
30734+config X86_NO_TSS
30735+ bool
30736+ depends on X86_64_XEN
30737+ default y
30738+
30739+config X86_NO_IDT
30740+ bool
30741+ depends on X86_64_XEN
30742+ default y
30743+
30744 #
30745 # Define implied options from the CPU selection here
30746 #
30747@@ -134,6 +150,7 @@
30748
30749 config X86_TSC
30750 bool
30751+ depends on !X86_64_XEN
30752 default y
30753
30754 config X86_GOOD_APIC
30755@@ -176,7 +193,7 @@
30756
30757 config X86_HT
30758 bool
30759- depends on SMP && !MK8
30760+ depends on SMP && !MK8 && !X86_64_XEN
30761 default y
30762
30763 config MATH_EMULATION
30764@@ -190,14 +207,22 @@
30765
30766 config X86_IO_APIC
30767 bool
30768+ depends !XEN_UNPRIVILEGED_GUEST
30769 default y
30770
30771+config X86_XEN_GENAPIC
30772+ bool
30773+ depends X86_64_XEN
30774+ default XEN_PRIVILEGED_GUEST || SMP
30775+
30776 config X86_LOCAL_APIC
30777 bool
30778+ depends !XEN_UNPRIVILEGED_GUEST
30779 default y
30780
30781 config MTRR
30782 bool "MTRR (Memory Type Range Register) support"
30783+ depends on !XEN_UNPRIVILEGED_GUEST
30784 ---help---
30785 On Intel P6 family processors (Pentium Pro, Pentium II and later)
30786 the Memory Type Range Registers (MTRRs) may be used to control
30787@@ -238,7 +263,7 @@
30788
30789 config SCHED_SMT
30790 bool "SMT (Hyperthreading) scheduler support"
30791- depends on SMP
30792+ depends on SMP && !X86_64_XEN
30793 default n
30794 help
30795 SMT scheduler support improves the CPU scheduler's decision making
30796@@ -250,7 +275,7 @@
30797
30798 config NUMA
30799 bool "Non Uniform Memory Access (NUMA) Support"
30800- depends on SMP
30801+ depends on SMP && !X86_64_XEN
30802 help
30803 Enable NUMA (Non Uniform Memory Access) support. The kernel
30804 will try to allocate memory used by a CPU on the local memory
30805@@ -305,7 +330,7 @@
30806
30807 config ARCH_SPARSEMEM_ENABLE
30808 def_bool y
30809- depends on (NUMA || EXPERIMENTAL)
30810+ depends on (NUMA || EXPERIMENTAL) && !X86_64_XEN
30811
30812 config ARCH_MEMORY_PROBE
30813 def_bool y
30814@@ -325,6 +350,7 @@
30815 int "Maximum number of CPUs (2-256)"
30816 range 2 256
30817 depends on SMP
30818+ default "16" if X86_64_XEN
30819 default "8"
30820 help
30821 This allows you to specify the maximum number of CPUs which this
30822@@ -347,6 +373,7 @@
30823
30824 config HPET_TIMER
30825 bool
30826+ depends on !X86_64_XEN
30827 default y
30828 help
30829 Use the IA-PC HPET (High Precision Event Timer) to manage
30830@@ -364,7 +391,7 @@
30831 bool "K8 GART IOMMU support"
30832 default y
30833 select SWIOTLB
30834- depends on PCI
30835+ depends on PCI && !X86_64_XEN
30836 help
30837 Support the IOMMU. Needed to run systems with more than 3GB of memory
30838 properly with 32-bit PCI devices that do not support DAC (Double Address
30839@@ -382,6 +409,7 @@
30840
30841 config X86_MCE
30842 bool "Machine check support" if EMBEDDED
30843+ depends on !X86_64_XEN
30844 default y
30845 help
30846 Include a machine check error handler to report hardware errors.
30847@@ -407,7 +435,7 @@
30848
30849 config KEXEC
30850 bool "kexec system call (EXPERIMENTAL)"
30851- depends on EXPERIMENTAL
30852+ depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
30853 help
30854 kexec is a system call that implements the ability to shutdown your
30855 current kernel, and to start another kernel. It is like a reboot
30856@@ -490,8 +518,11 @@
30857 default y
30858
30859 menu "Power management options"
30860+ depends on !XEN_UNPRIVILEGED_GUEST
30861
30862+if !X86_64_XEN
30863 source kernel/power/Kconfig
30864+endif
30865
30866 source "drivers/acpi/Kconfig"
30867
30868@@ -514,6 +545,21 @@
30869 bool "Support mmconfig PCI config space access"
30870 depends on PCI && ACPI
30871
30872+config XEN_PCIDEV_FRONTEND
30873+ bool "Xen PCI Frontend"
30874+ depends on PCI && X86_64_XEN
30875+ default y
30876+ help
30877+ The PCI device frontend driver allows the kernel to import arbitrary
30878+ PCI devices from a PCI backend to support PCI driver domains.
30879+
30880+config XEN_PCIDEV_FE_DEBUG
30881+ bool "Xen PCI Frontend Debugging"
30882+ depends on XEN_PCIDEV_FRONTEND
30883+ default n
30884+ help
30885+ Enables some debug statements within the PCI Frontend.
30886+
30887 config UNORDERED_IO
30888 bool "Unordered IO mapping access"
30889 depends on EXPERIMENTAL
30890@@ -594,4 +640,6 @@
30891
30892 source "crypto/Kconfig"
30893
30894+source "drivers/xen/Kconfig"
30895+
30896 source "lib/Kconfig"
30897diff -Nur linux-2.6.16.33-noxen/arch/x86_64/Makefile linux-2.6.16.33/arch/x86_64/Makefile
30898--- linux-2.6.16.33-noxen/arch/x86_64/Makefile 2006-11-22 18:06:31.000000000 +0000
30899+++ linux-2.6.16.33/arch/x86_64/Makefile 2007-01-08 15:00:45.000000000 +0000
30900@@ -31,6 +31,10 @@
30901 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
30902 CFLAGS += $(cflags-y)
30903
30904+cppflags-$(CONFIG_XEN) += \
30905+ -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
30906+CPPFLAGS += $(cppflags-y)
30907+
30908 CFLAGS += -m64
30909 CFLAGS += -mno-red-zone
30910 CFLAGS += -mcmodel=kernel
30911@@ -70,6 +74,21 @@
30912 .PHONY: bzImage bzlilo install archmrproper \
30913 fdimage fdimage144 fdimage288 archclean
30914
30915+ifdef CONFIG_XEN
30916+CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
30917+head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
30918+LDFLAGS_vmlinux := -e _start
30919+boot := arch/i386/boot-xen
30920+.PHONY: vmlinuz
30921+#Default target when executing "make"
30922+all: vmlinuz
30923+
30924+vmlinuz: vmlinux
30925+ $(Q)$(MAKE) $(build)=$(boot) $@
30926+
30927+install:
30928+ $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
30929+else
30930 #Default target when executing "make"
30931 all: bzImage
30932
30933@@ -90,6 +109,7 @@
30934
30935 install:
30936 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
30937+endif
30938
30939 archclean:
30940 $(Q)$(MAKE) $(clean)=$(boot)
30941diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/Makefile linux-2.6.16.33/arch/x86_64/ia32/Makefile
30942--- linux-2.6.16.33-noxen/arch/x86_64/ia32/Makefile 2006-11-22 18:06:31.000000000 +0000
30943+++ linux-2.6.16.33/arch/x86_64/ia32/Makefile 2007-01-08 15:00:45.000000000 +0000
30944@@ -23,9 +23,25 @@
30945 -Wl,-soname=linux-gate.so.1 -o $@ \
30946 -Wl,-T,$(filter-out FORCE,$^)
30947
30948+$(obj)/vsyscall-int80.so \
30949 $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
30950 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
30951 $(call if_changed,syscall)
30952
30953-AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
30954-AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
30955+AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -Iarch/i386/kernel
30956+AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 -Iarch/i386/kernel
30957+
30958+ifdef CONFIG_XEN
30959+AFLAGS_vsyscall-int80.o = -m32 -Wa,-32 -Iarch/i386/kernel
30960+CFLAGS_syscall32-xen.o += -DUSE_INT80
30961+AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
30962+
30963+$(obj)/syscall32_syscall-xen.o: \
30964+ $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
30965+
30966+targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
30967+
30968+include $(srctree)/scripts/Makefile.xen
30969+
30970+obj-y := $(call cherrypickxen, $(obj-y))
30971+endif
30972diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/ia32entry-xen.S linux-2.6.16.33/arch/x86_64/ia32/ia32entry-xen.S
30973--- linux-2.6.16.33-noxen/arch/x86_64/ia32/ia32entry-xen.S 1970-01-01 00:00:00.000000000 +0000
30974+++ linux-2.6.16.33/arch/x86_64/ia32/ia32entry-xen.S 2007-01-08 15:00:45.000000000 +0000
30975@@ -0,0 +1,721 @@
30976+/*
30977+ * Compatibility mode system call entry point for x86-64.
30978+ *
30979+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
30980+ */
30981+
30982+#include <asm/dwarf2.h>
30983+#include <asm/calling.h>
30984+#include <asm/asm-offsets.h>
30985+#include <asm/current.h>
30986+#include <asm/errno.h>
30987+#include <asm/ia32_unistd.h>
30988+#include <asm/thread_info.h>
30989+#include <asm/segment.h>
30990+#include <asm/vsyscall32.h>
30991+#include <linux/linkage.h>
30992+
30993+#define __XEN_X86_64 1
30994+
30995+ .macro IA32_ARG_FIXUP noebp=0
30996+ movl %edi,%r8d
30997+ .if \noebp
30998+ .else
30999+ movl %ebp,%r9d
31000+ .endif
31001+ xchg %ecx,%esi
31002+ movl %ebx,%edi
31003+ movl %edx,%edx /* zero extension */
31004+ .endm
31005+
31006+ /* clobbers %eax */
31007+ .macro CLEAR_RREGS
31008+ xorl %eax,%eax
31009+ movq %rax,R11(%rsp)
31010+ movq %rax,R10(%rsp)
31011+ movq %rax,R9(%rsp)
31012+ movq %rax,R8(%rsp)
31013+ .endm
31014+
31015+#if defined (__XEN_X86_64)
31016+#include "../kernel/xen_entry.S"
31017+
31018+#define __swapgs
31019+#define __cli
31020+#define __sti
31021+#else
31022+/*
31023+ * Use the native instructions
31024+ */
31025+#define __swapgs swapgs
31026+#define __cli cli
31027+#define __sti sti
31028+#endif
31029+
31030+ .macro CFI_STARTPROC32 simple
31031+ CFI_STARTPROC \simple
31032+ CFI_UNDEFINED r8
31033+ CFI_UNDEFINED r9
31034+ CFI_UNDEFINED r10
31035+ CFI_UNDEFINED r11
31036+ CFI_UNDEFINED r12
31037+ CFI_UNDEFINED r13
31038+ CFI_UNDEFINED r14
31039+ CFI_UNDEFINED r15
31040+ .endm
31041+
31042+/*
31043+ * 32bit SYSENTER instruction entry.
31044+ *
31045+ * Arguments:
31046+ * %eax System call number.
31047+ * %ebx Arg1
31048+ * %ecx Arg2
31049+ * %edx Arg3
31050+ * %esi Arg4
31051+ * %edi Arg5
31052+ * %ebp user stack
31053+ * 0(%ebp) Arg6
31054+ *
31055+ * Interrupts off.
31056+ *
31057+ * This is purely a fast path. For anything complicated we use the int 0x80
31058+ * path below. Set up a complete hardware stack frame to share code
31059+ * with the int 0x80 path.
31060+ */
31061+ENTRY(ia32_sysenter_target)
31062+ CFI_STARTPROC32 simple
31063+ CFI_DEF_CFA rsp,0
31064+ CFI_REGISTER rsp,rbp
31065+ __swapgs
31066+ movq %gs:pda_kernelstack, %rsp
31067+ addq $(PDA_STACKOFFSET),%rsp
31068+ XEN_UNBLOCK_EVENTS(%r11)
31069+ __sti
31070+ movl %ebp,%ebp /* zero extension */
31071+ pushq $__USER32_DS
31072+ CFI_ADJUST_CFA_OFFSET 8
31073+ /*CFI_REL_OFFSET ss,0*/
31074+ pushq %rbp
31075+ CFI_ADJUST_CFA_OFFSET 8
31076+ CFI_REL_OFFSET rsp,0
31077+ pushfq
31078+ CFI_ADJUST_CFA_OFFSET 8
31079+ /*CFI_REL_OFFSET rflags,0*/
31080+ movl $VSYSCALL32_SYSEXIT, %r10d
31081+ CFI_REGISTER rip,r10
31082+ pushq $__USER32_CS
31083+ CFI_ADJUST_CFA_OFFSET 8
31084+ /*CFI_REL_OFFSET cs,0*/
31085+ movl %eax, %eax
31086+ pushq %r10
31087+ CFI_ADJUST_CFA_OFFSET 8
31088+ CFI_REL_OFFSET rip,0
31089+ pushq %rax
31090+ CFI_ADJUST_CFA_OFFSET 8
31091+ cld
31092+ SAVE_ARGS 0,0,1
31093+ /* no need to do an access_ok check here because rbp has been
31094+ 32bit zero extended */
31095+1: movl (%rbp),%r9d
31096+ .section __ex_table,"a"
31097+ .quad 1b,ia32_badarg
31098+ .previous
31099+ GET_THREAD_INFO(%r10)
31100+ orl $TS_COMPAT,threadinfo_status(%r10)
31101+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31102+ CFI_REMEMBER_STATE
31103+ jnz sysenter_tracesys
31104+sysenter_do_call:
31105+ cmpl $(IA32_NR_syscalls),%eax
31106+ jae ia32_badsys
31107+ IA32_ARG_FIXUP 1
31108+ call *ia32_sys_call_table(,%rax,8)
31109+ movq %rax,RAX-ARGOFFSET(%rsp)
31110+ GET_THREAD_INFO(%r10)
31111+ XEN_BLOCK_EVENTS(%r11)
31112+ __cli
31113+ testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31114+ jnz int_ret_from_sys_call
31115+ andl $~TS_COMPAT,threadinfo_status(%r10)
31116+ /* clear IF, that popfq doesn't enable interrupts early */
31117+ andl $~0x200,EFLAGS-R11(%rsp)
31118+ RESTORE_ARGS 1,24,1,1,1,1
31119+ popfq
31120+ CFI_ADJUST_CFA_OFFSET -8
31121+ /*CFI_RESTORE rflags*/
31122+ popq %rcx /* User %esp */
31123+ CFI_ADJUST_CFA_OFFSET -8
31124+ CFI_REGISTER rsp,rcx
31125+ movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
31126+ CFI_REGISTER rip,rdx
31127+ __swapgs
31128+ XEN_UNBLOCK_EVENTS(%r11)
31129+ __sti /* sti only takes effect after the next instruction */
31130+ /* sysexit */
31131+ .byte 0xf, 0x35 /* TBD */
31132+
31133+sysenter_tracesys:
31134+ CFI_RESTORE_STATE
31135+ SAVE_REST
31136+ CLEAR_RREGS
31137+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
31138+ movq %rsp,%rdi /* &pt_regs -> arg1 */
31139+ call syscall_trace_enter
31140+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31141+ RESTORE_REST
31142+ movl %ebp, %ebp
31143+ /* no need to do an access_ok check here because rbp has been
31144+ 32bit zero extended */
31145+1: movl (%rbp),%r9d
31146+ .section __ex_table,"a"
31147+ .quad 1b,ia32_badarg
31148+ .previous
31149+ jmp sysenter_do_call
31150+ CFI_ENDPROC
31151+
31152+/*
31153+ * 32bit SYSCALL instruction entry.
31154+ *
31155+ * Arguments:
31156+ * %eax System call number.
31157+ * %ebx Arg1
31158+ * %ecx return EIP
31159+ * %edx Arg3
31160+ * %esi Arg4
31161+ * %edi Arg5
31162+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
31163+ * %esp user stack
31164+ * 0(%esp) Arg6
31165+ *
31166+ * Interrupts off.
31167+ *
31168+ * This is purely a fast path. For anything complicated we use the int 0x80
31169+ * path below. Set up a complete hardware stack frame to share code
31170+ * with the int 0x80 path.
31171+ */
31172+ENTRY(ia32_cstar_target)
31173+ CFI_STARTPROC32 simple
31174+ CFI_DEF_CFA rsp,0
31175+ CFI_REGISTER rip,rcx
31176+ /*CFI_REGISTER rflags,r11*/
31177+ __swapgs
31178+ movl %esp,%r8d
31179+ CFI_REGISTER rsp,r8
31180+ movq %gs:pda_kernelstack,%rsp
31181+ XEN_UNBLOCK_EVENTS(%r11)
31182+ __sti
31183+ SAVE_ARGS 8,1,1
31184+ movl %eax,%eax /* zero extension */
31185+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
31186+ movq %rcx,RIP-ARGOFFSET(%rsp)
31187+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
31188+ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
31189+ movl %ebp,%ecx
31190+ movq $__USER32_CS,CS-ARGOFFSET(%rsp)
31191+ movq $__USER32_DS,SS-ARGOFFSET(%rsp)
31192+ movq %r11,EFLAGS-ARGOFFSET(%rsp)
31193+ /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
31194+ movq %r8,RSP-ARGOFFSET(%rsp)
31195+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
31196+ /* no need to do an access_ok check here because r8 has been
31197+ 32bit zero extended */
31198+ /* hardware stack frame is complete now */
31199+1: movl (%r8),%r9d
31200+ .section __ex_table,"a"
31201+ .quad 1b,ia32_badarg
31202+ .previous
31203+ GET_THREAD_INFO(%r10)
31204+ orl $TS_COMPAT,threadinfo_status(%r10)
31205+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31206+ CFI_REMEMBER_STATE
31207+ jnz cstar_tracesys
31208+cstar_do_call:
31209+ cmpl $IA32_NR_syscalls,%eax
31210+ jae ia32_badsys
31211+ IA32_ARG_FIXUP 1
31212+ call *ia32_sys_call_table(,%rax,8)
31213+ movq %rax,RAX-ARGOFFSET(%rsp)
31214+ GET_THREAD_INFO(%r10)
31215+ XEN_BLOCK_EVENTS(%r11)
31216+ __cli
31217+ testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31218+ jnz int_ret_from_sys_call
31219+ andl $~TS_COMPAT,threadinfo_status(%r10)
31220+ RESTORE_ARGS 1,-ARG_SKIP,1,1,1
31221+ movl RIP-ARGOFFSET(%rsp),%ecx
31222+ CFI_REGISTER rip,rcx
31223+ movl EFLAGS-ARGOFFSET(%rsp),%r11d
31224+ /*CFI_REGISTER rflags,r11*/
31225+ movl RSP-ARGOFFSET(%rsp),%esp
31226+ CFI_RESTORE rsp
31227+ __swapgs
31228+ sysretl /* TBD */
31229+
31230+cstar_tracesys:
31231+ CFI_RESTORE_STATE
31232+ SAVE_REST
31233+ CLEAR_RREGS
31234+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
31235+ movq %rsp,%rdi /* &pt_regs -> arg1 */
31236+ call syscall_trace_enter
31237+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31238+ RESTORE_REST
31239+ movl RSP-ARGOFFSET(%rsp), %r8d
31240+ /* no need to do an access_ok check here because r8 has been
31241+ 32bit zero extended */
31242+1: movl (%r8),%r9d
31243+ .section __ex_table,"a"
31244+ .quad 1b,ia32_badarg
31245+ .previous
31246+ jmp cstar_do_call
31247+
31248+ia32_badarg:
31249+ movq $-EFAULT,%rax
31250+ jmp ia32_sysret
31251+ CFI_ENDPROC
31252+
31253+/*
31254+ * Emulated IA32 system calls via int 0x80.
31255+ *
31256+ * Arguments:
31257+ * %eax System call number.
31258+ * %ebx Arg1
31259+ * %ecx Arg2
31260+ * %edx Arg3
31261+ * %esi Arg4
31262+ * %edi Arg5
31263+ * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
31264+ *
31265+ * Notes:
31266+ * Uses the same stack frame as the x86-64 version.
31267+ * All registers except %eax must be saved (but ptrace may violate that)
31268+ * Arguments are zero extended. For system calls that want sign extension and
31269+ * take long arguments a wrapper is needed. Most calls can just be called
31270+ * directly.
31271+ * Assumes it is only called from user space and entered with interrupts off.
31272+ */
31273+
31274+ENTRY(ia32_syscall)
31275+ CFI_STARTPROC simple
31276+ CFI_DEF_CFA rsp,SS+8-RIP
31277+ /*CFI_REL_OFFSET ss,SS-RIP*/
31278+ CFI_REL_OFFSET rsp,RSP-RIP
31279+ /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
31280+ /*CFI_REL_OFFSET cs,CS-RIP*/
31281+ CFI_REL_OFFSET rip,RIP-RIP
31282+ __swapgs
31283+ XEN_UNBLOCK_EVENTS(%r11)
31284+ __sti
31285+ movq (%rsp),%rcx
31286+ movq 8(%rsp),%r11
31287+ addq $0x10,%rsp /* skip rcx and r11 */
31288+ movl %eax,%eax
31289+ pushq %rax
31290+ CFI_ADJUST_CFA_OFFSET 8
31291+ cld
31292+/* 1: jmp 1b */
31293+ /* note the registers are not zero extended to the sf.
31294+ this could be a problem. */
31295+ SAVE_ARGS 0,0,1
31296+ GET_THREAD_INFO(%r10)
31297+ orl $TS_COMPAT,threadinfo_status(%r10)
31298+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31299+ jnz ia32_tracesys
31300+ia32_do_syscall:
31301+ cmpl $(IA32_NR_syscalls),%eax
31302+ jae ia32_badsys
31303+ IA32_ARG_FIXUP
31304+ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
31305+ia32_sysret:
31306+ movq %rax,RAX-ARGOFFSET(%rsp)
31307+ jmp int_ret_from_sys_call
31308+
31309+ia32_tracesys:
31310+ SAVE_REST
31311+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
31312+ movq %rsp,%rdi /* &pt_regs -> arg1 */
31313+ call syscall_trace_enter
31314+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31315+ RESTORE_REST
31316+ jmp ia32_do_syscall
31317+
31318+ia32_badsys:
31319+ movq $0,ORIG_RAX-ARGOFFSET(%rsp)
31320+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
31321+ jmp int_ret_from_sys_call
31322+
31323+ni_syscall:
31324+ movq %rax,%rdi
31325+ jmp sys32_ni_syscall
31326+
31327+quiet_ni_syscall:
31328+ movq $-ENOSYS,%rax
31329+ ret
31330+ CFI_ENDPROC
31331+
31332+ .macro PTREGSCALL label, func, arg
31333+ .globl \label
31334+\label:
31335+ leaq \func(%rip),%rax
31336+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
31337+ jmp ia32_ptregs_common
31338+ .endm
31339+
31340+ CFI_STARTPROC32
31341+
31342+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
31343+ PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
31344+ PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
31345+ PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
31346+ PTREGSCALL stub32_execve, sys32_execve, %rcx
31347+ PTREGSCALL stub32_fork, sys_fork, %rdi
31348+ PTREGSCALL stub32_clone, sys32_clone, %rdx
31349+ PTREGSCALL stub32_vfork, sys_vfork, %rdi
31350+ PTREGSCALL stub32_iopl, sys_iopl, %rsi
31351+ PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
31352+
31353+ENTRY(ia32_ptregs_common)
31354+ popq %r11
31355+ CFI_ENDPROC
31356+ CFI_STARTPROC32 simple
31357+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
31358+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
31359+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
31360+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
31361+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
31362+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
31363+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
31364+/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
31365+/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
31366+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
31367+/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
31368+ SAVE_REST
31369+ call *%rax
31370+ RESTORE_REST
31371+ jmp ia32_sysret /* misbalances the return cache */
31372+ CFI_ENDPROC
31373+
31374+ .section .rodata,"a"
31375+ .align 8
31376+ .globl ia32_sys_call_table
31377+ia32_sys_call_table:
31378+ .quad sys_restart_syscall
31379+ .quad sys_exit
31380+ .quad stub32_fork
31381+ .quad sys_read
31382+ .quad sys_write
31383+ .quad compat_sys_open /* 5 */
31384+ .quad sys_close
31385+ .quad sys32_waitpid
31386+ .quad sys_creat
31387+ .quad sys_link
31388+ .quad sys_unlink /* 10 */
31389+ .quad stub32_execve
31390+ .quad sys_chdir
31391+ .quad compat_sys_time
31392+ .quad sys_mknod
31393+ .quad sys_chmod /* 15 */
31394+ .quad sys_lchown16
31395+ .quad quiet_ni_syscall /* old break syscall holder */
31396+ .quad sys_stat
31397+ .quad sys32_lseek
31398+ .quad sys_getpid /* 20 */
31399+ .quad compat_sys_mount /* mount */
31400+ .quad sys_oldumount /* old_umount */
31401+ .quad sys_setuid16
31402+ .quad sys_getuid16
31403+ .quad compat_sys_stime /* stime */ /* 25 */
31404+ .quad sys32_ptrace /* ptrace */
31405+ .quad sys_alarm
31406+ .quad sys_fstat /* (old)fstat */
31407+ .quad sys_pause
31408+ .quad compat_sys_utime /* 30 */
31409+ .quad quiet_ni_syscall /* old stty syscall holder */
31410+ .quad quiet_ni_syscall /* old gtty syscall holder */
31411+ .quad sys_access
31412+ .quad sys_nice
31413+ .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
31414+ .quad sys_sync
31415+ .quad sys32_kill
31416+ .quad sys_rename
31417+ .quad sys_mkdir
31418+ .quad sys_rmdir /* 40 */
31419+ .quad sys_dup
31420+ .quad sys32_pipe
31421+ .quad compat_sys_times
31422+ .quad quiet_ni_syscall /* old prof syscall holder */
31423+ .quad sys_brk /* 45 */
31424+ .quad sys_setgid16
31425+ .quad sys_getgid16
31426+ .quad sys_signal
31427+ .quad sys_geteuid16
31428+ .quad sys_getegid16 /* 50 */
31429+ .quad sys_acct
31430+ .quad sys_umount /* new_umount */
31431+ .quad quiet_ni_syscall /* old lock syscall holder */
31432+ .quad compat_sys_ioctl
31433+ .quad compat_sys_fcntl64 /* 55 */
31434+ .quad quiet_ni_syscall /* old mpx syscall holder */
31435+ .quad sys_setpgid
31436+ .quad quiet_ni_syscall /* old ulimit syscall holder */
31437+ .quad sys32_olduname
31438+ .quad sys_umask /* 60 */
31439+ .quad sys_chroot
31440+ .quad sys32_ustat
31441+ .quad sys_dup2
31442+ .quad sys_getppid
31443+ .quad sys_getpgrp /* 65 */
31444+ .quad sys_setsid
31445+ .quad sys32_sigaction
31446+ .quad sys_sgetmask
31447+ .quad sys_ssetmask
31448+ .quad sys_setreuid16 /* 70 */
31449+ .quad sys_setregid16
31450+ .quad stub32_sigsuspend
31451+ .quad compat_sys_sigpending
31452+ .quad sys_sethostname
31453+ .quad compat_sys_setrlimit /* 75 */
31454+ .quad compat_sys_old_getrlimit /* old_getrlimit */
31455+ .quad compat_sys_getrusage
31456+ .quad sys32_gettimeofday
31457+ .quad sys32_settimeofday
31458+ .quad sys_getgroups16 /* 80 */
31459+ .quad sys_setgroups16
31460+ .quad sys32_old_select
31461+ .quad sys_symlink
31462+ .quad sys_lstat
31463+ .quad sys_readlink /* 85 */
31464+#ifdef CONFIG_IA32_AOUT
31465+ .quad sys_uselib
31466+#else
31467+ .quad quiet_ni_syscall
31468+#endif
31469+ .quad sys_swapon
31470+ .quad sys_reboot
31471+ .quad compat_sys_old_readdir
31472+ .quad sys32_mmap /* 90 */
31473+ .quad sys_munmap
31474+ .quad sys_truncate
31475+ .quad sys_ftruncate
31476+ .quad sys_fchmod
31477+ .quad sys_fchown16 /* 95 */
31478+ .quad sys_getpriority
31479+ .quad sys_setpriority
31480+ .quad quiet_ni_syscall /* old profil syscall holder */
31481+ .quad compat_sys_statfs
31482+ .quad compat_sys_fstatfs /* 100 */
31483+ .quad sys_ioperm
31484+ .quad compat_sys_socketcall
31485+ .quad sys_syslog
31486+ .quad compat_sys_setitimer
31487+ .quad compat_sys_getitimer /* 105 */
31488+ .quad compat_sys_newstat
31489+ .quad compat_sys_newlstat
31490+ .quad compat_sys_newfstat
31491+ .quad sys32_uname
31492+ .quad stub32_iopl /* 110 */
31493+ .quad sys_vhangup
31494+ .quad quiet_ni_syscall /* old "idle" system call */
31495+ .quad sys32_vm86_warning /* vm86old */
31496+ .quad compat_sys_wait4
31497+ .quad sys_swapoff /* 115 */
31498+ .quad sys32_sysinfo
31499+ .quad sys32_ipc
31500+ .quad sys_fsync
31501+ .quad stub32_sigreturn
31502+ .quad stub32_clone /* 120 */
31503+ .quad sys_setdomainname
31504+ .quad sys_uname
31505+ .quad sys_modify_ldt
31506+ .quad sys32_adjtimex
31507+ .quad sys32_mprotect /* 125 */
31508+ .quad compat_sys_sigprocmask
31509+ .quad quiet_ni_syscall /* create_module */
31510+ .quad sys_init_module
31511+ .quad sys_delete_module
31512+ .quad quiet_ni_syscall /* 130 get_kernel_syms */
31513+ .quad sys_quotactl
31514+ .quad sys_getpgid
31515+ .quad sys_fchdir
31516+ .quad quiet_ni_syscall /* bdflush */
31517+ .quad sys_sysfs /* 135 */
31518+ .quad sys_personality
31519+ .quad quiet_ni_syscall /* for afs_syscall */
31520+ .quad sys_setfsuid16
31521+ .quad sys_setfsgid16
31522+ .quad sys_llseek /* 140 */
31523+ .quad compat_sys_getdents
31524+ .quad compat_sys_select
31525+ .quad sys_flock
31526+ .quad sys_msync
31527+ .quad compat_sys_readv /* 145 */
31528+ .quad compat_sys_writev
31529+ .quad sys_getsid
31530+ .quad sys_fdatasync
31531+ .quad sys32_sysctl /* sysctl */
31532+ .quad sys_mlock /* 150 */
31533+ .quad sys_munlock
31534+ .quad sys_mlockall
31535+ .quad sys_munlockall
31536+ .quad sys_sched_setparam
31537+ .quad sys_sched_getparam /* 155 */
31538+ .quad sys_sched_setscheduler
31539+ .quad sys_sched_getscheduler
31540+ .quad sys_sched_yield
31541+ .quad sys_sched_get_priority_max
31542+ .quad sys_sched_get_priority_min /* 160 */
31543+ .quad sys_sched_rr_get_interval
31544+ .quad compat_sys_nanosleep
31545+ .quad sys_mremap
31546+ .quad sys_setresuid16
31547+ .quad sys_getresuid16 /* 165 */
31548+ .quad sys32_vm86_warning /* vm86 */
31549+ .quad quiet_ni_syscall /* query_module */
31550+ .quad sys_poll
31551+ .quad compat_sys_nfsservctl
31552+ .quad sys_setresgid16 /* 170 */
31553+ .quad sys_getresgid16
31554+ .quad sys_prctl
31555+ .quad stub32_rt_sigreturn
31556+ .quad sys32_rt_sigaction
31557+ .quad sys32_rt_sigprocmask /* 175 */
31558+ .quad sys32_rt_sigpending
31559+ .quad compat_sys_rt_sigtimedwait
31560+ .quad sys32_rt_sigqueueinfo
31561+ .quad stub32_rt_sigsuspend
31562+ .quad sys32_pread /* 180 */
31563+ .quad sys32_pwrite
31564+ .quad sys_chown16
31565+ .quad sys_getcwd
31566+ .quad sys_capget
31567+ .quad sys_capset
31568+ .quad stub32_sigaltstack
31569+ .quad sys32_sendfile
31570+ .quad quiet_ni_syscall /* streams1 */
31571+ .quad quiet_ni_syscall /* streams2 */
31572+ .quad stub32_vfork /* 190 */
31573+ .quad compat_sys_getrlimit
31574+ .quad sys32_mmap2
31575+ .quad sys32_truncate64
31576+ .quad sys32_ftruncate64
31577+ .quad sys32_stat64 /* 195 */
31578+ .quad sys32_lstat64
31579+ .quad sys32_fstat64
31580+ .quad sys_lchown
31581+ .quad sys_getuid
31582+ .quad sys_getgid /* 200 */
31583+ .quad sys_geteuid
31584+ .quad sys_getegid
31585+ .quad sys_setreuid
31586+ .quad sys_setregid
31587+ .quad sys_getgroups /* 205 */
31588+ .quad sys_setgroups
31589+ .quad sys_fchown
31590+ .quad sys_setresuid
31591+ .quad sys_getresuid
31592+ .quad sys_setresgid /* 210 */
31593+ .quad sys_getresgid
31594+ .quad sys_chown
31595+ .quad sys_setuid
31596+ .quad sys_setgid
31597+ .quad sys_setfsuid /* 215 */
31598+ .quad sys_setfsgid
31599+ .quad sys_pivot_root
31600+ .quad sys_mincore
31601+ .quad sys_madvise
31602+ .quad compat_sys_getdents64 /* 220 getdents64 */
31603+ .quad compat_sys_fcntl64
31604+ .quad quiet_ni_syscall /* tux */
31605+ .quad quiet_ni_syscall /* security */
31606+ .quad sys_gettid
31607+ .quad sys_readahead /* 225 */
31608+ .quad sys_setxattr
31609+ .quad sys_lsetxattr
31610+ .quad sys_fsetxattr
31611+ .quad sys_getxattr
31612+ .quad sys_lgetxattr /* 230 */
31613+ .quad sys_fgetxattr
31614+ .quad sys_listxattr
31615+ .quad sys_llistxattr
31616+ .quad sys_flistxattr
31617+ .quad sys_removexattr /* 235 */
31618+ .quad sys_lremovexattr
31619+ .quad sys_fremovexattr
31620+ .quad sys_tkill
31621+ .quad sys_sendfile64
31622+ .quad compat_sys_futex /* 240 */
31623+ .quad compat_sys_sched_setaffinity
31624+ .quad compat_sys_sched_getaffinity
31625+ .quad sys32_set_thread_area
31626+ .quad sys32_get_thread_area
31627+ .quad compat_sys_io_setup /* 245 */
31628+ .quad sys_io_destroy
31629+ .quad compat_sys_io_getevents
31630+ .quad compat_sys_io_submit
31631+ .quad sys_io_cancel
31632+ .quad sys_fadvise64 /* 250 */
31633+ .quad quiet_ni_syscall /* free_huge_pages */
31634+ .quad sys_exit_group
31635+ .quad sys32_lookup_dcookie
31636+ .quad sys_epoll_create
31637+ .quad sys_epoll_ctl /* 255 */
31638+ .quad sys_epoll_wait
31639+ .quad sys_remap_file_pages
31640+ .quad sys_set_tid_address
31641+ .quad compat_sys_timer_create
31642+ .quad compat_sys_timer_settime /* 260 */
31643+ .quad compat_sys_timer_gettime
31644+ .quad sys_timer_getoverrun
31645+ .quad sys_timer_delete
31646+ .quad compat_sys_clock_settime
31647+ .quad compat_sys_clock_gettime /* 265 */
31648+ .quad compat_sys_clock_getres
31649+ .quad compat_sys_clock_nanosleep
31650+ .quad compat_sys_statfs64
31651+ .quad compat_sys_fstatfs64
31652+ .quad sys_tgkill /* 270 */
31653+ .quad compat_sys_utimes
31654+ .quad sys32_fadvise64_64
31655+ .quad quiet_ni_syscall /* sys_vserver */
31656+ .quad sys_mbind
31657+ .quad compat_sys_get_mempolicy /* 275 */
31658+ .quad sys_set_mempolicy
31659+ .quad compat_sys_mq_open
31660+ .quad sys_mq_unlink
31661+ .quad compat_sys_mq_timedsend
31662+ .quad compat_sys_mq_timedreceive /* 280 */
31663+ .quad compat_sys_mq_notify
31664+ .quad compat_sys_mq_getsetattr
31665+ .quad compat_sys_kexec_load /* reserved for kexec */
31666+ .quad compat_sys_waitid
31667+ .quad quiet_ni_syscall /* 285: sys_altroot */
31668+ .quad sys_add_key
31669+ .quad sys_request_key
31670+ .quad sys_keyctl
31671+ .quad sys_ioprio_set
31672+ .quad sys_ioprio_get /* 290 */
31673+ .quad sys_inotify_init
31674+ .quad sys_inotify_add_watch
31675+ .quad sys_inotify_rm_watch
31676+ .quad sys_migrate_pages
31677+ .quad compat_sys_openat /* 295 */
31678+ .quad sys_mkdirat
31679+ .quad sys_mknodat
31680+ .quad sys_fchownat
31681+ .quad compat_sys_futimesat
31682+ .quad sys32_fstatat /* 300 */
31683+ .quad sys_unlinkat
31684+ .quad sys_renameat
31685+ .quad sys_linkat
31686+ .quad sys_symlinkat
31687+ .quad sys_readlinkat /* 305 */
31688+ .quad sys_fchmodat
31689+ .quad sys_faccessat
31690+ .quad sys_ni_syscall /* pselect6 for now */
31691+ .quad sys_ni_syscall /* ppoll for now */
31692+ .quad sys_unshare /* 310 */
31693+ia32_syscall_end:
31694+ .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
31695+ .quad ni_syscall
31696+ .endr
31697diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32-xen.c linux-2.6.16.33/arch/x86_64/ia32/syscall32-xen.c
31698--- linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32-xen.c 1970-01-01 00:00:00.000000000 +0000
31699+++ linux-2.6.16.33/arch/x86_64/ia32/syscall32-xen.c 2007-01-08 15:00:45.000000000 +0000
31700@@ -0,0 +1,128 @@
31701+/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
31702+
31703+/* vsyscall handling for 32bit processes. Map a stub page into it
31704+ on demand because 32bit cannot reach the kernel's fixmaps */
31705+
31706+#include <linux/mm.h>
31707+#include <linux/string.h>
31708+#include <linux/kernel.h>
31709+#include <linux/gfp.h>
31710+#include <linux/init.h>
31711+#include <linux/stringify.h>
31712+#include <linux/security.h>
31713+#include <asm/proto.h>
31714+#include <asm/tlbflush.h>
31715+#include <asm/ia32_unistd.h>
31716+
31717+#ifdef USE_INT80
31718+extern unsigned char syscall32_int80[], syscall32_int80_end[];
31719+#endif
31720+extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
31721+extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
31722+extern int sysctl_vsyscall32;
31723+
31724+char *syscall32_page;
31725+#ifndef USE_INT80
31726+static int use_sysenter = -1;
31727+#endif
31728+
31729+static struct page *
31730+syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
31731+{
31732+ struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
31733+ get_page(p);
31734+ return p;
31735+}
31736+
31737+/* Prevent VMA merging */
31738+static void syscall32_vma_close(struct vm_area_struct *vma)
31739+{
31740+}
31741+
31742+static struct vm_operations_struct syscall32_vm_ops = {
31743+ .close = syscall32_vma_close,
31744+ .nopage = syscall32_nopage,
31745+};
31746+
31747+struct linux_binprm;
31748+
31749+/* Setup a VMA at program startup for the vsyscall page */
31750+int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
31751+{
31752+ int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
31753+ struct vm_area_struct *vma;
31754+ struct mm_struct *mm = current->mm;
31755+ int ret;
31756+
31757+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
31758+ if (!vma)
31759+ return -ENOMEM;
31760+
31761+ memset(vma, 0, sizeof(struct vm_area_struct));
31762+ /* Could randomize here */
31763+ vma->vm_start = VSYSCALL32_BASE;
31764+ vma->vm_end = VSYSCALL32_END;
31765+ /* MAYWRITE to allow gdb to COW and set breakpoints */
31766+ vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
31767+ vma->vm_flags |= mm->def_flags;
31768+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
31769+ vma->vm_ops = &syscall32_vm_ops;
31770+ vma->vm_mm = mm;
31771+
31772+ down_write(&mm->mmap_sem);
31773+ if ((ret = insert_vm_struct(mm, vma))) {
31774+ up_write(&mm->mmap_sem);
31775+ kmem_cache_free(vm_area_cachep, vma);
31776+ return ret;
31777+ }
31778+ mm->total_vm += npages;
31779+ up_write(&mm->mmap_sem);
31780+ return 0;
31781+}
31782+
31783+static int __init init_syscall32(void)
31784+{
31785+ syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
31786+ if (!syscall32_page)
31787+ panic("Cannot allocate syscall32 page");
31788+
31789+#ifdef USE_INT80
31790+ /*
31791+ * At this point we use int 0x80.
31792+ */
31793+ memcpy(syscall32_page, syscall32_int80,
31794+ syscall32_int80_end - syscall32_int80);
31795+#else
31796+ if (use_sysenter > 0) {
31797+ memcpy(syscall32_page, syscall32_sysenter,
31798+ syscall32_sysenter_end - syscall32_sysenter);
31799+ } else {
31800+ memcpy(syscall32_page, syscall32_syscall,
31801+ syscall32_syscall_end - syscall32_syscall);
31802+ }
31803+#endif
31804+ return 0;
31805+}
31806+
31807+/*
31808+ * This must be done early in case we have an initrd containing 32-bit
31809+ * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
31810+ */
31811+core_initcall(init_syscall32);
31812+
31813+/* May not be __init: called during resume */
31814+void syscall32_cpu_init(void)
31815+{
31816+#ifndef USE_INT80
31817+ if (use_sysenter < 0)
31818+ use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
31819+
31820+ /* Load these always in case some future AMD CPU supports
31821+ SYSENTER from compat mode too. */
31822+ checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
31823+ checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
31824+ checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
31825+
31826+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
31827+#endif
31828+}
31829diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32_syscall-xen.S linux-2.6.16.33/arch/x86_64/ia32/syscall32_syscall-xen.S
31830--- linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32_syscall-xen.S 1970-01-01 00:00:00.000000000 +0000
31831+++ linux-2.6.16.33/arch/x86_64/ia32/syscall32_syscall-xen.S 2007-01-08 15:00:45.000000000 +0000
31832@@ -0,0 +1,28 @@
31833+/* 32bit VDSOs mapped into user space. */
31834+
31835+ .section ".init.data","aw"
31836+
31837+#ifdef USE_INT80
31838+
31839+ .globl syscall32_int80
31840+ .globl syscall32_int80_end
31841+
31842+syscall32_int80:
31843+ .incbin "arch/x86_64/ia32/vsyscall-int80.so"
31844+syscall32_int80_end:
31845+
31846+#endif
31847+
31848+ .globl syscall32_syscall
31849+ .globl syscall32_syscall_end
31850+
31851+syscall32_syscall:
31852+ .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
31853+syscall32_syscall_end:
31854+
31855+ .globl syscall32_sysenter
31856+ .globl syscall32_sysenter_end
31857+
31858+syscall32_sysenter:
31859+ .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
31860+syscall32_sysenter_end:
31861diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-int80.S linux-2.6.16.33/arch/x86_64/ia32/vsyscall-int80.S
31862--- linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-int80.S 1970-01-01 00:00:00.000000000 +0000
31863+++ linux-2.6.16.33/arch/x86_64/ia32/vsyscall-int80.S 2007-01-08 15:00:45.000000000 +0000
31864@@ -0,0 +1,58 @@
31865+/*
31866+ * Code for the vsyscall page. This version uses the old int $0x80 method.
31867+ *
31868+ * NOTE:
31869+ * 1) __kernel_vsyscall _must_ be first in this page.
31870+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
31871+ * for details.
31872+ */
31873+#include <asm/ia32_unistd.h>
31874+#include <asm/asm-offsets.h>
31875+
31876+ .code32
31877+ .text
31878+ .section .text.vsyscall,"ax"
31879+ .globl __kernel_vsyscall
31880+ .type __kernel_vsyscall,@function
31881+__kernel_vsyscall:
31882+.LSTART_vsyscall:
31883+ int $0x80
31884+ ret
31885+.LEND_vsyscall:
31886+ .size __kernel_vsyscall,.-.LSTART_vsyscall
31887+ .previous
31888+
31889+ .section .eh_frame,"a",@progbits
31890+.LSTARTFRAME:
31891+ .long .LENDCIE-.LSTARTCIE
31892+.LSTARTCIE:
31893+ .long 0 /* CIE ID */
31894+ .byte 1 /* Version number */
31895+ .string "zR" /* NUL-terminated augmentation string */
31896+ .uleb128 1 /* Code alignment factor */
31897+ .sleb128 -4 /* Data alignment factor */
31898+ .byte 8 /* Return address register column */
31899+ .uleb128 1 /* Augmentation value length */
31900+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
31901+ .byte 0x0c /* DW_CFA_def_cfa */
31902+ .uleb128 4
31903+ .uleb128 4
31904+ .byte 0x88 /* DW_CFA_offset, column 0x8 */
31905+ .uleb128 1
31906+ .align 4
31907+.LENDCIE:
31908+
31909+ .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
31910+.LSTARTFDE1:
31911+ .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
31912+ .long .LSTART_vsyscall-. /* PC-relative start address */
31913+ .long .LEND_vsyscall-.LSTART_vsyscall
31914+ .uleb128 0 /* Augmentation length */
31915+ .align 4
31916+.LENDFDE1:
31917+
31918+/*
31919+ * Get the common code for the sigreturn entry points.
31920+ */
31921+#define SYSCALL_ENTER_KERNEL int $0x80
31922+#include "vsyscall-sigreturn.S"
31923diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-sigreturn.S linux-2.6.16.33/arch/x86_64/ia32/vsyscall-sigreturn.S
31924--- linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-sigreturn.S 2006-11-22 18:06:31.000000000 +0000
31925+++ linux-2.6.16.33/arch/x86_64/ia32/vsyscall-sigreturn.S 2007-01-08 15:00:45.000000000 +0000
31926@@ -120,5 +120,5 @@
31927 .align 4
31928 .LENDFDE3:
31929
31930-#include "../../i386/kernel/vsyscall-note.S"
31931+#include <vsyscall-note.S>
31932
31933diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/Makefile linux-2.6.16.33/arch/x86_64/kernel/Makefile
31934--- linux-2.6.16.33-noxen/arch/x86_64/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
31935+++ linux-2.6.16.33/arch/x86_64/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
31936@@ -20,11 +20,13 @@
31937 obj-$(CONFIG_X86_CPUID) += cpuid.o
31938 obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
31939 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
31940+obj-$(CONFIG_X86_XEN_GENAPIC) += genapic.o genapic_xen.o
31941 obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
31942 genapic.o genapic_cluster.o genapic_flat.o
31943 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
31944 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
31945-obj-$(CONFIG_PM) += suspend.o
31946+obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
31947+obj-$(CONFIG_ACPI_SLEEP) += suspend.o
31948 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
31949 obj-$(CONFIG_CPU_FREQ) += cpufreq/
31950 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
31951@@ -51,3 +53,17 @@
31952 msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
31953 dmi_scan-y += ../../i386/kernel/dmi_scan.o
31954
31955+ifdef CONFIG_XEN
31956+time-y += ../../i386/kernel/time-xen.o
31957+pci-dma-y += ../../i386/kernel/pci-dma-xen.o
31958+microcode-$(subst m,y,$(CONFIG_MICROCODE)) := ../../i386/kernel/microcode-xen.o
31959+quirks-y := ../../i386/kernel/quirks-xen.o
31960+
31961+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
31962+
31963+include $(srctree)/scripts/Makefile.xen
31964+
31965+obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
31966+obj-y := $(call cherrypickxen, $(obj-y))
31967+extra-y := $(call cherrypickxen, $(extra-y))
31968+endif
31969diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/acpi/Makefile linux-2.6.16.33/arch/x86_64/kernel/acpi/Makefile
31970--- linux-2.6.16.33-noxen/arch/x86_64/kernel/acpi/Makefile 2006-11-22 18:06:31.000000000 +0000
31971+++ linux-2.6.16.33/arch/x86_64/kernel/acpi/Makefile 2007-01-08 15:00:45.000000000 +0000
31972@@ -7,3 +7,4 @@
31973 processor-y := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o
31974 endif
31975
31976+boot-$(CONFIG_XEN) := ../../../i386/kernel/acpi/boot-xen.o
31977diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/apic-xen.c linux-2.6.16.33/arch/x86_64/kernel/apic-xen.c
31978--- linux-2.6.16.33-noxen/arch/x86_64/kernel/apic-xen.c 1970-01-01 00:00:00.000000000 +0000
31979+++ linux-2.6.16.33/arch/x86_64/kernel/apic-xen.c 2007-01-08 15:00:45.000000000 +0000
31980@@ -0,0 +1,198 @@
31981+/*
31982+ * Local APIC handling, local APIC timers
31983+ *
31984+ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
31985+ *
31986+ * Fixes
31987+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
31988+ * thanks to Eric Gilmore
31989+ * and Rolf G. Tews
31990+ * for testing these extensively.
31991+ * Maciej W. Rozycki : Various updates and fixes.
31992+ * Mikael Pettersson : Power Management for UP-APIC.
31993+ * Pavel Machek and
31994+ * Mikael Pettersson : PM converted to driver model.
31995+ */
31996+
31997+#include <linux/config.h>
31998+#include <linux/init.h>
31999+
32000+#include <linux/mm.h>
32001+#include <linux/delay.h>
32002+#include <linux/bootmem.h>
32003+#include <linux/smp_lock.h>
32004+#include <linux/interrupt.h>
32005+#include <linux/mc146818rtc.h>
32006+#include <linux/kernel_stat.h>
32007+#include <linux/sysdev.h>
32008+#include <linux/module.h>
32009+
32010+#include <asm/atomic.h>
32011+#include <asm/smp.h>
32012+#include <asm/mtrr.h>
32013+#include <asm/mpspec.h>
32014+#include <asm/desc.h>
32015+#include <asm/arch_hooks.h>
32016+#include <asm/hpet.h>
32017+#include <asm/idle.h>
32018+
32019+int apic_verbosity;
32020+
32021+/*
32022+ * 'what should we do if we get a hw irq event on an illegal vector'.
32023+ * each architecture has to answer this themselves.
32024+ */
32025+void ack_bad_irq(unsigned int irq)
32026+{
32027+ printk("unexpected IRQ trap at vector %02x\n", irq);
32028+ /*
32029+ * Currently unexpected vectors happen only on SMP and APIC.
32030+ * We _must_ ack these because every local APIC has only N
32031+ * irq slots per priority level, and a 'hanging, unacked' IRQ
32032+ * holds up an irq slot - in excessive cases (when multiple
32033+ * unexpected vectors occur) that might lock up the APIC
32034+ * completely.
32035+ * But don't ack when the APIC is disabled. -AK
32036+ */
32037+ if (!disable_apic)
32038+ ack_APIC_irq();
32039+}
32040+
32041+int setup_profiling_timer(unsigned int multiplier)
32042+{
32043+ return -EINVAL;
32044+}
32045+
32046+void smp_local_timer_interrupt(struct pt_regs *regs)
32047+{
32048+ profile_tick(CPU_PROFILING, regs);
32049+#ifndef CONFIG_XEN
32050+#ifdef CONFIG_SMP
32051+ update_process_times(user_mode(regs));
32052+#endif
32053+#endif
32054+ /*
32055+ * We take the 'long' return path, and there every subsystem
32056+ * grabs the appropriate locks (kernel lock/ irq lock).
32057+ *
32058+ * we might want to decouple profiling from the 'long path',
32059+ * and do the profiling totally in assembly.
32060+ *
32061+ * Currently this isn't too much of an issue (performance wise),
32062+ * we can take more than 100K local irqs per second on a 100 MHz P5.
32063+ */
32064+}
32065+
32066+/*
32067+ * Local APIC timer interrupt. This is the most natural way for doing
32068+ * local interrupts, but local timer interrupts can be emulated by
32069+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
32070+ *
32071+ * [ if a single-CPU system runs an SMP kernel then we call the local
32072+ * interrupt as well. Thus we cannot inline the local irq ... ]
32073+ */
32074+void smp_apic_timer_interrupt(struct pt_regs *regs)
32075+{
32076+ /*
32077+ * the NMI deadlock-detector uses this.
32078+ */
32079+ add_pda(apic_timer_irqs, 1);
32080+
32081+ /*
32082+ * NOTE! We'd better ACK the irq immediately,
32083+ * because timer handling can be slow.
32084+ */
32085+ ack_APIC_irq();
32086+ /*
32087+ * update_process_times() expects us to have done irq_enter().
32088+ * Besides, if we don't timer interrupts ignore the global
32089+ * interrupt lock, which is the WrongThing (tm) to do.
32090+ */
32091+ exit_idle();
32092+ irq_enter();
32093+ smp_local_timer_interrupt(regs);
32094+ irq_exit();
32095+}
32096+
32097+/*
32098+ * This interrupt should _never_ happen with our APIC/SMP architecture
32099+ */
32100+asmlinkage void smp_spurious_interrupt(void)
32101+{
32102+ unsigned int v;
32103+ exit_idle();
32104+ irq_enter();
32105+ /*
32106+ * Check if this really is a spurious interrupt and ACK it
32107+ * if it is a vectored one. Just in case...
32108+ * Spurious interrupts should not be ACKed.
32109+ */
32110+ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
32111+ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
32112+ ack_APIC_irq();
32113+
32114+#if 0
32115+ static unsigned long last_warning;
32116+ static unsigned long skipped;
32117+
32118+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */
32119+ if (time_before(last_warning+30*HZ,jiffies)) {
32120+ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
32121+ smp_processor_id(), skipped);
32122+ last_warning = jiffies;
32123+ skipped = 0;
32124+ } else {
32125+ skipped++;
32126+ }
32127+#endif
32128+ irq_exit();
32129+}
32130+
32131+/*
32132+ * This interrupt should never happen with our APIC/SMP architecture
32133+ */
32134+
32135+asmlinkage void smp_error_interrupt(void)
32136+{
32137+ unsigned int v, v1;
32138+
32139+ exit_idle();
32140+ irq_enter();
32141+ /* First tickle the hardware, only then report what went on. -- REW */
32142+ v = apic_read(APIC_ESR);
32143+ apic_write(APIC_ESR, 0);
32144+ v1 = apic_read(APIC_ESR);
32145+ ack_APIC_irq();
32146+ atomic_inc(&irq_err_count);
32147+
32148+ /* Here is what the APIC error bits mean:
32149+ 0: Send CS error
32150+ 1: Receive CS error
32151+ 2: Send accept error
32152+ 3: Receive accept error
32153+ 4: Reserved
32154+ 5: Send illegal vector
32155+ 6: Received illegal vector
32156+ 7: Illegal register address
32157+ */
32158+ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
32159+ smp_processor_id(), v , v1);
32160+ irq_exit();
32161+}
32162+
32163+int disable_apic;
32164+
32165+/*
32166+ * This initializes the IO-APIC and APIC hardware if this is
32167+ * a UP kernel.
32168+ */
32169+int __init APIC_init_uniprocessor (void)
32170+{
32171+#ifdef CONFIG_X86_IO_APIC
32172+ if (smp_found_config)
32173+ if (!skip_ioapic_setup && nr_ioapics)
32174+ setup_IO_APIC();
32175+#endif
32176+
32177+ return 0;
32178+}
32179diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/asm-offsets.c linux-2.6.16.33/arch/x86_64/kernel/asm-offsets.c
32180--- linux-2.6.16.33-noxen/arch/x86_64/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
32181+++ linux-2.6.16.33/arch/x86_64/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
32182@@ -66,7 +66,9 @@
32183 DEFINE(pbe_address, offsetof(struct pbe, address));
32184 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
32185 DEFINE(pbe_next, offsetof(struct pbe, next));
32186+#ifndef CONFIG_X86_NO_TSS
32187 BLANK();
32188 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
32189+#endif
32190 return 0;
32191 }
32192diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/crash.c linux-2.6.16.33/arch/x86_64/kernel/crash.c
32193--- linux-2.6.16.33-noxen/arch/x86_64/kernel/crash.c 2006-11-22 18:06:31.000000000 +0000
32194+++ linux-2.6.16.33/arch/x86_64/kernel/crash.c 2007-01-08 15:00:45.000000000 +0000
32195@@ -92,6 +92,7 @@
32196 crash_save_this_cpu(regs, cpu);
32197 }
32198
32199+#ifndef CONFIG_XEN
32200 #ifdef CONFIG_SMP
32201 static atomic_t waiting_for_crash_ipi;
32202
32203@@ -156,6 +157,7 @@
32204 /* There are no cpus to shootdown */
32205 }
32206 #endif
32207+#endif /* CONFIG_XEN */
32208
32209 void machine_crash_shutdown(struct pt_regs *regs)
32210 {
32211@@ -173,6 +175,8 @@
32212
32213 /* Make a note of crashing cpu. Will be used in NMI callback.*/
32214 crashing_cpu = smp_processor_id();
32215+
32216+#ifndef CONFIG_XEN
32217 nmi_shootdown_cpus();
32218
32219 if(cpu_has_apic)
32220@@ -181,6 +185,6 @@
32221 #if defined(CONFIG_X86_IO_APIC)
32222 disable_IO_APIC();
32223 #endif
32224-
32225+#endif /* CONFIG_XEN */
32226 crash_save_self(regs);
32227 }
32228diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/e820-xen.c linux-2.6.16.33/arch/x86_64/kernel/e820-xen.c
32229--- linux-2.6.16.33-noxen/arch/x86_64/kernel/e820-xen.c 1970-01-01 00:00:00.000000000 +0000
32230+++ linux-2.6.16.33/arch/x86_64/kernel/e820-xen.c 2007-01-08 15:00:45.000000000 +0000
32231@@ -0,0 +1,719 @@
32232+/*
32233+ * Handle the memory map.
32234+ * The functions here do the job until bootmem takes over.
32235+ * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
32236+ *
32237+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
32238+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
32239+ * Alex Achenbach <xela@slit.de>, December 2002.
32240+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
32241+ *
32242+ */
32243+#include <linux/config.h>
32244+#include <linux/kernel.h>
32245+#include <linux/types.h>
32246+#include <linux/init.h>
32247+#include <linux/bootmem.h>
32248+#include <linux/ioport.h>
32249+#include <linux/string.h>
32250+#include <linux/kexec.h>
32251+#include <linux/module.h>
32252+
32253+#include <asm/page.h>
32254+#include <asm/e820.h>
32255+#include <asm/proto.h>
32256+#include <asm/bootsetup.h>
32257+#include <asm/sections.h>
32258+#include <xen/interface/memory.h>
32259+
32260+/*
32261+ * PFN of last memory page.
32262+ */
32263+unsigned long end_pfn;
32264+EXPORT_SYMBOL(end_pfn);
32265+
32266+/*
32267+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
32268+ * The direct mapping extends to end_pfn_map, so that we can directly access
32269+ * apertures, ACPI and other tables without having to play with fixmaps.
32270+ */
32271+unsigned long end_pfn_map;
32272+
32273+/*
32274+ * Last pfn which the user wants to use.
32275+ */
32276+unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
32277+
32278+extern struct resource code_resource, data_resource;
32279+
32280+/* Check for some hardcoded bad areas that early boot is not allowed to touch */
32281+static inline int bad_addr(unsigned long *addrp, unsigned long size)
32282+{
32283+ unsigned long addr = *addrp, last = addr + size;
32284+
32285+#ifndef CONFIG_XEN
32286+ /* various gunk below that needed for SMP startup */
32287+ if (addr < 0x8000) {
32288+ *addrp = 0x8000;
32289+ return 1;
32290+ }
32291+
32292+ /* direct mapping tables of the kernel */
32293+ if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
32294+ *addrp = table_end << PAGE_SHIFT;
32295+ return 1;
32296+ }
32297+
32298+ /* initrd */
32299+#ifdef CONFIG_BLK_DEV_INITRD
32300+ if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
32301+ addr < INITRD_START+INITRD_SIZE) {
32302+ *addrp = INITRD_START + INITRD_SIZE;
32303+ return 1;
32304+ }
32305+#endif
32306+ /* kernel code + 640k memory hole (later should not be needed, but
32307+ be paranoid for now) */
32308+ if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
32309+ *addrp = __pa_symbol(&_end);
32310+ return 1;
32311+ }
32312+ /* XXX ramdisk image here? */
32313+#else
32314+ if (last < (table_end<<PAGE_SHIFT)) {
32315+ *addrp = table_end << PAGE_SHIFT;
32316+ return 1;
32317+ }
32318+#endif
32319+ return 0;
32320+}
32321+
32322+#ifndef CONFIG_XEN
32323+int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
32324+{
32325+ int i;
32326+ for (i = 0; i < e820.nr_map; i++) {
32327+ struct e820entry *ei = &e820.map[i];
32328+ if (type && ei->type != type)
32329+ continue;
32330+ if (ei->addr >= end || ei->addr + ei->size <= start)
32331+ continue;
32332+ return 1;
32333+ }
32334+ return 0;
32335+}
32336+#endif
32337+
32338+/*
32339+ * Find a free area in a specific range.
32340+ */
32341+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
32342+{
32343+ int i;
32344+ for (i = 0; i < e820.nr_map; i++) {
32345+ struct e820entry *ei = &e820.map[i];
32346+ unsigned long addr = ei->addr, last;
32347+ if (ei->type != E820_RAM)
32348+ continue;
32349+ if (addr < start)
32350+ addr = start;
32351+ if (addr > ei->addr + ei->size)
32352+ continue;
32353+ while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
32354+ ;
32355+ last = addr + size;
32356+ if (last > ei->addr + ei->size)
32357+ continue;
32358+ if (last > end)
32359+ continue;
32360+ return addr;
32361+ }
32362+ return -1UL;
32363+}
32364+
32365+/*
32366+ * Free bootmem based on the e820 table for a node.
32367+ */
32368+void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
32369+{
32370+ int i;
32371+ for (i = 0; i < e820.nr_map; i++) {
32372+ struct e820entry *ei = &e820.map[i];
32373+ unsigned long last, addr;
32374+
32375+ if (ei->type != E820_RAM ||
32376+ ei->addr+ei->size <= start ||
32377+ ei->addr >= end)
32378+ continue;
32379+
32380+ addr = round_up(ei->addr, PAGE_SIZE);
32381+ if (addr < start)
32382+ addr = start;
32383+
32384+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
32385+ if (last >= end)
32386+ last = end;
32387+
32388+ if (last > addr && last-addr >= PAGE_SIZE)
32389+ free_bootmem_node(pgdat, addr, last-addr);
32390+ }
32391+}
32392+
32393+/*
32394+ * Find the highest page frame number we have available
32395+ */
32396+unsigned long __init e820_end_of_ram(void)
32397+{
32398+ int i;
32399+ unsigned long end_pfn = 0;
32400+
32401+ for (i = 0; i < e820.nr_map; i++) {
32402+ struct e820entry *ei = &e820.map[i];
32403+ unsigned long start, end;
32404+
32405+ start = round_up(ei->addr, PAGE_SIZE);
32406+ end = round_down(ei->addr + ei->size, PAGE_SIZE);
32407+ if (start >= end)
32408+ continue;
32409+ if (ei->type == E820_RAM) {
32410+ if (end > end_pfn<<PAGE_SHIFT)
32411+ end_pfn = end>>PAGE_SHIFT;
32412+ } else {
32413+ if (end > end_pfn_map<<PAGE_SHIFT)
32414+ end_pfn_map = end>>PAGE_SHIFT;
32415+ }
32416+ }
32417+
32418+ if (end_pfn > end_pfn_map)
32419+ end_pfn_map = end_pfn;
32420+ if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
32421+ end_pfn_map = MAXMEM>>PAGE_SHIFT;
32422+ if (end_pfn > end_user_pfn)
32423+ end_pfn = end_user_pfn;
32424+ if (end_pfn > end_pfn_map)
32425+ end_pfn = end_pfn_map;
32426+
32427+ return end_pfn;
32428+}
32429+
32430+/*
32431+ * Compute how much memory is missing in a range.
32432+ * Unlike the other functions in this file the arguments are in page numbers.
32433+ */
32434+unsigned long __init
32435+e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
32436+{
32437+ unsigned long ram = 0;
32438+ unsigned long start = start_pfn << PAGE_SHIFT;
32439+ unsigned long end = end_pfn << PAGE_SHIFT;
32440+ int i;
32441+ for (i = 0; i < e820.nr_map; i++) {
32442+ struct e820entry *ei = &e820.map[i];
32443+ unsigned long last, addr;
32444+
32445+ if (ei->type != E820_RAM ||
32446+ ei->addr+ei->size <= start ||
32447+ ei->addr >= end)
32448+ continue;
32449+
32450+ addr = round_up(ei->addr, PAGE_SIZE);
32451+ if (addr < start)
32452+ addr = start;
32453+
32454+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
32455+ if (last >= end)
32456+ last = end;
32457+
32458+ if (last > addr)
32459+ ram += last - addr;
32460+ }
32461+ return ((end - start) - ram) >> PAGE_SHIFT;
32462+}
32463+
32464+/*
32465+ * Mark e820 reserved areas as busy for the resource manager.
32466+ */
32467+void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
32468+{
32469+ int i;
32470+ for (i = 0; i < nr_map; i++) {
32471+ struct resource *res;
32472+ res = alloc_bootmem_low(sizeof(struct resource));
32473+ switch (e820[i].type) {
32474+ case E820_RAM: res->name = "System RAM"; break;
32475+ case E820_ACPI: res->name = "ACPI Tables"; break;
32476+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
32477+ default: res->name = "reserved";
32478+ }
32479+ res->start = e820[i].addr;
32480+ res->end = res->start + e820[i].size - 1;
32481+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
32482+ request_resource(&iomem_resource, res);
32483+ if (e820[i].type == E820_RAM) {
32484+ /*
32485+ * We don't know which RAM region contains kernel data,
32486+ * so we try it repeatedly and let the resource manager
32487+ * test it.
32488+ */
32489+#ifndef CONFIG_XEN
32490+ request_resource(res, &code_resource);
32491+ request_resource(res, &data_resource);
32492+#endif
32493+#ifdef CONFIG_KEXEC
32494+ if (crashk_res.start != crashk_res.end)
32495+ request_resource(res, &crashk_res);
32496+#ifdef CONFIG_XEN
32497+ xen_machine_kexec_register_resources(res);
32498+#endif
32499+#endif
32500+ }
32501+ }
32502+}
32503+
32504+/*
32505+ * Add a memory region to the kernel e820 map.
32506+ */
32507+void __init add_memory_region(unsigned long start, unsigned long size, int type)
32508+{
32509+ int x = e820.nr_map;
32510+
32511+ if (x == E820MAX) {
32512+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
32513+ return;
32514+ }
32515+
32516+ e820.map[x].addr = start;
32517+ e820.map[x].size = size;
32518+ e820.map[x].type = type;
32519+ e820.nr_map++;
32520+}
32521+
32522+void __init e820_print_map(char *who)
32523+{
32524+ int i;
32525+
32526+ for (i = 0; i < e820.nr_map; i++) {
32527+ printk(" %s: %016Lx - %016Lx ", who,
32528+ (unsigned long long) e820.map[i].addr,
32529+ (unsigned long long) (e820.map[i].addr + e820.map[i].size));
32530+ switch (e820.map[i].type) {
32531+ case E820_RAM: printk("(usable)\n");
32532+ break;
32533+ case E820_RESERVED:
32534+ printk("(reserved)\n");
32535+ break;
32536+ case E820_ACPI:
32537+ printk("(ACPI data)\n");
32538+ break;
32539+ case E820_NVS:
32540+ printk("(ACPI NVS)\n");
32541+ break;
32542+ default: printk("type %u\n", e820.map[i].type);
32543+ break;
32544+ }
32545+ }
32546+}
32547+
32548+/*
32549+ * Sanitize the BIOS e820 map.
32550+ *
32551+ * Some e820 responses include overlapping entries. The following
32552+ * replaces the original e820 map with a new one, removing overlaps.
32553+ *
32554+ */
32555+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
32556+{
32557+ struct change_member {
32558+ struct e820entry *pbios; /* pointer to original bios entry */
32559+ unsigned long long addr; /* address for this change point */
32560+ };
32561+ static struct change_member change_point_list[2*E820MAX] __initdata;
32562+ static struct change_member *change_point[2*E820MAX] __initdata;
32563+ static struct e820entry *overlap_list[E820MAX] __initdata;
32564+ static struct e820entry new_bios[E820MAX] __initdata;
32565+ struct change_member *change_tmp;
32566+ unsigned long current_type, last_type;
32567+ unsigned long long last_addr;
32568+ int chgidx, still_changing;
32569+ int overlap_entries;
32570+ int new_bios_entry;
32571+ int old_nr, new_nr, chg_nr;
32572+ int i;
32573+
32574+ /*
32575+ Visually we're performing the following (1,2,3,4 = memory types)...
32576+
32577+ Sample memory map (w/overlaps):
32578+ ____22__________________
32579+ ______________________4_
32580+ ____1111________________
32581+ _44_____________________
32582+ 11111111________________
32583+ ____________________33__
32584+ ___________44___________
32585+ __________33333_________
32586+ ______________22________
32587+ ___________________2222_
32588+ _________111111111______
32589+ _____________________11_
32590+ _________________4______
32591+
32592+ Sanitized equivalent (no overlap):
32593+ 1_______________________
32594+ _44_____________________
32595+ ___1____________________
32596+ ____22__________________
32597+ ______11________________
32598+ _________1______________
32599+ __________3_____________
32600+ ___________44___________
32601+ _____________33_________
32602+ _______________2________
32603+ ________________1_______
32604+ _________________4______
32605+ ___________________2____
32606+ ____________________33__
32607+ ______________________4_
32608+ */
32609+
32610+ /* if there's only one memory region, don't bother */
32611+ if (*pnr_map < 2)
32612+ return -1;
32613+
32614+ old_nr = *pnr_map;
32615+
32616+ /* bail out if we find any unreasonable addresses in bios map */
32617+ for (i=0; i<old_nr; i++)
32618+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
32619+ return -1;
32620+
32621+ /* create pointers for initial change-point information (for sorting) */
32622+ for (i=0; i < 2*old_nr; i++)
32623+ change_point[i] = &change_point_list[i];
32624+
32625+ /* record all known change-points (starting and ending addresses),
32626+ omitting those that are for empty memory regions */
32627+ chgidx = 0;
32628+ for (i=0; i < old_nr; i++) {
32629+ if (biosmap[i].size != 0) {
32630+ change_point[chgidx]->addr = biosmap[i].addr;
32631+ change_point[chgidx++]->pbios = &biosmap[i];
32632+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
32633+ change_point[chgidx++]->pbios = &biosmap[i];
32634+ }
32635+ }
32636+ chg_nr = chgidx;
32637+
32638+ /* sort change-point list by memory addresses (low -> high) */
32639+ still_changing = 1;
32640+ while (still_changing) {
32641+ still_changing = 0;
32642+ for (i=1; i < chg_nr; i++) {
32643+ /* if <current_addr> > <last_addr>, swap */
32644+ /* or, if current=<start_addr> & last=<end_addr>, swap */
32645+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
32646+ ((change_point[i]->addr == change_point[i-1]->addr) &&
32647+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
32648+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
32649+ )
32650+ {
32651+ change_tmp = change_point[i];
32652+ change_point[i] = change_point[i-1];
32653+ change_point[i-1] = change_tmp;
32654+ still_changing=1;
32655+ }
32656+ }
32657+ }
32658+
32659+ /* create a new bios memory map, removing overlaps */
32660+ overlap_entries=0; /* number of entries in the overlap table */
32661+ new_bios_entry=0; /* index for creating new bios map entries */
32662+ last_type = 0; /* start with undefined memory type */
32663+ last_addr = 0; /* start with 0 as last starting address */
32664+ /* loop through change-points, determining affect on the new bios map */
32665+ for (chgidx=0; chgidx < chg_nr; chgidx++)
32666+ {
32667+ /* keep track of all overlapping bios entries */
32668+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
32669+ {
32670+ /* add map entry to overlap list (> 1 entry implies an overlap) */
32671+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
32672+ }
32673+ else
32674+ {
32675+ /* remove entry from list (order independent, so swap with last) */
32676+ for (i=0; i<overlap_entries; i++)
32677+ {
32678+ if (overlap_list[i] == change_point[chgidx]->pbios)
32679+ overlap_list[i] = overlap_list[overlap_entries-1];
32680+ }
32681+ overlap_entries--;
32682+ }
32683+ /* if there are overlapping entries, decide which "type" to use */
32684+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
32685+ current_type = 0;
32686+ for (i=0; i<overlap_entries; i++)
32687+ if (overlap_list[i]->type > current_type)
32688+ current_type = overlap_list[i]->type;
32689+ /* continue building up new bios map based on this information */
32690+ if (current_type != last_type) {
32691+ if (last_type != 0) {
32692+ new_bios[new_bios_entry].size =
32693+ change_point[chgidx]->addr - last_addr;
32694+ /* move forward only if the new size was non-zero */
32695+ if (new_bios[new_bios_entry].size != 0)
32696+ if (++new_bios_entry >= E820MAX)
32697+ break; /* no more space left for new bios entries */
32698+ }
32699+ if (current_type != 0) {
32700+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
32701+ new_bios[new_bios_entry].type = current_type;
32702+ last_addr=change_point[chgidx]->addr;
32703+ }
32704+ last_type = current_type;
32705+ }
32706+ }
32707+ new_nr = new_bios_entry; /* retain count for new bios entries */
32708+
32709+ /* copy new bios mapping into original location */
32710+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
32711+ *pnr_map = new_nr;
32712+
32713+ return 0;
32714+}
32715+
32716+/*
32717+ * Copy the BIOS e820 map into a safe place.
32718+ *
32719+ * Sanity-check it while we're at it..
32720+ *
32721+ * If we're lucky and live on a modern system, the setup code
32722+ * will have given us a memory map that we can use to properly
32723+ * set up memory. If we aren't, we'll fake a memory map.
32724+ *
32725+ * We check to see that the memory map contains at least 2 elements
32726+ * before we'll use it, because the detection code in setup.S may
32727+ * not be perfect and most every PC known to man has two memory
32728+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
32729+ * thinkpad 560x, for example, does not cooperate with the memory
32730+ * detection code.)
32731+ */
32732+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
32733+{
32734+#ifndef CONFIG_XEN
32735+ /* Only one memory region (or negative)? Ignore it */
32736+ if (nr_map < 2)
32737+ return -1;
32738+#else
32739+ BUG_ON(nr_map < 1);
32740+#endif
32741+
32742+ do {
32743+ unsigned long start = biosmap->addr;
32744+ unsigned long size = biosmap->size;
32745+ unsigned long end = start + size;
32746+ unsigned long type = biosmap->type;
32747+
32748+ /* Overflow in 64 bits? Ignore the memory map. */
32749+ if (start > end)
32750+ return -1;
32751+
32752+#ifndef CONFIG_XEN
32753+ /*
32754+ * Some BIOSes claim RAM in the 640k - 1M region.
32755+ * Not right. Fix it up.
32756+ *
32757+ * This should be removed on Hammer which is supposed to not
32758+ * have non e820 covered ISA mappings there, but I had some strange
32759+ * problems so it stays for now. -AK
32760+ */
32761+ if (type == E820_RAM) {
32762+ if (start < 0x100000ULL && end > 0xA0000ULL) {
32763+ if (start < 0xA0000ULL)
32764+ add_memory_region(start, 0xA0000ULL-start, type);
32765+ if (end <= 0x100000ULL)
32766+ continue;
32767+ start = 0x100000ULL;
32768+ size = end - start;
32769+ }
32770+ }
32771+#endif
32772+
32773+ add_memory_region(start, size, type);
32774+ } while (biosmap++,--nr_map);
32775+ return 0;
32776+}
32777+
32778+#ifndef CONFIG_XEN
32779+void __init setup_memory_region(void)
32780+{
32781+ char *who = "BIOS-e820";
32782+
32783+ /*
32784+ * Try to copy the BIOS-supplied E820-map.
32785+ *
32786+ * Otherwise fake a memory map; one section from 0k->640k,
32787+ * the next section from 1mb->appropriate_mem_k
32788+ */
32789+ sanitize_e820_map(E820_MAP, &E820_MAP_NR);
32790+ if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
32791+ unsigned long mem_size;
32792+
32793+ /* compare results from other methods and take the greater */
32794+ if (ALT_MEM_K < EXT_MEM_K) {
32795+ mem_size = EXT_MEM_K;
32796+ who = "BIOS-88";
32797+ } else {
32798+ mem_size = ALT_MEM_K;
32799+ who = "BIOS-e801";
32800+ }
32801+
32802+ e820.nr_map = 0;
32803+ add_memory_region(0, LOWMEMSIZE(), E820_RAM);
32804+ add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
32805+ }
32806+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
32807+ e820_print_map(who);
32808+}
32809+
32810+#else /* CONFIG_XEN */
32811+
32812+void __init setup_memory_region(void)
32813+{
32814+ int rc;
32815+ struct xen_memory_map memmap;
32816+ /*
32817+ * This is rather large for a stack variable but this early in
32818+ * the boot process we know we have plenty slack space.
32819+ */
32820+ struct e820entry map[E820MAX];
32821+
32822+ memmap.nr_entries = E820MAX;
32823+ set_xen_guest_handle(memmap.buffer, map);
32824+
32825+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
32826+ if ( rc == -ENOSYS ) {
32827+ memmap.nr_entries = 1;
32828+ map[0].addr = 0ULL;
32829+ map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
32830+ /* 8MB slack (to balance backend allocations). */
32831+ map[0].size += 8 << 20;
32832+ map[0].type = E820_RAM;
32833+ rc = 0;
32834+ }
32835+ BUG_ON(rc);
32836+
32837+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
32838+
32839+ BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
32840+
32841+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
32842+ e820_print_map("Xen");
32843+}
32844+#endif
32845+
32846+void __init parse_memopt(char *p, char **from)
32847+{
32848+ int i;
32849+ unsigned long current_end;
32850+ unsigned long end;
32851+
32852+ end_user_pfn = memparse(p, from);
32853+ end_user_pfn >>= PAGE_SHIFT;
32854+
32855+ end = end_user_pfn<<PAGE_SHIFT;
32856+ i = e820.nr_map-1;
32857+ current_end = e820.map[i].addr + e820.map[i].size;
32858+
32859+ if (current_end < end) {
32860+ /*
32861+ * The e820 map ends before our requested size so
32862+ * extend the final entry to the requested address.
32863+ */
32864+ if (e820.map[i].type == E820_RAM)
32865+ e820.map[i].size = end - e820.map[i].addr;
32866+ else
32867+ add_memory_region(current_end, end - current_end, E820_RAM);
32868+ }
32869+}
32870+
32871+void __init parse_memmapopt(char *p, char **from)
32872+{
32873+ unsigned long long start_at, mem_size;
32874+
32875+ mem_size = memparse(p, from);
32876+ p = *from;
32877+ if (*p == '@') {
32878+ start_at = memparse(p+1, from);
32879+ add_memory_region(start_at, mem_size, E820_RAM);
32880+ } else if (*p == '#') {
32881+ start_at = memparse(p+1, from);
32882+ add_memory_region(start_at, mem_size, E820_ACPI);
32883+ } else if (*p == '$') {
32884+ start_at = memparse(p+1, from);
32885+ add_memory_region(start_at, mem_size, E820_RESERVED);
32886+ } else {
32887+ end_user_pfn = (mem_size >> PAGE_SHIFT);
32888+ }
32889+ p = *from;
32890+}
32891+
32892+unsigned long pci_mem_start = 0xaeedbabe;
32893+
32894+/*
32895+ * Search for the biggest gap in the low 32 bits of the e820
32896+ * memory space. We pass this space to PCI to assign MMIO resources
32897+ * for hotplug or unconfigured devices in.
32898+ * Hopefully the BIOS let enough space left.
32899+ */
32900+__init void e820_setup_gap(struct e820entry *e820, int nr_map)
32901+{
32902+ unsigned long gapstart, gapsize, round;
32903+ unsigned long last;
32904+ int i;
32905+ int found = 0;
32906+
32907+ last = 0x100000000ull;
32908+ gapstart = 0x10000000;
32909+ gapsize = 0x400000;
32910+ i = nr_map;
32911+ while (--i >= 0) {
32912+ unsigned long long start = e820[i].addr;
32913+ unsigned long long end = start + e820[i].size;
32914+
32915+ /*
32916+ * Since "last" is at most 4GB, we know we'll
32917+ * fit in 32 bits if this condition is true
32918+ */
32919+ if (last > end) {
32920+ unsigned long gap = last - end;
32921+
32922+ if (gap > gapsize) {
32923+ gapsize = gap;
32924+ gapstart = end;
32925+ found = 1;
32926+ }
32927+ }
32928+ if (start < last)
32929+ last = start;
32930+ }
32931+
32932+ if (!found) {
32933+ gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
32934+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
32935+ KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
32936+ }
32937+
32938+ /*
32939+ * See how much we want to round up: start off with
32940+ * rounding to the next 1MB area.
32941+ */
32942+ round = 0x100000;
32943+ while ((gapsize >> 4) > round)
32944+ round += round;
32945+ /* Fun with two's complement */
32946+ pci_mem_start = (gapstart + round) & -round;
32947+
32948+ printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
32949+ pci_mem_start, gapstart, gapsize);
32950+}
32951diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/early_printk-xen.c linux-2.6.16.33/arch/x86_64/kernel/early_printk-xen.c
32952--- linux-2.6.16.33-noxen/arch/x86_64/kernel/early_printk-xen.c 1970-01-01 00:00:00.000000000 +0000
32953+++ linux-2.6.16.33/arch/x86_64/kernel/early_printk-xen.c 2007-01-08 15:00:45.000000000 +0000
32954@@ -0,0 +1,306 @@
32955+#include <linux/config.h>
32956+#include <linux/console.h>
32957+#include <linux/kernel.h>
32958+#include <linux/init.h>
32959+#include <linux/string.h>
32960+#include <linux/tty.h>
32961+#include <asm/io.h>
32962+#include <asm/processor.h>
32963+#include <asm/fcntl.h>
32964+
32965+/* Simple VGA output */
32966+
32967+#ifdef __i386__
32968+#include <asm/setup.h>
32969+#define VGABASE (__ISA_IO_base + 0xb8000)
32970+#else
32971+#include <asm/bootsetup.h>
32972+#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
32973+#endif
32974+
32975+#define MAX_YPOS max_ypos
32976+#define MAX_XPOS max_xpos
32977+
32978+static int max_ypos = 25, max_xpos = 80;
32979+
32980+#ifndef CONFIG_XEN
32981+static int current_ypos = 1, current_xpos = 0;
32982+
32983+static void early_vga_write(struct console *con, const char *str, unsigned n)
32984+{
32985+ char c;
32986+ int i, k, j;
32987+
32988+ while ((c = *str++) != '\0' && n-- > 0) {
32989+ if (current_ypos >= MAX_YPOS) {
32990+ /* scroll 1 line up */
32991+ for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
32992+ for (i = 0; i < MAX_XPOS; i++) {
32993+ writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
32994+ VGABASE + 2*(MAX_XPOS*j + i));
32995+ }
32996+ }
32997+ for (i = 0; i < MAX_XPOS; i++)
32998+ writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
32999+ current_ypos = MAX_YPOS-1;
33000+ }
33001+ if (c == '\n') {
33002+ current_xpos = 0;
33003+ current_ypos++;
33004+ } else if (c != '\r') {
33005+ writew(((0x7 << 8) | (unsigned short) c),
33006+ VGABASE + 2*(MAX_XPOS*current_ypos +
33007+ current_xpos++));
33008+ if (current_xpos >= MAX_XPOS) {
33009+ current_xpos = 0;
33010+ current_ypos++;
33011+ }
33012+ }
33013+ }
33014+}
33015+
33016+static struct console early_vga_console = {
33017+ .name = "earlyvga",
33018+ .write = early_vga_write,
33019+ .flags = CON_PRINTBUFFER,
33020+ .index = -1,
33021+};
33022+
33023+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
33024+
33025+static int early_serial_base = 0x3f8; /* ttyS0 */
33026+
33027+#define XMTRDY 0x20
33028+
33029+#define DLAB 0x80
33030+
33031+#define TXR 0 /* Transmit register (WRITE) */
33032+#define RXR 0 /* Receive register (READ) */
33033+#define IER 1 /* Interrupt Enable */
33034+#define IIR 2 /* Interrupt ID */
33035+#define FCR 2 /* FIFO control */
33036+#define LCR 3 /* Line control */
33037+#define MCR 4 /* Modem control */
33038+#define LSR 5 /* Line Status */
33039+#define MSR 6 /* Modem Status */
33040+#define DLL 0 /* Divisor Latch Low */
33041+#define DLH 1 /* Divisor latch High */
33042+
33043+static int early_serial_putc(unsigned char ch)
33044+{
33045+ unsigned timeout = 0xffff;
33046+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
33047+ cpu_relax();
33048+ outb(ch, early_serial_base + TXR);
33049+ return timeout ? 0 : -1;
33050+}
33051+
33052+static void early_serial_write(struct console *con, const char *s, unsigned n)
33053+{
33054+ while (*s && n-- > 0) {
33055+ early_serial_putc(*s);
33056+ if (*s == '\n')
33057+ early_serial_putc('\r');
33058+ s++;
33059+ }
33060+}
33061+
33062+#define DEFAULT_BAUD 9600
33063+
33064+static __init void early_serial_init(char *s)
33065+{
33066+ unsigned char c;
33067+ unsigned divisor;
33068+ unsigned baud = DEFAULT_BAUD;
33069+ char *e;
33070+
33071+ if (*s == ',')
33072+ ++s;
33073+
33074+ if (*s) {
33075+ unsigned port;
33076+ if (!strncmp(s,"0x",2)) {
33077+ early_serial_base = simple_strtoul(s, &e, 16);
33078+ } else {
33079+ static int bases[] = { 0x3f8, 0x2f8 };
33080+
33081+ if (!strncmp(s,"ttyS",4))
33082+ s += 4;
33083+ port = simple_strtoul(s, &e, 10);
33084+ if (port > 1 || s == e)
33085+ port = 0;
33086+ early_serial_base = bases[port];
33087+ }
33088+ s += strcspn(s, ",");
33089+ if (*s == ',')
33090+ s++;
33091+ }
33092+
33093+ outb(0x3, early_serial_base + LCR); /* 8n1 */
33094+ outb(0, early_serial_base + IER); /* no interrupt */
33095+ outb(0, early_serial_base + FCR); /* no fifo */
33096+ outb(0x3, early_serial_base + MCR); /* DTR + RTS */
33097+
33098+ if (*s) {
33099+ baud = simple_strtoul(s, &e, 0);
33100+ if (baud == 0 || s == e)
33101+ baud = DEFAULT_BAUD;
33102+ }
33103+
33104+ divisor = 115200 / baud;
33105+ c = inb(early_serial_base + LCR);
33106+ outb(c | DLAB, early_serial_base + LCR);
33107+ outb(divisor & 0xff, early_serial_base + DLL);
33108+ outb((divisor >> 8) & 0xff, early_serial_base + DLH);
33109+ outb(c & ~DLAB, early_serial_base + LCR);
33110+}
33111+
33112+#else /* CONFIG_XEN */
33113+
33114+#undef SCREEN_INFO
33115+#define SCREEN_INFO screen_info
33116+extern struct screen_info screen_info;
33117+
33118+static void
33119+early_serial_write(struct console *con, const char *s, unsigned count)
33120+{
33121+ int n;
33122+
33123+ while (count > 0) {
33124+ n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
33125+ if (n <= 0)
33126+ break;
33127+ count -= n;
33128+ s += n;
33129+ }
33130+}
33131+
33132+static __init void early_serial_init(char *s)
33133+{
33134+}
33135+
33136+/*
33137+ * No early VGA console on Xen, as we do not have convenient ISA-space
33138+ * mappings. Someone should fix this for domain 0. For now, use fake serial.
33139+ */
33140+#define early_vga_console early_serial_console
33141+
33142+#endif
33143+
33144+static struct console early_serial_console = {
33145+ .name = "earlyser",
33146+ .write = early_serial_write,
33147+ .flags = CON_PRINTBUFFER,
33148+ .index = -1,
33149+};
33150+
33151+/* Console interface to a host file on AMD's SimNow! */
33152+
33153+static int simnow_fd;
33154+
33155+enum {
33156+ MAGIC1 = 0xBACCD00A,
33157+ MAGIC2 = 0xCA110000,
33158+ XOPEN = 5,
33159+ XWRITE = 4,
33160+};
33161+
33162+static noinline long simnow(long cmd, long a, long b, long c)
33163+{
33164+ long ret;
33165+ asm volatile("cpuid" :
33166+ "=a" (ret) :
33167+ "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
33168+ return ret;
33169+}
33170+
33171+void __init simnow_init(char *str)
33172+{
33173+ char *fn = "klog";
33174+ if (*str == '=')
33175+ fn = ++str;
33176+ /* error ignored */
33177+ simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
33178+}
33179+
33180+static void simnow_write(struct console *con, const char *s, unsigned n)
33181+{
33182+ simnow(XWRITE, simnow_fd, (unsigned long)s, n);
33183+}
33184+
33185+static struct console simnow_console = {
33186+ .name = "simnow",
33187+ .write = simnow_write,
33188+ .flags = CON_PRINTBUFFER,
33189+ .index = -1,
33190+};
33191+
33192+/* Direct interface for emergencies */
33193+struct console *early_console = &early_vga_console;
33194+static int early_console_initialized = 0;
33195+
33196+void early_printk(const char *fmt, ...)
33197+{
33198+ char buf[512];
33199+ int n;
33200+ va_list ap;
33201+
33202+ va_start(ap,fmt);
33203+ n = vscnprintf(buf,512,fmt,ap);
33204+ early_console->write(early_console,buf,n);
33205+ va_end(ap);
33206+}
33207+
33208+static int __initdata keep_early;
33209+
33210+int __init setup_early_printk(char *opt)
33211+{
33212+ char *space;
33213+ char buf[256];
33214+
33215+ if (early_console_initialized)
33216+ return -1;
33217+
33218+ strlcpy(buf,opt,sizeof(buf));
33219+ space = strchr(buf, ' ');
33220+ if (space)
33221+ *space = 0;
33222+
33223+ if (strstr(buf,"keep"))
33224+ keep_early = 1;
33225+
33226+ if (!strncmp(buf, "serial", 6)) {
33227+ early_serial_init(buf + 6);
33228+ early_console = &early_serial_console;
33229+ } else if (!strncmp(buf, "ttyS", 4)) {
33230+ early_serial_init(buf);
33231+ early_console = &early_serial_console;
33232+ } else if (!strncmp(buf, "vga", 3)
33233+ && SCREEN_INFO.orig_video_isVGA == 1) {
33234+ max_xpos = SCREEN_INFO.orig_video_cols;
33235+ max_ypos = SCREEN_INFO.orig_video_lines;
33236+ early_console = &early_vga_console;
33237+ } else if (!strncmp(buf, "simnow", 6)) {
33238+ simnow_init(buf + 6);
33239+ early_console = &simnow_console;
33240+ keep_early = 1;
33241+ }
33242+ early_console_initialized = 1;
33243+ register_console(early_console);
33244+ return 0;
33245+}
33246+
33247+void __init disable_early_printk(void)
33248+{
33249+ if (!early_console_initialized || !early_console)
33250+ return;
33251+ if (!keep_early) {
33252+ printk("disabling early console\n");
33253+ unregister_console(early_console);
33254+ early_console_initialized = 0;
33255+ } else {
33256+ printk("keeping early console\n");
33257+ }
33258+}
33259+
33260+__setup("earlyprintk=", setup_early_printk);
33261diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/entry-xen.S linux-2.6.16.33/arch/x86_64/kernel/entry-xen.S
33262--- linux-2.6.16.33-noxen/arch/x86_64/kernel/entry-xen.S 1970-01-01 00:00:00.000000000 +0000
33263+++ linux-2.6.16.33/arch/x86_64/kernel/entry-xen.S 2007-01-08 15:00:45.000000000 +0000
33264@@ -0,0 +1,1136 @@
33265+/*
33266+ * linux/arch/x86_64/entry.S
33267+ *
33268+ * Copyright (C) 1991, 1992 Linus Torvalds
33269+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
33270+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
33271+ *
33272+ * $Id$
33273+ *
33274+ * Jun Nakajima <jun.nakajima@intel.com>
33275+ * Asit Mallick <asit.k.mallick@intel.com>
33276+ * Modified for Xen
33277+ */
33278+
33279+/*
33280+ * entry.S contains the system-call and fault low-level handling routines.
33281+ *
33282+ * NOTE: This code handles signal-recognition, which happens every time
33283+ * after an interrupt and after each system call.
33284+ *
33285+ * Normal syscalls and interrupts don't save a full stack frame, this is
33286+ * only done for syscall tracing, signals or fork/exec et.al.
33287+ *
33288+ * A note on terminology:
33289+ * - top of stack: Architecture defined interrupt frame from SS to RIP
33290+ * at the top of the kernel process stack.
33291+ * - partial stack frame: partially saved registers upto R11.
33292+ * - full stack frame: Like partial stack frame, but all register saved.
33293+ *
33294+ * TODO:
33295+ * - schedule it carefully for the final hardware.
33296+ */
33297+
33298+#define ASSEMBLY 1
33299+#include <linux/config.h>
33300+#ifdef CONFIG_DEBUG_INFO
33301+#undef CONFIG_DEBUG_INFO
33302+#endif
33303+#include <linux/linkage.h>
33304+#include <asm/segment.h>
33305+#include <asm/smp.h>
33306+#include <asm/cache.h>
33307+#include <asm/errno.h>
33308+#include <asm/dwarf2.h>
33309+#include <asm/calling.h>
33310+#include <asm/asm-offsets.h>
33311+#include <asm/msr.h>
33312+#include <asm/unistd.h>
33313+#include <asm/thread_info.h>
33314+#include <asm/hw_irq.h>
33315+#include <asm/page.h>
33316+#include <asm/errno.h>
33317+#include <xen/interface/arch-x86_64.h>
33318+#include <xen/interface/features.h>
33319+
33320+#include "irq_vectors.h"
33321+
33322+#include "xen_entry.S"
33323+
33324+ .code64
33325+
33326+#ifndef CONFIG_PREEMPT
33327+#define retint_kernel retint_restore_args
33328+#endif
33329+
33330+NMI_MASK = 0x80000000
33331+
33332+/*
33333+ * C code is not supposed to know about undefined top of stack. Every time
33334+ * a C function with an pt_regs argument is called from the SYSCALL based
33335+ * fast path FIXUP_TOP_OF_STACK is needed.
33336+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
33337+ * manipulation.
33338+ */
33339+
33340+ /* %rsp:at FRAMEEND */
33341+ .macro FIXUP_TOP_OF_STACK tmp
33342+ movq $__USER_CS,CS(%rsp)
33343+ movq $-1,RCX(%rsp)
33344+ .endm
33345+
33346+ .macro RESTORE_TOP_OF_STACK tmp,offset=0
33347+ .endm
33348+
33349+ .macro FAKE_STACK_FRAME child_rip
33350+ /* push in order ss, rsp, eflags, cs, rip */
33351+ xorl %eax, %eax
33352+ pushq %rax /* ss */
33353+ CFI_ADJUST_CFA_OFFSET 8
33354+ /*CFI_REL_OFFSET ss,0*/
33355+ pushq %rax /* rsp */
33356+ CFI_ADJUST_CFA_OFFSET 8
33357+ CFI_REL_OFFSET rsp,0
33358+ pushq $(1<<9) /* eflags - interrupts on */
33359+ CFI_ADJUST_CFA_OFFSET 8
33360+ /*CFI_REL_OFFSET rflags,0*/
33361+ pushq $__KERNEL_CS /* cs */
33362+ CFI_ADJUST_CFA_OFFSET 8
33363+ /*CFI_REL_OFFSET cs,0*/
33364+ pushq \child_rip /* rip */
33365+ CFI_ADJUST_CFA_OFFSET 8
33366+ CFI_REL_OFFSET rip,0
33367+ pushq %rax /* orig rax */
33368+ CFI_ADJUST_CFA_OFFSET 8
33369+ .endm
33370+
33371+ .macro UNFAKE_STACK_FRAME
33372+ addq $8*6, %rsp
33373+ CFI_ADJUST_CFA_OFFSET -(6*8)
33374+ .endm
33375+
33376+ .macro CFI_DEFAULT_STACK start=1
33377+ .if \start
33378+ CFI_STARTPROC simple
33379+ CFI_DEF_CFA rsp,SS+8
33380+ .else
33381+ CFI_DEF_CFA_OFFSET SS+8
33382+ .endif
33383+ CFI_REL_OFFSET r15,R15
33384+ CFI_REL_OFFSET r14,R14
33385+ CFI_REL_OFFSET r13,R13
33386+ CFI_REL_OFFSET r12,R12
33387+ CFI_REL_OFFSET rbp,RBP
33388+ CFI_REL_OFFSET rbx,RBX
33389+ CFI_REL_OFFSET r11,R11
33390+ CFI_REL_OFFSET r10,R10
33391+ CFI_REL_OFFSET r9,R9
33392+ CFI_REL_OFFSET r8,R8
33393+ CFI_REL_OFFSET rax,RAX
33394+ CFI_REL_OFFSET rcx,RCX
33395+ CFI_REL_OFFSET rdx,RDX
33396+ CFI_REL_OFFSET rsi,RSI
33397+ CFI_REL_OFFSET rdi,RDI
33398+ CFI_REL_OFFSET rip,RIP
33399+ /*CFI_REL_OFFSET cs,CS*/
33400+ /*CFI_REL_OFFSET rflags,EFLAGS*/
33401+ CFI_REL_OFFSET rsp,RSP
33402+ /*CFI_REL_OFFSET ss,SS*/
33403+ .endm
33404+
33405+ /*
33406+ * Must be consistent with the definition in arch-x86_64.h:
33407+ * struct iret_context {
33408+ * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
33409+ * };
33410+ * #define VGCF_IN_SYSCALL (1<<8)
33411+ */
33412+ .macro HYPERVISOR_IRET flag
33413+ testb $3,1*8(%rsp)
33414+ jnz 2f
33415+ testl $NMI_MASK,2*8(%rsp)
33416+ jnz 2f
33417+
33418+ testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
33419+ jnz 1f
33420+
33421+ /* Direct iret to kernel space. Correct CS and SS. */
33422+ orb $3,1*8(%rsp)
33423+ orb $3,4*8(%rsp)
33424+1: iretq
33425+
33426+2: /* Slow iret via hypervisor. */
33427+ andl $~NMI_MASK, 16(%rsp)
33428+ pushq $\flag
33429+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
33430+ .endm
33431+
33432+ .macro SWITCH_TO_KERNEL ssoff,adjust=0
33433+ jc 1f
33434+ orb $1,\ssoff-\adjust+4(%rsp)
33435+1:
33436+ .endm
33437+
33438+/*
33439+ * A newly forked process directly context switches into this.
33440+ */
33441+/* rdi: prev */
33442+ENTRY(ret_from_fork)
33443+ CFI_DEFAULT_STACK
33444+ call schedule_tail
33445+ GET_THREAD_INFO(%rcx)
33446+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
33447+ jnz rff_trace
33448+rff_action:
33449+ RESTORE_REST
33450+ testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
33451+ je int_ret_from_sys_call
33452+ testl $_TIF_IA32,threadinfo_flags(%rcx)
33453+ jnz int_ret_from_sys_call
33454+ RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
33455+ jmp ret_from_sys_call
33456+rff_trace:
33457+ movq %rsp,%rdi
33458+ call syscall_trace_leave
33459+ GET_THREAD_INFO(%rcx)
33460+ jmp rff_action
33461+ CFI_ENDPROC
33462+
33463+/*
33464+ * System call entry. Upto 6 arguments in registers are supported.
33465+ *
33466+ * SYSCALL does not save anything on the stack and does not change the
33467+ * stack pointer.
33468+ */
33469+
33470+/*
33471+ * Register setup:
33472+ * rax system call number
33473+ * rdi arg0
33474+ * rcx return address for syscall/sysret, C arg3
33475+ * rsi arg1
33476+ * rdx arg2
33477+ * r10 arg3 (--> moved to rcx for C)
33478+ * r8 arg4
33479+ * r9 arg5
33480+ * r11 eflags for syscall/sysret, temporary for C
33481+ * r12-r15,rbp,rbx saved by C code, not touched.
33482+ *
33483+ * Interrupts are off on entry.
33484+ * Only called from user space.
33485+ *
33486+ * XXX if we had a free scratch register we could save the RSP into the stack frame
33487+ * and report it properly in ps. Unfortunately we haven't.
33488+ *
33489+ * When user can change the frames always force IRET. That is because
33490+ * it deals with uncanonical addresses better. SYSRET has trouble
33491+ * with them due to bugs in both AMD and Intel CPUs.
33492+ */
33493+
33494+ENTRY(system_call)
33495+ CFI_STARTPROC simple
33496+ CFI_DEF_CFA rsp,0
33497+ CFI_REGISTER rip,rcx
33498+ /*CFI_REGISTER rflags,r11*/
33499+ SAVE_ARGS -8,0
33500+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
33501+ XEN_UNBLOCK_EVENTS(%r11)
33502+ GET_THREAD_INFO(%rcx)
33503+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
33504+ CFI_REMEMBER_STATE
33505+ jnz tracesys
33506+ cmpq $__NR_syscall_max,%rax
33507+ ja badsys
33508+ movq %r10,%rcx
33509+ call *sys_call_table(,%rax,8) # XXX: rip relative
33510+ movq %rax,RAX-ARGOFFSET(%rsp)
33511+/*
33512+ * Syscall return path ending with SYSRET (fast path)
33513+ * Has incomplete stack frame and undefined top of stack.
33514+ */
33515+ .globl ret_from_sys_call
33516+ret_from_sys_call:
33517+ movl $_TIF_ALLWORK_MASK,%edi
33518+ /* edi: flagmask */
33519+sysret_check:
33520+ GET_THREAD_INFO(%rcx)
33521+ XEN_BLOCK_EVENTS(%rsi)
33522+ movl threadinfo_flags(%rcx),%edx
33523+ andl %edi,%edx
33524+ CFI_REMEMBER_STATE
33525+ jnz sysret_careful
33526+ XEN_UNBLOCK_EVENTS(%rsi)
33527+ CFI_REGISTER rip,rcx
33528+ RESTORE_ARGS 0,8,0
33529+ /*CFI_REGISTER rflags,r11*/
33530+ HYPERVISOR_IRET VGCF_IN_SYSCALL
33531+
33532+ /* Handle reschedules */
33533+ /* edx: work, edi: workmask */
33534+sysret_careful:
33535+ CFI_RESTORE_STATE
33536+ bt $TIF_NEED_RESCHED,%edx
33537+ jnc sysret_signal
33538+ XEN_UNBLOCK_EVENTS(%rsi)
33539+ pushq %rdi
33540+ CFI_ADJUST_CFA_OFFSET 8
33541+ call schedule
33542+ popq %rdi
33543+ CFI_ADJUST_CFA_OFFSET -8
33544+ jmp sysret_check
33545+
33546+ /* Handle a signal */
33547+sysret_signal:
33548+/* sti */
33549+ XEN_UNBLOCK_EVENTS(%rsi)
33550+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
33551+ jz 1f
33552+
33553+ /* Really a signal */
33554+ /* edx: work flags (arg3) */
33555+ leaq do_notify_resume(%rip),%rax
33556+ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
33557+ xorl %esi,%esi # oldset -> arg2
33558+ call ptregscall_common
33559+1: movl $_TIF_NEED_RESCHED,%edi
33560+ /* Use IRET because user could have changed frame. This
33561+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
33562+ XEN_BLOCK_EVENTS(%rsi)
33563+ jmp int_with_check
33564+
33565+badsys:
33566+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
33567+ jmp ret_from_sys_call
33568+
33569+ /* Do syscall tracing */
33570+tracesys:
33571+ CFI_RESTORE_STATE
33572+ SAVE_REST
33573+ movq $-ENOSYS,RAX(%rsp)
33574+ FIXUP_TOP_OF_STACK %rdi
33575+ movq %rsp,%rdi
33576+ call syscall_trace_enter
33577+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
33578+ RESTORE_REST
33579+ cmpq $__NR_syscall_max,%rax
33580+ ja 1f
33581+ movq %r10,%rcx /* fixup for C */
33582+ call *sys_call_table(,%rax,8)
33583+1: movq %rax,RAX-ARGOFFSET(%rsp)
33584+ /* Use IRET because user could have changed frame */
33585+ jmp int_ret_from_sys_call
33586+ CFI_ENDPROC
33587+
33588+/*
33589+ * Syscall return path ending with IRET.
33590+ * Has correct top of stack, but partial stack frame.
33591+ */
33592+ENTRY(int_ret_from_sys_call)
33593+ CFI_STARTPROC simple
33594+ CFI_DEF_CFA rsp,SS+8-ARGOFFSET
33595+ /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
33596+ CFI_REL_OFFSET rsp,RSP-ARGOFFSET
33597+ /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
33598+ /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
33599+ CFI_REL_OFFSET rip,RIP-ARGOFFSET
33600+ CFI_REL_OFFSET rdx,RDX-ARGOFFSET
33601+ CFI_REL_OFFSET rcx,RCX-ARGOFFSET
33602+ CFI_REL_OFFSET rax,RAX-ARGOFFSET
33603+ CFI_REL_OFFSET rdi,RDI-ARGOFFSET
33604+ CFI_REL_OFFSET rsi,RSI-ARGOFFSET
33605+ CFI_REL_OFFSET r8,R8-ARGOFFSET
33606+ CFI_REL_OFFSET r9,R9-ARGOFFSET
33607+ CFI_REL_OFFSET r10,R10-ARGOFFSET
33608+ CFI_REL_OFFSET r11,R11-ARGOFFSET
33609+ XEN_BLOCK_EVENTS(%rsi)
33610+ testb $3,CS-ARGOFFSET(%rsp)
33611+ jnz 1f
33612+ /* Need to set the proper %ss (not NULL) for ring 3 iretq */
33613+ movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
33614+ jmp retint_restore_args # retrun from ring3 kernel
33615+1:
33616+ movl $_TIF_ALLWORK_MASK,%edi
33617+ /* edi: mask to check */
33618+int_with_check:
33619+ GET_THREAD_INFO(%rcx)
33620+ movl threadinfo_flags(%rcx),%edx
33621+ andl %edi,%edx
33622+ jnz int_careful
33623+ andl $~TS_COMPAT,threadinfo_status(%rcx)
33624+ jmp retint_restore_args
33625+
33626+ /* Either reschedule or signal or syscall exit tracking needed. */
33627+ /* First do a reschedule test. */
33628+ /* edx: work, edi: workmask */
33629+int_careful:
33630+ bt $TIF_NEED_RESCHED,%edx
33631+ jnc int_very_careful
33632+/* sti */
33633+ XEN_UNBLOCK_EVENTS(%rsi)
33634+ pushq %rdi
33635+ CFI_ADJUST_CFA_OFFSET 8
33636+ call schedule
33637+ popq %rdi
33638+ CFI_ADJUST_CFA_OFFSET -8
33639+ XEN_BLOCK_EVENTS(%rsi)
33640+ jmp int_with_check
33641+
33642+ /* handle signals and tracing -- both require a full stack frame */
33643+int_very_careful:
33644+/* sti */
33645+ XEN_UNBLOCK_EVENTS(%rsi)
33646+ SAVE_REST
33647+ /* Check for syscall exit trace */
33648+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
33649+ jz int_signal
33650+ pushq %rdi
33651+ CFI_ADJUST_CFA_OFFSET 8
33652+ leaq 8(%rsp),%rdi # &ptregs -> arg1
33653+ call syscall_trace_leave
33654+ popq %rdi
33655+ CFI_ADJUST_CFA_OFFSET -8
33656+ andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
33657+ XEN_BLOCK_EVENTS(%rsi)
33658+ jmp int_restore_rest
33659+
33660+int_signal:
33661+ testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
33662+ jz 1f
33663+ movq %rsp,%rdi # &ptregs -> arg1
33664+ xorl %esi,%esi # oldset -> arg2
33665+ call do_notify_resume
33666+1: movl $_TIF_NEED_RESCHED,%edi
33667+int_restore_rest:
33668+ RESTORE_REST
33669+ XEN_BLOCK_EVENTS(%rsi)
33670+ jmp int_with_check
33671+ CFI_ENDPROC
33672+
33673+/*
33674+ * Certain special system calls that need to save a complete full stack frame.
33675+ */
33676+
33677+ .macro PTREGSCALL label,func,arg
33678+ .globl \label
33679+\label:
33680+ leaq \func(%rip),%rax
33681+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
33682+ jmp ptregscall_common
33683+ .endm
33684+
33685+ CFI_STARTPROC
33686+
33687+ PTREGSCALL stub_clone, sys_clone, %r8
33688+ PTREGSCALL stub_fork, sys_fork, %rdi
33689+ PTREGSCALL stub_vfork, sys_vfork, %rdi
33690+ PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
33691+ PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
33692+ PTREGSCALL stub_iopl, sys_iopl, %rsi
33693+
33694+ENTRY(ptregscall_common)
33695+ popq %r11
33696+ CFI_ADJUST_CFA_OFFSET -8
33697+ CFI_REGISTER rip, r11
33698+ SAVE_REST
33699+ movq %r11, %r15
33700+ CFI_REGISTER rip, r15
33701+ FIXUP_TOP_OF_STACK %r11
33702+ call *%rax
33703+ RESTORE_TOP_OF_STACK %r11
33704+ movq %r15, %r11
33705+ CFI_REGISTER rip, r11
33706+ RESTORE_REST
33707+ pushq %r11
33708+ CFI_ADJUST_CFA_OFFSET 8
33709+ CFI_REL_OFFSET rip, 0
33710+ ret
33711+ CFI_ENDPROC
33712+
33713+ENTRY(stub_execve)
33714+ CFI_STARTPROC
33715+ popq %r11
33716+ CFI_ADJUST_CFA_OFFSET -8
33717+ CFI_REGISTER rip, r11
33718+ SAVE_REST
33719+ FIXUP_TOP_OF_STACK %r11
33720+ call sys_execve
33721+ RESTORE_TOP_OF_STACK %r11
33722+ movq %rax,RAX(%rsp)
33723+ RESTORE_REST
33724+ jmp int_ret_from_sys_call
33725+ CFI_ENDPROC
33726+
33727+/*
33728+ * sigreturn is special because it needs to restore all registers on return.
33729+ * This cannot be done with SYSRET, so use the IRET return path instead.
33730+ */
33731+ENTRY(stub_rt_sigreturn)
33732+ CFI_STARTPROC
33733+ addq $8, %rsp
33734+ CFI_ADJUST_CFA_OFFSET -8
33735+ SAVE_REST
33736+ movq %rsp,%rdi
33737+ FIXUP_TOP_OF_STACK %r11
33738+ call sys_rt_sigreturn
33739+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
33740+ RESTORE_REST
33741+ jmp int_ret_from_sys_call
33742+ CFI_ENDPROC
33743+
33744+/*
33745+ * initial frame state for interrupts and exceptions
33746+ */
33747+ .macro _frame ref
33748+ CFI_STARTPROC simple
33749+ CFI_DEF_CFA rsp,SS+8-\ref
33750+ /*CFI_REL_OFFSET ss,SS-\ref*/
33751+ CFI_REL_OFFSET rsp,RSP-\ref
33752+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
33753+ /*CFI_REL_OFFSET cs,CS-\ref*/
33754+ CFI_REL_OFFSET rip,RIP-\ref
33755+ .endm
33756+
33757+/* initial frame state for interrupts (and exceptions without error code) */
33758+#define INTR_FRAME _frame RIP
33759+/* initial frame state for exceptions with error code (and interrupts with
33760+ vector already pushed) */
33761+#define XCPT_FRAME _frame ORIG_RAX
33762+
33763+/*
33764+ * Interrupt exit.
33765+ *
33766+ */
33767+
33768+retint_check:
33769+ movl threadinfo_flags(%rcx),%edx
33770+ andl %edi,%edx
33771+ CFI_REMEMBER_STATE
33772+ jnz retint_careful
33773+retint_restore_args:
33774+ movl EFLAGS-REST_SKIP(%rsp), %eax
33775+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
33776+ XEN_GET_VCPU_INFO(%rsi)
33777+ andb evtchn_upcall_mask(%rsi),%al
33778+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
33779+ jnz restore_all_enable_events # != 0 => enable event delivery
33780+ XEN_PUT_VCPU_INFO(%rsi)
33781+
33782+ RESTORE_ARGS 0,8,0
33783+ HYPERVISOR_IRET 0
33784+
33785+ /* edi: workmask, edx: work */
33786+retint_careful:
33787+ CFI_RESTORE_STATE
33788+ bt $TIF_NEED_RESCHED,%edx
33789+ jnc retint_signal
33790+ XEN_UNBLOCK_EVENTS(%rsi)
33791+/* sti */
33792+ pushq %rdi
33793+ CFI_ADJUST_CFA_OFFSET 8
33794+ call schedule
33795+ popq %rdi
33796+ CFI_ADJUST_CFA_OFFSET -8
33797+ GET_THREAD_INFO(%rcx)
33798+ XEN_BLOCK_EVENTS(%rsi)
33799+/* cli */
33800+ jmp retint_check
33801+
33802+retint_signal:
33803+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
33804+ jz retint_restore_args
33805+ XEN_UNBLOCK_EVENTS(%rsi)
33806+ SAVE_REST
33807+ movq $-1,ORIG_RAX(%rsp)
33808+ xorl %esi,%esi # oldset
33809+ movq %rsp,%rdi # &pt_regs
33810+ call do_notify_resume
33811+ RESTORE_REST
33812+ XEN_BLOCK_EVENTS(%rsi)
33813+ movl $_TIF_NEED_RESCHED,%edi
33814+ GET_THREAD_INFO(%rcx)
33815+ jmp retint_check
33816+
33817+#ifdef CONFIG_PREEMPT
33818+ /* Returning to kernel space. Check if we need preemption */
33819+ /* rcx: threadinfo. interrupts off. */
33820+ .p2align
33821+retint_kernel:
33822+ cmpl $0,threadinfo_preempt_count(%rcx)
33823+ jnz retint_restore_args
33824+ bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
33825+ jnc retint_restore_args
33826+ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
33827+ jnc retint_restore_args
33828+ call preempt_schedule_irq
33829+ jmp retint_kernel /* check again */
33830+#endif
33831+ CFI_ENDPROC
33832+
33833+/*
33834+ * APIC interrupts.
33835+ */
33836+ .macro apicinterrupt num,func
33837+ INTR_FRAME
33838+ pushq $~(\num)
33839+ CFI_ADJUST_CFA_OFFSET 8
33840+ interrupt \func
33841+ jmp error_entry
33842+ CFI_ENDPROC
33843+ .endm
33844+
33845+#ifndef CONFIG_XEN
33846+ENTRY(thermal_interrupt)
33847+ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
33848+
33849+ENTRY(threshold_interrupt)
33850+ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
33851+
33852+#ifdef CONFIG_SMP
33853+ENTRY(reschedule_interrupt)
33854+ apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
33855+
33856+ .macro INVALIDATE_ENTRY num
33857+ENTRY(invalidate_interrupt\num)
33858+ apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
33859+ .endm
33860+
33861+ INVALIDATE_ENTRY 0
33862+ INVALIDATE_ENTRY 1
33863+ INVALIDATE_ENTRY 2
33864+ INVALIDATE_ENTRY 3
33865+ INVALIDATE_ENTRY 4
33866+ INVALIDATE_ENTRY 5
33867+ INVALIDATE_ENTRY 6
33868+ INVALIDATE_ENTRY 7
33869+
33870+ENTRY(call_function_interrupt)
33871+ apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
33872+#endif
33873+
33874+#ifdef CONFIG_X86_LOCAL_APIC
33875+ENTRY(apic_timer_interrupt)
33876+ apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
33877+
33878+ENTRY(error_interrupt)
33879+ apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
33880+
33881+ENTRY(spurious_interrupt)
33882+ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
33883+#endif
33884+#endif /* !CONFIG_XEN */
33885+
33886+/*
33887+ * Exception entry points.
33888+ */
33889+ .macro zeroentry sym
33890+ INTR_FRAME
33891+ movq (%rsp),%rcx
33892+ movq 8(%rsp),%r11
33893+ addq $0x10,%rsp /* skip rcx and r11 */
33894+ pushq $0 /* push error code/oldrax */
33895+ CFI_ADJUST_CFA_OFFSET 8
33896+ pushq %rax /* push real oldrax to the rdi slot */
33897+ CFI_ADJUST_CFA_OFFSET 8
33898+ leaq \sym(%rip),%rax
33899+ jmp error_entry
33900+ CFI_ENDPROC
33901+ .endm
33902+
33903+ .macro errorentry sym
33904+ XCPT_FRAME
33905+ movq (%rsp),%rcx
33906+ movq 8(%rsp),%r11
33907+ addq $0x10,%rsp /* rsp points to the error code */
33908+ pushq %rax
33909+ CFI_ADJUST_CFA_OFFSET 8
33910+ leaq \sym(%rip),%rax
33911+ jmp error_entry
33912+ CFI_ENDPROC
33913+ .endm
33914+
33915+#if 0 /* not XEN */
33916+ /* error code is on the stack already */
33917+ /* handle NMI like exceptions that can happen everywhere */
33918+ .macro paranoidentry sym, ist=0
33919+ movq (%rsp),%rcx
33920+ movq 8(%rsp),%r11
33921+ addq $0x10,%rsp /* skip rcx and r11 */
33922+ SAVE_ALL
33923+ cld
33924+#if 0 /* not XEN */
33925+ movl $1,%ebx
33926+ movl $MSR_GS_BASE,%ecx
33927+ rdmsr
33928+ testl %edx,%edx
33929+ js 1f
33930+ swapgs
33931+ xorl %ebx,%ebx
33932+1:
33933+#endif
33934+ .if \ist
33935+ movq %gs:pda_data_offset, %rbp
33936+ .endif
33937+ movq %rsp,%rdi
33938+ movq ORIG_RAX(%rsp),%rsi
33939+ movq $-1,ORIG_RAX(%rsp)
33940+ .if \ist
33941+ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
33942+ .endif
33943+ call \sym
33944+ .if \ist
33945+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
33946+ .endif
33947+/* cli */
33948+ XEN_BLOCK_EVENTS(%rsi)
33949+ .endm
33950+#endif
33951+
33952+/*
33953+ * Exception entry point. This expects an error code/orig_rax on the stack
33954+ * and the exception handler in %rax.
33955+ */
33956+ENTRY(error_entry)
33957+ _frame RDI
33958+ /* rdi slot contains rax, oldrax contains error code */
33959+ cld
33960+ subq $14*8,%rsp
33961+ CFI_ADJUST_CFA_OFFSET (14*8)
33962+ movq %rsi,13*8(%rsp)
33963+ CFI_REL_OFFSET rsi,RSI
33964+ movq 14*8(%rsp),%rsi /* load rax from rdi slot */
33965+ movq %rdx,12*8(%rsp)
33966+ CFI_REL_OFFSET rdx,RDX
33967+ movq %rcx,11*8(%rsp)
33968+ CFI_REL_OFFSET rcx,RCX
33969+ movq %rsi,10*8(%rsp) /* store rax */
33970+ CFI_REL_OFFSET rax,RAX
33971+ movq %r8, 9*8(%rsp)
33972+ CFI_REL_OFFSET r8,R8
33973+ movq %r9, 8*8(%rsp)
33974+ CFI_REL_OFFSET r9,R9
33975+ movq %r10,7*8(%rsp)
33976+ CFI_REL_OFFSET r10,R10
33977+ movq %r11,6*8(%rsp)
33978+ CFI_REL_OFFSET r11,R11
33979+ movq %rbx,5*8(%rsp)
33980+ CFI_REL_OFFSET rbx,RBX
33981+ movq %rbp,4*8(%rsp)
33982+ CFI_REL_OFFSET rbp,RBP
33983+ movq %r12,3*8(%rsp)
33984+ CFI_REL_OFFSET r12,R12
33985+ movq %r13,2*8(%rsp)
33986+ CFI_REL_OFFSET r13,R13
33987+ movq %r14,1*8(%rsp)
33988+ CFI_REL_OFFSET r14,R14
33989+ movq %r15,(%rsp)
33990+ CFI_REL_OFFSET r15,R15
33991+#if 0
33992+ cmpl $__KERNEL_CS,CS(%rsp)
33993+ je error_kernelspace
33994+#endif
33995+error_call_handler:
33996+ movq %rdi, RDI(%rsp)
33997+ movq %rsp,%rdi
33998+ movq ORIG_RAX(%rsp),%rsi # get error code
33999+ movq $-1,ORIG_RAX(%rsp)
34000+ call *%rax
34001+error_exit:
34002+ RESTORE_REST
34003+/* cli */
34004+ XEN_BLOCK_EVENTS(%rsi)
34005+ GET_THREAD_INFO(%rcx)
34006+ testb $3,CS-ARGOFFSET(%rsp)
34007+ jz retint_kernel
34008+ movl threadinfo_flags(%rcx),%edx
34009+ movl $_TIF_WORK_MASK,%edi
34010+ andl %edi,%edx
34011+ jnz retint_careful
34012+ jmp retint_restore_args
34013+
34014+error_kernelspace:
34015+ /*
34016+ * We need to re-write the logic here because we don't do iretq to
34017+ * to return to user mode. It's still possible that we get trap/fault
34018+ * in the kernel (when accessing buffers pointed to by system calls,
34019+ * for example).
34020+ *
34021+ */
34022+#if 0
34023+ incl %ebx
34024+ /* There are two places in the kernel that can potentially fault with
34025+ usergs. Handle them here. The exception handlers after
34026+ iret run with kernel gs again, so don't set the user space flag.
34027+ B stepping K8s sometimes report an truncated RIP for IRET
34028+ exceptions returning to compat mode. Check for these here too. */
34029+ leaq iret_label(%rip),%rbp
34030+ cmpq %rbp,RIP(%rsp)
34031+ je error_swapgs
34032+ movl %ebp,%ebp /* zero extend */
34033+ cmpq %rbp,RIP(%rsp)
34034+ je error_swapgs
34035+ cmpq $gs_change,RIP(%rsp)
34036+ je error_swapgs
34037+ jmp error_sti
34038+#endif
34039+
34040+ENTRY(hypervisor_callback)
34041+ zeroentry do_hypervisor_callback
34042+
34043+/*
34044+ * Copied from arch/xen/i386/kernel/entry.S
34045+ */
34046+# A note on the "critical region" in our callback handler.
34047+# We want to avoid stacking callback handlers due to events occurring
34048+# during handling of the last event. To do this, we keep events disabled
34049+# until we've done all processing. HOWEVER, we must enable events before
34050+# popping the stack frame (can't be done atomically) and so it would still
34051+# be possible to get enough handler activations to overflow the stack.
34052+# Although unlikely, bugs of that kind are hard to track down, so we'd
34053+# like to avoid the possibility.
34054+# So, on entry to the handler we detect whether we interrupted an
34055+# existing activation in its critical region -- if so, we pop the current
34056+# activation and restart the handler using the previous one.
34057+ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
34058+# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
34059+# see the correct pointer to the pt_regs
34060+ movq %rdi, %rsp # we don't return, adjust the stack frame
34061+11: movq %gs:pda_irqstackptr,%rax
34062+ incl %gs:pda_irqcount
34063+ cmovzq %rax,%rsp
34064+ pushq %rdi
34065+ call evtchn_do_upcall
34066+ popq %rsp
34067+ decl %gs:pda_irqcount
34068+ jmp error_exit
34069+
34070+#ifdef CONFIG_X86_LOCAL_APIC
34071+KPROBE_ENTRY(nmi)
34072+ zeroentry do_nmi_callback
34073+ENTRY(do_nmi_callback)
34074+ addq $8, %rsp
34075+ call do_nmi
34076+ orl $NMI_MASK,EFLAGS(%rsp)
34077+ RESTORE_REST
34078+ XEN_BLOCK_EVENTS(%rsi)
34079+ GET_THREAD_INFO(%rcx)
34080+ jmp retint_restore_args
34081+ .previous .text
34082+#endif
34083+
34084+ ALIGN
34085+restore_all_enable_events:
34086+ XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
34087+
34088+scrit: /**** START OF CRITICAL REGION ****/
34089+ XEN_TEST_PENDING(%rsi)
34090+ jnz 14f # process more events if necessary...
34091+ XEN_PUT_VCPU_INFO(%rsi)
34092+ RESTORE_ARGS 0,8,0
34093+ HYPERVISOR_IRET 0
34094+
34095+14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
34096+ XEN_PUT_VCPU_INFO(%rsi)
34097+ SAVE_REST
34098+ movq %rsp,%rdi # set the argument again
34099+ jmp 11b
34100+ecrit: /**** END OF CRITICAL REGION ****/
34101+# At this point, unlike on x86-32, we don't do the fixup to simplify the
34102+# code and the stack frame is more complex on x86-64.
34103+# When the kernel is interrupted in the critical section, the kernel
34104+# will do IRET in that case, and everything will be restored at that point,
34105+# i.e. it just resumes from the next instruction interrupted with the same context.
34106+
34107+# Hypervisor uses this for application faults while it executes.
34108+# We get here for two reasons:
34109+# 1. Fault while reloading DS, ES, FS or GS
34110+# 2. Fault while executing IRET
34111+# Category 1 we do not need to fix up as Xen has already reloaded all segment
34112+# registers that could be reloaded and zeroed the others.
34113+# Category 2 we fix up by killing the current process. We cannot use the
34114+# normal Linux return path in this case because if we use the IRET hypercall
34115+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
34116+# We distinguish between categories by comparing each saved segment register
34117+# with its current contents: any discrepancy means we in category 1.
34118+ENTRY(failsafe_callback)
34119+ movw %ds,%cx
34120+ cmpw %cx,0x10(%rsp)
34121+ jne 1f
34122+ movw %es,%cx
34123+ cmpw %cx,0x18(%rsp)
34124+ jne 1f
34125+ movw %fs,%cx
34126+ cmpw %cx,0x20(%rsp)
34127+ jne 1f
34128+ movw %gs,%cx
34129+ cmpw %cx,0x28(%rsp)
34130+ jne 1f
34131+ /* All segments match their saved values => Category 2 (Bad IRET). */
34132+ movq (%rsp),%rcx
34133+ movq 8(%rsp),%r11
34134+ addq $0x30,%rsp
34135+ movq $-9999,%rdi /* better code? */
34136+ jmp do_exit
34137+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
34138+ movq (%rsp),%rcx
34139+ movq 8(%rsp),%r11
34140+ addq $0x30,%rsp
34141+ pushq $0
34142+ SAVE_ALL
34143+ jmp error_exit
34144+#if 0
34145+ .section __ex_table,"a"
34146+ .align 8
34147+ .quad gs_change,bad_gs
34148+ .previous
34149+ .section .fixup,"ax"
34150+ /* running with kernelgs */
34151+bad_gs:
34152+/* swapgs */ /* switch back to user gs */
34153+ xorl %eax,%eax
34154+ movl %eax,%gs
34155+ jmp 2b
34156+ .previous
34157+#endif
34158+
34159+/*
34160+ * Create a kernel thread.
34161+ *
34162+ * C extern interface:
34163+ * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
34164+ *
34165+ * asm input arguments:
34166+ * rdi: fn, rsi: arg, rdx: flags
34167+ */
34168+ENTRY(kernel_thread)
34169+ CFI_STARTPROC
34170+ FAKE_STACK_FRAME $child_rip
34171+ SAVE_ALL
34172+
34173+ # rdi: flags, rsi: usp, rdx: will be &pt_regs
34174+ movq %rdx,%rdi
34175+ orq kernel_thread_flags(%rip),%rdi
34176+ movq $-1, %rsi
34177+ movq %rsp, %rdx
34178+
34179+ xorl %r8d,%r8d
34180+ xorl %r9d,%r9d
34181+
34182+ # clone now
34183+ call do_fork
34184+ movq %rax,RAX(%rsp)
34185+ xorl %edi,%edi
34186+
34187+ /*
34188+ * It isn't worth to check for reschedule here,
34189+ * so internally to the x86_64 port you can rely on kernel_thread()
34190+ * not to reschedule the child before returning, this avoids the need
34191+ * of hacks for example to fork off the per-CPU idle tasks.
34192+ * [Hopefully no generic code relies on the reschedule -AK]
34193+ */
34194+ RESTORE_ALL
34195+ UNFAKE_STACK_FRAME
34196+ ret
34197+ CFI_ENDPROC
34198+
34199+
34200+child_rip:
34201+ /*
34202+ * Here we are in the child and the registers are set as they were
34203+ * at kernel_thread() invocation in the parent.
34204+ */
34205+ movq %rdi, %rax
34206+ movq %rsi, %rdi
34207+ call *%rax
34208+ # exit
34209+ xorl %edi, %edi
34210+ call do_exit
34211+
34212+/*
34213+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
34214+ *
34215+ * C extern interface:
34216+ * extern long execve(char *name, char **argv, char **envp)
34217+ *
34218+ * asm input arguments:
34219+ * rdi: name, rsi: argv, rdx: envp
34220+ *
34221+ * We want to fallback into:
34222+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
34223+ *
34224+ * do_sys_execve asm fallback arguments:
34225+ * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
34226+ */
34227+ENTRY(execve)
34228+ CFI_STARTPROC
34229+ FAKE_STACK_FRAME $0
34230+ SAVE_ALL
34231+ call sys_execve
34232+ movq %rax, RAX(%rsp)
34233+ RESTORE_REST
34234+ testq %rax,%rax
34235+ jne 1f
34236+ jmp int_ret_from_sys_call
34237+1: RESTORE_ARGS
34238+ UNFAKE_STACK_FRAME
34239+ ret
34240+ CFI_ENDPROC
34241+
34242+KPROBE_ENTRY(page_fault)
34243+ errorentry do_page_fault
34244+ .previous .text
34245+
34246+ENTRY(coprocessor_error)
34247+ zeroentry do_coprocessor_error
34248+
34249+ENTRY(simd_coprocessor_error)
34250+ zeroentry do_simd_coprocessor_error
34251+
34252+ENTRY(device_not_available)
34253+ zeroentry math_state_restore
34254+
34255+ /* runs on exception stack */
34256+KPROBE_ENTRY(debug)
34257+ INTR_FRAME
34258+/* pushq $0
34259+ CFI_ADJUST_CFA_OFFSET 8 */
34260+ zeroentry do_debug
34261+/* jmp paranoid_exit */
34262+ CFI_ENDPROC
34263+ .previous .text
34264+
34265+#if 0
34266+ /* runs on exception stack */
34267+KPROBE_ENTRY(nmi)
34268+ INTR_FRAME
34269+ pushq $-1
34270+ CFI_ADJUST_CFA_OFFSET 8
34271+ paranoidentry do_nmi
34272+ /*
34273+ * "Paranoid" exit path from exception stack.
34274+ * Paranoid because this is used by NMIs and cannot take
34275+ * any kernel state for granted.
34276+ * We don't do kernel preemption checks here, because only
34277+ * NMI should be common and it does not enable IRQs and
34278+ * cannot get reschedule ticks.
34279+ */
34280+ /* ebx: no swapgs flag */
34281+paranoid_exit:
34282+ testl %ebx,%ebx /* swapgs needed? */
34283+ jnz paranoid_restore
34284+ testl $3,CS(%rsp)
34285+ jnz paranoid_userspace
34286+paranoid_swapgs:
34287+ swapgs
34288+paranoid_restore:
34289+ RESTORE_ALL 8
34290+ iretq
34291+paranoid_userspace:
34292+ GET_THREAD_INFO(%rcx)
34293+ movl threadinfo_flags(%rcx),%ebx
34294+ andl $_TIF_WORK_MASK,%ebx
34295+ jz paranoid_swapgs
34296+ movq %rsp,%rdi /* &pt_regs */
34297+ call sync_regs
34298+ movq %rax,%rsp /* switch stack for scheduling */
34299+ testl $_TIF_NEED_RESCHED,%ebx
34300+ jnz paranoid_schedule
34301+ movl %ebx,%edx /* arg3: thread flags */
34302+ sti
34303+ xorl %esi,%esi /* arg2: oldset */
34304+ movq %rsp,%rdi /* arg1: &pt_regs */
34305+ call do_notify_resume
34306+ cli
34307+ jmp paranoid_userspace
34308+paranoid_schedule:
34309+ sti
34310+ call schedule
34311+ cli
34312+ jmp paranoid_userspace
34313+ CFI_ENDPROC
34314+ .previous .text
34315+#endif
34316+
34317+KPROBE_ENTRY(int3)
34318+ INTR_FRAME
34319+/* pushq $0
34320+ CFI_ADJUST_CFA_OFFSET 8 */
34321+ zeroentry do_int3
34322+/* jmp paranoid_exit */
34323+ CFI_ENDPROC
34324+ .previous .text
34325+
34326+ENTRY(overflow)
34327+ zeroentry do_overflow
34328+
34329+ENTRY(bounds)
34330+ zeroentry do_bounds
34331+
34332+ENTRY(invalid_op)
34333+ zeroentry do_invalid_op
34334+
34335+ENTRY(coprocessor_segment_overrun)
34336+ zeroentry do_coprocessor_segment_overrun
34337+
34338+ENTRY(reserved)
34339+ zeroentry do_reserved
34340+
34341+#if 0
34342+ /* runs on exception stack */
34343+ENTRY(double_fault)
34344+ XCPT_FRAME
34345+ paranoidentry do_double_fault
34346+ jmp paranoid_exit
34347+ CFI_ENDPROC
34348+#endif
34349+
34350+ENTRY(invalid_TSS)
34351+ errorentry do_invalid_TSS
34352+
34353+ENTRY(segment_not_present)
34354+ errorentry do_segment_not_present
34355+
34356+ /* runs on exception stack */
34357+ENTRY(stack_segment)
34358+ XCPT_FRAME
34359+ errorentry do_stack_segment
34360+ CFI_ENDPROC
34361+
34362+KPROBE_ENTRY(general_protection)
34363+ errorentry do_general_protection
34364+ .previous .text
34365+
34366+ENTRY(alignment_check)
34367+ errorentry do_alignment_check
34368+
34369+ENTRY(divide_error)
34370+ zeroentry do_divide_error
34371+
34372+ENTRY(spurious_interrupt_bug)
34373+ zeroentry do_spurious_interrupt_bug
34374+
34375+#ifdef CONFIG_X86_MCE
34376+ /* runs on exception stack */
34377+ENTRY(machine_check)
34378+ INTR_FRAME
34379+ pushq $0
34380+ CFI_ADJUST_CFA_OFFSET 8
34381+ paranoidentry do_machine_check
34382+ jmp paranoid_exit
34383+ CFI_ENDPROC
34384+#endif
34385+
34386+ENTRY(call_softirq)
34387+ CFI_STARTPROC
34388+ movq %gs:pda_irqstackptr,%rax
34389+ movq %rsp,%rdx
34390+ CFI_DEF_CFA_REGISTER rdx
34391+ incl %gs:pda_irqcount
34392+ cmove %rax,%rsp
34393+ pushq %rdx
34394+ /*todo CFI_DEF_CFA_EXPRESSION ...*/
34395+ call __do_softirq
34396+ popq %rsp
34397+ CFI_DEF_CFA_REGISTER rsp
34398+ decl %gs:pda_irqcount
34399+ ret
34400+ CFI_ENDPROC
34401diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/entry.S linux-2.6.16.33/arch/x86_64/kernel/entry.S
34402--- linux-2.6.16.33-noxen/arch/x86_64/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
34403+++ linux-2.6.16.33/arch/x86_64/kernel/entry.S 2007-05-23 21:00:01.000000000 +0000
34404@@ -596,7 +596,7 @@
34405 */
34406 .macro apicinterrupt num,func
34407 INTR_FRAME
34408- pushq $\num-256
34409+ pushq $~(\num)
34410 CFI_ADJUST_CFA_OFFSET 8
34411 interrupt \func
34412 jmp ret_from_intr
34413diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic-xen.c linux-2.6.16.33/arch/x86_64/kernel/genapic-xen.c
34414--- linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic-xen.c 1970-01-01 00:00:00.000000000 +0000
34415+++ linux-2.6.16.33/arch/x86_64/kernel/genapic-xen.c 2007-01-08 15:00:45.000000000 +0000
34416@@ -0,0 +1,144 @@
34417+/*
34418+ * Copyright 2004 James Cleverdon, IBM.
34419+ * Subject to the GNU Public License, v.2
34420+ *
34421+ * Generic APIC sub-arch probe layer.
34422+ *
34423+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
34424+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
34425+ * James Cleverdon.
34426+ */
34427+#include <linux/config.h>
34428+#include <linux/threads.h>
34429+#include <linux/cpumask.h>
34430+#include <linux/string.h>
34431+#include <linux/kernel.h>
34432+#include <linux/ctype.h>
34433+#include <linux/init.h>
34434+#include <linux/module.h>
34435+
34436+#include <asm/smp.h>
34437+#include <asm/ipi.h>
34438+
34439+#if defined(CONFIG_ACPI)
34440+#include <acpi/acpi_bus.h>
34441+#endif
34442+
34443+/* which logical CPU number maps to which CPU (physical APIC ID) */
34444+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
34445+EXPORT_SYMBOL(x86_cpu_to_apicid);
34446+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
34447+
34448+extern struct genapic apic_cluster;
34449+extern struct genapic apic_flat;
34450+extern struct genapic apic_physflat;
34451+
34452+#ifndef CONFIG_XEN
34453+struct genapic *genapic = &apic_flat;
34454+#else
34455+extern struct genapic apic_xen;
34456+struct genapic *genapic = &apic_xen;
34457+#endif
34458+
34459+
34460+/*
34461+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
34462+ */
34463+void __init clustered_apic_check(void)
34464+{
34465+#ifndef CONFIG_XEN
34466+ long i;
34467+ u8 clusters, max_cluster;
34468+ u8 id;
34469+ u8 cluster_cnt[NUM_APIC_CLUSTERS];
34470+ int max_apic = 0;
34471+
34472+#if defined(CONFIG_ACPI)
34473+ /*
34474+ * Some x86_64 machines use physical APIC mode regardless of how many
34475+ * procs/clusters are present (x86_64 ES7000 is an example).
34476+ */
34477+ if (acpi_fadt.revision > FADT2_REVISION_ID)
34478+ if (acpi_fadt.force_apic_physical_destination_mode) {
34479+ genapic = &apic_cluster;
34480+ goto print;
34481+ }
34482+#endif
34483+
34484+ memset(cluster_cnt, 0, sizeof(cluster_cnt));
34485+ for (i = 0; i < NR_CPUS; i++) {
34486+ id = bios_cpu_apicid[i];
34487+ if (id == BAD_APICID)
34488+ continue;
34489+ if (id > max_apic)
34490+ max_apic = id;
34491+ cluster_cnt[APIC_CLUSTERID(id)]++;
34492+ }
34493+
34494+ /* Don't use clustered mode on AMD platforms. */
34495+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
34496+ genapic = &apic_physflat;
34497+#ifndef CONFIG_HOTPLUG_CPU
34498+ /* In the CPU hotplug case we cannot use broadcast mode
34499+ because that opens a race when a CPU is removed.
34500+ Stay at physflat mode in this case.
34501+ It is bad to do this unconditionally though. Once
34502+ we have ACPI platform support for CPU hotplug
34503+ we should detect hotplug capablity from ACPI tables and
34504+ only do this when really needed. -AK */
34505+ if (max_apic <= 8)
34506+ genapic = &apic_flat;
34507+#endif
34508+ goto print;
34509+ }
34510+
34511+ clusters = 0;
34512+ max_cluster = 0;
34513+
34514+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
34515+ if (cluster_cnt[i] > 0) {
34516+ ++clusters;
34517+ if (cluster_cnt[i] > max_cluster)
34518+ max_cluster = cluster_cnt[i];
34519+ }
34520+ }
34521+
34522+ /*
34523+ * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
34524+ * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
34525+ * else physical mode.
34526+ * (We don't use lowest priority delivery + HW APIC IRQ steering, so
34527+ * can ignore the clustered logical case and go straight to physical.)
34528+ */
34529+ if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
34530+#ifdef CONFIG_HOTPLUG_CPU
34531+ /* Don't use APIC shortcuts in CPU hotplug to avoid races */
34532+ genapic = &apic_physflat;
34533+#else
34534+ genapic = &apic_flat;
34535+#endif
34536+ } else
34537+ genapic = &apic_cluster;
34538+
34539+print:
34540+#else
34541+ /* hardcode to xen apic functions */
34542+ genapic = &apic_xen;
34543+#endif
34544+ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
34545+}
34546+
34547+/* Same for both flat and clustered. */
34548+
34549+#ifdef CONFIG_XEN
34550+extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
34551+#endif
34552+
34553+void send_IPI_self(int vector)
34554+{
34555+#ifndef CONFIG_XEN
34556+ __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34557+#else
34558+ xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34559+#endif
34560+}
34561diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic_xen.c linux-2.6.16.33/arch/x86_64/kernel/genapic_xen.c
34562--- linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic_xen.c 1970-01-01 00:00:00.000000000 +0000
34563+++ linux-2.6.16.33/arch/x86_64/kernel/genapic_xen.c 2007-01-08 15:00:45.000000000 +0000
34564@@ -0,0 +1,162 @@
34565+/*
34566+ * Copyright 2004 James Cleverdon, IBM.
34567+ * Subject to the GNU Public License, v.2
34568+ *
34569+ * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
34570+ *
34571+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
34572+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
34573+ * James Cleverdon.
34574+ *
34575+ * Hacked to pieces for Xen by Chris Wright.
34576+ */
34577+#include <linux/config.h>
34578+#include <linux/threads.h>
34579+#include <linux/cpumask.h>
34580+#include <linux/string.h>
34581+#include <linux/kernel.h>
34582+#include <linux/ctype.h>
34583+#include <linux/init.h>
34584+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34585+#include <asm/smp.h>
34586+#include <asm/ipi.h>
34587+#else
34588+#include <asm/apic.h>
34589+#include <asm/apicdef.h>
34590+#include <asm/genapic.h>
34591+#endif
34592+#include <xen/evtchn.h>
34593+
34594+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
34595+
34596+static inline void __send_IPI_one(unsigned int cpu, int vector)
34597+{
34598+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
34599+ BUG_ON(irq < 0);
34600+ notify_remote_via_irq(irq);
34601+}
34602+
34603+void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
34604+{
34605+ int cpu;
34606+
34607+ switch (shortcut) {
34608+ case APIC_DEST_SELF:
34609+ __send_IPI_one(smp_processor_id(), vector);
34610+ break;
34611+ case APIC_DEST_ALLBUT:
34612+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34613+ if (cpu == smp_processor_id())
34614+ continue;
34615+ if (cpu_isset(cpu, cpu_online_map)) {
34616+ __send_IPI_one(cpu, vector);
34617+ }
34618+ }
34619+ break;
34620+ case APIC_DEST_ALLINC:
34621+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34622+ if (cpu_isset(cpu, cpu_online_map)) {
34623+ __send_IPI_one(cpu, vector);
34624+ }
34625+ }
34626+ break;
34627+ default:
34628+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
34629+ vector);
34630+ break;
34631+ }
34632+}
34633+
34634+static cpumask_t xen_target_cpus(void)
34635+{
34636+ return cpu_online_map;
34637+}
34638+
34639+/*
34640+ * Set up the logical destination ID.
34641+ * Do nothing, not called now.
34642+ */
34643+static void xen_init_apic_ldr(void)
34644+{
34645+ Dprintk("%s\n", __FUNCTION__);
34646+ return;
34647+}
34648+
34649+static void xen_send_IPI_allbutself(int vector)
34650+{
34651+ /*
34652+ * if there are no other CPUs in the system then
34653+ * we get an APIC send error if we try to broadcast.
34654+ * thus we have to avoid sending IPIs in this case.
34655+ */
34656+ Dprintk("%s\n", __FUNCTION__);
34657+ if (num_online_cpus() > 1)
34658+ xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
34659+}
34660+
34661+static void xen_send_IPI_all(int vector)
34662+{
34663+ Dprintk("%s\n", __FUNCTION__);
34664+ xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
34665+}
34666+
34667+static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
34668+{
34669+ unsigned long mask = cpus_addr(cpumask)[0];
34670+ unsigned int cpu;
34671+ unsigned long flags;
34672+
34673+ Dprintk("%s\n", __FUNCTION__);
34674+ local_irq_save(flags);
34675+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
34676+
34677+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34678+ if (cpu_isset(cpu, cpumask)) {
34679+ __send_IPI_one(cpu, vector);
34680+ }
34681+ }
34682+ local_irq_restore(flags);
34683+}
34684+
34685+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34686+static int xen_apic_id_registered(void)
34687+{
34688+ /* better be set */
34689+ Dprintk("%s\n", __FUNCTION__);
34690+ return physid_isset(smp_processor_id(), phys_cpu_present_map);
34691+}
34692+#endif
34693+
34694+static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
34695+{
34696+ Dprintk("%s\n", __FUNCTION__);
34697+ return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
34698+}
34699+
34700+static unsigned int phys_pkg_id(int index_msb)
34701+{
34702+ u32 ebx;
34703+
34704+ Dprintk("%s\n", __FUNCTION__);
34705+ ebx = cpuid_ebx(1);
34706+ return ((ebx >> 24) & 0xFF) >> index_msb;
34707+}
34708+
34709+struct genapic apic_xen = {
34710+ .name = "xen",
34711+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34712+ .int_delivery_mode = dest_LowestPrio,
34713+#endif
34714+ .int_dest_mode = (APIC_DEST_LOGICAL != 0),
34715+ .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
34716+ .target_cpus = xen_target_cpus,
34717+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34718+ .apic_id_registered = xen_apic_id_registered,
34719+#endif
34720+ .init_apic_ldr = xen_init_apic_ldr,
34721+ .send_IPI_all = xen_send_IPI_all,
34722+ .send_IPI_allbutself = xen_send_IPI_allbutself,
34723+ .send_IPI_mask = xen_send_IPI_mask,
34724+ .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
34725+ .phys_pkg_id = phys_pkg_id,
34726+};
34727diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/head-xen.S linux-2.6.16.33/arch/x86_64/kernel/head-xen.S
34728--- linux-2.6.16.33-noxen/arch/x86_64/kernel/head-xen.S 1970-01-01 00:00:00.000000000 +0000
34729+++ linux-2.6.16.33/arch/x86_64/kernel/head-xen.S 2007-01-08 15:00:45.000000000 +0000
34730@@ -0,0 +1,183 @@
34731+/*
34732+ * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
34733+ *
34734+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
34735+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
34736+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
34737+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
34738+ *
34739+ * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
34740+ *
34741+ * Jun Nakajima <jun.nakajima@intel.com>
34742+ * Modified for Xen
34743+ */
34744+
34745+
34746+#include <linux/linkage.h>
34747+#include <linux/threads.h>
34748+#include <linux/init.h>
34749+#include <linux/elfnote.h>
34750+#include <asm/desc.h>
34751+#include <asm/segment.h>
34752+#include <asm/page.h>
34753+#include <asm/msr.h>
34754+#include <asm/cache.h>
34755+
34756+#include <xen/interface/elfnote.h>
34757+
34758+ .text
34759+ .code64
34760+#define VIRT_ENTRY_OFFSET 0x0
34761+.org VIRT_ENTRY_OFFSET
34762+ .globl startup_64
34763+startup_64:
34764+ENTRY(_start)
34765+ movq $(init_thread_union+THREAD_SIZE-8),%rsp
34766+ /* zero EFLAGS after setting rsp */
34767+ pushq $0
34768+ popfq
34769+
34770+ /* rsi is pointer to startup info structure.
34771+ pass it to C */
34772+ movq %rsi,%rdi
34773+ jmp x86_64_start_kernel
34774+
34775+ENTRY(stext)
34776+ENTRY(_stext)
34777+
34778+ $page = 0
34779+#define NEXT_PAGE(name) \
34780+ $page = $page + 1; \
34781+ .org $page * 0x1000; \
34782+ phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
34783+ENTRY(name)
34784+
34785+NEXT_PAGE(init_level4_pgt)
34786+ /* This gets initialized in x86_64_start_kernel */
34787+ .fill 512,8,0
34788+
34789+ /*
34790+ * We update two pgd entries to make kernel and user pgd consistent
34791+ * at pgd_populate(). It can be used for kernel modules. So we place
34792+ * this page here for those cases to avoid memory corruption.
34793+ * We also use this page to establish the initiali mapping for
34794+ * vsyscall area.
34795+ */
34796+NEXT_PAGE(init_level4_user_pgt)
34797+ .fill 512,8,0
34798+
34799+NEXT_PAGE(level3_kernel_pgt)
34800+ .fill 512,8,0
34801+
34802+ /*
34803+ * This is used for vsyscall area mapping as we have a different
34804+ * level4 page table for user.
34805+ */
34806+NEXT_PAGE(level3_user_pgt)
34807+ .fill 512,8,0
34808+
34809+NEXT_PAGE(level2_kernel_pgt)
34810+ .fill 512,8,0
34811+
34812+NEXT_PAGE(empty_zero_page)
34813+ .skip PAGE_SIZE
34814+
34815+NEXT_PAGE(hypercall_page)
34816+ .fill 512,8,0
34817+
34818+#undef NEXT_PAGE
34819+
34820+ .data
34821+
34822+ .align 16
34823+ .globl cpu_gdt_descr
34824+cpu_gdt_descr:
34825+ .word gdt_end-cpu_gdt_table
34826+gdt:
34827+ .quad cpu_gdt_table
34828+#ifdef CONFIG_SMP
34829+ .rept NR_CPUS-1
34830+ .word 0
34831+ .quad 0
34832+ .endr
34833+#endif
34834+
34835+/* We need valid kernel segments for data and code in long mode too
34836+ * IRET will check the segment types kkeil 2000/10/28
34837+ * Also sysret mandates a special GDT layout
34838+ */
34839+
34840+ .section .data.page_aligned, "aw"
34841+ .align PAGE_SIZE
34842+
34843+/* The TLS descriptors are currently at a different place compared to i386.
34844+ Hopefully nobody expects them at a fixed place (Wine?) */
34845+
34846+ENTRY(cpu_gdt_table)
34847+ .quad 0x0000000000000000 /* NULL descriptor */
34848+ .quad 0x0 /* unused */
34849+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
34850+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
34851+ .quad 0x00cffa000000ffff /* __USER32_CS */
34852+ .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
34853+ .quad 0x00affa000000ffff /* __USER_CS */
34854+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
34855+ .quad 0,0 /* TSS */
34856+ .quad 0,0 /* LDT */
34857+ .quad 0,0,0 /* three TLS descriptors */
34858+ .quad 0 /* unused */
34859+gdt_end:
34860+ /* asm/segment.h:GDT_ENTRIES must match this */
34861+ /* This should be a multiple of the cache line size */
34862+ /* GDTs of other CPUs are now dynamically allocated */
34863+
34864+ /* zero the remaining page */
34865+ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
34866+
34867+#ifdef CONFIG_XEN_COMPAT_030002
34868+/*
34869+ * __xen_guest information
34870+ */
34871+.macro utoh value
34872+ .if (\value) < 0 || (\value) >= 0x10
34873+ utoh (((\value)>>4)&0x0fffffffffffffff)
34874+ .endif
34875+ .if ((\value) & 0xf) < 10
34876+ .byte '0' + ((\value) & 0xf)
34877+ .else
34878+ .byte 'A' + ((\value) & 0xf) - 10
34879+ .endif
34880+.endm
34881+
34882+.section __xen_guest
34883+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
34884+ .ascii ",XEN_VER=xen-3.0"
34885+ .ascii ",VIRT_BASE=0x"
34886+ utoh __START_KERNEL_map
34887+ .ascii ",ELF_PADDR_OFFSET=0x"
34888+ utoh __START_KERNEL_map
34889+ .ascii ",VIRT_ENTRY=0x"
34890+ utoh (__START_KERNEL_map + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
34891+ .ascii ",HYPERCALL_PAGE=0x"
34892+ utoh (phys_hypercall_page >> PAGE_SHIFT)
34893+ .ascii ",FEATURES=writable_page_tables"
34894+ .ascii "|writable_descriptor_tables"
34895+ .ascii "|auto_translated_physmap"
34896+ .ascii "|supervisor_mode_kernel"
34897+ .ascii ",LOADER=generic"
34898+ .byte 0
34899+#endif /* CONFIG_XEN_COMPAT_030002 */
34900+
34901+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
34902+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
34903+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
34904+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map)
34905+#ifdef CONFIG_XEN_COMPAT_030002
34906+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map)
34907+#else
34908+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0)
34909+#endif /* !CONFIG_XEN_COMPAT_030002 */
34910+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64)
34911+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page)
34912+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
34913+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
34914diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/head64-xen.c linux-2.6.16.33/arch/x86_64/kernel/head64-xen.c
34915--- linux-2.6.16.33-noxen/arch/x86_64/kernel/head64-xen.c 1970-01-01 00:00:00.000000000 +0000
34916+++ linux-2.6.16.33/arch/x86_64/kernel/head64-xen.c 2007-01-08 15:00:45.000000000 +0000
34917@@ -0,0 +1,159 @@
34918+/*
34919+ * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
34920+ *
34921+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
34922+ *
34923+ * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
34924+ *
34925+ * Jun Nakajima <jun.nakajima@intel.com>
34926+ * Modified for Xen.
34927+ */
34928+
34929+#include <linux/init.h>
34930+#include <linux/linkage.h>
34931+#include <linux/types.h>
34932+#include <linux/kernel.h>
34933+#include <linux/string.h>
34934+#include <linux/percpu.h>
34935+#include <linux/module.h>
34936+
34937+#include <asm/processor.h>
34938+#include <asm/proto.h>
34939+#include <asm/smp.h>
34940+#include <asm/bootsetup.h>
34941+#include <asm/setup.h>
34942+#include <asm/desc.h>
34943+#include <asm/pgtable.h>
34944+#include <asm/sections.h>
34945+
34946+unsigned long start_pfn;
34947+
34948+/* Don't add a printk in there. printk relies on the PDA which is not initialized
34949+ yet. */
34950+#if 0
34951+static void __init clear_bss(void)
34952+{
34953+ memset(__bss_start, 0,
34954+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
34955+}
34956+#endif
34957+
34958+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
34959+#define OLD_CL_MAGIC_ADDR 0x90020
34960+#define OLD_CL_MAGIC 0xA33F
34961+#define OLD_CL_BASE_ADDR 0x90000
34962+#define OLD_CL_OFFSET 0x90022
34963+
34964+extern char saved_command_line[];
34965+
34966+static void __init copy_bootdata(char *real_mode_data)
34967+{
34968+#ifndef CONFIG_XEN
34969+ int new_data;
34970+ char * command_line;
34971+
34972+ memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
34973+ new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
34974+ if (!new_data) {
34975+ if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
34976+ printk("so old bootloader that it does not support commandline?!\n");
34977+ return;
34978+ }
34979+ new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
34980+ printk("old bootloader convention, maybe loadlin?\n");
34981+ }
34982+ command_line = (char *) ((u64)(new_data));
34983+ memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
34984+#else
34985+ int max_cmdline;
34986+
34987+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
34988+ max_cmdline = COMMAND_LINE_SIZE;
34989+ memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
34990+ saved_command_line[max_cmdline-1] = '\0';
34991+#endif
34992+ printk("Bootdata ok (command line is %s)\n", saved_command_line);
34993+}
34994+
34995+static void __init setup_boot_cpu_data(void)
34996+{
34997+ unsigned int dummy, eax;
34998+
34999+ /* get vendor info */
35000+ cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
35001+ (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
35002+ (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
35003+ (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
35004+
35005+ /* get cpu type */
35006+ cpuid(1, &eax, &dummy, &dummy,
35007+ (unsigned int *) &boot_cpu_data.x86_capability);
35008+ boot_cpu_data.x86 = (eax >> 8) & 0xf;
35009+ boot_cpu_data.x86_model = (eax >> 4) & 0xf;
35010+ boot_cpu_data.x86_mask = eax & 0xf;
35011+}
35012+
35013+#include <xen/interface/memory.h>
35014+unsigned long *machine_to_phys_mapping;
35015+EXPORT_SYMBOL(machine_to_phys_mapping);
35016+unsigned int machine_to_phys_order;
35017+EXPORT_SYMBOL(machine_to_phys_order);
35018+
35019+void __init x86_64_start_kernel(char * real_mode_data)
35020+{
35021+ struct xen_machphys_mapping mapping;
35022+ unsigned long machine_to_phys_nr_ents;
35023+ char *s;
35024+ int i;
35025+
35026+ xen_start_info = (struct start_info *)real_mode_data;
35027+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
35028+ phys_to_machine_mapping =
35029+ (unsigned long *)xen_start_info->mfn_list;
35030+ start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
35031+ xen_start_info->nr_pt_frames;
35032+ }
35033+
35034+
35035+ machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
35036+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
35037+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
35038+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
35039+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
35040+ }
35041+ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
35042+ machine_to_phys_order++;
35043+
35044+#if 0
35045+ for (i = 0; i < 256; i++)
35046+ set_intr_gate(i, early_idt_handler);
35047+ asm volatile("lidt %0" :: "m" (idt_descr));
35048+#endif
35049+
35050+ for (i = 0; i < NR_CPUS; i++)
35051+ cpu_pda(i) = &boot_cpu_pda[i];
35052+
35053+ pda_init(0);
35054+ copy_bootdata(real_mode_data);
35055+#ifdef CONFIG_SMP
35056+ cpu_set(0, cpu_online_map);
35057+#endif
35058+ s = strstr(saved_command_line, "earlyprintk=");
35059+ if (s != NULL)
35060+ setup_early_printk(strchr(s, '=') + 1);
35061+#ifdef CONFIG_NUMA
35062+ s = strstr(saved_command_line, "numa=");
35063+ if (s != NULL)
35064+ numa_setup(s+5);
35065+#endif
35066+#ifdef CONFIG_X86_IO_APIC
35067+ if (strstr(saved_command_line, "disableapic"))
35068+ disable_apic = 1;
35069+#endif
35070+ /* You need early console to see that */
35071+ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
35072+ panic("Kernel too big for kernel mapping\n");
35073+
35074+ setup_boot_cpu_data();
35075+ start_kernel();
35076+}
35077diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/init_task.c linux-2.6.16.33/arch/x86_64/kernel/init_task.c
35078--- linux-2.6.16.33-noxen/arch/x86_64/kernel/init_task.c 2006-11-22 18:06:31.000000000 +0000
35079+++ linux-2.6.16.33/arch/x86_64/kernel/init_task.c 2007-01-08 15:00:45.000000000 +0000
35080@@ -37,6 +37,8 @@
35081 struct task_struct init_task = INIT_TASK(init_task);
35082
35083 EXPORT_SYMBOL(init_task);
35084+
35085+#ifndef CONFIG_X86_NO_TSS
35086 /*
35087 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
35088 * no more per-task TSS's. The TSS size is kept cacheline-aligned
35089@@ -45,5 +47,6 @@
35090 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
35091 */
35092 DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
35093+#endif
35094
35095 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
35096diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/io_apic-xen.c linux-2.6.16.33/arch/x86_64/kernel/io_apic-xen.c
35097--- linux-2.6.16.33-noxen/arch/x86_64/kernel/io_apic-xen.c 1970-01-01 00:00:00.000000000 +0000
35098+++ linux-2.6.16.33/arch/x86_64/kernel/io_apic-xen.c 2007-01-08 15:00:45.000000000 +0000
35099@@ -0,0 +1,2226 @@
35100+/*
35101+ * Intel IO-APIC support for multi-Pentium hosts.
35102+ *
35103+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
35104+ *
35105+ * Many thanks to Stig Venaas for trying out countless experimental
35106+ * patches and reporting/debugging problems patiently!
35107+ *
35108+ * (c) 1999, Multiple IO-APIC support, developed by
35109+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
35110+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
35111+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
35112+ * and Ingo Molnar <mingo@redhat.com>
35113+ *
35114+ * Fixes
35115+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
35116+ * thanks to Eric Gilmore
35117+ * and Rolf G. Tews
35118+ * for testing these extensively
35119+ * Paul Diefenbaugh : Added full ACPI support
35120+ */
35121+
35122+#include <linux/mm.h>
35123+#include <linux/interrupt.h>
35124+#include <linux/init.h>
35125+#include <linux/delay.h>
35126+#include <linux/sched.h>
35127+#include <linux/config.h>
35128+#include <linux/smp_lock.h>
35129+#include <linux/mc146818rtc.h>
35130+#include <linux/acpi.h>
35131+#include <linux/sysdev.h>
35132+#ifdef CONFIG_ACPI
35133+#include <acpi/acpi_bus.h>
35134+#endif
35135+
35136+#include <asm/io.h>
35137+#include <asm/smp.h>
35138+#include <asm/desc.h>
35139+#include <asm/proto.h>
35140+#include <asm/mach_apic.h>
35141+#include <asm/acpi.h>
35142+#include <asm/dma.h>
35143+
35144+#define __apicdebuginit __init
35145+
35146+int sis_apic_bug; /* not actually supported, dummy for compile */
35147+
35148+static int no_timer_check;
35149+
35150+int disable_timer_pin_1 __initdata;
35151+
35152+#ifndef CONFIG_XEN
35153+int timer_over_8254 __initdata = 1;
35154+
35155+/* Where if anywhere is the i8259 connect in external int mode */
35156+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
35157+#endif
35158+
35159+static DEFINE_SPINLOCK(ioapic_lock);
35160+
35161+/*
35162+ * # of IRQ routing registers
35163+ */
35164+int nr_ioapic_registers[MAX_IO_APICS];
35165+
35166+/*
35167+ * Rough estimation of how many shared IRQs there are, can
35168+ * be changed anytime.
35169+ */
35170+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
35171+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
35172+
35173+/*
35174+ * This is performance-critical, we want to do it O(1)
35175+ *
35176+ * the indexing order of this array favors 1:1 mappings
35177+ * between pins and IRQs.
35178+ */
35179+
35180+static struct irq_pin_list {
35181+ short apic, pin, next;
35182+} irq_2_pin[PIN_MAP_SIZE];
35183+
35184+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
35185+#ifdef CONFIG_PCI_MSI
35186+#define vector_to_irq(vector) \
35187+ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
35188+#else
35189+#define vector_to_irq(vector) (vector)
35190+#endif
35191+
35192+#ifdef CONFIG_XEN
35193+
35194+#include <xen/interface/xen.h>
35195+#include <xen/interface/physdev.h>
35196+
35197+/* Fake i8259 */
35198+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
35199+#define disable_8259A_irq(_irq) ((void)0)
35200+#define i8259A_irq_pending(_irq) (0)
35201+
35202+unsigned long io_apic_irqs;
35203+
35204+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
35205+{
35206+ struct physdev_apic apic_op;
35207+ int ret;
35208+
35209+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35210+ apic_op.reg = reg;
35211+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
35212+ if (ret)
35213+ return ret;
35214+ return apic_op.value;
35215+}
35216+
35217+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
35218+{
35219+ struct physdev_apic apic_op;
35220+
35221+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35222+ apic_op.reg = reg;
35223+ apic_op.value = value;
35224+ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
35225+}
35226+
35227+#define io_apic_read(a,r) xen_io_apic_read(a,r)
35228+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
35229+
35230+#define clear_IO_APIC() ((void)0)
35231+
35232+#else
35233+
35234+#ifdef CONFIG_SMP
35235+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
35236+{
35237+ unsigned long flags;
35238+ unsigned int dest;
35239+ cpumask_t tmp;
35240+
35241+ cpus_and(tmp, mask, cpu_online_map);
35242+ if (cpus_empty(tmp))
35243+ tmp = TARGET_CPUS;
35244+
35245+ cpus_and(mask, tmp, CPU_MASK_ALL);
35246+
35247+ dest = cpu_mask_to_apicid(mask);
35248+
35249+ /*
35250+ * Only the high 8 bits are valid.
35251+ */
35252+ dest = SET_APIC_LOGICAL_ID(dest);
35253+
35254+ spin_lock_irqsave(&ioapic_lock, flags);
35255+ __DO_ACTION(1, = dest, )
35256+ set_irq_info(irq, mask);
35257+ spin_unlock_irqrestore(&ioapic_lock, flags);
35258+}
35259+#endif
35260+
35261+#endif /* !CONFIG_XEN */
35262+
35263+/*
35264+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
35265+ * shared ISA-space IRQs, so we have to support them. We are super
35266+ * fast in the common case, and fast for shared ISA-space IRQs.
35267+ */
35268+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
35269+{
35270+ static int first_free_entry = NR_IRQS;
35271+ struct irq_pin_list *entry = irq_2_pin + irq;
35272+
35273+ BUG_ON(irq >= NR_IRQS);
35274+ while (entry->next)
35275+ entry = irq_2_pin + entry->next;
35276+
35277+ if (entry->pin != -1) {
35278+ entry->next = first_free_entry;
35279+ entry = irq_2_pin + entry->next;
35280+ if (++first_free_entry >= PIN_MAP_SIZE)
35281+ panic("io_apic.c: ran out of irq_2_pin entries!");
35282+ }
35283+ entry->apic = apic;
35284+ entry->pin = pin;
35285+}
35286+
35287+#ifndef CONFIG_XEN
35288+#define __DO_ACTION(R, ACTION, FINAL) \
35289+ \
35290+{ \
35291+ int pin; \
35292+ struct irq_pin_list *entry = irq_2_pin + irq; \
35293+ \
35294+ BUG_ON(irq >= NR_IRQS); \
35295+ for (;;) { \
35296+ unsigned int reg; \
35297+ pin = entry->pin; \
35298+ if (pin == -1) \
35299+ break; \
35300+ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
35301+ reg ACTION; \
35302+ io_apic_modify(entry->apic, reg); \
35303+ if (!entry->next) \
35304+ break; \
35305+ entry = irq_2_pin + entry->next; \
35306+ } \
35307+ FINAL; \
35308+}
35309+
35310+#define DO_ACTION(name,R,ACTION, FINAL) \
35311+ \
35312+ static void name##_IO_APIC_irq (unsigned int irq) \
35313+ __DO_ACTION(R, ACTION, FINAL)
35314+
35315+DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
35316+ /* mask = 1 */
35317+DO_ACTION( __unmask, 0, &= 0xfffeffff, )
35318+ /* mask = 0 */
35319+
35320+static void mask_IO_APIC_irq (unsigned int irq)
35321+{
35322+ unsigned long flags;
35323+
35324+ spin_lock_irqsave(&ioapic_lock, flags);
35325+ __mask_IO_APIC_irq(irq);
35326+ spin_unlock_irqrestore(&ioapic_lock, flags);
35327+}
35328+
35329+static void unmask_IO_APIC_irq (unsigned int irq)
35330+{
35331+ unsigned long flags;
35332+
35333+ spin_lock_irqsave(&ioapic_lock, flags);
35334+ __unmask_IO_APIC_irq(irq);
35335+ spin_unlock_irqrestore(&ioapic_lock, flags);
35336+}
35337+
35338+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
35339+{
35340+ struct IO_APIC_route_entry entry;
35341+ unsigned long flags;
35342+
35343+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
35344+ spin_lock_irqsave(&ioapic_lock, flags);
35345+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
35346+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
35347+ spin_unlock_irqrestore(&ioapic_lock, flags);
35348+ if (entry.delivery_mode == dest_SMI)
35349+ return;
35350+ /*
35351+ * Disable it in the IO-APIC irq-routing table:
35352+ */
35353+ memset(&entry, 0, sizeof(entry));
35354+ entry.mask = 1;
35355+ spin_lock_irqsave(&ioapic_lock, flags);
35356+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
35357+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
35358+ spin_unlock_irqrestore(&ioapic_lock, flags);
35359+}
35360+
35361+static void clear_IO_APIC (void)
35362+{
35363+ int apic, pin;
35364+
35365+ for (apic = 0; apic < nr_ioapics; apic++)
35366+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
35367+ clear_IO_APIC_pin(apic, pin);
35368+}
35369+
35370+#endif /* !CONFIG_XEN */
35371+
35372+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
35373+
35374+/*
35375+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
35376+ * specific CPU-side IRQs.
35377+ */
35378+
35379+#define MAX_PIRQS 8
35380+static int pirq_entries [MAX_PIRQS];
35381+static int pirqs_enabled;
35382+int skip_ioapic_setup;
35383+int ioapic_force;
35384+
35385+/* dummy parsing: see setup.c */
35386+
35387+static int __init disable_ioapic_setup(char *str)
35388+{
35389+ skip_ioapic_setup = 1;
35390+ return 1;
35391+}
35392+
35393+static int __init enable_ioapic_setup(char *str)
35394+{
35395+ ioapic_force = 1;
35396+ skip_ioapic_setup = 0;
35397+ return 1;
35398+}
35399+
35400+__setup("noapic", disable_ioapic_setup);
35401+__setup("apic", enable_ioapic_setup);
35402+
35403+#ifndef CONFIG_XEN
35404+static int __init setup_disable_8254_timer(char *s)
35405+{
35406+ timer_over_8254 = -1;
35407+ return 1;
35408+}
35409+static int __init setup_enable_8254_timer(char *s)
35410+{
35411+ timer_over_8254 = 2;
35412+ return 1;
35413+}
35414+
35415+__setup("disable_8254_timer", setup_disable_8254_timer);
35416+__setup("enable_8254_timer", setup_enable_8254_timer);
35417+#endif /* !CONFIG_XEN */
35418+
35419+#include <asm/pci-direct.h>
35420+#include <linux/pci_ids.h>
35421+#include <linux/pci.h>
35422+
35423+/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
35424+ off. Check for an Nvidia or VIA PCI bridge and turn it off.
35425+ Use pci direct infrastructure because this runs before the PCI subsystem.
35426+
35427+ Can be overwritten with "apic"
35428+
35429+ And another hack to disable the IOMMU on VIA chipsets.
35430+
35431+ ... and others. Really should move this somewhere else.
35432+
35433+ Kludge-O-Rama. */
35434+void __init check_ioapic(void)
35435+{
35436+ int num,slot,func;
35437+ /* Poor man's PCI discovery */
35438+ for (num = 0; num < 32; num++) {
35439+ for (slot = 0; slot < 32; slot++) {
35440+ for (func = 0; func < 8; func++) {
35441+ u32 class;
35442+ u32 vendor;
35443+ u8 type;
35444+ class = read_pci_config(num,slot,func,
35445+ PCI_CLASS_REVISION);
35446+ if (class == 0xffffffff)
35447+ break;
35448+
35449+ if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
35450+ continue;
35451+
35452+ vendor = read_pci_config(num, slot, func,
35453+ PCI_VENDOR_ID);
35454+ vendor &= 0xffff;
35455+ switch (vendor) {
35456+ case PCI_VENDOR_ID_VIA:
35457+#ifdef CONFIG_GART_IOMMU
35458+ if ((end_pfn > MAX_DMA32_PFN ||
35459+ force_iommu) &&
35460+ !iommu_aperture_allowed) {
35461+ printk(KERN_INFO
35462+ "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
35463+ iommu_aperture_disabled = 1;
35464+ }
35465+#endif
35466+ return;
35467+ case PCI_VENDOR_ID_NVIDIA:
35468+#ifdef CONFIG_ACPI
35469+ /* All timer overrides on Nvidia
35470+ seem to be wrong. Skip them. */
35471+ acpi_skip_timer_override = 1;
35472+ printk(KERN_INFO
35473+ "Nvidia board detected. Ignoring ACPI timer override.\n");
35474+#endif
35475+ /* RED-PEN skip them on mptables too? */
35476+ return;
35477+ case PCI_VENDOR_ID_ATI:
35478+
35479+ /* This should be actually default, but
35480+ for 2.6.16 let's do it for ATI only where
35481+ it's really needed. */
35482+#ifndef CONFIG_XEN
35483+ if (timer_over_8254 == 1) {
35484+ timer_over_8254 = 0;
35485+ printk(KERN_INFO
35486+ "ATI board detected. Disabling timer routing over 8254.\n");
35487+ }
35488+#endif
35489+ return;
35490+ }
35491+
35492+
35493+ /* No multi-function device? */
35494+ type = read_pci_config_byte(num,slot,func,
35495+ PCI_HEADER_TYPE);
35496+ if (!(type & 0x80))
35497+ break;
35498+ }
35499+ }
35500+ }
35501+}
35502+
35503+static int __init ioapic_pirq_setup(char *str)
35504+{
35505+ int i, max;
35506+ int ints[MAX_PIRQS+1];
35507+
35508+ get_options(str, ARRAY_SIZE(ints), ints);
35509+
35510+ for (i = 0; i < MAX_PIRQS; i++)
35511+ pirq_entries[i] = -1;
35512+
35513+ pirqs_enabled = 1;
35514+ apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
35515+ max = MAX_PIRQS;
35516+ if (ints[0] < MAX_PIRQS)
35517+ max = ints[0];
35518+
35519+ for (i = 0; i < max; i++) {
35520+ apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
35521+ /*
35522+ * PIRQs are mapped upside down, usually.
35523+ */
35524+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
35525+ }
35526+ return 1;
35527+}
35528+
35529+__setup("pirq=", ioapic_pirq_setup);
35530+
35531+/*
35532+ * Find the IRQ entry number of a certain pin.
35533+ */
35534+static int find_irq_entry(int apic, int pin, int type)
35535+{
35536+ int i;
35537+
35538+ for (i = 0; i < mp_irq_entries; i++)
35539+ if (mp_irqs[i].mpc_irqtype == type &&
35540+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
35541+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
35542+ mp_irqs[i].mpc_dstirq == pin)
35543+ return i;
35544+
35545+ return -1;
35546+}
35547+
35548+#ifndef CONFIG_XEN
35549+/*
35550+ * Find the pin to which IRQ[irq] (ISA) is connected
35551+ */
35552+static int __init find_isa_irq_pin(int irq, int type)
35553+{
35554+ int i;
35555+
35556+ for (i = 0; i < mp_irq_entries; i++) {
35557+ int lbus = mp_irqs[i].mpc_srcbus;
35558+
35559+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
35560+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
35561+ mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
35562+ (mp_irqs[i].mpc_irqtype == type) &&
35563+ (mp_irqs[i].mpc_srcbusirq == irq))
35564+
35565+ return mp_irqs[i].mpc_dstirq;
35566+ }
35567+ return -1;
35568+}
35569+
35570+static int __init find_isa_irq_apic(int irq, int type)
35571+{
35572+ int i;
35573+
35574+ for (i = 0; i < mp_irq_entries; i++) {
35575+ int lbus = mp_irqs[i].mpc_srcbus;
35576+
35577+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
35578+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
35579+ mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
35580+ (mp_irqs[i].mpc_irqtype == type) &&
35581+ (mp_irqs[i].mpc_srcbusirq == irq))
35582+ break;
35583+ }
35584+ if (i < mp_irq_entries) {
35585+ int apic;
35586+ for(apic = 0; apic < nr_ioapics; apic++) {
35587+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
35588+ return apic;
35589+ }
35590+ }
35591+
35592+ return -1;
35593+}
35594+#endif
35595+
35596+/*
35597+ * Find a specific PCI IRQ entry.
35598+ * Not an __init, possibly needed by modules
35599+ */
35600+static int pin_2_irq(int idx, int apic, int pin);
35601+
35602+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
35603+{
35604+ int apic, i, best_guess = -1;
35605+
35606+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
35607+ bus, slot, pin);
35608+ if (mp_bus_id_to_pci_bus[bus] == -1) {
35609+ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
35610+ return -1;
35611+ }
35612+ for (i = 0; i < mp_irq_entries; i++) {
35613+ int lbus = mp_irqs[i].mpc_srcbus;
35614+
35615+ for (apic = 0; apic < nr_ioapics; apic++)
35616+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
35617+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
35618+ break;
35619+
35620+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
35621+ !mp_irqs[i].mpc_irqtype &&
35622+ (bus == lbus) &&
35623+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
35624+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
35625+
35626+ if (!(apic || IO_APIC_IRQ(irq)))
35627+ continue;
35628+
35629+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
35630+ return irq;
35631+ /*
35632+ * Use the first all-but-pin matching entry as a
35633+ * best-guess fuzzy result for broken mptables.
35634+ */
35635+ if (best_guess < 0)
35636+ best_guess = irq;
35637+ }
35638+ }
35639+ BUG_ON(best_guess >= NR_IRQS);
35640+ return best_guess;
35641+}
35642+
35643+/*
35644+ * EISA Edge/Level control register, ELCR
35645+ */
35646+static int EISA_ELCR(unsigned int irq)
35647+{
35648+ if (irq < 16) {
35649+ unsigned int port = 0x4d0 + (irq >> 3);
35650+ return (inb(port) >> (irq & 7)) & 1;
35651+ }
35652+ apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
35653+ return 0;
35654+}
35655+
35656+/* EISA interrupts are always polarity zero and can be edge or level
35657+ * trigger depending on the ELCR value. If an interrupt is listed as
35658+ * EISA conforming in the MP table, that means its trigger type must
35659+ * be read in from the ELCR */
35660+
35661+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
35662+#define default_EISA_polarity(idx) (0)
35663+
35664+/* ISA interrupts are always polarity zero edge triggered,
35665+ * when listed as conforming in the MP table. */
35666+
35667+#define default_ISA_trigger(idx) (0)
35668+#define default_ISA_polarity(idx) (0)
35669+
35670+/* PCI interrupts are always polarity one level triggered,
35671+ * when listed as conforming in the MP table. */
35672+
35673+#define default_PCI_trigger(idx) (1)
35674+#define default_PCI_polarity(idx) (1)
35675+
35676+/* MCA interrupts are always polarity zero level triggered,
35677+ * when listed as conforming in the MP table. */
35678+
35679+#define default_MCA_trigger(idx) (1)
35680+#define default_MCA_polarity(idx) (0)
35681+
35682+static int __init MPBIOS_polarity(int idx)
35683+{
35684+ int bus = mp_irqs[idx].mpc_srcbus;
35685+ int polarity;
35686+
35687+ /*
35688+ * Determine IRQ line polarity (high active or low active):
35689+ */
35690+ switch (mp_irqs[idx].mpc_irqflag & 3)
35691+ {
35692+ case 0: /* conforms, ie. bus-type dependent polarity */
35693+ {
35694+ switch (mp_bus_id_to_type[bus])
35695+ {
35696+ case MP_BUS_ISA: /* ISA pin */
35697+ {
35698+ polarity = default_ISA_polarity(idx);
35699+ break;
35700+ }
35701+ case MP_BUS_EISA: /* EISA pin */
35702+ {
35703+ polarity = default_EISA_polarity(idx);
35704+ break;
35705+ }
35706+ case MP_BUS_PCI: /* PCI pin */
35707+ {
35708+ polarity = default_PCI_polarity(idx);
35709+ break;
35710+ }
35711+ case MP_BUS_MCA: /* MCA pin */
35712+ {
35713+ polarity = default_MCA_polarity(idx);
35714+ break;
35715+ }
35716+ default:
35717+ {
35718+ printk(KERN_WARNING "broken BIOS!!\n");
35719+ polarity = 1;
35720+ break;
35721+ }
35722+ }
35723+ break;
35724+ }
35725+ case 1: /* high active */
35726+ {
35727+ polarity = 0;
35728+ break;
35729+ }
35730+ case 2: /* reserved */
35731+ {
35732+ printk(KERN_WARNING "broken BIOS!!\n");
35733+ polarity = 1;
35734+ break;
35735+ }
35736+ case 3: /* low active */
35737+ {
35738+ polarity = 1;
35739+ break;
35740+ }
35741+ default: /* invalid */
35742+ {
35743+ printk(KERN_WARNING "broken BIOS!!\n");
35744+ polarity = 1;
35745+ break;
35746+ }
35747+ }
35748+ return polarity;
35749+}
35750+
35751+static int MPBIOS_trigger(int idx)
35752+{
35753+ int bus = mp_irqs[idx].mpc_srcbus;
35754+ int trigger;
35755+
35756+ /*
35757+ * Determine IRQ trigger mode (edge or level sensitive):
35758+ */
35759+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
35760+ {
35761+ case 0: /* conforms, ie. bus-type dependent */
35762+ {
35763+ switch (mp_bus_id_to_type[bus])
35764+ {
35765+ case MP_BUS_ISA: /* ISA pin */
35766+ {
35767+ trigger = default_ISA_trigger(idx);
35768+ break;
35769+ }
35770+ case MP_BUS_EISA: /* EISA pin */
35771+ {
35772+ trigger = default_EISA_trigger(idx);
35773+ break;
35774+ }
35775+ case MP_BUS_PCI: /* PCI pin */
35776+ {
35777+ trigger = default_PCI_trigger(idx);
35778+ break;
35779+ }
35780+ case MP_BUS_MCA: /* MCA pin */
35781+ {
35782+ trigger = default_MCA_trigger(idx);
35783+ break;
35784+ }
35785+ default:
35786+ {
35787+ printk(KERN_WARNING "broken BIOS!!\n");
35788+ trigger = 1;
35789+ break;
35790+ }
35791+ }
35792+ break;
35793+ }
35794+ case 1: /* edge */
35795+ {
35796+ trigger = 0;
35797+ break;
35798+ }
35799+ case 2: /* reserved */
35800+ {
35801+ printk(KERN_WARNING "broken BIOS!!\n");
35802+ trigger = 1;
35803+ break;
35804+ }
35805+ case 3: /* level */
35806+ {
35807+ trigger = 1;
35808+ break;
35809+ }
35810+ default: /* invalid */
35811+ {
35812+ printk(KERN_WARNING "broken BIOS!!\n");
35813+ trigger = 0;
35814+ break;
35815+ }
35816+ }
35817+ return trigger;
35818+}
35819+
35820+static inline int irq_polarity(int idx)
35821+{
35822+ return MPBIOS_polarity(idx);
35823+}
35824+
35825+static inline int irq_trigger(int idx)
35826+{
35827+ return MPBIOS_trigger(idx);
35828+}
35829+
35830+static int next_irq = 16;
35831+
35832+/*
35833+ * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
35834+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
35835+ * from ACPI, which can reach 800 in large boxen.
35836+ *
35837+ * Compact the sparse GSI space into a sequential IRQ series and reuse
35838+ * vectors if possible.
35839+ */
35840+int gsi_irq_sharing(int gsi)
35841+{
35842+ int i, tries, vector;
35843+
35844+ BUG_ON(gsi >= NR_IRQ_VECTORS);
35845+
35846+ if (platform_legacy_irq(gsi))
35847+ return gsi;
35848+
35849+ if (gsi_2_irq[gsi] != 0xFF)
35850+ return (int)gsi_2_irq[gsi];
35851+
35852+ tries = NR_IRQS;
35853+ try_again:
35854+ vector = assign_irq_vector(gsi);
35855+
35856+ /*
35857+ * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
35858+ * use of vector and if found, return that IRQ. However, we never want
35859+ * to share legacy IRQs, which usually have a different trigger mode
35860+ * than PCI.
35861+ */
35862+ for (i = 0; i < NR_IRQS; i++)
35863+ if (IO_APIC_VECTOR(i) == vector)
35864+ break;
35865+ if (platform_legacy_irq(i)) {
35866+ if (--tries >= 0) {
35867+ IO_APIC_VECTOR(i) = 0;
35868+ goto try_again;
35869+ }
35870+ panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
35871+ }
35872+ if (i < NR_IRQS) {
35873+ gsi_2_irq[gsi] = i;
35874+ printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
35875+ gsi, vector, i);
35876+ return i;
35877+ }
35878+
35879+ i = next_irq++;
35880+ BUG_ON(i >= NR_IRQS);
35881+ gsi_2_irq[gsi] = i;
35882+ IO_APIC_VECTOR(i) = vector;
35883+ printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
35884+ gsi, vector, i);
35885+ return i;
35886+}
35887+
35888+static int pin_2_irq(int idx, int apic, int pin)
35889+{
35890+ int irq, i;
35891+ int bus = mp_irqs[idx].mpc_srcbus;
35892+
35893+ /*
35894+ * Debugging check, we are in big trouble if this message pops up!
35895+ */
35896+ if (mp_irqs[idx].mpc_dstirq != pin)
35897+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
35898+
35899+ switch (mp_bus_id_to_type[bus])
35900+ {
35901+ case MP_BUS_ISA: /* ISA pin */
35902+ case MP_BUS_EISA:
35903+ case MP_BUS_MCA:
35904+ {
35905+ irq = mp_irqs[idx].mpc_srcbusirq;
35906+ break;
35907+ }
35908+ case MP_BUS_PCI: /* PCI pin */
35909+ {
35910+ /*
35911+ * PCI IRQs are mapped in order
35912+ */
35913+ i = irq = 0;
35914+ while (i < apic)
35915+ irq += nr_ioapic_registers[i++];
35916+ irq += pin;
35917+ irq = gsi_irq_sharing(irq);
35918+ break;
35919+ }
35920+ default:
35921+ {
35922+ printk(KERN_ERR "unknown bus type %d.\n",bus);
35923+ irq = 0;
35924+ break;
35925+ }
35926+ }
35927+ BUG_ON(irq >= NR_IRQS);
35928+
35929+ /*
35930+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
35931+ */
35932+ if ((pin >= 16) && (pin <= 23)) {
35933+ if (pirq_entries[pin-16] != -1) {
35934+ if (!pirq_entries[pin-16]) {
35935+ apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
35936+ } else {
35937+ irq = pirq_entries[pin-16];
35938+ apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
35939+ pin-16, irq);
35940+ }
35941+ }
35942+ }
35943+ BUG_ON(irq >= NR_IRQS);
35944+ return irq;
35945+}
35946+
35947+static inline int IO_APIC_irq_trigger(int irq)
35948+{
35949+ int apic, idx, pin;
35950+
35951+ for (apic = 0; apic < nr_ioapics; apic++) {
35952+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
35953+ idx = find_irq_entry(apic,pin,mp_INT);
35954+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
35955+ return irq_trigger(idx);
35956+ }
35957+ }
35958+ /*
35959+ * nonexistent IRQs are edge default
35960+ */
35961+ return 0;
35962+}
35963+
35964+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
35965+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
35966+
35967+int assign_irq_vector(int irq)
35968+{
35969+ struct physdev_irq irq_op;
35970+
35971+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
35972+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
35973+ return IO_APIC_VECTOR(irq);
35974+
35975+ irq_op.irq = irq;
35976+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
35977+ return -ENOSPC;
35978+
35979+ vector_irq[irq_op.vector] = irq;
35980+ if (irq != AUTO_ASSIGN)
35981+ IO_APIC_VECTOR(irq) = irq_op.vector;
35982+
35983+ return irq_op.vector;
35984+}
35985+
35986+extern void (*interrupt[NR_IRQS])(void);
35987+#ifndef CONFIG_XEN
35988+static struct hw_interrupt_type ioapic_level_type;
35989+static struct hw_interrupt_type ioapic_edge_type;
35990+
35991+#define IOAPIC_AUTO -1
35992+#define IOAPIC_EDGE 0
35993+#define IOAPIC_LEVEL 1
35994+
35995+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
35996+{
35997+ if (use_pci_vector() && !platform_legacy_irq(irq)) {
35998+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
35999+ trigger == IOAPIC_LEVEL)
36000+ irq_desc[vector].handler = &ioapic_level_type;
36001+ else
36002+ irq_desc[vector].handler = &ioapic_edge_type;
36003+ set_intr_gate(vector, interrupt[vector]);
36004+ } else {
36005+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
36006+ trigger == IOAPIC_LEVEL)
36007+ irq_desc[irq].handler = &ioapic_level_type;
36008+ else
36009+ irq_desc[irq].handler = &ioapic_edge_type;
36010+ set_intr_gate(vector, interrupt[irq]);
36011+ }
36012+}
36013+#else
36014+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
36015+#endif /* !CONFIG_XEN */
36016+
36017+static void __init setup_IO_APIC_irqs(void)
36018+{
36019+ struct IO_APIC_route_entry entry;
36020+ int apic, pin, idx, irq, first_notcon = 1, vector;
36021+ unsigned long flags;
36022+
36023+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
36024+
36025+ for (apic = 0; apic < nr_ioapics; apic++) {
36026+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36027+
36028+ /*
36029+ * add it to the IO-APIC irq-routing table:
36030+ */
36031+ memset(&entry,0,sizeof(entry));
36032+
36033+ entry.delivery_mode = INT_DELIVERY_MODE;
36034+ entry.dest_mode = INT_DEST_MODE;
36035+ entry.mask = 0; /* enable IRQ */
36036+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36037+
36038+ idx = find_irq_entry(apic,pin,mp_INT);
36039+ if (idx == -1) {
36040+ if (first_notcon) {
36041+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36042+ first_notcon = 0;
36043+ } else
36044+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36045+ continue;
36046+ }
36047+
36048+ entry.trigger = irq_trigger(idx);
36049+ entry.polarity = irq_polarity(idx);
36050+
36051+ if (irq_trigger(idx)) {
36052+ entry.trigger = 1;
36053+ entry.mask = 1;
36054+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36055+ }
36056+
36057+ irq = pin_2_irq(idx, apic, pin);
36058+ add_pin_to_irq(irq, apic, pin);
36059+
36060+ if (/* !apic && */ !IO_APIC_IRQ(irq))
36061+ continue;
36062+
36063+ if (IO_APIC_IRQ(irq)) {
36064+ vector = assign_irq_vector(irq);
36065+ entry.vector = vector;
36066+
36067+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
36068+ if (!apic && (irq < 16))
36069+ disable_8259A_irq(irq);
36070+ }
36071+ spin_lock_irqsave(&ioapic_lock, flags);
36072+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
36073+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
36074+ set_native_irq_info(irq, TARGET_CPUS);
36075+ spin_unlock_irqrestore(&ioapic_lock, flags);
36076+ }
36077+ }
36078+
36079+ if (!first_notcon)
36080+ apic_printk(APIC_VERBOSE," not connected.\n");
36081+}
36082+
36083+#ifndef CONFIG_XEN
36084+/*
36085+ * Set up the 8259A-master output pin as broadcast to all
36086+ * CPUs.
36087+ */
36088+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
36089+{
36090+ struct IO_APIC_route_entry entry;
36091+ unsigned long flags;
36092+
36093+ memset(&entry,0,sizeof(entry));
36094+
36095+ disable_8259A_irq(0);
36096+
36097+ /* mask LVT0 */
36098+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
36099+
36100+ /*
36101+ * We use logical delivery to get the timer IRQ
36102+ * to the first CPU.
36103+ */
36104+ entry.dest_mode = INT_DEST_MODE;
36105+ entry.mask = 0; /* unmask IRQ now */
36106+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36107+ entry.delivery_mode = INT_DELIVERY_MODE;
36108+ entry.polarity = 0;
36109+ entry.trigger = 0;
36110+ entry.vector = vector;
36111+
36112+ /*
36113+ * The timer IRQ doesn't have to know that behind the
36114+ * scene we have a 8259A-master in AEOI mode ...
36115+ */
36116+ irq_desc[0].handler = &ioapic_edge_type;
36117+
36118+ /*
36119+ * Add it to the IO-APIC irq-routing table:
36120+ */
36121+ spin_lock_irqsave(&ioapic_lock, flags);
36122+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
36123+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
36124+ spin_unlock_irqrestore(&ioapic_lock, flags);
36125+
36126+ enable_8259A_irq(0);
36127+}
36128+
36129+void __init UNEXPECTED_IO_APIC(void)
36130+{
36131+}
36132+
36133+void __apicdebuginit print_IO_APIC(void)
36134+{
36135+ int apic, i;
36136+ union IO_APIC_reg_00 reg_00;
36137+ union IO_APIC_reg_01 reg_01;
36138+ union IO_APIC_reg_02 reg_02;
36139+ unsigned long flags;
36140+
36141+ if (apic_verbosity == APIC_QUIET)
36142+ return;
36143+
36144+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
36145+ for (i = 0; i < nr_ioapics; i++)
36146+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
36147+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
36148+
36149+ /*
36150+ * We are a bit conservative about what we expect. We have to
36151+ * know about every hardware change ASAP.
36152+ */
36153+ printk(KERN_INFO "testing the IO APIC.......................\n");
36154+
36155+ for (apic = 0; apic < nr_ioapics; apic++) {
36156+
36157+ spin_lock_irqsave(&ioapic_lock, flags);
36158+ reg_00.raw = io_apic_read(apic, 0);
36159+ reg_01.raw = io_apic_read(apic, 1);
36160+ if (reg_01.bits.version >= 0x10)
36161+ reg_02.raw = io_apic_read(apic, 2);
36162+ spin_unlock_irqrestore(&ioapic_lock, flags);
36163+
36164+ printk("\n");
36165+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
36166+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
36167+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
36168+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
36169+ UNEXPECTED_IO_APIC();
36170+
36171+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
36172+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
36173+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
36174+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
36175+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
36176+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
36177+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
36178+ (reg_01.bits.entries != 0x2E) &&
36179+ (reg_01.bits.entries != 0x3F) &&
36180+ (reg_01.bits.entries != 0x03)
36181+ )
36182+ UNEXPECTED_IO_APIC();
36183+
36184+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
36185+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
36186+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
36187+ (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
36188+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
36189+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
36190+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
36191+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
36192+ )
36193+ UNEXPECTED_IO_APIC();
36194+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
36195+ UNEXPECTED_IO_APIC();
36196+
36197+ if (reg_01.bits.version >= 0x10) {
36198+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
36199+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
36200+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
36201+ UNEXPECTED_IO_APIC();
36202+ }
36203+
36204+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
36205+
36206+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
36207+ " Stat Dest Deli Vect: \n");
36208+
36209+ for (i = 0; i <= reg_01.bits.entries; i++) {
36210+ struct IO_APIC_route_entry entry;
36211+
36212+ spin_lock_irqsave(&ioapic_lock, flags);
36213+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
36214+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
36215+ spin_unlock_irqrestore(&ioapic_lock, flags);
36216+
36217+ printk(KERN_DEBUG " %02x %03X %02X ",
36218+ i,
36219+ entry.dest.logical.logical_dest,
36220+ entry.dest.physical.physical_dest
36221+ );
36222+
36223+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
36224+ entry.mask,
36225+ entry.trigger,
36226+ entry.irr,
36227+ entry.polarity,
36228+ entry.delivery_status,
36229+ entry.dest_mode,
36230+ entry.delivery_mode,
36231+ entry.vector
36232+ );
36233+ }
36234+ }
36235+ if (use_pci_vector())
36236+ printk(KERN_INFO "Using vector-based indexing\n");
36237+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
36238+ for (i = 0; i < NR_IRQS; i++) {
36239+ struct irq_pin_list *entry = irq_2_pin + i;
36240+ if (entry->pin < 0)
36241+ continue;
36242+ if (use_pci_vector() && !platform_legacy_irq(i))
36243+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
36244+ else
36245+ printk(KERN_DEBUG "IRQ%d ", i);
36246+ for (;;) {
36247+ printk("-> %d:%d", entry->apic, entry->pin);
36248+ if (!entry->next)
36249+ break;
36250+ entry = irq_2_pin + entry->next;
36251+ }
36252+ printk("\n");
36253+ }
36254+
36255+ printk(KERN_INFO ".................................... done.\n");
36256+
36257+ return;
36258+}
36259+
36260+#if 0
36261+
36262+static __apicdebuginit void print_APIC_bitfield (int base)
36263+{
36264+ unsigned int v;
36265+ int i, j;
36266+
36267+ if (apic_verbosity == APIC_QUIET)
36268+ return;
36269+
36270+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
36271+ for (i = 0; i < 8; i++) {
36272+ v = apic_read(base + i*0x10);
36273+ for (j = 0; j < 32; j++) {
36274+ if (v & (1<<j))
36275+ printk("1");
36276+ else
36277+ printk("0");
36278+ }
36279+ printk("\n");
36280+ }
36281+}
36282+
36283+void __apicdebuginit print_local_APIC(void * dummy)
36284+{
36285+ unsigned int v, ver, maxlvt;
36286+
36287+ if (apic_verbosity == APIC_QUIET)
36288+ return;
36289+
36290+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
36291+ smp_processor_id(), hard_smp_processor_id());
36292+ v = apic_read(APIC_ID);
36293+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
36294+ v = apic_read(APIC_LVR);
36295+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
36296+ ver = GET_APIC_VERSION(v);
36297+ maxlvt = get_maxlvt();
36298+
36299+ v = apic_read(APIC_TASKPRI);
36300+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
36301+
36302+ v = apic_read(APIC_ARBPRI);
36303+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
36304+ v & APIC_ARBPRI_MASK);
36305+ v = apic_read(APIC_PROCPRI);
36306+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
36307+
36308+ v = apic_read(APIC_EOI);
36309+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
36310+ v = apic_read(APIC_RRR);
36311+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
36312+ v = apic_read(APIC_LDR);
36313+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
36314+ v = apic_read(APIC_DFR);
36315+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
36316+ v = apic_read(APIC_SPIV);
36317+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
36318+
36319+ printk(KERN_DEBUG "... APIC ISR field:\n");
36320+ print_APIC_bitfield(APIC_ISR);
36321+ printk(KERN_DEBUG "... APIC TMR field:\n");
36322+ print_APIC_bitfield(APIC_TMR);
36323+ printk(KERN_DEBUG "... APIC IRR field:\n");
36324+ print_APIC_bitfield(APIC_IRR);
36325+
36326+ v = apic_read(APIC_ESR);
36327+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
36328+
36329+ v = apic_read(APIC_ICR);
36330+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
36331+ v = apic_read(APIC_ICR2);
36332+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
36333+
36334+ v = apic_read(APIC_LVTT);
36335+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
36336+
36337+ if (maxlvt > 3) { /* PC is LVT#4. */
36338+ v = apic_read(APIC_LVTPC);
36339+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
36340+ }
36341+ v = apic_read(APIC_LVT0);
36342+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
36343+ v = apic_read(APIC_LVT1);
36344+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
36345+
36346+ if (maxlvt > 2) { /* ERR is LVT#3. */
36347+ v = apic_read(APIC_LVTERR);
36348+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
36349+ }
36350+
36351+ v = apic_read(APIC_TMICT);
36352+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
36353+ v = apic_read(APIC_TMCCT);
36354+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
36355+ v = apic_read(APIC_TDCR);
36356+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
36357+ printk("\n");
36358+}
36359+
36360+void print_all_local_APICs (void)
36361+{
36362+ on_each_cpu(print_local_APIC, NULL, 1, 1);
36363+}
36364+
36365+void __apicdebuginit print_PIC(void)
36366+{
36367+ unsigned int v;
36368+ unsigned long flags;
36369+
36370+ if (apic_verbosity == APIC_QUIET)
36371+ return;
36372+
36373+ printk(KERN_DEBUG "\nprinting PIC contents\n");
36374+
36375+ spin_lock_irqsave(&i8259A_lock, flags);
36376+
36377+ v = inb(0xa1) << 8 | inb(0x21);
36378+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
36379+
36380+ v = inb(0xa0) << 8 | inb(0x20);
36381+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
36382+
36383+ outb(0x0b,0xa0);
36384+ outb(0x0b,0x20);
36385+ v = inb(0xa0) << 8 | inb(0x20);
36386+ outb(0x0a,0xa0);
36387+ outb(0x0a,0x20);
36388+
36389+ spin_unlock_irqrestore(&i8259A_lock, flags);
36390+
36391+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
36392+
36393+ v = inb(0x4d1) << 8 | inb(0x4d0);
36394+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
36395+}
36396+
36397+#endif /* 0 */
36398+
36399+#else
36400+void __init print_IO_APIC(void) { }
36401+#endif /* !CONFIG_XEN */
36402+
36403+static void __init enable_IO_APIC(void)
36404+{
36405+ union IO_APIC_reg_01 reg_01;
36406+#ifndef CONFIG_XEN
36407+ int i8259_apic, i8259_pin;
36408+#endif
36409+ int i, apic;
36410+ unsigned long flags;
36411+
36412+ for (i = 0; i < PIN_MAP_SIZE; i++) {
36413+ irq_2_pin[i].pin = -1;
36414+ irq_2_pin[i].next = 0;
36415+ }
36416+ if (!pirqs_enabled)
36417+ for (i = 0; i < MAX_PIRQS; i++)
36418+ pirq_entries[i] = -1;
36419+
36420+ /*
36421+ * The number of IO-APIC IRQ registers (== #pins):
36422+ */
36423+ for (apic = 0; apic < nr_ioapics; apic++) {
36424+ spin_lock_irqsave(&ioapic_lock, flags);
36425+ reg_01.raw = io_apic_read(apic, 1);
36426+ spin_unlock_irqrestore(&ioapic_lock, flags);
36427+ nr_ioapic_registers[apic] = reg_01.bits.entries+1;
36428+ }
36429+#ifndef CONFIG_XEN
36430+ for(apic = 0; apic < nr_ioapics; apic++) {
36431+ int pin;
36432+ /* See if any of the pins is in ExtINT mode */
36433+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36434+ struct IO_APIC_route_entry entry;
36435+ spin_lock_irqsave(&ioapic_lock, flags);
36436+ *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
36437+ *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
36438+ spin_unlock_irqrestore(&ioapic_lock, flags);
36439+
36440+
36441+ /* If the interrupt line is enabled and in ExtInt mode
36442+ * I have found the pin where the i8259 is connected.
36443+ */
36444+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
36445+ ioapic_i8259.apic = apic;
36446+ ioapic_i8259.pin = pin;
36447+ goto found_i8259;
36448+ }
36449+ }
36450+ }
36451+ found_i8259:
36452+ /* Look to see what if the MP table has reported the ExtINT */
36453+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
36454+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
36455+ /* Trust the MP table if nothing is setup in the hardware */
36456+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
36457+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
36458+ ioapic_i8259.pin = i8259_pin;
36459+ ioapic_i8259.apic = i8259_apic;
36460+ }
36461+ /* Complain if the MP table and the hardware disagree */
36462+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
36463+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
36464+ {
36465+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
36466+ }
36467+#endif
36468+
36469+ /*
36470+ * Do not trust the IO-APIC being empty at bootup
36471+ */
36472+ clear_IO_APIC();
36473+}
36474+
36475+/*
36476+ * Not an __init, needed by the reboot code
36477+ */
36478+void disable_IO_APIC(void)
36479+{
36480+ /*
36481+ * Clear the IO-APIC before rebooting:
36482+ */
36483+ clear_IO_APIC();
36484+
36485+#ifndef CONFIG_XEN
36486+ /*
36487+ * If the i8259 is routed through an IOAPIC
36488+ * Put that IOAPIC in virtual wire mode
36489+ * so legacy interrupts can be delivered.
36490+ */
36491+ if (ioapic_i8259.pin != -1) {
36492+ struct IO_APIC_route_entry entry;
36493+ unsigned long flags;
36494+
36495+ memset(&entry, 0, sizeof(entry));
36496+ entry.mask = 0; /* Enabled */
36497+ entry.trigger = 0; /* Edge */
36498+ entry.irr = 0;
36499+ entry.polarity = 0; /* High */
36500+ entry.delivery_status = 0;
36501+ entry.dest_mode = 0; /* Physical */
36502+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
36503+ entry.vector = 0;
36504+ entry.dest.physical.physical_dest =
36505+ GET_APIC_ID(apic_read(APIC_ID));
36506+
36507+ /*
36508+ * Add it to the IO-APIC irq-routing table:
36509+ */
36510+ spin_lock_irqsave(&ioapic_lock, flags);
36511+ io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
36512+ *(((int *)&entry)+1));
36513+ io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
36514+ *(((int *)&entry)+0));
36515+ spin_unlock_irqrestore(&ioapic_lock, flags);
36516+ }
36517+
36518+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
36519+#endif
36520+}
36521+
36522+/*
36523+ * function to set the IO-APIC physical IDs based on the
36524+ * values stored in the MPC table.
36525+ *
36526+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
36527+ */
36528+
36529+#ifndef CONFIG_XEN
36530+static void __init setup_ioapic_ids_from_mpc (void)
36531+{
36532+ union IO_APIC_reg_00 reg_00;
36533+ int apic;
36534+ int i;
36535+ unsigned char old_id;
36536+ unsigned long flags;
36537+
36538+ /*
36539+ * Set the IOAPIC ID to the value stored in the MPC table.
36540+ */
36541+ for (apic = 0; apic < nr_ioapics; apic++) {
36542+
36543+ /* Read the register 0 value */
36544+ spin_lock_irqsave(&ioapic_lock, flags);
36545+ reg_00.raw = io_apic_read(apic, 0);
36546+ spin_unlock_irqrestore(&ioapic_lock, flags);
36547+
36548+ old_id = mp_ioapics[apic].mpc_apicid;
36549+
36550+
36551+ printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
36552+
36553+
36554+ /*
36555+ * We need to adjust the IRQ routing table
36556+ * if the ID changed.
36557+ */
36558+ if (old_id != mp_ioapics[apic].mpc_apicid)
36559+ for (i = 0; i < mp_irq_entries; i++)
36560+ if (mp_irqs[i].mpc_dstapic == old_id)
36561+ mp_irqs[i].mpc_dstapic
36562+ = mp_ioapics[apic].mpc_apicid;
36563+
36564+ /*
36565+ * Read the right value from the MPC table and
36566+ * write it into the ID register.
36567+ */
36568+ apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
36569+ mp_ioapics[apic].mpc_apicid);
36570+
36571+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
36572+ spin_lock_irqsave(&ioapic_lock, flags);
36573+ io_apic_write(apic, 0, reg_00.raw);
36574+ spin_unlock_irqrestore(&ioapic_lock, flags);
36575+
36576+ /*
36577+ * Sanity check
36578+ */
36579+ spin_lock_irqsave(&ioapic_lock, flags);
36580+ reg_00.raw = io_apic_read(apic, 0);
36581+ spin_unlock_irqrestore(&ioapic_lock, flags);
36582+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
36583+ printk("could not set ID!\n");
36584+ else
36585+ apic_printk(APIC_VERBOSE," ok.\n");
36586+ }
36587+}
36588+#else
36589+static void __init setup_ioapic_ids_from_mpc(void) { }
36590+#endif
36591+
36592+/*
36593+ * There is a nasty bug in some older SMP boards, their mptable lies
36594+ * about the timer IRQ. We do the following to work around the situation:
36595+ *
36596+ * - timer IRQ defaults to IO-APIC IRQ
36597+ * - if this function detects that timer IRQs are defunct, then we fall
36598+ * back to ISA timer IRQs
36599+ */
36600+#ifndef CONFIG_XEN
36601+static int __init timer_irq_works(void)
36602+{
36603+ unsigned long t1 = jiffies;
36604+
36605+ local_irq_enable();
36606+ /* Let ten ticks pass... */
36607+ mdelay((10 * 1000) / HZ);
36608+
36609+ /*
36610+ * Expect a few ticks at least, to be sure some possible
36611+ * glue logic does not lock up after one or two first
36612+ * ticks in a non-ExtINT mode. Also the local APIC
36613+ * might have cached one ExtINT interrupt. Finally, at
36614+ * least one tick may be lost due to delays.
36615+ */
36616+
36617+ /* jiffies wrap? */
36618+ if (jiffies - t1 > 4)
36619+ return 1;
36620+ return 0;
36621+}
36622+
36623+/*
36624+ * In the SMP+IOAPIC case it might happen that there are an unspecified
36625+ * number of pending IRQ events unhandled. These cases are very rare,
36626+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
36627+ * better to do it this way as thus we do not have to be aware of
36628+ * 'pending' interrupts in the IRQ path, except at this point.
36629+ */
36630+/*
36631+ * Edge triggered needs to resend any interrupt
36632+ * that was delayed but this is now handled in the device
36633+ * independent code.
36634+ */
36635+
36636+/*
36637+ * Starting up a edge-triggered IO-APIC interrupt is
36638+ * nasty - we need to make sure that we get the edge.
36639+ * If it is already asserted for some reason, we need
36640+ * return 1 to indicate that is was pending.
36641+ *
36642+ * This is not complete - we should be able to fake
36643+ * an edge even if it isn't on the 8259A...
36644+ */
36645+
36646+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
36647+{
36648+ int was_pending = 0;
36649+ unsigned long flags;
36650+
36651+ spin_lock_irqsave(&ioapic_lock, flags);
36652+ if (irq < 16) {
36653+ disable_8259A_irq(irq);
36654+ if (i8259A_irq_pending(irq))
36655+ was_pending = 1;
36656+ }
36657+ __unmask_IO_APIC_irq(irq);
36658+ spin_unlock_irqrestore(&ioapic_lock, flags);
36659+
36660+ return was_pending;
36661+}
36662+
36663+/*
36664+ * Once we have recorded IRQ_PENDING already, we can mask the
36665+ * interrupt for real. This prevents IRQ storms from unhandled
36666+ * devices.
36667+ */
36668+static void ack_edge_ioapic_irq(unsigned int irq)
36669+{
36670+ move_irq(irq);
36671+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
36672+ == (IRQ_PENDING | IRQ_DISABLED))
36673+ mask_IO_APIC_irq(irq);
36674+ ack_APIC_irq();
36675+}
36676+
36677+/*
36678+ * Level triggered interrupts can just be masked,
36679+ * and shutting down and starting up the interrupt
36680+ * is the same as enabling and disabling them -- except
36681+ * with a startup need to return a "was pending" value.
36682+ *
36683+ * Level triggered interrupts are special because we
36684+ * do not touch any IO-APIC register while handling
36685+ * them. We ack the APIC in the end-IRQ handler, not
36686+ * in the start-IRQ-handler. Protection against reentrance
36687+ * from the same interrupt is still provided, both by the
36688+ * generic IRQ layer and by the fact that an unacked local
36689+ * APIC does not accept IRQs.
36690+ */
36691+static unsigned int startup_level_ioapic_irq (unsigned int irq)
36692+{
36693+ unmask_IO_APIC_irq(irq);
36694+
36695+ return 0; /* don't check for pending */
36696+}
36697+
36698+static void end_level_ioapic_irq (unsigned int irq)
36699+{
36700+ move_irq(irq);
36701+ ack_APIC_irq();
36702+}
36703+
36704+#ifdef CONFIG_PCI_MSI
36705+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
36706+{
36707+ int irq = vector_to_irq(vector);
36708+
36709+ return startup_edge_ioapic_irq(irq);
36710+}
36711+
36712+static void ack_edge_ioapic_vector(unsigned int vector)
36713+{
36714+ int irq = vector_to_irq(vector);
36715+
36716+ move_native_irq(vector);
36717+ ack_edge_ioapic_irq(irq);
36718+}
36719+
36720+static unsigned int startup_level_ioapic_vector (unsigned int vector)
36721+{
36722+ int irq = vector_to_irq(vector);
36723+
36724+ return startup_level_ioapic_irq (irq);
36725+}
36726+
36727+static void end_level_ioapic_vector (unsigned int vector)
36728+{
36729+ int irq = vector_to_irq(vector);
36730+
36731+ move_native_irq(vector);
36732+ end_level_ioapic_irq(irq);
36733+}
36734+
36735+static void mask_IO_APIC_vector (unsigned int vector)
36736+{
36737+ int irq = vector_to_irq(vector);
36738+
36739+ mask_IO_APIC_irq(irq);
36740+}
36741+
36742+static void unmask_IO_APIC_vector (unsigned int vector)
36743+{
36744+ int irq = vector_to_irq(vector);
36745+
36746+ unmask_IO_APIC_irq(irq);
36747+}
36748+
36749+#ifdef CONFIG_SMP
36750+static void set_ioapic_affinity_vector (unsigned int vector,
36751+ cpumask_t cpu_mask)
36752+{
36753+ int irq = vector_to_irq(vector);
36754+
36755+ set_native_irq_info(vector, cpu_mask);
36756+ set_ioapic_affinity_irq(irq, cpu_mask);
36757+}
36758+#endif // CONFIG_SMP
36759+#endif // CONFIG_PCI_MSI
36760+
36761+/*
36762+ * Level and edge triggered IO-APIC interrupts need different handling,
36763+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
36764+ * handled with the level-triggered descriptor, but that one has slightly
36765+ * more overhead. Level-triggered interrupts cannot be handled with the
36766+ * edge-triggered handler, without risking IRQ storms and other ugly
36767+ * races.
36768+ */
36769+
36770+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
36771+ .typename = "IO-APIC-edge",
36772+ .startup = startup_edge_ioapic,
36773+ .shutdown = shutdown_edge_ioapic,
36774+ .enable = enable_edge_ioapic,
36775+ .disable = disable_edge_ioapic,
36776+ .ack = ack_edge_ioapic,
36777+ .end = end_edge_ioapic,
36778+#ifdef CONFIG_SMP
36779+ .set_affinity = set_ioapic_affinity,
36780+#endif
36781+};
36782+
36783+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
36784+ .typename = "IO-APIC-level",
36785+ .startup = startup_level_ioapic,
36786+ .shutdown = shutdown_level_ioapic,
36787+ .enable = enable_level_ioapic,
36788+ .disable = disable_level_ioapic,
36789+ .ack = mask_and_ack_level_ioapic,
36790+ .end = end_level_ioapic,
36791+#ifdef CONFIG_SMP
36792+ .set_affinity = set_ioapic_affinity,
36793+#endif
36794+};
36795+#endif /* !CONFIG_XEN */
36796+
36797+static inline void init_IO_APIC_traps(void)
36798+{
36799+ int irq;
36800+
36801+ /*
36802+ * NOTE! The local APIC isn't very good at handling
36803+ * multiple interrupts at the same interrupt level.
36804+ * As the interrupt level is determined by taking the
36805+ * vector number and shifting that right by 4, we
36806+ * want to spread these out a bit so that they don't
36807+ * all fall in the same interrupt level.
36808+ *
36809+ * Also, we've got to be careful not to trash gate
36810+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
36811+ */
36812+ for (irq = 0; irq < NR_IRQS ; irq++) {
36813+ int tmp = irq;
36814+ if (use_pci_vector()) {
36815+ if (!platform_legacy_irq(tmp))
36816+ if ((tmp = vector_to_irq(tmp)) == -1)
36817+ continue;
36818+ }
36819+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
36820+ /*
36821+ * Hmm.. We don't have an entry for this,
36822+ * so default to an old-fashioned 8259
36823+ * interrupt if we can..
36824+ */
36825+ if (irq < 16)
36826+ make_8259A_irq(irq);
36827+#ifndef CONFIG_XEN
36828+ else
36829+ /* Strange. Oh, well.. */
36830+ irq_desc[irq].handler = &no_irq_type;
36831+#endif
36832+ }
36833+ }
36834+}
36835+
36836+#ifndef CONFIG_XEN
36837+static void enable_lapic_irq (unsigned int irq)
36838+{
36839+ unsigned long v;
36840+
36841+ v = apic_read(APIC_LVT0);
36842+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
36843+}
36844+
36845+static void disable_lapic_irq (unsigned int irq)
36846+{
36847+ unsigned long v;
36848+
36849+ v = apic_read(APIC_LVT0);
36850+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
36851+}
36852+
36853+static void ack_lapic_irq (unsigned int irq)
36854+{
36855+ ack_APIC_irq();
36856+}
36857+
36858+static void end_lapic_irq (unsigned int i) { /* nothing */ }
36859+
36860+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
36861+ .typename = "local-APIC-edge",
36862+ .startup = NULL, /* startup_irq() not used for IRQ0 */
36863+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
36864+ .enable = enable_lapic_irq,
36865+ .disable = disable_lapic_irq,
36866+ .ack = ack_lapic_irq,
36867+ .end = end_lapic_irq,
36868+};
36869+
36870+static void setup_nmi (void)
36871+{
36872+ /*
36873+ * Dirty trick to enable the NMI watchdog ...
36874+ * We put the 8259A master into AEOI mode and
36875+ * unmask on all local APICs LVT0 as NMI.
36876+ *
36877+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
36878+ * is from Maciej W. Rozycki - so we do not have to EOI from
36879+ * the NMI handler or the timer interrupt.
36880+ */
36881+ printk(KERN_INFO "activating NMI Watchdog ...");
36882+
36883+ enable_NMI_through_LVT0(NULL);
36884+
36885+ printk(" done.\n");
36886+}
36887+
36888+/*
36889+ * This looks a bit hackish but it's about the only one way of sending
36890+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
36891+ * not support the ExtINT mode, unfortunately. We need to send these
36892+ * cycles as some i82489DX-based boards have glue logic that keeps the
36893+ * 8259A interrupt line asserted until INTA. --macro
36894+ */
36895+static inline void unlock_ExtINT_logic(void)
36896+{
36897+ int apic, pin, i;
36898+ struct IO_APIC_route_entry entry0, entry1;
36899+ unsigned char save_control, save_freq_select;
36900+ unsigned long flags;
36901+
36902+ pin = find_isa_irq_pin(8, mp_INT);
36903+ apic = find_isa_irq_apic(8, mp_INT);
36904+ if (pin == -1)
36905+ return;
36906+
36907+ spin_lock_irqsave(&ioapic_lock, flags);
36908+ *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
36909+ *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
36910+ spin_unlock_irqrestore(&ioapic_lock, flags);
36911+ clear_IO_APIC_pin(apic, pin);
36912+
36913+ memset(&entry1, 0, sizeof(entry1));
36914+
36915+ entry1.dest_mode = 0; /* physical delivery */
36916+ entry1.mask = 0; /* unmask IRQ now */
36917+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
36918+ entry1.delivery_mode = dest_ExtINT;
36919+ entry1.polarity = entry0.polarity;
36920+ entry1.trigger = 0;
36921+ entry1.vector = 0;
36922+
36923+ spin_lock_irqsave(&ioapic_lock, flags);
36924+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
36925+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
36926+ spin_unlock_irqrestore(&ioapic_lock, flags);
36927+
36928+ save_control = CMOS_READ(RTC_CONTROL);
36929+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
36930+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
36931+ RTC_FREQ_SELECT);
36932+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
36933+
36934+ i = 100;
36935+ while (i-- > 0) {
36936+ mdelay(10);
36937+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
36938+ i -= 10;
36939+ }
36940+
36941+ CMOS_WRITE(save_control, RTC_CONTROL);
36942+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
36943+ clear_IO_APIC_pin(apic, pin);
36944+
36945+ spin_lock_irqsave(&ioapic_lock, flags);
36946+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
36947+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
36948+ spin_unlock_irqrestore(&ioapic_lock, flags);
36949+}
36950+
36951+/*
36952+ * This code may look a bit paranoid, but it's supposed to cooperate with
36953+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
36954+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
36955+ * fanatically on his truly buggy board.
36956+ *
36957+ * FIXME: really need to revamp this for modern platforms only.
36958+ */
36959+static inline void check_timer(void)
36960+{
36961+ int apic1, pin1, apic2, pin2;
36962+ int vector;
36963+
36964+ /*
36965+ * get/set the timer IRQ vector:
36966+ */
36967+ disable_8259A_irq(0);
36968+ vector = assign_irq_vector(0);
36969+ set_intr_gate(vector, interrupt[0]);
36970+
36971+ /*
36972+ * Subtle, code in do_timer_interrupt() expects an AEOI
36973+ * mode for the 8259A whenever interrupts are routed
36974+ * through I/O APICs. Also IRQ0 has to be enabled in
36975+ * the 8259A which implies the virtual wire has to be
36976+ * disabled in the local APIC.
36977+ */
36978+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
36979+ init_8259A(1);
36980+ if (timer_over_8254 > 0)
36981+ enable_8259A_irq(0);
36982+
36983+ pin1 = find_isa_irq_pin(0, mp_INT);
36984+ apic1 = find_isa_irq_apic(0, mp_INT);
36985+ pin2 = ioapic_i8259.pin;
36986+ apic2 = ioapic_i8259.apic;
36987+
36988+ apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
36989+ vector, apic1, pin1, apic2, pin2);
36990+
36991+ if (pin1 != -1) {
36992+ /*
36993+ * Ok, does IRQ0 through the IOAPIC work?
36994+ */
36995+ unmask_IO_APIC_irq(0);
36996+ if (!no_timer_check && timer_irq_works()) {
36997+ nmi_watchdog_default();
36998+ if (nmi_watchdog == NMI_IO_APIC) {
36999+ disable_8259A_irq(0);
37000+ setup_nmi();
37001+ enable_8259A_irq(0);
37002+ }
37003+ if (disable_timer_pin_1 > 0)
37004+ clear_IO_APIC_pin(0, pin1);
37005+ return;
37006+ }
37007+ clear_IO_APIC_pin(apic1, pin1);
37008+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
37009+ "connected to IO-APIC\n");
37010+ }
37011+
37012+ apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
37013+ "through the 8259A ... ");
37014+ if (pin2 != -1) {
37015+ apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
37016+ apic2, pin2);
37017+ /*
37018+ * legacy devices should be connected to IO APIC #0
37019+ */
37020+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
37021+ if (timer_irq_works()) {
37022+ printk("works.\n");
37023+ nmi_watchdog_default();
37024+ if (nmi_watchdog == NMI_IO_APIC) {
37025+ setup_nmi();
37026+ }
37027+ return;
37028+ }
37029+ /*
37030+ * Cleanup, just in case ...
37031+ */
37032+ clear_IO_APIC_pin(apic2, pin2);
37033+ }
37034+ printk(" failed.\n");
37035+
37036+ if (nmi_watchdog == NMI_IO_APIC) {
37037+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
37038+ nmi_watchdog = 0;
37039+ }
37040+
37041+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
37042+
37043+ disable_8259A_irq(0);
37044+ irq_desc[0].handler = &lapic_irq_type;
37045+ apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
37046+ enable_8259A_irq(0);
37047+
37048+ if (timer_irq_works()) {
37049+ apic_printk(APIC_QUIET, " works.\n");
37050+ return;
37051+ }
37052+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
37053+ apic_printk(APIC_VERBOSE," failed.\n");
37054+
37055+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
37056+
37057+ init_8259A(0);
37058+ make_8259A_irq(0);
37059+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
37060+
37061+ unlock_ExtINT_logic();
37062+
37063+ if (timer_irq_works()) {
37064+ apic_printk(APIC_VERBOSE," works.\n");
37065+ return;
37066+ }
37067+ apic_printk(APIC_VERBOSE," failed :(.\n");
37068+ panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
37069+}
37070+#else
37071+#define check_timer() ((void)0)
37072+#endif /* !CONFIG_XEN */
37073+
37074+static int __init notimercheck(char *s)
37075+{
37076+ no_timer_check = 1;
37077+ return 1;
37078+}
37079+__setup("no_timer_check", notimercheck);
37080+
37081+/*
37082+ *
37083+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
37084+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
37085+ * Linux doesn't really care, as it's not actually used
37086+ * for any interrupt handling anyway.
37087+ */
37088+#define PIC_IRQS (1<<2)
37089+
37090+void __init setup_IO_APIC(void)
37091+{
37092+ enable_IO_APIC();
37093+
37094+ if (acpi_ioapic)
37095+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
37096+ else
37097+ io_apic_irqs = ~PIC_IRQS;
37098+
37099+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
37100+
37101+ /*
37102+ * Set up the IO-APIC IRQ routing table.
37103+ */
37104+ if (!acpi_ioapic)
37105+ setup_ioapic_ids_from_mpc();
37106+#ifndef CONFIG_XEN
37107+ sync_Arb_IDs();
37108+#endif /* !CONFIG_XEN */
37109+ setup_IO_APIC_irqs();
37110+ init_IO_APIC_traps();
37111+ check_timer();
37112+ if (!acpi_ioapic)
37113+ print_IO_APIC();
37114+}
37115+
37116+struct sysfs_ioapic_data {
37117+ struct sys_device dev;
37118+ struct IO_APIC_route_entry entry[0];
37119+};
37120+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
37121+
37122+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
37123+{
37124+ struct IO_APIC_route_entry *entry;
37125+ struct sysfs_ioapic_data *data;
37126+ unsigned long flags;
37127+ int i;
37128+
37129+ data = container_of(dev, struct sysfs_ioapic_data, dev);
37130+ entry = data->entry;
37131+ spin_lock_irqsave(&ioapic_lock, flags);
37132+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
37133+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
37134+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
37135+ }
37136+ spin_unlock_irqrestore(&ioapic_lock, flags);
37137+
37138+ return 0;
37139+}
37140+
37141+static int ioapic_resume(struct sys_device *dev)
37142+{
37143+ struct IO_APIC_route_entry *entry;
37144+ struct sysfs_ioapic_data *data;
37145+ unsigned long flags;
37146+ union IO_APIC_reg_00 reg_00;
37147+ int i;
37148+
37149+ data = container_of(dev, struct sysfs_ioapic_data, dev);
37150+ entry = data->entry;
37151+
37152+ spin_lock_irqsave(&ioapic_lock, flags);
37153+ reg_00.raw = io_apic_read(dev->id, 0);
37154+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
37155+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
37156+ io_apic_write(dev->id, 0, reg_00.raw);
37157+ }
37158+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
37159+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
37160+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
37161+ }
37162+ spin_unlock_irqrestore(&ioapic_lock, flags);
37163+
37164+ return 0;
37165+}
37166+
37167+static struct sysdev_class ioapic_sysdev_class = {
37168+ set_kset_name("ioapic"),
37169+ .suspend = ioapic_suspend,
37170+ .resume = ioapic_resume,
37171+};
37172+
37173+static int __init ioapic_init_sysfs(void)
37174+{
37175+ struct sys_device * dev;
37176+ int i, size, error = 0;
37177+
37178+ error = sysdev_class_register(&ioapic_sysdev_class);
37179+ if (error)
37180+ return error;
37181+
37182+ for (i = 0; i < nr_ioapics; i++ ) {
37183+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
37184+ * sizeof(struct IO_APIC_route_entry);
37185+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
37186+ if (!mp_ioapic_data[i]) {
37187+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37188+ continue;
37189+ }
37190+ memset(mp_ioapic_data[i], 0, size);
37191+ dev = &mp_ioapic_data[i]->dev;
37192+ dev->id = i;
37193+ dev->cls = &ioapic_sysdev_class;
37194+ error = sysdev_register(dev);
37195+ if (error) {
37196+ kfree(mp_ioapic_data[i]);
37197+ mp_ioapic_data[i] = NULL;
37198+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37199+ continue;
37200+ }
37201+ }
37202+
37203+ return 0;
37204+}
37205+
37206+device_initcall(ioapic_init_sysfs);
37207+
37208+/* --------------------------------------------------------------------------
37209+ ACPI-based IOAPIC Configuration
37210+ -------------------------------------------------------------------------- */
37211+
37212+#ifdef CONFIG_ACPI
37213+
37214+#define IO_APIC_MAX_ID 0xFE
37215+
37216+int __init io_apic_get_version (int ioapic)
37217+{
37218+ union IO_APIC_reg_01 reg_01;
37219+ unsigned long flags;
37220+
37221+ spin_lock_irqsave(&ioapic_lock, flags);
37222+ reg_01.raw = io_apic_read(ioapic, 1);
37223+ spin_unlock_irqrestore(&ioapic_lock, flags);
37224+
37225+ return reg_01.bits.version;
37226+}
37227+
37228+
37229+int __init io_apic_get_redir_entries (int ioapic)
37230+{
37231+ union IO_APIC_reg_01 reg_01;
37232+ unsigned long flags;
37233+
37234+ spin_lock_irqsave(&ioapic_lock, flags);
37235+ reg_01.raw = io_apic_read(ioapic, 1);
37236+ spin_unlock_irqrestore(&ioapic_lock, flags);
37237+
37238+ return reg_01.bits.entries;
37239+}
37240+
37241+
37242+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
37243+{
37244+ struct IO_APIC_route_entry entry;
37245+ unsigned long flags;
37246+
37247+ if (!IO_APIC_IRQ(irq)) {
37248+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
37249+ ioapic);
37250+ return -EINVAL;
37251+ }
37252+
37253+ /*
37254+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
37255+ * Note that we mask (disable) IRQs now -- these get enabled when the
37256+ * corresponding device driver registers for this IRQ.
37257+ */
37258+
37259+ memset(&entry,0,sizeof(entry));
37260+
37261+ entry.delivery_mode = INT_DELIVERY_MODE;
37262+ entry.dest_mode = INT_DEST_MODE;
37263+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
37264+ entry.trigger = edge_level;
37265+ entry.polarity = active_high_low;
37266+ entry.mask = 1; /* Disabled (masked) */
37267+
37268+ irq = gsi_irq_sharing(irq);
37269+ /*
37270+ * IRQs < 16 are already in the irq_2_pin[] map
37271+ */
37272+ if (irq >= 16)
37273+ add_pin_to_irq(irq, ioapic, pin);
37274+
37275+ entry.vector = assign_irq_vector(irq);
37276+
37277+ apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
37278+ "IRQ %d Mode:%i Active:%i)\n", ioapic,
37279+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
37280+ edge_level, active_high_low);
37281+
37282+ ioapic_register_intr(irq, entry.vector, edge_level);
37283+
37284+ if (!ioapic && (irq < 16))
37285+ disable_8259A_irq(irq);
37286+
37287+ spin_lock_irqsave(&ioapic_lock, flags);
37288+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
37289+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
37290+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
37291+ spin_unlock_irqrestore(&ioapic_lock, flags);
37292+
37293+ return 0;
37294+}
37295+
37296+#endif /* CONFIG_ACPI */
37297+
37298+
37299+#ifndef CONFIG_XEN
37300+/*
37301+ * This function currently is only a helper for the i386 smp boot process where
37302+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
37303+ * so mask in all cases should simply be TARGET_CPUS
37304+ */
37305+#ifdef CONFIG_SMP
37306+void __init setup_ioapic_dest(void)
37307+{
37308+ int pin, ioapic, irq, irq_entry;
37309+
37310+ if (skip_ioapic_setup == 1)
37311+ return;
37312+
37313+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
37314+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
37315+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
37316+ if (irq_entry == -1)
37317+ continue;
37318+ irq = pin_2_irq(irq_entry, ioapic, pin);
37319+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
37320+ }
37321+
37322+ }
37323+}
37324+#endif
37325+#endif /* !CONFIG_XEN */
37326diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/ioport-xen.c linux-2.6.16.33/arch/x86_64/kernel/ioport-xen.c
37327--- linux-2.6.16.33-noxen/arch/x86_64/kernel/ioport-xen.c 1970-01-01 00:00:00.000000000 +0000
37328+++ linux-2.6.16.33/arch/x86_64/kernel/ioport-xen.c 2007-01-08 15:00:45.000000000 +0000
37329@@ -0,0 +1,99 @@
37330+/*
37331+ * linux/arch/x86_64/kernel/ioport.c
37332+ *
37333+ * This contains the io-permission bitmap code - written by obz, with changes
37334+ * by Linus.
37335+ */
37336+
37337+#include <linux/sched.h>
37338+#include <linux/kernel.h>
37339+#include <linux/capability.h>
37340+#include <linux/errno.h>
37341+#include <linux/types.h>
37342+#include <linux/ioport.h>
37343+#include <linux/mm.h>
37344+#include <linux/smp.h>
37345+#include <linux/smp_lock.h>
37346+#include <linux/stddef.h>
37347+#include <linux/slab.h>
37348+#include <linux/thread_info.h>
37349+#include <xen/interface/physdev.h>
37350+
37351+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
37352+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
37353+{
37354+ int i;
37355+
37356+ if (new_value)
37357+ for (i = base; i < base + extent; i++)
37358+ __set_bit(i, bitmap);
37359+ else
37360+ for (i = base; i < base + extent; i++)
37361+ clear_bit(i, bitmap);
37362+}
37363+
37364+/*
37365+ * this changes the io permissions bitmap in the current task.
37366+ */
37367+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
37368+{
37369+ struct thread_struct * t = &current->thread;
37370+ unsigned long *bitmap;
37371+ struct physdev_set_iobitmap set_iobitmap;
37372+
37373+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
37374+ return -EINVAL;
37375+ if (turn_on && !capable(CAP_SYS_RAWIO))
37376+ return -EPERM;
37377+
37378+ /*
37379+ * If it's the first ioperm() call in this thread's lifetime, set the
37380+ * IO bitmap up. ioperm() is much less timing critical than clone(),
37381+ * this is why we delay this operation until now:
37382+ */
37383+ if (!t->io_bitmap_ptr) {
37384+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
37385+ if (!bitmap)
37386+ return -ENOMEM;
37387+
37388+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
37389+ t->io_bitmap_ptr = bitmap;
37390+
37391+ set_iobitmap.bitmap = (char *)bitmap;
37392+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
37393+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
37394+ }
37395+
37396+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
37397+
37398+ return 0;
37399+}
37400+
37401+/*
37402+ * sys_iopl has to be used when you want to access the IO ports
37403+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
37404+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
37405+ *
37406+ */
37407+
37408+asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
37409+{
37410+ unsigned int old_iopl = current->thread.iopl;
37411+ struct physdev_set_iopl set_iopl;
37412+
37413+ if (new_iopl > 3)
37414+ return -EINVAL;
37415+
37416+ /* Need "raw I/O" privileges for direct port access. */
37417+ if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
37418+ return -EPERM;
37419+
37420+ /* Change our version of the privilege levels. */
37421+ current->thread.iopl = new_iopl;
37422+
37423+ /* Force the change at ring 0. */
37424+ set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
37425+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
37426+
37427+ return 0;
37428+}
37429diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/irq-xen.c linux-2.6.16.33/arch/x86_64/kernel/irq-xen.c
37430--- linux-2.6.16.33-noxen/arch/x86_64/kernel/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
37431+++ linux-2.6.16.33/arch/x86_64/kernel/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
37432@@ -0,0 +1,168 @@
37433+/*
37434+ * linux/arch/x86_64/kernel/irq.c
37435+ *
37436+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
37437+ *
37438+ * This file contains the lowest level x86_64-specific interrupt
37439+ * entry and irq statistics code. All the remaining irq logic is
37440+ * done by the generic kernel/irq/ code and in the
37441+ * x86_64-specific irq controller code. (e.g. i8259.c and
37442+ * io_apic.c.)
37443+ */
37444+
37445+#include <linux/kernel_stat.h>
37446+#include <linux/interrupt.h>
37447+#include <linux/seq_file.h>
37448+#include <linux/module.h>
37449+#include <linux/delay.h>
37450+#include <asm/uaccess.h>
37451+#include <asm/io_apic.h>
37452+#include <asm/idle.h>
37453+
37454+atomic_t irq_err_count;
37455+#ifdef CONFIG_X86_IO_APIC
37456+#ifdef APIC_MISMATCH_DEBUG
37457+atomic_t irq_mis_count;
37458+#endif
37459+#endif
37460+
37461+/*
37462+ * Generic, controller-independent functions:
37463+ */
37464+
37465+int show_interrupts(struct seq_file *p, void *v)
37466+{
37467+ int i = *(loff_t *) v, j;
37468+ struct irqaction * action;
37469+ unsigned long flags;
37470+
37471+ if (i == 0) {
37472+ seq_printf(p, " ");
37473+ for (j=0; j<NR_CPUS; j++)
37474+ if (cpu_online(j))
37475+ seq_printf(p, "CPU%d ",j);
37476+ seq_putc(p, '\n');
37477+ }
37478+
37479+ if (i < NR_IRQS) {
37480+ spin_lock_irqsave(&irq_desc[i].lock, flags);
37481+ action = irq_desc[i].action;
37482+ if (!action)
37483+ goto skip;
37484+ seq_printf(p, "%3d: ",i);
37485+#ifndef CONFIG_SMP
37486+ seq_printf(p, "%10u ", kstat_irqs(i));
37487+#else
37488+ for (j=0; j<NR_CPUS; j++)
37489+ if (cpu_online(j))
37490+ seq_printf(p, "%10u ",
37491+ kstat_cpu(j).irqs[i]);
37492+#endif
37493+ seq_printf(p, " %14s", irq_desc[i].handler->typename);
37494+
37495+ seq_printf(p, " %s", action->name);
37496+ for (action=action->next; action; action = action->next)
37497+ seq_printf(p, ", %s", action->name);
37498+ seq_putc(p, '\n');
37499+skip:
37500+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
37501+ } else if (i == NR_IRQS) {
37502+ seq_printf(p, "NMI: ");
37503+ for (j = 0; j < NR_CPUS; j++)
37504+ if (cpu_online(j))
37505+ seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
37506+ seq_putc(p, '\n');
37507+#ifdef CONFIG_X86_LOCAL_APIC
37508+ seq_printf(p, "LOC: ");
37509+ for (j = 0; j < NR_CPUS; j++)
37510+ if (cpu_online(j))
37511+ seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
37512+ seq_putc(p, '\n');
37513+#endif
37514+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
37515+#ifdef CONFIG_X86_IO_APIC
37516+#ifdef APIC_MISMATCH_DEBUG
37517+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
37518+#endif
37519+#endif
37520+ }
37521+ return 0;
37522+}
37523+
37524+/*
37525+ * do_IRQ handles all normal device IRQ's (the special
37526+ * SMP cross-CPU interrupts have their own specific
37527+ * handlers).
37528+ */
37529+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
37530+{
37531+ /* high bit used in ret_from_ code */
37532+ unsigned irq = ~regs->orig_rax;
37533+
37534+ exit_idle();
37535+ irq_enter();
37536+
37537+ __do_IRQ(irq, regs);
37538+ irq_exit();
37539+
37540+ return 1;
37541+}
37542+
37543+#ifdef CONFIG_HOTPLUG_CPU
37544+void fixup_irqs(cpumask_t map)
37545+{
37546+ unsigned int irq;
37547+ static int warned;
37548+
37549+ for (irq = 0; irq < NR_IRQS; irq++) {
37550+ cpumask_t mask;
37551+ if (irq == 2)
37552+ continue;
37553+
37554+ cpus_and(mask, irq_affinity[irq], map);
37555+ if (any_online_cpu(mask) == NR_CPUS) {
37556+ printk("Breaking affinity for irq %i\n", irq);
37557+ mask = map;
37558+ }
37559+ if (irq_desc[irq].handler->set_affinity)
37560+ irq_desc[irq].handler->set_affinity(irq, mask);
37561+ else if (irq_desc[irq].action && !(warned++))
37562+ printk("Cannot set affinity for irq %i\n", irq);
37563+ }
37564+
37565+ /* That doesn't seem sufficient. Give it 1ms. */
37566+ local_irq_enable();
37567+ mdelay(1);
37568+ local_irq_disable();
37569+}
37570+#endif
37571+
37572+extern void call_softirq(void);
37573+
37574+asmlinkage void do_softirq(void)
37575+{
37576+ __u32 pending;
37577+ unsigned long flags;
37578+
37579+ if (in_interrupt())
37580+ return;
37581+
37582+ local_irq_save(flags);
37583+ pending = local_softirq_pending();
37584+ /* Switch to interrupt stack */
37585+ if (pending)
37586+ call_softirq();
37587+ local_irq_restore(flags);
37588+}
37589+EXPORT_SYMBOL(do_softirq);
37590+
37591+#ifndef CONFIG_X86_LOCAL_APIC
37592+/*
37593+ * 'what should we do if we get a hw irq event on an illegal vector'.
37594+ * each architecture has to answer this themselves.
37595+ */
37596+void ack_bad_irq(unsigned int irq)
37597+{
37598+ printk("unexpected IRQ trap at vector %02x\n", irq);
37599+}
37600+#endif
37601diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/irq.c linux-2.6.16.33/arch/x86_64/kernel/irq.c
37602--- linux-2.6.16.33-noxen/arch/x86_64/kernel/irq.c 2006-11-22 18:06:31.000000000 +0000
37603+++ linux-2.6.16.33/arch/x86_64/kernel/irq.c 2007-05-23 21:00:01.000000000 +0000
37604@@ -96,8 +96,8 @@
37605 */
37606 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
37607 {
37608- /* high bits used in ret_from_ code */
37609- unsigned irq = regs->orig_rax & 0xff;
37610+ /* high bit used in ret_from_ code */
37611+ unsigned irq = ~regs->orig_rax;
37612
37613 exit_idle();
37614 irq_enter();
37615diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/ldt-xen.c linux-2.6.16.33/arch/x86_64/kernel/ldt-xen.c
37616--- linux-2.6.16.33-noxen/arch/x86_64/kernel/ldt-xen.c 1970-01-01 00:00:00.000000000 +0000
37617+++ linux-2.6.16.33/arch/x86_64/kernel/ldt-xen.c 2007-01-08 15:00:45.000000000 +0000
37618@@ -0,0 +1,282 @@
37619+/*
37620+ * linux/arch/x86_64/kernel/ldt.c
37621+ *
37622+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
37623+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
37624+ * Copyright (C) 2002 Andi Kleen
37625+ *
37626+ * This handles calls from both 32bit and 64bit mode.
37627+ */
37628+
37629+#include <linux/errno.h>
37630+#include <linux/sched.h>
37631+#include <linux/string.h>
37632+#include <linux/mm.h>
37633+#include <linux/smp.h>
37634+#include <linux/smp_lock.h>
37635+#include <linux/vmalloc.h>
37636+#include <linux/slab.h>
37637+
37638+#include <asm/uaccess.h>
37639+#include <asm/system.h>
37640+#include <asm/ldt.h>
37641+#include <asm/desc.h>
37642+#include <asm/proto.h>
37643+#include <asm/pgalloc.h>
37644+
37645+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
37646+static void flush_ldt(void *null)
37647+{
37648+ if (current->active_mm)
37649+ load_LDT(&current->active_mm->context);
37650+}
37651+#endif
37652+
37653+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
37654+{
37655+ void *oldldt;
37656+ void *newldt;
37657+ unsigned oldsize;
37658+
37659+ if (mincount <= (unsigned)pc->size)
37660+ return 0;
37661+ oldsize = pc->size;
37662+ mincount = (mincount+511)&(~511);
37663+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
37664+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
37665+ else
37666+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
37667+
37668+ if (!newldt)
37669+ return -ENOMEM;
37670+
37671+ if (oldsize)
37672+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
37673+ oldldt = pc->ldt;
37674+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
37675+ wmb();
37676+ pc->ldt = newldt;
37677+ wmb();
37678+ pc->size = mincount;
37679+ wmb();
37680+ if (reload) {
37681+#ifdef CONFIG_SMP
37682+ cpumask_t mask;
37683+
37684+ preempt_disable();
37685+#endif
37686+ make_pages_readonly(
37687+ pc->ldt,
37688+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37689+ XENFEAT_writable_descriptor_tables);
37690+ load_LDT(pc);
37691+#ifdef CONFIG_SMP
37692+ mask = cpumask_of_cpu(smp_processor_id());
37693+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
37694+ smp_call_function(flush_ldt, NULL, 1, 1);
37695+ preempt_enable();
37696+#endif
37697+ }
37698+ if (oldsize) {
37699+ make_pages_writable(
37700+ oldldt,
37701+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
37702+ XENFEAT_writable_descriptor_tables);
37703+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
37704+ vfree(oldldt);
37705+ else
37706+ kfree(oldldt);
37707+ }
37708+ return 0;
37709+}
37710+
37711+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
37712+{
37713+ int err = alloc_ldt(new, old->size, 0);
37714+ if (err < 0)
37715+ return err;
37716+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
37717+ make_pages_readonly(
37718+ new->ldt,
37719+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37720+ XENFEAT_writable_descriptor_tables);
37721+ return 0;
37722+}
37723+
37724+/*
37725+ * we do not have to muck with descriptors here, that is
37726+ * done in switch_mm() as needed.
37727+ */
37728+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
37729+{
37730+ struct mm_struct * old_mm;
37731+ int retval = 0;
37732+
37733+ memset(&mm->context, 0, sizeof(mm->context));
37734+ init_MUTEX(&mm->context.sem);
37735+ old_mm = current->mm;
37736+ if (old_mm && old_mm->context.size > 0) {
37737+ down(&old_mm->context.sem);
37738+ retval = copy_ldt(&mm->context, &old_mm->context);
37739+ up(&old_mm->context.sem);
37740+ }
37741+ if (retval == 0) {
37742+ spin_lock(&mm_unpinned_lock);
37743+ list_add(&mm->context.unpinned, &mm_unpinned);
37744+ spin_unlock(&mm_unpinned_lock);
37745+ }
37746+ return retval;
37747+}
37748+
37749+/*
37750+ *
37751+ * Don't touch the LDT register - we're already in the next thread.
37752+ */
37753+void destroy_context(struct mm_struct *mm)
37754+{
37755+ if (mm->context.size) {
37756+ if (mm == current->active_mm)
37757+ clear_LDT();
37758+ make_pages_writable(
37759+ mm->context.ldt,
37760+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37761+ XENFEAT_writable_descriptor_tables);
37762+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
37763+ vfree(mm->context.ldt);
37764+ else
37765+ kfree(mm->context.ldt);
37766+ mm->context.size = 0;
37767+ }
37768+ if (!mm->context.pinned) {
37769+ spin_lock(&mm_unpinned_lock);
37770+ list_del(&mm->context.unpinned);
37771+ spin_unlock(&mm_unpinned_lock);
37772+ }
37773+}
37774+
37775+static int read_ldt(void __user * ptr, unsigned long bytecount)
37776+{
37777+ int err;
37778+ unsigned long size;
37779+ struct mm_struct * mm = current->mm;
37780+
37781+ if (!mm->context.size)
37782+ return 0;
37783+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
37784+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
37785+
37786+ down(&mm->context.sem);
37787+ size = mm->context.size*LDT_ENTRY_SIZE;
37788+ if (size > bytecount)
37789+ size = bytecount;
37790+
37791+ err = 0;
37792+ if (copy_to_user(ptr, mm->context.ldt, size))
37793+ err = -EFAULT;
37794+ up(&mm->context.sem);
37795+ if (err < 0)
37796+ goto error_return;
37797+ if (size != bytecount) {
37798+ /* zero-fill the rest */
37799+ if (clear_user(ptr+size, bytecount-size) != 0) {
37800+ err = -EFAULT;
37801+ goto error_return;
37802+ }
37803+ }
37804+ return bytecount;
37805+error_return:
37806+ return err;
37807+}
37808+
37809+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
37810+{
37811+ /* Arbitrary number */
37812+ /* x86-64 default LDT is all zeros */
37813+ if (bytecount > 128)
37814+ bytecount = 128;
37815+ if (clear_user(ptr, bytecount))
37816+ return -EFAULT;
37817+ return bytecount;
37818+}
37819+
37820+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
37821+{
37822+ struct task_struct *me = current;
37823+ struct mm_struct * mm = me->mm;
37824+ __u32 entry_1, entry_2, *lp;
37825+ unsigned long mach_lp;
37826+ int error;
37827+ struct user_desc ldt_info;
37828+
37829+ error = -EINVAL;
37830+
37831+ if (bytecount != sizeof(ldt_info))
37832+ goto out;
37833+ error = -EFAULT;
37834+ if (copy_from_user(&ldt_info, ptr, bytecount))
37835+ goto out;
37836+
37837+ error = -EINVAL;
37838+ if (ldt_info.entry_number >= LDT_ENTRIES)
37839+ goto out;
37840+ if (ldt_info.contents == 3) {
37841+ if (oldmode)
37842+ goto out;
37843+ if (ldt_info.seg_not_present == 0)
37844+ goto out;
37845+ }
37846+
37847+ down(&mm->context.sem);
37848+ if (ldt_info.entry_number >= (unsigned)mm->context.size) {
37849+ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
37850+ if (error < 0)
37851+ goto out_unlock;
37852+ }
37853+
37854+ lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
37855+ mach_lp = arbitrary_virt_to_machine(lp);
37856+
37857+ /* Allow LDTs to be cleared by the user. */
37858+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
37859+ if (oldmode || LDT_empty(&ldt_info)) {
37860+ entry_1 = 0;
37861+ entry_2 = 0;
37862+ goto install;
37863+ }
37864+ }
37865+
37866+ entry_1 = LDT_entry_a(&ldt_info);
37867+ entry_2 = LDT_entry_b(&ldt_info);
37868+ if (oldmode)
37869+ entry_2 &= ~(1 << 20);
37870+
37871+ /* Install the new entry ... */
37872+install:
37873+ error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
37874+
37875+out_unlock:
37876+ up(&mm->context.sem);
37877+out:
37878+ return error;
37879+}
37880+
37881+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
37882+{
37883+ int ret = -ENOSYS;
37884+
37885+ switch (func) {
37886+ case 0:
37887+ ret = read_ldt(ptr, bytecount);
37888+ break;
37889+ case 1:
37890+ ret = write_ldt(ptr, bytecount, 1);
37891+ break;
37892+ case 2:
37893+ ret = read_default_ldt(ptr, bytecount);
37894+ break;
37895+ case 0x11:
37896+ ret = write_ldt(ptr, bytecount, 0);
37897+ break;
37898+ }
37899+ return ret;
37900+}
37901diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c
37902--- linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
37903+++ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
37904@@ -15,6 +15,113 @@
37905 #include <asm/mmu_context.h>
37906 #include <asm/io.h>
37907
37908+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
37909+static u64 kexec_pgd[512] PAGE_ALIGNED;
37910+static u64 kexec_pud0[512] PAGE_ALIGNED;
37911+static u64 kexec_pmd0[512] PAGE_ALIGNED;
37912+static u64 kexec_pte0[512] PAGE_ALIGNED;
37913+static u64 kexec_pud1[512] PAGE_ALIGNED;
37914+static u64 kexec_pmd1[512] PAGE_ALIGNED;
37915+static u64 kexec_pte1[512] PAGE_ALIGNED;
37916+
37917+#ifdef CONFIG_XEN
37918+
37919+/* In the case of Xen, override hypervisor functions to be able to create
37920+ * a regular identity mapping page table...
37921+ */
37922+
37923+#include <xen/interface/kexec.h>
37924+#include <xen/interface/memory.h>
37925+
37926+#define x__pmd(x) ((pmd_t) { (x) } )
37927+#define x__pud(x) ((pud_t) { (x) } )
37928+#define x__pgd(x) ((pgd_t) { (x) } )
37929+
37930+#define x_pmd_val(x) ((x).pmd)
37931+#define x_pud_val(x) ((x).pud)
37932+#define x_pgd_val(x) ((x).pgd)
37933+
37934+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
37935+{
37936+ x_pmd_val(*dst) = x_pmd_val(val);
37937+}
37938+
37939+static inline void x_set_pud(pud_t *dst, pud_t val)
37940+{
37941+ x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
37942+}
37943+
37944+static inline void x_pud_clear (pud_t *pud)
37945+{
37946+ x_pud_val(*pud) = 0;
37947+}
37948+
37949+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
37950+{
37951+ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
37952+}
37953+
37954+static inline void x_pgd_clear (pgd_t * pgd)
37955+{
37956+ x_pgd_val(*pgd) = 0;
37957+}
37958+
37959+#define X__PAGE_KERNEL_LARGE_EXEC \
37960+ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
37961+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
37962+
37963+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
37964+
37965+#if PAGES_NR > KEXEC_XEN_NO_PAGES
37966+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
37967+#endif
37968+
37969+#if PA_CONTROL_PAGE != 0
37970+#error PA_CONTROL_PAGE is non zero - Xen support will break
37971+#endif
37972+
37973+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
37974+{
37975+ void *control_page;
37976+ void *table_page;
37977+
37978+ memset(xki->page_list, 0, sizeof(xki->page_list));
37979+
37980+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
37981+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
37982+
37983+ table_page = page_address(image->control_code_page);
37984+
37985+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
37986+ xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
37987+
37988+ xki->page_list[PA_PGD] = __ma(kexec_pgd);
37989+ xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
37990+ xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
37991+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
37992+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
37993+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
37994+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
37995+}
37996+
37997+#else /* CONFIG_XEN */
37998+
37999+#define x__pmd(x) __pmd(x)
38000+#define x__pud(x) __pud(x)
38001+#define x__pgd(x) __pgd(x)
38002+
38003+#define x_set_pmd(x, y) set_pmd(x, y)
38004+#define x_set_pud(x, y) set_pud(x, y)
38005+#define x_set_pgd(x, y) set_pgd(x, y)
38006+
38007+#define x_pud_clear(x) pud_clear(x)
38008+#define x_pgd_clear(x) pgd_clear(x)
38009+
38010+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
38011+#define X_KERNPG_TABLE _KERNPG_TABLE
38012+
38013+#endif /* CONFIG_XEN */
38014+
38015 static void init_level2_page(pmd_t *level2p, unsigned long addr)
38016 {
38017 unsigned long end_addr;
38018@@ -22,7 +129,7 @@
38019 addr &= PAGE_MASK;
38020 end_addr = addr + PUD_SIZE;
38021 while (addr < end_addr) {
38022- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
38023+ x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
38024 addr += PMD_SIZE;
38025 }
38026 }
38027@@ -47,12 +154,12 @@
38028 }
38029 level2p = (pmd_t *)page_address(page);
38030 init_level2_page(level2p, addr);
38031- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
38032+ x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
38033 addr += PUD_SIZE;
38034 }
38035 /* clear the unused entries */
38036 while (addr < end_addr) {
38037- pud_clear(level3p++);
38038+ x_pud_clear(level3p++);
38039 addr += PUD_SIZE;
38040 }
38041 out:
38042@@ -83,12 +190,12 @@
38043 if (result) {
38044 goto out;
38045 }
38046- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
38047+ x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
38048 addr += PGDIR_SIZE;
38049 }
38050 /* clear the unused entries */
38051 while (addr < end_addr) {
38052- pgd_clear(level4p++);
38053+ x_pgd_clear(level4p++);
38054 addr += PGDIR_SIZE;
38055 }
38056 out:
38057@@ -99,77 +206,29 @@
38058 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
38059 {
38060 pgd_t *level4p;
38061- level4p = (pgd_t *)__va(start_pgtable);
38062- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
38063-}
38064-
38065-static void set_idt(void *newidt, u16 limit)
38066-{
38067- struct desc_ptr curidt;
38068+ unsigned long x_end_pfn = end_pfn;
38069
38070- /* x86-64 supports unaliged loads & stores */
38071- curidt.size = limit;
38072- curidt.address = (unsigned long)newidt;
38073+#ifdef CONFIG_XEN
38074+ x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
38075+#endif
38076
38077- __asm__ __volatile__ (
38078- "lidtq %0\n"
38079- : : "m" (curidt)
38080- );
38081-};
38082-
38083-
38084-static void set_gdt(void *newgdt, u16 limit)
38085-{
38086- struct desc_ptr curgdt;
38087-
38088- /* x86-64 supports unaligned loads & stores */
38089- curgdt.size = limit;
38090- curgdt.address = (unsigned long)newgdt;
38091-
38092- __asm__ __volatile__ (
38093- "lgdtq %0\n"
38094- : : "m" (curgdt)
38095- );
38096-};
38097-
38098-static void load_segments(void)
38099-{
38100- __asm__ __volatile__ (
38101- "\tmovl %0,%%ds\n"
38102- "\tmovl %0,%%es\n"
38103- "\tmovl %0,%%ss\n"
38104- "\tmovl %0,%%fs\n"
38105- "\tmovl %0,%%gs\n"
38106- : : "a" (__KERNEL_DS) : "memory"
38107- );
38108+ level4p = (pgd_t *)__va(start_pgtable);
38109+ return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
38110 }
38111
38112-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
38113- unsigned long control_code_buffer,
38114- unsigned long start_address,
38115- unsigned long pgtable) ATTRIB_NORET;
38116-
38117-const extern unsigned char relocate_new_kernel[];
38118-const extern unsigned long relocate_new_kernel_size;
38119-
38120 int machine_kexec_prepare(struct kimage *image)
38121 {
38122- unsigned long start_pgtable, control_code_buffer;
38123+ unsigned long start_pgtable;
38124 int result;
38125
38126 /* Calculate the offsets */
38127 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38128- control_code_buffer = start_pgtable + PAGE_SIZE;
38129
38130 /* Setup the identity mapped 64bit page table */
38131 result = init_pgtable(image, start_pgtable);
38132 if (result)
38133 return result;
38134
38135- /* Place the code in the reboot code buffer */
38136- memcpy(__va(control_code_buffer), relocate_new_kernel,
38137- relocate_new_kernel_size);
38138-
38139 return 0;
38140 }
38141
38142@@ -178,54 +237,43 @@
38143 return;
38144 }
38145
38146+#ifndef CONFIG_XEN
38147 /*
38148 * Do not allocate memory (or fail in any way) in machine_kexec().
38149 * We are past the point of no return, committed to rebooting now.
38150 */
38151 NORET_TYPE void machine_kexec(struct kimage *image)
38152 {
38153- unsigned long page_list;
38154- unsigned long control_code_buffer;
38155- unsigned long start_pgtable;
38156- relocate_new_kernel_t rnk;
38157+ unsigned long page_list[PAGES_NR];
38158+ void *control_page;
38159
38160 /* Interrupts aren't acceptable while we reboot */
38161 local_irq_disable();
38162
38163- /* Calculate the offsets */
38164- page_list = image->head;
38165- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38166- control_code_buffer = start_pgtable + PAGE_SIZE;
38167+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
38168+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
38169+
38170+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
38171+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
38172+ page_list[PA_PGD] = __pa(kexec_pgd);
38173+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
38174+ page_list[PA_PUD_0] = __pa(kexec_pud0);
38175+ page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
38176+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
38177+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
38178+ page_list[PA_PTE_0] = __pa(kexec_pte0);
38179+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
38180+ page_list[PA_PUD_1] = __pa(kexec_pud1);
38181+ page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
38182+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
38183+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
38184+ page_list[PA_PTE_1] = __pa(kexec_pte1);
38185+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
38186+
38187+ page_list[PA_TABLE_PAGE] =
38188+ (unsigned long)__pa(page_address(image->control_code_page));
38189
38190- /* Set the low half of the page table to my identity mapped
38191- * page table for kexec. Leave the high half pointing at the
38192- * kernel pages. Don't bother to flush the global pages
38193- * as that will happen when I fully switch to my identity mapped
38194- * page table anyway.
38195- */
38196- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
38197- __flush_tlb();
38198-
38199-
38200- /* The segment registers are funny things, they are
38201- * automatically loaded from a table, in memory wherever you
38202- * set them to a specific selector, but this table is never
38203- * accessed again unless you set the segment to a different selector.
38204- *
38205- * The more common model are caches where the behide
38206- * the scenes work is done, but is also dropped at arbitrary
38207- * times.
38208- *
38209- * I take advantage of this here by force loading the
38210- * segments, before I zap the gdt with an invalid value.
38211- */
38212- load_segments();
38213- /* The gdt & idt are now invalid.
38214- * If you want to load them you must set up your own idt & gdt.
38215- */
38216- set_gdt(phys_to_virt(0),0);
38217- set_idt(phys_to_virt(0),0);
38218- /* now call it */
38219- rnk = (relocate_new_kernel_t) control_code_buffer;
38220- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
38221+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
38222+ image->start);
38223 }
38224+#endif
38225diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c~ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c~
38226--- linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c~ 1970-01-01 00:00:00.000000000 +0000
38227+++ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c~ 2007-05-23 21:00:01.000000000 +0000
38228@@ -0,0 +1,228 @@
38229+/*
38230+ * machine_kexec.c - handle transition of Linux booting another kernel
38231+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
38232+ *
38233+ * This source code is licensed under the GNU General Public License,
38234+ * Version 2. See the file COPYING for more details.
38235+ */
38236+
38237+#include <linux/mm.h>
38238+#include <linux/kexec.h>
38239+#include <linux/string.h>
38240+#include <linux/reboot.h>
38241+#include <asm/pgtable.h>
38242+#include <asm/tlbflush.h>
38243+#include <asm/mmu_context.h>
38244+#include <asm/io.h>
38245+
38246+static void init_level2_page(pmd_t *level2p, unsigned long addr)
38247+{
38248+ unsigned long end_addr;
38249+
38250+ addr &= PAGE_MASK;
38251+ end_addr = addr + PUD_SIZE;
38252+ while (addr < end_addr) {
38253+ set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
38254+ addr += PMD_SIZE;
38255+ }
38256+}
38257+
38258+static int init_level3_page(struct kimage *image, pud_t *level3p,
38259+ unsigned long addr, unsigned long last_addr)
38260+{
38261+ unsigned long end_addr;
38262+ int result;
38263+
38264+ result = 0;
38265+ addr &= PAGE_MASK;
38266+ end_addr = addr + PGDIR_SIZE;
38267+ while ((addr < last_addr) && (addr < end_addr)) {
38268+ struct page *page;
38269+ pmd_t *level2p;
38270+
38271+ page = kimage_alloc_control_pages(image, 0);
38272+ if (!page) {
38273+ result = -ENOMEM;
38274+ goto out;
38275+ }
38276+ level2p = (pmd_t *)page_address(page);
38277+ init_level2_page(level2p, addr);
38278+ set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
38279+ addr += PUD_SIZE;
38280+ }
38281+ /* clear the unused entries */
38282+ while (addr < end_addr) {
38283+ pud_clear(level3p++);
38284+ addr += PUD_SIZE;
38285+ }
38286+out:
38287+ return result;
38288+}
38289+
38290+
38291+static int init_level4_page(struct kimage *image, pgd_t *level4p,
38292+ unsigned long addr, unsigned long last_addr)
38293+{
38294+ unsigned long end_addr;
38295+ int result;
38296+
38297+ result = 0;
38298+ addr &= PAGE_MASK;
38299+ end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
38300+ while ((addr < last_addr) && (addr < end_addr)) {
38301+ struct page *page;
38302+ pud_t *level3p;
38303+
38304+ page = kimage_alloc_control_pages(image, 0);
38305+ if (!page) {
38306+ result = -ENOMEM;
38307+ goto out;
38308+ }
38309+ level3p = (pud_t *)page_address(page);
38310+ result = init_level3_page(image, level3p, addr, last_addr);
38311+ if (result) {
38312+ goto out;
38313+ }
38314+ set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
38315+ addr += PGDIR_SIZE;
38316+ }
38317+ /* clear the unused entries */
38318+ while (addr < end_addr) {
38319+ pgd_clear(level4p++);
38320+ addr += PGDIR_SIZE;
38321+ }
38322+out:
38323+ return result;
38324+}
38325+
38326+
38327+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
38328+{
38329+ pgd_t *level4p;
38330+ level4p = (pgd_t *)__va(start_pgtable);
38331+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
38332+}
38333+
38334+static void set_idt(void *newidt, u16 limit)
38335+{
38336+ struct desc_ptr curidt;
38337+
38338+ /* x86-64 supports unaliged loads & stores */
38339+ curidt.size = limit;
38340+ curidt.address = (unsigned long)newidt;
38341+
38342+ __asm__ __volatile__ (
38343+ "lidtq %0\n"
38344+ : : "m" (curidt)
38345+ );
38346+};
38347+
38348+
38349+static void set_gdt(void *newgdt, u16 limit)
38350+{
38351+ struct desc_ptr curgdt;
38352+
38353+ /* x86-64 supports unaligned loads & stores */
38354+ curgdt.size = limit;
38355+ curgdt.address = (unsigned long)newgdt;
38356+
38357+ __asm__ __volatile__ (
38358+ "lgdtq %0\n"
38359+ : : "m" (curgdt)
38360+ );
38361+};
38362+
38363+static void load_segments(void)
38364+{
38365+ __asm__ __volatile__ (
38366+ "\tmovl %0,%%ds\n"
38367+ "\tmovl %0,%%es\n"
38368+ "\tmovl %0,%%ss\n"
38369+ "\tmovl %0,%%fs\n"
38370+ "\tmovl %0,%%gs\n"
38371+ : : "a" (__KERNEL_DS) : "memory"
38372+ );
38373+}
38374+
38375+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
38376+ unsigned long control_code_buffer,
38377+ unsigned long start_address,
38378+ unsigned long pgtable) ATTRIB_NORET;
38379+
38380+extern const unsigned char relocate_new_kernel[];
38381+extern const unsigned long relocate_new_kernel_size;
38382+
38383+int machine_kexec_prepare(struct kimage *image)
38384+{
38385+ unsigned long start_pgtable, control_code_buffer;
38386+ int result;
38387+
38388+ /* Calculate the offsets */
38389+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38390+ control_code_buffer = start_pgtable + PAGE_SIZE;
38391+
38392+ /* Setup the identity mapped 64bit page table */
38393+ result = init_pgtable(image, start_pgtable);
38394+ if (result)
38395+ return result;
38396+
38397+ /* Place the code in the reboot code buffer */
38398+ memcpy(__va(control_code_buffer), relocate_new_kernel,
38399+ relocate_new_kernel_size);
38400+
38401+ return 0;
38402+}
38403+
38404+void machine_kexec_cleanup(struct kimage *image)
38405+{
38406+ return;
38407+}
38408+
38409+/*
38410+ * Do not allocate memory (or fail in any way) in machine_kexec().
38411+ * We are past the point of no return, committed to rebooting now.
38412+ */
38413+NORET_TYPE void machine_kexec(struct kimage *image)
38414+{
38415+ unsigned long page_list;
38416+ unsigned long control_code_buffer;
38417+ unsigned long start_pgtable;
38418+ relocate_new_kernel_t rnk;
38419+
38420+ /* Interrupts aren't acceptable while we reboot */
38421+ local_irq_disable();
38422+
38423+ /* Calculate the offsets */
38424+ page_list = image->head;
38425+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38426+ control_code_buffer = start_pgtable + PAGE_SIZE;
38427+
38428+ /* Set the low half of the page table to my identity mapped
38429+ * page table for kexec. Leave the high half pointing at the
38430+ * kernel pages. Don't bother to flush the global pages
38431+ * as that will happen when I fully switch to my identity mapped
38432+ * page table anyway.
38433+ */
38434+ memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
38435+ __flush_tlb();
38436+
38437+
38438+ /* The segment registers are funny things, they have both a
38439+ * visible and an invisible part. Whenever the visible part is
38440+ * set to a specific selector, the invisible part is loaded
38441+ * with from a table in memory. At no other time is the
38442+ * descriptor table in memory accessed.
38443+ *
38444+ * I take advantage of this here by force loading the
38445+ * segments, before I zap the gdt with an invalid value.
38446+ */
38447+ load_segments();
38448+ /* The gdt & idt are now invalid.
38449+ * If you want to load them you must set up your own idt & gdt.
38450+ */
38451+ set_gdt(phys_to_virt(0),0);
38452+ set_idt(phys_to_virt(0),0);
38453+ /* now call it */
38454+ rnk = (relocate_new_kernel_t) control_code_buffer;
38455+ (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
38456+}
38457diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/mpparse-xen.c linux-2.6.16.33/arch/x86_64/kernel/mpparse-xen.c
38458--- linux-2.6.16.33-noxen/arch/x86_64/kernel/mpparse-xen.c 1970-01-01 00:00:00.000000000 +0000
38459+++ linux-2.6.16.33/arch/x86_64/kernel/mpparse-xen.c 2007-01-08 15:00:45.000000000 +0000
38460@@ -0,0 +1,1005 @@
38461+/*
38462+ * Intel Multiprocessor Specification 1.1 and 1.4
38463+ * compliant MP-table parsing routines.
38464+ *
38465+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
38466+ * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
38467+ *
38468+ * Fixes
38469+ * Erich Boleyn : MP v1.4 and additional changes.
38470+ * Alan Cox : Added EBDA scanning
38471+ * Ingo Molnar : various cleanups and rewrites
38472+ * Maciej W. Rozycki: Bits for default MP configurations
38473+ * Paul Diefenbaugh: Added full ACPI support
38474+ */
38475+
38476+#include <linux/mm.h>
38477+#include <linux/init.h>
38478+#include <linux/delay.h>
38479+#include <linux/config.h>
38480+#include <linux/bootmem.h>
38481+#include <linux/smp_lock.h>
38482+#include <linux/kernel_stat.h>
38483+#include <linux/mc146818rtc.h>
38484+#include <linux/acpi.h>
38485+#include <linux/module.h>
38486+
38487+#include <asm/smp.h>
38488+#include <asm/mtrr.h>
38489+#include <asm/mpspec.h>
38490+#include <asm/pgalloc.h>
38491+#include <asm/io_apic.h>
38492+#include <asm/proto.h>
38493+#include <asm/acpi.h>
38494+
38495+/* Have we found an MP table */
38496+int smp_found_config;
38497+unsigned int __initdata maxcpus = NR_CPUS;
38498+
38499+int acpi_found_madt;
38500+
38501+/*
38502+ * Various Linux-internal data structures created from the
38503+ * MP-table.
38504+ */
38505+unsigned char apic_version [MAX_APICS];
38506+unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
38507+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
38508+
38509+static int mp_current_pci_id = 0;
38510+/* I/O APIC entries */
38511+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
38512+
38513+/* # of MP IRQ source entries */
38514+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
38515+
38516+/* MP IRQ source entries */
38517+int mp_irq_entries;
38518+
38519+int nr_ioapics;
38520+int pic_mode;
38521+unsigned long mp_lapic_addr = 0;
38522+
38523+
38524+
38525+/* Processor that is doing the boot up */
38526+unsigned int boot_cpu_id = -1U;
38527+/* Internal processor count */
38528+unsigned int num_processors __initdata = 0;
38529+
38530+unsigned disabled_cpus __initdata;
38531+
38532+/* Bitmask of physically existing CPUs */
38533+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
38534+
38535+/* ACPI MADT entry parsing functions */
38536+#ifdef CONFIG_ACPI
38537+extern struct acpi_boot_flags acpi_boot;
38538+#ifdef CONFIG_X86_LOCAL_APIC
38539+extern int acpi_parse_lapic (acpi_table_entry_header *header);
38540+extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
38541+extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
38542+#endif /*CONFIG_X86_LOCAL_APIC*/
38543+#ifdef CONFIG_X86_IO_APIC
38544+extern int acpi_parse_ioapic (acpi_table_entry_header *header);
38545+#endif /*CONFIG_X86_IO_APIC*/
38546+#endif /*CONFIG_ACPI*/
38547+
38548+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
38549+
38550+
38551+/*
38552+ * Intel MP BIOS table parsing routines:
38553+ */
38554+
38555+/*
38556+ * Checksum an MP configuration block.
38557+ */
38558+
38559+static int __init mpf_checksum(unsigned char *mp, int len)
38560+{
38561+ int sum = 0;
38562+
38563+ while (len--)
38564+ sum += *mp++;
38565+
38566+ return sum & 0xFF;
38567+}
38568+
38569+#ifndef CONFIG_XEN
38570+static void __init MP_processor_info (struct mpc_config_processor *m)
38571+{
38572+ int cpu;
38573+ unsigned char ver;
38574+ static int found_bsp=0;
38575+
38576+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
38577+ disabled_cpus++;
38578+ return;
38579+ }
38580+
38581+ printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
38582+ m->mpc_apicid,
38583+ (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
38584+ (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
38585+ m->mpc_apicver);
38586+
38587+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38588+ Dprintk(" Bootup CPU\n");
38589+ boot_cpu_id = m->mpc_apicid;
38590+ }
38591+ if (num_processors >= NR_CPUS) {
38592+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
38593+ " Processor ignored.\n", NR_CPUS);
38594+ return;
38595+ }
38596+
38597+ cpu = num_processors++;
38598+
38599+#if MAX_APICS < 255
38600+ if ((int)m->mpc_apicid > MAX_APICS) {
38601+ printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
38602+ m->mpc_apicid, MAX_APICS);
38603+ return;
38604+ }
38605+#endif
38606+ ver = m->mpc_apicver;
38607+
38608+ physid_set(m->mpc_apicid, phys_cpu_present_map);
38609+ /*
38610+ * Validate version
38611+ */
38612+ if (ver == 0x0) {
38613+ printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
38614+ ver = 0x10;
38615+ }
38616+ apic_version[m->mpc_apicid] = ver;
38617+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38618+ /*
38619+ * bios_cpu_apicid is required to have processors listed
38620+ * in same order as logical cpu numbers. Hence the first
38621+ * entry is BSP, and so on.
38622+ */
38623+ cpu = 0;
38624+
38625+ bios_cpu_apicid[0] = m->mpc_apicid;
38626+ x86_cpu_to_apicid[0] = m->mpc_apicid;
38627+ found_bsp = 1;
38628+ } else
38629+ cpu = num_processors - found_bsp;
38630+ bios_cpu_apicid[cpu] = m->mpc_apicid;
38631+ x86_cpu_to_apicid[cpu] = m->mpc_apicid;
38632+
38633+ cpu_set(cpu, cpu_possible_map);
38634+ cpu_set(cpu, cpu_present_map);
38635+}
38636+#else
38637+void __init MP_processor_info (struct mpc_config_processor *m)
38638+{
38639+ num_processors++;
38640+}
38641+#endif /* CONFIG_XEN */
38642+
38643+static void __init MP_bus_info (struct mpc_config_bus *m)
38644+{
38645+ char str[7];
38646+
38647+ memcpy(str, m->mpc_bustype, 6);
38648+ str[6] = 0;
38649+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
38650+
38651+ if (strncmp(str, "ISA", 3) == 0) {
38652+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
38653+ } else if (strncmp(str, "EISA", 4) == 0) {
38654+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
38655+ } else if (strncmp(str, "PCI", 3) == 0) {
38656+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
38657+ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
38658+ mp_current_pci_id++;
38659+ } else if (strncmp(str, "MCA", 3) == 0) {
38660+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
38661+ } else {
38662+ printk(KERN_ERR "Unknown bustype %s\n", str);
38663+ }
38664+}
38665+
38666+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
38667+{
38668+ if (!(m->mpc_flags & MPC_APIC_USABLE))
38669+ return;
38670+
38671+ printk("I/O APIC #%d Version %d at 0x%X.\n",
38672+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
38673+ if (nr_ioapics >= MAX_IO_APICS) {
38674+ printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
38675+ MAX_IO_APICS, nr_ioapics);
38676+ panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
38677+ }
38678+ if (!m->mpc_apicaddr) {
38679+ printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
38680+ " found in MP table, skipping!\n");
38681+ return;
38682+ }
38683+ mp_ioapics[nr_ioapics] = *m;
38684+ nr_ioapics++;
38685+}
38686+
38687+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
38688+{
38689+ mp_irqs [mp_irq_entries] = *m;
38690+ Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
38691+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
38692+ m->mpc_irqtype, m->mpc_irqflag & 3,
38693+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
38694+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
38695+ if (++mp_irq_entries >= MAX_IRQ_SOURCES)
38696+ panic("Max # of irq sources exceeded!!\n");
38697+}
38698+
38699+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
38700+{
38701+ Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
38702+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
38703+ m->mpc_irqtype, m->mpc_irqflag & 3,
38704+ (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
38705+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
38706+ /*
38707+ * Well it seems all SMP boards in existence
38708+ * use ExtINT/LVT1 == LINT0 and
38709+ * NMI/LVT2 == LINT1 - the following check
38710+ * will show us if this assumptions is false.
38711+ * Until then we do not have to add baggage.
38712+ */
38713+ if ((m->mpc_irqtype == mp_ExtINT) &&
38714+ (m->mpc_destapiclint != 0))
38715+ BUG();
38716+ if ((m->mpc_irqtype == mp_NMI) &&
38717+ (m->mpc_destapiclint != 1))
38718+ BUG();
38719+}
38720+
38721+/*
38722+ * Read/parse the MPC
38723+ */
38724+
38725+static int __init smp_read_mpc(struct mp_config_table *mpc)
38726+{
38727+ char str[16];
38728+ int count=sizeof(*mpc);
38729+ unsigned char *mpt=((unsigned char *)mpc)+count;
38730+
38731+ if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
38732+ printk("SMP mptable: bad signature [%c%c%c%c]!\n",
38733+ mpc->mpc_signature[0],
38734+ mpc->mpc_signature[1],
38735+ mpc->mpc_signature[2],
38736+ mpc->mpc_signature[3]);
38737+ return 0;
38738+ }
38739+ if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
38740+ printk("SMP mptable: checksum error!\n");
38741+ return 0;
38742+ }
38743+ if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
38744+ printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
38745+ mpc->mpc_spec);
38746+ return 0;
38747+ }
38748+ if (!mpc->mpc_lapic) {
38749+ printk(KERN_ERR "SMP mptable: null local APIC address!\n");
38750+ return 0;
38751+ }
38752+ memcpy(str,mpc->mpc_oem,8);
38753+ str[8]=0;
38754+ printk(KERN_INFO "OEM ID: %s ",str);
38755+
38756+ memcpy(str,mpc->mpc_productid,12);
38757+ str[12]=0;
38758+ printk("Product ID: %s ",str);
38759+
38760+ printk("APIC at: 0x%X\n",mpc->mpc_lapic);
38761+
38762+ /* save the local APIC address, it might be non-default */
38763+ if (!acpi_lapic)
38764+ mp_lapic_addr = mpc->mpc_lapic;
38765+
38766+ /*
38767+ * Now process the configuration blocks.
38768+ */
38769+ while (count < mpc->mpc_length) {
38770+ switch(*mpt) {
38771+ case MP_PROCESSOR:
38772+ {
38773+ struct mpc_config_processor *m=
38774+ (struct mpc_config_processor *)mpt;
38775+ if (!acpi_lapic)
38776+ MP_processor_info(m);
38777+ mpt += sizeof(*m);
38778+ count += sizeof(*m);
38779+ break;
38780+ }
38781+ case MP_BUS:
38782+ {
38783+ struct mpc_config_bus *m=
38784+ (struct mpc_config_bus *)mpt;
38785+ MP_bus_info(m);
38786+ mpt += sizeof(*m);
38787+ count += sizeof(*m);
38788+ break;
38789+ }
38790+ case MP_IOAPIC:
38791+ {
38792+ struct mpc_config_ioapic *m=
38793+ (struct mpc_config_ioapic *)mpt;
38794+ MP_ioapic_info(m);
38795+ mpt+=sizeof(*m);
38796+ count+=sizeof(*m);
38797+ break;
38798+ }
38799+ case MP_INTSRC:
38800+ {
38801+ struct mpc_config_intsrc *m=
38802+ (struct mpc_config_intsrc *)mpt;
38803+
38804+ MP_intsrc_info(m);
38805+ mpt+=sizeof(*m);
38806+ count+=sizeof(*m);
38807+ break;
38808+ }
38809+ case MP_LINTSRC:
38810+ {
38811+ struct mpc_config_lintsrc *m=
38812+ (struct mpc_config_lintsrc *)mpt;
38813+ MP_lintsrc_info(m);
38814+ mpt+=sizeof(*m);
38815+ count+=sizeof(*m);
38816+ break;
38817+ }
38818+ }
38819+ }
38820+ clustered_apic_check();
38821+ if (!num_processors)
38822+ printk(KERN_ERR "SMP mptable: no processors registered!\n");
38823+ return num_processors;
38824+}
38825+
38826+static int __init ELCR_trigger(unsigned int irq)
38827+{
38828+ unsigned int port;
38829+
38830+ port = 0x4d0 + (irq >> 3);
38831+ return (inb(port) >> (irq & 7)) & 1;
38832+}
38833+
38834+static void __init construct_default_ioirq_mptable(int mpc_default_type)
38835+{
38836+ struct mpc_config_intsrc intsrc;
38837+ int i;
38838+ int ELCR_fallback = 0;
38839+
38840+ intsrc.mpc_type = MP_INTSRC;
38841+ intsrc.mpc_irqflag = 0; /* conforming */
38842+ intsrc.mpc_srcbus = 0;
38843+ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
38844+
38845+ intsrc.mpc_irqtype = mp_INT;
38846+
38847+ /*
38848+ * If true, we have an ISA/PCI system with no IRQ entries
38849+ * in the MP table. To prevent the PCI interrupts from being set up
38850+ * incorrectly, we try to use the ELCR. The sanity check to see if
38851+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
38852+ * never be level sensitive, so we simply see if the ELCR agrees.
38853+ * If it does, we assume it's valid.
38854+ */
38855+ if (mpc_default_type == 5) {
38856+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
38857+
38858+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
38859+ printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
38860+ else {
38861+ printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
38862+ ELCR_fallback = 1;
38863+ }
38864+ }
38865+
38866+ for (i = 0; i < 16; i++) {
38867+ switch (mpc_default_type) {
38868+ case 2:
38869+ if (i == 0 || i == 13)
38870+ continue; /* IRQ0 & IRQ13 not connected */
38871+ /* fall through */
38872+ default:
38873+ if (i == 2)
38874+ continue; /* IRQ2 is never connected */
38875+ }
38876+
38877+ if (ELCR_fallback) {
38878+ /*
38879+ * If the ELCR indicates a level-sensitive interrupt, we
38880+ * copy that information over to the MP table in the
38881+ * irqflag field (level sensitive, active high polarity).
38882+ */
38883+ if (ELCR_trigger(i))
38884+ intsrc.mpc_irqflag = 13;
38885+ else
38886+ intsrc.mpc_irqflag = 0;
38887+ }
38888+
38889+ intsrc.mpc_srcbusirq = i;
38890+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
38891+ MP_intsrc_info(&intsrc);
38892+ }
38893+
38894+ intsrc.mpc_irqtype = mp_ExtINT;
38895+ intsrc.mpc_srcbusirq = 0;
38896+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
38897+ MP_intsrc_info(&intsrc);
38898+}
38899+
38900+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
38901+{
38902+ struct mpc_config_processor processor;
38903+ struct mpc_config_bus bus;
38904+ struct mpc_config_ioapic ioapic;
38905+ struct mpc_config_lintsrc lintsrc;
38906+ int linttypes[2] = { mp_ExtINT, mp_NMI };
38907+ int i;
38908+
38909+ /*
38910+ * local APIC has default address
38911+ */
38912+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
38913+
38914+ /*
38915+ * 2 CPUs, numbered 0 & 1.
38916+ */
38917+ processor.mpc_type = MP_PROCESSOR;
38918+ /* Either an integrated APIC or a discrete 82489DX. */
38919+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
38920+ processor.mpc_cpuflag = CPU_ENABLED;
38921+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
38922+ (boot_cpu_data.x86_model << 4) |
38923+ boot_cpu_data.x86_mask;
38924+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
38925+ processor.mpc_reserved[0] = 0;
38926+ processor.mpc_reserved[1] = 0;
38927+ for (i = 0; i < 2; i++) {
38928+ processor.mpc_apicid = i;
38929+ MP_processor_info(&processor);
38930+ }
38931+
38932+ bus.mpc_type = MP_BUS;
38933+ bus.mpc_busid = 0;
38934+ switch (mpc_default_type) {
38935+ default:
38936+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
38937+ mpc_default_type);
38938+ /* fall through */
38939+ case 1:
38940+ case 5:
38941+ memcpy(bus.mpc_bustype, "ISA ", 6);
38942+ break;
38943+ case 2:
38944+ case 6:
38945+ case 3:
38946+ memcpy(bus.mpc_bustype, "EISA ", 6);
38947+ break;
38948+ case 4:
38949+ case 7:
38950+ memcpy(bus.mpc_bustype, "MCA ", 6);
38951+ }
38952+ MP_bus_info(&bus);
38953+ if (mpc_default_type > 4) {
38954+ bus.mpc_busid = 1;
38955+ memcpy(bus.mpc_bustype, "PCI ", 6);
38956+ MP_bus_info(&bus);
38957+ }
38958+
38959+ ioapic.mpc_type = MP_IOAPIC;
38960+ ioapic.mpc_apicid = 2;
38961+ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
38962+ ioapic.mpc_flags = MPC_APIC_USABLE;
38963+ ioapic.mpc_apicaddr = 0xFEC00000;
38964+ MP_ioapic_info(&ioapic);
38965+
38966+ /*
38967+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
38968+ */
38969+ construct_default_ioirq_mptable(mpc_default_type);
38970+
38971+ lintsrc.mpc_type = MP_LINTSRC;
38972+ lintsrc.mpc_irqflag = 0; /* conforming */
38973+ lintsrc.mpc_srcbusid = 0;
38974+ lintsrc.mpc_srcbusirq = 0;
38975+ lintsrc.mpc_destapic = MP_APIC_ALL;
38976+ for (i = 0; i < 2; i++) {
38977+ lintsrc.mpc_irqtype = linttypes[i];
38978+ lintsrc.mpc_destapiclint = i;
38979+ MP_lintsrc_info(&lintsrc);
38980+ }
38981+}
38982+
38983+static struct intel_mp_floating *mpf_found;
38984+
38985+/*
38986+ * Scan the memory blocks for an SMP configuration block.
38987+ */
38988+void __init get_smp_config (void)
38989+{
38990+ struct intel_mp_floating *mpf = mpf_found;
38991+
38992+ /*
38993+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
38994+ * processors, where MPS only supports physical.
38995+ */
38996+ if (acpi_lapic && acpi_ioapic) {
38997+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
38998+ return;
38999+ }
39000+ else if (acpi_lapic)
39001+ printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
39002+
39003+ printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
39004+ if (mpf->mpf_feature2 & (1<<7)) {
39005+ printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
39006+ pic_mode = 1;
39007+ } else {
39008+ printk(KERN_INFO " Virtual Wire compatibility mode.\n");
39009+ pic_mode = 0;
39010+ }
39011+
39012+ /*
39013+ * Now see if we need to read further.
39014+ */
39015+ if (mpf->mpf_feature1 != 0) {
39016+
39017+ printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
39018+ construct_default_ISA_mptable(mpf->mpf_feature1);
39019+
39020+ } else if (mpf->mpf_physptr) {
39021+
39022+ /*
39023+ * Read the physical hardware table. Anything here will
39024+ * override the defaults.
39025+ */
39026+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
39027+ smp_found_config = 0;
39028+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
39029+ printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
39030+ return;
39031+ }
39032+ /*
39033+ * If there are no explicit MP IRQ entries, then we are
39034+ * broken. We set up most of the low 16 IO-APIC pins to
39035+ * ISA defaults and hope it will work.
39036+ */
39037+ if (!mp_irq_entries) {
39038+ struct mpc_config_bus bus;
39039+
39040+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
39041+
39042+ bus.mpc_type = MP_BUS;
39043+ bus.mpc_busid = 0;
39044+ memcpy(bus.mpc_bustype, "ISA ", 6);
39045+ MP_bus_info(&bus);
39046+
39047+ construct_default_ioirq_mptable(0);
39048+ }
39049+
39050+ } else
39051+ BUG();
39052+
39053+ printk(KERN_INFO "Processors: %d\n", num_processors);
39054+ /*
39055+ * Only use the first configuration found.
39056+ */
39057+}
39058+
39059+static int __init smp_scan_config (unsigned long base, unsigned long length)
39060+{
39061+ extern void __bad_mpf_size(void);
39062+ unsigned int *bp = isa_bus_to_virt(base);
39063+ struct intel_mp_floating *mpf;
39064+
39065+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
39066+ if (sizeof(*mpf) != 16)
39067+ __bad_mpf_size();
39068+
39069+ while (length > 0) {
39070+ mpf = (struct intel_mp_floating *)bp;
39071+ if ((*bp == SMP_MAGIC_IDENT) &&
39072+ (mpf->mpf_length == 1) &&
39073+ !mpf_checksum((unsigned char *)bp, 16) &&
39074+ ((mpf->mpf_specification == 1)
39075+ || (mpf->mpf_specification == 4)) ) {
39076+
39077+ smp_found_config = 1;
39078+ mpf_found = mpf;
39079+ return 1;
39080+ }
39081+ bp += 4;
39082+ length -= 16;
39083+ }
39084+ return 0;
39085+}
39086+
39087+void __init find_intel_smp (void)
39088+{
39089+ unsigned int address;
39090+
39091+ /*
39092+ * FIXME: Linux assumes you have 640K of base ram..
39093+ * this continues the error...
39094+ *
39095+ * 1) Scan the bottom 1K for a signature
39096+ * 2) Scan the top 1K of base RAM
39097+ * 3) Scan the 64K of bios
39098+ */
39099+ if (smp_scan_config(0x0,0x400) ||
39100+ smp_scan_config(639*0x400,0x400) ||
39101+ smp_scan_config(0xF0000,0x10000))
39102+ return;
39103+ /*
39104+ * If it is an SMP machine we should know now, unless the
39105+ * configuration is in an EISA/MCA bus machine with an
39106+ * extended bios data area.
39107+ *
39108+ * there is a real-mode segmented pointer pointing to the
39109+ * 4K EBDA area at 0x40E, calculate and scan it here.
39110+ *
39111+ * NOTE! There are Linux loaders that will corrupt the EBDA
39112+ * area, and as such this kind of SMP config may be less
39113+ * trustworthy, simply because the SMP table may have been
39114+ * stomped on during early boot. These loaders are buggy and
39115+ * should be fixed.
39116+ */
39117+
39118+ address = *(unsigned short *)phys_to_virt(0x40E);
39119+ address <<= 4;
39120+ if (smp_scan_config(address, 0x1000))
39121+ return;
39122+
39123+ /* If we have come this far, we did not find an MP table */
39124+ printk(KERN_INFO "No mptable found.\n");
39125+}
39126+
39127+/*
39128+ * - Intel MP Configuration Table
39129+ */
39130+void __init find_smp_config (void)
39131+{
39132+#ifdef CONFIG_X86_LOCAL_APIC
39133+ find_intel_smp();
39134+#endif
39135+}
39136+
39137+
39138+/* --------------------------------------------------------------------------
39139+ ACPI-based MP Configuration
39140+ -------------------------------------------------------------------------- */
39141+
39142+#ifdef CONFIG_ACPI
39143+
39144+void __init mp_register_lapic_address (
39145+ u64 address)
39146+{
39147+#ifndef CONFIG_XEN
39148+ mp_lapic_addr = (unsigned long) address;
39149+
39150+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
39151+
39152+ if (boot_cpu_id == -1U)
39153+ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
39154+
39155+ Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
39156+#endif
39157+}
39158+
39159+
39160+void __init mp_register_lapic (
39161+ u8 id,
39162+ u8 enabled)
39163+{
39164+ struct mpc_config_processor processor;
39165+ int boot_cpu = 0;
39166+
39167+ if (id >= MAX_APICS) {
39168+ printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
39169+ id, MAX_APICS);
39170+ return;
39171+ }
39172+
39173+ if (id == boot_cpu_physical_apicid)
39174+ boot_cpu = 1;
39175+
39176+#ifndef CONFIG_XEN
39177+ processor.mpc_type = MP_PROCESSOR;
39178+ processor.mpc_apicid = id;
39179+ processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
39180+ processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
39181+ processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
39182+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
39183+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
39184+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
39185+ processor.mpc_reserved[0] = 0;
39186+ processor.mpc_reserved[1] = 0;
39187+#endif
39188+
39189+ MP_processor_info(&processor);
39190+}
39191+
39192+#ifdef CONFIG_X86_IO_APIC
39193+
39194+#define MP_ISA_BUS 0
39195+#define MP_MAX_IOAPIC_PIN 127
39196+
39197+static struct mp_ioapic_routing {
39198+ int apic_id;
39199+ int gsi_start;
39200+ int gsi_end;
39201+ u32 pin_programmed[4];
39202+} mp_ioapic_routing[MAX_IO_APICS];
39203+
39204+
39205+static int mp_find_ioapic (
39206+ int gsi)
39207+{
39208+ int i = 0;
39209+
39210+ /* Find the IOAPIC that manages this GSI. */
39211+ for (i = 0; i < nr_ioapics; i++) {
39212+ if ((gsi >= mp_ioapic_routing[i].gsi_start)
39213+ && (gsi <= mp_ioapic_routing[i].gsi_end))
39214+ return i;
39215+ }
39216+
39217+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
39218+
39219+ return -1;
39220+}
39221+
39222+
39223+void __init mp_register_ioapic (
39224+ u8 id,
39225+ u32 address,
39226+ u32 gsi_base)
39227+{
39228+ int idx = 0;
39229+
39230+ if (nr_ioapics >= MAX_IO_APICS) {
39231+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
39232+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
39233+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
39234+ }
39235+ if (!address) {
39236+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
39237+ " found in MADT table, skipping!\n");
39238+ return;
39239+ }
39240+
39241+ idx = nr_ioapics++;
39242+
39243+ mp_ioapics[idx].mpc_type = MP_IOAPIC;
39244+ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
39245+ mp_ioapics[idx].mpc_apicaddr = address;
39246+
39247+#ifndef CONFIG_XEN
39248+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
39249+#endif
39250+ mp_ioapics[idx].mpc_apicid = id;
39251+ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
39252+
39253+ /*
39254+ * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
39255+ * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
39256+ */
39257+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
39258+ mp_ioapic_routing[idx].gsi_start = gsi_base;
39259+ mp_ioapic_routing[idx].gsi_end = gsi_base +
39260+ io_apic_get_redir_entries(idx);
39261+
39262+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
39263+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
39264+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
39265+ mp_ioapic_routing[idx].gsi_start,
39266+ mp_ioapic_routing[idx].gsi_end);
39267+
39268+ return;
39269+}
39270+
39271+
39272+void __init mp_override_legacy_irq (
39273+ u8 bus_irq,
39274+ u8 polarity,
39275+ u8 trigger,
39276+ u32 gsi)
39277+{
39278+ struct mpc_config_intsrc intsrc;
39279+ int ioapic = -1;
39280+ int pin = -1;
39281+
39282+ /*
39283+ * Convert 'gsi' to 'ioapic.pin'.
39284+ */
39285+ ioapic = mp_find_ioapic(gsi);
39286+ if (ioapic < 0)
39287+ return;
39288+ pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39289+
39290+ /*
39291+ * TBD: This check is for faulty timer entries, where the override
39292+ * erroneously sets the trigger to level, resulting in a HUGE
39293+ * increase of timer interrupts!
39294+ */
39295+ if ((bus_irq == 0) && (trigger == 3))
39296+ trigger = 1;
39297+
39298+ intsrc.mpc_type = MP_INTSRC;
39299+ intsrc.mpc_irqtype = mp_INT;
39300+ intsrc.mpc_irqflag = (trigger << 2) | polarity;
39301+ intsrc.mpc_srcbus = MP_ISA_BUS;
39302+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
39303+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
39304+ intsrc.mpc_dstirq = pin; /* INTIN# */
39305+
39306+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
39307+ intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
39308+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
39309+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
39310+
39311+ mp_irqs[mp_irq_entries] = intsrc;
39312+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
39313+ panic("Max # of irq sources exceeded!\n");
39314+
39315+ return;
39316+}
39317+
39318+
39319+void __init mp_config_acpi_legacy_irqs (void)
39320+{
39321+ struct mpc_config_intsrc intsrc;
39322+ int i = 0;
39323+ int ioapic = -1;
39324+
39325+ /*
39326+ * Fabricate the legacy ISA bus (bus #31).
39327+ */
39328+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
39329+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
39330+
39331+ /*
39332+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
39333+ */
39334+ ioapic = mp_find_ioapic(0);
39335+ if (ioapic < 0)
39336+ return;
39337+
39338+ intsrc.mpc_type = MP_INTSRC;
39339+ intsrc.mpc_irqflag = 0; /* Conforming */
39340+ intsrc.mpc_srcbus = MP_ISA_BUS;
39341+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
39342+
39343+ /*
39344+ * Use the default configuration for the IRQs 0-15. Unless
39345+ * overridden by (MADT) interrupt source override entries.
39346+ */
39347+ for (i = 0; i < 16; i++) {
39348+ int idx;
39349+
39350+ for (idx = 0; idx < mp_irq_entries; idx++) {
39351+ struct mpc_config_intsrc *irq = mp_irqs + idx;
39352+
39353+ /* Do we already have a mapping for this ISA IRQ? */
39354+ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
39355+ break;
39356+
39357+ /* Do we already have a mapping for this IOAPIC pin */
39358+ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
39359+ (irq->mpc_dstirq == i))
39360+ break;
39361+ }
39362+
39363+ if (idx != mp_irq_entries) {
39364+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
39365+ continue; /* IRQ already used */
39366+ }
39367+
39368+ intsrc.mpc_irqtype = mp_INT;
39369+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
39370+ intsrc.mpc_dstirq = i;
39371+
39372+ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
39373+ "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
39374+ (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
39375+ intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
39376+ intsrc.mpc_dstirq);
39377+
39378+ mp_irqs[mp_irq_entries] = intsrc;
39379+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
39380+ panic("Max # of irq sources exceeded!\n");
39381+ }
39382+
39383+ return;
39384+}
39385+
39386+#define MAX_GSI_NUM 4096
39387+
39388+int mp_register_gsi(u32 gsi, int triggering, int polarity)
39389+{
39390+ int ioapic = -1;
39391+ int ioapic_pin = 0;
39392+ int idx, bit = 0;
39393+ static int pci_irq = 16;
39394+ /*
39395+ * Mapping between Global System Interrupts, which
39396+ * represent all possible interrupts, to the IRQs
39397+ * assigned to actual devices.
39398+ */
39399+ static int gsi_to_irq[MAX_GSI_NUM];
39400+
39401+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
39402+ return gsi;
39403+
39404+ /* Don't set up the ACPI SCI because it's already set up */
39405+ if (acpi_fadt.sci_int == gsi)
39406+ return gsi;
39407+
39408+ ioapic = mp_find_ioapic(gsi);
39409+ if (ioapic < 0) {
39410+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
39411+ return gsi;
39412+ }
39413+
39414+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39415+
39416+ /*
39417+ * Avoid pin reprogramming. PRTs typically include entries
39418+ * with redundant pin->gsi mappings (but unique PCI devices);
39419+ * we only program the IOAPIC on the first.
39420+ */
39421+ bit = ioapic_pin % 32;
39422+ idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
39423+ if (idx > 3) {
39424+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
39425+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
39426+ ioapic_pin);
39427+ return gsi;
39428+ }
39429+ if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
39430+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
39431+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
39432+ return gsi_to_irq[gsi];
39433+ }
39434+
39435+ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
39436+
39437+ if (triggering == ACPI_LEVEL_SENSITIVE) {
39438+ /*
39439+ * For PCI devices assign IRQs in order, avoiding gaps
39440+ * due to unused I/O APIC pins.
39441+ */
39442+ int irq = gsi;
39443+ if (gsi < MAX_GSI_NUM) {
39444+ if (gsi > 15)
39445+ gsi = pci_irq++;
39446+ /*
39447+ * Don't assign IRQ used by ACPI SCI
39448+ */
39449+ if (gsi == acpi_fadt.sci_int)
39450+ gsi = pci_irq++;
39451+ gsi_to_irq[irq] = gsi;
39452+ } else {
39453+ printk(KERN_ERR "GSI %u is too high\n", gsi);
39454+ return gsi;
39455+ }
39456+ }
39457+
39458+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
39459+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
39460+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
39461+ return gsi;
39462+}
39463+
39464+#endif /*CONFIG_X86_IO_APIC*/
39465+#endif /*CONFIG_ACPI*/
39466diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/pci-swiotlb-xen.c linux-2.6.16.33/arch/x86_64/kernel/pci-swiotlb-xen.c
39467--- linux-2.6.16.33-noxen/arch/x86_64/kernel/pci-swiotlb-xen.c 1970-01-01 00:00:00.000000000 +0000
39468+++ linux-2.6.16.33/arch/x86_64/kernel/pci-swiotlb-xen.c 2007-01-08 15:00:45.000000000 +0000
39469@@ -0,0 +1,54 @@
39470+/* Glue code to lib/swiotlb.c */
39471+
39472+#include <linux/pci.h>
39473+#include <linux/cache.h>
39474+#include <linux/module.h>
39475+#include <asm/dma-mapping.h>
39476+#include <asm/proto.h>
39477+#include <asm/swiotlb.h>
39478+#include <asm/dma.h>
39479+
39480+#if 0
39481+int swiotlb __read_mostly;
39482+EXPORT_SYMBOL(swiotlb);
39483+#endif
39484+
39485+struct dma_mapping_ops swiotlb_dma_ops = {
39486+#if 0
39487+ .mapping_error = swiotlb_dma_mapping_error,
39488+ .alloc_coherent = swiotlb_alloc_coherent,
39489+ .free_coherent = swiotlb_free_coherent,
39490+ .map_single = swiotlb_map_single,
39491+ .unmap_single = swiotlb_unmap_single,
39492+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
39493+ .sync_single_for_device = swiotlb_sync_single_for_device,
39494+ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
39495+ .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
39496+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
39497+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
39498+ .map_sg = swiotlb_map_sg,
39499+ .unmap_sg = swiotlb_unmap_sg,
39500+ .dma_supported = NULL,
39501+#endif
39502+};
39503+
39504+void pci_swiotlb_init(void)
39505+{
39506+#if 0
39507+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
39508+ if (!iommu_aperture && !no_iommu &&
39509+ (end_pfn > MAX_DMA32_PFN || force_iommu))
39510+ swiotlb = 1;
39511+ if (swiotlb) {
39512+ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39513+ swiotlb_init();
39514+ dma_ops = &swiotlb_dma_ops;
39515+ }
39516+#else
39517+ swiotlb_init();
39518+ if (swiotlb) {
39519+ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39520+ dma_ops = &swiotlb_dma_ops;
39521+ }
39522+#endif
39523+}
39524diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/process-xen.c linux-2.6.16.33/arch/x86_64/kernel/process-xen.c
39525--- linux-2.6.16.33-noxen/arch/x86_64/kernel/process-xen.c 1970-01-01 00:00:00.000000000 +0000
39526+++ linux-2.6.16.33/arch/x86_64/kernel/process-xen.c 2007-01-08 15:00:45.000000000 +0000
39527@@ -0,0 +1,802 @@
39528+/*
39529+ * linux/arch/x86-64/kernel/process.c
39530+ *
39531+ * Copyright (C) 1995 Linus Torvalds
39532+ *
39533+ * Pentium III FXSR, SSE support
39534+ * Gareth Hughes <gareth@valinux.com>, May 2000
39535+ *
39536+ * X86-64 port
39537+ * Andi Kleen.
39538+ *
39539+ * CPU hotplug support - ashok.raj@intel.com
39540+ * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
39541+ *
39542+ * Jun Nakajima <jun.nakajima@intel.com>
39543+ * Modified for Xen
39544+ */
39545+
39546+/*
39547+ * This file handles the architecture-dependent parts of process handling..
39548+ */
39549+
39550+#include <stdarg.h>
39551+
39552+#include <linux/cpu.h>
39553+#include <linux/errno.h>
39554+#include <linux/sched.h>
39555+#include <linux/kernel.h>
39556+#include <linux/mm.h>
39557+#include <linux/elfcore.h>
39558+#include <linux/smp.h>
39559+#include <linux/slab.h>
39560+#include <linux/user.h>
39561+#include <linux/module.h>
39562+#include <linux/a.out.h>
39563+#include <linux/interrupt.h>
39564+#include <linux/delay.h>
39565+#include <linux/ptrace.h>
39566+#include <linux/utsname.h>
39567+#include <linux/random.h>
39568+#include <linux/kprobes.h>
39569+#include <linux/notifier.h>
39570+
39571+#include <asm/uaccess.h>
39572+#include <asm/pgtable.h>
39573+#include <asm/system.h>
39574+#include <asm/io.h>
39575+#include <asm/processor.h>
39576+#include <asm/i387.h>
39577+#include <asm/mmu_context.h>
39578+#include <asm/pda.h>
39579+#include <asm/prctl.h>
39580+#include <asm/kdebug.h>
39581+#include <xen/interface/dom0_ops.h>
39582+#include <xen/interface/physdev.h>
39583+#include <xen/interface/vcpu.h>
39584+#include <asm/desc.h>
39585+#include <asm/proto.h>
39586+#include <asm/hardirq.h>
39587+#include <asm/ia32.h>
39588+#include <asm/idle.h>
39589+
39590+#include <xen/cpu_hotplug.h>
39591+
39592+asmlinkage extern void ret_from_fork(void);
39593+
39594+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
39595+
39596+unsigned long boot_option_idle_override = 0;
39597+EXPORT_SYMBOL(boot_option_idle_override);
39598+
39599+/*
39600+ * Powermanagement idle function, if any..
39601+ */
39602+void (*pm_idle)(void);
39603+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
39604+
39605+static struct notifier_block *idle_notifier;
39606+static DEFINE_SPINLOCK(idle_notifier_lock);
39607+
39608+void idle_notifier_register(struct notifier_block *n)
39609+{
39610+ unsigned long flags;
39611+ spin_lock_irqsave(&idle_notifier_lock, flags);
39612+ notifier_chain_register(&idle_notifier, n);
39613+ spin_unlock_irqrestore(&idle_notifier_lock, flags);
39614+}
39615+EXPORT_SYMBOL_GPL(idle_notifier_register);
39616+
39617+void idle_notifier_unregister(struct notifier_block *n)
39618+{
39619+ unsigned long flags;
39620+ spin_lock_irqsave(&idle_notifier_lock, flags);
39621+ notifier_chain_unregister(&idle_notifier, n);
39622+ spin_unlock_irqrestore(&idle_notifier_lock, flags);
39623+}
39624+EXPORT_SYMBOL(idle_notifier_unregister);
39625+
39626+enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
39627+static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
39628+
39629+void enter_idle(void)
39630+{
39631+ __get_cpu_var(idle_state) = CPU_IDLE;
39632+ notifier_call_chain(&idle_notifier, IDLE_START, NULL);
39633+}
39634+
39635+static void __exit_idle(void)
39636+{
39637+ __get_cpu_var(idle_state) = CPU_NOT_IDLE;
39638+ notifier_call_chain(&idle_notifier, IDLE_END, NULL);
39639+}
39640+
39641+/* Called from interrupts to signify idle end */
39642+void exit_idle(void)
39643+{
39644+ if (current->pid | read_pda(irqcount))
39645+ return;
39646+ __exit_idle();
39647+}
39648+
39649+/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
39650+void xen_idle(void)
39651+{
39652+ local_irq_disable();
39653+
39654+ if (need_resched())
39655+ local_irq_enable();
39656+ else {
39657+ clear_thread_flag(TIF_POLLING_NRFLAG);
39658+ smp_mb__after_clear_bit();
39659+ safe_halt();
39660+ set_thread_flag(TIF_POLLING_NRFLAG);
39661+ }
39662+}
39663+
39664+#ifdef CONFIG_HOTPLUG_CPU
39665+static inline void play_dead(void)
39666+{
39667+ idle_task_exit();
39668+ local_irq_disable();
39669+ cpu_clear(smp_processor_id(), cpu_initialized);
39670+ preempt_enable_no_resched();
39671+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
39672+ cpu_bringup();
39673+}
39674+#else
39675+static inline void play_dead(void)
39676+{
39677+ BUG();
39678+}
39679+#endif /* CONFIG_HOTPLUG_CPU */
39680+
39681+/*
39682+ * The idle thread. There's no useful work to be
39683+ * done, so just try to conserve power and have a
39684+ * low exit latency (ie sit in a loop waiting for
39685+ * somebody to say that they'd like to reschedule)
39686+ */
39687+void cpu_idle (void)
39688+{
39689+ set_thread_flag(TIF_POLLING_NRFLAG);
39690+
39691+ /* endless idle loop with no priority at all */
39692+ while (1) {
39693+ while (!need_resched()) {
39694+ if (__get_cpu_var(cpu_idle_state))
39695+ __get_cpu_var(cpu_idle_state) = 0;
39696+ rmb();
39697+
39698+ if (cpu_is_offline(smp_processor_id()))
39699+ play_dead();
39700+ enter_idle();
39701+ xen_idle();
39702+ __exit_idle();
39703+ }
39704+
39705+ preempt_enable_no_resched();
39706+ schedule();
39707+ preempt_disable();
39708+ }
39709+}
39710+
39711+void cpu_idle_wait(void)
39712+{
39713+ unsigned int cpu, this_cpu = get_cpu();
39714+ cpumask_t map;
39715+
39716+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
39717+ put_cpu();
39718+
39719+ cpus_clear(map);
39720+ for_each_online_cpu(cpu) {
39721+ per_cpu(cpu_idle_state, cpu) = 1;
39722+ cpu_set(cpu, map);
39723+ }
39724+
39725+ __get_cpu_var(cpu_idle_state) = 0;
39726+
39727+ wmb();
39728+ do {
39729+ ssleep(1);
39730+ for_each_online_cpu(cpu) {
39731+ if (cpu_isset(cpu, map) &&
39732+ !per_cpu(cpu_idle_state, cpu))
39733+ cpu_clear(cpu, map);
39734+ }
39735+ cpus_and(map, map, cpu_online_map);
39736+ } while (!cpus_empty(map));
39737+}
39738+EXPORT_SYMBOL_GPL(cpu_idle_wait);
39739+
39740+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
39741+/* Always use xen_idle() instead. */
39742+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
39743+
39744+/* Prints also some state that isn't saved in the pt_regs */
39745+void __show_regs(struct pt_regs * regs)
39746+{
39747+ unsigned long fs, gs, shadowgs;
39748+ unsigned int fsindex,gsindex;
39749+ unsigned int ds,cs,es;
39750+
39751+ printk("\n");
39752+ print_modules();
39753+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
39754+ current->pid, current->comm, print_tainted(),
39755+ system_utsname.release,
39756+ (int)strcspn(system_utsname.version, " "),
39757+ system_utsname.version);
39758+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
39759+ printk_address(regs->rip);
39760+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
39761+ regs->eflags);
39762+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
39763+ regs->rax, regs->rbx, regs->rcx);
39764+ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
39765+ regs->rdx, regs->rsi, regs->rdi);
39766+ printk("RBP: %016lx R08: %016lx R09: %016lx\n",
39767+ regs->rbp, regs->r8, regs->r9);
39768+ printk("R10: %016lx R11: %016lx R12: %016lx\n",
39769+ regs->r10, regs->r11, regs->r12);
39770+ printk("R13: %016lx R14: %016lx R15: %016lx\n",
39771+ regs->r13, regs->r14, regs->r15);
39772+
39773+ asm("mov %%ds,%0" : "=r" (ds));
39774+ asm("mov %%cs,%0" : "=r" (cs));
39775+ asm("mov %%es,%0" : "=r" (es));
39776+ asm("mov %%fs,%0" : "=r" (fsindex));
39777+ asm("mov %%gs,%0" : "=r" (gsindex));
39778+
39779+ rdmsrl(MSR_FS_BASE, fs);
39780+ rdmsrl(MSR_GS_BASE, gs);
39781+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
39782+
39783+ printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
39784+ fs,fsindex,gs,gsindex,shadowgs);
39785+ printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
39786+
39787+}
39788+
39789+void show_regs(struct pt_regs *regs)
39790+{
39791+ printk("CPU %d:", smp_processor_id());
39792+ __show_regs(regs);
39793+ show_trace(&regs->rsp);
39794+}
39795+
39796+/*
39797+ * Free current thread data structures etc..
39798+ */
39799+void exit_thread(void)
39800+{
39801+ struct task_struct *me = current;
39802+ struct thread_struct *t = &me->thread;
39803+
39804+ /*
39805+ * Remove function-return probe instances associated with this task
39806+ * and put them back on the free list. Do not insert an exit probe for
39807+ * this function, it will be disabled by kprobe_flush_task if you do.
39808+ */
39809+ kprobe_flush_task(me);
39810+
39811+ if (me->thread.io_bitmap_ptr) {
39812+#ifndef CONFIG_X86_NO_TSS
39813+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
39814+#endif
39815+#ifdef CONFIG_XEN
39816+ struct physdev_set_iobitmap iobmp_op = { 0 };
39817+#endif
39818+
39819+ kfree(t->io_bitmap_ptr);
39820+ t->io_bitmap_ptr = NULL;
39821+ /*
39822+ * Careful, clear this in the TSS too:
39823+ */
39824+#ifndef CONFIG_X86_NO_TSS
39825+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
39826+ put_cpu();
39827+#endif
39828+#ifdef CONFIG_XEN
39829+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
39830+#endif
39831+ t->io_bitmap_max = 0;
39832+ }
39833+}
39834+
39835+void load_gs_index(unsigned gs)
39836+{
39837+ HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
39838+}
39839+
39840+void flush_thread(void)
39841+{
39842+ struct task_struct *tsk = current;
39843+ struct thread_info *t = current_thread_info();
39844+
39845+ if (t->flags & _TIF_ABI_PENDING)
39846+ t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
39847+
39848+ tsk->thread.debugreg0 = 0;
39849+ tsk->thread.debugreg1 = 0;
39850+ tsk->thread.debugreg2 = 0;
39851+ tsk->thread.debugreg3 = 0;
39852+ tsk->thread.debugreg6 = 0;
39853+ tsk->thread.debugreg7 = 0;
39854+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
39855+ /*
39856+ * Forget coprocessor state..
39857+ */
39858+ clear_fpu(tsk);
39859+ clear_used_math();
39860+}
39861+
39862+void release_thread(struct task_struct *dead_task)
39863+{
39864+ if (dead_task->mm) {
39865+ if (dead_task->mm->context.size) {
39866+ printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
39867+ dead_task->comm,
39868+ dead_task->mm->context.ldt,
39869+ dead_task->mm->context.size);
39870+ BUG();
39871+ }
39872+ }
39873+}
39874+
39875+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
39876+{
39877+ struct user_desc ud = {
39878+ .base_addr = addr,
39879+ .limit = 0xfffff,
39880+ .seg_32bit = 1,
39881+ .limit_in_pages = 1,
39882+ .useable = 1,
39883+ };
39884+ struct n_desc_struct *desc = (void *)t->thread.tls_array;
39885+ desc += tls;
39886+ desc->a = LDT_entry_a(&ud);
39887+ desc->b = LDT_entry_b(&ud);
39888+}
39889+
39890+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
39891+{
39892+ struct desc_struct *desc = (void *)t->thread.tls_array;
39893+ desc += tls;
39894+ return desc->base0 |
39895+ (((u32)desc->base1) << 16) |
39896+ (((u32)desc->base2) << 24);
39897+}
39898+
39899+/*
39900+ * This gets called before we allocate a new thread and copy
39901+ * the current task into it.
39902+ */
39903+void prepare_to_copy(struct task_struct *tsk)
39904+{
39905+ unlazy_fpu(tsk);
39906+}
39907+
39908+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
39909+ unsigned long unused,
39910+ struct task_struct * p, struct pt_regs * regs)
39911+{
39912+ int err;
39913+ struct pt_regs * childregs;
39914+ struct task_struct *me = current;
39915+
39916+ childregs = ((struct pt_regs *)
39917+ (THREAD_SIZE + task_stack_page(p))) - 1;
39918+ *childregs = *regs;
39919+
39920+ childregs->rax = 0;
39921+ childregs->rsp = rsp;
39922+ if (rsp == ~0UL)
39923+ childregs->rsp = (unsigned long)childregs;
39924+
39925+ p->thread.rsp = (unsigned long) childregs;
39926+ p->thread.rsp0 = (unsigned long) (childregs+1);
39927+ p->thread.userrsp = me->thread.userrsp;
39928+
39929+ set_tsk_thread_flag(p, TIF_FORK);
39930+
39931+ p->thread.fs = me->thread.fs;
39932+ p->thread.gs = me->thread.gs;
39933+
39934+ asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
39935+ asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
39936+ asm("mov %%es,%0" : "=m" (p->thread.es));
39937+ asm("mov %%ds,%0" : "=m" (p->thread.ds));
39938+
39939+ if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
39940+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
39941+ if (!p->thread.io_bitmap_ptr) {
39942+ p->thread.io_bitmap_max = 0;
39943+ return -ENOMEM;
39944+ }
39945+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
39946+ IO_BITMAP_BYTES);
39947+ }
39948+
39949+ /*
39950+ * Set a new TLS for the child thread?
39951+ */
39952+ if (clone_flags & CLONE_SETTLS) {
39953+#ifdef CONFIG_IA32_EMULATION
39954+ if (test_thread_flag(TIF_IA32))
39955+ err = ia32_child_tls(p, childregs);
39956+ else
39957+#endif
39958+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
39959+ if (err)
39960+ goto out;
39961+ }
39962+ p->thread.iopl = current->thread.iopl;
39963+
39964+ err = 0;
39965+out:
39966+ if (err && p->thread.io_bitmap_ptr) {
39967+ kfree(p->thread.io_bitmap_ptr);
39968+ p->thread.io_bitmap_max = 0;
39969+ }
39970+ return err;
39971+}
39972+
39973+static inline void __save_init_fpu( struct task_struct *tsk )
39974+{
39975+ asm volatile( "rex64 ; fxsave %0 ; fnclex"
39976+ : "=m" (tsk->thread.i387.fxsave));
39977+ tsk->thread_info->status &= ~TS_USEDFPU;
39978+}
39979+
39980+/*
39981+ * switch_to(x,y) should switch tasks from x to y.
39982+ *
39983+ * This could still be optimized:
39984+ * - fold all the options into a flag word and test it with a single test.
39985+ * - could test fs/gs bitsliced
39986+ *
39987+ * Kprobes not supported here. Set the probe on schedule instead.
39988+ */
39989+__kprobes struct task_struct *
39990+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
39991+{
39992+ struct thread_struct *prev = &prev_p->thread,
39993+ *next = &next_p->thread;
39994+ int cpu = smp_processor_id();
39995+#ifndef CONFIG_X86_NO_TSS
39996+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
39997+#endif
39998+ struct physdev_set_iopl iopl_op;
39999+ struct physdev_set_iobitmap iobmp_op;
40000+ multicall_entry_t _mcl[8], *mcl = _mcl;
40001+
40002+ /*
40003+ * This is basically '__unlazy_fpu', except that we queue a
40004+ * multicall to indicate FPU task switch, rather than
40005+ * synchronously trapping to Xen.
40006+ * This must be here to ensure both math_state_restore() and
40007+ * kernel_fpu_begin() work consistently.
40008+ * The AMD workaround requires it to be after DS reload, or
40009+ * after DS has been cleared, which we do in __prepare_arch_switch.
40010+ */
40011+ if (prev_p->thread_info->status & TS_USEDFPU) {
40012+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
40013+ mcl->op = __HYPERVISOR_fpu_taskswitch;
40014+ mcl->args[0] = 1;
40015+ mcl++;
40016+ }
40017+
40018+ /*
40019+ * Reload esp0, LDT and the page table pointer:
40020+ */
40021+ mcl->op = __HYPERVISOR_stack_switch;
40022+ mcl->args[0] = __KERNEL_DS;
40023+ mcl->args[1] = next->rsp0;
40024+ mcl++;
40025+
40026+ /*
40027+ * Load the per-thread Thread-Local Storage descriptor.
40028+ * This is load_TLS(next, cpu) with multicalls.
40029+ */
40030+#define C(i) do { \
40031+ if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
40032+ mcl->op = __HYPERVISOR_update_descriptor; \
40033+ mcl->args[0] = virt_to_machine( \
40034+ &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
40035+ mcl->args[1] = next->tls_array[i]; \
40036+ mcl++; \
40037+ } \
40038+} while (0)
40039+ C(0); C(1); C(2);
40040+#undef C
40041+
40042+ if (unlikely(prev->iopl != next->iopl)) {
40043+ iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
40044+ mcl->op = __HYPERVISOR_physdev_op;
40045+ mcl->args[0] = PHYSDEVOP_set_iopl;
40046+ mcl->args[1] = (unsigned long)&iopl_op;
40047+ mcl++;
40048+ }
40049+
40050+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
40051+ iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
40052+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
40053+ mcl->op = __HYPERVISOR_physdev_op;
40054+ mcl->args[0] = PHYSDEVOP_set_iobitmap;
40055+ mcl->args[1] = (unsigned long)&iobmp_op;
40056+ mcl++;
40057+ }
40058+
40059+ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
40060+ /*
40061+ * Switch DS and ES.
40062+ * This won't pick up thread selector changes, but I guess that is ok.
40063+ */
40064+ if (unlikely(next->es))
40065+ loadsegment(es, next->es);
40066+
40067+ if (unlikely(next->ds))
40068+ loadsegment(ds, next->ds);
40069+
40070+ /*
40071+ * Switch FS and GS.
40072+ */
40073+ if (unlikely(next->fsindex))
40074+ loadsegment(fs, next->fsindex);
40075+
40076+ if (next->fs)
40077+ HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
40078+
40079+ if (unlikely(next->gsindex))
40080+ load_gs_index(next->gsindex);
40081+
40082+ if (next->gs)
40083+ HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
40084+
40085+ /*
40086+ * Switch the PDA context.
40087+ */
40088+ prev->userrsp = read_pda(oldrsp);
40089+ write_pda(oldrsp, next->userrsp);
40090+ write_pda(pcurrent, next_p);
40091+ write_pda(kernelstack,
40092+ task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
40093+
40094+ /*
40095+ * Now maybe reload the debug registers
40096+ */
40097+ if (unlikely(next->debugreg7)) {
40098+ set_debugreg(next->debugreg0, 0);
40099+ set_debugreg(next->debugreg1, 1);
40100+ set_debugreg(next->debugreg2, 2);
40101+ set_debugreg(next->debugreg3, 3);
40102+ /* no 4 and 5 */
40103+ set_debugreg(next->debugreg6, 6);
40104+ set_debugreg(next->debugreg7, 7);
40105+ }
40106+
40107+ return prev_p;
40108+}
40109+
40110+/*
40111+ * sys_execve() executes a new program.
40112+ */
40113+asmlinkage
40114+long sys_execve(char __user *name, char __user * __user *argv,
40115+ char __user * __user *envp, struct pt_regs regs)
40116+{
40117+ long error;
40118+ char * filename;
40119+
40120+ filename = getname(name);
40121+ error = PTR_ERR(filename);
40122+ if (IS_ERR(filename))
40123+ return error;
40124+ error = do_execve(filename, argv, envp, &regs);
40125+ if (error == 0) {
40126+ task_lock(current);
40127+ current->ptrace &= ~PT_DTRACE;
40128+ task_unlock(current);
40129+ }
40130+ putname(filename);
40131+ return error;
40132+}
40133+
40134+void set_personality_64bit(void)
40135+{
40136+ /* inherit personality from parent */
40137+
40138+ /* Make sure to be in 64bit mode */
40139+ clear_thread_flag(TIF_IA32);
40140+
40141+ /* TBD: overwrites user setup. Should have two bits.
40142+ But 64bit processes have always behaved this way,
40143+ so it's not too bad. The main problem is just that
40144+ 32bit childs are affected again. */
40145+ current->personality &= ~READ_IMPLIES_EXEC;
40146+}
40147+
40148+asmlinkage long sys_fork(struct pt_regs *regs)
40149+{
40150+ return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
40151+}
40152+
40153+asmlinkage long
40154+sys_clone(unsigned long clone_flags, unsigned long newsp,
40155+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
40156+{
40157+ if (!newsp)
40158+ newsp = regs->rsp;
40159+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
40160+}
40161+
40162+/*
40163+ * This is trivial, and on the face of it looks like it
40164+ * could equally well be done in user mode.
40165+ *
40166+ * Not so, for quite unobvious reasons - register pressure.
40167+ * In user mode vfork() cannot have a stack frame, and if
40168+ * done by calling the "clone()" system call directly, you
40169+ * do not have enough call-clobbered registers to hold all
40170+ * the information you need.
40171+ */
40172+asmlinkage long sys_vfork(struct pt_regs *regs)
40173+{
40174+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
40175+ NULL, NULL);
40176+}
40177+
40178+unsigned long get_wchan(struct task_struct *p)
40179+{
40180+ unsigned long stack;
40181+ u64 fp,rip;
40182+ int count = 0;
40183+
40184+ if (!p || p == current || p->state==TASK_RUNNING)
40185+ return 0;
40186+ stack = (unsigned long)task_stack_page(p);
40187+ if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
40188+ return 0;
40189+ fp = *(u64 *)(p->thread.rsp);
40190+ do {
40191+ if (fp < (unsigned long)stack ||
40192+ fp > (unsigned long)stack+THREAD_SIZE)
40193+ return 0;
40194+ rip = *(u64 *)(fp+8);
40195+ if (!in_sched_functions(rip))
40196+ return rip;
40197+ fp = *(u64 *)fp;
40198+ } while (count++ < 16);
40199+ return 0;
40200+}
40201+
40202+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
40203+{
40204+ int ret = 0;
40205+ int doit = task == current;
40206+ int cpu;
40207+
40208+ switch (code) {
40209+ case ARCH_SET_GS:
40210+ if (addr >= TASK_SIZE_OF(task))
40211+ return -EPERM;
40212+ cpu = get_cpu();
40213+ /* handle small bases via the GDT because that's faster to
40214+ switch. */
40215+ if (addr <= 0xffffffff) {
40216+ set_32bit_tls(task, GS_TLS, addr);
40217+ if (doit) {
40218+ load_TLS(&task->thread, cpu);
40219+ load_gs_index(GS_TLS_SEL);
40220+ }
40221+ task->thread.gsindex = GS_TLS_SEL;
40222+ task->thread.gs = 0;
40223+ } else {
40224+ task->thread.gsindex = 0;
40225+ task->thread.gs = addr;
40226+ if (doit) {
40227+ load_gs_index(0);
40228+ ret = HYPERVISOR_set_segment_base(
40229+ SEGBASE_GS_USER, addr);
40230+ }
40231+ }
40232+ put_cpu();
40233+ break;
40234+ case ARCH_SET_FS:
40235+ /* Not strictly needed for fs, but do it for symmetry
40236+ with gs */
40237+ if (addr >= TASK_SIZE_OF(task))
40238+ return -EPERM;
40239+ cpu = get_cpu();
40240+ /* handle small bases via the GDT because that's faster to
40241+ switch. */
40242+ if (addr <= 0xffffffff) {
40243+ set_32bit_tls(task, FS_TLS, addr);
40244+ if (doit) {
40245+ load_TLS(&task->thread, cpu);
40246+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
40247+ }
40248+ task->thread.fsindex = FS_TLS_SEL;
40249+ task->thread.fs = 0;
40250+ } else {
40251+ task->thread.fsindex = 0;
40252+ task->thread.fs = addr;
40253+ if (doit) {
40254+ /* set the selector to 0 to not confuse
40255+ __switch_to */
40256+ asm volatile("movl %0,%%fs" :: "r" (0));
40257+ ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
40258+ addr);
40259+ }
40260+ }
40261+ put_cpu();
40262+ break;
40263+ case ARCH_GET_FS: {
40264+ unsigned long base;
40265+ if (task->thread.fsindex == FS_TLS_SEL)
40266+ base = read_32bit_tls(task, FS_TLS);
40267+ else if (doit)
40268+ rdmsrl(MSR_FS_BASE, base);
40269+ else
40270+ base = task->thread.fs;
40271+ ret = put_user(base, (unsigned long __user *)addr);
40272+ break;
40273+ }
40274+ case ARCH_GET_GS: {
40275+ unsigned long base;
40276+ if (task->thread.gsindex == GS_TLS_SEL)
40277+ base = read_32bit_tls(task, GS_TLS);
40278+ else if (doit)
40279+ rdmsrl(MSR_KERNEL_GS_BASE, base);
40280+ else
40281+ base = task->thread.gs;
40282+ ret = put_user(base, (unsigned long __user *)addr);
40283+ break;
40284+ }
40285+
40286+ default:
40287+ ret = -EINVAL;
40288+ break;
40289+ }
40290+
40291+ return ret;
40292+}
40293+
40294+long sys_arch_prctl(int code, unsigned long addr)
40295+{
40296+ return do_arch_prctl(current, code, addr);
40297+}
40298+
40299+/*
40300+ * Capture the user space registers if the task is not running (in user space)
40301+ */
40302+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
40303+{
40304+ struct pt_regs *pp, ptregs;
40305+
40306+ pp = task_pt_regs(tsk);
40307+
40308+ ptregs = *pp;
40309+ ptregs.cs &= 0xffff;
40310+ ptregs.ss &= 0xffff;
40311+
40312+ elf_core_copy_regs(regs, &ptregs);
40313+
40314+ boot_option_idle_override = 1;
40315+ return 1;
40316+}
40317+
40318+unsigned long arch_align_stack(unsigned long sp)
40319+{
40320+ if (randomize_va_space)
40321+ sp -= get_random_int() % 8192;
40322+ return sp & ~0xf;
40323+}
40324+
40325+#ifndef CONFIG_SMP
40326+void _restore_vcpu(void)
40327+{
40328+}
40329+#endif
40330diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/relocate_kernel.S linux-2.6.16.33/arch/x86_64/kernel/relocate_kernel.S
40331--- linux-2.6.16.33-noxen/arch/x86_64/kernel/relocate_kernel.S 2006-11-22 18:06:31.000000000 +0000
40332+++ linux-2.6.16.33/arch/x86_64/kernel/relocate_kernel.S 2007-05-23 21:00:01.000000000 +0000
40333@@ -7,31 +7,195 @@
40334 */
40335
40336 #include <linux/linkage.h>
40337+#include <asm/page.h>
40338+#include <asm/kexec.h>
40339
40340- /*
40341- * Must be relocatable PIC code callable as a C function, that once
40342- * it starts can not use the previous processes stack.
40343- */
40344- .globl relocate_new_kernel
40345+/*
40346+ * Must be relocatable PIC code callable as a C function
40347+ */
40348+
40349+#define PTR(x) (x << 3)
40350+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
40351+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
40352+
40353+ .text
40354+ .align PAGE_ALIGNED
40355 .code64
40356+ .globl relocate_kernel
40357+relocate_kernel:
40358+ /* %rdi indirection_page
40359+ * %rsi page_list
40360+ * %rdx start address
40361+ */
40362+
40363+ /* map the control page at its virtual address */
40364+
40365+ movq $0x0000ff8000000000, %r10 /* mask */
40366+ mov $(39 - 3), %cl /* bits to shift */
40367+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
40368+
40369+ movq %r11, %r9
40370+ andq %r10, %r9
40371+ shrq %cl, %r9
40372+
40373+ movq PTR(VA_PGD)(%rsi), %r8
40374+ addq %r8, %r9
40375+ movq PTR(PA_PUD_0)(%rsi), %r8
40376+ orq $PAGE_ATTR, %r8
40377+ movq %r8, (%r9)
40378+
40379+ shrq $9, %r10
40380+ sub $9, %cl
40381+
40382+ movq %r11, %r9
40383+ andq %r10, %r9
40384+ shrq %cl, %r9
40385+
40386+ movq PTR(VA_PUD_0)(%rsi), %r8
40387+ addq %r8, %r9
40388+ movq PTR(PA_PMD_0)(%rsi), %r8
40389+ orq $PAGE_ATTR, %r8
40390+ movq %r8, (%r9)
40391+
40392+ shrq $9, %r10
40393+ sub $9, %cl
40394+
40395+ movq %r11, %r9
40396+ andq %r10, %r9
40397+ shrq %cl, %r9
40398+
40399+ movq PTR(VA_PMD_0)(%rsi), %r8
40400+ addq %r8, %r9
40401+ movq PTR(PA_PTE_0)(%rsi), %r8
40402+ orq $PAGE_ATTR, %r8
40403+ movq %r8, (%r9)
40404+
40405+ shrq $9, %r10
40406+ sub $9, %cl
40407+
40408+ movq %r11, %r9
40409+ andq %r10, %r9
40410+ shrq %cl, %r9
40411+
40412+ movq PTR(VA_PTE_0)(%rsi), %r8
40413+ addq %r8, %r9
40414+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40415+ orq $PAGE_ATTR, %r8
40416+ movq %r8, (%r9)
40417+
40418+ /* identity map the control page at its physical address */
40419+
40420+ movq $0x0000ff8000000000, %r10 /* mask */
40421+ mov $(39 - 3), %cl /* bits to shift */
40422+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
40423+
40424+ movq %r11, %r9
40425+ andq %r10, %r9
40426+ shrq %cl, %r9
40427+
40428+ movq PTR(VA_PGD)(%rsi), %r8
40429+ addq %r8, %r9
40430+ movq PTR(PA_PUD_1)(%rsi), %r8
40431+ orq $PAGE_ATTR, %r8
40432+ movq %r8, (%r9)
40433+
40434+ shrq $9, %r10
40435+ sub $9, %cl
40436+
40437+ movq %r11, %r9
40438+ andq %r10, %r9
40439+ shrq %cl, %r9
40440+
40441+ movq PTR(VA_PUD_1)(%rsi), %r8
40442+ addq %r8, %r9
40443+ movq PTR(PA_PMD_1)(%rsi), %r8
40444+ orq $PAGE_ATTR, %r8
40445+ movq %r8, (%r9)
40446+
40447+ shrq $9, %r10
40448+ sub $9, %cl
40449+
40450+ movq %r11, %r9
40451+ andq %r10, %r9
40452+ shrq %cl, %r9
40453+
40454+ movq PTR(VA_PMD_1)(%rsi), %r8
40455+ addq %r8, %r9
40456+ movq PTR(PA_PTE_1)(%rsi), %r8
40457+ orq $PAGE_ATTR, %r8
40458+ movq %r8, (%r9)
40459+
40460+ shrq $9, %r10
40461+ sub $9, %cl
40462+
40463+ movq %r11, %r9
40464+ andq %r10, %r9
40465+ shrq %cl, %r9
40466+
40467+ movq PTR(VA_PTE_1)(%rsi), %r8
40468+ addq %r8, %r9
40469+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40470+ orq $PAGE_ATTR, %r8
40471+ movq %r8, (%r9)
40472+
40473 relocate_new_kernel:
40474- /* %rdi page_list
40475- * %rsi reboot_code_buffer
40476+ /* %rdi indirection_page
40477+ * %rsi page_list
40478 * %rdx start address
40479- * %rcx page_table
40480- * %r8 arg5
40481- * %r9 arg6
40482 */
40483
40484 /* zero out flags, and disable interrupts */
40485 pushq $0
40486 popfq
40487
40488- /* set a new stack at the bottom of our page... */
40489- lea 4096(%rsi), %rsp
40490-
40491- /* store the parameters back on the stack */
40492- pushq %rdx /* store the start address */
40493+ /* get physical address of control page now */
40494+ /* this is impossible after page table switch */
40495+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40496+
40497+ /* get physical address of page table now too */
40498+ movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
40499+
40500+ /* switch to new set of page tables */
40501+ movq PTR(PA_PGD)(%rsi), %r9
40502+ movq %r9, %cr3
40503+
40504+ /* setup idt */
40505+ movq %r8, %rax
40506+ addq $(idt_80 - relocate_kernel), %rax
40507+ lidtq (%rax)
40508+
40509+ /* setup gdt */
40510+ movq %r8, %rax
40511+ addq $(gdt - relocate_kernel), %rax
40512+ movq %r8, %r9
40513+ addq $((gdt_80 - relocate_kernel) + 2), %r9
40514+ movq %rax, (%r9)
40515+
40516+ movq %r8, %rax
40517+ addq $(gdt_80 - relocate_kernel), %rax
40518+ lgdtq (%rax)
40519+
40520+ /* setup data segment registers */
40521+ xorl %eax, %eax
40522+ movl %eax, %ds
40523+ movl %eax, %es
40524+ movl %eax, %fs
40525+ movl %eax, %gs
40526+ movl %eax, %ss
40527+
40528+ /* setup a new stack at the end of the physical control page */
40529+ lea 4096(%r8), %rsp
40530+
40531+ /* load new code segment and jump to identity mapped page */
40532+ movq %r8, %rax
40533+ addq $(identity_mapped - relocate_kernel), %rax
40534+ pushq $(gdt_cs - gdt)
40535+ pushq %rax
40536+ lretq
40537+
40538+identity_mapped:
40539+ /* store the start address on the stack */
40540+ pushq %rdx
40541
40542 /* Set cr0 to a known state:
40543 * 31 1 == Paging enabled
40544@@ -134,10 +298,19 @@
40545 xorq %r13, %r13
40546 xorq %r14, %r14
40547 xorq %r15, %r15
40548-
40549 ret
40550-relocate_new_kernel_end:
40551
40552- .globl relocate_new_kernel_size
40553-relocate_new_kernel_size:
40554- .quad relocate_new_kernel_end - relocate_new_kernel
40555+ .align 16
40556+gdt:
40557+ .quad 0x0000000000000000 /* NULL descriptor */
40558+gdt_cs:
40559+ .quad 0x00af9a000000ffff
40560+gdt_end:
40561+
40562+gdt_80:
40563+ .word gdt_end - gdt - 1 /* limit */
40564+ .quad 0 /* base - filled in by code above */
40565+
40566+idt_80:
40567+ .word 0 /* limit */
40568+ .quad 0 /* base */
40569diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/setup-xen.c linux-2.6.16.33/arch/x86_64/kernel/setup-xen.c
40570--- linux-2.6.16.33-noxen/arch/x86_64/kernel/setup-xen.c 1970-01-01 00:00:00.000000000 +0000
40571+++ linux-2.6.16.33/arch/x86_64/kernel/setup-xen.c 2007-01-08 15:00:45.000000000 +0000
40572@@ -0,0 +1,1672 @@
40573+/*
40574+ * linux/arch/x86-64/kernel/setup.c
40575+ *
40576+ * Copyright (C) 1995 Linus Torvalds
40577+ *
40578+ * Nov 2001 Dave Jones <davej@suse.de>
40579+ * Forked from i386 setup code.
40580+ *
40581+ * $Id$
40582+ */
40583+
40584+/*
40585+ * This file handles the architecture-dependent parts of initialization
40586+ */
40587+
40588+#include <linux/errno.h>
40589+#include <linux/sched.h>
40590+#include <linux/kernel.h>
40591+#include <linux/mm.h>
40592+#include <linux/stddef.h>
40593+#include <linux/unistd.h>
40594+#include <linux/ptrace.h>
40595+#include <linux/slab.h>
40596+#include <linux/user.h>
40597+#include <linux/a.out.h>
40598+#include <linux/tty.h>
40599+#include <linux/ioport.h>
40600+#include <linux/delay.h>
40601+#include <linux/config.h>
40602+#include <linux/init.h>
40603+#include <linux/initrd.h>
40604+#include <linux/highmem.h>
40605+#include <linux/bootmem.h>
40606+#include <linux/module.h>
40607+#include <asm/processor.h>
40608+#include <linux/console.h>
40609+#include <linux/seq_file.h>
40610+#include <linux/crash_dump.h>
40611+#include <linux/root_dev.h>
40612+#include <linux/pci.h>
40613+#include <linux/acpi.h>
40614+#include <linux/kallsyms.h>
40615+#include <linux/edd.h>
40616+#include <linux/mmzone.h>
40617+#include <linux/kexec.h>
40618+#include <linux/cpufreq.h>
40619+#include <linux/dmi.h>
40620+#include <linux/dma-mapping.h>
40621+
40622+#include <asm/mtrr.h>
40623+#include <asm/uaccess.h>
40624+#include <asm/system.h>
40625+#include <asm/io.h>
40626+#include <asm/smp.h>
40627+#include <asm/msr.h>
40628+#include <asm/desc.h>
40629+#include <video/edid.h>
40630+#include <asm/e820.h>
40631+#include <asm/dma.h>
40632+#include <asm/mpspec.h>
40633+#include <asm/mmu_context.h>
40634+#include <asm/bootsetup.h>
40635+#include <asm/proto.h>
40636+#include <asm/setup.h>
40637+#include <asm/mach_apic.h>
40638+#include <asm/numa.h>
40639+#include <asm/swiotlb.h>
40640+#include <asm/sections.h>
40641+#include <asm/gart-mapping.h>
40642+#ifdef CONFIG_XEN
40643+#include <linux/percpu.h>
40644+#include <xen/interface/physdev.h>
40645+#include "setup_arch_pre.h"
40646+#include <asm/hypervisor.h>
40647+#include <xen/interface/nmi.h>
40648+#include <xen/features.h>
40649+#include <xen/xencons.h>
40650+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
40651+#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
40652+#include <asm/mach-xen/setup_arch_post.h>
40653+#include <xen/interface/memory.h>
40654+
40655+#ifdef CONFIG_XEN
40656+#include <xen/interface/kexec.h>
40657+#endif
40658+
40659+extern unsigned long start_pfn;
40660+extern struct edid_info edid_info;
40661+
40662+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
40663+EXPORT_SYMBOL(HYPERVISOR_shared_info);
40664+
40665+extern char hypercall_page[PAGE_SIZE];
40666+EXPORT_SYMBOL(hypercall_page);
40667+
40668+/* Allows setting of maximum possible memory size */
40669+unsigned long xen_override_max_pfn;
40670+
40671+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
40672+static struct notifier_block xen_panic_block = {
40673+ xen_panic_event, NULL, 0 /* try to go last */
40674+};
40675+
40676+unsigned long *phys_to_machine_mapping;
40677+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
40678+
40679+EXPORT_SYMBOL(phys_to_machine_mapping);
40680+
40681+DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
40682+DEFINE_PER_CPU(int, nr_multicall_ents);
40683+
40684+/* Raw start-of-day parameters from the hypervisor. */
40685+start_info_t *xen_start_info;
40686+EXPORT_SYMBOL(xen_start_info);
40687+#endif
40688+
40689+/*
40690+ * Machine setup..
40691+ */
40692+
40693+struct cpuinfo_x86 boot_cpu_data __read_mostly;
40694+
40695+unsigned long mmu_cr4_features;
40696+
40697+int acpi_disabled;
40698+EXPORT_SYMBOL(acpi_disabled);
40699+#ifdef CONFIG_ACPI
40700+extern int __initdata acpi_ht;
40701+extern acpi_interrupt_flags acpi_sci_flags;
40702+int __initdata acpi_force = 0;
40703+#endif
40704+
40705+int acpi_numa __initdata;
40706+
40707+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
40708+int bootloader_type;
40709+
40710+unsigned long saved_video_mode;
40711+
40712+/*
40713+ * Setup options
40714+ */
40715+struct screen_info screen_info;
40716+struct sys_desc_table_struct {
40717+ unsigned short length;
40718+ unsigned char table[0];
40719+};
40720+
40721+struct edid_info edid_info;
40722+struct e820map e820;
40723+#ifdef CONFIG_XEN
40724+struct e820map machine_e820;
40725+#endif
40726+
40727+extern int root_mountflags;
40728+
40729+char command_line[COMMAND_LINE_SIZE];
40730+
40731+struct resource standard_io_resources[] = {
40732+ { .name = "dma1", .start = 0x00, .end = 0x1f,
40733+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40734+ { .name = "pic1", .start = 0x20, .end = 0x21,
40735+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40736+ { .name = "timer0", .start = 0x40, .end = 0x43,
40737+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40738+ { .name = "timer1", .start = 0x50, .end = 0x53,
40739+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40740+ { .name = "keyboard", .start = 0x60, .end = 0x6f,
40741+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40742+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
40743+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40744+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
40745+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40746+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
40747+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40748+ { .name = "fpu", .start = 0xf0, .end = 0xff,
40749+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
40750+};
40751+
40752+#define STANDARD_IO_RESOURCES \
40753+ (sizeof standard_io_resources / sizeof standard_io_resources[0])
40754+
40755+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
40756+
40757+struct resource data_resource = {
40758+ .name = "Kernel data",
40759+ .start = 0,
40760+ .end = 0,
40761+ .flags = IORESOURCE_RAM,
40762+};
40763+struct resource code_resource = {
40764+ .name = "Kernel code",
40765+ .start = 0,
40766+ .end = 0,
40767+ .flags = IORESOURCE_RAM,
40768+};
40769+
40770+#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
40771+
40772+static struct resource system_rom_resource = {
40773+ .name = "System ROM",
40774+ .start = 0xf0000,
40775+ .end = 0xfffff,
40776+ .flags = IORESOURCE_ROM,
40777+};
40778+
40779+static struct resource extension_rom_resource = {
40780+ .name = "Extension ROM",
40781+ .start = 0xe0000,
40782+ .end = 0xeffff,
40783+ .flags = IORESOURCE_ROM,
40784+};
40785+
40786+static struct resource adapter_rom_resources[] = {
40787+ { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
40788+ .flags = IORESOURCE_ROM },
40789+ { .name = "Adapter ROM", .start = 0, .end = 0,
40790+ .flags = IORESOURCE_ROM },
40791+ { .name = "Adapter ROM", .start = 0, .end = 0,
40792+ .flags = IORESOURCE_ROM },
40793+ { .name = "Adapter ROM", .start = 0, .end = 0,
40794+ .flags = IORESOURCE_ROM },
40795+ { .name = "Adapter ROM", .start = 0, .end = 0,
40796+ .flags = IORESOURCE_ROM },
40797+ { .name = "Adapter ROM", .start = 0, .end = 0,
40798+ .flags = IORESOURCE_ROM }
40799+};
40800+
40801+#define ADAPTER_ROM_RESOURCES \
40802+ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
40803+
40804+static struct resource video_rom_resource = {
40805+ .name = "Video ROM",
40806+ .start = 0xc0000,
40807+ .end = 0xc7fff,
40808+ .flags = IORESOURCE_ROM,
40809+};
40810+
40811+static struct resource video_ram_resource = {
40812+ .name = "Video RAM area",
40813+ .start = 0xa0000,
40814+ .end = 0xbffff,
40815+ .flags = IORESOURCE_RAM,
40816+};
40817+
40818+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
40819+
40820+static int __init romchecksum(unsigned char *rom, unsigned long length)
40821+{
40822+ unsigned char *p, sum = 0;
40823+
40824+ for (p = rom; p < rom + length; p++)
40825+ sum += *p;
40826+ return sum == 0;
40827+}
40828+
40829+static void __init probe_roms(void)
40830+{
40831+ unsigned long start, length, upper;
40832+ unsigned char *rom;
40833+ int i;
40834+
40835+#ifdef CONFIG_XEN
40836+ /* Nothing to do if not running in dom0. */
40837+ if (!is_initial_xendomain())
40838+ return;
40839+#endif
40840+
40841+ /* video rom */
40842+ upper = adapter_rom_resources[0].start;
40843+ for (start = video_rom_resource.start; start < upper; start += 2048) {
40844+ rom = isa_bus_to_virt(start);
40845+ if (!romsignature(rom))
40846+ continue;
40847+
40848+ video_rom_resource.start = start;
40849+
40850+ /* 0 < length <= 0x7f * 512, historically */
40851+ length = rom[2] * 512;
40852+
40853+ /* if checksum okay, trust length byte */
40854+ if (length && romchecksum(rom, length))
40855+ video_rom_resource.end = start + length - 1;
40856+
40857+ request_resource(&iomem_resource, &video_rom_resource);
40858+ break;
40859+ }
40860+
40861+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
40862+ if (start < upper)
40863+ start = upper;
40864+
40865+ /* system rom */
40866+ request_resource(&iomem_resource, &system_rom_resource);
40867+ upper = system_rom_resource.start;
40868+
40869+ /* check for extension rom (ignore length byte!) */
40870+ rom = isa_bus_to_virt(extension_rom_resource.start);
40871+ if (romsignature(rom)) {
40872+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
40873+ if (romchecksum(rom, length)) {
40874+ request_resource(&iomem_resource, &extension_rom_resource);
40875+ upper = extension_rom_resource.start;
40876+ }
40877+ }
40878+
40879+ /* check for adapter roms on 2k boundaries */
40880+ for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
40881+ rom = isa_bus_to_virt(start);
40882+ if (!romsignature(rom))
40883+ continue;
40884+
40885+ /* 0 < length <= 0x7f * 512, historically */
40886+ length = rom[2] * 512;
40887+
40888+ /* but accept any length that fits if checksum okay */
40889+ if (!length || start + length > upper || !romchecksum(rom, length))
40890+ continue;
40891+
40892+ adapter_rom_resources[i].start = start;
40893+ adapter_rom_resources[i].end = start + length - 1;
40894+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
40895+
40896+ start = adapter_rom_resources[i++].end & ~2047UL;
40897+ }
40898+}
40899+
40900+static __init void parse_cmdline_early (char ** cmdline_p)
40901+{
40902+ char c = ' ', *to = command_line, *from = COMMAND_LINE;
40903+ int len = 0;
40904+ int userdef = 0;
40905+
40906+ for (;;) {
40907+ if (c != ' ')
40908+ goto next_char;
40909+
40910+#ifdef CONFIG_SMP
40911+ /*
40912+ * If the BIOS enumerates physical processors before logical,
40913+ * maxcpus=N at enumeration-time can be used to disable HT.
40914+ */
40915+ else if (!memcmp(from, "maxcpus=", 8)) {
40916+ extern unsigned int maxcpus;
40917+
40918+ maxcpus = simple_strtoul(from + 8, NULL, 0);
40919+ }
40920+#endif
40921+#ifdef CONFIG_ACPI
40922+ /* "acpi=off" disables both ACPI table parsing and interpreter init */
40923+ if (!memcmp(from, "acpi=off", 8))
40924+ disable_acpi();
40925+
40926+ if (!memcmp(from, "acpi=force", 10)) {
40927+ /* add later when we do DMI horrors: */
40928+ acpi_force = 1;
40929+ acpi_disabled = 0;
40930+ }
40931+
40932+ /* acpi=ht just means: do ACPI MADT parsing
40933+ at bootup, but don't enable the full ACPI interpreter */
40934+ if (!memcmp(from, "acpi=ht", 7)) {
40935+ if (!acpi_force)
40936+ disable_acpi();
40937+ acpi_ht = 1;
40938+ }
40939+ else if (!memcmp(from, "pci=noacpi", 10))
40940+ acpi_disable_pci();
40941+ else if (!memcmp(from, "acpi=noirq", 10))
40942+ acpi_noirq_set();
40943+
40944+ else if (!memcmp(from, "acpi_sci=edge", 13))
40945+ acpi_sci_flags.trigger = 1;
40946+ else if (!memcmp(from, "acpi_sci=level", 14))
40947+ acpi_sci_flags.trigger = 3;
40948+ else if (!memcmp(from, "acpi_sci=high", 13))
40949+ acpi_sci_flags.polarity = 1;
40950+ else if (!memcmp(from, "acpi_sci=low", 12))
40951+ acpi_sci_flags.polarity = 3;
40952+
40953+ /* acpi=strict disables out-of-spec workarounds */
40954+ else if (!memcmp(from, "acpi=strict", 11)) {
40955+ acpi_strict = 1;
40956+ }
40957+#ifdef CONFIG_X86_IO_APIC
40958+ else if (!memcmp(from, "acpi_skip_timer_override", 24))
40959+ acpi_skip_timer_override = 1;
40960+#endif
40961+#endif
40962+
40963+#ifndef CONFIG_XEN
40964+ if (!memcmp(from, "nolapic", 7) ||
40965+ !memcmp(from, "disableapic", 11))
40966+ disable_apic = 1;
40967+
40968+ /* Don't confuse with noapictimer */
40969+ if (!memcmp(from, "noapic", 6) &&
40970+ (from[6] == ' ' || from[6] == 0))
40971+ skip_ioapic_setup = 1;
40972+
40973+ /* Make sure to not confuse with apic= */
40974+ if (!memcmp(from, "apic", 4) &&
40975+ (from[4] == ' ' || from[4] == 0)) {
40976+ skip_ioapic_setup = 0;
40977+ ioapic_force = 1;
40978+ }
40979+#endif
40980+
40981+ if (!memcmp(from, "mem=", 4))
40982+ parse_memopt(from+4, &from);
40983+
40984+ if (!memcmp(from, "memmap=", 7)) {
40985+ /* exactmap option is for used defined memory */
40986+ if (!memcmp(from+7, "exactmap", 8)) {
40987+#ifdef CONFIG_CRASH_DUMP
40988+ /* If we are doing a crash dump, we
40989+ * still need to know the real mem
40990+ * size before original memory map is
40991+ * reset.
40992+ */
40993+ saved_max_pfn = e820_end_of_ram();
40994+#endif
40995+ from += 8+7;
40996+ end_pfn_map = 0;
40997+ e820.nr_map = 0;
40998+ userdef = 1;
40999+ }
41000+ else {
41001+ parse_memmapopt(from+7, &from);
41002+ userdef = 1;
41003+ }
41004+ }
41005+
41006+#ifdef CONFIG_NUMA
41007+ if (!memcmp(from, "numa=", 5))
41008+ numa_setup(from+5);
41009+#endif
41010+
41011+ if (!memcmp(from,"iommu=",6)) {
41012+ iommu_setup(from+6);
41013+ }
41014+
41015+ if (!memcmp(from,"oops=panic", 10))
41016+ panic_on_oops = 1;
41017+
41018+ if (!memcmp(from, "noexec=", 7))
41019+ nonx_setup(from + 7);
41020+
41021+#ifdef CONFIG_KEXEC
41022+ /* crashkernel=size@addr specifies the location to reserve for
41023+ * a crash kernel. By reserving this memory we guarantee
41024+ * that linux never set's it up as a DMA target.
41025+ * Useful for holding code to do something appropriate
41026+ * after a kernel panic.
41027+ */
41028+ else if (!memcmp(from, "crashkernel=", 12)) {
41029+#ifndef CONFIG_XEN
41030+ unsigned long size, base;
41031+ size = memparse(from+12, &from);
41032+ if (*from == '@') {
41033+ base = memparse(from+1, &from);
41034+ /* FIXME: Do I want a sanity check
41035+ * to validate the memory range?
41036+ */
41037+ crashk_res.start = base;
41038+ crashk_res.end = base + size - 1;
41039+ }
41040+#else
41041+ printk("Ignoring crashkernel command line, "
41042+ "parameter will be supplied by xen\n");
41043+#endif
41044+ }
41045+#endif
41046+
41047+#ifdef CONFIG_PROC_VMCORE
41048+ /* elfcorehdr= specifies the location of elf core header
41049+ * stored by the crashed kernel. This option will be passed
41050+ * by kexec loader to the capture kernel.
41051+ */
41052+ else if(!memcmp(from, "elfcorehdr=", 11))
41053+ elfcorehdr_addr = memparse(from+11, &from);
41054+#endif
41055+
41056+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
41057+ else if (!memcmp(from, "additional_cpus=", 16))
41058+ setup_additional_cpus(from+16);
41059+#endif
41060+
41061+ next_char:
41062+ c = *(from++);
41063+ if (!c)
41064+ break;
41065+ if (COMMAND_LINE_SIZE <= ++len)
41066+ break;
41067+ *(to++) = c;
41068+ }
41069+ if (userdef) {
41070+ printk(KERN_INFO "user-defined physical RAM map:\n");
41071+ e820_print_map("user");
41072+ }
41073+ *to = '\0';
41074+ *cmdline_p = command_line;
41075+}
41076+
41077+#ifndef CONFIG_NUMA
41078+static void __init
41079+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
41080+{
41081+ unsigned long bootmap_size, bootmap;
41082+
41083+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
41084+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
41085+ if (bootmap == -1L)
41086+ panic("Cannot find bootmem map of size %ld\n",bootmap_size);
41087+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
41088+#ifdef CONFIG_XEN
41089+ e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
41090+#else
41091+ e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
41092+#endif
41093+ reserve_bootmem(bootmap, bootmap_size);
41094+}
41095+#endif
41096+
41097+/* Use inline assembly to define this because the nops are defined
41098+ as inline assembly strings in the include files and we cannot
41099+ get them easily into strings. */
41100+asm("\t.data\nk8nops: "
41101+ K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
41102+ K8_NOP7 K8_NOP8);
41103+
41104+extern unsigned char k8nops[];
41105+static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
41106+ NULL,
41107+ k8nops,
41108+ k8nops + 1,
41109+ k8nops + 1 + 2,
41110+ k8nops + 1 + 2 + 3,
41111+ k8nops + 1 + 2 + 3 + 4,
41112+ k8nops + 1 + 2 + 3 + 4 + 5,
41113+ k8nops + 1 + 2 + 3 + 4 + 5 + 6,
41114+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
41115+};
41116+
41117+extern char __vsyscall_0;
41118+
41119+/* Replace instructions with better alternatives for this CPU type.
41120+
41121+ This runs before SMP is initialized to avoid SMP problems with
41122+ self modifying code. This implies that assymetric systems where
41123+ APs have less capabilities than the boot processor are not handled.
41124+ In this case boot with "noreplacement". */
41125+void apply_alternatives(void *start, void *end)
41126+{
41127+ struct alt_instr *a;
41128+ int diff, i, k;
41129+ for (a = start; (void *)a < end; a++) {
41130+ u8 *instr;
41131+
41132+ if (!boot_cpu_has(a->cpuid))
41133+ continue;
41134+
41135+ BUG_ON(a->replacementlen > a->instrlen);
41136+ instr = a->instr;
41137+ /* vsyscall code is not mapped yet. resolve it manually. */
41138+ if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
41139+ instr -= VSYSCALL_START - (unsigned long)&__vsyscall_0;
41140+ __inline_memcpy(instr, a->replacement, a->replacementlen);
41141+ diff = a->instrlen - a->replacementlen;
41142+
41143+ /* Pad the rest with nops */
41144+ for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
41145+ k = diff;
41146+ if (k > ASM_NOP_MAX)
41147+ k = ASM_NOP_MAX;
41148+ __inline_memcpy(instr + i, k8_nops[k], k);
41149+ }
41150+ }
41151+}
41152+
41153+static int no_replacement __initdata = 0;
41154+
41155+void __init alternative_instructions(void)
41156+{
41157+ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
41158+ if (no_replacement)
41159+ return;
41160+ apply_alternatives(__alt_instructions, __alt_instructions_end);
41161+}
41162+
41163+static int __init noreplacement_setup(char *s)
41164+{
41165+ no_replacement = 1;
41166+ return 0;
41167+}
41168+
41169+__setup("noreplacement", noreplacement_setup);
41170+
41171+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
41172+struct edd edd;
41173+#ifdef CONFIG_EDD_MODULE
41174+EXPORT_SYMBOL(edd);
41175+#endif
41176+/**
41177+ * copy_edd() - Copy the BIOS EDD information
41178+ * from boot_params into a safe place.
41179+ *
41180+ */
41181+static inline void copy_edd(void)
41182+{
41183+ memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
41184+ memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
41185+ edd.mbr_signature_nr = EDD_MBR_SIG_NR;
41186+ edd.edd_info_nr = EDD_NR;
41187+}
41188+#else
41189+static inline void copy_edd(void)
41190+{
41191+}
41192+#endif
41193+
41194+#ifndef CONFIG_XEN
41195+#define EBDA_ADDR_POINTER 0x40E
41196+static void __init reserve_ebda_region(void)
41197+{
41198+ unsigned int addr;
41199+ /**
41200+ * there is a real-mode segmented pointer pointing to the
41201+ * 4K EBDA area at 0x40E
41202+ */
41203+ addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
41204+ addr <<= 4;
41205+ if (addr)
41206+ reserve_bootmem_generic(addr, PAGE_SIZE);
41207+}
41208+#endif
41209+
41210+void __init setup_arch(char **cmdline_p)
41211+{
41212+ unsigned long kernel_end;
41213+ struct xen_memory_map memmap;
41214+
41215+#ifdef CONFIG_XEN
41216+ /* Register a call for panic conditions. */
41217+ notifier_chain_register(&panic_notifier_list, &xen_panic_block);
41218+
41219+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
41220+ kernel_end = 0; /* dummy */
41221+ screen_info = SCREEN_INFO;
41222+
41223+ if (is_initial_xendomain()) {
41224+ /* This is drawn from a dump from vgacon:startup in
41225+ * standard Linux. */
41226+ screen_info.orig_video_mode = 3;
41227+ screen_info.orig_video_isVGA = 1;
41228+ screen_info.orig_video_lines = 25;
41229+ screen_info.orig_video_cols = 80;
41230+ screen_info.orig_video_ega_bx = 3;
41231+ screen_info.orig_video_points = 16;
41232+ screen_info.orig_y = screen_info.orig_video_lines - 1;
41233+ if (xen_start_info->console.dom0.info_size >=
41234+ sizeof(struct dom0_vga_console_info)) {
41235+ const struct dom0_vga_console_info *info =
41236+ (struct dom0_vga_console_info *)(
41237+ (char *)xen_start_info +
41238+ xen_start_info->console.dom0.info_off);
41239+ dom0_init_screen_info(info);
41240+ }
41241+ xen_start_info->console.domU.mfn = 0;
41242+ xen_start_info->console.domU.evtchn = 0;
41243+ } else
41244+ screen_info.orig_video_isVGA = 0;
41245+
41246+ edid_info = EDID_INFO;
41247+ saved_video_mode = SAVED_VIDEO_MODE;
41248+ bootloader_type = LOADER_TYPE;
41249+
41250+#ifdef CONFIG_BLK_DEV_RAM
41251+ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
41252+ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
41253+ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
41254+
41255+
41256+#endif
41257+
41258+ setup_xen_features();
41259+
41260+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
41261+ VMASST_TYPE_writable_pagetables);
41262+
41263+ ARCH_SETUP
41264+#else
41265+ ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
41266+ screen_info = SCREEN_INFO;
41267+ edid_info = EDID_INFO;
41268+ saved_video_mode = SAVED_VIDEO_MODE;
41269+ bootloader_type = LOADER_TYPE;
41270+
41271+#ifdef CONFIG_BLK_DEV_RAM
41272+ rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
41273+ rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
41274+ rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
41275+#endif
41276+#endif /* !CONFIG_XEN */
41277+ setup_memory_region();
41278+ copy_edd();
41279+
41280+ if (!MOUNT_ROOT_RDONLY)
41281+ root_mountflags &= ~MS_RDONLY;
41282+ init_mm.start_code = (unsigned long) &_text;
41283+ init_mm.end_code = (unsigned long) &_etext;
41284+ init_mm.end_data = (unsigned long) &_edata;
41285+ init_mm.brk = (unsigned long) &_end;
41286+
41287+ code_resource.start = virt_to_phys(&_text);
41288+ code_resource.end = virt_to_phys(&_etext)-1;
41289+ data_resource.start = virt_to_phys(&_etext);
41290+ data_resource.end = virt_to_phys(&_edata)-1;
41291+
41292+ parse_cmdline_early(cmdline_p);
41293+
41294+ early_identify_cpu(&boot_cpu_data);
41295+
41296+ /*
41297+ * partially used pages are not usable - thus
41298+ * we are rounding upwards:
41299+ */
41300+ end_pfn = e820_end_of_ram();
41301+
41302+ check_efer();
41303+
41304+ init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
41305+
41306+#ifdef CONFIG_ACPI_NUMA
41307+ /*
41308+ * Parse SRAT to discover nodes.
41309+ */
41310+ acpi_numa_init();
41311+#endif
41312+
41313+#ifdef CONFIG_NUMA
41314+ numa_initmem_init(0, end_pfn);
41315+#else
41316+ contig_initmem_init(0, end_pfn);
41317+#endif
41318+
41319+ /* Reserve direct mapping */
41320+ reserve_bootmem_generic(table_start << PAGE_SHIFT,
41321+ (table_end - table_start) << PAGE_SHIFT);
41322+
41323+ /* reserve kernel */
41324+ kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
41325+ reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
41326+
41327+#ifdef CONFIG_XEN
41328+ /* reserve physmap, start info and initial page tables */
41329+ reserve_bootmem(kernel_end, (table_start<<PAGE_SHIFT)-kernel_end);
41330+#else
41331+ /*
41332+ * reserve physical page 0 - it's a special BIOS page on many boxes,
41333+ * enabling clean reboots, SMP operation, laptop functions.
41334+ */
41335+ reserve_bootmem_generic(0, PAGE_SIZE);
41336+
41337+ /* reserve ebda region */
41338+ reserve_ebda_region();
41339+#endif
41340+
41341+#ifdef CONFIG_SMP
41342+ /*
41343+ * But first pinch a few for the stack/trampoline stuff
41344+ * FIXME: Don't need the extra page at 4K, but need to fix
41345+ * trampoline before removing it. (see the GDT stuff)
41346+ */
41347+ reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
41348+
41349+ /* Reserve SMP trampoline */
41350+ reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
41351+#endif
41352+
41353+#ifdef CONFIG_ACPI_SLEEP
41354+ /*
41355+ * Reserve low memory region for sleep support.
41356+ */
41357+ acpi_reserve_bootmem();
41358+#endif
41359+#ifdef CONFIG_XEN
41360+#ifdef CONFIG_BLK_DEV_INITRD
41361+ if (xen_start_info->mod_start) {
41362+ if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
41363+ /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
41364+ initrd_start = INITRD_START + PAGE_OFFSET;
41365+ initrd_end = initrd_start+INITRD_SIZE;
41366+ initrd_below_start_ok = 1;
41367+ } else {
41368+ printk(KERN_ERR "initrd extends beyond end of memory "
41369+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
41370+ (unsigned long)(INITRD_START + INITRD_SIZE),
41371+ (unsigned long)(end_pfn << PAGE_SHIFT));
41372+ initrd_start = 0;
41373+ }
41374+ }
41375+#endif
41376+#else /* CONFIG_XEN */
41377+#ifdef CONFIG_BLK_DEV_INITRD
41378+ if (LOADER_TYPE && INITRD_START) {
41379+ if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
41380+ reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
41381+ initrd_start =
41382+ INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
41383+ initrd_end = initrd_start+INITRD_SIZE;
41384+ }
41385+ else {
41386+ printk(KERN_ERR "initrd extends beyond end of memory "
41387+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
41388+ (unsigned long)(INITRD_START + INITRD_SIZE),
41389+ (unsigned long)(end_pfn << PAGE_SHIFT));
41390+ initrd_start = 0;
41391+ }
41392+ }
41393+#endif
41394+#endif /* !CONFIG_XEN */
41395+#ifdef CONFIG_KEXEC
41396+#ifdef CONFIG_XEN
41397+ xen_machine_kexec_setup_resources();
41398+#else
41399+ if (crashk_res.start != crashk_res.end) {
41400+ reserve_bootmem(crashk_res.start,
41401+ crashk_res.end - crashk_res.start + 1);
41402+ }
41403+#endif
41404+#endif
41405+
41406+ paging_init();
41407+#ifdef CONFIG_X86_LOCAL_APIC
41408+ /*
41409+ * Find and reserve possible boot-time SMP configuration:
41410+ */
41411+ find_smp_config();
41412+#endif
41413+#ifdef CONFIG_XEN
41414+ {
41415+ int i, j, k, fpp;
41416+
41417+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
41418+ /* Make sure we have a large enough P->M table. */
41419+ phys_to_machine_mapping = alloc_bootmem_pages(
41420+ end_pfn * sizeof(unsigned long));
41421+ memset(phys_to_machine_mapping, ~0,
41422+ end_pfn * sizeof(unsigned long));
41423+ memcpy(phys_to_machine_mapping,
41424+ (unsigned long *)xen_start_info->mfn_list,
41425+ xen_start_info->nr_pages * sizeof(unsigned long));
41426+ free_bootmem(
41427+ __pa(xen_start_info->mfn_list),
41428+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
41429+ sizeof(unsigned long))));
41430+
41431+ /*
41432+ * Initialise the list of the frames that specify the
41433+ * list of frames that make up the p2m table. Used by
41434+ * save/restore.
41435+ */
41436+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
41437+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
41438+ virt_to_mfn(pfn_to_mfn_frame_list_list);
41439+
41440+ fpp = PAGE_SIZE/sizeof(unsigned long);
41441+ for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
41442+ if ((j % fpp) == 0) {
41443+ k++;
41444+ BUG_ON(k>=fpp);
41445+ pfn_to_mfn_frame_list[k] =
41446+ alloc_bootmem_pages(PAGE_SIZE);
41447+ pfn_to_mfn_frame_list_list[k] =
41448+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
41449+ j=0;
41450+ }
41451+ pfn_to_mfn_frame_list[k][j] =
41452+ virt_to_mfn(&phys_to_machine_mapping[i]);
41453+ }
41454+ HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
41455+ }
41456+
41457+ }
41458+
41459+ if (!is_initial_xendomain()) {
41460+ acpi_disabled = 1;
41461+#ifdef CONFIG_ACPI
41462+ acpi_ht = 0;
41463+#endif
41464+ }
41465+#endif
41466+
41467+#ifndef CONFIG_XEN
41468+ check_ioapic();
41469+#endif
41470+
41471+ zap_low_mappings(0);
41472+
41473+#ifdef CONFIG_ACPI
41474+ /*
41475+ * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
41476+ * Call this early for SRAT node setup.
41477+ */
41478+ acpi_boot_table_init();
41479+
41480+ /*
41481+ * Read APIC and some other early information from ACPI tables.
41482+ */
41483+ acpi_boot_init();
41484+#endif
41485+
41486+ init_cpu_to_node();
41487+
41488+#ifdef CONFIG_X86_LOCAL_APIC
41489+ /*
41490+ * get boot-time SMP configuration:
41491+ */
41492+ if (smp_found_config)
41493+ get_smp_config();
41494+#ifndef CONFIG_XEN
41495+ init_apic_mappings();
41496+#endif
41497+#endif
41498+#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
41499+ prefill_possible_map();
41500+#endif
41501+
41502+ /*
41503+ * Request address space for all standard RAM and ROM resources
41504+ * and also for regions reported as reserved by the e820.
41505+ */
41506+ probe_roms();
41507+#ifdef CONFIG_XEN
41508+ if (is_initial_xendomain()) {
41509+ memmap.nr_entries = E820MAX;
41510+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
41511+
41512+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
41513+ BUG();
41514+ machine_e820.nr_map = memmap.nr_entries;
41515+
41516+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
41517+ }
41518+#else
41519+ e820_reserve_resources(e820.map, e820.nr_map);
41520+#endif
41521+
41522+ request_resource(&iomem_resource, &video_ram_resource);
41523+
41524+ {
41525+ unsigned i;
41526+ /* request I/O space for devices used on all i[345]86 PCs */
41527+ for (i = 0; i < STANDARD_IO_RESOURCES; i++)
41528+ request_resource(&ioport_resource, &standard_io_resources[i]);
41529+ }
41530+
41531+#ifdef CONFIG_XEN
41532+ if (is_initial_xendomain())
41533+ e820_setup_gap(machine_e820.map, machine_e820.nr_map);
41534+#else
41535+ e820_setup_gap(e820.map, e820.nr_map);
41536+#endif
41537+
41538+#ifdef CONFIG_GART_IOMMU
41539+ iommu_hole_init();
41540+#endif
41541+
41542+#ifdef CONFIG_XEN
41543+ {
41544+ struct physdev_set_iopl set_iopl;
41545+
41546+ set_iopl.iopl = 1;
41547+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
41548+
41549+ if (is_initial_xendomain()) {
41550+#ifdef CONFIG_VT
41551+#if defined(CONFIG_VGA_CONSOLE)
41552+ conswitchp = &vga_con;
41553+#elif defined(CONFIG_DUMMY_CONSOLE)
41554+ conswitchp = &dummy_con;
41555+#endif
41556+#endif
41557+ } else {
41558+#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
41559+ conswitchp = &dummy_con;
41560+#endif
41561+ }
41562+ }
41563+ xencons_early_setup();
41564+#else /* CONFIG_XEN */
41565+
41566+#ifdef CONFIG_VT
41567+#if defined(CONFIG_VGA_CONSOLE)
41568+ conswitchp = &vga_con;
41569+#elif defined(CONFIG_DUMMY_CONSOLE)
41570+ conswitchp = &dummy_con;
41571+#endif
41572+#endif
41573+
41574+#endif /* !CONFIG_XEN */
41575+}
41576+
41577+#ifdef CONFIG_XEN
41578+static int
41579+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
41580+{
41581+ HYPERVISOR_shutdown(SHUTDOWN_crash);
41582+ /* we're never actually going to get here... */
41583+ return NOTIFY_DONE;
41584+}
41585+#endif /* !CONFIG_XEN */
41586+
41587+
41588+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
41589+{
41590+ unsigned int *v;
41591+
41592+ if (c->extended_cpuid_level < 0x80000004)
41593+ return 0;
41594+
41595+ v = (unsigned int *) c->x86_model_id;
41596+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
41597+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
41598+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
41599+ c->x86_model_id[48] = 0;
41600+ return 1;
41601+}
41602+
41603+
41604+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
41605+{
41606+ unsigned int n, dummy, eax, ebx, ecx, edx;
41607+
41608+ n = c->extended_cpuid_level;
41609+
41610+ if (n >= 0x80000005) {
41611+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
41612+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
41613+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
41614+ c->x86_cache_size=(ecx>>24)+(edx>>24);
41615+ /* On K8 L1 TLB is inclusive, so don't count it */
41616+ c->x86_tlbsize = 0;
41617+ }
41618+
41619+ if (n >= 0x80000006) {
41620+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
41621+ ecx = cpuid_ecx(0x80000006);
41622+ c->x86_cache_size = ecx >> 16;
41623+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
41624+
41625+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
41626+ c->x86_cache_size, ecx & 0xFF);
41627+ }
41628+
41629+ if (n >= 0x80000007)
41630+ cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
41631+ if (n >= 0x80000008) {
41632+ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
41633+ c->x86_virt_bits = (eax >> 8) & 0xff;
41634+ c->x86_phys_bits = eax & 0xff;
41635+ }
41636+}
41637+
41638+#ifdef CONFIG_NUMA
41639+static int nearby_node(int apicid)
41640+{
41641+ int i;
41642+ for (i = apicid - 1; i >= 0; i--) {
41643+ int node = apicid_to_node[i];
41644+ if (node != NUMA_NO_NODE && node_online(node))
41645+ return node;
41646+ }
41647+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
41648+ int node = apicid_to_node[i];
41649+ if (node != NUMA_NO_NODE && node_online(node))
41650+ return node;
41651+ }
41652+ return first_node(node_online_map); /* Shouldn't happen */
41653+}
41654+#endif
41655+
41656+/*
41657+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
41658+ * Assumes number of cores is a power of two.
41659+ */
41660+static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
41661+{
41662+#ifdef CONFIG_SMP
41663+ int cpu = smp_processor_id();
41664+ unsigned bits;
41665+#ifdef CONFIG_NUMA
41666+ int node = 0;
41667+ unsigned apicid = phys_proc_id[cpu];
41668+#endif
41669+
41670+ bits = 0;
41671+ while ((1 << bits) < c->x86_max_cores)
41672+ bits++;
41673+
41674+ /* Low order bits define the core id (index of core in socket) */
41675+ cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
41676+ /* Convert the APIC ID into the socket ID */
41677+ phys_proc_id[cpu] >>= bits;
41678+
41679+#ifdef CONFIG_NUMA
41680+ node = phys_proc_id[cpu];
41681+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
41682+ node = apicid_to_node[apicid];
41683+ if (!node_online(node)) {
41684+ /* Two possibilities here:
41685+ - The CPU is missing memory and no node was created.
41686+ In that case try picking one from a nearby CPU
41687+ - The APIC IDs differ from the HyperTransport node IDs
41688+ which the K8 northbridge parsing fills in.
41689+ Assume they are all increased by a constant offset,
41690+ but in the same order as the HT nodeids.
41691+ If that doesn't result in a usable node fall back to the
41692+ path for the previous case. */
41693+ int ht_nodeid = apicid - (phys_proc_id[0] << bits);
41694+ if (ht_nodeid >= 0 &&
41695+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
41696+ node = apicid_to_node[ht_nodeid];
41697+ /* Pick a nearby node */
41698+ if (!node_online(node))
41699+ node = nearby_node(apicid);
41700+ }
41701+ numa_set_node(cpu, node);
41702+
41703+ printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
41704+ cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
41705+#endif
41706+#endif
41707+}
41708+
41709+static int __init init_amd(struct cpuinfo_x86 *c)
41710+{
41711+ int r;
41712+ unsigned level;
41713+
41714+#ifdef CONFIG_SMP
41715+ unsigned long value;
41716+
41717+ /*
41718+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
41719+ * bit 6 of msr C001_0015
41720+ *
41721+ * Errata 63 for SH-B3 steppings
41722+ * Errata 122 for all steppings (F+ have it disabled by default)
41723+ */
41724+ if (c->x86 == 15) {
41725+ rdmsrl(MSR_K8_HWCR, value);
41726+ value |= 1 << 6;
41727+ wrmsrl(MSR_K8_HWCR, value);
41728+ }
41729+#endif
41730+
41731+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
41732+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
41733+ clear_bit(0*32+31, &c->x86_capability);
41734+
41735+ /* On C+ stepping K8 rep microcode works well for copy/memset */
41736+ level = cpuid_eax(1);
41737+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
41738+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
41739+
41740+ /* Enable workaround for FXSAVE leak */
41741+ if (c->x86 >= 6)
41742+ set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
41743+
41744+ r = get_model_name(c);
41745+ if (!r) {
41746+ switch (c->x86) {
41747+ case 15:
41748+ /* Should distinguish Models here, but this is only
41749+ a fallback anyways. */
41750+ strcpy(c->x86_model_id, "Hammer");
41751+ break;
41752+ }
41753+ }
41754+ display_cacheinfo(c);
41755+
41756+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
41757+ if (c->x86_power & (1<<8))
41758+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41759+
41760+ if (c->extended_cpuid_level >= 0x80000008) {
41761+ c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
41762+ if (c->x86_max_cores & (c->x86_max_cores - 1))
41763+ c->x86_max_cores = 1;
41764+
41765+ amd_detect_cmp(c);
41766+ }
41767+
41768+ return r;
41769+}
41770+
41771+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
41772+{
41773+#ifdef CONFIG_SMP
41774+ u32 eax, ebx, ecx, edx;
41775+ int index_msb, core_bits;
41776+ int cpu = smp_processor_id();
41777+
41778+ cpuid(1, &eax, &ebx, &ecx, &edx);
41779+
41780+ c->apicid = phys_pkg_id(0);
41781+
41782+ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
41783+ return;
41784+
41785+ smp_num_siblings = (ebx & 0xff0000) >> 16;
41786+
41787+ if (smp_num_siblings == 1) {
41788+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
41789+ } else if (smp_num_siblings > 1 ) {
41790+
41791+ if (smp_num_siblings > NR_CPUS) {
41792+ printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
41793+ smp_num_siblings = 1;
41794+ return;
41795+ }
41796+
41797+ index_msb = get_count_order(smp_num_siblings);
41798+ phys_proc_id[cpu] = phys_pkg_id(index_msb);
41799+
41800+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
41801+ phys_proc_id[cpu]);
41802+
41803+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
41804+
41805+ index_msb = get_count_order(smp_num_siblings) ;
41806+
41807+ core_bits = get_count_order(c->x86_max_cores);
41808+
41809+ cpu_core_id[cpu] = phys_pkg_id(index_msb) &
41810+ ((1 << core_bits) - 1);
41811+
41812+ if (c->x86_max_cores > 1)
41813+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
41814+ cpu_core_id[cpu]);
41815+ }
41816+#endif
41817+}
41818+
41819+/*
41820+ * find out the number of processor cores on the die
41821+ */
41822+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
41823+{
41824+ unsigned int eax;
41825+
41826+ if (c->cpuid_level < 4)
41827+ return 1;
41828+
41829+ __asm__("cpuid"
41830+ : "=a" (eax)
41831+ : "0" (4), "c" (0)
41832+ : "bx", "dx");
41833+
41834+ if (eax & 0x1f)
41835+ return ((eax >> 26) + 1);
41836+ else
41837+ return 1;
41838+}
41839+
41840+static void srat_detect_node(void)
41841+{
41842+#ifdef CONFIG_NUMA
41843+ unsigned node;
41844+ int cpu = smp_processor_id();
41845+
41846+ /* Don't do the funky fallback heuristics the AMD version employs
41847+ for now. */
41848+ node = apicid_to_node[hard_smp_processor_id()];
41849+ if (node == NUMA_NO_NODE)
41850+ node = 0;
41851+ numa_set_node(cpu, node);
41852+
41853+ if (acpi_numa > 0)
41854+ printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
41855+#endif
41856+}
41857+
41858+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
41859+{
41860+ /* Cache sizes */
41861+ unsigned n;
41862+
41863+ init_intel_cacheinfo(c);
41864+ n = c->extended_cpuid_level;
41865+ if (n >= 0x80000008) {
41866+ unsigned eax = cpuid_eax(0x80000008);
41867+ c->x86_virt_bits = (eax >> 8) & 0xff;
41868+ c->x86_phys_bits = eax & 0xff;
41869+ /* CPUID workaround for Intel 0F34 CPU */
41870+ if (c->x86_vendor == X86_VENDOR_INTEL &&
41871+ c->x86 == 0xF && c->x86_model == 0x3 &&
41872+ c->x86_mask == 0x4)
41873+ c->x86_phys_bits = 36;
41874+ }
41875+
41876+ if (c->x86 == 15)
41877+ c->x86_cache_alignment = c->x86_clflush_size * 2;
41878+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
41879+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
41880+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41881+ set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41882+ c->x86_max_cores = intel_num_cpu_cores(c);
41883+
41884+ srat_detect_node();
41885+}
41886+
41887+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
41888+{
41889+ char *v = c->x86_vendor_id;
41890+
41891+ if (!strcmp(v, "AuthenticAMD"))
41892+ c->x86_vendor = X86_VENDOR_AMD;
41893+ else if (!strcmp(v, "GenuineIntel"))
41894+ c->x86_vendor = X86_VENDOR_INTEL;
41895+ else
41896+ c->x86_vendor = X86_VENDOR_UNKNOWN;
41897+}
41898+
41899+struct cpu_model_info {
41900+ int vendor;
41901+ int family;
41902+ char *model_names[16];
41903+};
41904+
41905+/* Do some early cpuid on the boot CPU to get some parameter that are
41906+ needed before check_bugs. Everything advanced is in identify_cpu
41907+ below. */
41908+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
41909+{
41910+ u32 tfms;
41911+
41912+ c->loops_per_jiffy = loops_per_jiffy;
41913+ c->x86_cache_size = -1;
41914+ c->x86_vendor = X86_VENDOR_UNKNOWN;
41915+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
41916+ c->x86_vendor_id[0] = '\0'; /* Unset */
41917+ c->x86_model_id[0] = '\0'; /* Unset */
41918+ c->x86_clflush_size = 64;
41919+ c->x86_cache_alignment = c->x86_clflush_size;
41920+ c->x86_max_cores = 1;
41921+ c->extended_cpuid_level = 0;
41922+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
41923+
41924+ /* Get vendor name */
41925+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
41926+ (unsigned int *)&c->x86_vendor_id[0],
41927+ (unsigned int *)&c->x86_vendor_id[8],
41928+ (unsigned int *)&c->x86_vendor_id[4]);
41929+
41930+ get_cpu_vendor(c);
41931+
41932+ /* Initialize the standard set of capabilities */
41933+ /* Note that the vendor-specific code below might override */
41934+
41935+ /* Intel-defined flags: level 0x00000001 */
41936+ if (c->cpuid_level >= 0x00000001) {
41937+ __u32 misc;
41938+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
41939+ &c->x86_capability[0]);
41940+ c->x86 = (tfms >> 8) & 0xf;
41941+ c->x86_model = (tfms >> 4) & 0xf;
41942+ c->x86_mask = tfms & 0xf;
41943+ if (c->x86 == 0xf)
41944+ c->x86 += (tfms >> 20) & 0xff;
41945+ if (c->x86 >= 0x6)
41946+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
41947+ if (c->x86_capability[0] & (1<<19))
41948+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
41949+ } else {
41950+ /* Have CPUID level 0 only - unheard of */
41951+ c->x86 = 4;
41952+ }
41953+
41954+#ifdef CONFIG_SMP
41955+ phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
41956+#endif
41957+}
41958+
41959+/*
41960+ * This does the hard work of actually picking apart the CPU stuff...
41961+ */
41962+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
41963+{
41964+ int i;
41965+ u32 xlvl;
41966+
41967+ early_identify_cpu(c);
41968+
41969+ /* AMD-defined flags: level 0x80000001 */
41970+ xlvl = cpuid_eax(0x80000000);
41971+ c->extended_cpuid_level = xlvl;
41972+ if ((xlvl & 0xffff0000) == 0x80000000) {
41973+ if (xlvl >= 0x80000001) {
41974+ c->x86_capability[1] = cpuid_edx(0x80000001);
41975+ c->x86_capability[6] = cpuid_ecx(0x80000001);
41976+ }
41977+ if (xlvl >= 0x80000004)
41978+ get_model_name(c); /* Default name */
41979+ }
41980+
41981+ /* Transmeta-defined flags: level 0x80860001 */
41982+ xlvl = cpuid_eax(0x80860000);
41983+ if ((xlvl & 0xffff0000) == 0x80860000) {
41984+ /* Don't set x86_cpuid_level here for now to not confuse. */
41985+ if (xlvl >= 0x80860001)
41986+ c->x86_capability[2] = cpuid_edx(0x80860001);
41987+ }
41988+
41989+ /*
41990+ * Vendor-specific initialization. In this section we
41991+ * canonicalize the feature flags, meaning if there are
41992+ * features a certain CPU supports which CPUID doesn't
41993+ * tell us, CPUID claiming incorrect flags, or other bugs,
41994+ * we handle them here.
41995+ *
41996+ * At the end of this section, c->x86_capability better
41997+ * indicate the features this CPU genuinely supports!
41998+ */
41999+ switch (c->x86_vendor) {
42000+ case X86_VENDOR_AMD:
42001+ init_amd(c);
42002+ break;
42003+
42004+ case X86_VENDOR_INTEL:
42005+ init_intel(c);
42006+ break;
42007+
42008+ case X86_VENDOR_UNKNOWN:
42009+ default:
42010+ display_cacheinfo(c);
42011+ break;
42012+ }
42013+
42014+ select_idle_routine(c);
42015+ detect_ht(c);
42016+
42017+ /*
42018+ * On SMP, boot_cpu_data holds the common feature set between
42019+ * all CPUs; so make sure that we indicate which features are
42020+ * common between the CPUs. The first time this routine gets
42021+ * executed, c == &boot_cpu_data.
42022+ */
42023+ if (c != &boot_cpu_data) {
42024+ /* AND the already accumulated flags with these */
42025+ for (i = 0 ; i < NCAPINTS ; i++)
42026+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
42027+ }
42028+
42029+#ifdef CONFIG_X86_MCE
42030+ mcheck_init(c);
42031+#endif
42032+ if (c == &boot_cpu_data)
42033+ mtrr_bp_init();
42034+ else
42035+ mtrr_ap_init();
42036+#ifdef CONFIG_NUMA
42037+ numa_add_cpu(smp_processor_id());
42038+#endif
42039+}
42040+
42041+
42042+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
42043+{
42044+ if (c->x86_model_id[0])
42045+ printk("%s", c->x86_model_id);
42046+
42047+ if (c->x86_mask || c->cpuid_level >= 0)
42048+ printk(" stepping %02x\n", c->x86_mask);
42049+ else
42050+ printk("\n");
42051+}
42052+
42053+/*
42054+ * Get CPU information for use by the procfs.
42055+ */
42056+
42057+static int show_cpuinfo(struct seq_file *m, void *v)
42058+{
42059+ struct cpuinfo_x86 *c = v;
42060+
42061+ /*
42062+ * These flag bits must match the definitions in <asm/cpufeature.h>.
42063+ * NULL means this bit is undefined or reserved; either way it doesn't
42064+ * have meaning as far as Linux is concerned. Note that it's important
42065+ * to realize there is a difference between this table and CPUID -- if
42066+ * applications want to get the raw CPUID data, they should access
42067+ * /dev/cpu/<cpu_nr>/cpuid instead.
42068+ */
42069+ static char *x86_cap_flags[] = {
42070+ /* Intel-defined */
42071+ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
42072+ "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
42073+ "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
42074+ "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
42075+
42076+ /* AMD-defined */
42077+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42078+ NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
42079+ NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
42080+ NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
42081+
42082+ /* Transmeta-defined */
42083+ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
42084+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42085+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42086+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42087+
42088+ /* Other (Linux-defined) */
42089+ "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
42090+ "constant_tsc", NULL, NULL,
42091+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42092+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42093+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42094+
42095+ /* Intel-defined (#2) */
42096+ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
42097+ "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
42098+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42099+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42100+
42101+ /* VIA/Cyrix/Centaur-defined */
42102+ NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
42103+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42104+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42105+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42106+
42107+ /* AMD-defined (#2) */
42108+ "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
42109+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42110+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42111+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42112+ };
42113+ static char *x86_power_flags[] = {
42114+ "ts", /* temperature sensor */
42115+ "fid", /* frequency id control */
42116+ "vid", /* voltage id control */
42117+ "ttp", /* thermal trip */
42118+ "tm",
42119+ "stc",
42120+ NULL,
42121+ /* nothing */ /* constant_tsc - moved to flags */
42122+ };
42123+
42124+
42125+#ifdef CONFIG_SMP
42126+ if (!cpu_online(c-cpu_data))
42127+ return 0;
42128+#endif
42129+
42130+ seq_printf(m,"processor\t: %u\n"
42131+ "vendor_id\t: %s\n"
42132+ "cpu family\t: %d\n"
42133+ "model\t\t: %d\n"
42134+ "model name\t: %s\n",
42135+ (unsigned)(c-cpu_data),
42136+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
42137+ c->x86,
42138+ (int)c->x86_model,
42139+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
42140+
42141+ if (c->x86_mask || c->cpuid_level >= 0)
42142+ seq_printf(m, "stepping\t: %d\n", c->x86_mask);
42143+ else
42144+ seq_printf(m, "stepping\t: unknown\n");
42145+
42146+ if (cpu_has(c,X86_FEATURE_TSC)) {
42147+ unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
42148+ if (!freq)
42149+ freq = cpu_khz;
42150+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
42151+ freq / 1000, (freq % 1000));
42152+ }
42153+
42154+ /* Cache size */
42155+ if (c->x86_cache_size >= 0)
42156+ seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
42157+
42158+#ifdef CONFIG_SMP
42159+ if (smp_num_siblings * c->x86_max_cores > 1) {
42160+ int cpu = c - cpu_data;
42161+ seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
42162+ seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
42163+ seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
42164+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
42165+ }
42166+#endif
42167+
42168+ seq_printf(m,
42169+ "fpu\t\t: yes\n"
42170+ "fpu_exception\t: yes\n"
42171+ "cpuid level\t: %d\n"
42172+ "wp\t\t: yes\n"
42173+ "flags\t\t:",
42174+ c->cpuid_level);
42175+
42176+ {
42177+ int i;
42178+ for ( i = 0 ; i < 32*NCAPINTS ; i++ )
42179+ if ( test_bit(i, &c->x86_capability) &&
42180+ x86_cap_flags[i] != NULL )
42181+ seq_printf(m, " %s", x86_cap_flags[i]);
42182+ }
42183+
42184+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
42185+ c->loops_per_jiffy/(500000/HZ),
42186+ (c->loops_per_jiffy/(5000/HZ)) % 100);
42187+
42188+ if (c->x86_tlbsize > 0)
42189+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
42190+ seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
42191+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
42192+
42193+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
42194+ c->x86_phys_bits, c->x86_virt_bits);
42195+
42196+ seq_printf(m, "power management:");
42197+ {
42198+ unsigned i;
42199+ for (i = 0; i < 32; i++)
42200+ if (c->x86_power & (1 << i)) {
42201+ if (i < ARRAY_SIZE(x86_power_flags) &&
42202+ x86_power_flags[i])
42203+ seq_printf(m, "%s%s",
42204+ x86_power_flags[i][0]?" ":"",
42205+ x86_power_flags[i]);
42206+ else
42207+ seq_printf(m, " [%d]", i);
42208+ }
42209+ }
42210+
42211+ seq_printf(m, "\n\n");
42212+
42213+ return 0;
42214+}
42215+
42216+static void *c_start(struct seq_file *m, loff_t *pos)
42217+{
42218+ return *pos < NR_CPUS ? cpu_data + *pos : NULL;
42219+}
42220+
42221+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
42222+{
42223+ ++*pos;
42224+ return c_start(m, pos);
42225+}
42226+
42227+static void c_stop(struct seq_file *m, void *v)
42228+{
42229+}
42230+
42231+struct seq_operations cpuinfo_op = {
42232+ .start =c_start,
42233+ .next = c_next,
42234+ .stop = c_stop,
42235+ .show = show_cpuinfo,
42236+};
42237+
42238+static int __init run_dmi_scan(void)
42239+{
42240+ dmi_scan_machine();
42241+ return 0;
42242+}
42243+core_initcall(run_dmi_scan);
42244+
42245diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/setup64-xen.c linux-2.6.16.33/arch/x86_64/kernel/setup64-xen.c
42246--- linux-2.6.16.33-noxen/arch/x86_64/kernel/setup64-xen.c 1970-01-01 00:00:00.000000000 +0000
42247+++ linux-2.6.16.33/arch/x86_64/kernel/setup64-xen.c 2007-01-08 15:00:45.000000000 +0000
42248@@ -0,0 +1,360 @@
42249+/*
42250+ * X86-64 specific CPU setup.
42251+ * Copyright (C) 1995 Linus Torvalds
42252+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
42253+ * See setup.c for older changelog.
42254+ * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
42255+ *
42256+ * Jun Nakajima <jun.nakajima@intel.com>
42257+ * Modified for Xen
42258+ *
42259+ */
42260+#include <linux/config.h>
42261+#include <linux/init.h>
42262+#include <linux/kernel.h>
42263+#include <linux/sched.h>
42264+#include <linux/string.h>
42265+#include <linux/bootmem.h>
42266+#include <linux/bitops.h>
42267+#include <linux/module.h>
42268+#include <asm/bootsetup.h>
42269+#include <asm/pda.h>
42270+#include <asm/pgtable.h>
42271+#include <asm/processor.h>
42272+#include <asm/desc.h>
42273+#include <asm/atomic.h>
42274+#include <asm/mmu_context.h>
42275+#include <asm/smp.h>
42276+#include <asm/i387.h>
42277+#include <asm/percpu.h>
42278+#include <asm/proto.h>
42279+#include <asm/sections.h>
42280+#ifdef CONFIG_XEN
42281+#include <asm/hypervisor.h>
42282+#endif
42283+
42284+char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
42285+
42286+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
42287+
42288+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
42289+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
42290+
42291+#ifndef CONFIG_X86_NO_IDT
42292+struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
42293+#endif
42294+
42295+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
42296+
42297+unsigned long __supported_pte_mask __read_mostly = ~0UL;
42298+static int do_not_nx __cpuinitdata = 0;
42299+
42300+/* noexec=on|off
42301+Control non executable mappings for 64bit processes.
42302+
42303+on Enable(default)
42304+off Disable
42305+*/
42306+int __init nonx_setup(char *str)
42307+{
42308+ if (!strncmp(str, "on", 2)) {
42309+ __supported_pte_mask |= _PAGE_NX;
42310+ do_not_nx = 0;
42311+ } else if (!strncmp(str, "off", 3)) {
42312+ do_not_nx = 1;
42313+ __supported_pte_mask &= ~_PAGE_NX;
42314+ }
42315+ return 0;
42316+}
42317+__setup("noexec=", nonx_setup); /* parsed early actually */
42318+
42319+int force_personality32 = READ_IMPLIES_EXEC;
42320+
42321+/* noexec32=on|off
42322+Control non executable heap for 32bit processes.
42323+To control the stack too use noexec=off
42324+
42325+on PROT_READ does not imply PROT_EXEC for 32bit processes
42326+off PROT_READ implies PROT_EXEC (default)
42327+*/
42328+static int __init nonx32_setup(char *str)
42329+{
42330+ if (!strcmp(str, "on"))
42331+ force_personality32 &= ~READ_IMPLIES_EXEC;
42332+ else if (!strcmp(str, "off"))
42333+ force_personality32 |= READ_IMPLIES_EXEC;
42334+ return 0;
42335+}
42336+__setup("noexec32=", nonx32_setup);
42337+
42338+/*
42339+ * Great future plan:
42340+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
42341+ * Always point %gs to its beginning
42342+ */
42343+void __init setup_per_cpu_areas(void)
42344+{
42345+ int i;
42346+ unsigned long size;
42347+
42348+#ifdef CONFIG_HOTPLUG_CPU
42349+ prefill_possible_map();
42350+#endif
42351+
42352+ /* Copy section for each CPU (we discard the original) */
42353+ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
42354+#ifdef CONFIG_MODULES
42355+ if (size < PERCPU_ENOUGH_ROOM)
42356+ size = PERCPU_ENOUGH_ROOM;
42357+#endif
42358+
42359+ for_each_cpu_mask (i, cpu_possible_map) {
42360+ char *ptr;
42361+
42362+ if (!NODE_DATA(cpu_to_node(i))) {
42363+ printk("cpu with no node %d, num_online_nodes %d\n",
42364+ i, num_online_nodes());
42365+ ptr = alloc_bootmem(size);
42366+ } else {
42367+ ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
42368+ }
42369+ if (!ptr)
42370+ panic("Cannot allocate cpu data for CPU %d\n", i);
42371+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
42372+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
42373+ }
42374+}
42375+
42376+#ifdef CONFIG_XEN
42377+static void switch_pt(void)
42378+{
42379+ xen_pt_switch(__pa(init_level4_pgt));
42380+ xen_new_user_pt(__pa(init_level4_user_pgt));
42381+}
42382+
42383+void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
42384+{
42385+ unsigned long frames[16];
42386+ unsigned long va;
42387+ int f;
42388+
42389+ for (va = gdt_descr->address, f = 0;
42390+ va < gdt_descr->address + gdt_descr->size;
42391+ va += PAGE_SIZE, f++) {
42392+ frames[f] = virt_to_mfn(va);
42393+ make_page_readonly(
42394+ (void *)va, XENFEAT_writable_descriptor_tables);
42395+ }
42396+ if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
42397+ sizeof (struct desc_struct)))
42398+ BUG();
42399+}
42400+#else
42401+static void switch_pt(void)
42402+{
42403+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
42404+}
42405+
42406+void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
42407+{
42408+ asm volatile("lgdt %0" :: "m" (*gdt_descr));
42409+ asm volatile("lidt %0" :: "m" (idt_descr));
42410+}
42411+#endif
42412+
42413+void pda_init(int cpu)
42414+{
42415+ struct x8664_pda *pda = cpu_pda(cpu);
42416+
42417+ /* Setup up data that may be needed in __get_free_pages early */
42418+ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
42419+#ifndef CONFIG_XEN
42420+ wrmsrl(MSR_GS_BASE, pda);
42421+#else
42422+ HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
42423+#endif
42424+ pda->cpunumber = cpu;
42425+ pda->irqcount = -1;
42426+ pda->kernelstack =
42427+ (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
42428+ pda->active_mm = &init_mm;
42429+ pda->mmu_state = 0;
42430+
42431+ if (cpu == 0) {
42432+#ifdef CONFIG_XEN
42433+ xen_init_pt();
42434+#endif
42435+ /* others are initialized in smpboot.c */
42436+ pda->pcurrent = &init_task;
42437+ pda->irqstackptr = boot_cpu_stack;
42438+ } else {
42439+ pda->irqstackptr = (char *)
42440+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
42441+ if (!pda->irqstackptr)
42442+ panic("cannot allocate irqstack for cpu %d", cpu);
42443+ }
42444+
42445+ switch_pt();
42446+
42447+ pda->irqstackptr += IRQSTACKSIZE-64;
42448+}
42449+
42450+#ifndef CONFIG_X86_NO_TSS
42451+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
42452+__attribute__((section(".bss.page_aligned")));
42453+#endif
42454+
42455+/* May not be marked __init: used by software suspend */
42456+void syscall_init(void)
42457+{
42458+#ifndef CONFIG_XEN
42459+ /*
42460+ * LSTAR and STAR live in a bit strange symbiosis.
42461+ * They both write to the same internal register. STAR allows to set CS/DS
42462+ * but only a 32bit target. LSTAR sets the 64bit rip.
42463+ */
42464+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
42465+ wrmsrl(MSR_LSTAR, system_call);
42466+
42467+ /* Flags to clear on syscall */
42468+ wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
42469+#endif
42470+#ifdef CONFIG_IA32_EMULATION
42471+ syscall32_cpu_init ();
42472+#endif
42473+}
42474+
42475+void __cpuinit check_efer(void)
42476+{
42477+ unsigned long efer;
42478+
42479+ rdmsrl(MSR_EFER, efer);
42480+ if (!(efer & EFER_NX) || do_not_nx) {
42481+ __supported_pte_mask &= ~_PAGE_NX;
42482+ }
42483+}
42484+
42485+/*
42486+ * cpu_init() initializes state that is per-CPU. Some data is already
42487+ * initialized (naturally) in the bootstrap process, such as the GDT
42488+ * and IDT. We reload them nevertheless, this function acts as a
42489+ * 'CPU state barrier', nothing should get across.
42490+ * A lot of state is already set up in PDA init.
42491+ */
42492+void __cpuinit cpu_init (void)
42493+{
42494+ int cpu = stack_smp_processor_id();
42495+#ifndef CONFIG_X86_NO_TSS
42496+ struct tss_struct *t = &per_cpu(init_tss, cpu);
42497+ unsigned long v;
42498+ char *estacks = NULL;
42499+ unsigned i;
42500+#endif
42501+ struct task_struct *me;
42502+
42503+ /* CPU 0 is initialised in head64.c */
42504+ if (cpu != 0) {
42505+ pda_init(cpu);
42506+ zap_low_mappings(cpu);
42507+ }
42508+#ifndef CONFIG_X86_NO_TSS
42509+ else
42510+ estacks = boot_exception_stacks;
42511+#endif
42512+
42513+ me = current;
42514+
42515+ if (cpu_test_and_set(cpu, cpu_initialized))
42516+ panic("CPU#%d already initialized!\n", cpu);
42517+
42518+ printk("Initializing CPU#%d\n", cpu);
42519+
42520+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
42521+
42522+ /*
42523+ * Initialize the per-CPU GDT with the boot GDT,
42524+ * and set up the GDT descriptor:
42525+ */
42526+#ifndef CONFIG_XEN
42527+ if (cpu)
42528+ memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
42529+#endif
42530+
42531+ cpu_gdt_descr[cpu].size = GDT_SIZE;
42532+ cpu_gdt_init(&cpu_gdt_descr[cpu]);
42533+
42534+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
42535+ syscall_init();
42536+
42537+ wrmsrl(MSR_FS_BASE, 0);
42538+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
42539+ barrier();
42540+
42541+ check_efer();
42542+
42543+#ifndef CONFIG_X86_NO_TSS
42544+ /*
42545+ * set up and load the per-CPU TSS
42546+ */
42547+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
42548+ if (cpu) {
42549+ static const unsigned int order[N_EXCEPTION_STACKS] = {
42550+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
42551+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
42552+ };
42553+
42554+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
42555+ if (!estacks)
42556+ panic("Cannot allocate exception stack %ld %d\n",
42557+ v, cpu);
42558+ }
42559+ switch (v + 1) {
42560+#if DEBUG_STKSZ > EXCEPTION_STKSZ
42561+ case DEBUG_STACK:
42562+ cpu_pda[cpu].debugstack = (unsigned long)estacks;
42563+ estacks += DEBUG_STKSZ;
42564+ break;
42565+#endif
42566+ default:
42567+ estacks += EXCEPTION_STKSZ;
42568+ break;
42569+ }
42570+ t->ist[v] = (unsigned long)estacks;
42571+ }
42572+
42573+ t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
42574+ /*
42575+ * <= is required because the CPU will access up to
42576+ * 8 bits beyond the end of the IO permission bitmap.
42577+ */
42578+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
42579+ t->io_bitmap[i] = ~0UL;
42580+#endif
42581+
42582+ atomic_inc(&init_mm.mm_count);
42583+ me->active_mm = &init_mm;
42584+ if (me->mm)
42585+ BUG();
42586+ enter_lazy_tlb(&init_mm, me);
42587+
42588+#ifndef CONFIG_X86_NO_TSS
42589+ set_tss_desc(cpu, t);
42590+#endif
42591+#ifndef CONFIG_XEN
42592+ load_TR_desc();
42593+#endif
42594+ load_LDT(&init_mm.context);
42595+
42596+ /*
42597+ * Clear all 6 debug registers:
42598+ */
42599+
42600+ set_debug(0UL, 0);
42601+ set_debug(0UL, 1);
42602+ set_debug(0UL, 2);
42603+ set_debug(0UL, 3);
42604+ set_debug(0UL, 6);
42605+ set_debug(0UL, 7);
42606+
42607+ fpu_init();
42608+}
42609diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/smp-xen.c linux-2.6.16.33/arch/x86_64/kernel/smp-xen.c
42610--- linux-2.6.16.33-noxen/arch/x86_64/kernel/smp-xen.c 1970-01-01 00:00:00.000000000 +0000
42611+++ linux-2.6.16.33/arch/x86_64/kernel/smp-xen.c 2007-01-08 15:00:45.000000000 +0000
42612@@ -0,0 +1,596 @@
42613+/*
42614+ * Intel SMP support routines.
42615+ *
42616+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
42617+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
42618+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
42619+ *
42620+ * This code is released under the GNU General Public License version 2 or
42621+ * later.
42622+ */
42623+
42624+#include <linux/init.h>
42625+
42626+#include <linux/mm.h>
42627+#include <linux/delay.h>
42628+#include <linux/spinlock.h>
42629+#include <linux/smp_lock.h>
42630+#include <linux/smp.h>
42631+#include <linux/kernel_stat.h>
42632+#include <linux/mc146818rtc.h>
42633+#include <linux/interrupt.h>
42634+
42635+#include <asm/mtrr.h>
42636+#include <asm/pgalloc.h>
42637+#include <asm/tlbflush.h>
42638+#include <asm/mach_apic.h>
42639+#include <asm/mmu_context.h>
42640+#include <asm/proto.h>
42641+#include <asm/apicdef.h>
42642+#include <asm/idle.h>
42643+#ifdef CONFIG_XEN
42644+#include <xen/evtchn.h>
42645+#endif
42646+
42647+#ifndef CONFIG_XEN
42648+/*
42649+ * Smarter SMP flushing macros.
42650+ * c/o Linus Torvalds.
42651+ *
42652+ * These mean you can really definitely utterly forget about
42653+ * writing to user space from interrupts. (Its not allowed anyway).
42654+ *
42655+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
42656+ *
42657+ * More scalable flush, from Andi Kleen
42658+ *
42659+ * To avoid global state use 8 different call vectors.
42660+ * Each CPU uses a specific vector to trigger flushes on other
42661+ * CPUs. Depending on the received vector the target CPUs look into
42662+ * the right per cpu variable for the flush data.
42663+ *
42664+ * With more than 8 CPUs they are hashed to the 8 available
42665+ * vectors. The limited global vector space forces us to this right now.
42666+ * In future when interrupts are split into per CPU domains this could be
42667+ * fixed, at the cost of triggering multiple IPIs in some cases.
42668+ */
42669+
42670+union smp_flush_state {
42671+ struct {
42672+ cpumask_t flush_cpumask;
42673+ struct mm_struct *flush_mm;
42674+ unsigned long flush_va;
42675+#define FLUSH_ALL -1ULL
42676+ spinlock_t tlbstate_lock;
42677+ };
42678+ char pad[SMP_CACHE_BYTES];
42679+} ____cacheline_aligned;
42680+
42681+/* State is put into the per CPU data section, but padded
42682+ to a full cache line because other CPUs can access it and we don't
42683+ want false sharing in the per cpu data segment. */
42684+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
42685+#endif
42686+
42687+/*
42688+ * We cannot call mmdrop() because we are in interrupt context,
42689+ * instead update mm->cpu_vm_mask.
42690+ */
42691+static inline void leave_mm(unsigned long cpu)
42692+{
42693+ if (read_pda(mmu_state) == TLBSTATE_OK)
42694+ BUG();
42695+ clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
42696+ load_cr3(swapper_pg_dir);
42697+}
42698+
42699+#ifndef CONFIG_XEN
42700+/*
42701+ *
42702+ * The flush IPI assumes that a thread switch happens in this order:
42703+ * [cpu0: the cpu that switches]
42704+ * 1) switch_mm() either 1a) or 1b)
42705+ * 1a) thread switch to a different mm
42706+ * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
42707+ * Stop ipi delivery for the old mm. This is not synchronized with
42708+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
42709+ * for the wrong mm, and in the worst case we perform a superfluous
42710+ * tlb flush.
42711+ * 1a2) set cpu mmu_state to TLBSTATE_OK
42712+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
42713+ * was in lazy tlb mode.
42714+ * 1a3) update cpu active_mm
42715+ * Now cpu0 accepts tlb flushes for the new mm.
42716+ * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
42717+ * Now the other cpus will send tlb flush ipis.
42718+ * 1a4) change cr3.
42719+ * 1b) thread switch without mm change
42720+ * cpu active_mm is correct, cpu0 already handles
42721+ * flush ipis.
42722+ * 1b1) set cpu mmu_state to TLBSTATE_OK
42723+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
42724+ * Atomically set the bit [other cpus will start sending flush ipis],
42725+ * and test the bit.
42726+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
42727+ * 2) switch %%esp, ie current
42728+ *
42729+ * The interrupt must handle 2 special cases:
42730+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
42731+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
42732+ * runs in kernel space, the cpu could load tlb entries for user space
42733+ * pages.
42734+ *
42735+ * The good news is that cpu mmu_state is local to each cpu, no
42736+ * write/read ordering problems.
42737+ */
42738+
42739+/*
42740+ * TLB flush IPI:
42741+ *
42742+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
42743+ * 2) Leave the mm if we are in the lazy tlb mode.
42744+ *
42745+ * Interrupts are disabled.
42746+ */
42747+
42748+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
42749+{
42750+ int cpu;
42751+ int sender;
42752+ union smp_flush_state *f;
42753+
42754+ cpu = smp_processor_id();
42755+ /*
42756+ * orig_rax contains the interrupt vector - 256.
42757+ * Use that to determine where the sender put the data.
42758+ */
42759+ sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
42760+ f = &per_cpu(flush_state, sender);
42761+
42762+ if (!cpu_isset(cpu, f->flush_cpumask))
42763+ goto out;
42764+ /*
42765+ * This was a BUG() but until someone can quote me the
42766+ * line from the intel manual that guarantees an IPI to
42767+ * multiple CPUs is retried _only_ on the erroring CPUs
42768+ * its staying as a return
42769+ *
42770+ * BUG();
42771+ */
42772+
42773+ if (f->flush_mm == read_pda(active_mm)) {
42774+ if (read_pda(mmu_state) == TLBSTATE_OK) {
42775+ if (f->flush_va == FLUSH_ALL)
42776+ local_flush_tlb();
42777+ else
42778+ __flush_tlb_one(f->flush_va);
42779+ } else
42780+ leave_mm(cpu);
42781+ }
42782+out:
42783+ ack_APIC_irq();
42784+ cpu_clear(cpu, f->flush_cpumask);
42785+}
42786+
42787+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
42788+ unsigned long va)
42789+{
42790+ int sender;
42791+ union smp_flush_state *f;
42792+
42793+ /* Caller has disabled preemption */
42794+ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
42795+ f = &per_cpu(flush_state, sender);
42796+
42797+ /* Could avoid this lock when
42798+ num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
42799+ probably not worth checking this for a cache-hot lock. */
42800+ spin_lock(&f->tlbstate_lock);
42801+
42802+ f->flush_mm = mm;
42803+ f->flush_va = va;
42804+ cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
42805+
42806+ /*
42807+ * We have to send the IPI only to
42808+ * CPUs affected.
42809+ */
42810+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
42811+
42812+ while (!cpus_empty(f->flush_cpumask))
42813+ cpu_relax();
42814+
42815+ f->flush_mm = NULL;
42816+ f->flush_va = 0;
42817+ spin_unlock(&f->tlbstate_lock);
42818+}
42819+
42820+int __cpuinit init_smp_flush(void)
42821+{
42822+ int i;
42823+ for_each_cpu_mask(i, cpu_possible_map) {
42824+ spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
42825+ }
42826+ return 0;
42827+}
42828+
42829+core_initcall(init_smp_flush);
42830+
42831+void flush_tlb_current_task(void)
42832+{
42833+ struct mm_struct *mm = current->mm;
42834+ cpumask_t cpu_mask;
42835+
42836+ preempt_disable();
42837+ cpu_mask = mm->cpu_vm_mask;
42838+ cpu_clear(smp_processor_id(), cpu_mask);
42839+
42840+ local_flush_tlb();
42841+ if (!cpus_empty(cpu_mask))
42842+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42843+ preempt_enable();
42844+}
42845+
42846+void flush_tlb_mm (struct mm_struct * mm)
42847+{
42848+ cpumask_t cpu_mask;
42849+
42850+ preempt_disable();
42851+ cpu_mask = mm->cpu_vm_mask;
42852+ cpu_clear(smp_processor_id(), cpu_mask);
42853+
42854+ if (current->active_mm == mm) {
42855+ if (current->mm)
42856+ local_flush_tlb();
42857+ else
42858+ leave_mm(smp_processor_id());
42859+ }
42860+ if (!cpus_empty(cpu_mask))
42861+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42862+
42863+ preempt_enable();
42864+}
42865+
42866+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42867+{
42868+ struct mm_struct *mm = vma->vm_mm;
42869+ cpumask_t cpu_mask;
42870+
42871+ preempt_disable();
42872+ cpu_mask = mm->cpu_vm_mask;
42873+ cpu_clear(smp_processor_id(), cpu_mask);
42874+
42875+ if (current->active_mm == mm) {
42876+ if(current->mm)
42877+ __flush_tlb_one(va);
42878+ else
42879+ leave_mm(smp_processor_id());
42880+ }
42881+
42882+ if (!cpus_empty(cpu_mask))
42883+ flush_tlb_others(cpu_mask, mm, va);
42884+
42885+ preempt_enable();
42886+}
42887+
42888+static void do_flush_tlb_all(void* info)
42889+{
42890+ unsigned long cpu = smp_processor_id();
42891+
42892+ __flush_tlb_all();
42893+ if (read_pda(mmu_state) == TLBSTATE_LAZY)
42894+ leave_mm(cpu);
42895+}
42896+
42897+void flush_tlb_all(void)
42898+{
42899+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
42900+}
42901+#else
42902+asmlinkage void smp_invalidate_interrupt (void)
42903+{ return; }
42904+void flush_tlb_current_task(void)
42905+{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
42906+void flush_tlb_mm (struct mm_struct * mm)
42907+{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
42908+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42909+{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
42910+void flush_tlb_all(void)
42911+{ xen_tlb_flush_all(); }
42912+#endif /* Xen */
42913+
42914+/*
42915+ * this function sends a 'reschedule' IPI to another CPU.
42916+ * it goes straight through and wastes no time serializing
42917+ * anything. Worst case is that we lose a reschedule ...
42918+ */
42919+
42920+void smp_send_reschedule(int cpu)
42921+{
42922+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
42923+}
42924+
42925+/*
42926+ * Structure and data for smp_call_function(). This is designed to minimise
42927+ * static memory requirements. It also looks cleaner.
42928+ */
42929+static DEFINE_SPINLOCK(call_lock);
42930+
42931+struct call_data_struct {
42932+ void (*func) (void *info);
42933+ void *info;
42934+ atomic_t started;
42935+ atomic_t finished;
42936+ int wait;
42937+};
42938+
42939+static struct call_data_struct * call_data;
42940+
42941+void lock_ipi_call_lock(void)
42942+{
42943+ spin_lock_irq(&call_lock);
42944+}
42945+
42946+void unlock_ipi_call_lock(void)
42947+{
42948+ spin_unlock_irq(&call_lock);
42949+}
42950+
42951+/*
42952+ * this function sends a 'generic call function' IPI to one other CPU
42953+ * in the system.
42954+ *
42955+ * cpu is a standard Linux logical CPU number.
42956+ */
42957+static void
42958+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
42959+ int nonatomic, int wait)
42960+{
42961+ struct call_data_struct data;
42962+ int cpus = 1;
42963+
42964+ data.func = func;
42965+ data.info = info;
42966+ atomic_set(&data.started, 0);
42967+ data.wait = wait;
42968+ if (wait)
42969+ atomic_set(&data.finished, 0);
42970+
42971+ call_data = &data;
42972+ wmb();
42973+ /* Send a message to all other CPUs and wait for them to respond */
42974+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
42975+
42976+ /* Wait for response */
42977+ while (atomic_read(&data.started) != cpus)
42978+ cpu_relax();
42979+
42980+ if (!wait)
42981+ return;
42982+
42983+ while (atomic_read(&data.finished) != cpus)
42984+ cpu_relax();
42985+}
42986+
42987+/*
42988+ * smp_call_function_single - Run a function on another CPU
42989+ * @func: The function to run. This must be fast and non-blocking.
42990+ * @info: An arbitrary pointer to pass to the function.
42991+ * @nonatomic: Currently unused.
42992+ * @wait: If true, wait until function has completed on other CPUs.
42993+ *
42994+ * Retrurns 0 on success, else a negative status code.
42995+ *
42996+ * Does not return until the remote CPU is nearly ready to execute <func>
42997+ * or is or has executed.
42998+ */
42999+
43000+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
43001+ int nonatomic, int wait)
43002+{
43003+ /* prevent preemption and reschedule on another processor */
43004+ int me = get_cpu();
43005+ if (cpu == me) {
43006+ WARN_ON(1);
43007+ put_cpu();
43008+ return -EBUSY;
43009+ }
43010+ spin_lock_bh(&call_lock);
43011+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
43012+ spin_unlock_bh(&call_lock);
43013+ put_cpu();
43014+ return 0;
43015+}
43016+
43017+/*
43018+ * this function sends a 'generic call function' IPI to all other CPUs
43019+ * in the system.
43020+ */
43021+static void __smp_call_function (void (*func) (void *info), void *info,
43022+ int nonatomic, int wait)
43023+{
43024+ struct call_data_struct data;
43025+ int cpus = num_online_cpus()-1;
43026+
43027+ if (!cpus)
43028+ return;
43029+
43030+ data.func = func;
43031+ data.info = info;
43032+ atomic_set(&data.started, 0);
43033+ data.wait = wait;
43034+ if (wait)
43035+ atomic_set(&data.finished, 0);
43036+
43037+ call_data = &data;
43038+ wmb();
43039+ /* Send a message to all other CPUs and wait for them to respond */
43040+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
43041+
43042+ /* Wait for response */
43043+ while (atomic_read(&data.started) != cpus)
43044+#ifndef CONFIG_XEN
43045+ cpu_relax();
43046+#else
43047+ barrier();
43048+#endif
43049+
43050+ if (!wait)
43051+ return;
43052+
43053+ while (atomic_read(&data.finished) != cpus)
43054+#ifndef CONFIG_XEN
43055+ cpu_relax();
43056+#else
43057+ barrier();
43058+#endif
43059+}
43060+
43061+/*
43062+ * smp_call_function - run a function on all other CPUs.
43063+ * @func: The function to run. This must be fast and non-blocking.
43064+ * @info: An arbitrary pointer to pass to the function.
43065+ * @nonatomic: currently unused.
43066+ * @wait: If true, wait (atomically) until function has completed on other
43067+ * CPUs.
43068+ *
43069+ * Returns 0 on success, else a negative status code. Does not return until
43070+ * remote CPUs are nearly ready to execute func or are or have executed.
43071+ *
43072+ * You must not call this function with disabled interrupts or from a
43073+ * hardware interrupt handler or from a bottom half handler.
43074+ * Actually there are a few legal cases, like panic.
43075+ */
43076+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
43077+ int wait)
43078+{
43079+ spin_lock(&call_lock);
43080+ __smp_call_function(func,info,nonatomic,wait);
43081+ spin_unlock(&call_lock);
43082+ return 0;
43083+}
43084+
43085+void smp_stop_cpu(void)
43086+{
43087+ unsigned long flags;
43088+ /*
43089+ * Remove this CPU:
43090+ */
43091+ cpu_clear(smp_processor_id(), cpu_online_map);
43092+ local_irq_save(flags);
43093+#ifndef CONFIG_XEN
43094+ disable_local_APIC();
43095+#endif
43096+ local_irq_restore(flags);
43097+}
43098+
43099+static void smp_really_stop_cpu(void *dummy)
43100+{
43101+ smp_stop_cpu();
43102+ for (;;)
43103+ halt();
43104+}
43105+
43106+void smp_send_stop(void)
43107+{
43108+ int nolock = 0;
43109+#ifndef CONFIG_XEN
43110+ if (reboot_force)
43111+ return;
43112+#endif
43113+ /* Don't deadlock on the call lock in panic */
43114+ if (!spin_trylock(&call_lock)) {
43115+ /* ignore locking because we have paniced anyways */
43116+ nolock = 1;
43117+ }
43118+ __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
43119+ if (!nolock)
43120+ spin_unlock(&call_lock);
43121+
43122+ local_irq_disable();
43123+#ifndef CONFIG_XEN
43124+ disable_local_APIC();
43125+#endif
43126+ local_irq_enable();
43127+}
43128+
43129+/*
43130+ * Reschedule call back. Nothing to do,
43131+ * all the work is done automatically when
43132+ * we return from the interrupt.
43133+ */
43134+#ifndef CONFIG_XEN
43135+asmlinkage void smp_reschedule_interrupt(void)
43136+#else
43137+asmlinkage irqreturn_t smp_reschedule_interrupt(void)
43138+#endif
43139+{
43140+#ifndef CONFIG_XEN
43141+ ack_APIC_irq();
43142+#else
43143+ return IRQ_HANDLED;
43144+#endif
43145+}
43146+
43147+#ifndef CONFIG_XEN
43148+asmlinkage void smp_call_function_interrupt(void)
43149+#else
43150+asmlinkage irqreturn_t smp_call_function_interrupt(void)
43151+#endif
43152+{
43153+ void (*func) (void *info) = call_data->func;
43154+ void *info = call_data->info;
43155+ int wait = call_data->wait;
43156+
43157+#ifndef CONFIG_XEN
43158+ ack_APIC_irq();
43159+#endif
43160+ /*
43161+ * Notify initiating CPU that I've grabbed the data and am
43162+ * about to execute the function
43163+ */
43164+ mb();
43165+ atomic_inc(&call_data->started);
43166+ /*
43167+ * At this point the info structure may be out of scope unless wait==1
43168+ */
43169+ exit_idle();
43170+ irq_enter();
43171+ (*func)(info);
43172+ irq_exit();
43173+ if (wait) {
43174+ mb();
43175+ atomic_inc(&call_data->finished);
43176+ }
43177+#ifdef CONFIG_XEN
43178+ return IRQ_HANDLED;
43179+#endif
43180+}
43181+
43182+int safe_smp_processor_id(void)
43183+{
43184+#ifdef CONFIG_XEN
43185+ return smp_processor_id();
43186+#else
43187+ int apicid, i;
43188+
43189+ if (disable_apic)
43190+ return 0;
43191+
43192+ apicid = hard_smp_processor_id();
43193+ if (x86_cpu_to_apicid[apicid] == apicid)
43194+ return apicid;
43195+
43196+ for (i = 0; i < NR_CPUS; ++i) {
43197+ if (x86_cpu_to_apicid[i] == apicid)
43198+ return i;
43199+ }
43200+
43201+ /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
43202+ * or called too early. Either way, we must be CPU 0. */
43203+ if (x86_cpu_to_apicid[0] == BAD_APICID)
43204+ return 0;
43205+
43206+ return 0; /* Should not happen */
43207+#endif
43208+}
43209diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/smp.c linux-2.6.16.33/arch/x86_64/kernel/smp.c
43210--- linux-2.6.16.33-noxen/arch/x86_64/kernel/smp.c 2006-11-22 18:06:31.000000000 +0000
43211+++ linux-2.6.16.33/arch/x86_64/kernel/smp.c 2007-05-23 21:00:01.000000000 +0000
43212@@ -135,10 +135,10 @@
43213
43214 cpu = smp_processor_id();
43215 /*
43216- * orig_rax contains the interrupt vector - 256.
43217+ * orig_rax contains the negated interrupt vector.
43218 * Use that to determine where the sender put the data.
43219 */
43220- sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
43221+ sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
43222 f = &per_cpu(flush_state, sender);
43223
43224 if (!cpu_isset(cpu, f->flush_cpumask))
43225diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/traps-xen.c linux-2.6.16.33/arch/x86_64/kernel/traps-xen.c
43226--- linux-2.6.16.33-noxen/arch/x86_64/kernel/traps-xen.c 1970-01-01 00:00:00.000000000 +0000
43227+++ linux-2.6.16.33/arch/x86_64/kernel/traps-xen.c 2007-01-08 15:00:45.000000000 +0000
43228@@ -0,0 +1,1028 @@
43229+/*
43230+ * linux/arch/x86-64/traps.c
43231+ *
43232+ * Copyright (C) 1991, 1992 Linus Torvalds
43233+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
43234+ *
43235+ * Pentium III FXSR, SSE support
43236+ * Gareth Hughes <gareth@valinux.com>, May 2000
43237+ *
43238+ * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
43239+ */
43240+
43241+/*
43242+ * 'Traps.c' handles hardware traps and faults after we have saved some
43243+ * state in 'entry.S'.
43244+ */
43245+#include <linux/config.h>
43246+#include <linux/sched.h>
43247+#include <linux/kernel.h>
43248+#include <linux/string.h>
43249+#include <linux/errno.h>
43250+#include <linux/ptrace.h>
43251+#include <linux/timer.h>
43252+#include <linux/mm.h>
43253+#include <linux/init.h>
43254+#include <linux/delay.h>
43255+#include <linux/spinlock.h>
43256+#include <linux/interrupt.h>
43257+#include <linux/module.h>
43258+#include <linux/moduleparam.h>
43259+#include <linux/nmi.h>
43260+#include <linux/kprobes.h>
43261+#include <linux/kexec.h>
43262+
43263+#include <asm/system.h>
43264+#include <asm/uaccess.h>
43265+#include <asm/io.h>
43266+#include <asm/atomic.h>
43267+#include <asm/debugreg.h>
43268+#include <asm/desc.h>
43269+#include <asm/i387.h>
43270+#include <asm/kdebug.h>
43271+#include <asm/processor.h>
43272+
43273+#include <asm/smp.h>
43274+#include <asm/pgalloc.h>
43275+#include <asm/pda.h>
43276+#include <asm/proto.h>
43277+#include <asm/nmi.h>
43278+
43279+#ifndef CONFIG_X86_NO_IDT
43280+extern struct gate_struct idt_table[256];
43281+#endif
43282+
43283+asmlinkage void divide_error(void);
43284+asmlinkage void debug(void);
43285+asmlinkage void nmi(void);
43286+asmlinkage void int3(void);
43287+asmlinkage void overflow(void);
43288+asmlinkage void bounds(void);
43289+asmlinkage void invalid_op(void);
43290+asmlinkage void device_not_available(void);
43291+asmlinkage void double_fault(void);
43292+asmlinkage void coprocessor_segment_overrun(void);
43293+asmlinkage void invalid_TSS(void);
43294+asmlinkage void segment_not_present(void);
43295+asmlinkage void stack_segment(void);
43296+asmlinkage void general_protection(void);
43297+asmlinkage void page_fault(void);
43298+asmlinkage void coprocessor_error(void);
43299+asmlinkage void simd_coprocessor_error(void);
43300+asmlinkage void reserved(void);
43301+asmlinkage void alignment_check(void);
43302+asmlinkage void machine_check(void);
43303+asmlinkage void spurious_interrupt_bug(void);
43304+
43305+struct notifier_block *die_chain;
43306+static DEFINE_SPINLOCK(die_notifier_lock);
43307+
43308+int register_die_notifier(struct notifier_block *nb)
43309+{
43310+ int err = 0;
43311+ unsigned long flags;
43312+ spin_lock_irqsave(&die_notifier_lock, flags);
43313+ err = notifier_chain_register(&die_chain, nb);
43314+ spin_unlock_irqrestore(&die_notifier_lock, flags);
43315+ return err;
43316+}
43317+
43318+static inline void conditional_sti(struct pt_regs *regs)
43319+{
43320+ if (regs->eflags & X86_EFLAGS_IF)
43321+ local_irq_enable();
43322+}
43323+
43324+static inline void preempt_conditional_sti(struct pt_regs *regs)
43325+{
43326+ preempt_disable();
43327+ if (regs->eflags & X86_EFLAGS_IF)
43328+ local_irq_enable();
43329+}
43330+
43331+static inline void preempt_conditional_cli(struct pt_regs *regs)
43332+{
43333+ if (regs->eflags & X86_EFLAGS_IF)
43334+ local_irq_disable();
43335+ preempt_enable_no_resched();
43336+}
43337+
43338+static int kstack_depth_to_print = 10;
43339+
43340+#ifdef CONFIG_KALLSYMS
43341+#include <linux/kallsyms.h>
43342+int printk_address(unsigned long address)
43343+{
43344+ unsigned long offset = 0, symsize;
43345+ const char *symname;
43346+ char *modname;
43347+ char *delim = ":";
43348+ char namebuf[128];
43349+
43350+ symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
43351+ if (!symname)
43352+ return printk("[<%016lx>]", address);
43353+ if (!modname)
43354+ modname = delim = "";
43355+ return printk("<%016lx>{%s%s%s%s%+ld}",
43356+ address,delim,modname,delim,symname,offset);
43357+}
43358+#else
43359+int printk_address(unsigned long address)
43360+{
43361+ return printk("[<%016lx>]", address);
43362+}
43363+#endif
43364+
43365+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
43366+ unsigned *usedp, const char **idp)
43367+{
43368+#ifndef CONFIG_X86_NO_TSS
43369+ static char ids[][8] = {
43370+ [DEBUG_STACK - 1] = "#DB",
43371+ [NMI_STACK - 1] = "NMI",
43372+ [DOUBLEFAULT_STACK - 1] = "#DF",
43373+ [STACKFAULT_STACK - 1] = "#SS",
43374+ [MCE_STACK - 1] = "#MC",
43375+#if DEBUG_STKSZ > EXCEPTION_STKSZ
43376+ [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
43377+#endif
43378+ };
43379+ unsigned k;
43380+
43381+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
43382+ unsigned long end;
43383+
43384+ switch (k + 1) {
43385+#if DEBUG_STKSZ > EXCEPTION_STKSZ
43386+ case DEBUG_STACK:
43387+ end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
43388+ break;
43389+#endif
43390+ default:
43391+ end = per_cpu(init_tss, cpu).ist[k];
43392+ break;
43393+ }
43394+ if (stack >= end)
43395+ continue;
43396+ if (stack >= end - EXCEPTION_STKSZ) {
43397+ if (*usedp & (1U << k))
43398+ break;
43399+ *usedp |= 1U << k;
43400+ *idp = ids[k];
43401+ return (unsigned long *)end;
43402+ }
43403+#if DEBUG_STKSZ > EXCEPTION_STKSZ
43404+ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
43405+ unsigned j = N_EXCEPTION_STACKS - 1;
43406+
43407+ do {
43408+ ++j;
43409+ end -= EXCEPTION_STKSZ;
43410+ ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
43411+ } while (stack < end - EXCEPTION_STKSZ);
43412+ if (*usedp & (1U << j))
43413+ break;
43414+ *usedp |= 1U << j;
43415+ *idp = ids[j];
43416+ return (unsigned long *)end;
43417+ }
43418+#endif
43419+ }
43420+#endif
43421+ return NULL;
43422+}
43423+
43424+/*
43425+ * x86-64 can have upto three kernel stacks:
43426+ * process stack
43427+ * interrupt stack
43428+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
43429+ */
43430+
43431+void show_trace(unsigned long *stack)
43432+{
43433+ const unsigned cpu = safe_smp_processor_id();
43434+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
43435+ int i;
43436+ unsigned used = 0;
43437+
43438+ printk("\nCall Trace:");
43439+
43440+#define HANDLE_STACK(cond) \
43441+ do while (cond) { \
43442+ unsigned long addr = *stack++; \
43443+ if (kernel_text_address(addr)) { \
43444+ if (i > 50) { \
43445+ printk("\n "); \
43446+ i = 0; \
43447+ } \
43448+ else \
43449+ i += printk(" "); \
43450+ /* \
43451+ * If the address is either in the text segment of the \
43452+ * kernel, or in the region which contains vmalloc'ed \
43453+ * memory, it *may* be the address of a calling \
43454+ * routine; if so, print it so that someone tracing \
43455+ * down the cause of the crash will be able to figure \
43456+ * out the call path that was taken. \
43457+ */ \
43458+ i += printk_address(addr); \
43459+ } \
43460+ } while (0)
43461+
43462+ for(i = 11; ; ) {
43463+ const char *id;
43464+ unsigned long *estack_end;
43465+ estack_end = in_exception_stack(cpu, (unsigned long)stack,
43466+ &used, &id);
43467+
43468+ if (estack_end) {
43469+ i += printk(" <%s>", id);
43470+ HANDLE_STACK (stack < estack_end);
43471+ i += printk(" <EOE>");
43472+ stack = (unsigned long *) estack_end[-2];
43473+ continue;
43474+ }
43475+ if (irqstack_end) {
43476+ unsigned long *irqstack;
43477+ irqstack = irqstack_end -
43478+ (IRQSTACKSIZE - 64) / sizeof(*irqstack);
43479+
43480+ if (stack >= irqstack && stack < irqstack_end) {
43481+ i += printk(" <IRQ>");
43482+ HANDLE_STACK (stack < irqstack_end);
43483+ stack = (unsigned long *) (irqstack_end[-1]);
43484+ irqstack_end = NULL;
43485+ i += printk(" <EOI>");
43486+ continue;
43487+ }
43488+ }
43489+ break;
43490+ }
43491+
43492+ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
43493+#undef HANDLE_STACK
43494+ printk("\n");
43495+}
43496+
43497+void show_stack(struct task_struct *tsk, unsigned long * rsp)
43498+{
43499+ unsigned long *stack;
43500+ int i;
43501+ const int cpu = safe_smp_processor_id();
43502+ unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
43503+ unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
43504+
43505+ // debugging aid: "show_stack(NULL, NULL);" prints the
43506+ // back trace for this cpu.
43507+
43508+ if (rsp == NULL) {
43509+ if (tsk)
43510+ rsp = (unsigned long *)tsk->thread.rsp;
43511+ else
43512+ rsp = (unsigned long *)&rsp;
43513+ }
43514+
43515+ stack = rsp;
43516+ for(i=0; i < kstack_depth_to_print; i++) {
43517+ if (stack >= irqstack && stack <= irqstack_end) {
43518+ if (stack == irqstack_end) {
43519+ stack = (unsigned long *) (irqstack_end[-1]);
43520+ printk(" <EOI> ");
43521+ }
43522+ } else {
43523+ if (((long) stack & (THREAD_SIZE-1)) == 0)
43524+ break;
43525+ }
43526+ if (i && ((i % 4) == 0))
43527+ printk("\n ");
43528+ printk("%016lx ", *stack++);
43529+ touch_nmi_watchdog();
43530+ }
43531+ show_trace((unsigned long *)rsp);
43532+}
43533+
43534+/*
43535+ * The architecture-independent dump_stack generator
43536+ */
43537+void dump_stack(void)
43538+{
43539+ unsigned long dummy;
43540+ show_trace(&dummy);
43541+}
43542+
43543+EXPORT_SYMBOL(dump_stack);
43544+
43545+void show_registers(struct pt_regs *regs)
43546+{
43547+ int i;
43548+ int in_kernel = !user_mode(regs);
43549+ unsigned long rsp;
43550+ const int cpu = safe_smp_processor_id();
43551+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;
43552+
43553+ rsp = regs->rsp;
43554+
43555+ printk("CPU %d ", cpu);
43556+ __show_regs(regs);
43557+ printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
43558+ cur->comm, cur->pid, task_thread_info(cur), cur);
43559+
43560+ /*
43561+ * When in-kernel, we also print out the stack and code at the
43562+ * time of the fault..
43563+ */
43564+ if (in_kernel) {
43565+
43566+ printk("Stack: ");
43567+ show_stack(NULL, (unsigned long*)rsp);
43568+
43569+ printk("\nCode: ");
43570+ if(regs->rip < PAGE_OFFSET)
43571+ goto bad;
43572+
43573+ for(i=0;i<20;i++)
43574+ {
43575+ unsigned char c;
43576+ if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
43577+bad:
43578+ printk(" Bad RIP value.");
43579+ break;
43580+ }
43581+ printk("%02x ", c);
43582+ }
43583+ }
43584+ printk("\n");
43585+}
43586+
43587+void handle_BUG(struct pt_regs *regs)
43588+{
43589+ struct bug_frame f;
43590+ long len;
43591+ const char *prefix = "";
43592+
43593+ if (user_mode(regs))
43594+ return;
43595+ if (__copy_from_user(&f, (const void __user *) regs->rip,
43596+ sizeof(struct bug_frame)))
43597+ return;
43598+ if (f.filename >= 0 ||
43599+ f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
43600+ return;
43601+ len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
43602+ if (len < 0 || len >= PATH_MAX)
43603+ f.filename = (int)(long)"unmapped filename";
43604+ else if (len > 50) {
43605+ f.filename += len - 50;
43606+ prefix = "...";
43607+ }
43608+ printk("----------- [cut here ] --------- [please bite here ] ---------\n");
43609+ printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
43610+}
43611+
43612+#ifdef CONFIG_BUG
43613+void out_of_line_bug(void)
43614+{
43615+ BUG();
43616+}
43617+#endif
43618+
43619+static DEFINE_SPINLOCK(die_lock);
43620+static int die_owner = -1;
43621+
43622+unsigned __kprobes long oops_begin(void)
43623+{
43624+ int cpu = safe_smp_processor_id();
43625+ unsigned long flags;
43626+
43627+ /* racy, but better than risking deadlock. */
43628+ local_irq_save(flags);
43629+ if (!spin_trylock(&die_lock)) {
43630+ if (cpu == die_owner)
43631+ /* nested oops. should stop eventually */;
43632+ else
43633+ spin_lock(&die_lock);
43634+ }
43635+ die_owner = cpu;
43636+ console_verbose();
43637+ bust_spinlocks(1);
43638+ return flags;
43639+}
43640+
43641+void __kprobes oops_end(unsigned long flags)
43642+{
43643+ die_owner = -1;
43644+ bust_spinlocks(0);
43645+ spin_unlock_irqrestore(&die_lock, flags);
43646+ if (panic_on_oops)
43647+ panic("Oops");
43648+}
43649+
43650+void __kprobes __die(const char * str, struct pt_regs * regs, long err)
43651+{
43652+ static int die_counter;
43653+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
43654+#ifdef CONFIG_PREEMPT
43655+ printk("PREEMPT ");
43656+#endif
43657+#ifdef CONFIG_SMP
43658+ printk("SMP ");
43659+#endif
43660+#ifdef CONFIG_DEBUG_PAGEALLOC
43661+ printk("DEBUG_PAGEALLOC");
43662+#endif
43663+ printk("\n");
43664+ notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
43665+ show_registers(regs);
43666+ /* Executive summary in case the oops scrolled away */
43667+ printk(KERN_ALERT "RIP ");
43668+ printk_address(regs->rip);
43669+ printk(" RSP <%016lx>\n", regs->rsp);
43670+ if (kexec_should_crash(current))
43671+ crash_kexec(regs);
43672+}
43673+
43674+void die(const char * str, struct pt_regs * regs, long err)
43675+{
43676+ unsigned long flags = oops_begin();
43677+
43678+ handle_BUG(regs);
43679+ __die(str, regs, err);
43680+ oops_end(flags);
43681+ do_exit(SIGSEGV);
43682+}
43683+
43684+#ifdef CONFIG_X86_LOCAL_APIC
43685+void __kprobes die_nmi(char *str, struct pt_regs *regs)
43686+{
43687+ unsigned long flags = oops_begin();
43688+
43689+ /*
43690+ * We are in trouble anyway, lets at least try
43691+ * to get a message out.
43692+ */
43693+ printk(str, safe_smp_processor_id());
43694+ show_registers(regs);
43695+ if (kexec_should_crash(current))
43696+ crash_kexec(regs);
43697+ if (panic_on_timeout || panic_on_oops)
43698+ panic("nmi watchdog");
43699+ printk("console shuts up ...\n");
43700+ oops_end(flags);
43701+ do_exit(SIGSEGV);
43702+}
43703+#endif
43704+
43705+static void __kprobes do_trap(int trapnr, int signr, char *str,
43706+ struct pt_regs * regs, long error_code,
43707+ siginfo_t *info)
43708+{
43709+ struct task_struct *tsk = current;
43710+
43711+ conditional_sti(regs);
43712+
43713+ tsk->thread.error_code = error_code;
43714+ tsk->thread.trap_no = trapnr;
43715+
43716+ if (user_mode(regs)) {
43717+ if (exception_trace && unhandled_signal(tsk, signr))
43718+ printk(KERN_INFO
43719+ "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
43720+ tsk->comm, tsk->pid, str,
43721+ regs->rip,regs->rsp,error_code);
43722+
43723+ if (info)
43724+ force_sig_info(signr, info, tsk);
43725+ else
43726+ force_sig(signr, tsk);
43727+ return;
43728+ }
43729+
43730+
43731+ /* kernel trap */
43732+ {
43733+ const struct exception_table_entry *fixup;
43734+ fixup = search_exception_tables(regs->rip);
43735+ if (fixup) {
43736+ regs->rip = fixup->fixup;
43737+ } else
43738+ die(str, regs, error_code);
43739+ return;
43740+ }
43741+}
43742+
43743+#define DO_ERROR(trapnr, signr, str, name) \
43744+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43745+{ \
43746+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43747+ == NOTIFY_STOP) \
43748+ return; \
43749+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
43750+}
43751+
43752+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
43753+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43754+{ \
43755+ siginfo_t info; \
43756+ info.si_signo = signr; \
43757+ info.si_errno = 0; \
43758+ info.si_code = sicode; \
43759+ info.si_addr = (void __user *)siaddr; \
43760+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43761+ == NOTIFY_STOP) \
43762+ return; \
43763+ do_trap(trapnr, signr, str, regs, error_code, &info); \
43764+}
43765+
43766+DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
43767+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
43768+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
43769+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
43770+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
43771+DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
43772+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
43773+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
43774+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
43775+DO_ERROR(18, SIGSEGV, "reserved", reserved)
43776+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
43777+
43778+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
43779+{
43780+ static const char str[] = "double fault";
43781+ struct task_struct *tsk = current;
43782+
43783+ /* Return not checked because double check cannot be ignored */
43784+ notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
43785+
43786+ tsk->thread.error_code = error_code;
43787+ tsk->thread.trap_no = 8;
43788+
43789+ /* This is always a kernel trap and never fixable (and thus must
43790+ never return). */
43791+ for (;;)
43792+ die(str, regs, error_code);
43793+}
43794+
43795+asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
43796+ long error_code)
43797+{
43798+ struct task_struct *tsk = current;
43799+
43800+ conditional_sti(regs);
43801+
43802+ tsk->thread.error_code = error_code;
43803+ tsk->thread.trap_no = 13;
43804+
43805+ if (user_mode(regs)) {
43806+ if (exception_trace && unhandled_signal(tsk, SIGSEGV))
43807+ printk(KERN_INFO
43808+ "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
43809+ tsk->comm, tsk->pid,
43810+ regs->rip,regs->rsp,error_code);
43811+
43812+ force_sig(SIGSEGV, tsk);
43813+ return;
43814+ }
43815+
43816+ /* kernel gp */
43817+ {
43818+ const struct exception_table_entry *fixup;
43819+ fixup = search_exception_tables(regs->rip);
43820+ if (fixup) {
43821+ regs->rip = fixup->fixup;
43822+ return;
43823+ }
43824+ if (notify_die(DIE_GPF, "general protection fault", regs,
43825+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
43826+ return;
43827+ die("general protection fault", regs, error_code);
43828+ }
43829+}
43830+
43831+static __kprobes void
43832+mem_parity_error(unsigned char reason, struct pt_regs * regs)
43833+{
43834+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
43835+ printk("You probably have a hardware problem with your RAM chips\n");
43836+
43837+#if 0 /* XEN */
43838+ /* Clear and disable the memory parity error line. */
43839+ reason = (reason & 0xf) | 4;
43840+ outb(reason, 0x61);
43841+#endif /* XEN */
43842+}
43843+
43844+static __kprobes void
43845+io_check_error(unsigned char reason, struct pt_regs * regs)
43846+{
43847+ printk("NMI: IOCK error (debug interrupt?)\n");
43848+ show_registers(regs);
43849+
43850+#if 0 /* XEN */
43851+ /* Re-enable the IOCK line, wait for a few seconds */
43852+ reason = (reason & 0xf) | 8;
43853+ outb(reason, 0x61);
43854+ mdelay(2000);
43855+ reason &= ~8;
43856+ outb(reason, 0x61);
43857+#endif /* XEN */
43858+}
43859+
43860+static __kprobes void
43861+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
43862+{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
43863+ printk("Dazed and confused, but trying to continue\n");
43864+ printk("Do you have a strange power saving mode enabled?\n");
43865+}
43866+
43867+/* Runs on IST stack. This code must keep interrupts off all the time.
43868+ Nested NMIs are prevented by the CPU. */
43869+asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
43870+{
43871+ unsigned char reason = 0;
43872+ int cpu;
43873+
43874+ cpu = smp_processor_id();
43875+
43876+ /* Only the BSP gets external NMIs from the system. */
43877+ if (!cpu)
43878+ reason = get_nmi_reason();
43879+
43880+ if (!(reason & 0xc0)) {
43881+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
43882+ == NOTIFY_STOP)
43883+ return;
43884+#ifdef CONFIG_X86_LOCAL_APIC
43885+ /*
43886+ * Ok, so this is none of the documented NMI sources,
43887+ * so it must be the NMI watchdog.
43888+ */
43889+ if (nmi_watchdog > 0) {
43890+ nmi_watchdog_tick(regs,reason);
43891+ return;
43892+ }
43893+#endif
43894+ unknown_nmi_error(reason, regs);
43895+ return;
43896+ }
43897+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
43898+ return;
43899+
43900+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
43901+
43902+ if (reason & 0x80)
43903+ mem_parity_error(reason, regs);
43904+ if (reason & 0x40)
43905+ io_check_error(reason, regs);
43906+}
43907+
43908+/* runs on IST stack. */
43909+asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
43910+{
43911+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
43912+ return;
43913+ }
43914+ do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
43915+ return;
43916+}
43917+
43918+/* Help handler running on IST stack to switch back to user stack
43919+ for scheduling or signal handling. The actual stack switch is done in
43920+ entry.S */
43921+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
43922+{
43923+ struct pt_regs *regs = eregs;
43924+ /* Did already sync */
43925+ if (eregs == (struct pt_regs *)eregs->rsp)
43926+ ;
43927+ /* Exception from user space */
43928+ else if (user_mode(eregs))
43929+ regs = task_pt_regs(current);
43930+ /* Exception from kernel and interrupts are enabled. Move to
43931+ kernel process stack. */
43932+ else if (eregs->eflags & X86_EFLAGS_IF)
43933+ regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
43934+ if (eregs != regs)
43935+ *regs = *eregs;
43936+ return regs;
43937+}
43938+
43939+/* runs on IST stack. */
43940+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
43941+ unsigned long error_code)
43942+{
43943+ unsigned long condition;
43944+ struct task_struct *tsk = current;
43945+ siginfo_t info;
43946+
43947+ get_debugreg(condition, 6);
43948+
43949+ if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
43950+ SIGTRAP) == NOTIFY_STOP)
43951+ return;
43952+
43953+ preempt_conditional_sti(regs);
43954+
43955+ /* Mask out spurious debug traps due to lazy DR7 setting */
43956+ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
43957+ if (!tsk->thread.debugreg7) {
43958+ goto clear_dr7;
43959+ }
43960+ }
43961+
43962+ tsk->thread.debugreg6 = condition;
43963+
43964+ /* Mask out spurious TF errors due to lazy TF clearing */
43965+ if (condition & DR_STEP) {
43966+ /*
43967+ * The TF error should be masked out only if the current
43968+ * process is not traced and if the TRAP flag has been set
43969+ * previously by a tracing process (condition detected by
43970+ * the PT_DTRACE flag); remember that the i386 TRAP flag
43971+ * can be modified by the process itself in user mode,
43972+ * allowing programs to debug themselves without the ptrace()
43973+ * interface.
43974+ */
43975+ if (!user_mode(regs))
43976+ goto clear_TF_reenable;
43977+ /*
43978+ * Was the TF flag set by a debugger? If so, clear it now,
43979+ * so that register information is correct.
43980+ */
43981+ if (tsk->ptrace & PT_DTRACE) {
43982+ regs->eflags &= ~TF_MASK;
43983+ tsk->ptrace &= ~PT_DTRACE;
43984+ }
43985+ }
43986+
43987+ /* Ok, finally something we can handle */
43988+ tsk->thread.trap_no = 1;
43989+ tsk->thread.error_code = error_code;
43990+ info.si_signo = SIGTRAP;
43991+ info.si_errno = 0;
43992+ info.si_code = TRAP_BRKPT;
43993+ info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
43994+ force_sig_info(SIGTRAP, &info, tsk);
43995+
43996+clear_dr7:
43997+ set_debugreg(0UL, 7);
43998+ preempt_conditional_cli(regs);
43999+ return;
44000+
44001+clear_TF_reenable:
44002+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
44003+ regs->eflags &= ~TF_MASK;
44004+ preempt_conditional_cli(regs);
44005+}
44006+
44007+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
44008+{
44009+ const struct exception_table_entry *fixup;
44010+ fixup = search_exception_tables(regs->rip);
44011+ if (fixup) {
44012+ regs->rip = fixup->fixup;
44013+ return 1;
44014+ }
44015+ notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
44016+ /* Illegal floating point operation in the kernel */
44017+ current->thread.trap_no = trapnr;
44018+ die(str, regs, 0);
44019+ return 0;
44020+}
44021+
44022+/*
44023+ * Note that we play around with the 'TS' bit in an attempt to get
44024+ * the correct behaviour even in the presence of the asynchronous
44025+ * IRQ13 behaviour
44026+ */
44027+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
44028+{
44029+ void __user *rip = (void __user *)(regs->rip);
44030+ struct task_struct * task;
44031+ siginfo_t info;
44032+ unsigned short cwd, swd;
44033+
44034+ conditional_sti(regs);
44035+ if (!user_mode(regs) &&
44036+ kernel_math_error(regs, "kernel x87 math error", 16))
44037+ return;
44038+
44039+ /*
44040+ * Save the info for the exception handler and clear the error.
44041+ */
44042+ task = current;
44043+ save_init_fpu(task);
44044+ task->thread.trap_no = 16;
44045+ task->thread.error_code = 0;
44046+ info.si_signo = SIGFPE;
44047+ info.si_errno = 0;
44048+ info.si_code = __SI_FAULT;
44049+ info.si_addr = rip;
44050+ /*
44051+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
44052+ * status. 0x3f is the exception bits in these regs, 0x200 is the
44053+ * C1 reg you need in case of a stack fault, 0x040 is the stack
44054+ * fault bit. We should only be taking one exception at a time,
44055+ * so if this combination doesn't produce any single exception,
44056+ * then we have a bad program that isn't synchronizing its FPU usage
44057+ * and it will suffer the consequences since we won't be able to
44058+ * fully reproduce the context of the exception
44059+ */
44060+ cwd = get_fpu_cwd(task);
44061+ swd = get_fpu_swd(task);
44062+ switch (swd & ~cwd & 0x3f) {
44063+ case 0x000:
44064+ default:
44065+ break;
44066+ case 0x001: /* Invalid Op */
44067+ /*
44068+ * swd & 0x240 == 0x040: Stack Underflow
44069+ * swd & 0x240 == 0x240: Stack Overflow
44070+ * User must clear the SF bit (0x40) if set
44071+ */
44072+ info.si_code = FPE_FLTINV;
44073+ break;
44074+ case 0x002: /* Denormalize */
44075+ case 0x010: /* Underflow */
44076+ info.si_code = FPE_FLTUND;
44077+ break;
44078+ case 0x004: /* Zero Divide */
44079+ info.si_code = FPE_FLTDIV;
44080+ break;
44081+ case 0x008: /* Overflow */
44082+ info.si_code = FPE_FLTOVF;
44083+ break;
44084+ case 0x020: /* Precision */
44085+ info.si_code = FPE_FLTRES;
44086+ break;
44087+ }
44088+ force_sig_info(SIGFPE, &info, task);
44089+}
44090+
44091+asmlinkage void bad_intr(void)
44092+{
44093+ printk("bad interrupt");
44094+}
44095+
44096+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
44097+{
44098+ void __user *rip = (void __user *)(regs->rip);
44099+ struct task_struct * task;
44100+ siginfo_t info;
44101+ unsigned short mxcsr;
44102+
44103+ conditional_sti(regs);
44104+ if (!user_mode(regs) &&
44105+ kernel_math_error(regs, "kernel simd math error", 19))
44106+ return;
44107+
44108+ /*
44109+ * Save the info for the exception handler and clear the error.
44110+ */
44111+ task = current;
44112+ save_init_fpu(task);
44113+ task->thread.trap_no = 19;
44114+ task->thread.error_code = 0;
44115+ info.si_signo = SIGFPE;
44116+ info.si_errno = 0;
44117+ info.si_code = __SI_FAULT;
44118+ info.si_addr = rip;
44119+ /*
44120+ * The SIMD FPU exceptions are handled a little differently, as there
44121+ * is only a single status/control register. Thus, to determine which
44122+ * unmasked exception was caught we must mask the exception mask bits
44123+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
44124+ */
44125+ mxcsr = get_fpu_mxcsr(task);
44126+ switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
44127+ case 0x000:
44128+ default:
44129+ break;
44130+ case 0x001: /* Invalid Op */
44131+ info.si_code = FPE_FLTINV;
44132+ break;
44133+ case 0x002: /* Denormalize */
44134+ case 0x010: /* Underflow */
44135+ info.si_code = FPE_FLTUND;
44136+ break;
44137+ case 0x004: /* Zero Divide */
44138+ info.si_code = FPE_FLTDIV;
44139+ break;
44140+ case 0x008: /* Overflow */
44141+ info.si_code = FPE_FLTOVF;
44142+ break;
44143+ case 0x020: /* Precision */
44144+ info.si_code = FPE_FLTRES;
44145+ break;
44146+ }
44147+ force_sig_info(SIGFPE, &info, task);
44148+}
44149+
44150+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
44151+{
44152+}
44153+
44154+#if 0
44155+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
44156+{
44157+}
44158+#endif
44159+
44160+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
44161+{
44162+}
44163+
44164+/*
44165+ * 'math_state_restore()' saves the current math information in the
44166+ * old math state array, and gets the new ones from the current task
44167+ *
44168+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
44169+ * Don't touch unless you *really* know how it works.
44170+ */
44171+asmlinkage void math_state_restore(void)
44172+{
44173+ struct task_struct *me = current;
44174+ /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
44175+
44176+ if (!used_math())
44177+ init_fpu(me);
44178+ restore_fpu_checking(&me->thread.i387.fxsave);
44179+ task_thread_info(me)->status |= TS_USEDFPU;
44180+}
44181+
44182+
44183+/*
44184+ * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
44185+ * specify <dpl>|4 in the second field.
44186+ */
44187+static trap_info_t trap_table[] = {
44188+ { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
44189+ { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
44190+ { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
44191+ { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
44192+ { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
44193+ { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
44194+ { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
44195+ { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
44196+ { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
44197+ { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
44198+ { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
44199+ { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
44200+ { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
44201+ { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
44202+ { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
44203+ { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
44204+#ifdef CONFIG_X86_MCE
44205+ { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
44206+#endif
44207+ { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
44208+#ifdef CONFIG_IA32_EMULATION
44209+ { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
44210+#endif
44211+ { 0, 0, 0, 0 }
44212+};
44213+
44214+void __init trap_init(void)
44215+{
44216+ int ret;
44217+
44218+ ret = HYPERVISOR_set_trap_table(trap_table);
44219+
44220+ if (ret)
44221+ printk("HYPERVISOR_set_trap_table faild: error %d\n",
44222+ ret);
44223+
44224+ /*
44225+ * Should be a barrier for any external CPU state.
44226+ */
44227+ cpu_init();
44228+}
44229+
44230+void smp_trap_init(trap_info_t *trap_ctxt)
44231+{
44232+ trap_info_t *t = trap_table;
44233+
44234+ for (t = trap_table; t->address; t++) {
44235+ trap_ctxt[t->vector].flags = t->flags;
44236+ trap_ctxt[t->vector].cs = t->cs;
44237+ trap_ctxt[t->vector].address = t->address;
44238+ }
44239+}
44240+
44241+
44242+/* Actual parsing is done early in setup.c. */
44243+static int __init oops_dummy(char *s)
44244+{
44245+ panic_on_oops = 1;
44246+ return -1;
44247+}
44248+__setup("oops=", oops_dummy);
44249+
44250+static int __init kstack_setup(char *s)
44251+{
44252+ kstack_depth_to_print = simple_strtoul(s,NULL,0);
44253+ return 0;
44254+}
44255+__setup("kstack=", kstack_setup);
44256+
44257diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S
44258--- linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S 2006-11-22 18:06:31.000000000 +0000
44259+++ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S 2007-05-23 21:00:01.000000000 +0000
44260@@ -14,6 +14,13 @@
44261 OUTPUT_ARCH(i386:x86-64)
44262 ENTRY(phys_startup_64)
44263 jiffies_64 = jiffies;
44264+PHDRS {
44265+ text PT_LOAD FLAGS(5); /* R_E */
44266+ data PT_LOAD FLAGS(7); /* RWE */
44267+ user PT_LOAD FLAGS(7); /* RWE */
44268+ data.init PT_LOAD FLAGS(7); /* RWE */
44269+ note PT_NOTE FLAGS(4); /* R__ */
44270+}
44271 SECTIONS
44272 {
44273 . = __START_KERNEL;
44274@@ -26,7 +33,7 @@
44275 KPROBES_TEXT
44276 *(.fixup)
44277 *(.gnu.warning)
44278- } = 0x9090
44279+ } :text = 0x9090
44280 /* out-of-line lock text */
44281 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
44282
44283@@ -43,17 +50,10 @@
44284 .data : AT(ADDR(.data) - LOAD_OFFSET) {
44285 *(.data)
44286 CONSTRUCTORS
44287- }
44288+ } :data
44289
44290 _edata = .; /* End of data section */
44291
44292- __bss_start = .; /* BSS */
44293- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44294- *(.bss.page_aligned)
44295- *(.bss)
44296- }
44297- __bss_stop = .;
44298-
44299 . = ALIGN(PAGE_SIZE);
44300 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44301 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
44302@@ -75,7 +75,7 @@
44303 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
44304
44305 . = VSYSCALL_ADDR;
44306- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
44307+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
44308 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
44309
44310 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44311@@ -118,7 +118,7 @@
44312 . = ALIGN(8192); /* init_task */
44313 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
44314 *(.data.init_task)
44315- }
44316+ }:data.init
44317
44318 . = ALIGN(4096);
44319 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
44320@@ -188,6 +188,14 @@
44321 . = ALIGN(4096);
44322 __nosave_end = .;
44323
44324+ __bss_start = .; /* BSS */
44325+ . = ALIGN(4096);
44326+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44327+ *(.bss.page_aligned)
44328+ *(.bss)
44329+ }
44330+ __bss_stop = .;
44331+
44332 _end = . ;
44333
44334 /* Sections to be discarded */
44335@@ -201,4 +209,6 @@
44336 STABS_DEBUG
44337
44338 DWARF_DEBUG
44339+
44340+ NOTES
44341 }
44342diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S~ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S~
44343--- linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S~ 1970-01-01 00:00:00.000000000 +0000
44344+++ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S~ 2007-05-23 21:00:01.000000000 +0000
44345@@ -0,0 +1,213 @@
44346+/* ld script to make x86-64 Linux kernel
44347+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
44348+ */
44349+
44350+#define LOAD_OFFSET __START_KERNEL_map
44351+
44352+#include <asm-generic/vmlinux.lds.h>
44353+#include <asm/page.h>
44354+#include <linux/config.h>
44355+
44356+#undef i386 /* in case the preprocessor is a 32bit one */
44357+
44358+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
44359+OUTPUT_ARCH(i386:x86-64)
44360+ENTRY(phys_startup_64)
44361+jiffies_64 = jiffies;
44362+PHDRS {
44363+ text PT_LOAD FLAGS(5); /* R_E */
44364+ data PT_LOAD FLAGS(7); /* RWE */
44365+ user PT_LOAD FLAGS(7); /* RWE */
44366+ note PT_NOTE FLAGS(4); /* R__ */
44367+}
44368+SECTIONS
44369+{
44370+ . = __START_KERNEL;
44371+ phys_startup_64 = startup_64 - LOAD_OFFSET;
44372+ _text = .; /* Text and read-only data */
44373+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
44374+ *(.text)
44375+ SCHED_TEXT
44376+ LOCK_TEXT
44377+ KPROBES_TEXT
44378+ *(.fixup)
44379+ *(.gnu.warning)
44380+ } :text = 0x9090
44381+ /* out-of-line lock text */
44382+ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
44383+
44384+ _etext = .; /* End of text section */
44385+
44386+ . = ALIGN(16); /* Exception table */
44387+ __start___ex_table = .;
44388+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
44389+ __stop___ex_table = .;
44390+
44391+ RODATA
44392+
44393+ /* Data */
44394+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
44395+ *(.data)
44396+ CONSTRUCTORS
44397+ } :data
44398+
44399+ _edata = .; /* End of data section */
44400+
44401+ . = ALIGN(PAGE_SIZE);
44402+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44403+ .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
44404+ *(.data.cacheline_aligned)
44405+ }
44406+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44407+ .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
44408+ *(.data.read_mostly)
44409+ }
44410+
44411+#define VSYSCALL_ADDR (-10*1024*1024)
44412+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
44413+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
44414+
44415+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
44416+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
44417+
44418+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
44419+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
44420+
44421+ . = VSYSCALL_ADDR;
44422+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
44423+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
44424+
44425+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44426+ .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
44427+ xtime_lock = VVIRT(.xtime_lock);
44428+
44429+ .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
44430+ vxtime = VVIRT(.vxtime);
44431+
44432+ .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
44433+ wall_jiffies = VVIRT(.wall_jiffies);
44434+
44435+ .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
44436+ sys_tz = VVIRT(.sys_tz);
44437+
44438+ .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
44439+ sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
44440+
44441+ .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
44442+ xtime = VVIRT(.xtime);
44443+
44444+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44445+ .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
44446+ jiffies = VVIRT(.jiffies);
44447+
44448+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
44449+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
44450+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
44451+
44452+ . = VSYSCALL_VIRT_ADDR + 4096;
44453+
44454+#undef VSYSCALL_ADDR
44455+#undef VSYSCALL_PHYS_ADDR
44456+#undef VSYSCALL_VIRT_ADDR
44457+#undef VLOAD_OFFSET
44458+#undef VLOAD
44459+#undef VVIRT_OFFSET
44460+#undef VVIRT
44461+
44462+ . = ALIGN(8192); /* init_task */
44463+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
44464+ *(.data.init_task)
44465+ } :data
44466+
44467+ . = ALIGN(4096);
44468+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
44469+ *(.data.page_aligned)
44470+ }
44471+
44472+ . = ALIGN(4096); /* Init code and data */
44473+ __init_begin = .;
44474+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
44475+ _sinittext = .;
44476+ *(.init.text)
44477+ _einittext = .;
44478+ }
44479+ __initdata_begin = .;
44480+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
44481+ __initdata_end = .;
44482+ . = ALIGN(16);
44483+ __setup_start = .;
44484+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
44485+ __setup_end = .;
44486+ __initcall_start = .;
44487+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
44488+ *(.initcall1.init)
44489+ *(.initcall2.init)
44490+ *(.initcall3.init)
44491+ *(.initcall4.init)
44492+ *(.initcall5.init)
44493+ *(.initcall6.init)
44494+ *(.initcall7.init)
44495+ }
44496+ __initcall_end = .;
44497+ __con_initcall_start = .;
44498+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
44499+ *(.con_initcall.init)
44500+ }
44501+ __con_initcall_end = .;
44502+ SECURITY_INIT
44503+ . = ALIGN(8);
44504+ __alt_instructions = .;
44505+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
44506+ *(.altinstructions)
44507+ }
44508+ __alt_instructions_end = .;
44509+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
44510+ *(.altinstr_replacement)
44511+ }
44512+ /* .exit.text is discard at runtime, not link time, to deal with references
44513+ from .altinstructions and .eh_frame */
44514+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
44515+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
44516+ . = ALIGN(4096);
44517+ __initramfs_start = .;
44518+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
44519+ __initramfs_end = .;
44520+ /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+
44521+ complain */
44522+ . = ALIGN(4096);
44523+ __init_end = .;
44524+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44525+ __per_cpu_start = .;
44526+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
44527+ __per_cpu_end = .;
44528+
44529+ . = ALIGN(4096);
44530+ __nosave_begin = .;
44531+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
44532+ . = ALIGN(4096);
44533+ __nosave_end = .;
44534+
44535+ __bss_start = .; /* BSS */
44536+ . = ALIGN(4096);
44537+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44538+ *(.bss.page_aligned)
44539+ *(.bss)
44540+ }
44541+ __bss_stop = .;
44542+
44543+ _end = . ;
44544+
44545+ /* Sections to be discarded */
44546+ /DISCARD/ : {
44547+ *(.exitcall.exit)
44548+#ifndef CONFIG_UNWIND_INFO
44549+ *(.eh_frame)
44550+#endif
44551+ }
44552+
44553+ STABS_DEBUG
44554+
44555+ DWARF_DEBUG
44556+
44557+ NOTES
44558+}
44559diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vsyscall-xen.c linux-2.6.16.33/arch/x86_64/kernel/vsyscall-xen.c
44560--- linux-2.6.16.33-noxen/arch/x86_64/kernel/vsyscall-xen.c 1970-01-01 00:00:00.000000000 +0000
44561+++ linux-2.6.16.33/arch/x86_64/kernel/vsyscall-xen.c 2007-01-08 15:00:45.000000000 +0000
44562@@ -0,0 +1,239 @@
44563+/*
44564+ * linux/arch/x86_64/kernel/vsyscall.c
44565+ *
44566+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
44567+ * Copyright 2003 Andi Kleen, SuSE Labs.
44568+ *
44569+ * Thanks to hpa@transmeta.com for some useful hint.
44570+ * Special thanks to Ingo Molnar for his early experience with
44571+ * a different vsyscall implementation for Linux/IA32 and for the name.
44572+ *
44573+ * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
44574+ * at virtual address -10Mbyte+1024bytes etc... There are at max 4
44575+ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
44576+ * jumping out of line if necessary. We cannot add more with this
44577+ * mechanism because older kernels won't return -ENOSYS.
44578+ * If we want more than four we need a vDSO.
44579+ *
44580+ * Note: the concept clashes with user mode linux. If you use UML and
44581+ * want per guest time just set the kernel.vsyscall64 sysctl to 0.
44582+ */
44583+
44584+#include <linux/time.h>
44585+#include <linux/init.h>
44586+#include <linux/kernel.h>
44587+#include <linux/timer.h>
44588+#include <linux/seqlock.h>
44589+#include <linux/jiffies.h>
44590+#include <linux/sysctl.h>
44591+
44592+#include <asm/vsyscall.h>
44593+#include <asm/pgtable.h>
44594+#include <asm/page.h>
44595+#include <asm/fixmap.h>
44596+#include <asm/errno.h>
44597+#include <asm/io.h>
44598+
44599+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
44600+
44601+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
44602+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
44603+
44604+#include <asm/unistd.h>
44605+
44606+static __always_inline void timeval_normalize(struct timeval * tv)
44607+{
44608+ time_t __sec;
44609+
44610+ __sec = tv->tv_usec / 1000000;
44611+ if (__sec) {
44612+ tv->tv_usec %= 1000000;
44613+ tv->tv_sec += __sec;
44614+ }
44615+}
44616+
44617+static __always_inline void do_vgettimeofday(struct timeval * tv)
44618+{
44619+ long sequence, t;
44620+ unsigned long sec, usec;
44621+
44622+ do {
44623+ sequence = read_seqbegin(&__xtime_lock);
44624+
44625+ sec = __xtime.tv_sec;
44626+ usec = (__xtime.tv_nsec / 1000) +
44627+ (__jiffies - __wall_jiffies) * (1000000 / HZ);
44628+
44629+ if (__vxtime.mode != VXTIME_HPET) {
44630+ t = get_cycles_sync();
44631+ if (t < __vxtime.last_tsc)
44632+ t = __vxtime.last_tsc;
44633+ usec += ((t - __vxtime.last_tsc) *
44634+ __vxtime.tsc_quot) >> 32;
44635+ /* See comment in x86_64 do_gettimeofday. */
44636+ } else {
44637+ usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
44638+ __vxtime.last) * __vxtime.quot) >> 32;
44639+ }
44640+ } while (read_seqretry(&__xtime_lock, sequence));
44641+
44642+ tv->tv_sec = sec + usec / 1000000;
44643+ tv->tv_usec = usec % 1000000;
44644+}
44645+
44646+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
44647+static __always_inline void do_get_tz(struct timezone * tz)
44648+{
44649+ *tz = __sys_tz;
44650+}
44651+
44652+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
44653+{
44654+ int ret;
44655+ asm volatile("vsysc2: syscall"
44656+ : "=a" (ret)
44657+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
44658+ return ret;
44659+}
44660+
44661+static __always_inline long time_syscall(long *t)
44662+{
44663+ long secs;
44664+ asm volatile("vsysc1: syscall"
44665+ : "=a" (secs)
44666+ : "0" (__NR_time),"D" (t) : __syscall_clobber);
44667+ return secs;
44668+}
44669+
44670+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
44671+{
44672+ if (unlikely(!__sysctl_vsyscall))
44673+ return gettimeofday(tv,tz);
44674+ if (tv)
44675+ do_vgettimeofday(tv);
44676+ if (tz)
44677+ do_get_tz(tz);
44678+ return 0;
44679+}
44680+
44681+/* This will break when the xtime seconds get inaccurate, but that is
44682+ * unlikely */
44683+time_t __vsyscall(1) vtime(time_t *t)
44684+{
44685+ if (unlikely(!__sysctl_vsyscall))
44686+ return time_syscall(t);
44687+ else if (t)
44688+ *t = __xtime.tv_sec;
44689+ return __xtime.tv_sec;
44690+}
44691+
44692+long __vsyscall(2) venosys_0(void)
44693+{
44694+ return -ENOSYS;
44695+}
44696+
44697+long __vsyscall(3) venosys_1(void)
44698+{
44699+ return -ENOSYS;
44700+}
44701+
44702+#ifdef CONFIG_SYSCTL
44703+
44704+#define SYSCALL 0x050f
44705+#define NOP2 0x9090
44706+
44707+/*
44708+ * NOP out syscall in vsyscall page when not needed.
44709+ */
44710+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
44711+ void __user *buffer, size_t *lenp, loff_t *ppos)
44712+{
44713+ extern u16 vsysc1, vsysc2;
44714+ u16 *map1, *map2;
44715+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
44716+ if (!write)
44717+ return ret;
44718+ /* gcc has some trouble with __va(__pa()), so just do it this
44719+ way. */
44720+ map1 = ioremap(__pa_symbol(&vsysc1), 2);
44721+ if (!map1)
44722+ return -ENOMEM;
44723+ map2 = ioremap(__pa_symbol(&vsysc2), 2);
44724+ if (!map2) {
44725+ ret = -ENOMEM;
44726+ goto out;
44727+ }
44728+ if (!sysctl_vsyscall) {
44729+ *map1 = SYSCALL;
44730+ *map2 = SYSCALL;
44731+ } else {
44732+ *map1 = NOP2;
44733+ *map2 = NOP2;
44734+ }
44735+ iounmap(map2);
44736+out:
44737+ iounmap(map1);
44738+ return ret;
44739+}
44740+
44741+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
44742+ void __user *oldval, size_t __user *oldlenp,
44743+ void __user *newval, size_t newlen,
44744+ void **context)
44745+{
44746+ return -ENOSYS;
44747+}
44748+
44749+static ctl_table kernel_table2[] = {
44750+ { .ctl_name = 99, .procname = "vsyscall64",
44751+ .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
44752+ .strategy = vsyscall_sysctl_nostrat,
44753+ .proc_handler = vsyscall_sysctl_change },
44754+ { 0, }
44755+};
44756+
44757+static ctl_table kernel_root_table2[] = {
44758+ { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
44759+ .child = kernel_table2 },
44760+ { 0 },
44761+};
44762+
44763+#endif
44764+
44765+static void __init map_vsyscall(void)
44766+{
44767+ extern char __vsyscall_0;
44768+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44769+
44770+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44771+}
44772+
44773+#ifdef CONFIG_XEN
44774+static void __init map_vsyscall_user(void)
44775+{
44776+ extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
44777+ extern char __vsyscall_0;
44778+ unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44779+
44780+ __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44781+}
44782+#endif
44783+
44784+static int __init vsyscall_init(void)
44785+{
44786+ BUG_ON(((unsigned long) &vgettimeofday !=
44787+ VSYSCALL_ADDR(__NR_vgettimeofday)));
44788+ BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
44789+ BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
44790+ map_vsyscall();
44791+#ifdef CONFIG_XEN
44792+ map_vsyscall_user();
44793+ sysctl_vsyscall = 0; /* disable vgettimeofay() */
44794+#endif
44795+#ifdef CONFIG_SYSCTL
44796+ register_sysctl_table(kernel_root_table2, 0);
44797+#endif
44798+ return 0;
44799+}
44800+
44801+__initcall(vsyscall_init);
44802diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/x8664_ksyms-xen.c linux-2.6.16.33/arch/x86_64/kernel/x8664_ksyms-xen.c
44803--- linux-2.6.16.33-noxen/arch/x86_64/kernel/x8664_ksyms-xen.c 1970-01-01 00:00:00.000000000 +0000
44804+++ linux-2.6.16.33/arch/x86_64/kernel/x8664_ksyms-xen.c 2007-01-08 15:00:45.000000000 +0000
44805@@ -0,0 +1,163 @@
44806+#include <linux/config.h>
44807+#include <linux/module.h>
44808+#include <linux/smp.h>
44809+#include <linux/user.h>
44810+#include <linux/sched.h>
44811+#include <linux/in6.h>
44812+#include <linux/interrupt.h>
44813+#include <linux/smp_lock.h>
44814+#include <linux/pm.h>
44815+#include <linux/pci.h>
44816+#include <linux/apm_bios.h>
44817+#include <linux/kernel.h>
44818+#include <linux/string.h>
44819+#include <linux/syscalls.h>
44820+#include <linux/tty.h>
44821+
44822+#include <asm/semaphore.h>
44823+#include <asm/processor.h>
44824+#include <asm/i387.h>
44825+#include <asm/uaccess.h>
44826+#include <asm/checksum.h>
44827+#include <asm/io.h>
44828+#include <asm/delay.h>
44829+#include <asm/irq.h>
44830+#include <asm/mmx.h>
44831+#include <asm/desc.h>
44832+#include <asm/pgtable.h>
44833+#include <asm/pgalloc.h>
44834+#include <asm/nmi.h>
44835+#include <asm/kdebug.h>
44836+#include <asm/unistd.h>
44837+#include <asm/tlbflush.h>
44838+#include <asm/kdebug.h>
44839+
44840+#ifdef CONFIG_SMP
44841+extern void __write_lock_failed(rwlock_t *rw);
44842+extern void __read_lock_failed(rwlock_t *rw);
44843+#endif
44844+
44845+/* platform dependent support */
44846+EXPORT_SYMBOL(boot_cpu_data);
44847+//EXPORT_SYMBOL(dump_fpu);
44848+EXPORT_SYMBOL(kernel_thread);
44849+EXPORT_SYMBOL(pm_idle);
44850+EXPORT_SYMBOL(pm_power_off);
44851+
44852+EXPORT_SYMBOL(__down_failed);
44853+EXPORT_SYMBOL(__down_failed_interruptible);
44854+EXPORT_SYMBOL(__down_failed_trylock);
44855+EXPORT_SYMBOL(__up_wakeup);
44856+/* Networking helper routines. */
44857+EXPORT_SYMBOL(csum_partial_copy_nocheck);
44858+EXPORT_SYMBOL(ip_compute_csum);
44859+/* Delay loops */
44860+EXPORT_SYMBOL(__udelay);
44861+EXPORT_SYMBOL(__ndelay);
44862+EXPORT_SYMBOL(__delay);
44863+EXPORT_SYMBOL(__const_udelay);
44864+
44865+EXPORT_SYMBOL(__get_user_1);
44866+EXPORT_SYMBOL(__get_user_2);
44867+EXPORT_SYMBOL(__get_user_4);
44868+EXPORT_SYMBOL(__get_user_8);
44869+EXPORT_SYMBOL(__put_user_1);
44870+EXPORT_SYMBOL(__put_user_2);
44871+EXPORT_SYMBOL(__put_user_4);
44872+EXPORT_SYMBOL(__put_user_8);
44873+
44874+EXPORT_SYMBOL(strncpy_from_user);
44875+EXPORT_SYMBOL(__strncpy_from_user);
44876+EXPORT_SYMBOL(clear_user);
44877+EXPORT_SYMBOL(__clear_user);
44878+EXPORT_SYMBOL(copy_user_generic);
44879+EXPORT_SYMBOL(copy_from_user);
44880+EXPORT_SYMBOL(copy_to_user);
44881+EXPORT_SYMBOL(copy_in_user);
44882+EXPORT_SYMBOL(strnlen_user);
44883+
44884+#ifdef CONFIG_PCI
44885+EXPORT_SYMBOL(pci_mem_start);
44886+#endif
44887+
44888+EXPORT_SYMBOL(copy_page);
44889+EXPORT_SYMBOL(clear_page);
44890+
44891+EXPORT_SYMBOL(_cpu_pda);
44892+#ifdef CONFIG_SMP
44893+EXPORT_SYMBOL(__write_lock_failed);
44894+EXPORT_SYMBOL(__read_lock_failed);
44895+
44896+EXPORT_SYMBOL(smp_call_function);
44897+#endif
44898+
44899+#ifdef CONFIG_VT
44900+EXPORT_SYMBOL(screen_info);
44901+#endif
44902+
44903+EXPORT_SYMBOL(get_wchan);
44904+
44905+#ifdef CONFIG_X86_LOCAL_APIC
44906+EXPORT_SYMBOL_GPL(set_nmi_callback);
44907+EXPORT_SYMBOL_GPL(unset_nmi_callback);
44908+#endif
44909+
44910+/* Export string functions. We normally rely on gcc builtin for most of these,
44911+ but gcc sometimes decides not to inline them. */
44912+#undef memcpy
44913+#undef memset
44914+#undef memmove
44915+#undef strlen
44916+
44917+extern void * memset(void *,int,__kernel_size_t);
44918+extern size_t strlen(const char *);
44919+extern void * memmove(void * dest,const void *src,size_t count);
44920+extern void * memcpy(void *,const void *,__kernel_size_t);
44921+extern void * __memcpy(void *,const void *,__kernel_size_t);
44922+
44923+EXPORT_SYMBOL(memset);
44924+EXPORT_SYMBOL(strlen);
44925+EXPORT_SYMBOL(memmove);
44926+EXPORT_SYMBOL(memcpy);
44927+EXPORT_SYMBOL(__memcpy);
44928+
44929+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
44930+/* prototypes are wrong, these are assembly with custom calling functions */
44931+extern void rwsem_down_read_failed_thunk(void);
44932+extern void rwsem_wake_thunk(void);
44933+extern void rwsem_downgrade_thunk(void);
44934+extern void rwsem_down_write_failed_thunk(void);
44935+EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
44936+EXPORT_SYMBOL(rwsem_wake_thunk);
44937+EXPORT_SYMBOL(rwsem_downgrade_thunk);
44938+EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
44939+#endif
44940+
44941+EXPORT_SYMBOL(empty_zero_page);
44942+
44943+EXPORT_SYMBOL(die_chain);
44944+EXPORT_SYMBOL(register_die_notifier);
44945+
44946+#ifdef CONFIG_SMP
44947+EXPORT_SYMBOL(cpu_sibling_map);
44948+EXPORT_SYMBOL(smp_num_siblings);
44949+#endif
44950+
44951+extern void do_softirq_thunk(void);
44952+EXPORT_SYMBOL(do_softirq_thunk);
44953+
44954+#ifdef CONFIG_BUG
44955+EXPORT_SYMBOL(out_of_line_bug);
44956+#endif
44957+
44958+EXPORT_SYMBOL(init_level4_pgt);
44959+
44960+extern unsigned long __supported_pte_mask;
44961+EXPORT_SYMBOL(__supported_pte_mask);
44962+
44963+#ifdef CONFIG_SMP
44964+EXPORT_SYMBOL(flush_tlb_page);
44965+#endif
44966+
44967+EXPORT_SYMBOL(load_gs_index);
44968+
44969diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/xen_entry.S linux-2.6.16.33/arch/x86_64/kernel/xen_entry.S
44970--- linux-2.6.16.33-noxen/arch/x86_64/kernel/xen_entry.S 1970-01-01 00:00:00.000000000 +0000
44971+++ linux-2.6.16.33/arch/x86_64/kernel/xen_entry.S 2007-01-08 15:00:45.000000000 +0000
44972@@ -0,0 +1,40 @@
44973+/*
44974+ * Copied from arch/xen/i386/kernel/entry.S
44975+ */
44976+/* Offsets into shared_info_t. */
44977+#define evtchn_upcall_pending /* 0 */
44978+#define evtchn_upcall_mask 1
44979+
44980+#define sizeof_vcpu_shift 6
44981+
44982+#ifdef CONFIG_SMP
44983+//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
44984+//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
44985+#define preempt_disable(reg)
44986+#define preempt_enable(reg)
44987+#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
44988+ movq %gs:pda_cpunumber,reg ; \
44989+ shl $32, reg ; \
44990+ shr $32-sizeof_vcpu_shift,reg ; \
44991+ addq HYPERVISOR_shared_info,reg
44992+#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
44993+#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
44994+#else
44995+#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
44996+#define XEN_PUT_VCPU_INFO(reg)
44997+#define XEN_PUT_VCPU_INFO_fixup
44998+#endif
44999+
45000+#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
45001+#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
45002+#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
45003+ XEN_LOCKED_BLOCK_EVENTS(reg) ; \
45004+ XEN_PUT_VCPU_INFO(reg)
45005+#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
45006+ XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
45007+ XEN_PUT_VCPU_INFO(reg)
45008+#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
45009+
45010+VGCF_IN_SYSCALL = (1<<8)
45011+
45012+
45013diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/Makefile linux-2.6.16.33/arch/x86_64/mm/Makefile
45014--- linux-2.6.16.33-noxen/arch/x86_64/mm/Makefile 2006-11-22 18:06:31.000000000 +0000
45015+++ linux-2.6.16.33/arch/x86_64/mm/Makefile 2007-01-08 15:00:45.000000000 +0000
45016@@ -9,3 +9,13 @@
45017 obj-$(CONFIG_ACPI_NUMA) += srat.o
45018
45019 hugetlbpage-y = ../../i386/mm/hugetlbpage.o
45020+
45021+ifdef CONFIG_XEN
45022+include $(srctree)/scripts/Makefile.xen
45023+
45024+ioremap-y += ../../i386/mm/ioremap-xen.o
45025+hypervisor-y += ../../i386/mm/hypervisor.o
45026+obj-y += hypervisor.o
45027+
45028+obj-y := $(call cherrypickxen, $(obj-y))
45029+endif
45030diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/fault-xen.c linux-2.6.16.33/arch/x86_64/mm/fault-xen.c
45031--- linux-2.6.16.33-noxen/arch/x86_64/mm/fault-xen.c 1970-01-01 00:00:00.000000000 +0000
45032+++ linux-2.6.16.33/arch/x86_64/mm/fault-xen.c 2007-01-08 15:00:45.000000000 +0000
45033@@ -0,0 +1,641 @@
45034+/*
45035+ * linux/arch/x86-64/mm/fault.c
45036+ *
45037+ * Copyright (C) 1995 Linus Torvalds
45038+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
45039+ */
45040+
45041+#include <linux/config.h>
45042+#include <linux/signal.h>
45043+#include <linux/sched.h>
45044+#include <linux/kernel.h>
45045+#include <linux/errno.h>
45046+#include <linux/string.h>
45047+#include <linux/types.h>
45048+#include <linux/ptrace.h>
45049+#include <linux/mman.h>
45050+#include <linux/mm.h>
45051+#include <linux/smp.h>
45052+#include <linux/smp_lock.h>
45053+#include <linux/interrupt.h>
45054+#include <linux/init.h>
45055+#include <linux/tty.h>
45056+#include <linux/vt_kern.h> /* For unblank_screen() */
45057+#include <linux/compiler.h>
45058+#include <linux/module.h>
45059+#include <linux/kprobes.h>
45060+
45061+#include <asm/system.h>
45062+#include <asm/uaccess.h>
45063+#include <asm/pgalloc.h>
45064+#include <asm/smp.h>
45065+#include <asm/tlbflush.h>
45066+#include <asm/proto.h>
45067+#include <asm/kdebug.h>
45068+#include <asm-generic/sections.h>
45069+
45070+/* Page fault error code bits */
45071+#define PF_PROT (1<<0) /* or no page found */
45072+#define PF_WRITE (1<<1)
45073+#define PF_USER (1<<2)
45074+#define PF_RSVD (1<<3)
45075+#define PF_INSTR (1<<4)
45076+
45077+void bust_spinlocks(int yes)
45078+{
45079+ int loglevel_save = console_loglevel;
45080+ if (yes) {
45081+ oops_in_progress = 1;
45082+ } else {
45083+#ifdef CONFIG_VT
45084+ unblank_screen();
45085+#endif
45086+ oops_in_progress = 0;
45087+ /*
45088+ * OK, the message is on the console. Now we call printk()
45089+ * without oops_in_progress set so that printk will give klogd
45090+ * a poke. Hold onto your hats...
45091+ */
45092+ console_loglevel = 15; /* NMI oopser may have shut the console up */
45093+ printk(" ");
45094+ console_loglevel = loglevel_save;
45095+ }
45096+}
45097+
45098+/* Sometimes the CPU reports invalid exceptions on prefetch.
45099+ Check that here and ignore.
45100+ Opcode checker based on code by Richard Brunner */
45101+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
45102+ unsigned long error_code)
45103+{
45104+ unsigned char *instr;
45105+ int scan_more = 1;
45106+ int prefetch = 0;
45107+ unsigned char *max_instr;
45108+
45109+ /* If it was a exec fault ignore */
45110+ if (error_code & PF_INSTR)
45111+ return 0;
45112+
45113+ instr = (unsigned char *)convert_rip_to_linear(current, regs);
45114+ max_instr = instr + 15;
45115+
45116+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
45117+ return 0;
45118+
45119+ while (scan_more && instr < max_instr) {
45120+ unsigned char opcode;
45121+ unsigned char instr_hi;
45122+ unsigned char instr_lo;
45123+
45124+ if (__get_user(opcode, instr))
45125+ break;
45126+
45127+ instr_hi = opcode & 0xf0;
45128+ instr_lo = opcode & 0x0f;
45129+ instr++;
45130+
45131+ switch (instr_hi) {
45132+ case 0x20:
45133+ case 0x30:
45134+ /* Values 0x26,0x2E,0x36,0x3E are valid x86
45135+ prefixes. In long mode, the CPU will signal
45136+ invalid opcode if some of these prefixes are
45137+ present so we will never get here anyway */
45138+ scan_more = ((instr_lo & 7) == 0x6);
45139+ break;
45140+
45141+ case 0x40:
45142+ /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
45143+ Need to figure out under what instruction mode the
45144+ instruction was issued ... */
45145+ /* Could check the LDT for lm, but for now it's good
45146+ enough to assume that long mode only uses well known
45147+ segments or kernel. */
45148+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
45149+ break;
45150+
45151+ case 0x60:
45152+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
45153+ scan_more = (instr_lo & 0xC) == 0x4;
45154+ break;
45155+ case 0xF0:
45156+ /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
45157+ scan_more = !instr_lo || (instr_lo>>1) == 1;
45158+ break;
45159+ case 0x00:
45160+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
45161+ scan_more = 0;
45162+ if (__get_user(opcode, instr))
45163+ break;
45164+ prefetch = (instr_lo == 0xF) &&
45165+ (opcode == 0x0D || opcode == 0x18);
45166+ break;
45167+ default:
45168+ scan_more = 0;
45169+ break;
45170+ }
45171+ }
45172+ return prefetch;
45173+}
45174+
45175+static int bad_address(void *p)
45176+{
45177+ unsigned long dummy;
45178+ return __get_user(dummy, (unsigned long *)p);
45179+}
45180+
45181+void dump_pagetable(unsigned long address)
45182+{
45183+ pgd_t *pgd;
45184+ pud_t *pud;
45185+ pmd_t *pmd;
45186+ pte_t *pte;
45187+
45188+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
45189+ pgd += pgd_index(address);
45190+ if (bad_address(pgd)) goto bad;
45191+ printk("PGD %lx ", pgd_val(*pgd));
45192+ if (!pgd_present(*pgd)) goto ret;
45193+
45194+ pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
45195+ if (bad_address(pud)) goto bad;
45196+ printk("PUD %lx ", pud_val(*pud));
45197+ if (!pud_present(*pud)) goto ret;
45198+
45199+ pmd = pmd_offset(pud, address);
45200+ if (bad_address(pmd)) goto bad;
45201+ printk("PMD %lx ", pmd_val(*pmd));
45202+ if (!pmd_present(*pmd)) goto ret;
45203+
45204+ pte = pte_offset_kernel(pmd, address);
45205+ if (bad_address(pte)) goto bad;
45206+ printk("PTE %lx", pte_val(*pte));
45207+ret:
45208+ printk("\n");
45209+ return;
45210+bad:
45211+ printk("BAD\n");
45212+}
45213+
45214+static const char errata93_warning[] =
45215+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
45216+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
45217+KERN_ERR "******* Please consider a BIOS update.\n"
45218+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
45219+
45220+/* Workaround for K8 erratum #93 & buggy BIOS.
45221+ BIOS SMM functions are required to use a specific workaround
45222+ to avoid corruption of the 64bit RIP register on C stepping K8.
45223+ A lot of BIOS that didn't get tested properly miss this.
45224+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
45225+ Try to work around it here.
45226+ Note we only handle faults in kernel here. */
45227+
45228+static int is_errata93(struct pt_regs *regs, unsigned long address)
45229+{
45230+ static int warned;
45231+ if (address != regs->rip)
45232+ return 0;
45233+ if ((address >> 32) != 0)
45234+ return 0;
45235+ address |= 0xffffffffUL << 32;
45236+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
45237+ (address >= MODULES_VADDR && address <= MODULES_END)) {
45238+ if (!warned) {
45239+ printk(errata93_warning);
45240+ warned = 1;
45241+ }
45242+ regs->rip = address;
45243+ return 1;
45244+ }
45245+ return 0;
45246+}
45247+
45248+int unhandled_signal(struct task_struct *tsk, int sig)
45249+{
45250+ if (tsk->pid == 1)
45251+ return 1;
45252+ if (tsk->ptrace & PT_PTRACED)
45253+ return 0;
45254+ return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
45255+ (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
45256+}
45257+
45258+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
45259+ unsigned long error_code)
45260+{
45261+ unsigned long flags = oops_begin();
45262+ struct task_struct *tsk;
45263+
45264+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
45265+ current->comm, address);
45266+ dump_pagetable(address);
45267+ tsk = current;
45268+ tsk->thread.cr2 = address;
45269+ tsk->thread.trap_no = 14;
45270+ tsk->thread.error_code = error_code;
45271+ __die("Bad pagetable", regs, error_code);
45272+ oops_end(flags);
45273+ do_exit(SIGKILL);
45274+}
45275+
45276+/*
45277+ * Handle a fault on the vmalloc area
45278+ *
45279+ * This assumes no large pages in there.
45280+ */
45281+static int vmalloc_fault(unsigned long address)
45282+{
45283+ pgd_t *pgd, *pgd_ref;
45284+ pud_t *pud, *pud_ref;
45285+ pmd_t *pmd, *pmd_ref;
45286+ pte_t *pte, *pte_ref;
45287+
45288+ /* Copy kernel mappings over when needed. This can also
45289+ happen within a race in page table update. In the later
45290+ case just flush. */
45291+
45292+ /* On Xen the line below does not always work. Needs investigating! */
45293+ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
45294+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
45295+ pgd += pgd_index(address);
45296+ pgd_ref = pgd_offset_k(address);
45297+ if (pgd_none(*pgd_ref))
45298+ return -1;
45299+ if (pgd_none(*pgd))
45300+ set_pgd(pgd, *pgd_ref);
45301+
45302+ /* Below here mismatches are bugs because these lower tables
45303+ are shared */
45304+
45305+ pud = pud_offset(pgd, address);
45306+ pud_ref = pud_offset(pgd_ref, address);
45307+ if (pud_none(*pud_ref))
45308+ return -1;
45309+ if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
45310+ BUG();
45311+ pmd = pmd_offset(pud, address);
45312+ pmd_ref = pmd_offset(pud_ref, address);
45313+ if (pmd_none(*pmd_ref))
45314+ return -1;
45315+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
45316+ BUG();
45317+ pte_ref = pte_offset_kernel(pmd_ref, address);
45318+ if (!pte_present(*pte_ref))
45319+ return -1;
45320+ pte = pte_offset_kernel(pmd, address);
45321+ /* Don't use pte_page here, because the mappings can point
45322+ outside mem_map, and the NUMA hash lookup cannot handle
45323+ that. */
45324+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
45325+ BUG();
45326+ return 0;
45327+}
45328+
45329+int page_fault_trace = 0;
45330+int exception_trace = 1;
45331+
45332+
45333+#define MEM_VERBOSE 1
45334+
45335+#ifdef MEM_VERBOSE
45336+#define MEM_LOG(_f, _a...) \
45337+ printk("fault.c:[%d]-> " _f "\n", \
45338+ __LINE__ , ## _a )
45339+#else
45340+#define MEM_LOG(_f, _a...) ((void)0)
45341+#endif
45342+
45343+static int spurious_fault(struct pt_regs *regs,
45344+ unsigned long address,
45345+ unsigned long error_code)
45346+{
45347+ pgd_t *pgd;
45348+ pud_t *pud;
45349+ pmd_t *pmd;
45350+ pte_t *pte;
45351+
45352+#ifdef CONFIG_XEN
45353+ /* Faults in hypervisor area are never spurious. */
45354+ if ((address >= HYPERVISOR_VIRT_START) &&
45355+ (address < HYPERVISOR_VIRT_END))
45356+ return 0;
45357+#endif
45358+
45359+ /* Reserved-bit violation or user access to kernel space? */
45360+ if (error_code & (PF_RSVD|PF_USER))
45361+ return 0;
45362+
45363+ pgd = init_mm.pgd + pgd_index(address);
45364+ if (!pgd_present(*pgd))
45365+ return 0;
45366+
45367+ pud = pud_offset(pgd, address);
45368+ if (!pud_present(*pud))
45369+ return 0;
45370+
45371+ pmd = pmd_offset(pud, address);
45372+ if (!pmd_present(*pmd))
45373+ return 0;
45374+
45375+ pte = pte_offset_kernel(pmd, address);
45376+ if (!pte_present(*pte))
45377+ return 0;
45378+ if ((error_code & PF_WRITE) && !pte_write(*pte))
45379+ return 0;
45380+ if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
45381+ return 0;
45382+
45383+ return 1;
45384+}
45385+
45386+/*
45387+ * This routine handles page faults. It determines the address,
45388+ * and the problem, and then passes it off to one of the appropriate
45389+ * routines.
45390+ */
45391+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
45392+ unsigned long error_code)
45393+{
45394+ struct task_struct *tsk;
45395+ struct mm_struct *mm;
45396+ struct vm_area_struct * vma;
45397+ unsigned long address;
45398+ const struct exception_table_entry *fixup;
45399+ int write;
45400+ unsigned long flags;
45401+ siginfo_t info;
45402+
45403+ if (!user_mode(regs))
45404+ error_code &= ~PF_USER; /* means kernel */
45405+
45406+ /* get the address */
45407+ address = HYPERVISOR_shared_info->vcpu_info[
45408+ smp_processor_id()].arch.cr2;
45409+ if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
45410+ SIGSEGV) == NOTIFY_STOP)
45411+ return;
45412+
45413+ if (likely(regs->eflags & X86_EFLAGS_IF))
45414+ local_irq_enable();
45415+
45416+ if (unlikely(page_fault_trace))
45417+ printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
45418+ regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
45419+
45420+ tsk = current;
45421+ mm = tsk->mm;
45422+ info.si_code = SEGV_MAPERR;
45423+
45424+
45425+ /*
45426+ * We fault-in kernel-space virtual memory on-demand. The
45427+ * 'reference' page table is init_mm.pgd.
45428+ *
45429+ * NOTE! We MUST NOT take any locks for this case. We may
45430+ * be in an interrupt or a critical region, and should
45431+ * only copy the information from the master page table,
45432+ * nothing more.
45433+ *
45434+ * This verifies that the fault happens in kernel space
45435+ * (error_code & 4) == 0, and that the fault was not a
45436+ * protection error (error_code & 9) == 0.
45437+ */
45438+ if (unlikely(address >= TASK_SIZE64)) {
45439+ /*
45440+ * Don't check for the module range here: its PML4
45441+ * is always initialized because it's shared with the main
45442+ * kernel text. Only vmalloc may need PML4 syncups.
45443+ */
45444+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
45445+ ((address >= VMALLOC_START && address < VMALLOC_END))) {
45446+ if (vmalloc_fault(address) < 0)
45447+ goto bad_area_nosemaphore;
45448+ return;
45449+ }
45450+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
45451+ if (spurious_fault(regs, address, error_code))
45452+ return;
45453+ /*
45454+ * Don't take the mm semaphore here. If we fixup a prefetch
45455+ * fault we could otherwise deadlock.
45456+ */
45457+ goto bad_area_nosemaphore;
45458+ }
45459+
45460+ if (unlikely(error_code & PF_RSVD))
45461+ pgtable_bad(address, regs, error_code);
45462+
45463+ /*
45464+ * If we're in an interrupt or have no user
45465+ * context, we must not take the fault..
45466+ */
45467+ if (unlikely(in_atomic() || !mm))
45468+ goto bad_area_nosemaphore;
45469+
45470+ again:
45471+ /* When running in the kernel we expect faults to occur only to
45472+ * addresses in user space. All other faults represent errors in the
45473+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
45474+ * erroneous fault occuring in a code path which already holds mmap_sem
45475+ * we will deadlock attempting to validate the fault against the
45476+ * address space. Luckily the kernel only validly references user
45477+ * space from well defined areas of code, which are listed in the
45478+ * exceptions table.
45479+ *
45480+ * As the vast majority of faults will be valid we will only perform
45481+ * the source reference check when there is a possibilty of a deadlock.
45482+ * Attempt to lock the address space, if we cannot we then validate the
45483+ * source. If this is invalid we can skip the address space check,
45484+ * thus avoiding the deadlock.
45485+ */
45486+ if (!down_read_trylock(&mm->mmap_sem)) {
45487+ if ((error_code & PF_USER) == 0 &&
45488+ !search_exception_tables(regs->rip))
45489+ goto bad_area_nosemaphore;
45490+ down_read(&mm->mmap_sem);
45491+ }
45492+
45493+ vma = find_vma(mm, address);
45494+ if (!vma)
45495+ goto bad_area;
45496+ if (likely(vma->vm_start <= address))
45497+ goto good_area;
45498+ if (!(vma->vm_flags & VM_GROWSDOWN))
45499+ goto bad_area;
45500+ if (error_code & 4) {
45501+ // XXX: align red zone size with ABI
45502+ if (address + 128 < regs->rsp)
45503+ goto bad_area;
45504+ }
45505+ if (expand_stack(vma, address))
45506+ goto bad_area;
45507+/*
45508+ * Ok, we have a good vm_area for this memory access, so
45509+ * we can handle it..
45510+ */
45511+good_area:
45512+ info.si_code = SEGV_ACCERR;
45513+ write = 0;
45514+ switch (error_code & (PF_PROT|PF_WRITE)) {
45515+ default: /* 3: write, present */
45516+ /* fall through */
45517+ case PF_WRITE: /* write, not present */
45518+ if (!(vma->vm_flags & VM_WRITE))
45519+ goto bad_area;
45520+ write++;
45521+ break;
45522+ case PF_PROT: /* read, present */
45523+ goto bad_area;
45524+ case 0: /* read, not present */
45525+ if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
45526+ goto bad_area;
45527+ }
45528+
45529+ /*
45530+ * If for any reason at all we couldn't handle the fault,
45531+ * make sure we exit gracefully rather than endlessly redo
45532+ * the fault.
45533+ */
45534+ switch (handle_mm_fault(mm, vma, address, write)) {
45535+ case VM_FAULT_MINOR:
45536+ tsk->min_flt++;
45537+ break;
45538+ case VM_FAULT_MAJOR:
45539+ tsk->maj_flt++;
45540+ break;
45541+ case VM_FAULT_SIGBUS:
45542+ goto do_sigbus;
45543+ default:
45544+ goto out_of_memory;
45545+ }
45546+
45547+ up_read(&mm->mmap_sem);
45548+ return;
45549+
45550+/*
45551+ * Something tried to access memory that isn't in our memory map..
45552+ * Fix it, but check if it's kernel or user first..
45553+ */
45554+bad_area:
45555+ up_read(&mm->mmap_sem);
45556+
45557+bad_area_nosemaphore:
45558+ /* User mode accesses just cause a SIGSEGV */
45559+ if (error_code & PF_USER) {
45560+ if (is_prefetch(regs, address, error_code))
45561+ return;
45562+
45563+ /* Work around K8 erratum #100 K8 in compat mode
45564+ occasionally jumps to illegal addresses >4GB. We
45565+ catch this here in the page fault handler because
45566+ these addresses are not reachable. Just detect this
45567+ case and return. Any code segment in LDT is
45568+ compatibility mode. */
45569+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
45570+ (address >> 32))
45571+ return;
45572+
45573+ if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
45574+ printk(
45575+ "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
45576+ tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
45577+ tsk->comm, tsk->pid, address, regs->rip,
45578+ regs->rsp, error_code);
45579+ }
45580+
45581+ tsk->thread.cr2 = address;
45582+ /* Kernel addresses are always protection faults */
45583+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
45584+ tsk->thread.trap_no = 14;
45585+ info.si_signo = SIGSEGV;
45586+ info.si_errno = 0;
45587+ /* info.si_code has been set above */
45588+ info.si_addr = (void __user *)address;
45589+ force_sig_info(SIGSEGV, &info, tsk);
45590+ return;
45591+ }
45592+
45593+no_context:
45594+
45595+ /* Are we prepared to handle this kernel fault? */
45596+ fixup = search_exception_tables(regs->rip);
45597+ if (fixup) {
45598+ regs->rip = fixup->fixup;
45599+ return;
45600+ }
45601+
45602+ /*
45603+ * Hall of shame of CPU/BIOS bugs.
45604+ */
45605+
45606+ if (is_prefetch(regs, address, error_code))
45607+ return;
45608+
45609+ if (is_errata93(regs, address))
45610+ return;
45611+
45612+/*
45613+ * Oops. The kernel tried to access some bad page. We'll have to
45614+ * terminate things with extreme prejudice.
45615+ */
45616+
45617+ flags = oops_begin();
45618+
45619+ if (address < PAGE_SIZE)
45620+ printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
45621+ else
45622+ printk(KERN_ALERT "Unable to handle kernel paging request");
45623+ printk(" at %016lx RIP: \n" KERN_ALERT,address);
45624+ printk_address(regs->rip);
45625+ printk("\n");
45626+ dump_pagetable(address);
45627+ tsk->thread.cr2 = address;
45628+ tsk->thread.trap_no = 14;
45629+ tsk->thread.error_code = error_code;
45630+ __die("Oops", regs, error_code);
45631+ /* Executive summary in case the body of the oops scrolled away */
45632+ printk(KERN_EMERG "CR2: %016lx\n", address);
45633+ oops_end(flags);
45634+ do_exit(SIGKILL);
45635+
45636+/*
45637+ * We ran out of memory, or some other thing happened to us that made
45638+ * us unable to handle the page fault gracefully.
45639+ */
45640+out_of_memory:
45641+ up_read(&mm->mmap_sem);
45642+ if (current->pid == 1) {
45643+ yield();
45644+ goto again;
45645+ }
45646+ printk("VM: killing process %s\n", tsk->comm);
45647+ if (error_code & 4)
45648+ do_exit(SIGKILL);
45649+ goto no_context;
45650+
45651+do_sigbus:
45652+ up_read(&mm->mmap_sem);
45653+
45654+ /* Kernel mode? Handle exceptions or die */
45655+ if (!(error_code & PF_USER))
45656+ goto no_context;
45657+
45658+ tsk->thread.cr2 = address;
45659+ tsk->thread.error_code = error_code;
45660+ tsk->thread.trap_no = 14;
45661+ info.si_signo = SIGBUS;
45662+ info.si_errno = 0;
45663+ info.si_code = BUS_ADRERR;
45664+ info.si_addr = (void __user *)address;
45665+ force_sig_info(SIGBUS, &info, tsk);
45666+ return;
45667+}
45668+
45669+static int __init enable_pagefaulttrace(char *str)
45670+{
45671+ page_fault_trace = 1;
45672+ return 0;
45673+}
45674+__setup("pagefaulttrace", enable_pagefaulttrace);
45675diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/init-xen.c linux-2.6.16.33/arch/x86_64/mm/init-xen.c
45676--- linux-2.6.16.33-noxen/arch/x86_64/mm/init-xen.c 1970-01-01 00:00:00.000000000 +0000
45677+++ linux-2.6.16.33/arch/x86_64/mm/init-xen.c 2007-01-08 15:00:45.000000000 +0000
45678@@ -0,0 +1,1162 @@
45679+/*
45680+ * linux/arch/x86_64/mm/init.c
45681+ *
45682+ * Copyright (C) 1995 Linus Torvalds
45683+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
45684+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
45685+ *
45686+ * Jun Nakajima <jun.nakajima@intel.com>
45687+ * Modified for Xen.
45688+ */
45689+
45690+#include <linux/config.h>
45691+#include <linux/signal.h>
45692+#include <linux/sched.h>
45693+#include <linux/kernel.h>
45694+#include <linux/errno.h>
45695+#include <linux/string.h>
45696+#include <linux/types.h>
45697+#include <linux/ptrace.h>
45698+#include <linux/mman.h>
45699+#include <linux/mm.h>
45700+#include <linux/swap.h>
45701+#include <linux/smp.h>
45702+#include <linux/init.h>
45703+#include <linux/pagemap.h>
45704+#include <linux/bootmem.h>
45705+#include <linux/proc_fs.h>
45706+#include <linux/pci.h>
45707+#include <linux/dma-mapping.h>
45708+#include <linux/module.h>
45709+#include <linux/memory_hotplug.h>
45710+
45711+#include <asm/processor.h>
45712+#include <asm/system.h>
45713+#include <asm/uaccess.h>
45714+#include <asm/pgtable.h>
45715+#include <asm/pgalloc.h>
45716+#include <asm/dma.h>
45717+#include <asm/fixmap.h>
45718+#include <asm/e820.h>
45719+#include <asm/apic.h>
45720+#include <asm/tlb.h>
45721+#include <asm/mmu_context.h>
45722+#include <asm/proto.h>
45723+#include <asm/smp.h>
45724+#include <asm/sections.h>
45725+#include <asm/dma-mapping.h>
45726+#include <asm/swiotlb.h>
45727+
45728+#include <xen/features.h>
45729+
45730+#ifndef Dprintk
45731+#define Dprintk(x...)
45732+#endif
45733+
45734+struct dma_mapping_ops* dma_ops;
45735+EXPORT_SYMBOL(dma_ops);
45736+
45737+#ifdef CONFIG_XEN_COMPAT_030002
45738+unsigned int __kernel_page_user;
45739+EXPORT_SYMBOL(__kernel_page_user);
45740+#endif
45741+
45742+extern unsigned long *contiguous_bitmap;
45743+
45744+static unsigned long dma_reserve __initdata;
45745+
45746+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
45747+extern unsigned long start_pfn;
45748+
45749+/*
45750+ * Use this until direct mapping is established, i.e. before __va() is
45751+ * available in init_memory_mapping().
45752+ */
45753+
45754+#define addr_to_page(addr, page) \
45755+ (addr) &= PHYSICAL_PAGE_MASK; \
45756+ (page) = ((unsigned long *) ((unsigned long) \
45757+ (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
45758+ __START_KERNEL_map)))
45759+
45760+static void early_make_page_readonly(void *va, unsigned int feature)
45761+{
45762+ unsigned long addr, _va = (unsigned long)va;
45763+ pte_t pte, *ptep;
45764+ unsigned long *page = (unsigned long *) init_level4_pgt;
45765+
45766+ if (xen_feature(feature))
45767+ return;
45768+
45769+ addr = (unsigned long) page[pgd_index(_va)];
45770+ addr_to_page(addr, page);
45771+
45772+ addr = page[pud_index(_va)];
45773+ addr_to_page(addr, page);
45774+
45775+ addr = page[pmd_index(_va)];
45776+ addr_to_page(addr, page);
45777+
45778+ ptep = (pte_t *) &page[pte_index(_va)];
45779+
45780+ pte.pte = ptep->pte & ~_PAGE_RW;
45781+ if (HYPERVISOR_update_va_mapping(_va, pte, 0))
45782+ BUG();
45783+}
45784+
45785+void make_page_readonly(void *va, unsigned int feature)
45786+{
45787+ pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45788+ unsigned long addr = (unsigned long) va;
45789+
45790+ if (xen_feature(feature))
45791+ return;
45792+
45793+ pgd = pgd_offset_k(addr);
45794+ pud = pud_offset(pgd, addr);
45795+ pmd = pmd_offset(pud, addr);
45796+ ptep = pte_offset_kernel(pmd, addr);
45797+
45798+ pte.pte = ptep->pte & ~_PAGE_RW;
45799+ if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45800+ xen_l1_entry_update(ptep, pte); /* fallback */
45801+
45802+ if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45803+ make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45804+}
45805+
45806+void make_page_writable(void *va, unsigned int feature)
45807+{
45808+ pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45809+ unsigned long addr = (unsigned long) va;
45810+
45811+ if (xen_feature(feature))
45812+ return;
45813+
45814+ pgd = pgd_offset_k(addr);
45815+ pud = pud_offset(pgd, addr);
45816+ pmd = pmd_offset(pud, addr);
45817+ ptep = pte_offset_kernel(pmd, addr);
45818+
45819+ pte.pte = ptep->pte | _PAGE_RW;
45820+ if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45821+ xen_l1_entry_update(ptep, pte); /* fallback */
45822+
45823+ if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45824+ make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45825+}
45826+
45827+void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
45828+{
45829+ if (xen_feature(feature))
45830+ return;
45831+
45832+ while (nr-- != 0) {
45833+ make_page_readonly(va, feature);
45834+ va = (void*)((unsigned long)va + PAGE_SIZE);
45835+ }
45836+}
45837+
45838+void make_pages_writable(void *va, unsigned nr, unsigned int feature)
45839+{
45840+ if (xen_feature(feature))
45841+ return;
45842+
45843+ while (nr-- != 0) {
45844+ make_page_writable(va, feature);
45845+ va = (void*)((unsigned long)va + PAGE_SIZE);
45846+ }
45847+}
45848+
45849+/*
45850+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
45851+ * physical space so we can cache the place of the first one and move
45852+ * around without checking the pgd every time.
45853+ */
45854+
45855+void show_mem(void)
45856+{
45857+ long i, total = 0, reserved = 0;
45858+ long shared = 0, cached = 0;
45859+ pg_data_t *pgdat;
45860+ struct page *page;
45861+
45862+ printk(KERN_INFO "Mem-info:\n");
45863+ show_free_areas();
45864+ printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
45865+
45866+ for_each_pgdat(pgdat) {
45867+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
45868+ page = pfn_to_page(pgdat->node_start_pfn + i);
45869+ total++;
45870+ if (PageReserved(page))
45871+ reserved++;
45872+ else if (PageSwapCache(page))
45873+ cached++;
45874+ else if (page_count(page))
45875+ shared += page_count(page) - 1;
45876+ }
45877+ }
45878+ printk(KERN_INFO "%lu pages of RAM\n", total);
45879+ printk(KERN_INFO "%lu reserved pages\n",reserved);
45880+ printk(KERN_INFO "%lu pages shared\n",shared);
45881+ printk(KERN_INFO "%lu pages swap cached\n",cached);
45882+}
45883+
45884+/* References to section boundaries */
45885+
45886+int after_bootmem;
45887+
45888+static void *spp_getpage(void)
45889+{
45890+ void *ptr;
45891+ if (after_bootmem)
45892+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
45893+ else
45894+ ptr = alloc_bootmem_pages(PAGE_SIZE);
45895+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
45896+ panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
45897+
45898+ Dprintk("spp_getpage %p\n", ptr);
45899+ return ptr;
45900+}
45901+
45902+#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
45903+
45904+static inline pud_t *pud_offset_u(unsigned long address)
45905+{
45906+ pud_t *pud = level3_user_pgt;
45907+
45908+ return pud + pud_index(address);
45909+}
45910+
45911+static void set_pte_phys(unsigned long vaddr,
45912+ unsigned long phys, pgprot_t prot, int user_mode)
45913+{
45914+ pgd_t *pgd;
45915+ pud_t *pud;
45916+ pmd_t *pmd;
45917+ pte_t *pte, new_pte;
45918+
45919+ Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45920+
45921+ pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
45922+ if (pgd_none(*pgd)) {
45923+ printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45924+ return;
45925+ }
45926+ pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
45927+ if (pud_none(*pud)) {
45928+ pmd = (pmd_t *) spp_getpage();
45929+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
45930+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45931+ if (pmd != pmd_offset(pud, 0)) {
45932+ printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45933+ return;
45934+ }
45935+ }
45936+ pmd = pmd_offset(pud, vaddr);
45937+ if (pmd_none(*pmd)) {
45938+ pte = (pte_t *) spp_getpage();
45939+ make_page_readonly(pte, XENFEAT_writable_page_tables);
45940+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45941+ if (pte != pte_offset_kernel(pmd, 0)) {
45942+ printk("PAGETABLE BUG #02!\n");
45943+ return;
45944+ }
45945+ }
45946+ if (pgprot_val(prot))
45947+ new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
45948+ else
45949+ new_pte = __pte(0);
45950+
45951+ pte = pte_offset_kernel(pmd, vaddr);
45952+ if (!pte_none(*pte) &&
45953+ pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
45954+ pte_ERROR(*pte);
45955+ set_pte(pte, new_pte);
45956+
45957+ /*
45958+ * It's enough to flush this one mapping.
45959+ * (PGE mappings get flushed as well)
45960+ */
45961+ __flush_tlb_one(vaddr);
45962+}
45963+
45964+static void set_pte_phys_ma(unsigned long vaddr,
45965+ unsigned long phys, pgprot_t prot)
45966+{
45967+ pgd_t *pgd;
45968+ pud_t *pud;
45969+ pmd_t *pmd;
45970+ pte_t *pte, new_pte;
45971+
45972+ Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45973+
45974+ pgd = pgd_offset_k(vaddr);
45975+ if (pgd_none(*pgd)) {
45976+ printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45977+ return;
45978+ }
45979+ pud = pud_offset(pgd, vaddr);
45980+ if (pud_none(*pud)) {
45981+
45982+ pmd = (pmd_t *) spp_getpage();
45983+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
45984+
45985+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45986+
45987+ if (pmd != pmd_offset(pud, 0)) {
45988+ printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45989+ return;
45990+ }
45991+ }
45992+ pmd = pmd_offset(pud, vaddr);
45993+
45994+ if (pmd_none(*pmd)) {
45995+ pte = (pte_t *) spp_getpage();
45996+ make_page_readonly(pte, XENFEAT_writable_page_tables);
45997+
45998+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45999+ if (pte != pte_offset_kernel(pmd, 0)) {
46000+ printk("PAGETABLE BUG #02!\n");
46001+ return;
46002+ }
46003+ }
46004+
46005+ new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
46006+ pte = pte_offset_kernel(pmd, vaddr);
46007+
46008+ /*
46009+ * Note that the pte page is already RO, thus we want to use
46010+ * xen_l1_entry_update(), not set_pte().
46011+ */
46012+ xen_l1_entry_update(pte,
46013+ pfn_pte_ma(phys >> PAGE_SHIFT, prot));
46014+
46015+ /*
46016+ * It's enough to flush this one mapping.
46017+ * (PGE mappings get flushed as well)
46018+ */
46019+ __flush_tlb_one(vaddr);
46020+}
46021+
46022+#define SET_FIXMAP_KERNEL 0
46023+#define SET_FIXMAP_USER 1
46024+
46025+/* NOTE: this is meant to be run only at boot */
46026+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
46027+{
46028+ unsigned long address = __fix_to_virt(idx);
46029+
46030+ if (idx >= __end_of_fixed_addresses) {
46031+ printk("Invalid __set_fixmap\n");
46032+ return;
46033+ }
46034+ switch (idx) {
46035+ case VSYSCALL_FIRST_PAGE:
46036+ set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
46037+ break;
46038+ default:
46039+ set_pte_phys_ma(address, phys, prot);
46040+ break;
46041+ }
46042+}
46043+
46044+/*
46045+ * At this point it only supports vsyscall area.
46046+ */
46047+void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
46048+{
46049+ unsigned long address = __fix_to_virt(idx);
46050+
46051+ if (idx >= __end_of_fixed_addresses) {
46052+ printk("Invalid __set_fixmap\n");
46053+ return;
46054+ }
46055+
46056+ set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
46057+}
46058+
46059+unsigned long __initdata table_start, table_end;
46060+
46061+unsigned long get_machine_pfn(unsigned long addr)
46062+{
46063+ pud_t* pud = pud_offset_k(NULL, addr);
46064+ pmd_t* pmd = pmd_offset(pud, addr);
46065+ pte_t *pte = pte_offset_kernel(pmd, addr);
46066+
46067+ return pte_mfn(*pte);
46068+}
46069+
46070+static __meminit void *alloc_static_page(unsigned long *phys)
46071+{
46072+ unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
46073+
46074+ if (after_bootmem) {
46075+ void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
46076+
46077+ *phys = __pa(adr);
46078+ return adr;
46079+ }
46080+
46081+ *phys = start_pfn << PAGE_SHIFT;
46082+ start_pfn++;
46083+ memset((void *)va, 0, PAGE_SIZE);
46084+ return (void *)va;
46085+}
46086+
46087+#define PTE_SIZE PAGE_SIZE
46088+
46089+static inline void __set_pte(pte_t *dst, pte_t val)
46090+{
46091+ *dst = val;
46092+}
46093+
46094+static inline int make_readonly(unsigned long paddr)
46095+{
46096+ int readonly = 0;
46097+
46098+ /* Make new page tables read-only. */
46099+ if (!xen_feature(XENFEAT_writable_page_tables)
46100+ && (paddr >= (table_start << PAGE_SHIFT))
46101+ && (paddr < (table_end << PAGE_SHIFT)))
46102+ readonly = 1;
46103+ /* Make old page tables read-only. */
46104+ if (!xen_feature(XENFEAT_writable_page_tables)
46105+ && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
46106+ && (paddr < (start_pfn << PAGE_SHIFT)))
46107+ readonly = 1;
46108+
46109+ /*
46110+ * No need for writable mapping of kernel image. This also ensures that
46111+ * page and descriptor tables embedded inside don't have writable
46112+ * mappings.
46113+ */
46114+ if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
46115+ readonly = 1;
46116+
46117+ return readonly;
46118+}
46119+
46120+static void __meminit
46121+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
46122+{
46123+ int i, k;
46124+
46125+ for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
46126+ unsigned long pte_phys;
46127+ pte_t *pte, *pte_save;
46128+
46129+ if (address >= end) {
46130+ for (; i < PTRS_PER_PMD; i++, pmd++)
46131+ set_pmd(pmd, __pmd(0));
46132+ break;
46133+ }
46134+ pte = alloc_static_page(&pte_phys);
46135+ pte_save = pte;
46136+ for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
46137+ if ((address >= end) ||
46138+ ((address >> PAGE_SHIFT) >=
46139+ xen_start_info->nr_pages)) {
46140+ __set_pte(pte, __pte(0));
46141+ continue;
46142+ }
46143+ if (make_readonly(address)) {
46144+ __set_pte(pte,
46145+ __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
46146+ continue;
46147+ }
46148+ __set_pte(pte, __pte(address | _KERNPG_TABLE));
46149+ }
46150+ pte = pte_save;
46151+ early_make_page_readonly(pte, XENFEAT_writable_page_tables);
46152+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
46153+ }
46154+}
46155+
46156+static void __meminit
46157+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
46158+{
46159+ pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
46160+
46161+ if (pmd_none(*pmd)) {
46162+ spin_lock(&init_mm.page_table_lock);
46163+ phys_pmd_init(pmd, address, end);
46164+ spin_unlock(&init_mm.page_table_lock);
46165+ __flush_tlb_all();
46166+ }
46167+}
46168+
46169+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
46170+{
46171+ long i = pud_index(address);
46172+
46173+ pud = pud + i;
46174+
46175+ if (after_bootmem && pud_val(*pud)) {
46176+ phys_pmd_update(pud, address, end);
46177+ return;
46178+ }
46179+
46180+ for (; i < PTRS_PER_PUD; pud++, i++) {
46181+ unsigned long paddr, pmd_phys;
46182+ pmd_t *pmd;
46183+
46184+ paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
46185+ if (paddr >= end)
46186+ break;
46187+
46188+ pmd = alloc_static_page(&pmd_phys);
46189+ early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
46190+ spin_lock(&init_mm.page_table_lock);
46191+ set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
46192+ phys_pmd_init(pmd, paddr, end);
46193+ spin_unlock(&init_mm.page_table_lock);
46194+ }
46195+ __flush_tlb();
46196+}
46197+
46198+void __init xen_init_pt(void)
46199+{
46200+ unsigned long addr, *page;
46201+
46202+ memset((void *)init_level4_pgt, 0, PAGE_SIZE);
46203+ memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
46204+ memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
46205+
46206+ /* Find the initial pte page that was built for us. */
46207+ page = (unsigned long *)xen_start_info->pt_base;
46208+ addr = page[pgd_index(__START_KERNEL_map)];
46209+ addr_to_page(addr, page);
46210+ addr = page[pud_index(__START_KERNEL_map)];
46211+ addr_to_page(addr, page);
46212+
46213+#ifdef CONFIG_XEN_COMPAT_030002
46214+ /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
46215+ in kernel PTEs. We check that here. */
46216+ if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
46217+ unsigned long *pg;
46218+ pte_t pte;
46219+
46220+ /* Mess with the initial mapping of page 0. It's not needed. */
46221+ BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
46222+ addr = page[pmd_index(__START_KERNEL_map)];
46223+ addr_to_page(addr, pg);
46224+ pte.pte = pg[pte_index(__START_KERNEL_map)];
46225+ BUG_ON(!(pte.pte & _PAGE_PRESENT));
46226+
46227+ /* If _PAGE_USER isn't set, we obviously do not need it. */
46228+ if (pte.pte & _PAGE_USER) {
46229+ /* _PAGE_USER is needed, but is it set implicitly? */
46230+ pte.pte &= ~_PAGE_USER;
46231+ if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
46232+ pte, 0) != 0) ||
46233+ !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
46234+ /* We need to explicitly specify _PAGE_USER. */
46235+ __kernel_page_user = _PAGE_USER;
46236+ }
46237+ }
46238+#endif
46239+
46240+ /* Construct mapping of initial pte page in our own directories. */
46241+ init_level4_pgt[pgd_index(__START_KERNEL_map)] =
46242+ mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
46243+ level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
46244+ __pud(__pa_symbol(level2_kernel_pgt) |
46245+ _KERNPG_TABLE);
46246+ memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
46247+
46248+ early_make_page_readonly(init_level4_pgt,
46249+ XENFEAT_writable_page_tables);
46250+ early_make_page_readonly(init_level4_user_pgt,
46251+ XENFEAT_writable_page_tables);
46252+ early_make_page_readonly(level3_kernel_pgt,
46253+ XENFEAT_writable_page_tables);
46254+ early_make_page_readonly(level3_user_pgt,
46255+ XENFEAT_writable_page_tables);
46256+ early_make_page_readonly(level2_kernel_pgt,
46257+ XENFEAT_writable_page_tables);
46258+
46259+ xen_pgd_pin(__pa_symbol(init_level4_pgt));
46260+ xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
46261+
46262+ set_pgd((pgd_t *)(init_level4_user_pgt + 511),
46263+ mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
46264+}
46265+
46266+void __init extend_init_mapping(unsigned long tables_space)
46267+{
46268+ unsigned long va = __START_KERNEL_map;
46269+ unsigned long phys, addr, *pte_page;
46270+ pmd_t *pmd;
46271+ pte_t *pte, new_pte;
46272+ unsigned long *page = (unsigned long *)init_level4_pgt;
46273+
46274+ addr = page[pgd_index(va)];
46275+ addr_to_page(addr, page);
46276+ addr = page[pud_index(va)];
46277+ addr_to_page(addr, page);
46278+
46279+ /* Kill mapping of low 1MB. */
46280+ while (va < (unsigned long)&_text) {
46281+ HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
46282+ va += PAGE_SIZE;
46283+ }
46284+
46285+ /* Ensure init mappings cover kernel text/data and initial tables. */
46286+ while (va < (__START_KERNEL_map
46287+ + (start_pfn << PAGE_SHIFT)
46288+ + tables_space)) {
46289+ pmd = (pmd_t *)&page[pmd_index(va)];
46290+ if (pmd_none(*pmd)) {
46291+ pte_page = alloc_static_page(&phys);
46292+ early_make_page_readonly(
46293+ pte_page, XENFEAT_writable_page_tables);
46294+ set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
46295+ } else {
46296+ addr = page[pmd_index(va)];
46297+ addr_to_page(addr, pte_page);
46298+ }
46299+ pte = (pte_t *)&pte_page[pte_index(va)];
46300+ if (pte_none(*pte)) {
46301+ new_pte = pfn_pte(
46302+ (va - __START_KERNEL_map) >> PAGE_SHIFT,
46303+ __pgprot(_KERNPG_TABLE));
46304+ xen_l1_entry_update(pte, new_pte);
46305+ }
46306+ va += PAGE_SIZE;
46307+ }
46308+
46309+ /* Finally, blow away any spurious initial mappings. */
46310+ while (1) {
46311+ pmd = (pmd_t *)&page[pmd_index(va)];
46312+ if (pmd_none(*pmd))
46313+ break;
46314+ HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
46315+ va += PAGE_SIZE;
46316+ }
46317+}
46318+
46319+static void __init find_early_table_space(unsigned long end)
46320+{
46321+ unsigned long puds, pmds, ptes, tables;
46322+
46323+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
46324+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
46325+ ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
46326+
46327+ tables = round_up(puds * 8, PAGE_SIZE) +
46328+ round_up(pmds * 8, PAGE_SIZE) +
46329+ round_up(ptes * 8, PAGE_SIZE);
46330+
46331+ extend_init_mapping(tables);
46332+
46333+ table_start = start_pfn;
46334+ table_end = table_start + (tables>>PAGE_SHIFT);
46335+
46336+ early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
46337+ end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
46338+}
46339+
46340+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
46341+ This runs before bootmem is initialized and gets pages directly from the
46342+ physical memory. To access them they are temporarily mapped. */
46343+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
46344+{
46345+ unsigned long next;
46346+
46347+ Dprintk("init_memory_mapping\n");
46348+
46349+ /*
46350+ * Find space for the kernel direct mapping tables.
46351+ * Later we should allocate these tables in the local node of the memory
46352+ * mapped. Unfortunately this is done currently before the nodes are
46353+ * discovered.
46354+ */
46355+ if (!after_bootmem)
46356+ find_early_table_space(end);
46357+
46358+ start = (unsigned long)__va(start);
46359+ end = (unsigned long)__va(end);
46360+
46361+ for (; start < end; start = next) {
46362+ unsigned long pud_phys;
46363+ pgd_t *pgd = pgd_offset_k(start);
46364+ pud_t *pud;
46365+
46366+ if (after_bootmem) {
46367+ pud = pud_offset_k(pgd, __PAGE_OFFSET);
46368+ make_page_readonly(pud, XENFEAT_writable_page_tables);
46369+ pud_phys = __pa(pud);
46370+ } else {
46371+ pud = alloc_static_page(&pud_phys);
46372+ early_make_page_readonly(pud, XENFEAT_writable_page_tables);
46373+ }
46374+ next = start + PGDIR_SIZE;
46375+ if (next > end)
46376+ next = end;
46377+ phys_pud_init(pud, __pa(start), __pa(next));
46378+ if (!after_bootmem)
46379+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
46380+ }
46381+
46382+ if (!after_bootmem) {
46383+ BUG_ON(start_pfn != table_end);
46384+
46385+ /* Re-vector virtual addresses pointing into the initial
46386+ mapping to the just-established permanent ones. */
46387+ xen_start_info = __va(__pa(xen_start_info));
46388+ xen_start_info->pt_base = (unsigned long)
46389+ __va(__pa(xen_start_info->pt_base));
46390+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
46391+ phys_to_machine_mapping =
46392+ __va(__pa(xen_start_info->mfn_list));
46393+ xen_start_info->mfn_list = (unsigned long)
46394+ phys_to_machine_mapping;
46395+ }
46396+ if (xen_start_info->mod_start)
46397+ xen_start_info->mod_start = (unsigned long)
46398+ __va(__pa(xen_start_info->mod_start));
46399+
46400+ /* Destroy the Xen-created mappings beyond the kernel image as
46401+ * well as the temporary mappings created above. Prevents
46402+ * overlap with modules area (if init mapping is very big).
46403+ */
46404+ start = PAGE_ALIGN((unsigned long)_end);
46405+ end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
46406+ for (; start < end; start += PAGE_SIZE)
46407+ WARN_ON(HYPERVISOR_update_va_mapping(
46408+ start, __pte_ma(0), 0));
46409+ }
46410+
46411+ __flush_tlb_all();
46412+}
46413+
46414+void __cpuinit zap_low_mappings(int cpu)
46415+{
46416+ /* this is not required for Xen */
46417+#if 0
46418+ swap_low_mappings();
46419+#endif
46420+}
46421+
46422+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
46423+__init void
46424+size_zones(unsigned long *z, unsigned long *h,
46425+ unsigned long start_pfn, unsigned long end_pfn)
46426+{
46427+ int i;
46428+#ifndef CONFIG_XEN
46429+ unsigned long w;
46430+#endif
46431+
46432+ for (i = 0; i < MAX_NR_ZONES; i++)
46433+ z[i] = 0;
46434+
46435+#ifndef CONFIG_XEN
46436+ if (start_pfn < MAX_DMA_PFN)
46437+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
46438+ if (start_pfn < MAX_DMA32_PFN) {
46439+ unsigned long dma32_pfn = MAX_DMA32_PFN;
46440+ if (dma32_pfn > end_pfn)
46441+ dma32_pfn = end_pfn;
46442+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
46443+ }
46444+ z[ZONE_NORMAL] = end_pfn - start_pfn;
46445+
46446+ /* Remove lower zones from higher ones. */
46447+ w = 0;
46448+ for (i = 0; i < MAX_NR_ZONES; i++) {
46449+ if (z[i])
46450+ z[i] -= w;
46451+ w += z[i];
46452+ }
46453+
46454+ /* Compute holes */
46455+ w = start_pfn;
46456+ for (i = 0; i < MAX_NR_ZONES; i++) {
46457+ unsigned long s = w;
46458+ w += z[i];
46459+ h[i] = e820_hole_size(s, w);
46460+ }
46461+
46462+ /* Add the space pace needed for mem_map to the holes too. */
46463+ for (i = 0; i < MAX_NR_ZONES; i++)
46464+ h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
46465+
46466+ /* The 16MB DMA zone has the kernel and other misc mappings.
46467+ Account them too */
46468+ if (h[ZONE_DMA]) {
46469+ h[ZONE_DMA] += dma_reserve;
46470+ if (h[ZONE_DMA] >= z[ZONE_DMA]) {
46471+ printk(KERN_WARNING
46472+ "Kernel too large and filling up ZONE_DMA?\n");
46473+ h[ZONE_DMA] = z[ZONE_DMA];
46474+ }
46475+ }
46476+#else
46477+ z[ZONE_DMA] = end_pfn;
46478+ for (i = 0; i < MAX_NR_ZONES; i++)
46479+ h[i] = 0;
46480+#endif
46481+}
46482+
46483+#ifndef CONFIG_NUMA
46484+void __init paging_init(void)
46485+{
46486+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
46487+ int i;
46488+
46489+ memory_present(0, 0, end_pfn);
46490+ sparse_init();
46491+ size_zones(zones, holes, 0, end_pfn);
46492+ free_area_init_node(0, NODE_DATA(0), zones,
46493+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
46494+
46495+ /* Switch to the real shared_info page, and clear the
46496+ * dummy page. */
46497+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
46498+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
46499+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
46500+
46501+ init_mm.context.pinned = 1;
46502+
46503+ /* Setup mapping of lower 1st MB */
46504+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
46505+ if (is_initial_xendomain())
46506+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
46507+ else
46508+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
46509+ virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
46510+ PAGE_KERNEL_RO);
46511+}
46512+#endif
46513+
46514+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
46515+ from the CPU leading to inconsistent cache lines. address and size
46516+ must be aligned to 2MB boundaries.
46517+ Does nothing when the mapping doesn't exist. */
46518+void __init clear_kernel_mapping(unsigned long address, unsigned long size)
46519+{
46520+ unsigned long end = address + size;
46521+
46522+ BUG_ON(address & ~LARGE_PAGE_MASK);
46523+ BUG_ON(size & ~LARGE_PAGE_MASK);
46524+
46525+ for (; address < end; address += LARGE_PAGE_SIZE) {
46526+ pgd_t *pgd = pgd_offset_k(address);
46527+ pud_t *pud;
46528+ pmd_t *pmd;
46529+ if (pgd_none(*pgd))
46530+ continue;
46531+ pud = pud_offset(pgd, address);
46532+ if (pud_none(*pud))
46533+ continue;
46534+ pmd = pmd_offset(pud, address);
46535+ if (!pmd || pmd_none(*pmd))
46536+ continue;
46537+ if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
46538+ /* Could handle this, but it should not happen currently. */
46539+ printk(KERN_ERR
46540+ "clear_kernel_mapping: mapping has been split. will leak memory\n");
46541+ pmd_ERROR(*pmd);
46542+ }
46543+ set_pmd(pmd, __pmd(0));
46544+ }
46545+ __flush_tlb_all();
46546+}
46547+
46548+/*
46549+ * Memory hotplug specific functions
46550+ * These are only for non-NUMA machines right now.
46551+ */
46552+#ifdef CONFIG_MEMORY_HOTPLUG
46553+
46554+void online_page(struct page *page)
46555+{
46556+ ClearPageReserved(page);
46557+ set_page_count(page, 1);
46558+ __free_page(page);
46559+ totalram_pages++;
46560+ num_physpages++;
46561+}
46562+
46563+int add_memory(u64 start, u64 size)
46564+{
46565+ struct pglist_data *pgdat = NODE_DATA(0);
46566+ struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
46567+ unsigned long start_pfn = start >> PAGE_SHIFT;
46568+ unsigned long nr_pages = size >> PAGE_SHIFT;
46569+ int ret;
46570+
46571+ ret = __add_pages(zone, start_pfn, nr_pages);
46572+ if (ret)
46573+ goto error;
46574+
46575+ init_memory_mapping(start, (start + size -1));
46576+
46577+ return ret;
46578+error:
46579+ printk("%s: Problem encountered in __add_pages!\n", __func__);
46580+ return ret;
46581+}
46582+EXPORT_SYMBOL_GPL(add_memory);
46583+
46584+int remove_memory(u64 start, u64 size)
46585+{
46586+ return -EINVAL;
46587+}
46588+EXPORT_SYMBOL_GPL(remove_memory);
46589+
46590+#endif
46591+
46592+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
46593+ kcore_vsyscall;
46594+
46595+void __init mem_init(void)
46596+{
46597+ long codesize, reservedpages, datasize, initsize;
46598+ unsigned long pfn;
46599+
46600+ contiguous_bitmap = alloc_bootmem_low_pages(
46601+ (end_pfn + 2*BITS_PER_LONG) >> 3);
46602+ BUG_ON(!contiguous_bitmap);
46603+ memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
46604+
46605+#if defined(CONFIG_SWIOTLB)
46606+ pci_swiotlb_init();
46607+#endif
46608+ no_iommu_init();
46609+
46610+ /* How many end-of-memory variables you have, grandma! */
46611+ max_low_pfn = end_pfn;
46612+ max_pfn = end_pfn;
46613+ num_physpages = end_pfn;
46614+ high_memory = (void *) __va(end_pfn * PAGE_SIZE);
46615+
46616+ /* clear the zero-page */
46617+ memset(empty_zero_page, 0, PAGE_SIZE);
46618+
46619+ reservedpages = 0;
46620+
46621+ /* this will put all low memory onto the freelists */
46622+#ifdef CONFIG_NUMA
46623+ totalram_pages = numa_free_all_bootmem();
46624+#else
46625+ totalram_pages = free_all_bootmem();
46626+#endif
46627+ /* XEN: init and count pages outside initial allocation. */
46628+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
46629+ ClearPageReserved(pfn_to_page(pfn));
46630+ set_page_count(pfn_to_page(pfn), 1);
46631+ totalram_pages++;
46632+ }
46633+ reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
46634+
46635+ after_bootmem = 1;
46636+
46637+ codesize = (unsigned long) &_etext - (unsigned long) &_text;
46638+ datasize = (unsigned long) &_edata - (unsigned long) &_etext;
46639+ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
46640+
46641+ /* Register memory areas for /proc/kcore */
46642+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
46643+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
46644+ VMALLOC_END-VMALLOC_START);
46645+ kclist_add(&kcore_kernel, &_stext, _end - _stext);
46646+ kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
46647+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
46648+ VSYSCALL_END - VSYSCALL_START);
46649+
46650+ printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
46651+ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
46652+ end_pfn << (PAGE_SHIFT-10),
46653+ codesize >> 10,
46654+ reservedpages << (PAGE_SHIFT-10),
46655+ datasize >> 10,
46656+ initsize >> 10);
46657+
46658+#ifndef CONFIG_XEN
46659+#ifdef CONFIG_SMP
46660+ /*
46661+ * Sync boot_level4_pgt mappings with the init_level4_pgt
46662+ * except for the low identity mappings which are already zapped
46663+ * in init_level4_pgt. This sync-up is essential for AP's bringup
46664+ */
46665+ memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
46666+#endif
46667+#endif
46668+}
46669+
46670+void free_initmem(void)
46671+{
46672+#ifdef __DO_LATER__
46673+ /*
46674+ * Some pages can be pinned, but some are not. Unpinning such pages
46675+ * triggers BUG().
46676+ */
46677+ unsigned long addr;
46678+
46679+ addr = (unsigned long)(&__init_begin);
46680+ for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
46681+ ClearPageReserved(virt_to_page(addr));
46682+ set_page_count(virt_to_page(addr), 1);
46683+ memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
46684+ make_page_writable(
46685+ __va(__pa(addr)), XENFEAT_writable_page_tables);
46686+ /*
46687+ * Make pages from __PAGE_OFFSET address as well
46688+ */
46689+ make_page_writable(
46690+ (void *)addr, XENFEAT_writable_page_tables);
46691+ free_page(addr);
46692+ totalram_pages++;
46693+ }
46694+ memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
46695+ printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
46696+#endif
46697+}
46698+
46699+#ifdef CONFIG_DEBUG_RODATA
46700+
46701+extern char __start_rodata, __end_rodata;
46702+void mark_rodata_ro(void)
46703+{
46704+ unsigned long addr = (unsigned long)&__start_rodata;
46705+
46706+ for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
46707+ change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
46708+
46709+ printk ("Write protecting the kernel read-only data: %luk\n",
46710+ (&__end_rodata - &__start_rodata) >> 10);
46711+
46712+ /*
46713+ * change_page_attr_addr() requires a global_flush_tlb() call after it.
46714+ * We do this after the printk so that if something went wrong in the
46715+ * change, the printk gets out at least to give a better debug hint
46716+ * of who is the culprit.
46717+ */
46718+ global_flush_tlb();
46719+}
46720+#endif
46721+
46722+#ifdef CONFIG_BLK_DEV_INITRD
46723+void free_initrd_mem(unsigned long start, unsigned long end)
46724+{
46725+ if (start >= end)
46726+ return;
46727+ printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
46728+ for (; start < end; start += PAGE_SIZE) {
46729+ ClearPageReserved(virt_to_page(start));
46730+ set_page_count(virt_to_page(start), 1);
46731+ free_page(start);
46732+ totalram_pages++;
46733+ }
46734+}
46735+#endif
46736+
46737+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
46738+{
46739+ /* Should check here against the e820 map to avoid double free */
46740+#ifdef CONFIG_NUMA
46741+ int nid = phys_to_nid(phys);
46742+ reserve_bootmem_node(NODE_DATA(nid), phys, len);
46743+#else
46744+ reserve_bootmem(phys, len);
46745+#endif
46746+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
46747+ dma_reserve += len / PAGE_SIZE;
46748+}
46749+
46750+int kern_addr_valid(unsigned long addr)
46751+{
46752+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
46753+ pgd_t *pgd;
46754+ pud_t *pud;
46755+ pmd_t *pmd;
46756+ pte_t *pte;
46757+
46758+ if (above != 0 && above != -1UL)
46759+ return 0;
46760+
46761+ pgd = pgd_offset_k(addr);
46762+ if (pgd_none(*pgd))
46763+ return 0;
46764+
46765+ pud = pud_offset_k(pgd, addr);
46766+ if (pud_none(*pud))
46767+ return 0;
46768+
46769+ pmd = pmd_offset(pud, addr);
46770+ if (pmd_none(*pmd))
46771+ return 0;
46772+ if (pmd_large(*pmd))
46773+ return pfn_valid(pmd_pfn(*pmd));
46774+
46775+ pte = pte_offset_kernel(pmd, addr);
46776+ if (pte_none(*pte))
46777+ return 0;
46778+ return pfn_valid(pte_pfn(*pte));
46779+}
46780+
46781+#ifdef CONFIG_SYSCTL
46782+#include <linux/sysctl.h>
46783+
46784+extern int exception_trace, page_fault_trace;
46785+
46786+static ctl_table debug_table2[] = {
46787+ { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
46788+ proc_dointvec },
46789+ { 0, }
46790+};
46791+
46792+static ctl_table debug_root_table2[] = {
46793+ { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
46794+ .child = debug_table2 },
46795+ { 0 },
46796+};
46797+
46798+static __init int x8664_sysctl_init(void)
46799+{
46800+ register_sysctl_table(debug_root_table2, 1);
46801+ return 0;
46802+}
46803+__initcall(x8664_sysctl_init);
46804+#endif
46805+
46806+/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
46807+ covers the 64bit vsyscall page now. 32bit has a real VMA now and does
46808+ not need special handling anymore. */
46809+
46810+static struct vm_area_struct gate_vma = {
46811+ .vm_start = VSYSCALL_START,
46812+ .vm_end = VSYSCALL_END,
46813+ .vm_page_prot = PAGE_READONLY
46814+};
46815+
46816+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
46817+{
46818+#ifdef CONFIG_IA32_EMULATION
46819+ if (test_tsk_thread_flag(tsk, TIF_IA32))
46820+ return NULL;
46821+#endif
46822+ return &gate_vma;
46823+}
46824+
46825+int in_gate_area(struct task_struct *task, unsigned long addr)
46826+{
46827+ struct vm_area_struct *vma = get_gate_vma(task);
46828+ if (!vma)
46829+ return 0;
46830+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
46831+}
46832+
46833+/* Use this when you have no reliable task/vma, typically from interrupt
46834+ * context. It is less reliable than using the task's vma and may give
46835+ * false positives.
46836+ */
46837+int in_gate_area_no_task(unsigned long addr)
46838+{
46839+ return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
46840+}
46841diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/pageattr-xen.c linux-2.6.16.33/arch/x86_64/mm/pageattr-xen.c
46842--- linux-2.6.16.33-noxen/arch/x86_64/mm/pageattr-xen.c 1970-01-01 00:00:00.000000000 +0000
46843+++ linux-2.6.16.33/arch/x86_64/mm/pageattr-xen.c 2007-01-08 15:00:45.000000000 +0000
46844@@ -0,0 +1,408 @@
46845+/*
46846+ * Copyright 2002 Andi Kleen, SuSE Labs.
46847+ * Thanks to Ben LaHaise for precious feedback.
46848+ */
46849+
46850+#include <linux/config.h>
46851+#include <linux/mm.h>
46852+#include <linux/sched.h>
46853+#include <linux/highmem.h>
46854+#include <linux/module.h>
46855+#include <linux/slab.h>
46856+#include <asm/uaccess.h>
46857+#include <asm/processor.h>
46858+#include <asm/tlbflush.h>
46859+#include <asm/io.h>
46860+
46861+#ifdef CONFIG_XEN
46862+#include <asm/pgalloc.h>
46863+#include <asm/mmu_context.h>
46864+
46865+LIST_HEAD(mm_unpinned);
46866+DEFINE_SPINLOCK(mm_unpinned_lock);
46867+
46868+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
46869+{
46870+ struct page *page = virt_to_page(pt);
46871+ unsigned long pfn = page_to_pfn(page);
46872+
46873+ BUG_ON(HYPERVISOR_update_va_mapping(
46874+ (unsigned long)__va(pfn << PAGE_SHIFT),
46875+ pfn_pte(pfn, flags), 0));
46876+}
46877+
46878+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
46879+{
46880+ pgd_t *pgd;
46881+ pud_t *pud;
46882+ pmd_t *pmd;
46883+ pte_t *pte;
46884+ int g,u,m;
46885+
46886+ pgd = mm->pgd;
46887+ /*
46888+ * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
46889+ * be the 'current' task's pagetables (e.g., current may be 32-bit,
46890+ * but the pagetables may be for a 64-bit task).
46891+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
46892+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
46893+ */
46894+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
46895+ if (pgd_none(*pgd))
46896+ continue;
46897+ pud = pud_offset(pgd, 0);
46898+ if (PTRS_PER_PUD > 1) /* not folded */
46899+ mm_walk_set_prot(pud,flags);
46900+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
46901+ if (pud_none(*pud))
46902+ continue;
46903+ pmd = pmd_offset(pud, 0);
46904+ if (PTRS_PER_PMD > 1) /* not folded */
46905+ mm_walk_set_prot(pmd,flags);
46906+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
46907+ if (pmd_none(*pmd))
46908+ continue;
46909+ pte = pte_offset_kernel(pmd,0);
46910+ mm_walk_set_prot(pte,flags);
46911+ }
46912+ }
46913+ }
46914+}
46915+
46916+void mm_pin(struct mm_struct *mm)
46917+{
46918+ if (xen_feature(XENFEAT_writable_page_tables))
46919+ return;
46920+
46921+ spin_lock(&mm->page_table_lock);
46922+
46923+ mm_walk(mm, PAGE_KERNEL_RO);
46924+ BUG_ON(HYPERVISOR_update_va_mapping(
46925+ (unsigned long)mm->pgd,
46926+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
46927+ UVMF_TLB_FLUSH));
46928+ BUG_ON(HYPERVISOR_update_va_mapping(
46929+ (unsigned long)__user_pgd(mm->pgd),
46930+ pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
46931+ UVMF_TLB_FLUSH));
46932+ xen_pgd_pin(__pa(mm->pgd)); /* kernel */
46933+ xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
46934+ mm->context.pinned = 1;
46935+ spin_lock(&mm_unpinned_lock);
46936+ list_del(&mm->context.unpinned);
46937+ spin_unlock(&mm_unpinned_lock);
46938+
46939+ spin_unlock(&mm->page_table_lock);
46940+}
46941+
46942+void mm_unpin(struct mm_struct *mm)
46943+{
46944+ if (xen_feature(XENFEAT_writable_page_tables))
46945+ return;
46946+
46947+ spin_lock(&mm->page_table_lock);
46948+
46949+ xen_pgd_unpin(__pa(mm->pgd));
46950+ xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
46951+ BUG_ON(HYPERVISOR_update_va_mapping(
46952+ (unsigned long)mm->pgd,
46953+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
46954+ BUG_ON(HYPERVISOR_update_va_mapping(
46955+ (unsigned long)__user_pgd(mm->pgd),
46956+ pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
46957+ mm_walk(mm, PAGE_KERNEL);
46958+ xen_tlb_flush();
46959+ mm->context.pinned = 0;
46960+ spin_lock(&mm_unpinned_lock);
46961+ list_add(&mm->context.unpinned, &mm_unpinned);
46962+ spin_unlock(&mm_unpinned_lock);
46963+
46964+ spin_unlock(&mm->page_table_lock);
46965+}
46966+
46967+void mm_pin_all(void)
46968+{
46969+ if (xen_feature(XENFEAT_writable_page_tables))
46970+ return;
46971+
46972+ while (!list_empty(&mm_unpinned))
46973+ mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
46974+ context.unpinned));
46975+}
46976+
46977+void _arch_dup_mmap(struct mm_struct *mm)
46978+{
46979+ if (!mm->context.pinned)
46980+ mm_pin(mm);
46981+}
46982+
46983+void _arch_exit_mmap(struct mm_struct *mm)
46984+{
46985+ struct task_struct *tsk = current;
46986+
46987+ task_lock(tsk);
46988+
46989+ /*
46990+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
46991+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
46992+ */
46993+ if ( tsk->active_mm == mm )
46994+ {
46995+ tsk->active_mm = &init_mm;
46996+ atomic_inc(&init_mm.mm_count);
46997+
46998+ switch_mm(mm, &init_mm, tsk);
46999+
47000+ atomic_dec(&mm->mm_count);
47001+ BUG_ON(atomic_read(&mm->mm_count) == 0);
47002+ }
47003+
47004+ task_unlock(tsk);
47005+
47006+ if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
47007+ !mm->context.has_foreign_mappings )
47008+ mm_unpin(mm);
47009+}
47010+
47011+void pte_free(struct page *pte)
47012+{
47013+ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
47014+
47015+ if (!pte_write(*virt_to_ptep(va)))
47016+ BUG_ON(HYPERVISOR_update_va_mapping(
47017+ va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
47018+ __free_page(pte);
47019+}
47020+#endif /* CONFIG_XEN */
47021+
47022+static inline pte_t *lookup_address(unsigned long address)
47023+{
47024+ pgd_t *pgd = pgd_offset_k(address);
47025+ pud_t *pud;
47026+ pmd_t *pmd;
47027+ pte_t *pte;
47028+ if (pgd_none(*pgd))
47029+ return NULL;
47030+ pud = pud_offset(pgd, address);
47031+ if (!pud_present(*pud))
47032+ return NULL;
47033+ pmd = pmd_offset(pud, address);
47034+ if (!pmd_present(*pmd))
47035+ return NULL;
47036+ if (pmd_large(*pmd))
47037+ return (pte_t *)pmd;
47038+ pte = pte_offset_kernel(pmd, address);
47039+ if (pte && !pte_present(*pte))
47040+ pte = NULL;
47041+ return pte;
47042+}
47043+
47044+static struct page *split_large_page(unsigned long address, pgprot_t prot,
47045+ pgprot_t ref_prot)
47046+{
47047+ int i;
47048+ unsigned long addr;
47049+ struct page *base = alloc_pages(GFP_KERNEL, 0);
47050+ pte_t *pbase;
47051+ if (!base)
47052+ return NULL;
47053+ address = __pa(address);
47054+ addr = address & LARGE_PAGE_MASK;
47055+ pbase = (pte_t *)page_address(base);
47056+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
47057+ pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
47058+ addr == address ? prot : ref_prot);
47059+ }
47060+ return base;
47061+}
47062+
47063+
47064+static void flush_kernel_map(void *address)
47065+{
47066+ if (0 && address && cpu_has_clflush) {
47067+ /* is this worth it? */
47068+ int i;
47069+ for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
47070+ asm volatile("clflush (%0)" :: "r" (address + i));
47071+ } else
47072+ asm volatile("wbinvd":::"memory");
47073+ if (address)
47074+ __flush_tlb_one(address);
47075+ else
47076+ __flush_tlb_all();
47077+}
47078+
47079+
47080+static inline void flush_map(unsigned long address)
47081+{
47082+ on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
47083+}
47084+
47085+struct deferred_page {
47086+ struct deferred_page *next;
47087+ struct page *fpage;
47088+ unsigned long address;
47089+};
47090+static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
47091+
47092+static inline void save_page(unsigned long address, struct page *fpage)
47093+{
47094+ struct deferred_page *df;
47095+ df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
47096+ if (!df) {
47097+ flush_map(address);
47098+ __free_page(fpage);
47099+ } else {
47100+ df->next = df_list;
47101+ df->fpage = fpage;
47102+ df->address = address;
47103+ df_list = df;
47104+ }
47105+}
47106+
47107+/*
47108+ * No more special protections in this 2/4MB area - revert to a
47109+ * large page again.
47110+ */
47111+static void revert_page(unsigned long address, pgprot_t ref_prot)
47112+{
47113+ pgd_t *pgd;
47114+ pud_t *pud;
47115+ pmd_t *pmd;
47116+ pte_t large_pte;
47117+
47118+ pgd = pgd_offset_k(address);
47119+ BUG_ON(pgd_none(*pgd));
47120+ pud = pud_offset(pgd,address);
47121+ BUG_ON(pud_none(*pud));
47122+ pmd = pmd_offset(pud, address);
47123+ BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
47124+ pgprot_val(ref_prot) |= _PAGE_PSE;
47125+ large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
47126+ set_pte((pte_t *)pmd, large_pte);
47127+}
47128+
47129+static int
47130+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
47131+ pgprot_t ref_prot)
47132+{
47133+ pte_t *kpte;
47134+ struct page *kpte_page;
47135+ unsigned kpte_flags;
47136+ pgprot_t ref_prot2;
47137+ kpte = lookup_address(address);
47138+ if (!kpte) return 0;
47139+ kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
47140+ kpte_flags = pte_val(*kpte);
47141+ if (pgprot_val(prot) != pgprot_val(ref_prot)) {
47142+ if ((kpte_flags & _PAGE_PSE) == 0) {
47143+ set_pte(kpte, pfn_pte(pfn, prot));
47144+ } else {
47145+ /*
47146+ * split_large_page will take the reference for this change_page_attr
47147+ * on the split page.
47148+ */
47149+
47150+ struct page *split;
47151+ ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
47152+
47153+ split = split_large_page(address, prot, ref_prot2);
47154+ if (!split)
47155+ return -ENOMEM;
47156+ set_pte(kpte,mk_pte(split, ref_prot2));
47157+ kpte_page = split;
47158+ }
47159+ get_page(kpte_page);
47160+ } else if ((kpte_flags & _PAGE_PSE) == 0) {
47161+ set_pte(kpte, pfn_pte(pfn, ref_prot));
47162+ __put_page(kpte_page);
47163+ } else
47164+ BUG();
47165+
47166+ /* on x86-64 the direct mapping set at boot is not using 4k pages */
47167+ /*
47168+ * ..., but the XEN guest kernels (currently) do:
47169+ * If the pte was reserved, it means it was created at boot
47170+ * time (not via split_large_page) and in turn we must not
47171+ * replace it with a large page.
47172+ */
47173+#ifndef CONFIG_XEN
47174+ BUG_ON(PageReserved(kpte_page));
47175+#else
47176+ if (!PageReserved(kpte_page))
47177+#endif
47178+ switch (page_count(kpte_page)) {
47179+ case 1:
47180+ save_page(address, kpte_page);
47181+ revert_page(address, ref_prot);
47182+ break;
47183+ case 0:
47184+ BUG(); /* memleak and failed 2M page regeneration */
47185+ }
47186+ return 0;
47187+}
47188+
47189+/*
47190+ * Change the page attributes of an page in the linear mapping.
47191+ *
47192+ * This should be used when a page is mapped with a different caching policy
47193+ * than write-back somewhere - some CPUs do not like it when mappings with
47194+ * different caching policies exist. This changes the page attributes of the
47195+ * in kernel linear mapping too.
47196+ *
47197+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
47198+ * This function only deals with the kernel linear map.
47199+ *
47200+ * Caller must call global_flush_tlb() after this.
47201+ */
47202+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
47203+{
47204+ int err = 0;
47205+ int i;
47206+
47207+ down_write(&init_mm.mmap_sem);
47208+ for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
47209+ unsigned long pfn = __pa(address) >> PAGE_SHIFT;
47210+
47211+ err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
47212+ if (err)
47213+ break;
47214+ /* Handle kernel mapping too which aliases part of the
47215+ * lowmem */
47216+ if (__pa(address) < KERNEL_TEXT_SIZE) {
47217+ unsigned long addr2;
47218+ pgprot_t prot2 = prot;
47219+ addr2 = __START_KERNEL_map + __pa(address);
47220+ pgprot_val(prot2) &= ~_PAGE_NX;
47221+ err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
47222+ }
47223+ }
47224+ up_write(&init_mm.mmap_sem);
47225+ return err;
47226+}
47227+
47228+/* Don't call this for MMIO areas that may not have a mem_map entry */
47229+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
47230+{
47231+ unsigned long addr = (unsigned long)page_address(page);
47232+ return change_page_attr_addr(addr, numpages, prot);
47233+}
47234+
47235+void global_flush_tlb(void)
47236+{
47237+ struct deferred_page *df, *next_df;
47238+
47239+ down_read(&init_mm.mmap_sem);
47240+ df = xchg(&df_list, NULL);
47241+ up_read(&init_mm.mmap_sem);
47242+ flush_map((df && !df->next) ? df->address : 0);
47243+ for (; df; df = next_df) {
47244+ next_df = df->next;
47245+ if (df->fpage)
47246+ __free_page(df->fpage);
47247+ kfree(df);
47248+ }
47249+}
47250+
47251+EXPORT_SYMBOL(change_page_attr);
47252+EXPORT_SYMBOL(global_flush_tlb);
47253diff -Nur linux-2.6.16.33-noxen/arch/x86_64/oprofile/Makefile linux-2.6.16.33/arch/x86_64/oprofile/Makefile
47254--- linux-2.6.16.33-noxen/arch/x86_64/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
47255+++ linux-2.6.16.33/arch/x86_64/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
47256@@ -11,9 +11,15 @@
47257 oprofilefs.o oprofile_stats.o \
47258 timer_int.o )
47259
47260+ifdef CONFIG_XEN
47261+XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
47262+ xenoprofile.o)
47263+OPROFILE-y := xenoprof.o
47264+else
47265 OPROFILE-y := init.o backtrace.o
47266 OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
47267 op_model_ppro.o
47268 OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
47269-
47270-oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
47271+endif
47272+oprofile-y = $(DRIVER_OBJS) $(XENOPROF_COMMON_OBJS) \
47273+ $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
47274diff -Nur linux-2.6.16.33-noxen/arch/x86_64/pci/Makefile linux-2.6.16.33/arch/x86_64/pci/Makefile
47275--- linux-2.6.16.33-noxen/arch/x86_64/pci/Makefile 2006-11-22 18:06:31.000000000 +0000
47276+++ linux-2.6.16.33/arch/x86_64/pci/Makefile 2007-01-08 15:00:45.000000000 +0000
47277@@ -15,10 +15,22 @@
47278
47279 obj-$(CONFIG_NUMA) += k8-bus.o
47280
47281+# pcifront should be after mmconfig.o and direct.o as it should only
47282+# take over if direct access to the PCI bus is unavailable
47283+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
47284+
47285 direct-y += ../../i386/pci/direct.o
47286 acpi-y += ../../i386/pci/acpi.o
47287+pcifront-y += ../../i386/pci/pcifront.o
47288 legacy-y += ../../i386/pci/legacy.o
47289 irq-y += ../../i386/pci/irq.o
47290 common-y += ../../i386/pci/common.o
47291 fixup-y += ../../i386/pci/fixup.o
47292 i386-y += ../../i386/pci/i386.o
47293+
47294+ifdef CONFIG_XEN
47295+irq-y := ../../i386/pci/irq-xen.o
47296+include $(srctree)/scripts/Makefile.xen
47297+
47298+obj-y := $(call cherrypickxen, $(obj-y))
47299+endif
47300diff -Nur linux-2.6.16.33-noxen/arch/x86_64/pci/mmconfig.c linux-2.6.16.33/arch/x86_64/pci/mmconfig.c
47301--- linux-2.6.16.33-noxen/arch/x86_64/pci/mmconfig.c 2006-11-22 18:06:31.000000000 +0000
47302+++ linux-2.6.16.33/arch/x86_64/pci/mmconfig.c 2007-05-23 21:00:01.000000000 +0000
47303@@ -9,11 +9,19 @@
47304 #include <linux/init.h>
47305 #include <linux/acpi.h>
47306 #include <linux/bitmap.h>
47307+#include <asm/e820.h>
47308+
47309 #include "pci.h"
47310
47311-#define MMCONFIG_APER_SIZE (256*1024*1024)
47312+/* aperture is up to 256MB but BIOS may reserve less */
47313+#define MMCONFIG_APER_MIN (2 * 1024*1024)
47314+#define MMCONFIG_APER_MAX (256 * 1024*1024)
47315+
47316+/* Verify the first 16 busses. We assume that systems with more busses
47317+ get MCFG right. */
47318+#define MAX_CHECK_BUS 16
47319
47320-static DECLARE_BITMAP(fallback_slots, 32);
47321+static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
47322
47323 /* Static virtual mapping of the MMCONFIG aperture */
47324 struct mmcfg_virt {
47325@@ -55,7 +63,8 @@
47326 static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
47327 {
47328 char __iomem *addr;
47329- if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), &fallback_slots))
47330+ if (seg == 0 && bus < MAX_CHECK_BUS &&
47331+ test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
47332 return NULL;
47333 addr = get_virt(seg, bus);
47334 if (!addr)
47335@@ -69,8 +78,10 @@
47336 char __iomem *addr;
47337
47338 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
47339- if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
47340+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
47341+ *value = -1;
47342 return -EINVAL;
47343+ }
47344
47345 addr = pci_dev_base(seg, bus, devfn);
47346 if (!addr)
47347@@ -129,23 +140,56 @@
47348 Normally this can be expressed in the MCFG by not listing them
47349 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
47350 Instead try to discover all devices on bus 0 that are unreachable using MM
47351- and fallback for them.
47352- We only do this for bus 0/seg 0 */
47353+ and fallback for them. */
47354 static __init void unreachable_devices(void)
47355 {
47356- int i;
47357- for (i = 0; i < 32; i++) {
47358- u32 val1;
47359- char __iomem *addr;
47360+ int i, k;
47361+ /* Use the max bus number from ACPI here? */
47362+ for (k = 0; k < MAX_CHECK_BUS; k++) {
47363+ for (i = 0; i < 32; i++) {
47364+ u32 val1;
47365+ char __iomem *addr;
47366+
47367+ pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
47368+ if (val1 == 0xffffffff)
47369+ continue;
47370+ addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
47371+ if (addr == NULL|| readl(addr) != val1) {
47372+ set_bit(i + 32*k, fallback_slots);
47373+ printk(KERN_NOTICE
47374+ "PCI: No mmconfig possible on device %x:%x\n",
47375+ k, i);
47376+ }
47377+ }
47378+ }
47379+}
47380
47381- pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
47382- if (val1 == 0xffffffff)
47383+/* NB. Ripped from arch/x86_64/kernel/e820.c for this Xen bugfix patch. */
47384+#ifdef CONFIG_XEN
47385+extern struct e820map machine_e820;
47386+#define e820 machine_e820
47387+#endif
47388+static int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
47389+{
47390+ int i;
47391+ for (i = 0; i < e820.nr_map; i++) {
47392+ struct e820entry *ei = &e820.map[i];
47393+ if (type && ei->type != type)
47394 continue;
47395- addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
47396- if (addr == NULL|| readl(addr) != val1) {
47397- set_bit(i, &fallback_slots);
47398- }
47399+ /* is the region (part) in overlap with the current region ?*/
47400+ if (ei->addr >= end || ei->addr + ei->size <= start)
47401+ continue;
47402+
47403+ /* if the region is at the beginning of <start,end> we move
47404+ * start to the end of the region since it's ok until there
47405+ */
47406+ if (ei->addr <= start)
47407+ start = ei->addr + ei->size;
47408+ /* if start is now at or beyond end, we're done, full coverage */
47409+ if (start >= end)
47410+ return 1; /* we're done */
47411 }
47412+ return 0;
47413 }
47414
47415 static int __init pci_mmcfg_init(void)
47416@@ -161,6 +205,15 @@
47417 (pci_mmcfg_config[0].base_address == 0))
47418 return 0;
47419
47420+ if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
47421+ pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
47422+ E820_RESERVED)) {
47423+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
47424+ pci_mmcfg_config[0].base_address);
47425+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
47426+ return 0;
47427+ }
47428+
47429 /* RED-PEN i386 doesn't do _nocache right now */
47430 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
47431 if (pci_mmcfg_virt == NULL) {
47432@@ -169,7 +222,8 @@
47433 }
47434 for (i = 0; i < pci_mmcfg_config_num; ++i) {
47435 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
47436- pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address, MMCONFIG_APER_SIZE);
47437+ pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address,
47438+ MMCONFIG_APER_MAX);
47439 if (!pci_mmcfg_virt[i].virt) {
47440 printk("PCI: Cannot map mmconfig aperture for segment %d\n",
47441 pci_mmcfg_config[i].pci_segment_group_number);
47442diff -Nur linux-2.6.16.33-noxen/drivers/Makefile linux-2.6.16.33/drivers/Makefile
47443--- linux-2.6.16.33-noxen/drivers/Makefile 2006-11-22 18:06:31.000000000 +0000
47444+++ linux-2.6.16.33/drivers/Makefile 2007-01-08 15:00:45.000000000 +0000
47445@@ -34,6 +34,7 @@
47446 obj-$(CONFIG_NUBUS) += nubus/
47447 obj-$(CONFIG_ATM) += atm/
47448 obj-$(CONFIG_PPC_PMAC) += macintosh/
47449+obj-$(CONFIG_XEN) += xen/
47450 obj-$(CONFIG_IDE) += ide/
47451 obj-$(CONFIG_FC4) += fc4/
47452 obj-$(CONFIG_SCSI) += scsi/
47453diff -Nur linux-2.6.16.33-noxen/drivers/acpi/Kconfig linux-2.6.16.33/drivers/acpi/Kconfig
47454--- linux-2.6.16.33-noxen/drivers/acpi/Kconfig 2006-11-22 18:06:31.000000000 +0000
47455+++ linux-2.6.16.33/drivers/acpi/Kconfig 2007-01-08 15:00:45.000000000 +0000
47456@@ -46,7 +46,7 @@
47457
47458 config ACPI_SLEEP
47459 bool "Sleep States"
47460- depends on X86 && (!SMP || SUSPEND_SMP)
47461+ depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
47462 depends on PM
47463 default y
47464 ---help---
47465@@ -287,6 +287,7 @@
47466 config X86_PM_TIMER
47467 bool "Power Management Timer Support" if EMBEDDED
47468 depends on X86
47469+ depends on !XEN
47470 default y
47471 help
47472 The Power Management Timer is available on all ACPI-capable,
47473diff -Nur linux-2.6.16.33-noxen/drivers/acpi/tables.c linux-2.6.16.33/drivers/acpi/tables.c
47474--- linux-2.6.16.33-noxen/drivers/acpi/tables.c 2006-11-22 18:06:31.000000000 +0000
47475+++ linux-2.6.16.33/drivers/acpi/tables.c 2007-01-08 15:00:45.000000000 +0000
47476@@ -572,6 +572,11 @@
47477 *
47478 * result: sdt_entry[] is initialized
47479 */
47480+#if defined(CONFIG_X86_XEN) || defined(CONFIG_X86_64_XEN)
47481+#define acpi_rsdp_phys_to_va(rsdp_phys) isa_bus_to_virt(rsdp_phys)
47482+#else
47483+#define acpi_rsdp_phys_to_va(rsdp_phys) __va(rsdp_phys)
47484+#endif
47485
47486 int __init acpi_table_init(void)
47487 {
47488@@ -587,7 +592,7 @@
47489 return -ENODEV;
47490 }
47491
47492- rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys);
47493+ rsdp = (struct acpi_table_rsdp *)acpi_rsdp_phys_to_va(rsdp_phys);
47494 if (!rsdp) {
47495 printk(KERN_WARNING PREFIX "Unable to map RSDP\n");
47496 return -ENODEV;
47497diff -Nur linux-2.6.16.33-noxen/drivers/base/bus.c linux-2.6.16.33/drivers/base/bus.c
47498--- linux-2.6.16.33-noxen/drivers/base/bus.c 2006-11-22 18:06:31.000000000 +0000
47499+++ linux-2.6.16.33/drivers/base/bus.c 2007-05-23 21:00:01.000000000 +0000
47500@@ -188,6 +188,11 @@
47501 up(&dev->sem);
47502 if (dev->parent)
47503 up(&dev->parent->sem);
47504+
47505+ if (err > 0) /* success */
47506+ err = count;
47507+ else if (err == 0) /* driver didn't accept device */
47508+ err = -ENODEV;
47509 }
47510 put_device(dev);
47511 put_bus(bus);
47512diff -Nur linux-2.6.16.33-noxen/drivers/block/aoe/aoenet.c linux-2.6.16.33/drivers/block/aoe/aoenet.c
47513--- linux-2.6.16.33-noxen/drivers/block/aoe/aoenet.c 2006-11-22 18:06:31.000000000 +0000
47514+++ linux-2.6.16.33/drivers/block/aoe/aoenet.c 2007-05-23 21:00:01.000000000 +0000
47515@@ -95,9 +95,8 @@
47516 static struct sk_buff *
47517 skb_check(struct sk_buff *skb)
47518 {
47519- if (skb_is_nonlinear(skb))
47520 if ((skb = skb_share_check(skb, GFP_ATOMIC)))
47521- if (skb_linearize(skb, GFP_ATOMIC) < 0) {
47522+ if (skb_linearize(skb)) {
47523 dev_kfree_skb(skb);
47524 return NULL;
47525 }
47526diff -Nur linux-2.6.16.33-noxen/drivers/char/mem.c linux-2.6.16.33/drivers/char/mem.c
47527--- linux-2.6.16.33-noxen/drivers/char/mem.c 2006-11-22 18:06:31.000000000 +0000
47528+++ linux-2.6.16.33/drivers/char/mem.c 2007-01-08 15:00:45.000000000 +0000
47529@@ -108,6 +108,7 @@
47530 }
47531 #endif
47532
47533+#ifndef ARCH_HAS_DEV_MEM
47534 /*
47535 * This funcion reads the *physical* memory. The f_pos points directly to the
47536 * memory location.
47537@@ -232,6 +233,7 @@
47538 *ppos += written;
47539 return written;
47540 }
47541+#endif
47542
47543 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
47544 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
47545@@ -773,6 +775,7 @@
47546 #define open_kmem open_mem
47547 #define open_oldmem open_mem
47548
47549+#ifndef ARCH_HAS_DEV_MEM
47550 static struct file_operations mem_fops = {
47551 .llseek = memory_lseek,
47552 .read = read_mem,
47553@@ -780,6 +783,9 @@
47554 .mmap = mmap_mem,
47555 .open = open_mem,
47556 };
47557+#else
47558+extern struct file_operations mem_fops;
47559+#endif
47560
47561 static struct file_operations kmem_fops = {
47562 .llseek = memory_lseek,
47563diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/Kconfig linux-2.6.16.33/drivers/char/tpm/Kconfig
47564--- linux-2.6.16.33-noxen/drivers/char/tpm/Kconfig 2006-11-22 18:06:31.000000000 +0000
47565+++ linux-2.6.16.33/drivers/char/tpm/Kconfig 2007-01-08 15:00:45.000000000 +0000
47566@@ -20,9 +20,18 @@
47567 Note: For more TPM drivers enable CONFIG_PNP, CONFIG_ACPI
47568 and CONFIG_PNPACPI.
47569
47570+config TCG_TIS
47571+ tristate "TPM Interface Specification 1.2 Interface"
47572+ depends on TCG_TPM
47573+ ---help---
47574+ If you have a TPM security chip that is compliant with the
47575+ TCG TIS 1.2 TPM specification say Yes and it will be accessible
47576+ from within Linux. To compile this driver as a module, choose
47577+ M here; the module will be called tpm_tis.
47578+
47579 config TCG_NSC
47580 tristate "National Semiconductor TPM Interface"
47581- depends on TCG_TPM
47582+ depends on TCG_TPM && PNPACPI
47583 ---help---
47584 If you have a TPM security chip from National Semicondutor
47585 say Yes and it will be accessible from within Linux. To
47586@@ -49,5 +58,13 @@
47587 Further information on this driver and the supported hardware
47588 can be found at http://www.prosec.rub.de/tpm
47589
47590-endmenu
47591+config TCG_XEN
47592+ tristate "XEN TPM Interface"
47593+ depends on TCG_TPM && XEN
47594+ ---help---
47595+ If you want to make TPM support available to a Xen user domain,
47596+ say Yes and it will be accessible from within Linux.
47597+ To compile this driver as a module, choose M here; the module
47598+ will be called tpm_xenu.
47599
47600+endmenu
47601diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/Makefile linux-2.6.16.33/drivers/char/tpm/Makefile
47602--- linux-2.6.16.33-noxen/drivers/char/tpm/Makefile 2006-11-22 18:06:31.000000000 +0000
47603+++ linux-2.6.16.33/drivers/char/tpm/Makefile 2007-01-08 15:00:45.000000000 +0000
47604@@ -5,6 +5,9 @@
47605 ifdef CONFIG_ACPI
47606 obj-$(CONFIG_TCG_TPM) += tpm_bios.o
47607 endif
47608+obj-$(CONFIG_TCG_TIS) += tpm_tis.o
47609 obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
47610 obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
47611 obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
47612+obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
47613+tpm_xenu-y = tpm_xen.o tpm_vtpm.o
47614diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm.c linux-2.6.16.33/drivers/char/tpm/tpm.c
47615--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm.c 2006-11-22 18:06:31.000000000 +0000
47616+++ linux-2.6.16.33/drivers/char/tpm/tpm.c 2007-01-08 15:00:45.000000000 +0000
47617@@ -30,14 +30,295 @@
47618
47619 enum tpm_const {
47620 TPM_MINOR = 224, /* officially assigned */
47621+#ifndef CONFIG_XEN
47622 TPM_BUFSIZE = 2048,
47623+#endif
47624 TPM_NUM_DEVICES = 256,
47625- TPM_NUM_MASK_ENTRIES = TPM_NUM_DEVICES / (8 * sizeof(int))
47626 };
47627
47628+enum tpm_duration {
47629+ TPM_SHORT = 0,
47630+ TPM_MEDIUM = 1,
47631+ TPM_LONG = 2,
47632+ TPM_UNDEFINED,
47633+};
47634+
47635+#define TPM_MAX_ORDINAL 243
47636+#define TPM_MAX_PROTECTED_ORDINAL 12
47637+#define TPM_PROTECTED_ORDINAL_MASK 0xFF
47638+
47639 static LIST_HEAD(tpm_chip_list);
47640 static DEFINE_SPINLOCK(driver_lock);
47641-static int dev_mask[TPM_NUM_MASK_ENTRIES];
47642+static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
47643+
47644+/*
47645+ * Array with one entry per ordinal defining the maximum amount
47646+ * of time the chip could take to return the result. The ordinal
47647+ * designation of short, medium or long is defined in a table in
47648+ * TCG Specification TPM Main Part 2 TPM Structures Section 17. The
47649+ * values of the SHORT, MEDIUM, and LONG durations are retrieved
47650+ * from the chip during initialization with a call to tpm_get_timeouts.
47651+ */
47652+static const u8 tpm_protected_ordinal_duration[TPM_MAX_PROTECTED_ORDINAL] = {
47653+ TPM_UNDEFINED, /* 0 */
47654+ TPM_UNDEFINED,
47655+ TPM_UNDEFINED,
47656+ TPM_UNDEFINED,
47657+ TPM_UNDEFINED,
47658+ TPM_UNDEFINED, /* 5 */
47659+ TPM_UNDEFINED,
47660+ TPM_UNDEFINED,
47661+ TPM_UNDEFINED,
47662+ TPM_UNDEFINED,
47663+ TPM_SHORT, /* 10 */
47664+ TPM_SHORT,
47665+};
47666+
47667+static const u8 tpm_ordinal_duration[TPM_MAX_ORDINAL] = {
47668+ TPM_UNDEFINED, /* 0 */
47669+ TPM_UNDEFINED,
47670+ TPM_UNDEFINED,
47671+ TPM_UNDEFINED,
47672+ TPM_UNDEFINED,
47673+ TPM_UNDEFINED, /* 5 */
47674+ TPM_UNDEFINED,
47675+ TPM_UNDEFINED,
47676+ TPM_UNDEFINED,
47677+ TPM_UNDEFINED,
47678+ TPM_SHORT, /* 10 */
47679+ TPM_SHORT,
47680+ TPM_MEDIUM,
47681+ TPM_LONG,
47682+ TPM_LONG,
47683+ TPM_MEDIUM, /* 15 */
47684+ TPM_SHORT,
47685+ TPM_SHORT,
47686+ TPM_MEDIUM,
47687+ TPM_LONG,
47688+ TPM_SHORT, /* 20 */
47689+ TPM_SHORT,
47690+ TPM_MEDIUM,
47691+ TPM_MEDIUM,
47692+ TPM_MEDIUM,
47693+ TPM_SHORT, /* 25 */
47694+ TPM_SHORT,
47695+ TPM_MEDIUM,
47696+ TPM_SHORT,
47697+ TPM_SHORT,
47698+ TPM_MEDIUM, /* 30 */
47699+ TPM_LONG,
47700+ TPM_MEDIUM,
47701+ TPM_SHORT,
47702+ TPM_SHORT,
47703+ TPM_SHORT, /* 35 */
47704+ TPM_MEDIUM,
47705+ TPM_MEDIUM,
47706+ TPM_UNDEFINED,
47707+ TPM_UNDEFINED,
47708+ TPM_MEDIUM, /* 40 */
47709+ TPM_LONG,
47710+ TPM_MEDIUM,
47711+ TPM_SHORT,
47712+ TPM_SHORT,
47713+ TPM_SHORT, /* 45 */
47714+ TPM_SHORT,
47715+ TPM_SHORT,
47716+ TPM_SHORT,
47717+ TPM_LONG,
47718+ TPM_MEDIUM, /* 50 */
47719+ TPM_MEDIUM,
47720+ TPM_UNDEFINED,
47721+ TPM_UNDEFINED,
47722+ TPM_UNDEFINED,
47723+ TPM_UNDEFINED, /* 55 */
47724+ TPM_UNDEFINED,
47725+ TPM_UNDEFINED,
47726+ TPM_UNDEFINED,
47727+ TPM_UNDEFINED,
47728+ TPM_MEDIUM, /* 60 */
47729+ TPM_MEDIUM,
47730+ TPM_MEDIUM,
47731+ TPM_SHORT,
47732+ TPM_SHORT,
47733+ TPM_MEDIUM, /* 65 */
47734+ TPM_UNDEFINED,
47735+ TPM_UNDEFINED,
47736+ TPM_UNDEFINED,
47737+ TPM_UNDEFINED,
47738+ TPM_SHORT, /* 70 */
47739+ TPM_SHORT,
47740+ TPM_UNDEFINED,
47741+ TPM_UNDEFINED,
47742+ TPM_UNDEFINED,
47743+ TPM_UNDEFINED, /* 75 */
47744+ TPM_UNDEFINED,
47745+ TPM_UNDEFINED,
47746+ TPM_UNDEFINED,
47747+ TPM_UNDEFINED,
47748+ TPM_LONG, /* 80 */
47749+ TPM_UNDEFINED,
47750+ TPM_MEDIUM,
47751+ TPM_LONG,
47752+ TPM_SHORT,
47753+ TPM_UNDEFINED, /* 85 */
47754+ TPM_UNDEFINED,
47755+ TPM_UNDEFINED,
47756+ TPM_UNDEFINED,
47757+ TPM_UNDEFINED,
47758+ TPM_SHORT, /* 90 */
47759+ TPM_SHORT,
47760+ TPM_SHORT,
47761+ TPM_SHORT,
47762+ TPM_SHORT,
47763+ TPM_UNDEFINED, /* 95 */
47764+ TPM_UNDEFINED,
47765+ TPM_UNDEFINED,
47766+ TPM_UNDEFINED,
47767+ TPM_UNDEFINED,
47768+ TPM_MEDIUM, /* 100 */
47769+ TPM_SHORT,
47770+ TPM_SHORT,
47771+ TPM_UNDEFINED,
47772+ TPM_UNDEFINED,
47773+ TPM_UNDEFINED, /* 105 */
47774+ TPM_UNDEFINED,
47775+ TPM_UNDEFINED,
47776+ TPM_UNDEFINED,
47777+ TPM_UNDEFINED,
47778+ TPM_SHORT, /* 110 */
47779+ TPM_SHORT,
47780+ TPM_SHORT,
47781+ TPM_SHORT,
47782+ TPM_SHORT,
47783+ TPM_SHORT, /* 115 */
47784+ TPM_SHORT,
47785+ TPM_SHORT,
47786+ TPM_UNDEFINED,
47787+ TPM_UNDEFINED,
47788+ TPM_LONG, /* 120 */
47789+ TPM_LONG,
47790+ TPM_MEDIUM,
47791+ TPM_UNDEFINED,
47792+ TPM_SHORT,
47793+ TPM_SHORT, /* 125 */
47794+ TPM_SHORT,
47795+ TPM_LONG,
47796+ TPM_SHORT,
47797+ TPM_SHORT,
47798+ TPM_SHORT, /* 130 */
47799+ TPM_MEDIUM,
47800+ TPM_UNDEFINED,
47801+ TPM_SHORT,
47802+ TPM_MEDIUM,
47803+ TPM_UNDEFINED, /* 135 */
47804+ TPM_UNDEFINED,
47805+ TPM_UNDEFINED,
47806+ TPM_UNDEFINED,
47807+ TPM_UNDEFINED,
47808+ TPM_SHORT, /* 140 */
47809+ TPM_SHORT,
47810+ TPM_UNDEFINED,
47811+ TPM_UNDEFINED,
47812+ TPM_UNDEFINED,
47813+ TPM_UNDEFINED, /* 145 */
47814+ TPM_UNDEFINED,
47815+ TPM_UNDEFINED,
47816+ TPM_UNDEFINED,
47817+ TPM_UNDEFINED,
47818+ TPM_SHORT, /* 150 */
47819+ TPM_MEDIUM,
47820+ TPM_MEDIUM,
47821+ TPM_SHORT,
47822+ TPM_SHORT,
47823+ TPM_UNDEFINED, /* 155 */
47824+ TPM_UNDEFINED,
47825+ TPM_UNDEFINED,
47826+ TPM_UNDEFINED,
47827+ TPM_UNDEFINED,
47828+ TPM_SHORT, /* 160 */
47829+ TPM_SHORT,
47830+ TPM_SHORT,
47831+ TPM_SHORT,
47832+ TPM_UNDEFINED,
47833+ TPM_UNDEFINED, /* 165 */
47834+ TPM_UNDEFINED,
47835+ TPM_UNDEFINED,
47836+ TPM_UNDEFINED,
47837+ TPM_UNDEFINED,
47838+ TPM_LONG, /* 170 */
47839+ TPM_UNDEFINED,
47840+ TPM_UNDEFINED,
47841+ TPM_UNDEFINED,
47842+ TPM_UNDEFINED,
47843+ TPM_UNDEFINED, /* 175 */
47844+ TPM_UNDEFINED,
47845+ TPM_UNDEFINED,
47846+ TPM_UNDEFINED,
47847+ TPM_UNDEFINED,
47848+ TPM_MEDIUM, /* 180 */
47849+ TPM_SHORT,
47850+ TPM_MEDIUM,
47851+ TPM_MEDIUM,
47852+ TPM_MEDIUM,
47853+ TPM_MEDIUM, /* 185 */
47854+ TPM_SHORT,
47855+ TPM_UNDEFINED,
47856+ TPM_UNDEFINED,
47857+ TPM_UNDEFINED,
47858+ TPM_UNDEFINED, /* 190 */
47859+ TPM_UNDEFINED,
47860+ TPM_UNDEFINED,
47861+ TPM_UNDEFINED,
47862+ TPM_UNDEFINED,
47863+ TPM_UNDEFINED, /* 195 */
47864+ TPM_UNDEFINED,
47865+ TPM_UNDEFINED,
47866+ TPM_UNDEFINED,
47867+ TPM_UNDEFINED,
47868+ TPM_SHORT, /* 200 */
47869+ TPM_UNDEFINED,
47870+ TPM_UNDEFINED,
47871+ TPM_UNDEFINED,
47872+ TPM_SHORT,
47873+ TPM_SHORT, /* 205 */
47874+ TPM_SHORT,
47875+ TPM_SHORT,
47876+ TPM_SHORT,
47877+ TPM_SHORT,
47878+ TPM_MEDIUM, /* 210 */
47879+ TPM_UNDEFINED,
47880+ TPM_MEDIUM,
47881+ TPM_MEDIUM,
47882+ TPM_MEDIUM,
47883+ TPM_UNDEFINED, /* 215 */
47884+ TPM_MEDIUM,
47885+ TPM_UNDEFINED,
47886+ TPM_UNDEFINED,
47887+ TPM_SHORT,
47888+ TPM_SHORT, /* 220 */
47889+ TPM_SHORT,
47890+ TPM_SHORT,
47891+ TPM_SHORT,
47892+ TPM_SHORT,
47893+ TPM_UNDEFINED, /* 225 */
47894+ TPM_UNDEFINED,
47895+ TPM_UNDEFINED,
47896+ TPM_UNDEFINED,
47897+ TPM_UNDEFINED,
47898+ TPM_SHORT, /* 230 */
47899+ TPM_LONG,
47900+ TPM_MEDIUM,
47901+ TPM_UNDEFINED,
47902+ TPM_UNDEFINED,
47903+ TPM_UNDEFINED, /* 235 */
47904+ TPM_UNDEFINED,
47905+ TPM_UNDEFINED,
47906+ TPM_UNDEFINED,
47907+ TPM_UNDEFINED,
47908+ TPM_SHORT, /* 240 */
47909+ TPM_UNDEFINED,
47910+ TPM_MEDIUM,
47911+};
47912
47913 static void user_reader_timeout(unsigned long ptr)
47914 {
47915@@ -46,28 +327,58 @@
47916 schedule_work(&chip->work);
47917 }
47918
47919-static void timeout_work(void * ptr)
47920+static void timeout_work(void *ptr)
47921 {
47922 struct tpm_chip *chip = ptr;
47923
47924 down(&chip->buffer_mutex);
47925 atomic_set(&chip->data_pending, 0);
47926+#ifndef CONFIG_XEN
47927 memset(chip->data_buffer, 0, TPM_BUFSIZE);
47928+#else
47929+ memset(chip->data_buffer, 0, get_chip_buffersize(chip));
47930+#endif
47931 up(&chip->buffer_mutex);
47932 }
47933
47934 /*
47935+ * Returns max number of jiffies to wait
47936+ */
47937+unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip,
47938+ u32 ordinal)
47939+{
47940+ int duration_idx = TPM_UNDEFINED;
47941+ int duration = 0;
47942+
47943+ if (ordinal < TPM_MAX_ORDINAL)
47944+ duration_idx = tpm_ordinal_duration[ordinal];
47945+ else if ((ordinal & TPM_PROTECTED_ORDINAL_MASK) <
47946+ TPM_MAX_PROTECTED_ORDINAL)
47947+ duration_idx =
47948+ tpm_protected_ordinal_duration[ordinal &
47949+ TPM_PROTECTED_ORDINAL_MASK];
47950+
47951+ if (duration_idx != TPM_UNDEFINED)
47952+ duration = chip->vendor.duration[duration_idx];
47953+ if (duration <= 0)
47954+ return 2 * 60 * HZ;
47955+ else
47956+ return duration;
47957+}
47958+EXPORT_SYMBOL_GPL(tpm_calc_ordinal_duration);
47959+
47960+/*
47961 * Internal kernel interface to transmit TPM commands
47962 */
47963 static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf,
47964 size_t bufsiz)
47965 {
47966 ssize_t rc;
47967- u32 count;
47968+ u32 count, ordinal;
47969 unsigned long stop;
47970
47971 count = be32_to_cpu(*((__be32 *) (buf + 2)));
47972-
47973+ ordinal = be32_to_cpu(*((__be32 *) (buf + 6)));
47974 if (count == 0)
47975 return -ENODATA;
47976 if (count > bufsiz) {
47977@@ -78,21 +389,23 @@
47978
47979 down(&chip->tpm_mutex);
47980
47981- if ((rc = chip->vendor->send(chip, (u8 *) buf, count)) < 0) {
47982+ if ((rc = chip->vendor.send(chip, (u8 *) buf, count)) < 0) {
47983 dev_err(chip->dev,
47984 "tpm_transmit: tpm_send: error %zd\n", rc);
47985 goto out;
47986 }
47987
47988- stop = jiffies + 2 * 60 * HZ;
47989+ if (chip->vendor.irq)
47990+ goto out_recv;
47991+
47992+ stop = jiffies + tpm_calc_ordinal_duration(chip, ordinal);
47993 do {
47994- u8 status = chip->vendor->status(chip);
47995- if ((status & chip->vendor->req_complete_mask) ==
47996- chip->vendor->req_complete_val) {
47997+ u8 status = chip->vendor.status(chip);
47998+ if ((status & chip->vendor.req_complete_mask) ==
47999+ chip->vendor.req_complete_val)
48000 goto out_recv;
48001- }
48002
48003- if ((status == chip->vendor->req_canceled)) {
48004+ if ((status == chip->vendor.req_canceled)) {
48005 dev_err(chip->dev, "Operation Canceled\n");
48006 rc = -ECANCELED;
48007 goto out;
48008@@ -102,14 +415,13 @@
48009 rmb();
48010 } while (time_before(jiffies, stop));
48011
48012-
48013- chip->vendor->cancel(chip);
48014+ chip->vendor.cancel(chip);
48015 dev_err(chip->dev, "Operation Timed out\n");
48016 rc = -ETIME;
48017 goto out;
48018
48019 out_recv:
48020- rc = chip->vendor->recv(chip, (u8 *) buf, bufsiz);
48021+ rc = chip->vendor.recv(chip, (u8 *) buf, bufsiz);
48022 if (rc < 0)
48023 dev_err(chip->dev,
48024 "tpm_transmit: tpm_recv: error %zd\n", rc);
48025@@ -119,17 +431,247 @@
48026 }
48027
48028 #define TPM_DIGEST_SIZE 20
48029-#define CAP_PCR_RESULT_SIZE 18
48030-static const u8 cap_pcr[] = {
48031+#define TPM_ERROR_SIZE 10
48032+#define TPM_RET_CODE_IDX 6
48033+#define TPM_GET_CAP_RET_SIZE_IDX 10
48034+#define TPM_GET_CAP_RET_UINT32_1_IDX 14
48035+#define TPM_GET_CAP_RET_UINT32_2_IDX 18
48036+#define TPM_GET_CAP_RET_UINT32_3_IDX 22
48037+#define TPM_GET_CAP_RET_UINT32_4_IDX 26
48038+#define TPM_GET_CAP_PERM_DISABLE_IDX 16
48039+#define TPM_GET_CAP_PERM_INACTIVE_IDX 18
48040+#define TPM_GET_CAP_RET_BOOL_1_IDX 14
48041+#define TPM_GET_CAP_TEMP_INACTIVE_IDX 16
48042+
48043+#define TPM_CAP_IDX 13
48044+#define TPM_CAP_SUBCAP_IDX 21
48045+
48046+enum tpm_capabilities {
48047+ TPM_CAP_FLAG = 4,
48048+ TPM_CAP_PROP = 5,
48049+};
48050+
48051+enum tpm_sub_capabilities {
48052+ TPM_CAP_PROP_PCR = 0x1,
48053+ TPM_CAP_PROP_MANUFACTURER = 0x3,
48054+ TPM_CAP_FLAG_PERM = 0x8,
48055+ TPM_CAP_FLAG_VOL = 0x9,
48056+ TPM_CAP_PROP_OWNER = 0x11,
48057+ TPM_CAP_PROP_TIS_TIMEOUT = 0x15,
48058+ TPM_CAP_PROP_TIS_DURATION = 0x20,
48059+};
48060+
48061+/*
48062+ * This is a semi generic GetCapability command for use
48063+ * with the capability type TPM_CAP_PROP or TPM_CAP_FLAG
48064+ * and their associated sub_capabilities.
48065+ */
48066+
48067+static const u8 tpm_cap[] = {
48068 0, 193, /* TPM_TAG_RQU_COMMAND */
48069 0, 0, 0, 22, /* length */
48070 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48071- 0, 0, 0, 5,
48072- 0, 0, 0, 4,
48073- 0, 0, 1, 1
48074+ 0, 0, 0, 0, /* TPM_CAP_<TYPE> */
48075+ 0, 0, 0, 4, /* TPM_CAP_SUB_<TYPE> size */
48076+ 0, 0, 1, 0 /* TPM_CAP_SUB_<TYPE> */
48077 };
48078
48079-#define READ_PCR_RESULT_SIZE 30
48080+static ssize_t transmit_cmd(struct tpm_chip *chip, u8 *data, int len,
48081+ char *desc)
48082+{
48083+ int err;
48084+
48085+ len = tpm_transmit(chip, data, len);
48086+ if (len < 0)
48087+ return len;
48088+ if (len == TPM_ERROR_SIZE) {
48089+ err = be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX)));
48090+ dev_dbg(chip->dev, "A TPM error (%d) occurred %s\n", err, desc);
48091+ return err;
48092+ }
48093+ return 0;
48094+}
48095+
48096+void tpm_gen_interrupt(struct tpm_chip *chip)
48097+{
48098+ u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 30)];
48099+ ssize_t rc;
48100+
48101+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48102+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48103+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_TIMEOUT;
48104+
48105+ rc = transmit_cmd(chip, data, sizeof(data),
48106+ "attempting to determine the timeouts");
48107+}
48108+EXPORT_SYMBOL_GPL(tpm_gen_interrupt);
48109+
48110+void tpm_get_timeouts(struct tpm_chip *chip)
48111+{
48112+ u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 30)];
48113+ ssize_t rc;
48114+ u32 timeout;
48115+
48116+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48117+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48118+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_TIMEOUT;
48119+
48120+ rc = transmit_cmd(chip, data, sizeof(data),
48121+ "attempting to determine the timeouts");
48122+ if (rc)
48123+ goto duration;
48124+
48125+ if (be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_SIZE_IDX)))
48126+ != 4 * sizeof(u32))
48127+ goto duration;
48128+
48129+ /* Don't overwrite default if value is 0 */
48130+ timeout =
48131+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX)));
48132+ if (timeout)
48133+ chip->vendor.timeout_a = msecs_to_jiffies(timeout);
48134+ timeout =
48135+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_2_IDX)));
48136+ if (timeout)
48137+ chip->vendor.timeout_b = msecs_to_jiffies(timeout);
48138+ timeout =
48139+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_3_IDX)));
48140+ if (timeout)
48141+ chip->vendor.timeout_c = msecs_to_jiffies(timeout);
48142+ timeout =
48143+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_4_IDX)));
48144+ if (timeout)
48145+ chip->vendor.timeout_d = msecs_to_jiffies(timeout);
48146+
48147+duration:
48148+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48149+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48150+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_DURATION;
48151+
48152+ rc = transmit_cmd(chip, data, sizeof(data),
48153+ "attempting to determine the durations");
48154+ if (rc)
48155+ return;
48156+
48157+ if (be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_SIZE_IDX)))
48158+ != 3 * sizeof(u32))
48159+ return;
48160+
48161+ chip->vendor.duration[TPM_SHORT] =
48162+ msecs_to_jiffies(be32_to_cpu
48163+ (*((__be32 *) (data +
48164+ TPM_GET_CAP_RET_UINT32_1_IDX))));
48165+ chip->vendor.duration[TPM_MEDIUM] =
48166+ msecs_to_jiffies(be32_to_cpu
48167+ (*((__be32 *) (data +
48168+ TPM_GET_CAP_RET_UINT32_2_IDX))));
48169+ chip->vendor.duration[TPM_LONG] =
48170+ msecs_to_jiffies(be32_to_cpu
48171+ (*((__be32 *) (data +
48172+ TPM_GET_CAP_RET_UINT32_3_IDX))));
48173+}
48174+EXPORT_SYMBOL_GPL(tpm_get_timeouts);
48175+
48176+void tpm_continue_selftest(struct tpm_chip *chip)
48177+{
48178+ u8 data[] = {
48179+ 0, 193, /* TPM_TAG_RQU_COMMAND */
48180+ 0, 0, 0, 10, /* length */
48181+ 0, 0, 0, 83, /* TPM_ORD_GetCapability */
48182+ };
48183+
48184+ tpm_transmit(chip, data, sizeof(data));
48185+}
48186+EXPORT_SYMBOL_GPL(tpm_continue_selftest);
48187+
48188+ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr,
48189+ char *buf)
48190+{
48191+ u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 35)];
48192+ ssize_t rc;
48193+
48194+ struct tpm_chip *chip = dev_get_drvdata(dev);
48195+ if (chip == NULL)
48196+ return -ENODEV;
48197+
48198+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48199+ data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48200+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_PERM;
48201+
48202+ rc = transmit_cmd(chip, data, sizeof(data),
48203+ "attemtping to determine the permanent state");
48204+ if (rc)
48205+ return 0;
48206+ return sprintf(buf, "%d\n", !data[TPM_GET_CAP_PERM_DISABLE_IDX]);
48207+}
48208+EXPORT_SYMBOL_GPL(tpm_show_enabled);
48209+
48210+ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr,
48211+ char *buf)
48212+{
48213+ u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 35)];
48214+ ssize_t rc;
48215+
48216+ struct tpm_chip *chip = dev_get_drvdata(dev);
48217+ if (chip == NULL)
48218+ return -ENODEV;
48219+
48220+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48221+ data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48222+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_PERM;
48223+
48224+ rc = transmit_cmd(chip, data, sizeof(data),
48225+ "attemtping to determine the permanent state");
48226+ if (rc)
48227+ return 0;
48228+ return sprintf(buf, "%d\n", !data[TPM_GET_CAP_PERM_INACTIVE_IDX]);
48229+}
48230+EXPORT_SYMBOL_GPL(tpm_show_active);
48231+
48232+ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr,
48233+ char *buf)
48234+{
48235+ u8 data[sizeof(tpm_cap)];
48236+ ssize_t rc;
48237+
48238+ struct tpm_chip *chip = dev_get_drvdata(dev);
48239+ if (chip == NULL)
48240+ return -ENODEV;
48241+
48242+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48243+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48244+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_OWNER;
48245+
48246+ rc = transmit_cmd(chip, data, sizeof(data),
48247+ "attempting to determine the owner state");
48248+ if (rc)
48249+ return 0;
48250+ return sprintf(buf, "%d\n", data[TPM_GET_CAP_RET_BOOL_1_IDX]);
48251+}
48252+EXPORT_SYMBOL_GPL(tpm_show_owned);
48253+
48254+ssize_t tpm_show_temp_deactivated(struct device * dev,
48255+ struct device_attribute * attr, char *buf)
48256+{
48257+ u8 data[sizeof(tpm_cap)];
48258+ ssize_t rc;
48259+
48260+ struct tpm_chip *chip = dev_get_drvdata(dev);
48261+ if (chip == NULL)
48262+ return -ENODEV;
48263+
48264+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48265+ data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48266+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_VOL;
48267+
48268+ rc = transmit_cmd(chip, data, sizeof(data),
48269+ "attempting to determine the temporary state");
48270+ if (rc)
48271+ return 0;
48272+ return sprintf(buf, "%d\n", data[TPM_GET_CAP_TEMP_INACTIVE_IDX]);
48273+}
48274+EXPORT_SYMBOL_GPL(tpm_show_temp_deactivated);
48275+
48276 static const u8 pcrread[] = {
48277 0, 193, /* TPM_TAG_RQU_COMMAND */
48278 0, 0, 0, 14, /* length */
48279@@ -140,8 +682,8 @@
48280 ssize_t tpm_show_pcrs(struct device *dev, struct device_attribute *attr,
48281 char *buf)
48282 {
48283- u8 data[READ_PCR_RESULT_SIZE];
48284- ssize_t len;
48285+ u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(pcrread)), 30)];
48286+ ssize_t rc;
48287 int i, j, num_pcrs;
48288 __be32 index;
48289 char *str = buf;
48290@@ -150,29 +692,24 @@
48291 if (chip == NULL)
48292 return -ENODEV;
48293
48294- memcpy(data, cap_pcr, sizeof(cap_pcr));
48295- if ((len = tpm_transmit(chip, data, sizeof(data)))
48296- < CAP_PCR_RESULT_SIZE) {
48297- dev_dbg(chip->dev, "A TPM error (%d) occurred "
48298- "attempting to determine the number of PCRS\n",
48299- be32_to_cpu(*((__be32 *) (data + 6))));
48300+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48301+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48302+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_PCR;
48303+
48304+ rc = transmit_cmd(chip, data, sizeof(data),
48305+ "attempting to determine the number of PCRS");
48306+ if (rc)
48307 return 0;
48308- }
48309
48310 num_pcrs = be32_to_cpu(*((__be32 *) (data + 14)));
48311-
48312 for (i = 0; i < num_pcrs; i++) {
48313 memcpy(data, pcrread, sizeof(pcrread));
48314 index = cpu_to_be32(i);
48315 memcpy(data + 10, &index, 4);
48316- if ((len = tpm_transmit(chip, data, sizeof(data)))
48317- < READ_PCR_RESULT_SIZE){
48318- dev_dbg(chip->dev, "A TPM error (%d) occurred"
48319- " attempting to read PCR %d of %d\n",
48320- be32_to_cpu(*((__be32 *) (data + 6))),
48321- i, num_pcrs);
48322+ rc = transmit_cmd(chip, data, sizeof(data),
48323+ "attempting to read a PCR");
48324+ if (rc)
48325 goto out;
48326- }
48327 str += sprintf(str, "PCR-%02d: ", i);
48328 for (j = 0; j < TPM_DIGEST_SIZE; j++)
48329 str += sprintf(str, "%02X ", *(data + 10 + j));
48330@@ -194,7 +731,7 @@
48331 char *buf)
48332 {
48333 u8 *data;
48334- ssize_t len;
48335+ ssize_t err;
48336 int i, rc;
48337 char *str = buf;
48338
48339@@ -208,14 +745,10 @@
48340
48341 memcpy(data, readpubek, sizeof(readpubek));
48342
48343- if ((len = tpm_transmit(chip, data, READ_PUBEK_RESULT_SIZE)) <
48344- READ_PUBEK_RESULT_SIZE) {
48345- dev_dbg(chip->dev, "A TPM error (%d) occurred "
48346- "attempting to read the PUBEK\n",
48347- be32_to_cpu(*((__be32 *) (data + 6))));
48348- rc = 0;
48349+ err = transmit_cmd(chip, data, READ_PUBEK_RESULT_SIZE,
48350+ "attempting to read the PUBEK");
48351+ if (err)
48352 goto out;
48353- }
48354
48355 /*
48356 ignore header 10 bytes
48357@@ -245,36 +778,68 @@
48358 if ((i + 1) % 16 == 0)
48359 str += sprintf(str, "\n");
48360 }
48361- rc = str - buf;
48362 out:
48363+ rc = str - buf;
48364 kfree(data);
48365 return rc;
48366 }
48367 EXPORT_SYMBOL_GPL(tpm_show_pubek);
48368
48369-#define CAP_VER_RESULT_SIZE 18
48370+#define CAP_VERSION_1_1 6
48371+#define CAP_VERSION_1_2 0x1A
48372+#define CAP_VERSION_IDX 13
48373 static const u8 cap_version[] = {
48374 0, 193, /* TPM_TAG_RQU_COMMAND */
48375 0, 0, 0, 18, /* length */
48376 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48377- 0, 0, 0, 6,
48378+ 0, 0, 0, 0,
48379 0, 0, 0, 0
48380 };
48381
48382-#define CAP_MANUFACTURER_RESULT_SIZE 18
48383-static const u8 cap_manufacturer[] = {
48384- 0, 193, /* TPM_TAG_RQU_COMMAND */
48385- 0, 0, 0, 22, /* length */
48386- 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48387- 0, 0, 0, 5,
48388- 0, 0, 0, 4,
48389- 0, 0, 1, 3
48390-};
48391-
48392 ssize_t tpm_show_caps(struct device *dev, struct device_attribute *attr,
48393 char *buf)
48394 {
48395- u8 data[sizeof(cap_manufacturer)];
48396+ u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(cap_version)), 30)];
48397+ ssize_t rc;
48398+ char *str = buf;
48399+
48400+ struct tpm_chip *chip = dev_get_drvdata(dev);
48401+ if (chip == NULL)
48402+ return -ENODEV;
48403+
48404+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48405+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48406+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_MANUFACTURER;
48407+
48408+ rc = transmit_cmd(chip, data, sizeof(data),
48409+ "attempting to determine the manufacturer");
48410+ if (rc)
48411+ return 0;
48412+
48413+ str += sprintf(str, "Manufacturer: 0x%x\n",
48414+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX))));
48415+
48416+ memcpy(data, cap_version, sizeof(cap_version));
48417+ data[CAP_VERSION_IDX] = CAP_VERSION_1_1;
48418+ rc = transmit_cmd(chip, data, sizeof(data),
48419+ "attempting to determine the 1.1 version");
48420+ if (rc)
48421+ goto out;
48422+
48423+ str += sprintf(str,
48424+ "TCG version: %d.%d\nFirmware version: %d.%d\n",
48425+ (int) data[14], (int) data[15], (int) data[16],
48426+ (int) data[17]);
48427+
48428+out:
48429+ return str - buf;
48430+}
48431+EXPORT_SYMBOL_GPL(tpm_show_caps);
48432+
48433+ssize_t tpm_show_caps_1_2(struct device * dev,
48434+ struct device_attribute * attr, char *buf)
48435+{
48436+ u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(cap_version)), 30)];
48437 ssize_t len;
48438 char *str = buf;
48439
48440@@ -282,29 +847,40 @@
48441 if (chip == NULL)
48442 return -ENODEV;
48443
48444- memcpy(data, cap_manufacturer, sizeof(cap_manufacturer));
48445+ memcpy(data, tpm_cap, sizeof(tpm_cap));
48446+ data[TPM_CAP_IDX] = TPM_CAP_PROP;
48447+ data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_MANUFACTURER;
48448
48449- if ((len = tpm_transmit(chip, data, sizeof(data))) <
48450- CAP_MANUFACTURER_RESULT_SIZE)
48451- return len;
48452+ if ((len = tpm_transmit(chip, data, sizeof(data))) <=
48453+ TPM_ERROR_SIZE) {
48454+ dev_dbg(chip->dev, "A TPM error (%d) occurred "
48455+ "attempting to determine the manufacturer\n",
48456+ be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX))));
48457+ return 0;
48458+ }
48459
48460 str += sprintf(str, "Manufacturer: 0x%x\n",
48461- be32_to_cpu(*((__be32 *) (data + 14))));
48462+ be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX))));
48463
48464 memcpy(data, cap_version, sizeof(cap_version));
48465+ data[CAP_VERSION_IDX] = CAP_VERSION_1_2;
48466
48467- if ((len = tpm_transmit(chip, data, sizeof(data))) <
48468- CAP_VER_RESULT_SIZE)
48469- return len;
48470-
48471- str +=
48472- sprintf(str, "TCG version: %d.%d\nFirmware version: %d.%d\n",
48473- (int) data[14], (int) data[15], (int) data[16],
48474- (int) data[17]);
48475+ if ((len = tpm_transmit(chip, data, sizeof(data))) <=
48476+ TPM_ERROR_SIZE) {
48477+ dev_err(chip->dev, "A TPM error (%d) occurred "
48478+ "attempting to determine the 1.2 version\n",
48479+ be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX))));
48480+ goto out;
48481+ }
48482+ str += sprintf(str,
48483+ "TCG version: %d.%d\nFirmware version: %d.%d\n",
48484+ (int) data[16], (int) data[17], (int) data[18],
48485+ (int) data[19]);
48486
48487+out:
48488 return str - buf;
48489 }
48490-EXPORT_SYMBOL_GPL(tpm_show_caps);
48491+EXPORT_SYMBOL_GPL(tpm_show_caps_1_2);
48492
48493 ssize_t tpm_store_cancel(struct device *dev, struct device_attribute *attr,
48494 const char *buf, size_t count)
48495@@ -313,7 +889,7 @@
48496 if (chip == NULL)
48497 return 0;
48498
48499- chip->vendor->cancel(chip);
48500+ chip->vendor.cancel(chip);
48501 return count;
48502 }
48503 EXPORT_SYMBOL_GPL(tpm_store_cancel);
48504@@ -329,7 +905,7 @@
48505 spin_lock(&driver_lock);
48506
48507 list_for_each_entry(pos, &tpm_chip_list, list) {
48508- if (pos->vendor->miscdev.minor == minor) {
48509+ if (pos->vendor.miscdev.minor == minor) {
48510 chip = pos;
48511 break;
48512 }
48513@@ -351,7 +927,12 @@
48514
48515 spin_unlock(&driver_lock);
48516
48517+#ifndef CONFIG_XEN
48518 chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
48519+#else
48520+ chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8),
48521+ GFP_KERNEL);
48522+#endif
48523 if (chip->data_buffer == NULL) {
48524 chip->num_opens--;
48525 put_device(chip->dev);
48526@@ -387,7 +968,7 @@
48527 EXPORT_SYMBOL_GPL(tpm_release);
48528
48529 ssize_t tpm_write(struct file *file, const char __user *buf,
48530- size_t size, loff_t * off)
48531+ size_t size, loff_t *off)
48532 {
48533 struct tpm_chip *chip = file->private_data;
48534 int in_size = size, out_size;
48535@@ -399,8 +980,13 @@
48536
48537 down(&chip->buffer_mutex);
48538
48539+#ifndef CONFIG_XEN
48540 if (in_size > TPM_BUFSIZE)
48541 in_size = TPM_BUFSIZE;
48542+#else
48543+ if (in_size > get_chip_buffersize(chip))
48544+ in_size = get_chip_buffersize(chip);
48545+#endif
48546
48547 if (copy_from_user
48548 (chip->data_buffer, (void __user *) buf, in_size)) {
48549@@ -409,9 +995,17 @@
48550 }
48551
48552 /* atomic tpm command send and result receive */
48553+#ifndef CONFIG_XEN
48554 out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
48555+#else
48556+ out_size = tpm_transmit(chip, chip->data_buffer,
48557+ get_chip_buffersize(chip));
48558+#endif
48559
48560 atomic_set(&chip->data_pending, out_size);
48561+#ifdef CONFIG_XEN
48562+ atomic_set(&chip->data_position, 0);
48563+#endif
48564 up(&chip->buffer_mutex);
48565
48566 /* Set a timeout by which the reader must come claim the result */
48567@@ -419,29 +1013,59 @@
48568
48569 return in_size;
48570 }
48571-
48572 EXPORT_SYMBOL_GPL(tpm_write);
48573
48574-ssize_t tpm_read(struct file * file, char __user *buf,
48575- size_t size, loff_t * off)
48576+ssize_t tpm_read(struct file *file, char __user *buf,
48577+ size_t size, loff_t *off)
48578 {
48579 struct tpm_chip *chip = file->private_data;
48580 int ret_size;
48581+#ifdef CONFIG_XEN
48582+ int pos, pending = 0;
48583+#endif
48584
48585+#ifndef CONFIG_XEN
48586 del_singleshot_timer_sync(&chip->user_read_timer);
48587 flush_scheduled_work();
48588+#endif
48589 ret_size = atomic_read(&chip->data_pending);
48590+#ifndef CONFIG_XEN
48591 atomic_set(&chip->data_pending, 0);
48592+#endif
48593 if (ret_size > 0) { /* relay data */
48594 if (size < ret_size)
48595 ret_size = size;
48596
48597+#ifdef CONFIG_XEN
48598+ pos = atomic_read(&chip->data_position);
48599+#endif
48600 down(&chip->buffer_mutex);
48601+#ifndef CONFIG_XEN
48602 if (copy_to_user(buf, chip->data_buffer, ret_size))
48603+#else
48604+ if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
48605+#endif
48606 ret_size = -EFAULT;
48607+#ifdef CONFIG_XEN
48608+ } else {
48609+ pending = atomic_read(&chip->data_pending) - ret_size;
48610+ if ( pending ) {
48611+ atomic_set(&chip->data_pending, pending);
48612+ atomic_set(&chip->data_position,
48613+ pos+ret_size);
48614+ }
48615+ }
48616+#endif
48617 up(&chip->buffer_mutex);
48618 }
48619
48620+#ifdef CONFIG_XEN
48621+ if ( ret_size <= 0 || pending == 0 ) {
48622+ atomic_set(&chip->data_pending, 0);
48623+ del_singleshot_timer_sync(&chip->user_read_timer);
48624+ flush_scheduled_work();
48625+ }
48626+#endif
48627 return ret_size;
48628 }
48629 EXPORT_SYMBOL_GPL(tpm_read);
48630@@ -462,14 +1086,13 @@
48631 spin_unlock(&driver_lock);
48632
48633 dev_set_drvdata(dev, NULL);
48634- misc_deregister(&chip->vendor->miscdev);
48635- kfree(chip->vendor->miscdev.name);
48636+ misc_deregister(&chip->vendor.miscdev);
48637+ kfree(chip->vendor.miscdev.name);
48638
48639- sysfs_remove_group(&dev->kobj, chip->vendor->attr_group);
48640+ sysfs_remove_group(&dev->kobj, chip->vendor.attr_group);
48641 tpm_bios_log_teardown(chip->bios_dir);
48642
48643- dev_mask[chip->dev_num / TPM_NUM_MASK_ENTRIES ] &=
48644- ~(1 << (chip->dev_num % TPM_NUM_MASK_ENTRIES));
48645+ clear_bit(chip->dev_num, dev_mask);
48646
48647 kfree(chip);
48648
48649@@ -520,18 +1143,18 @@
48650 * upon errant exit from this function specific probe function should call
48651 * pci_disable_device
48652 */
48653-int tpm_register_hardware(struct device *dev, struct tpm_vendor_specific *entry)
48654+struct tpm_chip *tpm_register_hardware(struct device *dev, const struct tpm_vendor_specific
48655+ *entry)
48656 {
48657 #define DEVNAME_SIZE 7
48658
48659 char *devname;
48660 struct tpm_chip *chip;
48661- int i, j;
48662
48663 /* Driver specific per-device data */
48664 chip = kzalloc(sizeof(*chip), GFP_KERNEL);
48665 if (chip == NULL)
48666- return -ENOMEM;
48667+ return NULL;
48668
48669 init_MUTEX(&chip->buffer_mutex);
48670 init_MUTEX(&chip->tpm_mutex);
48671@@ -543,45 +1166,37 @@
48672 chip->user_read_timer.function = user_reader_timeout;
48673 chip->user_read_timer.data = (unsigned long) chip;
48674
48675- chip->vendor = entry;
48676+ memcpy(&chip->vendor, entry, sizeof(struct tpm_vendor_specific));
48677
48678- chip->dev_num = -1;
48679-
48680- for (i = 0; i < TPM_NUM_MASK_ENTRIES; i++)
48681- for (j = 0; j < 8 * sizeof(int); j++)
48682- if ((dev_mask[i] & (1 << j)) == 0) {
48683- chip->dev_num =
48684- i * TPM_NUM_MASK_ENTRIES + j;
48685- dev_mask[i] |= 1 << j;
48686- goto dev_num_search_complete;
48687- }
48688+ chip->dev_num = find_first_zero_bit(dev_mask, TPM_NUM_DEVICES);
48689
48690-dev_num_search_complete:
48691- if (chip->dev_num < 0) {
48692+ if (chip->dev_num >= TPM_NUM_DEVICES) {
48693 dev_err(dev, "No available tpm device numbers\n");
48694 kfree(chip);
48695- return -ENODEV;
48696+ return NULL;
48697 } else if (chip->dev_num == 0)
48698- chip->vendor->miscdev.minor = TPM_MINOR;
48699+ chip->vendor.miscdev.minor = TPM_MINOR;
48700 else
48701- chip->vendor->miscdev.minor = MISC_DYNAMIC_MINOR;
48702+ chip->vendor.miscdev.minor = MISC_DYNAMIC_MINOR;
48703+
48704+ set_bit(chip->dev_num, dev_mask);
48705
48706 devname = kmalloc(DEVNAME_SIZE, GFP_KERNEL);
48707 scnprintf(devname, DEVNAME_SIZE, "%s%d", "tpm", chip->dev_num);
48708- chip->vendor->miscdev.name = devname;
48709+ chip->vendor.miscdev.name = devname;
48710
48711- chip->vendor->miscdev.dev = dev;
48712+ chip->vendor.miscdev.dev = dev;
48713 chip->dev = get_device(dev);
48714
48715- if (misc_register(&chip->vendor->miscdev)) {
48716+ if (misc_register(&chip->vendor.miscdev)) {
48717 dev_err(chip->dev,
48718 "unable to misc_register %s, minor %d\n",
48719- chip->vendor->miscdev.name,
48720- chip->vendor->miscdev.minor);
48721+ chip->vendor.miscdev.name,
48722+ chip->vendor.miscdev.minor);
48723 put_device(dev);
48724+ clear_bit(chip->dev_num, dev_mask);
48725 kfree(chip);
48726- dev_mask[i] &= !(1 << j);
48727- return -ENODEV;
48728+ return NULL;
48729 }
48730
48731 spin_lock(&driver_lock);
48732@@ -592,11 +1207,11 @@
48733
48734 spin_unlock(&driver_lock);
48735
48736- sysfs_create_group(&dev->kobj, chip->vendor->attr_group);
48737+ sysfs_create_group(&dev->kobj, chip->vendor.attr_group);
48738
48739 chip->bios_dir = tpm_bios_log_setup(devname);
48740
48741- return 0;
48742+ return chip;
48743 }
48744 EXPORT_SYMBOL_GPL(tpm_register_hardware);
48745
48746diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm.h linux-2.6.16.33/drivers/char/tpm/tpm.h
48747--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm.h 2006-11-22 18:06:31.000000000 +0000
48748+++ linux-2.6.16.33/drivers/char/tpm/tpm.h 2007-01-08 15:00:45.000000000 +0000
48749@@ -24,6 +24,14 @@
48750 #include <linux/fs.h>
48751 #include <linux/miscdevice.h>
48752 #include <linux/platform_device.h>
48753+#include <linux/io.h>
48754+
48755+#ifdef CONFIG_XEN
48756+enum tpm_bufsize {
48757+ TPM_MIN_BUFFERSIZE = 2048,
48758+ TPM_MAX_BUFFERSIZE = 64 * 1024,
48759+};
48760+#endif
48761
48762 enum tpm_timeout {
48763 TPM_TIMEOUT = 5, /* msecs */
48764@@ -41,18 +49,33 @@
48765 char *);
48766 extern ssize_t tpm_show_caps(struct device *, struct device_attribute *attr,
48767 char *);
48768+extern ssize_t tpm_show_caps_1_2(struct device *, struct device_attribute *attr,
48769+ char *);
48770 extern ssize_t tpm_store_cancel(struct device *, struct device_attribute *attr,
48771 const char *, size_t);
48772+extern ssize_t tpm_show_enabled(struct device *, struct device_attribute *attr,
48773+ char *);
48774+extern ssize_t tpm_show_active(struct device *, struct device_attribute *attr,
48775+ char *);
48776+extern ssize_t tpm_show_owned(struct device *, struct device_attribute *attr,
48777+ char *);
48778+extern ssize_t tpm_show_temp_deactivated(struct device *,
48779+ struct device_attribute *attr, char *);
48780
48781 struct tpm_chip;
48782
48783 struct tpm_vendor_specific {
48784- u8 req_complete_mask;
48785- u8 req_complete_val;
48786- u8 req_canceled;
48787+ const u8 req_complete_mask;
48788+ const u8 req_complete_val;
48789+ const u8 req_canceled;
48790+#ifdef CONFIG_XEN
48791+ u32 buffersize;
48792+#endif
48793 void __iomem *iobase; /* ioremapped address */
48794 unsigned long base; /* TPM base address */
48795
48796+ int irq;
48797+
48798 int region_size;
48799 int have_region;
48800
48801@@ -62,6 +85,13 @@
48802 u8 (*status) (struct tpm_chip *);
48803 struct miscdevice miscdev;
48804 struct attribute_group *attr_group;
48805+ struct list_head list;
48806+ int locality;
48807+ unsigned long timeout_a, timeout_b, timeout_c, timeout_d; /* jiffies */
48808+ unsigned long duration[3]; /* jiffies */
48809+
48810+ wait_queue_head_t read_queue;
48811+ wait_queue_head_t int_queue;
48812 };
48813
48814 struct tpm_chip {
48815@@ -74,19 +104,27 @@
48816 /* Data passed to and from the tpm via the read/write calls */
48817 u8 *data_buffer;
48818 atomic_t data_pending;
48819+#ifdef CONFIG_XEN
48820+ atomic_t data_position;
48821+#endif
48822 struct semaphore buffer_mutex;
48823
48824 struct timer_list user_read_timer; /* user needs to claim result */
48825 struct work_struct work;
48826 struct semaphore tpm_mutex; /* tpm is processing */
48827
48828- struct tpm_vendor_specific *vendor;
48829+ struct tpm_vendor_specific vendor;
48830
48831 struct dentry **bios_dir;
48832
48833 struct list_head list;
48834+#ifdef CONFIG_XEN
48835+ void *priv;
48836+#endif
48837 };
48838
48839+#define to_tpm_chip(n) container_of(n, struct tpm_chip, vendor)
48840+
48841 static inline int tpm_read_index(int base, int index)
48842 {
48843 outb(index, base);
48844@@ -99,8 +137,35 @@
48845 outb(value & 0xFF, base+1);
48846 }
48847
48848-extern int tpm_register_hardware(struct device *,
48849- struct tpm_vendor_specific *);
48850+#ifdef CONFIG_XEN
48851+static inline u32 get_chip_buffersize(struct tpm_chip *chip)
48852+{
48853+ u32 size = chip->vendor.buffersize;
48854+ if (size > TPM_MAX_BUFFERSIZE) {
48855+ return TPM_MAX_BUFFERSIZE;
48856+ } else if (size < TPM_MIN_BUFFERSIZE) {
48857+ return TPM_MIN_BUFFERSIZE;
48858+ }
48859+ return size;
48860+}
48861+
48862+static inline void *chip_get_private(const struct tpm_chip *chip)
48863+{
48864+ return chip->priv;
48865+}
48866+
48867+static inline void chip_set_private(struct tpm_chip *chip, void *priv)
48868+{
48869+ chip->priv = priv;
48870+}
48871+#endif
48872+
48873+extern void tpm_get_timeouts(struct tpm_chip *);
48874+extern void tpm_gen_interrupt(struct tpm_chip *);
48875+extern void tpm_continue_selftest(struct tpm_chip *);
48876+extern unsigned long tpm_calc_ordinal_duration(struct tpm_chip *, u32);
48877+extern struct tpm_chip* tpm_register_hardware(struct device *,
48878+ const struct tpm_vendor_specific *);
48879 extern int tpm_open(struct inode *, struct file *);
48880 extern int tpm_release(struct inode *, struct file *);
48881 extern ssize_t tpm_write(struct file *, const char __user *, size_t,
48882@@ -114,7 +179,7 @@
48883 extern struct dentry ** tpm_bios_log_setup(char *);
48884 extern void tpm_bios_log_teardown(struct dentry **);
48885 #else
48886-static inline struct dentry* tpm_bios_log_setup(char *name)
48887+static inline struct dentry ** tpm_bios_log_setup(char *name)
48888 {
48889 return NULL;
48890 }
48891diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.c linux-2.6.16.33/drivers/char/tpm/tpm_atmel.c
48892--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.c 2006-11-22 18:06:31.000000000 +0000
48893+++ linux-2.6.16.33/drivers/char/tpm/tpm_atmel.c 2007-05-23 21:00:01.000000000 +0000
48894@@ -47,12 +47,12 @@
48895 return -EIO;
48896
48897 for (i = 0; i < 6; i++) {
48898- status = ioread8(chip->vendor->iobase + 1);
48899+ status = ioread8(chip->vendor.iobase + 1);
48900 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48901 dev_err(chip->dev, "error reading header\n");
48902 return -EIO;
48903 }
48904- *buf++ = ioread8(chip->vendor->iobase);
48905+ *buf++ = ioread8(chip->vendor.iobase);
48906 }
48907
48908 /* size of the data received */
48909@@ -63,7 +63,7 @@
48910 dev_err(chip->dev,
48911 "Recv size(%d) less than available space\n", size);
48912 for (; i < size; i++) { /* clear the waiting data anyway */
48913- status = ioread8(chip->vendor->iobase + 1);
48914+ status = ioread8(chip->vendor.iobase + 1);
48915 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48916 dev_err(chip->dev, "error reading data\n");
48917 return -EIO;
48918@@ -74,16 +74,16 @@
48919
48920 /* read all the data available */
48921 for (; i < size; i++) {
48922- status = ioread8(chip->vendor->iobase + 1);
48923+ status = ioread8(chip->vendor.iobase + 1);
48924 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48925 dev_err(chip->dev, "error reading data\n");
48926 return -EIO;
48927 }
48928- *buf++ = ioread8(chip->vendor->iobase);
48929+ *buf++ = ioread8(chip->vendor.iobase);
48930 }
48931
48932 /* make sure data available is gone */
48933- status = ioread8(chip->vendor->iobase + 1);
48934+ status = ioread8(chip->vendor.iobase + 1);
48935
48936 if (status & ATML_STATUS_DATA_AVAIL) {
48937 dev_err(chip->dev, "data available is stuck\n");
48938@@ -100,7 +100,7 @@
48939 dev_dbg(chip->dev, "tpm_atml_send:\n");
48940 for (i = 0; i < count; i++) {
48941 dev_dbg(chip->dev, "%d 0x%x(%d)\n", i, buf[i], buf[i]);
48942- iowrite8(buf[i], chip->vendor->iobase);
48943+ iowrite8(buf[i], chip->vendor.iobase);
48944 }
48945
48946 return count;
48947@@ -108,12 +108,12 @@
48948
48949 static void tpm_atml_cancel(struct tpm_chip *chip)
48950 {
48951- iowrite8(ATML_STATUS_ABORT, chip->vendor->iobase + 1);
48952+ iowrite8(ATML_STATUS_ABORT, chip->vendor.iobase + 1);
48953 }
48954
48955 static u8 tpm_atml_status(struct tpm_chip *chip)
48956 {
48957- return ioread8(chip->vendor->iobase + 1);
48958+ return ioread8(chip->vendor.iobase + 1);
48959 }
48960
48961 static struct file_operations atmel_ops = {
48962@@ -140,7 +140,7 @@
48963
48964 static struct attribute_group atmel_attr_grp = { .attrs = atmel_attrs };
48965
48966-static struct tpm_vendor_specific tpm_atmel = {
48967+static const struct tpm_vendor_specific tpm_atmel = {
48968 .recv = tpm_atml_recv,
48969 .send = tpm_atml_send,
48970 .cancel = tpm_atml_cancel,
48971@@ -159,10 +159,10 @@
48972 struct tpm_chip *chip = dev_get_drvdata(&pdev->dev);
48973
48974 if (chip) {
48975- if (chip->vendor->have_region)
48976- atmel_release_region(chip->vendor->base,
48977- chip->vendor->region_size);
48978- atmel_put_base_addr(chip->vendor);
48979+ if (chip->vendor.have_region)
48980+ atmel_release_region(chip->vendor.base,
48981+ chip->vendor.region_size);
48982+ atmel_put_base_addr(chip->vendor.iobase);
48983 tpm_remove_hardware(chip->dev);
48984 platform_device_unregister(pdev);
48985 }
48986@@ -179,18 +179,22 @@
48987 static int __init init_atmel(void)
48988 {
48989 int rc = 0;
48990+ void __iomem *iobase = NULL;
48991+ int have_region, region_size;
48992+ unsigned long base;
48993+ struct tpm_chip *chip;
48994
48995 driver_register(&atml_drv);
48996
48997- if ((tpm_atmel.iobase = atmel_get_base_addr(&tpm_atmel)) == NULL) {
48998+ if ((iobase = atmel_get_base_addr(&base, &region_size)) == NULL) {
48999 rc = -ENODEV;
49000 goto err_unreg_drv;
49001 }
49002
49003- tpm_atmel.have_region =
49004+ have_region =
49005 (atmel_request_region
49006- (tpm_atmel.base, tpm_atmel.region_size,
49007- "tpm_atmel0") == NULL) ? 0 : 1;
49008+ (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1;
49009+
49010
49011 if (IS_ERR
49012 (pdev =
49013@@ -199,17 +203,25 @@
49014 goto err_rel_reg;
49015 }
49016
49017- if ((rc = tpm_register_hardware(&pdev->dev, &tpm_atmel)) < 0)
49018+ if (!(chip = tpm_register_hardware(&pdev->dev, &tpm_atmel))) {
49019+ rc = -ENODEV;
49020 goto err_unreg_dev;
49021+ }
49022+
49023+ chip->vendor.iobase = iobase;
49024+ chip->vendor.base = base;
49025+ chip->vendor.have_region = have_region;
49026+ chip->vendor.region_size = region_size;
49027+
49028 return 0;
49029
49030 err_unreg_dev:
49031 platform_device_unregister(pdev);
49032 err_rel_reg:
49033- atmel_put_base_addr(&tpm_atmel);
49034- if (tpm_atmel.have_region)
49035- atmel_release_region(tpm_atmel.base,
49036- tpm_atmel.region_size);
49037+ atmel_put_base_addr(iobase);
49038+ if (have_region)
49039+ atmel_release_region(base,
49040+ region_size);
49041 err_unreg_drv:
49042 driver_unregister(&atml_drv);
49043 return rc;
49044diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.h linux-2.6.16.33/drivers/char/tpm/tpm_atmel.h
49045--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.h 2006-11-22 18:06:31.000000000 +0000
49046+++ linux-2.6.16.33/drivers/char/tpm/tpm_atmel.h 2007-05-23 21:00:01.000000000 +0000
49047@@ -28,13 +28,12 @@
49048 #define atmel_request_region request_mem_region
49049 #define atmel_release_region release_mem_region
49050
49051-static inline void atmel_put_base_addr(struct tpm_vendor_specific
49052- *vendor)
49053+static inline void atmel_put_base_addr(void __iomem *iobase)
49054 {
49055- iounmap(vendor->iobase);
49056+ iounmap(iobase);
49057 }
49058
49059-static void __iomem * atmel_get_base_addr(struct tpm_vendor_specific *vendor)
49060+static void __iomem * atmel_get_base_addr(unsigned long *base, int *region_size)
49061 {
49062 struct device_node *dn;
49063 unsigned long address, size;
49064@@ -71,9 +70,9 @@
49065 else
49066 size = reg[naddrc];
49067
49068- vendor->base = address;
49069- vendor->region_size = size;
49070- return ioremap(vendor->base, vendor->region_size);
49071+ *base = address;
49072+ *region_size = size;
49073+ return ioremap(*base, *region_size);
49074 }
49075 #else
49076 #define atmel_getb(chip, offset) inb(chip->vendor->base + offset)
49077@@ -106,14 +105,12 @@
49078 return 0;
49079 }
49080
49081-static inline void atmel_put_base_addr(struct tpm_vendor_specific
49082- *vendor)
49083+static inline void atmel_put_base_addr(void __iomem *iobase)
49084 {
49085 }
49086
49087 /* Determine where to talk to device */
49088-static void __iomem * atmel_get_base_addr(struct tpm_vendor_specific
49089- *vendor)
49090+static void __iomem * atmel_get_base_addr(unsigned long *base, int *region_size)
49091 {
49092 int lo, hi;
49093
49094@@ -123,9 +120,9 @@
49095 lo = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_LO);
49096 hi = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_HI);
49097
49098- vendor->base = (hi << 8) | lo;
49099- vendor->region_size = 2;
49100+ *base = (hi << 8) | lo;
49101+ *region_size = 2;
49102
49103- return ioport_map(vendor->base, vendor->region_size);
49104+ return ioport_map(*base, *region_size);
49105 }
49106 #endif
49107diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_bios.c linux-2.6.16.33/drivers/char/tpm/tpm_bios.c
49108--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_bios.c 2006-11-22 18:06:31.000000000 +0000
49109+++ linux-2.6.16.33/drivers/char/tpm/tpm_bios.c 2007-05-23 21:00:01.000000000 +0000
49110@@ -29,6 +29,11 @@
49111 #define MAX_TEXT_EVENT 1000 /* Max event string length */
49112 #define ACPI_TCPA_SIG "TCPA" /* 0x41504354 /'TCPA' */
49113
49114+enum bios_platform_class {
49115+ BIOS_CLIENT = 0x00,
49116+ BIOS_SERVER = 0x01,
49117+};
49118+
49119 struct tpm_bios_log {
49120 void *bios_event_log;
49121 void *bios_event_log_end;
49122@@ -36,9 +41,18 @@
49123
49124 struct acpi_tcpa {
49125 struct acpi_table_header hdr;
49126- u16 reserved;
49127- u32 log_max_len __attribute__ ((packed));
49128- u32 log_start_addr __attribute__ ((packed));
49129+ u16 platform_class;
49130+ union {
49131+ struct client_hdr {
49132+ u32 log_max_len __attribute__ ((packed));
49133+ u64 log_start_addr __attribute__ ((packed));
49134+ } client;
49135+ struct server_hdr {
49136+ u16 reserved;
49137+ u64 log_max_len __attribute__ ((packed));
49138+ u64 log_start_addr __attribute__ ((packed));
49139+ } server;
49140+ };
49141 };
49142
49143 struct tcpa_event {
49144@@ -91,6 +105,12 @@
49145 "Non-Host Info"
49146 };
49147
49148+struct tcpa_pc_event {
49149+ u32 event_id;
49150+ u32 event_size;
49151+ u8 event_data[0];
49152+};
49153+
49154 enum tcpa_pc_event_ids {
49155 SMBIOS = 1,
49156 BIS_CERT,
49157@@ -100,14 +120,15 @@
49158 NVRAM,
49159 OPTION_ROM_EXEC,
49160 OPTION_ROM_CONFIG,
49161- OPTION_ROM_MICROCODE,
49162+ OPTION_ROM_MICROCODE = 10,
49163 S_CRTM_VERSION,
49164 S_CRTM_CONTENTS,
49165 POST_CONTENTS,
49166+ HOST_TABLE_OF_DEVICES,
49167 };
49168
49169 static const char* tcpa_pc_event_id_strings[] = {
49170- ""
49171+ "",
49172 "SMBIOS",
49173 "BIS Certificate",
49174 "POST BIOS ",
49175@@ -116,10 +137,12 @@
49176 "NVRAM",
49177 "Option ROM",
49178 "Option ROM config",
49179- "Option ROM microcode",
49180+ "",
49181+ "Option ROM microcode ",
49182 "S-CRTM Version",
49183- "S-CRTM Contents",
49184- "S-CRTM POST Contents",
49185+ "S-CRTM Contents ",
49186+ "POST Contents ",
49187+ "Table of Devices",
49188 };
49189
49190 /* returns pointer to start of pos. entry of tcg log */
49191@@ -191,7 +214,7 @@
49192 const char *name = "";
49193 char data[40] = "";
49194 int i, n_len = 0, d_len = 0;
49195- u32 event_id;
49196+ struct tcpa_pc_event *pc_event;
49197
49198 switch(event->event_type) {
49199 case PREBOOT:
49200@@ -220,31 +243,32 @@
49201 }
49202 break;
49203 case EVENT_TAG:
49204- event_id = be32_to_cpu(*((u32 *)event_entry));
49205+ pc_event = (struct tcpa_pc_event *)event_entry;
49206
49207 /* ToDo Row data -> Base64 */
49208
49209- switch (event_id) {
49210+ switch (pc_event->event_id) {
49211 case SMBIOS:
49212 case BIS_CERT:
49213 case CMOS:
49214 case NVRAM:
49215 case OPTION_ROM_EXEC:
49216 case OPTION_ROM_CONFIG:
49217- case OPTION_ROM_MICROCODE:
49218 case S_CRTM_VERSION:
49219- case S_CRTM_CONTENTS:
49220- case POST_CONTENTS:
49221- name = tcpa_pc_event_id_strings[event_id];
49222+ name = tcpa_pc_event_id_strings[pc_event->event_id];
49223 n_len = strlen(name);
49224 break;
49225+ /* hash data */
49226 case POST_BIOS_ROM:
49227 case ESCD:
49228- name = tcpa_pc_event_id_strings[event_id];
49229+ case OPTION_ROM_MICROCODE:
49230+ case S_CRTM_CONTENTS:
49231+ case POST_CONTENTS:
49232+ name = tcpa_pc_event_id_strings[pc_event->event_id];
49233 n_len = strlen(name);
49234 for (i = 0; i < 20; i++)
49235- d_len += sprintf(data, "%02x",
49236- event_entry[8 + i]);
49237+ d_len += sprintf(&data[2*i], "%02x",
49238+ pc_event->event_data[i]);
49239 break;
49240 default:
49241 break;
49242@@ -260,52 +284,13 @@
49243
49244 static int tpm_binary_bios_measurements_show(struct seq_file *m, void *v)
49245 {
49246+ struct tcpa_event *event = v;
49247+ char *data = v;
49248+ int i;
49249
49250- char *eventname;
49251- char data[4];
49252- u32 help;
49253- int i, len;
49254- struct tcpa_event *event = (struct tcpa_event *) v;
49255- unsigned char *event_entry =
49256- (unsigned char *) (v + sizeof(struct tcpa_event));
49257-
49258- eventname = kmalloc(MAX_TEXT_EVENT, GFP_KERNEL);
49259- if (!eventname) {
49260- printk(KERN_ERR "%s: ERROR - No Memory for event name\n ",
49261- __func__);
49262- return -ENOMEM;
49263- }
49264-
49265- /* 1st: PCR used is in little-endian format (4 bytes) */
49266- help = le32_to_cpu(event->pcr_index);
49267- memcpy(data, &help, 4);
49268- for (i = 0; i < 4; i++)
49269- seq_putc(m, data[i]);
49270-
49271- /* 2nd: SHA1 (20 bytes) */
49272- for (i = 0; i < 20; i++)
49273- seq_putc(m, event->pcr_value[i]);
49274-
49275- /* 3rd: event type identifier (4 bytes) */
49276- help = le32_to_cpu(event->event_type);
49277- memcpy(data, &help, 4);
49278- for (i = 0; i < 4; i++)
49279+ for (i = 0; i < sizeof(struct tcpa_event) + event->event_size; i++)
49280 seq_putc(m, data[i]);
49281
49282- len = 0;
49283-
49284- len += get_event_name(eventname, event, event_entry);
49285-
49286- /* 4th: filename <= 255 + \'0' delimiter */
49287- if (len > TCG_EVENT_NAME_LEN_MAX)
49288- len = TCG_EVENT_NAME_LEN_MAX;
49289-
49290- for (i = 0; i < len; i++)
49291- seq_putc(m, eventname[i]);
49292-
49293- /* 5th: delimiter */
49294- seq_putc(m, '\0');
49295-
49296 return 0;
49297 }
49298
49299@@ -353,6 +338,7 @@
49300 /* 4th: eventname <= max + \'0' delimiter */
49301 seq_printf(m, " %s\n", eventname);
49302
49303+ kfree(eventname);
49304 return 0;
49305 }
49306
49307@@ -376,6 +362,7 @@
49308 struct acpi_tcpa *buff;
49309 acpi_status status;
49310 struct acpi_table_header *virt;
49311+ u64 len, start;
49312
49313 if (log->bios_event_log != NULL) {
49314 printk(KERN_ERR
49315@@ -396,27 +383,37 @@
49316 return -EIO;
49317 }
49318
49319- if (buff->log_max_len == 0) {
49320+ switch(buff->platform_class) {
49321+ case BIOS_SERVER:
49322+ len = buff->server.log_max_len;
49323+ start = buff->server.log_start_addr;
49324+ break;
49325+ case BIOS_CLIENT:
49326+ default:
49327+ len = buff->client.log_max_len;
49328+ start = buff->client.log_start_addr;
49329+ break;
49330+ }
49331+ if (!len) {
49332 printk(KERN_ERR "%s: ERROR - TCPA log area empty\n", __func__);
49333 return -EIO;
49334 }
49335
49336 /* malloc EventLog space */
49337- log->bios_event_log = kmalloc(buff->log_max_len, GFP_KERNEL);
49338+ log->bios_event_log = kmalloc(len, GFP_KERNEL);
49339 if (!log->bios_event_log) {
49340- printk
49341- ("%s: ERROR - Not enough Memory for BIOS measurements\n",
49342- __func__);
49343+ printk("%s: ERROR - Not enough Memory for BIOS measurements\n",
49344+ __func__);
49345 return -ENOMEM;
49346 }
49347
49348- log->bios_event_log_end = log->bios_event_log + buff->log_max_len;
49349+ log->bios_event_log_end = log->bios_event_log + len;
49350
49351- acpi_os_map_memory(buff->log_start_addr, buff->log_max_len, (void *) &virt);
49352+ acpi_os_map_memory(start, len, (void *) &virt);
49353
49354- memcpy(log->bios_event_log, virt, buff->log_max_len);
49355+ memcpy(log->bios_event_log, virt, len);
49356
49357- acpi_os_unmap_memory(virt, buff->log_max_len);
49358+ acpi_os_unmap_memory(virt, len);
49359 return 0;
49360 }
49361
49362diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_infineon.c linux-2.6.16.33/drivers/char/tpm/tpm_infineon.c
49363--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_infineon.c 2006-11-22 18:06:31.000000000 +0000
49364+++ linux-2.6.16.33/drivers/char/tpm/tpm_infineon.c 2007-05-23 21:00:01.000000000 +0000
49365@@ -15,6 +15,7 @@
49366 * License.
49367 */
49368
49369+#include <linux/init.h>
49370 #include <linux/pnp.h>
49371 #include "tpm.h"
49372
49373@@ -104,7 +105,7 @@
49374
49375 if (clear_wrfifo) {
49376 for (i = 0; i < 4096; i++) {
49377- status = inb(chip->vendor->base + WRFIFO);
49378+ status = inb(chip->vendor.base + WRFIFO);
49379 if (status == 0xff) {
49380 if (check == 5)
49381 break;
49382@@ -124,8 +125,8 @@
49383 */
49384 i = 0;
49385 do {
49386- status = inb(chip->vendor->base + RDFIFO);
49387- status = inb(chip->vendor->base + STAT);
49388+ status = inb(chip->vendor.base + RDFIFO);
49389+ status = inb(chip->vendor.base + STAT);
49390 i++;
49391 if (i == TPM_MAX_TRIES)
49392 return -EIO;
49393@@ -138,7 +139,7 @@
49394 int status;
49395 int i;
49396 for (i = 0; i < TPM_MAX_TRIES; i++) {
49397- status = inb(chip->vendor->base + STAT);
49398+ status = inb(chip->vendor.base + STAT);
49399 /* check the status-register if wait_for_bit is set */
49400 if (status & 1 << wait_for_bit)
49401 break;
49402@@ -157,7 +158,7 @@
49403 static void wait_and_send(struct tpm_chip *chip, u8 sendbyte)
49404 {
49405 wait(chip, STAT_XFE);
49406- outb(sendbyte, chip->vendor->base + WRFIFO);
49407+ outb(sendbyte, chip->vendor.base + WRFIFO);
49408 }
49409
49410 /* Note: WTX means Waiting-Time-Extension. Whenever the TPM needs more
49411@@ -204,7 +205,7 @@
49412 ret = wait(chip, STAT_RDA);
49413 if (ret)
49414 return -EIO;
49415- buf[i] = inb(chip->vendor->base + RDFIFO);
49416+ buf[i] = inb(chip->vendor.base + RDFIFO);
49417 }
49418
49419 if (buf[0] != TPM_VL_VER) {
49420@@ -219,7 +220,7 @@
49421
49422 for (i = 0; i < size; i++) {
49423 wait(chip, STAT_RDA);
49424- buf[i] = inb(chip->vendor->base + RDFIFO);
49425+ buf[i] = inb(chip->vendor.base + RDFIFO);
49426 }
49427
49428 if ((size == 0x6D00) && (buf[1] == 0x80)) {
49429@@ -268,7 +269,7 @@
49430 u8 count_high, count_low, count_4, count_3, count_2, count_1;
49431
49432 /* Disabling Reset, LP and IRQC */
49433- outb(RESET_LP_IRQC_DISABLE, chip->vendor->base + CMD);
49434+ outb(RESET_LP_IRQC_DISABLE, chip->vendor.base + CMD);
49435
49436 ret = empty_fifo(chip, 1);
49437 if (ret) {
49438@@ -319,7 +320,7 @@
49439
49440 static u8 tpm_inf_status(struct tpm_chip *chip)
49441 {
49442- return inb(chip->vendor->base + STAT);
49443+ return inb(chip->vendor.base + STAT);
49444 }
49445
49446 static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
49447@@ -346,7 +347,7 @@
49448 .release = tpm_release,
49449 };
49450
49451-static struct tpm_vendor_specific tpm_inf = {
49452+static const struct tpm_vendor_specific tpm_inf = {
49453 .recv = tpm_inf_recv,
49454 .send = tpm_inf_send,
49455 .cancel = tpm_inf_cancel,
49456@@ -375,6 +376,7 @@
49457 int version[2];
49458 int productid[2];
49459 char chipname[20];
49460+ struct tpm_chip *chip;
49461
49462 /* read IO-ports through PnP */
49463 if (pnp_port_valid(dev, 0) && pnp_port_valid(dev, 1) &&
49464@@ -395,14 +397,13 @@
49465 goto err_last;
49466 }
49467 /* publish my base address and request region */
49468- tpm_inf.base = TPM_INF_BASE;
49469 if (request_region
49470- (tpm_inf.base, TPM_INF_PORT_LEN, "tpm_infineon0") == NULL) {
49471+ (TPM_INF_BASE, TPM_INF_PORT_LEN, "tpm_infineon0") == NULL) {
49472 rc = -EINVAL;
49473 goto err_last;
49474 }
49475- if (request_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN,
49476- "tpm_infineon0") == NULL) {
49477+ if (request_region
49478+ (TPM_INF_ADDR, TPM_INF_ADDR_LEN, "tpm_infineon0") == NULL) {
49479 rc = -EINVAL;
49480 goto err_last;
49481 }
49482@@ -442,9 +443,9 @@
49483
49484 /* configure TPM with IO-ports */
49485 outb(IOLIMH, TPM_INF_ADDR);
49486- outb(((tpm_inf.base >> 8) & 0xff), TPM_INF_DATA);
49487+ outb(((TPM_INF_BASE >> 8) & 0xff), TPM_INF_DATA);
49488 outb(IOLIML, TPM_INF_ADDR);
49489- outb((tpm_inf.base & 0xff), TPM_INF_DATA);
49490+ outb((TPM_INF_BASE & 0xff), TPM_INF_DATA);
49491
49492 /* control if IO-ports are set correctly */
49493 outb(IOLIMH, TPM_INF_ADDR);
49494@@ -452,10 +453,10 @@
49495 outb(IOLIML, TPM_INF_ADDR);
49496 iol = inb(TPM_INF_DATA);
49497
49498- if ((ioh << 8 | iol) != tpm_inf.base) {
49499+ if ((ioh << 8 | iol) != TPM_INF_BASE) {
49500 dev_err(&dev->dev,
49501- "Could not set IO-ports to 0x%lx\n",
49502- tpm_inf.base);
49503+ "Could not set IO-ports to 0x%x\n",
49504+ TPM_INF_BASE);
49505 rc = -EIO;
49506 goto err_release_region;
49507 }
49508@@ -466,15 +467,15 @@
49509 outb(DISABLE_REGISTER_PAIR, TPM_INF_ADDR);
49510
49511 /* disable RESET, LP and IRQC */
49512- outb(RESET_LP_IRQC_DISABLE, tpm_inf.base + CMD);
49513+ outb(RESET_LP_IRQC_DISABLE, TPM_INF_BASE + CMD);
49514
49515 /* Finally, we're done, print some infos */
49516 dev_info(&dev->dev, "TPM found: "
49517 "config base 0x%x, "
49518 "io base 0x%x, "
49519- "chip version %02x%02x, "
49520- "vendor id %x%x (Infineon), "
49521- "product id %02x%02x"
49522+ "chip version 0x%02x%02x, "
49523+ "vendor id 0x%x%x (Infineon), "
49524+ "product id 0x%02x%02x"
49525 "%s\n",
49526 TPM_INF_ADDR,
49527 TPM_INF_BASE,
49528@@ -482,11 +483,10 @@
49529 vendorid[0], vendorid[1],
49530 productid[0], productid[1], chipname);
49531
49532- rc = tpm_register_hardware(&dev->dev, &tpm_inf);
49533- if (rc < 0) {
49534- rc = -ENODEV;
49535+ if (!(chip = tpm_register_hardware(&dev->dev, &tpm_inf))) {
49536 goto err_release_region;
49537 }
49538+ chip->vendor.base = TPM_INF_BASE;
49539 return 0;
49540 } else {
49541 rc = -ENODEV;
49542@@ -494,7 +494,7 @@
49543 }
49544
49545 err_release_region:
49546- release_region(tpm_inf.base, TPM_INF_PORT_LEN);
49547+ release_region(TPM_INF_BASE, TPM_INF_PORT_LEN);
49548 release_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN);
49549
49550 err_last:
49551@@ -506,7 +506,8 @@
49552 struct tpm_chip *chip = pnp_get_drvdata(dev);
49553
49554 if (chip) {
49555- release_region(chip->vendor->base, TPM_INF_PORT_LEN);
49556+ release_region(TPM_INF_BASE, TPM_INF_PORT_LEN);
49557+ release_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN);
49558 tpm_remove_hardware(chip->dev);
49559 }
49560 }
49561@@ -520,7 +521,7 @@
49562 },
49563 .id_table = tpm_pnp_tbl,
49564 .probe = tpm_inf_pnp_probe,
49565- .remove = tpm_inf_pnp_remove,
49566+ .remove = __devexit_p(tpm_inf_pnp_remove),
49567 };
49568
49569 static int __init init_inf(void)
49570@@ -538,5 +539,5 @@
49571
49572 MODULE_AUTHOR("Marcel Selhorst <selhorst@crypto.rub.de>");
49573 MODULE_DESCRIPTION("Driver for Infineon TPM SLD 9630 TT 1.1 / SLB 9635 TT 1.2");
49574-MODULE_VERSION("1.7");
49575+MODULE_VERSION("1.8");
49576 MODULE_LICENSE("GPL");
49577diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_nsc.c linux-2.6.16.33/drivers/char/tpm/tpm_nsc.c
49578--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_nsc.c 2006-11-22 18:06:31.000000000 +0000
49579+++ linux-2.6.16.33/drivers/char/tpm/tpm_nsc.c 2007-05-23 21:00:01.000000000 +0000
49580@@ -71,7 +71,7 @@
49581 unsigned long stop;
49582
49583 /* status immediately available check */
49584- *data = inb(chip->vendor->base + NSC_STATUS);
49585+ *data = inb(chip->vendor.base + NSC_STATUS);
49586 if ((*data & mask) == val)
49587 return 0;
49588
49589@@ -79,7 +79,7 @@
49590 stop = jiffies + 10 * HZ;
49591 do {
49592 msleep(TPM_TIMEOUT);
49593- *data = inb(chip->vendor->base + 1);
49594+ *data = inb(chip->vendor.base + 1);
49595 if ((*data & mask) == val)
49596 return 0;
49597 }
49598@@ -94,9 +94,9 @@
49599 unsigned long stop;
49600
49601 /* status immediately available check */
49602- status = inb(chip->vendor->base + NSC_STATUS);
49603+ status = inb(chip->vendor.base + NSC_STATUS);
49604 if (status & NSC_STATUS_OBF)
49605- status = inb(chip->vendor->base + NSC_DATA);
49606+ status = inb(chip->vendor.base + NSC_DATA);
49607 if (status & NSC_STATUS_RDY)
49608 return 0;
49609
49610@@ -104,9 +104,9 @@
49611 stop = jiffies + 100;
49612 do {
49613 msleep(TPM_TIMEOUT);
49614- status = inb(chip->vendor->base + NSC_STATUS);
49615+ status = inb(chip->vendor.base + NSC_STATUS);
49616 if (status & NSC_STATUS_OBF)
49617- status = inb(chip->vendor->base + NSC_DATA);
49618+ status = inb(chip->vendor.base + NSC_DATA);
49619 if (status & NSC_STATUS_RDY)
49620 return 0;
49621 }
49622@@ -132,7 +132,7 @@
49623 return -EIO;
49624 }
49625 if ((data =
49626- inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_NORMAL) {
49627+ inb(chip->vendor.base + NSC_DATA)) != NSC_COMMAND_NORMAL) {
49628 dev_err(chip->dev, "not in normal mode (0x%x)\n",
49629 data);
49630 return -EIO;
49631@@ -148,7 +148,7 @@
49632 }
49633 if (data & NSC_STATUS_F0)
49634 break;
49635- *p = inb(chip->vendor->base + NSC_DATA);
49636+ *p = inb(chip->vendor.base + NSC_DATA);
49637 }
49638
49639 if ((data & NSC_STATUS_F0) == 0 &&
49640@@ -156,7 +156,7 @@
49641 dev_err(chip->dev, "F0 not set\n");
49642 return -EIO;
49643 }
49644- if ((data = inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_EOC) {
49645+ if ((data = inb(chip->vendor.base + NSC_DATA)) != NSC_COMMAND_EOC) {
49646 dev_err(chip->dev,
49647 "expected end of command(0x%x)\n", data);
49648 return -EIO;
49649@@ -182,7 +182,7 @@
49650 * fix it. Not sure why this is needed, we followed the flow
49651 * chart in the manual to the letter.
49652 */
49653- outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND);
49654+ outb(NSC_COMMAND_CANCEL, chip->vendor.base + NSC_COMMAND);
49655
49656 if (nsc_wait_for_ready(chip) != 0)
49657 return -EIO;
49658@@ -192,7 +192,7 @@
49659 return -EIO;
49660 }
49661
49662- outb(NSC_COMMAND_NORMAL, chip->vendor->base + NSC_COMMAND);
49663+ outb(NSC_COMMAND_NORMAL, chip->vendor.base + NSC_COMMAND);
49664 if (wait_for_stat(chip, NSC_STATUS_IBR, NSC_STATUS_IBR, &data) < 0) {
49665 dev_err(chip->dev, "IBR timeout\n");
49666 return -EIO;
49667@@ -204,26 +204,26 @@
49668 "IBF timeout (while writing data)\n");
49669 return -EIO;
49670 }
49671- outb(buf[i], chip->vendor->base + NSC_DATA);
49672+ outb(buf[i], chip->vendor.base + NSC_DATA);
49673 }
49674
49675 if (wait_for_stat(chip, NSC_STATUS_IBF, 0, &data) < 0) {
49676 dev_err(chip->dev, "IBF timeout\n");
49677 return -EIO;
49678 }
49679- outb(NSC_COMMAND_EOC, chip->vendor->base + NSC_COMMAND);
49680+ outb(NSC_COMMAND_EOC, chip->vendor.base + NSC_COMMAND);
49681
49682 return count;
49683 }
49684
49685 static void tpm_nsc_cancel(struct tpm_chip *chip)
49686 {
49687- outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND);
49688+ outb(NSC_COMMAND_CANCEL, chip->vendor.base + NSC_COMMAND);
49689 }
49690
49691 static u8 tpm_nsc_status(struct tpm_chip *chip)
49692 {
49693- return inb(chip->vendor->base + NSC_STATUS);
49694+ return inb(chip->vendor.base + NSC_STATUS);
49695 }
49696
49697 static struct file_operations nsc_ops = {
49698@@ -250,7 +250,7 @@
49699
49700 static struct attribute_group nsc_attr_grp = { .attrs = nsc_attrs };
49701
49702-static struct tpm_vendor_specific tpm_nsc = {
49703+static const struct tpm_vendor_specific tpm_nsc = {
49704 .recv = tpm_nsc_recv,
49705 .send = tpm_nsc_send,
49706 .cancel = tpm_nsc_cancel,
49707@@ -268,7 +268,7 @@
49708 {
49709 struct tpm_chip *chip = dev_get_drvdata(dev);
49710 if ( chip ) {
49711- release_region(chip->vendor->base, 2);
49712+ release_region(chip->vendor.base, 2);
49713 tpm_remove_hardware(chip->dev);
49714 }
49715 }
49716@@ -286,7 +286,8 @@
49717 int rc = 0;
49718 int lo, hi;
49719 int nscAddrBase = TPM_ADDR;
49720-
49721+ struct tpm_chip *chip;
49722+ unsigned long base;
49723
49724 /* verify that it is a National part (SID) */
49725 if (tpm_read_index(TPM_ADDR, NSC_SID_INDEX) != 0xEF) {
49726@@ -300,7 +301,7 @@
49727
49728 hi = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_HI);
49729 lo = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_LO);
49730- tpm_nsc.base = (hi<<8) | lo;
49731+ base = (hi<<8) | lo;
49732
49733 /* enable the DPM module */
49734 tpm_write_index(nscAddrBase, NSC_LDC_INDEX, 0x01);
49735@@ -320,13 +321,15 @@
49736 if ((rc = platform_device_register(pdev)) < 0)
49737 goto err_free_dev;
49738
49739- if (request_region(tpm_nsc.base, 2, "tpm_nsc0") == NULL ) {
49740+ if (request_region(base, 2, "tpm_nsc0") == NULL ) {
49741 rc = -EBUSY;
49742 goto err_unreg_dev;
49743 }
49744
49745- if ((rc = tpm_register_hardware(&pdev->dev, &tpm_nsc)) < 0)
49746+ if (!(chip = tpm_register_hardware(&pdev->dev, &tpm_nsc))) {
49747+ rc = -ENODEV;
49748 goto err_rel_reg;
49749+ }
49750
49751 dev_dbg(&pdev->dev, "NSC TPM detected\n");
49752 dev_dbg(&pdev->dev,
49753@@ -361,10 +364,12 @@
49754 "NSC TPM revision %d\n",
49755 tpm_read_index(nscAddrBase, 0x27) & 0x1F);
49756
49757+ chip->vendor.base = base;
49758+
49759 return 0;
49760
49761 err_rel_reg:
49762- release_region(tpm_nsc.base, 2);
49763+ release_region(base, 2);
49764 err_unreg_dev:
49765 platform_device_unregister(pdev);
49766 err_free_dev:
49767diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_tis.c linux-2.6.16.33/drivers/char/tpm/tpm_tis.c
49768--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_tis.c 1970-01-01 00:00:00.000000000 +0000
49769+++ linux-2.6.16.33/drivers/char/tpm/tpm_tis.c 2007-05-23 21:00:01.000000000 +0000
49770@@ -0,0 +1,665 @@
49771+/*
49772+ * Copyright (C) 2005, 2006 IBM Corporation
49773+ *
49774+ * Authors:
49775+ * Leendert van Doorn <leendert@watson.ibm.com>
49776+ * Kylene Hall <kjhall@us.ibm.com>
49777+ *
49778+ * Device driver for TCG/TCPA TPM (trusted platform module).
49779+ * Specifications at www.trustedcomputinggroup.org
49780+ *
49781+ * This device driver implements the TPM interface as defined in
49782+ * the TCG TPM Interface Spec version 1.2, revision 1.0.
49783+ *
49784+ * This program is free software; you can redistribute it and/or
49785+ * modify it under the terms of the GNU General Public License as
49786+ * published by the Free Software Foundation, version 2 of the
49787+ * License.
49788+ */
49789+#include <linux/init.h>
49790+#include <linux/module.h>
49791+#include <linux/moduleparam.h>
49792+#include <linux/pnp.h>
49793+#include <linux/interrupt.h>
49794+#include <linux/wait.h>
49795+#include "tpm.h"
49796+
49797+#define TPM_HEADER_SIZE 10
49798+
49799+enum tis_access {
49800+ TPM_ACCESS_VALID = 0x80,
49801+ TPM_ACCESS_ACTIVE_LOCALITY = 0x20,
49802+ TPM_ACCESS_REQUEST_PENDING = 0x04,
49803+ TPM_ACCESS_REQUEST_USE = 0x02,
49804+};
49805+
49806+enum tis_status {
49807+ TPM_STS_VALID = 0x80,
49808+ TPM_STS_COMMAND_READY = 0x40,
49809+ TPM_STS_GO = 0x20,
49810+ TPM_STS_DATA_AVAIL = 0x10,
49811+ TPM_STS_DATA_EXPECT = 0x08,
49812+};
49813+
49814+enum tis_int_flags {
49815+ TPM_GLOBAL_INT_ENABLE = 0x80000000,
49816+ TPM_INTF_BURST_COUNT_STATIC = 0x100,
49817+ TPM_INTF_CMD_READY_INT = 0x080,
49818+ TPM_INTF_INT_EDGE_FALLING = 0x040,
49819+ TPM_INTF_INT_EDGE_RISING = 0x020,
49820+ TPM_INTF_INT_LEVEL_LOW = 0x010,
49821+ TPM_INTF_INT_LEVEL_HIGH = 0x008,
49822+ TPM_INTF_LOCALITY_CHANGE_INT = 0x004,
49823+ TPM_INTF_STS_VALID_INT = 0x002,
49824+ TPM_INTF_DATA_AVAIL_INT = 0x001,
49825+};
49826+
49827+enum tis_defaults {
49828+ TIS_MEM_BASE = 0xFED40000,
49829+ TIS_MEM_LEN = 0x5000,
49830+ TIS_SHORT_TIMEOUT = 750, /* ms */
49831+ TIS_LONG_TIMEOUT = 2000, /* 2 sec */
49832+};
49833+
49834+#define TPM_ACCESS(l) (0x0000 | ((l) << 12))
49835+#define TPM_INT_ENABLE(l) (0x0008 | ((l) << 12))
49836+#define TPM_INT_VECTOR(l) (0x000C | ((l) << 12))
49837+#define TPM_INT_STATUS(l) (0x0010 | ((l) << 12))
49838+#define TPM_INTF_CAPS(l) (0x0014 | ((l) << 12))
49839+#define TPM_STS(l) (0x0018 | ((l) << 12))
49840+#define TPM_DATA_FIFO(l) (0x0024 | ((l) << 12))
49841+
49842+#define TPM_DID_VID(l) (0x0F00 | ((l) << 12))
49843+#define TPM_RID(l) (0x0F04 | ((l) << 12))
49844+
49845+static LIST_HEAD(tis_chips);
49846+static DEFINE_SPINLOCK(tis_lock);
49847+
49848+static int check_locality(struct tpm_chip *chip, int l)
49849+{
49850+ if ((ioread8(chip->vendor.iobase + TPM_ACCESS(l)) &
49851+ (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
49852+ (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID))
49853+ return chip->vendor.locality = l;
49854+
49855+ return -1;
49856+}
49857+
49858+static void release_locality(struct tpm_chip *chip, int l, int force)
49859+{
49860+ if (force || (ioread8(chip->vendor.iobase + TPM_ACCESS(l)) &
49861+ (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID)) ==
49862+ (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID))
49863+ iowrite8(TPM_ACCESS_ACTIVE_LOCALITY,
49864+ chip->vendor.iobase + TPM_ACCESS(l));
49865+}
49866+
49867+static int request_locality(struct tpm_chip *chip, int l)
49868+{
49869+ unsigned long stop;
49870+ long rc;
49871+
49872+ if (check_locality(chip, l) >= 0)
49873+ return l;
49874+
49875+ iowrite8(TPM_ACCESS_REQUEST_USE,
49876+ chip->vendor.iobase + TPM_ACCESS(l));
49877+
49878+ if (chip->vendor.irq) {
49879+ rc = wait_event_interruptible_timeout(chip->vendor.int_queue,
49880+ (check_locality
49881+ (chip, l) >= 0),
49882+ chip->vendor.timeout_a);
49883+ if (rc > 0)
49884+ return l;
49885+
49886+ } else {
49887+ /* wait for burstcount */
49888+ stop = jiffies + chip->vendor.timeout_a;
49889+ do {
49890+ if (check_locality(chip, l) >= 0)
49891+ return l;
49892+ msleep(TPM_TIMEOUT);
49893+ }
49894+ while (time_before(jiffies, stop));
49895+ }
49896+ return -1;
49897+}
49898+
49899+static u8 tpm_tis_status(struct tpm_chip *chip)
49900+{
49901+ return ioread8(chip->vendor.iobase +
49902+ TPM_STS(chip->vendor.locality));
49903+}
49904+
49905+static void tpm_tis_ready(struct tpm_chip *chip)
49906+{
49907+ /* this causes the current command to be aborted */
49908+ iowrite8(TPM_STS_COMMAND_READY,
49909+ chip->vendor.iobase + TPM_STS(chip->vendor.locality));
49910+}
49911+
49912+static int get_burstcount(struct tpm_chip *chip)
49913+{
49914+ unsigned long stop;
49915+ int burstcnt;
49916+
49917+ /* wait for burstcount */
49918+ /* which timeout value, spec has 2 answers (c & d) */
49919+ stop = jiffies + chip->vendor.timeout_d;
49920+ do {
49921+ burstcnt = ioread8(chip->vendor.iobase +
49922+ TPM_STS(chip->vendor.locality) + 1);
49923+ burstcnt += ioread8(chip->vendor.iobase +
49924+ TPM_STS(chip->vendor.locality) +
49925+ 2) << 8;
49926+ if (burstcnt)
49927+ return burstcnt;
49928+ msleep(TPM_TIMEOUT);
49929+ } while (time_before(jiffies, stop));
49930+ return -EBUSY;
49931+}
49932+
49933+static int wait_for_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout,
49934+ wait_queue_head_t *queue)
49935+{
49936+ unsigned long stop;
49937+ long rc;
49938+ u8 status;
49939+
49940+ /* check current status */
49941+ status = tpm_tis_status(chip);
49942+ if ((status & mask) == mask)
49943+ return 0;
49944+
49945+ if (chip->vendor.irq) {
49946+ rc = wait_event_interruptible_timeout(*queue,
49947+ ((tpm_tis_status
49948+ (chip) & mask) ==
49949+ mask), timeout);
49950+ if (rc > 0)
49951+ return 0;
49952+ } else {
49953+ stop = jiffies + timeout;
49954+ do {
49955+ msleep(TPM_TIMEOUT);
49956+ status = tpm_tis_status(chip);
49957+ if ((status & mask) == mask)
49958+ return 0;
49959+ } while (time_before(jiffies, stop));
49960+ }
49961+ return -ETIME;
49962+}
49963+
49964+static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
49965+{
49966+ int size = 0, burstcnt;
49967+ while (size < count &&
49968+ wait_for_stat(chip,
49969+ TPM_STS_DATA_AVAIL | TPM_STS_VALID,
49970+ chip->vendor.timeout_c,
49971+ &chip->vendor.read_queue)
49972+ == 0) {
49973+ burstcnt = get_burstcount(chip);
49974+ for (; burstcnt > 0 && size < count; burstcnt--)
49975+ buf[size++] = ioread8(chip->vendor.iobase +
49976+ TPM_DATA_FIFO(chip->vendor.
49977+ locality));
49978+ }
49979+ return size;
49980+}
49981+
49982+static int tpm_tis_recv(struct tpm_chip *chip, u8 *buf, size_t count)
49983+{
49984+ int size = 0;
49985+ int expected, status;
49986+
49987+ if (count < TPM_HEADER_SIZE) {
49988+ size = -EIO;
49989+ goto out;
49990+ }
49991+
49992+ /* read first 10 bytes, including tag, paramsize, and result */
49993+ if ((size =
49994+ recv_data(chip, buf, TPM_HEADER_SIZE)) < TPM_HEADER_SIZE) {
49995+ dev_err(chip->dev, "Unable to read header\n");
49996+ goto out;
49997+ }
49998+
49999+ expected = be32_to_cpu(*(__be32 *) (buf + 2));
50000+ if (expected > count) {
50001+ size = -EIO;
50002+ goto out;
50003+ }
50004+
50005+ if ((size +=
50006+ recv_data(chip, &buf[TPM_HEADER_SIZE],
50007+ expected - TPM_HEADER_SIZE)) < expected) {
50008+ dev_err(chip->dev, "Unable to read remainder of result\n");
50009+ size = -ETIME;
50010+ goto out;
50011+ }
50012+
50013+ wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50014+ &chip->vendor.int_queue);
50015+ status = tpm_tis_status(chip);
50016+ if (status & TPM_STS_DATA_AVAIL) { /* retry? */
50017+ dev_err(chip->dev, "Error left over data\n");
50018+ size = -EIO;
50019+ goto out;
50020+ }
50021+
50022+out:
50023+ tpm_tis_ready(chip);
50024+ release_locality(chip, chip->vendor.locality, 0);
50025+ return size;
50026+}
50027+
50028+/*
50029+ * If interrupts are used (signaled by an irq set in the vendor structure)
50030+ * tpm.c can skip polling for the data to be available as the interrupt is
50031+ * waited for here
50032+ */
50033+static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len)
50034+{
50035+ int rc, status, burstcnt;
50036+ size_t count = 0;
50037+ u32 ordinal;
50038+
50039+ if (request_locality(chip, 0) < 0)
50040+ return -EBUSY;
50041+
50042+ status = tpm_tis_status(chip);
50043+ if ((status & TPM_STS_COMMAND_READY) == 0) {
50044+ tpm_tis_ready(chip);
50045+ if (wait_for_stat
50046+ (chip, TPM_STS_COMMAND_READY, chip->vendor.timeout_b,
50047+ &chip->vendor.int_queue) < 0) {
50048+ rc = -ETIME;
50049+ goto out_err;
50050+ }
50051+ }
50052+
50053+ while (count < len - 1) {
50054+ burstcnt = get_burstcount(chip);
50055+ for (; burstcnt > 0 && count < len - 1; burstcnt--) {
50056+ iowrite8(buf[count], chip->vendor.iobase +
50057+ TPM_DATA_FIFO(chip->vendor.locality));
50058+ count++;
50059+ }
50060+
50061+ wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50062+ &chip->vendor.int_queue);
50063+ status = tpm_tis_status(chip);
50064+ if ((status & TPM_STS_DATA_EXPECT) == 0) {
50065+ rc = -EIO;
50066+ goto out_err;
50067+ }
50068+ }
50069+
50070+ /* write last byte */
50071+ iowrite8(buf[count],
50072+ chip->vendor.iobase +
50073+ TPM_DATA_FIFO(chip->vendor.locality));
50074+ wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50075+ &chip->vendor.int_queue);
50076+ status = tpm_tis_status(chip);
50077+ if ((status & TPM_STS_DATA_EXPECT) != 0) {
50078+ rc = -EIO;
50079+ goto out_err;
50080+ }
50081+
50082+ /* go and do it */
50083+ iowrite8(TPM_STS_GO,
50084+ chip->vendor.iobase + TPM_STS(chip->vendor.locality));
50085+
50086+ if (chip->vendor.irq) {
50087+ ordinal = be32_to_cpu(*((__be32 *) (buf + 6)));
50088+ if (wait_for_stat
50089+ (chip, TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50090+ tpm_calc_ordinal_duration(chip, ordinal),
50091+ &chip->vendor.read_queue) < 0) {
50092+ rc = -ETIME;
50093+ goto out_err;
50094+ }
50095+ }
50096+ return len;
50097+out_err:
50098+ tpm_tis_ready(chip);
50099+ release_locality(chip, chip->vendor.locality, 0);
50100+ return rc;
50101+}
50102+
50103+static struct file_operations tis_ops = {
50104+ .owner = THIS_MODULE,
50105+ .llseek = no_llseek,
50106+ .open = tpm_open,
50107+ .read = tpm_read,
50108+ .write = tpm_write,
50109+ .release = tpm_release,
50110+};
50111+
50112+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
50113+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
50114+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
50115+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
50116+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
50117+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
50118+ NULL);
50119+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
50120+static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
50121+
50122+static struct attribute *tis_attrs[] = {
50123+ &dev_attr_pubek.attr,
50124+ &dev_attr_pcrs.attr,
50125+ &dev_attr_enabled.attr,
50126+ &dev_attr_active.attr,
50127+ &dev_attr_owned.attr,
50128+ &dev_attr_temp_deactivated.attr,
50129+ &dev_attr_caps.attr,
50130+ &dev_attr_cancel.attr, NULL,
50131+};
50132+
50133+static struct attribute_group tis_attr_grp = {
50134+ .attrs = tis_attrs
50135+};
50136+
50137+static struct tpm_vendor_specific tpm_tis = {
50138+ .status = tpm_tis_status,
50139+ .recv = tpm_tis_recv,
50140+ .send = tpm_tis_send,
50141+ .cancel = tpm_tis_ready,
50142+ .req_complete_mask = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50143+ .req_complete_val = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50144+ .req_canceled = TPM_STS_COMMAND_READY,
50145+ .attr_group = &tis_attr_grp,
50146+ .miscdev = {
50147+ .fops = &tis_ops,},
50148+};
50149+
50150+static irqreturn_t tis_int_probe(int irq, void *dev_id, struct pt_regs *regs)
50151+{
50152+ struct tpm_chip *chip = (struct tpm_chip *) dev_id;
50153+ u32 interrupt;
50154+
50155+ interrupt = ioread32(chip->vendor.iobase +
50156+ TPM_INT_STATUS(chip->vendor.locality));
50157+
50158+ if (interrupt == 0)
50159+ return IRQ_NONE;
50160+
50161+ chip->vendor.irq = irq;
50162+
50163+ /* Clear interrupts handled with TPM_EOI */
50164+ iowrite32(interrupt,
50165+ chip->vendor.iobase +
50166+ TPM_INT_STATUS(chip->vendor.locality));
50167+ return IRQ_HANDLED;
50168+}
50169+
50170+static irqreturn_t tis_int_handler(int irq, void *dev_id, struct pt_regs *regs)
50171+{
50172+ struct tpm_chip *chip = (struct tpm_chip *) dev_id;
50173+ u32 interrupt;
50174+ int i;
50175+
50176+ interrupt = ioread32(chip->vendor.iobase +
50177+ TPM_INT_STATUS(chip->vendor.locality));
50178+
50179+ if (interrupt == 0)
50180+ return IRQ_NONE;
50181+
50182+ if (interrupt & TPM_INTF_DATA_AVAIL_INT)
50183+ wake_up_interruptible(&chip->vendor.read_queue);
50184+ if (interrupt & TPM_INTF_LOCALITY_CHANGE_INT)
50185+ for (i = 0; i < 5; i++)
50186+ if (check_locality(chip, i) >= 0)
50187+ break;
50188+ if (interrupt &
50189+ (TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_STS_VALID_INT |
50190+ TPM_INTF_CMD_READY_INT))
50191+ wake_up_interruptible(&chip->vendor.int_queue);
50192+
50193+ /* Clear interrupts handled with TPM_EOI */
50194+ iowrite32(interrupt,
50195+ chip->vendor.iobase +
50196+ TPM_INT_STATUS(chip->vendor.locality));
50197+ return IRQ_HANDLED;
50198+}
50199+
50200+static int interrupts = 1;
50201+module_param(interrupts, bool, 0444);
50202+MODULE_PARM_DESC(interrupts, "Enable interrupts");
50203+
50204+static int __devinit tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
50205+ const struct pnp_device_id *pnp_id)
50206+{
50207+ u32 vendor, intfcaps, intmask;
50208+ int rc, i;
50209+ unsigned long start, len;
50210+ struct tpm_chip *chip;
50211+
50212+ start = pnp_mem_start(pnp_dev, 0);
50213+ len = pnp_mem_len(pnp_dev, 0);
50214+
50215+ if (!start)
50216+ start = TIS_MEM_BASE;
50217+ if (!len)
50218+ len = TIS_MEM_LEN;
50219+
50220+ if (!(chip = tpm_register_hardware(&pnp_dev->dev, &tpm_tis)))
50221+ return -ENODEV;
50222+
50223+ chip->vendor.iobase = ioremap(start, len);
50224+ if (!chip->vendor.iobase) {
50225+ rc = -EIO;
50226+ goto out_err;
50227+ }
50228+
50229+ vendor = ioread32(chip->vendor.iobase + TPM_DID_VID(0));
50230+
50231+ /* Default timeouts */
50232+ chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50233+ chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
50234+ chip->vendor.timeout_c = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50235+ chip->vendor.timeout_d = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50236+
50237+ dev_info(&pnp_dev->dev,
50238+ "1.2 TPM (device-id 0x%X, rev-id %d)\n",
50239+ vendor >> 16, ioread8(chip->vendor.iobase + TPM_RID(0)));
50240+
50241+ /* Figure out the capabilities */
50242+ intfcaps =
50243+ ioread32(chip->vendor.iobase +
50244+ TPM_INTF_CAPS(chip->vendor.locality));
50245+ dev_dbg(&pnp_dev->dev, "TPM interface capabilities (0x%x):\n",
50246+ intfcaps);
50247+ if (intfcaps & TPM_INTF_BURST_COUNT_STATIC)
50248+ dev_dbg(&pnp_dev->dev, "\tBurst Count Static\n");
50249+ if (intfcaps & TPM_INTF_CMD_READY_INT)
50250+ dev_dbg(&pnp_dev->dev, "\tCommand Ready Int Support\n");
50251+ if (intfcaps & TPM_INTF_INT_EDGE_FALLING)
50252+ dev_dbg(&pnp_dev->dev, "\tInterrupt Edge Falling\n");
50253+ if (intfcaps & TPM_INTF_INT_EDGE_RISING)
50254+ dev_dbg(&pnp_dev->dev, "\tInterrupt Edge Rising\n");
50255+ if (intfcaps & TPM_INTF_INT_LEVEL_LOW)
50256+ dev_dbg(&pnp_dev->dev, "\tInterrupt Level Low\n");
50257+ if (intfcaps & TPM_INTF_INT_LEVEL_HIGH)
50258+ dev_dbg(&pnp_dev->dev, "\tInterrupt Level High\n");
50259+ if (intfcaps & TPM_INTF_LOCALITY_CHANGE_INT)
50260+ dev_dbg(&pnp_dev->dev, "\tLocality Change Int Support\n");
50261+ if (intfcaps & TPM_INTF_STS_VALID_INT)
50262+ dev_dbg(&pnp_dev->dev, "\tSts Valid Int Support\n");
50263+ if (intfcaps & TPM_INTF_DATA_AVAIL_INT)
50264+ dev_dbg(&pnp_dev->dev, "\tData Avail Int Support\n");
50265+
50266+ if (request_locality(chip, 0) != 0) {
50267+ rc = -ENODEV;
50268+ goto out_err;
50269+ }
50270+
50271+ /* INTERRUPT Setup */
50272+ init_waitqueue_head(&chip->vendor.read_queue);
50273+ init_waitqueue_head(&chip->vendor.int_queue);
50274+
50275+ intmask =
50276+ ioread32(chip->vendor.iobase +
50277+ TPM_INT_ENABLE(chip->vendor.locality));
50278+
50279+ intmask |= TPM_INTF_CMD_READY_INT
50280+ | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT
50281+ | TPM_INTF_STS_VALID_INT;
50282+
50283+ iowrite32(intmask,
50284+ chip->vendor.iobase +
50285+ TPM_INT_ENABLE(chip->vendor.locality));
50286+ if (interrupts) {
50287+ chip->vendor.irq =
50288+ ioread8(chip->vendor.iobase +
50289+ TPM_INT_VECTOR(chip->vendor.locality));
50290+
50291+ for (i = 3; i < 16 && chip->vendor.irq == 0; i++) {
50292+ iowrite8(i, chip->vendor.iobase +
50293+ TPM_INT_VECTOR(chip->vendor.locality));
50294+ if (request_irq
50295+ (i, tis_int_probe, SA_SHIRQ,
50296+ chip->vendor.miscdev.name, chip) != 0) {
50297+ dev_info(chip->dev,
50298+ "Unable to request irq: %d for probe\n",
50299+ i);
50300+ continue;
50301+ }
50302+
50303+ /* Clear all existing */
50304+ iowrite32(ioread32
50305+ (chip->vendor.iobase +
50306+ TPM_INT_STATUS(chip->vendor.locality)),
50307+ chip->vendor.iobase +
50308+ TPM_INT_STATUS(chip->vendor.locality));
50309+
50310+ /* Turn on */
50311+ iowrite32(intmask | TPM_GLOBAL_INT_ENABLE,
50312+ chip->vendor.iobase +
50313+ TPM_INT_ENABLE(chip->vendor.locality));
50314+
50315+ /* Generate Interrupts */
50316+ tpm_gen_interrupt(chip);
50317+
50318+ /* Turn off */
50319+ iowrite32(intmask,
50320+ chip->vendor.iobase +
50321+ TPM_INT_ENABLE(chip->vendor.locality));
50322+ free_irq(i, chip);
50323+ }
50324+ }
50325+ if (chip->vendor.irq) {
50326+ iowrite8(chip->vendor.irq,
50327+ chip->vendor.iobase +
50328+ TPM_INT_VECTOR(chip->vendor.locality));
50329+ if (request_irq
50330+ (chip->vendor.irq, tis_int_handler, SA_SHIRQ,
50331+ chip->vendor.miscdev.name, chip) != 0) {
50332+ dev_info(chip->dev,
50333+ "Unable to request irq: %d for use\n",
50334+ chip->vendor.irq);
50335+ chip->vendor.irq = 0;
50336+ } else {
50337+ /* Clear all existing */
50338+ iowrite32(ioread32
50339+ (chip->vendor.iobase +
50340+ TPM_INT_STATUS(chip->vendor.locality)),
50341+ chip->vendor.iobase +
50342+ TPM_INT_STATUS(chip->vendor.locality));
50343+
50344+ /* Turn on */
50345+ iowrite32(intmask | TPM_GLOBAL_INT_ENABLE,
50346+ chip->vendor.iobase +
50347+ TPM_INT_ENABLE(chip->vendor.locality));
50348+ }
50349+ }
50350+
50351+ INIT_LIST_HEAD(&chip->vendor.list);
50352+ spin_lock(&tis_lock);
50353+ list_add(&chip->vendor.list, &tis_chips);
50354+ spin_unlock(&tis_lock);
50355+
50356+ tpm_get_timeouts(chip);
50357+ tpm_continue_selftest(chip);
50358+
50359+ return 0;
50360+out_err:
50361+ if (chip->vendor.iobase)
50362+ iounmap(chip->vendor.iobase);
50363+ tpm_remove_hardware(chip->dev);
50364+ return rc;
50365+}
50366+
50367+static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg)
50368+{
50369+ return tpm_pm_suspend(&dev->dev, msg);
50370+}
50371+
50372+static int tpm_tis_pnp_resume(struct pnp_dev *dev)
50373+{
50374+ return tpm_pm_resume(&dev->dev);
50375+}
50376+
50377+static struct pnp_device_id tpm_pnp_tbl[] __devinitdata = {
50378+ {"PNP0C31", 0}, /* TPM */
50379+ {"ATM1200", 0}, /* Atmel */
50380+ {"IFX0102", 0}, /* Infineon */
50381+ {"BCM0101", 0}, /* Broadcom */
50382+ {"NSC1200", 0}, /* National */
50383+ /* Add new here */
50384+ {"", 0}, /* User Specified */
50385+ {"", 0} /* Terminator */
50386+};
50387+
50388+static struct pnp_driver tis_pnp_driver = {
50389+ .name = "tpm_tis",
50390+ .id_table = tpm_pnp_tbl,
50391+ .probe = tpm_tis_pnp_init,
50392+ .suspend = tpm_tis_pnp_suspend,
50393+ .resume = tpm_tis_pnp_resume,
50394+};
50395+
50396+#define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2
50397+module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id,
50398+ sizeof(tpm_pnp_tbl[TIS_HID_USR_IDX].id), 0444);
50399+MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe");
50400+
50401+static int __init init_tis(void)
50402+{
50403+ return pnp_register_driver(&tis_pnp_driver);
50404+}
50405+
50406+static void __exit cleanup_tis(void)
50407+{
50408+ struct tpm_vendor_specific *i, *j;
50409+ struct tpm_chip *chip;
50410+ spin_lock(&tis_lock);
50411+ list_for_each_entry_safe(i, j, &tis_chips, list) {
50412+ chip = to_tpm_chip(i);
50413+ iowrite32(~TPM_GLOBAL_INT_ENABLE &
50414+ ioread32(chip->vendor.iobase +
50415+ TPM_INT_ENABLE(chip->vendor.
50416+ locality)),
50417+ chip->vendor.iobase +
50418+ TPM_INT_ENABLE(chip->vendor.locality));
50419+ release_locality(chip, chip->vendor.locality, 1);
50420+ if (chip->vendor.irq)
50421+ free_irq(chip->vendor.irq, chip);
50422+ iounmap(i->iobase);
50423+ list_del(&i->list);
50424+ tpm_remove_hardware(chip->dev);
50425+ }
50426+ spin_unlock(&tis_lock);
50427+ pnp_unregister_driver(&tis_pnp_driver);
50428+}
50429+
50430+module_init(init_tis);
50431+module_exit(cleanup_tis);
50432+MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
50433+MODULE_DESCRIPTION("TPM Driver");
50434+MODULE_VERSION("2.0");
50435+MODULE_LICENSE("GPL");
50436diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.c linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.c
50437--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.c 1970-01-01 00:00:00.000000000 +0000
50438+++ linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.c 2007-01-08 15:00:45.000000000 +0000
50439@@ -0,0 +1,547 @@
50440+/*
50441+ * Copyright (C) 2006 IBM Corporation
50442+ *
50443+ * Authors:
50444+ * Stefan Berger <stefanb@us.ibm.com>
50445+ *
50446+ * Generic device driver part for device drivers in a virtualized
50447+ * environment.
50448+ *
50449+ * This program is free software; you can redistribute it and/or
50450+ * modify it under the terms of the GNU General Public License as
50451+ * published by the Free Software Foundation, version 2 of the
50452+ * License.
50453+ *
50454+ */
50455+
50456+#include <asm/uaccess.h>
50457+#include <linux/list.h>
50458+#include <linux/device.h>
50459+#include <linux/interrupt.h>
50460+#include <linux/platform_device.h>
50461+#include "tpm.h"
50462+#include "tpm_vtpm.h"
50463+
50464+/* read status bits */
50465+enum {
50466+ STATUS_BUSY = 0x01,
50467+ STATUS_DATA_AVAIL = 0x02,
50468+ STATUS_READY = 0x04
50469+};
50470+
50471+struct transmission {
50472+ struct list_head next;
50473+
50474+ unsigned char *request;
50475+ size_t request_len;
50476+ size_t request_buflen;
50477+
50478+ unsigned char *response;
50479+ size_t response_len;
50480+ size_t response_buflen;
50481+
50482+ unsigned int flags;
50483+};
50484+
50485+enum {
50486+ TRANSMISSION_FLAG_WAS_QUEUED = 0x1
50487+};
50488+
50489+
50490+enum {
50491+ DATAEX_FLAG_QUEUED_ONLY = 0x1
50492+};
50493+
50494+
50495+/* local variables */
50496+
50497+/* local function prototypes */
50498+static int _vtpm_send_queued(struct tpm_chip *chip);
50499+
50500+
50501+/* =============================================================
50502+ * Some utility functions
50503+ * =============================================================
50504+ */
50505+static void vtpm_state_init(struct vtpm_state *vtpms)
50506+{
50507+ vtpms->current_request = NULL;
50508+ spin_lock_init(&vtpms->req_list_lock);
50509+ init_waitqueue_head(&vtpms->req_wait_queue);
50510+ INIT_LIST_HEAD(&vtpms->queued_requests);
50511+
50512+ vtpms->current_response = NULL;
50513+ spin_lock_init(&vtpms->resp_list_lock);
50514+ init_waitqueue_head(&vtpms->resp_wait_queue);
50515+
50516+ vtpms->disconnect_time = jiffies;
50517+}
50518+
50519+
50520+static inline struct transmission *transmission_alloc(void)
50521+{
50522+ return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
50523+}
50524+
50525+static unsigned char *
50526+transmission_set_req_buffer(struct transmission *t,
50527+ unsigned char *buffer, size_t len)
50528+{
50529+ if (t->request_buflen < len) {
50530+ kfree(t->request);
50531+ t->request = kmalloc(len, GFP_KERNEL);
50532+ if (!t->request) {
50533+ t->request_buflen = 0;
50534+ return NULL;
50535+ }
50536+ t->request_buflen = len;
50537+ }
50538+
50539+ memcpy(t->request, buffer, len);
50540+ t->request_len = len;
50541+
50542+ return t->request;
50543+}
50544+
50545+static unsigned char *
50546+transmission_set_res_buffer(struct transmission *t,
50547+ const unsigned char *buffer, size_t len)
50548+{
50549+ if (t->response_buflen < len) {
50550+ kfree(t->response);
50551+ t->response = kmalloc(len, GFP_ATOMIC);
50552+ if (!t->response) {
50553+ t->response_buflen = 0;
50554+ return NULL;
50555+ }
50556+ t->response_buflen = len;
50557+ }
50558+
50559+ memcpy(t->response, buffer, len);
50560+ t->response_len = len;
50561+
50562+ return t->response;
50563+}
50564+
50565+static inline void transmission_free(struct transmission *t)
50566+{
50567+ kfree(t->request);
50568+ kfree(t->response);
50569+ kfree(t);
50570+}
50571+
50572+/* =============================================================
50573+ * Interface with the lower layer driver
50574+ * =============================================================
50575+ */
50576+/*
50577+ * Lower layer uses this function to make a response available.
50578+ */
50579+int vtpm_vd_recv(const struct tpm_chip *chip,
50580+ const unsigned char *buffer, size_t count,
50581+ void *ptr)
50582+{
50583+ unsigned long flags;
50584+ int ret_size = 0;
50585+ struct transmission *t;
50586+ struct vtpm_state *vtpms;
50587+
50588+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50589+
50590+ /*
50591+ * The list with requests must contain one request
50592+ * only and the element there must be the one that
50593+ * was passed to me from the front-end.
50594+ */
50595+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50596+ if (vtpms->current_request != ptr) {
50597+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50598+ return 0;
50599+ }
50600+
50601+ if ((t = vtpms->current_request)) {
50602+ transmission_free(t);
50603+ vtpms->current_request = NULL;
50604+ }
50605+
50606+ t = transmission_alloc();
50607+ if (t) {
50608+ if (!transmission_set_res_buffer(t, buffer, count)) {
50609+ transmission_free(t);
50610+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50611+ return -ENOMEM;
50612+ }
50613+ ret_size = count;
50614+ vtpms->current_response = t;
50615+ wake_up_interruptible(&vtpms->resp_wait_queue);
50616+ }
50617+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50618+
50619+ return ret_size;
50620+}
50621+
50622+
50623+/*
50624+ * Lower layer indicates its status (connected/disconnected)
50625+ */
50626+void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
50627+{
50628+ struct vtpm_state *vtpms;
50629+
50630+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50631+
50632+ vtpms->vd_status = vd_status;
50633+ if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
50634+ vtpms->disconnect_time = jiffies;
50635+ }
50636+}
50637+
50638+/* =============================================================
50639+ * Interface with the generic TPM driver
50640+ * =============================================================
50641+ */
50642+static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
50643+{
50644+ int rc = 0;
50645+ unsigned long flags;
50646+ struct vtpm_state *vtpms;
50647+
50648+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50649+
50650+ /*
50651+ * Check if the previous operation only queued the command
50652+ * In this case there won't be a response, so I just
50653+ * return from here and reset that flag. In any other
50654+ * case I should receive a response from the back-end.
50655+ */
50656+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50657+ if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
50658+ vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
50659+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50660+ /*
50661+ * The first few commands (measurements) must be
50662+ * queued since it might not be possible to talk to the
50663+ * TPM, yet.
50664+ * Return a response of up to 30 '0's.
50665+ */
50666+
50667+ count = min_t(size_t, count, 30);
50668+ memset(buf, 0x0, count);
50669+ return count;
50670+ }
50671+ /*
50672+ * Check whether something is in the responselist and if
50673+ * there's nothing in the list wait for something to appear.
50674+ */
50675+
50676+ if (!vtpms->current_response) {
50677+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50678+ interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
50679+ 1000);
50680+ spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
50681+ }
50682+
50683+ if (vtpms->current_response) {
50684+ struct transmission *t = vtpms->current_response;
50685+ vtpms->current_response = NULL;
50686+ rc = min(count, t->response_len);
50687+ memcpy(buf, t->response, rc);
50688+ transmission_free(t);
50689+ }
50690+
50691+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50692+ return rc;
50693+}
50694+
50695+static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
50696+{
50697+ int rc = 0;
50698+ unsigned long flags;
50699+ struct transmission *t = transmission_alloc();
50700+ struct vtpm_state *vtpms;
50701+
50702+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50703+
50704+ if (!t)
50705+ return -ENOMEM;
50706+ /*
50707+ * If there's a current request, it must be the
50708+ * previous request that has timed out.
50709+ */
50710+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
50711+ if (vtpms->current_request != NULL) {
50712+ printk("WARNING: Sending although there is a request outstanding.\n"
50713+ " Previous request must have timed out.\n");
50714+ transmission_free(vtpms->current_request);
50715+ vtpms->current_request = NULL;
50716+ }
50717+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50718+
50719+ /*
50720+ * Queue the packet if the driver below is not
50721+ * ready, yet, or there is any packet already
50722+ * in the queue.
50723+ * If the driver below is ready, unqueue all
50724+ * packets first before sending our current
50725+ * packet.
50726+ * For each unqueued packet, except for the
50727+ * last (=current) packet, call the function
50728+ * tpm_xen_recv to wait for the response to come
50729+ * back.
50730+ */
50731+ if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
50732+ if (time_after(jiffies,
50733+ vtpms->disconnect_time + HZ * 10)) {
50734+ rc = -ENOENT;
50735+ } else {
50736+ goto queue_it;
50737+ }
50738+ } else {
50739+ /*
50740+ * Send all queued packets.
50741+ */
50742+ if (_vtpm_send_queued(chip) == 0) {
50743+
50744+ vtpms->current_request = t;
50745+
50746+ rc = vtpm_vd_send(vtpms->tpm_private,
50747+ buf,
50748+ count,
50749+ t);
50750+ /*
50751+ * The generic TPM driver will call
50752+ * the function to receive the response.
50753+ */
50754+ if (rc < 0) {
50755+ vtpms->current_request = NULL;
50756+ goto queue_it;
50757+ }
50758+ } else {
50759+queue_it:
50760+ if (!transmission_set_req_buffer(t, buf, count)) {
50761+ transmission_free(t);
50762+ rc = -ENOMEM;
50763+ goto exit;
50764+ }
50765+ /*
50766+ * An error occurred. Don't event try
50767+ * to send the current request. Just
50768+ * queue it.
50769+ */
50770+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
50771+ vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
50772+ list_add_tail(&t->next, &vtpms->queued_requests);
50773+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50774+ }
50775+ }
50776+
50777+exit:
50778+ return rc;
50779+}
50780+
50781+
50782+/*
50783+ * Send all queued requests.
50784+ */
50785+static int _vtpm_send_queued(struct tpm_chip *chip)
50786+{
50787+ int rc;
50788+ int error = 0;
50789+ long flags;
50790+ unsigned char buffer[1];
50791+ struct vtpm_state *vtpms;
50792+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50793+
50794+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
50795+
50796+ while (!list_empty(&vtpms->queued_requests)) {
50797+ /*
50798+ * Need to dequeue them.
50799+ * Read the result into a dummy buffer.
50800+ */
50801+ struct transmission *qt = (struct transmission *)
50802+ vtpms->queued_requests.next;
50803+ list_del(&qt->next);
50804+ vtpms->current_request = qt;
50805+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50806+
50807+ rc = vtpm_vd_send(vtpms->tpm_private,
50808+ qt->request,
50809+ qt->request_len,
50810+ qt);
50811+
50812+ if (rc < 0) {
50813+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
50814+ if ((qt = vtpms->current_request) != NULL) {
50815+ /*
50816+ * requeue it at the beginning
50817+ * of the list
50818+ */
50819+ list_add(&qt->next,
50820+ &vtpms->queued_requests);
50821+ }
50822+ vtpms->current_request = NULL;
50823+ error = 1;
50824+ break;
50825+ }
50826+ /*
50827+ * After this point qt is not valid anymore!
50828+ * It is freed when the front-end is delivering
50829+ * the data by calling tpm_recv
50830+ */
50831+ /*
50832+ * Receive response into provided dummy buffer
50833+ */
50834+ rc = vtpm_recv(chip, buffer, sizeof(buffer));
50835+ spin_lock_irqsave(&vtpms->req_list_lock, flags);
50836+ }
50837+
50838+ spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50839+
50840+ return error;
50841+}
50842+
50843+static void vtpm_cancel(struct tpm_chip *chip)
50844+{
50845+ unsigned long flags;
50846+ struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
50847+
50848+ spin_lock_irqsave(&vtpms->resp_list_lock,flags);
50849+
50850+ if (!vtpms->current_response && vtpms->current_request) {
50851+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50852+ interruptible_sleep_on(&vtpms->resp_wait_queue);
50853+ spin_lock_irqsave(&vtpms->resp_list_lock,flags);
50854+ }
50855+
50856+ if (vtpms->current_response) {
50857+ struct transmission *t = vtpms->current_response;
50858+ vtpms->current_response = NULL;
50859+ transmission_free(t);
50860+ }
50861+
50862+ spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
50863+}
50864+
50865+static u8 vtpm_status(struct tpm_chip *chip)
50866+{
50867+ u8 rc = 0;
50868+ unsigned long flags;
50869+ struct vtpm_state *vtpms;
50870+
50871+ vtpms = (struct vtpm_state *)chip_get_private(chip);
50872+
50873+ spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50874+ /*
50875+ * Data are available if:
50876+ * - there's a current response
50877+ * - the last packet was queued only (this is fake, but necessary to
50878+ * get the generic TPM layer to call the receive function.)
50879+ */
50880+ if (vtpms->current_response ||
50881+ 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
50882+ rc = STATUS_DATA_AVAIL;
50883+ } else if (!vtpms->current_response && !vtpms->current_request) {
50884+ rc = STATUS_READY;
50885+ }
50886+
50887+ spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50888+ return rc;
50889+}
50890+
50891+static struct file_operations vtpm_ops = {
50892+ .owner = THIS_MODULE,
50893+ .llseek = no_llseek,
50894+ .open = tpm_open,
50895+ .read = tpm_read,
50896+ .write = tpm_write,
50897+ .release = tpm_release,
50898+};
50899+
50900+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
50901+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
50902+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
50903+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
50904+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
50905+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
50906+ NULL);
50907+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
50908+static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
50909+
50910+static struct attribute *vtpm_attrs[] = {
50911+ &dev_attr_pubek.attr,
50912+ &dev_attr_pcrs.attr,
50913+ &dev_attr_enabled.attr,
50914+ &dev_attr_active.attr,
50915+ &dev_attr_owned.attr,
50916+ &dev_attr_temp_deactivated.attr,
50917+ &dev_attr_caps.attr,
50918+ &dev_attr_cancel.attr,
50919+ NULL,
50920+};
50921+
50922+static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
50923+
50924+#define TPM_LONG_TIMEOUT (10 * 60 * HZ)
50925+
50926+static struct tpm_vendor_specific tpm_vtpm = {
50927+ .recv = vtpm_recv,
50928+ .send = vtpm_send,
50929+ .cancel = vtpm_cancel,
50930+ .status = vtpm_status,
50931+ .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
50932+ .req_complete_val = STATUS_DATA_AVAIL,
50933+ .req_canceled = STATUS_READY,
50934+ .attr_group = &vtpm_attr_grp,
50935+ .miscdev = {
50936+ .fops = &vtpm_ops,
50937+ },
50938+ .duration = {
50939+ TPM_LONG_TIMEOUT,
50940+ TPM_LONG_TIMEOUT,
50941+ TPM_LONG_TIMEOUT,
50942+ },
50943+};
50944+
50945+struct tpm_chip *init_vtpm(struct device *dev,
50946+ struct tpm_virtual_device *tvd,
50947+ struct tpm_private *tp)
50948+{
50949+ long rc;
50950+ struct tpm_chip *chip;
50951+ struct vtpm_state *vtpms;
50952+
50953+ vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
50954+ if (!vtpms)
50955+ return ERR_PTR(-ENOMEM);
50956+
50957+ vtpm_state_init(vtpms);
50958+ vtpms->tpmvd = tvd;
50959+ vtpms->tpm_private = tp;
50960+
50961+ if (tvd)
50962+ tpm_vtpm.buffersize = tvd->max_tx_size;
50963+
50964+ chip = tpm_register_hardware(dev, &tpm_vtpm);
50965+ if (!chip) {
50966+ rc = -ENODEV;
50967+ goto err_free_mem;
50968+ }
50969+
50970+ chip_set_private(chip, vtpms);
50971+
50972+ return chip;
50973+
50974+err_free_mem:
50975+ kfree(vtpms);
50976+
50977+ return ERR_PTR(rc);
50978+}
50979+
50980+void cleanup_vtpm(struct device *dev)
50981+{
50982+ struct tpm_chip *chip = dev_get_drvdata(dev);
50983+ struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
50984+ tpm_remove_hardware(dev);
50985+ kfree(vtpms);
50986+}
50987diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.h linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.h
50988--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.h 1970-01-01 00:00:00.000000000 +0000
50989+++ linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.h 2007-01-08 15:00:45.000000000 +0000
50990@@ -0,0 +1,68 @@
50991+#ifndef TPM_VTPM_H
50992+#define TPM_VTPM_H
50993+
50994+struct tpm_chip;
50995+struct tpm_private;
50996+
50997+struct tpm_virtual_device {
50998+ /*
50999+ * This field indicates the maximum size the driver can
51000+ * transfer in one chunk. It is filled in by the front-end
51001+ * driver and should be propagated to the generic tpm driver
51002+ * for allocation of buffers.
51003+ */
51004+ unsigned int max_tx_size;
51005+};
51006+
51007+struct vtpm_state {
51008+ struct transmission *current_request;
51009+ spinlock_t req_list_lock;
51010+ wait_queue_head_t req_wait_queue;
51011+
51012+ struct list_head queued_requests;
51013+
51014+ struct transmission *current_response;
51015+ spinlock_t resp_list_lock;
51016+ wait_queue_head_t resp_wait_queue; // processes waiting for responses
51017+
51018+ u8 vd_status;
51019+ u8 flags;
51020+
51021+ unsigned long disconnect_time;
51022+
51023+ struct tpm_virtual_device *tpmvd;
51024+
51025+ /*
51026+ * The following is a private structure of the underlying
51027+ * driver. It is passed as parameter in the send function.
51028+ */
51029+ struct tpm_private *tpm_private;
51030+};
51031+
51032+
51033+enum vdev_status {
51034+ TPM_VD_STATUS_DISCONNECTED = 0x0,
51035+ TPM_VD_STATUS_CONNECTED = 0x1
51036+};
51037+
51038+/* this function is called from tpm_vtpm.c */
51039+int vtpm_vd_send(struct tpm_private * tp,
51040+ const u8 * buf, size_t count, void *ptr);
51041+
51042+/* these functions are offered by tpm_vtpm.c */
51043+struct tpm_chip *init_vtpm(struct device *,
51044+ struct tpm_virtual_device *,
51045+ struct tpm_private *);
51046+void cleanup_vtpm(struct device *);
51047+int vtpm_vd_recv(const struct tpm_chip* chip,
51048+ const unsigned char *buffer, size_t count, void *ptr);
51049+void vtpm_vd_status(const struct tpm_chip *, u8 status);
51050+
51051+static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
51052+{
51053+ struct tpm_chip *chip = dev_get_drvdata(dev);
51054+ struct vtpm_state *vtpms = chip_get_private(chip);
51055+ return vtpms->tpm_private;
51056+}
51057+
51058+#endif
51059diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_xen.c linux-2.6.16.33/drivers/char/tpm/tpm_xen.c
51060--- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_xen.c 1970-01-01 00:00:00.000000000 +0000
51061+++ linux-2.6.16.33/drivers/char/tpm/tpm_xen.c 2007-01-08 15:00:45.000000000 +0000
51062@@ -0,0 +1,760 @@
51063+/*
51064+ * Copyright (c) 2005, IBM Corporation
51065+ *
51066+ * Author: Stefan Berger, stefanb@us.ibm.com
51067+ * Grant table support: Mahadevan Gomathisankaran
51068+ *
51069+ * This code has been derived from drivers/xen/netfront/netfront.c
51070+ *
51071+ * Copyright (c) 2002-2004, K A Fraser
51072+ *
51073+ * This program is free software; you can redistribute it and/or
51074+ * modify it under the terms of the GNU General Public License version 2
51075+ * as published by the Free Software Foundation; or, when distributed
51076+ * separately from the Linux kernel or incorporated into other
51077+ * software packages, subject to the following license:
51078+ *
51079+ * Permission is hereby granted, free of charge, to any person obtaining a copy
51080+ * of this source file (the "Software"), to deal in the Software without
51081+ * restriction, including without limitation the rights to use, copy, modify,
51082+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51083+ * and to permit persons to whom the Software is furnished to do so, subject to
51084+ * the following conditions:
51085+ *
51086+ * The above copyright notice and this permission notice shall be included in
51087+ * all copies or substantial portions of the Software.
51088+ *
51089+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51090+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51091+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51092+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51093+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51094+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51095+ * IN THE SOFTWARE.
51096+ */
51097+
51098+#include <linux/errno.h>
51099+#include <linux/err.h>
51100+#include <linux/interrupt.h>
51101+#include <linux/mutex.h>
51102+#include <asm/uaccess.h>
51103+#include <xen/evtchn.h>
51104+#include <xen/interface/grant_table.h>
51105+#include <xen/interface/io/tpmif.h>
51106+#include <xen/gnttab.h>
51107+#include <xen/xenbus.h>
51108+#include "tpm.h"
51109+#include "tpm_vtpm.h"
51110+
51111+#undef DEBUG
51112+
51113+/* local structures */
51114+struct tpm_private {
51115+ struct tpm_chip *chip;
51116+
51117+ tpmif_tx_interface_t *tx;
51118+ atomic_t refcnt;
51119+ unsigned int evtchn;
51120+ unsigned int irq;
51121+ u8 is_connected;
51122+ u8 is_suspended;
51123+
51124+ spinlock_t tx_lock;
51125+
51126+ struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
51127+
51128+ atomic_t tx_busy;
51129+ void *tx_remember;
51130+
51131+ domid_t backend_id;
51132+ wait_queue_head_t wait_q;
51133+
51134+ struct xenbus_device *dev;
51135+ int ring_ref;
51136+};
51137+
51138+struct tx_buffer {
51139+ unsigned int size; // available space in data
51140+ unsigned int len; // used space in data
51141+ unsigned char *data; // pointer to a page
51142+};
51143+
51144+
51145+/* locally visible variables */
51146+static grant_ref_t gref_head;
51147+static struct tpm_private *my_priv;
51148+
51149+/* local function prototypes */
51150+static irqreturn_t tpmif_int(int irq,
51151+ void *tpm_priv,
51152+ struct pt_regs *ptregs);
51153+static void tpmif_rx_action(unsigned long unused);
51154+static int tpmif_connect(struct xenbus_device *dev,
51155+ struct tpm_private *tp,
51156+ domid_t domid);
51157+static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
51158+static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
51159+static void tpmif_free_tx_buffers(struct tpm_private *tp);
51160+static void tpmif_set_connected_state(struct tpm_private *tp,
51161+ u8 newstate);
51162+static int tpm_xmit(struct tpm_private *tp,
51163+ const u8 * buf, size_t count, int userbuffer,
51164+ void *remember);
51165+static void destroy_tpmring(struct tpm_private *tp);
51166+void __exit tpmif_exit(void);
51167+
51168+#define DPRINTK(fmt, args...) \
51169+ pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
51170+#define IPRINTK(fmt, args...) \
51171+ printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
51172+#define WPRINTK(fmt, args...) \
51173+ printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
51174+
51175+#define GRANT_INVALID_REF 0
51176+
51177+
51178+static inline int
51179+tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
51180+ int isuserbuffer)
51181+{
51182+ int copied = len;
51183+
51184+ if (len > txb->size) {
51185+ copied = txb->size;
51186+ }
51187+ if (isuserbuffer) {
51188+ if (copy_from_user(txb->data, src, copied))
51189+ return -EFAULT;
51190+ } else {
51191+ memcpy(txb->data, src, copied);
51192+ }
51193+ txb->len = len;
51194+ return copied;
51195+}
51196+
51197+static inline struct tx_buffer *tx_buffer_alloc(void)
51198+{
51199+ struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
51200+ GFP_KERNEL);
51201+
51202+ if (txb) {
51203+ txb->len = 0;
51204+ txb->size = PAGE_SIZE;
51205+ txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
51206+ if (txb->data == NULL) {
51207+ kfree(txb);
51208+ txb = NULL;
51209+ }
51210+ }
51211+ return txb;
51212+}
51213+
51214+
51215+static inline void tx_buffer_free(struct tx_buffer *txb)
51216+{
51217+ if (txb) {
51218+ free_page((long)txb->data);
51219+ kfree(txb);
51220+ }
51221+}
51222+
51223+/**************************************************************
51224+ Utility function for the tpm_private structure
51225+**************************************************************/
51226+static inline void tpm_private_init(struct tpm_private *tp)
51227+{
51228+ spin_lock_init(&tp->tx_lock);
51229+ init_waitqueue_head(&tp->wait_q);
51230+ atomic_set(&tp->refcnt, 1);
51231+}
51232+
51233+static inline void tpm_private_put(void)
51234+{
51235+ if ( atomic_dec_and_test(&my_priv->refcnt)) {
51236+ tpmif_free_tx_buffers(my_priv);
51237+ kfree(my_priv);
51238+ my_priv = NULL;
51239+ }
51240+}
51241+
51242+static struct tpm_private *tpm_private_get(void)
51243+{
51244+ int err;
51245+ if (!my_priv) {
51246+ my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
51247+ if (my_priv) {
51248+ tpm_private_init(my_priv);
51249+ err = tpmif_allocate_tx_buffers(my_priv);
51250+ if (err < 0) {
51251+ tpm_private_put();
51252+ }
51253+ }
51254+ } else {
51255+ atomic_inc(&my_priv->refcnt);
51256+ }
51257+ return my_priv;
51258+}
51259+
51260+/**************************************************************
51261+
51262+ The interface to let the tpm plugin register its callback
51263+ function and send data to another partition using this module
51264+
51265+**************************************************************/
51266+
51267+static DEFINE_MUTEX(suspend_lock);
51268+/*
51269+ * Send data via this module by calling this function
51270+ */
51271+int vtpm_vd_send(struct tpm_private *tp,
51272+ const u8 * buf, size_t count, void *ptr)
51273+{
51274+ int sent;
51275+
51276+ mutex_lock(&suspend_lock);
51277+ sent = tpm_xmit(tp, buf, count, 0, ptr);
51278+ mutex_unlock(&suspend_lock);
51279+
51280+ return sent;
51281+}
51282+
51283+/**************************************************************
51284+ XENBUS support code
51285+**************************************************************/
51286+
51287+static int setup_tpmring(struct xenbus_device *dev,
51288+ struct tpm_private *tp)
51289+{
51290+ tpmif_tx_interface_t *sring;
51291+ int err;
51292+
51293+ tp->ring_ref = GRANT_INVALID_REF;
51294+
51295+ sring = (void *)__get_free_page(GFP_KERNEL);
51296+ if (!sring) {
51297+ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
51298+ return -ENOMEM;
51299+ }
51300+ tp->tx = sring;
51301+
51302+ err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
51303+ if (err < 0) {
51304+ free_page((unsigned long)sring);
51305+ tp->tx = NULL;
51306+ xenbus_dev_fatal(dev, err, "allocating grant reference");
51307+ goto fail;
51308+ }
51309+ tp->ring_ref = err;
51310+
51311+ err = tpmif_connect(dev, tp, dev->otherend_id);
51312+ if (err)
51313+ goto fail;
51314+
51315+ return 0;
51316+fail:
51317+ destroy_tpmring(tp);
51318+ return err;
51319+}
51320+
51321+
51322+static void destroy_tpmring(struct tpm_private *tp)
51323+{
51324+ tpmif_set_connected_state(tp, 0);
51325+
51326+ if (tp->ring_ref != GRANT_INVALID_REF) {
51327+ gnttab_end_foreign_access(tp->ring_ref, 0,
51328+ (unsigned long)tp->tx);
51329+ tp->ring_ref = GRANT_INVALID_REF;
51330+ tp->tx = NULL;
51331+ }
51332+
51333+ if (tp->irq)
51334+ unbind_from_irqhandler(tp->irq, tp);
51335+
51336+ tp->evtchn = tp->irq = 0;
51337+}
51338+
51339+
51340+static int talk_to_backend(struct xenbus_device *dev,
51341+ struct tpm_private *tp)
51342+{
51343+ const char *message = NULL;
51344+ int err;
51345+ struct xenbus_transaction xbt;
51346+
51347+ err = setup_tpmring(dev, tp);
51348+ if (err) {
51349+ xenbus_dev_fatal(dev, err, "setting up ring");
51350+ goto out;
51351+ }
51352+
51353+again:
51354+ err = xenbus_transaction_start(&xbt);
51355+ if (err) {
51356+ xenbus_dev_fatal(dev, err, "starting transaction");
51357+ goto destroy_tpmring;
51358+ }
51359+
51360+ err = xenbus_printf(xbt, dev->nodename,
51361+ "ring-ref","%u", tp->ring_ref);
51362+ if (err) {
51363+ message = "writing ring-ref";
51364+ goto abort_transaction;
51365+ }
51366+
51367+ err = xenbus_printf(xbt, dev->nodename,
51368+ "event-channel", "%u", tp->evtchn);
51369+ if (err) {
51370+ message = "writing event-channel";
51371+ goto abort_transaction;
51372+ }
51373+
51374+ err = xenbus_transaction_end(xbt, 0);
51375+ if (err == -EAGAIN)
51376+ goto again;
51377+ if (err) {
51378+ xenbus_dev_fatal(dev, err, "completing transaction");
51379+ goto destroy_tpmring;
51380+ }
51381+
51382+ xenbus_switch_state(dev, XenbusStateConnected);
51383+
51384+ return 0;
51385+
51386+abort_transaction:
51387+ xenbus_transaction_end(xbt, 1);
51388+ if (message)
51389+ xenbus_dev_error(dev, err, "%s", message);
51390+destroy_tpmring:
51391+ destroy_tpmring(tp);
51392+out:
51393+ return err;
51394+}
51395+
51396+/**
51397+ * Callback received when the backend's state changes.
51398+ */
51399+static void backend_changed(struct xenbus_device *dev,
51400+ enum xenbus_state backend_state)
51401+{
51402+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51403+ DPRINTK("\n");
51404+
51405+ switch (backend_state) {
51406+ case XenbusStateInitialising:
51407+ case XenbusStateInitWait:
51408+ case XenbusStateInitialised:
51409+ case XenbusStateUnknown:
51410+ break;
51411+
51412+ case XenbusStateConnected:
51413+ tpmif_set_connected_state(tp, 1);
51414+ break;
51415+
51416+ case XenbusStateClosing:
51417+ tpmif_set_connected_state(tp, 0);
51418+ xenbus_frontend_closed(dev);
51419+ break;
51420+
51421+ case XenbusStateClosed:
51422+ tpmif_set_connected_state(tp, 0);
51423+ if (tp->is_suspended == 0)
51424+ device_unregister(&dev->dev);
51425+ xenbus_frontend_closed(dev);
51426+ break;
51427+ }
51428+}
51429+
51430+struct tpm_virtual_device tvd = {
51431+ .max_tx_size = PAGE_SIZE * TPMIF_TX_RING_SIZE,
51432+};
51433+
51434+static int tpmfront_probe(struct xenbus_device *dev,
51435+ const struct xenbus_device_id *id)
51436+{
51437+ int err;
51438+ int handle;
51439+ struct tpm_private *tp = tpm_private_get();
51440+
51441+ if (!tp)
51442+ return -ENOMEM;
51443+
51444+ tp->chip = init_vtpm(&dev->dev, &tvd, tp);
51445+
51446+ if (IS_ERR(tp->chip)) {
51447+ return PTR_ERR(tp->chip);
51448+ }
51449+
51450+ err = xenbus_scanf(XBT_NIL, dev->nodename,
51451+ "handle", "%i", &handle);
51452+ if (XENBUS_EXIST_ERR(err))
51453+ return err;
51454+
51455+ if (err < 0) {
51456+ xenbus_dev_fatal(dev,err,"reading virtual-device");
51457+ return err;
51458+ }
51459+
51460+ tp->dev = dev;
51461+
51462+ err = talk_to_backend(dev, tp);
51463+ if (err) {
51464+ tpm_private_put();
51465+ return err;
51466+ }
51467+ return 0;
51468+}
51469+
51470+
51471+static int tpmfront_remove(struct xenbus_device *dev)
51472+{
51473+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51474+ destroy_tpmring(tp);
51475+ cleanup_vtpm(&dev->dev);
51476+ return 0;
51477+}
51478+
51479+static int tpmfront_suspend(struct xenbus_device *dev)
51480+{
51481+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51482+ u32 ctr;
51483+ /* lock, so no app can send */
51484+ mutex_lock(&suspend_lock);
51485+ tp->is_suspended = 1;
51486+
51487+ for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 300; ctr++) {
51488+ if ((ctr % 10) == 0)
51489+ printk("TPM-FE [INFO]: Waiting for outstanding "
51490+ "request.\n");
51491+ /*
51492+ * Wait for a request to be responded to.
51493+ */
51494+ interruptible_sleep_on_timeout(&tp->wait_q, 100);
51495+ }
51496+ xenbus_switch_state(dev, XenbusStateClosing);
51497+
51498+ if (atomic_read(&tp->tx_busy)) {
51499+ /*
51500+ * A temporary work-around.
51501+ */
51502+ printk("TPM-FE [WARNING]: Resetting busy flag.");
51503+ atomic_set(&tp->tx_busy, 0);
51504+ }
51505+
51506+ return 0;
51507+}
51508+
51509+static int tpmfront_resume(struct xenbus_device *dev)
51510+{
51511+ struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51512+ destroy_tpmring(tp);
51513+ return talk_to_backend(dev, tp);
51514+}
51515+
51516+static int tpmif_connect(struct xenbus_device *dev,
51517+ struct tpm_private *tp,
51518+ domid_t domid)
51519+{
51520+ int err;
51521+
51522+ tp->backend_id = domid;
51523+
51524+ err = xenbus_alloc_evtchn(dev, &tp->evtchn);
51525+ if (err)
51526+ return err;
51527+
51528+ err = bind_evtchn_to_irqhandler(tp->evtchn,
51529+ tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
51530+ tp);
51531+ if (err <= 0) {
51532+ WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
51533+ return err;
51534+ }
51535+
51536+ tp->irq = err;
51537+ return 0;
51538+}
51539+
51540+static struct xenbus_device_id tpmfront_ids[] = {
51541+ { "vtpm" },
51542+ { "" }
51543+};
51544+
51545+static struct xenbus_driver tpmfront = {
51546+ .name = "vtpm",
51547+ .owner = THIS_MODULE,
51548+ .ids = tpmfront_ids,
51549+ .probe = tpmfront_probe,
51550+ .remove = tpmfront_remove,
51551+ .resume = tpmfront_resume,
51552+ .otherend_changed = backend_changed,
51553+ .suspend = tpmfront_suspend,
51554+};
51555+
51556+static void __init init_tpm_xenbus(void)
51557+{
51558+ xenbus_register_frontend(&tpmfront);
51559+}
51560+
51561+static void __exit exit_tpm_xenbus(void)
51562+{
51563+ xenbus_unregister_driver(&tpmfront);
51564+}
51565+
51566+static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
51567+{
51568+ unsigned int i;
51569+
51570+ for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
51571+ tp->tx_buffers[i] = tx_buffer_alloc();
51572+ if (!tp->tx_buffers[i]) {
51573+ tpmif_free_tx_buffers(tp);
51574+ return -ENOMEM;
51575+ }
51576+ }
51577+ return 0;
51578+}
51579+
51580+static void tpmif_free_tx_buffers(struct tpm_private *tp)
51581+{
51582+ unsigned int i;
51583+
51584+ for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
51585+ tx_buffer_free(tp->tx_buffers[i]);
51586+ }
51587+}
51588+
51589+static void tpmif_rx_action(unsigned long priv)
51590+{
51591+ struct tpm_private *tp = (struct tpm_private *)priv;
51592+
51593+ int i = 0;
51594+ unsigned int received;
51595+ unsigned int offset = 0;
51596+ u8 *buffer;
51597+ tpmif_tx_request_t *tx;
51598+ tx = &tp->tx->ring[i].req;
51599+
51600+ atomic_set(&tp->tx_busy, 0);
51601+ wake_up_interruptible(&tp->wait_q);
51602+
51603+ received = tx->size;
51604+
51605+ buffer = kmalloc(received, GFP_ATOMIC);
51606+ if (NULL == buffer) {
51607+ goto exit;
51608+ }
51609+
51610+ for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
51611+ struct tx_buffer *txb = tp->tx_buffers[i];
51612+ tpmif_tx_request_t *tx;
51613+ unsigned int tocopy;
51614+
51615+ tx = &tp->tx->ring[i].req;
51616+ tocopy = tx->size;
51617+ if (tocopy > PAGE_SIZE) {
51618+ tocopy = PAGE_SIZE;
51619+ }
51620+
51621+ memcpy(&buffer[offset], txb->data, tocopy);
51622+
51623+ gnttab_release_grant_reference(&gref_head, tx->ref);
51624+
51625+ offset += tocopy;
51626+ }
51627+
51628+ vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
51629+ kfree(buffer);
51630+
51631+exit:
51632+
51633+ return;
51634+}
51635+
51636+
51637+static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
51638+{
51639+ struct tpm_private *tp = tpm_priv;
51640+ unsigned long flags;
51641+
51642+ spin_lock_irqsave(&tp->tx_lock, flags);
51643+ tpmif_rx_tasklet.data = (unsigned long)tp;
51644+ tasklet_schedule(&tpmif_rx_tasklet);
51645+ spin_unlock_irqrestore(&tp->tx_lock, flags);
51646+
51647+ return IRQ_HANDLED;
51648+}
51649+
51650+
51651+static int tpm_xmit(struct tpm_private *tp,
51652+ const u8 * buf, size_t count, int isuserbuffer,
51653+ void *remember)
51654+{
51655+ tpmif_tx_request_t *tx;
51656+ TPMIF_RING_IDX i;
51657+ unsigned int offset = 0;
51658+
51659+ spin_lock_irq(&tp->tx_lock);
51660+
51661+ if (unlikely(atomic_read(&tp->tx_busy))) {
51662+ printk("tpm_xmit: There's an outstanding request/response "
51663+ "on the way!\n");
51664+ spin_unlock_irq(&tp->tx_lock);
51665+ return -EBUSY;
51666+ }
51667+
51668+ if (tp->is_connected != 1) {
51669+ spin_unlock_irq(&tp->tx_lock);
51670+ return -EIO;
51671+ }
51672+
51673+ for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
51674+ struct tx_buffer *txb = tp->tx_buffers[i];
51675+ int copied;
51676+
51677+ if (NULL == txb) {
51678+ DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
51679+ "Not transmitting anything!\n", i);
51680+ spin_unlock_irq(&tp->tx_lock);
51681+ return -EFAULT;
51682+ }
51683+ copied = tx_buffer_copy(txb, &buf[offset], count,
51684+ isuserbuffer);
51685+ if (copied < 0) {
51686+ /* An error occurred */
51687+ spin_unlock_irq(&tp->tx_lock);
51688+ return copied;
51689+ }
51690+ count -= copied;
51691+ offset += copied;
51692+
51693+ tx = &tp->tx->ring[i].req;
51694+
51695+ tx->addr = virt_to_machine(txb->data);
51696+ tx->size = txb->len;
51697+
51698+ DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
51699+ txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
51700+
51701+ /* get the granttable reference for this page */
51702+ tx->ref = gnttab_claim_grant_reference(&gref_head);
51703+
51704+ if (-ENOSPC == tx->ref) {
51705+ spin_unlock_irq(&tp->tx_lock);
51706+ DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
51707+ return -ENOSPC;
51708+ }
51709+ gnttab_grant_foreign_access_ref( tx->ref,
51710+ tp->backend_id,
51711+ (tx->addr >> PAGE_SHIFT),
51712+ 0 /*RW*/);
51713+ wmb();
51714+ }
51715+
51716+ atomic_set(&tp->tx_busy, 1);
51717+ tp->tx_remember = remember;
51718+
51719+ mb();
51720+
51721+ DPRINTK("Notifying backend via event channel %d\n",
51722+ tp->evtchn);
51723+
51724+ notify_remote_via_irq(tp->irq);
51725+
51726+ spin_unlock_irq(&tp->tx_lock);
51727+ return offset;
51728+}
51729+
51730+
51731+static void tpmif_notify_upperlayer(struct tpm_private *tp)
51732+{
51733+ /*
51734+ * Notify upper layer about the state of the connection
51735+ * to the BE.
51736+ */
51737+ if (tp->is_connected) {
51738+ vtpm_vd_status(tp->chip, TPM_VD_STATUS_CONNECTED);
51739+ } else {
51740+ vtpm_vd_status(tp->chip, TPM_VD_STATUS_DISCONNECTED);
51741+ }
51742+}
51743+
51744+
51745+static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
51746+{
51747+ /*
51748+ * Don't notify upper layer if we are in suspend mode and
51749+ * should disconnect - assumption is that we will resume
51750+ * The mutex keeps apps from sending.
51751+ */
51752+ if (is_connected == 0 && tp->is_suspended == 1) {
51753+ return;
51754+ }
51755+
51756+ /*
51757+ * Unlock the mutex if we are connected again
51758+ * after being suspended - now resuming.
51759+ * This also removes the suspend state.
51760+ */
51761+ if (is_connected == 1 && tp->is_suspended == 1) {
51762+ tp->is_suspended = 0;
51763+ /* unlock, so apps can resume sending */
51764+ mutex_unlock(&suspend_lock);
51765+ }
51766+
51767+ if (is_connected != tp->is_connected) {
51768+ tp->is_connected = is_connected;
51769+ tpmif_notify_upperlayer(tp);
51770+ }
51771+}
51772+
51773+
51774+
51775+/* =================================================================
51776+ * Initialization function.
51777+ * =================================================================
51778+ */
51779+
51780+
51781+static int __init tpmif_init(void)
51782+{
51783+ long rc = 0;
51784+ struct tpm_private *tp;
51785+
51786+ if (is_initial_xendomain())
51787+ return -EPERM;
51788+
51789+ tp = tpm_private_get();
51790+ if (!tp) {
51791+ rc = -ENOMEM;
51792+ goto failexit;
51793+ }
51794+
51795+ IPRINTK("Initialising the vTPM driver.\n");
51796+ if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
51797+ &gref_head ) < 0) {
51798+ rc = -EFAULT;
51799+ goto gnttab_alloc_failed;
51800+ }
51801+
51802+ init_tpm_xenbus();
51803+ return 0;
51804+
51805+gnttab_alloc_failed:
51806+ tpm_private_put();
51807+failexit:
51808+
51809+ return (int)rc;
51810+}
51811+
51812+
51813+void __exit tpmif_exit(void)
51814+{
51815+ exit_tpm_xenbus();
51816+ tpm_private_put();
51817+ gnttab_free_grant_references(gref_head);
51818+}
51819+
51820+module_init(tpmif_init);
51821+
51822+MODULE_LICENSE("Dual BSD/GPL");
51823diff -Nur linux-2.6.16.33-noxen/drivers/char/tty_io.c linux-2.6.16.33/drivers/char/tty_io.c
51824--- linux-2.6.16.33-noxen/drivers/char/tty_io.c 2006-11-22 18:06:31.000000000 +0000
51825+++ linux-2.6.16.33/drivers/char/tty_io.c 2007-01-08 15:00:45.000000000 +0000
51826@@ -132,6 +132,8 @@
51827 vt.c for deeply disgusting hack reasons */
51828 DECLARE_MUTEX(tty_sem);
51829
51830+int console_use_vt = 1;
51831+
51832 #ifdef CONFIG_UNIX98_PTYS
51833 extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
51834 extern int pty_limit; /* Config limit on Unix98 ptys */
51835@@ -2054,7 +2056,7 @@
51836 goto got_driver;
51837 }
51838 #ifdef CONFIG_VT
51839- if (device == MKDEV(TTY_MAJOR,0)) {
51840+ if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
51841 extern struct tty_driver *console_driver;
51842 driver = console_driver;
51843 index = fg_console;
51844@@ -3245,6 +3247,8 @@
51845 #endif
51846
51847 #ifdef CONFIG_VT
51848+ if (!console_use_vt)
51849+ goto out_vt;
51850 cdev_init(&vc0_cdev, &console_fops);
51851 if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
51852 register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
51853@@ -3253,6 +3257,7 @@
51854 class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
51855
51856 vty_init();
51857+ out_vt:
51858 #endif
51859 return 0;
51860 }
51861diff -Nur linux-2.6.16.33-noxen/drivers/firmware/Kconfig linux-2.6.16.33/drivers/firmware/Kconfig
51862--- linux-2.6.16.33-noxen/drivers/firmware/Kconfig 2006-11-22 18:06:31.000000000 +0000
51863+++ linux-2.6.16.33/drivers/firmware/Kconfig 2007-01-08 15:00:45.000000000 +0000
51864@@ -8,7 +8,7 @@
51865 config EDD
51866 tristate "BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)"
51867 depends on EXPERIMENTAL
51868- depends on !IA64
51869+ depends on !IA64 && !XEN
51870 help
51871 Say Y or M here if you want to enable BIOS Enhanced Disk Drive
51872 Services real mode BIOS calls to determine which disk
51873diff -Nur linux-2.6.16.33-noxen/drivers/ide/ide-lib.c linux-2.6.16.33/drivers/ide/ide-lib.c
51874--- linux-2.6.16.33-noxen/drivers/ide/ide-lib.c 2006-11-22 18:06:31.000000000 +0000
51875+++ linux-2.6.16.33/drivers/ide/ide-lib.c 2007-05-23 21:00:01.000000000 +0000
51876@@ -410,10 +410,10 @@
51877 {
51878 u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */
51879
51880- if (!PCI_DMA_BUS_IS_PHYS) {
51881- addr = BLK_BOUNCE_ANY;
51882- } else if (on && drive->media == ide_disk) {
51883- if (HWIF(drive)->pci_dev)
51884+ if (on && drive->media == ide_disk) {
51885+ if (!PCI_DMA_BUS_IS_PHYS)
51886+ addr = BLK_BOUNCE_ANY;
51887+ else if (HWIF(drive)->pci_dev)
51888 addr = HWIF(drive)->pci_dev->dma_mask;
51889 }
51890
51891diff -Nur linux-2.6.16.33-noxen/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-2.6.16.33/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
51892--- linux-2.6.16.33-noxen/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2006-11-22 18:06:31.000000000 +0000
51893+++ linux-2.6.16.33/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2007-05-23 21:00:01.000000000 +0000
51894@@ -821,7 +821,8 @@
51895
51896 ipoib_mcast_stop_thread(dev, 0);
51897
51898- spin_lock_irqsave(&dev->xmit_lock, flags);
51899+ local_irq_save(flags);
51900+ netif_tx_lock(dev);
51901 spin_lock(&priv->lock);
51902
51903 /*
51904@@ -896,7 +897,8 @@
51905 }
51906
51907 spin_unlock(&priv->lock);
51908- spin_unlock_irqrestore(&dev->xmit_lock, flags);
51909+ netif_tx_unlock(dev);
51910+ local_irq_restore(flags);
51911
51912 /* We have to cancel outside of the spinlock */
51913 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
51914diff -Nur linux-2.6.16.33-noxen/drivers/media/dvb/dvb-core/dvb_net.c linux-2.6.16.33/drivers/media/dvb/dvb-core/dvb_net.c
51915--- linux-2.6.16.33-noxen/drivers/media/dvb/dvb-core/dvb_net.c 2006-11-22 18:06:31.000000000 +0000
51916+++ linux-2.6.16.33/drivers/media/dvb/dvb-core/dvb_net.c 2007-05-23 21:00:01.000000000 +0000
51917@@ -1053,7 +1053,7 @@
51918
51919 dvb_net_feed_stop(dev);
51920 priv->rx_mode = RX_MODE_UNI;
51921- spin_lock_bh(&dev->xmit_lock);
51922+ netif_tx_lock_bh(dev);
51923
51924 if (dev->flags & IFF_PROMISC) {
51925 dprintk("%s: promiscuous mode\n", dev->name);
51926@@ -1078,7 +1078,7 @@
51927 }
51928 }
51929
51930- spin_unlock_bh(&dev->xmit_lock);
51931+ netif_tx_unlock_bh(dev);
51932 dvb_net_feed_start(dev);
51933 }
51934
51935diff -Nur linux-2.6.16.33-noxen/drivers/net/8139cp.c linux-2.6.16.33/drivers/net/8139cp.c
51936--- linux-2.6.16.33-noxen/drivers/net/8139cp.c 2006-11-22 18:06:31.000000000 +0000
51937+++ linux-2.6.16.33/drivers/net/8139cp.c 2007-05-23 21:00:01.000000000 +0000
51938@@ -794,7 +794,7 @@
51939 entry = cp->tx_head;
51940 eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
51941 if (dev->features & NETIF_F_TSO)
51942- mss = skb_shinfo(skb)->tso_size;
51943+ mss = skb_shinfo(skb)->gso_size;
51944
51945 if (skb_shinfo(skb)->nr_frags == 0) {
51946 struct cp_desc *txd = &cp->tx_ring[entry];
51947diff -Nur linux-2.6.16.33-noxen/drivers/net/bnx2.c linux-2.6.16.33/drivers/net/bnx2.c
51948--- linux-2.6.16.33-noxen/drivers/net/bnx2.c 2006-11-22 18:06:31.000000000 +0000
51949+++ linux-2.6.16.33/drivers/net/bnx2.c 2007-05-23 21:00:01.000000000 +0000
51950@@ -1593,7 +1593,7 @@
51951 skb = tx_buf->skb;
51952 #ifdef BCM_TSO
51953 /* partial BD completions possible with TSO packets */
51954- if (skb_shinfo(skb)->tso_size) {
51955+ if (skb_is_gso(skb)) {
51956 u16 last_idx, last_ring_idx;
51957
51958 last_idx = sw_cons +
51959@@ -1948,7 +1948,7 @@
51960 return 1;
51961 }
51962
51963-/* Called with rtnl_lock from vlan functions and also dev->xmit_lock
51964+/* Called with rtnl_lock from vlan functions and also netif_tx_lock
51965 * from set_multicast.
51966 */
51967 static void
51968@@ -4403,7 +4403,7 @@
51969 }
51970 #endif
51971
51972-/* Called with dev->xmit_lock.
51973+/* Called with netif_tx_lock.
51974 * hard_start_xmit is pseudo-lockless - a lock is only required when
51975 * the tx queue is full. This way, we get the benefit of lockless
51976 * operations most of the time without the complexities to handle
51977@@ -4441,7 +4441,7 @@
51978 (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
51979 }
51980 #ifdef BCM_TSO
51981- if ((mss = skb_shinfo(skb)->tso_size) &&
51982+ if ((mss = skb_shinfo(skb)->gso_size) &&
51983 (skb->len > (bp->dev->mtu + ETH_HLEN))) {
51984 u32 tcp_opt_len, ip_tcp_len;
51985
51986diff -Nur linux-2.6.16.33-noxen/drivers/net/bonding/bond_main.c linux-2.6.16.33/drivers/net/bonding/bond_main.c
51987--- linux-2.6.16.33-noxen/drivers/net/bonding/bond_main.c 2006-11-22 18:06:31.000000000 +0000
51988+++ linux-2.6.16.33/drivers/net/bonding/bond_main.c 2007-05-23 21:00:01.000000000 +0000
51989@@ -1145,8 +1145,7 @@
51990 }
51991
51992 #define BOND_INTERSECT_FEATURES \
51993- (NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\
51994- NETIF_F_TSO|NETIF_F_UFO)
51995+ (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO)
51996
51997 /*
51998 * Compute the common dev->feature set available to all slaves. Some
51999@@ -1164,9 +1163,7 @@
52000 features &= (slave->dev->features & BOND_INTERSECT_FEATURES);
52001
52002 if ((features & NETIF_F_SG) &&
52003- !(features & (NETIF_F_IP_CSUM |
52004- NETIF_F_NO_CSUM |
52005- NETIF_F_HW_CSUM)))
52006+ !(features & NETIF_F_ALL_CSUM))
52007 features &= ~NETIF_F_SG;
52008
52009 /*
52010@@ -4147,7 +4144,7 @@
52011 */
52012 bond_dev->features |= NETIF_F_VLAN_CHALLENGED;
52013
52014- /* don't acquire bond device's xmit_lock when
52015+ /* don't acquire bond device's netif_tx_lock when
52016 * transmitting */
52017 bond_dev->features |= NETIF_F_LLTX;
52018
52019diff -Nur linux-2.6.16.33-noxen/drivers/net/chelsio/sge.c linux-2.6.16.33/drivers/net/chelsio/sge.c
52020--- linux-2.6.16.33-noxen/drivers/net/chelsio/sge.c 2006-11-22 18:06:31.000000000 +0000
52021+++ linux-2.6.16.33/drivers/net/chelsio/sge.c 2007-05-23 21:00:01.000000000 +0000
52022@@ -1419,7 +1419,7 @@
52023 struct cpl_tx_pkt *cpl;
52024
52025 #ifdef NETIF_F_TSO
52026- if (skb_shinfo(skb)->tso_size) {
52027+ if (skb_is_gso(skb)) {
52028 int eth_type;
52029 struct cpl_tx_pkt_lso *hdr;
52030
52031@@ -1434,7 +1434,7 @@
52032 hdr->ip_hdr_words = skb->nh.iph->ihl;
52033 hdr->tcp_hdr_words = skb->h.th->doff;
52034 hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
52035- skb_shinfo(skb)->tso_size));
52036+ skb_shinfo(skb)->gso_size));
52037 hdr->len = htonl(skb->len - sizeof(*hdr));
52038 cpl = (struct cpl_tx_pkt *)hdr;
52039 sge->stats.tx_lso_pkts++;
52040diff -Nur linux-2.6.16.33-noxen/drivers/net/e1000/e1000_main.c linux-2.6.16.33/drivers/net/e1000/e1000_main.c
52041--- linux-2.6.16.33-noxen/drivers/net/e1000/e1000_main.c 2006-11-22 18:06:31.000000000 +0000
52042+++ linux-2.6.16.33/drivers/net/e1000/e1000_main.c 2007-05-23 21:00:01.000000000 +0000
52043@@ -2526,7 +2526,7 @@
52044 uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
52045 int err;
52046
52047- if (skb_shinfo(skb)->tso_size) {
52048+ if (skb_is_gso(skb)) {
52049 if (skb_header_cloned(skb)) {
52050 err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
52051 if (err)
52052@@ -2534,7 +2534,7 @@
52053 }
52054
52055 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
52056- mss = skb_shinfo(skb)->tso_size;
52057+ mss = skb_shinfo(skb)->gso_size;
52058 if (skb->protocol == ntohs(ETH_P_IP)) {
52059 skb->nh.iph->tot_len = 0;
52060 skb->nh.iph->check = 0;
52061@@ -2651,7 +2651,7 @@
52062 * tso gets written back prematurely before the data is fully
52063 * DMAd to the controller */
52064 if (!skb->data_len && tx_ring->last_tx_tso &&
52065- !skb_shinfo(skb)->tso_size) {
52066+ !skb_is_gso(skb)) {
52067 tx_ring->last_tx_tso = 0;
52068 size -= 4;
52069 }
52070@@ -2893,7 +2893,7 @@
52071 }
52072
52073 #ifdef NETIF_F_TSO
52074- mss = skb_shinfo(skb)->tso_size;
52075+ mss = skb_shinfo(skb)->gso_size;
52076 /* The controller does a simple calculation to
52077 * make sure there is enough room in the FIFO before
52078 * initiating the DMA for each buffer. The calc is:
52079@@ -2934,8 +2934,7 @@
52080
52081 #ifdef NETIF_F_TSO
52082 /* Controller Erratum workaround */
52083- if (!skb->data_len && tx_ring->last_tx_tso &&
52084- !skb_shinfo(skb)->tso_size)
52085+ if (!skb->data_len && tx_ring->last_tx_tso && !skb_is_gso(skb))
52086 count++;
52087 #endif
52088
52089diff -Nur linux-2.6.16.33-noxen/drivers/net/forcedeth.c linux-2.6.16.33/drivers/net/forcedeth.c
52090--- linux-2.6.16.33-noxen/drivers/net/forcedeth.c 2006-11-22 18:06:31.000000000 +0000
52091+++ linux-2.6.16.33/drivers/net/forcedeth.c 2007-05-23 21:00:01.000000000 +0000
52092@@ -482,9 +482,9 @@
52093 * critical parts:
52094 * - rx is (pseudo-) lockless: it relies on the single-threading provided
52095 * by the arch code for interrupts.
52096- * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission
52097+ * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
52098 * needs dev->priv->lock :-(
52099- * - set_multicast_list: preparation lockless, relies on dev->xmit_lock.
52100+ * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
52101 */
52102
52103 /* in dev: base, irq */
52104@@ -1016,7 +1016,7 @@
52105
52106 /*
52107 * nv_start_xmit: dev->hard_start_xmit function
52108- * Called with dev->xmit_lock held.
52109+ * Called with netif_tx_lock held.
52110 */
52111 static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
52112 {
52113@@ -1105,8 +1105,8 @@
52114 np->tx_skbuff[nr] = skb;
52115
52116 #ifdef NETIF_F_TSO
52117- if (skb_shinfo(skb)->tso_size)
52118- tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
52119+ if (skb_is_gso(skb))
52120+ tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
52121 else
52122 #endif
52123 tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
52124@@ -1203,7 +1203,7 @@
52125
52126 /*
52127 * nv_tx_timeout: dev->tx_timeout function
52128- * Called with dev->xmit_lock held.
52129+ * Called with netif_tx_lock held.
52130 */
52131 static void nv_tx_timeout(struct net_device *dev)
52132 {
52133@@ -1524,7 +1524,7 @@
52134 * Changing the MTU is a rare event, it shouldn't matter.
52135 */
52136 disable_irq(dev->irq);
52137- spin_lock_bh(&dev->xmit_lock);
52138+ netif_tx_lock_bh(dev);
52139 spin_lock(&np->lock);
52140 /* stop engines */
52141 nv_stop_rx(dev);
52142@@ -1559,7 +1559,7 @@
52143 nv_start_rx(dev);
52144 nv_start_tx(dev);
52145 spin_unlock(&np->lock);
52146- spin_unlock_bh(&dev->xmit_lock);
52147+ netif_tx_unlock_bh(dev);
52148 enable_irq(dev->irq);
52149 }
52150 return 0;
52151@@ -1594,7 +1594,7 @@
52152 memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN);
52153
52154 if (netif_running(dev)) {
52155- spin_lock_bh(&dev->xmit_lock);
52156+ netif_tx_lock_bh(dev);
52157 spin_lock_irq(&np->lock);
52158
52159 /* stop rx engine */
52160@@ -1606,7 +1606,7 @@
52161 /* restart rx engine */
52162 nv_start_rx(dev);
52163 spin_unlock_irq(&np->lock);
52164- spin_unlock_bh(&dev->xmit_lock);
52165+ netif_tx_unlock_bh(dev);
52166 } else {
52167 nv_copy_mac_to_hw(dev);
52168 }
52169@@ -1615,7 +1615,7 @@
52170
52171 /*
52172 * nv_set_multicast: dev->set_multicast function
52173- * Called with dev->xmit_lock held.
52174+ * Called with netif_tx_lock held.
52175 */
52176 static void nv_set_multicast(struct net_device *dev)
52177 {
52178diff -Nur linux-2.6.16.33-noxen/drivers/net/hamradio/6pack.c linux-2.6.16.33/drivers/net/hamradio/6pack.c
52179--- linux-2.6.16.33-noxen/drivers/net/hamradio/6pack.c 2006-11-22 18:06:31.000000000 +0000
52180+++ linux-2.6.16.33/drivers/net/hamradio/6pack.c 2007-05-23 21:00:01.000000000 +0000
52181@@ -308,9 +308,9 @@
52182 {
52183 struct sockaddr_ax25 *sa = addr;
52184
52185- spin_lock_irq(&dev->xmit_lock);
52186+ netif_tx_lock_bh(dev);
52187 memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
52188- spin_unlock_irq(&dev->xmit_lock);
52189+ netif_tx_unlock_bh(dev);
52190
52191 return 0;
52192 }
52193@@ -767,9 +767,9 @@
52194 break;
52195 }
52196
52197- spin_lock_irq(&dev->xmit_lock);
52198+ netif_tx_lock_bh(dev);
52199 memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN);
52200- spin_unlock_irq(&dev->xmit_lock);
52201+ netif_tx_unlock_bh(dev);
52202
52203 err = 0;
52204 break;
52205diff -Nur linux-2.6.16.33-noxen/drivers/net/hamradio/mkiss.c linux-2.6.16.33/drivers/net/hamradio/mkiss.c
52206--- linux-2.6.16.33-noxen/drivers/net/hamradio/mkiss.c 2006-11-22 18:06:31.000000000 +0000
52207+++ linux-2.6.16.33/drivers/net/hamradio/mkiss.c 2007-05-23 21:00:01.000000000 +0000
52208@@ -357,9 +357,9 @@
52209 {
52210 struct sockaddr_ax25 *sa = addr;
52211
52212- spin_lock_irq(&dev->xmit_lock);
52213+ netif_tx_lock_bh(dev);
52214 memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
52215- spin_unlock_irq(&dev->xmit_lock);
52216+ netif_tx_unlock_bh(dev);
52217
52218 return 0;
52219 }
52220@@ -886,9 +886,9 @@
52221 break;
52222 }
52223
52224- spin_lock_irq(&dev->xmit_lock);
52225+ netif_tx_lock_bh(dev);
52226 memcpy(dev->dev_addr, addr, AX25_ADDR_LEN);
52227- spin_unlock_irq(&dev->xmit_lock);
52228+ netif_tx_unlock_bh(dev);
52229
52230 err = 0;
52231 break;
52232diff -Nur linux-2.6.16.33-noxen/drivers/net/ifb.c linux-2.6.16.33/drivers/net/ifb.c
52233--- linux-2.6.16.33-noxen/drivers/net/ifb.c 2006-11-22 18:06:31.000000000 +0000
52234+++ linux-2.6.16.33/drivers/net/ifb.c 2007-05-23 21:00:01.000000000 +0000
52235@@ -76,13 +76,13 @@
52236 dp->st_task_enter++;
52237 if ((skb = skb_peek(&dp->tq)) == NULL) {
52238 dp->st_txq_refl_try++;
52239- if (spin_trylock(&_dev->xmit_lock)) {
52240+ if (netif_tx_trylock(_dev)) {
52241 dp->st_rxq_enter++;
52242 while ((skb = skb_dequeue(&dp->rq)) != NULL) {
52243 skb_queue_tail(&dp->tq, skb);
52244 dp->st_rx2tx_tran++;
52245 }
52246- spin_unlock(&_dev->xmit_lock);
52247+ netif_tx_unlock(_dev);
52248 } else {
52249 /* reschedule */
52250 dp->st_rxq_notenter++;
52251@@ -110,7 +110,7 @@
52252 }
52253 }
52254
52255- if (spin_trylock(&_dev->xmit_lock)) {
52256+ if (netif_tx_trylock(_dev)) {
52257 dp->st_rxq_check++;
52258 if ((skb = skb_peek(&dp->rq)) == NULL) {
52259 dp->tasklet_pending = 0;
52260@@ -118,10 +118,10 @@
52261 netif_wake_queue(_dev);
52262 } else {
52263 dp->st_rxq_rsch++;
52264- spin_unlock(&_dev->xmit_lock);
52265+ netif_tx_unlock(_dev);
52266 goto resched;
52267 }
52268- spin_unlock(&_dev->xmit_lock);
52269+ netif_tx_unlock(_dev);
52270 } else {
52271 resched:
52272 dp->tasklet_pending = 1;
52273diff -Nur linux-2.6.16.33-noxen/drivers/net/irda/vlsi_ir.c linux-2.6.16.33/drivers/net/irda/vlsi_ir.c
52274--- linux-2.6.16.33-noxen/drivers/net/irda/vlsi_ir.c 2006-11-22 18:06:31.000000000 +0000
52275+++ linux-2.6.16.33/drivers/net/irda/vlsi_ir.c 2007-05-23 21:00:01.000000000 +0000
52276@@ -959,7 +959,7 @@
52277 || (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec))
52278 break;
52279 udelay(100);
52280- /* must not sleep here - we are called under xmit_lock! */
52281+ /* must not sleep here - called under netif_tx_lock! */
52282 }
52283 }
52284
52285diff -Nur linux-2.6.16.33-noxen/drivers/net/ixgb/ixgb_main.c linux-2.6.16.33/drivers/net/ixgb/ixgb_main.c
52286--- linux-2.6.16.33-noxen/drivers/net/ixgb/ixgb_main.c 2006-11-22 18:06:31.000000000 +0000
52287+++ linux-2.6.16.33/drivers/net/ixgb/ixgb_main.c 2007-05-23 21:00:01.000000000 +0000
52288@@ -1163,7 +1163,7 @@
52289 uint16_t ipcse, tucse, mss;
52290 int err;
52291
52292- if(likely(skb_shinfo(skb)->tso_size)) {
52293+ if (likely(skb_is_gso(skb))) {
52294 if (skb_header_cloned(skb)) {
52295 err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
52296 if (err)
52297@@ -1171,7 +1171,7 @@
52298 }
52299
52300 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
52301- mss = skb_shinfo(skb)->tso_size;
52302+ mss = skb_shinfo(skb)->gso_size;
52303 skb->nh.iph->tot_len = 0;
52304 skb->nh.iph->check = 0;
52305 skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
52306diff -Nur linux-2.6.16.33-noxen/drivers/net/loopback.c linux-2.6.16.33/drivers/net/loopback.c
52307--- linux-2.6.16.33-noxen/drivers/net/loopback.c 2006-11-22 18:06:31.000000000 +0000
52308+++ linux-2.6.16.33/drivers/net/loopback.c 2007-05-23 21:00:01.000000000 +0000
52309@@ -74,7 +74,7 @@
52310 struct iphdr *iph = skb->nh.iph;
52311 struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
52312 unsigned int doffset = (iph->ihl + th->doff) * 4;
52313- unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
52314+ unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
52315 unsigned int offset = 0;
52316 u32 seq = ntohl(th->seq);
52317 u16 id = ntohs(iph->id);
52318@@ -139,7 +139,7 @@
52319 #endif
52320
52321 #ifdef LOOPBACK_TSO
52322- if (skb_shinfo(skb)->tso_size) {
52323+ if (skb_is_gso(skb)) {
52324 BUG_ON(skb->protocol != htons(ETH_P_IP));
52325 BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
52326
52327diff -Nur linux-2.6.16.33-noxen/drivers/net/mv643xx_eth.c linux-2.6.16.33/drivers/net/mv643xx_eth.c
52328--- linux-2.6.16.33-noxen/drivers/net/mv643xx_eth.c 2006-11-22 18:06:31.000000000 +0000
52329+++ linux-2.6.16.33/drivers/net/mv643xx_eth.c 2007-05-23 21:00:01.000000000 +0000
52330@@ -1107,7 +1107,7 @@
52331
52332 #ifdef MV643XX_CHECKSUM_OFFLOAD_TX
52333 if (has_tiny_unaligned_frags(skb)) {
52334- if ((skb_linearize(skb, GFP_ATOMIC) != 0)) {
52335+ if (__skb_linearize(skb)) {
52336 stats->tx_dropped++;
52337 printk(KERN_DEBUG "%s: failed to linearize tiny "
52338 "unaligned fragment\n", dev->name);
52339diff -Nur linux-2.6.16.33-noxen/drivers/net/natsemi.c linux-2.6.16.33/drivers/net/natsemi.c
52340--- linux-2.6.16.33-noxen/drivers/net/natsemi.c 2006-11-22 18:06:31.000000000 +0000
52341+++ linux-2.6.16.33/drivers/net/natsemi.c 2007-05-23 21:00:01.000000000 +0000
52342@@ -323,12 +323,12 @@
52343 The rx process only runs in the interrupt handler. Access from outside
52344 the interrupt handler is only permitted after disable_irq().
52345
52346-The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap
52347+The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap
52348 is set, then access is permitted under spin_lock_irq(&np->lock).
52349
52350 Thus configuration functions that want to access everything must call
52351 disable_irq(dev->irq);
52352- spin_lock_bh(dev->xmit_lock);
52353+ netif_tx_lock_bh(dev);
52354 spin_lock_irq(&np->lock);
52355
52356 IV. Notes
52357diff -Nur linux-2.6.16.33-noxen/drivers/net/r8169.c linux-2.6.16.33/drivers/net/r8169.c
52358--- linux-2.6.16.33-noxen/drivers/net/r8169.c 2006-11-22 18:06:31.000000000 +0000
52359+++ linux-2.6.16.33/drivers/net/r8169.c 2007-05-23 21:00:01.000000000 +0000
52360@@ -2171,7 +2171,7 @@
52361 static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
52362 {
52363 if (dev->features & NETIF_F_TSO) {
52364- u32 mss = skb_shinfo(skb)->tso_size;
52365+ u32 mss = skb_shinfo(skb)->gso_size;
52366
52367 if (mss)
52368 return LargeSend | ((mss & MSSMask) << MSSShift);
52369diff -Nur linux-2.6.16.33-noxen/drivers/net/s2io.c linux-2.6.16.33/drivers/net/s2io.c
52370--- linux-2.6.16.33-noxen/drivers/net/s2io.c 2006-11-22 18:06:31.000000000 +0000
52371+++ linux-2.6.16.33/drivers/net/s2io.c 2007-05-23 21:00:01.000000000 +0000
52372@@ -3522,8 +3522,8 @@
52373 txdp->Control_1 = 0;
52374 txdp->Control_2 = 0;
52375 #ifdef NETIF_F_TSO
52376- mss = skb_shinfo(skb)->tso_size;
52377- if (mss) {
52378+ mss = skb_shinfo(skb)->gso_size;
52379+ if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
52380 txdp->Control_1 |= TXD_TCP_LSO_EN;
52381 txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
52382 }
52383@@ -3543,10 +3543,10 @@
52384 }
52385
52386 frg_len = skb->len - skb->data_len;
52387- if (skb_shinfo(skb)->ufo_size) {
52388+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
52389 int ufo_size;
52390
52391- ufo_size = skb_shinfo(skb)->ufo_size;
52392+ ufo_size = skb_shinfo(skb)->gso_size;
52393 ufo_size &= ~7;
52394 txdp->Control_1 |= TXD_UFO_EN;
52395 txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
52396@@ -3572,7 +3572,7 @@
52397 txdp->Host_Control = (unsigned long) skb;
52398 txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
52399
52400- if (skb_shinfo(skb)->ufo_size)
52401+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52402 txdp->Control_1 |= TXD_UFO_EN;
52403
52404 frg_cnt = skb_shinfo(skb)->nr_frags;
52405@@ -3587,12 +3587,12 @@
52406 (sp->pdev, frag->page, frag->page_offset,
52407 frag->size, PCI_DMA_TODEVICE);
52408 txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
52409- if (skb_shinfo(skb)->ufo_size)
52410+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52411 txdp->Control_1 |= TXD_UFO_EN;
52412 }
52413 txdp->Control_1 |= TXD_GATHER_CODE_LAST;
52414
52415- if (skb_shinfo(skb)->ufo_size)
52416+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52417 frg_cnt++; /* as Txd0 was used for inband header */
52418
52419 tx_fifo = mac_control->tx_FIFO_start[queue];
52420@@ -3606,7 +3606,7 @@
52421 if (mss)
52422 val64 |= TX_FIFO_SPECIAL_FUNC;
52423 #endif
52424- if (skb_shinfo(skb)->ufo_size)
52425+ if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52426 val64 |= TX_FIFO_SPECIAL_FUNC;
52427 writeq(val64, &tx_fifo->List_Control);
52428
52429diff -Nur linux-2.6.16.33-noxen/drivers/net/sky2.c linux-2.6.16.33/drivers/net/sky2.c
52430--- linux-2.6.16.33-noxen/drivers/net/sky2.c 2006-11-22 18:06:31.000000000 +0000
52431+++ linux-2.6.16.33/drivers/net/sky2.c 2007-05-23 21:00:01.000000000 +0000
52432@@ -1141,7 +1141,7 @@
52433 count = sizeof(dma_addr_t) / sizeof(u32);
52434 count += skb_shinfo(skb)->nr_frags * count;
52435
52436- if (skb_shinfo(skb)->tso_size)
52437+ if (skb_is_gso(skb))
52438 ++count;
52439
52440 if (skb->ip_summed == CHECKSUM_HW)
52441@@ -1213,7 +1213,7 @@
52442 }
52443
52444 /* Check for TCP Segmentation Offload */
52445- mss = skb_shinfo(skb)->tso_size;
52446+ mss = skb_shinfo(skb)->gso_size;
52447 if (mss != 0) {
52448 /* just drop the packet if non-linear expansion fails */
52449 if (skb_header_cloned(skb) &&
52450diff -Nur linux-2.6.16.33-noxen/drivers/net/sky2.c~ linux-2.6.16.33/drivers/net/sky2.c~
52451--- linux-2.6.16.33-noxen/drivers/net/sky2.c~ 1970-01-01 00:00:00.000000000 +0000
52452+++ linux-2.6.16.33/drivers/net/sky2.c~ 2007-05-23 21:00:01.000000000 +0000
52453@@ -0,0 +1,3425 @@
52454+/*
52455+ * New driver for Marvell Yukon 2 chipset.
52456+ * Based on earlier sk98lin, and skge driver.
52457+ *
52458+ * This driver intentionally does not support all the features
52459+ * of the original driver such as link fail-over and link management because
52460+ * those should be done at higher levels.
52461+ *
52462+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
52463+ *
52464+ * This program is free software; you can redistribute it and/or modify
52465+ * it under the terms of the GNU General Public License as published by
52466+ * the Free Software Foundation; either version 2 of the License, or
52467+ * (at your option) any later version.
52468+ *
52469+ * This program is distributed in the hope that it will be useful,
52470+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
52471+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
52472+ * GNU General Public License for more details.
52473+ *
52474+ * You should have received a copy of the GNU General Public License
52475+ * along with this program; if not, write to the Free Software
52476+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
52477+ */
52478+
52479+#include <linux/config.h>
52480+#include <linux/crc32.h>
52481+#include <linux/kernel.h>
52482+#include <linux/version.h>
52483+#include <linux/module.h>
52484+#include <linux/netdevice.h>
52485+#include <linux/dma-mapping.h>
52486+#include <linux/etherdevice.h>
52487+#include <linux/ethtool.h>
52488+#include <linux/pci.h>
52489+#include <linux/ip.h>
52490+#include <linux/tcp.h>
52491+#include <linux/in.h>
52492+#include <linux/delay.h>
52493+#include <linux/workqueue.h>
52494+#include <linux/if_vlan.h>
52495+#include <linux/prefetch.h>
52496+#include <linux/mii.h>
52497+
52498+#include <asm/irq.h>
52499+
52500+#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
52501+#define SKY2_VLAN_TAG_USED 1
52502+#endif
52503+
52504+#include "sky2.h"
52505+
52506+#define DRV_NAME "sky2"
52507+#define DRV_VERSION "0.15"
52508+#define PFX DRV_NAME " "
52509+
52510+/*
52511+ * The Yukon II chipset takes 64 bit command blocks (called list elements)
52512+ * that are organized into three (receive, transmit, status) different rings
52513+ * similar to Tigon3. A transmit can require several elements;
52514+ * a receive requires one (or two if using 64 bit dma).
52515+ */
52516+
52517+#define is_ec_a1(hw) \
52518+ unlikely((hw)->chip_id == CHIP_ID_YUKON_EC && \
52519+ (hw)->chip_rev == CHIP_REV_YU_EC_A1)
52520+
52521+#define RX_LE_SIZE 512
52522+#define RX_LE_BYTES (RX_LE_SIZE*sizeof(struct sky2_rx_le))
52523+#define RX_MAX_PENDING (RX_LE_SIZE/2 - 2)
52524+#define RX_DEF_PENDING RX_MAX_PENDING
52525+#define RX_SKB_ALIGN 8
52526+
52527+#define TX_RING_SIZE 512
52528+#define TX_DEF_PENDING (TX_RING_SIZE - 1)
52529+#define TX_MIN_PENDING 64
52530+#define MAX_SKB_TX_LE (4 + (sizeof(dma_addr_t)/sizeof(u32))*MAX_SKB_FRAGS)
52531+
52532+#define STATUS_RING_SIZE 2048 /* 2 ports * (TX + 2*RX) */
52533+#define STATUS_LE_BYTES (STATUS_RING_SIZE*sizeof(struct sky2_status_le))
52534+#define ETH_JUMBO_MTU 9000
52535+#define TX_WATCHDOG (5 * HZ)
52536+#define NAPI_WEIGHT 64
52537+#define PHY_RETRIES 1000
52538+
52539+static const u32 default_msg =
52540+ NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
52541+ | NETIF_MSG_TIMER | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR
52542+ | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
52543+
52544+static int debug = -1; /* defaults above */
52545+module_param(debug, int, 0);
52546+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
52547+
52548+static int copybreak __read_mostly = 256;
52549+module_param(copybreak, int, 0);
52550+MODULE_PARM_DESC(copybreak, "Receive copy threshold");
52551+
52552+static const struct pci_device_id sky2_id_table[] = {
52553+ { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) },
52554+ { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) },
52555+ { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b00) },
52556+ { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b01) },
52557+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4340) },
52558+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4341) },
52559+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4342) },
52560+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4343) },
52561+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4344) },
52562+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4345) },
52563+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4346) },
52564+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4347) },
52565+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4350) },
52566+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) },
52567+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) },
52568+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) },
52569+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) },
52570+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) },
52571+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4363) },
52572+ { 0 }
52573+};
52574+
52575+MODULE_DEVICE_TABLE(pci, sky2_id_table);
52576+
52577+/* Avoid conditionals by using array */
52578+static const unsigned txqaddr[] = { Q_XA1, Q_XA2 };
52579+static const unsigned rxqaddr[] = { Q_R1, Q_R2 };
52580+
52581+/* This driver supports yukon2 chipset only */
52582+static const char *yukon2_name[] = {
52583+ "XL", /* 0xb3 */
52584+ "EC Ultra", /* 0xb4 */
52585+ "UNKNOWN", /* 0xb5 */
52586+ "EC", /* 0xb6 */
52587+ "FE", /* 0xb7 */
52588+};
52589+
52590+/* Access to external PHY */
52591+static int gm_phy_write(struct sky2_hw *hw, unsigned port, u16 reg, u16 val)
52592+{
52593+ int i;
52594+
52595+ gma_write16(hw, port, GM_SMI_DATA, val);
52596+ gma_write16(hw, port, GM_SMI_CTRL,
52597+ GM_SMI_CT_PHY_AD(PHY_ADDR_MARV) | GM_SMI_CT_REG_AD(reg));
52598+
52599+ for (i = 0; i < PHY_RETRIES; i++) {
52600+ if (!(gma_read16(hw, port, GM_SMI_CTRL) & GM_SMI_CT_BUSY))
52601+ return 0;
52602+ udelay(1);
52603+ }
52604+
52605+ printk(KERN_WARNING PFX "%s: phy write timeout\n", hw->dev[port]->name);
52606+ return -ETIMEDOUT;
52607+}
52608+
52609+static int __gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg, u16 *val)
52610+{
52611+ int i;
52612+
52613+ gma_write16(hw, port, GM_SMI_CTRL, GM_SMI_CT_PHY_AD(PHY_ADDR_MARV)
52614+ | GM_SMI_CT_REG_AD(reg) | GM_SMI_CT_OP_RD);
52615+
52616+ for (i = 0; i < PHY_RETRIES; i++) {
52617+ if (gma_read16(hw, port, GM_SMI_CTRL) & GM_SMI_CT_RD_VAL) {
52618+ *val = gma_read16(hw, port, GM_SMI_DATA);
52619+ return 0;
52620+ }
52621+
52622+ udelay(1);
52623+ }
52624+
52625+ return -ETIMEDOUT;
52626+}
52627+
52628+static u16 gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg)
52629+{
52630+ u16 v;
52631+
52632+ if (__gm_phy_read(hw, port, reg, &v) != 0)
52633+ printk(KERN_WARNING PFX "%s: phy read timeout\n", hw->dev[port]->name);
52634+ return v;
52635+}
52636+
52637+static int sky2_set_power_state(struct sky2_hw *hw, pci_power_t state)
52638+{
52639+ u16 power_control;
52640+ u32 reg1;
52641+ int vaux;
52642+ int ret = 0;
52643+
52644+ pr_debug("sky2_set_power_state %d\n", state);
52645+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
52646+
52647+ power_control = sky2_pci_read16(hw, hw->pm_cap + PCI_PM_PMC);
52648+ vaux = (sky2_read16(hw, B0_CTST) & Y2_VAUX_AVAIL) &&
52649+ (power_control & PCI_PM_CAP_PME_D3cold);
52650+
52651+ power_control = sky2_pci_read16(hw, hw->pm_cap + PCI_PM_CTRL);
52652+
52653+ power_control |= PCI_PM_CTRL_PME_STATUS;
52654+ power_control &= ~(PCI_PM_CTRL_STATE_MASK);
52655+
52656+ switch (state) {
52657+ case PCI_D0:
52658+ /* switch power to VCC (WA for VAUX problem) */
52659+ sky2_write8(hw, B0_POWER_CTRL,
52660+ PC_VAUX_ENA | PC_VCC_ENA | PC_VAUX_OFF | PC_VCC_ON);
52661+
52662+ /* disable Core Clock Division, */
52663+ sky2_write32(hw, B2_Y2_CLK_CTRL, Y2_CLK_DIV_DIS);
52664+
52665+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52666+ /* enable bits are inverted */
52667+ sky2_write8(hw, B2_Y2_CLK_GATE,
52668+ Y2_PCI_CLK_LNK1_DIS | Y2_COR_CLK_LNK1_DIS |
52669+ Y2_CLK_GAT_LNK1_DIS | Y2_PCI_CLK_LNK2_DIS |
52670+ Y2_COR_CLK_LNK2_DIS | Y2_CLK_GAT_LNK2_DIS);
52671+ else
52672+ sky2_write8(hw, B2_Y2_CLK_GATE, 0);
52673+
52674+ /* Turn off phy power saving */
52675+ reg1 = sky2_pci_read32(hw, PCI_DEV_REG1);
52676+ reg1 &= ~(PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52677+
52678+ /* looks like this XL is back asswards .. */
52679+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1) {
52680+ reg1 |= PCI_Y2_PHY1_COMA;
52681+ if (hw->ports > 1)
52682+ reg1 |= PCI_Y2_PHY2_COMA;
52683+ }
52684+
52685+ if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
52686+ sky2_pci_write32(hw, PCI_DEV_REG3, 0);
52687+ reg1 = sky2_pci_read32(hw, PCI_DEV_REG4);
52688+ reg1 &= P_ASPM_CONTROL_MSK;
52689+ sky2_pci_write32(hw, PCI_DEV_REG4, reg1);
52690+ sky2_pci_write32(hw, PCI_DEV_REG5, 0);
52691+ }
52692+
52693+ sky2_pci_write32(hw, PCI_DEV_REG1, reg1);
52694+
52695+ break;
52696+
52697+ case PCI_D3hot:
52698+ case PCI_D3cold:
52699+ /* Turn on phy power saving */
52700+ reg1 = sky2_pci_read32(hw, PCI_DEV_REG1);
52701+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52702+ reg1 &= ~(PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52703+ else
52704+ reg1 |= (PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52705+ sky2_pci_write32(hw, PCI_DEV_REG1, reg1);
52706+
52707+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52708+ sky2_write8(hw, B2_Y2_CLK_GATE, 0);
52709+ else
52710+ /* enable bits are inverted */
52711+ sky2_write8(hw, B2_Y2_CLK_GATE,
52712+ Y2_PCI_CLK_LNK1_DIS | Y2_COR_CLK_LNK1_DIS |
52713+ Y2_CLK_GAT_LNK1_DIS | Y2_PCI_CLK_LNK2_DIS |
52714+ Y2_COR_CLK_LNK2_DIS | Y2_CLK_GAT_LNK2_DIS);
52715+
52716+ /* switch power to VAUX */
52717+ if (vaux && state != PCI_D3cold)
52718+ sky2_write8(hw, B0_POWER_CTRL,
52719+ (PC_VAUX_ENA | PC_VCC_ENA |
52720+ PC_VAUX_ON | PC_VCC_OFF));
52721+ break;
52722+ default:
52723+ printk(KERN_ERR PFX "Unknown power state %d\n", state);
52724+ ret = -1;
52725+ }
52726+
52727+ sky2_pci_write16(hw, hw->pm_cap + PCI_PM_CTRL, power_control);
52728+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
52729+ return ret;
52730+}
52731+
52732+static void sky2_phy_reset(struct sky2_hw *hw, unsigned port)
52733+{
52734+ u16 reg;
52735+
52736+ /* disable all GMAC IRQ's */
52737+ sky2_write8(hw, SK_REG(port, GMAC_IRQ_MSK), 0);
52738+ /* disable PHY IRQs */
52739+ gm_phy_write(hw, port, PHY_MARV_INT_MASK, 0);
52740+
52741+ gma_write16(hw, port, GM_MC_ADDR_H1, 0); /* clear MC hash */
52742+ gma_write16(hw, port, GM_MC_ADDR_H2, 0);
52743+ gma_write16(hw, port, GM_MC_ADDR_H3, 0);
52744+ gma_write16(hw, port, GM_MC_ADDR_H4, 0);
52745+
52746+ reg = gma_read16(hw, port, GM_RX_CTRL);
52747+ reg |= GM_RXCR_UCF_ENA | GM_RXCR_MCF_ENA;
52748+ gma_write16(hw, port, GM_RX_CTRL, reg);
52749+}
52750+
52751+static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
52752+{
52753+ struct sky2_port *sky2 = netdev_priv(hw->dev[port]);
52754+ u16 ctrl, ct1000, adv, pg, ledctrl, ledover;
52755+
52756+ if (sky2->autoneg == AUTONEG_ENABLE && hw->chip_id != CHIP_ID_YUKON_XL) {
52757+ u16 ectrl = gm_phy_read(hw, port, PHY_MARV_EXT_CTRL);
52758+
52759+ ectrl &= ~(PHY_M_EC_M_DSC_MSK | PHY_M_EC_S_DSC_MSK |
52760+ PHY_M_EC_MAC_S_MSK);
52761+ ectrl |= PHY_M_EC_MAC_S(MAC_TX_CLK_25_MHZ);
52762+
52763+ if (hw->chip_id == CHIP_ID_YUKON_EC)
52764+ ectrl |= PHY_M_EC_DSC_2(2) | PHY_M_EC_DOWN_S_ENA;
52765+ else
52766+ ectrl |= PHY_M_EC_M_DSC(2) | PHY_M_EC_S_DSC(3);
52767+
52768+ gm_phy_write(hw, port, PHY_MARV_EXT_CTRL, ectrl);
52769+ }
52770+
52771+ ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52772+ if (sky2_is_copper(hw)) {
52773+ if (hw->chip_id == CHIP_ID_YUKON_FE) {
52774+ /* enable automatic crossover */
52775+ ctrl |= PHY_M_PC_MDI_XMODE(PHY_M_PC_ENA_AUTO) >> 1;
52776+ } else {
52777+ /* disable energy detect */
52778+ ctrl &= ~PHY_M_PC_EN_DET_MSK;
52779+
52780+ /* enable automatic crossover */
52781+ ctrl |= PHY_M_PC_MDI_XMODE(PHY_M_PC_ENA_AUTO);
52782+
52783+ if (sky2->autoneg == AUTONEG_ENABLE &&
52784+ hw->chip_id == CHIP_ID_YUKON_XL) {
52785+ ctrl &= ~PHY_M_PC_DSC_MSK;
52786+ ctrl |= PHY_M_PC_DSC(2) | PHY_M_PC_DOWN_S_ENA;
52787+ }
52788+ }
52789+ } else {
52790+ /* workaround for deviation #4.88 (CRC errors) */
52791+ /* disable Automatic Crossover */
52792+
52793+ ctrl &= ~PHY_M_PC_MDIX_MSK;
52794+ }
52795+
52796+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ctrl);
52797+
52798+ /* special setup for PHY 88E1112 Fiber */
52799+ if (hw->chip_id == CHIP_ID_YUKON_XL && !sky2_is_copper(hw)) {
52800+ pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
52801+
52802+ /* Fiber: select 1000BASE-X only mode MAC Specific Ctrl Reg. */
52803+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 2);
52804+ ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52805+ ctrl &= ~PHY_M_MAC_MD_MSK;
52806+ ctrl |= PHY_M_MAC_MODE_SEL(PHY_M_MAC_MD_1000BX);
52807+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ctrl);
52808+
52809+ if (hw->pmd_type == 'P') {
52810+ /* select page 1 to access Fiber registers */
52811+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 1);
52812+
52813+ /* for SFP-module set SIGDET polarity to low */
52814+ ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52815+ ctrl |= PHY_M_FIB_SIGD_POL;
52816+ gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52817+ }
52818+
52819+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
52820+ }
52821+
52822+ ctrl = gm_phy_read(hw, port, PHY_MARV_CTRL);
52823+ if (sky2->autoneg == AUTONEG_DISABLE)
52824+ ctrl &= ~PHY_CT_ANE;
52825+ else
52826+ ctrl |= PHY_CT_ANE;
52827+
52828+ ctrl |= PHY_CT_RESET;
52829+ gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52830+
52831+ ctrl = 0;
52832+ ct1000 = 0;
52833+ adv = PHY_AN_CSMA;
52834+
52835+ if (sky2->autoneg == AUTONEG_ENABLE) {
52836+ if (sky2_is_copper(hw)) {
52837+ if (sky2->advertising & ADVERTISED_1000baseT_Full)
52838+ ct1000 |= PHY_M_1000C_AFD;
52839+ if (sky2->advertising & ADVERTISED_1000baseT_Half)
52840+ ct1000 |= PHY_M_1000C_AHD;
52841+ if (sky2->advertising & ADVERTISED_100baseT_Full)
52842+ adv |= PHY_M_AN_100_FD;
52843+ if (sky2->advertising & ADVERTISED_100baseT_Half)
52844+ adv |= PHY_M_AN_100_HD;
52845+ if (sky2->advertising & ADVERTISED_10baseT_Full)
52846+ adv |= PHY_M_AN_10_FD;
52847+ if (sky2->advertising & ADVERTISED_10baseT_Half)
52848+ adv |= PHY_M_AN_10_HD;
52849+ } else { /* special defines for FIBER (88E1040S only) */
52850+ if (sky2->advertising & ADVERTISED_1000baseT_Full)
52851+ adv |= PHY_M_AN_1000X_AFD;
52852+ if (sky2->advertising & ADVERTISED_1000baseT_Half)
52853+ adv |= PHY_M_AN_1000X_AHD;
52854+ }
52855+
52856+ /* Set Flow-control capabilities */
52857+ if (sky2->tx_pause && sky2->rx_pause)
52858+ adv |= PHY_AN_PAUSE_CAP; /* symmetric */
52859+ else if (sky2->rx_pause && !sky2->tx_pause)
52860+ adv |= PHY_AN_PAUSE_ASYM | PHY_AN_PAUSE_CAP;
52861+ else if (!sky2->rx_pause && sky2->tx_pause)
52862+ adv |= PHY_AN_PAUSE_ASYM; /* local */
52863+
52864+ /* Restart Auto-negotiation */
52865+ ctrl |= PHY_CT_ANE | PHY_CT_RE_CFG;
52866+ } else {
52867+ /* forced speed/duplex settings */
52868+ ct1000 = PHY_M_1000C_MSE;
52869+
52870+ if (sky2->duplex == DUPLEX_FULL)
52871+ ctrl |= PHY_CT_DUP_MD;
52872+
52873+ switch (sky2->speed) {
52874+ case SPEED_1000:
52875+ ctrl |= PHY_CT_SP1000;
52876+ break;
52877+ case SPEED_100:
52878+ ctrl |= PHY_CT_SP100;
52879+ break;
52880+ }
52881+
52882+ ctrl |= PHY_CT_RESET;
52883+ }
52884+
52885+ if (hw->chip_id != CHIP_ID_YUKON_FE)
52886+ gm_phy_write(hw, port, PHY_MARV_1000T_CTRL, ct1000);
52887+
52888+ gm_phy_write(hw, port, PHY_MARV_AUNE_ADV, adv);
52889+ gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52890+
52891+ /* Setup Phy LED's */
52892+ ledctrl = PHY_M_LED_PULS_DUR(PULS_170MS);
52893+ ledover = 0;
52894+
52895+ switch (hw->chip_id) {
52896+ case CHIP_ID_YUKON_FE:
52897+ /* on 88E3082 these bits are at 11..9 (shifted left) */
52898+ ledctrl |= PHY_M_LED_BLINK_RT(BLINK_84MS) << 1;
52899+
52900+ ctrl = gm_phy_read(hw, port, PHY_MARV_FE_LED_PAR);
52901+
52902+ /* delete ACT LED control bits */
52903+ ctrl &= ~PHY_M_FELP_LED1_MSK;
52904+ /* change ACT LED control to blink mode */
52905+ ctrl |= PHY_M_FELP_LED1_CTRL(LED_PAR_CTRL_ACT_BL);
52906+ gm_phy_write(hw, port, PHY_MARV_FE_LED_PAR, ctrl);
52907+ break;
52908+
52909+ case CHIP_ID_YUKON_XL:
52910+ pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
52911+
52912+ /* select page 3 to access LED control register */
52913+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
52914+
52915+ /* set LED Function Control register */
52916+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, (PHY_M_LEDC_LOS_CTRL(1) | /* LINK/ACT */
52917+ PHY_M_LEDC_INIT_CTRL(7) | /* 10 Mbps */
52918+ PHY_M_LEDC_STA1_CTRL(7) | /* 100 Mbps */
52919+ PHY_M_LEDC_STA0_CTRL(7))); /* 1000 Mbps */
52920+
52921+ /* set Polarity Control register */
52922+ gm_phy_write(hw, port, PHY_MARV_PHY_STAT,
52923+ (PHY_M_POLC_LS1_P_MIX(4) |
52924+ PHY_M_POLC_IS0_P_MIX(4) |
52925+ PHY_M_POLC_LOS_CTRL(2) |
52926+ PHY_M_POLC_INIT_CTRL(2) |
52927+ PHY_M_POLC_STA1_CTRL(2) |
52928+ PHY_M_POLC_STA0_CTRL(2)));
52929+
52930+ /* restore page register */
52931+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
52932+ break;
52933+
52934+ default:
52935+ /* set Tx LED (LED_TX) to blink mode on Rx OR Tx activity */
52936+ ledctrl |= PHY_M_LED_BLINK_RT(BLINK_84MS) | PHY_M_LEDC_TX_CTRL;
52937+ /* turn off the Rx LED (LED_RX) */
52938+ ledover |= PHY_M_LED_MO_RX(MO_LED_OFF);
52939+ }
52940+
52941+ if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev >= 2) {
52942+ /* apply fixes in PHY AFE */
52943+ gm_phy_write(hw, port, 22, 255);
52944+ /* increase differential signal amplitude in 10BASE-T */
52945+ gm_phy_write(hw, port, 24, 0xaa99);
52946+ gm_phy_write(hw, port, 23, 0x2011);
52947+
52948+ /* fix for IEEE A/B Symmetry failure in 1000BASE-T */
52949+ gm_phy_write(hw, port, 24, 0xa204);
52950+ gm_phy_write(hw, port, 23, 0x2002);
52951+
52952+ /* set page register to 0 */
52953+ gm_phy_write(hw, port, 22, 0);
52954+ } else {
52955+ gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
52956+
52957+ if (sky2->autoneg == AUTONEG_DISABLE || sky2->speed == SPEED_100) {
52958+ /* turn on 100 Mbps LED (LED_LINK100) */
52959+ ledover |= PHY_M_LED_MO_100(MO_LED_ON);
52960+ }
52961+
52962+ if (ledover)
52963+ gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
52964+
52965+ }
52966+ /* Enable phy interrupt on auto-negotiation complete (or link up) */
52967+ if (sky2->autoneg == AUTONEG_ENABLE)
52968+ gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_IS_AN_COMPL);
52969+ else
52970+ gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_DEF_MSK);
52971+}
52972+
52973+/* Force a renegotiation */
52974+static void sky2_phy_reinit(struct sky2_port *sky2)
52975+{
52976+ down(&sky2->phy_sema);
52977+ sky2_phy_init(sky2->hw, sky2->port);
52978+ up(&sky2->phy_sema);
52979+}
52980+
52981+static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
52982+{
52983+ struct sky2_port *sky2 = netdev_priv(hw->dev[port]);
52984+ u16 reg;
52985+ int i;
52986+ const u8 *addr = hw->dev[port]->dev_addr;
52987+
52988+ sky2_write32(hw, SK_REG(port, GPHY_CTRL), GPC_RST_SET);
52989+ sky2_write32(hw, SK_REG(port, GPHY_CTRL), GPC_RST_CLR|GPC_ENA_PAUSE);
52990+
52991+ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_RST_CLR);
52992+
52993+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0 && port == 1) {
52994+ /* WA DEV_472 -- looks like crossed wires on port 2 */
52995+ /* clear GMAC 1 Control reset */
52996+ sky2_write8(hw, SK_REG(0, GMAC_CTRL), GMC_RST_CLR);
52997+ do {
52998+ sky2_write8(hw, SK_REG(1, GMAC_CTRL), GMC_RST_SET);
52999+ sky2_write8(hw, SK_REG(1, GMAC_CTRL), GMC_RST_CLR);
53000+ } while (gm_phy_read(hw, 1, PHY_MARV_ID0) != PHY_MARV_ID0_VAL ||
53001+ gm_phy_read(hw, 1, PHY_MARV_ID1) != PHY_MARV_ID1_Y2 ||
53002+ gm_phy_read(hw, 1, PHY_MARV_INT_MASK) != 0);
53003+ }
53004+
53005+ if (sky2->autoneg == AUTONEG_DISABLE) {
53006+ reg = gma_read16(hw, port, GM_GP_CTRL);
53007+ reg |= GM_GPCR_AU_ALL_DIS;
53008+ gma_write16(hw, port, GM_GP_CTRL, reg);
53009+ gma_read16(hw, port, GM_GP_CTRL);
53010+
53011+ switch (sky2->speed) {
53012+ case SPEED_1000:
53013+ reg &= ~GM_GPCR_SPEED_100;
53014+ reg |= GM_GPCR_SPEED_1000;
53015+ break;
53016+ case SPEED_100:
53017+ reg &= ~GM_GPCR_SPEED_1000;
53018+ reg |= GM_GPCR_SPEED_100;
53019+ break;
53020+ case SPEED_10:
53021+ reg &= ~(GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100);
53022+ break;
53023+ }
53024+
53025+ if (sky2->duplex == DUPLEX_FULL)
53026+ reg |= GM_GPCR_DUP_FULL;
53027+ } else
53028+ reg = GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100 | GM_GPCR_DUP_FULL;
53029+
53030+ if (!sky2->tx_pause && !sky2->rx_pause) {
53031+ sky2_write32(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
53032+ reg |=
53033+ GM_GPCR_FC_TX_DIS | GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS;
53034+ } else if (sky2->tx_pause && !sky2->rx_pause) {
53035+ /* disable Rx flow-control */
53036+ reg |= GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS;
53037+ }
53038+
53039+ gma_write16(hw, port, GM_GP_CTRL, reg);
53040+
53041+ sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC));
53042+
53043+ down(&sky2->phy_sema);
53044+ sky2_phy_init(hw, port);
53045+ up(&sky2->phy_sema);
53046+
53047+ /* MIB clear */
53048+ reg = gma_read16(hw, port, GM_PHY_ADDR);
53049+ gma_write16(hw, port, GM_PHY_ADDR, reg | GM_PAR_MIB_CLR);
53050+
53051+ for (i = GM_MIB_CNT_BASE; i <= GM_MIB_CNT_END; i += 4)
53052+ gma_read16(hw, port, i);
53053+ gma_write16(hw, port, GM_PHY_ADDR, reg);
53054+
53055+ /* transmit control */
53056+ gma_write16(hw, port, GM_TX_CTRL, TX_COL_THR(TX_COL_DEF));
53057+
53058+ /* receive control reg: unicast + multicast + no FCS */
53059+ gma_write16(hw, port, GM_RX_CTRL,
53060+ GM_RXCR_UCF_ENA | GM_RXCR_CRC_DIS | GM_RXCR_MCF_ENA);
53061+
53062+ /* transmit flow control */
53063+ gma_write16(hw, port, GM_TX_FLOW_CTRL, 0xffff);
53064+
53065+ /* transmit parameter */
53066+ gma_write16(hw, port, GM_TX_PARAM,
53067+ TX_JAM_LEN_VAL(TX_JAM_LEN_DEF) |
53068+ TX_JAM_IPG_VAL(TX_JAM_IPG_DEF) |
53069+ TX_IPG_JAM_DATA(TX_IPG_JAM_DEF) |
53070+ TX_BACK_OFF_LIM(TX_BOF_LIM_DEF));
53071+
53072+ /* serial mode register */
53073+ reg = DATA_BLIND_VAL(DATA_BLIND_DEF) |
53074+ GM_SMOD_VLAN_ENA | IPG_DATA_VAL(IPG_DATA_DEF);
53075+
53076+ if (hw->dev[port]->mtu > ETH_DATA_LEN)
53077+ reg |= GM_SMOD_JUMBO_ENA;
53078+
53079+ gma_write16(hw, port, GM_SERIAL_MODE, reg);
53080+
53081+ /* virtual address for data */
53082+ gma_set_addr(hw, port, GM_SRC_ADDR_2L, addr);
53083+
53084+ /* physical address: used for pause frames */
53085+ gma_set_addr(hw, port, GM_SRC_ADDR_1L, addr);
53086+
53087+ /* ignore counter overflows */
53088+ gma_write16(hw, port, GM_TX_IRQ_MSK, 0);
53089+ gma_write16(hw, port, GM_RX_IRQ_MSK, 0);
53090+ gma_write16(hw, port, GM_TR_IRQ_MSK, 0);
53091+
53092+ /* Configure Rx MAC FIFO */
53093+ sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_CLR);
53094+ sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
53095+ GMF_OPER_ON | GMF_RX_F_FL_ON);
53096+
53097+ /* Flush Rx MAC FIFO on any flow control or error */
53098+ sky2_write16(hw, SK_REG(port, RX_GMF_FL_MSK), GMR_FS_ANY_ERR);
53099+
53100+ /* Set threshold to 0xa (64 bytes)
53101+ * ASF disabled so no need to do WA dev #4.30
53102+ */
53103+ sky2_write16(hw, SK_REG(port, RX_GMF_FL_THR), RX_GMF_FL_THR_DEF);
53104+
53105+ /* Configure Tx MAC FIFO */
53106+ sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_CLR);
53107+ sky2_write16(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_OPER_ON);
53108+
53109+ if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
53110+ sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 768/8);
53111+ sky2_write8(hw, SK_REG(port, RX_GMF_UP_THR), 1024/8);
53112+ if (hw->dev[port]->mtu > ETH_DATA_LEN) {
53113+ /* set Tx GMAC FIFO Almost Empty Threshold */
53114+ sky2_write32(hw, SK_REG(port, TX_GMF_AE_THR), 0x180);
53115+ /* Disable Store & Forward mode for TX */
53116+ sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_STFW_DIS);
53117+ }
53118+ }
53119+
53120+}
53121+
53122+/* Assign Ram Buffer allocation.
53123+ * start and end are in units of 4k bytes
53124+ * ram registers are in units of 64bit words
53125+ */
53126+static void sky2_ramset(struct sky2_hw *hw, u16 q, u8 startk, u8 endk)
53127+{
53128+ u32 start, end;
53129+
53130+ start = startk * 4096/8;
53131+ end = (endk * 4096/8) - 1;
53132+
53133+ sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_RST_CLR);
53134+ sky2_write32(hw, RB_ADDR(q, RB_START), start);
53135+ sky2_write32(hw, RB_ADDR(q, RB_END), end);
53136+ sky2_write32(hw, RB_ADDR(q, RB_WP), start);
53137+ sky2_write32(hw, RB_ADDR(q, RB_RP), start);
53138+
53139+ if (q == Q_R1 || q == Q_R2) {
53140+ u32 space = (endk - startk) * 4096/8;
53141+ u32 tp = space - space/4;
53142+
53143+ /* On receive queue's set the thresholds
53144+ * give receiver priority when > 3/4 full
53145+ * send pause when down to 2K
53146+ */
53147+ sky2_write32(hw, RB_ADDR(q, RB_RX_UTHP), tp);
53148+ sky2_write32(hw, RB_ADDR(q, RB_RX_LTHP), space/2);
53149+
53150+ tp = space - 2048/8;
53151+ sky2_write32(hw, RB_ADDR(q, RB_RX_UTPP), tp);
53152+ sky2_write32(hw, RB_ADDR(q, RB_RX_LTPP), space/4);
53153+ } else {
53154+ /* Enable store & forward on Tx queue's because
53155+ * Tx FIFO is only 1K on Yukon
53156+ */
53157+ sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_ENA_STFWD);
53158+ }
53159+
53160+ sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_ENA_OP_MD);
53161+ sky2_read8(hw, RB_ADDR(q, RB_CTRL));
53162+}
53163+
53164+/* Setup Bus Memory Interface */
53165+static void sky2_qset(struct sky2_hw *hw, u16 q)
53166+{
53167+ sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_CLR_RESET);
53168+ sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_OPER_INIT);
53169+ sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_FIFO_OP_ON);
53170+ sky2_write32(hw, Q_ADDR(q, Q_WM), BMU_WM_DEFAULT);
53171+}
53172+
53173+/* Setup prefetch unit registers. This is the interface between
53174+ * hardware and driver list elements
53175+ */
53176+static void sky2_prefetch_init(struct sky2_hw *hw, u32 qaddr,
53177+ u64 addr, u32 last)
53178+{
53179+ sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
53180+ sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_RST_CLR);
53181+ sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_ADDR_HI), addr >> 32);
53182+ sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_ADDR_LO), (u32) addr);
53183+ sky2_write16(hw, Y2_QADDR(qaddr, PREF_UNIT_LAST_IDX), last);
53184+ sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_OP_ON);
53185+
53186+ sky2_read32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL));
53187+}
53188+
53189+static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2)
53190+{
53191+ struct sky2_tx_le *le = sky2->tx_le + sky2->tx_prod;
53192+
53193+ sky2->tx_prod = (sky2->tx_prod + 1) % TX_RING_SIZE;
53194+ return le;
53195+}
53196+
53197+/*
53198+ * This is a workaround code taken from SysKonnect sk98lin driver
53199+ * to deal with chip bug on Yukon EC rev 0 in the wraparound case.
53200+ */
53201+static void sky2_put_idx(struct sky2_hw *hw, unsigned q,
53202+ u16 idx, u16 *last, u16 size)
53203+{
53204+ wmb();
53205+ if (is_ec_a1(hw) && idx < *last) {
53206+ u16 hwget = sky2_read16(hw, Y2_QADDR(q, PREF_UNIT_GET_IDX));
53207+
53208+ if (hwget == 0) {
53209+ /* Start prefetching again */
53210+ sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 0xe0);
53211+ goto setnew;
53212+ }
53213+
53214+ if (hwget == size - 1) {
53215+ /* set watermark to one list element */
53216+ sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 8);
53217+
53218+ /* set put index to first list element */
53219+ sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), 0);
53220+ } else /* have hardware go to end of list */
53221+ sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX),
53222+ size - 1);
53223+ } else {
53224+setnew:
53225+ sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
53226+ }
53227+ *last = idx;
53228+ mmiowb();
53229+}
53230+
53231+
53232+static inline struct sky2_rx_le *sky2_next_rx(struct sky2_port *sky2)
53233+{
53234+ struct sky2_rx_le *le = sky2->rx_le + sky2->rx_put;
53235+ sky2->rx_put = (sky2->rx_put + 1) % RX_LE_SIZE;
53236+ return le;
53237+}
53238+
53239+/* Return high part of DMA address (could be 32 or 64 bit) */
53240+static inline u32 high32(dma_addr_t a)
53241+{
53242+ return sizeof(a) > sizeof(u32) ? (a >> 16) >> 16 : 0;
53243+}
53244+
53245+/* Build description to hardware about buffer */
53246+static void sky2_rx_add(struct sky2_port *sky2, dma_addr_t map)
53247+{
53248+ struct sky2_rx_le *le;
53249+ u32 hi = high32(map);
53250+ u16 len = sky2->rx_bufsize;
53251+
53252+ if (sky2->rx_addr64 != hi) {
53253+ le = sky2_next_rx(sky2);
53254+ le->addr = cpu_to_le32(hi);
53255+ le->ctrl = 0;
53256+ le->opcode = OP_ADDR64 | HW_OWNER;
53257+ sky2->rx_addr64 = high32(map + len);
53258+ }
53259+
53260+ le = sky2_next_rx(sky2);
53261+ le->addr = cpu_to_le32((u32) map);
53262+ le->length = cpu_to_le16(len);
53263+ le->ctrl = 0;
53264+ le->opcode = OP_PACKET | HW_OWNER;
53265+}
53266+
53267+
53268+/* Tell chip where to start receive checksum.
53269+ * Actually has two checksums, but set both same to avoid possible byte
53270+ * order problems.
53271+ */
53272+static void rx_set_checksum(struct sky2_port *sky2)
53273+{
53274+ struct sky2_rx_le *le;
53275+
53276+ le = sky2_next_rx(sky2);
53277+ le->addr = (ETH_HLEN << 16) | ETH_HLEN;
53278+ le->ctrl = 0;
53279+ le->opcode = OP_TCPSTART | HW_OWNER;
53280+
53281+ sky2_write32(sky2->hw,
53282+ Q_ADDR(rxqaddr[sky2->port], Q_CSR),
53283+ sky2->rx_csum ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM);
53284+
53285+}
53286+
53287+/*
53288+ * The RX Stop command will not work for Yukon-2 if the BMU does not
53289+ * reach the end of packet and since we can't make sure that we have
53290+ * incoming data, we must reset the BMU while it is not doing a DMA
53291+ * transfer. Since it is possible that the RX path is still active,
53292+ * the RX RAM buffer will be stopped first, so any possible incoming
53293+ * data will not trigger a DMA. After the RAM buffer is stopped, the
53294+ * BMU is polled until any DMA in progress is ended and only then it
53295+ * will be reset.
53296+ */
53297+static void sky2_rx_stop(struct sky2_port *sky2)
53298+{
53299+ struct sky2_hw *hw = sky2->hw;
53300+ unsigned rxq = rxqaddr[sky2->port];
53301+ int i;
53302+
53303+ /* disable the RAM Buffer receive queue */
53304+ sky2_write8(hw, RB_ADDR(rxq, RB_CTRL), RB_DIS_OP_MD);
53305+
53306+ for (i = 0; i < 0xffff; i++)
53307+ if (sky2_read8(hw, RB_ADDR(rxq, Q_RSL))
53308+ == sky2_read8(hw, RB_ADDR(rxq, Q_RL)))
53309+ goto stopped;
53310+
53311+ printk(KERN_WARNING PFX "%s: receiver stop failed\n",
53312+ sky2->netdev->name);
53313+stopped:
53314+ sky2_write32(hw, Q_ADDR(rxq, Q_CSR), BMU_RST_SET | BMU_FIFO_RST);
53315+
53316+ /* reset the Rx prefetch unit */
53317+ sky2_write32(hw, Y2_QADDR(rxq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
53318+}
53319+
53320+/* Clean out receive buffer area, assumes receiver hardware stopped */
53321+static void sky2_rx_clean(struct sky2_port *sky2)
53322+{
53323+ unsigned i;
53324+
53325+ memset(sky2->rx_le, 0, RX_LE_BYTES);
53326+ for (i = 0; i < sky2->rx_pending; i++) {
53327+ struct ring_info *re = sky2->rx_ring + i;
53328+
53329+ if (re->skb) {
53330+ pci_unmap_single(sky2->hw->pdev,
53331+ re->mapaddr, sky2->rx_bufsize,
53332+ PCI_DMA_FROMDEVICE);
53333+ kfree_skb(re->skb);
53334+ re->skb = NULL;
53335+ }
53336+ }
53337+}
53338+
53339+/* Basic MII support */
53340+static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
53341+{
53342+ struct mii_ioctl_data *data = if_mii(ifr);
53343+ struct sky2_port *sky2 = netdev_priv(dev);
53344+ struct sky2_hw *hw = sky2->hw;
53345+ int err = -EOPNOTSUPP;
53346+
53347+ if (!netif_running(dev))
53348+ return -ENODEV; /* Phy still in reset */
53349+
53350+ switch(cmd) {
53351+ case SIOCGMIIPHY:
53352+ data->phy_id = PHY_ADDR_MARV;
53353+
53354+ /* fallthru */
53355+ case SIOCGMIIREG: {
53356+ u16 val = 0;
53357+
53358+ down(&sky2->phy_sema);
53359+ err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val);
53360+ up(&sky2->phy_sema);
53361+
53362+ data->val_out = val;
53363+ break;
53364+ }
53365+
53366+ case SIOCSMIIREG:
53367+ if (!capable(CAP_NET_ADMIN))
53368+ return -EPERM;
53369+
53370+ down(&sky2->phy_sema);
53371+ err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f,
53372+ data->val_in);
53373+ up(&sky2->phy_sema);
53374+ break;
53375+ }
53376+ return err;
53377+}
53378+
53379+#ifdef SKY2_VLAN_TAG_USED
53380+static void sky2_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
53381+{
53382+ struct sky2_port *sky2 = netdev_priv(dev);
53383+ struct sky2_hw *hw = sky2->hw;
53384+ u16 port = sky2->port;
53385+
53386+ spin_lock_bh(&sky2->tx_lock);
53387+
53388+ sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T), RX_VLAN_STRIP_ON);
53389+ sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_VLAN_TAG_ON);
53390+ sky2->vlgrp = grp;
53391+
53392+ spin_unlock_bh(&sky2->tx_lock);
53393+}
53394+
53395+static void sky2_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
53396+{
53397+ struct sky2_port *sky2 = netdev_priv(dev);
53398+ struct sky2_hw *hw = sky2->hw;
53399+ u16 port = sky2->port;
53400+
53401+ spin_lock_bh(&sky2->tx_lock);
53402+
53403+ sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T), RX_VLAN_STRIP_OFF);
53404+ sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_VLAN_TAG_OFF);
53405+ if (sky2->vlgrp)
53406+ sky2->vlgrp->vlan_devices[vid] = NULL;
53407+
53408+ spin_unlock_bh(&sky2->tx_lock);
53409+}
53410+#endif
53411+
53412+/*
53413+ * It appears the hardware has a bug in the FIFO logic that
53414+ * cause it to hang if the FIFO gets overrun and the receive buffer
53415+ * is not aligned. Also dev_alloc_skb() won't align properly if slab
53416+ * debugging is enabled.
53417+ */
53418+static inline struct sk_buff *sky2_alloc_skb(unsigned int size, gfp_t gfp_mask)
53419+{
53420+ struct sk_buff *skb;
53421+
53422+ skb = __dev_alloc_skb(size + RX_SKB_ALIGN, gfp_mask);
53423+ if (likely(skb)) {
53424+ unsigned long p = (unsigned long) skb->data;
53425+ skb_reserve(skb,
53426+ ((p + RX_SKB_ALIGN - 1) & ~(RX_SKB_ALIGN - 1)) - p);
53427+ }
53428+
53429+ return skb;
53430+}
53431+
53432+/*
53433+ * Allocate and setup receiver buffer pool.
53434+ * In case of 64 bit dma, there are 2X as many list elements
53435+ * available as ring entries
53436+ * and need to reserve one list element so we don't wrap around.
53437+ */
53438+static int sky2_rx_start(struct sky2_port *sky2)
53439+{
53440+ struct sky2_hw *hw = sky2->hw;
53441+ unsigned rxq = rxqaddr[sky2->port];
53442+ int i;
53443+
53444+ sky2->rx_put = sky2->rx_next = 0;
53445+ sky2_qset(hw, rxq);
53446+
53447+ if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev >= 2) {
53448+ /* MAC Rx RAM Read is controlled by hardware */
53449+ sky2_write32(hw, Q_ADDR(rxq, Q_F), F_M_RX_RAM_DIS);
53450+ }
53451+
53452+ sky2_prefetch_init(hw, rxq, sky2->rx_le_map, RX_LE_SIZE - 1);
53453+
53454+ rx_set_checksum(sky2);
53455+ for (i = 0; i < sky2->rx_pending; i++) {
53456+ struct ring_info *re = sky2->rx_ring + i;
53457+
53458+ re->skb = sky2_alloc_skb(sky2->rx_bufsize, GFP_KERNEL);
53459+ if (!re->skb)
53460+ goto nomem;
53461+
53462+ re->mapaddr = pci_map_single(hw->pdev, re->skb->data,
53463+ sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
53464+ sky2_rx_add(sky2, re->mapaddr);
53465+ }
53466+
53467+ /* Truncate oversize frames */
53468+ sky2_write16(hw, SK_REG(sky2->port, RX_GMF_TR_THR), sky2->rx_bufsize - 8);
53469+ sky2_write32(hw, SK_REG(sky2->port, RX_GMF_CTRL_T), RX_TRUNC_ON);
53470+
53471+ /* Tell chip about available buffers */
53472+ sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put);
53473+ sky2->rx_last_put = sky2_read16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX));
53474+ return 0;
53475+nomem:
53476+ sky2_rx_clean(sky2);
53477+ return -ENOMEM;
53478+}
53479+
53480+/* Bring up network interface. */
53481+static int sky2_up(struct net_device *dev)
53482+{
53483+ struct sky2_port *sky2 = netdev_priv(dev);
53484+ struct sky2_hw *hw = sky2->hw;
53485+ unsigned port = sky2->port;
53486+ u32 ramsize, rxspace;
53487+ int err = -ENOMEM;
53488+
53489+ if (netif_msg_ifup(sky2))
53490+ printk(KERN_INFO PFX "%s: enabling interface\n", dev->name);
53491+
53492+ /* must be power of 2 */
53493+ sky2->tx_le = pci_alloc_consistent(hw->pdev,
53494+ TX_RING_SIZE *
53495+ sizeof(struct sky2_tx_le),
53496+ &sky2->tx_le_map);
53497+ if (!sky2->tx_le)
53498+ goto err_out;
53499+
53500+ sky2->tx_ring = kcalloc(TX_RING_SIZE, sizeof(struct tx_ring_info),
53501+ GFP_KERNEL);
53502+ if (!sky2->tx_ring)
53503+ goto err_out;
53504+ sky2->tx_prod = sky2->tx_cons = 0;
53505+
53506+ sky2->rx_le = pci_alloc_consistent(hw->pdev, RX_LE_BYTES,
53507+ &sky2->rx_le_map);
53508+ if (!sky2->rx_le)
53509+ goto err_out;
53510+ memset(sky2->rx_le, 0, RX_LE_BYTES);
53511+
53512+ sky2->rx_ring = kcalloc(sky2->rx_pending, sizeof(struct ring_info),
53513+ GFP_KERNEL);
53514+ if (!sky2->rx_ring)
53515+ goto err_out;
53516+
53517+ sky2_mac_init(hw, port);
53518+
53519+ /* Determine available ram buffer space (in 4K blocks).
53520+ * Note: not sure about the FE setting below yet
53521+ */
53522+ if (hw->chip_id == CHIP_ID_YUKON_FE)
53523+ ramsize = 4;
53524+ else
53525+ ramsize = sky2_read8(hw, B2_E_0);
53526+
53527+ /* Give transmitter one third (rounded up) */
53528+ rxspace = ramsize - (ramsize + 2) / 3;
53529+
53530+ sky2_ramset(hw, rxqaddr[port], 0, rxspace);
53531+ sky2_ramset(hw, txqaddr[port], rxspace, ramsize);
53532+
53533+ /* Make sure SyncQ is disabled */
53534+ sky2_write8(hw, RB_ADDR(port == 0 ? Q_XS1 : Q_XS2, RB_CTRL),
53535+ RB_RST_SET);
53536+
53537+ sky2_qset(hw, txqaddr[port]);
53538+
53539+ /* Set almost empty threshold */
53540+ if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev == 1)
53541+ sky2_write16(hw, Q_ADDR(txqaddr[port], Q_AL), 0x1a0);
53542+
53543+ sky2_prefetch_init(hw, txqaddr[port], sky2->tx_le_map,
53544+ TX_RING_SIZE - 1);
53545+
53546+ err = sky2_rx_start(sky2);
53547+ if (err)
53548+ goto err_out;
53549+
53550+ /* Enable interrupts from phy/mac for port */
53551+ spin_lock_irq(&hw->hw_lock);
53552+ hw->intr_mask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
53553+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
53554+ spin_unlock_irq(&hw->hw_lock);
53555+ return 0;
53556+
53557+err_out:
53558+ if (sky2->rx_le) {
53559+ pci_free_consistent(hw->pdev, RX_LE_BYTES,
53560+ sky2->rx_le, sky2->rx_le_map);
53561+ sky2->rx_le = NULL;
53562+ }
53563+ if (sky2->tx_le) {
53564+ pci_free_consistent(hw->pdev,
53565+ TX_RING_SIZE * sizeof(struct sky2_tx_le),
53566+ sky2->tx_le, sky2->tx_le_map);
53567+ sky2->tx_le = NULL;
53568+ }
53569+ kfree(sky2->tx_ring);
53570+ kfree(sky2->rx_ring);
53571+
53572+ sky2->tx_ring = NULL;
53573+ sky2->rx_ring = NULL;
53574+ return err;
53575+}
53576+
53577+/* Modular subtraction in ring */
53578+static inline int tx_dist(unsigned tail, unsigned head)
53579+{
53580+ return (head - tail) % TX_RING_SIZE;
53581+}
53582+
53583+/* Number of list elements available for next tx */
53584+static inline int tx_avail(const struct sky2_port *sky2)
53585+{
53586+ return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod);
53587+}
53588+
53589+/* Estimate of number of transmit list elements required */
53590+static unsigned tx_le_req(const struct sk_buff *skb)
53591+{
53592+ unsigned count;
53593+
53594+ count = sizeof(dma_addr_t) / sizeof(u32);
53595+ count += skb_shinfo(skb)->nr_frags * count;
53596+
53597+ if (skb_shinfo(skb)->gso_size)
53598+ ++count;
53599+
53600+ if (skb->ip_summed == CHECKSUM_HW)
53601+ ++count;
53602+
53603+ return count;
53604+}
53605+
53606+/*
53607+ * Put one packet in ring for transmit.
53608+ * A single packet can generate multiple list elements, and
53609+ * the number of ring elements will probably be less than the number
53610+ * of list elements used.
53611+ *
53612+ * No BH disabling for tx_lock here (like tg3)
53613+ */
53614+static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
53615+{
53616+ struct sky2_port *sky2 = netdev_priv(dev);
53617+ struct sky2_hw *hw = sky2->hw;
53618+ struct sky2_tx_le *le = NULL;
53619+ struct tx_ring_info *re;
53620+ unsigned i, len;
53621+ int avail;
53622+ dma_addr_t mapping;
53623+ u32 addr64;
53624+ u16 mss;
53625+ u8 ctrl;
53626+
53627+ /* No BH disabling for tx_lock here. We are running in BH disabled
53628+ * context and TX reclaim runs via poll inside of a software
53629+ * interrupt, and no related locks in IRQ processing.
53630+ */
53631+ if (!spin_trylock(&sky2->tx_lock))
53632+ return NETDEV_TX_LOCKED;
53633+
53634+ if (unlikely(tx_avail(sky2) < tx_le_req(skb))) {
53635+ /* There is a known but harmless race with lockless tx
53636+ * and netif_stop_queue.
53637+ */
53638+ if (!netif_queue_stopped(dev)) {
53639+ netif_stop_queue(dev);
53640+ if (net_ratelimit())
53641+ printk(KERN_WARNING PFX "%s: ring full when queue awake!\n",
53642+ dev->name);
53643+ }
53644+ spin_unlock(&sky2->tx_lock);
53645+
53646+ return NETDEV_TX_BUSY;
53647+ }
53648+
53649+ if (unlikely(netif_msg_tx_queued(sky2)))
53650+ printk(KERN_DEBUG "%s: tx queued, slot %u, len %d\n",
53651+ dev->name, sky2->tx_prod, skb->len);
53652+
53653+ len = skb_headlen(skb);
53654+ mapping = pci_map_single(hw->pdev, skb->data, len, PCI_DMA_TODEVICE);
53655+ addr64 = high32(mapping);
53656+
53657+ re = sky2->tx_ring + sky2->tx_prod;
53658+
53659+ /* Send high bits if changed or crosses boundary */
53660+ if (addr64 != sky2->tx_addr64 || high32(mapping + len) != sky2->tx_addr64) {
53661+ le = get_tx_le(sky2);
53662+ le->tx.addr = cpu_to_le32(addr64);
53663+ le->ctrl = 0;
53664+ le->opcode = OP_ADDR64 | HW_OWNER;
53665+ sky2->tx_addr64 = high32(mapping + len);
53666+ }
53667+
53668+ /* Check for TCP Segmentation Offload */
53669+ mss = skb_shinfo(skb)->gso_size;
53670+ if (mss != 0) {
53671+ /* just drop the packet if non-linear expansion fails */
53672+ if (skb_header_cloned(skb) &&
53673+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
53674+ dev_kfree_skb_any(skb);
53675+ goto out_unlock;
53676+ }
53677+
53678+ mss += ((skb->h.th->doff - 5) * 4); /* TCP options */
53679+ mss += (skb->nh.iph->ihl * 4) + sizeof(struct tcphdr);
53680+ mss += ETH_HLEN;
53681+ }
53682+
53683+ if (mss != sky2->tx_last_mss) {
53684+ le = get_tx_le(sky2);
53685+ le->tx.tso.size = cpu_to_le16(mss);
53686+ le->tx.tso.rsvd = 0;
53687+ le->opcode = OP_LRGLEN | HW_OWNER;
53688+ le->ctrl = 0;
53689+ sky2->tx_last_mss = mss;
53690+ }
53691+
53692+ ctrl = 0;
53693+#ifdef SKY2_VLAN_TAG_USED
53694+ /* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
53695+ if (sky2->vlgrp && vlan_tx_tag_present(skb)) {
53696+ if (!le) {
53697+ le = get_tx_le(sky2);
53698+ le->tx.addr = 0;
53699+ le->opcode = OP_VLAN|HW_OWNER;
53700+ le->ctrl = 0;
53701+ } else
53702+ le->opcode |= OP_VLAN;
53703+ le->length = cpu_to_be16(vlan_tx_tag_get(skb));
53704+ ctrl |= INS_VLAN;
53705+ }
53706+#endif
53707+
53708+ /* Handle TCP checksum offload */
53709+ if (skb->ip_summed == CHECKSUM_HW) {
53710+ u16 hdr = skb->h.raw - skb->data;
53711+ u16 offset = hdr + skb->csum;
53712+
53713+ ctrl = CALSUM | WR_SUM | INIT_SUM | LOCK_SUM;
53714+ if (skb->nh.iph->protocol == IPPROTO_UDP)
53715+ ctrl |= UDPTCP;
53716+
53717+ le = get_tx_le(sky2);
53718+ le->tx.csum.start = cpu_to_le16(hdr);
53719+ le->tx.csum.offset = cpu_to_le16(offset);
53720+ le->length = 0; /* initial checksum value */
53721+ le->ctrl = 1; /* one packet */
53722+ le->opcode = OP_TCPLISW | HW_OWNER;
53723+ }
53724+
53725+ le = get_tx_le(sky2);
53726+ le->tx.addr = cpu_to_le32((u32) mapping);
53727+ le->length = cpu_to_le16(len);
53728+ le->ctrl = ctrl;
53729+ le->opcode = mss ? (OP_LARGESEND | HW_OWNER) : (OP_PACKET | HW_OWNER);
53730+
53731+ /* Record the transmit mapping info */
53732+ re->skb = skb;
53733+ pci_unmap_addr_set(re, mapaddr, mapping);
53734+
53735+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
53736+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
53737+ struct tx_ring_info *fre;
53738+
53739+ mapping = pci_map_page(hw->pdev, frag->page, frag->page_offset,
53740+ frag->size, PCI_DMA_TODEVICE);
53741+ addr64 = high32(mapping);
53742+ if (addr64 != sky2->tx_addr64) {
53743+ le = get_tx_le(sky2);
53744+ le->tx.addr = cpu_to_le32(addr64);
53745+ le->ctrl = 0;
53746+ le->opcode = OP_ADDR64 | HW_OWNER;
53747+ sky2->tx_addr64 = addr64;
53748+ }
53749+
53750+ le = get_tx_le(sky2);
53751+ le->tx.addr = cpu_to_le32((u32) mapping);
53752+ le->length = cpu_to_le16(frag->size);
53753+ le->ctrl = ctrl;
53754+ le->opcode = OP_BUFFER | HW_OWNER;
53755+
53756+ fre = sky2->tx_ring
53757+ + ((re - sky2->tx_ring) + i + 1) % TX_RING_SIZE;
53758+ pci_unmap_addr_set(fre, mapaddr, mapping);
53759+ }
53760+
53761+ re->idx = sky2->tx_prod;
53762+ le->ctrl |= EOP;
53763+
53764+ avail = tx_avail(sky2);
53765+ if (mss != 0 || avail < TX_MIN_PENDING) {
53766+ le->ctrl |= FRC_STAT;
53767+ if (avail <= MAX_SKB_TX_LE)
53768+ netif_stop_queue(dev);
53769+ }
53770+
53771+ sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod,
53772+ &sky2->tx_last_put, TX_RING_SIZE);
53773+
53774+out_unlock:
53775+ spin_unlock(&sky2->tx_lock);
53776+
53777+ dev->trans_start = jiffies;
53778+ return NETDEV_TX_OK;
53779+}
53780+
53781+/*
53782+ * Free ring elements from starting at tx_cons until "done"
53783+ *
53784+ * NB: the hardware will tell us about partial completion of multi-part
53785+ * buffers; these are deferred until completion.
53786+ */
53787+static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
53788+{
53789+ struct net_device *dev = sky2->netdev;
53790+ struct pci_dev *pdev = sky2->hw->pdev;
53791+ u16 nxt, put;
53792+ unsigned i;
53793+
53794+ BUG_ON(done >= TX_RING_SIZE);
53795+
53796+ if (unlikely(netif_msg_tx_done(sky2)))
53797+ printk(KERN_DEBUG "%s: tx done, up to %u\n",
53798+ dev->name, done);
53799+
53800+ for (put = sky2->tx_cons; put != done; put = nxt) {
53801+ struct tx_ring_info *re = sky2->tx_ring + put;
53802+ struct sk_buff *skb = re->skb;
53803+
53804+ nxt = re->idx;
53805+ BUG_ON(nxt >= TX_RING_SIZE);
53806+ prefetch(sky2->tx_ring + nxt);
53807+
53808+ /* Check for partial status */
53809+ if (tx_dist(put, done) < tx_dist(put, nxt))
53810+ break;
53811+
53812+ skb = re->skb;
53813+ pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
53814+ skb_headlen(skb), PCI_DMA_TODEVICE);
53815+
53816+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
53817+ struct tx_ring_info *fre;
53818+ fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE;
53819+ pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr),
53820+ skb_shinfo(skb)->frags[i].size,
53821+ PCI_DMA_TODEVICE);
53822+ }
53823+
53824+ dev_kfree_skb_any(skb);
53825+ }
53826+
53827+ sky2->tx_cons = put;
53828+ if (netif_queue_stopped(dev) && tx_avail(sky2) > MAX_SKB_TX_LE)
53829+ netif_wake_queue(dev);
53830+}
53831+
53832+/* Cleanup all untransmitted buffers, assume transmitter not running */
53833+static void sky2_tx_clean(struct sky2_port *sky2)
53834+{
53835+ spin_lock_bh(&sky2->tx_lock);
53836+ sky2_tx_complete(sky2, sky2->tx_prod);
53837+ spin_unlock_bh(&sky2->tx_lock);
53838+}
53839+
53840+/* Network shutdown */
53841+static int sky2_down(struct net_device *dev)
53842+{
53843+ struct sky2_port *sky2 = netdev_priv(dev);
53844+ struct sky2_hw *hw = sky2->hw;
53845+ unsigned port = sky2->port;
53846+ u16 ctrl;
53847+
53848+ /* Never really got started! */
53849+ if (!sky2->tx_le)
53850+ return 0;
53851+
53852+ if (netif_msg_ifdown(sky2))
53853+ printk(KERN_INFO PFX "%s: disabling interface\n", dev->name);
53854+
53855+ /* Stop more packets from being queued */
53856+ netif_stop_queue(dev);
53857+
53858+ /* Disable port IRQ */
53859+ spin_lock_irq(&hw->hw_lock);
53860+ hw->intr_mask &= ~((sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
53861+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
53862+ spin_unlock_irq(&hw->hw_lock);
53863+
53864+ flush_scheduled_work();
53865+
53866+ sky2_phy_reset(hw, port);
53867+
53868+ /* Stop transmitter */
53869+ sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR), BMU_STOP);
53870+ sky2_read32(hw, Q_ADDR(txqaddr[port], Q_CSR));
53871+
53872+ sky2_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL),
53873+ RB_RST_SET | RB_DIS_OP_MD);
53874+
53875+ ctrl = gma_read16(hw, port, GM_GP_CTRL);
53876+ ctrl &= ~(GM_GPCR_TX_ENA | GM_GPCR_RX_ENA);
53877+ gma_write16(hw, port, GM_GP_CTRL, ctrl);
53878+
53879+ sky2_write8(hw, SK_REG(port, GPHY_CTRL), GPC_RST_SET);
53880+
53881+ /* Workaround shared GMAC reset */
53882+ if (!(hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0
53883+ && port == 0 && hw->dev[1] && netif_running(hw->dev[1])))
53884+ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_RST_SET);
53885+
53886+ /* Disable Force Sync bit and Enable Alloc bit */
53887+ sky2_write8(hw, SK_REG(port, TXA_CTRL),
53888+ TXA_DIS_FSYNC | TXA_DIS_ALLOC | TXA_STOP_RC);
53889+
53890+ /* Stop Interval Timer and Limit Counter of Tx Arbiter */
53891+ sky2_write32(hw, SK_REG(port, TXA_ITI_INI), 0L);
53892+ sky2_write32(hw, SK_REG(port, TXA_LIM_INI), 0L);
53893+
53894+ /* Reset the PCI FIFO of the async Tx queue */
53895+ sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR),
53896+ BMU_RST_SET | BMU_FIFO_RST);
53897+
53898+ /* Reset the Tx prefetch units */
53899+ sky2_write32(hw, Y2_QADDR(txqaddr[port], PREF_UNIT_CTRL),
53900+ PREF_UNIT_RST_SET);
53901+
53902+ sky2_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL), RB_RST_SET);
53903+
53904+ sky2_rx_stop(sky2);
53905+
53906+ sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET);
53907+ sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET);
53908+
53909+ /* turn off LED's */
53910+ sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
53911+
53912+ synchronize_irq(hw->pdev->irq);
53913+
53914+ sky2_tx_clean(sky2);
53915+ sky2_rx_clean(sky2);
53916+
53917+ pci_free_consistent(hw->pdev, RX_LE_BYTES,
53918+ sky2->rx_le, sky2->rx_le_map);
53919+ kfree(sky2->rx_ring);
53920+
53921+ pci_free_consistent(hw->pdev,
53922+ TX_RING_SIZE * sizeof(struct sky2_tx_le),
53923+ sky2->tx_le, sky2->tx_le_map);
53924+ kfree(sky2->tx_ring);
53925+
53926+ sky2->tx_le = NULL;
53927+ sky2->rx_le = NULL;
53928+
53929+ sky2->rx_ring = NULL;
53930+ sky2->tx_ring = NULL;
53931+
53932+ return 0;
53933+}
53934+
53935+static u16 sky2_phy_speed(const struct sky2_hw *hw, u16 aux)
53936+{
53937+ if (!sky2_is_copper(hw))
53938+ return SPEED_1000;
53939+
53940+ if (hw->chip_id == CHIP_ID_YUKON_FE)
53941+ return (aux & PHY_M_PS_SPEED_100) ? SPEED_100 : SPEED_10;
53942+
53943+ switch (aux & PHY_M_PS_SPEED_MSK) {
53944+ case PHY_M_PS_SPEED_1000:
53945+ return SPEED_1000;
53946+ case PHY_M_PS_SPEED_100:
53947+ return SPEED_100;
53948+ default:
53949+ return SPEED_10;
53950+ }
53951+}
53952+
53953+static void sky2_link_up(struct sky2_port *sky2)
53954+{
53955+ struct sky2_hw *hw = sky2->hw;
53956+ unsigned port = sky2->port;
53957+ u16 reg;
53958+
53959+ /* Enable Transmit FIFO Underrun */
53960+ sky2_write8(hw, SK_REG(port, GMAC_IRQ_MSK), GMAC_DEF_MSK);
53961+
53962+ reg = gma_read16(hw, port, GM_GP_CTRL);
53963+ if (sky2->autoneg == AUTONEG_DISABLE) {
53964+ reg |= GM_GPCR_AU_ALL_DIS;
53965+
53966+ /* Is write/read necessary? Copied from sky2_mac_init */
53967+ gma_write16(hw, port, GM_GP_CTRL, reg);
53968+ gma_read16(hw, port, GM_GP_CTRL);
53969+
53970+ switch (sky2->speed) {
53971+ case SPEED_1000:
53972+ reg &= ~GM_GPCR_SPEED_100;
53973+ reg |= GM_GPCR_SPEED_1000;
53974+ break;
53975+ case SPEED_100:
53976+ reg &= ~GM_GPCR_SPEED_1000;
53977+ reg |= GM_GPCR_SPEED_100;
53978+ break;
53979+ case SPEED_10:
53980+ reg &= ~(GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100);
53981+ break;
53982+ }
53983+ } else
53984+ reg &= ~GM_GPCR_AU_ALL_DIS;
53985+
53986+ if (sky2->duplex == DUPLEX_FULL || sky2->autoneg == AUTONEG_ENABLE)
53987+ reg |= GM_GPCR_DUP_FULL;
53988+
53989+ /* enable Rx/Tx */
53990+ reg |= GM_GPCR_RX_ENA | GM_GPCR_TX_ENA;
53991+ gma_write16(hw, port, GM_GP_CTRL, reg);
53992+ gma_read16(hw, port, GM_GP_CTRL);
53993+
53994+ gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_DEF_MSK);
53995+
53996+ netif_carrier_on(sky2->netdev);
53997+ netif_wake_queue(sky2->netdev);
53998+
53999+ /* Turn on link LED */
54000+ sky2_write8(hw, SK_REG(port, LNK_LED_REG),
54001+ LINKLED_ON | LINKLED_BLINK_OFF | LINKLED_LINKSYNC_OFF);
54002+
54003+ if (hw->chip_id == CHIP_ID_YUKON_XL) {
54004+ u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
54005+
54006+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
54007+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, PHY_M_LEDC_LOS_CTRL(1) | /* LINK/ACT */
54008+ PHY_M_LEDC_INIT_CTRL(sky2->speed ==
54009+ SPEED_10 ? 7 : 0) |
54010+ PHY_M_LEDC_STA1_CTRL(sky2->speed ==
54011+ SPEED_100 ? 7 : 0) |
54012+ PHY_M_LEDC_STA0_CTRL(sky2->speed ==
54013+ SPEED_1000 ? 7 : 0));
54014+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
54015+ }
54016+
54017+ if (netif_msg_link(sky2))
54018+ printk(KERN_INFO PFX
54019+ "%s: Link is up at %d Mbps, %s duplex, flow control %s\n",
54020+ sky2->netdev->name, sky2->speed,
54021+ sky2->duplex == DUPLEX_FULL ? "full" : "half",
54022+ (sky2->tx_pause && sky2->rx_pause) ? "both" :
54023+ sky2->tx_pause ? "tx" : sky2->rx_pause ? "rx" : "none");
54024+}
54025+
54026+static void sky2_link_down(struct sky2_port *sky2)
54027+{
54028+ struct sky2_hw *hw = sky2->hw;
54029+ unsigned port = sky2->port;
54030+ u16 reg;
54031+
54032+ gm_phy_write(hw, port, PHY_MARV_INT_MASK, 0);
54033+
54034+ reg = gma_read16(hw, port, GM_GP_CTRL);
54035+ reg &= ~(GM_GPCR_RX_ENA | GM_GPCR_TX_ENA);
54036+ gma_write16(hw, port, GM_GP_CTRL, reg);
54037+ gma_read16(hw, port, GM_GP_CTRL); /* PCI post */
54038+
54039+ if (sky2->rx_pause && !sky2->tx_pause) {
54040+ /* restore Asymmetric Pause bit */
54041+ gm_phy_write(hw, port, PHY_MARV_AUNE_ADV,
54042+ gm_phy_read(hw, port, PHY_MARV_AUNE_ADV)
54043+ | PHY_M_AN_ASP);
54044+ }
54045+
54046+ netif_carrier_off(sky2->netdev);
54047+ netif_stop_queue(sky2->netdev);
54048+
54049+ /* Turn on link LED */
54050+ sky2_write8(hw, SK_REG(port, LNK_LED_REG), LINKLED_OFF);
54051+
54052+ if (netif_msg_link(sky2))
54053+ printk(KERN_INFO PFX "%s: Link is down.\n", sky2->netdev->name);
54054+ sky2_phy_init(hw, port);
54055+}
54056+
54057+static int sky2_autoneg_done(struct sky2_port *sky2, u16 aux)
54058+{
54059+ struct sky2_hw *hw = sky2->hw;
54060+ unsigned port = sky2->port;
54061+ u16 lpa;
54062+
54063+ lpa = gm_phy_read(hw, port, PHY_MARV_AUNE_LP);
54064+
54065+ if (lpa & PHY_M_AN_RF) {
54066+ printk(KERN_ERR PFX "%s: remote fault", sky2->netdev->name);
54067+ return -1;
54068+ }
54069+
54070+ if (hw->chip_id != CHIP_ID_YUKON_FE &&
54071+ gm_phy_read(hw, port, PHY_MARV_1000T_STAT) & PHY_B_1000S_MSF) {
54072+ printk(KERN_ERR PFX "%s: master/slave fault",
54073+ sky2->netdev->name);
54074+ return -1;
54075+ }
54076+
54077+ if (!(aux & PHY_M_PS_SPDUP_RES)) {
54078+ printk(KERN_ERR PFX "%s: speed/duplex mismatch",
54079+ sky2->netdev->name);
54080+ return -1;
54081+ }
54082+
54083+ sky2->duplex = (aux & PHY_M_PS_FULL_DUP) ? DUPLEX_FULL : DUPLEX_HALF;
54084+
54085+ sky2->speed = sky2_phy_speed(hw, aux);
54086+
54087+ /* Pause bits are offset (9..8) */
54088+ if (hw->chip_id == CHIP_ID_YUKON_XL)
54089+ aux >>= 6;
54090+
54091+ sky2->rx_pause = (aux & PHY_M_PS_RX_P_EN) != 0;
54092+ sky2->tx_pause = (aux & PHY_M_PS_TX_P_EN) != 0;
54093+
54094+ if ((sky2->tx_pause || sky2->rx_pause)
54095+ && !(sky2->speed < SPEED_1000 && sky2->duplex == DUPLEX_HALF))
54096+ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_ON);
54097+ else
54098+ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
54099+
54100+ return 0;
54101+}
54102+
54103+/*
54104+ * Interrupt from PHY are handled outside of interrupt context
54105+ * because accessing phy registers requires spin wait which might
54106+ * cause excess interrupt latency.
54107+ */
54108+static void sky2_phy_task(void *arg)
54109+{
54110+ struct sky2_port *sky2 = arg;
54111+ struct sky2_hw *hw = sky2->hw;
54112+ u16 istatus, phystat;
54113+
54114+ down(&sky2->phy_sema);
54115+ istatus = gm_phy_read(hw, sky2->port, PHY_MARV_INT_STAT);
54116+ phystat = gm_phy_read(hw, sky2->port, PHY_MARV_PHY_STAT);
54117+
54118+ if (netif_msg_intr(sky2))
54119+ printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n",
54120+ sky2->netdev->name, istatus, phystat);
54121+
54122+ if (istatus & PHY_M_IS_AN_COMPL) {
54123+ if (sky2_autoneg_done(sky2, phystat) == 0)
54124+ sky2_link_up(sky2);
54125+ goto out;
54126+ }
54127+
54128+ if (istatus & PHY_M_IS_LSP_CHANGE)
54129+ sky2->speed = sky2_phy_speed(hw, phystat);
54130+
54131+ if (istatus & PHY_M_IS_DUP_CHANGE)
54132+ sky2->duplex =
54133+ (phystat & PHY_M_PS_FULL_DUP) ? DUPLEX_FULL : DUPLEX_HALF;
54134+
54135+ if (istatus & PHY_M_IS_LST_CHANGE) {
54136+ if (phystat & PHY_M_PS_LINK_UP)
54137+ sky2_link_up(sky2);
54138+ else
54139+ sky2_link_down(sky2);
54140+ }
54141+out:
54142+ up(&sky2->phy_sema);
54143+
54144+ spin_lock_irq(&hw->hw_lock);
54145+ hw->intr_mask |= (sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2;
54146+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
54147+ spin_unlock_irq(&hw->hw_lock);
54148+}
54149+
54150+
54151+/* Transmit timeout is only called if we are running, carries is up
54152+ * and tx queue is full (stopped).
54153+ */
54154+static void sky2_tx_timeout(struct net_device *dev)
54155+{
54156+ struct sky2_port *sky2 = netdev_priv(dev);
54157+ struct sky2_hw *hw = sky2->hw;
54158+ unsigned txq = txqaddr[sky2->port];
54159+ u16 ridx;
54160+
54161+ /* Maybe we just missed an status interrupt */
54162+ spin_lock(&sky2->tx_lock);
54163+ ridx = sky2_read16(hw,
54164+ sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
54165+ sky2_tx_complete(sky2, ridx);
54166+ spin_unlock(&sky2->tx_lock);
54167+
54168+ if (!netif_queue_stopped(dev)) {
54169+ if (net_ratelimit())
54170+ pr_info(PFX "transmit interrupt missed? recovered\n");
54171+ return;
54172+ }
54173+
54174+ if (netif_msg_timer(sky2))
54175+ printk(KERN_ERR PFX "%s: tx timeout\n", dev->name);
54176+
54177+ sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP);
54178+ sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
54179+
54180+ sky2_tx_clean(sky2);
54181+
54182+ sky2_qset(hw, txq);
54183+ sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1);
54184+}
54185+
54186+
54187+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
54188+/* Want receive buffer size to be multiple of 64 bits
54189+ * and incl room for vlan and truncation
54190+ */
54191+static inline unsigned sky2_buf_size(int mtu)
54192+{
54193+ return roundup(mtu + ETH_HLEN + VLAN_HLEN, 8) + 8;
54194+}
54195+
54196+static int sky2_change_mtu(struct net_device *dev, int new_mtu)
54197+{
54198+ struct sky2_port *sky2 = netdev_priv(dev);
54199+ struct sky2_hw *hw = sky2->hw;
54200+ int err;
54201+ u16 ctl, mode;
54202+
54203+ if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
54204+ return -EINVAL;
54205+
54206+ if (hw->chip_id == CHIP_ID_YUKON_EC_U && new_mtu > ETH_DATA_LEN)
54207+ return -EINVAL;
54208+
54209+ if (!netif_running(dev)) {
54210+ dev->mtu = new_mtu;
54211+ return 0;
54212+ }
54213+
54214+ sky2_write32(hw, B0_IMSK, 0);
54215+
54216+ dev->trans_start = jiffies; /* prevent tx timeout */
54217+ netif_stop_queue(dev);
54218+ netif_poll_disable(hw->dev[0]);
54219+
54220+ ctl = gma_read16(hw, sky2->port, GM_GP_CTRL);
54221+ gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA);
54222+ sky2_rx_stop(sky2);
54223+ sky2_rx_clean(sky2);
54224+
54225+ dev->mtu = new_mtu;
54226+ sky2->rx_bufsize = sky2_buf_size(new_mtu);
54227+ mode = DATA_BLIND_VAL(DATA_BLIND_DEF) |
54228+ GM_SMOD_VLAN_ENA | IPG_DATA_VAL(IPG_DATA_DEF);
54229+
54230+ if (dev->mtu > ETH_DATA_LEN)
54231+ mode |= GM_SMOD_JUMBO_ENA;
54232+
54233+ gma_write16(hw, sky2->port, GM_SERIAL_MODE, mode);
54234+
54235+ sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD);
54236+
54237+ err = sky2_rx_start(sky2);
54238+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
54239+
54240+ if (err)
54241+ dev_close(dev);
54242+ else {
54243+ gma_write16(hw, sky2->port, GM_GP_CTRL, ctl);
54244+
54245+ netif_poll_enable(hw->dev[0]);
54246+ netif_wake_queue(dev);
54247+ }
54248+
54249+ return err;
54250+}
54251+
54252+/*
54253+ * Receive one packet.
54254+ * For small packets or errors, just reuse existing skb.
54255+ * For larger packets, get new buffer.
54256+ */
54257+static struct sk_buff *sky2_receive(struct sky2_port *sky2,
54258+ u16 length, u32 status)
54259+{
54260+ struct ring_info *re = sky2->rx_ring + sky2->rx_next;
54261+ struct sk_buff *skb = NULL;
54262+
54263+ if (unlikely(netif_msg_rx_status(sky2)))
54264+ printk(KERN_DEBUG PFX "%s: rx slot %u status 0x%x len %d\n",
54265+ sky2->netdev->name, sky2->rx_next, status, length);
54266+
54267+ sky2->rx_next = (sky2->rx_next + 1) % sky2->rx_pending;
54268+ prefetch(sky2->rx_ring + sky2->rx_next);
54269+
54270+ if (status & GMR_FS_ANY_ERR)
54271+ goto error;
54272+
54273+ if (!(status & GMR_FS_RX_OK))
54274+ goto resubmit;
54275+
54276+ if (length > sky2->netdev->mtu + ETH_HLEN)
54277+ goto oversize;
54278+
54279+ if (length < copybreak) {
54280+ skb = dev_alloc_skb(length + 2);
54281+ if (!skb)
54282+ goto resubmit;
54283+
54284+ skb_reserve(skb, 2);
54285+ pci_dma_sync_single_for_cpu(sky2->hw->pdev, re->mapaddr,
54286+ length, PCI_DMA_FROMDEVICE);
54287+ memcpy(skb->data, re->skb->data, length);
54288+ skb->ip_summed = re->skb->ip_summed;
54289+ skb->csum = re->skb->csum;
54290+ pci_dma_sync_single_for_device(sky2->hw->pdev, re->mapaddr,
54291+ length, PCI_DMA_FROMDEVICE);
54292+ } else {
54293+ struct sk_buff *nskb;
54294+
54295+ nskb = sky2_alloc_skb(sky2->rx_bufsize, GFP_ATOMIC);
54296+ if (!nskb)
54297+ goto resubmit;
54298+
54299+ skb = re->skb;
54300+ re->skb = nskb;
54301+ pci_unmap_single(sky2->hw->pdev, re->mapaddr,
54302+ sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
54303+ prefetch(skb->data);
54304+
54305+ re->mapaddr = pci_map_single(sky2->hw->pdev, nskb->data,
54306+ sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
54307+ }
54308+
54309+ skb_put(skb, length);
54310+resubmit:
54311+ re->skb->ip_summed = CHECKSUM_NONE;
54312+ sky2_rx_add(sky2, re->mapaddr);
54313+
54314+ /* Tell receiver about new buffers. */
54315+ sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put,
54316+ &sky2->rx_last_put, RX_LE_SIZE);
54317+
54318+ return skb;
54319+
54320+oversize:
54321+ ++sky2->net_stats.rx_over_errors;
54322+ goto resubmit;
54323+
54324+error:
54325+ ++sky2->net_stats.rx_errors;
54326+
54327+ if (netif_msg_rx_err(sky2) && net_ratelimit())
54328+ printk(KERN_INFO PFX "%s: rx error, status 0x%x length %d\n",
54329+ sky2->netdev->name, status, length);
54330+
54331+ if (status & (GMR_FS_LONG_ERR | GMR_FS_UN_SIZE))
54332+ sky2->net_stats.rx_length_errors++;
54333+ if (status & GMR_FS_FRAGMENT)
54334+ sky2->net_stats.rx_frame_errors++;
54335+ if (status & GMR_FS_CRC_ERR)
54336+ sky2->net_stats.rx_crc_errors++;
54337+ if (status & GMR_FS_RX_FF_OV)
54338+ sky2->net_stats.rx_fifo_errors++;
54339+
54340+ goto resubmit;
54341+}
54342+
54343+/*
54344+ * Check for transmit complete
54345+ */
54346+#define TX_NO_STATUS 0xffff
54347+
54348+static void sky2_tx_check(struct sky2_hw *hw, int port, u16 last)
54349+{
54350+ if (last != TX_NO_STATUS) {
54351+ struct net_device *dev = hw->dev[port];
54352+ if (dev && netif_running(dev)) {
54353+ struct sky2_port *sky2 = netdev_priv(dev);
54354+
54355+ spin_lock(&sky2->tx_lock);
54356+ sky2_tx_complete(sky2, last);
54357+ spin_unlock(&sky2->tx_lock);
54358+ }
54359+ }
54360+}
54361+
54362+/*
54363+ * Both ports share the same status interrupt, therefore there is only
54364+ * one poll routine.
54365+ */
54366+static int sky2_poll(struct net_device *dev0, int *budget)
54367+{
54368+ struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
54369+ unsigned int to_do = min(dev0->quota, *budget);
54370+ unsigned int work_done = 0;
54371+ u16 hwidx;
54372+ u16 tx_done[2] = { TX_NO_STATUS, TX_NO_STATUS };
54373+
54374+ sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
54375+
54376+ /*
54377+ * Kick the STAT_LEV_TIMER_CTRL timer.
54378+ * This fixes my hangs on Yukon-EC (0xb6) rev 1.
54379+ * The if clause is there to start the timer only if it has been
54380+ * configured correctly and not been disabled via ethtool.
54381+ */
54382+ if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_START) {
54383+ sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
54384+ sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
54385+ }
54386+
54387+ hwidx = sky2_read16(hw, STAT_PUT_IDX);
54388+ BUG_ON(hwidx >= STATUS_RING_SIZE);
54389+ rmb();
54390+
54391+ while (hwidx != hw->st_idx) {
54392+ struct sky2_status_le *le = hw->st_le + hw->st_idx;
54393+ struct net_device *dev;
54394+ struct sky2_port *sky2;
54395+ struct sk_buff *skb;
54396+ u32 status;
54397+ u16 length;
54398+
54399+ le = hw->st_le + hw->st_idx;
54400+ hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE;
54401+ prefetch(hw->st_le + hw->st_idx);
54402+
54403+ BUG_ON(le->link >= 2);
54404+ dev = hw->dev[le->link];
54405+ if (dev == NULL || !netif_running(dev))
54406+ continue;
54407+
54408+ sky2 = netdev_priv(dev);
54409+ status = le32_to_cpu(le->status);
54410+ length = le16_to_cpu(le->length);
54411+
54412+ switch (le->opcode & ~HW_OWNER) {
54413+ case OP_RXSTAT:
54414+ skb = sky2_receive(sky2, length, status);
54415+ if (!skb)
54416+ break;
54417+
54418+ skb->dev = dev;
54419+ skb->protocol = eth_type_trans(skb, dev);
54420+ dev->last_rx = jiffies;
54421+
54422+#ifdef SKY2_VLAN_TAG_USED
54423+ if (sky2->vlgrp && (status & GMR_FS_VLAN)) {
54424+ vlan_hwaccel_receive_skb(skb,
54425+ sky2->vlgrp,
54426+ be16_to_cpu(sky2->rx_tag));
54427+ } else
54428+#endif
54429+ netif_receive_skb(skb);
54430+
54431+ if (++work_done >= to_do)
54432+ goto exit_loop;
54433+ break;
54434+
54435+#ifdef SKY2_VLAN_TAG_USED
54436+ case OP_RXVLAN:
54437+ sky2->rx_tag = length;
54438+ break;
54439+
54440+ case OP_RXCHKSVLAN:
54441+ sky2->rx_tag = length;
54442+ /* fall through */
54443+#endif
54444+ case OP_RXCHKS:
54445+ skb = sky2->rx_ring[sky2->rx_next].skb;
54446+ skb->ip_summed = CHECKSUM_HW;
54447+ skb->csum = le16_to_cpu(status);
54448+ break;
54449+
54450+ case OP_TXINDEXLE:
54451+ /* TX index reports status for both ports */
54452+ tx_done[0] = status & 0xffff;
54453+ tx_done[1] = ((status >> 24) & 0xff)
54454+ | (u16)(length & 0xf) << 8;
54455+ break;
54456+
54457+ default:
54458+ if (net_ratelimit())
54459+ printk(KERN_WARNING PFX
54460+ "unknown status opcode 0x%x\n", le->opcode);
54461+ break;
54462+ }
54463+ }
54464+
54465+exit_loop:
54466+ sky2_tx_check(hw, 0, tx_done[0]);
54467+ sky2_tx_check(hw, 1, tx_done[1]);
54468+
54469+ if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
54470+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
54471+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
54472+ }
54473+
54474+ if (likely(work_done < to_do)) {
54475+ spin_lock_irq(&hw->hw_lock);
54476+ __netif_rx_complete(dev0);
54477+
54478+ hw->intr_mask |= Y2_IS_STAT_BMU;
54479+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
54480+ spin_unlock_irq(&hw->hw_lock);
54481+
54482+ return 0;
54483+ } else {
54484+ *budget -= work_done;
54485+ dev0->quota -= work_done;
54486+ return 1;
54487+ }
54488+}
54489+
54490+static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status)
54491+{
54492+ struct net_device *dev = hw->dev[port];
54493+
54494+ if (net_ratelimit())
54495+ printk(KERN_INFO PFX "%s: hw error interrupt status 0x%x\n",
54496+ dev->name, status);
54497+
54498+ if (status & Y2_IS_PAR_RD1) {
54499+ if (net_ratelimit())
54500+ printk(KERN_ERR PFX "%s: ram data read parity error\n",
54501+ dev->name);
54502+ /* Clear IRQ */
54503+ sky2_write16(hw, RAM_BUFFER(port, B3_RI_CTRL), RI_CLR_RD_PERR);
54504+ }
54505+
54506+ if (status & Y2_IS_PAR_WR1) {
54507+ if (net_ratelimit())
54508+ printk(KERN_ERR PFX "%s: ram data write parity error\n",
54509+ dev->name);
54510+
54511+ sky2_write16(hw, RAM_BUFFER(port, B3_RI_CTRL), RI_CLR_WR_PERR);
54512+ }
54513+
54514+ if (status & Y2_IS_PAR_MAC1) {
54515+ if (net_ratelimit())
54516+ printk(KERN_ERR PFX "%s: MAC parity error\n", dev->name);
54517+ sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_CLI_TX_PE);
54518+ }
54519+
54520+ if (status & Y2_IS_PAR_RX1) {
54521+ if (net_ratelimit())
54522+ printk(KERN_ERR PFX "%s: RX parity error\n", dev->name);
54523+ sky2_write32(hw, Q_ADDR(rxqaddr[port], Q_CSR), BMU_CLR_IRQ_PAR);
54524+ }
54525+
54526+ if (status & Y2_IS_TCP_TXA1) {
54527+ if (net_ratelimit())
54528+ printk(KERN_ERR PFX "%s: TCP segmentation error\n",
54529+ dev->name);
54530+ sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR), BMU_CLR_IRQ_TCP);
54531+ }
54532+}
54533+
54534+static void sky2_hw_intr(struct sky2_hw *hw)
54535+{
54536+ u32 status = sky2_read32(hw, B0_HWE_ISRC);
54537+
54538+ if (status & Y2_IS_TIST_OV)
54539+ sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_CLR_IRQ);
54540+
54541+ if (status & (Y2_IS_MST_ERR | Y2_IS_IRQ_STAT)) {
54542+ u16 pci_err;
54543+
54544+ pci_err = sky2_pci_read16(hw, PCI_STATUS);
54545+ if (net_ratelimit())
54546+ printk(KERN_ERR PFX "%s: pci hw error (0x%x)\n",
54547+ pci_name(hw->pdev), pci_err);
54548+
54549+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54550+ sky2_pci_write16(hw, PCI_STATUS,
54551+ pci_err | PCI_STATUS_ERROR_BITS);
54552+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54553+ }
54554+
54555+ if (status & Y2_IS_PCI_EXP) {
54556+ /* PCI-Express uncorrectable Error occurred */
54557+ u32 pex_err;
54558+
54559+ pex_err = sky2_pci_read32(hw, PEX_UNC_ERR_STAT);
54560+
54561+ if (net_ratelimit())
54562+ printk(KERN_ERR PFX "%s: pci express error (0x%x)\n",
54563+ pci_name(hw->pdev), pex_err);
54564+
54565+ /* clear the interrupt */
54566+ sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54567+ sky2_pci_write32(hw, PEX_UNC_ERR_STAT,
54568+ 0xffffffffUL);
54569+ sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54570+
54571+ if (pex_err & PEX_FATAL_ERRORS) {
54572+ u32 hwmsk = sky2_read32(hw, B0_HWE_IMSK);
54573+ hwmsk &= ~Y2_IS_PCI_EXP;
54574+ sky2_write32(hw, B0_HWE_IMSK, hwmsk);
54575+ }
54576+ }
54577+
54578+ if (status & Y2_HWE_L1_MASK)
54579+ sky2_hw_error(hw, 0, status);
54580+ status >>= 8;
54581+ if (status & Y2_HWE_L1_MASK)
54582+ sky2_hw_error(hw, 1, status);
54583+}
54584+
54585+static void sky2_mac_intr(struct sky2_hw *hw, unsigned port)
54586+{
54587+ struct net_device *dev = hw->dev[port];
54588+ struct sky2_port *sky2 = netdev_priv(dev);
54589+ u8 status = sky2_read8(hw, SK_REG(port, GMAC_IRQ_SRC));
54590+
54591+ if (netif_msg_intr(sky2))
54592+ printk(KERN_INFO PFX "%s: mac interrupt status 0x%x\n",
54593+ dev->name, status);
54594+
54595+ if (status & GM_IS_RX_FF_OR) {
54596+ ++sky2->net_stats.rx_fifo_errors;
54597+ sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_CLI_RX_FO);
54598+ }
54599+
54600+ if (status & GM_IS_TX_FF_UR) {
54601+ ++sky2->net_stats.tx_fifo_errors;
54602+ sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_CLI_TX_FU);
54603+ }
54604+}
54605+
54606+static void sky2_phy_intr(struct sky2_hw *hw, unsigned port)
54607+{
54608+ struct net_device *dev = hw->dev[port];
54609+ struct sky2_port *sky2 = netdev_priv(dev);
54610+
54611+ hw->intr_mask &= ~(port == 0 ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
54612+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
54613+
54614+ schedule_work(&sky2->phy_task);
54615+}
54616+
54617+static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
54618+{
54619+ struct sky2_hw *hw = dev_id;
54620+ struct net_device *dev0 = hw->dev[0];
54621+ u32 status;
54622+
54623+ status = sky2_read32(hw, B0_Y2_SP_ISRC2);
54624+ if (status == 0 || status == ~0)
54625+ return IRQ_NONE;
54626+
54627+ spin_lock(&hw->hw_lock);
54628+ if (status & Y2_IS_HW_ERR)
54629+ sky2_hw_intr(hw);
54630+
54631+ /* Do NAPI for Rx and Tx status */
54632+ if (status & Y2_IS_STAT_BMU) {
54633+ hw->intr_mask &= ~Y2_IS_STAT_BMU;
54634+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
54635+
54636+ if (likely(__netif_rx_schedule_prep(dev0))) {
54637+ prefetch(&hw->st_le[hw->st_idx]);
54638+ __netif_rx_schedule(dev0);
54639+ }
54640+ }
54641+
54642+ if (status & Y2_IS_IRQ_PHY1)
54643+ sky2_phy_intr(hw, 0);
54644+
54645+ if (status & Y2_IS_IRQ_PHY2)
54646+ sky2_phy_intr(hw, 1);
54647+
54648+ if (status & Y2_IS_IRQ_MAC1)
54649+ sky2_mac_intr(hw, 0);
54650+
54651+ if (status & Y2_IS_IRQ_MAC2)
54652+ sky2_mac_intr(hw, 1);
54653+
54654+ sky2_write32(hw, B0_Y2_SP_ICR, 2);
54655+
54656+ spin_unlock(&hw->hw_lock);
54657+
54658+ return IRQ_HANDLED;
54659+}
54660+
54661+#ifdef CONFIG_NET_POLL_CONTROLLER
54662+static void sky2_netpoll(struct net_device *dev)
54663+{
54664+ struct sky2_port *sky2 = netdev_priv(dev);
54665+
54666+ sky2_intr(sky2->hw->pdev->irq, sky2->hw, NULL);
54667+}
54668+#endif
54669+
54670+/* Chip internal frequency for clock calculations */
54671+static inline u32 sky2_mhz(const struct sky2_hw *hw)
54672+{
54673+ switch (hw->chip_id) {
54674+ case CHIP_ID_YUKON_EC:
54675+ case CHIP_ID_YUKON_EC_U:
54676+ return 125; /* 125 Mhz */
54677+ case CHIP_ID_YUKON_FE:
54678+ return 100; /* 100 Mhz */
54679+ default: /* YUKON_XL */
54680+ return 156; /* 156 Mhz */
54681+ }
54682+}
54683+
54684+static inline u32 sky2_us2clk(const struct sky2_hw *hw, u32 us)
54685+{
54686+ return sky2_mhz(hw) * us;
54687+}
54688+
54689+static inline u32 sky2_clk2us(const struct sky2_hw *hw, u32 clk)
54690+{
54691+ return clk / sky2_mhz(hw);
54692+}
54693+
54694+
54695+static int sky2_reset(struct sky2_hw *hw)
54696+{
54697+ u16 status;
54698+ u8 t8;
54699+ int i;
54700+
54701+ sky2_write8(hw, B0_CTST, CS_RST_CLR);
54702+
54703+ hw->chip_id = sky2_read8(hw, B2_CHIP_ID);
54704+ if (hw->chip_id < CHIP_ID_YUKON_XL || hw->chip_id > CHIP_ID_YUKON_FE) {
54705+ printk(KERN_ERR PFX "%s: unsupported chip type 0x%x\n",
54706+ pci_name(hw->pdev), hw->chip_id);
54707+ return -EOPNOTSUPP;
54708+ }
54709+
54710+ /* disable ASF */
54711+ if (hw->chip_id <= CHIP_ID_YUKON_EC) {
54712+ sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET);
54713+ sky2_write16(hw, B0_CTST, Y2_ASF_DISABLE);
54714+ }
54715+
54716+ /* do a SW reset */
54717+ sky2_write8(hw, B0_CTST, CS_RST_SET);
54718+ sky2_write8(hw, B0_CTST, CS_RST_CLR);
54719+
54720+ /* clear PCI errors, if any */
54721+ status = sky2_pci_read16(hw, PCI_STATUS);
54722+
54723+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54724+ sky2_pci_write16(hw, PCI_STATUS, status | PCI_STATUS_ERROR_BITS);
54725+
54726+
54727+ sky2_write8(hw, B0_CTST, CS_MRST_CLR);
54728+
54729+ /* clear any PEX errors */
54730+ if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
54731+ sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL);
54732+
54733+
54734+ hw->pmd_type = sky2_read8(hw, B2_PMD_TYP);
54735+ hw->ports = 1;
54736+ t8 = sky2_read8(hw, B2_Y2_HW_RES);
54737+ if ((t8 & CFG_DUAL_MAC_MSK) == CFG_DUAL_MAC_MSK) {
54738+ if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC))
54739+ ++hw->ports;
54740+ }
54741+ hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
54742+
54743+ sky2_set_power_state(hw, PCI_D0);
54744+
54745+ for (i = 0; i < hw->ports; i++) {
54746+ sky2_write8(hw, SK_REG(i, GMAC_LINK_CTRL), GMLC_RST_SET);
54747+ sky2_write8(hw, SK_REG(i, GMAC_LINK_CTRL), GMLC_RST_CLR);
54748+ }
54749+
54750+ sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54751+
54752+ /* Clear I2C IRQ noise */
54753+ sky2_write32(hw, B2_I2C_IRQ, 1);
54754+
54755+ /* turn off hardware timer (unused) */
54756+ sky2_write8(hw, B2_TI_CTRL, TIM_STOP);
54757+ sky2_write8(hw, B2_TI_CTRL, TIM_CLR_IRQ);
54758+
54759+ sky2_write8(hw, B0_Y2LED, LED_STAT_ON);
54760+
54761+ /* Turn off descriptor polling */
54762+ sky2_write32(hw, B28_DPT_CTRL, DPT_STOP);
54763+
54764+ /* Turn off receive timestamp */
54765+ sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_STOP);
54766+ sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_CLR_IRQ);
54767+
54768+ /* enable the Tx Arbiters */
54769+ for (i = 0; i < hw->ports; i++)
54770+ sky2_write8(hw, SK_REG(i, TXA_CTRL), TXA_ENA_ARB);
54771+
54772+ /* Initialize ram interface */
54773+ for (i = 0; i < hw->ports; i++) {
54774+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_CTRL), RI_RST_CLR);
54775+
54776+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_R1), SK_RI_TO_53);
54777+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XA1), SK_RI_TO_53);
54778+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XS1), SK_RI_TO_53);
54779+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_R1), SK_RI_TO_53);
54780+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XA1), SK_RI_TO_53);
54781+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS1), SK_RI_TO_53);
54782+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_R2), SK_RI_TO_53);
54783+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XA2), SK_RI_TO_53);
54784+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XS2), SK_RI_TO_53);
54785+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_R2), SK_RI_TO_53);
54786+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XA2), SK_RI_TO_53);
54787+ sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS2), SK_RI_TO_53);
54788+ }
54789+
54790+ sky2_write32(hw, B0_HWE_IMSK, Y2_HWE_ALL_MASK);
54791+
54792+ for (i = 0; i < hw->ports; i++)
54793+ sky2_phy_reset(hw, i);
54794+
54795+ memset(hw->st_le, 0, STATUS_LE_BYTES);
54796+ hw->st_idx = 0;
54797+
54798+ sky2_write32(hw, STAT_CTRL, SC_STAT_RST_SET);
54799+ sky2_write32(hw, STAT_CTRL, SC_STAT_RST_CLR);
54800+
54801+ sky2_write32(hw, STAT_LIST_ADDR_LO, hw->st_dma);
54802+ sky2_write32(hw, STAT_LIST_ADDR_HI, (u64) hw->st_dma >> 32);
54803+
54804+ /* Set the list last index */
54805+ sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1);
54806+
54807+ /* These status setup values are copied from SysKonnect's driver */
54808+ if (is_ec_a1(hw)) {
54809+ /* WA for dev. #4.3 */
54810+ sky2_write16(hw, STAT_TX_IDX_TH, 0xfff); /* Tx Threshold */
54811+
54812+ /* set Status-FIFO watermark */
54813+ sky2_write8(hw, STAT_FIFO_WM, 0x21); /* WA for dev. #4.18 */
54814+
54815+ /* set Status-FIFO ISR watermark */
54816+ sky2_write8(hw, STAT_FIFO_ISR_WM, 0x07); /* WA for dev. #4.18 */
54817+ sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 10000));
54818+ } else {
54819+ sky2_write16(hw, STAT_TX_IDX_TH, 10);
54820+ sky2_write8(hw, STAT_FIFO_WM, 16);
54821+
54822+ /* set Status-FIFO ISR watermark */
54823+ if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
54824+ sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
54825+ else
54826+ sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
54827+
54828+ sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000));
54829+ sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 7));
54830+ }
54831+
54832+ /* enable status unit */
54833+ sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON);
54834+
54835+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
54836+ sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
54837+ sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_START);
54838+
54839+ return 0;
54840+}
54841+
54842+static u32 sky2_supported_modes(const struct sky2_hw *hw)
54843+{
54844+ if (sky2_is_copper(hw)) {
54845+ u32 modes = SUPPORTED_10baseT_Half
54846+ | SUPPORTED_10baseT_Full
54847+ | SUPPORTED_100baseT_Half
54848+ | SUPPORTED_100baseT_Full
54849+ | SUPPORTED_Autoneg | SUPPORTED_TP;
54850+
54851+ if (hw->chip_id != CHIP_ID_YUKON_FE)
54852+ modes |= SUPPORTED_1000baseT_Half
54853+ | SUPPORTED_1000baseT_Full;
54854+ return modes;
54855+ } else
54856+ return SUPPORTED_1000baseT_Half
54857+ | SUPPORTED_1000baseT_Full
54858+ | SUPPORTED_Autoneg
54859+ | SUPPORTED_FIBRE;
54860+}
54861+
54862+static int sky2_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
54863+{
54864+ struct sky2_port *sky2 = netdev_priv(dev);
54865+ struct sky2_hw *hw = sky2->hw;
54866+
54867+ ecmd->transceiver = XCVR_INTERNAL;
54868+ ecmd->supported = sky2_supported_modes(hw);
54869+ ecmd->phy_address = PHY_ADDR_MARV;
54870+ if (sky2_is_copper(hw)) {
54871+ ecmd->supported = SUPPORTED_10baseT_Half
54872+ | SUPPORTED_10baseT_Full
54873+ | SUPPORTED_100baseT_Half
54874+ | SUPPORTED_100baseT_Full
54875+ | SUPPORTED_1000baseT_Half
54876+ | SUPPORTED_1000baseT_Full
54877+ | SUPPORTED_Autoneg | SUPPORTED_TP;
54878+ ecmd->port = PORT_TP;
54879+ ecmd->speed = sky2->speed;
54880+ } else {
54881+ ecmd->speed = SPEED_1000;
54882+ ecmd->port = PORT_FIBRE;
54883+ }
54884+
54885+ ecmd->advertising = sky2->advertising;
54886+ ecmd->autoneg = sky2->autoneg;
54887+ ecmd->duplex = sky2->duplex;
54888+ return 0;
54889+}
54890+
54891+static int sky2_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
54892+{
54893+ struct sky2_port *sky2 = netdev_priv(dev);
54894+ const struct sky2_hw *hw = sky2->hw;
54895+ u32 supported = sky2_supported_modes(hw);
54896+
54897+ if (ecmd->autoneg == AUTONEG_ENABLE) {
54898+ ecmd->advertising = supported;
54899+ sky2->duplex = -1;
54900+ sky2->speed = -1;
54901+ } else {
54902+ u32 setting;
54903+
54904+ switch (ecmd->speed) {
54905+ case SPEED_1000:
54906+ if (ecmd->duplex == DUPLEX_FULL)
54907+ setting = SUPPORTED_1000baseT_Full;
54908+ else if (ecmd->duplex == DUPLEX_HALF)
54909+ setting = SUPPORTED_1000baseT_Half;
54910+ else
54911+ return -EINVAL;
54912+ break;
54913+ case SPEED_100:
54914+ if (ecmd->duplex == DUPLEX_FULL)
54915+ setting = SUPPORTED_100baseT_Full;
54916+ else if (ecmd->duplex == DUPLEX_HALF)
54917+ setting = SUPPORTED_100baseT_Half;
54918+ else
54919+ return -EINVAL;
54920+ break;
54921+
54922+ case SPEED_10:
54923+ if (ecmd->duplex == DUPLEX_FULL)
54924+ setting = SUPPORTED_10baseT_Full;
54925+ else if (ecmd->duplex == DUPLEX_HALF)
54926+ setting = SUPPORTED_10baseT_Half;
54927+ else
54928+ return -EINVAL;
54929+ break;
54930+ default:
54931+ return -EINVAL;
54932+ }
54933+
54934+ if ((setting & supported) == 0)
54935+ return -EINVAL;
54936+
54937+ sky2->speed = ecmd->speed;
54938+ sky2->duplex = ecmd->duplex;
54939+ }
54940+
54941+ sky2->autoneg = ecmd->autoneg;
54942+ sky2->advertising = ecmd->advertising;
54943+
54944+ if (netif_running(dev))
54945+ sky2_phy_reinit(sky2);
54946+
54947+ return 0;
54948+}
54949+
54950+static void sky2_get_drvinfo(struct net_device *dev,
54951+ struct ethtool_drvinfo *info)
54952+{
54953+ struct sky2_port *sky2 = netdev_priv(dev);
54954+
54955+ strcpy(info->driver, DRV_NAME);
54956+ strcpy(info->version, DRV_VERSION);
54957+ strcpy(info->fw_version, "N/A");
54958+ strcpy(info->bus_info, pci_name(sky2->hw->pdev));
54959+}
54960+
54961+static const struct sky2_stat {
54962+ char name[ETH_GSTRING_LEN];
54963+ u16 offset;
54964+} sky2_stats[] = {
54965+ { "tx_bytes", GM_TXO_OK_HI },
54966+ { "rx_bytes", GM_RXO_OK_HI },
54967+ { "tx_broadcast", GM_TXF_BC_OK },
54968+ { "rx_broadcast", GM_RXF_BC_OK },
54969+ { "tx_multicast", GM_TXF_MC_OK },
54970+ { "rx_multicast", GM_RXF_MC_OK },
54971+ { "tx_unicast", GM_TXF_UC_OK },
54972+ { "rx_unicast", GM_RXF_UC_OK },
54973+ { "tx_mac_pause", GM_TXF_MPAUSE },
54974+ { "rx_mac_pause", GM_RXF_MPAUSE },
54975+ { "collisions", GM_TXF_SNG_COL },
54976+ { "late_collision",GM_TXF_LAT_COL },
54977+ { "aborted", GM_TXF_ABO_COL },
54978+ { "multi_collisions", GM_TXF_MUL_COL },
54979+ { "fifo_underrun", GM_TXE_FIFO_UR },
54980+ { "fifo_overflow", GM_RXE_FIFO_OV },
54981+ { "rx_toolong", GM_RXF_LNG_ERR },
54982+ { "rx_jabber", GM_RXF_JAB_PKT },
54983+ { "rx_runt", GM_RXE_FRAG },
54984+ { "rx_too_long", GM_RXF_LNG_ERR },
54985+ { "rx_fcs_error", GM_RXF_FCS_ERR },
54986+};
54987+
54988+static u32 sky2_get_rx_csum(struct net_device *dev)
54989+{
54990+ struct sky2_port *sky2 = netdev_priv(dev);
54991+
54992+ return sky2->rx_csum;
54993+}
54994+
54995+static int sky2_set_rx_csum(struct net_device *dev, u32 data)
54996+{
54997+ struct sky2_port *sky2 = netdev_priv(dev);
54998+
54999+ sky2->rx_csum = data;
55000+
55001+ sky2_write32(sky2->hw, Q_ADDR(rxqaddr[sky2->port], Q_CSR),
55002+ data ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM);
55003+
55004+ return 0;
55005+}
55006+
55007+static u32 sky2_get_msglevel(struct net_device *netdev)
55008+{
55009+ struct sky2_port *sky2 = netdev_priv(netdev);
55010+ return sky2->msg_enable;
55011+}
55012+
55013+static int sky2_nway_reset(struct net_device *dev)
55014+{
55015+ struct sky2_port *sky2 = netdev_priv(dev);
55016+
55017+ if (sky2->autoneg != AUTONEG_ENABLE)
55018+ return -EINVAL;
55019+
55020+ sky2_phy_reinit(sky2);
55021+
55022+ return 0;
55023+}
55024+
55025+static void sky2_phy_stats(struct sky2_port *sky2, u64 * data, unsigned count)
55026+{
55027+ struct sky2_hw *hw = sky2->hw;
55028+ unsigned port = sky2->port;
55029+ int i;
55030+
55031+ data[0] = (u64) gma_read32(hw, port, GM_TXO_OK_HI) << 32
55032+ | (u64) gma_read32(hw, port, GM_TXO_OK_LO);
55033+ data[1] = (u64) gma_read32(hw, port, GM_RXO_OK_HI) << 32
55034+ | (u64) gma_read32(hw, port, GM_RXO_OK_LO);
55035+
55036+ for (i = 2; i < count; i++)
55037+ data[i] = (u64) gma_read32(hw, port, sky2_stats[i].offset);
55038+}
55039+
55040+static void sky2_set_msglevel(struct net_device *netdev, u32 value)
55041+{
55042+ struct sky2_port *sky2 = netdev_priv(netdev);
55043+ sky2->msg_enable = value;
55044+}
55045+
55046+static int sky2_get_stats_count(struct net_device *dev)
55047+{
55048+ return ARRAY_SIZE(sky2_stats);
55049+}
55050+
55051+static void sky2_get_ethtool_stats(struct net_device *dev,
55052+ struct ethtool_stats *stats, u64 * data)
55053+{
55054+ struct sky2_port *sky2 = netdev_priv(dev);
55055+
55056+ sky2_phy_stats(sky2, data, ARRAY_SIZE(sky2_stats));
55057+}
55058+
55059+static void sky2_get_strings(struct net_device *dev, u32 stringset, u8 * data)
55060+{
55061+ int i;
55062+
55063+ switch (stringset) {
55064+ case ETH_SS_STATS:
55065+ for (i = 0; i < ARRAY_SIZE(sky2_stats); i++)
55066+ memcpy(data + i * ETH_GSTRING_LEN,
55067+ sky2_stats[i].name, ETH_GSTRING_LEN);
55068+ break;
55069+ }
55070+}
55071+
55072+/* Use hardware MIB variables for critical path statistics and
55073+ * transmit feedback not reported at interrupt.
55074+ * Other errors are accounted for in interrupt handler.
55075+ */
55076+static struct net_device_stats *sky2_get_stats(struct net_device *dev)
55077+{
55078+ struct sky2_port *sky2 = netdev_priv(dev);
55079+ u64 data[13];
55080+
55081+ sky2_phy_stats(sky2, data, ARRAY_SIZE(data));
55082+
55083+ sky2->net_stats.tx_bytes = data[0];
55084+ sky2->net_stats.rx_bytes = data[1];
55085+ sky2->net_stats.tx_packets = data[2] + data[4] + data[6];
55086+ sky2->net_stats.rx_packets = data[3] + data[5] + data[7];
55087+ sky2->net_stats.multicast = data[5] + data[7];
55088+ sky2->net_stats.collisions = data[10];
55089+ sky2->net_stats.tx_aborted_errors = data[12];
55090+
55091+ return &sky2->net_stats;
55092+}
55093+
55094+static int sky2_set_mac_address(struct net_device *dev, void *p)
55095+{
55096+ struct sky2_port *sky2 = netdev_priv(dev);
55097+ struct sky2_hw *hw = sky2->hw;
55098+ unsigned port = sky2->port;
55099+ const struct sockaddr *addr = p;
55100+
55101+ if (!is_valid_ether_addr(addr->sa_data))
55102+ return -EADDRNOTAVAIL;
55103+
55104+ memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
55105+ memcpy_toio(hw->regs + B2_MAC_1 + port * 8,
55106+ dev->dev_addr, ETH_ALEN);
55107+ memcpy_toio(hw->regs + B2_MAC_2 + port * 8,
55108+ dev->dev_addr, ETH_ALEN);
55109+
55110+ /* virtual address for data */
55111+ gma_set_addr(hw, port, GM_SRC_ADDR_2L, dev->dev_addr);
55112+
55113+ /* physical address: used for pause frames */
55114+ gma_set_addr(hw, port, GM_SRC_ADDR_1L, dev->dev_addr);
55115+
55116+ return 0;
55117+}
55118+
55119+static void sky2_set_multicast(struct net_device *dev)
55120+{
55121+ struct sky2_port *sky2 = netdev_priv(dev);
55122+ struct sky2_hw *hw = sky2->hw;
55123+ unsigned port = sky2->port;
55124+ struct dev_mc_list *list = dev->mc_list;
55125+ u16 reg;
55126+ u8 filter[8];
55127+
55128+ memset(filter, 0, sizeof(filter));
55129+
55130+ reg = gma_read16(hw, port, GM_RX_CTRL);
55131+ reg |= GM_RXCR_UCF_ENA;
55132+
55133+ if (dev->flags & IFF_PROMISC) /* promiscuous */
55134+ reg &= ~(GM_RXCR_UCF_ENA | GM_RXCR_MCF_ENA);
55135+ else if ((dev->flags & IFF_ALLMULTI) || dev->mc_count > 16) /* all multicast */
55136+ memset(filter, 0xff, sizeof(filter));
55137+ else if (dev->mc_count == 0) /* no multicast */
55138+ reg &= ~GM_RXCR_MCF_ENA;
55139+ else {
55140+ int i;
55141+ reg |= GM_RXCR_MCF_ENA;
55142+
55143+ for (i = 0; list && i < dev->mc_count; i++, list = list->next) {
55144+ u32 bit = ether_crc(ETH_ALEN, list->dmi_addr) & 0x3f;
55145+ filter[bit / 8] |= 1 << (bit % 8);
55146+ }
55147+ }
55148+
55149+ gma_write16(hw, port, GM_MC_ADDR_H1,
55150+ (u16) filter[0] | ((u16) filter[1] << 8));
55151+ gma_write16(hw, port, GM_MC_ADDR_H2,
55152+ (u16) filter[2] | ((u16) filter[3] << 8));
55153+ gma_write16(hw, port, GM_MC_ADDR_H3,
55154+ (u16) filter[4] | ((u16) filter[5] << 8));
55155+ gma_write16(hw, port, GM_MC_ADDR_H4,
55156+ (u16) filter[6] | ((u16) filter[7] << 8));
55157+
55158+ gma_write16(hw, port, GM_RX_CTRL, reg);
55159+}
55160+
55161+/* Can have one global because blinking is controlled by
55162+ * ethtool and that is always under RTNL mutex
55163+ */
55164+static void sky2_led(struct sky2_hw *hw, unsigned port, int on)
55165+{
55166+ u16 pg;
55167+
55168+ switch (hw->chip_id) {
55169+ case CHIP_ID_YUKON_XL:
55170+ pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55171+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55172+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL,
55173+ on ? (PHY_M_LEDC_LOS_CTRL(1) |
55174+ PHY_M_LEDC_INIT_CTRL(7) |
55175+ PHY_M_LEDC_STA1_CTRL(7) |
55176+ PHY_M_LEDC_STA0_CTRL(7))
55177+ : 0);
55178+
55179+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55180+ break;
55181+
55182+ default:
55183+ gm_phy_write(hw, port, PHY_MARV_LED_CTRL, 0);
55184+ gm_phy_write(hw, port, PHY_MARV_LED_OVER,
55185+ on ? PHY_M_LED_MO_DUP(MO_LED_ON) |
55186+ PHY_M_LED_MO_10(MO_LED_ON) |
55187+ PHY_M_LED_MO_100(MO_LED_ON) |
55188+ PHY_M_LED_MO_1000(MO_LED_ON) |
55189+ PHY_M_LED_MO_RX(MO_LED_ON)
55190+ : PHY_M_LED_MO_DUP(MO_LED_OFF) |
55191+ PHY_M_LED_MO_10(MO_LED_OFF) |
55192+ PHY_M_LED_MO_100(MO_LED_OFF) |
55193+ PHY_M_LED_MO_1000(MO_LED_OFF) |
55194+ PHY_M_LED_MO_RX(MO_LED_OFF));
55195+
55196+ }
55197+}
55198+
55199+/* blink LED's for finding board */
55200+static int sky2_phys_id(struct net_device *dev, u32 data)
55201+{
55202+ struct sky2_port *sky2 = netdev_priv(dev);
55203+ struct sky2_hw *hw = sky2->hw;
55204+ unsigned port = sky2->port;
55205+ u16 ledctrl, ledover = 0;
55206+ long ms;
55207+ int interrupted;
55208+ int onoff = 1;
55209+
55210+ if (!data || data > (u32) (MAX_SCHEDULE_TIMEOUT / HZ))
55211+ ms = jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT);
55212+ else
55213+ ms = data * 1000;
55214+
55215+ /* save initial values */
55216+ down(&sky2->phy_sema);
55217+ if (hw->chip_id == CHIP_ID_YUKON_XL) {
55218+ u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55219+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55220+ ledctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
55221+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55222+ } else {
55223+ ledctrl = gm_phy_read(hw, port, PHY_MARV_LED_CTRL);
55224+ ledover = gm_phy_read(hw, port, PHY_MARV_LED_OVER);
55225+ }
55226+
55227+ interrupted = 0;
55228+ while (!interrupted && ms > 0) {
55229+ sky2_led(hw, port, onoff);
55230+ onoff = !onoff;
55231+
55232+ up(&sky2->phy_sema);
55233+ interrupted = msleep_interruptible(250);
55234+ down(&sky2->phy_sema);
55235+
55236+ ms -= 250;
55237+ }
55238+
55239+ /* resume regularly scheduled programming */
55240+ if (hw->chip_id == CHIP_ID_YUKON_XL) {
55241+ u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55242+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55243+ gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ledctrl);
55244+ gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55245+ } else {
55246+ gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
55247+ gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
55248+ }
55249+ up(&sky2->phy_sema);
55250+
55251+ return 0;
55252+}
55253+
55254+static void sky2_get_pauseparam(struct net_device *dev,
55255+ struct ethtool_pauseparam *ecmd)
55256+{
55257+ struct sky2_port *sky2 = netdev_priv(dev);
55258+
55259+ ecmd->tx_pause = sky2->tx_pause;
55260+ ecmd->rx_pause = sky2->rx_pause;
55261+ ecmd->autoneg = sky2->autoneg;
55262+}
55263+
55264+static int sky2_set_pauseparam(struct net_device *dev,
55265+ struct ethtool_pauseparam *ecmd)
55266+{
55267+ struct sky2_port *sky2 = netdev_priv(dev);
55268+ int err = 0;
55269+
55270+ sky2->autoneg = ecmd->autoneg;
55271+ sky2->tx_pause = ecmd->tx_pause != 0;
55272+ sky2->rx_pause = ecmd->rx_pause != 0;
55273+
55274+ sky2_phy_reinit(sky2);
55275+
55276+ return err;
55277+}
55278+
55279+#ifdef CONFIG_PM
55280+static void sky2_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
55281+{
55282+ struct sky2_port *sky2 = netdev_priv(dev);
55283+
55284+ wol->supported = WAKE_MAGIC;
55285+ wol->wolopts = sky2->wol ? WAKE_MAGIC : 0;
55286+}
55287+
55288+static int sky2_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
55289+{
55290+ struct sky2_port *sky2 = netdev_priv(dev);
55291+ struct sky2_hw *hw = sky2->hw;
55292+
55293+ if (wol->wolopts != WAKE_MAGIC && wol->wolopts != 0)
55294+ return -EOPNOTSUPP;
55295+
55296+ sky2->wol = wol->wolopts == WAKE_MAGIC;
55297+
55298+ if (sky2->wol) {
55299+ memcpy_toio(hw->regs + WOL_MAC_ADDR, dev->dev_addr, ETH_ALEN);
55300+
55301+ sky2_write16(hw, WOL_CTRL_STAT,
55302+ WOL_CTL_ENA_PME_ON_MAGIC_PKT |
55303+ WOL_CTL_ENA_MAGIC_PKT_UNIT);
55304+ } else
55305+ sky2_write16(hw, WOL_CTRL_STAT, WOL_CTL_DEFAULT);
55306+
55307+ return 0;
55308+}
55309+#endif
55310+
55311+static int sky2_get_coalesce(struct net_device *dev,
55312+ struct ethtool_coalesce *ecmd)
55313+{
55314+ struct sky2_port *sky2 = netdev_priv(dev);
55315+ struct sky2_hw *hw = sky2->hw;
55316+
55317+ if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_STOP)
55318+ ecmd->tx_coalesce_usecs = 0;
55319+ else {
55320+ u32 clks = sky2_read32(hw, STAT_TX_TIMER_INI);
55321+ ecmd->tx_coalesce_usecs = sky2_clk2us(hw, clks);
55322+ }
55323+ ecmd->tx_max_coalesced_frames = sky2_read16(hw, STAT_TX_IDX_TH);
55324+
55325+ if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_STOP)
55326+ ecmd->rx_coalesce_usecs = 0;
55327+ else {
55328+ u32 clks = sky2_read32(hw, STAT_LEV_TIMER_INI);
55329+ ecmd->rx_coalesce_usecs = sky2_clk2us(hw, clks);
55330+ }
55331+ ecmd->rx_max_coalesced_frames = sky2_read8(hw, STAT_FIFO_WM);
55332+
55333+ if (sky2_read8(hw, STAT_ISR_TIMER_CTRL) == TIM_STOP)
55334+ ecmd->rx_coalesce_usecs_irq = 0;
55335+ else {
55336+ u32 clks = sky2_read32(hw, STAT_ISR_TIMER_INI);
55337+ ecmd->rx_coalesce_usecs_irq = sky2_clk2us(hw, clks);
55338+ }
55339+
55340+ ecmd->rx_max_coalesced_frames_irq = sky2_read8(hw, STAT_FIFO_ISR_WM);
55341+
55342+ return 0;
55343+}
55344+
55345+/* Note: this affect both ports */
55346+static int sky2_set_coalesce(struct net_device *dev,
55347+ struct ethtool_coalesce *ecmd)
55348+{
55349+ struct sky2_port *sky2 = netdev_priv(dev);
55350+ struct sky2_hw *hw = sky2->hw;
55351+ const u32 tmin = sky2_clk2us(hw, 1);
55352+ const u32 tmax = 5000;
55353+
55354+ if (ecmd->tx_coalesce_usecs != 0 &&
55355+ (ecmd->tx_coalesce_usecs < tmin || ecmd->tx_coalesce_usecs > tmax))
55356+ return -EINVAL;
55357+
55358+ if (ecmd->rx_coalesce_usecs != 0 &&
55359+ (ecmd->rx_coalesce_usecs < tmin || ecmd->rx_coalesce_usecs > tmax))
55360+ return -EINVAL;
55361+
55362+ if (ecmd->rx_coalesce_usecs_irq != 0 &&
55363+ (ecmd->rx_coalesce_usecs_irq < tmin || ecmd->rx_coalesce_usecs_irq > tmax))
55364+ return -EINVAL;
55365+
55366+ if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1)
55367+ return -EINVAL;
55368+ if (ecmd->rx_max_coalesced_frames > RX_MAX_PENDING)
55369+ return -EINVAL;
55370+ if (ecmd->rx_max_coalesced_frames_irq >RX_MAX_PENDING)
55371+ return -EINVAL;
55372+
55373+ if (ecmd->tx_coalesce_usecs == 0)
55374+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
55375+ else {
55376+ sky2_write32(hw, STAT_TX_TIMER_INI,
55377+ sky2_us2clk(hw, ecmd->tx_coalesce_usecs));
55378+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
55379+ }
55380+ sky2_write16(hw, STAT_TX_IDX_TH, ecmd->tx_max_coalesced_frames);
55381+
55382+ if (ecmd->rx_coalesce_usecs == 0)
55383+ sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
55384+ else {
55385+ sky2_write32(hw, STAT_LEV_TIMER_INI,
55386+ sky2_us2clk(hw, ecmd->rx_coalesce_usecs));
55387+ sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
55388+ }
55389+ sky2_write8(hw, STAT_FIFO_WM, ecmd->rx_max_coalesced_frames);
55390+
55391+ if (ecmd->rx_coalesce_usecs_irq == 0)
55392+ sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_STOP);
55393+ else {
55394+ sky2_write32(hw, STAT_ISR_TIMER_INI,
55395+ sky2_us2clk(hw, ecmd->rx_coalesce_usecs_irq));
55396+ sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_START);
55397+ }
55398+ sky2_write8(hw, STAT_FIFO_ISR_WM, ecmd->rx_max_coalesced_frames_irq);
55399+ return 0;
55400+}
55401+
55402+static void sky2_get_ringparam(struct net_device *dev,
55403+ struct ethtool_ringparam *ering)
55404+{
55405+ struct sky2_port *sky2 = netdev_priv(dev);
55406+
55407+ ering->rx_max_pending = RX_MAX_PENDING;
55408+ ering->rx_mini_max_pending = 0;
55409+ ering->rx_jumbo_max_pending = 0;
55410+ ering->tx_max_pending = TX_RING_SIZE - 1;
55411+
55412+ ering->rx_pending = sky2->rx_pending;
55413+ ering->rx_mini_pending = 0;
55414+ ering->rx_jumbo_pending = 0;
55415+ ering->tx_pending = sky2->tx_pending;
55416+}
55417+
55418+static int sky2_set_ringparam(struct net_device *dev,
55419+ struct ethtool_ringparam *ering)
55420+{
55421+ struct sky2_port *sky2 = netdev_priv(dev);
55422+ int err = 0;
55423+
55424+ if (ering->rx_pending > RX_MAX_PENDING ||
55425+ ering->rx_pending < 8 ||
55426+ ering->tx_pending < MAX_SKB_TX_LE ||
55427+ ering->tx_pending > TX_RING_SIZE - 1)
55428+ return -EINVAL;
55429+
55430+ if (netif_running(dev))
55431+ sky2_down(dev);
55432+
55433+ sky2->rx_pending = ering->rx_pending;
55434+ sky2->tx_pending = ering->tx_pending;
55435+
55436+ if (netif_running(dev)) {
55437+ err = sky2_up(dev);
55438+ if (err)
55439+ dev_close(dev);
55440+ else
55441+ sky2_set_multicast(dev);
55442+ }
55443+
55444+ return err;
55445+}
55446+
55447+static int sky2_get_regs_len(struct net_device *dev)
55448+{
55449+ return 0x4000;
55450+}
55451+
55452+/*
55453+ * Returns copy of control register region
55454+ * Note: access to the RAM address register set will cause timeouts.
55455+ */
55456+static void sky2_get_regs(struct net_device *dev, struct ethtool_regs *regs,
55457+ void *p)
55458+{
55459+ const struct sky2_port *sky2 = netdev_priv(dev);
55460+ const void __iomem *io = sky2->hw->regs;
55461+
55462+ BUG_ON(regs->len < B3_RI_WTO_R1);
55463+ regs->version = 1;
55464+ memset(p, 0, regs->len);
55465+
55466+ memcpy_fromio(p, io, B3_RAM_ADDR);
55467+
55468+ memcpy_fromio(p + B3_RI_WTO_R1,
55469+ io + B3_RI_WTO_R1,
55470+ regs->len - B3_RI_WTO_R1);
55471+}
55472+
55473+static struct ethtool_ops sky2_ethtool_ops = {
55474+ .get_settings = sky2_get_settings,
55475+ .set_settings = sky2_set_settings,
55476+ .get_drvinfo = sky2_get_drvinfo,
55477+ .get_msglevel = sky2_get_msglevel,
55478+ .set_msglevel = sky2_set_msglevel,
55479+ .nway_reset = sky2_nway_reset,
55480+ .get_regs_len = sky2_get_regs_len,
55481+ .get_regs = sky2_get_regs,
55482+ .get_link = ethtool_op_get_link,
55483+ .get_sg = ethtool_op_get_sg,
55484+ .set_sg = ethtool_op_set_sg,
55485+ .get_tx_csum = ethtool_op_get_tx_csum,
55486+ .set_tx_csum = ethtool_op_set_tx_csum,
55487+ .get_tso = ethtool_op_get_tso,
55488+ .set_tso = ethtool_op_set_tso,
55489+ .get_rx_csum = sky2_get_rx_csum,
55490+ .set_rx_csum = sky2_set_rx_csum,
55491+ .get_strings = sky2_get_strings,
55492+ .get_coalesce = sky2_get_coalesce,
55493+ .set_coalesce = sky2_set_coalesce,
55494+ .get_ringparam = sky2_get_ringparam,
55495+ .set_ringparam = sky2_set_ringparam,
55496+ .get_pauseparam = sky2_get_pauseparam,
55497+ .set_pauseparam = sky2_set_pauseparam,
55498+#ifdef CONFIG_PM
55499+ .get_wol = sky2_get_wol,
55500+ .set_wol = sky2_set_wol,
55501+#endif
55502+ .phys_id = sky2_phys_id,
55503+ .get_stats_count = sky2_get_stats_count,
55504+ .get_ethtool_stats = sky2_get_ethtool_stats,
55505+ .get_perm_addr = ethtool_op_get_perm_addr,
55506+};
55507+
55508+/* Initialize network device */
55509+static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
55510+ unsigned port, int highmem)
55511+{
55512+ struct sky2_port *sky2;
55513+ struct net_device *dev = alloc_etherdev(sizeof(*sky2));
55514+
55515+ if (!dev) {
55516+ printk(KERN_ERR "sky2 etherdev alloc failed");
55517+ return NULL;
55518+ }
55519+
55520+ SET_MODULE_OWNER(dev);
55521+ SET_NETDEV_DEV(dev, &hw->pdev->dev);
55522+ dev->irq = hw->pdev->irq;
55523+ dev->open = sky2_up;
55524+ dev->stop = sky2_down;
55525+ dev->do_ioctl = sky2_ioctl;
55526+ dev->hard_start_xmit = sky2_xmit_frame;
55527+ dev->get_stats = sky2_get_stats;
55528+ dev->set_multicast_list = sky2_set_multicast;
55529+ dev->set_mac_address = sky2_set_mac_address;
55530+ dev->change_mtu = sky2_change_mtu;
55531+ SET_ETHTOOL_OPS(dev, &sky2_ethtool_ops);
55532+ dev->tx_timeout = sky2_tx_timeout;
55533+ dev->watchdog_timeo = TX_WATCHDOG;
55534+ if (port == 0)
55535+ dev->poll = sky2_poll;
55536+ dev->weight = NAPI_WEIGHT;
55537+#ifdef CONFIG_NET_POLL_CONTROLLER
55538+ dev->poll_controller = sky2_netpoll;
55539+#endif
55540+
55541+ sky2 = netdev_priv(dev);
55542+ sky2->netdev = dev;
55543+ sky2->hw = hw;
55544+ sky2->msg_enable = netif_msg_init(debug, default_msg);
55545+
55546+ spin_lock_init(&sky2->tx_lock);
55547+ /* Auto speed and flow control */
55548+ sky2->autoneg = AUTONEG_ENABLE;
55549+ sky2->tx_pause = 1;
55550+ sky2->rx_pause = 1;
55551+ sky2->duplex = -1;
55552+ sky2->speed = -1;
55553+ sky2->advertising = sky2_supported_modes(hw);
55554+
55555+ /* Receive checksum disabled for Yukon XL
55556+ * because of observed problems with incorrect
55557+ * values when multiple packets are received in one interrupt
55558+ */
55559+ sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL);
55560+
55561+ INIT_WORK(&sky2->phy_task, sky2_phy_task, sky2);
55562+ init_MUTEX(&sky2->phy_sema);
55563+ sky2->tx_pending = TX_DEF_PENDING;
55564+ sky2->rx_pending = is_ec_a1(hw) ? 8 : RX_DEF_PENDING;
55565+ sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN);
55566+
55567+ hw->dev[port] = dev;
55568+
55569+ sky2->port = port;
55570+
55571+ dev->features |= NETIF_F_LLTX;
55572+ if (hw->chip_id != CHIP_ID_YUKON_EC_U)
55573+ dev->features |= NETIF_F_TSO;
55574+ if (highmem)
55575+ dev->features |= NETIF_F_HIGHDMA;
55576+ dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
55577+
55578+#ifdef SKY2_VLAN_TAG_USED
55579+ dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
55580+ dev->vlan_rx_register = sky2_vlan_rx_register;
55581+ dev->vlan_rx_kill_vid = sky2_vlan_rx_kill_vid;
55582+#endif
55583+
55584+ /* read the mac address */
55585+ memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN);
55586+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
55587+
55588+ /* device is off until link detection */
55589+ netif_carrier_off(dev);
55590+ netif_stop_queue(dev);
55591+
55592+ return dev;
55593+}
55594+
55595+static void __devinit sky2_show_addr(struct net_device *dev)
55596+{
55597+ const struct sky2_port *sky2 = netdev_priv(dev);
55598+
55599+ if (netif_msg_probe(sky2))
55600+ printk(KERN_INFO PFX "%s: addr %02x:%02x:%02x:%02x:%02x:%02x\n",
55601+ dev->name,
55602+ dev->dev_addr[0], dev->dev_addr[1], dev->dev_addr[2],
55603+ dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]);
55604+}
55605+
55606+static int __devinit sky2_probe(struct pci_dev *pdev,
55607+ const struct pci_device_id *ent)
55608+{
55609+ struct net_device *dev, *dev1 = NULL;
55610+ struct sky2_hw *hw;
55611+ int err, pm_cap, using_dac = 0;
55612+
55613+ err = pci_enable_device(pdev);
55614+ if (err) {
55615+ printk(KERN_ERR PFX "%s cannot enable PCI device\n",
55616+ pci_name(pdev));
55617+ goto err_out;
55618+ }
55619+
55620+ err = pci_request_regions(pdev, DRV_NAME);
55621+ if (err) {
55622+ printk(KERN_ERR PFX "%s cannot obtain PCI resources\n",
55623+ pci_name(pdev));
55624+ goto err_out;
55625+ }
55626+
55627+ pci_set_master(pdev);
55628+
55629+ /* Find power-management capability. */
55630+ pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM);
55631+ if (pm_cap == 0) {
55632+ printk(KERN_ERR PFX "Cannot find PowerManagement capability, "
55633+ "aborting.\n");
55634+ err = -EIO;
55635+ goto err_out_free_regions;
55636+ }
55637+
55638+ if (sizeof(dma_addr_t) > sizeof(u32) &&
55639+ !(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) {
55640+ using_dac = 1;
55641+ err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
55642+ if (err < 0) {
55643+ printk(KERN_ERR PFX "%s unable to obtain 64 bit DMA "
55644+ "for consistent allocations\n", pci_name(pdev));
55645+ goto err_out_free_regions;
55646+ }
55647+
55648+ } else {
55649+ err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
55650+ if (err) {
55651+ printk(KERN_ERR PFX "%s no usable DMA configuration\n",
55652+ pci_name(pdev));
55653+ goto err_out_free_regions;
55654+ }
55655+ }
55656+
55657+ err = -ENOMEM;
55658+ hw = kzalloc(sizeof(*hw), GFP_KERNEL);
55659+ if (!hw) {
55660+ printk(KERN_ERR PFX "%s: cannot allocate hardware struct\n",
55661+ pci_name(pdev));
55662+ goto err_out_free_regions;
55663+ }
55664+
55665+ hw->pdev = pdev;
55666+
55667+ hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000);
55668+ if (!hw->regs) {
55669+ printk(KERN_ERR PFX "%s: cannot map device registers\n",
55670+ pci_name(pdev));
55671+ goto err_out_free_hw;
55672+ }
55673+ hw->pm_cap = pm_cap;
55674+ spin_lock_init(&hw->hw_lock);
55675+
55676+#ifdef __BIG_ENDIAN
55677+ /* byte swap descriptors in hardware */
55678+ {
55679+ u32 reg;
55680+
55681+ reg = sky2_pci_read32(hw, PCI_DEV_REG2);
55682+ reg |= PCI_REV_DESC;
55683+ sky2_pci_write32(hw, PCI_DEV_REG2, reg);
55684+ }
55685+#endif
55686+
55687+ /* ring for status responses */
55688+ hw->st_le = pci_alloc_consistent(hw->pdev, STATUS_LE_BYTES,
55689+ &hw->st_dma);
55690+ if (!hw->st_le)
55691+ goto err_out_iounmap;
55692+
55693+ err = sky2_reset(hw);
55694+ if (err)
55695+ goto err_out_iounmap;
55696+
55697+ printk(KERN_INFO PFX "v%s addr 0x%lx irq %d Yukon-%s (0x%x) rev %d\n",
55698+ DRV_VERSION, pci_resource_start(pdev, 0), pdev->irq,
55699+ yukon2_name[hw->chip_id - CHIP_ID_YUKON_XL],
55700+ hw->chip_id, hw->chip_rev);
55701+
55702+ dev = sky2_init_netdev(hw, 0, using_dac);
55703+ if (!dev)
55704+ goto err_out_free_pci;
55705+
55706+ err = register_netdev(dev);
55707+ if (err) {
55708+ printk(KERN_ERR PFX "%s: cannot register net device\n",
55709+ pci_name(pdev));
55710+ goto err_out_free_netdev;
55711+ }
55712+
55713+ sky2_show_addr(dev);
55714+
55715+ if (hw->ports > 1 && (dev1 = sky2_init_netdev(hw, 1, using_dac))) {
55716+ if (register_netdev(dev1) == 0)
55717+ sky2_show_addr(dev1);
55718+ else {
55719+ /* Failure to register second port need not be fatal */
55720+ printk(KERN_WARNING PFX
55721+ "register of second port failed\n");
55722+ hw->dev[1] = NULL;
55723+ free_netdev(dev1);
55724+ }
55725+ }
55726+
55727+ err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw);
55728+ if (err) {
55729+ printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
55730+ pci_name(pdev), pdev->irq);
55731+ goto err_out_unregister;
55732+ }
55733+
55734+ hw->intr_mask = Y2_IS_BASE;
55735+ sky2_write32(hw, B0_IMSK, hw->intr_mask);
55736+
55737+ pci_set_drvdata(pdev, hw);
55738+
55739+ return 0;
55740+
55741+err_out_unregister:
55742+ if (dev1) {
55743+ unregister_netdev(dev1);
55744+ free_netdev(dev1);
55745+ }
55746+ unregister_netdev(dev);
55747+err_out_free_netdev:
55748+ free_netdev(dev);
55749+err_out_free_pci:
55750+ sky2_write8(hw, B0_CTST, CS_RST_SET);
55751+ pci_free_consistent(hw->pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
55752+err_out_iounmap:
55753+ iounmap(hw->regs);
55754+err_out_free_hw:
55755+ kfree(hw);
55756+err_out_free_regions:
55757+ pci_release_regions(pdev);
55758+ pci_disable_device(pdev);
55759+err_out:
55760+ return err;
55761+}
55762+
55763+static void __devexit sky2_remove(struct pci_dev *pdev)
55764+{
55765+ struct sky2_hw *hw = pci_get_drvdata(pdev);
55766+ struct net_device *dev0, *dev1;
55767+
55768+ if (!hw)
55769+ return;
55770+
55771+ dev0 = hw->dev[0];
55772+ dev1 = hw->dev[1];
55773+ if (dev1)
55774+ unregister_netdev(dev1);
55775+ unregister_netdev(dev0);
55776+
55777+ sky2_write32(hw, B0_IMSK, 0);
55778+ sky2_set_power_state(hw, PCI_D3hot);
55779+ sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
55780+ sky2_write8(hw, B0_CTST, CS_RST_SET);
55781+ sky2_read8(hw, B0_CTST);
55782+
55783+ free_irq(pdev->irq, hw);
55784+ pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
55785+ pci_release_regions(pdev);
55786+ pci_disable_device(pdev);
55787+
55788+ if (dev1)
55789+ free_netdev(dev1);
55790+ free_netdev(dev0);
55791+ iounmap(hw->regs);
55792+ kfree(hw);
55793+
55794+ pci_set_drvdata(pdev, NULL);
55795+}
55796+
55797+#ifdef CONFIG_PM
55798+static int sky2_suspend(struct pci_dev *pdev, pm_message_t state)
55799+{
55800+ struct sky2_hw *hw = pci_get_drvdata(pdev);
55801+ int i;
55802+
55803+ for (i = 0; i < 2; i++) {
55804+ struct net_device *dev = hw->dev[i];
55805+
55806+ if (dev) {
55807+ if (!netif_running(dev))
55808+ continue;
55809+
55810+ sky2_down(dev);
55811+ netif_device_detach(dev);
55812+ }
55813+ }
55814+
55815+ return sky2_set_power_state(hw, pci_choose_state(pdev, state));
55816+}
55817+
55818+static int sky2_resume(struct pci_dev *pdev)
55819+{
55820+ struct sky2_hw *hw = pci_get_drvdata(pdev);
55821+ int i, err;
55822+
55823+ pci_restore_state(pdev);
55824+ pci_enable_wake(pdev, PCI_D0, 0);
55825+ err = sky2_set_power_state(hw, PCI_D0);
55826+ if (err)
55827+ goto out;
55828+
55829+ err = sky2_reset(hw);
55830+ if (err)
55831+ goto out;
55832+
55833+ for (i = 0; i < 2; i++) {
55834+ struct net_device *dev = hw->dev[i];
55835+ if (dev && netif_running(dev)) {
55836+ netif_device_attach(dev);
55837+ err = sky2_up(dev);
55838+ if (err) {
55839+ printk(KERN_ERR PFX "%s: could not up: %d\n",
55840+ dev->name, err);
55841+ dev_close(dev);
55842+ break;
55843+ }
55844+ }
55845+ }
55846+out:
55847+ return err;
55848+}
55849+#endif
55850+
55851+static struct pci_driver sky2_driver = {
55852+ .name = DRV_NAME,
55853+ .id_table = sky2_id_table,
55854+ .probe = sky2_probe,
55855+ .remove = __devexit_p(sky2_remove),
55856+#ifdef CONFIG_PM
55857+ .suspend = sky2_suspend,
55858+ .resume = sky2_resume,
55859+#endif
55860+};
55861+
55862+static int __init sky2_init_module(void)
55863+{
55864+ return pci_register_driver(&sky2_driver);
55865+}
55866+
55867+static void __exit sky2_cleanup_module(void)
55868+{
55869+ pci_unregister_driver(&sky2_driver);
55870+}
55871+
55872+module_init(sky2_init_module);
55873+module_exit(sky2_cleanup_module);
55874+
55875+MODULE_DESCRIPTION("Marvell Yukon 2 Gigabit Ethernet driver");
55876+MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
55877+MODULE_LICENSE("GPL");
55878+MODULE_VERSION(DRV_VERSION);
55879diff -Nur linux-2.6.16.33-noxen/drivers/net/tg3.c linux-2.6.16.33/drivers/net/tg3.c
55880--- linux-2.6.16.33-noxen/drivers/net/tg3.c 2006-11-22 18:06:31.000000000 +0000
55881+++ linux-2.6.16.33/drivers/net/tg3.c 2007-05-23 21:00:01.000000000 +0000
55882@@ -3664,7 +3664,7 @@
55883 #if TG3_TSO_SUPPORT != 0
55884 mss = 0;
55885 if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
55886- (mss = skb_shinfo(skb)->tso_size) != 0) {
55887+ (mss = skb_shinfo(skb)->gso_size) != 0) {
55888 int tcp_opt_len, ip_tcp_len;
55889
55890 if (skb_header_cloned(skb) &&
55891diff -Nur linux-2.6.16.33-noxen/drivers/net/tulip/winbond-840.c linux-2.6.16.33/drivers/net/tulip/winbond-840.c
55892--- linux-2.6.16.33-noxen/drivers/net/tulip/winbond-840.c 2006-11-22 18:06:31.000000000 +0000
55893+++ linux-2.6.16.33/drivers/net/tulip/winbond-840.c 2007-05-23 21:00:01.000000000 +0000
55894@@ -1605,11 +1605,11 @@
55895 * - get_stats:
55896 * spin_lock_irq(np->lock), doesn't touch hw if not present
55897 * - hard_start_xmit:
55898- * netif_stop_queue + spin_unlock_wait(&dev->xmit_lock);
55899+ * synchronize_irq + netif_tx_disable;
55900 * - tx_timeout:
55901- * netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
55902+ * netif_device_detach + netif_tx_disable;
55903 * - set_multicast_list
55904- * netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
55905+ * netif_device_detach + netif_tx_disable;
55906 * - interrupt handler
55907 * doesn't touch hw if not present, synchronize_irq waits for
55908 * running instances of the interrupt handler.
55909@@ -1635,11 +1635,10 @@
55910 netif_device_detach(dev);
55911 update_csr6(dev, 0);
55912 iowrite32(0, ioaddr + IntrEnable);
55913- netif_stop_queue(dev);
55914 spin_unlock_irq(&np->lock);
55915
55916- spin_unlock_wait(&dev->xmit_lock);
55917 synchronize_irq(dev->irq);
55918+ netif_tx_disable(dev);
55919
55920 np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff;
55921
55922diff -Nur linux-2.6.16.33-noxen/drivers/net/typhoon.c linux-2.6.16.33/drivers/net/typhoon.c
55923--- linux-2.6.16.33-noxen/drivers/net/typhoon.c 2006-11-22 18:06:31.000000000 +0000
55924+++ linux-2.6.16.33/drivers/net/typhoon.c 2007-05-23 21:00:01.000000000 +0000
55925@@ -340,7 +340,7 @@
55926 #endif
55927
55928 #if defined(NETIF_F_TSO)
55929-#define skb_tso_size(x) (skb_shinfo(x)->tso_size)
55930+#define skb_tso_size(x) (skb_shinfo(x)->gso_size)
55931 #define TSO_NUM_DESCRIPTORS 2
55932 #define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT
55933 #else
55934@@ -805,7 +805,7 @@
55935 * If problems develop with TSO, check this first.
55936 */
55937 numDesc = skb_shinfo(skb)->nr_frags + 1;
55938- if(skb_tso_size(skb))
55939+ if (skb_is_gso(skb))
55940 numDesc++;
55941
55942 /* When checking for free space in the ring, we need to also
55943@@ -845,7 +845,7 @@
55944 TYPHOON_TX_PF_VLAN_TAG_SHIFT);
55945 }
55946
55947- if(skb_tso_size(skb)) {
55948+ if (skb_is_gso(skb)) {
55949 first_txd->processFlags |= TYPHOON_TX_PF_TCP_SEGMENT;
55950 first_txd->numDesc++;
55951
55952diff -Nur linux-2.6.16.33-noxen/drivers/net/via-velocity.c linux-2.6.16.33/drivers/net/via-velocity.c
55953--- linux-2.6.16.33-noxen/drivers/net/via-velocity.c 2006-11-22 18:06:31.000000000 +0000
55954+++ linux-2.6.16.33/drivers/net/via-velocity.c 2007-05-23 21:00:01.000000000 +0000
55955@@ -1905,6 +1905,13 @@
55956
55957 int pktlen = skb->len;
55958
55959+#ifdef VELOCITY_ZERO_COPY_SUPPORT
55960+ if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) {
55961+ kfree_skb(skb);
55962+ return 0;
55963+ }
55964+#endif
55965+
55966 spin_lock_irqsave(&vptr->lock, flags);
55967
55968 index = vptr->td_curr[qnum];
55969@@ -1920,8 +1927,6 @@
55970 */
55971 if (pktlen < ETH_ZLEN) {
55972 /* Cannot occur until ZC support */
55973- if(skb_linearize(skb, GFP_ATOMIC))
55974- return 0;
55975 pktlen = ETH_ZLEN;
55976 memcpy(tdinfo->buf, skb->data, skb->len);
55977 memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len);
55978@@ -1939,7 +1944,6 @@
55979 int nfrags = skb_shinfo(skb)->nr_frags;
55980 tdinfo->skb = skb;
55981 if (nfrags > 6) {
55982- skb_linearize(skb, GFP_ATOMIC);
55983 memcpy(tdinfo->buf, skb->data, skb->len);
55984 tdinfo->skb_dma[0] = tdinfo->buf_dma;
55985 td_ptr->tdesc0.pktsize =
55986diff -Nur linux-2.6.16.33-noxen/drivers/net/wireless/orinoco.c linux-2.6.16.33/drivers/net/wireless/orinoco.c
55987--- linux-2.6.16.33-noxen/drivers/net/wireless/orinoco.c 2006-11-22 18:06:31.000000000 +0000
55988+++ linux-2.6.16.33/drivers/net/wireless/orinoco.c 2007-05-23 21:00:01.000000000 +0000
55989@@ -1835,7 +1835,9 @@
55990 /* Set promiscuity / multicast*/
55991 priv->promiscuous = 0;
55992 priv->mc_count = 0;
55993- __orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */
55994+
55995+ /* FIXME: what about netif_tx_lock */
55996+ __orinoco_set_multicast_list(dev);
55997
55998 return 0;
55999 }
56000diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/buffer_sync.c linux-2.6.16.33/drivers/oprofile/buffer_sync.c
56001--- linux-2.6.16.33-noxen/drivers/oprofile/buffer_sync.c 2006-11-22 18:06:31.000000000 +0000
56002+++ linux-2.6.16.33/drivers/oprofile/buffer_sync.c 2007-05-23 21:00:01.000000000 +0000
56003@@ -6,6 +6,10 @@
56004 *
56005 * @author John Levon <levon@movementarian.org>
56006 *
56007+ * Modified by Aravind Menon for Xen
56008+ * These modifications are:
56009+ * Copyright (C) 2005 Hewlett-Packard Co.
56010+ *
56011 * This is the core of the buffer management. Each
56012 * CPU buffer is processed and entered into the
56013 * global event buffer. Such processing is necessary
56014@@ -38,6 +42,7 @@
56015 static DEFINE_SPINLOCK(task_mortuary);
56016 static void process_task_mortuary(void);
56017
56018+static int cpu_current_domain[NR_CPUS];
56019
56020 /* Take ownership of the task struct and place it on the
56021 * list for processing. Only after two full buffer syncs
56022@@ -146,6 +151,11 @@
56023 int sync_start(void)
56024 {
56025 int err;
56026+ int i;
56027+
56028+ for (i = 0; i < NR_CPUS; i++) {
56029+ cpu_current_domain[i] = COORDINATOR_DOMAIN;
56030+ }
56031
56032 start_cpu_work();
56033
56034@@ -275,15 +285,31 @@
56035 last_cookie = INVALID_COOKIE;
56036 }
56037
56038-static void add_kernel_ctx_switch(unsigned int in_kernel)
56039+static void add_cpu_mode_switch(unsigned int cpu_mode)
56040 {
56041 add_event_entry(ESCAPE_CODE);
56042- if (in_kernel)
56043- add_event_entry(KERNEL_ENTER_SWITCH_CODE);
56044- else
56045- add_event_entry(KERNEL_EXIT_SWITCH_CODE);
56046+ switch (cpu_mode) {
56047+ case CPU_MODE_USER:
56048+ add_event_entry(USER_ENTER_SWITCH_CODE);
56049+ break;
56050+ case CPU_MODE_KERNEL:
56051+ add_event_entry(KERNEL_ENTER_SWITCH_CODE);
56052+ break;
56053+ case CPU_MODE_XEN:
56054+ add_event_entry(XEN_ENTER_SWITCH_CODE);
56055+ break;
56056+ default:
56057+ break;
56058+ }
56059 }
56060-
56061+
56062+static void add_domain_switch(unsigned long domain_id)
56063+{
56064+ add_event_entry(ESCAPE_CODE);
56065+ add_event_entry(DOMAIN_SWITCH_CODE);
56066+ add_event_entry(domain_id);
56067+}
56068+
56069 static void
56070 add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
56071 {
56072@@ -348,9 +374,9 @@
56073 * for later lookup from userspace.
56074 */
56075 static int
56076-add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
56077+add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
56078 {
56079- if (in_kernel) {
56080+ if (cpu_mode >= CPU_MODE_KERNEL) {
56081 add_sample_entry(s->eip, s->event);
56082 return 1;
56083 } else if (mm) {
56084@@ -496,15 +522,21 @@
56085 struct mm_struct *mm = NULL;
56086 struct task_struct * new;
56087 unsigned long cookie = 0;
56088- int in_kernel = 1;
56089+ int cpu_mode = 1;
56090 unsigned int i;
56091 sync_buffer_state state = sb_buffer_start;
56092 unsigned long available;
56093+ int domain_switch = 0;
56094
56095 down(&buffer_sem);
56096
56097 add_cpu_switch(cpu);
56098
56099+ /* We need to assign the first samples in this CPU buffer to the
56100+ same domain that we were processing at the last sync_buffer */
56101+ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
56102+ add_domain_switch(cpu_current_domain[cpu]);
56103+ }
56104 /* Remember, only we can modify tail_pos */
56105
56106 available = get_slots(cpu_buf);
56107@@ -512,16 +544,18 @@
56108 for (i = 0; i < available; ++i) {
56109 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
56110
56111- if (is_code(s->eip)) {
56112- if (s->event <= CPU_IS_KERNEL) {
56113- /* kernel/userspace switch */
56114- in_kernel = s->event;
56115+ if (is_code(s->eip) && !domain_switch) {
56116+ if (s->event <= CPU_MODE_XEN) {
56117+ /* xen/kernel/userspace switch */
56118+ cpu_mode = s->event;
56119 if (state == sb_buffer_start)
56120 state = sb_sample_start;
56121- add_kernel_ctx_switch(s->event);
56122+ add_cpu_mode_switch(s->event);
56123 } else if (s->event == CPU_TRACE_BEGIN) {
56124 state = sb_bt_start;
56125 add_trace_begin();
56126+ } else if (s->event == CPU_DOMAIN_SWITCH) {
56127+ domain_switch = 1;
56128 } else {
56129 struct mm_struct * oldmm = mm;
56130
56131@@ -535,11 +569,21 @@
56132 add_user_ctx_switch(new, cookie);
56133 }
56134 } else {
56135- if (state >= sb_bt_start &&
56136- !add_sample(mm, s, in_kernel)) {
56137- if (state == sb_bt_start) {
56138- state = sb_bt_ignore;
56139- atomic_inc(&oprofile_stats.bt_lost_no_mapping);
56140+ if (domain_switch) {
56141+ cpu_current_domain[cpu] = s->eip;
56142+ add_domain_switch(s->eip);
56143+ domain_switch = 0;
56144+ } else {
56145+ if (cpu_current_domain[cpu] !=
56146+ COORDINATOR_DOMAIN) {
56147+ add_sample_entry(s->eip, s->event);
56148+ }
56149+ else if (state >= sb_bt_start &&
56150+ !add_sample(mm, s, cpu_mode)) {
56151+ if (state == sb_bt_start) {
56152+ state = sb_bt_ignore;
56153+ atomic_inc(&oprofile_stats.bt_lost_no_mapping);
56154+ }
56155 }
56156 }
56157 }
56158@@ -548,6 +592,11 @@
56159 }
56160 release_mm(mm);
56161
56162+ /* We reset domain to COORDINATOR at each CPU switch */
56163+ if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
56164+ add_domain_switch(COORDINATOR_DOMAIN);
56165+ }
56166+
56167 mark_done(cpu);
56168
56169 up(&buffer_sem);
56170diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.c linux-2.6.16.33/drivers/oprofile/cpu_buffer.c
56171--- linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.c 2006-11-22 18:06:31.000000000 +0000
56172+++ linux-2.6.16.33/drivers/oprofile/cpu_buffer.c 2007-05-23 21:00:01.000000000 +0000
56173@@ -6,6 +6,10 @@
56174 *
56175 * @author John Levon <levon@movementarian.org>
56176 *
56177+ * Modified by Aravind Menon for Xen
56178+ * These modifications are:
56179+ * Copyright (C) 2005 Hewlett-Packard Co.
56180+ *
56181 * Each CPU has a local buffer that stores PC value/event
56182 * pairs. We also log context switches when we notice them.
56183 * Eventually each CPU's buffer is processed into the global
56184@@ -34,6 +38,8 @@
56185 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
56186 static int work_enabled;
56187
56188+static int32_t current_domain = COORDINATOR_DOMAIN;
56189+
56190 void free_cpu_buffers(void)
56191 {
56192 int i;
56193@@ -58,7 +64,7 @@
56194 goto fail;
56195
56196 b->last_task = NULL;
56197- b->last_is_kernel = -1;
56198+ b->last_cpu_mode = -1;
56199 b->tracing = 0;
56200 b->buffer_size = buffer_size;
56201 b->tail_pos = 0;
56202@@ -114,7 +120,7 @@
56203 * collected will populate the buffer with proper
56204 * values to initialize the buffer
56205 */
56206- cpu_buf->last_is_kernel = -1;
56207+ cpu_buf->last_cpu_mode = -1;
56208 cpu_buf->last_task = NULL;
56209 }
56210
56211@@ -164,13 +170,13 @@
56212 * because of the head/tail separation of the writer and reader
56213 * of the CPU buffer.
56214 *
56215- * is_kernel is needed because on some architectures you cannot
56216+ * cpu_mode is needed because on some architectures you cannot
56217 * tell if you are in kernel or user space simply by looking at
56218- * pc. We tag this in the buffer by generating kernel enter/exit
56219- * events whenever is_kernel changes
56220+ * pc. We tag this in the buffer by generating kernel/user (and xen)
56221+ * enter events whenever cpu_mode changes
56222 */
56223 static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
56224- int is_kernel, unsigned long event)
56225+ int cpu_mode, unsigned long event)
56226 {
56227 struct task_struct * task;
56228
56229@@ -181,18 +187,18 @@
56230 return 0;
56231 }
56232
56233- is_kernel = !!is_kernel;
56234-
56235 task = current;
56236
56237 /* notice a switch from user->kernel or vice versa */
56238- if (cpu_buf->last_is_kernel != is_kernel) {
56239- cpu_buf->last_is_kernel = is_kernel;
56240- add_code(cpu_buf, is_kernel);
56241+ if (cpu_buf->last_cpu_mode != cpu_mode) {
56242+ cpu_buf->last_cpu_mode = cpu_mode;
56243+ add_code(cpu_buf, cpu_mode);
56244 }
56245-
56246+
56247 /* notice a task switch */
56248- if (cpu_buf->last_task != task) {
56249+ /* if not processing other domain samples */
56250+ if ((cpu_buf->last_task != task) &&
56251+ (current_domain == COORDINATOR_DOMAIN)) {
56252 cpu_buf->last_task = task;
56253 add_code(cpu_buf, (unsigned long)task);
56254 }
56255@@ -269,6 +275,25 @@
56256 add_sample(cpu_buf, pc, 0);
56257 }
56258
56259+int oprofile_add_domain_switch(int32_t domain_id)
56260+{
56261+ struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
56262+
56263+ /* should have space for switching into and out of domain
56264+ (2 slots each) plus one sample and one cpu mode switch */
56265+ if (((nr_available_slots(cpu_buf) < 6) &&
56266+ (domain_id != COORDINATOR_DOMAIN)) ||
56267+ (nr_available_slots(cpu_buf) < 2))
56268+ return 0;
56269+
56270+ add_code(cpu_buf, CPU_DOMAIN_SWITCH);
56271+ add_sample(cpu_buf, domain_id, 0);
56272+
56273+ current_domain = domain_id;
56274+
56275+ return 1;
56276+}
56277+
56278 /*
56279 * This serves to avoid cpu buffer overflow, and makes sure
56280 * the task mortuary progresses
56281diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.h linux-2.6.16.33/drivers/oprofile/cpu_buffer.h
56282--- linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.h 2006-11-22 18:06:31.000000000 +0000
56283+++ linux-2.6.16.33/drivers/oprofile/cpu_buffer.h 2007-05-23 21:00:01.000000000 +0000
56284@@ -36,7 +36,7 @@
56285 volatile unsigned long tail_pos;
56286 unsigned long buffer_size;
56287 struct task_struct * last_task;
56288- int last_is_kernel;
56289+ int last_cpu_mode;
56290 int tracing;
56291 struct op_sample * buffer;
56292 unsigned long sample_received;
56293@@ -51,7 +51,10 @@
56294 void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
56295
56296 /* transient events for the CPU buffer -> event buffer */
56297-#define CPU_IS_KERNEL 1
56298-#define CPU_TRACE_BEGIN 2
56299+#define CPU_MODE_USER 0
56300+#define CPU_MODE_KERNEL 1
56301+#define CPU_MODE_XEN 2
56302+#define CPU_TRACE_BEGIN 3
56303+#define CPU_DOMAIN_SWITCH 4
56304
56305 #endif /* OPROFILE_CPU_BUFFER_H */
56306diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/event_buffer.h linux-2.6.16.33/drivers/oprofile/event_buffer.h
56307--- linux-2.6.16.33-noxen/drivers/oprofile/event_buffer.h 2006-11-22 18:06:31.000000000 +0000
56308+++ linux-2.6.16.33/drivers/oprofile/event_buffer.h 2007-05-23 21:00:01.000000000 +0000
56309@@ -29,15 +29,20 @@
56310 #define CPU_SWITCH_CODE 2
56311 #define COOKIE_SWITCH_CODE 3
56312 #define KERNEL_ENTER_SWITCH_CODE 4
56313-#define KERNEL_EXIT_SWITCH_CODE 5
56314+#define USER_ENTER_SWITCH_CODE 5
56315 #define MODULE_LOADED_CODE 6
56316 #define CTX_TGID_CODE 7
56317 #define TRACE_BEGIN_CODE 8
56318 #define TRACE_END_CODE 9
56319+#define XEN_ENTER_SWITCH_CODE 10
56320+#define DOMAIN_SWITCH_CODE 11
56321
56322 #define INVALID_COOKIE ~0UL
56323 #define NO_COOKIE 0UL
56324
56325+/* Constant used to refer to coordinator domain (Xen) */
56326+#define COORDINATOR_DOMAIN -1
56327+
56328 /* add data to the event buffer */
56329 void add_event_entry(unsigned long data);
56330
56331diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprof.c linux-2.6.16.33/drivers/oprofile/oprof.c
56332--- linux-2.6.16.33-noxen/drivers/oprofile/oprof.c 2006-11-22 18:06:31.000000000 +0000
56333+++ linux-2.6.16.33/drivers/oprofile/oprof.c 2007-05-23 21:00:01.000000000 +0000
56334@@ -5,6 +5,10 @@
56335 * @remark Read the file COPYING
56336 *
56337 * @author John Levon <levon@movementarian.org>
56338+ *
56339+ * Modified by Aravind Menon for Xen
56340+ * These modifications are:
56341+ * Copyright (C) 2005 Hewlett-Packard Co.
56342 */
56343
56344 #include <linux/kernel.h>
56345@@ -19,7 +23,7 @@
56346 #include "cpu_buffer.h"
56347 #include "buffer_sync.h"
56348 #include "oprofile_stats.h"
56349-
56350+
56351 struct oprofile_operations oprofile_ops;
56352
56353 unsigned long oprofile_started;
56354@@ -33,6 +37,32 @@
56355 */
56356 static int timer = 0;
56357
56358+int oprofile_set_active(int active_domains[], unsigned int adomains)
56359+{
56360+ int err;
56361+
56362+ if (!oprofile_ops.set_active)
56363+ return -EINVAL;
56364+
56365+ down(&start_sem);
56366+ err = oprofile_ops.set_active(active_domains, adomains);
56367+ up(&start_sem);
56368+ return err;
56369+}
56370+
56371+int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
56372+{
56373+ int err;
56374+
56375+ if (!oprofile_ops.set_passive)
56376+ return -EINVAL;
56377+
56378+ down(&start_sem);
56379+ err = oprofile_ops.set_passive(passive_domains, pdomains);
56380+ up(&start_sem);
56381+ return err;
56382+}
56383+
56384 int oprofile_setup(void)
56385 {
56386 int err;
56387diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprof.h linux-2.6.16.33/drivers/oprofile/oprof.h
56388--- linux-2.6.16.33-noxen/drivers/oprofile/oprof.h 2006-11-22 18:06:31.000000000 +0000
56389+++ linux-2.6.16.33/drivers/oprofile/oprof.h 2007-05-23 21:00:01.000000000 +0000
56390@@ -35,5 +35,8 @@
56391 void oprofile_timer_init(struct oprofile_operations * ops);
56392
56393 int oprofile_set_backtrace(unsigned long depth);
56394+
56395+int oprofile_set_active(int active_domains[], unsigned int adomains);
56396+int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
56397
56398 #endif /* OPROF_H */
56399diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprofile_files.c linux-2.6.16.33/drivers/oprofile/oprofile_files.c
56400--- linux-2.6.16.33-noxen/drivers/oprofile/oprofile_files.c 2006-11-22 18:06:31.000000000 +0000
56401+++ linux-2.6.16.33/drivers/oprofile/oprofile_files.c 2007-05-23 21:00:01.000000000 +0000
56402@@ -5,15 +5,21 @@
56403 * @remark Read the file COPYING
56404 *
56405 * @author John Levon <levon@movementarian.org>
56406+ *
56407+ * Modified by Aravind Menon for Xen
56408+ * These modifications are:
56409+ * Copyright (C) 2005 Hewlett-Packard Co.
56410 */
56411
56412 #include <linux/fs.h>
56413 #include <linux/oprofile.h>
56414+#include <asm/uaccess.h>
56415+#include <linux/ctype.h>
56416
56417 #include "event_buffer.h"
56418 #include "oprofile_stats.h"
56419 #include "oprof.h"
56420-
56421+
56422 unsigned long fs_buffer_size = 131072;
56423 unsigned long fs_cpu_buffer_size = 8192;
56424 unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
56425@@ -117,11 +123,202 @@
56426 static struct file_operations dump_fops = {
56427 .write = dump_write,
56428 };
56429-
56430+
56431+#define TMPBUFSIZE 512
56432+
56433+static unsigned int adomains = 0;
56434+static int active_domains[MAX_OPROF_DOMAINS + 1];
56435+static DEFINE_MUTEX(adom_mutex);
56436+
56437+static ssize_t adomain_write(struct file * file, char const __user * buf,
56438+ size_t count, loff_t * offset)
56439+{
56440+ char *tmpbuf;
56441+ char *startp, *endp;
56442+ int i;
56443+ unsigned long val;
56444+ ssize_t retval = count;
56445+
56446+ if (*offset)
56447+ return -EINVAL;
56448+ if (count > TMPBUFSIZE - 1)
56449+ return -EINVAL;
56450+
56451+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56452+ return -ENOMEM;
56453+
56454+ if (copy_from_user(tmpbuf, buf, count)) {
56455+ kfree(tmpbuf);
56456+ return -EFAULT;
56457+ }
56458+ tmpbuf[count] = 0;
56459+
56460+ mutex_lock(&adom_mutex);
56461+
56462+ startp = tmpbuf;
56463+ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
56464+ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
56465+ val = simple_strtoul(startp, &endp, 0);
56466+ if (endp == startp)
56467+ break;
56468+ while (ispunct(*endp) || isspace(*endp))
56469+ endp++;
56470+ active_domains[i] = val;
56471+ if (active_domains[i] != val)
56472+ /* Overflow, force error below */
56473+ i = MAX_OPROF_DOMAINS + 1;
56474+ startp = endp;
56475+ }
56476+ /* Force error on trailing junk */
56477+ adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
56478+
56479+ kfree(tmpbuf);
56480+
56481+ if (adomains > MAX_OPROF_DOMAINS
56482+ || oprofile_set_active(active_domains, adomains)) {
56483+ adomains = 0;
56484+ retval = -EINVAL;
56485+ }
56486+
56487+ mutex_unlock(&adom_mutex);
56488+ return retval;
56489+}
56490+
56491+static ssize_t adomain_read(struct file * file, char __user * buf,
56492+ size_t count, loff_t * offset)
56493+{
56494+ char * tmpbuf;
56495+ size_t len;
56496+ int i;
56497+ ssize_t retval;
56498+
56499+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56500+ return -ENOMEM;
56501+
56502+ mutex_lock(&adom_mutex);
56503+
56504+ len = 0;
56505+ for (i = 0; i < adomains; i++)
56506+ len += snprintf(tmpbuf + len,
56507+ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
56508+ "%u ", active_domains[i]);
56509+ WARN_ON(len > TMPBUFSIZE);
56510+ if (len != 0 && len <= TMPBUFSIZE)
56511+ tmpbuf[len-1] = '\n';
56512+
56513+ mutex_unlock(&adom_mutex);
56514+
56515+ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
56516+
56517+ kfree(tmpbuf);
56518+ return retval;
56519+}
56520+
56521+
56522+static struct file_operations active_domain_ops = {
56523+ .read = adomain_read,
56524+ .write = adomain_write,
56525+};
56526+
56527+static unsigned int pdomains = 0;
56528+static int passive_domains[MAX_OPROF_DOMAINS];
56529+static DEFINE_MUTEX(pdom_mutex);
56530+
56531+static ssize_t pdomain_write(struct file * file, char const __user * buf,
56532+ size_t count, loff_t * offset)
56533+{
56534+ char *tmpbuf;
56535+ char *startp, *endp;
56536+ int i;
56537+ unsigned long val;
56538+ ssize_t retval = count;
56539+
56540+ if (*offset)
56541+ return -EINVAL;
56542+ if (count > TMPBUFSIZE - 1)
56543+ return -EINVAL;
56544+
56545+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56546+ return -ENOMEM;
56547+
56548+ if (copy_from_user(tmpbuf, buf, count)) {
56549+ kfree(tmpbuf);
56550+ return -EFAULT;
56551+ }
56552+ tmpbuf[count] = 0;
56553+
56554+ mutex_lock(&pdom_mutex);
56555+
56556+ startp = tmpbuf;
56557+ /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
56558+ for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
56559+ val = simple_strtoul(startp, &endp, 0);
56560+ if (endp == startp)
56561+ break;
56562+ while (ispunct(*endp) || isspace(*endp))
56563+ endp++;
56564+ passive_domains[i] = val;
56565+ if (passive_domains[i] != val)
56566+ /* Overflow, force error below */
56567+ i = MAX_OPROF_DOMAINS + 1;
56568+ startp = endp;
56569+ }
56570+ /* Force error on trailing junk */
56571+ pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
56572+
56573+ kfree(tmpbuf);
56574+
56575+ if (pdomains > MAX_OPROF_DOMAINS
56576+ || oprofile_set_passive(passive_domains, pdomains)) {
56577+ pdomains = 0;
56578+ retval = -EINVAL;
56579+ }
56580+
56581+ mutex_unlock(&pdom_mutex);
56582+ return retval;
56583+}
56584+
56585+static ssize_t pdomain_read(struct file * file, char __user * buf,
56586+ size_t count, loff_t * offset)
56587+{
56588+ char * tmpbuf;
56589+ size_t len;
56590+ int i;
56591+ ssize_t retval;
56592+
56593+ if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56594+ return -ENOMEM;
56595+
56596+ mutex_lock(&pdom_mutex);
56597+
56598+ len = 0;
56599+ for (i = 0; i < pdomains; i++)
56600+ len += snprintf(tmpbuf + len,
56601+ len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
56602+ "%u ", passive_domains[i]);
56603+ WARN_ON(len > TMPBUFSIZE);
56604+ if (len != 0 && len <= TMPBUFSIZE)
56605+ tmpbuf[len-1] = '\n';
56606+
56607+ mutex_unlock(&pdom_mutex);
56608+
56609+ retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
56610+
56611+ kfree(tmpbuf);
56612+ return retval;
56613+}
56614+
56615+static struct file_operations passive_domain_ops = {
56616+ .read = pdomain_read,
56617+ .write = pdomain_write,
56618+};
56619+
56620 void oprofile_create_files(struct super_block * sb, struct dentry * root)
56621 {
56622 oprofilefs_create_file(sb, root, "enable", &enable_fops);
56623 oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
56624+ oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
56625+ oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
56626 oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
56627 oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
56628 oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
56629diff -Nur linux-2.6.16.33-noxen/drivers/pci/Kconfig linux-2.6.16.33/drivers/pci/Kconfig
56630--- linux-2.6.16.33-noxen/drivers/pci/Kconfig 2006-11-22 18:06:31.000000000 +0000
56631+++ linux-2.6.16.33/drivers/pci/Kconfig 2007-01-08 15:00:45.000000000 +0000
56632@@ -5,6 +5,7 @@
56633 bool "Message Signaled Interrupts (MSI and MSI-X)"
56634 depends on PCI
56635 depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
56636+ depends on !XEN
56637 help
56638 This allows device drivers to enable MSI (Message Signaled
56639 Interrupts). Message Signaled Interrupts enable a device to
56640diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_eddp.c linux-2.6.16.33/drivers/s390/net/qeth_eddp.c
56641--- linux-2.6.16.33-noxen/drivers/s390/net/qeth_eddp.c 2006-11-22 18:06:31.000000000 +0000
56642+++ linux-2.6.16.33/drivers/s390/net/qeth_eddp.c 2007-05-23 21:00:01.000000000 +0000
56643@@ -421,7 +421,7 @@
56644 }
56645 tcph = eddp->skb->h.th;
56646 while (eddp->skb_offset < eddp->skb->len) {
56647- data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
56648+ data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
56649 (int)(eddp->skb->len - eddp->skb_offset));
56650 /* prepare qdio hdr */
56651 if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
56652@@ -516,20 +516,20 @@
56653
56654 QETH_DBF_TEXT(trace, 5, "eddpcanp");
56655 /* can we put multiple skbs in one page? */
56656- skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
56657+ skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
56658 if (skbs_per_page > 1){
56659- ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
56660+ ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
56661 skbs_per_page + 1;
56662 ctx->elements_per_skb = 1;
56663 } else {
56664 /* no -> how many elements per skb? */
56665- ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
56666+ ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
56667 PAGE_SIZE) >> PAGE_SHIFT;
56668 ctx->num_pages = ctx->elements_per_skb *
56669- (skb_shinfo(skb)->tso_segs + 1);
56670+ (skb_shinfo(skb)->gso_segs + 1);
56671 }
56672 ctx->num_elements = ctx->elements_per_skb *
56673- (skb_shinfo(skb)->tso_segs + 1);
56674+ (skb_shinfo(skb)->gso_segs + 1);
56675 }
56676
56677 static inline struct qeth_eddp_context *
56678diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_main.c linux-2.6.16.33/drivers/s390/net/qeth_main.c
56679--- linux-2.6.16.33-noxen/drivers/s390/net/qeth_main.c 2006-11-22 18:06:31.000000000 +0000
56680+++ linux-2.6.16.33/drivers/s390/net/qeth_main.c 2007-05-23 21:00:01.000000000 +0000
56681@@ -4454,7 +4454,7 @@
56682 queue = card->qdio.out_qs
56683 [qeth_get_priority_queue(card, skb, ipv, cast_type)];
56684
56685- if (skb_shinfo(skb)->tso_size)
56686+ if (skb_is_gso(skb))
56687 large_send = card->options.large_send;
56688
56689 /*are we able to do TSO ? If so ,prepare and send it from here */
56690@@ -4501,8 +4501,7 @@
56691 card->stats.tx_packets++;
56692 card->stats.tx_bytes += skb->len;
56693 #ifdef CONFIG_QETH_PERF_STATS
56694- if (skb_shinfo(skb)->tso_size &&
56695- !(large_send == QETH_LARGE_SEND_NO)) {
56696+ if (skb_is_gso(skb) && !(large_send == QETH_LARGE_SEND_NO)) {
56697 card->perf_stats.large_send_bytes += skb->len;
56698 card->perf_stats.large_send_cnt++;
56699 }
56700diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_tso.h linux-2.6.16.33/drivers/s390/net/qeth_tso.h
56701--- linux-2.6.16.33-noxen/drivers/s390/net/qeth_tso.h 2006-11-22 18:06:31.000000000 +0000
56702+++ linux-2.6.16.33/drivers/s390/net/qeth_tso.h 2007-05-23 21:00:01.000000000 +0000
56703@@ -51,7 +51,7 @@
56704 hdr->ext.hdr_version = 1;
56705 hdr->ext.hdr_len = 28;
56706 /*insert non-fix values */
56707- hdr->ext.mss = skb_shinfo(skb)->tso_size;
56708+ hdr->ext.mss = skb_shinfo(skb)->gso_size;
56709 hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
56710 hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
56711 sizeof(struct qeth_hdr_tso));
56712diff -Nur linux-2.6.16.33-noxen/drivers/serial/Kconfig linux-2.6.16.33/drivers/serial/Kconfig
56713--- linux-2.6.16.33-noxen/drivers/serial/Kconfig 2006-11-22 18:06:31.000000000 +0000
56714+++ linux-2.6.16.33/drivers/serial/Kconfig 2007-01-08 15:00:45.000000000 +0000
56715@@ -11,6 +11,7 @@
56716 config SERIAL_8250
56717 tristate "8250/16550 and compatible serial support"
56718 depends on (BROKEN || !SPARC)
56719+ depends on !XEN_DISABLE_SERIAL
56720 select SERIAL_CORE
56721 ---help---
56722 This selects whether you want to include the driver for the standard
56723diff -Nur linux-2.6.16.33-noxen/drivers/video/Kconfig linux-2.6.16.33/drivers/video/Kconfig
56724--- linux-2.6.16.33-noxen/drivers/video/Kconfig 2006-11-22 18:06:31.000000000 +0000
56725+++ linux-2.6.16.33/drivers/video/Kconfig 2007-01-08 15:00:45.000000000 +0000
56726@@ -495,7 +495,7 @@
56727
56728 config VIDEO_SELECT
56729 bool
56730- depends on (FB = y) && X86
56731+ depends on (FB = y) && X86 && !XEN
56732 default y
56733
56734 config FB_SGIVW
56735diff -Nur linux-2.6.16.33-noxen/drivers/xen/Kconfig linux-2.6.16.33/drivers/xen/Kconfig
56736--- linux-2.6.16.33-noxen/drivers/xen/Kconfig 1970-01-01 00:00:00.000000000 +0000
56737+++ linux-2.6.16.33/drivers/xen/Kconfig 2007-01-08 15:00:45.000000000 +0000
56738@@ -0,0 +1,283 @@
56739+#
56740+# This Kconfig describe xen options
56741+#
56742+
56743+mainmenu "Xen Configuration"
56744+
56745+config XEN
56746+ bool
56747+ default y if X86_XEN || X86_64_XEN
56748+ help
56749+ This is the Linux Xen port.
56750+
56751+if XEN
56752+config XEN_INTERFACE_VERSION
56753+ hex
56754+ default 0x00030203
56755+
56756+menu "XEN"
56757+
56758+config XEN_PRIVILEGED_GUEST
56759+ bool "Privileged Guest (domain 0)"
56760+ depends XEN
56761+ default n
56762+ help
56763+ Support for privileged operation (domain 0)
56764+
56765+config XEN_UNPRIVILEGED_GUEST
56766+ bool
56767+ default !XEN_PRIVILEGED_GUEST
56768+
56769+config XEN_PRIVCMD
56770+ bool
56771+ depends on PROC_FS
56772+ default y
56773+
56774+config XEN_XENBUS_DEV
56775+ bool
56776+ depends on PROC_FS
56777+ default y
56778+
56779+config XEN_BACKEND
56780+ tristate "Backend driver support"
56781+ default y
56782+ help
56783+ Support for backend device drivers that provide I/O services
56784+ to other virtual machines.
56785+
56786+config XEN_BLKDEV_BACKEND
56787+ tristate "Block-device backend driver"
56788+ depends on XEN_BACKEND
56789+ default y
56790+ help
56791+ The block-device backend driver allows the kernel to export its
56792+ block devices to other guests via a high-performance shared-memory
56793+ interface.
56794+
56795+config XEN_BLKDEV_TAP
56796+ tristate "Block-device tap backend driver"
56797+ depends on XEN_BACKEND
56798+ default XEN_PRIVILEGED_GUEST
56799+ help
56800+ The block tap driver is an alternative to the block back driver
56801+ and allows VM block requests to be redirected to userspace through
56802+ a device interface. The tap allows user-space development of
56803+ high-performance block backends, where disk images may be implemented
56804+ as files, in memory, or on other hosts across the network. This
56805+ driver can safely coexist with the existing blockback driver.
56806+
56807+config XEN_NETDEV_BACKEND
56808+ tristate "Network-device backend driver"
56809+ depends on XEN_BACKEND && NET
56810+ default y
56811+ help
56812+ The network-device backend driver allows the kernel to export its
56813+ network devices to other guests via a high-performance shared-memory
56814+ interface.
56815+
56816+config XEN_NETDEV_PIPELINED_TRANSMITTER
56817+ bool "Pipelined transmitter (DANGEROUS)"
56818+ depends on XEN_NETDEV_BACKEND
56819+ default n
56820+ help
56821+ If the net backend is a dumb domain, such as a transparent Ethernet
56822+ bridge with no local IP interface, it is safe to say Y here to get
56823+ slightly lower network overhead.
56824+ If the backend has a local IP interface; or may be doing smart things
56825+ like reassembling packets to perform firewall filtering; or if you
56826+ are unsure; or if you experience network hangs when this option is
56827+ enabled; then you must say N here.
56828+
56829+config XEN_NETDEV_LOOPBACK
56830+ tristate "Network-device loopback driver"
56831+ depends on XEN_NETDEV_BACKEND
56832+ default y
56833+ help
56834+ A two-interface loopback device to emulate a local netfront-netback
56835+ connection.
56836+
56837+config XEN_PCIDEV_BACKEND
56838+ tristate "PCI-device backend driver"
56839+ depends on PCI && XEN_BACKEND
56840+ default XEN_PRIVILEGED_GUEST
56841+ help
56842+ The PCI device backend driver allows the kernel to export arbitrary
56843+ PCI devices to other guests. If you select this to be a module, you
56844+ will need to make sure no other driver has bound to the device(s)
56845+ you want to make visible to other guests.
56846+
56847+choice
56848+ prompt "PCI Backend Mode"
56849+ depends on XEN_PCIDEV_BACKEND
56850+ default XEN_PCIDEV_BACKEND_VPCI
56851+
56852+config XEN_PCIDEV_BACKEND_VPCI
56853+ bool "Virtual PCI"
56854+ ---help---
56855+ This PCI Backend hides the true PCI topology and makes the frontend
56856+ think there is a single PCI bus with only the exported devices on it.
56857+ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
56858+ second device at 02:1a.1 will be re-assigned to 00:01.1.
56859+
56860+config XEN_PCIDEV_BACKEND_PASS
56861+ bool "Passthrough"
56862+ ---help---
56863+ This PCI Backend provides a real view of the PCI topology to the
56864+ frontend (for example, a device at 06:01.b will still appear at
56865+ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
56866+ PCI devices to its driver domains. This may be required for drivers
56867+ which depend on finding their hardward in certain bus/slot
56868+ locations.
56869+
56870+config XEN_PCIDEV_BACKEND_SLOT
56871+ bool "Slot"
56872+ ---help---
56873+ This PCI Backend hides the true PCI topology and makes the frontend
56874+ think there is a single PCI bus with only the exported devices on it.
56875+ Contrary to the virtual PCI backend, a function becomes a new slot.
56876+ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
56877+ second device at 02:1a.1 will be re-assigned to 00:01.0.
56878+
56879+endchoice
56880+
56881+config XEN_PCIDEV_BE_DEBUG
56882+ bool "PCI Backend Debugging"
56883+ depends on XEN_PCIDEV_BACKEND
56884+ default n
56885+
56886+config XEN_TPMDEV_BACKEND
56887+ tristate "TPM-device backend driver"
56888+ depends on XEN_BACKEND
56889+ default n
56890+ help
56891+ The TPM-device backend driver
56892+
56893+config XEN_BLKDEV_FRONTEND
56894+ tristate "Block-device frontend driver"
56895+ depends on XEN
56896+ default y
56897+ help
56898+ The block-device frontend driver allows the kernel to access block
56899+ devices mounted within another guest OS. Unless you are building a
56900+ dedicated device-driver domain, or your master control domain
56901+ (domain 0), then you almost certainly want to say Y here.
56902+
56903+config XEN_NETDEV_FRONTEND
56904+ tristate "Network-device frontend driver"
56905+ depends on XEN && NET
56906+ default y
56907+ help
56908+ The network-device frontend driver allows the kernel to access
56909+ network interfaces within another guest OS. Unless you are building a
56910+ dedicated device-driver domain, or your master control domain
56911+ (domain 0), then you almost certainly want to say Y here.
56912+
56913+config XEN_FRAMEBUFFER
56914+ tristate "Framebuffer-device frontend driver"
56915+ depends on XEN && FB
56916+ select FB_CFB_FILLRECT
56917+ select FB_CFB_COPYAREA
56918+ select FB_CFB_IMAGEBLIT
56919+ default y
56920+ help
56921+ The framebuffer-device frontend drivers allows the kernel to create a
56922+ virtual framebuffer. This framebuffer can be viewed in another
56923+ domain. Unless this domain has access to a real video card, you
56924+ probably want to say Y here.
56925+
56926+config XEN_KEYBOARD
56927+ tristate "Keyboard-device frontend driver"
56928+ depends on XEN && XEN_FRAMEBUFFER && INPUT
56929+ default y
56930+ help
56931+ The keyboard-device frontend driver allows the kernel to create a
56932+ virtual keyboard. This keyboard can then be driven by another
56933+ domain. If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
56934+ want to say Y here.
56935+
56936+config XEN_SCRUB_PAGES
56937+ bool "Scrub memory before freeing it to Xen"
56938+ default y
56939+ help
56940+ Erase memory contents before freeing it back to Xen's global
56941+ pool. This ensures that any secrets contained within that
56942+ memory (e.g., private keys) cannot be found by other guests that
56943+ may be running on the machine. Most people will want to say Y here.
56944+ If security is not a concern then you may increase performance by
56945+ saying N.
56946+
56947+config XEN_DISABLE_SERIAL
56948+ bool "Disable serial port drivers"
56949+ default y
56950+ help
56951+ Disable serial port drivers, allowing the Xen console driver
56952+ to provide a serial console at ttyS0.
56953+
56954+config XEN_SYSFS
56955+ tristate "Export Xen attributes in sysfs"
56956+ depends on SYSFS
56957+ default y
56958+ help
56959+ Xen hypervisor attributes will show up under /sys/hypervisor/.
56960+
56961+choice
56962+ prompt "Xen version compatibility"
56963+ default XEN_COMPAT_030002_AND_LATER
56964+
56965+ config XEN_COMPAT_030002_AND_LATER
56966+ bool "3.0.2 and later"
56967+
56968+ config XEN_COMPAT_LATEST_ONLY
56969+ bool "no compatibility code"
56970+
56971+endchoice
56972+
56973+config XEN_COMPAT_030002
56974+ bool
56975+ default XEN_COMPAT_030002_AND_LATER
56976+
56977+endmenu
56978+
56979+config HAVE_ARCH_ALLOC_SKB
56980+ bool
56981+ default y
56982+
56983+config HAVE_ARCH_DEV_ALLOC_SKB
56984+ bool
56985+ default y
56986+
56987+config HAVE_IRQ_IGNORE_UNHANDLED
56988+ bool
56989+ default y
56990+
56991+config NO_IDLE_HZ
56992+ bool
56993+ default y
56994+
56995+config XEN_UTIL
56996+ bool
56997+ default y
56998+
56999+config XEN_BALLOON
57000+ bool
57001+ default y
57002+
57003+config XEN_DEVMEM
57004+ bool
57005+ default y
57006+
57007+config XEN_SKBUFF
57008+ bool
57009+ default y
57010+ depends on NET
57011+
57012+config XEN_REBOOT
57013+ bool
57014+ default y
57015+
57016+config XEN_SMPBOOT
57017+ bool
57018+ default y
57019+ depends on SMP
57020+
57021+endif
57022diff -Nur linux-2.6.16.33-noxen/drivers/xen/Makefile linux-2.6.16.33/drivers/xen/Makefile
57023--- linux-2.6.16.33-noxen/drivers/xen/Makefile 1970-01-01 00:00:00.000000000 +0000
57024+++ linux-2.6.16.33/drivers/xen/Makefile 2007-01-08 15:00:45.000000000 +0000
57025@@ -0,0 +1,19 @@
57026+obj-y += core/
57027+obj-y += console/
57028+obj-y += evtchn/
57029+obj-y += privcmd/
57030+obj-y += xenbus/
57031+
57032+obj-$(CONFIG_XEN_UTIL) += util.o
57033+obj-$(CONFIG_XEN_BALLOON) += balloon/
57034+obj-$(CONFIG_XEN_DEVMEM) += char/
57035+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
57036+obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
57037+obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
57038+obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmback/
57039+obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/
57040+obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/
57041+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
57042+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront/
57043+obj-$(CONFIG_XEN_FRAMEBUFFER) += fbfront/
57044+obj-$(CONFIG_XEN_KEYBOARD) += fbfront/
57045diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/Makefile linux-2.6.16.33/drivers/xen/balloon/Makefile
57046--- linux-2.6.16.33-noxen/drivers/xen/balloon/Makefile 1970-01-01 00:00:00.000000000 +0000
57047+++ linux-2.6.16.33/drivers/xen/balloon/Makefile 2007-01-08 15:00:45.000000000 +0000
57048@@ -0,0 +1,2 @@
57049+
57050+obj-y := balloon.o sysfs.o
57051diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/balloon.c linux-2.6.16.33/drivers/xen/balloon/balloon.c
57052--- linux-2.6.16.33-noxen/drivers/xen/balloon/balloon.c 1970-01-01 00:00:00.000000000 +0000
57053+++ linux-2.6.16.33/drivers/xen/balloon/balloon.c 2007-01-08 15:00:45.000000000 +0000
57054@@ -0,0 +1,625 @@
57055+/******************************************************************************
57056+ * balloon.c
57057+ *
57058+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
57059+ *
57060+ * Copyright (c) 2003, B Dragovic
57061+ * Copyright (c) 2003-2004, M Williamson, K Fraser
57062+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
57063+ *
57064+ * This program is free software; you can redistribute it and/or
57065+ * modify it under the terms of the GNU General Public License version 2
57066+ * as published by the Free Software Foundation; or, when distributed
57067+ * separately from the Linux kernel or incorporated into other
57068+ * software packages, subject to the following license:
57069+ *
57070+ * Permission is hereby granted, free of charge, to any person obtaining a copy
57071+ * of this source file (the "Software"), to deal in the Software without
57072+ * restriction, including without limitation the rights to use, copy, modify,
57073+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57074+ * and to permit persons to whom the Software is furnished to do so, subject to
57075+ * the following conditions:
57076+ *
57077+ * The above copyright notice and this permission notice shall be included in
57078+ * all copies or substantial portions of the Software.
57079+ *
57080+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57081+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57082+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57083+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57084+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57085+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57086+ * IN THE SOFTWARE.
57087+ */
57088+
57089+#include <linux/config.h>
57090+#include <linux/kernel.h>
57091+#include <linux/module.h>
57092+#include <linux/sched.h>
57093+#include <linux/errno.h>
57094+#include <linux/mm.h>
57095+#include <linux/mman.h>
57096+#include <linux/smp_lock.h>
57097+#include <linux/pagemap.h>
57098+#include <linux/bootmem.h>
57099+#include <linux/highmem.h>
57100+#include <linux/vmalloc.h>
57101+#include <xen/xen_proc.h>
57102+#include <asm/hypervisor.h>
57103+#include <xen/balloon.h>
57104+#include <xen/interface/memory.h>
57105+#include <asm/pgalloc.h>
57106+#include <asm/pgtable.h>
57107+#include <asm/uaccess.h>
57108+#include <asm/tlb.h>
57109+#include <linux/list.h>
57110+#include <xen/xenbus.h>
57111+#include "common.h"
57112+
57113+#ifdef CONFIG_PROC_FS
57114+static struct proc_dir_entry *balloon_pde;
57115+#endif
57116+
57117+static DECLARE_MUTEX(balloon_mutex);
57118+
57119+/*
57120+ * Protects atomic reservation decrease/increase against concurrent increases.
57121+ * Also protects non-atomic updates of current_pages and driver_pages, and
57122+ * balloon lists.
57123+ */
57124+DEFINE_SPINLOCK(balloon_lock);
57125+
57126+struct balloon_stats balloon_stats;
57127+
57128+/* We increase/decrease in batches which fit in a page */
57129+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
57130+
57131+/* VM /proc information for memory */
57132+extern unsigned long totalram_pages;
57133+
57134+/* List of ballooned pages, threaded through the mem_map array. */
57135+static LIST_HEAD(ballooned_pages);
57136+
57137+/* Main work function, always executed in process context. */
57138+static void balloon_process(void *unused);
57139+static DECLARE_WORK(balloon_worker, balloon_process, NULL);
57140+static struct timer_list balloon_timer;
57141+
57142+/* When ballooning out (allocating memory to return to Xen) we don't really
57143+ want the kernel to try too hard since that can trigger the oom killer. */
57144+#define GFP_BALLOON \
57145+ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
57146+
57147+#define PAGE_TO_LIST(p) (&(p)->lru)
57148+#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
57149+#define UNLIST_PAGE(p) \
57150+ do { \
57151+ list_del(PAGE_TO_LIST(p)); \
57152+ PAGE_TO_LIST(p)->next = NULL; \
57153+ PAGE_TO_LIST(p)->prev = NULL; \
57154+ } while(0)
57155+
57156+#define IPRINTK(fmt, args...) \
57157+ printk(KERN_INFO "xen_mem: " fmt, ##args)
57158+#define WPRINTK(fmt, args...) \
57159+ printk(KERN_WARNING "xen_mem: " fmt, ##args)
57160+
57161+/* balloon_append: add the given page to the balloon. */
57162+static void balloon_append(struct page *page)
57163+{
57164+ /* Lowmem is re-populated first, so highmem pages go at list tail. */
57165+ if (PageHighMem(page)) {
57166+ list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
57167+ bs.balloon_high++;
57168+ } else {
57169+ list_add(PAGE_TO_LIST(page), &ballooned_pages);
57170+ bs.balloon_low++;
57171+ }
57172+}
57173+
57174+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
57175+static struct page *balloon_retrieve(void)
57176+{
57177+ struct page *page;
57178+
57179+ if (list_empty(&ballooned_pages))
57180+ return NULL;
57181+
57182+ page = LIST_TO_PAGE(ballooned_pages.next);
57183+ UNLIST_PAGE(page);
57184+
57185+ if (PageHighMem(page))
57186+ bs.balloon_high--;
57187+ else
57188+ bs.balloon_low--;
57189+
57190+ return page;
57191+}
57192+
57193+static struct page *balloon_first_page(void)
57194+{
57195+ if (list_empty(&ballooned_pages))
57196+ return NULL;
57197+ return LIST_TO_PAGE(ballooned_pages.next);
57198+}
57199+
57200+static struct page *balloon_next_page(struct page *page)
57201+{
57202+ struct list_head *next = PAGE_TO_LIST(page)->next;
57203+ if (next == &ballooned_pages)
57204+ return NULL;
57205+ return LIST_TO_PAGE(next);
57206+}
57207+
57208+static void balloon_alarm(unsigned long unused)
57209+{
57210+ schedule_work(&balloon_worker);
57211+}
57212+
57213+static unsigned long current_target(void)
57214+{
57215+ unsigned long target = min(bs.target_pages, bs.hard_limit);
57216+ if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
57217+ target = bs.current_pages + bs.balloon_low + bs.balloon_high;
57218+ return target;
57219+}
57220+
57221+static int increase_reservation(unsigned long nr_pages)
57222+{
57223+ unsigned long pfn, i, flags;
57224+ struct page *page;
57225+ long rc;
57226+ struct xen_memory_reservation reservation = {
57227+ .address_bits = 0,
57228+ .extent_order = 0,
57229+ .domid = DOMID_SELF
57230+ };
57231+
57232+ if (nr_pages > ARRAY_SIZE(frame_list))
57233+ nr_pages = ARRAY_SIZE(frame_list);
57234+
57235+ balloon_lock(flags);
57236+
57237+ page = balloon_first_page();
57238+ for (i = 0; i < nr_pages; i++) {
57239+ BUG_ON(page == NULL);
57240+ frame_list[i] = page_to_pfn(page);;
57241+ page = balloon_next_page(page);
57242+ }
57243+
57244+ set_xen_guest_handle(reservation.extent_start, frame_list);
57245+ reservation.nr_extents = nr_pages;
57246+ rc = HYPERVISOR_memory_op(
57247+ XENMEM_populate_physmap, &reservation);
57248+ if (rc < nr_pages) {
57249+ if (rc > 0) {
57250+ int ret;
57251+
57252+ /* We hit the Xen hard limit: reprobe. */
57253+ reservation.nr_extents = rc;
57254+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
57255+ &reservation);
57256+ BUG_ON(ret != rc);
57257+ }
57258+ if (rc >= 0)
57259+ bs.hard_limit = (bs.current_pages + rc -
57260+ bs.driver_pages);
57261+ goto out;
57262+ }
57263+
57264+ for (i = 0; i < nr_pages; i++) {
57265+ page = balloon_retrieve();
57266+ BUG_ON(page == NULL);
57267+
57268+ pfn = page_to_pfn(page);
57269+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
57270+ phys_to_machine_mapping_valid(pfn));
57271+
57272+ set_phys_to_machine(pfn, frame_list[i]);
57273+
57274+ /* Link back into the page tables if not highmem. */
57275+ if (pfn < max_low_pfn) {
57276+ int ret;
57277+ ret = HYPERVISOR_update_va_mapping(
57278+ (unsigned long)__va(pfn << PAGE_SHIFT),
57279+ pfn_pte_ma(frame_list[i], PAGE_KERNEL),
57280+ 0);
57281+ BUG_ON(ret);
57282+ }
57283+
57284+ /* Relinquish the page back to the allocator. */
57285+ ClearPageReserved(page);
57286+ set_page_count(page, 1);
57287+ __free_page(page);
57288+ }
57289+
57290+ bs.current_pages += nr_pages;
57291+ totalram_pages = bs.current_pages;
57292+
57293+ out:
57294+ balloon_unlock(flags);
57295+
57296+ return 0;
57297+}
57298+
57299+static int decrease_reservation(unsigned long nr_pages)
57300+{
57301+ unsigned long pfn, i, flags;
57302+ struct page *page;
57303+ void *v;
57304+ int need_sleep = 0;
57305+ int ret;
57306+ struct xen_memory_reservation reservation = {
57307+ .address_bits = 0,
57308+ .extent_order = 0,
57309+ .domid = DOMID_SELF
57310+ };
57311+
57312+ if (nr_pages > ARRAY_SIZE(frame_list))
57313+ nr_pages = ARRAY_SIZE(frame_list);
57314+
57315+ for (i = 0; i < nr_pages; i++) {
57316+ if ((page = alloc_page(GFP_BALLOON)) == NULL) {
57317+ nr_pages = i;
57318+ need_sleep = 1;
57319+ break;
57320+ }
57321+
57322+ pfn = page_to_pfn(page);
57323+ frame_list[i] = pfn_to_mfn(pfn);
57324+
57325+ if (!PageHighMem(page)) {
57326+ v = phys_to_virt(pfn << PAGE_SHIFT);
57327+ scrub_pages(v, 1);
57328+ ret = HYPERVISOR_update_va_mapping(
57329+ (unsigned long)v, __pte_ma(0), 0);
57330+ BUG_ON(ret);
57331+ }
57332+#ifdef CONFIG_XEN_SCRUB_PAGES
57333+ else {
57334+ v = kmap(page);
57335+ scrub_pages(v, 1);
57336+ kunmap(page);
57337+ }
57338+#endif
57339+ }
57340+
57341+ /* Ensure that ballooned highmem pages don't have kmaps. */
57342+ kmap_flush_unused();
57343+ flush_tlb_all();
57344+
57345+ balloon_lock(flags);
57346+
57347+ /* No more mappings: invalidate P2M and add to balloon. */
57348+ for (i = 0; i < nr_pages; i++) {
57349+ pfn = mfn_to_pfn(frame_list[i]);
57350+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
57351+ balloon_append(pfn_to_page(pfn));
57352+ }
57353+
57354+ set_xen_guest_handle(reservation.extent_start, frame_list);
57355+ reservation.nr_extents = nr_pages;
57356+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
57357+ BUG_ON(ret != nr_pages);
57358+
57359+ bs.current_pages -= nr_pages;
57360+ totalram_pages = bs.current_pages;
57361+
57362+ balloon_unlock(flags);
57363+
57364+ return need_sleep;
57365+}
57366+
57367+/*
57368+ * We avoid multiple worker processes conflicting via the balloon mutex.
57369+ * We may of course race updates of the target counts (which are protected
57370+ * by the balloon lock), or with changes to the Xen hard limit, but we will
57371+ * recover from these in time.
57372+ */
57373+static void balloon_process(void *unused)
57374+{
57375+ int need_sleep = 0;
57376+ long credit;
57377+
57378+ down(&balloon_mutex);
57379+
57380+ do {
57381+ credit = current_target() - bs.current_pages;
57382+ if (credit > 0)
57383+ need_sleep = (increase_reservation(credit) != 0);
57384+ if (credit < 0)
57385+ need_sleep = (decrease_reservation(-credit) != 0);
57386+
57387+#ifndef CONFIG_PREEMPT
57388+ if (need_resched())
57389+ schedule();
57390+#endif
57391+ } while ((credit != 0) && !need_sleep);
57392+
57393+ /* Schedule more work if there is some still to be done. */
57394+ if (current_target() != bs.current_pages)
57395+ mod_timer(&balloon_timer, jiffies + HZ);
57396+
57397+ up(&balloon_mutex);
57398+}
57399+
57400+/* Resets the Xen limit, sets new target, and kicks off processing. */
57401+void balloon_set_new_target(unsigned long target)
57402+{
57403+ /* No need for lock. Not read-modify-write updates. */
57404+ bs.hard_limit = ~0UL;
57405+ bs.target_pages = target;
57406+ schedule_work(&balloon_worker);
57407+}
57408+
57409+static struct xenbus_watch target_watch =
57410+{
57411+ .node = "memory/target"
57412+};
57413+
57414+/* React to a change in the target key */
57415+static void watch_target(struct xenbus_watch *watch,
57416+ const char **vec, unsigned int len)
57417+{
57418+ unsigned long long new_target;
57419+ int err;
57420+
57421+ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
57422+ if (err != 1) {
57423+ /* This is ok (for domain0 at least) - so just return */
57424+ return;
57425+ }
57426+
57427+ /* The given memory/target value is in KiB, so it needs converting to
57428+ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
57429+ */
57430+ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
57431+}
57432+
57433+static int balloon_init_watcher(struct notifier_block *notifier,
57434+ unsigned long event,
57435+ void *data)
57436+{
57437+ int err;
57438+
57439+ err = register_xenbus_watch(&target_watch);
57440+ if (err)
57441+ printk(KERN_ERR "Failed to set balloon watcher\n");
57442+
57443+ return NOTIFY_DONE;
57444+}
57445+
57446+#ifdef CONFIG_PROC_FS
57447+static int balloon_write(struct file *file, const char __user *buffer,
57448+ unsigned long count, void *data)
57449+{
57450+ char memstring[64], *endchar;
57451+ unsigned long long target_bytes;
57452+
57453+ if (!capable(CAP_SYS_ADMIN))
57454+ return -EPERM;
57455+
57456+ if (count <= 1)
57457+ return -EBADMSG; /* runt */
57458+ if (count > sizeof(memstring))
57459+ return -EFBIG; /* too long */
57460+
57461+ if (copy_from_user(memstring, buffer, count))
57462+ return -EFAULT;
57463+ memstring[sizeof(memstring)-1] = '\0';
57464+
57465+ target_bytes = memparse(memstring, &endchar);
57466+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
57467+
57468+ return count;
57469+}
57470+
57471+static int balloon_read(char *page, char **start, off_t off,
57472+ int count, int *eof, void *data)
57473+{
57474+ int len;
57475+
57476+ len = sprintf(
57477+ page,
57478+ "Current allocation: %8lu kB\n"
57479+ "Requested target: %8lu kB\n"
57480+ "Low-mem balloon: %8lu kB\n"
57481+ "High-mem balloon: %8lu kB\n"
57482+ "Driver pages: %8lu kB\n"
57483+ "Xen hard limit: ",
57484+ PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages),
57485+ PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
57486+ PAGES2KB(bs.driver_pages));
57487+
57488+ if (bs.hard_limit != ~0UL)
57489+ len += sprintf(page + len, "%8lu kB\n",
57490+ PAGES2KB(bs.hard_limit));
57491+ else
57492+ len += sprintf(page + len, " ??? kB\n");
57493+
57494+ *eof = 1;
57495+ return len;
57496+}
57497+#endif
57498+
57499+static struct notifier_block xenstore_notifier;
57500+
57501+static int __init balloon_init(void)
57502+{
57503+ unsigned long pfn;
57504+ struct page *page;
57505+
57506+ if (!is_running_on_xen())
57507+ return -ENODEV;
57508+
57509+ IPRINTK("Initialising balloon driver.\n");
57510+
57511+ bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
57512+ totalram_pages = bs.current_pages;
57513+ bs.target_pages = bs.current_pages;
57514+ bs.balloon_low = 0;
57515+ bs.balloon_high = 0;
57516+ bs.driver_pages = 0UL;
57517+ bs.hard_limit = ~0UL;
57518+
57519+ init_timer(&balloon_timer);
57520+ balloon_timer.data = 0;
57521+ balloon_timer.function = balloon_alarm;
57522+
57523+#ifdef CONFIG_PROC_FS
57524+ if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
57525+ WPRINTK("Unable to create /proc/xen/balloon.\n");
57526+ return -1;
57527+ }
57528+
57529+ balloon_pde->read_proc = balloon_read;
57530+ balloon_pde->write_proc = balloon_write;
57531+#endif
57532+ balloon_sysfs_init();
57533+
57534+ /* Initialise the balloon with excess memory space. */
57535+ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
57536+ page = pfn_to_page(pfn);
57537+ if (!PageReserved(page))
57538+ balloon_append(page);
57539+ }
57540+
57541+ target_watch.callback = watch_target;
57542+ xenstore_notifier.notifier_call = balloon_init_watcher;
57543+
57544+ register_xenstore_notifier(&xenstore_notifier);
57545+
57546+ return 0;
57547+}
57548+
57549+subsys_initcall(balloon_init);
57550+
57551+void balloon_update_driver_allowance(long delta)
57552+{
57553+ unsigned long flags;
57554+
57555+ balloon_lock(flags);
57556+ bs.driver_pages += delta;
57557+ balloon_unlock(flags);
57558+}
57559+
57560+static int dealloc_pte_fn(
57561+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
57562+{
57563+ unsigned long mfn = pte_mfn(*pte);
57564+ int ret;
57565+ struct xen_memory_reservation reservation = {
57566+ .nr_extents = 1,
57567+ .extent_order = 0,
57568+ .domid = DOMID_SELF
57569+ };
57570+ set_xen_guest_handle(reservation.extent_start, &mfn);
57571+ set_pte_at(&init_mm, addr, pte, __pte_ma(0));
57572+ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
57573+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
57574+ BUG_ON(ret != 1);
57575+ return 0;
57576+}
57577+
57578+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
57579+{
57580+ unsigned long vaddr, flags;
57581+ struct page *page, **pagevec;
57582+ int i, ret;
57583+
57584+ pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
57585+ if (pagevec == NULL)
57586+ return NULL;
57587+
57588+ for (i = 0; i < nr_pages; i++) {
57589+ page = pagevec[i] = alloc_page(GFP_KERNEL);
57590+ if (page == NULL)
57591+ goto err;
57592+
57593+ vaddr = (unsigned long)page_address(page);
57594+
57595+ scrub_pages(vaddr, 1);
57596+
57597+ balloon_lock(flags);
57598+
57599+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
57600+ unsigned long gmfn = page_to_pfn(page);
57601+ struct xen_memory_reservation reservation = {
57602+ .nr_extents = 1,
57603+ .extent_order = 0,
57604+ .domid = DOMID_SELF
57605+ };
57606+ set_xen_guest_handle(reservation.extent_start, &gmfn);
57607+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
57608+ &reservation);
57609+ if (ret == 1)
57610+ ret = 0; /* success */
57611+ } else {
57612+ ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
57613+ dealloc_pte_fn, NULL);
57614+ }
57615+
57616+ if (ret != 0) {
57617+ balloon_unlock(flags);
57618+ __free_page(page);
57619+ goto err;
57620+ }
57621+
57622+ totalram_pages = --bs.current_pages;
57623+
57624+ balloon_unlock(flags);
57625+ }
57626+
57627+ out:
57628+ schedule_work(&balloon_worker);
57629+ flush_tlb_all();
57630+ return pagevec;
57631+
57632+ err:
57633+ balloon_lock(flags);
57634+ while (--i >= 0)
57635+ balloon_append(pagevec[i]);
57636+ balloon_unlock(flags);
57637+ kfree(pagevec);
57638+ pagevec = NULL;
57639+ goto out;
57640+}
57641+
57642+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
57643+{
57644+ unsigned long flags;
57645+ int i;
57646+
57647+ if (pagevec == NULL)
57648+ return;
57649+
57650+ balloon_lock(flags);
57651+ for (i = 0; i < nr_pages; i++) {
57652+ BUG_ON(page_count(pagevec[i]) != 1);
57653+ balloon_append(pagevec[i]);
57654+ }
57655+ balloon_unlock(flags);
57656+
57657+ kfree(pagevec);
57658+
57659+ schedule_work(&balloon_worker);
57660+}
57661+
57662+void balloon_release_driver_page(struct page *page)
57663+{
57664+ unsigned long flags;
57665+
57666+ balloon_lock(flags);
57667+ balloon_append(page);
57668+ bs.driver_pages--;
57669+ balloon_unlock(flags);
57670+
57671+ schedule_work(&balloon_worker);
57672+}
57673+
57674+EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
57675+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
57676+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
57677+EXPORT_SYMBOL_GPL(balloon_release_driver_page);
57678+
57679+MODULE_LICENSE("Dual BSD/GPL");
57680diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/common.h linux-2.6.16.33/drivers/xen/balloon/common.h
57681--- linux-2.6.16.33-noxen/drivers/xen/balloon/common.h 1970-01-01 00:00:00.000000000 +0000
57682+++ linux-2.6.16.33/drivers/xen/balloon/common.h 2007-01-08 15:00:45.000000000 +0000
57683@@ -0,0 +1,58 @@
57684+/******************************************************************************
57685+ * balloon/common.h
57686+ *
57687+ * This program is free software; you can redistribute it and/or
57688+ * modify it under the terms of the GNU General Public License version 2
57689+ * as published by the Free Software Foundation; or, when distributed
57690+ * separately from the Linux kernel or incorporated into other
57691+ * software packages, subject to the following license:
57692+ *
57693+ * Permission is hereby granted, free of charge, to any person obtaining a copy
57694+ * of this source file (the "Software"), to deal in the Software without
57695+ * restriction, including without limitation the rights to use, copy, modify,
57696+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57697+ * and to permit persons to whom the Software is furnished to do so, subject to
57698+ * the following conditions:
57699+ *
57700+ * The above copyright notice and this permission notice shall be included in
57701+ * all copies or substantial portions of the Software.
57702+ *
57703+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57704+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57705+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57706+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57707+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57708+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57709+ * IN THE SOFTWARE.
57710+ */
57711+
57712+#ifndef __XEN_BALLOON_COMMON_H__
57713+#define __XEN_BALLOON_COMMON_H__
57714+
57715+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
57716+
57717+struct balloon_stats {
57718+ /* We aim for 'current allocation' == 'target allocation'. */
57719+ unsigned long current_pages;
57720+ unsigned long target_pages;
57721+ /* We may hit the hard limit in Xen. If we do then we remember it. */
57722+ unsigned long hard_limit;
57723+ /*
57724+ * Drivers may alter the memory reservation independently, but they
57725+ * must inform the balloon driver so we avoid hitting the hard limit.
57726+ */
57727+ unsigned long driver_pages;
57728+ /* Number of pages in high- and low-memory balloons. */
57729+ unsigned long balloon_low;
57730+ unsigned long balloon_high;
57731+};
57732+
57733+extern struct balloon_stats balloon_stats;
57734+#define bs balloon_stats
57735+
57736+int balloon_sysfs_init(void);
57737+void balloon_sysfs_exit(void);
57738+
57739+void balloon_set_new_target(unsigned long target);
57740+
57741+#endif /* __XEN_BALLOON_COMMON_H__ */
57742diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/sysfs.c linux-2.6.16.33/drivers/xen/balloon/sysfs.c
57743--- linux-2.6.16.33-noxen/drivers/xen/balloon/sysfs.c 1970-01-01 00:00:00.000000000 +0000
57744+++ linux-2.6.16.33/drivers/xen/balloon/sysfs.c 2007-01-08 15:00:45.000000000 +0000
57745@@ -0,0 +1,165 @@
57746+/******************************************************************************
57747+ * balloon/sysfs.c
57748+ *
57749+ * Xen balloon driver - sysfs interfaces.
57750+ *
57751+ * This program is free software; you can redistribute it and/or
57752+ * modify it under the terms of the GNU General Public License version 2
57753+ * as published by the Free Software Foundation; or, when distributed
57754+ * separately from the Linux kernel or incorporated into other
57755+ * software packages, subject to the following license:
57756+ *
57757+ * Permission is hereby granted, free of charge, to any person obtaining a copy
57758+ * of this source file (the "Software"), to deal in the Software without
57759+ * restriction, including without limitation the rights to use, copy, modify,
57760+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57761+ * and to permit persons to whom the Software is furnished to do so, subject to
57762+ * the following conditions:
57763+ *
57764+ * The above copyright notice and this permission notice shall be included in
57765+ * all copies or substantial portions of the Software.
57766+ *
57767+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57768+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57769+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57770+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57771+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57772+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57773+ * IN THE SOFTWARE.
57774+ */
57775+
57776+#include <linux/config.h>
57777+#include <linux/capability.h>
57778+#include <linux/stat.h>
57779+#include <linux/sysdev.h>
57780+#include "common.h"
57781+
57782+#define BALLOON_CLASS_NAME "memory"
57783+
57784+#define BALLOON_SHOW(name, format, args...) \
57785+ static ssize_t show_##name(struct sys_device *dev, \
57786+ char *buf) \
57787+ { \
57788+ return sprintf(buf, format, ##args); \
57789+ } \
57790+ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
57791+
57792+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
57793+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
57794+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
57795+BALLOON_SHOW(hard_limit_kb,
57796+ (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n",
57797+ (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
57798+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
57799+
57800+static ssize_t show_target_kb(struct sys_device *dev, char *buf)
57801+{
57802+ return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
57803+}
57804+
57805+static ssize_t store_target_kb(struct sys_device *dev,
57806+ const char *buf,
57807+ size_t count)
57808+{
57809+ char memstring[64], *endchar;
57810+ unsigned long long target_bytes;
57811+
57812+ if (!capable(CAP_SYS_ADMIN))
57813+ return -EPERM;
57814+
57815+ if (count <= 1)
57816+ return -EBADMSG; /* runt */
57817+ if (count > sizeof(memstring))
57818+ return -EFBIG; /* too long */
57819+ strcpy(memstring, buf);
57820+
57821+ target_bytes = memparse(memstring, &endchar);
57822+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
57823+
57824+ return count;
57825+}
57826+
57827+static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
57828+ show_target_kb, store_target_kb);
57829+
57830+static struct sysdev_attribute *balloon_attrs[] = {
57831+ &attr_target_kb,
57832+};
57833+
57834+static struct attribute *balloon_info_attrs[] = {
57835+ &attr_current_kb.attr,
57836+ &attr_low_kb.attr,
57837+ &attr_high_kb.attr,
57838+ &attr_hard_limit_kb.attr,
57839+ &attr_driver_kb.attr,
57840+ NULL
57841+};
57842+
57843+static struct attribute_group balloon_info_group = {
57844+ .name = "info",
57845+ .attrs = balloon_info_attrs,
57846+};
57847+
57848+static struct sysdev_class balloon_sysdev_class = {
57849+ set_kset_name(BALLOON_CLASS_NAME),
57850+};
57851+
57852+static struct sys_device balloon_sysdev;
57853+
57854+static int register_balloon(struct sys_device *sysdev)
57855+{
57856+ int i, error;
57857+
57858+ error = sysdev_class_register(&balloon_sysdev_class);
57859+ if (error)
57860+ return error;
57861+
57862+ sysdev->id = 0;
57863+ sysdev->cls = &balloon_sysdev_class;
57864+
57865+ error = sysdev_register(sysdev);
57866+ if (error) {
57867+ sysdev_class_unregister(&balloon_sysdev_class);
57868+ return error;
57869+ }
57870+
57871+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
57872+ error = sysdev_create_file(sysdev, balloon_attrs[i]);
57873+ if (error)
57874+ goto fail;
57875+ }
57876+
57877+ error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
57878+ if (error)
57879+ goto fail;
57880+
57881+ return 0;
57882+
57883+ fail:
57884+ while (--i >= 0)
57885+ sysdev_remove_file(sysdev, balloon_attrs[i]);
57886+ sysdev_unregister(sysdev);
57887+ sysdev_class_unregister(&balloon_sysdev_class);
57888+ return error;
57889+}
57890+
57891+static void unregister_balloon(struct sys_device *sysdev)
57892+{
57893+ int i;
57894+
57895+ sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
57896+ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
57897+ sysdev_remove_file(sysdev, balloon_attrs[i]);
57898+ sysdev_unregister(sysdev);
57899+ sysdev_class_unregister(&balloon_sysdev_class);
57900+}
57901+
57902+int balloon_sysfs_init(void)
57903+{
57904+ return register_balloon(&balloon_sysdev);
57905+}
57906+
57907+void balloon_sysfs_exit(void)
57908+{
57909+ unregister_balloon(&balloon_sysdev);
57910+}
57911diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/Makefile linux-2.6.16.33/drivers/xen/blkback/Makefile
57912--- linux-2.6.16.33-noxen/drivers/xen/blkback/Makefile 1970-01-01 00:00:00.000000000 +0000
57913+++ linux-2.6.16.33/drivers/xen/blkback/Makefile 2007-01-08 15:00:45.000000000 +0000
57914@@ -0,0 +1,3 @@
57915+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
57916+
57917+blkbk-y := blkback.o xenbus.o interface.o vbd.o
57918diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/blkback.c linux-2.6.16.33/drivers/xen/blkback/blkback.c
57919--- linux-2.6.16.33-noxen/drivers/xen/blkback/blkback.c 1970-01-01 00:00:00.000000000 +0000
57920+++ linux-2.6.16.33/drivers/xen/blkback/blkback.c 2007-01-08 15:00:45.000000000 +0000
57921@@ -0,0 +1,580 @@
57922+/******************************************************************************
57923+ * arch/xen/drivers/blkif/backend/main.c
57924+ *
57925+ * Back-end of the driver for virtual block devices. This portion of the
57926+ * driver exports a 'unified' block-device interface that can be accessed
57927+ * by any operating system that implements a compatible front end. A
57928+ * reference front-end implementation can be found in:
57929+ * arch/xen/drivers/blkif/frontend
57930+ *
57931+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
57932+ * Copyright (c) 2005, Christopher Clark
57933+ *
57934+ * This program is free software; you can redistribute it and/or
57935+ * modify it under the terms of the GNU General Public License version 2
57936+ * as published by the Free Software Foundation; or, when distributed
57937+ * separately from the Linux kernel or incorporated into other
57938+ * software packages, subject to the following license:
57939+ *
57940+ * Permission is hereby granted, free of charge, to any person obtaining a copy
57941+ * of this source file (the "Software"), to deal in the Software without
57942+ * restriction, including without limitation the rights to use, copy, modify,
57943+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57944+ * and to permit persons to whom the Software is furnished to do so, subject to
57945+ * the following conditions:
57946+ *
57947+ * The above copyright notice and this permission notice shall be included in
57948+ * all copies or substantial portions of the Software.
57949+ *
57950+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57951+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57952+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57953+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57954+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57955+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57956+ * IN THE SOFTWARE.
57957+ */
57958+
57959+#include <linux/spinlock.h>
57960+#include <linux/kthread.h>
57961+#include <linux/list.h>
57962+#include <xen/balloon.h>
57963+#include <asm/hypervisor.h>
57964+#include "common.h"
57965+
57966+/*
57967+ * These are rather arbitrary. They are fairly large because adjacent requests
57968+ * pulled from a communication ring are quite likely to end up being part of
57969+ * the same scatter/gather request at the disc.
57970+ *
57971+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
57972+ *
57973+ * This will increase the chances of being able to write whole tracks.
57974+ * 64 should be enough to keep us competitive with Linux.
57975+ */
57976+static int blkif_reqs = 64;
57977+module_param_named(reqs, blkif_reqs, int, 0);
57978+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
57979+
57980+/* Run-time switchable: /sys/module/blkback/parameters/ */
57981+static unsigned int log_stats = 0;
57982+static unsigned int debug_lvl = 0;
57983+module_param(log_stats, int, 0644);
57984+module_param(debug_lvl, int, 0644);
57985+
57986+/*
57987+ * Each outstanding request that we've passed to the lower device layers has a
57988+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
57989+ * the pendcnt towards zero. When it hits zero, the specified domain has a
57990+ * response queued for it, with the saved 'id' passed back.
57991+ */
57992+typedef struct {
57993+ blkif_t *blkif;
57994+ unsigned long id;
57995+ int nr_pages;
57996+ atomic_t pendcnt;
57997+ unsigned short operation;
57998+ int status;
57999+ struct list_head free_list;
58000+} pending_req_t;
58001+
58002+static pending_req_t *pending_reqs;
58003+static struct list_head pending_free;
58004+static DEFINE_SPINLOCK(pending_free_lock);
58005+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
58006+
58007+#define BLKBACK_INVALID_HANDLE (~0)
58008+
58009+static struct page **pending_pages;
58010+static grant_handle_t *pending_grant_handles;
58011+
58012+static inline int vaddr_pagenr(pending_req_t *req, int seg)
58013+{
58014+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
58015+}
58016+
58017+static inline unsigned long vaddr(pending_req_t *req, int seg)
58018+{
58019+ unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
58020+ return (unsigned long)pfn_to_kaddr(pfn);
58021+}
58022+
58023+#define pending_handle(_req, _seg) \
58024+ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
58025+
58026+
58027+static int do_block_io_op(blkif_t *blkif);
58028+static void dispatch_rw_block_io(blkif_t *blkif,
58029+ blkif_request_t *req,
58030+ pending_req_t *pending_req);
58031+static void make_response(blkif_t *blkif, unsigned long id,
58032+ unsigned short op, int st);
58033+
58034+/******************************************************************
58035+ * misc small helpers
58036+ */
58037+static pending_req_t* alloc_req(void)
58038+{
58039+ pending_req_t *req = NULL;
58040+ unsigned long flags;
58041+
58042+ spin_lock_irqsave(&pending_free_lock, flags);
58043+ if (!list_empty(&pending_free)) {
58044+ req = list_entry(pending_free.next, pending_req_t, free_list);
58045+ list_del(&req->free_list);
58046+ }
58047+ spin_unlock_irqrestore(&pending_free_lock, flags);
58048+ return req;
58049+}
58050+
58051+static void free_req(pending_req_t *req)
58052+{
58053+ unsigned long flags;
58054+ int was_empty;
58055+
58056+ spin_lock_irqsave(&pending_free_lock, flags);
58057+ was_empty = list_empty(&pending_free);
58058+ list_add(&req->free_list, &pending_free);
58059+ spin_unlock_irqrestore(&pending_free_lock, flags);
58060+ if (was_empty)
58061+ wake_up(&pending_free_wq);
58062+}
58063+
58064+static void unplug_queue(blkif_t *blkif)
58065+{
58066+ if (blkif->plug == NULL)
58067+ return;
58068+ if (blkif->plug->unplug_fn)
58069+ blkif->plug->unplug_fn(blkif->plug);
58070+ blk_put_queue(blkif->plug);
58071+ blkif->plug = NULL;
58072+}
58073+
58074+static void plug_queue(blkif_t *blkif, struct bio *bio)
58075+{
58076+ request_queue_t *q = bdev_get_queue(bio->bi_bdev);
58077+
58078+ if (q == blkif->plug)
58079+ return;
58080+ unplug_queue(blkif);
58081+ blk_get_queue(q);
58082+ blkif->plug = q;
58083+}
58084+
58085+static void fast_flush_area(pending_req_t *req)
58086+{
58087+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58088+ unsigned int i, invcount = 0;
58089+ grant_handle_t handle;
58090+ int ret;
58091+
58092+ for (i = 0; i < req->nr_pages; i++) {
58093+ handle = pending_handle(req, i);
58094+ if (handle == BLKBACK_INVALID_HANDLE)
58095+ continue;
58096+ gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
58097+ handle);
58098+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
58099+ invcount++;
58100+ }
58101+
58102+ ret = HYPERVISOR_grant_table_op(
58103+ GNTTABOP_unmap_grant_ref, unmap, invcount);
58104+ BUG_ON(ret);
58105+}
58106+
58107+/******************************************************************
58108+ * SCHEDULER FUNCTIONS
58109+ */
58110+
58111+static void print_stats(blkif_t *blkif)
58112+{
58113+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
58114+ current->comm, blkif->st_oo_req,
58115+ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
58116+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
58117+ blkif->st_rd_req = 0;
58118+ blkif->st_wr_req = 0;
58119+ blkif->st_oo_req = 0;
58120+}
58121+
58122+int blkif_schedule(void *arg)
58123+{
58124+ blkif_t *blkif = arg;
58125+
58126+ blkif_get(blkif);
58127+
58128+ if (debug_lvl)
58129+ printk(KERN_DEBUG "%s: started\n", current->comm);
58130+
58131+ while (!kthread_should_stop()) {
58132+ wait_event_interruptible(
58133+ blkif->wq,
58134+ blkif->waiting_reqs || kthread_should_stop());
58135+ wait_event_interruptible(
58136+ pending_free_wq,
58137+ !list_empty(&pending_free) || kthread_should_stop());
58138+
58139+ blkif->waiting_reqs = 0;
58140+ smp_mb(); /* clear flag *before* checking for work */
58141+
58142+ if (do_block_io_op(blkif))
58143+ blkif->waiting_reqs = 1;
58144+ unplug_queue(blkif);
58145+
58146+ if (log_stats && time_after(jiffies, blkif->st_print))
58147+ print_stats(blkif);
58148+ }
58149+
58150+ if (log_stats)
58151+ print_stats(blkif);
58152+ if (debug_lvl)
58153+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
58154+
58155+ blkif->xenblkd = NULL;
58156+ blkif_put(blkif);
58157+
58158+ return 0;
58159+}
58160+
58161+/******************************************************************
58162+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
58163+ */
58164+
58165+static void __end_block_io_op(pending_req_t *pending_req, int error)
58166+{
58167+ /* An error fails the entire request. */
58168+ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
58169+ (error == -EOPNOTSUPP)) {
58170+ DPRINTK("blkback: write barrier op failed, not supported\n");
58171+ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
58172+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
58173+ } else if (error) {
58174+ DPRINTK("Buffer not up-to-date at end of operation, "
58175+ "error=%d\n", error);
58176+ pending_req->status = BLKIF_RSP_ERROR;
58177+ }
58178+
58179+ if (atomic_dec_and_test(&pending_req->pendcnt)) {
58180+ fast_flush_area(pending_req);
58181+ make_response(pending_req->blkif, pending_req->id,
58182+ pending_req->operation, pending_req->status);
58183+ blkif_put(pending_req->blkif);
58184+ free_req(pending_req);
58185+ }
58186+}
58187+
58188+static int end_block_io_op(struct bio *bio, unsigned int done, int error)
58189+{
58190+ if (bio->bi_size != 0)
58191+ return 1;
58192+ __end_block_io_op(bio->bi_private, error);
58193+ bio_put(bio);
58194+ return error;
58195+}
58196+
58197+
58198+/******************************************************************************
58199+ * NOTIFICATION FROM GUEST OS.
58200+ */
58201+
58202+static void blkif_notify_work(blkif_t *blkif)
58203+{
58204+ blkif->waiting_reqs = 1;
58205+ wake_up(&blkif->wq);
58206+}
58207+
58208+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
58209+{
58210+ blkif_notify_work(dev_id);
58211+ return IRQ_HANDLED;
58212+}
58213+
58214+
58215+
58216+/******************************************************************
58217+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
58218+ */
58219+
58220+static int do_block_io_op(blkif_t *blkif)
58221+{
58222+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
58223+ blkif_request_t req;
58224+ pending_req_t *pending_req;
58225+ RING_IDX rc, rp;
58226+ int more_to_do = 0;
58227+
58228+ rc = blk_ring->req_cons;
58229+ rp = blk_ring->sring->req_prod;
58230+ rmb(); /* Ensure we see queued requests up to 'rp'. */
58231+
58232+ while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
58233+
58234+ pending_req = alloc_req();
58235+ if (NULL == pending_req) {
58236+ blkif->st_oo_req++;
58237+ more_to_do = 1;
58238+ break;
58239+ }
58240+
58241+ memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
58242+ blk_ring->req_cons = ++rc; /* before make_response() */
58243+
58244+ switch (req.operation) {
58245+ case BLKIF_OP_READ:
58246+ blkif->st_rd_req++;
58247+ dispatch_rw_block_io(blkif, &req, pending_req);
58248+ break;
58249+ case BLKIF_OP_WRITE_BARRIER:
58250+ blkif->st_br_req++;
58251+ /* fall through */
58252+ case BLKIF_OP_WRITE:
58253+ blkif->st_wr_req++;
58254+ dispatch_rw_block_io(blkif, &req, pending_req);
58255+ break;
58256+ default:
58257+ DPRINTK("error: unknown block io operation [%d]\n",
58258+ req.operation);
58259+ make_response(blkif, req.id, req.operation,
58260+ BLKIF_RSP_ERROR);
58261+ free_req(pending_req);
58262+ break;
58263+ }
58264+ }
58265+ return more_to_do;
58266+}
58267+
58268+static void dispatch_rw_block_io(blkif_t *blkif,
58269+ blkif_request_t *req,
58270+ pending_req_t *pending_req)
58271+{
58272+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
58273+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58274+ struct phys_req preq;
58275+ struct {
58276+ unsigned long buf; unsigned int nsec;
58277+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58278+ unsigned int nseg;
58279+ struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58280+ int ret, i, nbio = 0;
58281+ int operation;
58282+
58283+ switch (req->operation) {
58284+ case BLKIF_OP_READ:
58285+ operation = READ;
58286+ break;
58287+ case BLKIF_OP_WRITE:
58288+ operation = WRITE;
58289+ break;
58290+ case BLKIF_OP_WRITE_BARRIER:
58291+ operation = WRITE_BARRIER;
58292+ break;
58293+ default:
58294+ operation = 0; /* make gcc happy */
58295+ BUG();
58296+ }
58297+
58298+ /* Check that number of segments is sane. */
58299+ nseg = req->nr_segments;
58300+ if (unlikely(nseg == 0) ||
58301+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
58302+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
58303+ goto fail_response;
58304+ }
58305+
58306+ preq.dev = req->handle;
58307+ preq.sector_number = req->sector_number;
58308+ preq.nr_sects = 0;
58309+
58310+ pending_req->blkif = blkif;
58311+ pending_req->id = req->id;
58312+ pending_req->operation = req->operation;
58313+ pending_req->status = BLKIF_RSP_OKAY;
58314+ pending_req->nr_pages = nseg;
58315+
58316+ for (i = 0; i < nseg; i++) {
58317+ uint32_t flags;
58318+
58319+ seg[i].nsec = req->seg[i].last_sect -
58320+ req->seg[i].first_sect + 1;
58321+
58322+ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
58323+ (req->seg[i].last_sect < req->seg[i].first_sect))
58324+ goto fail_response;
58325+ preq.nr_sects += seg[i].nsec;
58326+
58327+ flags = GNTMAP_host_map;
58328+ if (operation != READ)
58329+ flags |= GNTMAP_readonly;
58330+ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
58331+ req->seg[i].gref, blkif->domid);
58332+ }
58333+
58334+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
58335+ BUG_ON(ret);
58336+
58337+ for (i = 0; i < nseg; i++) {
58338+ if (unlikely(map[i].status != 0)) {
58339+ DPRINTK("invalid buffer -- could not remap it\n");
58340+ map[i].handle = BLKBACK_INVALID_HANDLE;
58341+ ret |= 1;
58342+ }
58343+
58344+ pending_handle(pending_req, i) = map[i].handle;
58345+
58346+ if (ret)
58347+ continue;
58348+
58349+ set_phys_to_machine(__pa(vaddr(
58350+ pending_req, i)) >> PAGE_SHIFT,
58351+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
58352+ seg[i].buf = map[i].dev_bus_addr |
58353+ (req->seg[i].first_sect << 9);
58354+ }
58355+
58356+ if (ret)
58357+ goto fail_flush;
58358+
58359+ if (vbd_translate(&preq, blkif, operation) != 0) {
58360+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
58361+ operation == READ ? "read" : "write",
58362+ preq.sector_number,
58363+ preq.sector_number + preq.nr_sects, preq.dev);
58364+ goto fail_flush;
58365+ }
58366+
58367+ for (i = 0; i < nseg; i++) {
58368+ if (((int)preq.sector_number|(int)seg[i].nsec) &
58369+ ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
58370+ DPRINTK("Misaligned I/O request from domain %d",
58371+ blkif->domid);
58372+ goto fail_put_bio;
58373+ }
58374+
58375+ while ((bio == NULL) ||
58376+ (bio_add_page(bio,
58377+ virt_to_page(vaddr(pending_req, i)),
58378+ seg[i].nsec << 9,
58379+ seg[i].buf & ~PAGE_MASK) == 0)) {
58380+ bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
58381+ if (unlikely(bio == NULL))
58382+ goto fail_put_bio;
58383+
58384+ bio->bi_bdev = preq.bdev;
58385+ bio->bi_private = pending_req;
58386+ bio->bi_end_io = end_block_io_op;
58387+ bio->bi_sector = preq.sector_number;
58388+ }
58389+
58390+ preq.sector_number += seg[i].nsec;
58391+ }
58392+
58393+ plug_queue(blkif, bio);
58394+ atomic_set(&pending_req->pendcnt, nbio);
58395+ blkif_get(blkif);
58396+
58397+ for (i = 0; i < nbio; i++)
58398+ submit_bio(operation, biolist[i]);
58399+
58400+ return;
58401+
58402+ fail_put_bio:
58403+ for (i = 0; i < (nbio-1); i++)
58404+ bio_put(biolist[i]);
58405+ fail_flush:
58406+ fast_flush_area(pending_req);
58407+ fail_response:
58408+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
58409+ free_req(pending_req);
58410+}
58411+
58412+
58413+
58414+/******************************************************************
58415+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
58416+ */
58417+
58418+
58419+static void make_response(blkif_t *blkif, unsigned long id,
58420+ unsigned short op, int st)
58421+{
58422+ blkif_response_t *resp;
58423+ unsigned long flags;
58424+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
58425+ int more_to_do = 0;
58426+ int notify;
58427+
58428+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
58429+
58430+ /* Place on the response ring for the relevant domain. */
58431+ resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
58432+ resp->id = id;
58433+ resp->operation = op;
58434+ resp->status = st;
58435+ blk_ring->rsp_prod_pvt++;
58436+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
58437+
58438+ if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
58439+ /*
58440+ * Tail check for pending requests. Allows frontend to avoid
58441+ * notifications if requests are already in flight (lower
58442+ * overheads and promotes batching).
58443+ */
58444+ RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
58445+
58446+ } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
58447+ more_to_do = 1;
58448+
58449+ }
58450+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
58451+
58452+ if (more_to_do)
58453+ blkif_notify_work(blkif);
58454+ if (notify)
58455+ notify_remote_via_irq(blkif->irq);
58456+}
58457+
58458+static int __init blkif_init(void)
58459+{
58460+ int i, mmap_pages;
58461+
58462+ if (!is_running_on_xen())
58463+ return -ENODEV;
58464+
58465+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
58466+
58467+ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
58468+ blkif_reqs, GFP_KERNEL);
58469+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
58470+ mmap_pages, GFP_KERNEL);
58471+ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
58472+
58473+ if (!pending_reqs || !pending_grant_handles || !pending_pages)
58474+ goto out_of_memory;
58475+
58476+ for (i = 0; i < mmap_pages; i++)
58477+ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
58478+
58479+ blkif_interface_init();
58480+
58481+ memset(pending_reqs, 0, sizeof(pending_reqs));
58482+ INIT_LIST_HEAD(&pending_free);
58483+
58484+ for (i = 0; i < blkif_reqs; i++)
58485+ list_add_tail(&pending_reqs[i].free_list, &pending_free);
58486+
58487+ blkif_xenbus_init();
58488+
58489+ return 0;
58490+
58491+ out_of_memory:
58492+ kfree(pending_reqs);
58493+ kfree(pending_grant_handles);
58494+ free_empty_pages_and_pagevec(pending_pages, mmap_pages);
58495+ printk("%s: out of memory\n", __FUNCTION__);
58496+ return -ENOMEM;
58497+}
58498+
58499+module_init(blkif_init);
58500+
58501+MODULE_LICENSE("Dual BSD/GPL");
58502diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/common.h linux-2.6.16.33/drivers/xen/blkback/common.h
58503--- linux-2.6.16.33-noxen/drivers/xen/blkback/common.h 1970-01-01 00:00:00.000000000 +0000
58504+++ linux-2.6.16.33/drivers/xen/blkback/common.h 2007-01-08 15:00:45.000000000 +0000
58505@@ -0,0 +1,139 @@
58506+/*
58507+ * This program is free software; you can redistribute it and/or
58508+ * modify it under the terms of the GNU General Public License version 2
58509+ * as published by the Free Software Foundation; or, when distributed
58510+ * separately from the Linux kernel or incorporated into other
58511+ * software packages, subject to the following license:
58512+ *
58513+ * Permission is hereby granted, free of charge, to any person obtaining a copy
58514+ * of this source file (the "Software"), to deal in the Software without
58515+ * restriction, including without limitation the rights to use, copy, modify,
58516+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58517+ * and to permit persons to whom the Software is furnished to do so, subject to
58518+ * the following conditions:
58519+ *
58520+ * The above copyright notice and this permission notice shall be included in
58521+ * all copies or substantial portions of the Software.
58522+ *
58523+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58524+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58525+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58526+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58527+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58528+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58529+ * IN THE SOFTWARE.
58530+ */
58531+
58532+#ifndef __BLKIF__BACKEND__COMMON_H__
58533+#define __BLKIF__BACKEND__COMMON_H__
58534+
58535+#include <linux/config.h>
58536+#include <linux/version.h>
58537+#include <linux/module.h>
58538+#include <linux/interrupt.h>
58539+#include <linux/slab.h>
58540+#include <linux/blkdev.h>
58541+#include <linux/vmalloc.h>
58542+#include <linux/wait.h>
58543+#include <asm/io.h>
58544+#include <asm/setup.h>
58545+#include <asm/pgalloc.h>
58546+#include <xen/evtchn.h>
58547+#include <asm/hypervisor.h>
58548+#include <xen/interface/io/blkif.h>
58549+#include <xen/interface/io/ring.h>
58550+#include <xen/gnttab.h>
58551+#include <xen/driver_util.h>
58552+#include <xen/xenbus.h>
58553+
58554+#define DPRINTK(_f, _a...) \
58555+ pr_debug("(file=%s, line=%d) " _f, \
58556+ __FILE__ , __LINE__ , ## _a )
58557+
58558+struct vbd {
58559+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
58560+ unsigned char readonly; /* Non-zero -> read-only */
58561+ unsigned char type; /* VDISK_xxx */
58562+ u32 pdevice; /* phys device that this vbd maps to */
58563+ struct block_device *bdev;
58564+};
58565+
58566+struct backend_info;
58567+
58568+typedef struct blkif_st {
58569+ /* Unique identifier for this interface. */
58570+ domid_t domid;
58571+ unsigned int handle;
58572+ /* Physical parameters of the comms window. */
58573+ unsigned int evtchn;
58574+ unsigned int irq;
58575+ /* Comms information. */
58576+ blkif_back_ring_t blk_ring;
58577+ struct vm_struct *blk_ring_area;
58578+ /* The VBD attached to this interface. */
58579+ struct vbd vbd;
58580+ /* Back pointer to the backend_info. */
58581+ struct backend_info *be;
58582+ /* Private fields. */
58583+ spinlock_t blk_ring_lock;
58584+ atomic_t refcnt;
58585+
58586+ wait_queue_head_t wq;
58587+ struct task_struct *xenblkd;
58588+ unsigned int waiting_reqs;
58589+ request_queue_t *plug;
58590+
58591+ /* statistics */
58592+ unsigned long st_print;
58593+ int st_rd_req;
58594+ int st_wr_req;
58595+ int st_oo_req;
58596+ int st_br_req;
58597+
58598+ wait_queue_head_t waiting_to_free;
58599+
58600+ grant_handle_t shmem_handle;
58601+ grant_ref_t shmem_ref;
58602+} blkif_t;
58603+
58604+blkif_t *blkif_alloc(domid_t domid);
58605+void blkif_disconnect(blkif_t *blkif);
58606+void blkif_free(blkif_t *blkif);
58607+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
58608+
58609+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
58610+#define blkif_put(_b) \
58611+ do { \
58612+ if (atomic_dec_and_test(&(_b)->refcnt)) \
58613+ wake_up(&(_b)->waiting_to_free);\
58614+ } while (0)
58615+
58616+/* Create a vbd. */
58617+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
58618+ unsigned minor, int readonly);
58619+void vbd_free(struct vbd *vbd);
58620+
58621+unsigned long long vbd_size(struct vbd *vbd);
58622+unsigned int vbd_info(struct vbd *vbd);
58623+unsigned long vbd_secsize(struct vbd *vbd);
58624+
58625+struct phys_req {
58626+ unsigned short dev;
58627+ unsigned short nr_sects;
58628+ struct block_device *bdev;
58629+ blkif_sector_t sector_number;
58630+};
58631+
58632+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
58633+
58634+void blkif_interface_init(void);
58635+
58636+void blkif_xenbus_init(void);
58637+
58638+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
58639+int blkif_schedule(void *arg);
58640+
58641+int blkback_barrier(struct xenbus_transaction xbt,
58642+ struct backend_info *be, int state);
58643+
58644+#endif /* __BLKIF__BACKEND__COMMON_H__ */
58645diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/interface.c linux-2.6.16.33/drivers/xen/blkback/interface.c
58646--- linux-2.6.16.33-noxen/drivers/xen/blkback/interface.c 1970-01-01 00:00:00.000000000 +0000
58647+++ linux-2.6.16.33/drivers/xen/blkback/interface.c 2007-01-08 15:00:45.000000000 +0000
58648@@ -0,0 +1,171 @@
58649+/******************************************************************************
58650+ * arch/xen/drivers/blkif/backend/interface.c
58651+ *
58652+ * Block-device interface management.
58653+ *
58654+ * Copyright (c) 2004, Keir Fraser
58655+ *
58656+ * This program is free software; you can redistribute it and/or
58657+ * modify it under the terms of the GNU General Public License version 2
58658+ * as published by the Free Software Foundation; or, when distributed
58659+ * separately from the Linux kernel or incorporated into other
58660+ * software packages, subject to the following license:
58661+ *
58662+ * Permission is hereby granted, free of charge, to any person obtaining a copy
58663+ * of this source file (the "Software"), to deal in the Software without
58664+ * restriction, including without limitation the rights to use, copy, modify,
58665+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58666+ * and to permit persons to whom the Software is furnished to do so, subject to
58667+ * the following conditions:
58668+ *
58669+ * The above copyright notice and this permission notice shall be included in
58670+ * all copies or substantial portions of the Software.
58671+ *
58672+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58673+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58674+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58675+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58676+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58677+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58678+ * IN THE SOFTWARE.
58679+ */
58680+
58681+#include "common.h"
58682+#include <xen/evtchn.h>
58683+#include <linux/kthread.h>
58684+
58685+static kmem_cache_t *blkif_cachep;
58686+
58687+blkif_t *blkif_alloc(domid_t domid)
58688+{
58689+ blkif_t *blkif;
58690+
58691+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
58692+ if (!blkif)
58693+ return ERR_PTR(-ENOMEM);
58694+
58695+ memset(blkif, 0, sizeof(*blkif));
58696+ blkif->domid = domid;
58697+ spin_lock_init(&blkif->blk_ring_lock);
58698+ atomic_set(&blkif->refcnt, 1);
58699+ init_waitqueue_head(&blkif->wq);
58700+ blkif->st_print = jiffies;
58701+ init_waitqueue_head(&blkif->waiting_to_free);
58702+
58703+ return blkif;
58704+}
58705+
58706+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
58707+{
58708+ struct gnttab_map_grant_ref op;
58709+ int ret;
58710+
58711+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
58712+ GNTMAP_host_map, shared_page, blkif->domid);
58713+
58714+ lock_vm_area(blkif->blk_ring_area);
58715+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
58716+ unlock_vm_area(blkif->blk_ring_area);
58717+ BUG_ON(ret);
58718+
58719+ if (op.status) {
58720+ DPRINTK(" Grant table operation failure !\n");
58721+ return op.status;
58722+ }
58723+
58724+ blkif->shmem_ref = shared_page;
58725+ blkif->shmem_handle = op.handle;
58726+
58727+ return 0;
58728+}
58729+
58730+static void unmap_frontend_page(blkif_t *blkif)
58731+{
58732+ struct gnttab_unmap_grant_ref op;
58733+ int ret;
58734+
58735+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
58736+ GNTMAP_host_map, blkif->shmem_handle);
58737+
58738+ lock_vm_area(blkif->blk_ring_area);
58739+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
58740+ unlock_vm_area(blkif->blk_ring_area);
58741+ BUG_ON(ret);
58742+}
58743+
58744+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
58745+{
58746+ blkif_sring_t *sring;
58747+ int err;
58748+ struct evtchn_bind_interdomain bind_interdomain;
58749+
58750+ /* Already connected through? */
58751+ if (blkif->irq)
58752+ return 0;
58753+
58754+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
58755+ return -ENOMEM;
58756+
58757+ err = map_frontend_page(blkif, shared_page);
58758+ if (err) {
58759+ free_vm_area(blkif->blk_ring_area);
58760+ return err;
58761+ }
58762+
58763+ bind_interdomain.remote_dom = blkif->domid;
58764+ bind_interdomain.remote_port = evtchn;
58765+
58766+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
58767+ &bind_interdomain);
58768+ if (err) {
58769+ unmap_frontend_page(blkif);
58770+ free_vm_area(blkif->blk_ring_area);
58771+ return err;
58772+ }
58773+
58774+ blkif->evtchn = bind_interdomain.local_port;
58775+
58776+ sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
58777+ BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
58778+
58779+ blkif->irq = bind_evtchn_to_irqhandler(
58780+ blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
58781+
58782+ return 0;
58783+}
58784+
58785+void blkif_disconnect(blkif_t *blkif)
58786+{
58787+ if (blkif->xenblkd) {
58788+ kthread_stop(blkif->xenblkd);
58789+ blkif->xenblkd = NULL;
58790+ }
58791+
58792+ atomic_dec(&blkif->refcnt);
58793+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
58794+ atomic_inc(&blkif->refcnt);
58795+
58796+ if (blkif->irq) {
58797+ unbind_from_irqhandler(blkif->irq, blkif);
58798+ blkif->irq = 0;
58799+ }
58800+
58801+ if (blkif->blk_ring.sring) {
58802+ unmap_frontend_page(blkif);
58803+ free_vm_area(blkif->blk_ring_area);
58804+ blkif->blk_ring.sring = NULL;
58805+ }
58806+}
58807+
58808+void blkif_free(blkif_t *blkif)
58809+{
58810+ if (!atomic_dec_and_test(&blkif->refcnt))
58811+ BUG();
58812+ kmem_cache_free(blkif_cachep, blkif);
58813+}
58814+
58815+void __init blkif_interface_init(void)
58816+{
58817+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
58818+ 0, 0, NULL, NULL);
58819+}
58820diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/vbd.c linux-2.6.16.33/drivers/xen/blkback/vbd.c
58821--- linux-2.6.16.33-noxen/drivers/xen/blkback/vbd.c 1970-01-01 00:00:00.000000000 +0000
58822+++ linux-2.6.16.33/drivers/xen/blkback/vbd.c 2007-01-08 15:00:45.000000000 +0000
58823@@ -0,0 +1,118 @@
58824+/******************************************************************************
58825+ * blkback/vbd.c
58826+ *
58827+ * Routines for managing virtual block devices (VBDs).
58828+ *
58829+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
58830+ *
58831+ * This program is free software; you can redistribute it and/or
58832+ * modify it under the terms of the GNU General Public License version 2
58833+ * as published by the Free Software Foundation; or, when distributed
58834+ * separately from the Linux kernel or incorporated into other
58835+ * software packages, subject to the following license:
58836+ *
58837+ * Permission is hereby granted, free of charge, to any person obtaining a copy
58838+ * of this source file (the "Software"), to deal in the Software without
58839+ * restriction, including without limitation the rights to use, copy, modify,
58840+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58841+ * and to permit persons to whom the Software is furnished to do so, subject to
58842+ * the following conditions:
58843+ *
58844+ * The above copyright notice and this permission notice shall be included in
58845+ * all copies or substantial portions of the Software.
58846+ *
58847+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58848+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58849+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58850+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58851+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58852+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58853+ * IN THE SOFTWARE.
58854+ */
58855+
58856+#include "common.h"
58857+
58858+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
58859+ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
58860+
58861+unsigned long long vbd_size(struct vbd *vbd)
58862+{
58863+ return vbd_sz(vbd);
58864+}
58865+
58866+unsigned int vbd_info(struct vbd *vbd)
58867+{
58868+ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
58869+}
58870+
58871+unsigned long vbd_secsize(struct vbd *vbd)
58872+{
58873+ return bdev_hardsect_size(vbd->bdev);
58874+}
58875+
58876+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
58877+ unsigned minor, int readonly)
58878+{
58879+ struct vbd *vbd;
58880+ struct block_device *bdev;
58881+
58882+ vbd = &blkif->vbd;
58883+ vbd->handle = handle;
58884+ vbd->readonly = readonly;
58885+ vbd->type = 0;
58886+
58887+ vbd->pdevice = MKDEV(major, minor);
58888+
58889+ bdev = open_by_devnum(vbd->pdevice,
58890+ vbd->readonly ? FMODE_READ : FMODE_WRITE);
58891+
58892+ if (IS_ERR(bdev)) {
58893+ DPRINTK("vbd_creat: device %08x could not be opened.\n",
58894+ vbd->pdevice);
58895+ return -ENOENT;
58896+ }
58897+
58898+ vbd->bdev = bdev;
58899+
58900+ if (vbd->bdev->bd_disk == NULL) {
58901+ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
58902+ vbd->pdevice);
58903+ vbd_free(vbd);
58904+ return -ENOENT;
58905+ }
58906+
58907+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
58908+ vbd->type |= VDISK_CDROM;
58909+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
58910+ vbd->type |= VDISK_REMOVABLE;
58911+
58912+ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
58913+ handle, blkif->domid);
58914+ return 0;
58915+}
58916+
58917+void vbd_free(struct vbd *vbd)
58918+{
58919+ if (vbd->bdev)
58920+ blkdev_put(vbd->bdev);
58921+ vbd->bdev = NULL;
58922+}
58923+
58924+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
58925+{
58926+ struct vbd *vbd = &blkif->vbd;
58927+ int rc = -EACCES;
58928+
58929+ if ((operation != READ) && vbd->readonly)
58930+ goto out;
58931+
58932+ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
58933+ goto out;
58934+
58935+ req->dev = vbd->pdevice;
58936+ req->bdev = vbd->bdev;
58937+ rc = 0;
58938+
58939+ out:
58940+ return rc;
58941+}
58942diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/xenbus.c linux-2.6.16.33/drivers/xen/blkback/xenbus.c
58943--- linux-2.6.16.33-noxen/drivers/xen/blkback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
58944+++ linux-2.6.16.33/drivers/xen/blkback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
58945@@ -0,0 +1,485 @@
58946+/* Xenbus code for blkif backend
58947+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
58948+ Copyright (C) 2005 XenSource Ltd
58949+
58950+ This program is free software; you can redistribute it and/or modify
58951+ it under the terms of the GNU General Public License as published by
58952+ the Free Software Foundation; either version 2 of the License, or
58953+ (at your option) any later version.
58954+
58955+ This program is distributed in the hope that it will be useful,
58956+ but WITHOUT ANY WARRANTY; without even the implied warranty of
58957+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
58958+ GNU General Public License for more details.
58959+
58960+ You should have received a copy of the GNU General Public License
58961+ along with this program; if not, write to the Free Software
58962+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
58963+*/
58964+
58965+#include <stdarg.h>
58966+#include <linux/module.h>
58967+#include <linux/kthread.h>
58968+#include "common.h"
58969+
58970+#undef DPRINTK
58971+#define DPRINTK(fmt, args...) \
58972+ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
58973+ __FUNCTION__, __LINE__, ##args)
58974+
58975+struct backend_info
58976+{
58977+ struct xenbus_device *dev;
58978+ blkif_t *blkif;
58979+ struct xenbus_watch backend_watch;
58980+ unsigned major;
58981+ unsigned minor;
58982+ char *mode;
58983+};
58984+
58985+static void connect(struct backend_info *);
58986+static int connect_ring(struct backend_info *);
58987+static void backend_changed(struct xenbus_watch *, const char **,
58988+ unsigned int);
58989+
58990+static void update_blkif_status(blkif_t *blkif)
58991+{
58992+ int err;
58993+
58994+ /* Not ready to connect? */
58995+ if (!blkif->irq || !blkif->vbd.bdev)
58996+ return;
58997+
58998+ /* Already connected? */
58999+ if (blkif->be->dev->state == XenbusStateConnected)
59000+ return;
59001+
59002+ /* Attempt to connect: exit if we fail to. */
59003+ connect(blkif->be);
59004+ if (blkif->be->dev->state != XenbusStateConnected)
59005+ return;
59006+
59007+ blkif->xenblkd = kthread_run(blkif_schedule, blkif,
59008+ "xvd %d %02x:%02x",
59009+ blkif->domid,
59010+ blkif->be->major, blkif->be->minor);
59011+ if (IS_ERR(blkif->xenblkd)) {
59012+ err = PTR_ERR(blkif->xenblkd);
59013+ blkif->xenblkd = NULL;
59014+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
59015+ }
59016+}
59017+
59018+
59019+/****************************************************************
59020+ * sysfs interface for VBD I/O requests
59021+ */
59022+
59023+#define VBD_SHOW(name, format, args...) \
59024+ static ssize_t show_##name(struct device *_dev, \
59025+ struct device_attribute *attr, \
59026+ char *buf) \
59027+ { \
59028+ struct xenbus_device *dev = to_xenbus_device(_dev); \
59029+ struct backend_info *be = dev->dev.driver_data; \
59030+ \
59031+ return sprintf(buf, format, ##args); \
59032+ } \
59033+ DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
59034+
59035+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
59036+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
59037+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
59038+VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
59039+
59040+static struct attribute *vbdstat_attrs[] = {
59041+ &dev_attr_oo_req.attr,
59042+ &dev_attr_rd_req.attr,
59043+ &dev_attr_wr_req.attr,
59044+ &dev_attr_br_req.attr,
59045+ NULL
59046+};
59047+
59048+static struct attribute_group vbdstat_group = {
59049+ .name = "statistics",
59050+ .attrs = vbdstat_attrs,
59051+};
59052+
59053+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
59054+VBD_SHOW(mode, "%s\n", be->mode);
59055+
59056+int xenvbd_sysfs_addif(struct xenbus_device *dev)
59057+{
59058+ int error;
59059+
59060+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
59061+ if (error)
59062+ goto fail1;
59063+
59064+ error = device_create_file(&dev->dev, &dev_attr_mode);
59065+ if (error)
59066+ goto fail2;
59067+
59068+ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
59069+ if (error)
59070+ goto fail3;
59071+
59072+ return 0;
59073+
59074+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
59075+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
59076+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
59077+ return error;
59078+}
59079+
59080+void xenvbd_sysfs_delif(struct xenbus_device *dev)
59081+{
59082+ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
59083+ device_remove_file(&dev->dev, &dev_attr_mode);
59084+ device_remove_file(&dev->dev, &dev_attr_physical_device);
59085+}
59086+
59087+static int blkback_remove(struct xenbus_device *dev)
59088+{
59089+ struct backend_info *be = dev->dev.driver_data;
59090+
59091+ DPRINTK("");
59092+
59093+ if (be->backend_watch.node) {
59094+ unregister_xenbus_watch(&be->backend_watch);
59095+ kfree(be->backend_watch.node);
59096+ be->backend_watch.node = NULL;
59097+ }
59098+
59099+ if (be->blkif) {
59100+ blkif_disconnect(be->blkif);
59101+ vbd_free(&be->blkif->vbd);
59102+ blkif_free(be->blkif);
59103+ be->blkif = NULL;
59104+ }
59105+
59106+ if (be->major || be->minor)
59107+ xenvbd_sysfs_delif(dev);
59108+
59109+ kfree(be);
59110+ dev->dev.driver_data = NULL;
59111+ return 0;
59112+}
59113+
59114+int blkback_barrier(struct xenbus_transaction xbt,
59115+ struct backend_info *be, int state)
59116+{
59117+ struct xenbus_device *dev = be->dev;
59118+ int err;
59119+
59120+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
59121+ "%d", state);
59122+ if (err)
59123+ xenbus_dev_fatal(dev, err, "writing feature-barrier");
59124+
59125+ return err;
59126+}
59127+
59128+/**
59129+ * Entry point to this code when a new device is created. Allocate the basic
59130+ * structures, and watch the store waiting for the hotplug scripts to tell us
59131+ * the device's physical major and minor numbers. Switch to InitWait.
59132+ */
59133+static int blkback_probe(struct xenbus_device *dev,
59134+ const struct xenbus_device_id *id)
59135+{
59136+ int err;
59137+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
59138+ GFP_KERNEL);
59139+ if (!be) {
59140+ xenbus_dev_fatal(dev, -ENOMEM,
59141+ "allocating backend structure");
59142+ return -ENOMEM;
59143+ }
59144+ be->dev = dev;
59145+ dev->dev.driver_data = be;
59146+
59147+ be->blkif = blkif_alloc(dev->otherend_id);
59148+ if (IS_ERR(be->blkif)) {
59149+ err = PTR_ERR(be->blkif);
59150+ be->blkif = NULL;
59151+ xenbus_dev_fatal(dev, err, "creating block interface");
59152+ goto fail;
59153+ }
59154+
59155+ /* setup back pointer */
59156+ be->blkif->be = be;
59157+
59158+ err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
59159+ &be->backend_watch, backend_changed);
59160+ if (err)
59161+ goto fail;
59162+
59163+ err = xenbus_switch_state(dev, XenbusStateInitWait);
59164+ if (err)
59165+ goto fail;
59166+
59167+ return 0;
59168+
59169+fail:
59170+ DPRINTK("failed");
59171+ blkback_remove(dev);
59172+ return err;
59173+}
59174+
59175+
59176+/**
59177+ * Callback received when the hotplug scripts have placed the physical-device
59178+ * node. Read it and the mode node, and create a vbd. If the frontend is
59179+ * ready, connect.
59180+ */
59181+static void backend_changed(struct xenbus_watch *watch,
59182+ const char **vec, unsigned int len)
59183+{
59184+ int err;
59185+ unsigned major;
59186+ unsigned minor;
59187+ struct backend_info *be
59188+ = container_of(watch, struct backend_info, backend_watch);
59189+ struct xenbus_device *dev = be->dev;
59190+
59191+ DPRINTK("");
59192+
59193+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
59194+ &major, &minor);
59195+ if (XENBUS_EXIST_ERR(err)) {
59196+ /* Since this watch will fire once immediately after it is
59197+ registered, we expect this. Ignore it, and wait for the
59198+ hotplug scripts. */
59199+ return;
59200+ }
59201+ if (err != 2) {
59202+ xenbus_dev_fatal(dev, err, "reading physical-device");
59203+ return;
59204+ }
59205+
59206+ if ((be->major || be->minor) &&
59207+ ((be->major != major) || (be->minor != minor))) {
59208+ printk(KERN_WARNING
59209+ "blkback: changing physical device (from %x:%x to "
59210+ "%x:%x) not supported.\n", be->major, be->minor,
59211+ major, minor);
59212+ return;
59213+ }
59214+
59215+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
59216+ if (IS_ERR(be->mode)) {
59217+ err = PTR_ERR(be->mode);
59218+ be->mode = NULL;
59219+ xenbus_dev_fatal(dev, err, "reading mode");
59220+ return;
59221+ }
59222+
59223+ if (be->major == 0 && be->minor == 0) {
59224+ /* Front end dir is a number, which is used as the handle. */
59225+
59226+ char *p = strrchr(dev->otherend, '/') + 1;
59227+ long handle = simple_strtoul(p, NULL, 0);
59228+
59229+ be->major = major;
59230+ be->minor = minor;
59231+
59232+ err = vbd_create(be->blkif, handle, major, minor,
59233+ (NULL == strchr(be->mode, 'w')));
59234+ if (err) {
59235+ be->major = be->minor = 0;
59236+ xenbus_dev_fatal(dev, err, "creating vbd structure");
59237+ return;
59238+ }
59239+
59240+ err = xenvbd_sysfs_addif(dev);
59241+ if (err) {
59242+ vbd_free(&be->blkif->vbd);
59243+ be->major = be->minor = 0;
59244+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
59245+ return;
59246+ }
59247+
59248+ /* We're potentially connected now */
59249+ update_blkif_status(be->blkif);
59250+ }
59251+}
59252+
59253+
59254+/**
59255+ * Callback received when the frontend's state changes.
59256+ */
59257+static void frontend_changed(struct xenbus_device *dev,
59258+ enum xenbus_state frontend_state)
59259+{
59260+ struct backend_info *be = dev->dev.driver_data;
59261+ int err;
59262+
59263+ DPRINTK("%s", xenbus_strstate(frontend_state));
59264+
59265+ switch (frontend_state) {
59266+ case XenbusStateInitialising:
59267+ if (dev->state == XenbusStateClosed) {
59268+ printk("%s: %s: prepare for reconnect\n",
59269+ __FUNCTION__, dev->nodename);
59270+ xenbus_switch_state(dev, XenbusStateInitWait);
59271+ }
59272+ break;
59273+
59274+ case XenbusStateInitialised:
59275+ case XenbusStateConnected:
59276+ /* Ensure we connect even when two watches fire in
59277+ close successsion and we miss the intermediate value
59278+ of frontend_state. */
59279+ if (dev->state == XenbusStateConnected)
59280+ break;
59281+
59282+ err = connect_ring(be);
59283+ if (err)
59284+ break;
59285+ update_blkif_status(be->blkif);
59286+ break;
59287+
59288+ case XenbusStateClosing:
59289+ blkif_disconnect(be->blkif);
59290+ xenbus_switch_state(dev, XenbusStateClosing);
59291+ break;
59292+
59293+ case XenbusStateClosed:
59294+ xenbus_switch_state(dev, XenbusStateClosed);
59295+ if (xenbus_dev_is_online(dev))
59296+ break;
59297+ /* fall through if not online */
59298+ case XenbusStateUnknown:
59299+ device_unregister(&dev->dev);
59300+ break;
59301+
59302+ default:
59303+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
59304+ frontend_state);
59305+ break;
59306+ }
59307+}
59308+
59309+
59310+/* ** Connection ** */
59311+
59312+
59313+/**
59314+ * Write the physical details regarding the block device to the store, and
59315+ * switch to Connected state.
59316+ */
59317+static void connect(struct backend_info *be)
59318+{
59319+ struct xenbus_transaction xbt;
59320+ int err;
59321+ struct xenbus_device *dev = be->dev;
59322+
59323+ DPRINTK("%s", dev->otherend);
59324+
59325+ /* Supply the information about the device the frontend needs */
59326+again:
59327+ err = xenbus_transaction_start(&xbt);
59328+ if (err) {
59329+ xenbus_dev_fatal(dev, err, "starting transaction");
59330+ return;
59331+ }
59332+
59333+ err = blkback_barrier(xbt, be, 1);
59334+ if (err)
59335+ goto abort;
59336+
59337+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
59338+ vbd_size(&be->blkif->vbd));
59339+ if (err) {
59340+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
59341+ dev->nodename);
59342+ goto abort;
59343+ }
59344+
59345+ /* FIXME: use a typename instead */
59346+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
59347+ vbd_info(&be->blkif->vbd));
59348+ if (err) {
59349+ xenbus_dev_fatal(dev, err, "writing %s/info",
59350+ dev->nodename);
59351+ goto abort;
59352+ }
59353+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
59354+ vbd_secsize(&be->blkif->vbd));
59355+ if (err) {
59356+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
59357+ dev->nodename);
59358+ goto abort;
59359+ }
59360+
59361+ err = xenbus_transaction_end(xbt, 0);
59362+ if (err == -EAGAIN)
59363+ goto again;
59364+ if (err)
59365+ xenbus_dev_fatal(dev, err, "ending transaction");
59366+
59367+ err = xenbus_switch_state(dev, XenbusStateConnected);
59368+ if (err)
59369+ xenbus_dev_fatal(dev, err, "switching to Connected state",
59370+ dev->nodename);
59371+
59372+ return;
59373+ abort:
59374+ xenbus_transaction_end(xbt, 1);
59375+}
59376+
59377+
59378+static int connect_ring(struct backend_info *be)
59379+{
59380+ struct xenbus_device *dev = be->dev;
59381+ unsigned long ring_ref;
59382+ unsigned int evtchn;
59383+ int err;
59384+
59385+ DPRINTK("%s", dev->otherend);
59386+
59387+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
59388+ "event-channel", "%u", &evtchn, NULL);
59389+ if (err) {
59390+ xenbus_dev_fatal(dev, err,
59391+ "reading %s/ring-ref and event-channel",
59392+ dev->otherend);
59393+ return err;
59394+ }
59395+
59396+ /* Map the shared frame, irq etc. */
59397+ err = blkif_map(be->blkif, ring_ref, evtchn);
59398+ if (err) {
59399+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
59400+ ring_ref, evtchn);
59401+ return err;
59402+ }
59403+
59404+ return 0;
59405+}
59406+
59407+
59408+/* ** Driver Registration ** */
59409+
59410+
59411+static struct xenbus_device_id blkback_ids[] = {
59412+ { "vbd" },
59413+ { "" }
59414+};
59415+
59416+
59417+static struct xenbus_driver blkback = {
59418+ .name = "vbd",
59419+ .owner = THIS_MODULE,
59420+ .ids = blkback_ids,
59421+ .probe = blkback_probe,
59422+ .remove = blkback_remove,
59423+ .otherend_changed = frontend_changed
59424+};
59425+
59426+
59427+void blkif_xenbus_init(void)
59428+{
59429+ xenbus_register_backend(&blkback);
59430+}
59431diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/Makefile linux-2.6.16.33/drivers/xen/blkfront/Makefile
59432--- linux-2.6.16.33-noxen/drivers/xen/blkfront/Makefile 1970-01-01 00:00:00.000000000 +0000
59433+++ linux-2.6.16.33/drivers/xen/blkfront/Makefile 2007-01-08 15:00:45.000000000 +0000
59434@@ -0,0 +1,5 @@
59435+
59436+obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o
59437+
59438+xenblk-objs := blkfront.o vbd.o
59439+
59440diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/blkfront.c linux-2.6.16.33/drivers/xen/blkfront/blkfront.c
59441--- linux-2.6.16.33-noxen/drivers/xen/blkfront/blkfront.c 1970-01-01 00:00:00.000000000 +0000
59442+++ linux-2.6.16.33/drivers/xen/blkfront/blkfront.c 2007-01-08 15:00:45.000000000 +0000
59443@@ -0,0 +1,891 @@
59444+/******************************************************************************
59445+ * blkfront.c
59446+ *
59447+ * XenLinux virtual block-device driver.
59448+ *
59449+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
59450+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
59451+ * Copyright (c) 2004, Christian Limpach
59452+ * Copyright (c) 2004, Andrew Warfield
59453+ * Copyright (c) 2005, Christopher Clark
59454+ * Copyright (c) 2005, XenSource Ltd
59455+ *
59456+ * This program is free software; you can redistribute it and/or
59457+ * modify it under the terms of the GNU General Public License version 2
59458+ * as published by the Free Software Foundation; or, when distributed
59459+ * separately from the Linux kernel or incorporated into other
59460+ * software packages, subject to the following license:
59461+ *
59462+ * Permission is hereby granted, free of charge, to any person obtaining a copy
59463+ * of this source file (the "Software"), to deal in the Software without
59464+ * restriction, including without limitation the rights to use, copy, modify,
59465+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
59466+ * and to permit persons to whom the Software is furnished to do so, subject to
59467+ * the following conditions:
59468+ *
59469+ * The above copyright notice and this permission notice shall be included in
59470+ * all copies or substantial portions of the Software.
59471+ *
59472+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59473+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59474+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
59475+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59476+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59477+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
59478+ * IN THE SOFTWARE.
59479+ */
59480+
59481+#include <linux/version.h>
59482+#include "block.h"
59483+#include <linux/cdrom.h>
59484+#include <linux/sched.h>
59485+#include <linux/interrupt.h>
59486+#include <scsi/scsi.h>
59487+#include <xen/evtchn.h>
59488+#include <xen/xenbus.h>
59489+#include <xen/interface/grant_table.h>
59490+#include <xen/gnttab.h>
59491+#include <asm/hypervisor.h>
59492+#include <asm/maddr.h>
59493+
59494+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
59495+#include <xen/platform-compat.h>
59496+#endif
59497+
59498+#define BLKIF_STATE_DISCONNECTED 0
59499+#define BLKIF_STATE_CONNECTED 1
59500+#define BLKIF_STATE_SUSPENDED 2
59501+
59502+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
59503+ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
59504+#define GRANT_INVALID_REF 0
59505+
59506+static void connect(struct blkfront_info *);
59507+static void blkfront_closing(struct xenbus_device *);
59508+static int blkfront_remove(struct xenbus_device *);
59509+static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
59510+static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
59511+
59512+static void kick_pending_request_queues(struct blkfront_info *);
59513+
59514+static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
59515+static void blkif_restart_queue(void *arg);
59516+static void blkif_recover(struct blkfront_info *);
59517+static void blkif_completion(struct blk_shadow *);
59518+static void blkif_free(struct blkfront_info *, int);
59519+
59520+
59521+/**
59522+ * Entry point to this code when a new device is created. Allocate the basic
59523+ * structures and the ring buffer for communication with the backend, and
59524+ * inform the backend of the appropriate details for those. Switch to
59525+ * Initialised state.
59526+ */
59527+static int blkfront_probe(struct xenbus_device *dev,
59528+ const struct xenbus_device_id *id)
59529+{
59530+ int err, vdevice, i;
59531+ struct blkfront_info *info;
59532+
59533+ /* FIXME: Use dynamic device id if this is not set. */
59534+ err = xenbus_scanf(XBT_NIL, dev->nodename,
59535+ "virtual-device", "%i", &vdevice);
59536+ if (err != 1) {
59537+ xenbus_dev_fatal(dev, err, "reading virtual-device");
59538+ return err;
59539+ }
59540+
59541+ info = kzalloc(sizeof(*info), GFP_KERNEL);
59542+ if (!info) {
59543+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
59544+ return -ENOMEM;
59545+ }
59546+
59547+ info->xbdev = dev;
59548+ info->vdevice = vdevice;
59549+ info->connected = BLKIF_STATE_DISCONNECTED;
59550+ INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
59551+
59552+ for (i = 0; i < BLK_RING_SIZE; i++)
59553+ info->shadow[i].req.id = i+1;
59554+ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
59555+
59556+ /* Front end dir is a number, which is used as the id. */
59557+ info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
59558+ dev->dev.driver_data = info;
59559+
59560+ err = talk_to_backend(dev, info);
59561+ if (err) {
59562+ kfree(info);
59563+ dev->dev.driver_data = NULL;
59564+ return err;
59565+ }
59566+
59567+ return 0;
59568+}
59569+
59570+
59571+/**
59572+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
59573+ * driver restart. We tear down our blkif structure and recreate it, but
59574+ * leave the device-layer structures intact so that this is transparent to the
59575+ * rest of the kernel.
59576+ */
59577+static int blkfront_resume(struct xenbus_device *dev)
59578+{
59579+ struct blkfront_info *info = dev->dev.driver_data;
59580+ int err;
59581+
59582+ DPRINTK("blkfront_resume: %s\n", dev->nodename);
59583+
59584+ blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
59585+
59586+ err = talk_to_backend(dev, info);
59587+ if (info->connected == BLKIF_STATE_SUSPENDED && !err)
59588+ blkif_recover(info);
59589+
59590+ return err;
59591+}
59592+
59593+
59594+/* Common code used when first setting up, and when resuming. */
59595+static int talk_to_backend(struct xenbus_device *dev,
59596+ struct blkfront_info *info)
59597+{
59598+ const char *message = NULL;
59599+ struct xenbus_transaction xbt;
59600+ int err;
59601+
59602+ /* Create shared ring, alloc event channel. */
59603+ err = setup_blkring(dev, info);
59604+ if (err)
59605+ goto out;
59606+
59607+again:
59608+ err = xenbus_transaction_start(&xbt);
59609+ if (err) {
59610+ xenbus_dev_fatal(dev, err, "starting transaction");
59611+ goto destroy_blkring;
59612+ }
59613+
59614+ err = xenbus_printf(xbt, dev->nodename,
59615+ "ring-ref","%u", info->ring_ref);
59616+ if (err) {
59617+ message = "writing ring-ref";
59618+ goto abort_transaction;
59619+ }
59620+ err = xenbus_printf(xbt, dev->nodename,
59621+ "event-channel", "%u", info->evtchn);
59622+ if (err) {
59623+ message = "writing event-channel";
59624+ goto abort_transaction;
59625+ }
59626+
59627+ err = xenbus_transaction_end(xbt, 0);
59628+ if (err) {
59629+ if (err == -EAGAIN)
59630+ goto again;
59631+ xenbus_dev_fatal(dev, err, "completing transaction");
59632+ goto destroy_blkring;
59633+ }
59634+
59635+ xenbus_switch_state(dev, XenbusStateInitialised);
59636+
59637+ return 0;
59638+
59639+ abort_transaction:
59640+ xenbus_transaction_end(xbt, 1);
59641+ if (message)
59642+ xenbus_dev_fatal(dev, err, "%s", message);
59643+ destroy_blkring:
59644+ blkif_free(info, 0);
59645+ out:
59646+ return err;
59647+}
59648+
59649+
59650+static int setup_blkring(struct xenbus_device *dev,
59651+ struct blkfront_info *info)
59652+{
59653+ blkif_sring_t *sring;
59654+ int err;
59655+
59656+ info->ring_ref = GRANT_INVALID_REF;
59657+
59658+ sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
59659+ if (!sring) {
59660+ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
59661+ return -ENOMEM;
59662+ }
59663+ SHARED_RING_INIT(sring);
59664+ FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
59665+
59666+ err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
59667+ if (err < 0) {
59668+ free_page((unsigned long)sring);
59669+ info->ring.sring = NULL;
59670+ goto fail;
59671+ }
59672+ info->ring_ref = err;
59673+
59674+ err = xenbus_alloc_evtchn(dev, &info->evtchn);
59675+ if (err)
59676+ goto fail;
59677+
59678+ err = bind_evtchn_to_irqhandler(
59679+ info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
59680+ if (err <= 0) {
59681+ xenbus_dev_fatal(dev, err,
59682+ "bind_evtchn_to_irqhandler failed");
59683+ goto fail;
59684+ }
59685+ info->irq = err;
59686+
59687+ return 0;
59688+fail:
59689+ blkif_free(info, 0);
59690+ return err;
59691+}
59692+
59693+
59694+/**
59695+ * Callback received when the backend's state changes.
59696+ */
59697+static void backend_changed(struct xenbus_device *dev,
59698+ enum xenbus_state backend_state)
59699+{
59700+ struct blkfront_info *info = dev->dev.driver_data;
59701+ struct block_device *bd;
59702+
59703+ DPRINTK("blkfront:backend_changed.\n");
59704+
59705+ switch (backend_state) {
59706+ case XenbusStateInitialising:
59707+ case XenbusStateInitWait:
59708+ case XenbusStateInitialised:
59709+ case XenbusStateUnknown:
59710+ case XenbusStateClosed:
59711+ break;
59712+
59713+ case XenbusStateConnected:
59714+ connect(info);
59715+ break;
59716+
59717+ case XenbusStateClosing:
59718+ bd = bdget(info->dev);
59719+ if (bd == NULL)
59720+ xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
59721+
59722+ down(&bd->bd_sem);
59723+ if (info->users > 0)
59724+ xenbus_dev_error(dev, -EBUSY,
59725+ "Device in use; refusing to close");
59726+ else
59727+ blkfront_closing(dev);
59728+ up(&bd->bd_sem);
59729+ bdput(bd);
59730+ break;
59731+ }
59732+}
59733+
59734+
59735+/* ** Connection ** */
59736+
59737+
59738+/*
59739+ * Invoked when the backend is finally 'ready' (and has told produced
59740+ * the details about the physical device - #sectors, size, etc).
59741+ */
59742+static void connect(struct blkfront_info *info)
59743+{
59744+ unsigned long long sectors;
59745+ unsigned long sector_size;
59746+ unsigned int binfo;
59747+ int err;
59748+
59749+ if ((info->connected == BLKIF_STATE_CONNECTED) ||
59750+ (info->connected == BLKIF_STATE_SUSPENDED) )
59751+ return;
59752+
59753+ DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
59754+
59755+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
59756+ "sectors", "%Lu", &sectors,
59757+ "info", "%u", &binfo,
59758+ "sector-size", "%lu", &sector_size,
59759+ NULL);
59760+ if (err) {
59761+ xenbus_dev_fatal(info->xbdev, err,
59762+ "reading backend fields at %s",
59763+ info->xbdev->otherend);
59764+ return;
59765+ }
59766+
59767+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
59768+ "feature-barrier", "%lu", &info->feature_barrier,
59769+ NULL);
59770+ if (err)
59771+ info->feature_barrier = 0;
59772+
59773+ err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
59774+ if (err) {
59775+ xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
59776+ info->xbdev->otherend);
59777+ return;
59778+ }
59779+
59780+ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
59781+
59782+ /* Kick pending requests. */
59783+ spin_lock_irq(&blkif_io_lock);
59784+ info->connected = BLKIF_STATE_CONNECTED;
59785+ kick_pending_request_queues(info);
59786+ spin_unlock_irq(&blkif_io_lock);
59787+
59788+ add_disk(info->gd);
59789+}
59790+
59791+/**
59792+ * Handle the change of state of the backend to Closing. We must delete our
59793+ * device-layer structures now, to ensure that writes are flushed through to
59794+ * the backend. Once is this done, we can switch to Closed in
59795+ * acknowledgement.
59796+ */
59797+static void blkfront_closing(struct xenbus_device *dev)
59798+{
59799+ struct blkfront_info *info = dev->dev.driver_data;
59800+ unsigned long flags;
59801+
59802+ DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
59803+
59804+ if (info->rq == NULL)
59805+ goto out;
59806+
59807+ spin_lock_irqsave(&blkif_io_lock, flags);
59808+ /* No more blkif_request(). */
59809+ blk_stop_queue(info->rq);
59810+ /* No more gnttab callback work. */
59811+ gnttab_cancel_free_callback(&info->callback);
59812+ spin_unlock_irqrestore(&blkif_io_lock, flags);
59813+
59814+ /* Flush gnttab callback work. Must be done with no locks held. */
59815+ flush_scheduled_work();
59816+
59817+ xlvbd_del(info);
59818+
59819+ out:
59820+ xenbus_frontend_closed(dev);
59821+}
59822+
59823+
59824+static int blkfront_remove(struct xenbus_device *dev)
59825+{
59826+ struct blkfront_info *info = dev->dev.driver_data;
59827+
59828+ DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
59829+
59830+ blkif_free(info, 0);
59831+
59832+ kfree(info);
59833+
59834+ return 0;
59835+}
59836+
59837+
59838+static inline int GET_ID_FROM_FREELIST(
59839+ struct blkfront_info *info)
59840+{
59841+ unsigned long free = info->shadow_free;
59842+ BUG_ON(free > BLK_RING_SIZE);
59843+ info->shadow_free = info->shadow[free].req.id;
59844+ info->shadow[free].req.id = 0x0fffffee; /* debug */
59845+ return free;
59846+}
59847+
59848+static inline void ADD_ID_TO_FREELIST(
59849+ struct blkfront_info *info, unsigned long id)
59850+{
59851+ info->shadow[id].req.id = info->shadow_free;
59852+ info->shadow[id].request = 0;
59853+ info->shadow_free = id;
59854+}
59855+
59856+static inline void flush_requests(struct blkfront_info *info)
59857+{
59858+ int notify;
59859+
59860+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
59861+
59862+ if (notify)
59863+ notify_remote_via_irq(info->irq);
59864+}
59865+
59866+static void kick_pending_request_queues(struct blkfront_info *info)
59867+{
59868+ if (!RING_FULL(&info->ring)) {
59869+ /* Re-enable calldowns. */
59870+ blk_start_queue(info->rq);
59871+ /* Kick things off immediately. */
59872+ do_blkif_request(info->rq);
59873+ }
59874+}
59875+
59876+static void blkif_restart_queue(void *arg)
59877+{
59878+ struct blkfront_info *info = (struct blkfront_info *)arg;
59879+ spin_lock_irq(&blkif_io_lock);
59880+ if (info->connected == BLKIF_STATE_CONNECTED)
59881+ kick_pending_request_queues(info);
59882+ spin_unlock_irq(&blkif_io_lock);
59883+}
59884+
59885+static void blkif_restart_queue_callback(void *arg)
59886+{
59887+ struct blkfront_info *info = (struct blkfront_info *)arg;
59888+ schedule_work(&info->work);
59889+}
59890+
59891+int blkif_open(struct inode *inode, struct file *filep)
59892+{
59893+ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
59894+ info->users++;
59895+ return 0;
59896+}
59897+
59898+
59899+int blkif_release(struct inode *inode, struct file *filep)
59900+{
59901+ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
59902+ info->users--;
59903+ if (info->users == 0) {
59904+ /* Check whether we have been instructed to close. We will
59905+ have ignored this request initially, as the device was
59906+ still mounted. */
59907+ struct xenbus_device * dev = info->xbdev;
59908+ enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
59909+
59910+ if (state == XenbusStateClosing)
59911+ blkfront_closing(dev);
59912+ }
59913+ return 0;
59914+}
59915+
59916+
59917+int blkif_ioctl(struct inode *inode, struct file *filep,
59918+ unsigned command, unsigned long argument)
59919+{
59920+ int i;
59921+
59922+ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
59923+ command, (long)argument, inode->i_rdev);
59924+
59925+ switch (command) {
59926+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
59927+ case HDIO_GETGEO: {
59928+ struct block_device *bd = inode->i_bdev;
59929+ struct hd_geometry geo;
59930+ int ret;
59931+
59932+ if (!argument)
59933+ return -EINVAL;
59934+
59935+ geo.start = get_start_sect(bd);
59936+ ret = blkif_getgeo(bd, &geo);
59937+ if (ret)
59938+ return ret;
59939+
59940+ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
59941+ sizeof(geo)))
59942+ return -EFAULT;
59943+
59944+ return 0;
59945+ }
59946+#endif
59947+ case CDROMMULTISESSION:
59948+ DPRINTK("FIXME: support multisession CDs later\n");
59949+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
59950+ if (put_user(0, (char __user *)(argument + i)))
59951+ return -EFAULT;
59952+ return 0;
59953+
59954+ default:
59955+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
59956+ command);*/
59957+ return -EINVAL; /* same return as native Linux */
59958+ }
59959+
59960+ return 0;
59961+}
59962+
59963+
59964+int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
59965+{
59966+ /* We don't have real geometry info, but let's at least return
59967+ values consistent with the size of the device */
59968+ sector_t nsect = get_capacity(bd->bd_disk);
59969+ sector_t cylinders = nsect;
59970+
59971+ hg->heads = 0xff;
59972+ hg->sectors = 0x3f;
59973+ sector_div(cylinders, hg->heads * hg->sectors);
59974+ hg->cylinders = cylinders;
59975+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
59976+ hg->cylinders = 0xffff;
59977+ return 0;
59978+}
59979+
59980+
59981+/*
59982+ * blkif_queue_request
59983+ *
59984+ * request block io
59985+ *
59986+ * id: for guest use only.
59987+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
59988+ * buffer: buffer to read/write into. this should be a
59989+ * virtual address in the guest os.
59990+ */
59991+static int blkif_queue_request(struct request *req)
59992+{
59993+ struct blkfront_info *info = req->rq_disk->private_data;
59994+ unsigned long buffer_mfn;
59995+ blkif_request_t *ring_req;
59996+ struct bio *bio;
59997+ struct bio_vec *bvec;
59998+ int idx;
59999+ unsigned long id;
60000+ unsigned int fsect, lsect;
60001+ int ref;
60002+ grant_ref_t gref_head;
60003+
60004+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
60005+ return 1;
60006+
60007+ if (gnttab_alloc_grant_references(
60008+ BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
60009+ gnttab_request_free_callback(
60010+ &info->callback,
60011+ blkif_restart_queue_callback,
60012+ info,
60013+ BLKIF_MAX_SEGMENTS_PER_REQUEST);
60014+ return 1;
60015+ }
60016+
60017+ /* Fill out a communications ring structure. */
60018+ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
60019+ id = GET_ID_FROM_FREELIST(info);
60020+ info->shadow[id].request = (unsigned long)req;
60021+
60022+ ring_req->id = id;
60023+ ring_req->sector_number = (blkif_sector_t)req->sector;
60024+ ring_req->handle = info->handle;
60025+
60026+ ring_req->operation = rq_data_dir(req) ?
60027+ BLKIF_OP_WRITE : BLKIF_OP_READ;
60028+ if (blk_barrier_rq(req))
60029+ ring_req->operation = BLKIF_OP_WRITE_BARRIER;
60030+
60031+ ring_req->nr_segments = 0;
60032+ rq_for_each_bio (bio, req) {
60033+ bio_for_each_segment (bvec, bio, idx) {
60034+ BUG_ON(ring_req->nr_segments
60035+ == BLKIF_MAX_SEGMENTS_PER_REQUEST);
60036+ buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
60037+ fsect = bvec->bv_offset >> 9;
60038+ lsect = fsect + (bvec->bv_len >> 9) - 1;
60039+ /* install a grant reference. */
60040+ ref = gnttab_claim_grant_reference(&gref_head);
60041+ BUG_ON(ref == -ENOSPC);
60042+
60043+ gnttab_grant_foreign_access_ref(
60044+ ref,
60045+ info->xbdev->otherend_id,
60046+ buffer_mfn,
60047+ rq_data_dir(req) );
60048+
60049+ info->shadow[id].frame[ring_req->nr_segments] =
60050+ mfn_to_pfn(buffer_mfn);
60051+
60052+ ring_req->seg[ring_req->nr_segments] =
60053+ (struct blkif_request_segment) {
60054+ .gref = ref,
60055+ .first_sect = fsect,
60056+ .last_sect = lsect };
60057+
60058+ ring_req->nr_segments++;
60059+ }
60060+ }
60061+
60062+ info->ring.req_prod_pvt++;
60063+
60064+ /* Keep a private copy so we can reissue requests when recovering. */
60065+ info->shadow[id].req = *ring_req;
60066+
60067+ gnttab_free_grant_references(gref_head);
60068+
60069+ return 0;
60070+}
60071+
60072+/*
60073+ * do_blkif_request
60074+ * read a block; request is in a request queue
60075+ */
60076+void do_blkif_request(request_queue_t *rq)
60077+{
60078+ struct blkfront_info *info = NULL;
60079+ struct request *req;
60080+ int queued;
60081+
60082+ DPRINTK("Entered do_blkif_request\n");
60083+
60084+ queued = 0;
60085+
60086+ while ((req = elv_next_request(rq)) != NULL) {
60087+ info = req->rq_disk->private_data;
60088+ if (!blk_fs_request(req)) {
60089+ end_request(req, 0);
60090+ continue;
60091+ }
60092+
60093+ if (RING_FULL(&info->ring))
60094+ goto wait;
60095+
60096+ DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
60097+ "(%u/%li) buffer:%p [%s]\n",
60098+ req, req->cmd, req->sector, req->current_nr_sectors,
60099+ req->nr_sectors, req->buffer,
60100+ rq_data_dir(req) ? "write" : "read");
60101+
60102+
60103+ blkdev_dequeue_request(req);
60104+ if (blkif_queue_request(req)) {
60105+ blk_requeue_request(rq, req);
60106+ wait:
60107+ /* Avoid pointless unplugs. */
60108+ blk_stop_queue(rq);
60109+ break;
60110+ }
60111+
60112+ queued++;
60113+ }
60114+
60115+ if (queued != 0)
60116+ flush_requests(info);
60117+}
60118+
60119+
60120+static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
60121+{
60122+ struct request *req;
60123+ blkif_response_t *bret;
60124+ RING_IDX i, rp;
60125+ unsigned long flags;
60126+ struct blkfront_info *info = (struct blkfront_info *)dev_id;
60127+ int uptodate;
60128+
60129+ spin_lock_irqsave(&blkif_io_lock, flags);
60130+
60131+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
60132+ spin_unlock_irqrestore(&blkif_io_lock, flags);
60133+ return IRQ_HANDLED;
60134+ }
60135+
60136+ again:
60137+ rp = info->ring.sring->rsp_prod;
60138+ rmb(); /* Ensure we see queued responses up to 'rp'. */
60139+
60140+ for (i = info->ring.rsp_cons; i != rp; i++) {
60141+ unsigned long id;
60142+ int ret;
60143+
60144+ bret = RING_GET_RESPONSE(&info->ring, i);
60145+ id = bret->id;
60146+ req = (struct request *)info->shadow[id].request;
60147+
60148+ blkif_completion(&info->shadow[id]);
60149+
60150+ ADD_ID_TO_FREELIST(info, id);
60151+
60152+ uptodate = (bret->status == BLKIF_RSP_OKAY);
60153+ switch (bret->operation) {
60154+ case BLKIF_OP_WRITE_BARRIER:
60155+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
60156+ printk("blkfront: %s: write barrier op failed\n",
60157+ info->gd->disk_name);
60158+ uptodate = -EOPNOTSUPP;
60159+ info->feature_barrier = 0;
60160+ xlvbd_barrier(info);
60161+ }
60162+ /* fall through */
60163+ case BLKIF_OP_READ:
60164+ case BLKIF_OP_WRITE:
60165+ if (unlikely(bret->status != BLKIF_RSP_OKAY))
60166+ DPRINTK("Bad return from blkdev data "
60167+ "request: %x\n", bret->status);
60168+
60169+ ret = end_that_request_first(req, uptodate,
60170+ req->hard_nr_sectors);
60171+ BUG_ON(ret);
60172+ end_that_request_last(req, uptodate);
60173+ break;
60174+ default:
60175+ BUG();
60176+ }
60177+ }
60178+
60179+ info->ring.rsp_cons = i;
60180+
60181+ if (i != info->ring.req_prod_pvt) {
60182+ int more_to_do;
60183+ RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
60184+ if (more_to_do)
60185+ goto again;
60186+ } else
60187+ info->ring.sring->rsp_event = i + 1;
60188+
60189+ kick_pending_request_queues(info);
60190+
60191+ spin_unlock_irqrestore(&blkif_io_lock, flags);
60192+
60193+ return IRQ_HANDLED;
60194+}
60195+
60196+static void blkif_free(struct blkfront_info *info, int suspend)
60197+{
60198+ /* Prevent new requests being issued until we fix things up. */
60199+ spin_lock_irq(&blkif_io_lock);
60200+ info->connected = suspend ?
60201+ BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
60202+ /* No more blkif_request(). */
60203+ if (info->rq)
60204+ blk_stop_queue(info->rq);
60205+ /* No more gnttab callback work. */
60206+ gnttab_cancel_free_callback(&info->callback);
60207+ spin_unlock_irq(&blkif_io_lock);
60208+
60209+ /* Flush gnttab callback work. Must be done with no locks held. */
60210+ flush_scheduled_work();
60211+
60212+ /* Free resources associated with old device channel. */
60213+ if (info->ring_ref != GRANT_INVALID_REF) {
60214+ gnttab_end_foreign_access(info->ring_ref, 0,
60215+ (unsigned long)info->ring.sring);
60216+ info->ring_ref = GRANT_INVALID_REF;
60217+ info->ring.sring = NULL;
60218+ }
60219+ if (info->irq)
60220+ unbind_from_irqhandler(info->irq, info);
60221+ info->evtchn = info->irq = 0;
60222+
60223+}
60224+
60225+static void blkif_completion(struct blk_shadow *s)
60226+{
60227+ int i;
60228+ for (i = 0; i < s->req.nr_segments; i++)
60229+ gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
60230+}
60231+
60232+static void blkif_recover(struct blkfront_info *info)
60233+{
60234+ int i;
60235+ blkif_request_t *req;
60236+ struct blk_shadow *copy;
60237+ int j;
60238+
60239+ /* Stage 1: Make a safe copy of the shadow state. */
60240+ copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
60241+ memcpy(copy, info->shadow, sizeof(info->shadow));
60242+
60243+ /* Stage 2: Set up free list. */
60244+ memset(&info->shadow, 0, sizeof(info->shadow));
60245+ for (i = 0; i < BLK_RING_SIZE; i++)
60246+ info->shadow[i].req.id = i+1;
60247+ info->shadow_free = info->ring.req_prod_pvt;
60248+ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
60249+
60250+ /* Stage 3: Find pending requests and requeue them. */
60251+ for (i = 0; i < BLK_RING_SIZE; i++) {
60252+ /* Not in use? */
60253+ if (copy[i].request == 0)
60254+ continue;
60255+
60256+ /* Grab a request slot and copy shadow state into it. */
60257+ req = RING_GET_REQUEST(
60258+ &info->ring, info->ring.req_prod_pvt);
60259+ *req = copy[i].req;
60260+
60261+ /* We get a new request id, and must reset the shadow state. */
60262+ req->id = GET_ID_FROM_FREELIST(info);
60263+ memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
60264+
60265+ /* Rewrite any grant references invalidated by susp/resume. */
60266+ for (j = 0; j < req->nr_segments; j++)
60267+ gnttab_grant_foreign_access_ref(
60268+ req->seg[j].gref,
60269+ info->xbdev->otherend_id,
60270+ pfn_to_mfn(info->shadow[req->id].frame[j]),
60271+ rq_data_dir(
60272+ (struct request *)
60273+ info->shadow[req->id].request));
60274+ info->shadow[req->id].req = *req;
60275+
60276+ info->ring.req_prod_pvt++;
60277+ }
60278+
60279+ kfree(copy);
60280+
60281+ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
60282+
60283+ spin_lock_irq(&blkif_io_lock);
60284+
60285+ /* Now safe for us to use the shared ring */
60286+ info->connected = BLKIF_STATE_CONNECTED;
60287+
60288+ /* Send off requeued requests */
60289+ flush_requests(info);
60290+
60291+ /* Kick any other new requests queued since we resumed */
60292+ kick_pending_request_queues(info);
60293+
60294+ spin_unlock_irq(&blkif_io_lock);
60295+}
60296+
60297+
60298+/* ** Driver Registration ** */
60299+
60300+
60301+static struct xenbus_device_id blkfront_ids[] = {
60302+ { "vbd" },
60303+ { "" }
60304+};
60305+
60306+
60307+static struct xenbus_driver blkfront = {
60308+ .name = "vbd",
60309+ .owner = THIS_MODULE,
60310+ .ids = blkfront_ids,
60311+ .probe = blkfront_probe,
60312+ .remove = blkfront_remove,
60313+ .resume = blkfront_resume,
60314+ .otherend_changed = backend_changed,
60315+};
60316+
60317+
60318+static int __init xlblk_init(void)
60319+{
60320+ if (!is_running_on_xen())
60321+ return -ENODEV;
60322+
60323+ return xenbus_register_frontend(&blkfront);
60324+}
60325+module_init(xlblk_init);
60326+
60327+
60328+static void xlblk_exit(void)
60329+{
60330+ return xenbus_unregister_driver(&blkfront);
60331+}
60332+module_exit(xlblk_exit);
60333+
60334+MODULE_LICENSE("Dual BSD/GPL");
60335diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/block.h linux-2.6.16.33/drivers/xen/blkfront/block.h
60336--- linux-2.6.16.33-noxen/drivers/xen/blkfront/block.h 1970-01-01 00:00:00.000000000 +0000
60337+++ linux-2.6.16.33/drivers/xen/blkfront/block.h 2007-01-08 15:00:45.000000000 +0000
60338@@ -0,0 +1,158 @@
60339+/******************************************************************************
60340+ * block.h
60341+ *
60342+ * Shared definitions between all levels of XenLinux Virtual block devices.
60343+ *
60344+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
60345+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
60346+ * Copyright (c) 2004-2005, Christian Limpach
60347+ *
60348+ * This program is free software; you can redistribute it and/or
60349+ * modify it under the terms of the GNU General Public License version 2
60350+ * as published by the Free Software Foundation; or, when distributed
60351+ * separately from the Linux kernel or incorporated into other
60352+ * software packages, subject to the following license:
60353+ *
60354+ * Permission is hereby granted, free of charge, to any person obtaining a copy
60355+ * of this source file (the "Software"), to deal in the Software without
60356+ * restriction, including without limitation the rights to use, copy, modify,
60357+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60358+ * and to permit persons to whom the Software is furnished to do so, subject to
60359+ * the following conditions:
60360+ *
60361+ * The above copyright notice and this permission notice shall be included in
60362+ * all copies or substantial portions of the Software.
60363+ *
60364+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60365+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60366+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60367+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60368+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60369+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60370+ * IN THE SOFTWARE.
60371+ */
60372+
60373+#ifndef __XEN_DRIVERS_BLOCK_H__
60374+#define __XEN_DRIVERS_BLOCK_H__
60375+
60376+#include <linux/config.h>
60377+#include <linux/version.h>
60378+#include <linux/module.h>
60379+#include <linux/kernel.h>
60380+#include <linux/sched.h>
60381+#include <linux/slab.h>
60382+#include <linux/string.h>
60383+#include <linux/errno.h>
60384+#include <linux/fs.h>
60385+#include <linux/hdreg.h>
60386+#include <linux/blkdev.h>
60387+#include <linux/major.h>
60388+#include <linux/devfs_fs_kernel.h>
60389+#include <asm/hypervisor.h>
60390+#include <xen/xenbus.h>
60391+#include <xen/gnttab.h>
60392+#include <xen/interface/xen.h>
60393+#include <xen/interface/io/blkif.h>
60394+#include <xen/interface/io/ring.h>
60395+#include <asm/io.h>
60396+#include <asm/atomic.h>
60397+#include <asm/uaccess.h>
60398+
60399+#if 1
60400+#define IPRINTK(fmt, args...) \
60401+ printk(KERN_INFO "xen_blk: " fmt, ##args)
60402+#else
60403+#define IPRINTK(fmt, args...) ((void)0)
60404+#endif
60405+
60406+#if 1
60407+#define WPRINTK(fmt, args...) \
60408+ printk(KERN_WARNING "xen_blk: " fmt, ##args)
60409+#else
60410+#define WPRINTK(fmt, args...) ((void)0)
60411+#endif
60412+
60413+#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
60414+
60415+#if 0
60416+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
60417+#else
60418+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
60419+#endif
60420+
60421+struct xlbd_type_info
60422+{
60423+ int partn_shift;
60424+ int disks_per_major;
60425+ char *devname;
60426+ char *diskname;
60427+};
60428+
60429+struct xlbd_major_info
60430+{
60431+ int major;
60432+ int index;
60433+ int usage;
60434+ struct xlbd_type_info *type;
60435+};
60436+
60437+struct blk_shadow {
60438+ blkif_request_t req;
60439+ unsigned long request;
60440+ unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
60441+};
60442+
60443+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
60444+
60445+/*
60446+ * We have one of these per vbd, whether ide, scsi or 'other'. They
60447+ * hang in private_data off the gendisk structure. We may end up
60448+ * putting all kinds of interesting stuff here :-)
60449+ */
60450+struct blkfront_info
60451+{
60452+ struct xenbus_device *xbdev;
60453+ dev_t dev;
60454+ struct gendisk *gd;
60455+ int vdevice;
60456+ blkif_vdev_t handle;
60457+ int connected;
60458+ int ring_ref;
60459+ blkif_front_ring_t ring;
60460+ unsigned int evtchn, irq;
60461+ struct xlbd_major_info *mi;
60462+ request_queue_t *rq;
60463+ struct work_struct work;
60464+ struct gnttab_free_callback callback;
60465+ struct blk_shadow shadow[BLK_RING_SIZE];
60466+ unsigned long shadow_free;
60467+ int feature_barrier;
60468+
60469+ /**
60470+ * The number of people holding this device open. We won't allow a
60471+ * hot-unplug unless this is 0.
60472+ */
60473+ int users;
60474+};
60475+
60476+extern spinlock_t blkif_io_lock;
60477+
60478+extern int blkif_open(struct inode *inode, struct file *filep);
60479+extern int blkif_release(struct inode *inode, struct file *filep);
60480+extern int blkif_ioctl(struct inode *inode, struct file *filep,
60481+ unsigned command, unsigned long argument);
60482+extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
60483+extern int blkif_check(dev_t dev);
60484+extern int blkif_revalidate(dev_t dev);
60485+extern void do_blkif_request (request_queue_t *rq);
60486+
60487+/* Virtual block-device subsystem. */
60488+/* Note that xlvbd_add doesn't call add_disk for you: you're expected
60489+ to call add_disk on info->gd once the disk is properly connected
60490+ up. */
60491+int xlvbd_add(blkif_sector_t capacity, int device,
60492+ u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
60493+void xlvbd_del(struct blkfront_info *info);
60494+int xlvbd_barrier(struct blkfront_info *info);
60495+
60496+#endif /* __XEN_DRIVERS_BLOCK_H__ */
60497diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/vbd.c linux-2.6.16.33/drivers/xen/blkfront/vbd.c
60498--- linux-2.6.16.33-noxen/drivers/xen/blkfront/vbd.c 1970-01-01 00:00:00.000000000 +0000
60499+++ linux-2.6.16.33/drivers/xen/blkfront/vbd.c 2007-01-08 15:00:45.000000000 +0000
60500@@ -0,0 +1,375 @@
60501+/******************************************************************************
60502+ * vbd.c
60503+ *
60504+ * XenLinux virtual block-device driver (xvd).
60505+ *
60506+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
60507+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
60508+ * Copyright (c) 2004-2005, Christian Limpach
60509+ *
60510+ * This program is free software; you can redistribute it and/or
60511+ * modify it under the terms of the GNU General Public License version 2
60512+ * as published by the Free Software Foundation; or, when distributed
60513+ * separately from the Linux kernel or incorporated into other
60514+ * software packages, subject to the following license:
60515+ *
60516+ * Permission is hereby granted, free of charge, to any person obtaining a copy
60517+ * of this source file (the "Software"), to deal in the Software without
60518+ * restriction, including without limitation the rights to use, copy, modify,
60519+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60520+ * and to permit persons to whom the Software is furnished to do so, subject to
60521+ * the following conditions:
60522+ *
60523+ * The above copyright notice and this permission notice shall be included in
60524+ * all copies or substantial portions of the Software.
60525+ *
60526+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60527+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60528+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60529+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60530+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60531+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60532+ * IN THE SOFTWARE.
60533+ */
60534+
60535+#include "block.h"
60536+#include <linux/blkdev.h>
60537+#include <linux/list.h>
60538+
60539+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
60540+#include <xen/platform-compat.h>
60541+#endif
60542+
60543+#define BLKIF_MAJOR(dev) ((dev)>>8)
60544+#define BLKIF_MINOR(dev) ((dev) & 0xff)
60545+
60546+/*
60547+ * For convenience we distinguish between ide, scsi and 'other' (i.e.,
60548+ * potentially combinations of the two) in the naming scheme and in a few other
60549+ * places.
60550+ */
60551+
60552+#define NUM_IDE_MAJORS 10
60553+#define NUM_SCSI_MAJORS 17
60554+#define NUM_VBD_MAJORS 1
60555+
60556+static struct xlbd_type_info xlbd_ide_type = {
60557+ .partn_shift = 6,
60558+ .disks_per_major = 2,
60559+ .devname = "ide",
60560+ .diskname = "hd",
60561+};
60562+
60563+static struct xlbd_type_info xlbd_scsi_type = {
60564+ .partn_shift = 4,
60565+ .disks_per_major = 16,
60566+ .devname = "sd",
60567+ .diskname = "sd",
60568+};
60569+
60570+static struct xlbd_type_info xlbd_vbd_type = {
60571+ .partn_shift = 4,
60572+ .disks_per_major = 16,
60573+ .devname = "xvd",
60574+ .diskname = "xvd",
60575+};
60576+
60577+static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
60578+ NUM_VBD_MAJORS];
60579+
60580+#define XLBD_MAJOR_IDE_START 0
60581+#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS)
60582+#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
60583+
60584+#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
60585+#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
60586+#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
60587+
60588+/* Information about our VBDs. */
60589+#define MAX_VBDS 64
60590+static LIST_HEAD(vbds_list);
60591+
60592+static struct block_device_operations xlvbd_block_fops =
60593+{
60594+ .owner = THIS_MODULE,
60595+ .open = blkif_open,
60596+ .release = blkif_release,
60597+ .ioctl = blkif_ioctl,
60598+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
60599+ .getgeo = blkif_getgeo
60600+#endif
60601+};
60602+
60603+DEFINE_SPINLOCK(blkif_io_lock);
60604+
60605+static struct xlbd_major_info *
60606+xlbd_alloc_major_info(int major, int minor, int index)
60607+{
60608+ struct xlbd_major_info *ptr;
60609+
60610+ ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
60611+ if (ptr == NULL)
60612+ return NULL;
60613+
60614+ ptr->major = major;
60615+
60616+ switch (index) {
60617+ case XLBD_MAJOR_IDE_RANGE:
60618+ ptr->type = &xlbd_ide_type;
60619+ ptr->index = index - XLBD_MAJOR_IDE_START;
60620+ break;
60621+ case XLBD_MAJOR_SCSI_RANGE:
60622+ ptr->type = &xlbd_scsi_type;
60623+ ptr->index = index - XLBD_MAJOR_SCSI_START;
60624+ break;
60625+ case XLBD_MAJOR_VBD_RANGE:
60626+ ptr->type = &xlbd_vbd_type;
60627+ ptr->index = index - XLBD_MAJOR_VBD_START;
60628+ break;
60629+ }
60630+
60631+ printk("Registering block device major %i\n", ptr->major);
60632+ if (register_blkdev(ptr->major, ptr->type->devname)) {
60633+ WPRINTK("can't get major %d with name %s\n",
60634+ ptr->major, ptr->type->devname);
60635+ kfree(ptr);
60636+ return NULL;
60637+ }
60638+
60639+ devfs_mk_dir(ptr->type->devname);
60640+ major_info[index] = ptr;
60641+ return ptr;
60642+}
60643+
60644+static struct xlbd_major_info *
60645+xlbd_get_major_info(int vdevice)
60646+{
60647+ struct xlbd_major_info *mi;
60648+ int major, minor, index;
60649+
60650+ major = BLKIF_MAJOR(vdevice);
60651+ minor = BLKIF_MINOR(vdevice);
60652+
60653+ switch (major) {
60654+ case IDE0_MAJOR: index = 0; break;
60655+ case IDE1_MAJOR: index = 1; break;
60656+ case IDE2_MAJOR: index = 2; break;
60657+ case IDE3_MAJOR: index = 3; break;
60658+ case IDE4_MAJOR: index = 4; break;
60659+ case IDE5_MAJOR: index = 5; break;
60660+ case IDE6_MAJOR: index = 6; break;
60661+ case IDE7_MAJOR: index = 7; break;
60662+ case IDE8_MAJOR: index = 8; break;
60663+ case IDE9_MAJOR: index = 9; break;
60664+ case SCSI_DISK0_MAJOR: index = 10; break;
60665+ case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
60666+ index = 11 + major - SCSI_DISK1_MAJOR;
60667+ break;
60668+ case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
60669+ index = 18 + major - SCSI_DISK8_MAJOR;
60670+ break;
60671+ case SCSI_CDROM_MAJOR: index = 26; break;
60672+ default: index = 27; break;
60673+ }
60674+
60675+ mi = ((major_info[index] != NULL) ? major_info[index] :
60676+ xlbd_alloc_major_info(major, minor, index));
60677+ if (mi)
60678+ mi->usage++;
60679+ return mi;
60680+}
60681+
60682+static void
60683+xlbd_put_major_info(struct xlbd_major_info *mi)
60684+{
60685+ mi->usage--;
60686+ /* XXX: release major if 0 */
60687+}
60688+
60689+static int
60690+xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
60691+{
60692+ request_queue_t *rq;
60693+
60694+ rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
60695+ if (rq == NULL)
60696+ return -1;
60697+
60698+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
60699+ elevator_init(rq, "noop");
60700+#else
60701+ elevator_init(rq, &elevator_noop);
60702+#endif
60703+
60704+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
60705+ blk_queue_hardsect_size(rq, sector_size);
60706+ blk_queue_max_sectors(rq, 512);
60707+
60708+ /* Each segment in a request is up to an aligned page in size. */
60709+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
60710+ blk_queue_max_segment_size(rq, PAGE_SIZE);
60711+
60712+ /* Ensure a merged request will fit in a single I/O ring slot. */
60713+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
60714+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
60715+
60716+ /* Make sure buffer addresses are sector-aligned. */
60717+ blk_queue_dma_alignment(rq, 511);
60718+
60719+ gd->queue = rq;
60720+
60721+ return 0;
60722+}
60723+
60724+static int
60725+xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
60726+ u16 vdisk_info, u16 sector_size,
60727+ struct blkfront_info *info)
60728+{
60729+ struct gendisk *gd;
60730+ struct xlbd_major_info *mi;
60731+ int nr_minors = 1;
60732+ int err = -ENODEV;
60733+ unsigned int offset;
60734+
60735+ BUG_ON(info->gd != NULL);
60736+ BUG_ON(info->mi != NULL);
60737+ BUG_ON(info->rq != NULL);
60738+
60739+ mi = xlbd_get_major_info(vdevice);
60740+ if (mi == NULL)
60741+ goto out;
60742+ info->mi = mi;
60743+
60744+ if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
60745+ nr_minors = 1 << mi->type->partn_shift;
60746+
60747+ gd = alloc_disk(nr_minors);
60748+ if (gd == NULL)
60749+ goto out;
60750+
60751+ offset = mi->index * mi->type->disks_per_major +
60752+ (minor >> mi->type->partn_shift);
60753+ if (nr_minors > 1) {
60754+ if (offset < 26) {
60755+ sprintf(gd->disk_name, "%s%c",
60756+ mi->type->diskname, 'a' + offset );
60757+ }
60758+ else {
60759+ sprintf(gd->disk_name, "%s%c%c",
60760+ mi->type->diskname,
60761+ 'a' + ((offset/26)-1), 'a' + (offset%26) );
60762+ }
60763+ }
60764+ else {
60765+ if (offset < 26) {
60766+ sprintf(gd->disk_name, "%s%c%d",
60767+ mi->type->diskname,
60768+ 'a' + offset,
60769+ minor & ((1 << mi->type->partn_shift) - 1));
60770+ }
60771+ else {
60772+ sprintf(gd->disk_name, "%s%c%c%d",
60773+ mi->type->diskname,
60774+ 'a' + ((offset/26)-1), 'a' + (offset%26),
60775+ minor & ((1 << mi->type->partn_shift) - 1));
60776+ }
60777+ }
60778+
60779+ gd->major = mi->major;
60780+ gd->first_minor = minor;
60781+ gd->fops = &xlvbd_block_fops;
60782+ gd->private_data = info;
60783+ gd->driverfs_dev = &(info->xbdev->dev);
60784+ set_capacity(gd, capacity);
60785+
60786+ if (xlvbd_init_blk_queue(gd, sector_size)) {
60787+ del_gendisk(gd);
60788+ goto out;
60789+ }
60790+
60791+ info->rq = gd->queue;
60792+ info->gd = gd;
60793+
60794+ if (info->feature_barrier)
60795+ xlvbd_barrier(info);
60796+
60797+ if (vdisk_info & VDISK_READONLY)
60798+ set_disk_ro(gd, 1);
60799+
60800+ if (vdisk_info & VDISK_REMOVABLE)
60801+ gd->flags |= GENHD_FL_REMOVABLE;
60802+
60803+ if (vdisk_info & VDISK_CDROM)
60804+ gd->flags |= GENHD_FL_CD;
60805+
60806+ return 0;
60807+
60808+ out:
60809+ if (mi)
60810+ xlbd_put_major_info(mi);
60811+ info->mi = NULL;
60812+ return err;
60813+}
60814+
60815+int
60816+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
60817+ u16 sector_size, struct blkfront_info *info)
60818+{
60819+ struct block_device *bd;
60820+ int err = 0;
60821+
60822+ info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
60823+
60824+ bd = bdget(info->dev);
60825+ if (bd == NULL)
60826+ return -ENODEV;
60827+
60828+ err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
60829+ vdisk_info, sector_size, info);
60830+
60831+ bdput(bd);
60832+ return err;
60833+}
60834+
60835+void
60836+xlvbd_del(struct blkfront_info *info)
60837+{
60838+ if (info->mi == NULL)
60839+ return;
60840+
60841+ BUG_ON(info->gd == NULL);
60842+ del_gendisk(info->gd);
60843+ put_disk(info->gd);
60844+ info->gd = NULL;
60845+
60846+ xlbd_put_major_info(info->mi);
60847+ info->mi = NULL;
60848+
60849+ BUG_ON(info->rq == NULL);
60850+ blk_cleanup_queue(info->rq);
60851+ info->rq = NULL;
60852+}
60853+
60854+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
60855+int
60856+xlvbd_barrier(struct blkfront_info *info)
60857+{
60858+ int err;
60859+
60860+ err = blk_queue_ordered(info->rq,
60861+ info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL);
60862+ if (err)
60863+ return err;
60864+ printk("blkfront: %s: barriers %s\n",
60865+ info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled");
60866+ return 0;
60867+}
60868+#else
60869+int
60870+xlvbd_barrier(struct blkfront_info *info)
60871+{
60872+ printk("blkfront: %s: barriers disabled\n", info->gd->disk_name);
60873+ return -ENOSYS;
60874+}
60875+#endif
60876diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/Makefile linux-2.6.16.33/drivers/xen/blktap/Makefile
60877--- linux-2.6.16.33-noxen/drivers/xen/blktap/Makefile 1970-01-01 00:00:00.000000000 +0000
60878+++ linux-2.6.16.33/drivers/xen/blktap/Makefile 2007-01-08 15:00:45.000000000 +0000
60879@@ -0,0 +1,3 @@
60880+LINUXINCLUDE += -I../xen/include/public/io
60881+obj-y := xenbus.o interface.o blktap.o
60882+
60883diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/blktap.c linux-2.6.16.33/drivers/xen/blktap/blktap.c
60884--- linux-2.6.16.33-noxen/drivers/xen/blktap/blktap.c 1970-01-01 00:00:00.000000000 +0000
60885+++ linux-2.6.16.33/drivers/xen/blktap/blktap.c 2007-01-08 15:00:45.000000000 +0000
60886@@ -0,0 +1,1517 @@
60887+/******************************************************************************
60888+ * drivers/xen/blktap/blktap.c
60889+ *
60890+ * Back-end driver for user level virtual block devices. This portion of the
60891+ * driver exports a 'unified' block-device interface that can be accessed
60892+ * by any operating system that implements a compatible front end. Requests
60893+ * are remapped to a user-space memory region.
60894+ *
60895+ * Based on the blkback driver code.
60896+ *
60897+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
60898+ *
60899+ * Clean ups and fix ups:
60900+ * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
60901+ *
60902+ * This program is free software; you can redistribute it and/or
60903+ * modify it under the terms of the GNU General Public License version 2
60904+ * as published by the Free Software Foundation; or, when distributed
60905+ * separately from the Linux kernel or incorporated into other
60906+ * software packages, subject to the following license:
60907+ *
60908+ * Permission is hereby granted, free of charge, to any person obtaining a copy
60909+ * of this source file (the "Software"), to deal in the Software without
60910+ * restriction, including without limitation the rights to use, copy, modify,
60911+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60912+ * and to permit persons to whom the Software is furnished to do so, subject to
60913+ * the following conditions:
60914+ *
60915+ * The above copyright notice and this permission notice shall be included in
60916+ * all copies or substantial portions of the Software.
60917+ *
60918+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60919+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60920+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60921+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60922+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60923+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60924+ * IN THE SOFTWARE.
60925+ */
60926+
60927+#include <linux/spinlock.h>
60928+#include <linux/kthread.h>
60929+#include <linux/list.h>
60930+#include <asm/hypervisor.h>
60931+#include "common.h"
60932+#include <xen/balloon.h>
60933+#include <linux/kernel.h>
60934+#include <linux/fs.h>
60935+#include <linux/mm.h>
60936+#include <linux/errno.h>
60937+#include <linux/major.h>
60938+#include <linux/gfp.h>
60939+#include <linux/poll.h>
60940+#include <asm/tlbflush.h>
60941+#include <linux/devfs_fs_kernel.h>
60942+
60943+#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
60944+#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
60945+
60946+
60947+struct class *xen_class;
60948+EXPORT_SYMBOL_GPL(xen_class);
60949+
60950+/*
60951+ * Setup the xen class. This should probably go in another file, but
60952+ * since blktap is the only user of it so far, it gets to keep it.
60953+ */
60954+int setup_xen_class(void)
60955+{
60956+ int ret;
60957+
60958+ if (xen_class)
60959+ return 0;
60960+
60961+ xen_class = class_create(THIS_MODULE, "xen");
60962+ if ((ret = IS_ERR(xen_class))) {
60963+ xen_class = NULL;
60964+ return ret;
60965+ }
60966+
60967+ return 0;
60968+}
60969+
60970+/*
60971+ * The maximum number of requests that can be outstanding at any time
60972+ * is determined by
60973+ *
60974+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
60975+ *
60976+ * where mmap_alloc < MAX_DYNAMIC_MEM.
60977+ *
60978+ * TODO:
60979+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
60980+ * sysfs.
60981+ */
60982+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
60983+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
60984+#define MAX_PENDING_REQS BLK_RING_SIZE
60985+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
60986+#define MMAP_VADDR(_start, _req,_seg) \
60987+ (_start + \
60988+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
60989+ ((_seg) * PAGE_SIZE))
60990+static int blkif_reqs = MAX_PENDING_REQS;
60991+static int mmap_pages = MMAP_PAGES;
60992+
60993+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
60994+ * have a bunch of pages reserved for shared
60995+ * memory rings.
60996+ */
60997+
60998+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
60999+typedef struct domid_translate {
61000+ unsigned short domid;
61001+ unsigned short busid;
61002+} domid_translate_t ;
61003+
61004+/*Data struct associated with each of the tapdisk devices*/
61005+typedef struct tap_blkif {
61006+ struct vm_area_struct *vma; /*Shared memory area */
61007+ unsigned long rings_vstart; /*Kernel memory mapping */
61008+ unsigned long user_vstart; /*User memory mapping */
61009+ unsigned long dev_inuse; /*One process opens device at a time. */
61010+ unsigned long dev_pending; /*In process of being opened */
61011+ unsigned long ring_ok; /*make this ring->state */
61012+ blkif_front_ring_t ufe_ring; /*Rings up to user space. */
61013+ wait_queue_head_t wait; /*for poll */
61014+ unsigned long mode; /*current switching mode */
61015+ int minor; /*Minor number for tapdisk device */
61016+ pid_t pid; /*tapdisk process id */
61017+ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
61018+ shutdown */
61019+ unsigned long *idx_map; /*Record the user ring id to kern
61020+ [req id, idx] tuple */
61021+ blkif_t *blkif; /*Associate blkif with tapdev */
61022+ struct domid_translate trans; /*Translation from domid to bus. */
61023+} tap_blkif_t;
61024+
61025+static struct tap_blkif *tapfds[MAX_TAP_DEV];
61026+static int blktap_next_minor;
61027+
61028+static int __init set_blkif_reqs(char *str)
61029+{
61030+ get_option(&str, &blkif_reqs);
61031+ return 1;
61032+}
61033+__setup("blkif_reqs=", set_blkif_reqs);
61034+
61035+/* Run-time switchable: /sys/module/blktap/parameters/ */
61036+static unsigned int log_stats = 0;
61037+static unsigned int debug_lvl = 0;
61038+module_param(log_stats, int, 0644);
61039+module_param(debug_lvl, int, 0644);
61040+
61041+/*
61042+ * Each outstanding request that we've passed to the lower device layers has a
61043+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
61044+ * the pendcnt towards zero. When it hits zero, the specified domain has a
61045+ * response queued for it, with the saved 'id' passed back.
61046+ */
61047+typedef struct {
61048+ blkif_t *blkif;
61049+ unsigned long id;
61050+ unsigned short mem_idx;
61051+ int nr_pages;
61052+ atomic_t pendcnt;
61053+ unsigned short operation;
61054+ int status;
61055+ struct list_head free_list;
61056+ int inuse;
61057+} pending_req_t;
61058+
61059+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
61060+static struct list_head pending_free;
61061+static DEFINE_SPINLOCK(pending_free_lock);
61062+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
61063+static int alloc_pending_reqs;
61064+
61065+typedef unsigned int PEND_RING_IDX;
61066+
61067+static inline int MASK_PEND_IDX(int i) {
61068+ return (i & (MAX_PENDING_REQS-1));
61069+}
61070+
61071+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
61072+ return (req - pending_reqs[idx]);
61073+}
61074+
61075+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
61076+
61077+#define BLKBACK_INVALID_HANDLE (~0)
61078+
61079+static struct page **foreign_pages[MAX_DYNAMIC_MEM];
61080+static inline unsigned long idx_to_kaddr(
61081+ unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
61082+{
61083+ unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
61084+ unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
61085+ return (unsigned long)pfn_to_kaddr(pfn);
61086+}
61087+
61088+static unsigned short mmap_alloc = 0;
61089+static unsigned short mmap_lock = 0;
61090+static unsigned short mmap_inuse = 0;
61091+
61092+/******************************************************************
61093+ * GRANT HANDLES
61094+ */
61095+
61096+/* When using grant tables to map a frame for device access then the
61097+ * handle returned must be used to unmap the frame. This is needed to
61098+ * drop the ref count on the frame.
61099+ */
61100+struct grant_handle_pair
61101+{
61102+ grant_handle_t kernel;
61103+ grant_handle_t user;
61104+};
61105+#define INVALID_GRANT_HANDLE 0xFFFF
61106+
61107+static struct grant_handle_pair
61108+ pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
61109+#define pending_handle(_id, _idx, _i) \
61110+ (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
61111+ + (_i)])
61112+
61113+
61114+static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
61115+
61116+#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
61117+#define BLKTAP_DEV_DIR "/dev/xen"
61118+
61119+static int blktap_major;
61120+
61121+/* blktap IOCTLs: */
61122+#define BLKTAP_IOCTL_KICK_FE 1
61123+#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
61124+#define BLKTAP_IOCTL_SETMODE 3
61125+#define BLKTAP_IOCTL_SENDPID 4
61126+#define BLKTAP_IOCTL_NEWINTF 5
61127+#define BLKTAP_IOCTL_MINOR 6
61128+#define BLKTAP_IOCTL_MAJOR 7
61129+#define BLKTAP_QUERY_ALLOC_REQS 8
61130+#define BLKTAP_IOCTL_FREEINTF 9
61131+#define BLKTAP_IOCTL_PRINT_IDXS 100
61132+
61133+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
61134+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
61135+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
61136+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
61137+
61138+#define BLKTAP_MODE_INTERPOSE \
61139+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
61140+
61141+
61142+static inline int BLKTAP_MODE_VALID(unsigned long arg)
61143+{
61144+ return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
61145+ (arg == BLKTAP_MODE_INTERCEPT_FE) ||
61146+ (arg == BLKTAP_MODE_INTERPOSE ));
61147+}
61148+
61149+/* Requests passing through the tap to userspace are re-assigned an ID.
61150+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
61151+ * ring ID.
61152+ */
61153+
61154+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
61155+{
61156+ return ((fe_dom << 16) | MASK_PEND_IDX(idx));
61157+}
61158+
61159+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
61160+{
61161+ return (PEND_RING_IDX)(id & 0x0000ffff);
61162+}
61163+
61164+extern inline int ID_TO_MIDX(unsigned long id)
61165+{
61166+ return (int)(id >> 16);
61167+}
61168+
61169+#define INVALID_REQ 0xdead0000
61170+
61171+/*TODO: Convert to a free list*/
61172+static inline int GET_NEXT_REQ(unsigned long *idx_map)
61173+{
61174+ int i;
61175+ for (i = 0; i < MAX_PENDING_REQS; i++)
61176+ if (idx_map[i] == INVALID_REQ)
61177+ return i;
61178+
61179+ return INVALID_REQ;
61180+}
61181+
61182+
61183+#define BLKTAP_INVALID_HANDLE(_g) \
61184+ (((_g->kernel) == INVALID_GRANT_HANDLE) && \
61185+ ((_g->user) == INVALID_GRANT_HANDLE))
61186+
61187+#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
61188+ (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
61189+ } while(0)
61190+
61191+
61192+/******************************************************************
61193+ * BLKTAP VM OPS
61194+ */
61195+
61196+static struct page *blktap_nopage(struct vm_area_struct *vma,
61197+ unsigned long address,
61198+ int *type)
61199+{
61200+ /*
61201+ * if the page has not been mapped in by the driver then return
61202+ * NOPAGE_SIGBUS to the domain.
61203+ */
61204+
61205+ return NOPAGE_SIGBUS;
61206+}
61207+
61208+struct vm_operations_struct blktap_vm_ops = {
61209+ nopage: blktap_nopage,
61210+};
61211+
61212+/******************************************************************
61213+ * BLKTAP FILE OPS
61214+ */
61215+
61216+/*Function Declarations*/
61217+static tap_blkif_t *get_next_free_dev(void);
61218+static int blktap_open(struct inode *inode, struct file *filp);
61219+static int blktap_release(struct inode *inode, struct file *filp);
61220+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
61221+static int blktap_ioctl(struct inode *inode, struct file *filp,
61222+ unsigned int cmd, unsigned long arg);
61223+static unsigned int blktap_poll(struct file *file, poll_table *wait);
61224+
61225+static struct file_operations blktap_fops = {
61226+ .owner = THIS_MODULE,
61227+ .poll = blktap_poll,
61228+ .ioctl = blktap_ioctl,
61229+ .open = blktap_open,
61230+ .release = blktap_release,
61231+ .mmap = blktap_mmap,
61232+};
61233+
61234+
61235+static tap_blkif_t *get_next_free_dev(void)
61236+{
61237+ tap_blkif_t *info;
61238+ int minor;
61239+
61240+ /*
61241+ * This is called only from the ioctl, which
61242+ * means we should always have interrupts enabled.
61243+ */
61244+ BUG_ON(irqs_disabled());
61245+
61246+ spin_lock_irq(&pending_free_lock);
61247+
61248+ /* tapfds[0] is always NULL */
61249+
61250+ for (minor = 1; minor < blktap_next_minor; minor++) {
61251+ info = tapfds[minor];
61252+ /* we could have failed a previous attempt. */
61253+ if (!info ||
61254+ ((info->dev_inuse == 0) &&
61255+ (info->dev_pending == 0)) ) {
61256+ info->dev_pending = 1;
61257+ goto found;
61258+ }
61259+ }
61260+ info = NULL;
61261+ minor = -1;
61262+
61263+ /*
61264+ * We didn't find free device. If we can still allocate
61265+ * more, then we grab the next device minor that is
61266+ * available. This is done while we are still under
61267+ * the protection of the pending_free_lock.
61268+ */
61269+ if (blktap_next_minor < MAX_TAP_DEV)
61270+ minor = blktap_next_minor++;
61271+found:
61272+ spin_unlock_irq(&pending_free_lock);
61273+
61274+ if (!info && minor > 0) {
61275+ info = kzalloc(sizeof(*info), GFP_KERNEL);
61276+ if (unlikely(!info)) {
61277+ /*
61278+ * If we failed here, try to put back
61279+ * the next minor number. But if one
61280+ * was just taken, then we just lose this
61281+ * minor. We can try to allocate this
61282+ * minor again later.
61283+ */
61284+ spin_lock_irq(&pending_free_lock);
61285+ if (blktap_next_minor == minor+1)
61286+ blktap_next_minor--;
61287+ spin_unlock_irq(&pending_free_lock);
61288+ goto out;
61289+ }
61290+
61291+ info->minor = minor;
61292+ /*
61293+ * Make sure that we have a minor before others can
61294+ * see us.
61295+ */
61296+ wmb();
61297+ tapfds[minor] = info;
61298+
61299+ class_device_create(xen_class, NULL,
61300+ MKDEV(blktap_major, minor), NULL,
61301+ "blktap%d", minor);
61302+ devfs_mk_cdev(MKDEV(blktap_major, minor),
61303+ S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor);
61304+ }
61305+
61306+out:
61307+ return info;
61308+}
61309+
61310+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
61311+{
61312+ tap_blkif_t *info;
61313+ int i;
61314+
61315+ for (i = 1; i < blktap_next_minor; i++) {
61316+ info = tapfds[i];
61317+ if ( info &&
61318+ (info->trans.domid == domid) &&
61319+ (info->trans.busid == xenbus_id) ) {
61320+ info->blkif = blkif;
61321+ info->status = RUNNING;
61322+ return i;
61323+ }
61324+ }
61325+ return -1;
61326+}
61327+
61328+void signal_tapdisk(int idx)
61329+{
61330+ tap_blkif_t *info;
61331+ struct task_struct *ptask;
61332+
61333+ info = tapfds[idx];
61334+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
61335+ return;
61336+
61337+ if (info->pid > 0) {
61338+ ptask = find_task_by_pid(info->pid);
61339+ if (ptask)
61340+ info->status = CLEANSHUTDOWN;
61341+ }
61342+ info->blkif = NULL;
61343+
61344+ return;
61345+}
61346+
61347+static int blktap_open(struct inode *inode, struct file *filp)
61348+{
61349+ blkif_sring_t *sring;
61350+ int idx = iminor(inode) - BLKTAP_MINOR;
61351+ tap_blkif_t *info;
61352+ int i;
61353+
61354+ /* ctrl device, treat differently */
61355+ if (!idx)
61356+ return 0;
61357+
61358+ info = tapfds[idx];
61359+
61360+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
61361+ WPRINTK("Unable to open device /dev/xen/blktap%d\n",
61362+ idx);
61363+ return -ENODEV;
61364+ }
61365+
61366+ DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
61367+
61368+ /*Only one process can access device at a time*/
61369+ if (test_and_set_bit(0, &info->dev_inuse))
61370+ return -EBUSY;
61371+
61372+ info->dev_pending = 0;
61373+
61374+ /* Allocate the fe ring. */
61375+ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
61376+ if (sring == NULL)
61377+ goto fail_nomem;
61378+
61379+ SetPageReserved(virt_to_page(sring));
61380+
61381+ SHARED_RING_INIT(sring);
61382+ FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
61383+
61384+ filp->private_data = info;
61385+ info->vma = NULL;
61386+
61387+ info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
61388+ GFP_KERNEL);
61389+
61390+ if (idx > 0) {
61391+ init_waitqueue_head(&info->wait);
61392+ for (i = 0; i < MAX_PENDING_REQS; i++)
61393+ info->idx_map[i] = INVALID_REQ;
61394+ }
61395+
61396+ DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
61397+ return 0;
61398+
61399+ fail_nomem:
61400+ return -ENOMEM;
61401+}
61402+
61403+static int blktap_release(struct inode *inode, struct file *filp)
61404+{
61405+ tap_blkif_t *info = filp->private_data;
61406+
61407+ /* check for control device */
61408+ if (!info)
61409+ return 0;
61410+
61411+ info->dev_inuse = 0;
61412+ DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
61413+
61414+ /* Free the ring page. */
61415+ ClearPageReserved(virt_to_page(info->ufe_ring.sring));
61416+ free_page((unsigned long) info->ufe_ring.sring);
61417+
61418+ /* Clear any active mappings and free foreign map table */
61419+ if (info->vma) {
61420+ zap_page_range(
61421+ info->vma, info->vma->vm_start,
61422+ info->vma->vm_end - info->vma->vm_start, NULL);
61423+ info->vma = NULL;
61424+ }
61425+
61426+ if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
61427+ if (info->blkif->xenblkd != NULL) {
61428+ kthread_stop(info->blkif->xenblkd);
61429+ info->blkif->xenblkd = NULL;
61430+ }
61431+ info->status = CLEANSHUTDOWN;
61432+ }
61433+ return 0;
61434+}
61435+
61436+
61437+/* Note on mmap:
61438+ * We need to map pages to user space in a way that will allow the block
61439+ * subsystem set up direct IO to them. This couldn't be done before, because
61440+ * there isn't really a sane way to translate a user virtual address down to a
61441+ * physical address when the page belongs to another domain.
61442+ *
61443+ * My first approach was to map the page in to kernel memory, add an entry
61444+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
61445+ * and then attempt to map that page up to user space. This is disallowed
61446+ * by xen though, which realizes that we don't really own the machine frame
61447+ * underlying the physical page.
61448+ *
61449+ * The new approach is to provide explicit support for this in xen linux.
61450+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
61451+ * mapped from other vms. vma->vm_private_data is set up as a mapping
61452+ * from pages to actual page structs. There is a new clause in get_user_pages
61453+ * that does the right thing for this sort of mapping.
61454+ */
61455+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
61456+{
61457+ int size;
61458+ struct page **map;
61459+ int i;
61460+ tap_blkif_t *info = filp->private_data;
61461+
61462+ if (info == NULL) {
61463+ WPRINTK("blktap: mmap, retrieving idx failed\n");
61464+ return -ENOMEM;
61465+ }
61466+
61467+ vma->vm_flags |= VM_RESERVED;
61468+ vma->vm_ops = &blktap_vm_ops;
61469+
61470+ size = vma->vm_end - vma->vm_start;
61471+ if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
61472+ WPRINTK("you _must_ map exactly %d pages!\n",
61473+ mmap_pages + RING_PAGES);
61474+ return -EAGAIN;
61475+ }
61476+
61477+ size >>= PAGE_SHIFT;
61478+ info->rings_vstart = vma->vm_start;
61479+ info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
61480+
61481+ /* Map the ring pages to the start of the region and reserve it. */
61482+ if (remap_pfn_range(vma, vma->vm_start,
61483+ __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
61484+ PAGE_SIZE, vma->vm_page_prot)) {
61485+ WPRINTK("Mapping user ring failed!\n");
61486+ goto fail;
61487+ }
61488+
61489+ /* Mark this VM as containing foreign pages, and set up mappings. */
61490+ map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
61491+ * sizeof(struct page_struct*),
61492+ GFP_KERNEL);
61493+ if (map == NULL) {
61494+ WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
61495+ goto fail;
61496+ }
61497+
61498+ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
61499+ map[i] = NULL;
61500+
61501+ vma->vm_private_data = map;
61502+ vma->vm_flags |= VM_FOREIGN;
61503+
61504+ info->vma = vma;
61505+ info->ring_ok = 1;
61506+ return 0;
61507+ fail:
61508+ /* Clear any active mappings. */
61509+ zap_page_range(vma, vma->vm_start,
61510+ vma->vm_end - vma->vm_start, NULL);
61511+
61512+ return -ENOMEM;
61513+}
61514+
61515+
61516+static int blktap_ioctl(struct inode *inode, struct file *filp,
61517+ unsigned int cmd, unsigned long arg)
61518+{
61519+ tap_blkif_t *info = filp->private_data;
61520+
61521+ switch(cmd) {
61522+ case BLKTAP_IOCTL_KICK_FE:
61523+ {
61524+ /* There are fe messages to process. */
61525+ return blktap_read_ufe_ring(info);
61526+ }
61527+ case BLKTAP_IOCTL_SETMODE:
61528+ {
61529+ if (info) {
61530+ if (BLKTAP_MODE_VALID(arg)) {
61531+ info->mode = arg;
61532+ /* XXX: may need to flush rings here. */
61533+ DPRINTK("blktap: set mode to %lx\n",
61534+ arg);
61535+ return 0;
61536+ }
61537+ }
61538+ return 0;
61539+ }
61540+ case BLKTAP_IOCTL_PRINT_IDXS:
61541+ {
61542+ if (info) {
61543+ printk("User Rings: \n-----------\n");
61544+ printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
61545+ "| req_prod: %2d, rsp_prod: %2d\n",
61546+ info->ufe_ring.rsp_cons,
61547+ info->ufe_ring.req_prod_pvt,
61548+ info->ufe_ring.sring->req_prod,
61549+ info->ufe_ring.sring->rsp_prod);
61550+ }
61551+ return 0;
61552+ }
61553+ case BLKTAP_IOCTL_SENDPID:
61554+ {
61555+ if (info) {
61556+ info->pid = (pid_t)arg;
61557+ DPRINTK("blktap: pid received %d\n",
61558+ info->pid);
61559+ }
61560+ return 0;
61561+ }
61562+ case BLKTAP_IOCTL_NEWINTF:
61563+ {
61564+ uint64_t val = (uint64_t)arg;
61565+ domid_translate_t *tr = (domid_translate_t *)&val;
61566+
61567+ DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
61568+ tr->domid, tr->busid);
61569+ info = get_next_free_dev();
61570+ if (!info) {
61571+ WPRINTK("Error initialising /dev/xen/blktap - "
61572+ "No more devices\n");
61573+ return -1;
61574+ }
61575+ info->trans.domid = tr->domid;
61576+ info->trans.busid = tr->busid;
61577+ return info->minor;
61578+ }
61579+ case BLKTAP_IOCTL_FREEINTF:
61580+ {
61581+ unsigned long dev = arg;
61582+ unsigned long flags;
61583+
61584+ info = tapfds[dev];
61585+
61586+ if ((dev > MAX_TAP_DEV) || !info)
61587+ return 0; /* should this be an error? */
61588+
61589+ spin_lock_irqsave(&pending_free_lock, flags);
61590+ if (info->dev_pending)
61591+ info->dev_pending = 0;
61592+ spin_unlock_irqrestore(&pending_free_lock, flags);
61593+
61594+ return 0;
61595+ }
61596+ case BLKTAP_IOCTL_MINOR:
61597+ {
61598+ unsigned long dev = arg;
61599+
61600+ info = tapfds[dev];
61601+
61602+ if ((dev > MAX_TAP_DEV) || !info)
61603+ return -EINVAL;
61604+
61605+ return info->minor;
61606+ }
61607+ case BLKTAP_IOCTL_MAJOR:
61608+ return blktap_major;
61609+
61610+ case BLKTAP_QUERY_ALLOC_REQS:
61611+ {
61612+ WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
61613+ alloc_pending_reqs, blkif_reqs);
61614+ return (alloc_pending_reqs/blkif_reqs) * 100;
61615+ }
61616+ }
61617+ return -ENOIOCTLCMD;
61618+}
61619+
61620+static unsigned int blktap_poll(struct file *filp, poll_table *wait)
61621+{
61622+ tap_blkif_t *info = filp->private_data;
61623+
61624+ /* do not work on the control device */
61625+ if (!info)
61626+ return 0;
61627+
61628+ poll_wait(filp, &info->wait, wait);
61629+ if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
61630+ RING_PUSH_REQUESTS(&info->ufe_ring);
61631+ return POLLIN | POLLRDNORM;
61632+ }
61633+ return 0;
61634+}
61635+
61636+void blktap_kick_user(int idx)
61637+{
61638+ tap_blkif_t *info;
61639+
61640+ info = tapfds[idx];
61641+
61642+ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
61643+ return;
61644+
61645+ wake_up_interruptible(&info->wait);
61646+
61647+ return;
61648+}
61649+
61650+static int do_block_io_op(blkif_t *blkif);
61651+static void dispatch_rw_block_io(blkif_t *blkif,
61652+ blkif_request_t *req,
61653+ pending_req_t *pending_req);
61654+static void make_response(blkif_t *blkif, unsigned long id,
61655+ unsigned short op, int st);
61656+
61657+/******************************************************************
61658+ * misc small helpers
61659+ */
61660+static int req_increase(void)
61661+{
61662+ int i, j;
61663+
61664+ if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
61665+ return -EINVAL;
61666+
61667+ pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
61668+ * blkif_reqs, GFP_KERNEL);
61669+ foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
61670+
61671+ if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
61672+ goto out_of_memory;
61673+
61674+ DPRINTK("%s: reqs=%d, pages=%d\n",
61675+ __FUNCTION__, blkif_reqs, mmap_pages);
61676+
61677+ for (i = 0; i < MAX_PENDING_REQS; i++) {
61678+ list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
61679+ &pending_free);
61680+ pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
61681+ for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
61682+ BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
61683+ i, j));
61684+ }
61685+
61686+ mmap_alloc++;
61687+ DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
61688+ return 0;
61689+
61690+ out_of_memory:
61691+ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
61692+ kfree(pending_reqs[mmap_alloc]);
61693+ WPRINTK("%s: out of memory\n", __FUNCTION__);
61694+ return -ENOMEM;
61695+}
61696+
61697+static void mmap_req_del(int mmap)
61698+{
61699+ BUG_ON(!spin_is_locked(&pending_free_lock));
61700+
61701+ kfree(pending_reqs[mmap]);
61702+ pending_reqs[mmap] = NULL;
61703+
61704+ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
61705+ foreign_pages[mmap] = NULL;
61706+
61707+ mmap_lock = 0;
61708+ DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
61709+ mmap_alloc--;
61710+}
61711+
61712+static pending_req_t* alloc_req(void)
61713+{
61714+ pending_req_t *req = NULL;
61715+ unsigned long flags;
61716+
61717+ spin_lock_irqsave(&pending_free_lock, flags);
61718+
61719+ if (!list_empty(&pending_free)) {
61720+ req = list_entry(pending_free.next, pending_req_t, free_list);
61721+ list_del(&req->free_list);
61722+ }
61723+
61724+ if (req) {
61725+ req->inuse = 1;
61726+ alloc_pending_reqs++;
61727+ }
61728+ spin_unlock_irqrestore(&pending_free_lock, flags);
61729+
61730+ return req;
61731+}
61732+
61733+static void free_req(pending_req_t *req)
61734+{
61735+ unsigned long flags;
61736+ int was_empty;
61737+
61738+ spin_lock_irqsave(&pending_free_lock, flags);
61739+
61740+ alloc_pending_reqs--;
61741+ req->inuse = 0;
61742+ if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
61743+ mmap_inuse--;
61744+ if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
61745+ spin_unlock_irqrestore(&pending_free_lock, flags);
61746+ return;
61747+ }
61748+ was_empty = list_empty(&pending_free);
61749+ list_add(&req->free_list, &pending_free);
61750+
61751+ spin_unlock_irqrestore(&pending_free_lock, flags);
61752+
61753+ if (was_empty)
61754+ wake_up(&pending_free_wq);
61755+}
61756+
61757+static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
61758+ int tapidx)
61759+{
61760+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
61761+ unsigned int i, invcount = 0;
61762+ struct grant_handle_pair *khandle;
61763+ uint64_t ptep;
61764+ int ret, mmap_idx;
61765+ unsigned long kvaddr, uvaddr;
61766+ tap_blkif_t *info;
61767+
61768+
61769+ info = tapfds[tapidx];
61770+
61771+ if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
61772+ WPRINTK("fast_flush: Couldn't get info!\n");
61773+ return;
61774+ }
61775+
61776+ if (info->vma != NULL &&
61777+ xen_feature(XENFEAT_auto_translated_physmap)) {
61778+ down_write(&info->vma->vm_mm->mmap_sem);
61779+ zap_page_range(info->vma,
61780+ MMAP_VADDR(info->user_vstart, u_idx, 0),
61781+ req->nr_pages << PAGE_SHIFT, NULL);
61782+ up_write(&info->vma->vm_mm->mmap_sem);
61783+ }
61784+
61785+ mmap_idx = req->mem_idx;
61786+
61787+ for (i = 0; i < req->nr_pages; i++) {
61788+ kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
61789+ uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
61790+
61791+ khandle = &pending_handle(mmap_idx, k_idx, i);
61792+
61793+ if (khandle->kernel != INVALID_GRANT_HANDLE) {
61794+ gnttab_set_unmap_op(&unmap[invcount],
61795+ idx_to_kaddr(mmap_idx, k_idx, i),
61796+ GNTMAP_host_map, khandle->kernel);
61797+ invcount++;
61798+ }
61799+
61800+ if (khandle->user != INVALID_GRANT_HANDLE) {
61801+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
61802+ if (create_lookup_pte_addr(
61803+ info->vma->vm_mm,
61804+ MMAP_VADDR(info->user_vstart, u_idx, i),
61805+ &ptep) !=0) {
61806+ WPRINTK("Couldn't get a pte addr!\n");
61807+ return;
61808+ }
61809+
61810+ gnttab_set_unmap_op(&unmap[invcount], ptep,
61811+ GNTMAP_host_map
61812+ | GNTMAP_application_map
61813+ | GNTMAP_contains_pte,
61814+ khandle->user);
61815+ invcount++;
61816+ }
61817+
61818+ BLKTAP_INVALIDATE_HANDLE(khandle);
61819+ }
61820+ ret = HYPERVISOR_grant_table_op(
61821+ GNTTABOP_unmap_grant_ref, unmap, invcount);
61822+ BUG_ON(ret);
61823+
61824+ if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
61825+ zap_page_range(info->vma,
61826+ MMAP_VADDR(info->user_vstart, u_idx, 0),
61827+ req->nr_pages << PAGE_SHIFT, NULL);
61828+}
61829+
61830+/******************************************************************
61831+ * SCHEDULER FUNCTIONS
61832+ */
61833+
61834+static void print_stats(blkif_t *blkif)
61835+{
61836+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
61837+ current->comm, blkif->st_oo_req,
61838+ blkif->st_rd_req, blkif->st_wr_req);
61839+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
61840+ blkif->st_rd_req = 0;
61841+ blkif->st_wr_req = 0;
61842+ blkif->st_oo_req = 0;
61843+}
61844+
61845+int tap_blkif_schedule(void *arg)
61846+{
61847+ blkif_t *blkif = arg;
61848+
61849+ blkif_get(blkif);
61850+
61851+ if (debug_lvl)
61852+ printk(KERN_DEBUG "%s: started\n", current->comm);
61853+
61854+ while (!kthread_should_stop()) {
61855+ wait_event_interruptible(
61856+ blkif->wq,
61857+ blkif->waiting_reqs || kthread_should_stop());
61858+ wait_event_interruptible(
61859+ pending_free_wq,
61860+ !list_empty(&pending_free) || kthread_should_stop());
61861+
61862+ blkif->waiting_reqs = 0;
61863+ smp_mb(); /* clear flag *before* checking for work */
61864+
61865+ if (do_block_io_op(blkif))
61866+ blkif->waiting_reqs = 1;
61867+
61868+ if (log_stats && time_after(jiffies, blkif->st_print))
61869+ print_stats(blkif);
61870+ }
61871+
61872+ if (log_stats)
61873+ print_stats(blkif);
61874+ if (debug_lvl)
61875+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
61876+
61877+ blkif->xenblkd = NULL;
61878+ blkif_put(blkif);
61879+
61880+ return 0;
61881+}
61882+
61883+/******************************************************************
61884+ * COMPLETION CALLBACK -- Called by user level ioctl()
61885+ */
61886+
61887+static int blktap_read_ufe_ring(tap_blkif_t *info)
61888+{
61889+ /* This is called to read responses from the UFE ring. */
61890+ RING_IDX i, j, rp;
61891+ blkif_response_t *resp;
61892+ blkif_t *blkif=NULL;
61893+ int pending_idx, usr_idx, mmap_idx;
61894+ pending_req_t *pending_req;
61895+
61896+ if (!info)
61897+ return 0;
61898+
61899+ /* We currently only forward packets in INTERCEPT_FE mode. */
61900+ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
61901+ return 0;
61902+
61903+ /* for each outstanding message on the UFEring */
61904+ rp = info->ufe_ring.sring->rsp_prod;
61905+ rmb();
61906+
61907+ for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
61908+ blkif_response_t res;
61909+ resp = RING_GET_RESPONSE(&info->ufe_ring, i);
61910+ memcpy(&res, resp, sizeof(res));
61911+ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
61912+ ++info->ufe_ring.rsp_cons;
61913+
61914+ /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
61915+ usr_idx = (int)res.id;
61916+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
61917+ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
61918+
61919+ if ( (mmap_idx >= mmap_alloc) ||
61920+ (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
61921+ WPRINTK("Incorrect req map"
61922+ "[%d], internal map [%d,%d (%d)]\n",
61923+ usr_idx, mmap_idx,
61924+ ID_TO_IDX(info->idx_map[usr_idx]),
61925+ MASK_PEND_IDX(
61926+ ID_TO_IDX(info->idx_map[usr_idx])));
61927+
61928+ pending_req = &pending_reqs[mmap_idx][pending_idx];
61929+ blkif = pending_req->blkif;
61930+
61931+ for (j = 0; j < pending_req->nr_pages; j++) {
61932+
61933+ unsigned long kvaddr, uvaddr;
61934+ struct page **map = info->vma->vm_private_data;
61935+ struct page *pg;
61936+ int offset;
61937+
61938+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
61939+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
61940+
61941+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
61942+ ClearPageReserved(pg);
61943+ offset = (uvaddr - info->vma->vm_start)
61944+ >> PAGE_SHIFT;
61945+ map[offset] = NULL;
61946+ }
61947+ fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
61948+ info->idx_map[usr_idx] = INVALID_REQ;
61949+ make_response(blkif, pending_req->id, res.operation,
61950+ res.status);
61951+ blkif_put(pending_req->blkif);
61952+ free_req(pending_req);
61953+ }
61954+
61955+ return 0;
61956+}
61957+
61958+
61959+/******************************************************************************
61960+ * NOTIFICATION FROM GUEST OS.
61961+ */
61962+
61963+static void blkif_notify_work(blkif_t *blkif)
61964+{
61965+ blkif->waiting_reqs = 1;
61966+ wake_up(&blkif->wq);
61967+}
61968+
61969+irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
61970+{
61971+ blkif_notify_work(dev_id);
61972+ return IRQ_HANDLED;
61973+}
61974+
61975+
61976+
61977+/******************************************************************
61978+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
61979+ */
61980+static int print_dbug = 1;
61981+static int do_block_io_op(blkif_t *blkif)
61982+{
61983+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
61984+ blkif_request_t req;
61985+ pending_req_t *pending_req;
61986+ RING_IDX rc, rp;
61987+ int more_to_do = 0;
61988+ tap_blkif_t *info;
61989+
61990+ rc = blk_ring->req_cons;
61991+ rp = blk_ring->sring->req_prod;
61992+ rmb(); /* Ensure we see queued requests up to 'rp'. */
61993+
61994+ /*Check blkif has corresponding UE ring*/
61995+ if (blkif->dev_num < 0) {
61996+ /*oops*/
61997+ if (print_dbug) {
61998+ WPRINTK("Corresponding UE "
61999+ "ring does not exist!\n");
62000+ print_dbug = 0; /*We only print this message once*/
62001+ }
62002+ return 0;
62003+ }
62004+
62005+ info = tapfds[blkif->dev_num];
62006+
62007+ if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
62008+ if (print_dbug) {
62009+ WPRINTK("Can't get UE info!\n");
62010+ print_dbug = 0;
62011+ }
62012+ return 0;
62013+ }
62014+
62015+ while (rc != rp) {
62016+
62017+ if (RING_FULL(&info->ufe_ring)) {
62018+ WPRINTK("RING_FULL! More to do\n");
62019+ more_to_do = 1;
62020+ break;
62021+ }
62022+
62023+ if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
62024+ WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
62025+ " More to do\n");
62026+ more_to_do = 1;
62027+ break;
62028+ }
62029+
62030+ pending_req = alloc_req();
62031+ if (NULL == pending_req) {
62032+ blkif->st_oo_req++;
62033+ more_to_do = 1;
62034+ break;
62035+ }
62036+
62037+ memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
62038+ blk_ring->req_cons = ++rc; /* before make_response() */
62039+
62040+ switch (req.operation) {
62041+ case BLKIF_OP_READ:
62042+ blkif->st_rd_req++;
62043+ dispatch_rw_block_io(blkif, &req, pending_req);
62044+ break;
62045+
62046+ case BLKIF_OP_WRITE:
62047+ blkif->st_wr_req++;
62048+ dispatch_rw_block_io(blkif, &req, pending_req);
62049+ break;
62050+
62051+ default:
62052+ WPRINTK("unknown operation [%d]\n",
62053+ req.operation);
62054+ make_response(blkif, req.id, req.operation,
62055+ BLKIF_RSP_ERROR);
62056+ free_req(pending_req);
62057+ break;
62058+ }
62059+ }
62060+
62061+ blktap_kick_user(blkif->dev_num);
62062+
62063+ return more_to_do;
62064+}
62065+
62066+static void dispatch_rw_block_io(blkif_t *blkif,
62067+ blkif_request_t *req,
62068+ pending_req_t *pending_req)
62069+{
62070+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
62071+ int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
62072+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
62073+ unsigned int nseg;
62074+ int ret, i;
62075+ tap_blkif_t *info;
62076+ uint64_t sector;
62077+ blkif_request_t *target;
62078+ int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
62079+ int usr_idx;
62080+ uint16_t mmap_idx = pending_req->mem_idx;
62081+
62082+ if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
62083+ goto fail_response;
62084+
62085+ info = tapfds[blkif->dev_num];
62086+ if (info == NULL)
62087+ goto fail_response;
62088+
62089+ /* Check we have space on user ring - should never fail. */
62090+ usr_idx = GET_NEXT_REQ(info->idx_map);
62091+ if (usr_idx == INVALID_REQ) {
62092+ BUG();
62093+ goto fail_response;
62094+ }
62095+
62096+ /* Check that number of segments is sane. */
62097+ nseg = req->nr_segments;
62098+ if ( unlikely(nseg == 0) ||
62099+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
62100+ WPRINTK("Bad number of segments in request (%d)\n", nseg);
62101+ goto fail_response;
62102+ }
62103+
62104+ /* Make sure userspace is ready. */
62105+ if (!info->ring_ok) {
62106+ WPRINTK("blktap: ring not ready for requests!\n");
62107+ goto fail_response;
62108+ }
62109+
62110+ if (RING_FULL(&info->ufe_ring)) {
62111+ WPRINTK("blktap: fe_ring is full, can't add "
62112+ "IO Request will be dropped. %d %d\n",
62113+ RING_SIZE(&info->ufe_ring),
62114+ RING_SIZE(&blkif->blk_ring));
62115+ goto fail_response;
62116+ }
62117+
62118+ pending_req->blkif = blkif;
62119+ pending_req->id = req->id;
62120+ pending_req->operation = operation;
62121+ pending_req->status = BLKIF_RSP_OKAY;
62122+ pending_req->nr_pages = nseg;
62123+ op = 0;
62124+ for (i = 0; i < nseg; i++) {
62125+ unsigned long uvaddr;
62126+ unsigned long kvaddr;
62127+ uint64_t ptep;
62128+ uint32_t flags;
62129+
62130+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
62131+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62132+
62133+ sector = req->sector_number + ((PAGE_SIZE / 512) * i);
62134+ if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
62135+ WPRINTK("BLKTAP: Sector request greater"
62136+ "than size\n");
62137+ WPRINTK("BLKTAP: %s request sector"
62138+ "[%llu,%llu], Total [%llu]\n",
62139+ (req->operation ==
62140+ BLKIF_OP_WRITE ? "WRITE" : "READ"),
62141+ (long long unsigned) sector,
62142+ (long long unsigned) sector>>9,
62143+ (long long unsigned) blkif->sectors);
62144+ }
62145+
62146+ flags = GNTMAP_host_map;
62147+ if (operation == WRITE)
62148+ flags |= GNTMAP_readonly;
62149+ gnttab_set_map_op(&map[op], kvaddr, flags,
62150+ req->seg[i].gref, blkif->domid);
62151+ op++;
62152+
62153+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
62154+ /* Now map it to user. */
62155+ ret = create_lookup_pte_addr(info->vma->vm_mm,
62156+ uvaddr, &ptep);
62157+ if (ret) {
62158+ WPRINTK("Couldn't get a pte addr!\n");
62159+ goto fail_flush;
62160+ }
62161+
62162+ flags = GNTMAP_host_map | GNTMAP_application_map
62163+ | GNTMAP_contains_pte;
62164+ if (operation == WRITE)
62165+ flags |= GNTMAP_readonly;
62166+ gnttab_set_map_op(&map[op], ptep, flags,
62167+ req->seg[i].gref, blkif->domid);
62168+ op++;
62169+ }
62170+ }
62171+
62172+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
62173+ BUG_ON(ret);
62174+
62175+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
62176+ for (i = 0; i < (nseg*2); i+=2) {
62177+ unsigned long uvaddr;
62178+ unsigned long kvaddr;
62179+ unsigned long offset;
62180+ struct page *pg;
62181+
62182+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
62183+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
62184+
62185+ if (unlikely(map[i].status != 0)) {
62186+ WPRINTK("invalid kernel buffer -- "
62187+ "could not remap it\n");
62188+ ret |= 1;
62189+ map[i].handle = INVALID_GRANT_HANDLE;
62190+ }
62191+
62192+ if (unlikely(map[i+1].status != 0)) {
62193+ WPRINTK("invalid user buffer -- "
62194+ "could not remap it\n");
62195+ ret |= 1;
62196+ map[i+1].handle = INVALID_GRANT_HANDLE;
62197+ }
62198+
62199+ pending_handle(mmap_idx, pending_idx, i/2).kernel
62200+ = map[i].handle;
62201+ pending_handle(mmap_idx, pending_idx, i/2).user
62202+ = map[i+1].handle;
62203+
62204+ if (ret)
62205+ continue;
62206+
62207+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
62208+ FOREIGN_FRAME(map[i].dev_bus_addr
62209+ >> PAGE_SHIFT));
62210+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
62211+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62212+ ((struct page **)info->vma->vm_private_data)[offset] =
62213+ pg;
62214+ }
62215+ } else {
62216+ for (i = 0; i < nseg; i++) {
62217+ unsigned long uvaddr;
62218+ unsigned long kvaddr;
62219+ unsigned long offset;
62220+ struct page *pg;
62221+
62222+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
62223+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62224+
62225+ if (unlikely(map[i].status != 0)) {
62226+ WPRINTK("invalid kernel buffer -- "
62227+ "could not remap it\n");
62228+ ret |= 1;
62229+ map[i].handle = INVALID_GRANT_HANDLE;
62230+ }
62231+
62232+ pending_handle(mmap_idx, pending_idx, i).kernel
62233+ = map[i].handle;
62234+
62235+ if (ret)
62236+ continue;
62237+
62238+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
62239+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62240+ ((struct page **)info->vma->vm_private_data)[offset] =
62241+ pg;
62242+ }
62243+ }
62244+
62245+ if (ret)
62246+ goto fail_flush;
62247+
62248+ if (xen_feature(XENFEAT_auto_translated_physmap))
62249+ down_write(&info->vma->vm_mm->mmap_sem);
62250+ /* Mark mapped pages as reserved: */
62251+ for (i = 0; i < req->nr_segments; i++) {
62252+ unsigned long kvaddr;
62253+ struct page *pg;
62254+
62255+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62256+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62257+ SetPageReserved(pg);
62258+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
62259+ ret = vm_insert_page(info->vma,
62260+ MMAP_VADDR(info->user_vstart,
62261+ usr_idx, i), pg);
62262+ if (ret) {
62263+ up_write(&info->vma->vm_mm->mmap_sem);
62264+ goto fail_flush;
62265+ }
62266+ }
62267+ }
62268+ if (xen_feature(XENFEAT_auto_translated_physmap))
62269+ up_write(&info->vma->vm_mm->mmap_sem);
62270+
62271+ /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
62272+ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
62273+
62274+ blkif_get(blkif);
62275+ /* Finally, write the request message to the user ring. */
62276+ target = RING_GET_REQUEST(&info->ufe_ring,
62277+ info->ufe_ring.req_prod_pvt);
62278+ memcpy(target, req, sizeof(*req));
62279+ target->id = usr_idx;
62280+ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
62281+ info->ufe_ring.req_prod_pvt++;
62282+ return;
62283+
62284+ fail_flush:
62285+ WPRINTK("Reached Fail_flush\n");
62286+ fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
62287+ fail_response:
62288+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
62289+ free_req(pending_req);
62290+}
62291+
62292+
62293+
62294+/******************************************************************
62295+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
62296+ */
62297+
62298+
62299+static void make_response(blkif_t *blkif, unsigned long id,
62300+ unsigned short op, int st)
62301+{
62302+ blkif_response_t *resp;
62303+ unsigned long flags;
62304+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
62305+ int more_to_do = 0;
62306+ int notify;
62307+
62308+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
62309+ /* Place on the response ring for the relevant domain. */
62310+ resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
62311+ resp->id = id;
62312+ resp->operation = op;
62313+ resp->status = st;
62314+ blk_ring->rsp_prod_pvt++;
62315+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
62316+
62317+ if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
62318+ /*
62319+ * Tail check for pending requests. Allows frontend to avoid
62320+ * notifications if requests are already in flight (lower
62321+ * overheads and promotes batching).
62322+ */
62323+ RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
62324+ } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
62325+ more_to_do = 1;
62326+
62327+ }
62328+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
62329+ if (more_to_do)
62330+ blkif_notify_work(blkif);
62331+ if (notify)
62332+ notify_remote_via_irq(blkif->irq);
62333+}
62334+
62335+static int __init blkif_init(void)
62336+{
62337+ int i,ret,blktap_dir;
62338+
62339+ if (!is_running_on_xen())
62340+ return -ENODEV;
62341+
62342+ INIT_LIST_HEAD(&pending_free);
62343+ for(i = 0; i < 2; i++) {
62344+ ret = req_increase();
62345+ if (ret)
62346+ break;
62347+ }
62348+ if (i == 0)
62349+ return ret;
62350+
62351+ tap_blkif_interface_init();
62352+
62353+ alloc_pending_reqs = 0;
62354+
62355+ tap_blkif_xenbus_init();
62356+
62357+ /* Dynamically allocate a major for this device */
62358+ ret = register_chrdev(0, "blktap", &blktap_fops);
62359+ blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
62360+
62361+ if ( (ret < 0)||(blktap_dir < 0) ) {
62362+ WPRINTK("Couldn't register /dev/xen/blktap\n");
62363+ return -ENOMEM;
62364+ }
62365+
62366+ blktap_major = ret;
62367+
62368+ /* tapfds[0] is always NULL */
62369+ blktap_next_minor++;
62370+
62371+ ret = devfs_mk_cdev(MKDEV(blktap_major, i),
62372+ S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
62373+
62374+ if(ret != 0)
62375+ return -ENOMEM;
62376+
62377+ DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
62378+
62379+ /* Make sure the xen class exists */
62380+ if (!setup_xen_class()) {
62381+ /*
62382+ * This will allow udev to create the blktap ctrl device.
62383+ * We only want to create blktap0 first. We don't want
62384+ * to flood the sysfs system with needless blktap devices.
62385+ * We only create the device when a request of a new device is
62386+ * made.
62387+ */
62388+ class_device_create(xen_class, NULL,
62389+ MKDEV(blktap_major, 0), NULL,
62390+ "blktap0");
62391+ } else {
62392+ /* this is bad, but not fatal */
62393+ WPRINTK("blktap: sysfs xen_class not created\n");
62394+ }
62395+
62396+ DPRINTK("Blktap device successfully created\n");
62397+
62398+ return 0;
62399+}
62400+
62401+module_init(blkif_init);
62402+
62403+MODULE_LICENSE("Dual BSD/GPL");
62404diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/common.h linux-2.6.16.33/drivers/xen/blktap/common.h
62405--- linux-2.6.16.33-noxen/drivers/xen/blktap/common.h 1970-01-01 00:00:00.000000000 +0000
62406+++ linux-2.6.16.33/drivers/xen/blktap/common.h 2007-01-08 15:00:45.000000000 +0000
62407@@ -0,0 +1,121 @@
62408+/*
62409+ * This program is free software; you can redistribute it and/or
62410+ * modify it under the terms of the GNU General Public License version 2
62411+ * as published by the Free Software Foundation; or, when distributed
62412+ * separately from the Linux kernel or incorporated into other
62413+ * software packages, subject to the following license:
62414+ *
62415+ * Permission is hereby granted, free of charge, to any person obtaining a copy
62416+ * of this source file (the "Software"), to deal in the Software without
62417+ * restriction, including without limitation the rights to use, copy, modify,
62418+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62419+ * and to permit persons to whom the Software is furnished to do so, subject to
62420+ * the following conditions:
62421+ *
62422+ * The above copyright notice and this permission notice shall be included in
62423+ * all copies or substantial portions of the Software.
62424+ *
62425+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62426+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62427+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62428+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62429+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62430+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62431+ * IN THE SOFTWARE.
62432+ */
62433+
62434+#ifndef __BLKIF__BACKEND__COMMON_H__
62435+#define __BLKIF__BACKEND__COMMON_H__
62436+
62437+#include <linux/config.h>
62438+#include <linux/version.h>
62439+#include <linux/module.h>
62440+#include <linux/interrupt.h>
62441+#include <linux/slab.h>
62442+#include <linux/blkdev.h>
62443+#include <linux/vmalloc.h>
62444+#include <asm/io.h>
62445+#include <asm/setup.h>
62446+#include <asm/pgalloc.h>
62447+#include <xen/evtchn.h>
62448+#include <asm/hypervisor.h>
62449+#include <xen/interface/io/blkif.h>
62450+#include <xen/interface/io/ring.h>
62451+#include <xen/gnttab.h>
62452+#include <xen/driver_util.h>
62453+
62454+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
62455+ __FILE__ , __LINE__ , ## _a )
62456+
62457+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
62458+
62459+struct backend_info;
62460+
62461+typedef struct blkif_st {
62462+ /* Unique identifier for this interface. */
62463+ domid_t domid;
62464+ unsigned int handle;
62465+ /* Physical parameters of the comms window. */
62466+ unsigned int evtchn;
62467+ unsigned int irq;
62468+ /* Comms information. */
62469+ blkif_back_ring_t blk_ring;
62470+ struct vm_struct *blk_ring_area;
62471+ /* Back pointer to the backend_info. */
62472+ struct backend_info *be;
62473+ /* Private fields. */
62474+ spinlock_t blk_ring_lock;
62475+ atomic_t refcnt;
62476+
62477+ wait_queue_head_t wq;
62478+ struct task_struct *xenblkd;
62479+ unsigned int waiting_reqs;
62480+ request_queue_t *plug;
62481+
62482+ /* statistics */
62483+ unsigned long st_print;
62484+ int st_rd_req;
62485+ int st_wr_req;
62486+ int st_oo_req;
62487+
62488+ wait_queue_head_t waiting_to_free;
62489+
62490+ grant_handle_t shmem_handle;
62491+ grant_ref_t shmem_ref;
62492+
62493+ int dev_num;
62494+ uint64_t sectors;
62495+} blkif_t;
62496+
62497+blkif_t *tap_alloc_blkif(domid_t domid);
62498+void tap_blkif_free(blkif_t *blkif);
62499+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
62500+ unsigned int evtchn);
62501+void tap_blkif_unmap(blkif_t *blkif);
62502+
62503+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
62504+#define blkif_put(_b) \
62505+ do { \
62506+ if (atomic_dec_and_test(&(_b)->refcnt)) \
62507+ wake_up(&(_b)->waiting_to_free);\
62508+ } while (0)
62509+
62510+
62511+struct phys_req {
62512+ unsigned short dev;
62513+ unsigned short nr_sects;
62514+ struct block_device *bdev;
62515+ blkif_sector_t sector_number;
62516+};
62517+
62518+void tap_blkif_interface_init(void);
62519+
62520+void tap_blkif_xenbus_init(void);
62521+
62522+irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
62523+int tap_blkif_schedule(void *arg);
62524+
62525+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
62526+void signal_tapdisk(int idx);
62527+
62528+#endif /* __BLKIF__BACKEND__COMMON_H__ */
62529diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/interface.c linux-2.6.16.33/drivers/xen/blktap/interface.c
62530--- linux-2.6.16.33-noxen/drivers/xen/blktap/interface.c 1970-01-01 00:00:00.000000000 +0000
62531+++ linux-2.6.16.33/drivers/xen/blktap/interface.c 2007-01-08 15:00:45.000000000 +0000
62532@@ -0,0 +1,164 @@
62533+/******************************************************************************
62534+ * drivers/xen/blktap/interface.c
62535+ *
62536+ * Block-device interface management.
62537+ *
62538+ * Copyright (c) 2004, Keir Fraser
62539+ *
62540+ * This program is free software; you can redistribute it and/or
62541+ * modify it under the terms of the GNU General Public License version 2
62542+ * as published by the Free Software Foundation; or, when distributed
62543+ * separately from the Linux kernel or incorporated into other
62544+ * software packages, subject to the following license:
62545+ *
62546+ * Permission is hereby granted, free of charge, to any person obtaining a copy
62547+ * of this source file (the "Software"), to deal in the Software without
62548+ * restriction, including without limitation the rights to use, copy, modify,
62549+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62550+ * and to permit persons to whom the Software is furnished to do so, subject to
62551+ * the following conditions:
62552+ *
62553+ * The above copyright notice and this permission notice shall be included in
62554+ * all copies or substantial portions of the Software.
62555+ *
62556+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62557+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62558+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62559+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62560+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62561+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62562+ * IN THE SOFTWARE.
62563+
62564+ */
62565+
62566+#include "common.h"
62567+#include <xen/evtchn.h>
62568+
62569+static kmem_cache_t *blkif_cachep;
62570+
62571+blkif_t *tap_alloc_blkif(domid_t domid)
62572+{
62573+ blkif_t *blkif;
62574+
62575+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
62576+ if (!blkif)
62577+ return ERR_PTR(-ENOMEM);
62578+
62579+ memset(blkif, 0, sizeof(*blkif));
62580+ blkif->domid = domid;
62581+ spin_lock_init(&blkif->blk_ring_lock);
62582+ atomic_set(&blkif->refcnt, 1);
62583+ init_waitqueue_head(&blkif->wq);
62584+ blkif->st_print = jiffies;
62585+ init_waitqueue_head(&blkif->waiting_to_free);
62586+
62587+ return blkif;
62588+}
62589+
62590+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
62591+{
62592+ struct gnttab_map_grant_ref op;
62593+ int ret;
62594+
62595+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
62596+ GNTMAP_host_map, shared_page, blkif->domid);
62597+
62598+ lock_vm_area(blkif->blk_ring_area);
62599+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
62600+ unlock_vm_area(blkif->blk_ring_area);
62601+ BUG_ON(ret);
62602+
62603+ if (op.status) {
62604+ DPRINTK(" Grant table operation failure !\n");
62605+ return op.status;
62606+ }
62607+
62608+ blkif->shmem_ref = shared_page;
62609+ blkif->shmem_handle = op.handle;
62610+
62611+ return 0;
62612+}
62613+
62614+static void unmap_frontend_page(blkif_t *blkif)
62615+{
62616+ struct gnttab_unmap_grant_ref op;
62617+ int ret;
62618+
62619+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
62620+ GNTMAP_host_map, blkif->shmem_handle);
62621+
62622+ lock_vm_area(blkif->blk_ring_area);
62623+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
62624+ unlock_vm_area(blkif->blk_ring_area);
62625+ BUG_ON(ret);
62626+}
62627+
62628+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
62629+ unsigned int evtchn)
62630+{
62631+ blkif_sring_t *sring;
62632+ int err;
62633+ struct evtchn_bind_interdomain bind_interdomain;
62634+
62635+ /* Already connected through? */
62636+ if (blkif->irq)
62637+ return 0;
62638+
62639+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
62640+ return -ENOMEM;
62641+
62642+ err = map_frontend_page(blkif, shared_page);
62643+ if (err) {
62644+ free_vm_area(blkif->blk_ring_area);
62645+ return err;
62646+ }
62647+
62648+ bind_interdomain.remote_dom = blkif->domid;
62649+ bind_interdomain.remote_port = evtchn;
62650+
62651+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
62652+ &bind_interdomain);
62653+ if (err) {
62654+ unmap_frontend_page(blkif);
62655+ free_vm_area(blkif->blk_ring_area);
62656+ return err;
62657+ }
62658+
62659+ blkif->evtchn = bind_interdomain.local_port;
62660+
62661+ sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
62662+ BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
62663+
62664+ blkif->irq = bind_evtchn_to_irqhandler(
62665+ blkif->evtchn, tap_blkif_be_int, 0, "blkif-backend", blkif);
62666+
62667+ return 0;
62668+}
62669+
62670+void tap_blkif_unmap(blkif_t *blkif)
62671+{
62672+ if (blkif->irq) {
62673+ unbind_from_irqhandler(blkif->irq, blkif);
62674+ blkif->irq = 0;
62675+ }
62676+ if (blkif->blk_ring.sring) {
62677+ unmap_frontend_page(blkif);
62678+ free_vm_area(blkif->blk_ring_area);
62679+ blkif->blk_ring.sring = NULL;
62680+ }
62681+}
62682+
62683+void tap_blkif_free(blkif_t *blkif)
62684+{
62685+ atomic_dec(&blkif->refcnt);
62686+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
62687+
62688+ tap_blkif_unmap(blkif);
62689+ kmem_cache_free(blkif_cachep, blkif);
62690+}
62691+
62692+void __init tap_blkif_interface_init(void)
62693+{
62694+ blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t),
62695+ 0, 0, NULL, NULL);
62696+}
62697diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/xenbus.c linux-2.6.16.33/drivers/xen/blktap/xenbus.c
62698--- linux-2.6.16.33-noxen/drivers/xen/blktap/xenbus.c 1970-01-01 00:00:00.000000000 +0000
62699+++ linux-2.6.16.33/drivers/xen/blktap/xenbus.c 2007-01-08 15:00:45.000000000 +0000
62700@@ -0,0 +1,366 @@
62701+/* drivers/xen/blktap/xenbus.c
62702+ *
62703+ * Xenbus code for blktap
62704+ *
62705+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
62706+ *
62707+ * Based on the blkback xenbus code:
62708+ *
62709+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
62710+ * Copyright (C) 2005 XenSource Ltd
62711+ *
62712+ * This program is free software; you can redistribute it and/or
62713+ * modify it under the terms of the GNU General Public License version 2
62714+ * as published by the Free Software Foundation; or, when distributed
62715+ * separately from the Linux kernel or incorporated into other
62716+ * software packages, subject to the following license:
62717+ *
62718+ * Permission is hereby granted, free of charge, to any person obtaining a copy
62719+ * of this source file (the "Software"), to deal in the Software without
62720+ * restriction, including without limitation the rights to use, copy, modify,
62721+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62722+ * and to permit persons to whom the Software is furnished to do so, subject to
62723+ * the following conditions:
62724+ *
62725+ * The above copyright notice and this permission notice shall be included in
62726+ * all copies or substantial portions of the Software.
62727+ *
62728+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62729+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62730+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62731+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62732+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62733+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62734+ * IN THE SOFTWARE.
62735+ */
62736+
62737+#include <stdarg.h>
62738+#include <linux/module.h>
62739+#include <linux/kthread.h>
62740+#include <xen/xenbus.h>
62741+#include "common.h"
62742+
62743+
62744+struct backend_info
62745+{
62746+ struct xenbus_device *dev;
62747+ blkif_t *blkif;
62748+ struct xenbus_watch backend_watch;
62749+ int xenbus_id;
62750+};
62751+
62752+
62753+static void connect(struct backend_info *);
62754+static int connect_ring(struct backend_info *);
62755+static int blktap_remove(struct xenbus_device *dev);
62756+static int blktap_probe(struct xenbus_device *dev,
62757+ const struct xenbus_device_id *id);
62758+static void tap_backend_changed(struct xenbus_watch *, const char **,
62759+ unsigned int);
62760+static void tap_frontend_changed(struct xenbus_device *dev,
62761+ enum xenbus_state frontend_state);
62762+
62763+static int strsep_len(const char *str, char c, unsigned int len)
62764+{
62765+ unsigned int i;
62766+
62767+ for (i = 0; str[i]; i++)
62768+ if (str[i] == c) {
62769+ if (len == 0)
62770+ return i;
62771+ len--;
62772+ }
62773+ return (len == 0) ? i : -ERANGE;
62774+}
62775+
62776+static long get_id(const char *str)
62777+{
62778+ int len,end;
62779+ const char *ptr;
62780+ char *tptr, num[10];
62781+
62782+ len = strsep_len(str, '/', 2);
62783+ end = strlen(str);
62784+ if ( (len < 0) || (end < 0) ) return -1;
62785+
62786+ ptr = str + len + 1;
62787+ strncpy(num,ptr,end - len);
62788+ tptr = num + (end - (len + 1));
62789+ *tptr = '\0';
62790+ DPRINTK("Get_id called for %s (%s)\n",str,num);
62791+
62792+ return simple_strtol(num, NULL, 10);
62793+}
62794+
62795+static void tap_update_blkif_status(blkif_t *blkif)
62796+{
62797+ int err;
62798+
62799+ /* Not ready to connect? */
62800+ if(!blkif->irq || !blkif->sectors) {
62801+ return;
62802+ }
62803+
62804+ /* Already connected? */
62805+ if (blkif->be->dev->state == XenbusStateConnected)
62806+ return;
62807+
62808+ /* Attempt to connect: exit if we fail to. */
62809+ connect(blkif->be);
62810+ if (blkif->be->dev->state != XenbusStateConnected)
62811+ return;
62812+
62813+ blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif,
62814+ "xvd %d",
62815+ blkif->domid);
62816+
62817+ if (IS_ERR(blkif->xenblkd)) {
62818+ err = PTR_ERR(blkif->xenblkd);
62819+ blkif->xenblkd = NULL;
62820+ xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
62821+ WPRINTK("Error starting thread\n");
62822+ }
62823+}
62824+
62825+static int blktap_remove(struct xenbus_device *dev)
62826+{
62827+ struct backend_info *be = dev->dev.driver_data;
62828+
62829+ if (be->backend_watch.node) {
62830+ unregister_xenbus_watch(&be->backend_watch);
62831+ kfree(be->backend_watch.node);
62832+ be->backend_watch.node = NULL;
62833+ }
62834+ if (be->blkif) {
62835+ if (be->blkif->xenblkd)
62836+ kthread_stop(be->blkif->xenblkd);
62837+ signal_tapdisk(be->blkif->dev_num);
62838+ tap_blkif_free(be->blkif);
62839+ be->blkif = NULL;
62840+ }
62841+ kfree(be);
62842+ dev->dev.driver_data = NULL;
62843+ return 0;
62844+}
62845+
62846+/**
62847+ * Entry point to this code when a new device is created. Allocate
62848+ * the basic structures, and watch the store waiting for the
62849+ * user-space program to tell us the physical device info. Switch to
62850+ * InitWait.
62851+ */
62852+static int blktap_probe(struct xenbus_device *dev,
62853+ const struct xenbus_device_id *id)
62854+{
62855+ int err;
62856+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
62857+ GFP_KERNEL);
62858+ if (!be) {
62859+ xenbus_dev_fatal(dev, -ENOMEM,
62860+ "allocating backend structure");
62861+ return -ENOMEM;
62862+ }
62863+
62864+ be->dev = dev;
62865+ dev->dev.driver_data = be;
62866+ be->xenbus_id = get_id(dev->nodename);
62867+
62868+ be->blkif = tap_alloc_blkif(dev->otherend_id);
62869+ if (IS_ERR(be->blkif)) {
62870+ err = PTR_ERR(be->blkif);
62871+ be->blkif = NULL;
62872+ xenbus_dev_fatal(dev, err, "creating block interface");
62873+ goto fail;
62874+ }
62875+
62876+ /* setup back pointer */
62877+ be->blkif->be = be;
62878+ be->blkif->sectors = 0;
62879+
62880+ /* set a watch on disk info, waiting for userspace to update details*/
62881+ err = xenbus_watch_path2(dev, dev->nodename, "info",
62882+ &be->backend_watch, tap_backend_changed);
62883+ if (err)
62884+ goto fail;
62885+
62886+ err = xenbus_switch_state(dev, XenbusStateInitWait);
62887+ if (err)
62888+ goto fail;
62889+ return 0;
62890+
62891+fail:
62892+ DPRINTK("blktap probe failed\n");
62893+ blktap_remove(dev);
62894+ return err;
62895+}
62896+
62897+
62898+/**
62899+ * Callback received when the user space code has placed the device
62900+ * information in xenstore.
62901+ */
62902+static void tap_backend_changed(struct xenbus_watch *watch,
62903+ const char **vec, unsigned int len)
62904+{
62905+ int err;
62906+ unsigned long info;
62907+ struct backend_info *be
62908+ = container_of(watch, struct backend_info, backend_watch);
62909+ struct xenbus_device *dev = be->dev;
62910+
62911+ /**
62912+ * Check to see whether userspace code has opened the image
62913+ * and written sector
62914+ * and disk info to xenstore
62915+ */
62916+ err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info,
62917+ NULL);
62918+ if (err) {
62919+ xenbus_dev_error(dev, err, "getting info");
62920+ return;
62921+ }
62922+
62923+ DPRINTK("Userspace update on disk info, %lu\n",info);
62924+
62925+ err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu",
62926+ &be->blkif->sectors, NULL);
62927+
62928+ /* Associate tap dev with domid*/
62929+ be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id,
62930+ be->blkif);
62931+ DPRINTK("Thread started for domid [%d], connecting disk\n",
62932+ be->blkif->dev_num);
62933+
62934+ tap_update_blkif_status(be->blkif);
62935+}
62936+
62937+/**
62938+ * Callback received when the frontend's state changes.
62939+ */
62940+static void tap_frontend_changed(struct xenbus_device *dev,
62941+ enum xenbus_state frontend_state)
62942+{
62943+ struct backend_info *be = dev->dev.driver_data;
62944+ int err;
62945+
62946+ DPRINTK("\n");
62947+
62948+ switch (frontend_state) {
62949+ case XenbusStateInitialising:
62950+ if (dev->state == XenbusStateClosed) {
62951+ printk("%s: %s: prepare for reconnect\n",
62952+ __FUNCTION__, dev->nodename);
62953+ xenbus_switch_state(dev, XenbusStateInitWait);
62954+ }
62955+ break;
62956+
62957+ case XenbusStateInitialised:
62958+ case XenbusStateConnected:
62959+ /* Ensure we connect even when two watches fire in
62960+ close successsion and we miss the intermediate value
62961+ of frontend_state. */
62962+ if (dev->state == XenbusStateConnected)
62963+ break;
62964+
62965+ err = connect_ring(be);
62966+ if (err)
62967+ break;
62968+ tap_update_blkif_status(be->blkif);
62969+ break;
62970+
62971+ case XenbusStateClosing:
62972+ if (be->blkif->xenblkd) {
62973+ kthread_stop(be->blkif->xenblkd);
62974+ be->blkif->xenblkd = NULL;
62975+ }
62976+ xenbus_switch_state(dev, XenbusStateClosing);
62977+ break;
62978+
62979+ case XenbusStateClosed:
62980+ xenbus_switch_state(dev, XenbusStateClosed);
62981+ if (xenbus_dev_is_online(dev))
62982+ break;
62983+ /* fall through if not online */
62984+ case XenbusStateUnknown:
62985+ device_unregister(&dev->dev);
62986+ break;
62987+
62988+ default:
62989+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
62990+ frontend_state);
62991+ break;
62992+ }
62993+}
62994+
62995+
62996+/**
62997+ * Switch to Connected state.
62998+ */
62999+static void connect(struct backend_info *be)
63000+{
63001+ int err;
63002+
63003+ struct xenbus_device *dev = be->dev;
63004+
63005+ err = xenbus_switch_state(dev, XenbusStateConnected);
63006+ if (err)
63007+ xenbus_dev_fatal(dev, err, "switching to Connected state",
63008+ dev->nodename);
63009+
63010+ return;
63011+}
63012+
63013+
63014+static int connect_ring(struct backend_info *be)
63015+{
63016+ struct xenbus_device *dev = be->dev;
63017+ unsigned long ring_ref;
63018+ unsigned int evtchn;
63019+ int err;
63020+
63021+ DPRINTK("%s\n", dev->otherend);
63022+
63023+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
63024+ &ring_ref, "event-channel", "%u", &evtchn, NULL);
63025+ if (err) {
63026+ xenbus_dev_fatal(dev, err,
63027+ "reading %s/ring-ref and event-channel",
63028+ dev->otherend);
63029+ return err;
63030+ }
63031+
63032+ /* Map the shared frame, irq etc. */
63033+ err = tap_blkif_map(be->blkif, ring_ref, evtchn);
63034+ if (err) {
63035+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
63036+ ring_ref, evtchn);
63037+ return err;
63038+ }
63039+
63040+ return 0;
63041+}
63042+
63043+
63044+/* ** Driver Registration ** */
63045+
63046+
63047+static struct xenbus_device_id blktap_ids[] = {
63048+ { "tap" },
63049+ { "" }
63050+};
63051+
63052+
63053+static struct xenbus_driver blktap = {
63054+ .name = "tap",
63055+ .owner = THIS_MODULE,
63056+ .ids = blktap_ids,
63057+ .probe = blktap_probe,
63058+ .remove = blktap_remove,
63059+ .otherend_changed = tap_frontend_changed
63060+};
63061+
63062+
63063+void tap_blkif_xenbus_init(void)
63064+{
63065+ xenbus_register_backend(&blktap);
63066+}
63067diff -Nur linux-2.6.16.33-noxen/drivers/xen/char/Makefile linux-2.6.16.33/drivers/xen/char/Makefile
63068--- linux-2.6.16.33-noxen/drivers/xen/char/Makefile 1970-01-01 00:00:00.000000000 +0000
63069+++ linux-2.6.16.33/drivers/xen/char/Makefile 2007-01-08 15:00:45.000000000 +0000
63070@@ -0,0 +1,2 @@
63071+
63072+obj-y := mem.o
63073diff -Nur linux-2.6.16.33-noxen/drivers/xen/char/mem.c linux-2.6.16.33/drivers/xen/char/mem.c
63074--- linux-2.6.16.33-noxen/drivers/xen/char/mem.c 1970-01-01 00:00:00.000000000 +0000
63075+++ linux-2.6.16.33/drivers/xen/char/mem.c 2007-01-08 15:00:45.000000000 +0000
63076@@ -0,0 +1,205 @@
63077+/*
63078+ * Originally from linux/drivers/char/mem.c
63079+ *
63080+ * Copyright (C) 1991, 1992 Linus Torvalds
63081+ *
63082+ * Added devfs support.
63083+ * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
63084+ * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
63085+ */
63086+
63087+#include <linux/config.h>
63088+#include <linux/mm.h>
63089+#include <linux/miscdevice.h>
63090+#include <linux/slab.h>
63091+#include <linux/vmalloc.h>
63092+#include <linux/mman.h>
63093+#include <linux/random.h>
63094+#include <linux/init.h>
63095+#include <linux/raw.h>
63096+#include <linux/tty.h>
63097+#include <linux/capability.h>
63098+#include <linux/smp_lock.h>
63099+#include <linux/devfs_fs_kernel.h>
63100+#include <linux/ptrace.h>
63101+#include <linux/device.h>
63102+#include <asm/pgalloc.h>
63103+#include <asm/uaccess.h>
63104+#include <asm/io.h>
63105+#include <asm/hypervisor.h>
63106+
63107+#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE
63108+static inline int valid_phys_addr_range(unsigned long addr, size_t *count)
63109+{
63110+ return 1;
63111+}
63112+#endif
63113+
63114+/*
63115+ * This funcion reads the *physical* memory. The f_pos points directly to the
63116+ * memory location.
63117+ */
63118+static ssize_t read_mem(struct file * file, char __user * buf,
63119+ size_t count, loff_t *ppos)
63120+{
63121+ unsigned long p = *ppos, ignored;
63122+ ssize_t read = 0, sz;
63123+ void __iomem *v;
63124+
63125+ if (!valid_phys_addr_range(p, &count))
63126+ return -EFAULT;
63127+
63128+ while (count > 0) {
63129+ /*
63130+ * Handle first page in case it's not aligned
63131+ */
63132+ if (-p & (PAGE_SIZE - 1))
63133+ sz = -p & (PAGE_SIZE - 1);
63134+ else
63135+ sz = PAGE_SIZE;
63136+
63137+ sz = min_t(unsigned long, sz, count);
63138+
63139+ v = xlate_dev_mem_ptr(p, sz);
63140+ if (IS_ERR(v) || v == NULL) {
63141+ /*
63142+ * Some programs (e.g., dmidecode) groove off into
63143+ * weird RAM areas where no tables can possibly exist
63144+ * (because Xen will have stomped on them!). These
63145+ * programs get rather upset if we let them know that
63146+ * Xen failed their access, so we fake out a read of
63147+ * all zeroes.
63148+ */
63149+ if (clear_user(buf, count))
63150+ return -EFAULT;
63151+ read += count;
63152+ break;
63153+ }
63154+
63155+ ignored = copy_to_user(buf, v, sz);
63156+ xlate_dev_mem_ptr_unmap(v);
63157+ if (ignored)
63158+ return -EFAULT;
63159+ buf += sz;
63160+ p += sz;
63161+ count -= sz;
63162+ read += sz;
63163+ }
63164+
63165+ *ppos += read;
63166+ return read;
63167+}
63168+
63169+static ssize_t write_mem(struct file * file, const char __user * buf,
63170+ size_t count, loff_t *ppos)
63171+{
63172+ unsigned long p = *ppos, ignored;
63173+ ssize_t written = 0, sz;
63174+ void __iomem *v;
63175+
63176+ if (!valid_phys_addr_range(p, &count))
63177+ return -EFAULT;
63178+
63179+ while (count > 0) {
63180+ /*
63181+ * Handle first page in case it's not aligned
63182+ */
63183+ if (-p & (PAGE_SIZE - 1))
63184+ sz = -p & (PAGE_SIZE - 1);
63185+ else
63186+ sz = PAGE_SIZE;
63187+
63188+ sz = min_t(unsigned long, sz, count);
63189+
63190+ v = xlate_dev_mem_ptr(p, sz);
63191+ if (v == NULL)
63192+ break;
63193+ if (IS_ERR(v)) {
63194+ if (written == 0)
63195+ return PTR_ERR(v);
63196+ break;
63197+ }
63198+
63199+ ignored = copy_from_user(v, buf, sz);
63200+ xlate_dev_mem_ptr_unmap(v);
63201+ if (ignored) {
63202+ written += sz - ignored;
63203+ if (written)
63204+ break;
63205+ return -EFAULT;
63206+ }
63207+ buf += sz;
63208+ p += sz;
63209+ count -= sz;
63210+ written += sz;
63211+ }
63212+
63213+ *ppos += written;
63214+ return written;
63215+}
63216+
63217+#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
63218+static inline int uncached_access(struct file *file)
63219+{
63220+ if (file->f_flags & O_SYNC)
63221+ return 1;
63222+ /* Xen sets correct MTRR type on non-RAM for us. */
63223+ return 0;
63224+}
63225+
63226+static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
63227+{
63228+ size_t size = vma->vm_end - vma->vm_start;
63229+
63230+ if (uncached_access(file))
63231+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
63232+
63233+ /* We want to return the real error code, not EAGAIN. */
63234+ return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
63235+ size, vma->vm_page_prot, DOMID_IO);
63236+}
63237+#endif
63238+
63239+/*
63240+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
63241+ * check against negative addresses: they are ok. The return value is weird,
63242+ * though, in that case (0).
63243+ *
63244+ * also note that seeking relative to the "end of file" isn't supported:
63245+ * it has no meaning, so it returns -EINVAL.
63246+ */
63247+static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
63248+{
63249+ loff_t ret;
63250+
63251+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
63252+ switch (orig) {
63253+ case 0:
63254+ file->f_pos = offset;
63255+ ret = file->f_pos;
63256+ force_successful_syscall_return();
63257+ break;
63258+ case 1:
63259+ file->f_pos += offset;
63260+ ret = file->f_pos;
63261+ force_successful_syscall_return();
63262+ break;
63263+ default:
63264+ ret = -EINVAL;
63265+ }
63266+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
63267+ return ret;
63268+}
63269+
63270+static int open_mem(struct inode * inode, struct file * filp)
63271+{
63272+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
63273+}
63274+
63275+struct file_operations mem_fops = {
63276+ .llseek = memory_lseek,
63277+ .read = read_mem,
63278+ .write = write_mem,
63279+ .mmap = xen_mmap_mem,
63280+ .open = open_mem,
63281+};
63282diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/Makefile linux-2.6.16.33/drivers/xen/console/Makefile
63283--- linux-2.6.16.33-noxen/drivers/xen/console/Makefile 1970-01-01 00:00:00.000000000 +0000
63284+++ linux-2.6.16.33/drivers/xen/console/Makefile 2007-01-08 15:00:45.000000000 +0000
63285@@ -0,0 +1,2 @@
63286+
63287+obj-y := console.o xencons_ring.o
63288diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/console.c linux-2.6.16.33/drivers/xen/console/console.c
63289--- linux-2.6.16.33-noxen/drivers/xen/console/console.c 1970-01-01 00:00:00.000000000 +0000
63290+++ linux-2.6.16.33/drivers/xen/console/console.c 2007-01-08 15:00:45.000000000 +0000
63291@@ -0,0 +1,718 @@
63292+/******************************************************************************
63293+ * console.c
63294+ *
63295+ * Virtual console driver.
63296+ *
63297+ * Copyright (c) 2002-2004, K A Fraser.
63298+ *
63299+ * This program is free software; you can redistribute it and/or
63300+ * modify it under the terms of the GNU General Public License version 2
63301+ * as published by the Free Software Foundation; or, when distributed
63302+ * separately from the Linux kernel or incorporated into other
63303+ * software packages, subject to the following license:
63304+ *
63305+ * Permission is hereby granted, free of charge, to any person obtaining a copy
63306+ * of this source file (the "Software"), to deal in the Software without
63307+ * restriction, including without limitation the rights to use, copy, modify,
63308+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63309+ * and to permit persons to whom the Software is furnished to do so, subject to
63310+ * the following conditions:
63311+ *
63312+ * The above copyright notice and this permission notice shall be included in
63313+ * all copies or substantial portions of the Software.
63314+ *
63315+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63316+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63317+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63318+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63319+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63320+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63321+ * IN THE SOFTWARE.
63322+ */
63323+
63324+#include <linux/config.h>
63325+#include <linux/version.h>
63326+#include <linux/module.h>
63327+#include <linux/errno.h>
63328+#include <linux/signal.h>
63329+#include <linux/sched.h>
63330+#include <linux/interrupt.h>
63331+#include <linux/tty.h>
63332+#include <linux/tty_flip.h>
63333+#include <linux/serial.h>
63334+#include <linux/major.h>
63335+#include <linux/ptrace.h>
63336+#include <linux/ioport.h>
63337+#include <linux/mm.h>
63338+#include <linux/slab.h>
63339+#include <linux/init.h>
63340+#include <linux/console.h>
63341+#include <linux/bootmem.h>
63342+#include <linux/sysrq.h>
63343+#include <linux/screen_info.h>
63344+#include <asm/io.h>
63345+#include <asm/irq.h>
63346+#include <asm/uaccess.h>
63347+#include <xen/interface/xen.h>
63348+#include <xen/interface/event_channel.h>
63349+#include <asm/hypervisor.h>
63350+#include <xen/evtchn.h>
63351+#include <xen/xenbus.h>
63352+#include <xen/xencons.h>
63353+
63354+/*
63355+ * Modes:
63356+ * 'xencons=off' [XC_OFF]: Console is disabled.
63357+ * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'.
63358+ * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'.
63359+ * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'.
63360+ * default: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
63361+ *
63362+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
63363+ * warnings from standard distro startup scripts.
63364+ */
63365+static enum {
63366+ XC_OFF, XC_TTY, XC_SERIAL, XC_XVC
63367+} xc_mode;
63368+static int xc_num = -1;
63369+
63370+/* /dev/xvc0 device number allocated by lanana.org. */
63371+#define XEN_XVC_MAJOR 204
63372+#define XEN_XVC_MINOR 191
63373+
63374+#ifdef CONFIG_MAGIC_SYSRQ
63375+static unsigned long sysrq_requested;
63376+extern int sysrq_enabled;
63377+#endif
63378+
63379+void xencons_early_setup(void)
63380+{
63381+ extern int console_use_vt;
63382+
63383+ if (is_initial_xendomain()) {
63384+ xc_mode = XC_SERIAL;
63385+ } else {
63386+ xc_mode = XC_TTY;
63387+ console_use_vt = 0;
63388+ }
63389+}
63390+
63391+static int __init xencons_setup(char *str)
63392+{
63393+ char *q;
63394+ int n;
63395+ extern int console_use_vt;
63396+
63397+ console_use_vt = 1;
63398+ if (!strncmp(str, "ttyS", 4)) {
63399+ xc_mode = XC_SERIAL;
63400+ str += 4;
63401+ } else if (!strncmp(str, "tty", 3)) {
63402+ xc_mode = XC_TTY;
63403+ str += 3;
63404+ console_use_vt = 0;
63405+ } else if (!strncmp(str, "xvc", 3)) {
63406+ xc_mode = XC_XVC;
63407+ str += 3;
63408+ } else if (!strncmp(str, "off", 3)) {
63409+ xc_mode = XC_OFF;
63410+ str += 3;
63411+ }
63412+
63413+ n = simple_strtol(str, &q, 10);
63414+ if (q != str)
63415+ xc_num = n;
63416+
63417+ return 1;
63418+}
63419+__setup("xencons=", xencons_setup);
63420+
63421+/* The kernel and user-land drivers share a common transmit buffer. */
63422+static unsigned int wbuf_size = 4096;
63423+#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
63424+static char *wbuf;
63425+static unsigned int wc, wp; /* write_cons, write_prod */
63426+
63427+static int __init xencons_bufsz_setup(char *str)
63428+{
63429+ unsigned int goal;
63430+ goal = simple_strtoul(str, NULL, 0);
63431+ if (goal) {
63432+ goal = roundup_pow_of_two(goal);
63433+ if (wbuf_size < goal)
63434+ wbuf_size = goal;
63435+ }
63436+ return 1;
63437+}
63438+__setup("xencons_bufsz=", xencons_bufsz_setup);
63439+
63440+/* This lock protects accesses to the common transmit buffer. */
63441+static DEFINE_SPINLOCK(xencons_lock);
63442+
63443+/* Common transmit-kick routine. */
63444+static void __xencons_tx_flush(void);
63445+
63446+static struct tty_driver *xencons_driver;
63447+
63448+/******************** Kernel console driver ********************************/
63449+
63450+static void kcons_write(struct console *c, const char *s, unsigned int count)
63451+{
63452+ int i = 0;
63453+ unsigned long flags;
63454+
63455+ spin_lock_irqsave(&xencons_lock, flags);
63456+
63457+ while (i < count) {
63458+ for (; i < count; i++) {
63459+ if ((wp - wc) >= (wbuf_size - 1))
63460+ break;
63461+ if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
63462+ wbuf[WBUF_MASK(wp++)] = '\r';
63463+ }
63464+
63465+ __xencons_tx_flush();
63466+ }
63467+
63468+ spin_unlock_irqrestore(&xencons_lock, flags);
63469+}
63470+
63471+static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
63472+{
63473+
63474+ while (count > 0) {
63475+ int rc;
63476+ rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
63477+ if (rc <= 0)
63478+ break;
63479+ count -= rc;
63480+ s += rc;
63481+ }
63482+}
63483+
63484+static struct tty_driver *kcons_device(struct console *c, int *index)
63485+{
63486+ *index = 0;
63487+ return xencons_driver;
63488+}
63489+
63490+static struct console kcons_info = {
63491+ .device = kcons_device,
63492+ .flags = CON_PRINTBUFFER | CON_ENABLED,
63493+ .index = -1,
63494+};
63495+
63496+static int __init xen_console_init(void)
63497+{
63498+ if (!is_running_on_xen())
63499+ goto out;
63500+
63501+ if (is_initial_xendomain()) {
63502+ kcons_info.write = kcons_write_dom0;
63503+ } else {
63504+ if (!xen_start_info->console.domU.evtchn)
63505+ goto out;
63506+ kcons_info.write = kcons_write;
63507+ }
63508+
63509+ switch (xc_mode) {
63510+ case XC_XVC:
63511+ strcpy(kcons_info.name, "xvc");
63512+ if (xc_num == -1)
63513+ xc_num = 0;
63514+ break;
63515+
63516+ case XC_SERIAL:
63517+ strcpy(kcons_info.name, "ttyS");
63518+ if (xc_num == -1)
63519+ xc_num = 0;
63520+ break;
63521+
63522+ case XC_TTY:
63523+ strcpy(kcons_info.name, "tty");
63524+ if (xc_num == -1)
63525+ xc_num = 1;
63526+ break;
63527+
63528+ default:
63529+ goto out;
63530+ }
63531+
63532+ wbuf = alloc_bootmem(wbuf_size);
63533+
63534+ register_console(&kcons_info);
63535+
63536+ out:
63537+ return 0;
63538+}
63539+console_initcall(xen_console_init);
63540+
63541+/*** Useful function for console debugging -- goes straight to Xen. ***/
63542+asmlinkage int xprintk(const char *fmt, ...)
63543+{
63544+ va_list args;
63545+ int printk_len;
63546+ static char printk_buf[1024];
63547+
63548+ /* Emit the output into the temporary buffer */
63549+ va_start(args, fmt);
63550+ printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
63551+ va_end(args);
63552+
63553+ /* Send the processed output directly to Xen. */
63554+ kcons_write_dom0(NULL, printk_buf, printk_len);
63555+
63556+ return 0;
63557+}
63558+
63559+/*** Forcibly flush console data before dying. ***/
63560+void xencons_force_flush(void)
63561+{
63562+ int sz;
63563+
63564+ /* Emergency console is synchronous, so there's nothing to flush. */
63565+ if (!is_running_on_xen() ||
63566+ is_initial_xendomain() ||
63567+ !xen_start_info->console.domU.evtchn)
63568+ return;
63569+
63570+ /* Spin until console data is flushed through to the daemon. */
63571+ while (wc != wp) {
63572+ int sent = 0;
63573+ if ((sz = wp - wc) == 0)
63574+ continue;
63575+ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
63576+ if (sent > 0)
63577+ wc += sent;
63578+ }
63579+}
63580+
63581+
63582+void dom0_init_screen_info(const struct dom0_vga_console_info *info)
63583+{
63584+ switch (info->video_type) {
63585+ case XEN_VGATYPE_TEXT_MODE_3:
63586+ screen_info.orig_video_mode = 3;
63587+ screen_info.orig_video_ega_bx = 3;
63588+ screen_info.orig_video_isVGA = 1;
63589+ screen_info.orig_video_lines = info->u.text_mode_3.rows;
63590+ screen_info.orig_video_cols = info->u.text_mode_3.columns;
63591+ screen_info.orig_x = info->u.text_mode_3.cursor_x;
63592+ screen_info.orig_y = info->u.text_mode_3.cursor_y;
63593+ screen_info.orig_video_points =
63594+ info->u.text_mode_3.font_height;
63595+ break;
63596+ case XEN_VGATYPE_VESA_LFB:
63597+ screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
63598+ screen_info.lfb_width = info->u.vesa_lfb.width;
63599+ screen_info.lfb_height = info->u.vesa_lfb.height;
63600+ screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
63601+ screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
63602+ screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
63603+ screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
63604+ screen_info.red_size = info->u.vesa_lfb.red_size;
63605+ screen_info.red_pos = info->u.vesa_lfb.red_pos;
63606+ screen_info.green_size = info->u.vesa_lfb.green_size;
63607+ screen_info.green_pos = info->u.vesa_lfb.green_pos;
63608+ screen_info.blue_size = info->u.vesa_lfb.blue_size;
63609+ screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
63610+ screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
63611+ screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
63612+ break;
63613+ }
63614+}
63615+
63616+
63617+/******************** User-space console driver (/dev/console) ************/
63618+
63619+#define DRV(_d) (_d)
63620+#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
63621+ ((_tty)->index != (xc_num - 1)))
63622+
63623+static struct termios *xencons_termios[MAX_NR_CONSOLES];
63624+static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
63625+static struct tty_struct *xencons_tty;
63626+static int xencons_priv_irq;
63627+static char x_char;
63628+
63629+void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
63630+{
63631+ int i;
63632+ unsigned long flags;
63633+
63634+ spin_lock_irqsave(&xencons_lock, flags);
63635+ if (xencons_tty == NULL)
63636+ goto out;
63637+
63638+ for (i = 0; i < len; i++) {
63639+#ifdef CONFIG_MAGIC_SYSRQ
63640+ if (sysrq_enabled) {
63641+ if (buf[i] == '\x0f') { /* ^O */
63642+ sysrq_requested = jiffies;
63643+ continue; /* don't print the sysrq key */
63644+ } else if (sysrq_requested) {
63645+ unsigned long sysrq_timeout =
63646+ sysrq_requested + HZ*2;
63647+ sysrq_requested = 0;
63648+ if (time_before(jiffies, sysrq_timeout)) {
63649+ spin_unlock_irqrestore(
63650+ &xencons_lock, flags);
63651+ handle_sysrq(
63652+ buf[i], regs, xencons_tty);
63653+ spin_lock_irqsave(
63654+ &xencons_lock, flags);
63655+ continue;
63656+ }
63657+ }
63658+ }
63659+#endif
63660+ tty_insert_flip_char(xencons_tty, buf[i], 0);
63661+ }
63662+ tty_flip_buffer_push(xencons_tty);
63663+
63664+ out:
63665+ spin_unlock_irqrestore(&xencons_lock, flags);
63666+}
63667+
63668+static void __xencons_tx_flush(void)
63669+{
63670+ int sent, sz, work_done = 0;
63671+
63672+ if (x_char) {
63673+ if (is_initial_xendomain())
63674+ kcons_write_dom0(NULL, &x_char, 1);
63675+ else
63676+ while (x_char)
63677+ if (xencons_ring_send(&x_char, 1) == 1)
63678+ break;
63679+ x_char = 0;
63680+ work_done = 1;
63681+ }
63682+
63683+ while (wc != wp) {
63684+ sz = wp - wc;
63685+ if (sz > (wbuf_size - WBUF_MASK(wc)))
63686+ sz = wbuf_size - WBUF_MASK(wc);
63687+ if (is_initial_xendomain()) {
63688+ kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
63689+ wc += sz;
63690+ } else {
63691+ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
63692+ if (sent == 0)
63693+ break;
63694+ wc += sent;
63695+ }
63696+ work_done = 1;
63697+ }
63698+
63699+ if (work_done && (xencons_tty != NULL)) {
63700+ wake_up_interruptible(&xencons_tty->write_wait);
63701+ if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
63702+ (xencons_tty->ldisc.write_wakeup != NULL))
63703+ (xencons_tty->ldisc.write_wakeup)(xencons_tty);
63704+ }
63705+}
63706+
63707+void xencons_tx(void)
63708+{
63709+ unsigned long flags;
63710+
63711+ spin_lock_irqsave(&xencons_lock, flags);
63712+ __xencons_tx_flush();
63713+ spin_unlock_irqrestore(&xencons_lock, flags);
63714+}
63715+
63716+/* Privileged receive callback and transmit kicker. */
63717+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
63718+ struct pt_regs *regs)
63719+{
63720+ static char rbuf[16];
63721+ int l;
63722+
63723+ while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
63724+ xencons_rx(rbuf, l, regs);
63725+
63726+ xencons_tx();
63727+
63728+ return IRQ_HANDLED;
63729+}
63730+
63731+static int xencons_write_room(struct tty_struct *tty)
63732+{
63733+ return wbuf_size - (wp - wc);
63734+}
63735+
63736+static int xencons_chars_in_buffer(struct tty_struct *tty)
63737+{
63738+ return wp - wc;
63739+}
63740+
63741+static void xencons_send_xchar(struct tty_struct *tty, char ch)
63742+{
63743+ unsigned long flags;
63744+
63745+ if (DUMMY_TTY(tty))
63746+ return;
63747+
63748+ spin_lock_irqsave(&xencons_lock, flags);
63749+ x_char = ch;
63750+ __xencons_tx_flush();
63751+ spin_unlock_irqrestore(&xencons_lock, flags);
63752+}
63753+
63754+static void xencons_throttle(struct tty_struct *tty)
63755+{
63756+ if (DUMMY_TTY(tty))
63757+ return;
63758+
63759+ if (I_IXOFF(tty))
63760+ xencons_send_xchar(tty, STOP_CHAR(tty));
63761+}
63762+
63763+static void xencons_unthrottle(struct tty_struct *tty)
63764+{
63765+ if (DUMMY_TTY(tty))
63766+ return;
63767+
63768+ if (I_IXOFF(tty)) {
63769+ if (x_char != 0)
63770+ x_char = 0;
63771+ else
63772+ xencons_send_xchar(tty, START_CHAR(tty));
63773+ }
63774+}
63775+
63776+static void xencons_flush_buffer(struct tty_struct *tty)
63777+{
63778+ unsigned long flags;
63779+
63780+ if (DUMMY_TTY(tty))
63781+ return;
63782+
63783+ spin_lock_irqsave(&xencons_lock, flags);
63784+ wc = wp = 0;
63785+ spin_unlock_irqrestore(&xencons_lock, flags);
63786+}
63787+
63788+static inline int __xencons_put_char(int ch)
63789+{
63790+ char _ch = (char)ch;
63791+ if ((wp - wc) == wbuf_size)
63792+ return 0;
63793+ wbuf[WBUF_MASK(wp++)] = _ch;
63794+ return 1;
63795+}
63796+
63797+static int xencons_write(
63798+ struct tty_struct *tty,
63799+ const unsigned char *buf,
63800+ int count)
63801+{
63802+ int i;
63803+ unsigned long flags;
63804+
63805+ if (DUMMY_TTY(tty))
63806+ return count;
63807+
63808+ spin_lock_irqsave(&xencons_lock, flags);
63809+
63810+ for (i = 0; i < count; i++)
63811+ if (!__xencons_put_char(buf[i]))
63812+ break;
63813+
63814+ if (i != 0)
63815+ __xencons_tx_flush();
63816+
63817+ spin_unlock_irqrestore(&xencons_lock, flags);
63818+
63819+ return i;
63820+}
63821+
63822+static void xencons_put_char(struct tty_struct *tty, u_char ch)
63823+{
63824+ unsigned long flags;
63825+
63826+ if (DUMMY_TTY(tty))
63827+ return;
63828+
63829+ spin_lock_irqsave(&xencons_lock, flags);
63830+ (void)__xencons_put_char(ch);
63831+ spin_unlock_irqrestore(&xencons_lock, flags);
63832+}
63833+
63834+static void xencons_flush_chars(struct tty_struct *tty)
63835+{
63836+ unsigned long flags;
63837+
63838+ if (DUMMY_TTY(tty))
63839+ return;
63840+
63841+ spin_lock_irqsave(&xencons_lock, flags);
63842+ __xencons_tx_flush();
63843+ spin_unlock_irqrestore(&xencons_lock, flags);
63844+}
63845+
63846+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
63847+{
63848+ unsigned long orig_jiffies = jiffies;
63849+
63850+ if (DUMMY_TTY(tty))
63851+ return;
63852+
63853+ while (DRV(tty->driver)->chars_in_buffer(tty)) {
63854+ set_current_state(TASK_INTERRUPTIBLE);
63855+ schedule_timeout(1);
63856+ if (signal_pending(current))
63857+ break;
63858+ if (timeout && time_after(jiffies, orig_jiffies + timeout))
63859+ break;
63860+ }
63861+
63862+ set_current_state(TASK_RUNNING);
63863+}
63864+
63865+static int xencons_open(struct tty_struct *tty, struct file *filp)
63866+{
63867+ unsigned long flags;
63868+
63869+ if (DUMMY_TTY(tty))
63870+ return 0;
63871+
63872+ spin_lock_irqsave(&xencons_lock, flags);
63873+ tty->driver_data = NULL;
63874+ if (xencons_tty == NULL)
63875+ xencons_tty = tty;
63876+ __xencons_tx_flush();
63877+ spin_unlock_irqrestore(&xencons_lock, flags);
63878+
63879+ return 0;
63880+}
63881+
63882+static void xencons_close(struct tty_struct *tty, struct file *filp)
63883+{
63884+ unsigned long flags;
63885+
63886+ if (DUMMY_TTY(tty))
63887+ return;
63888+
63889+ down(&tty_sem);
63890+
63891+ if (tty->count != 1) {
63892+ up(&tty_sem);
63893+ return;
63894+ }
63895+
63896+ /* Prevent other threads from re-opening this tty. */
63897+ set_bit(TTY_CLOSING, &tty->flags);
63898+ up(&tty_sem);
63899+
63900+ tty->closing = 1;
63901+ tty_wait_until_sent(tty, 0);
63902+ if (DRV(tty->driver)->flush_buffer != NULL)
63903+ DRV(tty->driver)->flush_buffer(tty);
63904+ if (tty->ldisc.flush_buffer != NULL)
63905+ tty->ldisc.flush_buffer(tty);
63906+ tty->closing = 0;
63907+ spin_lock_irqsave(&xencons_lock, flags);
63908+ xencons_tty = NULL;
63909+ spin_unlock_irqrestore(&xencons_lock, flags);
63910+}
63911+
63912+static struct tty_operations xencons_ops = {
63913+ .open = xencons_open,
63914+ .close = xencons_close,
63915+ .write = xencons_write,
63916+ .write_room = xencons_write_room,
63917+ .put_char = xencons_put_char,
63918+ .flush_chars = xencons_flush_chars,
63919+ .chars_in_buffer = xencons_chars_in_buffer,
63920+ .send_xchar = xencons_send_xchar,
63921+ .flush_buffer = xencons_flush_buffer,
63922+ .throttle = xencons_throttle,
63923+ .unthrottle = xencons_unthrottle,
63924+ .wait_until_sent = xencons_wait_until_sent,
63925+};
63926+
63927+static int __init xencons_init(void)
63928+{
63929+ int rc;
63930+
63931+ if (!is_running_on_xen())
63932+ return -ENODEV;
63933+
63934+ if (xc_mode == XC_OFF)
63935+ return 0;
63936+
63937+ if (!is_initial_xendomain()) {
63938+ rc = xencons_ring_init();
63939+ if (rc)
63940+ return rc;
63941+ }
63942+
63943+ xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
63944+ MAX_NR_CONSOLES : 1);
63945+ if (xencons_driver == NULL)
63946+ return -ENOMEM;
63947+
63948+ DRV(xencons_driver)->name = "xencons";
63949+ DRV(xencons_driver)->major = TTY_MAJOR;
63950+ DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL;
63951+ DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL;
63952+ DRV(xencons_driver)->init_termios = tty_std_termios;
63953+ DRV(xencons_driver)->flags =
63954+ TTY_DRIVER_REAL_RAW |
63955+ TTY_DRIVER_RESET_TERMIOS;
63956+ DRV(xencons_driver)->termios = xencons_termios;
63957+ DRV(xencons_driver)->termios_locked = xencons_termios_locked;
63958+
63959+ switch (xc_mode) {
63960+ case XC_XVC:
63961+ DRV(xencons_driver)->name = "xvc";
63962+ DRV(xencons_driver)->major = XEN_XVC_MAJOR;
63963+ DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
63964+ DRV(xencons_driver)->name_base = xc_num;
63965+ break;
63966+ case XC_SERIAL:
63967+ DRV(xencons_driver)->name = "ttyS";
63968+ DRV(xencons_driver)->minor_start = 64 + xc_num;
63969+ DRV(xencons_driver)->name_base = xc_num;
63970+ break;
63971+ default:
63972+ DRV(xencons_driver)->name = "tty";
63973+ DRV(xencons_driver)->minor_start = 1;
63974+ DRV(xencons_driver)->name_base = 1;
63975+ break;
63976+ }
63977+
63978+ tty_set_operations(xencons_driver, &xencons_ops);
63979+
63980+ if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
63981+ printk("WARNING: Failed to register Xen virtual "
63982+ "console driver as '%s%d'\n",
63983+ DRV(xencons_driver)->name,
63984+ DRV(xencons_driver)->name_base);
63985+ put_tty_driver(xencons_driver);
63986+ xencons_driver = NULL;
63987+ return rc;
63988+ }
63989+
63990+ if (is_initial_xendomain()) {
63991+ xencons_priv_irq = bind_virq_to_irqhandler(
63992+ VIRQ_CONSOLE,
63993+ 0,
63994+ xencons_priv_interrupt,
63995+ 0,
63996+ "console",
63997+ NULL);
63998+ BUG_ON(xencons_priv_irq < 0);
63999+ }
64000+
64001+ printk("Xen virtual console successfully installed as %s%d\n",
64002+ DRV(xencons_driver)->name, xc_num);
64003+
64004+ return 0;
64005+}
64006+
64007+module_init(xencons_init);
64008+
64009+MODULE_LICENSE("Dual BSD/GPL");
64010diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/xencons_ring.c linux-2.6.16.33/drivers/xen/console/xencons_ring.c
64011--- linux-2.6.16.33-noxen/drivers/xen/console/xencons_ring.c 1970-01-01 00:00:00.000000000 +0000
64012+++ linux-2.6.16.33/drivers/xen/console/xencons_ring.c 2007-01-08 15:00:45.000000000 +0000
64013@@ -0,0 +1,143 @@
64014+/*
64015+ * This program is free software; you can redistribute it and/or
64016+ * modify it under the terms of the GNU General Public License version 2
64017+ * as published by the Free Software Foundation; or, when distributed
64018+ * separately from the Linux kernel or incorporated into other
64019+ * software packages, subject to the following license:
64020+ *
64021+ * Permission is hereby granted, free of charge, to any person obtaining a copy
64022+ * of this source file (the "Software"), to deal in the Software without
64023+ * restriction, including without limitation the rights to use, copy, modify,
64024+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64025+ * and to permit persons to whom the Software is furnished to do so, subject to
64026+ * the following conditions:
64027+ *
64028+ * The above copyright notice and this permission notice shall be included in
64029+ * all copies or substantial portions of the Software.
64030+ *
64031+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64032+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64033+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64034+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64035+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64036+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64037+ * IN THE SOFTWARE.
64038+ */
64039+
64040+#include <linux/version.h>
64041+#include <linux/module.h>
64042+#include <linux/errno.h>
64043+#include <linux/signal.h>
64044+#include <linux/sched.h>
64045+#include <linux/interrupt.h>
64046+#include <linux/tty.h>
64047+#include <linux/tty_flip.h>
64048+#include <linux/serial.h>
64049+#include <linux/major.h>
64050+#include <linux/ptrace.h>
64051+#include <linux/ioport.h>
64052+#include <linux/mm.h>
64053+#include <linux/slab.h>
64054+
64055+#include <asm/hypervisor.h>
64056+#include <xen/evtchn.h>
64057+#include <xen/xencons.h>
64058+#include <linux/wait.h>
64059+#include <linux/interrupt.h>
64060+#include <linux/sched.h>
64061+#include <linux/err.h>
64062+#include <xen/interface/io/console.h>
64063+
64064+static int xencons_irq;
64065+
64066+static inline struct xencons_interface *xencons_interface(void)
64067+{
64068+ return mfn_to_virt(xen_start_info->console.domU.mfn);
64069+}
64070+
64071+static inline void notify_daemon(void)
64072+{
64073+ /* Use evtchn: this is called early, before irq is set up. */
64074+ notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
64075+}
64076+
64077+int xencons_ring_send(const char *data, unsigned len)
64078+{
64079+ int sent = 0;
64080+ struct xencons_interface *intf = xencons_interface();
64081+ XENCONS_RING_IDX cons, prod;
64082+
64083+ cons = intf->out_cons;
64084+ prod = intf->out_prod;
64085+ mb();
64086+ BUG_ON((prod - cons) > sizeof(intf->out));
64087+
64088+ while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
64089+ intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
64090+
64091+ wmb();
64092+ intf->out_prod = prod;
64093+
64094+ notify_daemon();
64095+
64096+ return sent;
64097+}
64098+
64099+static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
64100+{
64101+ struct xencons_interface *intf = xencons_interface();
64102+ XENCONS_RING_IDX cons, prod;
64103+
64104+ cons = intf->in_cons;
64105+ prod = intf->in_prod;
64106+ mb();
64107+ BUG_ON((prod - cons) > sizeof(intf->in));
64108+
64109+ while (cons != prod) {
64110+ xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
64111+ cons++;
64112+ }
64113+
64114+ mb();
64115+ intf->in_cons = cons;
64116+
64117+ notify_daemon();
64118+
64119+ xencons_tx();
64120+
64121+ return IRQ_HANDLED;
64122+}
64123+
64124+int xencons_ring_init(void)
64125+{
64126+ int irq;
64127+
64128+ if (xencons_irq)
64129+ unbind_from_irqhandler(xencons_irq, NULL);
64130+ xencons_irq = 0;
64131+
64132+ if (!is_running_on_xen() ||
64133+ is_initial_xendomain() ||
64134+ !xen_start_info->console.domU.evtchn)
64135+ return -ENODEV;
64136+
64137+ irq = bind_evtchn_to_irqhandler(
64138+ xen_start_info->console.domU.evtchn,
64139+ handle_input, 0, "xencons", NULL);
64140+ if (irq < 0) {
64141+ printk(KERN_ERR "XEN console request irq failed %i\n", irq);
64142+ return irq;
64143+ }
64144+
64145+ xencons_irq = irq;
64146+
64147+ /* In case we have in-flight data after save/restore... */
64148+ notify_daemon();
64149+
64150+ return 0;
64151+}
64152+
64153+void xencons_resume(void)
64154+{
64155+ (void)xencons_ring_init();
64156+}
64157diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/Makefile linux-2.6.16.33/drivers/xen/core/Makefile
64158--- linux-2.6.16.33-noxen/drivers/xen/core/Makefile 1970-01-01 00:00:00.000000000 +0000
64159+++ linux-2.6.16.33/drivers/xen/core/Makefile 2007-01-08 15:00:45.000000000 +0000
64160@@ -0,0 +1,14 @@
64161+#
64162+# Makefile for the linux kernel.
64163+#
64164+
64165+obj-y := evtchn.o gnttab.o features.o
64166+
64167+obj-$(CONFIG_PROC_FS) += xen_proc.o
64168+obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o
64169+obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
64170+obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
64171+obj-$(CONFIG_XEN_SKBUFF) += skbuff.o
64172+obj-$(CONFIG_XEN_REBOOT) += reboot.o machine_reboot.o
64173+obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
64174+obj-$(CONFIG_KEXEC) += machine_kexec.o
64175diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/cpu_hotplug.c linux-2.6.16.33/drivers/xen/core/cpu_hotplug.c
64176--- linux-2.6.16.33-noxen/drivers/xen/core/cpu_hotplug.c 1970-01-01 00:00:00.000000000 +0000
64177+++ linux-2.6.16.33/drivers/xen/core/cpu_hotplug.c 2007-01-08 15:00:45.000000000 +0000
64178@@ -0,0 +1,188 @@
64179+#include <linux/config.h>
64180+#include <linux/init.h>
64181+#include <linux/kernel.h>
64182+#include <linux/sched.h>
64183+#include <linux/notifier.h>
64184+#include <linux/cpu.h>
64185+#include <xen/cpu_hotplug.h>
64186+#include <xen/xenbus.h>
64187+
64188+/*
64189+ * Set of CPUs that remote admin software will allow us to bring online.
64190+ * Notified to us via xenbus.
64191+ */
64192+static cpumask_t xenbus_allowed_cpumask;
64193+
64194+/* Set of CPUs that local admin will allow us to bring online. */
64195+static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
64196+
64197+static int local_cpu_hotplug_request(void)
64198+{
64199+ /*
64200+ * We assume a CPU hotplug request comes from local admin if it is made
64201+ * via a userspace process (i.e., one with a real mm_struct).
64202+ */
64203+ return (current->mm != NULL);
64204+}
64205+
64206+static void vcpu_hotplug(unsigned int cpu)
64207+{
64208+ int err;
64209+ char dir[32], state[32];
64210+
64211+ if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
64212+ return;
64213+
64214+ sprintf(dir, "cpu/%d", cpu);
64215+ err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
64216+ if (err != 1) {
64217+ printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
64218+ return;
64219+ }
64220+
64221+ if (strcmp(state, "online") == 0) {
64222+ cpu_set(cpu, xenbus_allowed_cpumask);
64223+ (void)cpu_up(cpu);
64224+ } else if (strcmp(state, "offline") == 0) {
64225+ cpu_clear(cpu, xenbus_allowed_cpumask);
64226+ (void)cpu_down(cpu);
64227+ } else {
64228+ printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
64229+ state, cpu);
64230+ }
64231+}
64232+
64233+static void handle_vcpu_hotplug_event(
64234+ struct xenbus_watch *watch, const char **vec, unsigned int len)
64235+{
64236+ int cpu;
64237+ char *cpustr;
64238+ const char *node = vec[XS_WATCH_PATH];
64239+
64240+ if ((cpustr = strstr(node, "cpu/")) != NULL) {
64241+ sscanf(cpustr, "cpu/%d", &cpu);
64242+ vcpu_hotplug(cpu);
64243+ }
64244+}
64245+
64246+static int smpboot_cpu_notify(struct notifier_block *notifier,
64247+ unsigned long action, void *hcpu)
64248+{
64249+ int cpu = (long)hcpu;
64250+
64251+ /*
64252+ * We do this in a callback notifier rather than __cpu_disable()
64253+ * because local_cpu_hotplug_request() does not work in the latter
64254+ * as it's always executed from within a stopmachine kthread.
64255+ */
64256+ if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
64257+ cpu_clear(cpu, local_allowed_cpumask);
64258+
64259+ return NOTIFY_OK;
64260+}
64261+
64262+static int setup_cpu_watcher(struct notifier_block *notifier,
64263+ unsigned long event, void *data)
64264+{
64265+ int i;
64266+
64267+ static struct xenbus_watch cpu_watch = {
64268+ .node = "cpu",
64269+ .callback = handle_vcpu_hotplug_event,
64270+ .flags = XBWF_new_thread };
64271+ (void)register_xenbus_watch(&cpu_watch);
64272+
64273+ if (!is_initial_xendomain()) {
64274+ for_each_cpu(i)
64275+ vcpu_hotplug(i);
64276+ printk(KERN_INFO "Brought up %ld CPUs\n",
64277+ (long)num_online_cpus());
64278+ }
64279+
64280+ return NOTIFY_DONE;
64281+}
64282+
64283+static int __init setup_vcpu_hotplug_event(void)
64284+{
64285+ static struct notifier_block hotplug_cpu = {
64286+ .notifier_call = smpboot_cpu_notify };
64287+ static struct notifier_block xsn_cpu = {
64288+ .notifier_call = setup_cpu_watcher };
64289+
64290+ if (!is_running_on_xen())
64291+ return -ENODEV;
64292+
64293+ register_cpu_notifier(&hotplug_cpu);
64294+ register_xenstore_notifier(&xsn_cpu);
64295+
64296+ return 0;
64297+}
64298+
64299+arch_initcall(setup_vcpu_hotplug_event);
64300+
64301+int smp_suspend(void)
64302+{
64303+ int i, err;
64304+
64305+ lock_cpu_hotplug();
64306+
64307+ /*
64308+ * Take all other CPUs offline. We hold the hotplug mutex to
64309+ * avoid other processes bringing up CPUs under our feet.
64310+ */
64311+ while (num_online_cpus() > 1) {
64312+ unlock_cpu_hotplug();
64313+ for_each_online_cpu(i) {
64314+ if (i == 0)
64315+ continue;
64316+ err = cpu_down(i);
64317+ if (err) {
64318+ printk(KERN_CRIT "Failed to take all CPUs "
64319+ "down: %d.\n", err);
64320+ for_each_cpu(i)
64321+ vcpu_hotplug(i);
64322+ return err;
64323+ }
64324+ }
64325+ lock_cpu_hotplug();
64326+ }
64327+
64328+ return 0;
64329+}
64330+
64331+void smp_resume(void)
64332+{
64333+ int cpu;
64334+
64335+ for_each_cpu(cpu)
64336+ cpu_initialize_context(cpu);
64337+
64338+ unlock_cpu_hotplug();
64339+
64340+ for_each_cpu(cpu)
64341+ vcpu_hotplug(cpu);
64342+}
64343+
64344+int cpu_up_check(unsigned int cpu)
64345+{
64346+ int rc = 0;
64347+
64348+ if (local_cpu_hotplug_request()) {
64349+ cpu_set(cpu, local_allowed_cpumask);
64350+ if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
64351+ printk("%s: attempt to bring up CPU %u disallowed by "
64352+ "remote admin.\n", __FUNCTION__, cpu);
64353+ rc = -EBUSY;
64354+ }
64355+ } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
64356+ !cpu_isset(cpu, xenbus_allowed_cpumask)) {
64357+ rc = -EBUSY;
64358+ }
64359+
64360+ return rc;
64361+}
64362+
64363+void init_xenbus_allowed_cpumask(void)
64364+{
64365+ xenbus_allowed_cpumask = cpu_present_map;
64366+}
64367diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/evtchn.c linux-2.6.16.33/drivers/xen/core/evtchn.c
64368--- linux-2.6.16.33-noxen/drivers/xen/core/evtchn.c 1970-01-01 00:00:00.000000000 +0000
64369+++ linux-2.6.16.33/drivers/xen/core/evtchn.c 2007-01-08 15:00:45.000000000 +0000
64370@@ -0,0 +1,872 @@
64371+/******************************************************************************
64372+ * evtchn.c
64373+ *
64374+ * Communication via Xen event channels.
64375+ *
64376+ * Copyright (c) 2002-2005, K A Fraser
64377+ *
64378+ * This program is free software; you can redistribute it and/or
64379+ * modify it under the terms of the GNU General Public License version 2
64380+ * as published by the Free Software Foundation; or, when distributed
64381+ * separately from the Linux kernel or incorporated into other
64382+ * software packages, subject to the following license:
64383+ *
64384+ * Permission is hereby granted, free of charge, to any person obtaining a copy
64385+ * of this source file (the "Software"), to deal in the Software without
64386+ * restriction, including without limitation the rights to use, copy, modify,
64387+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64388+ * and to permit persons to whom the Software is furnished to do so, subject to
64389+ * the following conditions:
64390+ *
64391+ * The above copyright notice and this permission notice shall be included in
64392+ * all copies or substantial portions of the Software.
64393+ *
64394+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64395+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64396+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64397+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64398+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64399+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64400+ * IN THE SOFTWARE.
64401+ */
64402+
64403+#include <linux/config.h>
64404+#include <linux/module.h>
64405+#include <linux/irq.h>
64406+#include <linux/interrupt.h>
64407+#include <linux/sched.h>
64408+#include <linux/kernel_stat.h>
64409+#include <linux/version.h>
64410+#include <asm/atomic.h>
64411+#include <asm/system.h>
64412+#include <asm/ptrace.h>
64413+#include <asm/synch_bitops.h>
64414+#include <xen/evtchn.h>
64415+#include <xen/interface/event_channel.h>
64416+#include <xen/interface/physdev.h>
64417+#include <asm/hypervisor.h>
64418+#include <linux/mc146818rtc.h> /* RTC_IRQ */
64419+
64420+/*
64421+ * This lock protects updates to the following mapping and reference-count
64422+ * arrays. The lock does not need to be acquired to read the mapping tables.
64423+ */
64424+static DEFINE_SPINLOCK(irq_mapping_update_lock);
64425+
64426+/* IRQ <-> event-channel mappings. */
64427+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
64428+ [0 ... NR_EVENT_CHANNELS-1] = -1 };
64429+
64430+/* Packed IRQ information: binding type, sub-type index, and event channel. */
64431+static u32 irq_info[NR_IRQS];
64432+
64433+/* Binding types. */
64434+enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
64435+
64436+/* Constructor for packed IRQ information. */
64437+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
64438+{
64439+ return ((type << 24) | (index << 16) | evtchn);
64440+}
64441+
64442+/* Convenient shorthand for packed representation of an unbound IRQ. */
64443+#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
64444+
64445+/*
64446+ * Accessors for packed IRQ information.
64447+ */
64448+
64449+static inline unsigned int evtchn_from_irq(int irq)
64450+{
64451+ return (u16)(irq_info[irq]);
64452+}
64453+
64454+static inline unsigned int index_from_irq(int irq)
64455+{
64456+ return (u8)(irq_info[irq] >> 16);
64457+}
64458+
64459+static inline unsigned int type_from_irq(int irq)
64460+{
64461+ return (u8)(irq_info[irq] >> 24);
64462+}
64463+
64464+/* IRQ <-> VIRQ mapping. */
64465+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
64466+
64467+/* IRQ <-> IPI mapping. */
64468+#ifndef NR_IPIS
64469+#define NR_IPIS 1
64470+#endif
64471+DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
64472+
64473+/* Reference counts for bindings to IRQs. */
64474+static int irq_bindcount[NR_IRQS];
64475+
64476+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
64477+static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
64478+
64479+#ifdef CONFIG_SMP
64480+
64481+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
64482+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
64483+
64484+static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
64485+ unsigned int idx)
64486+{
64487+ return (sh->evtchn_pending[idx] &
64488+ cpu_evtchn_mask[cpu][idx] &
64489+ ~sh->evtchn_mask[idx]);
64490+}
64491+
64492+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
64493+{
64494+ int irq = evtchn_to_irq[chn];
64495+
64496+ BUG_ON(irq == -1);
64497+ set_native_irq_info(irq, cpumask_of_cpu(cpu));
64498+
64499+ clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
64500+ set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
64501+ cpu_evtchn[chn] = cpu;
64502+}
64503+
64504+static void init_evtchn_cpu_bindings(void)
64505+{
64506+ int i;
64507+
64508+ /* By default all event channels notify CPU#0. */
64509+ for (i = 0; i < NR_IRQS; i++)
64510+ set_native_irq_info(i, cpumask_of_cpu(0));
64511+
64512+ memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
64513+ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
64514+}
64515+
64516+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
64517+{
64518+ return cpu_evtchn[evtchn];
64519+}
64520+
64521+#else
64522+
64523+static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
64524+ unsigned int idx)
64525+{
64526+ return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
64527+}
64528+
64529+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
64530+{
64531+}
64532+
64533+static void init_evtchn_cpu_bindings(void)
64534+{
64535+}
64536+
64537+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
64538+{
64539+ return 0;
64540+}
64541+
64542+#endif
64543+
64544+/* Upcall to generic IRQ layer. */
64545+#ifdef CONFIG_X86
64546+extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
64547+void __init xen_init_IRQ(void);
64548+void __init init_IRQ(void)
64549+{
64550+ irq_ctx_init(0);
64551+ xen_init_IRQ();
64552+}
64553+#if defined (__i386__)
64554+static inline void exit_idle(void) {}
64555+#define IRQ_REG orig_eax
64556+#elif defined (__x86_64__)
64557+#include <asm/idle.h>
64558+#define IRQ_REG orig_rax
64559+#endif
64560+#define do_IRQ(irq, regs) do { \
64561+ (regs)->IRQ_REG = ~(irq); \
64562+ do_IRQ((regs)); \
64563+} while (0)
64564+#endif
64565+
64566+/* Xen will never allocate port zero for any purpose. */
64567+#define VALID_EVTCHN(chn) ((chn) != 0)
64568+
64569+/*
64570+ * Force a proper event-channel callback from Xen after clearing the
64571+ * callback mask. We do this in a very simple manner, by making a call
64572+ * down into Xen. The pending flag will be checked by Xen on return.
64573+ */
64574+void force_evtchn_callback(void)
64575+{
64576+ (void)HYPERVISOR_xen_version(0, NULL);
64577+}
64578+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
64579+EXPORT_SYMBOL(force_evtchn_callback);
64580+
64581+/* NB. Interrupts are disabled on entry. */
64582+asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
64583+{
64584+ unsigned long l1, l2;
64585+ unsigned int l1i, l2i, port;
64586+ int irq, cpu = smp_processor_id();
64587+ shared_info_t *s = HYPERVISOR_shared_info;
64588+ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
64589+
64590+ vcpu_info->evtchn_upcall_pending = 0;
64591+
64592+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
64593+ /* Clear master pending flag /before/ clearing selector flag. */
64594+ rmb();
64595+#endif
64596+ l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
64597+ while (l1 != 0) {
64598+ l1i = __ffs(l1);
64599+ l1 &= ~(1UL << l1i);
64600+
64601+ while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
64602+ l2i = __ffs(l2);
64603+
64604+ port = (l1i * BITS_PER_LONG) + l2i;
64605+ if ((irq = evtchn_to_irq[port]) != -1)
64606+ do_IRQ(irq, regs);
64607+ else {
64608+ exit_idle();
64609+ evtchn_device_upcall(port);
64610+ }
64611+ }
64612+ }
64613+}
64614+
64615+static int find_unbound_irq(void)
64616+{
64617+ static int warned;
64618+ int dynirq, irq;
64619+
64620+ for (dynirq = 0; dynirq < NR_DYNIRQS; dynirq++) {
64621+ irq = dynirq_to_irq(dynirq);
64622+ if (irq_bindcount[irq] == 0)
64623+ return irq;
64624+ }
64625+
64626+ if (!warned) {
64627+ warned = 1;
64628+ printk(KERN_WARNING "No available IRQ to bind to: "
64629+ "increase NR_DYNIRQS.\n");
64630+ }
64631+
64632+ return -ENOSPC;
64633+}
64634+
64635+static int bind_evtchn_to_irq(unsigned int evtchn)
64636+{
64637+ int irq;
64638+
64639+ spin_lock(&irq_mapping_update_lock);
64640+
64641+ if ((irq = evtchn_to_irq[evtchn]) == -1) {
64642+ if ((irq = find_unbound_irq()) < 0)
64643+ goto out;
64644+
64645+ evtchn_to_irq[evtchn] = irq;
64646+ irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
64647+ }
64648+
64649+ irq_bindcount[irq]++;
64650+
64651+ out:
64652+ spin_unlock(&irq_mapping_update_lock);
64653+ return irq;
64654+}
64655+
64656+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
64657+{
64658+ struct evtchn_bind_virq bind_virq;
64659+ int evtchn, irq;
64660+
64661+ spin_lock(&irq_mapping_update_lock);
64662+
64663+ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
64664+ if ((irq = find_unbound_irq()) < 0)
64665+ goto out;
64666+
64667+ bind_virq.virq = virq;
64668+ bind_virq.vcpu = cpu;
64669+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
64670+ &bind_virq) != 0)
64671+ BUG();
64672+ evtchn = bind_virq.port;
64673+
64674+ evtchn_to_irq[evtchn] = irq;
64675+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
64676+
64677+ per_cpu(virq_to_irq, cpu)[virq] = irq;
64678+
64679+ bind_evtchn_to_cpu(evtchn, cpu);
64680+ }
64681+
64682+ irq_bindcount[irq]++;
64683+
64684+ out:
64685+ spin_unlock(&irq_mapping_update_lock);
64686+ return irq;
64687+}
64688+
64689+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
64690+{
64691+ struct evtchn_bind_ipi bind_ipi;
64692+ int evtchn, irq;
64693+
64694+ spin_lock(&irq_mapping_update_lock);
64695+
64696+ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
64697+ if ((irq = find_unbound_irq()) < 0)
64698+ goto out;
64699+
64700+ bind_ipi.vcpu = cpu;
64701+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
64702+ &bind_ipi) != 0)
64703+ BUG();
64704+ evtchn = bind_ipi.port;
64705+
64706+ evtchn_to_irq[evtchn] = irq;
64707+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
64708+
64709+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
64710+
64711+ bind_evtchn_to_cpu(evtchn, cpu);
64712+ }
64713+
64714+ irq_bindcount[irq]++;
64715+
64716+ out:
64717+ spin_unlock(&irq_mapping_update_lock);
64718+ return irq;
64719+}
64720+
64721+static void unbind_from_irq(unsigned int irq)
64722+{
64723+ struct evtchn_close close;
64724+ int evtchn = evtchn_from_irq(irq);
64725+
64726+ spin_lock(&irq_mapping_update_lock);
64727+
64728+ if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
64729+ close.port = evtchn;
64730+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
64731+ BUG();
64732+
64733+ switch (type_from_irq(irq)) {
64734+ case IRQT_VIRQ:
64735+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
64736+ [index_from_irq(irq)] = -1;
64737+ break;
64738+ case IRQT_IPI:
64739+ per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
64740+ [index_from_irq(irq)] = -1;
64741+ break;
64742+ default:
64743+ break;
64744+ }
64745+
64746+ /* Closed ports are implicitly re-bound to VCPU0. */
64747+ bind_evtchn_to_cpu(evtchn, 0);
64748+
64749+ evtchn_to_irq[evtchn] = -1;
64750+ irq_info[irq] = IRQ_UNBOUND;
64751+ }
64752+
64753+ spin_unlock(&irq_mapping_update_lock);
64754+}
64755+
64756+int bind_evtchn_to_irqhandler(
64757+ unsigned int evtchn,
64758+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
64759+ unsigned long irqflags,
64760+ const char *devname,
64761+ void *dev_id)
64762+{
64763+ unsigned int irq;
64764+ int retval;
64765+
64766+ irq = bind_evtchn_to_irq(evtchn);
64767+ if (irq < 0)
64768+ return irq;
64769+
64770+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
64771+ if (retval != 0) {
64772+ unbind_from_irq(irq);
64773+ return retval;
64774+ }
64775+
64776+ return irq;
64777+}
64778+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
64779+
64780+int bind_virq_to_irqhandler(
64781+ unsigned int virq,
64782+ unsigned int cpu,
64783+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
64784+ unsigned long irqflags,
64785+ const char *devname,
64786+ void *dev_id)
64787+{
64788+ unsigned int irq;
64789+ int retval;
64790+
64791+ irq = bind_virq_to_irq(virq, cpu);
64792+ if (irq < 0)
64793+ return irq;
64794+
64795+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
64796+ if (retval != 0) {
64797+ unbind_from_irq(irq);
64798+ return retval;
64799+ }
64800+
64801+ return irq;
64802+}
64803+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
64804+
64805+int bind_ipi_to_irqhandler(
64806+ unsigned int ipi,
64807+ unsigned int cpu,
64808+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
64809+ unsigned long irqflags,
64810+ const char *devname,
64811+ void *dev_id)
64812+{
64813+ unsigned int irq;
64814+ int retval;
64815+
64816+ irq = bind_ipi_to_irq(ipi, cpu);
64817+ if (irq < 0)
64818+ return irq;
64819+
64820+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
64821+ if (retval != 0) {
64822+ unbind_from_irq(irq);
64823+ return retval;
64824+ }
64825+
64826+ return irq;
64827+}
64828+EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
64829+
64830+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
64831+{
64832+ free_irq(irq, dev_id);
64833+ unbind_from_irq(irq);
64834+}
64835+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
64836+
64837+/* Rebind an evtchn so that it gets delivered to a specific cpu */
64838+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
64839+{
64840+ struct evtchn_bind_vcpu bind_vcpu;
64841+ int evtchn = evtchn_from_irq(irq);
64842+
64843+ if (!VALID_EVTCHN(evtchn))
64844+ return;
64845+
64846+ /* Send future instances of this interrupt to other vcpu. */
64847+ bind_vcpu.port = evtchn;
64848+ bind_vcpu.vcpu = tcpu;
64849+
64850+ /*
64851+ * If this fails, it usually just indicates that we're dealing with a
64852+ * virq or IPI channel, which don't actually need to be rebound. Ignore
64853+ * it, but don't do the xenlinux-level rebind in that case.
64854+ */
64855+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
64856+ bind_evtchn_to_cpu(evtchn, tcpu);
64857+}
64858+
64859+
64860+static void set_affinity_irq(unsigned irq, cpumask_t dest)
64861+{
64862+ unsigned tcpu = first_cpu(dest);
64863+ rebind_irq_to_cpu(irq, tcpu);
64864+}
64865+
64866+/*
64867+ * Interface to generic handling in irq.c
64868+ */
64869+
64870+static unsigned int startup_dynirq(unsigned int irq)
64871+{
64872+ int evtchn = evtchn_from_irq(irq);
64873+
64874+ if (VALID_EVTCHN(evtchn))
64875+ unmask_evtchn(evtchn);
64876+ return 0;
64877+}
64878+
64879+static void shutdown_dynirq(unsigned int irq)
64880+{
64881+ int evtchn = evtchn_from_irq(irq);
64882+
64883+ if (VALID_EVTCHN(evtchn))
64884+ mask_evtchn(evtchn);
64885+}
64886+
64887+static void enable_dynirq(unsigned int irq)
64888+{
64889+ int evtchn = evtchn_from_irq(irq);
64890+
64891+ if (VALID_EVTCHN(evtchn))
64892+ unmask_evtchn(evtchn);
64893+}
64894+
64895+static void disable_dynirq(unsigned int irq)
64896+{
64897+ int evtchn = evtchn_from_irq(irq);
64898+
64899+ if (VALID_EVTCHN(evtchn))
64900+ mask_evtchn(evtchn);
64901+}
64902+
64903+static void ack_dynirq(unsigned int irq)
64904+{
64905+ int evtchn = evtchn_from_irq(irq);
64906+
64907+ move_native_irq(irq);
64908+
64909+ if (VALID_EVTCHN(evtchn)) {
64910+ mask_evtchn(evtchn);
64911+ clear_evtchn(evtchn);
64912+ }
64913+}
64914+
64915+static void end_dynirq(unsigned int irq)
64916+{
64917+ int evtchn = evtchn_from_irq(irq);
64918+
64919+ if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
64920+ unmask_evtchn(evtchn);
64921+}
64922+
64923+static struct hw_interrupt_type dynirq_type = {
64924+ "Dynamic-irq",
64925+ startup_dynirq,
64926+ shutdown_dynirq,
64927+ enable_dynirq,
64928+ disable_dynirq,
64929+ ack_dynirq,
64930+ end_dynirq,
64931+ set_affinity_irq
64932+};
64933+
64934+static inline void pirq_unmask_notify(int pirq)
64935+{
64936+ struct physdev_eoi eoi = { .irq = pirq };
64937+ if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
64938+ (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
64939+}
64940+
64941+static inline void pirq_query_unmask(int pirq)
64942+{
64943+ struct physdev_irq_status_query irq_status;
64944+ irq_status.irq = pirq;
64945+ (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
64946+ clear_bit(pirq, &pirq_needs_eoi[0]);
64947+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
64948+ set_bit(pirq, &pirq_needs_eoi[0]);
64949+}
64950+
64951+/*
64952+ * On startup, if there is no action associated with the IRQ then we are
64953+ * probing. In this case we should not share with others as it will confuse us.
64954+ */
64955+#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
64956+
64957+static unsigned int startup_pirq(unsigned int irq)
64958+{
64959+ struct evtchn_bind_pirq bind_pirq;
64960+ int evtchn = evtchn_from_irq(irq);
64961+
64962+ if (VALID_EVTCHN(evtchn))
64963+ goto out;
64964+
64965+ bind_pirq.pirq = irq;
64966+ /* NB. We are happy to share unless we are probing. */
64967+ bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
64968+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
64969+ if (!probing_irq(irq))
64970+ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
64971+ irq);
64972+ return 0;
64973+ }
64974+ evtchn = bind_pirq.port;
64975+
64976+ pirq_query_unmask(irq_to_pirq(irq));
64977+
64978+ evtchn_to_irq[evtchn] = irq;
64979+ bind_evtchn_to_cpu(evtchn, 0);
64980+ irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
64981+
64982+ out:
64983+ unmask_evtchn(evtchn);
64984+ pirq_unmask_notify(irq_to_pirq(irq));
64985+
64986+ return 0;
64987+}
64988+
64989+static void shutdown_pirq(unsigned int irq)
64990+{
64991+ struct evtchn_close close;
64992+ int evtchn = evtchn_from_irq(irq);
64993+
64994+ if (!VALID_EVTCHN(evtchn))
64995+ return;
64996+
64997+ mask_evtchn(evtchn);
64998+
64999+ close.port = evtchn;
65000+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
65001+ BUG();
65002+
65003+ bind_evtchn_to_cpu(evtchn, 0);
65004+ evtchn_to_irq[evtchn] = -1;
65005+ irq_info[irq] = IRQ_UNBOUND;
65006+}
65007+
65008+static void enable_pirq(unsigned int irq)
65009+{
65010+ int evtchn = evtchn_from_irq(irq);
65011+
65012+ if (VALID_EVTCHN(evtchn)) {
65013+ unmask_evtchn(evtchn);
65014+ pirq_unmask_notify(irq_to_pirq(irq));
65015+ }
65016+}
65017+
65018+static void disable_pirq(unsigned int irq)
65019+{
65020+ int evtchn = evtchn_from_irq(irq);
65021+
65022+ if (VALID_EVTCHN(evtchn))
65023+ mask_evtchn(evtchn);
65024+}
65025+
65026+static void ack_pirq(unsigned int irq)
65027+{
65028+ int evtchn = evtchn_from_irq(irq);
65029+
65030+ move_native_irq(irq);
65031+
65032+ if (VALID_EVTCHN(evtchn)) {
65033+ mask_evtchn(evtchn);
65034+ clear_evtchn(evtchn);
65035+ }
65036+}
65037+
65038+static void end_pirq(unsigned int irq)
65039+{
65040+ int evtchn = evtchn_from_irq(irq);
65041+
65042+ if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
65043+ unmask_evtchn(evtchn);
65044+ pirq_unmask_notify(irq_to_pirq(irq));
65045+ }
65046+}
65047+
65048+static struct hw_interrupt_type pirq_type = {
65049+ "Phys-irq",
65050+ startup_pirq,
65051+ shutdown_pirq,
65052+ enable_pirq,
65053+ disable_pirq,
65054+ ack_pirq,
65055+ end_pirq,
65056+ set_affinity_irq
65057+};
65058+
65059+int irq_ignore_unhandled(unsigned int irq)
65060+{
65061+ struct physdev_irq_status_query irq_status = { .irq = irq };
65062+
65063+ if (!is_running_on_xen())
65064+ return 0;
65065+
65066+ (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
65067+ return !!(irq_status.flags & XENIRQSTAT_shared);
65068+}
65069+
65070+void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i)
65071+{
65072+ int evtchn = evtchn_from_irq(i);
65073+ shared_info_t *s = HYPERVISOR_shared_info;
65074+ if (!VALID_EVTCHN(evtchn))
65075+ return;
65076+ BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
65077+ synch_set_bit(evtchn, &s->evtchn_pending[0]);
65078+}
65079+
65080+void notify_remote_via_irq(int irq)
65081+{
65082+ int evtchn = evtchn_from_irq(irq);
65083+
65084+ if (VALID_EVTCHN(evtchn))
65085+ notify_remote_via_evtchn(evtchn);
65086+}
65087+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
65088+
65089+void mask_evtchn(int port)
65090+{
65091+ shared_info_t *s = HYPERVISOR_shared_info;
65092+ synch_set_bit(port, &s->evtchn_mask[0]);
65093+}
65094+EXPORT_SYMBOL_GPL(mask_evtchn);
65095+
65096+void unmask_evtchn(int port)
65097+{
65098+ shared_info_t *s = HYPERVISOR_shared_info;
65099+ unsigned int cpu = smp_processor_id();
65100+ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
65101+
65102+ BUG_ON(!irqs_disabled());
65103+
65104+ /* Slow path (hypercall) if this is a non-local port. */
65105+ if (unlikely(cpu != cpu_from_evtchn(port))) {
65106+ struct evtchn_unmask unmask = { .port = port };
65107+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
65108+ return;
65109+ }
65110+
65111+ synch_clear_bit(port, &s->evtchn_mask[0]);
65112+
65113+ /*
65114+ * The following is basically the equivalent of 'hw_resend_irq'. Just
65115+ * like a real IO-APIC we 'lose the interrupt edge' if the channel is
65116+ * masked.
65117+ */
65118+ if (synch_test_bit(port, &s->evtchn_pending[0]) &&
65119+ !synch_test_and_set_bit(port / BITS_PER_LONG,
65120+ &vcpu_info->evtchn_pending_sel))
65121+ vcpu_info->evtchn_upcall_pending = 1;
65122+}
65123+EXPORT_SYMBOL_GPL(unmask_evtchn);
65124+
65125+void irq_resume(void)
65126+{
65127+ struct evtchn_bind_virq bind_virq;
65128+ struct evtchn_bind_ipi bind_ipi;
65129+ int cpu, pirq, virq, ipi, irq, evtchn;
65130+
65131+ init_evtchn_cpu_bindings();
65132+
65133+ /* New event-channel space is not 'live' yet. */
65134+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
65135+ mask_evtchn(evtchn);
65136+
65137+ /* Check that no PIRQs are still bound. */
65138+ for (pirq = 0; pirq < NR_PIRQS; pirq++)
65139+ BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
65140+
65141+ /* Secondary CPUs must have no VIRQ or IPI bindings. */
65142+ for_each_possible_cpu(cpu) {
65143+ if (cpu == 0)
65144+ continue;
65145+ for (virq = 0; virq < NR_VIRQS; virq++)
65146+ BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
65147+ for (ipi = 0; ipi < NR_IPIS; ipi++)
65148+ BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
65149+ }
65150+
65151+ /* No IRQ <-> event-channel mappings. */
65152+ for (irq = 0; irq < NR_IRQS; irq++)
65153+ irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
65154+ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
65155+ evtchn_to_irq[evtchn] = -1;
65156+
65157+ /* Primary CPU: rebind VIRQs automatically. */
65158+ for (virq = 0; virq < NR_VIRQS; virq++) {
65159+ if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
65160+ continue;
65161+
65162+ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
65163+
65164+ /* Get a new binding from Xen. */
65165+ bind_virq.virq = virq;
65166+ bind_virq.vcpu = 0;
65167+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
65168+ &bind_virq) != 0)
65169+ BUG();
65170+ evtchn = bind_virq.port;
65171+
65172+ /* Record the new mapping. */
65173+ evtchn_to_irq[evtchn] = irq;
65174+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
65175+
65176+ /* Ready for use. */
65177+ unmask_evtchn(evtchn);
65178+ }
65179+
65180+ /* Primary CPU: rebind IPIs automatically. */
65181+ for (ipi = 0; ipi < NR_IPIS; ipi++) {
65182+ if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
65183+ continue;
65184+
65185+ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
65186+
65187+ /* Get a new binding from Xen. */
65188+ bind_ipi.vcpu = 0;
65189+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
65190+ &bind_ipi) != 0)
65191+ BUG();
65192+ evtchn = bind_ipi.port;
65193+
65194+ /* Record the new mapping. */
65195+ evtchn_to_irq[evtchn] = irq;
65196+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
65197+
65198+ /* Ready for use. */
65199+ unmask_evtchn(evtchn);
65200+ }
65201+}
65202+
65203+void __init xen_init_IRQ(void)
65204+{
65205+ int i;
65206+
65207+ init_evtchn_cpu_bindings();
65208+
65209+ /* No event channels are 'live' right now. */
65210+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
65211+ mask_evtchn(i);
65212+
65213+ /* No IRQ -> event-channel mappings. */
65214+ for (i = 0; i < NR_IRQS; i++)
65215+ irq_info[i] = IRQ_UNBOUND;
65216+
65217+ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
65218+ for (i = 0; i < NR_DYNIRQS; i++) {
65219+ irq_bindcount[dynirq_to_irq(i)] = 0;
65220+
65221+ irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
65222+ irq_desc[dynirq_to_irq(i)].action = NULL;
65223+ irq_desc[dynirq_to_irq(i)].depth = 1;
65224+ irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
65225+ }
65226+
65227+ /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
65228+ for (i = 0; i < NR_PIRQS; i++) {
65229+ irq_bindcount[pirq_to_irq(i)] = 1;
65230+
65231+#ifdef RTC_IRQ
65232+ /* If not domain 0, force our RTC driver to fail its probe. */
65233+ if ((i == RTC_IRQ) && !is_initial_xendomain())
65234+ continue;
65235+#endif
65236+
65237+ irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
65238+ irq_desc[pirq_to_irq(i)].action = NULL;
65239+ irq_desc[pirq_to_irq(i)].depth = 1;
65240+ irq_desc[pirq_to_irq(i)].handler = &pirq_type;
65241+ }
65242+}
65243diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/features.c linux-2.6.16.33/drivers/xen/core/features.c
65244--- linux-2.6.16.33-noxen/drivers/xen/core/features.c 1970-01-01 00:00:00.000000000 +0000
65245+++ linux-2.6.16.33/drivers/xen/core/features.c 2007-01-08 15:00:45.000000000 +0000
65246@@ -0,0 +1,34 @@
65247+/******************************************************************************
65248+ * features.c
65249+ *
65250+ * Xen feature flags.
65251+ *
65252+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
65253+ */
65254+#include <linux/types.h>
65255+#include <linux/cache.h>
65256+#include <linux/module.h>
65257+#include <asm/hypervisor.h>
65258+#include <xen/features.h>
65259+
65260+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
65261+#include <xen/platform-compat.h>
65262+#endif
65263+
65264+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
65265+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
65266+EXPORT_SYMBOL(xen_features);
65267+
65268+void setup_xen_features(void)
65269+{
65270+ xen_feature_info_t fi;
65271+ int i, j;
65272+
65273+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
65274+ fi.submap_idx = i;
65275+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
65276+ break;
65277+ for (j=0; j<32; j++)
65278+ xen_features[i*32+j] = !!(fi.submap & 1<<j);
65279+ }
65280+}
65281diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/gnttab.c linux-2.6.16.33/drivers/xen/core/gnttab.c
65282--- linux-2.6.16.33-noxen/drivers/xen/core/gnttab.c 1970-01-01 00:00:00.000000000 +0000
65283+++ linux-2.6.16.33/drivers/xen/core/gnttab.c 2007-01-08 15:00:45.000000000 +0000
65284@@ -0,0 +1,488 @@
65285+/******************************************************************************
65286+ * gnttab.c
65287+ *
65288+ * Granting foreign access to our memory reservation.
65289+ *
65290+ * Copyright (c) 2005, Christopher Clark
65291+ * Copyright (c) 2004-2005, K A Fraser
65292+ *
65293+ * This program is free software; you can redistribute it and/or
65294+ * modify it under the terms of the GNU General Public License version 2
65295+ * as published by the Free Software Foundation; or, when distributed
65296+ * separately from the Linux kernel or incorporated into other
65297+ * software packages, subject to the following license:
65298+ *
65299+ * Permission is hereby granted, free of charge, to any person obtaining a copy
65300+ * of this source file (the "Software"), to deal in the Software without
65301+ * restriction, including without limitation the rights to use, copy, modify,
65302+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
65303+ * and to permit persons to whom the Software is furnished to do so, subject to
65304+ * the following conditions:
65305+ *
65306+ * The above copyright notice and this permission notice shall be included in
65307+ * all copies or substantial portions of the Software.
65308+ *
65309+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65310+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
65311+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
65312+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65313+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
65314+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
65315+ * IN THE SOFTWARE.
65316+ */
65317+
65318+#include <linux/config.h>
65319+#include <linux/module.h>
65320+#include <linux/sched.h>
65321+#include <linux/mm.h>
65322+#include <linux/vmalloc.h>
65323+#include <xen/interface/xen.h>
65324+#include <xen/gnttab.h>
65325+#include <asm/pgtable.h>
65326+#include <asm/uaccess.h>
65327+#include <asm/synch_bitops.h>
65328+#include <asm/io.h>
65329+#include <xen/interface/memory.h>
65330+
65331+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
65332+#include <xen/platform-compat.h>
65333+#endif
65334+
65335+/* External tools reserve first few grant table entries. */
65336+#define NR_RESERVED_ENTRIES 8
65337+
65338+#define NR_GRANT_ENTRIES \
65339+ (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(struct grant_entry))
65340+#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
65341+
65342+static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
65343+static int gnttab_free_count;
65344+static grant_ref_t gnttab_free_head;
65345+static DEFINE_SPINLOCK(gnttab_list_lock);
65346+
65347+static struct grant_entry *shared;
65348+
65349+static struct gnttab_free_callback *gnttab_free_callback_list;
65350+
65351+static int get_free_entries(int count)
65352+{
65353+ unsigned long flags;
65354+ int ref;
65355+ grant_ref_t head;
65356+ spin_lock_irqsave(&gnttab_list_lock, flags);
65357+ if (gnttab_free_count < count) {
65358+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65359+ return -1;
65360+ }
65361+ ref = head = gnttab_free_head;
65362+ gnttab_free_count -= count;
65363+ while (count-- > 1)
65364+ head = gnttab_list[head];
65365+ gnttab_free_head = gnttab_list[head];
65366+ gnttab_list[head] = GNTTAB_LIST_END;
65367+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65368+ return ref;
65369+}
65370+
65371+#define get_free_entry() get_free_entries(1)
65372+
65373+static void do_free_callbacks(void)
65374+{
65375+ struct gnttab_free_callback *callback, *next;
65376+
65377+ callback = gnttab_free_callback_list;
65378+ gnttab_free_callback_list = NULL;
65379+
65380+ while (callback != NULL) {
65381+ next = callback->next;
65382+ if (gnttab_free_count >= callback->count) {
65383+ callback->next = NULL;
65384+ callback->fn(callback->arg);
65385+ } else {
65386+ callback->next = gnttab_free_callback_list;
65387+ gnttab_free_callback_list = callback;
65388+ }
65389+ callback = next;
65390+ }
65391+}
65392+
65393+static inline void check_free_callbacks(void)
65394+{
65395+ if (unlikely(gnttab_free_callback_list))
65396+ do_free_callbacks();
65397+}
65398+
65399+static void put_free_entry(grant_ref_t ref)
65400+{
65401+ unsigned long flags;
65402+ spin_lock_irqsave(&gnttab_list_lock, flags);
65403+ gnttab_list[ref] = gnttab_free_head;
65404+ gnttab_free_head = ref;
65405+ gnttab_free_count++;
65406+ check_free_callbacks();
65407+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65408+}
65409+
65410+/*
65411+ * Public grant-issuing interface functions
65412+ */
65413+
65414+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
65415+ int readonly)
65416+{
65417+ int ref;
65418+
65419+ if (unlikely((ref = get_free_entry()) == -1))
65420+ return -ENOSPC;
65421+
65422+ shared[ref].frame = frame;
65423+ shared[ref].domid = domid;
65424+ wmb();
65425+ shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
65426+
65427+ return ref;
65428+}
65429+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
65430+
65431+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
65432+ unsigned long frame, int readonly)
65433+{
65434+ shared[ref].frame = frame;
65435+ shared[ref].domid = domid;
65436+ wmb();
65437+ shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
65438+}
65439+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
65440+
65441+
65442+int gnttab_query_foreign_access(grant_ref_t ref)
65443+{
65444+ u16 nflags;
65445+
65446+ nflags = shared[ref].flags;
65447+
65448+ return (nflags & (GTF_reading|GTF_writing));
65449+}
65450+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
65451+
65452+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
65453+{
65454+ u16 flags, nflags;
65455+
65456+ nflags = shared[ref].flags;
65457+ do {
65458+ if ((flags = nflags) & (GTF_reading|GTF_writing)) {
65459+ printk(KERN_ALERT "WARNING: g.e. still in use!\n");
65460+ return 0;
65461+ }
65462+ } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
65463+ flags);
65464+
65465+ return 1;
65466+}
65467+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
65468+
65469+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
65470+ unsigned long page)
65471+{
65472+ if (gnttab_end_foreign_access_ref(ref, readonly)) {
65473+ put_free_entry(ref);
65474+ if (page != 0)
65475+ free_page(page);
65476+ } else {
65477+ /* XXX This needs to be fixed so that the ref and page are
65478+ placed on a list to be freed up later. */
65479+ printk(KERN_WARNING
65480+ "WARNING: leaking g.e. and page still in use!\n");
65481+ }
65482+}
65483+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
65484+
65485+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
65486+{
65487+ int ref;
65488+
65489+ if (unlikely((ref = get_free_entry()) == -1))
65490+ return -ENOSPC;
65491+ gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
65492+
65493+ return ref;
65494+}
65495+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
65496+
65497+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
65498+ unsigned long pfn)
65499+{
65500+ shared[ref].frame = pfn;
65501+ shared[ref].domid = domid;
65502+ wmb();
65503+ shared[ref].flags = GTF_accept_transfer;
65504+}
65505+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
65506+
65507+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
65508+{
65509+ unsigned long frame;
65510+ u16 flags;
65511+
65512+ /*
65513+ * If a transfer is not even yet started, try to reclaim the grant
65514+ * reference and return failure (== 0).
65515+ */
65516+ while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
65517+ if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
65518+ return 0;
65519+ cpu_relax();
65520+ }
65521+
65522+ /* If a transfer is in progress then wait until it is completed. */
65523+ while (!(flags & GTF_transfer_completed)) {
65524+ flags = shared[ref].flags;
65525+ cpu_relax();
65526+ }
65527+
65528+ /* Read the frame number /after/ reading completion status. */
65529+ rmb();
65530+ frame = shared[ref].frame;
65531+ BUG_ON(frame == 0);
65532+
65533+ return frame;
65534+}
65535+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
65536+
65537+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
65538+{
65539+ unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
65540+ put_free_entry(ref);
65541+ return frame;
65542+}
65543+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
65544+
65545+void gnttab_free_grant_reference(grant_ref_t ref)
65546+{
65547+ put_free_entry(ref);
65548+}
65549+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
65550+
65551+void gnttab_free_grant_references(grant_ref_t head)
65552+{
65553+ grant_ref_t ref;
65554+ unsigned long flags;
65555+ int count = 1;
65556+ if (head == GNTTAB_LIST_END)
65557+ return;
65558+ spin_lock_irqsave(&gnttab_list_lock, flags);
65559+ ref = head;
65560+ while (gnttab_list[ref] != GNTTAB_LIST_END) {
65561+ ref = gnttab_list[ref];
65562+ count++;
65563+ }
65564+ gnttab_list[ref] = gnttab_free_head;
65565+ gnttab_free_head = head;
65566+ gnttab_free_count += count;
65567+ check_free_callbacks();
65568+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65569+}
65570+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
65571+
65572+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
65573+{
65574+ int h = get_free_entries(count);
65575+
65576+ if (h == -1)
65577+ return -ENOSPC;
65578+
65579+ *head = h;
65580+
65581+ return 0;
65582+}
65583+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
65584+
65585+int gnttab_empty_grant_references(const grant_ref_t *private_head)
65586+{
65587+ return (*private_head == GNTTAB_LIST_END);
65588+}
65589+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
65590+
65591+int gnttab_claim_grant_reference(grant_ref_t *private_head)
65592+{
65593+ grant_ref_t g = *private_head;
65594+ if (unlikely(g == GNTTAB_LIST_END))
65595+ return -ENOSPC;
65596+ *private_head = gnttab_list[g];
65597+ return g;
65598+}
65599+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
65600+
65601+void gnttab_release_grant_reference(grant_ref_t *private_head,
65602+ grant_ref_t release)
65603+{
65604+ gnttab_list[release] = *private_head;
65605+ *private_head = release;
65606+}
65607+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
65608+
65609+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
65610+ void (*fn)(void *), void *arg, u16 count)
65611+{
65612+ unsigned long flags;
65613+ spin_lock_irqsave(&gnttab_list_lock, flags);
65614+ if (callback->next)
65615+ goto out;
65616+ callback->fn = fn;
65617+ callback->arg = arg;
65618+ callback->count = count;
65619+ callback->next = gnttab_free_callback_list;
65620+ gnttab_free_callback_list = callback;
65621+ check_free_callbacks();
65622+out:
65623+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65624+}
65625+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
65626+
65627+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
65628+{
65629+ struct gnttab_free_callback **pcb;
65630+ unsigned long flags;
65631+
65632+ spin_lock_irqsave(&gnttab_list_lock, flags);
65633+ for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
65634+ if (*pcb == callback) {
65635+ *pcb = callback->next;
65636+ break;
65637+ }
65638+ }
65639+ spin_unlock_irqrestore(&gnttab_list_lock, flags);
65640+}
65641+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
65642+
65643+#ifdef CONFIG_XEN
65644+
65645+#ifndef __ia64__
65646+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
65647+ unsigned long addr, void *data)
65648+{
65649+ unsigned long **frames = (unsigned long **)data;
65650+
65651+ set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
65652+ (*frames)++;
65653+ return 0;
65654+}
65655+
65656+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
65657+ unsigned long addr, void *data)
65658+{
65659+
65660+ set_pte_at(&init_mm, addr, pte, __pte(0));
65661+ return 0;
65662+}
65663+#endif
65664+
65665+int gnttab_resume(void)
65666+{
65667+ struct gnttab_setup_table setup;
65668+ unsigned long frames[NR_GRANT_FRAMES];
65669+ int rc;
65670+#ifndef __ia64__
65671+ void *pframes = frames;
65672+ struct vm_struct *area;
65673+#endif
65674+
65675+ setup.dom = DOMID_SELF;
65676+ setup.nr_frames = NR_GRANT_FRAMES;
65677+ set_xen_guest_handle(setup.frame_list, frames);
65678+
65679+ rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
65680+ if (rc == -ENOSYS)
65681+ return -ENOSYS;
65682+
65683+ BUG_ON(rc || setup.status);
65684+
65685+#ifndef __ia64__
65686+ if (shared == NULL) {
65687+ area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
65688+ BUG_ON(area == NULL);
65689+ shared = area->addr;
65690+ }
65691+ rc = apply_to_page_range(&init_mm, (unsigned long)shared,
65692+ PAGE_SIZE * NR_GRANT_FRAMES,
65693+ map_pte_fn, &pframes);
65694+ BUG_ON(rc);
65695+#else
65696+ shared = __va(frames[0] << PAGE_SHIFT);
65697+ printk("grant table at %p\n", shared);
65698+#endif
65699+
65700+ return 0;
65701+}
65702+
65703+int gnttab_suspend(void)
65704+{
65705+#ifndef __ia64__
65706+ apply_to_page_range(&init_mm, (unsigned long)shared,
65707+ PAGE_SIZE * NR_GRANT_FRAMES,
65708+ unmap_pte_fn, NULL);
65709+#endif
65710+ return 0;
65711+}
65712+
65713+#else /* !CONFIG_XEN */
65714+
65715+#include <platform-pci.h>
65716+
65717+int gnttab_resume(void)
65718+{
65719+ unsigned long frames;
65720+ struct xen_add_to_physmap xatp;
65721+ unsigned int i;
65722+
65723+ frames = alloc_xen_mmio(PAGE_SIZE * NR_GRANT_FRAMES);
65724+
65725+ for (i = 0; i < NR_GRANT_FRAMES; i++) {
65726+ xatp.domid = DOMID_SELF;
65727+ xatp.idx = i;
65728+ xatp.space = XENMAPSPACE_grant_table;
65729+ xatp.gpfn = (frames >> PAGE_SHIFT) + i;
65730+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
65731+ BUG();
65732+ }
65733+
65734+ shared = ioremap(frames, PAGE_SIZE * NR_GRANT_FRAMES);
65735+ if (shared == NULL) {
65736+ printk("error to ioremap gnttab share frames\n");
65737+ return -1;
65738+ }
65739+
65740+ return 0;
65741+}
65742+
65743+int gnttab_suspend(void)
65744+{
65745+ iounmap(shared);
65746+ return 0;
65747+}
65748+
65749+#endif /* !CONFIG_XEN */
65750+
65751+int __init gnttab_init(void)
65752+{
65753+ int i;
65754+
65755+ if (!is_running_on_xen())
65756+ return -ENODEV;
65757+
65758+ if (gnttab_resume() < 0)
65759+ return -ENODEV;
65760+
65761+ for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
65762+ gnttab_list[i] = i + 1;
65763+ gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
65764+ gnttab_free_head = NR_RESERVED_ENTRIES;
65765+
65766+ printk("Grant table initialized\n");
65767+ return 0;
65768+}
65769+
65770+#ifdef CONFIG_XEN
65771+core_initcall(gnttab_init);
65772+#endif
65773diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/hypervisor_sysfs.c linux-2.6.16.33/drivers/xen/core/hypervisor_sysfs.c
65774--- linux-2.6.16.33-noxen/drivers/xen/core/hypervisor_sysfs.c 1970-01-01 00:00:00.000000000 +0000
65775+++ linux-2.6.16.33/drivers/xen/core/hypervisor_sysfs.c 2007-01-08 15:00:45.000000000 +0000
65776@@ -0,0 +1,60 @@
65777+/*
65778+ * copyright (c) 2006 IBM Corporation
65779+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
65780+ *
65781+ * This program is free software; you can redistribute it and/or modify
65782+ * it under the terms of the GNU General Public License version 2 as
65783+ * published by the Free Software Foundation.
65784+ */
65785+
65786+#include <linux/config.h>
65787+#include <linux/kernel.h>
65788+#include <linux/module.h>
65789+#include <linux/kobject.h>
65790+#include <xen/hypervisor_sysfs.h>
65791+
65792+decl_subsys(hypervisor, NULL, NULL);
65793+
65794+static ssize_t hyp_sysfs_show(struct kobject *kobj,
65795+ struct attribute *attr,
65796+ char *buffer)
65797+{
65798+ struct hyp_sysfs_attr *hyp_attr;
65799+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
65800+ if (hyp_attr->show)
65801+ return hyp_attr->show(hyp_attr, buffer);
65802+ return 0;
65803+}
65804+
65805+static ssize_t hyp_sysfs_store(struct kobject *kobj,
65806+ struct attribute *attr,
65807+ const char *buffer,
65808+ size_t len)
65809+{
65810+ struct hyp_sysfs_attr *hyp_attr;
65811+ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
65812+ if (hyp_attr->store)
65813+ return hyp_attr->store(hyp_attr, buffer, len);
65814+ return 0;
65815+}
65816+
65817+struct sysfs_ops hyp_sysfs_ops = {
65818+ .show = hyp_sysfs_show,
65819+ .store = hyp_sysfs_store,
65820+};
65821+
65822+static struct kobj_type hyp_sysfs_kobj_type = {
65823+ .sysfs_ops = &hyp_sysfs_ops,
65824+};
65825+
65826+static int __init hypervisor_subsys_init(void)
65827+{
65828+ if (!is_running_on_xen())
65829+ return -ENODEV;
65830+
65831+ hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
65832+ return subsystem_register(&hypervisor_subsys);
65833+}
65834+
65835+device_initcall(hypervisor_subsys_init);
65836+EXPORT_SYMBOL_GPL(hypervisor_subsys);
65837diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/machine_kexec.c linux-2.6.16.33/drivers/xen/core/machine_kexec.c
65838--- linux-2.6.16.33-noxen/drivers/xen/core/machine_kexec.c 1970-01-01 00:00:00.000000000 +0000
65839+++ linux-2.6.16.33/drivers/xen/core/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
65840@@ -0,0 +1,190 @@
65841+/*
65842+ * drivers/xen/core/machine_kexec.c
65843+ * handle transition of Linux booting another kernel
65844+ */
65845+
65846+#include <linux/kexec.h>
65847+#include <xen/interface/kexec.h>
65848+#include <linux/mm.h>
65849+#include <linux/bootmem.h>
65850+#include <asm/hypercall.h>
65851+
65852+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,
65853+ struct kimage *image);
65854+
65855+int xen_max_nr_phys_cpus;
65856+struct resource xen_hypervisor_res;
65857+struct resource *xen_phys_cpus;
65858+
65859+void xen_machine_kexec_setup_resources(void)
65860+{
65861+ xen_kexec_range_t range;
65862+ struct resource *res;
65863+ int k = 0;
65864+
65865+ if (!is_initial_xendomain())
65866+ return;
65867+
65868+ /* determine maximum number of physical cpus */
65869+
65870+ while (1) {
65871+ memset(&range, 0, sizeof(range));
65872+ range.range = KEXEC_RANGE_MA_CPU;
65873+ range.nr = k;
65874+
65875+ if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65876+ break;
65877+
65878+ k++;
65879+ }
65880+
65881+ if (k == 0)
65882+ return;
65883+
65884+ xen_max_nr_phys_cpus = k;
65885+
65886+ /* allocate xen_phys_cpus */
65887+
65888+ xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
65889+ BUG_ON(xen_phys_cpus == NULL);
65890+
65891+ /* fill in xen_phys_cpus with per-cpu crash note information */
65892+
65893+ for (k = 0; k < xen_max_nr_phys_cpus; k++) {
65894+ memset(&range, 0, sizeof(range));
65895+ range.range = KEXEC_RANGE_MA_CPU;
65896+ range.nr = k;
65897+
65898+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65899+ goto err;
65900+
65901+ res = xen_phys_cpus + k;
65902+
65903+ memset(res, 0, sizeof(*res));
65904+ res->name = "Crash note";
65905+ res->start = range.start;
65906+ res->end = range.start + range.size - 1;
65907+ res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
65908+ }
65909+
65910+ /* fill in xen_hypervisor_res with hypervisor machine address range */
65911+
65912+ memset(&range, 0, sizeof(range));
65913+ range.range = KEXEC_RANGE_MA_XEN;
65914+
65915+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65916+ goto err;
65917+
65918+ xen_hypervisor_res.name = "Hypervisor code and data";
65919+ xen_hypervisor_res.start = range.start;
65920+ xen_hypervisor_res.end = range.start + range.size - 1;
65921+ xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
65922+
65923+ /* fill in crashk_res if range is reserved by hypervisor */
65924+
65925+ memset(&range, 0, sizeof(range));
65926+ range.range = KEXEC_RANGE_MA_CRASH;
65927+
65928+ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65929+ return;
65930+
65931+ if (range.size) {
65932+ crashk_res.start = range.start;
65933+ crashk_res.end = range.start + range.size - 1;
65934+ }
65935+
65936+ return;
65937+
65938+ err:
65939+ /*
65940+ * It isn't possible to free xen_phys_cpus this early in the
65941+ * boot. Since failure at this stage is unexpected and the
65942+ * amount is small we leak the memory.
65943+ */
65944+ xen_max_nr_phys_cpus = 0;
65945+ return;
65946+}
65947+
65948+void xen_machine_kexec_register_resources(struct resource *res)
65949+{
65950+ int k;
65951+
65952+ request_resource(res, &xen_hypervisor_res);
65953+
65954+ for (k = 0; k < xen_max_nr_phys_cpus; k++)
65955+ request_resource(&xen_hypervisor_res, xen_phys_cpus + k);
65956+
65957+}
65958+
65959+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
65960+{
65961+ machine_kexec_setup_load_arg(xki, image);
65962+
65963+ xki->indirection_page = image->head;
65964+ xki->start_address = image->start;
65965+}
65966+
65967+/*
65968+ * Load the image into xen so xen can kdump itself
65969+ * This might have been done in prepare, but prepare
65970+ * is currently called too early. It might make sense
65971+ * to move prepare, but for now, just add an extra hook.
65972+ */
65973+int xen_machine_kexec_load(struct kimage *image)
65974+{
65975+ xen_kexec_load_t xkl;
65976+
65977+ memset(&xkl, 0, sizeof(xkl));
65978+ xkl.type = image->type;
65979+ setup_load_arg(&xkl.image, image);
65980+ return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
65981+}
65982+
65983+/*
65984+ * Unload the image that was stored by machine_kexec_load()
65985+ * This might have been done in machine_kexec_cleanup() but it
65986+ * is called too late, and its possible xen could try and kdump
65987+ * using resources that have been freed.
65988+ */
65989+void xen_machine_kexec_unload(struct kimage *image)
65990+{
65991+ xen_kexec_load_t xkl;
65992+
65993+ memset(&xkl, 0, sizeof(xkl));
65994+ xkl.type = image->type;
65995+ HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
65996+}
65997+
65998+/*
65999+ * Do not allocate memory (or fail in any way) in machine_kexec().
66000+ * We are past the point of no return, committed to rebooting now.
66001+ *
66002+ * This has the hypervisor move to the prefered reboot CPU,
66003+ * stop all CPUs and kexec. That is it combines machine_shutdown()
66004+ * and machine_kexec() in Linux kexec terms.
66005+ */
66006+NORET_TYPE void machine_kexec(struct kimage *image)
66007+{
66008+ xen_kexec_exec_t xke;
66009+
66010+ memset(&xke, 0, sizeof(xke));
66011+ xke.type = image->type;
66012+ HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
66013+ panic("KEXEC_CMD_kexec hypercall should not return\n");
66014+}
66015+
66016+void machine_shutdown(void)
66017+{
66018+ /* do nothing */
66019+}
66020+
66021+
66022+/*
66023+ * Local variables:
66024+ * c-file-style: "linux"
66025+ * indent-tabs-mode: t
66026+ * c-indent-level: 8
66027+ * c-basic-offset: 8
66028+ * tab-width: 8
66029+ * End:
66030+ */
66031diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/machine_reboot.c linux-2.6.16.33/drivers/xen/core/machine_reboot.c
66032--- linux-2.6.16.33-noxen/drivers/xen/core/machine_reboot.c 1970-01-01 00:00:00.000000000 +0000
66033+++ linux-2.6.16.33/drivers/xen/core/machine_reboot.c 2007-01-08 15:00:45.000000000 +0000
66034@@ -0,0 +1,185 @@
66035+#define __KERNEL_SYSCALLS__
66036+#include <linux/version.h>
66037+#include <linux/kernel.h>
66038+#include <linux/mm.h>
66039+#include <linux/unistd.h>
66040+#include <linux/module.h>
66041+#include <linux/reboot.h>
66042+#include <linux/sysrq.h>
66043+#include <linux/stringify.h>
66044+#include <asm/irq.h>
66045+#include <asm/mmu_context.h>
66046+#include <xen/evtchn.h>
66047+#include <asm/hypervisor.h>
66048+#include <xen/interface/dom0_ops.h>
66049+#include <xen/xenbus.h>
66050+#include <linux/cpu.h>
66051+#include <linux/kthread.h>
66052+#include <xen/gnttab.h>
66053+#include <xen/xencons.h>
66054+#include <xen/cpu_hotplug.h>
66055+
66056+#if defined(__i386__) || defined(__x86_64__)
66057+
66058+/*
66059+ * Power off function, if any
66060+ */
66061+void (*pm_power_off)(void);
66062+EXPORT_SYMBOL(pm_power_off);
66063+
66064+void machine_emergency_restart(void)
66065+{
66066+ /* We really want to get pending console data out before we die. */
66067+ xencons_force_flush();
66068+ HYPERVISOR_shutdown(SHUTDOWN_reboot);
66069+}
66070+
66071+void machine_restart(char * __unused)
66072+{
66073+ machine_emergency_restart();
66074+}
66075+
66076+void machine_halt(void)
66077+{
66078+ machine_power_off();
66079+}
66080+
66081+void machine_power_off(void)
66082+{
66083+ /* We really want to get pending console data out before we die. */
66084+ xencons_force_flush();
66085+ if (pm_power_off)
66086+ pm_power_off();
66087+ HYPERVISOR_shutdown(SHUTDOWN_poweroff);
66088+}
66089+
66090+int reboot_thru_bios = 0; /* for dmi_scan.c */
66091+EXPORT_SYMBOL(machine_restart);
66092+EXPORT_SYMBOL(machine_halt);
66093+EXPORT_SYMBOL(machine_power_off);
66094+
66095+/* Ensure we run on the idle task page tables so that we will
66096+ switch page tables before running user space. This is needed
66097+ on architectures with separate kernel and user page tables
66098+ because the user page table pointer is not saved/restored. */
66099+static void switch_idle_mm(void)
66100+{
66101+ struct mm_struct *mm = current->active_mm;
66102+
66103+ if (mm == &init_mm)
66104+ return;
66105+
66106+ atomic_inc(&init_mm.mm_count);
66107+ switch_mm(mm, &init_mm, current);
66108+ current->active_mm = &init_mm;
66109+ mmdrop(mm);
66110+}
66111+
66112+static void pre_suspend(void)
66113+{
66114+ HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
66115+ clear_fixmap(FIX_SHARED_INFO);
66116+
66117+ xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
66118+ xen_start_info->console.domU.mfn =
66119+ mfn_to_pfn(xen_start_info->console.domU.mfn);
66120+}
66121+
66122+static void post_suspend(void)
66123+{
66124+ int i, j, k, fpp;
66125+ extern unsigned long max_pfn;
66126+ extern unsigned long *pfn_to_mfn_frame_list_list;
66127+ extern unsigned long *pfn_to_mfn_frame_list[];
66128+
66129+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
66130+
66131+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
66132+
66133+ memset(empty_zero_page, 0, PAGE_SIZE);
66134+
66135+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
66136+ virt_to_mfn(pfn_to_mfn_frame_list_list);
66137+
66138+ fpp = PAGE_SIZE/sizeof(unsigned long);
66139+ for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
66140+ if ((j % fpp) == 0) {
66141+ k++;
66142+ pfn_to_mfn_frame_list_list[k] =
66143+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
66144+ j = 0;
66145+ }
66146+ pfn_to_mfn_frame_list[k][j] =
66147+ virt_to_mfn(&phys_to_machine_mapping[i]);
66148+ }
66149+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
66150+}
66151+
66152+#else /* !(defined(__i386__) || defined(__x86_64__)) */
66153+
66154+#define switch_idle_mm() ((void)0)
66155+#define mm_pin_all() ((void)0)
66156+#define pre_suspend() ((void)0)
66157+#define post_suspend() ((void)0)
66158+
66159+#endif
66160+
66161+int __xen_suspend(void)
66162+{
66163+ int err;
66164+
66165+ extern void time_resume(void);
66166+
66167+ BUG_ON(smp_processor_id() != 0);
66168+ BUG_ON(in_interrupt());
66169+
66170+#if defined(__i386__) || defined(__x86_64__)
66171+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
66172+ printk(KERN_WARNING "Cannot suspend in "
66173+ "auto_translated_physmap mode.\n");
66174+ return -EOPNOTSUPP;
66175+ }
66176+#endif
66177+
66178+ err = smp_suspend();
66179+ if (err)
66180+ return err;
66181+
66182+ xenbus_suspend();
66183+
66184+ preempt_disable();
66185+
66186+ mm_pin_all();
66187+ local_irq_disable();
66188+ preempt_enable();
66189+
66190+ gnttab_suspend();
66191+
66192+ pre_suspend();
66193+
66194+ /*
66195+ * We'll stop somewhere inside this hypercall. When it returns,
66196+ * we'll start resuming after the restore.
66197+ */
66198+ HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
66199+
66200+ post_suspend();
66201+
66202+ gnttab_resume();
66203+
66204+ irq_resume();
66205+
66206+ time_resume();
66207+
66208+ switch_idle_mm();
66209+
66210+ local_irq_enable();
66211+
66212+ xencons_resume();
66213+
66214+ xenbus_resume();
66215+
66216+ smp_resume();
66217+
66218+ return err;
66219+}
66220diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/reboot.c linux-2.6.16.33/drivers/xen/core/reboot.c
66221--- linux-2.6.16.33-noxen/drivers/xen/core/reboot.c 1970-01-01 00:00:00.000000000 +0000
66222+++ linux-2.6.16.33/drivers/xen/core/reboot.c 2007-01-08 15:00:45.000000000 +0000
66223@@ -0,0 +1,220 @@
66224+#define __KERNEL_SYSCALLS__
66225+#include <linux/version.h>
66226+#include <linux/kernel.h>
66227+#include <linux/unistd.h>
66228+#include <linux/module.h>
66229+#include <linux/reboot.h>
66230+#include <linux/sysrq.h>
66231+#include <asm/hypervisor.h>
66232+#include <xen/xenbus.h>
66233+#include <linux/kthread.h>
66234+
66235+MODULE_LICENSE("Dual BSD/GPL");
66236+
66237+#define SHUTDOWN_INVALID -1
66238+#define SHUTDOWN_POWEROFF 0
66239+#define SHUTDOWN_SUSPEND 2
66240+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
66241+ * report a crash, not be instructed to crash!
66242+ * HALT is the same as POWEROFF, as far as we're concerned. The tools use
66243+ * the distinction when we return the reason code to them.
66244+ */
66245+#define SHUTDOWN_HALT 4
66246+
66247+/* Ignore multiple shutdown requests. */
66248+static int shutting_down = SHUTDOWN_INVALID;
66249+
66250+static void __shutdown_handler(void *unused);
66251+static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
66252+
66253+#ifdef CONFIG_XEN
66254+int __xen_suspend(void);
66255+#else
66256+#define __xen_suspend() (void)0
66257+#endif
66258+
66259+static int shutdown_process(void *__unused)
66260+{
66261+ static char *envp[] = { "HOME=/", "TERM=linux",
66262+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
66263+ static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
66264+
66265+ extern asmlinkage long sys_reboot(int magic1, int magic2,
66266+ unsigned int cmd, void *arg);
66267+
66268+ if ((shutting_down == SHUTDOWN_POWEROFF) ||
66269+ (shutting_down == SHUTDOWN_HALT)) {
66270+ if (call_usermodehelper("/sbin/poweroff", poweroff_argv, envp, 0) < 0) {
66271+#ifdef CONFIG_XEN
66272+ sys_reboot(LINUX_REBOOT_MAGIC1,
66273+ LINUX_REBOOT_MAGIC2,
66274+ LINUX_REBOOT_CMD_POWER_OFF,
66275+ NULL);
66276+#endif /* CONFIG_XEN */
66277+ }
66278+ }
66279+
66280+ shutting_down = SHUTDOWN_INVALID; /* could try again */
66281+
66282+ return 0;
66283+}
66284+
66285+static int xen_suspend(void *__unused)
66286+{
66287+ __xen_suspend();
66288+ shutting_down = SHUTDOWN_INVALID;
66289+ return 0;
66290+}
66291+
66292+static int kthread_create_on_cpu(int (*f)(void *arg),
66293+ void *arg,
66294+ const char *name,
66295+ int cpu)
66296+{
66297+ struct task_struct *p;
66298+ p = kthread_create(f, arg, name);
66299+ if (IS_ERR(p))
66300+ return PTR_ERR(p);
66301+ kthread_bind(p, cpu);
66302+ wake_up_process(p);
66303+ return 0;
66304+}
66305+
66306+static void __shutdown_handler(void *unused)
66307+{
66308+ int err;
66309+
66310+ if (shutting_down != SHUTDOWN_SUSPEND)
66311+ err = kernel_thread(shutdown_process, NULL,
66312+ CLONE_FS | CLONE_FILES);
66313+ else
66314+ err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0);
66315+
66316+ if (err < 0) {
66317+ printk(KERN_WARNING "Error creating shutdown process (%d): "
66318+ "retrying...\n", -err);
66319+ schedule_delayed_work(&shutdown_work, HZ/2);
66320+ }
66321+}
66322+
66323+static void shutdown_handler(struct xenbus_watch *watch,
66324+ const char **vec, unsigned int len)
66325+{
66326+ char *str;
66327+ struct xenbus_transaction xbt;
66328+ int err;
66329+
66330+ if (shutting_down != SHUTDOWN_INVALID)
66331+ return;
66332+
66333+ again:
66334+ err = xenbus_transaction_start(&xbt);
66335+ if (err)
66336+ return;
66337+ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
66338+ /* Ignore read errors and empty reads. */
66339+ if (XENBUS_IS_ERR_READ(str)) {
66340+ xenbus_transaction_end(xbt, 1);
66341+ return;
66342+ }
66343+
66344+ xenbus_write(xbt, "control", "shutdown", "");
66345+
66346+ err = xenbus_transaction_end(xbt, 0);
66347+ if (err == -EAGAIN) {
66348+ kfree(str);
66349+ goto again;
66350+ }
66351+
66352+ if (strcmp(str, "poweroff") == 0)
66353+ shutting_down = SHUTDOWN_POWEROFF;
66354+ else if (strcmp(str, "reboot") == 0)
66355+ kill_proc(1, SIGINT, 1); /* interrupt init */
66356+ else if (strcmp(str, "suspend") == 0)
66357+ shutting_down = SHUTDOWN_SUSPEND;
66358+ else if (strcmp(str, "halt") == 0)
66359+ shutting_down = SHUTDOWN_HALT;
66360+ else {
66361+ printk("Ignoring shutdown request: %s\n", str);
66362+ shutting_down = SHUTDOWN_INVALID;
66363+ }
66364+
66365+ if (shutting_down != SHUTDOWN_INVALID)
66366+ schedule_work(&shutdown_work);
66367+
66368+ kfree(str);
66369+}
66370+
66371+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
66372+ unsigned int len)
66373+{
66374+ char sysrq_key = '\0';
66375+ struct xenbus_transaction xbt;
66376+ int err;
66377+
66378+ again:
66379+ err = xenbus_transaction_start(&xbt);
66380+ if (err)
66381+ return;
66382+ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
66383+ printk(KERN_ERR "Unable to read sysrq code in "
66384+ "control/sysrq\n");
66385+ xenbus_transaction_end(xbt, 1);
66386+ return;
66387+ }
66388+
66389+ if (sysrq_key != '\0')
66390+ xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
66391+
66392+ err = xenbus_transaction_end(xbt, 0);
66393+ if (err == -EAGAIN)
66394+ goto again;
66395+
66396+#ifdef CONFIG_MAGIC_SYSRQ
66397+ if (sysrq_key != '\0')
66398+ handle_sysrq(sysrq_key, NULL, NULL);
66399+#endif
66400+}
66401+
66402+static struct xenbus_watch shutdown_watch = {
66403+ .node = "control/shutdown",
66404+ .callback = shutdown_handler
66405+};
66406+
66407+static struct xenbus_watch sysrq_watch = {
66408+ .node ="control/sysrq",
66409+ .callback = sysrq_handler
66410+};
66411+
66412+static int setup_shutdown_watcher(struct notifier_block *notifier,
66413+ unsigned long event,
66414+ void *data)
66415+{
66416+ int err;
66417+
66418+ err = register_xenbus_watch(&shutdown_watch);
66419+ if (err)
66420+ printk(KERN_ERR "Failed to set shutdown watcher\n");
66421+ else
66422+ xenbus_write(XBT_NIL, "control", "feature-reboot", "1");
66423+
66424+ err = register_xenbus_watch(&sysrq_watch);
66425+ if (err)
66426+ printk(KERN_ERR "Failed to set sysrq watcher\n");
66427+ else
66428+ xenbus_write(XBT_NIL, "control", "feature-sysrq", "1");
66429+
66430+ return NOTIFY_DONE;
66431+}
66432+
66433+static int __init setup_shutdown_event(void)
66434+{
66435+ static struct notifier_block xenstore_notifier = {
66436+ .notifier_call = setup_shutdown_watcher
66437+ };
66438+ register_xenstore_notifier(&xenstore_notifier);
66439+
66440+ return 0;
66441+}
66442+
66443+subsys_initcall(setup_shutdown_event);
66444diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/skbuff.c linux-2.6.16.33/drivers/xen/core/skbuff.c
66445--- linux-2.6.16.33-noxen/drivers/xen/core/skbuff.c 1970-01-01 00:00:00.000000000 +0000
66446+++ linux-2.6.16.33/drivers/xen/core/skbuff.c 2007-01-08 15:00:45.000000000 +0000
66447@@ -0,0 +1,145 @@
66448+
66449+#include <linux/config.h>
66450+#include <linux/module.h>
66451+#include <linux/version.h>
66452+#include <linux/kernel.h>
66453+#include <linux/sched.h>
66454+#include <linux/slab.h>
66455+#include <linux/netdevice.h>
66456+#include <linux/inetdevice.h>
66457+#include <linux/etherdevice.h>
66458+#include <linux/skbuff.h>
66459+#include <linux/init.h>
66460+#include <asm/io.h>
66461+#include <asm/page.h>
66462+#include <asm/hypervisor.h>
66463+
66464+/* Referenced in netback.c. */
66465+/*static*/ kmem_cache_t *skbuff_cachep;
66466+EXPORT_SYMBOL(skbuff_cachep);
66467+
66468+/* Allow up to 64kB or page-sized packets (whichever is greater). */
66469+#if PAGE_SHIFT < 16
66470+#define MAX_SKBUFF_ORDER (16 - PAGE_SHIFT)
66471+#else
66472+#define MAX_SKBUFF_ORDER 0
66473+#endif
66474+static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
66475+
66476+static struct {
66477+ int size;
66478+ kmem_cache_t *cachep;
66479+} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
66480+
66481+struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
66482+ int fclone)
66483+{
66484+ int order, i;
66485+ kmem_cache_t *cachep;
66486+
66487+ length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
66488+
66489+ if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
66490+ for (i = 0; skbuff_small[i].size < length; i++)
66491+ continue;
66492+ cachep = skbuff_small[i].cachep;
66493+ } else {
66494+ order = get_order(length);
66495+ if (order > MAX_SKBUFF_ORDER) {
66496+ printk(KERN_ALERT "Attempt to allocate order %d "
66497+ "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
66498+ return NULL;
66499+ }
66500+ cachep = skbuff_order_cachep[order];
66501+ }
66502+
66503+ length -= sizeof(struct skb_shared_info);
66504+
66505+ return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
66506+}
66507+
66508+struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
66509+{
66510+ struct sk_buff *skb;
66511+ int order;
66512+
66513+ length = SKB_DATA_ALIGN(length + 16);
66514+ order = get_order(length + sizeof(struct skb_shared_info));
66515+ if (order > MAX_SKBUFF_ORDER) {
66516+ printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
66517+ "Increase MAX_SKBUFF_ORDER.\n", order);
66518+ return NULL;
66519+ }
66520+
66521+ skb = alloc_skb_from_cache(
66522+ skbuff_order_cachep[order], length, gfp_mask, 0);
66523+ if (skb != NULL)
66524+ skb_reserve(skb, 16);
66525+
66526+ return skb;
66527+}
66528+
66529+static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
66530+{
66531+ int order = 0;
66532+
66533+ while (skbuff_order_cachep[order] != cachep)
66534+ order++;
66535+
66536+ /* Do our best to allocate contiguous memory but fall back to IOMMU. */
66537+ if (order != 0)
66538+ (void)xen_create_contiguous_region(
66539+ (unsigned long)buf, order, 0);
66540+
66541+ scrub_pages(buf, 1 << order);
66542+}
66543+
66544+static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
66545+{
66546+ int order = 0;
66547+
66548+ while (skbuff_order_cachep[order] != cachep)
66549+ order++;
66550+
66551+ if (order != 0)
66552+ xen_destroy_contiguous_region((unsigned long)buf, order);
66553+}
66554+
66555+static int __init skbuff_init(void)
66556+{
66557+ static char name[MAX_SKBUFF_ORDER + 1][20];
66558+ static char small_name[ARRAY_SIZE(skbuff_small)][20];
66559+ unsigned long size;
66560+ int i, order;
66561+
66562+ for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
66563+ size = skbuff_small[i].size;
66564+ sprintf(small_name[i], "xen-skb-%lu", size);
66565+ /*
66566+ * No ctor/dtor: objects do not span page boundaries, and they
66567+ * are only used on transmit path so no need for scrubbing.
66568+ */
66569+ skbuff_small[i].cachep = kmem_cache_create(
66570+ small_name[i], size, size, 0, NULL, NULL);
66571+ }
66572+
66573+ for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
66574+ size = PAGE_SIZE << order;
66575+ sprintf(name[order], "xen-skb-%lu", size);
66576+ if (is_running_on_xen() && is_initial_xendomain())
66577+ skbuff_order_cachep[order] = kmem_cache_create(
66578+ name[order], size, size, 0,
66579+ skbuff_ctor, skbuff_dtor);
66580+ else
66581+ skbuff_order_cachep[order] = kmem_cache_create(
66582+ name[order], size, size, 0, NULL, NULL);
66583+
66584+ }
66585+
66586+ skbuff_cachep = skbuff_order_cachep[0];
66587+
66588+ return 0;
66589+}
66590+core_initcall(skbuff_init);
66591+
66592+EXPORT_SYMBOL(__dev_alloc_skb);
66593diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/smpboot.c linux-2.6.16.33/drivers/xen/core/smpboot.c
66594--- linux-2.6.16.33-noxen/drivers/xen/core/smpboot.c 1970-01-01 00:00:00.000000000 +0000
66595+++ linux-2.6.16.33/drivers/xen/core/smpboot.c 2007-01-08 15:00:45.000000000 +0000
66596@@ -0,0 +1,459 @@
66597+/*
66598+ * Xen SMP booting functions
66599+ *
66600+ * See arch/i386/kernel/smpboot.c for copyright and credits for derived
66601+ * portions of this file.
66602+ */
66603+
66604+#include <linux/module.h>
66605+#include <linux/config.h>
66606+#include <linux/init.h>
66607+#include <linux/kernel.h>
66608+#include <linux/mm.h>
66609+#include <linux/sched.h>
66610+#include <linux/kernel_stat.h>
66611+#include <linux/smp_lock.h>
66612+#include <linux/irq.h>
66613+#include <linux/bootmem.h>
66614+#include <linux/notifier.h>
66615+#include <linux/cpu.h>
66616+#include <linux/percpu.h>
66617+#include <asm/desc.h>
66618+#include <asm/arch_hooks.h>
66619+#include <asm/pgalloc.h>
66620+#include <xen/evtchn.h>
66621+#include <xen/interface/vcpu.h>
66622+#include <xen/cpu_hotplug.h>
66623+#include <xen/xenbus.h>
66624+
66625+#ifdef CONFIG_SMP_ALTERNATIVES
66626+#include <asm/smp_alt.h>
66627+#endif
66628+
66629+extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
66630+extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
66631+
66632+extern int local_setup_timer(unsigned int cpu);
66633+extern void local_teardown_timer(unsigned int cpu);
66634+
66635+extern void hypervisor_callback(void);
66636+extern void failsafe_callback(void);
66637+extern void system_call(void);
66638+extern void smp_trap_init(trap_info_t *);
66639+
66640+/* Number of siblings per CPU package */
66641+int smp_num_siblings = 1;
66642+int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
66643+EXPORT_SYMBOL(phys_proc_id);
66644+int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */
66645+EXPORT_SYMBOL(cpu_core_id);
66646+
66647+cpumask_t cpu_online_map;
66648+EXPORT_SYMBOL(cpu_online_map);
66649+cpumask_t cpu_possible_map;
66650+EXPORT_SYMBOL(cpu_possible_map);
66651+
66652+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
66653+EXPORT_SYMBOL(cpu_data);
66654+
66655+#ifdef CONFIG_HOTPLUG_CPU
66656+DEFINE_PER_CPU(int, cpu_state) = { 0 };
66657+#endif
66658+
66659+static DEFINE_PER_CPU(int, resched_irq);
66660+static DEFINE_PER_CPU(int, callfunc_irq);
66661+static char resched_name[NR_CPUS][15];
66662+static char callfunc_name[NR_CPUS][15];
66663+
66664+u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
66665+
66666+void *xquad_portio;
66667+
66668+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
66669+cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
66670+EXPORT_SYMBOL(cpu_core_map);
66671+
66672+#if defined(__i386__)
66673+u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
66674+EXPORT_SYMBOL(x86_cpu_to_apicid);
66675+#elif !defined(CONFIG_X86_IO_APIC)
66676+unsigned int maxcpus = NR_CPUS;
66677+#endif
66678+
66679+void __init prefill_possible_map(void)
66680+{
66681+ int i, rc;
66682+
66683+ if (!cpus_empty(cpu_possible_map))
66684+ return;
66685+
66686+ for (i = 0; i < NR_CPUS; i++) {
66687+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
66688+ if (rc >= 0)
66689+ cpu_set(i, cpu_possible_map);
66690+ }
66691+}
66692+
66693+void __init smp_alloc_memory(void)
66694+{
66695+}
66696+
66697+static inline void
66698+set_cpu_sibling_map(int cpu)
66699+{
66700+ phys_proc_id[cpu] = cpu;
66701+ cpu_core_id[cpu] = 0;
66702+
66703+ cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
66704+ cpu_core_map[cpu] = cpumask_of_cpu(cpu);
66705+
66706+ cpu_data[cpu].booted_cores = 1;
66707+}
66708+
66709+static void
66710+remove_siblinginfo(int cpu)
66711+{
66712+ phys_proc_id[cpu] = BAD_APICID;
66713+ cpu_core_id[cpu] = BAD_APICID;
66714+
66715+ cpus_clear(cpu_sibling_map[cpu]);
66716+ cpus_clear(cpu_core_map[cpu]);
66717+
66718+ cpu_data[cpu].booted_cores = 0;
66719+}
66720+
66721+static int xen_smp_intr_init(unsigned int cpu)
66722+{
66723+ int rc;
66724+
66725+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
66726+
66727+ sprintf(resched_name[cpu], "resched%d", cpu);
66728+ rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
66729+ cpu,
66730+ smp_reschedule_interrupt,
66731+ SA_INTERRUPT,
66732+ resched_name[cpu],
66733+ NULL);
66734+ if (rc < 0)
66735+ goto fail;
66736+ per_cpu(resched_irq, cpu) = rc;
66737+
66738+ sprintf(callfunc_name[cpu], "callfunc%d", cpu);
66739+ rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
66740+ cpu,
66741+ smp_call_function_interrupt,
66742+ SA_INTERRUPT,
66743+ callfunc_name[cpu],
66744+ NULL);
66745+ if (rc < 0)
66746+ goto fail;
66747+ per_cpu(callfunc_irq, cpu) = rc;
66748+
66749+ if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
66750+ goto fail;
66751+
66752+ return 0;
66753+
66754+ fail:
66755+ if (per_cpu(resched_irq, cpu) >= 0)
66756+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
66757+ if (per_cpu(callfunc_irq, cpu) >= 0)
66758+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
66759+ return rc;
66760+}
66761+
66762+#ifdef CONFIG_HOTPLUG_CPU
66763+static void xen_smp_intr_exit(unsigned int cpu)
66764+{
66765+ if (cpu != 0)
66766+ local_teardown_timer(cpu);
66767+
66768+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
66769+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
66770+}
66771+#endif
66772+
66773+void cpu_bringup(void)
66774+{
66775+ cpu_init();
66776+ touch_softlockup_watchdog();
66777+ preempt_disable();
66778+ local_irq_enable();
66779+}
66780+
66781+static void cpu_bringup_and_idle(void)
66782+{
66783+ cpu_bringup();
66784+ cpu_idle();
66785+}
66786+
66787+void cpu_initialize_context(unsigned int cpu)
66788+{
66789+ vcpu_guest_context_t ctxt;
66790+ struct task_struct *idle = idle_task(cpu);
66791+#ifdef __x86_64__
66792+ struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
66793+#else
66794+ struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
66795+#endif
66796+
66797+ if (cpu == 0)
66798+ return;
66799+
66800+ memset(&ctxt, 0, sizeof(ctxt));
66801+
66802+ ctxt.flags = VGCF_IN_KERNEL;
66803+ ctxt.user_regs.ds = __USER_DS;
66804+ ctxt.user_regs.es = __USER_DS;
66805+ ctxt.user_regs.fs = 0;
66806+ ctxt.user_regs.gs = 0;
66807+ ctxt.user_regs.ss = __KERNEL_DS;
66808+ ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
66809+ ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
66810+
66811+ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
66812+
66813+ smp_trap_init(ctxt.trap_ctxt);
66814+
66815+ ctxt.ldt_ents = 0;
66816+
66817+ ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
66818+ ctxt.gdt_ents = gdt_descr->size / 8;
66819+
66820+#ifdef __i386__
66821+ ctxt.user_regs.cs = __KERNEL_CS;
66822+ ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
66823+
66824+ ctxt.kernel_ss = __KERNEL_DS;
66825+ ctxt.kernel_sp = idle->thread.esp0;
66826+
66827+ ctxt.event_callback_cs = __KERNEL_CS;
66828+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
66829+ ctxt.failsafe_callback_cs = __KERNEL_CS;
66830+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
66831+
66832+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
66833+#else /* __x86_64__ */
66834+ ctxt.user_regs.cs = __KERNEL_CS;
66835+ ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
66836+
66837+ ctxt.kernel_ss = __KERNEL_DS;
66838+ ctxt.kernel_sp = idle->thread.rsp0;
66839+
66840+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
66841+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
66842+ ctxt.syscall_callback_eip = (unsigned long)system_call;
66843+
66844+ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
66845+
66846+ ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
66847+#endif
66848+
66849+ BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
66850+}
66851+
66852+void __init smp_prepare_cpus(unsigned int max_cpus)
66853+{
66854+ int cpu;
66855+ struct task_struct *idle;
66856+#ifdef __x86_64__
66857+ struct desc_ptr *gdt_descr;
66858+#else
66859+ struct Xgt_desc_struct *gdt_descr;
66860+#endif
66861+
66862+ boot_cpu_data.apicid = 0;
66863+ cpu_data[0] = boot_cpu_data;
66864+
66865+ cpu_2_logical_apicid[0] = 0;
66866+ x86_cpu_to_apicid[0] = 0;
66867+
66868+ current_thread_info()->cpu = 0;
66869+
66870+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
66871+ cpus_clear(cpu_sibling_map[cpu]);
66872+ cpus_clear(cpu_core_map[cpu]);
66873+ }
66874+
66875+ set_cpu_sibling_map(0);
66876+
66877+ if (xen_smp_intr_init(0))
66878+ BUG();
66879+
66880+ /* Restrict the possible_map according to max_cpus. */
66881+ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
66882+ for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
66883+ continue;
66884+ cpu_clear(cpu, cpu_possible_map);
66885+ }
66886+
66887+ for_each_cpu (cpu) {
66888+ if (cpu == 0)
66889+ continue;
66890+
66891+#ifdef __x86_64__
66892+ gdt_descr = &cpu_gdt_descr[cpu];
66893+#else
66894+ gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
66895+#endif
66896+ gdt_descr->address = get_zeroed_page(GFP_KERNEL);
66897+ if (unlikely(!gdt_descr->address)) {
66898+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
66899+ cpu);
66900+ continue;
66901+ }
66902+ gdt_descr->size = GDT_SIZE;
66903+ memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
66904+ make_page_readonly(
66905+ (void *)gdt_descr->address,
66906+ XENFEAT_writable_descriptor_tables);
66907+
66908+ cpu_data[cpu] = boot_cpu_data;
66909+ cpu_data[cpu].apicid = cpu;
66910+
66911+ cpu_2_logical_apicid[cpu] = cpu;
66912+ x86_cpu_to_apicid[cpu] = cpu;
66913+
66914+ idle = fork_idle(cpu);
66915+ if (IS_ERR(idle))
66916+ panic("failed fork for CPU %d", cpu);
66917+
66918+#ifdef __x86_64__
66919+ cpu_pda(cpu)->pcurrent = idle;
66920+ cpu_pda(cpu)->cpunumber = cpu;
66921+ clear_ti_thread_flag(idle->thread_info, TIF_FORK);
66922+#endif
66923+
66924+ irq_ctx_init(cpu);
66925+
66926+#ifdef CONFIG_HOTPLUG_CPU
66927+ if (is_initial_xendomain())
66928+ cpu_set(cpu, cpu_present_map);
66929+#else
66930+ cpu_set(cpu, cpu_present_map);
66931+#endif
66932+
66933+ cpu_initialize_context(cpu);
66934+ }
66935+
66936+ init_xenbus_allowed_cpumask();
66937+
66938+#ifdef CONFIG_X86_IO_APIC
66939+ /*
66940+ * Here we can be sure that there is an IO-APIC in the system. Let's
66941+ * go and set it up:
66942+ */
66943+ if (!skip_ioapic_setup && nr_ioapics)
66944+ setup_IO_APIC();
66945+#endif
66946+}
66947+
66948+void __devinit smp_prepare_boot_cpu(void)
66949+{
66950+ prefill_possible_map();
66951+ cpu_present_map = cpumask_of_cpu(0);
66952+ cpu_online_map = cpumask_of_cpu(0);
66953+}
66954+
66955+#ifdef CONFIG_HOTPLUG_CPU
66956+
66957+/*
66958+ * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
66959+ * But do it early enough to catch critical for_each_present_cpu() loops
66960+ * in i386-specific code.
66961+ */
66962+static int __init initialize_cpu_present_map(void)
66963+{
66964+ cpu_present_map = cpu_possible_map;
66965+ return 0;
66966+}
66967+core_initcall(initialize_cpu_present_map);
66968+
66969+int __cpu_disable(void)
66970+{
66971+ cpumask_t map = cpu_online_map;
66972+ int cpu = smp_processor_id();
66973+
66974+ if (cpu == 0)
66975+ return -EBUSY;
66976+
66977+ remove_siblinginfo(cpu);
66978+
66979+ cpu_clear(cpu, map);
66980+ fixup_irqs(map);
66981+ cpu_clear(cpu, cpu_online_map);
66982+
66983+ return 0;
66984+}
66985+
66986+void __cpu_die(unsigned int cpu)
66987+{
66988+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
66989+ current->state = TASK_UNINTERRUPTIBLE;
66990+ schedule_timeout(HZ/10);
66991+ }
66992+
66993+ xen_smp_intr_exit(cpu);
66994+
66995+#ifdef CONFIG_SMP_ALTERNATIVES
66996+ if (num_online_cpus() == 1)
66997+ unprepare_for_smp();
66998+#endif
66999+}
67000+
67001+#else /* !CONFIG_HOTPLUG_CPU */
67002+
67003+int __cpu_disable(void)
67004+{
67005+ return -ENOSYS;
67006+}
67007+
67008+void __cpu_die(unsigned int cpu)
67009+{
67010+ BUG();
67011+}
67012+
67013+#endif /* CONFIG_HOTPLUG_CPU */
67014+
67015+int __devinit __cpu_up(unsigned int cpu)
67016+{
67017+ int rc;
67018+
67019+ rc = cpu_up_check(cpu);
67020+ if (rc)
67021+ return rc;
67022+
67023+#ifdef CONFIG_SMP_ALTERNATIVES
67024+ if (num_online_cpus() == 1)
67025+ prepare_for_smp();
67026+#endif
67027+
67028+ /* This must be done before setting cpu_online_map */
67029+ set_cpu_sibling_map(cpu);
67030+ wmb();
67031+
67032+ rc = xen_smp_intr_init(cpu);
67033+ if (rc) {
67034+ remove_siblinginfo(cpu);
67035+ return rc;
67036+ }
67037+
67038+ cpu_set(cpu, cpu_online_map);
67039+
67040+ rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
67041+ BUG_ON(rc);
67042+
67043+ return 0;
67044+}
67045+
67046+void __init smp_cpus_done(unsigned int max_cpus)
67047+{
67048+}
67049+
67050+#ifndef CONFIG_X86_LOCAL_APIC
67051+int setup_profiling_timer(unsigned int multiplier)
67052+{
67053+ return -EINVAL;
67054+}
67055+#endif
67056diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/xen_proc.c linux-2.6.16.33/drivers/xen/core/xen_proc.c
67057--- linux-2.6.16.33-noxen/drivers/xen/core/xen_proc.c 1970-01-01 00:00:00.000000000 +0000
67058+++ linux-2.6.16.33/drivers/xen/core/xen_proc.c 2007-01-08 15:00:45.000000000 +0000
67059@@ -0,0 +1,19 @@
67060+
67061+#include <linux/config.h>
67062+#include <linux/proc_fs.h>
67063+#include <xen/xen_proc.h>
67064+
67065+static struct proc_dir_entry *xen_base;
67066+
67067+struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
67068+{
67069+ if ( xen_base == NULL )
67070+ if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
67071+ panic("Couldn't create /proc/xen");
67072+ return create_proc_entry(name, mode, xen_base);
67073+}
67074+
67075+void remove_xen_proc_entry(const char *name)
67076+{
67077+ remove_proc_entry(name, xen_base);
67078+}
67079diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/xen_sysfs.c linux-2.6.16.33/drivers/xen/core/xen_sysfs.c
67080--- linux-2.6.16.33-noxen/drivers/xen/core/xen_sysfs.c 1970-01-01 00:00:00.000000000 +0000
67081+++ linux-2.6.16.33/drivers/xen/core/xen_sysfs.c 2007-01-08 15:00:45.000000000 +0000
67082@@ -0,0 +1,379 @@
67083+/*
67084+ * copyright (c) 2006 IBM Corporation
67085+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
67086+ *
67087+ * This program is free software; you can redistribute it and/or modify
67088+ * it under the terms of the GNU General Public License version 2 as
67089+ * published by the Free Software Foundation.
67090+ */
67091+
67092+#include <linux/config.h>
67093+#include <linux/err.h>
67094+#include <linux/kernel.h>
67095+#include <linux/module.h>
67096+#include <linux/init.h>
67097+#include <asm/hypervisor.h>
67098+#include <xen/features.h>
67099+#include <xen/hypervisor_sysfs.h>
67100+#include <xen/xenbus.h>
67101+
67102+MODULE_LICENSE("GPL");
67103+MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
67104+
67105+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
67106+{
67107+ return sprintf(buffer, "xen\n");
67108+}
67109+
67110+HYPERVISOR_ATTR_RO(type);
67111+
67112+static int __init xen_sysfs_type_init(void)
67113+{
67114+ return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
67115+}
67116+
67117+static void xen_sysfs_type_destroy(void)
67118+{
67119+ sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
67120+}
67121+
67122+/* xen version attributes */
67123+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
67124+{
67125+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
67126+ if (version)
67127+ return sprintf(buffer, "%d\n", version >> 16);
67128+ return -ENODEV;
67129+}
67130+
67131+HYPERVISOR_ATTR_RO(major);
67132+
67133+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
67134+{
67135+ int version = HYPERVISOR_xen_version(XENVER_version, NULL);
67136+ if (version)
67137+ return sprintf(buffer, "%d\n", version & 0xff);
67138+ return -ENODEV;
67139+}
67140+
67141+HYPERVISOR_ATTR_RO(minor);
67142+
67143+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
67144+{
67145+ int ret = -ENOMEM;
67146+ char *extra;
67147+
67148+ extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
67149+ if (extra) {
67150+ ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
67151+ if (!ret)
67152+ ret = sprintf(buffer, "%s\n", extra);
67153+ kfree(extra);
67154+ }
67155+
67156+ return ret;
67157+}
67158+
67159+HYPERVISOR_ATTR_RO(extra);
67160+
67161+static struct attribute *version_attrs[] = {
67162+ &major_attr.attr,
67163+ &minor_attr.attr,
67164+ &extra_attr.attr,
67165+ NULL
67166+};
67167+
67168+static struct attribute_group version_group = {
67169+ .name = "version",
67170+ .attrs = version_attrs,
67171+};
67172+
67173+static int __init xen_sysfs_version_init(void)
67174+{
67175+ return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67176+ &version_group);
67177+}
67178+
67179+static void xen_sysfs_version_destroy(void)
67180+{
67181+ sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
67182+}
67183+
67184+/* UUID */
67185+
67186+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
67187+{
67188+ char *vm, *val;
67189+ int ret;
67190+
67191+ vm = xenbus_read(XBT_NIL, "vm", "", NULL);
67192+ if (IS_ERR(vm))
67193+ return PTR_ERR(vm);
67194+ val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
67195+ kfree(vm);
67196+ if (IS_ERR(val))
67197+ return PTR_ERR(val);
67198+ ret = sprintf(buffer, "%s\n", val);
67199+ kfree(val);
67200+ return ret;
67201+}
67202+
67203+HYPERVISOR_ATTR_RO(uuid);
67204+
67205+static int __init xen_sysfs_uuid_init(void)
67206+{
67207+ return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
67208+}
67209+
67210+static void xen_sysfs_uuid_destroy(void)
67211+{
67212+ sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
67213+}
67214+
67215+/* xen compilation attributes */
67216+
67217+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
67218+{
67219+ int ret = -ENOMEM;
67220+ struct xen_compile_info *info;
67221+
67222+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67223+ if (info) {
67224+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67225+ if (!ret)
67226+ ret = sprintf(buffer, "%s\n", info->compiler);
67227+ kfree(info);
67228+ }
67229+
67230+ return ret;
67231+}
67232+
67233+HYPERVISOR_ATTR_RO(compiler);
67234+
67235+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
67236+{
67237+ int ret = -ENOMEM;
67238+ struct xen_compile_info *info;
67239+
67240+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67241+ if (info) {
67242+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67243+ if (!ret)
67244+ ret = sprintf(buffer, "%s\n", info->compile_by);
67245+ kfree(info);
67246+ }
67247+
67248+ return ret;
67249+}
67250+
67251+HYPERVISOR_ATTR_RO(compiled_by);
67252+
67253+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
67254+{
67255+ int ret = -ENOMEM;
67256+ struct xen_compile_info *info;
67257+
67258+ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67259+ if (info) {
67260+ ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67261+ if (!ret)
67262+ ret = sprintf(buffer, "%s\n", info->compile_date);
67263+ kfree(info);
67264+ }
67265+
67266+ return ret;
67267+}
67268+
67269+HYPERVISOR_ATTR_RO(compile_date);
67270+
67271+static struct attribute *xen_compile_attrs[] = {
67272+ &compiler_attr.attr,
67273+ &compiled_by_attr.attr,
67274+ &compile_date_attr.attr,
67275+ NULL
67276+};
67277+
67278+static struct attribute_group xen_compilation_group = {
67279+ .name = "compilation",
67280+ .attrs = xen_compile_attrs,
67281+};
67282+
67283+int __init static xen_compilation_init(void)
67284+{
67285+ return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67286+ &xen_compilation_group);
67287+}
67288+
67289+static void xen_compilation_destroy(void)
67290+{
67291+ sysfs_remove_group(&hypervisor_subsys.kset.kobj,
67292+ &xen_compilation_group);
67293+}
67294+
67295+/* xen properties info */
67296+
67297+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
67298+{
67299+ int ret = -ENOMEM;
67300+ char *caps;
67301+
67302+ caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
67303+ if (caps) {
67304+ ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
67305+ if (!ret)
67306+ ret = sprintf(buffer, "%s\n", caps);
67307+ kfree(caps);
67308+ }
67309+
67310+ return ret;
67311+}
67312+
67313+HYPERVISOR_ATTR_RO(capabilities);
67314+
67315+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
67316+{
67317+ int ret = -ENOMEM;
67318+ char *cset;
67319+
67320+ cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
67321+ if (cset) {
67322+ ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
67323+ if (!ret)
67324+ ret = sprintf(buffer, "%s\n", cset);
67325+ kfree(cset);
67326+ }
67327+
67328+ return ret;
67329+}
67330+
67331+HYPERVISOR_ATTR_RO(changeset);
67332+
67333+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
67334+{
67335+ int ret = -ENOMEM;
67336+ struct xen_platform_parameters *parms;
67337+
67338+ parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
67339+ if (parms) {
67340+ ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
67341+ parms);
67342+ if (!ret)
67343+ ret = sprintf(buffer, "%lx\n", parms->virt_start);
67344+ kfree(parms);
67345+ }
67346+
67347+ return ret;
67348+}
67349+
67350+HYPERVISOR_ATTR_RO(virtual_start);
67351+
67352+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
67353+{
67354+ int ret;
67355+
67356+ ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
67357+ if (ret > 0)
67358+ ret = sprintf(buffer, "%x\n", ret);
67359+
67360+ return ret;
67361+}
67362+
67363+HYPERVISOR_ATTR_RO(pagesize);
67364+
67365+/* eventually there will be several more features to export */
67366+static ssize_t xen_feature_show(int index, char *buffer)
67367+{
67368+ int ret = -ENOMEM;
67369+ struct xen_feature_info *info;
67370+
67371+ info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
67372+ if (info) {
67373+ info->submap_idx = index;
67374+ ret = HYPERVISOR_xen_version(XENVER_get_features, info);
67375+ if (!ret)
67376+ ret = sprintf(buffer, "%d\n", info->submap);
67377+ kfree(info);
67378+ }
67379+
67380+ return ret;
67381+}
67382+
67383+static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
67384+{
67385+ return xen_feature_show(XENFEAT_writable_page_tables, buffer);
67386+}
67387+
67388+HYPERVISOR_ATTR_RO(writable_pt);
67389+
67390+static struct attribute *xen_properties_attrs[] = {
67391+ &capabilities_attr.attr,
67392+ &changeset_attr.attr,
67393+ &virtual_start_attr.attr,
67394+ &pagesize_attr.attr,
67395+ &writable_pt_attr.attr,
67396+ NULL
67397+};
67398+
67399+static struct attribute_group xen_properties_group = {
67400+ .name = "properties",
67401+ .attrs = xen_properties_attrs,
67402+};
67403+
67404+static int __init xen_properties_init(void)
67405+{
67406+ return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67407+ &xen_properties_group);
67408+}
67409+
67410+static void xen_properties_destroy(void)
67411+{
67412+ sysfs_remove_group(&hypervisor_subsys.kset.kobj,
67413+ &xen_properties_group);
67414+}
67415+
67416+static int __init hyper_sysfs_init(void)
67417+{
67418+ int ret;
67419+
67420+ if (!is_running_on_xen())
67421+ return -ENODEV;
67422+
67423+ ret = xen_sysfs_type_init();
67424+ if (ret)
67425+ goto out;
67426+ ret = xen_sysfs_version_init();
67427+ if (ret)
67428+ goto version_out;
67429+ ret = xen_compilation_init();
67430+ if (ret)
67431+ goto comp_out;
67432+ ret = xen_sysfs_uuid_init();
67433+ if (ret)
67434+ goto uuid_out;
67435+ ret = xen_properties_init();
67436+ if (!ret)
67437+ goto out;
67438+
67439+ xen_sysfs_uuid_destroy();
67440+uuid_out:
67441+ xen_compilation_destroy();
67442+comp_out:
67443+ xen_sysfs_version_destroy();
67444+version_out:
67445+ xen_sysfs_type_destroy();
67446+out:
67447+ return ret;
67448+}
67449+
67450+static void hyper_sysfs_exit(void)
67451+{
67452+ xen_properties_destroy();
67453+ xen_compilation_destroy();
67454+ xen_sysfs_uuid_destroy();
67455+ xen_sysfs_version_destroy();
67456+ xen_sysfs_type_destroy();
67457+
67458+}
67459+
67460+module_init(hyper_sysfs_init);
67461+module_exit(hyper_sysfs_exit);
67462diff -Nur linux-2.6.16.33-noxen/drivers/xen/evtchn/Makefile linux-2.6.16.33/drivers/xen/evtchn/Makefile
67463--- linux-2.6.16.33-noxen/drivers/xen/evtchn/Makefile 1970-01-01 00:00:00.000000000 +0000
67464+++ linux-2.6.16.33/drivers/xen/evtchn/Makefile 2007-01-08 15:00:45.000000000 +0000
67465@@ -0,0 +1,2 @@
67466+
67467+obj-y := evtchn.o
67468diff -Nur linux-2.6.16.33-noxen/drivers/xen/evtchn/evtchn.c linux-2.6.16.33/drivers/xen/evtchn/evtchn.c
67469--- linux-2.6.16.33-noxen/drivers/xen/evtchn/evtchn.c 1970-01-01 00:00:00.000000000 +0000
67470+++ linux-2.6.16.33/drivers/xen/evtchn/evtchn.c 2007-01-08 15:00:45.000000000 +0000
67471@@ -0,0 +1,457 @@
67472+/******************************************************************************
67473+ * evtchn.c
67474+ *
67475+ * Driver for receiving and demuxing event-channel signals.
67476+ *
67477+ * Copyright (c) 2004-2005, K A Fraser
67478+ * Multi-process extensions Copyright (c) 2004, Steven Smith
67479+ *
67480+ * This program is free software; you can redistribute it and/or
67481+ * modify it under the terms of the GNU General Public License version 2
67482+ * as published by the Free Software Foundation; or, when distributed
67483+ * separately from the Linux kernel or incorporated into other
67484+ * software packages, subject to the following license:
67485+ *
67486+ * Permission is hereby granted, free of charge, to any person obtaining a copy
67487+ * of this source file (the "Software"), to deal in the Software without
67488+ * restriction, including without limitation the rights to use, copy, modify,
67489+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
67490+ * and to permit persons to whom the Software is furnished to do so, subject to
67491+ * the following conditions:
67492+ *
67493+ * The above copyright notice and this permission notice shall be included in
67494+ * all copies or substantial portions of the Software.
67495+ *
67496+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67497+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67498+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67499+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
67500+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
67501+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
67502+ * IN THE SOFTWARE.
67503+ */
67504+
67505+#include <linux/config.h>
67506+#include <linux/module.h>
67507+#include <linux/kernel.h>
67508+#include <linux/sched.h>
67509+#include <linux/slab.h>
67510+#include <linux/string.h>
67511+#include <linux/errno.h>
67512+#include <linux/fs.h>
67513+#include <linux/errno.h>
67514+#include <linux/miscdevice.h>
67515+#include <linux/major.h>
67516+#include <linux/proc_fs.h>
67517+#include <linux/stat.h>
67518+#include <linux/poll.h>
67519+#include <linux/irq.h>
67520+#include <linux/init.h>
67521+#include <linux/gfp.h>
67522+#include <xen/evtchn.h>
67523+#include <xen/public/evtchn.h>
67524+
67525+struct per_user_data {
67526+ /* Notification ring, accessed via /dev/xen/evtchn. */
67527+#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
67528+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
67529+ evtchn_port_t *ring;
67530+ unsigned int ring_cons, ring_prod, ring_overflow;
67531+
67532+ /* Processes wait on this queue when ring is empty. */
67533+ wait_queue_head_t evtchn_wait;
67534+ struct fasync_struct *evtchn_async_queue;
67535+};
67536+
67537+/* Who's bound to each port? */
67538+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
67539+static spinlock_t port_user_lock;
67540+
67541+void evtchn_device_upcall(int port)
67542+{
67543+ struct per_user_data *u;
67544+
67545+ spin_lock(&port_user_lock);
67546+
67547+ mask_evtchn(port);
67548+ clear_evtchn(port);
67549+
67550+ if ((u = port_user[port]) != NULL) {
67551+ if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
67552+ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
67553+ if (u->ring_cons == u->ring_prod++) {
67554+ wake_up_interruptible(&u->evtchn_wait);
67555+ kill_fasync(&u->evtchn_async_queue,
67556+ SIGIO, POLL_IN);
67557+ }
67558+ } else {
67559+ u->ring_overflow = 1;
67560+ }
67561+ }
67562+
67563+ spin_unlock(&port_user_lock);
67564+}
67565+
67566+static ssize_t evtchn_read(struct file *file, char __user *buf,
67567+ size_t count, loff_t *ppos)
67568+{
67569+ int rc;
67570+ unsigned int c, p, bytes1 = 0, bytes2 = 0;
67571+ struct per_user_data *u = file->private_data;
67572+
67573+ /* Whole number of ports. */
67574+ count &= ~(sizeof(evtchn_port_t)-1);
67575+
67576+ if (count == 0)
67577+ return 0;
67578+
67579+ if (count > PAGE_SIZE)
67580+ count = PAGE_SIZE;
67581+
67582+ for (;;) {
67583+ if (u->ring_overflow)
67584+ return -EFBIG;
67585+
67586+ if ((c = u->ring_cons) != (p = u->ring_prod))
67587+ break;
67588+
67589+ if (file->f_flags & O_NONBLOCK)
67590+ return -EAGAIN;
67591+
67592+ rc = wait_event_interruptible(
67593+ u->evtchn_wait, u->ring_cons != u->ring_prod);
67594+ if (rc)
67595+ return rc;
67596+ }
67597+
67598+ /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
67599+ if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
67600+ bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
67601+ sizeof(evtchn_port_t);
67602+ bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
67603+ } else {
67604+ bytes1 = (p - c) * sizeof(evtchn_port_t);
67605+ bytes2 = 0;
67606+ }
67607+
67608+ /* Truncate chunks according to caller's maximum byte count. */
67609+ if (bytes1 > count) {
67610+ bytes1 = count;
67611+ bytes2 = 0;
67612+ } else if ((bytes1 + bytes2) > count) {
67613+ bytes2 = count - bytes1;
67614+ }
67615+
67616+ if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
67617+ ((bytes2 != 0) &&
67618+ copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
67619+ return -EFAULT;
67620+
67621+ u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
67622+
67623+ return bytes1 + bytes2;
67624+}
67625+
67626+static ssize_t evtchn_write(struct file *file, const char __user *buf,
67627+ size_t count, loff_t *ppos)
67628+{
67629+ int rc, i;
67630+ evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
67631+ struct per_user_data *u = file->private_data;
67632+
67633+ if (kbuf == NULL)
67634+ return -ENOMEM;
67635+
67636+ /* Whole number of ports. */
67637+ count &= ~(sizeof(evtchn_port_t)-1);
67638+
67639+ if (count == 0) {
67640+ rc = 0;
67641+ goto out;
67642+ }
67643+
67644+ if (count > PAGE_SIZE)
67645+ count = PAGE_SIZE;
67646+
67647+ if (copy_from_user(kbuf, buf, count) != 0) {
67648+ rc = -EFAULT;
67649+ goto out;
67650+ }
67651+
67652+ spin_lock_irq(&port_user_lock);
67653+ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
67654+ if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
67655+ unmask_evtchn(kbuf[i]);
67656+ spin_unlock_irq(&port_user_lock);
67657+
67658+ rc = count;
67659+
67660+ out:
67661+ free_page((unsigned long)kbuf);
67662+ return rc;
67663+}
67664+
67665+static void evtchn_bind_to_user(struct per_user_data *u, int port)
67666+{
67667+ spin_lock_irq(&port_user_lock);
67668+ BUG_ON(port_user[port] != NULL);
67669+ port_user[port] = u;
67670+ unmask_evtchn(port);
67671+ spin_unlock_irq(&port_user_lock);
67672+}
67673+
67674+static int evtchn_ioctl(struct inode *inode, struct file *file,
67675+ unsigned int cmd, unsigned long arg)
67676+{
67677+ int rc;
67678+ struct per_user_data *u = file->private_data;
67679+ void __user *uarg = (void __user *) arg;
67680+
67681+ switch (cmd) {
67682+ case IOCTL_EVTCHN_BIND_VIRQ: {
67683+ struct ioctl_evtchn_bind_virq bind;
67684+ struct evtchn_bind_virq bind_virq;
67685+
67686+ rc = -EFAULT;
67687+ if (copy_from_user(&bind, uarg, sizeof(bind)))
67688+ break;
67689+
67690+ bind_virq.virq = bind.virq;
67691+ bind_virq.vcpu = 0;
67692+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
67693+ &bind_virq);
67694+ if (rc != 0)
67695+ break;
67696+
67697+ rc = bind_virq.port;
67698+ evtchn_bind_to_user(u, rc);
67699+ break;
67700+ }
67701+
67702+ case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
67703+ struct ioctl_evtchn_bind_interdomain bind;
67704+ struct evtchn_bind_interdomain bind_interdomain;
67705+
67706+ rc = -EFAULT;
67707+ if (copy_from_user(&bind, uarg, sizeof(bind)))
67708+ break;
67709+
67710+ bind_interdomain.remote_dom = bind.remote_domain;
67711+ bind_interdomain.remote_port = bind.remote_port;
67712+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
67713+ &bind_interdomain);
67714+ if (rc != 0)
67715+ break;
67716+
67717+ rc = bind_interdomain.local_port;
67718+ evtchn_bind_to_user(u, rc);
67719+ break;
67720+ }
67721+
67722+ case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
67723+ struct ioctl_evtchn_bind_unbound_port bind;
67724+ struct evtchn_alloc_unbound alloc_unbound;
67725+
67726+ rc = -EFAULT;
67727+ if (copy_from_user(&bind, uarg, sizeof(bind)))
67728+ break;
67729+
67730+ alloc_unbound.dom = DOMID_SELF;
67731+ alloc_unbound.remote_dom = bind.remote_domain;
67732+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
67733+ &alloc_unbound);
67734+ if (rc != 0)
67735+ break;
67736+
67737+ rc = alloc_unbound.port;
67738+ evtchn_bind_to_user(u, rc);
67739+ break;
67740+ }
67741+
67742+ case IOCTL_EVTCHN_UNBIND: {
67743+ struct ioctl_evtchn_unbind unbind;
67744+ struct evtchn_close close;
67745+ int ret;
67746+
67747+ rc = -EFAULT;
67748+ if (copy_from_user(&unbind, uarg, sizeof(unbind)))
67749+ break;
67750+
67751+ rc = -EINVAL;
67752+ if (unbind.port >= NR_EVENT_CHANNELS)
67753+ break;
67754+
67755+ spin_lock_irq(&port_user_lock);
67756+
67757+ rc = -ENOTCONN;
67758+ if (port_user[unbind.port] != u) {
67759+ spin_unlock_irq(&port_user_lock);
67760+ break;
67761+ }
67762+
67763+ port_user[unbind.port] = NULL;
67764+ mask_evtchn(unbind.port);
67765+
67766+ spin_unlock_irq(&port_user_lock);
67767+
67768+ close.port = unbind.port;
67769+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
67770+ BUG_ON(ret);
67771+
67772+ rc = 0;
67773+ break;
67774+ }
67775+
67776+ case IOCTL_EVTCHN_NOTIFY: {
67777+ struct ioctl_evtchn_notify notify;
67778+
67779+ rc = -EFAULT;
67780+ if (copy_from_user(&notify, uarg, sizeof(notify)))
67781+ break;
67782+
67783+ if (notify.port >= NR_EVENT_CHANNELS) {
67784+ rc = -EINVAL;
67785+ } else if (port_user[notify.port] != u) {
67786+ rc = -ENOTCONN;
67787+ } else {
67788+ notify_remote_via_evtchn(notify.port);
67789+ rc = 0;
67790+ }
67791+ break;
67792+ }
67793+
67794+ case IOCTL_EVTCHN_RESET: {
67795+ /* Initialise the ring to empty. Clear errors. */
67796+ spin_lock_irq(&port_user_lock);
67797+ u->ring_cons = u->ring_prod = u->ring_overflow = 0;
67798+ spin_unlock_irq(&port_user_lock);
67799+ rc = 0;
67800+ break;
67801+ }
67802+
67803+ default:
67804+ rc = -ENOSYS;
67805+ break;
67806+ }
67807+
67808+ return rc;
67809+}
67810+
67811+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
67812+{
67813+ unsigned int mask = POLLOUT | POLLWRNORM;
67814+ struct per_user_data *u = file->private_data;
67815+
67816+ poll_wait(file, &u->evtchn_wait, wait);
67817+ if (u->ring_cons != u->ring_prod)
67818+ mask |= POLLIN | POLLRDNORM;
67819+ if (u->ring_overflow)
67820+ mask = POLLERR;
67821+ return mask;
67822+}
67823+
67824+static int evtchn_fasync(int fd, struct file *filp, int on)
67825+{
67826+ struct per_user_data *u = filp->private_data;
67827+ return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
67828+}
67829+
67830+static int evtchn_open(struct inode *inode, struct file *filp)
67831+{
67832+ struct per_user_data *u;
67833+
67834+ if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
67835+ return -ENOMEM;
67836+
67837+ memset(u, 0, sizeof(*u));
67838+ init_waitqueue_head(&u->evtchn_wait);
67839+
67840+ u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
67841+ if (u->ring == NULL) {
67842+ kfree(u);
67843+ return -ENOMEM;
67844+ }
67845+
67846+ filp->private_data = u;
67847+
67848+ return 0;
67849+}
67850+
67851+static int evtchn_release(struct inode *inode, struct file *filp)
67852+{
67853+ int i;
67854+ struct per_user_data *u = filp->private_data;
67855+ struct evtchn_close close;
67856+
67857+ spin_lock_irq(&port_user_lock);
67858+
67859+ free_page((unsigned long)u->ring);
67860+
67861+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
67862+ int ret;
67863+ if (port_user[i] != u)
67864+ continue;
67865+
67866+ port_user[i] = NULL;
67867+ mask_evtchn(i);
67868+
67869+ close.port = i;
67870+ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
67871+ BUG_ON(ret);
67872+ }
67873+
67874+ spin_unlock_irq(&port_user_lock);
67875+
67876+ kfree(u);
67877+
67878+ return 0;
67879+}
67880+
67881+static struct file_operations evtchn_fops = {
67882+ .owner = THIS_MODULE,
67883+ .read = evtchn_read,
67884+ .write = evtchn_write,
67885+ .ioctl = evtchn_ioctl,
67886+ .poll = evtchn_poll,
67887+ .fasync = evtchn_fasync,
67888+ .open = evtchn_open,
67889+ .release = evtchn_release,
67890+};
67891+
67892+static struct miscdevice evtchn_miscdev = {
67893+ .minor = MISC_DYNAMIC_MINOR,
67894+ .name = "evtchn",
67895+ .fops = &evtchn_fops,
67896+};
67897+
67898+static int __init evtchn_init(void)
67899+{
67900+ int err;
67901+
67902+ if (!is_running_on_xen())
67903+ return -ENODEV;
67904+
67905+ spin_lock_init(&port_user_lock);
67906+ memset(port_user, 0, sizeof(port_user));
67907+
67908+ /* Create '/dev/misc/evtchn'. */
67909+ err = misc_register(&evtchn_miscdev);
67910+ if (err != 0) {
67911+ printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
67912+ return err;
67913+ }
67914+
67915+ printk("Event-channel device installed.\n");
67916+
67917+ return 0;
67918+}
67919+
67920+static void evtchn_cleanup(void)
67921+{
67922+ misc_deregister(&evtchn_miscdev);
67923+}
67924+
67925+module_init(evtchn_init);
67926+module_exit(evtchn_cleanup);
67927+
67928+MODULE_LICENSE("Dual BSD/GPL");
67929diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/Makefile linux-2.6.16.33/drivers/xen/fbfront/Makefile
67930--- linux-2.6.16.33-noxen/drivers/xen/fbfront/Makefile 1970-01-01 00:00:00.000000000 +0000
67931+++ linux-2.6.16.33/drivers/xen/fbfront/Makefile 2007-01-08 15:00:45.000000000 +0000
67932@@ -0,0 +1,2 @@
67933+obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o
67934+obj-$(CONFIG_XEN_KEYBOARD) += xenkbd.o
67935diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/xenfb.c linux-2.6.16.33/drivers/xen/fbfront/xenfb.c
67936--- linux-2.6.16.33-noxen/drivers/xen/fbfront/xenfb.c 1970-01-01 00:00:00.000000000 +0000
67937+++ linux-2.6.16.33/drivers/xen/fbfront/xenfb.c 2007-01-08 15:00:45.000000000 +0000
67938@@ -0,0 +1,750 @@
67939+/*
67940+ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
67941+ *
67942+ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
67943+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
67944+ *
67945+ * Based on linux/drivers/video/q40fb.c
67946+ *
67947+ * This file is subject to the terms and conditions of the GNU General Public
67948+ * License. See the file COPYING in the main directory of this archive for
67949+ * more details.
67950+ */
67951+
67952+/*
67953+ * TODO:
67954+ *
67955+ * Switch to grant tables when they become capable of dealing with the
67956+ * frame buffer.
67957+ */
67958+
67959+#include <linux/kernel.h>
67960+#include <linux/errno.h>
67961+#include <linux/fb.h>
67962+#include <linux/module.h>
67963+#include <linux/vmalloc.h>
67964+#include <linux/mm.h>
67965+#include <asm/hypervisor.h>
67966+#include <xen/evtchn.h>
67967+#include <xen/interface/io/fbif.h>
67968+#include <xen/xenbus.h>
67969+#include <linux/kthread.h>
67970+
67971+struct xenfb_mapping
67972+{
67973+ struct list_head link;
67974+ struct vm_area_struct *vma;
67975+ atomic_t map_refs;
67976+ int faults;
67977+ struct xenfb_info *info;
67978+};
67979+
67980+struct xenfb_info
67981+{
67982+ struct task_struct *kthread;
67983+ wait_queue_head_t wq;
67984+
67985+ unsigned char *fb;
67986+ struct fb_info *fb_info;
67987+ struct timer_list refresh;
67988+ int dirty;
67989+ int x1, y1, x2, y2; /* dirty rectangle,
67990+ protected by dirty_lock */
67991+ spinlock_t dirty_lock;
67992+ struct mutex mm_lock;
67993+ int nr_pages;
67994+ struct page **pages;
67995+ struct list_head mappings; /* protected by mm_lock */
67996+
67997+ unsigned evtchn;
67998+ int irq;
67999+ struct xenfb_page *page;
68000+ unsigned long *mfns;
68001+ int update_wanted; /* XENFB_TYPE_UPDATE wanted */
68002+
68003+ struct xenbus_device *xbdev;
68004+};
68005+
68006+/*
68007+ * How the locks work together
68008+ *
68009+ * There are two locks: spinlock dirty_lock protecting the dirty
68010+ * rectangle, and mutex mm_lock protecting mappings.
68011+ *
68012+ * The problem is that dirty rectangle and mappings aren't
68013+ * independent: the dirty rectangle must cover all faulted pages in
68014+ * mappings. We need to prove that our locking maintains this
68015+ * invariant.
68016+ *
68017+ * There are several kinds of critical regions:
68018+ *
68019+ * 1. Holding only dirty_lock: xenfb_refresh(). May run in
68020+ * interrupts. Extends the dirty rectangle. Trivially preserves
68021+ * invariant.
68022+ *
68023+ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close(). Touch
68024+ * only mappings. The former creates unfaulted pages. Preserves
68025+ * invariant. The latter removes pages. Preserves invariant.
68026+ *
68027+ * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
68028+ * rectangle and updates mappings consistently. Preserves
68029+ * invariant.
68030+ *
68031+ * 4. The ugliest one: xenfb_update_screen(). Clear the dirty
68032+ * rectangle and update mappings consistently.
68033+ *
68034+ * We can't simply hold both locks, because zap_page_range() cannot
68035+ * be called with a spinlock held.
68036+ *
68037+ * Therefore, we first clear the dirty rectangle with both locks
68038+ * held. Then we unlock dirty_lock and update the mappings.
68039+ * Critical regions that hold only dirty_lock may interfere with
68040+ * that. This can only be region 1: xenfb_refresh(). But that
68041+ * just extends the dirty rectangle, which can't harm the
68042+ * invariant.
68043+ *
68044+ * But FIXME: the invariant is too weak. It misses that the fault
68045+ * record in mappings must be consistent with the mapping of pages in
68046+ * the associated address space! do_no_page() updates the PTE after
68047+ * xenfb_vm_nopage() returns, i.e. outside the critical region. This
68048+ * allows the following race:
68049+ *
68050+ * X writes to some address in the Xen frame buffer
68051+ * Fault - call do_no_page()
68052+ * call xenfb_vm_nopage()
68053+ * grab mm_lock
68054+ * map->faults++;
68055+ * release mm_lock
68056+ * return back to do_no_page()
68057+ * (preempted, or SMP)
68058+ * Xen worker thread runs.
68059+ * grab mm_lock
68060+ * look at mappings
68061+ * find this mapping, zaps its pages (but page not in pte yet)
68062+ * clear map->faults
68063+ * releases mm_lock
68064+ * (back to X process)
68065+ * put page in X's pte
68066+ *
68067+ * Oh well, we wont be updating the writes to this page anytime soon.
68068+ */
68069+
68070+static int xenfb_fps = 20;
68071+static unsigned long xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8;
68072+
68073+static int xenfb_remove(struct xenbus_device *);
68074+static void xenfb_init_shared_page(struct xenfb_info *);
68075+static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
68076+static void xenfb_disconnect_backend(struct xenfb_info *);
68077+
68078+static void xenfb_do_update(struct xenfb_info *info,
68079+ int x, int y, int w, int h)
68080+{
68081+ union xenfb_out_event event;
68082+ __u32 prod;
68083+
68084+ event.type = XENFB_TYPE_UPDATE;
68085+ event.update.x = x;
68086+ event.update.y = y;
68087+ event.update.width = w;
68088+ event.update.height = h;
68089+
68090+ prod = info->page->out_prod;
68091+ /* caller ensures !xenfb_queue_full() */
68092+ mb(); /* ensure ring space available */
68093+ XENFB_OUT_RING_REF(info->page, prod) = event;
68094+ wmb(); /* ensure ring contents visible */
68095+ info->page->out_prod = prod + 1;
68096+
68097+ notify_remote_via_evtchn(info->evtchn);
68098+}
68099+
68100+static int xenfb_queue_full(struct xenfb_info *info)
68101+{
68102+ __u32 cons, prod;
68103+
68104+ prod = info->page->out_prod;
68105+ cons = info->page->out_cons;
68106+ return prod - cons == XENFB_OUT_RING_LEN;
68107+}
68108+
68109+static void xenfb_update_screen(struct xenfb_info *info)
68110+{
68111+ unsigned long flags;
68112+ int y1, y2, x1, x2;
68113+ struct xenfb_mapping *map;
68114+
68115+ if (!info->update_wanted)
68116+ return;
68117+ if (xenfb_queue_full(info))
68118+ return;
68119+
68120+ mutex_lock(&info->mm_lock);
68121+
68122+ spin_lock_irqsave(&info->dirty_lock, flags);
68123+ y1 = info->y1;
68124+ y2 = info->y2;
68125+ x1 = info->x1;
68126+ x2 = info->x2;
68127+ info->x1 = info->y1 = INT_MAX;
68128+ info->x2 = info->y2 = 0;
68129+ spin_unlock_irqrestore(&info->dirty_lock, flags);
68130+
68131+ list_for_each_entry(map, &info->mappings, link) {
68132+ if (!map->faults)
68133+ continue;
68134+ zap_page_range(map->vma, map->vma->vm_start,
68135+ map->vma->vm_end - map->vma->vm_start, NULL);
68136+ map->faults = 0;
68137+ }
68138+
68139+ mutex_unlock(&info->mm_lock);
68140+
68141+ xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
68142+}
68143+
68144+static int xenfb_thread(void *data)
68145+{
68146+ struct xenfb_info *info = data;
68147+
68148+ while (!kthread_should_stop()) {
68149+ if (info->dirty) {
68150+ info->dirty = 0;
68151+ xenfb_update_screen(info);
68152+ }
68153+ wait_event_interruptible(info->wq,
68154+ kthread_should_stop() || info->dirty);
68155+ try_to_freeze();
68156+ }
68157+ return 0;
68158+}
68159+
68160+static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
68161+ unsigned blue, unsigned transp,
68162+ struct fb_info *info)
68163+{
68164+ u32 v;
68165+
68166+ if (regno > info->cmap.len)
68167+ return 1;
68168+
68169+ red >>= (16 - info->var.red.length);
68170+ green >>= (16 - info->var.green.length);
68171+ blue >>= (16 - info->var.blue.length);
68172+
68173+ v = (red << info->var.red.offset) |
68174+ (green << info->var.green.offset) |
68175+ (blue << info->var.blue.offset);
68176+
68177+ /* FIXME is this sane? check against xxxfb_setcolreg()! */
68178+ switch (info->var.bits_per_pixel) {
68179+ case 16:
68180+ case 24:
68181+ case 32:
68182+ ((u32 *)info->pseudo_palette)[regno] = v;
68183+ break;
68184+ }
68185+
68186+ return 0;
68187+}
68188+
68189+static void xenfb_timer(unsigned long data)
68190+{
68191+ struct xenfb_info *info = (struct xenfb_info *)data;
68192+ info->dirty = 1;
68193+ wake_up(&info->wq);
68194+}
68195+
68196+static void __xenfb_refresh(struct xenfb_info *info,
68197+ int x1, int y1, int w, int h)
68198+{
68199+ int y2, x2;
68200+
68201+ y2 = y1 + h;
68202+ x2 = x1 + w;
68203+
68204+ if (info->y1 > y1)
68205+ info->y1 = y1;
68206+ if (info->y2 < y2)
68207+ info->y2 = y2;
68208+ if (info->x1 > x1)
68209+ info->x1 = x1;
68210+ if (info->x2 < x2)
68211+ info->x2 = x2;
68212+
68213+ if (timer_pending(&info->refresh))
68214+ return;
68215+
68216+ mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
68217+}
68218+
68219+static void xenfb_refresh(struct xenfb_info *info,
68220+ int x1, int y1, int w, int h)
68221+{
68222+ unsigned long flags;
68223+
68224+ spin_lock_irqsave(&info->dirty_lock, flags);
68225+ __xenfb_refresh(info, x1, y1, w, h);
68226+ spin_unlock_irqrestore(&info->dirty_lock, flags);
68227+}
68228+
68229+static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
68230+{
68231+ struct xenfb_info *info = p->par;
68232+
68233+ cfb_fillrect(p, rect);
68234+ xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
68235+}
68236+
68237+static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
68238+{
68239+ struct xenfb_info *info = p->par;
68240+
68241+ cfb_imageblit(p, image);
68242+ xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
68243+}
68244+
68245+static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
68246+{
68247+ struct xenfb_info *info = p->par;
68248+
68249+ cfb_copyarea(p, area);
68250+ xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
68251+}
68252+
68253+static void xenfb_vm_open(struct vm_area_struct *vma)
68254+{
68255+ struct xenfb_mapping *map = vma->vm_private_data;
68256+ atomic_inc(&map->map_refs);
68257+}
68258+
68259+static void xenfb_vm_close(struct vm_area_struct *vma)
68260+{
68261+ struct xenfb_mapping *map = vma->vm_private_data;
68262+ struct xenfb_info *info = map->info;
68263+
68264+ mutex_lock(&info->mm_lock);
68265+ if (atomic_dec_and_test(&map->map_refs)) {
68266+ list_del(&map->link);
68267+ kfree(map);
68268+ }
68269+ mutex_unlock(&info->mm_lock);
68270+}
68271+
68272+static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
68273+ unsigned long vaddr, int *type)
68274+{
68275+ struct xenfb_mapping *map = vma->vm_private_data;
68276+ struct xenfb_info *info = map->info;
68277+ int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
68278+ unsigned long flags;
68279+ struct page *page;
68280+ int y1, y2;
68281+
68282+ if (pgnr >= info->nr_pages)
68283+ return NOPAGE_SIGBUS;
68284+
68285+ mutex_lock(&info->mm_lock);
68286+ spin_lock_irqsave(&info->dirty_lock, flags);
68287+ page = info->pages[pgnr];
68288+ get_page(page);
68289+ map->faults++;
68290+
68291+ y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
68292+ y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
68293+ if (y2 > info->fb_info->var.yres)
68294+ y2 = info->fb_info->var.yres;
68295+ __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
68296+ spin_unlock_irqrestore(&info->dirty_lock, flags);
68297+ mutex_unlock(&info->mm_lock);
68298+
68299+ if (type)
68300+ *type = VM_FAULT_MINOR;
68301+
68302+ return page;
68303+}
68304+
68305+static struct vm_operations_struct xenfb_vm_ops = {
68306+ .open = xenfb_vm_open,
68307+ .close = xenfb_vm_close,
68308+ .nopage = xenfb_vm_nopage,
68309+};
68310+
68311+static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
68312+{
68313+ struct xenfb_info *info = fb_info->par;
68314+ struct xenfb_mapping *map;
68315+ int map_pages;
68316+
68317+ if (!(vma->vm_flags & VM_WRITE))
68318+ return -EINVAL;
68319+ if (!(vma->vm_flags & VM_SHARED))
68320+ return -EINVAL;
68321+ if (vma->vm_pgoff != 0)
68322+ return -EINVAL;
68323+
68324+ map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
68325+ if (map_pages > info->nr_pages)
68326+ return -EINVAL;
68327+
68328+ map = kzalloc(sizeof(*map), GFP_KERNEL);
68329+ if (map == NULL)
68330+ return -ENOMEM;
68331+
68332+ map->vma = vma;
68333+ map->faults = 0;
68334+ map->info = info;
68335+ atomic_set(&map->map_refs, 1);
68336+
68337+ mutex_lock(&info->mm_lock);
68338+ list_add(&map->link, &info->mappings);
68339+ mutex_unlock(&info->mm_lock);
68340+
68341+ vma->vm_ops = &xenfb_vm_ops;
68342+ vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
68343+ vma->vm_private_data = map;
68344+
68345+ return 0;
68346+}
68347+
68348+static struct fb_ops xenfb_fb_ops = {
68349+ .owner = THIS_MODULE,
68350+ .fb_setcolreg = xenfb_setcolreg,
68351+ .fb_fillrect = xenfb_fillrect,
68352+ .fb_copyarea = xenfb_copyarea,
68353+ .fb_imageblit = xenfb_imageblit,
68354+ .fb_mmap = xenfb_mmap,
68355+};
68356+
68357+static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
68358+ struct pt_regs *regs)
68359+{
68360+ /*
68361+ * No in events recognized, simply ignore them all.
68362+ * If you need to recognize some, see xenbkd's input_handler()
68363+ * for how to do that.
68364+ */
68365+ struct xenfb_info *info = dev_id;
68366+ struct xenfb_page *page = info->page;
68367+
68368+ if (page->in_cons != page->in_prod) {
68369+ info->page->in_cons = info->page->in_prod;
68370+ notify_remote_via_evtchn(info->evtchn);
68371+ }
68372+ return IRQ_HANDLED;
68373+}
68374+
68375+static unsigned long vmalloc_to_mfn(void *address)
68376+{
68377+ return pfn_to_mfn(vmalloc_to_pfn(address));
68378+}
68379+
68380+static int __devinit xenfb_probe(struct xenbus_device *dev,
68381+ const struct xenbus_device_id *id)
68382+{
68383+ struct xenfb_info *info;
68384+ struct fb_info *fb_info;
68385+ int ret;
68386+
68387+ info = kzalloc(sizeof(*info), GFP_KERNEL);
68388+ if (info == NULL) {
68389+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
68390+ return -ENOMEM;
68391+ }
68392+ dev->dev.driver_data = info;
68393+ info->xbdev = dev;
68394+ info->irq = -1;
68395+ info->x1 = info->y1 = INT_MAX;
68396+ spin_lock_init(&info->dirty_lock);
68397+ mutex_init(&info->mm_lock);
68398+ init_waitqueue_head(&info->wq);
68399+ init_timer(&info->refresh);
68400+ info->refresh.function = xenfb_timer;
68401+ info->refresh.data = (unsigned long)info;
68402+ INIT_LIST_HEAD(&info->mappings);
68403+
68404+ info->fb = vmalloc(xenfb_mem_len);
68405+ if (info->fb == NULL)
68406+ goto error_nomem;
68407+ memset(info->fb, 0, xenfb_mem_len);
68408+
68409+ info->nr_pages = (xenfb_mem_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
68410+
68411+ info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
68412+ GFP_KERNEL);
68413+ if (info->pages == NULL)
68414+ goto error_nomem;
68415+
68416+ info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
68417+ if (!info->mfns)
68418+ goto error_nomem;
68419+
68420+ /* set up shared page */
68421+ info->page = (void *)__get_free_page(GFP_KERNEL);
68422+ if (!info->page)
68423+ goto error_nomem;
68424+
68425+ xenfb_init_shared_page(info);
68426+
68427+ fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
68428+ /* see fishy hackery below */
68429+ if (fb_info == NULL)
68430+ goto error_nomem;
68431+
68432+ /* FIXME fishy hackery */
68433+ fb_info->pseudo_palette = fb_info->par;
68434+ fb_info->par = info;
68435+ /* /FIXME */
68436+ fb_info->screen_base = info->fb;
68437+
68438+ fb_info->fbops = &xenfb_fb_ops;
68439+ fb_info->var.xres_virtual = fb_info->var.xres = info->page->width;
68440+ fb_info->var.yres_virtual = fb_info->var.yres = info->page->height;
68441+ fb_info->var.bits_per_pixel = info->page->depth;
68442+
68443+ fb_info->var.red = (struct fb_bitfield){16, 8, 0};
68444+ fb_info->var.green = (struct fb_bitfield){8, 8, 0};
68445+ fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
68446+
68447+ fb_info->var.activate = FB_ACTIVATE_NOW;
68448+ fb_info->var.height = -1;
68449+ fb_info->var.width = -1;
68450+ fb_info->var.vmode = FB_VMODE_NONINTERLACED;
68451+
68452+ fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
68453+ fb_info->fix.line_length = info->page->line_length;
68454+ fb_info->fix.smem_start = 0;
68455+ fb_info->fix.smem_len = xenfb_mem_len;
68456+ strcpy(fb_info->fix.id, "xen");
68457+ fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
68458+ fb_info->fix.accel = FB_ACCEL_NONE;
68459+
68460+ fb_info->flags = FBINFO_FLAG_DEFAULT;
68461+
68462+ ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
68463+ if (ret < 0) {
68464+ framebuffer_release(fb_info);
68465+ xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
68466+ goto error;
68467+ }
68468+
68469+ ret = register_framebuffer(fb_info);
68470+ if (ret) {
68471+ fb_dealloc_cmap(&info->fb_info->cmap);
68472+ framebuffer_release(fb_info);
68473+ xenbus_dev_fatal(dev, ret, "register_framebuffer");
68474+ goto error;
68475+ }
68476+ info->fb_info = fb_info;
68477+
68478+ /* FIXME should this be delayed until backend XenbusStateConnected? */
68479+ info->kthread = kthread_run(xenfb_thread, info, "xenfb thread");
68480+ if (IS_ERR(info->kthread)) {
68481+ ret = PTR_ERR(info->kthread);
68482+ info->kthread = NULL;
68483+ xenbus_dev_fatal(dev, ret, "register_framebuffer");
68484+ goto error;
68485+ }
68486+
68487+ ret = xenfb_connect_backend(dev, info);
68488+ if (ret < 0)
68489+ goto error;
68490+
68491+ return 0;
68492+
68493+ error_nomem:
68494+ ret = -ENOMEM;
68495+ xenbus_dev_fatal(dev, ret, "allocating device memory");
68496+ error:
68497+ xenfb_remove(dev);
68498+ return ret;
68499+}
68500+
68501+static int xenfb_resume(struct xenbus_device *dev)
68502+{
68503+ struct xenfb_info *info = dev->dev.driver_data;
68504+
68505+ xenfb_disconnect_backend(info);
68506+ xenfb_init_shared_page(info);
68507+ return xenfb_connect_backend(dev, info);
68508+}
68509+
68510+static int xenfb_remove(struct xenbus_device *dev)
68511+{
68512+ struct xenfb_info *info = dev->dev.driver_data;
68513+
68514+ del_timer(&info->refresh);
68515+ if (info->kthread)
68516+ kthread_stop(info->kthread);
68517+ xenfb_disconnect_backend(info);
68518+ if (info->fb_info) {
68519+ unregister_framebuffer(info->fb_info);
68520+ fb_dealloc_cmap(&info->fb_info->cmap);
68521+ framebuffer_release(info->fb_info);
68522+ }
68523+ free_page((unsigned long)info->page);
68524+ vfree(info->mfns);
68525+ kfree(info->pages);
68526+ vfree(info->fb);
68527+ kfree(info);
68528+
68529+ return 0;
68530+}
68531+
68532+static void xenfb_init_shared_page(struct xenfb_info *info)
68533+{
68534+ int i;
68535+
68536+ for (i = 0; i < info->nr_pages; i++)
68537+ info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
68538+
68539+ for (i = 0; i < info->nr_pages; i++)
68540+ info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
68541+
68542+ info->page->pd[0] = vmalloc_to_mfn(info->mfns);
68543+ info->page->pd[1] = 0;
68544+ info->page->width = XENFB_WIDTH;
68545+ info->page->height = XENFB_HEIGHT;
68546+ info->page->depth = XENFB_DEPTH;
68547+ info->page->line_length = (info->page->depth / 8) * info->page->width;
68548+ info->page->mem_length = xenfb_mem_len;
68549+ info->page->in_cons = info->page->in_prod = 0;
68550+ info->page->out_cons = info->page->out_prod = 0;
68551+}
68552+
68553+static int xenfb_connect_backend(struct xenbus_device *dev,
68554+ struct xenfb_info *info)
68555+{
68556+ int ret;
68557+ struct xenbus_transaction xbt;
68558+
68559+ ret = xenbus_alloc_evtchn(dev, &info->evtchn);
68560+ if (ret)
68561+ return ret;
68562+ ret = bind_evtchn_to_irqhandler(info->evtchn, xenfb_event_handler,
68563+ 0, "xenfb", info);
68564+ if (ret < 0) {
68565+ xenbus_free_evtchn(dev, info->evtchn);
68566+ xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
68567+ return ret;
68568+ }
68569+ info->irq = ret;
68570+
68571+ again:
68572+ ret = xenbus_transaction_start(&xbt);
68573+ if (ret) {
68574+ xenbus_dev_fatal(dev, ret, "starting transaction");
68575+ return ret;
68576+ }
68577+ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
68578+ virt_to_mfn(info->page));
68579+ if (ret)
68580+ goto error_xenbus;
68581+ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
68582+ info->evtchn);
68583+ if (ret)
68584+ goto error_xenbus;
68585+ ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
68586+ if (ret)
68587+ goto error_xenbus;
68588+ ret = xenbus_transaction_end(xbt, 0);
68589+ if (ret) {
68590+ if (ret == -EAGAIN)
68591+ goto again;
68592+ xenbus_dev_fatal(dev, ret, "completing transaction");
68593+ return ret;
68594+ }
68595+
68596+ xenbus_switch_state(dev, XenbusStateInitialised);
68597+ return 0;
68598+
68599+ error_xenbus:
68600+ xenbus_transaction_end(xbt, 1);
68601+ xenbus_dev_fatal(dev, ret, "writing xenstore");
68602+ return ret;
68603+}
68604+
68605+static void xenfb_disconnect_backend(struct xenfb_info *info)
68606+{
68607+ if (info->irq >= 0)
68608+ unbind_from_irqhandler(info->irq, info);
68609+ info->irq = -1;
68610+}
68611+
68612+static void xenfb_backend_changed(struct xenbus_device *dev,
68613+ enum xenbus_state backend_state)
68614+{
68615+ struct xenfb_info *info = dev->dev.driver_data;
68616+ int val;
68617+
68618+ switch (backend_state) {
68619+ case XenbusStateInitialising:
68620+ case XenbusStateInitialised:
68621+ case XenbusStateUnknown:
68622+ case XenbusStateClosed:
68623+ break;
68624+
68625+ case XenbusStateInitWait:
68626+ InitWait:
68627+ xenbus_switch_state(dev, XenbusStateConnected);
68628+ break;
68629+
68630+ case XenbusStateConnected:
68631+ /*
68632+ * Work around xenbus race condition: If backend goes
68633+ * through InitWait to Connected fast enough, we can
68634+ * get Connected twice here.
68635+ */
68636+ if (dev->state != XenbusStateConnected)
68637+ goto InitWait; /* no InitWait seen yet, fudge it */
68638+
68639+ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
68640+ "request-update", "%d", &val) < 0)
68641+ val = 0;
68642+ if (val)
68643+ info->update_wanted = 1;
68644+ break;
68645+
68646+ case XenbusStateClosing:
68647+ // FIXME is this safe in any dev->state?
68648+ xenbus_frontend_closed(dev);
68649+ break;
68650+ }
68651+}
68652+
68653+static struct xenbus_device_id xenfb_ids[] = {
68654+ { "vfb" },
68655+ { "" }
68656+};
68657+
68658+static struct xenbus_driver xenfb = {
68659+ .name = "vfb",
68660+ .owner = THIS_MODULE,
68661+ .ids = xenfb_ids,
68662+ .probe = xenfb_probe,
68663+ .remove = xenfb_remove,
68664+ .resume = xenfb_resume,
68665+ .otherend_changed = xenfb_backend_changed,
68666+};
68667+
68668+static int __init xenfb_init(void)
68669+{
68670+ if (!is_running_on_xen())
68671+ return -ENODEV;
68672+
68673+ /* Nothing to do if running in dom0. */
68674+ if (is_initial_xendomain())
68675+ return -ENODEV;
68676+
68677+ return xenbus_register_frontend(&xenfb);
68678+}
68679+
68680+static void __exit xenfb_cleanup(void)
68681+{
68682+ return xenbus_unregister_driver(&xenfb);
68683+}
68684+
68685+module_init(xenfb_init);
68686+module_exit(xenfb_cleanup);
68687+
68688+MODULE_LICENSE("GPL");
68689diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/xenkbd.c linux-2.6.16.33/drivers/xen/fbfront/xenkbd.c
68690--- linux-2.6.16.33-noxen/drivers/xen/fbfront/xenkbd.c 1970-01-01 00:00:00.000000000 +0000
68691+++ linux-2.6.16.33/drivers/xen/fbfront/xenkbd.c 2007-01-08 15:00:45.000000000 +0000
68692@@ -0,0 +1,300 @@
68693+/*
68694+ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
68695+ *
68696+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
68697+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
68698+ *
68699+ * Based on linux/drivers/input/mouse/sermouse.c
68700+ *
68701+ * This file is subject to the terms and conditions of the GNU General Public
68702+ * License. See the file COPYING in the main directory of this archive for
68703+ * more details.
68704+ */
68705+
68706+/*
68707+ * TODO:
68708+ *
68709+ * Switch to grant tables together with xenfb.c.
68710+ */
68711+
68712+#include <linux/kernel.h>
68713+#include <linux/errno.h>
68714+#include <linux/module.h>
68715+#include <linux/input.h>
68716+#include <asm/hypervisor.h>
68717+#include <xen/evtchn.h>
68718+#include <xen/interface/io/fbif.h>
68719+#include <xen/interface/io/kbdif.h>
68720+#include <xen/xenbus.h>
68721+
68722+struct xenkbd_info
68723+{
68724+ struct input_dev *dev;
68725+ struct xenkbd_page *page;
68726+ unsigned evtchn;
68727+ int irq;
68728+ struct xenbus_device *xbdev;
68729+};
68730+
68731+static int xenkbd_remove(struct xenbus_device *);
68732+static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
68733+static void xenkbd_disconnect_backend(struct xenkbd_info *);
68734+
68735+/*
68736+ * Note: if you need to send out events, see xenfb_do_update() for how
68737+ * to do that.
68738+ */
68739+
68740+static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
68741+{
68742+ struct xenkbd_info *info = dev_id;
68743+ struct xenkbd_page *page = info->page;
68744+ __u32 cons, prod;
68745+
68746+ prod = page->in_prod;
68747+ if (prod == page->out_cons)
68748+ return IRQ_HANDLED;
68749+ rmb(); /* ensure we see ring contents up to prod */
68750+ for (cons = page->in_cons; cons != prod; cons++) {
68751+ union xenkbd_in_event *event;
68752+ event = &XENKBD_IN_RING_REF(page, cons);
68753+
68754+ switch (event->type) {
68755+ case XENKBD_TYPE_MOTION:
68756+ input_report_rel(info->dev, REL_X, event->motion.rel_x);
68757+ input_report_rel(info->dev, REL_Y, event->motion.rel_y);
68758+ break;
68759+ case XENKBD_TYPE_KEY:
68760+ input_report_key(info->dev, event->key.keycode, event->key.pressed);
68761+ break;
68762+ case XENKBD_TYPE_POS:
68763+ input_report_abs(info->dev, ABS_X, event->pos.abs_x);
68764+ input_report_abs(info->dev, ABS_Y, event->pos.abs_y);
68765+ break;
68766+ }
68767+ }
68768+ input_sync(info->dev);
68769+ mb(); /* ensure we got ring contents */
68770+ page->in_cons = cons;
68771+ notify_remote_via_evtchn(info->evtchn);
68772+
68773+ return IRQ_HANDLED;
68774+}
68775+
68776+int __devinit xenkbd_probe(struct xenbus_device *dev,
68777+ const struct xenbus_device_id *id)
68778+{
68779+ int ret, i;
68780+ struct xenkbd_info *info;
68781+ struct input_dev *input_dev;
68782+
68783+ info = kzalloc(sizeof(*info), GFP_KERNEL);
68784+ if (!info) {
68785+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
68786+ return -ENOMEM;
68787+ }
68788+ dev->dev.driver_data = info;
68789+ info->xbdev = dev;
68790+
68791+ info->page = (void *)__get_free_page(GFP_KERNEL);
68792+ if (!info->page)
68793+ goto error_nomem;
68794+ info->page->in_cons = info->page->in_prod = 0;
68795+ info->page->out_cons = info->page->out_prod = 0;
68796+
68797+ input_dev = input_allocate_device();
68798+ if (!input_dev)
68799+ goto error_nomem;
68800+
68801+ input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS);
68802+ input_dev->keybit[LONG(BTN_MOUSE)]
68803+ = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT);
68804+ /* TODO additional buttons */
68805+ input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y);
68806+
68807+ /* FIXME not sure this is quite right */
68808+ for (i = 0; i < 256; i++)
68809+ set_bit(i, input_dev->keybit);
68810+
68811+ input_dev->name = "Xen Virtual Keyboard/Mouse";
68812+
68813+ input_set_abs_params(input_dev, ABS_X, 0, XENFB_WIDTH, 0, 0);
68814+ input_set_abs_params(input_dev, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
68815+
68816+ ret = input_register_device(input_dev);
68817+ if (ret) {
68818+ input_free_device(input_dev);
68819+ xenbus_dev_fatal(dev, ret, "input_register_device");
68820+ goto error;
68821+ }
68822+ info->dev = input_dev;
68823+
68824+ ret = xenkbd_connect_backend(dev, info);
68825+ if (ret < 0)
68826+ goto error;
68827+
68828+ return 0;
68829+
68830+ error_nomem:
68831+ ret = -ENOMEM;
68832+ xenbus_dev_fatal(dev, ret, "allocating device memory");
68833+ error:
68834+ xenkbd_remove(dev);
68835+ return ret;
68836+}
68837+
68838+static int xenkbd_resume(struct xenbus_device *dev)
68839+{
68840+ struct xenkbd_info *info = dev->dev.driver_data;
68841+
68842+ xenkbd_disconnect_backend(info);
68843+ return xenkbd_connect_backend(dev, info);
68844+}
68845+
68846+static int xenkbd_remove(struct xenbus_device *dev)
68847+{
68848+ struct xenkbd_info *info = dev->dev.driver_data;
68849+
68850+ xenkbd_disconnect_backend(info);
68851+ input_unregister_device(info->dev);
68852+ free_page((unsigned long)info->page);
68853+ kfree(info);
68854+ return 0;
68855+}
68856+
68857+static int xenkbd_connect_backend(struct xenbus_device *dev,
68858+ struct xenkbd_info *info)
68859+{
68860+ int ret;
68861+ struct xenbus_transaction xbt;
68862+
68863+ ret = xenbus_alloc_evtchn(dev, &info->evtchn);
68864+ if (ret)
68865+ return ret;
68866+ ret = bind_evtchn_to_irqhandler(info->evtchn, input_handler, 0,
68867+ "xenkbd", info);
68868+ if (ret < 0) {
68869+ xenbus_free_evtchn(dev, info->evtchn);
68870+ xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
68871+ return ret;
68872+ }
68873+ info->irq = ret;
68874+
68875+ again:
68876+ ret = xenbus_transaction_start(&xbt);
68877+ if (ret) {
68878+ xenbus_dev_fatal(dev, ret, "starting transaction");
68879+ return ret;
68880+ }
68881+ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
68882+ virt_to_mfn(info->page));
68883+ if (ret)
68884+ goto error_xenbus;
68885+ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
68886+ info->evtchn);
68887+ if (ret)
68888+ goto error_xenbus;
68889+ ret = xenbus_transaction_end(xbt, 0);
68890+ if (ret) {
68891+ if (ret == -EAGAIN)
68892+ goto again;
68893+ xenbus_dev_fatal(dev, ret, "completing transaction");
68894+ return ret;
68895+ }
68896+
68897+ xenbus_switch_state(dev, XenbusStateInitialised);
68898+ return 0;
68899+
68900+ error_xenbus:
68901+ xenbus_transaction_end(xbt, 1);
68902+ xenbus_dev_fatal(dev, ret, "writing xenstore");
68903+ return ret;
68904+}
68905+
68906+static void xenkbd_disconnect_backend(struct xenkbd_info *info)
68907+{
68908+ if (info->irq >= 0)
68909+ unbind_from_irqhandler(info->irq, info);
68910+ info->irq = -1;
68911+}
68912+
68913+static void xenkbd_backend_changed(struct xenbus_device *dev,
68914+ enum xenbus_state backend_state)
68915+{
68916+ struct xenkbd_info *info = dev->dev.driver_data;
68917+ int ret, val;
68918+
68919+ switch (backend_state) {
68920+ case XenbusStateInitialising:
68921+ case XenbusStateInitialised:
68922+ case XenbusStateUnknown:
68923+ case XenbusStateClosed:
68924+ break;
68925+
68926+ case XenbusStateInitWait:
68927+ InitWait:
68928+ ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
68929+ "feature-abs-pointer", "%d", &val);
68930+ if (ret < 0)
68931+ val = 0;
68932+ if (val) {
68933+ ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
68934+ "request-abs-pointer", "1");
68935+ if (ret)
68936+ ; /* FIXME */
68937+ }
68938+ xenbus_switch_state(dev, XenbusStateConnected);
68939+ break;
68940+
68941+ case XenbusStateConnected:
68942+ /*
68943+ * Work around xenbus race condition: If backend goes
68944+ * through InitWait to Connected fast enough, we can
68945+ * get Connected twice here.
68946+ */
68947+ if (dev->state != XenbusStateConnected)
68948+ goto InitWait; /* no InitWait seen yet, fudge it */
68949+ break;
68950+
68951+ case XenbusStateClosing:
68952+ xenbus_frontend_closed(dev);
68953+ break;
68954+ }
68955+}
68956+
68957+static struct xenbus_device_id xenkbd_ids[] = {
68958+ { "vkbd" },
68959+ { "" }
68960+};
68961+
68962+static struct xenbus_driver xenkbd = {
68963+ .name = "vkbd",
68964+ .owner = THIS_MODULE,
68965+ .ids = xenkbd_ids,
68966+ .probe = xenkbd_probe,
68967+ .remove = xenkbd_remove,
68968+ .resume = xenkbd_resume,
68969+ .otherend_changed = xenkbd_backend_changed,
68970+};
68971+
68972+static int __init xenkbd_init(void)
68973+{
68974+ if (!is_running_on_xen())
68975+ return -ENODEV;
68976+
68977+ /* Nothing to do if running in dom0. */
68978+ if (is_initial_xendomain())
68979+ return -ENODEV;
68980+
68981+ return xenbus_register_frontend(&xenkbd);
68982+}
68983+
68984+static void __exit xenkbd_cleanup(void)
68985+{
68986+ return xenbus_unregister_driver(&xenkbd);
68987+}
68988+
68989+module_init(xenkbd_init);
68990+module_exit(xenkbd_cleanup);
68991+
68992+MODULE_LICENSE("GPL");
68993diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/Makefile linux-2.6.16.33/drivers/xen/netback/Makefile
68994--- linux-2.6.16.33-noxen/drivers/xen/netback/Makefile 1970-01-01 00:00:00.000000000 +0000
68995+++ linux-2.6.16.33/drivers/xen/netback/Makefile 2007-01-08 15:00:45.000000000 +0000
68996@@ -0,0 +1,5 @@
68997+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
68998+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
68999+
69000+netbk-y := netback.o xenbus.o interface.o
69001+netloop-y := loopback.o
69002diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/common.h linux-2.6.16.33/drivers/xen/netback/common.h
69003--- linux-2.6.16.33-noxen/drivers/xen/netback/common.h 1970-01-01 00:00:00.000000000 +0000
69004+++ linux-2.6.16.33/drivers/xen/netback/common.h 2007-01-08 15:00:45.000000000 +0000
69005@@ -0,0 +1,146 @@
69006+/******************************************************************************
69007+ * arch/xen/drivers/netif/backend/common.h
69008+ *
69009+ * This program is free software; you can redistribute it and/or
69010+ * modify it under the terms of the GNU General Public License version 2
69011+ * as published by the Free Software Foundation; or, when distributed
69012+ * separately from the Linux kernel or incorporated into other
69013+ * software packages, subject to the following license:
69014+ *
69015+ * Permission is hereby granted, free of charge, to any person obtaining a copy
69016+ * of this source file (the "Software"), to deal in the Software without
69017+ * restriction, including without limitation the rights to use, copy, modify,
69018+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69019+ * and to permit persons to whom the Software is furnished to do so, subject to
69020+ * the following conditions:
69021+ *
69022+ * The above copyright notice and this permission notice shall be included in
69023+ * all copies or substantial portions of the Software.
69024+ *
69025+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69026+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69027+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69028+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69029+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69030+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69031+ * IN THE SOFTWARE.
69032+ */
69033+
69034+#ifndef __NETIF__BACKEND__COMMON_H__
69035+#define __NETIF__BACKEND__COMMON_H__
69036+
69037+#include <linux/config.h>
69038+#include <linux/version.h>
69039+#include <linux/module.h>
69040+#include <linux/interrupt.h>
69041+#include <linux/slab.h>
69042+#include <linux/ip.h>
69043+#include <linux/in.h>
69044+#include <linux/netdevice.h>
69045+#include <linux/etherdevice.h>
69046+#include <linux/wait.h>
69047+#include <xen/evtchn.h>
69048+#include <xen/interface/io/netif.h>
69049+#include <asm/io.h>
69050+#include <asm/pgalloc.h>
69051+#include <xen/interface/grant_table.h>
69052+#include <xen/gnttab.h>
69053+#include <xen/driver_util.h>
69054+
69055+#define DPRINTK(_f, _a...) \
69056+ pr_debug("(file=%s, line=%d) " _f, \
69057+ __FILE__ , __LINE__ , ## _a )
69058+#define IPRINTK(fmt, args...) \
69059+ printk(KERN_INFO "xen_net: " fmt, ##args)
69060+#define WPRINTK(fmt, args...) \
69061+ printk(KERN_WARNING "xen_net: " fmt, ##args)
69062+
69063+typedef struct netif_st {
69064+ /* Unique identifier for this interface. */
69065+ domid_t domid;
69066+ unsigned int handle;
69067+
69068+ u8 fe_dev_addr[6];
69069+
69070+ /* Physical parameters of the comms window. */
69071+ grant_handle_t tx_shmem_handle;
69072+ grant_ref_t tx_shmem_ref;
69073+ grant_handle_t rx_shmem_handle;
69074+ grant_ref_t rx_shmem_ref;
69075+ unsigned int evtchn;
69076+ unsigned int irq;
69077+
69078+ /* The shared rings and indexes. */
69079+ netif_tx_back_ring_t tx;
69080+ netif_rx_back_ring_t rx;
69081+ struct vm_struct *tx_comms_area;
69082+ struct vm_struct *rx_comms_area;
69083+
69084+ /* Set of features that can be turned on in dev->features. */
69085+ int features;
69086+
69087+ /* Internal feature information. */
69088+ int can_queue:1; /* can queue packets for receiver? */
69089+ int copying_receiver:1; /* copy packets to receiver? */
69090+
69091+ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
69092+ RING_IDX rx_req_cons_peek;
69093+
69094+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
69095+ unsigned long credit_bytes;
69096+ unsigned long credit_usec;
69097+ unsigned long remaining_credit;
69098+ struct timer_list credit_timeout;
69099+
69100+ /* Enforce draining of the transmit queue. */
69101+ struct timer_list tx_queue_timeout;
69102+
69103+ /* Miscellaneous private stuff. */
69104+ struct list_head list; /* scheduling list */
69105+ atomic_t refcnt;
69106+ struct net_device *dev;
69107+ struct net_device_stats stats;
69108+
69109+ wait_queue_head_t waiting_to_free;
69110+} netif_t;
69111+
69112+#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
69113+#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
69114+
69115+void netif_disconnect(netif_t *netif);
69116+
69117+netif_t *netif_alloc(domid_t domid, unsigned int handle);
69118+int netif_map(netif_t *netif, unsigned long tx_ring_ref,
69119+ unsigned long rx_ring_ref, unsigned int evtchn);
69120+
69121+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
69122+#define netif_put(_b) \
69123+ do { \
69124+ if ( atomic_dec_and_test(&(_b)->refcnt) ) \
69125+ wake_up(&(_b)->waiting_to_free); \
69126+ } while (0)
69127+
69128+void netif_xenbus_init(void);
69129+
69130+#define netif_schedulable(dev) (netif_running(dev) && netif_carrier_ok(dev))
69131+
69132+void netif_schedule_work(netif_t *netif);
69133+void netif_deschedule_work(netif_t *netif);
69134+
69135+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
69136+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
69137+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
69138+
69139+static inline int netbk_can_queue(struct net_device *dev)
69140+{
69141+ netif_t *netif = netdev_priv(dev);
69142+ return netif->can_queue;
69143+}
69144+
69145+static inline int netbk_can_sg(struct net_device *dev)
69146+{
69147+ netif_t *netif = netdev_priv(dev);
69148+ return netif->features & NETIF_F_SG;
69149+}
69150+
69151+#endif /* __NETIF__BACKEND__COMMON_H__ */
69152diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/interface.c linux-2.6.16.33/drivers/xen/netback/interface.c
69153--- linux-2.6.16.33-noxen/drivers/xen/netback/interface.c 1970-01-01 00:00:00.000000000 +0000
69154+++ linux-2.6.16.33/drivers/xen/netback/interface.c 2007-01-08 15:00:45.000000000 +0000
69155@@ -0,0 +1,349 @@
69156+/******************************************************************************
69157+ * arch/xen/drivers/netif/backend/interface.c
69158+ *
69159+ * Network-device interface management.
69160+ *
69161+ * Copyright (c) 2004-2005, Keir Fraser
69162+ *
69163+ * This program is free software; you can redistribute it and/or
69164+ * modify it under the terms of the GNU General Public License version 2
69165+ * as published by the Free Software Foundation; or, when distributed
69166+ * separately from the Linux kernel or incorporated into other
69167+ * software packages, subject to the following license:
69168+ *
69169+ * Permission is hereby granted, free of charge, to any person obtaining a copy
69170+ * of this source file (the "Software"), to deal in the Software without
69171+ * restriction, including without limitation the rights to use, copy, modify,
69172+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69173+ * and to permit persons to whom the Software is furnished to do so, subject to
69174+ * the following conditions:
69175+ *
69176+ * The above copyright notice and this permission notice shall be included in
69177+ * all copies or substantial portions of the Software.
69178+ *
69179+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69180+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69181+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69182+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69183+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69184+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69185+ * IN THE SOFTWARE.
69186+ */
69187+
69188+#include "common.h"
69189+#include <linux/ethtool.h>
69190+#include <linux/rtnetlink.h>
69191+
69192+/*
69193+ * Module parameter 'queue_length':
69194+ *
69195+ * Enables queuing in the network stack when a client has run out of receive
69196+ * descriptors. Although this feature can improve receive bandwidth by avoiding
69197+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
69198+ * unbounded time. This is bad if those packets hold onto foreign resources.
69199+ * For example, consider a packet that holds onto resources belonging to the
69200+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
69201+ * vif1.1 which is not activated in the guest): in this situation the guest
69202+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
69203+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
69204+ * blocked.
69205+ */
69206+static unsigned long netbk_queue_length = 32;
69207+module_param_named(queue_length, netbk_queue_length, ulong, 0);
69208+
69209+static void __netif_up(netif_t *netif)
69210+{
69211+ enable_irq(netif->irq);
69212+ netif_schedule_work(netif);
69213+}
69214+
69215+static void __netif_down(netif_t *netif)
69216+{
69217+ disable_irq(netif->irq);
69218+ netif_deschedule_work(netif);
69219+}
69220+
69221+static int net_open(struct net_device *dev)
69222+{
69223+ netif_t *netif = netdev_priv(dev);
69224+ if (netif_carrier_ok(dev))
69225+ __netif_up(netif);
69226+ return 0;
69227+}
69228+
69229+static int net_close(struct net_device *dev)
69230+{
69231+ netif_t *netif = netdev_priv(dev);
69232+ if (netif_carrier_ok(dev))
69233+ __netif_down(netif);
69234+ return 0;
69235+}
69236+
69237+static int netbk_change_mtu(struct net_device *dev, int mtu)
69238+{
69239+ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
69240+
69241+ if (mtu > max)
69242+ return -EINVAL;
69243+ dev->mtu = mtu;
69244+ return 0;
69245+}
69246+
69247+static int netbk_set_sg(struct net_device *dev, u32 data)
69248+{
69249+ if (data) {
69250+ netif_t *netif = netdev_priv(dev);
69251+
69252+ if (!(netif->features & NETIF_F_SG))
69253+ return -ENOSYS;
69254+ }
69255+
69256+ return ethtool_op_set_sg(dev, data);
69257+}
69258+
69259+static int netbk_set_tso(struct net_device *dev, u32 data)
69260+{
69261+ if (data) {
69262+ netif_t *netif = netdev_priv(dev);
69263+
69264+ if (!(netif->features & NETIF_F_TSO))
69265+ return -ENOSYS;
69266+ }
69267+
69268+ return ethtool_op_set_tso(dev, data);
69269+}
69270+
69271+static struct ethtool_ops network_ethtool_ops =
69272+{
69273+ .get_tx_csum = ethtool_op_get_tx_csum,
69274+ .set_tx_csum = ethtool_op_set_tx_csum,
69275+ .get_sg = ethtool_op_get_sg,
69276+ .set_sg = netbk_set_sg,
69277+ .get_tso = ethtool_op_get_tso,
69278+ .set_tso = netbk_set_tso,
69279+ .get_link = ethtool_op_get_link,
69280+};
69281+
69282+netif_t *netif_alloc(domid_t domid, unsigned int handle)
69283+{
69284+ int err = 0;
69285+ struct net_device *dev;
69286+ netif_t *netif;
69287+ char name[IFNAMSIZ] = {};
69288+
69289+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
69290+ dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
69291+ if (dev == NULL) {
69292+ DPRINTK("Could not create netif: out of memory\n");
69293+ return ERR_PTR(-ENOMEM);
69294+ }
69295+
69296+ netif_carrier_off(dev);
69297+
69298+ netif = netdev_priv(dev);
69299+ memset(netif, 0, sizeof(*netif));
69300+ netif->domid = domid;
69301+ netif->handle = handle;
69302+ atomic_set(&netif->refcnt, 1);
69303+ init_waitqueue_head(&netif->waiting_to_free);
69304+ netif->dev = dev;
69305+
69306+ netif->credit_bytes = netif->remaining_credit = ~0UL;
69307+ netif->credit_usec = 0UL;
69308+ init_timer(&netif->credit_timeout);
69309+ /* Initialize 'expires' now: it's used to track the credit window. */
69310+ netif->credit_timeout.expires = jiffies;
69311+
69312+ init_timer(&netif->tx_queue_timeout);
69313+
69314+ dev->hard_start_xmit = netif_be_start_xmit;
69315+ dev->get_stats = netif_be_get_stats;
69316+ dev->open = net_open;
69317+ dev->stop = net_close;
69318+ dev->change_mtu = netbk_change_mtu;
69319+ dev->features = NETIF_F_IP_CSUM;
69320+
69321+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
69322+
69323+ dev->tx_queue_len = netbk_queue_length;
69324+
69325+ /*
69326+ * Initialise a dummy MAC address. We choose the numerically
69327+ * largest non-broadcast address to prevent the address getting
69328+ * stolen by an Ethernet bridge for STP purposes.
69329+ * (FE:FF:FF:FF:FF:FF)
69330+ */
69331+ memset(dev->dev_addr, 0xFF, ETH_ALEN);
69332+ dev->dev_addr[0] &= ~0x01;
69333+
69334+ rtnl_lock();
69335+ err = register_netdevice(dev);
69336+ rtnl_unlock();
69337+ if (err) {
69338+ DPRINTK("Could not register new net device %s: err=%d\n",
69339+ dev->name, err);
69340+ free_netdev(dev);
69341+ return ERR_PTR(err);
69342+ }
69343+
69344+ DPRINTK("Successfully created netif\n");
69345+ return netif;
69346+}
69347+
69348+static int map_frontend_pages(
69349+ netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
69350+{
69351+ struct gnttab_map_grant_ref op;
69352+ int ret;
69353+
69354+ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
69355+ GNTMAP_host_map, tx_ring_ref, netif->domid);
69356+
69357+ lock_vm_area(netif->tx_comms_area);
69358+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
69359+ unlock_vm_area(netif->tx_comms_area);
69360+ BUG_ON(ret);
69361+
69362+ if (op.status) {
69363+ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
69364+ return op.status;
69365+ }
69366+
69367+ netif->tx_shmem_ref = tx_ring_ref;
69368+ netif->tx_shmem_handle = op.handle;
69369+
69370+ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
69371+ GNTMAP_host_map, rx_ring_ref, netif->domid);
69372+
69373+ lock_vm_area(netif->rx_comms_area);
69374+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
69375+ unlock_vm_area(netif->rx_comms_area);
69376+ BUG_ON(ret);
69377+
69378+ if (op.status) {
69379+ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
69380+ return op.status;
69381+ }
69382+
69383+ netif->rx_shmem_ref = rx_ring_ref;
69384+ netif->rx_shmem_handle = op.handle;
69385+
69386+ return 0;
69387+}
69388+
69389+static void unmap_frontend_pages(netif_t *netif)
69390+{
69391+ struct gnttab_unmap_grant_ref op;
69392+ int ret;
69393+
69394+ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
69395+ GNTMAP_host_map, netif->tx_shmem_handle);
69396+
69397+ lock_vm_area(netif->tx_comms_area);
69398+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
69399+ unlock_vm_area(netif->tx_comms_area);
69400+ BUG_ON(ret);
69401+
69402+ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
69403+ GNTMAP_host_map, netif->rx_shmem_handle);
69404+
69405+ lock_vm_area(netif->rx_comms_area);
69406+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
69407+ unlock_vm_area(netif->rx_comms_area);
69408+ BUG_ON(ret);
69409+}
69410+
69411+int netif_map(netif_t *netif, unsigned long tx_ring_ref,
69412+ unsigned long rx_ring_ref, unsigned int evtchn)
69413+{
69414+ int err = -ENOMEM;
69415+ netif_tx_sring_t *txs;
69416+ netif_rx_sring_t *rxs;
69417+ struct evtchn_bind_interdomain bind_interdomain;
69418+
69419+ /* Already connected through? */
69420+ if (netif->irq)
69421+ return 0;
69422+
69423+ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
69424+ if (netif->tx_comms_area == NULL)
69425+ return -ENOMEM;
69426+ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
69427+ if (netif->rx_comms_area == NULL)
69428+ goto err_rx;
69429+
69430+ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
69431+ if (err)
69432+ goto err_map;
69433+
69434+ bind_interdomain.remote_dom = netif->domid;
69435+ bind_interdomain.remote_port = evtchn;
69436+
69437+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
69438+ &bind_interdomain);
69439+ if (err)
69440+ goto err_hypervisor;
69441+
69442+ netif->evtchn = bind_interdomain.local_port;
69443+
69444+ netif->irq = bind_evtchn_to_irqhandler(
69445+ netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
69446+ disable_irq(netif->irq);
69447+
69448+ txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
69449+ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
69450+
69451+ rxs = (netif_rx_sring_t *)
69452+ ((char *)netif->rx_comms_area->addr);
69453+ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
69454+
69455+ netif->rx_req_cons_peek = 0;
69456+
69457+ netif_get(netif);
69458+
69459+ rtnl_lock();
69460+ netif_carrier_on(netif->dev);
69461+ if (netif_running(netif->dev))
69462+ __netif_up(netif);
69463+ rtnl_unlock();
69464+
69465+ return 0;
69466+err_hypervisor:
69467+ unmap_frontend_pages(netif);
69468+err_map:
69469+ free_vm_area(netif->rx_comms_area);
69470+err_rx:
69471+ free_vm_area(netif->tx_comms_area);
69472+ return err;
69473+}
69474+
69475+void netif_disconnect(netif_t *netif)
69476+{
69477+ if (netif_carrier_ok(netif->dev)) {
69478+ rtnl_lock();
69479+ netif_carrier_off(netif->dev);
69480+ if (netif_running(netif->dev))
69481+ __netif_down(netif);
69482+ rtnl_unlock();
69483+ netif_put(netif);
69484+ }
69485+
69486+ atomic_dec(&netif->refcnt);
69487+ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
69488+
69489+ del_timer_sync(&netif->credit_timeout);
69490+ del_timer_sync(&netif->tx_queue_timeout);
69491+
69492+ if (netif->irq)
69493+ unbind_from_irqhandler(netif->irq, netif);
69494+
69495+ unregister_netdev(netif->dev);
69496+
69497+ if (netif->tx.sring) {
69498+ unmap_frontend_pages(netif);
69499+ free_vm_area(netif->tx_comms_area);
69500+ free_vm_area(netif->rx_comms_area);
69501+ }
69502+
69503+ free_netdev(netif->dev);
69504+}
69505diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/loopback.c linux-2.6.16.33/drivers/xen/netback/loopback.c
69506--- linux-2.6.16.33-noxen/drivers/xen/netback/loopback.c 1970-01-01 00:00:00.000000000 +0000
69507+++ linux-2.6.16.33/drivers/xen/netback/loopback.c 2007-01-08 15:00:45.000000000 +0000
69508@@ -0,0 +1,321 @@
69509+/******************************************************************************
69510+ * netback/loopback.c
69511+ *
69512+ * A two-interface loopback device to emulate a local netfront-netback
69513+ * connection. This ensures that local packet delivery looks identical
69514+ * to inter-domain delivery. Most importantly, packets delivered locally
69515+ * originating from other domains will get *copied* when they traverse this
69516+ * driver. This prevents unbounded delays in socket-buffer queues from
69517+ * causing the netback driver to "seize up".
69518+ *
69519+ * This driver creates a symmetric pair of loopback interfaces with names
69520+ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
69521+ * bridge, just like a proper netback interface, while a local IP interface
69522+ * is configured on 'veth0'.
69523+ *
69524+ * As with a real netback interface, vif0.0 is configured with a suitable
69525+ * dummy MAC address. No default is provided for veth0: a reasonable strategy
69526+ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
69527+ * (to avoid confusing the Etherbridge).
69528+ *
69529+ * Copyright (c) 2005 K A Fraser
69530+ *
69531+ * This program is free software; you can redistribute it and/or
69532+ * modify it under the terms of the GNU General Public License version 2
69533+ * as published by the Free Software Foundation; or, when distributed
69534+ * separately from the Linux kernel or incorporated into other
69535+ * software packages, subject to the following license:
69536+ *
69537+ * Permission is hereby granted, free of charge, to any person obtaining a copy
69538+ * of this source file (the "Software"), to deal in the Software without
69539+ * restriction, including without limitation the rights to use, copy, modify,
69540+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69541+ * and to permit persons to whom the Software is furnished to do so, subject to
69542+ * the following conditions:
69543+ *
69544+ * The above copyright notice and this permission notice shall be included in
69545+ * all copies or substantial portions of the Software.
69546+ *
69547+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69548+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69549+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69550+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69551+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69552+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69553+ * IN THE SOFTWARE.
69554+ */
69555+
69556+#include <linux/config.h>
69557+#include <linux/module.h>
69558+#include <linux/netdevice.h>
69559+#include <linux/inetdevice.h>
69560+#include <linux/etherdevice.h>
69561+#include <linux/skbuff.h>
69562+#include <linux/ethtool.h>
69563+#include <net/dst.h>
69564+#include <net/xfrm.h> /* secpath_reset() */
69565+#include <asm/hypervisor.h> /* is_initial_xendomain() */
69566+
69567+static int nloopbacks = -1;
69568+module_param(nloopbacks, int, 0);
69569+MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
69570+
69571+struct net_private {
69572+ struct net_device *loopback_dev;
69573+ struct net_device_stats stats;
69574+};
69575+
69576+static int loopback_open(struct net_device *dev)
69577+{
69578+ struct net_private *np = netdev_priv(dev);
69579+ memset(&np->stats, 0, sizeof(np->stats));
69580+ netif_start_queue(dev);
69581+ return 0;
69582+}
69583+
69584+static int loopback_close(struct net_device *dev)
69585+{
69586+ netif_stop_queue(dev);
69587+ return 0;
69588+}
69589+
69590+#ifdef CONFIG_X86
69591+static int is_foreign(unsigned long pfn)
69592+{
69593+ /* NB. Play it safe for auto-translation mode. */
69594+ return (xen_feature(XENFEAT_auto_translated_physmap) ||
69595+ (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
69596+}
69597+#else
69598+/* How to detect a foreign mapping? Play it safe. */
69599+#define is_foreign(pfn) (1)
69600+#endif
69601+
69602+static int skb_remove_foreign_references(struct sk_buff *skb)
69603+{
69604+ struct page *page;
69605+ unsigned long pfn;
69606+ int i, off;
69607+ char *vaddr;
69608+
69609+ BUG_ON(skb_shinfo(skb)->frag_list);
69610+
69611+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
69612+ pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page);
69613+ if (!is_foreign(pfn))
69614+ continue;
69615+
69616+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
69617+ if (unlikely(!page))
69618+ return 0;
69619+
69620+ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
69621+ off = skb_shinfo(skb)->frags[i].page_offset;
69622+ memcpy(page_address(page) + off,
69623+ vaddr + off,
69624+ skb_shinfo(skb)->frags[i].size);
69625+ kunmap_skb_frag(vaddr);
69626+
69627+ put_page(skb_shinfo(skb)->frags[i].page);
69628+ skb_shinfo(skb)->frags[i].page = page;
69629+ }
69630+
69631+ return 1;
69632+}
69633+
69634+static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
69635+{
69636+ struct net_private *np = netdev_priv(dev);
69637+
69638+ if (!skb_remove_foreign_references(skb)) {
69639+ np->stats.tx_dropped++;
69640+ dev_kfree_skb(skb);
69641+ return 0;
69642+ }
69643+
69644+ dst_release(skb->dst);
69645+ skb->dst = NULL;
69646+
69647+ skb_orphan(skb);
69648+
69649+ np->stats.tx_bytes += skb->len;
69650+ np->stats.tx_packets++;
69651+
69652+ /* Switch to loopback context. */
69653+ dev = np->loopback_dev;
69654+ np = netdev_priv(dev);
69655+
69656+ np->stats.rx_bytes += skb->len;
69657+ np->stats.rx_packets++;
69658+
69659+ if (skb->ip_summed == CHECKSUM_HW) {
69660+ /* Defer checksum calculation. */
69661+ skb->proto_csum_blank = 1;
69662+ /* Must be a local packet: assert its integrity. */
69663+ skb->proto_data_valid = 1;
69664+ }
69665+
69666+ skb->ip_summed = skb->proto_data_valid ?
69667+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
69668+
69669+ skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
69670+ skb->protocol = eth_type_trans(skb, dev);
69671+ skb->dev = dev;
69672+ dev->last_rx = jiffies;
69673+
69674+ /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
69675+ nf_reset(skb);
69676+ secpath_reset(skb);
69677+
69678+ netif_rx(skb);
69679+
69680+ return 0;
69681+}
69682+
69683+static struct net_device_stats *loopback_get_stats(struct net_device *dev)
69684+{
69685+ struct net_private *np = netdev_priv(dev);
69686+ return &np->stats;
69687+}
69688+
69689+static struct ethtool_ops network_ethtool_ops =
69690+{
69691+ .get_tx_csum = ethtool_op_get_tx_csum,
69692+ .set_tx_csum = ethtool_op_set_tx_csum,
69693+ .get_sg = ethtool_op_get_sg,
69694+ .set_sg = ethtool_op_set_sg,
69695+ .get_tso = ethtool_op_get_tso,
69696+ .set_tso = ethtool_op_set_tso,
69697+ .get_link = ethtool_op_get_link,
69698+};
69699+
69700+/*
69701+ * Nothing to do here. Virtual interface is point-to-point and the
69702+ * physical interface is probably promiscuous anyway.
69703+ */
69704+static void loopback_set_multicast_list(struct net_device *dev)
69705+{
69706+}
69707+
69708+static void loopback_construct(struct net_device *dev, struct net_device *lo)
69709+{
69710+ struct net_private *np = netdev_priv(dev);
69711+
69712+ np->loopback_dev = lo;
69713+
69714+ dev->open = loopback_open;
69715+ dev->stop = loopback_close;
69716+ dev->hard_start_xmit = loopback_start_xmit;
69717+ dev->get_stats = loopback_get_stats;
69718+ dev->set_multicast_list = loopback_set_multicast_list;
69719+ dev->change_mtu = NULL; /* allow arbitrary mtu */
69720+
69721+ dev->tx_queue_len = 0;
69722+
69723+ dev->features = (NETIF_F_HIGHDMA |
69724+ NETIF_F_LLTX |
69725+ NETIF_F_TSO |
69726+ NETIF_F_SG |
69727+ NETIF_F_IP_CSUM);
69728+
69729+ SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
69730+
69731+ /*
69732+ * We do not set a jumbo MTU on the interface. Otherwise the network
69733+ * stack will try to send large packets that will get dropped by the
69734+ * Ethernet bridge (unless the physical Ethernet interface is
69735+ * configured to transfer jumbo packets). If a larger MTU is desired
69736+ * then the system administrator can specify it using the 'ifconfig'
69737+ * command.
69738+ */
69739+ /*dev->mtu = 16*1024;*/
69740+}
69741+
69742+static int __init make_loopback(int i)
69743+{
69744+ struct net_device *dev1, *dev2;
69745+ char dev_name[IFNAMSIZ];
69746+ int err = -ENOMEM;
69747+
69748+ sprintf(dev_name, "vif0.%d", i);
69749+ dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
69750+ if (!dev1)
69751+ return err;
69752+
69753+ sprintf(dev_name, "veth%d", i);
69754+ dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
69755+ if (!dev2)
69756+ goto fail_netdev2;
69757+
69758+ loopback_construct(dev1, dev2);
69759+ loopback_construct(dev2, dev1);
69760+
69761+ /*
69762+ * Initialise a dummy MAC address for the 'dummy backend' interface. We
69763+ * choose the numerically largest non-broadcast address to prevent the
69764+ * address getting stolen by an Ethernet bridge for STP purposes.
69765+ */
69766+ memset(dev1->dev_addr, 0xFF, ETH_ALEN);
69767+ dev1->dev_addr[0] &= ~0x01;
69768+
69769+ if ((err = register_netdev(dev1)) != 0)
69770+ goto fail;
69771+
69772+ if ((err = register_netdev(dev2)) != 0) {
69773+ unregister_netdev(dev1);
69774+ goto fail;
69775+ }
69776+
69777+ return 0;
69778+
69779+ fail:
69780+ free_netdev(dev2);
69781+ fail_netdev2:
69782+ free_netdev(dev1);
69783+ return err;
69784+}
69785+
69786+static void __exit clean_loopback(int i)
69787+{
69788+ struct net_device *dev1, *dev2;
69789+ char dev_name[IFNAMSIZ];
69790+
69791+ sprintf(dev_name, "vif0.%d", i);
69792+ dev1 = dev_get_by_name(dev_name);
69793+ sprintf(dev_name, "veth%d", i);
69794+ dev2 = dev_get_by_name(dev_name);
69795+ if (dev1 && dev2) {
69796+ unregister_netdev(dev2);
69797+ unregister_netdev(dev1);
69798+ free_netdev(dev2);
69799+ free_netdev(dev1);
69800+ }
69801+}
69802+
69803+static int __init loopback_init(void)
69804+{
69805+ int i, err = 0;
69806+
69807+ if (nloopbacks == -1)
69808+ nloopbacks = is_initial_xendomain() ? 4 : 0;
69809+
69810+ for (i = 0; i < nloopbacks; i++)
69811+ if ((err = make_loopback(i)) != 0)
69812+ break;
69813+
69814+ return err;
69815+}
69816+
69817+module_init(loopback_init);
69818+
69819+static void __exit loopback_exit(void)
69820+{
69821+ int i;
69822+
69823+ for (i = nloopbacks; i-- > 0; )
69824+ clean_loopback(i);
69825+}
69826+
69827+module_exit(loopback_exit);
69828+
69829+MODULE_LICENSE("Dual BSD/GPL");
69830diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/netback.c linux-2.6.16.33/drivers/xen/netback/netback.c
69831--- linux-2.6.16.33-noxen/drivers/xen/netback/netback.c 1970-01-01 00:00:00.000000000 +0000
69832+++ linux-2.6.16.33/drivers/xen/netback/netback.c 2007-01-08 15:00:45.000000000 +0000
69833@@ -0,0 +1,1523 @@
69834+/******************************************************************************
69835+ * drivers/xen/netback/netback.c
69836+ *
69837+ * Back-end of the driver for virtual network devices. This portion of the
69838+ * driver exports a 'unified' network-device interface that can be accessed
69839+ * by any operating system that implements a compatible front end. A
69840+ * reference front-end implementation can be found in:
69841+ * drivers/xen/netfront/netfront.c
69842+ *
69843+ * Copyright (c) 2002-2005, K A Fraser
69844+ *
69845+ * This program is free software; you can redistribute it and/or
69846+ * modify it under the terms of the GNU General Public License version 2
69847+ * as published by the Free Software Foundation; or, when distributed
69848+ * separately from the Linux kernel or incorporated into other
69849+ * software packages, subject to the following license:
69850+ *
69851+ * Permission is hereby granted, free of charge, to any person obtaining a copy
69852+ * of this source file (the "Software"), to deal in the Software without
69853+ * restriction, including without limitation the rights to use, copy, modify,
69854+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69855+ * and to permit persons to whom the Software is furnished to do so, subject to
69856+ * the following conditions:
69857+ *
69858+ * The above copyright notice and this permission notice shall be included in
69859+ * all copies or substantial portions of the Software.
69860+ *
69861+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69862+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69863+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69864+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69865+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69866+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69867+ * IN THE SOFTWARE.
69868+ */
69869+
69870+#include "common.h"
69871+#include <xen/balloon.h>
69872+#include <xen/interface/memory.h>
69873+
69874+/*#define NETBE_DEBUG_INTERRUPT*/
69875+
69876+struct netbk_rx_meta {
69877+ skb_frag_t frag;
69878+ int id;
69879+ int copy:1;
69880+};
69881+
69882+static void netif_idx_release(u16 pending_idx);
69883+static void netif_page_release(struct page *page);
69884+static void make_tx_response(netif_t *netif,
69885+ netif_tx_request_t *txp,
69886+ s8 st);
69887+static netif_rx_response_t *make_rx_response(netif_t *netif,
69888+ u16 id,
69889+ s8 st,
69890+ u16 offset,
69891+ u16 size,
69892+ u16 flags);
69893+
69894+static void net_tx_action(unsigned long unused);
69895+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
69896+
69897+static void net_rx_action(unsigned long unused);
69898+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
69899+
69900+static struct timer_list net_timer;
69901+
69902+#define MAX_PENDING_REQS 256
69903+
69904+static struct sk_buff_head rx_queue;
69905+
69906+static struct page **mmap_pages;
69907+static inline unsigned long idx_to_kaddr(unsigned int idx)
69908+{
69909+ return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
69910+}
69911+
69912+#define PKT_PROT_LEN 64
69913+
69914+static struct pending_tx_info {
69915+ netif_tx_request_t req;
69916+ netif_t *netif;
69917+} pending_tx_info[MAX_PENDING_REQS];
69918+static u16 pending_ring[MAX_PENDING_REQS];
69919+typedef unsigned int PEND_RING_IDX;
69920+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
69921+static PEND_RING_IDX pending_prod, pending_cons;
69922+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
69923+
69924+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
69925+static u16 dealloc_ring[MAX_PENDING_REQS];
69926+static PEND_RING_IDX dealloc_prod, dealloc_cons;
69927+
69928+static struct sk_buff_head tx_queue;
69929+
69930+static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
69931+static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
69932+static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
69933+
69934+static struct list_head net_schedule_list;
69935+static spinlock_t net_schedule_list_lock;
69936+
69937+#define MAX_MFN_ALLOC 64
69938+static unsigned long mfn_list[MAX_MFN_ALLOC];
69939+static unsigned int alloc_index = 0;
69940+
69941+static inline unsigned long alloc_mfn(void)
69942+{
69943+ return mfn_list[--alloc_index];
69944+}
69945+
69946+static int check_mfn(int nr)
69947+{
69948+ struct xen_memory_reservation reservation = {
69949+ .extent_order = 0,
69950+ .domid = DOMID_SELF
69951+ };
69952+
69953+ if (likely(alloc_index >= nr))
69954+ return 0;
69955+
69956+ set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
69957+ reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
69958+ alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
69959+ &reservation);
69960+
69961+ return alloc_index >= nr ? 0 : -ENOMEM;
69962+}
69963+
69964+static inline void maybe_schedule_tx_action(void)
69965+{
69966+ smp_mb();
69967+ if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
69968+ !list_empty(&net_schedule_list))
69969+ tasklet_schedule(&net_tx_tasklet);
69970+}
69971+
69972+/*
69973+ * A gross way of confirming the origin of an skb data page. The slab
69974+ * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
69975+ */
69976+static inline int is_xen_skb(struct sk_buff *skb)
69977+{
69978+ extern kmem_cache_t *skbuff_cachep;
69979+ kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
69980+ return (cp == skbuff_cachep);
69981+}
69982+
69983+/*
69984+ * We can flip without copying the packet unless:
69985+ * 1. The data is not allocated from our special cache; or
69986+ * 2. The main data area is shared; or
69987+ * 3. One or more fragments are shared; or
69988+ * 4. There are chained fragments.
69989+ */
69990+static inline int is_flippable_skb(struct sk_buff *skb)
69991+{
69992+ int frag;
69993+
69994+ if (!is_xen_skb(skb) || skb_cloned(skb))
69995+ return 0;
69996+
69997+ for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
69998+ if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
69999+ return 0;
70000+ }
70001+
70002+ if (skb_shinfo(skb)->frag_list != NULL)
70003+ return 0;
70004+
70005+ return 1;
70006+}
70007+
70008+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
70009+{
70010+ struct skb_shared_info *ninfo;
70011+ struct sk_buff *nskb;
70012+ unsigned long offset;
70013+ int ret;
70014+ int len;
70015+ int headlen;
70016+
70017+ BUG_ON(skb_shinfo(skb)->frag_list != NULL);
70018+
70019+ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
70020+ if (unlikely(!nskb))
70021+ goto err;
70022+
70023+ skb_reserve(nskb, 16 + NET_IP_ALIGN);
70024+ headlen = nskb->end - nskb->data;
70025+ if (headlen > skb_headlen(skb))
70026+ headlen = skb_headlen(skb);
70027+ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
70028+ BUG_ON(ret);
70029+
70030+ ninfo = skb_shinfo(nskb);
70031+ ninfo->gso_size = skb_shinfo(skb)->gso_size;
70032+ ninfo->gso_type = skb_shinfo(skb)->gso_type;
70033+
70034+ offset = headlen;
70035+ len = skb->len - headlen;
70036+
70037+ nskb->len = skb->len;
70038+ nskb->data_len = len;
70039+ nskb->truesize += len;
70040+
70041+ while (len) {
70042+ struct page *page;
70043+ int copy;
70044+ int zero;
70045+
70046+ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
70047+ dump_stack();
70048+ goto err_free;
70049+ }
70050+
70051+ copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
70052+ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
70053+
70054+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
70055+ if (unlikely(!page))
70056+ goto err_free;
70057+
70058+ ret = skb_copy_bits(skb, offset, page_address(page), copy);
70059+ BUG_ON(ret);
70060+
70061+ ninfo->frags[ninfo->nr_frags].page = page;
70062+ ninfo->frags[ninfo->nr_frags].page_offset = 0;
70063+ ninfo->frags[ninfo->nr_frags].size = copy;
70064+ ninfo->nr_frags++;
70065+
70066+ offset += copy;
70067+ len -= copy;
70068+ }
70069+
70070+ offset = nskb->data - skb->data;
70071+
70072+ nskb->h.raw = skb->h.raw + offset;
70073+ nskb->nh.raw = skb->nh.raw + offset;
70074+ nskb->mac.raw = skb->mac.raw + offset;
70075+
70076+ return nskb;
70077+
70078+ err_free:
70079+ kfree_skb(nskb);
70080+ err:
70081+ return NULL;
70082+}
70083+
70084+static inline int netbk_max_required_rx_slots(netif_t *netif)
70085+{
70086+ if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
70087+ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
70088+ return 1; /* all in one */
70089+}
70090+
70091+static inline int netbk_queue_full(netif_t *netif)
70092+{
70093+ RING_IDX peek = netif->rx_req_cons_peek;
70094+ RING_IDX needed = netbk_max_required_rx_slots(netif);
70095+
70096+ return ((netif->rx.sring->req_prod - peek) < needed) ||
70097+ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
70098+}
70099+
70100+static void tx_queue_callback(unsigned long data)
70101+{
70102+ netif_t *netif = (netif_t *)data;
70103+ if (netif_schedulable(netif->dev))
70104+ netif_wake_queue(netif->dev);
70105+}
70106+
70107+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
70108+{
70109+ netif_t *netif = netdev_priv(dev);
70110+
70111+ BUG_ON(skb->dev != dev);
70112+
70113+ /* Drop the packet if the target domain has no receive buffers. */
70114+ if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif)))
70115+ goto drop;
70116+
70117+ /*
70118+ * Copy the packet here if it's destined for a flipping interface
70119+ * but isn't flippable (e.g. extra references to data).
70120+ */
70121+ if (!netif->copying_receiver && !is_flippable_skb(skb)) {
70122+ struct sk_buff *nskb = netbk_copy_skb(skb);
70123+ if ( unlikely(nskb == NULL) )
70124+ goto drop;
70125+ /* Copy only the header fields we use in this driver. */
70126+ nskb->dev = skb->dev;
70127+ nskb->ip_summed = skb->ip_summed;
70128+ nskb->proto_data_valid = skb->proto_data_valid;
70129+ dev_kfree_skb(skb);
70130+ skb = nskb;
70131+ }
70132+
70133+ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
70134+ !!skb_shinfo(skb)->gso_size;
70135+ netif_get(netif);
70136+
70137+ if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
70138+ netif->rx.sring->req_event = netif->rx_req_cons_peek +
70139+ netbk_max_required_rx_slots(netif);
70140+ mb(); /* request notification /then/ check & stop the queue */
70141+ if (netbk_queue_full(netif)) {
70142+ netif_stop_queue(dev);
70143+ /*
70144+ * Schedule 500ms timeout to restart the queue, thus
70145+ * ensuring that an inactive queue will be drained.
70146+ * Packets will be immediately be dropped until more
70147+ * receive buffers become available (see
70148+ * netbk_queue_full() check above).
70149+ */
70150+ netif->tx_queue_timeout.data = (unsigned long)netif;
70151+ netif->tx_queue_timeout.function = tx_queue_callback;
70152+ __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
70153+ }
70154+ }
70155+
70156+ skb_queue_tail(&rx_queue, skb);
70157+ tasklet_schedule(&net_rx_tasklet);
70158+
70159+ return 0;
70160+
70161+ drop:
70162+ netif->stats.tx_dropped++;
70163+ dev_kfree_skb(skb);
70164+ return 0;
70165+}
70166+
70167+#if 0
70168+static void xen_network_done_notify(void)
70169+{
70170+ static struct net_device *eth0_dev = NULL;
70171+ if (unlikely(eth0_dev == NULL))
70172+ eth0_dev = __dev_get_by_name("eth0");
70173+ netif_rx_schedule(eth0_dev);
70174+}
70175+/*
70176+ * Add following to poll() function in NAPI driver (Tigon3 is example):
70177+ * if ( xen_network_done() )
70178+ * tg3_enable_ints(tp);
70179+ */
70180+int xen_network_done(void)
70181+{
70182+ return skb_queue_empty(&rx_queue);
70183+}
70184+#endif
70185+
70186+struct netrx_pending_operations {
70187+ unsigned trans_prod, trans_cons;
70188+ unsigned mmu_prod, mmu_cons;
70189+ unsigned mcl_prod, mcl_cons;
70190+ unsigned copy_prod, copy_cons;
70191+ unsigned meta_prod, meta_cons;
70192+ mmu_update_t *mmu;
70193+ gnttab_transfer_t *trans;
70194+ gnttab_copy_t *copy;
70195+ multicall_entry_t *mcl;
70196+ struct netbk_rx_meta *meta;
70197+};
70198+
70199+/* Set up the grant operations for this fragment. If it's a flipping
70200+ interface, we also set up the unmap request from here. */
70201+static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
70202+ int i, struct netrx_pending_operations *npo,
70203+ struct page *page, unsigned long size,
70204+ unsigned long offset)
70205+{
70206+ mmu_update_t *mmu;
70207+ gnttab_transfer_t *gop;
70208+ gnttab_copy_t *copy_gop;
70209+ multicall_entry_t *mcl;
70210+ netif_rx_request_t *req;
70211+ unsigned long old_mfn, new_mfn;
70212+
70213+ old_mfn = virt_to_mfn(page_address(page));
70214+
70215+ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
70216+ if (netif->copying_receiver) {
70217+ /* The fragment needs to be copied rather than
70218+ flipped. */
70219+ meta->copy = 1;
70220+ copy_gop = npo->copy + npo->copy_prod++;
70221+ copy_gop->flags = GNTCOPY_dest_gref;
70222+ if (PageForeign(page)) {
70223+ struct pending_tx_info *src_pend =
70224+ &pending_tx_info[page->index];
70225+ copy_gop->source.domid = src_pend->netif->domid;
70226+ copy_gop->source.u.ref = src_pend->req.gref;
70227+ copy_gop->flags |= GNTCOPY_source_gref;
70228+ } else {
70229+ copy_gop->source.domid = DOMID_SELF;
70230+ copy_gop->source.u.gmfn = old_mfn;
70231+ }
70232+ copy_gop->source.offset = offset;
70233+ copy_gop->dest.domid = netif->domid;
70234+ copy_gop->dest.offset = 0;
70235+ copy_gop->dest.u.ref = req->gref;
70236+ copy_gop->len = size;
70237+ } else {
70238+ meta->copy = 0;
70239+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
70240+ new_mfn = alloc_mfn();
70241+
70242+ /*
70243+ * Set the new P2M table entry before
70244+ * reassigning the old data page. Heed the
70245+ * comment in pgtable-2level.h:pte_page(). :-)
70246+ */
70247+ set_phys_to_machine(page_to_pfn(page), new_mfn);
70248+
70249+ mcl = npo->mcl + npo->mcl_prod++;
70250+ MULTI_update_va_mapping(mcl,
70251+ (unsigned long)page_address(page),
70252+ pfn_pte_ma(new_mfn, PAGE_KERNEL),
70253+ 0);
70254+
70255+ mmu = npo->mmu + npo->mmu_prod++;
70256+ mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
70257+ MMU_MACHPHYS_UPDATE;
70258+ mmu->val = page_to_pfn(page);
70259+ }
70260+
70261+ gop = npo->trans + npo->trans_prod++;
70262+ gop->mfn = old_mfn;
70263+ gop->domid = netif->domid;
70264+ gop->ref = req->gref;
70265+ }
70266+ return req->id;
70267+}
70268+
70269+static void netbk_gop_skb(struct sk_buff *skb,
70270+ struct netrx_pending_operations *npo)
70271+{
70272+ netif_t *netif = netdev_priv(skb->dev);
70273+ int nr_frags = skb_shinfo(skb)->nr_frags;
70274+ int i;
70275+ int extra;
70276+ struct netbk_rx_meta *head_meta, *meta;
70277+
70278+ head_meta = npo->meta + npo->meta_prod++;
70279+ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
70280+ head_meta->frag.size = skb_shinfo(skb)->gso_size;
70281+ extra = !!head_meta->frag.size + 1;
70282+
70283+ for (i = 0; i < nr_frags; i++) {
70284+ meta = npo->meta + npo->meta_prod++;
70285+ meta->frag = skb_shinfo(skb)->frags[i];
70286+ meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
70287+ meta->frag.page,
70288+ meta->frag.size,
70289+ meta->frag.page_offset);
70290+ }
70291+
70292+ /*
70293+ * This must occur at the end to ensure that we don't trash
70294+ * skb_shinfo until we're done.
70295+ */
70296+ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
70297+ virt_to_page(skb->data),
70298+ skb_headlen(skb),
70299+ offset_in_page(skb->data));
70300+
70301+ netif->rx.req_cons += nr_frags + extra;
70302+}
70303+
70304+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
70305+{
70306+ int i;
70307+
70308+ for (i = 0; i < nr_frags; i++)
70309+ put_page(meta[i].frag.page);
70310+}
70311+
70312+/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
70313+ used to set up the operations on the top of
70314+ netrx_pending_operations, which have since been done. Check that
70315+ they didn't give any errors and advance over them. */
70316+static int netbk_check_gop(int nr_frags, domid_t domid,
70317+ struct netrx_pending_operations *npo)
70318+{
70319+ multicall_entry_t *mcl;
70320+ gnttab_transfer_t *gop;
70321+ gnttab_copy_t *copy_op;
70322+ int status = NETIF_RSP_OKAY;
70323+ int i;
70324+
70325+ for (i = 0; i <= nr_frags; i++) {
70326+ if (npo->meta[npo->meta_cons + i].copy) {
70327+ copy_op = npo->copy + npo->copy_cons++;
70328+ if (copy_op->status != GNTST_okay) {
70329+ DPRINTK("Bad status %d from copy to DOM%d.\n",
70330+ copy_op->status, domid);
70331+ status = NETIF_RSP_ERROR;
70332+ }
70333+ } else {
70334+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
70335+ mcl = npo->mcl + npo->mcl_cons++;
70336+ /* The update_va_mapping() must not fail. */
70337+ BUG_ON(mcl->result != 0);
70338+ }
70339+
70340+ gop = npo->trans + npo->trans_cons++;
70341+ /* Check the reassignment error code. */
70342+ if (gop->status != 0) {
70343+ DPRINTK("Bad status %d from grant transfer to DOM%u\n",
70344+ gop->status, domid);
70345+ /*
70346+ * Page no longer belongs to us unless
70347+ * GNTST_bad_page, but that should be
70348+ * a fatal error anyway.
70349+ */
70350+ BUG_ON(gop->status == GNTST_bad_page);
70351+ status = NETIF_RSP_ERROR;
70352+ }
70353+ }
70354+ }
70355+
70356+ return status;
70357+}
70358+
70359+static void netbk_add_frag_responses(netif_t *netif, int status,
70360+ struct netbk_rx_meta *meta, int nr_frags)
70361+{
70362+ int i;
70363+ unsigned long offset;
70364+
70365+ for (i = 0; i < nr_frags; i++) {
70366+ int id = meta[i].id;
70367+ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
70368+
70369+ if (meta[i].copy)
70370+ offset = 0;
70371+ else
70372+ offset = meta[i].frag.page_offset;
70373+ make_rx_response(netif, id, status, offset,
70374+ meta[i].frag.size, flags);
70375+ }
70376+}
70377+
70378+static void net_rx_action(unsigned long unused)
70379+{
70380+ netif_t *netif = NULL;
70381+ s8 status;
70382+ u16 id, irq, flags;
70383+ netif_rx_response_t *resp;
70384+ multicall_entry_t *mcl;
70385+ struct sk_buff_head rxq;
70386+ struct sk_buff *skb;
70387+ int notify_nr = 0;
70388+ int ret;
70389+ int nr_frags;
70390+ int count;
70391+ unsigned long offset;
70392+
70393+ /*
70394+ * Putting hundreds of bytes on the stack is considered rude.
70395+ * Static works because a tasklet can only be on one CPU at any time.
70396+ */
70397+ static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
70398+ static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
70399+ static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
70400+ static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
70401+ static unsigned char rx_notify[NR_IRQS];
70402+ static u16 notify_list[NET_RX_RING_SIZE];
70403+ static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
70404+
70405+ struct netrx_pending_operations npo = {
70406+ mmu: rx_mmu,
70407+ trans: grant_trans_op,
70408+ copy: grant_copy_op,
70409+ mcl: rx_mcl,
70410+ meta: meta};
70411+
70412+ skb_queue_head_init(&rxq);
70413+
70414+ count = 0;
70415+
70416+ while ((skb = skb_dequeue(&rx_queue)) != NULL) {
70417+ nr_frags = skb_shinfo(skb)->nr_frags;
70418+ *(int *)skb->cb = nr_frags;
70419+
70420+ if (!xen_feature(XENFEAT_auto_translated_physmap) &&
70421+ check_mfn(nr_frags + 1)) {
70422+ /* Memory squeeze? Back off for an arbitrary while. */
70423+ if ( net_ratelimit() )
70424+ WPRINTK("Memory squeeze in netback "
70425+ "driver.\n");
70426+ mod_timer(&net_timer, jiffies + HZ);
70427+ skb_queue_head(&rx_queue, skb);
70428+ break;
70429+ }
70430+
70431+ netbk_gop_skb(skb, &npo);
70432+
70433+ count += nr_frags + 1;
70434+
70435+ __skb_queue_tail(&rxq, skb);
70436+
70437+ /* Filled the batch queue? */
70438+ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
70439+ break;
70440+ }
70441+
70442+ if (npo.mcl_prod &&
70443+ !xen_feature(XENFEAT_auto_translated_physmap)) {
70444+ mcl = npo.mcl + npo.mcl_prod++;
70445+
70446+ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
70447+ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
70448+
70449+ mcl->op = __HYPERVISOR_mmu_update;
70450+ mcl->args[0] = (unsigned long)rx_mmu;
70451+ mcl->args[1] = npo.mmu_prod;
70452+ mcl->args[2] = 0;
70453+ mcl->args[3] = DOMID_SELF;
70454+ }
70455+
70456+ if (npo.trans_prod) {
70457+ mcl = npo.mcl + npo.mcl_prod++;
70458+ mcl->op = __HYPERVISOR_grant_table_op;
70459+ mcl->args[0] = GNTTABOP_transfer;
70460+ mcl->args[1] = (unsigned long)grant_trans_op;
70461+ mcl->args[2] = npo.trans_prod;
70462+ }
70463+
70464+ if (npo.copy_prod) {
70465+ mcl = npo.mcl + npo.mcl_prod++;
70466+ mcl->op = __HYPERVISOR_grant_table_op;
70467+ mcl->args[0] = GNTTABOP_copy;
70468+ mcl->args[1] = (unsigned long)grant_copy_op;
70469+ mcl->args[2] = npo.copy_prod;
70470+ }
70471+
70472+ /* Nothing to do? */
70473+ if (!npo.mcl_prod)
70474+ return;
70475+
70476+ BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
70477+ BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
70478+ BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
70479+ BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
70480+ BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
70481+
70482+ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
70483+ BUG_ON(ret != 0);
70484+
70485+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
70486+ nr_frags = *(int *)skb->cb;
70487+
70488+ netif = netdev_priv(skb->dev);
70489+ /* We can't rely on skb_release_data to release the
70490+ pages used by fragments for us, since it tries to
70491+ touch the pages in the fraglist. If we're in
70492+ flipping mode, that doesn't work. In copying mode,
70493+ we still have access to all of the pages, and so
70494+ it's safe to let release_data deal with it. */
70495+ /* (Freeing the fragments is safe since we copy
70496+ non-linear skbs destined for flipping interfaces) */
70497+ if (!netif->copying_receiver) {
70498+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
70499+ skb_shinfo(skb)->frag_list = NULL;
70500+ skb_shinfo(skb)->nr_frags = 0;
70501+ netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
70502+ }
70503+
70504+ netif->stats.tx_bytes += skb->len;
70505+ netif->stats.tx_packets++;
70506+
70507+ status = netbk_check_gop(nr_frags, netif->domid, &npo);
70508+
70509+ id = meta[npo.meta_cons].id;
70510+ flags = nr_frags ? NETRXF_more_data : 0;
70511+
70512+ if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
70513+ flags |= NETRXF_csum_blank | NETRXF_data_validated;
70514+ else if (skb->proto_data_valid) /* remote but checksummed? */
70515+ flags |= NETRXF_data_validated;
70516+
70517+ if (meta[npo.meta_cons].copy)
70518+ offset = 0;
70519+ else
70520+ offset = offset_in_page(skb->data);
70521+ resp = make_rx_response(netif, id, status, offset,
70522+ skb_headlen(skb), flags);
70523+
70524+ if (meta[npo.meta_cons].frag.size) {
70525+ struct netif_extra_info *gso =
70526+ (struct netif_extra_info *)
70527+ RING_GET_RESPONSE(&netif->rx,
70528+ netif->rx.rsp_prod_pvt++);
70529+
70530+ resp->flags |= NETRXF_extra_info;
70531+
70532+ gso->u.gso.size = meta[npo.meta_cons].frag.size;
70533+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
70534+ gso->u.gso.pad = 0;
70535+ gso->u.gso.features = 0;
70536+
70537+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
70538+ gso->flags = 0;
70539+ }
70540+
70541+ netbk_add_frag_responses(netif, status,
70542+ meta + npo.meta_cons + 1,
70543+ nr_frags);
70544+
70545+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
70546+ irq = netif->irq;
70547+ if (ret && !rx_notify[irq]) {
70548+ rx_notify[irq] = 1;
70549+ notify_list[notify_nr++] = irq;
70550+ }
70551+
70552+ if (netif_queue_stopped(netif->dev) &&
70553+ netif_schedulable(netif->dev) &&
70554+ !netbk_queue_full(netif))
70555+ netif_wake_queue(netif->dev);
70556+
70557+ netif_put(netif);
70558+ dev_kfree_skb(skb);
70559+ npo.meta_cons += nr_frags + 1;
70560+ }
70561+
70562+ while (notify_nr != 0) {
70563+ irq = notify_list[--notify_nr];
70564+ rx_notify[irq] = 0;
70565+ notify_remote_via_irq(irq);
70566+ }
70567+
70568+ /* More work to do? */
70569+ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
70570+ tasklet_schedule(&net_rx_tasklet);
70571+#if 0
70572+ else
70573+ xen_network_done_notify();
70574+#endif
70575+}
70576+
70577+static void net_alarm(unsigned long unused)
70578+{
70579+ tasklet_schedule(&net_rx_tasklet);
70580+}
70581+
70582+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
70583+{
70584+ netif_t *netif = netdev_priv(dev);
70585+ return &netif->stats;
70586+}
70587+
70588+static int __on_net_schedule_list(netif_t *netif)
70589+{
70590+ return netif->list.next != NULL;
70591+}
70592+
70593+static void remove_from_net_schedule_list(netif_t *netif)
70594+{
70595+ spin_lock_irq(&net_schedule_list_lock);
70596+ if (likely(__on_net_schedule_list(netif))) {
70597+ list_del(&netif->list);
70598+ netif->list.next = NULL;
70599+ netif_put(netif);
70600+ }
70601+ spin_unlock_irq(&net_schedule_list_lock);
70602+}
70603+
70604+static void add_to_net_schedule_list_tail(netif_t *netif)
70605+{
70606+ if (__on_net_schedule_list(netif))
70607+ return;
70608+
70609+ spin_lock_irq(&net_schedule_list_lock);
70610+ if (!__on_net_schedule_list(netif) &&
70611+ likely(netif_schedulable(netif->dev))) {
70612+ list_add_tail(&netif->list, &net_schedule_list);
70613+ netif_get(netif);
70614+ }
70615+ spin_unlock_irq(&net_schedule_list_lock);
70616+}
70617+
70618+/*
70619+ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
70620+ * If this driver is pipelining transmit requests then we can be very
70621+ * aggressive in avoiding new-packet notifications -- frontend only needs to
70622+ * send a notification if there are no outstanding unreceived responses.
70623+ * If we may be buffer transmit buffers for any reason then we must be rather
70624+ * more conservative and treat this as the final check for pending work.
70625+ */
70626+void netif_schedule_work(netif_t *netif)
70627+{
70628+ int more_to_do;
70629+
70630+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
70631+ more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
70632+#else
70633+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
70634+#endif
70635+
70636+ if (more_to_do) {
70637+ add_to_net_schedule_list_tail(netif);
70638+ maybe_schedule_tx_action();
70639+ }
70640+}
70641+
70642+void netif_deschedule_work(netif_t *netif)
70643+{
70644+ remove_from_net_schedule_list(netif);
70645+}
70646+
70647+
70648+static void tx_add_credit(netif_t *netif)
70649+{
70650+ unsigned long max_burst, max_credit;
70651+
70652+ /*
70653+ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
70654+ * Otherwise the interface can seize up due to insufficient credit.
70655+ */
70656+ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
70657+ max_burst = min(max_burst, 131072UL);
70658+ max_burst = max(max_burst, netif->credit_bytes);
70659+
70660+ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
70661+ max_credit = netif->remaining_credit + netif->credit_bytes;
70662+ if (max_credit < netif->remaining_credit)
70663+ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
70664+
70665+ netif->remaining_credit = min(max_credit, max_burst);
70666+}
70667+
70668+static void tx_credit_callback(unsigned long data)
70669+{
70670+ netif_t *netif = (netif_t *)data;
70671+ tx_add_credit(netif);
70672+ netif_schedule_work(netif);
70673+}
70674+
70675+inline static void net_tx_action_dealloc(void)
70676+{
70677+ gnttab_unmap_grant_ref_t *gop;
70678+ u16 pending_idx;
70679+ PEND_RING_IDX dc, dp;
70680+ netif_t *netif;
70681+ int ret;
70682+
70683+ dc = dealloc_cons;
70684+ dp = dealloc_prod;
70685+
70686+ /* Ensure we see all indexes enqueued by netif_idx_release(). */
70687+ smp_rmb();
70688+
70689+ /*
70690+ * Free up any grants we have finished using
70691+ */
70692+ gop = tx_unmap_ops;
70693+ while (dc != dp) {
70694+ pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
70695+ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
70696+ GNTMAP_host_map,
70697+ grant_tx_handle[pending_idx]);
70698+ gop++;
70699+ }
70700+ ret = HYPERVISOR_grant_table_op(
70701+ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
70702+ BUG_ON(ret);
70703+
70704+ while (dealloc_cons != dp) {
70705+ pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
70706+
70707+ netif = pending_tx_info[pending_idx].netif;
70708+
70709+ make_tx_response(netif, &pending_tx_info[pending_idx].req,
70710+ NETIF_RSP_OKAY);
70711+
70712+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70713+
70714+ netif_put(netif);
70715+ }
70716+}
70717+
70718+static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
70719+{
70720+ RING_IDX cons = netif->tx.req_cons;
70721+
70722+ do {
70723+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
70724+ if (cons >= end)
70725+ break;
70726+ txp = RING_GET_REQUEST(&netif->tx, cons++);
70727+ } while (1);
70728+ netif->tx.req_cons = cons;
70729+ netif_schedule_work(netif);
70730+ netif_put(netif);
70731+}
70732+
70733+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
70734+ netif_tx_request_t *txp, int work_to_do)
70735+{
70736+ RING_IDX cons = netif->tx.req_cons;
70737+ int frags = 0;
70738+
70739+ if (!(first->flags & NETTXF_more_data))
70740+ return 0;
70741+
70742+ do {
70743+ if (frags >= work_to_do) {
70744+ DPRINTK("Need more frags\n");
70745+ return -frags;
70746+ }
70747+
70748+ if (unlikely(frags >= MAX_SKB_FRAGS)) {
70749+ DPRINTK("Too many frags\n");
70750+ return -frags;
70751+ }
70752+
70753+ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
70754+ sizeof(*txp));
70755+ if (txp->size > first->size) {
70756+ DPRINTK("Frags galore\n");
70757+ return -frags;
70758+ }
70759+
70760+ first->size -= txp->size;
70761+ frags++;
70762+
70763+ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
70764+ DPRINTK("txp->offset: %x, size: %u\n",
70765+ txp->offset, txp->size);
70766+ return -frags;
70767+ }
70768+ } while ((txp++)->flags & NETTXF_more_data);
70769+
70770+ return frags;
70771+}
70772+
70773+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
70774+ struct sk_buff *skb,
70775+ netif_tx_request_t *txp,
70776+ gnttab_map_grant_ref_t *mop)
70777+{
70778+ struct skb_shared_info *shinfo = skb_shinfo(skb);
70779+ skb_frag_t *frags = shinfo->frags;
70780+ unsigned long pending_idx = *((u16 *)skb->data);
70781+ int i, start;
70782+
70783+ /* Skip first skb fragment if it is on same page as header fragment. */
70784+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
70785+
70786+ for (i = start; i < shinfo->nr_frags; i++, txp++) {
70787+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
70788+
70789+ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
70790+ GNTMAP_host_map | GNTMAP_readonly,
70791+ txp->gref, netif->domid);
70792+
70793+ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
70794+ netif_get(netif);
70795+ pending_tx_info[pending_idx].netif = netif;
70796+ frags[i].page = (void *)pending_idx;
70797+ }
70798+
70799+ return mop;
70800+}
70801+
70802+static int netbk_tx_check_mop(struct sk_buff *skb,
70803+ gnttab_map_grant_ref_t **mopp)
70804+{
70805+ gnttab_map_grant_ref_t *mop = *mopp;
70806+ int pending_idx = *((u16 *)skb->data);
70807+ netif_t *netif = pending_tx_info[pending_idx].netif;
70808+ netif_tx_request_t *txp;
70809+ struct skb_shared_info *shinfo = skb_shinfo(skb);
70810+ int nr_frags = shinfo->nr_frags;
70811+ int i, err, start;
70812+
70813+ /* Check status of header. */
70814+ err = mop->status;
70815+ if (unlikely(err)) {
70816+ txp = &pending_tx_info[pending_idx].req;
70817+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
70818+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70819+ netif_put(netif);
70820+ } else {
70821+ set_phys_to_machine(
70822+ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
70823+ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
70824+ grant_tx_handle[pending_idx] = mop->handle;
70825+ }
70826+
70827+ /* Skip first skb fragment if it is on same page as header fragment. */
70828+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
70829+
70830+ for (i = start; i < nr_frags; i++) {
70831+ int j, newerr;
70832+
70833+ pending_idx = (unsigned long)shinfo->frags[i].page;
70834+
70835+ /* Check error status: if okay then remember grant handle. */
70836+ newerr = (++mop)->status;
70837+ if (likely(!newerr)) {
70838+ set_phys_to_machine(
70839+ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
70840+ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
70841+ grant_tx_handle[pending_idx] = mop->handle;
70842+ /* Had a previous error? Invalidate this fragment. */
70843+ if (unlikely(err))
70844+ netif_idx_release(pending_idx);
70845+ continue;
70846+ }
70847+
70848+ /* Error on this fragment: respond to client with an error. */
70849+ txp = &pending_tx_info[pending_idx].req;
70850+ make_tx_response(netif, txp, NETIF_RSP_ERROR);
70851+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70852+ netif_put(netif);
70853+
70854+ /* Not the first error? Preceding frags already invalidated. */
70855+ if (err)
70856+ continue;
70857+
70858+ /* First error: invalidate header and preceding fragments. */
70859+ pending_idx = *((u16 *)skb->data);
70860+ netif_idx_release(pending_idx);
70861+ for (j = start; j < i; j++) {
70862+ pending_idx = (unsigned long)shinfo->frags[i].page;
70863+ netif_idx_release(pending_idx);
70864+ }
70865+
70866+ /* Remember the error: invalidate all subsequent fragments. */
70867+ err = newerr;
70868+ }
70869+
70870+ *mopp = mop + 1;
70871+ return err;
70872+}
70873+
70874+static void netbk_fill_frags(struct sk_buff *skb)
70875+{
70876+ struct skb_shared_info *shinfo = skb_shinfo(skb);
70877+ int nr_frags = shinfo->nr_frags;
70878+ int i;
70879+
70880+ for (i = 0; i < nr_frags; i++) {
70881+ skb_frag_t *frag = shinfo->frags + i;
70882+ netif_tx_request_t *txp;
70883+ unsigned long pending_idx;
70884+
70885+ pending_idx = (unsigned long)frag->page;
70886+ txp = &pending_tx_info[pending_idx].req;
70887+ frag->page = virt_to_page(idx_to_kaddr(pending_idx));
70888+ frag->size = txp->size;
70889+ frag->page_offset = txp->offset;
70890+
70891+ skb->len += txp->size;
70892+ skb->data_len += txp->size;
70893+ skb->truesize += txp->size;
70894+ }
70895+}
70896+
70897+int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
70898+ int work_to_do)
70899+{
70900+ struct netif_extra_info extra;
70901+ RING_IDX cons = netif->tx.req_cons;
70902+
70903+ do {
70904+ if (unlikely(work_to_do-- <= 0)) {
70905+ DPRINTK("Missing extra info\n");
70906+ return -EBADR;
70907+ }
70908+
70909+ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
70910+ sizeof(extra));
70911+ if (unlikely(!extra.type ||
70912+ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
70913+ netif->tx.req_cons = ++cons;
70914+ DPRINTK("Invalid extra type: %d\n", extra.type);
70915+ return -EINVAL;
70916+ }
70917+
70918+ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
70919+ netif->tx.req_cons = ++cons;
70920+ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
70921+
70922+ return work_to_do;
70923+}
70924+
70925+static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
70926+{
70927+ if (!gso->u.gso.size) {
70928+ DPRINTK("GSO size must not be zero.\n");
70929+ return -EINVAL;
70930+ }
70931+
70932+ /* Currently only TCPv4 S.O. is supported. */
70933+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
70934+ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
70935+ return -EINVAL;
70936+ }
70937+
70938+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
70939+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
70940+
70941+ /* Header must be checked, and gso_segs computed. */
70942+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
70943+ skb_shinfo(skb)->gso_segs = 0;
70944+
70945+ return 0;
70946+}
70947+
70948+/* Called after netfront has transmitted */
70949+static void net_tx_action(unsigned long unused)
70950+{
70951+ struct list_head *ent;
70952+ struct sk_buff *skb;
70953+ netif_t *netif;
70954+ netif_tx_request_t txreq;
70955+ netif_tx_request_t txfrags[MAX_SKB_FRAGS];
70956+ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
70957+ u16 pending_idx;
70958+ RING_IDX i;
70959+ gnttab_map_grant_ref_t *mop;
70960+ unsigned int data_len;
70961+ int ret, work_to_do;
70962+
70963+ if (dealloc_cons != dealloc_prod)
70964+ net_tx_action_dealloc();
70965+
70966+ mop = tx_map_ops;
70967+ while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
70968+ !list_empty(&net_schedule_list)) {
70969+ /* Get a netif from the list with work to do. */
70970+ ent = net_schedule_list.next;
70971+ netif = list_entry(ent, netif_t, list);
70972+ netif_get(netif);
70973+ remove_from_net_schedule_list(netif);
70974+
70975+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
70976+ if (!work_to_do) {
70977+ netif_put(netif);
70978+ continue;
70979+ }
70980+
70981+ i = netif->tx.req_cons;
70982+ rmb(); /* Ensure that we see the request before we copy it. */
70983+ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
70984+
70985+ /* Credit-based scheduling. */
70986+ if (txreq.size > netif->remaining_credit) {
70987+ unsigned long now = jiffies;
70988+ unsigned long next_credit =
70989+ netif->credit_timeout.expires +
70990+ msecs_to_jiffies(netif->credit_usec / 1000);
70991+
70992+ /* Timer could already be pending in rare cases. */
70993+ if (timer_pending(&netif->credit_timeout)) {
70994+ netif_put(netif);
70995+ continue;
70996+ }
70997+
70998+ /* Passed the point where we can replenish credit? */
70999+ if (time_after_eq(now, next_credit)) {
71000+ netif->credit_timeout.expires = now;
71001+ tx_add_credit(netif);
71002+ }
71003+
71004+ /* Still too big to send right now? Set a callback. */
71005+ if (txreq.size > netif->remaining_credit) {
71006+ netif->credit_timeout.data =
71007+ (unsigned long)netif;
71008+ netif->credit_timeout.function =
71009+ tx_credit_callback;
71010+ __mod_timer(&netif->credit_timeout,
71011+ next_credit);
71012+ netif_put(netif);
71013+ continue;
71014+ }
71015+ }
71016+ netif->remaining_credit -= txreq.size;
71017+
71018+ work_to_do--;
71019+ netif->tx.req_cons = ++i;
71020+
71021+ memset(extras, 0, sizeof(extras));
71022+ if (txreq.flags & NETTXF_extra_info) {
71023+ work_to_do = netbk_get_extras(netif, extras,
71024+ work_to_do);
71025+ i = netif->tx.req_cons;
71026+ if (unlikely(work_to_do < 0)) {
71027+ netbk_tx_err(netif, &txreq, i);
71028+ continue;
71029+ }
71030+ }
71031+
71032+ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
71033+ if (unlikely(ret < 0)) {
71034+ netbk_tx_err(netif, &txreq, i - ret);
71035+ continue;
71036+ }
71037+ i += ret;
71038+
71039+ if (unlikely(txreq.size < ETH_HLEN)) {
71040+ DPRINTK("Bad packet size: %d\n", txreq.size);
71041+ netbk_tx_err(netif, &txreq, i);
71042+ continue;
71043+ }
71044+
71045+ /* No crossing a page as the payload mustn't fragment. */
71046+ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
71047+ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
71048+ txreq.offset, txreq.size,
71049+ (txreq.offset &~PAGE_MASK) + txreq.size);
71050+ netbk_tx_err(netif, &txreq, i);
71051+ continue;
71052+ }
71053+
71054+ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
71055+
71056+ data_len = (txreq.size > PKT_PROT_LEN &&
71057+ ret < MAX_SKB_FRAGS) ?
71058+ PKT_PROT_LEN : txreq.size;
71059+
71060+ skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
71061+ GFP_ATOMIC | __GFP_NOWARN);
71062+ if (unlikely(skb == NULL)) {
71063+ DPRINTK("Can't allocate a skb in start_xmit.\n");
71064+ netbk_tx_err(netif, &txreq, i);
71065+ break;
71066+ }
71067+
71068+ /* Packets passed to netif_rx() must have some headroom. */
71069+ skb_reserve(skb, 16 + NET_IP_ALIGN);
71070+
71071+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
71072+ struct netif_extra_info *gso;
71073+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
71074+
71075+ if (netbk_set_skb_gso(skb, gso)) {
71076+ kfree_skb(skb);
71077+ netbk_tx_err(netif, &txreq, i);
71078+ continue;
71079+ }
71080+ }
71081+
71082+ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
71083+ GNTMAP_host_map | GNTMAP_readonly,
71084+ txreq.gref, netif->domid);
71085+ mop++;
71086+
71087+ memcpy(&pending_tx_info[pending_idx].req,
71088+ &txreq, sizeof(txreq));
71089+ pending_tx_info[pending_idx].netif = netif;
71090+ *((u16 *)skb->data) = pending_idx;
71091+
71092+ __skb_put(skb, data_len);
71093+
71094+ skb_shinfo(skb)->nr_frags = ret;
71095+ if (data_len < txreq.size) {
71096+ skb_shinfo(skb)->nr_frags++;
71097+ skb_shinfo(skb)->frags[0].page =
71098+ (void *)(unsigned long)pending_idx;
71099+ } else {
71100+ /* Discriminate from any valid pending_idx value. */
71101+ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
71102+ }
71103+
71104+ __skb_queue_tail(&tx_queue, skb);
71105+
71106+ pending_cons++;
71107+
71108+ mop = netbk_get_requests(netif, skb, txfrags, mop);
71109+
71110+ netif->tx.req_cons = i;
71111+ netif_schedule_work(netif);
71112+
71113+ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
71114+ break;
71115+ }
71116+
71117+ if (mop == tx_map_ops)
71118+ return;
71119+
71120+ ret = HYPERVISOR_grant_table_op(
71121+ GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
71122+ BUG_ON(ret);
71123+
71124+ mop = tx_map_ops;
71125+ while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
71126+ netif_tx_request_t *txp;
71127+
71128+ pending_idx = *((u16 *)skb->data);
71129+ netif = pending_tx_info[pending_idx].netif;
71130+ txp = &pending_tx_info[pending_idx].req;
71131+
71132+ /* Check the remap error code. */
71133+ if (unlikely(netbk_tx_check_mop(skb, &mop))) {
71134+ printk(KERN_ALERT "#### netback grant fails\n");
71135+ skb_shinfo(skb)->nr_frags = 0;
71136+ kfree_skb(skb);
71137+ continue;
71138+ }
71139+
71140+ data_len = skb->len;
71141+ memcpy(skb->data,
71142+ (void *)(idx_to_kaddr(pending_idx)|txp->offset),
71143+ data_len);
71144+ if (data_len < txp->size) {
71145+ /* Append the packet payload as a fragment. */
71146+ txp->offset += data_len;
71147+ txp->size -= data_len;
71148+ } else {
71149+ /* Schedule a response immediately. */
71150+ netif_idx_release(pending_idx);
71151+ }
71152+
71153+ /*
71154+ * Old frontends do not assert data_validated but we
71155+ * can infer it from csum_blank so test both flags.
71156+ */
71157+ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
71158+ skb->ip_summed = CHECKSUM_UNNECESSARY;
71159+ skb->proto_data_valid = 1;
71160+ } else {
71161+ skb->ip_summed = CHECKSUM_NONE;
71162+ skb->proto_data_valid = 0;
71163+ }
71164+ skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
71165+
71166+ netbk_fill_frags(skb);
71167+
71168+ skb->dev = netif->dev;
71169+ skb->protocol = eth_type_trans(skb, skb->dev);
71170+
71171+ netif->stats.rx_bytes += skb->len;
71172+ netif->stats.rx_packets++;
71173+
71174+ netif_rx(skb);
71175+ netif->dev->last_rx = jiffies;
71176+ }
71177+}
71178+
71179+static void netif_idx_release(u16 pending_idx)
71180+{
71181+ static DEFINE_SPINLOCK(_lock);
71182+ unsigned long flags;
71183+
71184+ spin_lock_irqsave(&_lock, flags);
71185+ dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
71186+ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
71187+ smp_wmb();
71188+ dealloc_prod++;
71189+ spin_unlock_irqrestore(&_lock, flags);
71190+
71191+ tasklet_schedule(&net_tx_tasklet);
71192+}
71193+
71194+static void netif_page_release(struct page *page)
71195+{
71196+ /* Ready for next use. */
71197+ set_page_count(page, 1);
71198+
71199+ netif_idx_release(page->index);
71200+}
71201+
71202+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
71203+{
71204+ netif_t *netif = dev_id;
71205+
71206+ add_to_net_schedule_list_tail(netif);
71207+ maybe_schedule_tx_action();
71208+
71209+ if (netif_schedulable(netif->dev) && !netbk_queue_full(netif))
71210+ netif_wake_queue(netif->dev);
71211+
71212+ return IRQ_HANDLED;
71213+}
71214+
71215+static void make_tx_response(netif_t *netif,
71216+ netif_tx_request_t *txp,
71217+ s8 st)
71218+{
71219+ RING_IDX i = netif->tx.rsp_prod_pvt;
71220+ netif_tx_response_t *resp;
71221+ int notify;
71222+
71223+ resp = RING_GET_RESPONSE(&netif->tx, i);
71224+ resp->id = txp->id;
71225+ resp->status = st;
71226+
71227+ if (txp->flags & NETTXF_extra_info)
71228+ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
71229+
71230+ netif->tx.rsp_prod_pvt = ++i;
71231+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
71232+ if (notify)
71233+ notify_remote_via_irq(netif->irq);
71234+
71235+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
71236+ if (i == netif->tx.req_cons) {
71237+ int more_to_do;
71238+ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
71239+ if (more_to_do)
71240+ add_to_net_schedule_list_tail(netif);
71241+ }
71242+#endif
71243+}
71244+
71245+static netif_rx_response_t *make_rx_response(netif_t *netif,
71246+ u16 id,
71247+ s8 st,
71248+ u16 offset,
71249+ u16 size,
71250+ u16 flags)
71251+{
71252+ RING_IDX i = netif->rx.rsp_prod_pvt;
71253+ netif_rx_response_t *resp;
71254+
71255+ resp = RING_GET_RESPONSE(&netif->rx, i);
71256+ resp->offset = offset;
71257+ resp->flags = flags;
71258+ resp->id = id;
71259+ resp->status = (s16)size;
71260+ if (st < 0)
71261+ resp->status = (s16)st;
71262+
71263+ netif->rx.rsp_prod_pvt = ++i;
71264+
71265+ return resp;
71266+}
71267+
71268+#ifdef NETBE_DEBUG_INTERRUPT
71269+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
71270+{
71271+ struct list_head *ent;
71272+ netif_t *netif;
71273+ int i = 0;
71274+
71275+ printk(KERN_ALERT "netif_schedule_list:\n");
71276+ spin_lock_irq(&net_schedule_list_lock);
71277+
71278+ list_for_each (ent, &net_schedule_list) {
71279+ netif = list_entry(ent, netif_t, list);
71280+ printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
71281+ "rx_resp_prod=%08x\n",
71282+ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
71283+ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
71284+ netif->tx.req_cons, netif->tx.rsp_prod_pvt);
71285+ printk(KERN_ALERT " shared(rx_req_prod=%08x "
71286+ "rx_resp_prod=%08x\n",
71287+ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
71288+ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
71289+ netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
71290+ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
71291+ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
71292+ i++;
71293+ }
71294+
71295+ spin_unlock_irq(&net_schedule_list_lock);
71296+ printk(KERN_ALERT " ** End of netif_schedule_list **\n");
71297+
71298+ return IRQ_HANDLED;
71299+}
71300+#endif
71301+
71302+static int __init netback_init(void)
71303+{
71304+ int i;
71305+ struct page *page;
71306+
71307+ if (!is_running_on_xen())
71308+ return -ENODEV;
71309+
71310+ /* We can increase reservation by this much in net_rx_action(). */
71311+ balloon_update_driver_allowance(NET_RX_RING_SIZE);
71312+
71313+ skb_queue_head_init(&rx_queue);
71314+ skb_queue_head_init(&tx_queue);
71315+
71316+ init_timer(&net_timer);
71317+ net_timer.data = 0;
71318+ net_timer.function = net_alarm;
71319+
71320+ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
71321+ if (mmap_pages == NULL) {
71322+ printk("%s: out of memory\n", __FUNCTION__);
71323+ return -ENOMEM;
71324+ }
71325+
71326+ for (i = 0; i < MAX_PENDING_REQS; i++) {
71327+ page = mmap_pages[i];
71328+ SetPageForeign(page, netif_page_release);
71329+ page->index = i;
71330+ }
71331+
71332+ pending_cons = 0;
71333+ pending_prod = MAX_PENDING_REQS;
71334+ for (i = 0; i < MAX_PENDING_REQS; i++)
71335+ pending_ring[i] = i;
71336+
71337+ spin_lock_init(&net_schedule_list_lock);
71338+ INIT_LIST_HEAD(&net_schedule_list);
71339+
71340+ netif_xenbus_init();
71341+
71342+#ifdef NETBE_DEBUG_INTERRUPT
71343+ (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
71344+ 0,
71345+ netif_be_dbg,
71346+ SA_SHIRQ,
71347+ "net-be-dbg",
71348+ &netif_be_dbg);
71349+#endif
71350+
71351+ return 0;
71352+}
71353+
71354+module_init(netback_init);
71355+
71356+MODULE_LICENSE("Dual BSD/GPL");
71357diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/xenbus.c linux-2.6.16.33/drivers/xen/netback/xenbus.c
71358--- linux-2.6.16.33-noxen/drivers/xen/netback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
71359+++ linux-2.6.16.33/drivers/xen/netback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
71360@@ -0,0 +1,450 @@
71361+/* Xenbus code for netif backend
71362+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
71363+ Copyright (C) 2005 XenSource Ltd
71364+
71365+ This program is free software; you can redistribute it and/or modify
71366+ it under the terms of the GNU General Public License as published by
71367+ the Free Software Foundation; either version 2 of the License, or
71368+ (at your option) any later version.
71369+
71370+ This program is distributed in the hope that it will be useful,
71371+ but WITHOUT ANY WARRANTY; without even the implied warranty of
71372+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
71373+ GNU General Public License for more details.
71374+
71375+ You should have received a copy of the GNU General Public License
71376+ along with this program; if not, write to the Free Software
71377+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
71378+*/
71379+
71380+#include <stdarg.h>
71381+#include <linux/module.h>
71382+#include <xen/xenbus.h>
71383+#include "common.h"
71384+
71385+#if 0
71386+#undef DPRINTK
71387+#define DPRINTK(fmt, args...) \
71388+ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
71389+#endif
71390+
71391+struct backend_info {
71392+ struct xenbus_device *dev;
71393+ netif_t *netif;
71394+ enum xenbus_state frontend_state;
71395+};
71396+
71397+static int connect_rings(struct backend_info *);
71398+static void connect(struct backend_info *);
71399+static void backend_create_netif(struct backend_info *be);
71400+
71401+static int netback_remove(struct xenbus_device *dev)
71402+{
71403+ struct backend_info *be = dev->dev.driver_data;
71404+
71405+ if (be->netif) {
71406+ netif_disconnect(be->netif);
71407+ be->netif = NULL;
71408+ }
71409+ kfree(be);
71410+ dev->dev.driver_data = NULL;
71411+ return 0;
71412+}
71413+
71414+
71415+/**
71416+ * Entry point to this code when a new device is created. Allocate the basic
71417+ * structures and switch to InitWait.
71418+ */
71419+static int netback_probe(struct xenbus_device *dev,
71420+ const struct xenbus_device_id *id)
71421+{
71422+ const char *message;
71423+ struct xenbus_transaction xbt;
71424+ int err;
71425+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
71426+ GFP_KERNEL);
71427+ if (!be) {
71428+ xenbus_dev_fatal(dev, -ENOMEM,
71429+ "allocating backend structure");
71430+ return -ENOMEM;
71431+ }
71432+
71433+ be->dev = dev;
71434+ dev->dev.driver_data = be;
71435+
71436+ do {
71437+ err = xenbus_transaction_start(&xbt);
71438+ if (err) {
71439+ xenbus_dev_fatal(dev, err, "starting transaction");
71440+ goto fail;
71441+ }
71442+
71443+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
71444+ if (err) {
71445+ message = "writing feature-sg";
71446+ goto abort_transaction;
71447+ }
71448+
71449+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
71450+ "%d", 1);
71451+ if (err) {
71452+ message = "writing feature-gso-tcpv4";
71453+ goto abort_transaction;
71454+ }
71455+
71456+ /* We support rx-copy path. */
71457+ err = xenbus_printf(xbt, dev->nodename,
71458+ "feature-rx-copy", "%d", 1);
71459+ if (err) {
71460+ message = "writing feature-rx-copy";
71461+ goto abort_transaction;
71462+ }
71463+
71464+ /*
71465+ * We don't support rx-flip path (except old guests who don't
71466+ * grok this feature flag).
71467+ */
71468+ err = xenbus_printf(xbt, dev->nodename,
71469+ "feature-rx-flip", "%d", 0);
71470+ if (err) {
71471+ message = "writing feature-rx-flip";
71472+ goto abort_transaction;
71473+ }
71474+
71475+ err = xenbus_transaction_end(xbt, 0);
71476+ } while (err == -EAGAIN);
71477+
71478+ if (err) {
71479+ xenbus_dev_fatal(dev, err, "completing transaction");
71480+ goto fail;
71481+ }
71482+
71483+ err = xenbus_switch_state(dev, XenbusStateInitWait);
71484+ if (err)
71485+ goto fail;
71486+
71487+ /* This kicks hotplug scripts, so do it immediately. */
71488+ backend_create_netif(be);
71489+
71490+ return 0;
71491+
71492+abort_transaction:
71493+ xenbus_transaction_end(xbt, 1);
71494+ xenbus_dev_fatal(dev, err, "%s", message);
71495+fail:
71496+ DPRINTK("failed");
71497+ netback_remove(dev);
71498+ return err;
71499+}
71500+
71501+
71502+/**
71503+ * Handle the creation of the hotplug script environment. We add the script
71504+ * and vif variables to the environment, for the benefit of the vif-* hotplug
71505+ * scripts.
71506+ */
71507+static int netback_uevent(struct xenbus_device *xdev, char **envp,
71508+ int num_envp, char *buffer, int buffer_size)
71509+{
71510+ struct backend_info *be = xdev->dev.driver_data;
71511+ netif_t *netif = be->netif;
71512+ int i = 0, length = 0;
71513+ char *val;
71514+
71515+ DPRINTK("netback_uevent");
71516+
71517+ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
71518+ if (IS_ERR(val)) {
71519+ int err = PTR_ERR(val);
71520+ xenbus_dev_fatal(xdev, err, "reading script");
71521+ return err;
71522+ }
71523+ else {
71524+ add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
71525+ &length, "script=%s", val);
71526+ kfree(val);
71527+ }
71528+
71529+ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
71530+ "vif=%s", netif->dev->name);
71531+
71532+ envp[i] = NULL;
71533+
71534+ return 0;
71535+}
71536+
71537+
71538+static void backend_create_netif(struct backend_info *be)
71539+{
71540+ int err;
71541+ long handle;
71542+ struct xenbus_device *dev = be->dev;
71543+
71544+ if (be->netif != NULL)
71545+ return;
71546+
71547+ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
71548+ if (err != 1) {
71549+ xenbus_dev_fatal(dev, err, "reading handle");
71550+ return;
71551+ }
71552+
71553+ be->netif = netif_alloc(dev->otherend_id, handle);
71554+ if (IS_ERR(be->netif)) {
71555+ err = PTR_ERR(be->netif);
71556+ be->netif = NULL;
71557+ xenbus_dev_fatal(dev, err, "creating interface");
71558+ return;
71559+ }
71560+
71561+ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
71562+}
71563+
71564+
71565+/**
71566+ * Callback received when the frontend's state changes.
71567+ */
71568+static void frontend_changed(struct xenbus_device *dev,
71569+ enum xenbus_state frontend_state)
71570+{
71571+ struct backend_info *be = dev->dev.driver_data;
71572+
71573+ DPRINTK("%s", xenbus_strstate(frontend_state));
71574+
71575+ be->frontend_state = frontend_state;
71576+
71577+ switch (frontend_state) {
71578+ case XenbusStateInitialising:
71579+ if (dev->state == XenbusStateClosed) {
71580+ printk("%s: %s: prepare for reconnect\n",
71581+ __FUNCTION__, dev->nodename);
71582+ if (be->netif) {
71583+ netif_disconnect(be->netif);
71584+ be->netif = NULL;
71585+ }
71586+ xenbus_switch_state(dev, XenbusStateInitWait);
71587+ }
71588+ break;
71589+
71590+ case XenbusStateInitialised:
71591+ break;
71592+
71593+ case XenbusStateConnected:
71594+ backend_create_netif(be);
71595+ if (be->netif)
71596+ connect(be);
71597+ break;
71598+
71599+ case XenbusStateClosing:
71600+ xenbus_switch_state(dev, XenbusStateClosing);
71601+ break;
71602+
71603+ case XenbusStateClosed:
71604+ xenbus_switch_state(dev, XenbusStateClosed);
71605+ if (xenbus_dev_is_online(dev))
71606+ break;
71607+ /* fall through if not online */
71608+ case XenbusStateUnknown:
71609+ if (be->netif != NULL)
71610+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
71611+ device_unregister(&dev->dev);
71612+ break;
71613+
71614+ default:
71615+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
71616+ frontend_state);
71617+ break;
71618+ }
71619+}
71620+
71621+
71622+static void xen_net_read_rate(struct xenbus_device *dev,
71623+ unsigned long *bytes, unsigned long *usec)
71624+{
71625+ char *s, *e;
71626+ unsigned long b, u;
71627+ char *ratestr;
71628+
71629+ /* Default to unlimited bandwidth. */
71630+ *bytes = ~0UL;
71631+ *usec = 0;
71632+
71633+ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
71634+ if (IS_ERR(ratestr))
71635+ return;
71636+
71637+ s = ratestr;
71638+ b = simple_strtoul(s, &e, 10);
71639+ if ((s == e) || (*e != ','))
71640+ goto fail;
71641+
71642+ s = e + 1;
71643+ u = simple_strtoul(s, &e, 10);
71644+ if ((s == e) || (*e != '\0'))
71645+ goto fail;
71646+
71647+ *bytes = b;
71648+ *usec = u;
71649+
71650+ kfree(ratestr);
71651+ return;
71652+
71653+ fail:
71654+ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
71655+ kfree(ratestr);
71656+}
71657+
71658+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
71659+{
71660+ char *s, *e, *macstr;
71661+ int i;
71662+
71663+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
71664+ if (IS_ERR(macstr))
71665+ return PTR_ERR(macstr);
71666+
71667+ for (i = 0; i < ETH_ALEN; i++) {
71668+ mac[i] = simple_strtoul(s, &e, 16);
71669+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
71670+ kfree(macstr);
71671+ return -ENOENT;
71672+ }
71673+ s = e+1;
71674+ }
71675+
71676+ kfree(macstr);
71677+ return 0;
71678+}
71679+
71680+static void connect(struct backend_info *be)
71681+{
71682+ int err;
71683+ struct xenbus_device *dev = be->dev;
71684+
71685+ err = connect_rings(be);
71686+ if (err)
71687+ return;
71688+
71689+ err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
71690+ if (err) {
71691+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
71692+ return;
71693+ }
71694+
71695+ xen_net_read_rate(dev, &be->netif->credit_bytes,
71696+ &be->netif->credit_usec);
71697+ be->netif->remaining_credit = be->netif->credit_bytes;
71698+
71699+ xenbus_switch_state(dev, XenbusStateConnected);
71700+
71701+ /* May not get a kick from the frontend, so start the tx_queue now. */
71702+ if (!netbk_can_queue(be->netif->dev))
71703+ netif_wake_queue(be->netif->dev);
71704+}
71705+
71706+
71707+static int connect_rings(struct backend_info *be)
71708+{
71709+ struct xenbus_device *dev = be->dev;
71710+ unsigned long tx_ring_ref, rx_ring_ref;
71711+ unsigned int evtchn, rx_copy;
71712+ int err;
71713+ int val;
71714+
71715+ DPRINTK("");
71716+
71717+ err = xenbus_gather(XBT_NIL, dev->otherend,
71718+ "tx-ring-ref", "%lu", &tx_ring_ref,
71719+ "rx-ring-ref", "%lu", &rx_ring_ref,
71720+ "event-channel", "%u", &evtchn, NULL);
71721+ if (err) {
71722+ xenbus_dev_fatal(dev, err,
71723+ "reading %s/ring-ref and event-channel",
71724+ dev->otherend);
71725+ return err;
71726+ }
71727+
71728+ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
71729+ &rx_copy);
71730+ if (err == -ENOENT) {
71731+ err = 0;
71732+ rx_copy = 0;
71733+ }
71734+ if (err < 0) {
71735+ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
71736+ dev->otherend);
71737+ return err;
71738+ }
71739+ be->netif->copying_receiver = !!rx_copy;
71740+
71741+ if (be->netif->dev->tx_queue_len != 0) {
71742+ if (xenbus_scanf(XBT_NIL, dev->otherend,
71743+ "feature-rx-notify", "%d", &val) < 0)
71744+ val = 0;
71745+ if (val)
71746+ be->netif->can_queue = 1;
71747+ else
71748+ /* Must be non-zero for pfifo_fast to work. */
71749+ be->netif->dev->tx_queue_len = 1;
71750+ }
71751+
71752+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
71753+ val = 0;
71754+ if (val) {
71755+ be->netif->features |= NETIF_F_SG;
71756+ be->netif->dev->features |= NETIF_F_SG;
71757+ }
71758+
71759+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
71760+ &val) < 0)
71761+ val = 0;
71762+ if (val) {
71763+ be->netif->features |= NETIF_F_TSO;
71764+ be->netif->dev->features |= NETIF_F_TSO;
71765+ }
71766+
71767+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
71768+ "%d", &val) < 0)
71769+ val = 0;
71770+ if (val) {
71771+ be->netif->features &= ~NETIF_F_IP_CSUM;
71772+ be->netif->dev->features &= ~NETIF_F_IP_CSUM;
71773+ }
71774+
71775+ /* Map the shared frame, irq etc. */
71776+ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
71777+ if (err) {
71778+ xenbus_dev_fatal(dev, err,
71779+ "mapping shared-frames %lu/%lu port %u",
71780+ tx_ring_ref, rx_ring_ref, evtchn);
71781+ return err;
71782+ }
71783+ return 0;
71784+}
71785+
71786+
71787+/* ** Driver Registration ** */
71788+
71789+
71790+static struct xenbus_device_id netback_ids[] = {
71791+ { "vif" },
71792+ { "" }
71793+};
71794+
71795+
71796+static struct xenbus_driver netback = {
71797+ .name = "vif",
71798+ .owner = THIS_MODULE,
71799+ .ids = netback_ids,
71800+ .probe = netback_probe,
71801+ .remove = netback_remove,
71802+ .uevent = netback_uevent,
71803+ .otherend_changed = frontend_changed,
71804+};
71805+
71806+
71807+void netif_xenbus_init(void)
71808+{
71809+ xenbus_register_backend(&netback);
71810+}
71811diff -Nur linux-2.6.16.33-noxen/drivers/xen/netfront/Makefile linux-2.6.16.33/drivers/xen/netfront/Makefile
71812--- linux-2.6.16.33-noxen/drivers/xen/netfront/Makefile 1970-01-01 00:00:00.000000000 +0000
71813+++ linux-2.6.16.33/drivers/xen/netfront/Makefile 2007-01-08 15:00:45.000000000 +0000
71814@@ -0,0 +1,4 @@
71815+
71816+obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o
71817+
71818+xennet-objs := netfront.o
71819diff -Nur linux-2.6.16.33-noxen/drivers/xen/netfront/netfront.c linux-2.6.16.33/drivers/xen/netfront/netfront.c
71820--- linux-2.6.16.33-noxen/drivers/xen/netfront/netfront.c 1970-01-01 00:00:00.000000000 +0000
71821+++ linux-2.6.16.33/drivers/xen/netfront/netfront.c 2007-01-08 15:00:45.000000000 +0000
71822@@ -0,0 +1,2114 @@
71823+/******************************************************************************
71824+ * Virtual network driver for conversing with remote driver backends.
71825+ *
71826+ * Copyright (c) 2002-2005, K A Fraser
71827+ * Copyright (c) 2005, XenSource Ltd
71828+ *
71829+ * This program is free software; you can redistribute it and/or
71830+ * modify it under the terms of the GNU General Public License version 2
71831+ * as published by the Free Software Foundation; or, when distributed
71832+ * separately from the Linux kernel or incorporated into other
71833+ * software packages, subject to the following license:
71834+ *
71835+ * Permission is hereby granted, free of charge, to any person obtaining a copy
71836+ * of this source file (the "Software"), to deal in the Software without
71837+ * restriction, including without limitation the rights to use, copy, modify,
71838+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
71839+ * and to permit persons to whom the Software is furnished to do so, subject to
71840+ * the following conditions:
71841+ *
71842+ * The above copyright notice and this permission notice shall be included in
71843+ * all copies or substantial portions of the Software.
71844+ *
71845+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
71846+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
71847+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
71848+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
71849+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
71850+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
71851+ * IN THE SOFTWARE.
71852+ */
71853+
71854+#include <linux/config.h>
71855+#include <linux/module.h>
71856+#include <linux/version.h>
71857+#include <linux/kernel.h>
71858+#include <linux/sched.h>
71859+#include <linux/slab.h>
71860+#include <linux/string.h>
71861+#include <linux/errno.h>
71862+#include <linux/netdevice.h>
71863+#include <linux/inetdevice.h>
71864+#include <linux/etherdevice.h>
71865+#include <linux/skbuff.h>
71866+#include <linux/init.h>
71867+#include <linux/bitops.h>
71868+#include <linux/ethtool.h>
71869+#include <linux/in.h>
71870+#include <linux/if_ether.h>
71871+#include <linux/io.h>
71872+#include <linux/moduleparam.h>
71873+#include <net/sock.h>
71874+#include <net/pkt_sched.h>
71875+#include <net/arp.h>
71876+#include <net/route.h>
71877+#include <asm/uaccess.h>
71878+#include <xen/evtchn.h>
71879+#include <xen/xenbus.h>
71880+#include <xen/interface/io/netif.h>
71881+#include <xen/interface/memory.h>
71882+#include <xen/balloon.h>
71883+#include <asm/page.h>
71884+#include <asm/maddr.h>
71885+#include <asm/uaccess.h>
71886+#include <xen/interface/grant_table.h>
71887+#include <xen/gnttab.h>
71888+
71889+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
71890+#include <xen/platform-compat.h>
71891+#endif
71892+
71893+/*
71894+ * Mutually-exclusive module options to select receive data path:
71895+ * rx_copy : Packets are copied by network backend into local memory
71896+ * rx_flip : Page containing packet data is transferred to our ownership
71897+ * For fully-virtualised guests there is no option - copying must be used.
71898+ * For paravirtualised guests, flipping is the default.
71899+ */
71900+#ifdef CONFIG_XEN
71901+static int MODPARM_rx_copy = 0;
71902+module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
71903+MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
71904+static int MODPARM_rx_flip = 0;
71905+module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
71906+MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
71907+#else
71908+static const int MODPARM_rx_copy = 1;
71909+static const int MODPARM_rx_flip = 0;
71910+#endif
71911+
71912+#define RX_COPY_THRESHOLD 256
71913+
71914+/* If we don't have GSO, fake things up so that we never try to use it. */
71915+#if defined(NETIF_F_GSO)
71916+#define HAVE_GSO 1
71917+#define HAVE_TSO 1 /* TSO is a subset of GSO */
71918+static inline void dev_disable_gso_features(struct net_device *dev)
71919+{
71920+ /* Turn off all GSO bits except ROBUST. */
71921+ dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
71922+ dev->features |= NETIF_F_GSO_ROBUST;
71923+}
71924+#elif defined(NETIF_F_TSO)
71925+#define HAVE_TSO 1
71926+
71927+/* Some older kernels cannot cope with incorrect checksums,
71928+ * particularly in netfilter. I'm not sure there is 100% correlation
71929+ * with the presence of NETIF_F_TSO but it appears to be a good first
71930+ * approximiation.
71931+ */
71932+#define HAVE_NO_CSUM_OFFLOAD 1
71933+
71934+#define gso_size tso_size
71935+#define gso_segs tso_segs
71936+static inline void dev_disable_gso_features(struct net_device *dev)
71937+{
71938+ /* Turn off all TSO bits. */
71939+ dev->features &= ~NETIF_F_TSO;
71940+}
71941+static inline int skb_is_gso(const struct sk_buff *skb)
71942+{
71943+ return skb_shinfo(skb)->tso_size;
71944+}
71945+static inline int skb_gso_ok(struct sk_buff *skb, int features)
71946+{
71947+ return (features & NETIF_F_TSO);
71948+}
71949+
71950+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
71951+{
71952+ return skb_is_gso(skb) &&
71953+ (!skb_gso_ok(skb, dev->features) ||
71954+ unlikely(skb->ip_summed != CHECKSUM_HW));
71955+}
71956+#else
71957+#define netif_needs_gso(dev, skb) 0
71958+#define dev_disable_gso_features(dev) ((void)0)
71959+#endif
71960+
71961+#define GRANT_INVALID_REF 0
71962+
71963+#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
71964+#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
71965+
71966+struct netfront_info {
71967+ struct list_head list;
71968+ struct net_device *netdev;
71969+
71970+ struct net_device_stats stats;
71971+
71972+ struct netif_tx_front_ring tx;
71973+ struct netif_rx_front_ring rx;
71974+
71975+ spinlock_t tx_lock;
71976+ spinlock_t rx_lock;
71977+
71978+ unsigned int evtchn, irq;
71979+ unsigned int copying_receiver;
71980+
71981+ /* Receive-ring batched refills. */
71982+#define RX_MIN_TARGET 8
71983+#define RX_DFL_MIN_TARGET 64
71984+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
71985+ unsigned rx_min_target, rx_max_target, rx_target;
71986+ struct sk_buff_head rx_batch;
71987+
71988+ struct timer_list rx_refill_timer;
71989+
71990+ /*
71991+ * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
71992+ * is an index into a chain of free entries.
71993+ */
71994+ struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
71995+ struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
71996+
71997+#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
71998+ grant_ref_t gref_tx_head;
71999+ grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
72000+ grant_ref_t gref_rx_head;
72001+ grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
72002+
72003+ struct xenbus_device *xbdev;
72004+ int tx_ring_ref;
72005+ int rx_ring_ref;
72006+ u8 mac[ETH_ALEN];
72007+
72008+ unsigned long rx_pfn_array[NET_RX_RING_SIZE];
72009+ struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
72010+ struct mmu_update rx_mmu[NET_RX_RING_SIZE];
72011+};
72012+
72013+struct netfront_rx_info {
72014+ struct netif_rx_response rx;
72015+ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
72016+};
72017+
72018+/*
72019+ * Access macros for acquiring freeing slots in tx_skbs[].
72020+ */
72021+
72022+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
72023+{
72024+ list[id] = list[0];
72025+ list[0] = (void *)(unsigned long)id;
72026+}
72027+
72028+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
72029+{
72030+ unsigned int id = (unsigned int)(unsigned long)list[0];
72031+ list[0] = list[id];
72032+ return id;
72033+}
72034+
72035+static inline int xennet_rxidx(RING_IDX idx)
72036+{
72037+ return idx & (NET_RX_RING_SIZE - 1);
72038+}
72039+
72040+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
72041+ RING_IDX ri)
72042+{
72043+ int i = xennet_rxidx(ri);
72044+ struct sk_buff *skb = np->rx_skbs[i];
72045+ np->rx_skbs[i] = NULL;
72046+ return skb;
72047+}
72048+
72049+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
72050+ RING_IDX ri)
72051+{
72052+ int i = xennet_rxidx(ri);
72053+ grant_ref_t ref = np->grant_rx_ref[i];
72054+ np->grant_rx_ref[i] = GRANT_INVALID_REF;
72055+ return ref;
72056+}
72057+
72058+#define DPRINTK(fmt, args...) \
72059+ pr_debug("netfront (%s:%d) " fmt, \
72060+ __FUNCTION__, __LINE__, ##args)
72061+#define IPRINTK(fmt, args...) \
72062+ printk(KERN_INFO "netfront: " fmt, ##args)
72063+#define WPRINTK(fmt, args...) \
72064+ printk(KERN_WARNING "netfront: " fmt, ##args)
72065+
72066+static int setup_device(struct xenbus_device *, struct netfront_info *);
72067+static struct net_device *create_netdev(struct xenbus_device *);
72068+
72069+static void end_access(int, void *);
72070+static void netif_disconnect_backend(struct netfront_info *);
72071+
72072+static int network_connect(struct net_device *);
72073+static void network_tx_buf_gc(struct net_device *);
72074+static void network_alloc_rx_buffers(struct net_device *);
72075+static int send_fake_arp(struct net_device *);
72076+
72077+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
72078+
72079+#ifdef CONFIG_SYSFS
72080+static int xennet_sysfs_addif(struct net_device *netdev);
72081+static void xennet_sysfs_delif(struct net_device *netdev);
72082+#else /* !CONFIG_SYSFS */
72083+#define xennet_sysfs_addif(dev) (0)
72084+#define xennet_sysfs_delif(dev) do { } while(0)
72085+#endif
72086+
72087+static inline int xennet_can_sg(struct net_device *dev)
72088+{
72089+ return dev->features & NETIF_F_SG;
72090+}
72091+
72092+/**
72093+ * Entry point to this code when a new device is created. Allocate the basic
72094+ * structures and the ring buffers for communication with the backend, and
72095+ * inform the backend of the appropriate details for those.
72096+ */
72097+static int __devinit netfront_probe(struct xenbus_device *dev,
72098+ const struct xenbus_device_id *id)
72099+{
72100+ int err;
72101+ struct net_device *netdev;
72102+ struct netfront_info *info;
72103+
72104+ netdev = create_netdev(dev);
72105+ if (IS_ERR(netdev)) {
72106+ err = PTR_ERR(netdev);
72107+ xenbus_dev_fatal(dev, err, "creating netdev");
72108+ return err;
72109+ }
72110+
72111+ info = netdev_priv(netdev);
72112+ dev->dev.driver_data = info;
72113+
72114+ err = register_netdev(info->netdev);
72115+ if (err) {
72116+ printk(KERN_WARNING "%s: register_netdev err=%d\n",
72117+ __FUNCTION__, err);
72118+ goto fail;
72119+ }
72120+
72121+ err = xennet_sysfs_addif(info->netdev);
72122+ if (err) {
72123+ unregister_netdev(info->netdev);
72124+ printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
72125+ __FUNCTION__, err);
72126+ goto fail;
72127+ }
72128+
72129+ return 0;
72130+
72131+ fail:
72132+ free_netdev(netdev);
72133+ dev->dev.driver_data = NULL;
72134+ return err;
72135+}
72136+
72137+static int __devexit netfront_remove(struct xenbus_device *dev)
72138+{
72139+ struct netfront_info *info = dev->dev.driver_data;
72140+
72141+ DPRINTK("%s\n", dev->nodename);
72142+
72143+ netif_disconnect_backend(info);
72144+
72145+ del_timer_sync(&info->rx_refill_timer);
72146+
72147+ xennet_sysfs_delif(info->netdev);
72148+
72149+ unregister_netdev(info->netdev);
72150+
72151+ free_netdev(info->netdev);
72152+
72153+ return 0;
72154+}
72155+
72156+/**
72157+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
72158+ * driver restart. We tear down our netif structure and recreate it, but
72159+ * leave the device-layer structures intact so that this is transparent to the
72160+ * rest of the kernel.
72161+ */
72162+static int netfront_resume(struct xenbus_device *dev)
72163+{
72164+ struct netfront_info *info = dev->dev.driver_data;
72165+
72166+ DPRINTK("%s\n", dev->nodename);
72167+
72168+ netif_disconnect_backend(info);
72169+ return 0;
72170+}
72171+
72172+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
72173+{
72174+ char *s, *e, *macstr;
72175+ int i;
72176+
72177+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
72178+ if (IS_ERR(macstr))
72179+ return PTR_ERR(macstr);
72180+
72181+ for (i = 0; i < ETH_ALEN; i++) {
72182+ mac[i] = simple_strtoul(s, &e, 16);
72183+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
72184+ kfree(macstr);
72185+ return -ENOENT;
72186+ }
72187+ s = e+1;
72188+ }
72189+
72190+ kfree(macstr);
72191+ return 0;
72192+}
72193+
72194+/* Common code used when first setting up, and when resuming. */
72195+static int talk_to_backend(struct xenbus_device *dev,
72196+ struct netfront_info *info)
72197+{
72198+ const char *message;
72199+ struct xenbus_transaction xbt;
72200+ int err;
72201+
72202+ err = xen_net_read_mac(dev, info->mac);
72203+ if (err) {
72204+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
72205+ goto out;
72206+ }
72207+
72208+ /* Create shared ring, alloc event channel. */
72209+ err = setup_device(dev, info);
72210+ if (err)
72211+ goto out;
72212+
72213+again:
72214+ err = xenbus_transaction_start(&xbt);
72215+ if (err) {
72216+ xenbus_dev_fatal(dev, err, "starting transaction");
72217+ goto destroy_ring;
72218+ }
72219+
72220+ err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
72221+ info->tx_ring_ref);
72222+ if (err) {
72223+ message = "writing tx ring-ref";
72224+ goto abort_transaction;
72225+ }
72226+ err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
72227+ info->rx_ring_ref);
72228+ if (err) {
72229+ message = "writing rx ring-ref";
72230+ goto abort_transaction;
72231+ }
72232+ err = xenbus_printf(xbt, dev->nodename,
72233+ "event-channel", "%u", info->evtchn);
72234+ if (err) {
72235+ message = "writing event-channel";
72236+ goto abort_transaction;
72237+ }
72238+
72239+ err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
72240+ info->copying_receiver);
72241+ if (err) {
72242+ message = "writing request-rx-copy";
72243+ goto abort_transaction;
72244+ }
72245+
72246+ err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
72247+ if (err) {
72248+ message = "writing feature-rx-notify";
72249+ goto abort_transaction;
72250+ }
72251+
72252+#ifdef HAVE_NO_CSUM_OFFLOAD
72253+ err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1);
72254+ if (err) {
72255+ message = "writing feature-no-csum-offload";
72256+ goto abort_transaction;
72257+ }
72258+#endif
72259+
72260+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
72261+ if (err) {
72262+ message = "writing feature-sg";
72263+ goto abort_transaction;
72264+ }
72265+
72266+#ifdef HAVE_TSO
72267+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
72268+ if (err) {
72269+ message = "writing feature-gso-tcpv4";
72270+ goto abort_transaction;
72271+ }
72272+#endif
72273+
72274+ err = xenbus_transaction_end(xbt, 0);
72275+ if (err) {
72276+ if (err == -EAGAIN)
72277+ goto again;
72278+ xenbus_dev_fatal(dev, err, "completing transaction");
72279+ goto destroy_ring;
72280+ }
72281+
72282+ return 0;
72283+
72284+ abort_transaction:
72285+ xenbus_transaction_end(xbt, 1);
72286+ xenbus_dev_fatal(dev, err, "%s", message);
72287+ destroy_ring:
72288+ netif_disconnect_backend(info);
72289+ out:
72290+ return err;
72291+}
72292+
72293+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
72294+{
72295+ struct netif_tx_sring *txs;
72296+ struct netif_rx_sring *rxs;
72297+ int err;
72298+ struct net_device *netdev = info->netdev;
72299+
72300+ info->tx_ring_ref = GRANT_INVALID_REF;
72301+ info->rx_ring_ref = GRANT_INVALID_REF;
72302+ info->rx.sring = NULL;
72303+ info->tx.sring = NULL;
72304+ info->irq = 0;
72305+
72306+ txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
72307+ if (!txs) {
72308+ err = -ENOMEM;
72309+ xenbus_dev_fatal(dev, err, "allocating tx ring page");
72310+ goto fail;
72311+ }
72312+ SHARED_RING_INIT(txs);
72313+ FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
72314+
72315+ err = xenbus_grant_ring(dev, virt_to_mfn(txs));
72316+ if (err < 0) {
72317+ free_page((unsigned long)txs);
72318+ goto fail;
72319+ }
72320+ info->tx_ring_ref = err;
72321+
72322+ rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
72323+ if (!rxs) {
72324+ err = -ENOMEM;
72325+ xenbus_dev_fatal(dev, err, "allocating rx ring page");
72326+ goto fail;
72327+ }
72328+ SHARED_RING_INIT(rxs);
72329+ FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
72330+
72331+ err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
72332+ if (err < 0) {
72333+ free_page((unsigned long)rxs);
72334+ goto fail;
72335+ }
72336+ info->rx_ring_ref = err;
72337+
72338+ err = xenbus_alloc_evtchn(dev, &info->evtchn);
72339+ if (err)
72340+ goto fail;
72341+
72342+ memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
72343+ err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
72344+ SA_SAMPLE_RANDOM, netdev->name,
72345+ netdev);
72346+ if (err < 0)
72347+ goto fail;
72348+ info->irq = err;
72349+ return 0;
72350+
72351+ fail:
72352+ return err;
72353+}
72354+
72355+/**
72356+ * Callback received when the backend's state changes.
72357+ */
72358+static void backend_changed(struct xenbus_device *dev,
72359+ enum xenbus_state backend_state)
72360+{
72361+ struct netfront_info *np = dev->dev.driver_data;
72362+ struct net_device *netdev = np->netdev;
72363+
72364+ DPRINTK("%s\n", xenbus_strstate(backend_state));
72365+
72366+ switch (backend_state) {
72367+ case XenbusStateInitialising:
72368+ case XenbusStateInitialised:
72369+ case XenbusStateConnected:
72370+ case XenbusStateUnknown:
72371+ case XenbusStateClosed:
72372+ break;
72373+
72374+ case XenbusStateInitWait:
72375+ if (dev->state != XenbusStateInitialising)
72376+ break;
72377+ if (network_connect(netdev) != 0)
72378+ break;
72379+ xenbus_switch_state(dev, XenbusStateConnected);
72380+ (void)send_fake_arp(netdev);
72381+ break;
72382+
72383+ case XenbusStateClosing:
72384+ xenbus_frontend_closed(dev);
72385+ break;
72386+ }
72387+}
72388+
72389+/** Send a packet on a net device to encourage switches to learn the
72390+ * MAC. We send a fake ARP request.
72391+ *
72392+ * @param dev device
72393+ * @return 0 on success, error code otherwise
72394+ */
72395+static int send_fake_arp(struct net_device *dev)
72396+{
72397+ struct sk_buff *skb;
72398+ u32 src_ip, dst_ip;
72399+
72400+ dst_ip = INADDR_BROADCAST;
72401+ src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
72402+
72403+ /* No IP? Then nothing to do. */
72404+ if (src_ip == 0)
72405+ return 0;
72406+
72407+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
72408+ dst_ip, dev, src_ip,
72409+ /*dst_hw*/ NULL, /*src_hw*/ NULL,
72410+ /*target_hw*/ dev->dev_addr);
72411+ if (skb == NULL)
72412+ return -ENOMEM;
72413+
72414+ return dev_queue_xmit(skb);
72415+}
72416+
72417+static int network_open(struct net_device *dev)
72418+{
72419+ struct netfront_info *np = netdev_priv(dev);
72420+
72421+ memset(&np->stats, 0, sizeof(np->stats));
72422+
72423+ spin_lock(&np->rx_lock);
72424+ if (netif_carrier_ok(dev)) {
72425+ network_alloc_rx_buffers(dev);
72426+ np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
72427+ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
72428+ netif_rx_schedule(dev);
72429+ }
72430+ spin_unlock(&np->rx_lock);
72431+
72432+ netif_start_queue(dev);
72433+
72434+ return 0;
72435+}
72436+
72437+static inline int netfront_tx_slot_available(struct netfront_info *np)
72438+{
72439+ return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
72440+}
72441+
72442+static inline void network_maybe_wake_tx(struct net_device *dev)
72443+{
72444+ struct netfront_info *np = netdev_priv(dev);
72445+
72446+ if (unlikely(netif_queue_stopped(dev)) &&
72447+ netfront_tx_slot_available(np) &&
72448+ likely(netif_running(dev)))
72449+ netif_wake_queue(dev);
72450+}
72451+
72452+static void network_tx_buf_gc(struct net_device *dev)
72453+{
72454+ RING_IDX cons, prod;
72455+ unsigned short id;
72456+ struct netfront_info *np = netdev_priv(dev);
72457+ struct sk_buff *skb;
72458+
72459+ BUG_ON(!netif_carrier_ok(dev));
72460+
72461+ do {
72462+ prod = np->tx.sring->rsp_prod;
72463+ rmb(); /* Ensure we see responses up to 'rp'. */
72464+
72465+ for (cons = np->tx.rsp_cons; cons != prod; cons++) {
72466+ struct netif_tx_response *txrsp;
72467+
72468+ txrsp = RING_GET_RESPONSE(&np->tx, cons);
72469+ if (txrsp->status == NETIF_RSP_NULL)
72470+ continue;
72471+
72472+ id = txrsp->id;
72473+ skb = np->tx_skbs[id];
72474+ if (unlikely(gnttab_query_foreign_access(
72475+ np->grant_tx_ref[id]) != 0)) {
72476+ printk(KERN_ALERT "network_tx_buf_gc: warning "
72477+ "-- grant still in use by backend "
72478+ "domain.\n");
72479+ BUG();
72480+ }
72481+ gnttab_end_foreign_access_ref(
72482+ np->grant_tx_ref[id], GNTMAP_readonly);
72483+ gnttab_release_grant_reference(
72484+ &np->gref_tx_head, np->grant_tx_ref[id]);
72485+ np->grant_tx_ref[id] = GRANT_INVALID_REF;
72486+ add_id_to_freelist(np->tx_skbs, id);
72487+ dev_kfree_skb_irq(skb);
72488+ }
72489+
72490+ np->tx.rsp_cons = prod;
72491+
72492+ /*
72493+ * Set a new event, then check for race with update of tx_cons.
72494+ * Note that it is essential to schedule a callback, no matter
72495+ * how few buffers are pending. Even if there is space in the
72496+ * transmit ring, higher layers may be blocked because too much
72497+ * data is outstanding: in such cases notification from Xen is
72498+ * likely to be the only kick that we'll get.
72499+ */
72500+ np->tx.sring->rsp_event =
72501+ prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
72502+ mb();
72503+ } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
72504+
72505+ network_maybe_wake_tx(dev);
72506+}
72507+
72508+static void rx_refill_timeout(unsigned long data)
72509+{
72510+ struct net_device *dev = (struct net_device *)data;
72511+ netif_rx_schedule(dev);
72512+}
72513+
72514+static void network_alloc_rx_buffers(struct net_device *dev)
72515+{
72516+ unsigned short id;
72517+ struct netfront_info *np = netdev_priv(dev);
72518+ struct sk_buff *skb;
72519+ struct page *page;
72520+ int i, batch_target, notify;
72521+ RING_IDX req_prod = np->rx.req_prod_pvt;
72522+ struct xen_memory_reservation reservation;
72523+ grant_ref_t ref;
72524+ unsigned long pfn;
72525+ void *vaddr;
72526+ int nr_flips;
72527+ netif_rx_request_t *req;
72528+
72529+ if (unlikely(!netif_carrier_ok(dev)))
72530+ return;
72531+
72532+ /*
72533+ * Allocate skbuffs greedily, even though we batch updates to the
72534+ * receive ring. This creates a less bursty demand on the memory
72535+ * allocator, so should reduce the chance of failed allocation requests
72536+ * both for ourself and for other kernel subsystems.
72537+ */
72538+ batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
72539+ for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
72540+ /*
72541+ * Allocate an skb and a page. Do not use __dev_alloc_skb as
72542+ * that will allocate page-sized buffers which is not
72543+ * necessary here.
72544+ * 16 bytes added as necessary headroom for netif_receive_skb.
72545+ */
72546+ skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
72547+ GFP_ATOMIC | __GFP_NOWARN);
72548+ if (unlikely(!skb))
72549+ goto no_skb;
72550+
72551+ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
72552+ if (!page) {
72553+ kfree_skb(skb);
72554+no_skb:
72555+ /* Any skbuffs queued for refill? Force them out. */
72556+ if (i != 0)
72557+ goto refill;
72558+ /* Could not allocate any skbuffs. Try again later. */
72559+ mod_timer(&np->rx_refill_timer,
72560+ jiffies + (HZ/10));
72561+ break;
72562+ }
72563+
72564+ skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
72565+ skb_shinfo(skb)->frags[0].page = page;
72566+ skb_shinfo(skb)->nr_frags = 1;
72567+ __skb_queue_tail(&np->rx_batch, skb);
72568+ }
72569+
72570+ /* Is the batch large enough to be worthwhile? */
72571+ if (i < (np->rx_target/2)) {
72572+ if (req_prod > np->rx.sring->req_prod)
72573+ goto push;
72574+ return;
72575+ }
72576+
72577+ /* Adjust our fill target if we risked running out of buffers. */
72578+ if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
72579+ ((np->rx_target *= 2) > np->rx_max_target))
72580+ np->rx_target = np->rx_max_target;
72581+
72582+ refill:
72583+ for (nr_flips = i = 0; ; i++) {
72584+ if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
72585+ break;
72586+
72587+ skb->dev = dev;
72588+
72589+ id = xennet_rxidx(req_prod + i);
72590+
72591+ BUG_ON(np->rx_skbs[id]);
72592+ np->rx_skbs[id] = skb;
72593+
72594+ ref = gnttab_claim_grant_reference(&np->gref_rx_head);
72595+ BUG_ON((signed short)ref < 0);
72596+ np->grant_rx_ref[id] = ref;
72597+
72598+ pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
72599+ vaddr = page_address(skb_shinfo(skb)->frags[0].page);
72600+
72601+ req = RING_GET_REQUEST(&np->rx, req_prod + i);
72602+ if (!np->copying_receiver) {
72603+ gnttab_grant_foreign_transfer_ref(ref,
72604+ np->xbdev->otherend_id,
72605+ pfn);
72606+ np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
72607+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72608+ /* Remove this page before passing
72609+ * back to Xen. */
72610+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
72611+ MULTI_update_va_mapping(np->rx_mcl+i,
72612+ (unsigned long)vaddr,
72613+ __pte(0), 0);
72614+ }
72615+ nr_flips++;
72616+ } else {
72617+ gnttab_grant_foreign_access_ref(ref,
72618+ np->xbdev->otherend_id,
72619+ pfn_to_mfn(pfn),
72620+ 0);
72621+ }
72622+
72623+ req->id = id;
72624+ req->gref = ref;
72625+ }
72626+
72627+ if ( nr_flips != 0 ) {
72628+ /* Tell the ballon driver what is going on. */
72629+ balloon_update_driver_allowance(i);
72630+
72631+ set_xen_guest_handle(reservation.extent_start,
72632+ np->rx_pfn_array);
72633+ reservation.nr_extents = nr_flips;
72634+ reservation.extent_order = 0;
72635+ reservation.address_bits = 0;
72636+ reservation.domid = DOMID_SELF;
72637+
72638+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72639+ /* After all PTEs have been zapped, flush the TLB. */
72640+ np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
72641+ UVMF_TLB_FLUSH|UVMF_ALL;
72642+
72643+ /* Give away a batch of pages. */
72644+ np->rx_mcl[i].op = __HYPERVISOR_memory_op;
72645+ np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
72646+ np->rx_mcl[i].args[1] = (unsigned long)&reservation;
72647+
72648+ /* Zap PTEs and give away pages in one big
72649+ * multicall. */
72650+ (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
72651+
72652+ /* Check return status of HYPERVISOR_memory_op(). */
72653+ if (unlikely(np->rx_mcl[i].result != i))
72654+ panic("Unable to reduce memory reservation\n");
72655+ } else {
72656+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
72657+ &reservation) != i)
72658+ panic("Unable to reduce memory reservation\n");
72659+ }
72660+ } else {
72661+ wmb();
72662+ }
72663+
72664+ /* Above is a suitable barrier to ensure backend will see requests. */
72665+ np->rx.req_prod_pvt = req_prod + i;
72666+ push:
72667+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
72668+ if (notify)
72669+ notify_remote_via_irq(np->irq);
72670+}
72671+
72672+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
72673+ struct netif_tx_request *tx)
72674+{
72675+ struct netfront_info *np = netdev_priv(dev);
72676+ char *data = skb->data;
72677+ unsigned long mfn;
72678+ RING_IDX prod = np->tx.req_prod_pvt;
72679+ int frags = skb_shinfo(skb)->nr_frags;
72680+ unsigned int offset = offset_in_page(data);
72681+ unsigned int len = skb_headlen(skb);
72682+ unsigned int id;
72683+ grant_ref_t ref;
72684+ int i;
72685+
72686+ while (len > PAGE_SIZE - offset) {
72687+ tx->size = PAGE_SIZE - offset;
72688+ tx->flags |= NETTXF_more_data;
72689+ len -= tx->size;
72690+ data += tx->size;
72691+ offset = 0;
72692+
72693+ id = get_id_from_freelist(np->tx_skbs);
72694+ np->tx_skbs[id] = skb_get(skb);
72695+ tx = RING_GET_REQUEST(&np->tx, prod++);
72696+ tx->id = id;
72697+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72698+ BUG_ON((signed short)ref < 0);
72699+
72700+ mfn = virt_to_mfn(data);
72701+ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
72702+ mfn, GNTMAP_readonly);
72703+
72704+ tx->gref = np->grant_tx_ref[id] = ref;
72705+ tx->offset = offset;
72706+ tx->size = len;
72707+ tx->flags = 0;
72708+ }
72709+
72710+ for (i = 0; i < frags; i++) {
72711+ skb_frag_t *frag = skb_shinfo(skb)->frags + i;
72712+
72713+ tx->flags |= NETTXF_more_data;
72714+
72715+ id = get_id_from_freelist(np->tx_skbs);
72716+ np->tx_skbs[id] = skb_get(skb);
72717+ tx = RING_GET_REQUEST(&np->tx, prod++);
72718+ tx->id = id;
72719+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72720+ BUG_ON((signed short)ref < 0);
72721+
72722+ mfn = pfn_to_mfn(page_to_pfn(frag->page));
72723+ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
72724+ mfn, GNTMAP_readonly);
72725+
72726+ tx->gref = np->grant_tx_ref[id] = ref;
72727+ tx->offset = frag->page_offset;
72728+ tx->size = frag->size;
72729+ tx->flags = 0;
72730+ }
72731+
72732+ np->tx.req_prod_pvt = prod;
72733+}
72734+
72735+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
72736+{
72737+ unsigned short id;
72738+ struct netfront_info *np = netdev_priv(dev);
72739+ struct netif_tx_request *tx;
72740+ struct netif_extra_info *extra;
72741+ char *data = skb->data;
72742+ RING_IDX i;
72743+ grant_ref_t ref;
72744+ unsigned long mfn;
72745+ int notify;
72746+ int frags = skb_shinfo(skb)->nr_frags;
72747+ unsigned int offset = offset_in_page(data);
72748+ unsigned int len = skb_headlen(skb);
72749+
72750+ frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
72751+ if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
72752+ printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
72753+ frags);
72754+ dump_stack();
72755+ goto drop;
72756+ }
72757+
72758+ spin_lock_irq(&np->tx_lock);
72759+
72760+ if (unlikely(!netif_carrier_ok(dev) ||
72761+ (frags > 1 && !xennet_can_sg(dev)) ||
72762+ netif_needs_gso(dev, skb))) {
72763+ spin_unlock_irq(&np->tx_lock);
72764+ goto drop;
72765+ }
72766+
72767+ i = np->tx.req_prod_pvt;
72768+
72769+ id = get_id_from_freelist(np->tx_skbs);
72770+ np->tx_skbs[id] = skb;
72771+
72772+ tx = RING_GET_REQUEST(&np->tx, i);
72773+
72774+ tx->id = id;
72775+ ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72776+ BUG_ON((signed short)ref < 0);
72777+ mfn = virt_to_mfn(data);
72778+ gnttab_grant_foreign_access_ref(
72779+ ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
72780+ tx->gref = np->grant_tx_ref[id] = ref;
72781+ tx->offset = offset;
72782+ tx->size = len;
72783+
72784+ tx->flags = 0;
72785+ extra = NULL;
72786+
72787+ if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
72788+ tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
72789+#ifdef CONFIG_XEN
72790+ if (skb->proto_data_valid) /* remote but checksummed? */
72791+ tx->flags |= NETTXF_data_validated;
72792+#endif
72793+
72794+#ifdef HAVE_TSO
72795+ if (skb_shinfo(skb)->gso_size) {
72796+ struct netif_extra_info *gso = (struct netif_extra_info *)
72797+ RING_GET_REQUEST(&np->tx, ++i);
72798+
72799+ if (extra)
72800+ extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
72801+ else
72802+ tx->flags |= NETTXF_extra_info;
72803+
72804+ gso->u.gso.size = skb_shinfo(skb)->gso_size;
72805+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
72806+ gso->u.gso.pad = 0;
72807+ gso->u.gso.features = 0;
72808+
72809+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
72810+ gso->flags = 0;
72811+ extra = gso;
72812+ }
72813+#endif
72814+
72815+ np->tx.req_prod_pvt = i + 1;
72816+
72817+ xennet_make_frags(skb, dev, tx);
72818+ tx->size = skb->len;
72819+
72820+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
72821+ if (notify)
72822+ notify_remote_via_irq(np->irq);
72823+
72824+ network_tx_buf_gc(dev);
72825+
72826+ if (!netfront_tx_slot_available(np))
72827+ netif_stop_queue(dev);
72828+
72829+ spin_unlock_irq(&np->tx_lock);
72830+
72831+ np->stats.tx_bytes += skb->len;
72832+ np->stats.tx_packets++;
72833+
72834+ return 0;
72835+
72836+ drop:
72837+ np->stats.tx_dropped++;
72838+ dev_kfree_skb(skb);
72839+ return 0;
72840+}
72841+
72842+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
72843+{
72844+ struct net_device *dev = dev_id;
72845+ struct netfront_info *np = netdev_priv(dev);
72846+ unsigned long flags;
72847+
72848+ spin_lock_irqsave(&np->tx_lock, flags);
72849+
72850+ if (likely(netif_carrier_ok(dev))) {
72851+ network_tx_buf_gc(dev);
72852+ /* Under tx_lock: protects access to rx shared-ring indexes. */
72853+ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
72854+ netif_rx_schedule(dev);
72855+ }
72856+
72857+ spin_unlock_irqrestore(&np->tx_lock, flags);
72858+
72859+ return IRQ_HANDLED;
72860+}
72861+
72862+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
72863+ grant_ref_t ref)
72864+{
72865+ int new = xennet_rxidx(np->rx.req_prod_pvt);
72866+
72867+ BUG_ON(np->rx_skbs[new]);
72868+ np->rx_skbs[new] = skb;
72869+ np->grant_rx_ref[new] = ref;
72870+ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
72871+ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
72872+ np->rx.req_prod_pvt++;
72873+}
72874+
72875+int xennet_get_extras(struct netfront_info *np,
72876+ struct netif_extra_info *extras, RING_IDX rp)
72877+
72878+{
72879+ struct netif_extra_info *extra;
72880+ RING_IDX cons = np->rx.rsp_cons;
72881+ int err = 0;
72882+
72883+ do {
72884+ struct sk_buff *skb;
72885+ grant_ref_t ref;
72886+
72887+ if (unlikely(cons + 1 == rp)) {
72888+ if (net_ratelimit())
72889+ WPRINTK("Missing extra info\n");
72890+ err = -EBADR;
72891+ break;
72892+ }
72893+
72894+ extra = (struct netif_extra_info *)
72895+ RING_GET_RESPONSE(&np->rx, ++cons);
72896+
72897+ if (unlikely(!extra->type ||
72898+ extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
72899+ if (net_ratelimit())
72900+ WPRINTK("Invalid extra type: %d\n",
72901+ extra->type);
72902+ err = -EINVAL;
72903+ } else {
72904+ memcpy(&extras[extra->type - 1], extra,
72905+ sizeof(*extra));
72906+ }
72907+
72908+ skb = xennet_get_rx_skb(np, cons);
72909+ ref = xennet_get_rx_ref(np, cons);
72910+ xennet_move_rx_slot(np, skb, ref);
72911+ } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
72912+
72913+ np->rx.rsp_cons = cons;
72914+ return err;
72915+}
72916+
72917+static int xennet_get_responses(struct netfront_info *np,
72918+ struct netfront_rx_info *rinfo, RING_IDX rp,
72919+ struct sk_buff_head *list,
72920+ int *pages_flipped_p)
72921+{
72922+ int pages_flipped = *pages_flipped_p;
72923+ struct mmu_update *mmu;
72924+ struct multicall_entry *mcl;
72925+ struct netif_rx_response *rx = &rinfo->rx;
72926+ struct netif_extra_info *extras = rinfo->extras;
72927+ RING_IDX cons = np->rx.rsp_cons;
72928+ struct sk_buff *skb = xennet_get_rx_skb(np, cons);
72929+ grant_ref_t ref = xennet_get_rx_ref(np, cons);
72930+ int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
72931+ int frags = 1;
72932+ int err = 0;
72933+ unsigned long ret;
72934+
72935+ if (rx->flags & NETRXF_extra_info) {
72936+ err = xennet_get_extras(np, extras, rp);
72937+ cons = np->rx.rsp_cons;
72938+ }
72939+
72940+ for (;;) {
72941+ unsigned long mfn;
72942+
72943+ if (unlikely(rx->status < 0 ||
72944+ rx->offset + rx->status > PAGE_SIZE)) {
72945+ if (net_ratelimit())
72946+ WPRINTK("rx->offset: %x, size: %u\n",
72947+ rx->offset, rx->status);
72948+ xennet_move_rx_slot(np, skb, ref);
72949+ err = -EINVAL;
72950+ goto next;
72951+ }
72952+
72953+ /*
72954+ * This definitely indicates a bug, either in this driver or in
72955+ * the backend driver. In future this should flag the bad
72956+ * situation to the system controller to reboot the backed.
72957+ */
72958+ if (ref == GRANT_INVALID_REF) {
72959+ if (net_ratelimit())
72960+ WPRINTK("Bad rx response id %d.\n", rx->id);
72961+ err = -EINVAL;
72962+ goto next;
72963+ }
72964+
72965+ if (!np->copying_receiver) {
72966+ /* Memory pressure, insufficient buffer
72967+ * headroom, ... */
72968+ if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
72969+ if (net_ratelimit())
72970+ WPRINTK("Unfulfilled rx req "
72971+ "(id=%d, st=%d).\n",
72972+ rx->id, rx->status);
72973+ xennet_move_rx_slot(np, skb, ref);
72974+ err = -ENOMEM;
72975+ goto next;
72976+ }
72977+
72978+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72979+ /* Remap the page. */
72980+ struct page *page =
72981+ skb_shinfo(skb)->frags[0].page;
72982+ unsigned long pfn = page_to_pfn(page);
72983+ void *vaddr = page_address(page);
72984+
72985+ mcl = np->rx_mcl + pages_flipped;
72986+ mmu = np->rx_mmu + pages_flipped;
72987+
72988+ MULTI_update_va_mapping(mcl,
72989+ (unsigned long)vaddr,
72990+ pfn_pte_ma(mfn,
72991+ PAGE_KERNEL),
72992+ 0);
72993+ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
72994+ | MMU_MACHPHYS_UPDATE;
72995+ mmu->val = pfn;
72996+
72997+ set_phys_to_machine(pfn, mfn);
72998+ }
72999+ pages_flipped++;
73000+ } else {
73001+ ret = gnttab_end_foreign_access_ref(ref, 0);
73002+ BUG_ON(!ret);
73003+ }
73004+
73005+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
73006+
73007+ __skb_queue_tail(list, skb);
73008+
73009+next:
73010+ if (!(rx->flags & NETRXF_more_data))
73011+ break;
73012+
73013+ if (cons + frags == rp) {
73014+ if (net_ratelimit())
73015+ WPRINTK("Need more frags\n");
73016+ err = -ENOENT;
73017+ break;
73018+ }
73019+
73020+ rx = RING_GET_RESPONSE(&np->rx, cons + frags);
73021+ skb = xennet_get_rx_skb(np, cons + frags);
73022+ ref = xennet_get_rx_ref(np, cons + frags);
73023+ frags++;
73024+ }
73025+
73026+ if (unlikely(frags > max)) {
73027+ if (net_ratelimit())
73028+ WPRINTK("Too many frags\n");
73029+ err = -E2BIG;
73030+ }
73031+
73032+ if (unlikely(err))
73033+ np->rx.rsp_cons = cons + frags;
73034+
73035+ *pages_flipped_p = pages_flipped;
73036+
73037+ return err;
73038+}
73039+
73040+static RING_IDX xennet_fill_frags(struct netfront_info *np,
73041+ struct sk_buff *skb,
73042+ struct sk_buff_head *list)
73043+{
73044+ struct skb_shared_info *shinfo = skb_shinfo(skb);
73045+ int nr_frags = shinfo->nr_frags;
73046+ RING_IDX cons = np->rx.rsp_cons;
73047+ skb_frag_t *frag = shinfo->frags + nr_frags;
73048+ struct sk_buff *nskb;
73049+
73050+ while ((nskb = __skb_dequeue(list))) {
73051+ struct netif_rx_response *rx =
73052+ RING_GET_RESPONSE(&np->rx, ++cons);
73053+
73054+ frag->page = skb_shinfo(nskb)->frags[0].page;
73055+ frag->page_offset = rx->offset;
73056+ frag->size = rx->status;
73057+
73058+ skb->data_len += rx->status;
73059+
73060+ skb_shinfo(nskb)->nr_frags = 0;
73061+ kfree_skb(nskb);
73062+
73063+ frag++;
73064+ nr_frags++;
73065+ }
73066+
73067+ shinfo->nr_frags = nr_frags;
73068+ return cons;
73069+}
73070+
73071+static int xennet_set_skb_gso(struct sk_buff *skb,
73072+ struct netif_extra_info *gso)
73073+{
73074+ if (!gso->u.gso.size) {
73075+ if (net_ratelimit())
73076+ WPRINTK("GSO size must not be zero.\n");
73077+ return -EINVAL;
73078+ }
73079+
73080+ /* Currently only TCPv4 S.O. is supported. */
73081+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
73082+ if (net_ratelimit())
73083+ WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
73084+ return -EINVAL;
73085+ }
73086+
73087+#ifdef HAVE_TSO
73088+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
73089+#ifdef HAVE_GSO
73090+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
73091+
73092+ /* Header must be checked, and gso_segs computed. */
73093+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
73094+#endif
73095+ skb_shinfo(skb)->gso_segs = 0;
73096+
73097+ return 0;
73098+#else
73099+ if (net_ratelimit())
73100+ WPRINTK("GSO unsupported by this kernel.\n");
73101+ return -EINVAL;
73102+#endif
73103+}
73104+
73105+static int netif_poll(struct net_device *dev, int *pbudget)
73106+{
73107+ struct netfront_info *np = netdev_priv(dev);
73108+ struct sk_buff *skb;
73109+ struct netfront_rx_info rinfo;
73110+ struct netif_rx_response *rx = &rinfo.rx;
73111+ struct netif_extra_info *extras = rinfo.extras;
73112+ RING_IDX i, rp;
73113+ struct multicall_entry *mcl;
73114+ int work_done, budget, more_to_do = 1;
73115+ struct sk_buff_head rxq;
73116+ struct sk_buff_head errq;
73117+ struct sk_buff_head tmpq;
73118+ unsigned long flags;
73119+ unsigned int len;
73120+ int pages_flipped = 0;
73121+ int err;
73122+
73123+ spin_lock(&np->rx_lock);
73124+
73125+ if (unlikely(!netif_carrier_ok(dev))) {
73126+ spin_unlock(&np->rx_lock);
73127+ return 0;
73128+ }
73129+
73130+ skb_queue_head_init(&rxq);
73131+ skb_queue_head_init(&errq);
73132+ skb_queue_head_init(&tmpq);
73133+
73134+ if ((budget = *pbudget) > dev->quota)
73135+ budget = dev->quota;
73136+ rp = np->rx.sring->rsp_prod;
73137+ rmb(); /* Ensure we see queued responses up to 'rp'. */
73138+
73139+ i = np->rx.rsp_cons;
73140+ work_done = 0;
73141+ while ((i != rp) && (work_done < budget)) {
73142+ memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
73143+ memset(extras, 0, sizeof(extras));
73144+
73145+ err = xennet_get_responses(np, &rinfo, rp, &tmpq,
73146+ &pages_flipped);
73147+
73148+ if (unlikely(err)) {
73149+err:
73150+ while ((skb = __skb_dequeue(&tmpq)))
73151+ __skb_queue_tail(&errq, skb);
73152+ np->stats.rx_errors++;
73153+ i = np->rx.rsp_cons;
73154+ continue;
73155+ }
73156+
73157+ skb = __skb_dequeue(&tmpq);
73158+
73159+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
73160+ struct netif_extra_info *gso;
73161+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
73162+
73163+ if (unlikely(xennet_set_skb_gso(skb, gso))) {
73164+ __skb_queue_head(&tmpq, skb);
73165+ np->rx.rsp_cons += skb_queue_len(&tmpq);
73166+ goto err;
73167+ }
73168+ }
73169+
73170+ skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
73171+ skb->h.raw = skb->nh.raw + rx->offset;
73172+
73173+ len = rx->status;
73174+ if (len > RX_COPY_THRESHOLD)
73175+ len = RX_COPY_THRESHOLD;
73176+ skb_put(skb, len);
73177+
73178+ if (rx->status > len) {
73179+ skb_shinfo(skb)->frags[0].page_offset =
73180+ rx->offset + len;
73181+ skb_shinfo(skb)->frags[0].size = rx->status - len;
73182+ skb->data_len = rx->status - len;
73183+ } else {
73184+ skb_shinfo(skb)->frags[0].page = NULL;
73185+ skb_shinfo(skb)->nr_frags = 0;
73186+ }
73187+
73188+ i = xennet_fill_frags(np, skb, &tmpq);
73189+
73190+ /*
73191+ * Truesize must approximates the size of true data plus
73192+ * any supervisor overheads. Adding hypervisor overheads
73193+ * has been shown to significantly reduce achievable
73194+ * bandwidth with the default receive buffer size. It is
73195+ * therefore not wise to account for it here.
73196+ *
73197+ * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
73198+ * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
73199+ * add the size of the data pulled in xennet_fill_frags().
73200+ *
73201+ * We also adjust for any unused space in the main data
73202+ * area by subtracting (RX_COPY_THRESHOLD - len). This is
73203+ * especially important with drivers which split incoming
73204+ * packets into header and data, using only 66 bytes of
73205+ * the main data area (see the e1000 driver for example.)
73206+ * On such systems, without this last adjustement, our
73207+ * achievable receive throughout using the standard receive
73208+ * buffer size was cut by 25%(!!!).
73209+ */
73210+ skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
73211+ skb->len += skb->data_len;
73212+
73213+ /*
73214+ * Old backends do not assert data_validated but we
73215+ * can infer it from csum_blank so test both flags.
73216+ */
73217+ if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank))
73218+ skb->ip_summed = CHECKSUM_UNNECESSARY;
73219+ else
73220+ skb->ip_summed = CHECKSUM_NONE;
73221+#ifdef CONFIG_XEN
73222+ skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE);
73223+ skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
73224+#endif
73225+ np->stats.rx_packets++;
73226+ np->stats.rx_bytes += skb->len;
73227+
73228+ __skb_queue_tail(&rxq, skb);
73229+
73230+ np->rx.rsp_cons = ++i;
73231+ work_done++;
73232+ }
73233+
73234+ if (pages_flipped) {
73235+ /* Some pages are no longer absent... */
73236+ balloon_update_driver_allowance(-pages_flipped);
73237+
73238+ /* Do all the remapping work and M2P updates. */
73239+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73240+ mcl = np->rx_mcl + pages_flipped;
73241+ mcl->op = __HYPERVISOR_mmu_update;
73242+ mcl->args[0] = (unsigned long)np->rx_mmu;
73243+ mcl->args[1] = pages_flipped;
73244+ mcl->args[2] = 0;
73245+ mcl->args[3] = DOMID_SELF;
73246+ (void)HYPERVISOR_multicall(np->rx_mcl,
73247+ pages_flipped + 1);
73248+ }
73249+ }
73250+
73251+ while ((skb = __skb_dequeue(&errq)))
73252+ kfree_skb(skb);
73253+
73254+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
73255+ struct page *page = (struct page *)skb->nh.raw;
73256+ void *vaddr = page_address(page);
73257+
73258+ memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
73259+ skb_headlen(skb));
73260+
73261+ if (page != skb_shinfo(skb)->frags[0].page)
73262+ __free_page(page);
73263+
73264+ /* Ethernet work: Delayed to here as it peeks the header. */
73265+ skb->protocol = eth_type_trans(skb, dev);
73266+
73267+ /* Pass it up. */
73268+ netif_receive_skb(skb);
73269+ dev->last_rx = jiffies;
73270+ }
73271+
73272+ /* If we get a callback with very few responses, reduce fill target. */
73273+ /* NB. Note exponential increase, linear decrease. */
73274+ if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
73275+ ((3*np->rx_target) / 4)) &&
73276+ (--np->rx_target < np->rx_min_target))
73277+ np->rx_target = np->rx_min_target;
73278+
73279+ network_alloc_rx_buffers(dev);
73280+
73281+ *pbudget -= work_done;
73282+ dev->quota -= work_done;
73283+
73284+ if (work_done < budget) {
73285+ local_irq_save(flags);
73286+
73287+ RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
73288+ if (!more_to_do)
73289+ __netif_rx_complete(dev);
73290+
73291+ local_irq_restore(flags);
73292+ }
73293+
73294+ spin_unlock(&np->rx_lock);
73295+
73296+ return more_to_do;
73297+}
73298+
73299+static void netif_release_tx_bufs(struct netfront_info *np)
73300+{
73301+ struct sk_buff *skb;
73302+ int i;
73303+
73304+ for (i = 1; i <= NET_TX_RING_SIZE; i++) {
73305+ if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
73306+ continue;
73307+
73308+ skb = np->tx_skbs[i];
73309+ gnttab_end_foreign_access_ref(
73310+ np->grant_tx_ref[i], GNTMAP_readonly);
73311+ gnttab_release_grant_reference(
73312+ &np->gref_tx_head, np->grant_tx_ref[i]);
73313+ np->grant_tx_ref[i] = GRANT_INVALID_REF;
73314+ add_id_to_freelist(np->tx_skbs, i);
73315+ dev_kfree_skb_irq(skb);
73316+ }
73317+}
73318+
73319+static void netif_release_rx_bufs(struct netfront_info *np)
73320+{
73321+ struct mmu_update *mmu = np->rx_mmu;
73322+ struct multicall_entry *mcl = np->rx_mcl;
73323+ struct sk_buff_head free_list;
73324+ struct sk_buff *skb;
73325+ unsigned long mfn;
73326+ int xfer = 0, noxfer = 0, unused = 0;
73327+ int id, ref;
73328+
73329+ if (np->copying_receiver) {
73330+ printk("%s: fix me for copying receiver.\n", __FUNCTION__);
73331+ return;
73332+ }
73333+
73334+ skb_queue_head_init(&free_list);
73335+
73336+ spin_lock(&np->rx_lock);
73337+
73338+ for (id = 0; id < NET_RX_RING_SIZE; id++) {
73339+ if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
73340+ unused++;
73341+ continue;
73342+ }
73343+
73344+ skb = np->rx_skbs[id];
73345+ mfn = gnttab_end_foreign_transfer_ref(ref);
73346+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
73347+ np->grant_rx_ref[id] = GRANT_INVALID_REF;
73348+ add_id_to_freelist(np->rx_skbs, id);
73349+
73350+ if (0 == mfn) {
73351+ struct page *page = skb_shinfo(skb)->frags[0].page;
73352+ balloon_release_driver_page(page);
73353+ skb_shinfo(skb)->nr_frags = 0;
73354+ dev_kfree_skb(skb);
73355+ noxfer++;
73356+ continue;
73357+ }
73358+
73359+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73360+ /* Remap the page. */
73361+ struct page *page = skb_shinfo(skb)->frags[0].page;
73362+ unsigned long pfn = page_to_pfn(page);
73363+ void *vaddr = page_address(page);
73364+
73365+ MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
73366+ pfn_pte_ma(mfn, PAGE_KERNEL),
73367+ 0);
73368+ mcl++;
73369+ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
73370+ | MMU_MACHPHYS_UPDATE;
73371+ mmu->val = pfn;
73372+ mmu++;
73373+
73374+ set_phys_to_machine(pfn, mfn);
73375+ }
73376+ __skb_queue_tail(&free_list, skb);
73377+ xfer++;
73378+ }
73379+
73380+ printk("%s: %d xfer, %d noxfer, %d unused\n",
73381+ __FUNCTION__, xfer, noxfer, unused);
73382+
73383+ if (xfer) {
73384+ /* Some pages are no longer absent... */
73385+ balloon_update_driver_allowance(-xfer);
73386+
73387+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73388+ /* Do all the remapping work and M2P updates. */
73389+ mcl->op = __HYPERVISOR_mmu_update;
73390+ mcl->args[0] = (unsigned long)np->rx_mmu;
73391+ mcl->args[1] = mmu - np->rx_mmu;
73392+ mcl->args[2] = 0;
73393+ mcl->args[3] = DOMID_SELF;
73394+ mcl++;
73395+ HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
73396+ }
73397+ }
73398+
73399+ while ((skb = __skb_dequeue(&free_list)) != NULL)
73400+ dev_kfree_skb(skb);
73401+
73402+ spin_unlock(&np->rx_lock);
73403+}
73404+
73405+static int network_close(struct net_device *dev)
73406+{
73407+ struct netfront_info *np = netdev_priv(dev);
73408+ netif_stop_queue(np->netdev);
73409+ return 0;
73410+}
73411+
73412+
73413+static struct net_device_stats *network_get_stats(struct net_device *dev)
73414+{
73415+ struct netfront_info *np = netdev_priv(dev);
73416+ return &np->stats;
73417+}
73418+
73419+static int xennet_change_mtu(struct net_device *dev, int mtu)
73420+{
73421+ int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
73422+
73423+ if (mtu > max)
73424+ return -EINVAL;
73425+ dev->mtu = mtu;
73426+ return 0;
73427+}
73428+
73429+static int xennet_set_sg(struct net_device *dev, u32 data)
73430+{
73431+ if (data) {
73432+ struct netfront_info *np = netdev_priv(dev);
73433+ int val;
73434+
73435+ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
73436+ "%d", &val) < 0)
73437+ val = 0;
73438+ if (!val)
73439+ return -ENOSYS;
73440+ } else if (dev->mtu > ETH_DATA_LEN)
73441+ dev->mtu = ETH_DATA_LEN;
73442+
73443+ return ethtool_op_set_sg(dev, data);
73444+}
73445+
73446+static int xennet_set_tso(struct net_device *dev, u32 data)
73447+{
73448+#ifdef HAVE_TSO
73449+ if (data) {
73450+ struct netfront_info *np = netdev_priv(dev);
73451+ int val;
73452+
73453+ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73454+ "feature-gso-tcpv4", "%d", &val) < 0)
73455+ val = 0;
73456+ if (!val)
73457+ return -ENOSYS;
73458+ }
73459+
73460+ return ethtool_op_set_tso(dev, data);
73461+#else
73462+ return -ENOSYS;
73463+#endif
73464+}
73465+
73466+static void xennet_set_features(struct net_device *dev)
73467+{
73468+ dev_disable_gso_features(dev);
73469+ xennet_set_sg(dev, 0);
73470+
73471+ /* We need checksum offload to enable scatter/gather and TSO. */
73472+ if (!(dev->features & NETIF_F_IP_CSUM))
73473+ return;
73474+
73475+ if (xennet_set_sg(dev, 1))
73476+ return;
73477+
73478+ /* Before 2.6.9 TSO seems to be unreliable so do not enable it
73479+ * on older kernels.
73480+ */
73481+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
73482+ xennet_set_tso(dev, 1);
73483+#endif
73484+
73485+}
73486+
73487+static int network_connect(struct net_device *dev)
73488+{
73489+ struct netfront_info *np = netdev_priv(dev);
73490+ int i, requeue_idx, err;
73491+ struct sk_buff *skb;
73492+ grant_ref_t ref;
73493+ netif_rx_request_t *req;
73494+ unsigned int feature_rx_copy, feature_rx_flip;
73495+
73496+ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73497+ "feature-rx-copy", "%u", &feature_rx_copy);
73498+ if (err != 1)
73499+ feature_rx_copy = 0;
73500+ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73501+ "feature-rx-flip", "%u", &feature_rx_flip);
73502+ if (err != 1)
73503+ feature_rx_flip = 1;
73504+
73505+ /*
73506+ * Copy packets on receive path if:
73507+ * (a) This was requested by user, and the backend supports it; or
73508+ * (b) Flipping was requested, but this is unsupported by the backend.
73509+ */
73510+ np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
73511+ (MODPARM_rx_flip && !feature_rx_flip));
73512+
73513+ err = talk_to_backend(np->xbdev, np);
73514+ if (err)
73515+ return err;
73516+
73517+ xennet_set_features(dev);
73518+
73519+ IPRINTK("device %s has %sing receive path.\n",
73520+ dev->name, np->copying_receiver ? "copy" : "flipp");
73521+
73522+ spin_lock_irq(&np->tx_lock);
73523+ spin_lock(&np->rx_lock);
73524+
73525+ /*
73526+ * Recovery procedure:
73527+ * NB. Freelist index entries are always going to be less than
73528+ * PAGE_OFFSET, whereas pointers to skbs will always be equal or
73529+ * greater than PAGE_OFFSET: we use this property to distinguish
73530+ * them.
73531+ */
73532+
73533+ /* Step 1: Discard all pending TX packet fragments. */
73534+ netif_release_tx_bufs(np);
73535+
73536+ /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
73537+ for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
73538+ if (!np->rx_skbs[i])
73539+ continue;
73540+
73541+ skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
73542+ ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
73543+ req = RING_GET_REQUEST(&np->rx, requeue_idx);
73544+
73545+ if (!np->copying_receiver) {
73546+ gnttab_grant_foreign_transfer_ref(
73547+ ref, np->xbdev->otherend_id,
73548+ page_to_pfn(skb_shinfo(skb)->frags->page));
73549+ } else {
73550+ gnttab_grant_foreign_access_ref(
73551+ ref, np->xbdev->otherend_id,
73552+ pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
73553+ frags->page)),
73554+ 0);
73555+ }
73556+ req->gref = ref;
73557+ req->id = requeue_idx;
73558+
73559+ requeue_idx++;
73560+ }
73561+
73562+ np->rx.req_prod_pvt = requeue_idx;
73563+
73564+ /*
73565+ * Step 3: All public and private state should now be sane. Get
73566+ * ready to start sending and receiving packets and give the driver
73567+ * domain a kick because we've probably just requeued some
73568+ * packets.
73569+ */
73570+ netif_carrier_on(dev);
73571+ notify_remote_via_irq(np->irq);
73572+ network_tx_buf_gc(dev);
73573+ network_alloc_rx_buffers(dev);
73574+
73575+ spin_unlock(&np->rx_lock);
73576+ spin_unlock_irq(&np->tx_lock);
73577+
73578+ return 0;
73579+}
73580+
73581+static void netif_uninit(struct net_device *dev)
73582+{
73583+ struct netfront_info *np = netdev_priv(dev);
73584+ netif_release_tx_bufs(np);
73585+ netif_release_rx_bufs(np);
73586+ gnttab_free_grant_references(np->gref_tx_head);
73587+ gnttab_free_grant_references(np->gref_rx_head);
73588+}
73589+
73590+static struct ethtool_ops network_ethtool_ops =
73591+{
73592+ .get_tx_csum = ethtool_op_get_tx_csum,
73593+ .set_tx_csum = ethtool_op_set_tx_csum,
73594+ .get_sg = ethtool_op_get_sg,
73595+ .set_sg = xennet_set_sg,
73596+ .get_tso = ethtool_op_get_tso,
73597+ .set_tso = xennet_set_tso,
73598+ .get_link = ethtool_op_get_link,
73599+};
73600+
73601+#ifdef CONFIG_SYSFS
73602+static ssize_t show_rxbuf_min(struct class_device *cd, char *buf)
73603+{
73604+ struct net_device *netdev = container_of(cd, struct net_device,
73605+ class_dev);
73606+ struct netfront_info *info = netdev_priv(netdev);
73607+
73608+ return sprintf(buf, "%u\n", info->rx_min_target);
73609+}
73610+
73611+static ssize_t store_rxbuf_min(struct class_device *cd,
73612+ const char *buf, size_t len)
73613+{
73614+ struct net_device *netdev = container_of(cd, struct net_device,
73615+ class_dev);
73616+ struct netfront_info *np = netdev_priv(netdev);
73617+ char *endp;
73618+ unsigned long target;
73619+
73620+ if (!capable(CAP_NET_ADMIN))
73621+ return -EPERM;
73622+
73623+ target = simple_strtoul(buf, &endp, 0);
73624+ if (endp == buf)
73625+ return -EBADMSG;
73626+
73627+ if (target < RX_MIN_TARGET)
73628+ target = RX_MIN_TARGET;
73629+ if (target > RX_MAX_TARGET)
73630+ target = RX_MAX_TARGET;
73631+
73632+ spin_lock(&np->rx_lock);
73633+ if (target > np->rx_max_target)
73634+ np->rx_max_target = target;
73635+ np->rx_min_target = target;
73636+ if (target > np->rx_target)
73637+ np->rx_target = target;
73638+
73639+ network_alloc_rx_buffers(netdev);
73640+
73641+ spin_unlock(&np->rx_lock);
73642+ return len;
73643+}
73644+
73645+static ssize_t show_rxbuf_max(struct class_device *cd, char *buf)
73646+{
73647+ struct net_device *netdev = container_of(cd, struct net_device,
73648+ class_dev);
73649+ struct netfront_info *info = netdev_priv(netdev);
73650+
73651+ return sprintf(buf, "%u\n", info->rx_max_target);
73652+}
73653+
73654+static ssize_t store_rxbuf_max(struct class_device *cd,
73655+ const char *buf, size_t len)
73656+{
73657+ struct net_device *netdev = container_of(cd, struct net_device,
73658+ class_dev);
73659+ struct netfront_info *np = netdev_priv(netdev);
73660+ char *endp;
73661+ unsigned long target;
73662+
73663+ if (!capable(CAP_NET_ADMIN))
73664+ return -EPERM;
73665+
73666+ target = simple_strtoul(buf, &endp, 0);
73667+ if (endp == buf)
73668+ return -EBADMSG;
73669+
73670+ if (target < RX_MIN_TARGET)
73671+ target = RX_MIN_TARGET;
73672+ if (target > RX_MAX_TARGET)
73673+ target = RX_MAX_TARGET;
73674+
73675+ spin_lock(&np->rx_lock);
73676+ if (target < np->rx_min_target)
73677+ np->rx_min_target = target;
73678+ np->rx_max_target = target;
73679+ if (target < np->rx_target)
73680+ np->rx_target = target;
73681+
73682+ network_alloc_rx_buffers(netdev);
73683+
73684+ spin_unlock(&np->rx_lock);
73685+ return len;
73686+}
73687+
73688+static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf)
73689+{
73690+ struct net_device *netdev = container_of(cd, struct net_device,
73691+ class_dev);
73692+ struct netfront_info *info = netdev_priv(netdev);
73693+
73694+ return sprintf(buf, "%u\n", info->rx_target);
73695+}
73696+
73697+static const struct class_device_attribute xennet_attrs[] = {
73698+ __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
73699+ __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
73700+ __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
73701+};
73702+
73703+static int xennet_sysfs_addif(struct net_device *netdev)
73704+{
73705+ int i;
73706+ int error = 0;
73707+
73708+ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
73709+ error = class_device_create_file(&netdev->class_dev,
73710+ &xennet_attrs[i]);
73711+ if (error)
73712+ goto fail;
73713+ }
73714+ return 0;
73715+
73716+ fail:
73717+ while (--i >= 0)
73718+ class_device_remove_file(&netdev->class_dev,
73719+ &xennet_attrs[i]);
73720+ return error;
73721+}
73722+
73723+static void xennet_sysfs_delif(struct net_device *netdev)
73724+{
73725+ int i;
73726+
73727+ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
73728+ class_device_remove_file(&netdev->class_dev,
73729+ &xennet_attrs[i]);
73730+ }
73731+}
73732+
73733+#endif /* CONFIG_SYSFS */
73734+
73735+
73736+/*
73737+ * Nothing to do here. Virtual interface is point-to-point and the
73738+ * physical interface is probably promiscuous anyway.
73739+ */
73740+static void network_set_multicast_list(struct net_device *dev)
73741+{
73742+}
73743+
73744+static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
73745+{
73746+ int i, err = 0;
73747+ struct net_device *netdev = NULL;
73748+ struct netfront_info *np = NULL;
73749+
73750+ netdev = alloc_etherdev(sizeof(struct netfront_info));
73751+ if (!netdev) {
73752+ printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
73753+ __FUNCTION__);
73754+ return ERR_PTR(-ENOMEM);
73755+ }
73756+
73757+ np = netdev_priv(netdev);
73758+ np->xbdev = dev;
73759+
73760+ netif_carrier_off(netdev);
73761+
73762+ spin_lock_init(&np->tx_lock);
73763+ spin_lock_init(&np->rx_lock);
73764+
73765+ skb_queue_head_init(&np->rx_batch);
73766+ np->rx_target = RX_DFL_MIN_TARGET;
73767+ np->rx_min_target = RX_DFL_MIN_TARGET;
73768+ np->rx_max_target = RX_MAX_TARGET;
73769+
73770+ init_timer(&np->rx_refill_timer);
73771+ np->rx_refill_timer.data = (unsigned long)netdev;
73772+ np->rx_refill_timer.function = rx_refill_timeout;
73773+
73774+ /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
73775+ for (i = 0; i <= NET_TX_RING_SIZE; i++) {
73776+ np->tx_skbs[i] = (void *)((unsigned long) i+1);
73777+ np->grant_tx_ref[i] = GRANT_INVALID_REF;
73778+ }
73779+
73780+ for (i = 0; i < NET_RX_RING_SIZE; i++) {
73781+ np->rx_skbs[i] = NULL;
73782+ np->grant_rx_ref[i] = GRANT_INVALID_REF;
73783+ }
73784+
73785+ /* A grant for every tx ring slot */
73786+ if (gnttab_alloc_grant_references(TX_MAX_TARGET,
73787+ &np->gref_tx_head) < 0) {
73788+ printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
73789+ err = -ENOMEM;
73790+ goto exit;
73791+ }
73792+ /* A grant for every rx ring slot */
73793+ if (gnttab_alloc_grant_references(RX_MAX_TARGET,
73794+ &np->gref_rx_head) < 0) {
73795+ printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
73796+ err = -ENOMEM;
73797+ goto exit_free_tx;
73798+ }
73799+
73800+ netdev->open = network_open;
73801+ netdev->hard_start_xmit = network_start_xmit;
73802+ netdev->stop = network_close;
73803+ netdev->get_stats = network_get_stats;
73804+ netdev->poll = netif_poll;
73805+ netdev->set_multicast_list = network_set_multicast_list;
73806+ netdev->uninit = netif_uninit;
73807+ netdev->change_mtu = xennet_change_mtu;
73808+ netdev->weight = 64;
73809+ netdev->features = NETIF_F_IP_CSUM;
73810+
73811+ SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
73812+ SET_MODULE_OWNER(netdev);
73813+ SET_NETDEV_DEV(netdev, &dev->dev);
73814+
73815+ np->netdev = netdev;
73816+ return netdev;
73817+
73818+ exit_free_tx:
73819+ gnttab_free_grant_references(np->gref_tx_head);
73820+ exit:
73821+ free_netdev(netdev);
73822+ return ERR_PTR(err);
73823+}
73824+
73825+/*
73826+ * We use this notifier to send out a fake ARP reply to reset switches and
73827+ * router ARP caches when an IP interface is brought up on a VIF.
73828+ */
73829+static int
73830+inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
73831+{
73832+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
73833+ struct net_device *dev = ifa->ifa_dev->dev;
73834+
73835+ /* UP event and is it one of our devices? */
73836+ if (event == NETDEV_UP && dev->open == network_open)
73837+ (void)send_fake_arp(dev);
73838+
73839+ return NOTIFY_DONE;
73840+}
73841+
73842+
73843+static void netif_disconnect_backend(struct netfront_info *info)
73844+{
73845+ /* Stop old i/f to prevent errors whilst we rebuild the state. */
73846+ spin_lock_irq(&info->tx_lock);
73847+ spin_lock(&info->rx_lock);
73848+ netif_carrier_off(info->netdev);
73849+ spin_unlock(&info->rx_lock);
73850+ spin_unlock_irq(&info->tx_lock);
73851+
73852+ if (info->irq)
73853+ unbind_from_irqhandler(info->irq, info->netdev);
73854+ info->evtchn = info->irq = 0;
73855+
73856+ end_access(info->tx_ring_ref, info->tx.sring);
73857+ end_access(info->rx_ring_ref, info->rx.sring);
73858+ info->tx_ring_ref = GRANT_INVALID_REF;
73859+ info->rx_ring_ref = GRANT_INVALID_REF;
73860+ info->tx.sring = NULL;
73861+ info->rx.sring = NULL;
73862+}
73863+
73864+
73865+static void end_access(int ref, void *page)
73866+{
73867+ if (ref != GRANT_INVALID_REF)
73868+ gnttab_end_foreign_access(ref, 0, (unsigned long)page);
73869+}
73870+
73871+
73872+/* ** Driver registration ** */
73873+
73874+
73875+static struct xenbus_device_id netfront_ids[] = {
73876+ { "vif" },
73877+ { "" }
73878+};
73879+
73880+
73881+static struct xenbus_driver netfront = {
73882+ .name = "vif",
73883+ .owner = THIS_MODULE,
73884+ .ids = netfront_ids,
73885+ .probe = netfront_probe,
73886+ .remove = __devexit_p(netfront_remove),
73887+ .resume = netfront_resume,
73888+ .otherend_changed = backend_changed,
73889+};
73890+
73891+
73892+static struct notifier_block notifier_inetdev = {
73893+ .notifier_call = inetdev_notify,
73894+ .next = NULL,
73895+ .priority = 0
73896+};
73897+
73898+static int __init netif_init(void)
73899+{
73900+ if (!is_running_on_xen())
73901+ return -ENODEV;
73902+
73903+#ifdef CONFIG_XEN
73904+ if (MODPARM_rx_flip && MODPARM_rx_copy) {
73905+ WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
73906+ return -EINVAL;
73907+ }
73908+
73909+ if (!MODPARM_rx_flip && !MODPARM_rx_copy)
73910+ MODPARM_rx_flip = 1; /* Default is to flip. */
73911+#endif
73912+
73913+ if (is_initial_xendomain())
73914+ return 0;
73915+
73916+ IPRINTK("Initialising virtual ethernet driver.\n");
73917+
73918+ (void)register_inetaddr_notifier(&notifier_inetdev);
73919+
73920+ return xenbus_register_frontend(&netfront);
73921+}
73922+module_init(netif_init);
73923+
73924+
73925+static void __exit netif_exit(void)
73926+{
73927+ if (is_initial_xendomain())
73928+ return;
73929+
73930+ unregister_inetaddr_notifier(&notifier_inetdev);
73931+
73932+ return xenbus_unregister_driver(&netfront);
73933+}
73934+module_exit(netif_exit);
73935+
73936+MODULE_LICENSE("Dual BSD/GPL");
73937diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/Makefile linux-2.6.16.33/drivers/xen/pciback/Makefile
73938--- linux-2.6.16.33-noxen/drivers/xen/pciback/Makefile 1970-01-01 00:00:00.000000000 +0000
73939+++ linux-2.6.16.33/drivers/xen/pciback/Makefile 2007-01-08 15:00:45.000000000 +0000
73940@@ -0,0 +1,15 @@
73941+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
73942+
73943+pciback-y := pci_stub.o pciback_ops.o xenbus.o
73944+pciback-y += conf_space.o conf_space_header.o \
73945+ conf_space_capability.o \
73946+ conf_space_capability_vpd.o \
73947+ conf_space_capability_pm.o \
73948+ conf_space_quirks.o
73949+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
73950+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
73951+pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
73952+
73953+ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
73954+EXTRA_CFLAGS += -DDEBUG
73955+endif
73956diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.c linux-2.6.16.33/drivers/xen/pciback/conf_space.c
73957--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.c 1970-01-01 00:00:00.000000000 +0000
73958+++ linux-2.6.16.33/drivers/xen/pciback/conf_space.c 2007-01-08 15:00:45.000000000 +0000
73959@@ -0,0 +1,425 @@
73960+/*
73961+ * PCI Backend - Functions for creating a virtual configuration space for
73962+ * exported PCI Devices.
73963+ * It's dangerous to allow PCI Driver Domains to change their
73964+ * device's resources (memory, i/o ports, interrupts). We need to
73965+ * restrict changes to certain PCI Configuration registers:
73966+ * BARs, INTERRUPT_PIN, most registers in the header...
73967+ *
73968+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
73969+ */
73970+
73971+#include <linux/kernel.h>
73972+#include <linux/pci.h>
73973+#include "pciback.h"
73974+#include "conf_space.h"
73975+#include "conf_space_quirks.h"
73976+
73977+#define DEFINE_PCI_CONFIG(op,size,type) \
73978+int pciback_##op##_config_##size \
73979+(struct pci_dev *dev, int offset, type value, void *data) \
73980+{ \
73981+ return pci_##op##_config_##size (dev, offset, value); \
73982+}
73983+
73984+DEFINE_PCI_CONFIG(read, byte, u8 *)
73985+DEFINE_PCI_CONFIG(read, word, u16 *)
73986+DEFINE_PCI_CONFIG(read, dword, u32 *)
73987+
73988+DEFINE_PCI_CONFIG(write, byte, u8)
73989+DEFINE_PCI_CONFIG(write, word, u16)
73990+DEFINE_PCI_CONFIG(write, dword, u32)
73991+
73992+static int conf_space_read(struct pci_dev *dev,
73993+ struct config_field_entry *entry, int offset,
73994+ u32 * value)
73995+{
73996+ int ret = 0;
73997+ struct config_field *field = entry->field;
73998+
73999+ *value = 0;
74000+
74001+ switch (field->size) {
74002+ case 1:
74003+ if (field->u.b.read)
74004+ ret = field->u.b.read(dev, offset, (u8 *) value,
74005+ entry->data);
74006+ break;
74007+ case 2:
74008+ if (field->u.w.read)
74009+ ret = field->u.w.read(dev, offset, (u16 *) value,
74010+ entry->data);
74011+ break;
74012+ case 4:
74013+ if (field->u.dw.read)
74014+ ret = field->u.dw.read(dev, offset, value, entry->data);
74015+ break;
74016+ }
74017+ return ret;
74018+}
74019+
74020+static int conf_space_write(struct pci_dev *dev,
74021+ struct config_field_entry *entry, int offset,
74022+ u32 value)
74023+{
74024+ int ret = 0;
74025+ struct config_field *field = entry->field;
74026+
74027+ switch (field->size) {
74028+ case 1:
74029+ if (field->u.b.write)
74030+ ret = field->u.b.write(dev, offset, (u8) value,
74031+ entry->data);
74032+ break;
74033+ case 2:
74034+ if (field->u.w.write)
74035+ ret = field->u.w.write(dev, offset, (u16) value,
74036+ entry->data);
74037+ break;
74038+ case 4:
74039+ if (field->u.dw.write)
74040+ ret = field->u.dw.write(dev, offset, value,
74041+ entry->data);
74042+ break;
74043+ }
74044+ return ret;
74045+}
74046+
74047+static inline u32 get_mask(int size)
74048+{
74049+ if (size == 1)
74050+ return 0xff;
74051+ else if (size == 2)
74052+ return 0xffff;
74053+ else
74054+ return 0xffffffff;
74055+}
74056+
74057+static inline int valid_request(int offset, int size)
74058+{
74059+ /* Validate request (no un-aligned requests) */
74060+ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
74061+ return 1;
74062+ return 0;
74063+}
74064+
74065+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
74066+ int offset)
74067+{
74068+ if (offset >= 0) {
74069+ new_val_mask <<= (offset * 8);
74070+ new_val <<= (offset * 8);
74071+ } else {
74072+ new_val_mask >>= (offset * -8);
74073+ new_val >>= (offset * -8);
74074+ }
74075+ val = (val & ~new_val_mask) | (new_val & new_val_mask);
74076+
74077+ return val;
74078+}
74079+
74080+static int pcibios_err_to_errno(int err)
74081+{
74082+ switch (err) {
74083+ case PCIBIOS_SUCCESSFUL:
74084+ return XEN_PCI_ERR_success;
74085+ case PCIBIOS_DEVICE_NOT_FOUND:
74086+ return XEN_PCI_ERR_dev_not_found;
74087+ case PCIBIOS_BAD_REGISTER_NUMBER:
74088+ return XEN_PCI_ERR_invalid_offset;
74089+ case PCIBIOS_FUNC_NOT_SUPPORTED:
74090+ return XEN_PCI_ERR_not_implemented;
74091+ case PCIBIOS_SET_FAILED:
74092+ return XEN_PCI_ERR_access_denied;
74093+ }
74094+ return err;
74095+}
74096+
74097+int pciback_config_read(struct pci_dev *dev, int offset, int size,
74098+ u32 * ret_val)
74099+{
74100+ int err = 0;
74101+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74102+ struct config_field_entry *cfg_entry;
74103+ struct config_field *field;
74104+ int req_start, req_end, field_start, field_end;
74105+ /* if read fails for any reason, return 0 (as if device didn't respond) */
74106+ u32 value = 0, tmp_val;
74107+
74108+ if (unlikely(verbose_request))
74109+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
74110+ pci_name(dev), size, offset);
74111+
74112+ if (!valid_request(offset, size)) {
74113+ err = XEN_PCI_ERR_invalid_offset;
74114+ goto out;
74115+ }
74116+
74117+ /* Get the real value first, then modify as appropriate */
74118+ switch (size) {
74119+ case 1:
74120+ err = pci_read_config_byte(dev, offset, (u8 *) & value);
74121+ break;
74122+ case 2:
74123+ err = pci_read_config_word(dev, offset, (u16 *) & value);
74124+ break;
74125+ case 4:
74126+ err = pci_read_config_dword(dev, offset, &value);
74127+ break;
74128+ }
74129+
74130+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74131+ field = cfg_entry->field;
74132+
74133+ req_start = offset;
74134+ req_end = offset + size;
74135+ field_start = OFFSET(cfg_entry);
74136+ field_end = OFFSET(cfg_entry) + field->size;
74137+
74138+ if ((req_start >= field_start && req_start < field_end)
74139+ || (req_end > field_start && req_end <= field_end)) {
74140+ err = conf_space_read(dev, cfg_entry, field_start,
74141+ &tmp_val);
74142+ if (err)
74143+ goto out;
74144+
74145+ value = merge_value(value, tmp_val,
74146+ get_mask(field->size),
74147+ field_start - req_start);
74148+ }
74149+ }
74150+
74151+ out:
74152+ if (unlikely(verbose_request))
74153+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
74154+ pci_name(dev), size, offset, value);
74155+
74156+ *ret_val = value;
74157+ return pcibios_err_to_errno(err);
74158+}
74159+
74160+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
74161+{
74162+ int err = 0, handled = 0;
74163+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74164+ struct config_field_entry *cfg_entry;
74165+ struct config_field *field;
74166+ u32 tmp_val;
74167+ int req_start, req_end, field_start, field_end;
74168+
74169+ if (unlikely(verbose_request))
74170+ printk(KERN_DEBUG
74171+ "pciback: %s: write request %d bytes at 0x%x = %x\n",
74172+ pci_name(dev), size, offset, value);
74173+
74174+ if (!valid_request(offset, size))
74175+ return XEN_PCI_ERR_invalid_offset;
74176+
74177+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74178+ field = cfg_entry->field;
74179+
74180+ req_start = offset;
74181+ req_end = offset + size;
74182+ field_start = OFFSET(cfg_entry);
74183+ field_end = OFFSET(cfg_entry) + field->size;
74184+
74185+ if ((req_start >= field_start && req_start < field_end)
74186+ || (req_end > field_start && req_end <= field_end)) {
74187+ tmp_val = 0;
74188+
74189+ err = pciback_config_read(dev, field_start,
74190+ field->size, &tmp_val);
74191+ if (err)
74192+ break;
74193+
74194+ tmp_val = merge_value(tmp_val, value, get_mask(size),
74195+ req_start - field_start);
74196+
74197+ err = conf_space_write(dev, cfg_entry, field_start,
74198+ tmp_val);
74199+
74200+ /* handled is set true here, but not every byte
74201+ * may have been written! Properly detecting if
74202+ * every byte is handled is unnecessary as the
74203+ * flag is used to detect devices that need
74204+ * special helpers to work correctly.
74205+ */
74206+ handled = 1;
74207+ }
74208+ }
74209+
74210+ if (!handled && !err) {
74211+ /* By default, anything not specificially handled above is
74212+ * read-only. The permissive flag changes this behavior so
74213+ * that anything not specifically handled above is writable.
74214+ * This means that some fields may still be read-only because
74215+ * they have entries in the config_field list that intercept
74216+ * the write and do nothing. */
74217+ if (dev_data->permissive) {
74218+ switch (size) {
74219+ case 1:
74220+ err = pci_write_config_byte(dev, offset,
74221+ (u8) value);
74222+ break;
74223+ case 2:
74224+ err = pci_write_config_word(dev, offset,
74225+ (u16) value);
74226+ break;
74227+ case 4:
74228+ err = pci_write_config_dword(dev, offset,
74229+ (u32) value);
74230+ break;
74231+ }
74232+ } else if (!dev_data->warned_on_write) {
74233+ dev_data->warned_on_write = 1;
74234+ dev_warn(&dev->dev, "Driver tried to write to a "
74235+ "read-only configuration space field at offset "
74236+ "0x%x, size %d. This may be harmless, but if "
74237+ "you have problems with your device:\n"
74238+ "1) see permissive attribute in sysfs\n"
74239+ "2) report problems to the xen-devel "
74240+ "mailing list along with details of your "
74241+ "device obtained from lspci.\n", offset, size);
74242+ }
74243+ }
74244+
74245+ return pcibios_err_to_errno(err);
74246+}
74247+
74248+void pciback_config_free_dyn_fields(struct pci_dev *dev)
74249+{
74250+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74251+ struct config_field_entry *cfg_entry, *t;
74252+ struct config_field *field;
74253+
74254+ dev_dbg(&dev->dev,
74255+ "free-ing dynamically allocated virtual configuration space fields\n");
74256+
74257+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
74258+ field = cfg_entry->field;
74259+
74260+ if (field->clean) {
74261+ field->clean(field);
74262+
74263+ if (cfg_entry->data)
74264+ kfree(cfg_entry->data);
74265+
74266+ list_del(&cfg_entry->list);
74267+ kfree(cfg_entry);
74268+ }
74269+
74270+ }
74271+}
74272+
74273+void pciback_config_reset_dev(struct pci_dev *dev)
74274+{
74275+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74276+ struct config_field_entry *cfg_entry;
74277+ struct config_field *field;
74278+
74279+ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
74280+
74281+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74282+ field = cfg_entry->field;
74283+
74284+ if (field->reset)
74285+ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
74286+ }
74287+}
74288+
74289+void pciback_config_free_dev(struct pci_dev *dev)
74290+{
74291+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74292+ struct config_field_entry *cfg_entry, *t;
74293+ struct config_field *field;
74294+
74295+ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
74296+
74297+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
74298+ list_del(&cfg_entry->list);
74299+
74300+ field = cfg_entry->field;
74301+
74302+ if (field->release)
74303+ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
74304+
74305+ kfree(cfg_entry);
74306+ }
74307+}
74308+
74309+int pciback_config_add_field_offset(struct pci_dev *dev,
74310+ struct config_field *field,
74311+ unsigned int offset)
74312+{
74313+ int err = 0;
74314+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74315+ struct config_field_entry *cfg_entry;
74316+ void *tmp;
74317+
74318+ /* silently ignore duplicate fields */
74319+ if (pciback_field_is_dup(dev, field->offset))
74320+ goto out;
74321+
74322+ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
74323+ if (!cfg_entry) {
74324+ err = -ENOMEM;
74325+ goto out;
74326+ }
74327+
74328+ cfg_entry->data = NULL;
74329+ cfg_entry->field = field;
74330+ cfg_entry->base_offset = offset;
74331+
74332+ if (field->init) {
74333+ tmp = field->init(dev, OFFSET(cfg_entry));
74334+
74335+ if (IS_ERR(tmp)) {
74336+ err = PTR_ERR(tmp);
74337+ goto out;
74338+ }
74339+
74340+ cfg_entry->data = tmp;
74341+ }
74342+
74343+ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
74344+ OFFSET(cfg_entry));
74345+ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
74346+
74347+ out:
74348+ if (err)
74349+ kfree(cfg_entry);
74350+
74351+ return err;
74352+}
74353+
74354+/* This sets up the device's virtual configuration space to keep track of
74355+ * certain registers (like the base address registers (BARs) so that we can
74356+ * keep the client from manipulating them directly.
74357+ */
74358+int pciback_config_init_dev(struct pci_dev *dev)
74359+{
74360+ int err = 0;
74361+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74362+
74363+ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
74364+
74365+ INIT_LIST_HEAD(&dev_data->config_fields);
74366+
74367+ err = pciback_config_header_add_fields(dev);
74368+ if (err)
74369+ goto out;
74370+
74371+ err = pciback_config_capability_add_fields(dev);
74372+ if (err)
74373+ goto out;
74374+
74375+ err = pciback_config_quirks_init(dev);
74376+
74377+ out:
74378+ return err;
74379+}
74380+
74381+int pciback_config_init(void)
74382+{
74383+ return pciback_config_capability_init();
74384+}
74385diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.h linux-2.6.16.33/drivers/xen/pciback/conf_space.h
74386--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.h 1970-01-01 00:00:00.000000000 +0000
74387+++ linux-2.6.16.33/drivers/xen/pciback/conf_space.h 2007-01-08 15:00:45.000000000 +0000
74388@@ -0,0 +1,126 @@
74389+/*
74390+ * PCI Backend - Common data structures for overriding the configuration space
74391+ *
74392+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74393+ */
74394+
74395+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
74396+#define __XEN_PCIBACK_CONF_SPACE_H__
74397+
74398+#include <linux/list.h>
74399+#include <linux/err.h>
74400+
74401+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
74402+typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
74403+typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
74404+typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
74405+
74406+typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
74407+ void *data);
74408+typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
74409+ void *data);
74410+typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
74411+ void *data);
74412+typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
74413+ void *data);
74414+typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
74415+ void *data);
74416+typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
74417+ void *data);
74418+
74419+/* These are the fields within the configuration space which we
74420+ * are interested in intercepting reads/writes to and changing their
74421+ * values.
74422+ */
74423+struct config_field {
74424+ unsigned int offset;
74425+ unsigned int size;
74426+ unsigned int mask;
74427+ conf_field_init init;
74428+ conf_field_reset reset;
74429+ conf_field_free release;
74430+ void (*clean) (struct config_field * field);
74431+ union {
74432+ struct {
74433+ conf_dword_write write;
74434+ conf_dword_read read;
74435+ } dw;
74436+ struct {
74437+ conf_word_write write;
74438+ conf_word_read read;
74439+ } w;
74440+ struct {
74441+ conf_byte_write write;
74442+ conf_byte_read read;
74443+ } b;
74444+ } u;
74445+ struct list_head list;
74446+};
74447+
74448+struct config_field_entry {
74449+ struct list_head list;
74450+ struct config_field *field;
74451+ unsigned int base_offset;
74452+ void *data;
74453+};
74454+
74455+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
74456+
74457+/* Add fields to a device - the add_fields macro expects to get a pointer to
74458+ * the first entry in an array (of which the ending is marked by size==0)
74459+ */
74460+int pciback_config_add_field_offset(struct pci_dev *dev,
74461+ struct config_field *field,
74462+ unsigned int offset);
74463+
74464+static inline int pciback_config_add_field(struct pci_dev *dev,
74465+ struct config_field *field)
74466+{
74467+ return pciback_config_add_field_offset(dev, field, 0);
74468+}
74469+
74470+static inline int pciback_config_add_fields(struct pci_dev *dev,
74471+ struct config_field *field)
74472+{
74473+ int i, err = 0;
74474+ for (i = 0; field[i].size != 0; i++) {
74475+ err = pciback_config_add_field(dev, &field[i]);
74476+ if (err)
74477+ break;
74478+ }
74479+ return err;
74480+}
74481+
74482+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
74483+ struct config_field *field,
74484+ unsigned int offset)
74485+{
74486+ int i, err = 0;
74487+ for (i = 0; field[i].size != 0; i++) {
74488+ err = pciback_config_add_field_offset(dev, &field[i], offset);
74489+ if (err)
74490+ break;
74491+ }
74492+ return err;
74493+}
74494+
74495+/* Read/Write the real configuration space */
74496+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
74497+ void *data);
74498+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
74499+ void *data);
74500+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
74501+ void *data);
74502+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
74503+ void *data);
74504+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
74505+ void *data);
74506+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
74507+ void *data);
74508+
74509+int pciback_config_capability_init(void);
74510+
74511+int pciback_config_header_add_fields(struct pci_dev *dev);
74512+int pciback_config_capability_add_fields(struct pci_dev *dev);
74513+
74514+#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
74515diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.c
74516--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.c 1970-01-01 00:00:00.000000000 +0000
74517+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.c 2007-01-08 15:00:45.000000000 +0000
74518@@ -0,0 +1,71 @@
74519+/*
74520+ * PCI Backend - Handles the virtual fields found on the capability lists
74521+ * in the configuration space.
74522+ *
74523+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74524+ */
74525+
74526+#include <linux/kernel.h>
74527+#include <linux/pci.h>
74528+#include "pciback.h"
74529+#include "conf_space.h"
74530+#include "conf_space_capability.h"
74531+
74532+static LIST_HEAD(capabilities);
74533+
74534+static struct config_field caplist_header[] = {
74535+ {
74536+ .offset = PCI_CAP_LIST_ID,
74537+ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
74538+ .u.w.read = pciback_read_config_word,
74539+ .u.w.write = NULL,
74540+ },
74541+ {
74542+ .size = 0,
74543+ },
74544+};
74545+
74546+static inline void register_capability(struct pciback_config_capability *cap)
74547+{
74548+ list_add_tail(&cap->cap_list, &capabilities);
74549+}
74550+
74551+int pciback_config_capability_add_fields(struct pci_dev *dev)
74552+{
74553+ int err = 0;
74554+ struct pciback_config_capability *cap;
74555+ int cap_offset;
74556+
74557+ list_for_each_entry(cap, &capabilities, cap_list) {
74558+ cap_offset = pci_find_capability(dev, cap->capability);
74559+ if (cap_offset) {
74560+ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
74561+ cap->capability, cap_offset);
74562+
74563+ err = pciback_config_add_fields_offset(dev,
74564+ caplist_header,
74565+ cap_offset);
74566+ if (err)
74567+ goto out;
74568+ err = pciback_config_add_fields_offset(dev,
74569+ cap->fields,
74570+ cap_offset);
74571+ if (err)
74572+ goto out;
74573+ }
74574+ }
74575+
74576+ out:
74577+ return err;
74578+}
74579+
74580+extern struct pciback_config_capability pciback_config_capability_vpd;
74581+extern struct pciback_config_capability pciback_config_capability_pm;
74582+
74583+int pciback_config_capability_init(void)
74584+{
74585+ register_capability(&pciback_config_capability_vpd);
74586+ register_capability(&pciback_config_capability_pm);
74587+
74588+ return 0;
74589+}
74590diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.h linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.h
74591--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.h 1970-01-01 00:00:00.000000000 +0000
74592+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.h 2007-01-08 15:00:45.000000000 +0000
74593@@ -0,0 +1,23 @@
74594+/*
74595+ * PCI Backend - Data structures for special overlays for structures on
74596+ * the capability list.
74597+ *
74598+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74599+ */
74600+
74601+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
74602+#define __PCIBACK_CONFIG_CAPABILITY_H__
74603+
74604+#include <linux/pci.h>
74605+#include <linux/list.h>
74606+
74607+struct pciback_config_capability {
74608+ struct list_head cap_list;
74609+
74610+ int capability;
74611+
74612+ /* If the device has the capability found above, add these fields */
74613+ struct config_field *fields;
74614+};
74615+
74616+#endif
74617diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_pm.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_pm.c
74618--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_pm.c 1970-01-01 00:00:00.000000000 +0000
74619+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_pm.c 2007-01-08 15:00:45.000000000 +0000
74620@@ -0,0 +1,113 @@
74621+/*
74622+ * PCI Backend - Configuration space overlay for power management
74623+ *
74624+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74625+ */
74626+
74627+#include <linux/pci.h>
74628+#include "conf_space.h"
74629+#include "conf_space_capability.h"
74630+
74631+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
74632+ void *data)
74633+{
74634+ int err;
74635+ u16 real_value;
74636+
74637+ err = pci_read_config_word(dev, offset, &real_value);
74638+ if (err)
74639+ goto out;
74640+
74641+ *value = real_value & ~PCI_PM_CAP_PME_MASK;
74642+
74643+ out:
74644+ return err;
74645+}
74646+
74647+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
74648+ * Can't allow driver domain to enable PMEs - they're shared */
74649+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
74650+
74651+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
74652+ void *data)
74653+{
74654+ int err;
74655+ u16 cur_value;
74656+ pci_power_t new_state;
74657+
74658+ /* Handle setting power state separately */
74659+ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
74660+
74661+ err = pci_read_config_word(dev, offset, &cur_value);
74662+ if (err)
74663+ goto out;
74664+
74665+ new_value &= PM_OK_BITS;
74666+ if ((cur_value & PM_OK_BITS) != new_value) {
74667+ new_value = (cur_value & ~PM_OK_BITS) | new_value;
74668+ err = pci_write_config_word(dev, offset, new_value);
74669+ if (err)
74670+ goto out;
74671+ }
74672+
74673+ /* Let pci core handle the power management change */
74674+ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
74675+ err = pci_set_power_state(dev, new_state);
74676+ if (err)
74677+ err = PCIBIOS_SET_FAILED;
74678+
74679+ out:
74680+ return err;
74681+}
74682+
74683+/* Ensure PMEs are disabled */
74684+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
74685+{
74686+ int err;
74687+ u16 value;
74688+
74689+ err = pci_read_config_word(dev, offset, &value);
74690+ if (err)
74691+ goto out;
74692+
74693+ if (value & PCI_PM_CTRL_PME_ENABLE) {
74694+ value &= ~PCI_PM_CTRL_PME_ENABLE;
74695+ err = pci_write_config_word(dev, offset, value);
74696+ }
74697+
74698+ out:
74699+ return ERR_PTR(err);
74700+}
74701+
74702+static struct config_field caplist_pm[] = {
74703+ {
74704+ .offset = PCI_PM_PMC,
74705+ .size = 2,
74706+ .u.w.read = pm_caps_read,
74707+ },
74708+ {
74709+ .offset = PCI_PM_CTRL,
74710+ .size = 2,
74711+ .init = pm_ctrl_init,
74712+ .u.w.read = pciback_read_config_word,
74713+ .u.w.write = pm_ctrl_write,
74714+ },
74715+ {
74716+ .offset = PCI_PM_PPB_EXTENSIONS,
74717+ .size = 1,
74718+ .u.b.read = pciback_read_config_byte,
74719+ },
74720+ {
74721+ .offset = PCI_PM_DATA_REGISTER,
74722+ .size = 1,
74723+ .u.b.read = pciback_read_config_byte,
74724+ },
74725+ {
74726+ .size = 0,
74727+ },
74728+};
74729+
74730+struct pciback_config_capability pciback_config_capability_pm = {
74731+ .capability = PCI_CAP_ID_PM,
74732+ .fields = caplist_pm,
74733+};
74734diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_vpd.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_vpd.c
74735--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_vpd.c 1970-01-01 00:00:00.000000000 +0000
74736+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_vpd.c 2007-01-08 15:00:45.000000000 +0000
74737@@ -0,0 +1,42 @@
74738+/*
74739+ * PCI Backend - Configuration space overlay for Vital Product Data
74740+ *
74741+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74742+ */
74743+
74744+#include <linux/pci.h>
74745+#include "conf_space.h"
74746+#include "conf_space_capability.h"
74747+
74748+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
74749+ void *data)
74750+{
74751+ /* Disallow writes to the vital product data */
74752+ if (value & PCI_VPD_ADDR_F)
74753+ return PCIBIOS_SET_FAILED;
74754+ else
74755+ return pci_write_config_word(dev, offset, value);
74756+}
74757+
74758+static struct config_field caplist_vpd[] = {
74759+ {
74760+ .offset = PCI_VPD_ADDR,
74761+ .size = 2,
74762+ .u.w.read = pciback_read_config_word,
74763+ .u.w.write = vpd_address_write,
74764+ },
74765+ {
74766+ .offset = PCI_VPD_DATA,
74767+ .size = 4,
74768+ .u.dw.read = pciback_read_config_dword,
74769+ .u.dw.write = NULL,
74770+ },
74771+ {
74772+ .size = 0,
74773+ },
74774+};
74775+
74776+struct pciback_config_capability pciback_config_capability_vpd = {
74777+ .capability = PCI_CAP_ID_VPD,
74778+ .fields = caplist_vpd,
74779+};
74780diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_header.c linux-2.6.16.33/drivers/xen/pciback/conf_space_header.c
74781--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_header.c 1970-01-01 00:00:00.000000000 +0000
74782+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_header.c 2007-01-08 15:00:45.000000000 +0000
74783@@ -0,0 +1,299 @@
74784+/*
74785+ * PCI Backend - Handles the virtual fields in the configuration space headers.
74786+ *
74787+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74788+ */
74789+
74790+#include <linux/kernel.h>
74791+#include <linux/pci.h>
74792+#include "pciback.h"
74793+#include "conf_space.h"
74794+
74795+struct pci_bar_info {
74796+ u32 val;
74797+ u32 len_val;
74798+ int which;
74799+};
74800+
74801+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
74802+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
74803+
74804+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
74805+{
74806+ if (!dev->is_enabled && is_enable_cmd(value)) {
74807+ if (unlikely(verbose_request))
74808+ printk(KERN_DEBUG "pciback: %s: enable\n",
74809+ pci_name(dev));
74810+ pci_enable_device(dev);
74811+ } else if (dev->is_enabled && !is_enable_cmd(value)) {
74812+ if (unlikely(verbose_request))
74813+ printk(KERN_DEBUG "pciback: %s: disable\n",
74814+ pci_name(dev));
74815+ pci_disable_device(dev);
74816+ }
74817+
74818+ if (!dev->is_busmaster && is_master_cmd(value)) {
74819+ if (unlikely(verbose_request))
74820+ printk(KERN_DEBUG "pciback: %s: set bus master\n",
74821+ pci_name(dev));
74822+ pci_set_master(dev);
74823+ }
74824+
74825+ if (value & PCI_COMMAND_INVALIDATE) {
74826+ if (unlikely(verbose_request))
74827+ printk(KERN_DEBUG
74828+ "pciback: %s: enable memory-write-invalidate\n",
74829+ pci_name(dev));
74830+ pci_set_mwi(dev);
74831+ }
74832+
74833+ return pci_write_config_word(dev, offset, value);
74834+}
74835+
74836+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
74837+{
74838+ struct pci_bar_info *bar = data;
74839+
74840+ if (unlikely(!bar)) {
74841+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
74842+ pci_name(dev));
74843+ return XEN_PCI_ERR_op_failed;
74844+ }
74845+
74846+ /* A write to obtain the length must happen as a 32-bit write.
74847+ * This does not (yet) support writing individual bytes
74848+ */
74849+ if (value == ~PCI_ROM_ADDRESS_ENABLE)
74850+ bar->which = 1;
74851+ else
74852+ bar->which = 0;
74853+
74854+ /* Do we need to support enabling/disabling the rom address here? */
74855+
74856+ return 0;
74857+}
74858+
74859+/* For the BARs, only allow writes which write ~0 or
74860+ * the correct resource information
74861+ * (Needed for when the driver probes the resource usage)
74862+ */
74863+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
74864+{
74865+ struct pci_bar_info *bar = data;
74866+
74867+ if (unlikely(!bar)) {
74868+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
74869+ pci_name(dev));
74870+ return XEN_PCI_ERR_op_failed;
74871+ }
74872+
74873+ /* A write to obtain the length must happen as a 32-bit write.
74874+ * This does not (yet) support writing individual bytes
74875+ */
74876+ if (value == ~0)
74877+ bar->which = 1;
74878+ else
74879+ bar->which = 0;
74880+
74881+ return 0;
74882+}
74883+
74884+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
74885+{
74886+ struct pci_bar_info *bar = data;
74887+
74888+ if (unlikely(!bar)) {
74889+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
74890+ pci_name(dev));
74891+ return XEN_PCI_ERR_op_failed;
74892+ }
74893+
74894+ *value = bar->which ? bar->len_val : bar->val;
74895+
74896+ return 0;
74897+}
74898+
74899+static inline void read_dev_bar(struct pci_dev *dev,
74900+ struct pci_bar_info *bar_info, int offset,
74901+ u32 len_mask)
74902+{
74903+ pci_read_config_dword(dev, offset, &bar_info->val);
74904+ pci_write_config_dword(dev, offset, len_mask);
74905+ pci_read_config_dword(dev, offset, &bar_info->len_val);
74906+ pci_write_config_dword(dev, offset, bar_info->val);
74907+}
74908+
74909+static void *bar_init(struct pci_dev *dev, int offset)
74910+{
74911+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
74912+
74913+ if (!bar)
74914+ return ERR_PTR(-ENOMEM);
74915+
74916+ read_dev_bar(dev, bar, offset, ~0);
74917+ bar->which = 0;
74918+
74919+ return bar;
74920+}
74921+
74922+static void *rom_init(struct pci_dev *dev, int offset)
74923+{
74924+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
74925+
74926+ if (!bar)
74927+ return ERR_PTR(-ENOMEM);
74928+
74929+ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
74930+ bar->which = 0;
74931+
74932+ return bar;
74933+}
74934+
74935+static void bar_reset(struct pci_dev *dev, int offset, void *data)
74936+{
74937+ struct pci_bar_info *bar = data;
74938+
74939+ bar->which = 0;
74940+}
74941+
74942+static void bar_release(struct pci_dev *dev, int offset, void *data)
74943+{
74944+ kfree(data);
74945+}
74946+
74947+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
74948+ void *data)
74949+{
74950+ *value = (u8) dev->irq;
74951+
74952+ return 0;
74953+}
74954+
74955+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
74956+{
74957+ u8 cur_value;
74958+ int err;
74959+
74960+ err = pci_read_config_byte(dev, offset, &cur_value);
74961+ if (err)
74962+ goto out;
74963+
74964+ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
74965+ || value == PCI_BIST_START)
74966+ err = pci_write_config_byte(dev, offset, value);
74967+
74968+ out:
74969+ return err;
74970+}
74971+
74972+static struct config_field header_common[] = {
74973+ {
74974+ .offset = PCI_COMMAND,
74975+ .size = 2,
74976+ .u.w.read = pciback_read_config_word,
74977+ .u.w.write = command_write,
74978+ },
74979+ {
74980+ .offset = PCI_INTERRUPT_LINE,
74981+ .size = 1,
74982+ .u.b.read = interrupt_read,
74983+ },
74984+ {
74985+ .offset = PCI_INTERRUPT_PIN,
74986+ .size = 1,
74987+ .u.b.read = pciback_read_config_byte,
74988+ },
74989+ {
74990+ /* Any side effects of letting driver domain control cache line? */
74991+ .offset = PCI_CACHE_LINE_SIZE,
74992+ .size = 1,
74993+ .u.b.read = pciback_read_config_byte,
74994+ .u.b.write = pciback_write_config_byte,
74995+ },
74996+ {
74997+ .offset = PCI_LATENCY_TIMER,
74998+ .size = 1,
74999+ .u.b.read = pciback_read_config_byte,
75000+ },
75001+ {
75002+ .offset = PCI_BIST,
75003+ .size = 1,
75004+ .u.b.read = pciback_read_config_byte,
75005+ .u.b.write = bist_write,
75006+ },
75007+ {
75008+ .size = 0,
75009+ },
75010+};
75011+
75012+#define CFG_FIELD_BAR(reg_offset) \
75013+ { \
75014+ .offset = reg_offset, \
75015+ .size = 4, \
75016+ .init = bar_init, \
75017+ .reset = bar_reset, \
75018+ .release = bar_release, \
75019+ .u.dw.read = bar_read, \
75020+ .u.dw.write = bar_write, \
75021+ }
75022+
75023+#define CFG_FIELD_ROM(reg_offset) \
75024+ { \
75025+ .offset = reg_offset, \
75026+ .size = 4, \
75027+ .init = rom_init, \
75028+ .reset = bar_reset, \
75029+ .release = bar_release, \
75030+ .u.dw.read = bar_read, \
75031+ .u.dw.write = rom_write, \
75032+ }
75033+
75034+static struct config_field header_0[] = {
75035+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
75036+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
75037+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
75038+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
75039+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
75040+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
75041+ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
75042+ {
75043+ .size = 0,
75044+ },
75045+};
75046+
75047+static struct config_field header_1[] = {
75048+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
75049+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
75050+ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
75051+ {
75052+ .size = 0,
75053+ },
75054+};
75055+
75056+int pciback_config_header_add_fields(struct pci_dev *dev)
75057+{
75058+ int err;
75059+
75060+ err = pciback_config_add_fields(dev, header_common);
75061+ if (err)
75062+ goto out;
75063+
75064+ switch (dev->hdr_type) {
75065+ case PCI_HEADER_TYPE_NORMAL:
75066+ err = pciback_config_add_fields(dev, header_0);
75067+ break;
75068+
75069+ case PCI_HEADER_TYPE_BRIDGE:
75070+ err = pciback_config_add_fields(dev, header_1);
75071+ break;
75072+
75073+ default:
75074+ err = -EINVAL;
75075+ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
75076+ pci_name(dev), dev->hdr_type);
75077+ break;
75078+ }
75079+
75080+ out:
75081+ return err;
75082+}
75083diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.c linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.c
75084--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.c 1970-01-01 00:00:00.000000000 +0000
75085+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.c 2007-01-08 15:00:45.000000000 +0000
75086@@ -0,0 +1,128 @@
75087+/*
75088+ * PCI Backend - Handle special overlays for broken devices.
75089+ *
75090+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
75091+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
75092+ */
75093+
75094+#include <linux/kernel.h>
75095+#include <linux/pci.h>
75096+#include "pciback.h"
75097+#include "conf_space.h"
75098+#include "conf_space_quirks.h"
75099+
75100+LIST_HEAD(pciback_quirks);
75101+
75102+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
75103+{
75104+ struct pciback_config_quirk *tmp_quirk;
75105+
75106+ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
75107+ if (pci_match_id(&tmp_quirk->devid, dev))
75108+ goto out;
75109+ tmp_quirk = NULL;
75110+ printk(KERN_DEBUG
75111+ "quirk didn't match any device pciback knows about\n");
75112+ out:
75113+ return tmp_quirk;
75114+}
75115+
75116+static inline void register_quirk(struct pciback_config_quirk *quirk)
75117+{
75118+ list_add_tail(&quirk->quirks_list, &pciback_quirks);
75119+}
75120+
75121+int pciback_field_is_dup(struct pci_dev *dev, int reg)
75122+{
75123+ int ret = 0;
75124+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
75125+ struct config_field *field;
75126+ struct config_field_entry *cfg_entry;
75127+
75128+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
75129+ field = cfg_entry->field;
75130+ if (field->offset == reg) {
75131+ ret = 1;
75132+ break;
75133+ }
75134+ }
75135+ return ret;
75136+}
75137+
75138+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
75139+ *field)
75140+{
75141+ int err = 0;
75142+
75143+ switch (field->size) {
75144+ case 1:
75145+ field->u.b.read = pciback_read_config_byte;
75146+ field->u.b.write = pciback_write_config_byte;
75147+ break;
75148+ case 2:
75149+ field->u.w.read = pciback_read_config_word;
75150+ field->u.w.write = pciback_write_config_word;
75151+ break;
75152+ case 4:
75153+ field->u.dw.read = pciback_read_config_dword;
75154+ field->u.dw.write = pciback_write_config_dword;
75155+ break;
75156+ default:
75157+ err = -EINVAL;
75158+ goto out;
75159+ }
75160+
75161+ pciback_config_add_field(dev, field);
75162+
75163+ out:
75164+ return err;
75165+}
75166+
75167+int pciback_config_quirks_init(struct pci_dev *dev)
75168+{
75169+ struct pciback_config_quirk *quirk;
75170+ int ret = 0;
75171+
75172+ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
75173+ if (!quirk) {
75174+ ret = -ENOMEM;
75175+ goto out;
75176+ }
75177+
75178+ quirk->devid.vendor = dev->vendor;
75179+ quirk->devid.device = dev->device;
75180+ quirk->devid.subvendor = dev->subsystem_vendor;
75181+ quirk->devid.subdevice = dev->subsystem_device;
75182+ quirk->devid.class = 0;
75183+ quirk->devid.class_mask = 0;
75184+ quirk->devid.driver_data = 0UL;
75185+
75186+ quirk->pdev = dev;
75187+
75188+ register_quirk(quirk);
75189+ out:
75190+ return ret;
75191+}
75192+
75193+void pciback_config_field_free(struct config_field *field)
75194+{
75195+ kfree(field);
75196+}
75197+
75198+int pciback_config_quirk_release(struct pci_dev *dev)
75199+{
75200+ struct pciback_config_quirk *quirk;
75201+ int ret = 0;
75202+
75203+ quirk = pciback_find_quirk(dev);
75204+ if (!quirk) {
75205+ ret = -ENXIO;
75206+ goto out;
75207+ }
75208+
75209+ list_del(&quirk->quirks_list);
75210+ kfree(quirk);
75211+
75212+ out:
75213+ return ret;
75214+}
75215diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.h linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.h
75216--- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.h 1970-01-01 00:00:00.000000000 +0000
75217+++ linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.h 2007-01-08 15:00:45.000000000 +0000
75218@@ -0,0 +1,35 @@
75219+/*
75220+ * PCI Backend - Data structures for special overlays for broken devices.
75221+ *
75222+ * Ryan Wilson <hap9@epoch.ncsc.mil>
75223+ * Chris Bookholt <hap10@epoch.ncsc.mil>
75224+ */
75225+
75226+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
75227+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
75228+
75229+#include <linux/pci.h>
75230+#include <linux/list.h>
75231+
75232+struct pciback_config_quirk {
75233+ struct list_head quirks_list;
75234+ struct pci_device_id devid;
75235+ struct pci_dev *pdev;
75236+};
75237+
75238+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
75239+
75240+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
75241+ *field);
75242+
75243+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
75244+
75245+int pciback_config_quirks_init(struct pci_dev *dev);
75246+
75247+void pciback_config_field_free(struct config_field *field);
75248+
75249+int pciback_config_quirk_release(struct pci_dev *dev);
75250+
75251+int pciback_field_is_dup(struct pci_dev *dev, int reg);
75252+
75253+#endif
75254diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/passthrough.c linux-2.6.16.33/drivers/xen/pciback/passthrough.c
75255--- linux-2.6.16.33-noxen/drivers/xen/pciback/passthrough.c 1970-01-01 00:00:00.000000000 +0000
75256+++ linux-2.6.16.33/drivers/xen/pciback/passthrough.c 2007-01-08 15:00:45.000000000 +0000
75257@@ -0,0 +1,157 @@
75258+/*
75259+ * PCI Backend - Provides restricted access to the real PCI bus topology
75260+ * to the frontend
75261+ *
75262+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
75263+ */
75264+
75265+#include <linux/list.h>
75266+#include <linux/pci.h>
75267+#include <linux/spinlock.h>
75268+#include "pciback.h"
75269+
75270+struct passthrough_dev_data {
75271+ /* Access to dev_list must be protected by lock */
75272+ struct list_head dev_list;
75273+ spinlock_t lock;
75274+};
75275+
75276+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
75277+ unsigned int domain, unsigned int bus,
75278+ unsigned int devfn)
75279+{
75280+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75281+ struct pci_dev_entry *dev_entry;
75282+ struct pci_dev *dev = NULL;
75283+ unsigned long flags;
75284+
75285+ spin_lock_irqsave(&dev_data->lock, flags);
75286+
75287+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
75288+ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
75289+ && bus == (unsigned int)dev_entry->dev->bus->number
75290+ && devfn == dev_entry->dev->devfn) {
75291+ dev = dev_entry->dev;
75292+ break;
75293+ }
75294+ }
75295+
75296+ spin_unlock_irqrestore(&dev_data->lock, flags);
75297+
75298+ return dev;
75299+}
75300+
75301+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
75302+{
75303+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75304+ struct pci_dev_entry *dev_entry;
75305+ unsigned long flags;
75306+
75307+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
75308+ if (!dev_entry)
75309+ return -ENOMEM;
75310+ dev_entry->dev = dev;
75311+
75312+ spin_lock_irqsave(&dev_data->lock, flags);
75313+ list_add_tail(&dev_entry->list, &dev_data->dev_list);
75314+ spin_unlock_irqrestore(&dev_data->lock, flags);
75315+
75316+ return 0;
75317+}
75318+
75319+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
75320+{
75321+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75322+ struct pci_dev_entry *dev_entry, *t;
75323+ struct pci_dev *found_dev = NULL;
75324+ unsigned long flags;
75325+
75326+ spin_lock_irqsave(&dev_data->lock, flags);
75327+
75328+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
75329+ if (dev_entry->dev == dev) {
75330+ list_del(&dev_entry->list);
75331+ found_dev = dev_entry->dev;
75332+ kfree(dev_entry);
75333+ }
75334+ }
75335+
75336+ spin_unlock_irqrestore(&dev_data->lock, flags);
75337+
75338+ if (found_dev)
75339+ pcistub_put_pci_dev(found_dev);
75340+}
75341+
75342+int pciback_init_devices(struct pciback_device *pdev)
75343+{
75344+ struct passthrough_dev_data *dev_data;
75345+
75346+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
75347+ if (!dev_data)
75348+ return -ENOMEM;
75349+
75350+ spin_lock_init(&dev_data->lock);
75351+
75352+ INIT_LIST_HEAD(&dev_data->dev_list);
75353+
75354+ pdev->pci_dev_data = dev_data;
75355+
75356+ return 0;
75357+}
75358+
75359+int pciback_publish_pci_roots(struct pciback_device *pdev,
75360+ publish_pci_root_cb publish_root_cb)
75361+{
75362+ int err = 0;
75363+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75364+ struct pci_dev_entry *dev_entry, *e;
75365+ struct pci_dev *dev;
75366+ int found;
75367+ unsigned int domain, bus;
75368+
75369+ spin_lock(&dev_data->lock);
75370+
75371+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
75372+ /* Only publish this device as a root if none of its
75373+ * parent bridges are exported
75374+ */
75375+ found = 0;
75376+ dev = dev_entry->dev->bus->self;
75377+ for (; !found && dev != NULL; dev = dev->bus->self) {
75378+ list_for_each_entry(e, &dev_data->dev_list, list) {
75379+ if (dev == e->dev) {
75380+ found = 1;
75381+ break;
75382+ }
75383+ }
75384+ }
75385+
75386+ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
75387+ bus = (unsigned int)dev_entry->dev->bus->number;
75388+
75389+ if (!found) {
75390+ err = publish_root_cb(pdev, domain, bus);
75391+ if (err)
75392+ break;
75393+ }
75394+ }
75395+
75396+ spin_unlock(&dev_data->lock);
75397+
75398+ return err;
75399+}
75400+
75401+void pciback_release_devices(struct pciback_device *pdev)
75402+{
75403+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75404+ struct pci_dev_entry *dev_entry, *t;
75405+
75406+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
75407+ list_del(&dev_entry->list);
75408+ pcistub_put_pci_dev(dev_entry->dev);
75409+ kfree(dev_entry);
75410+ }
75411+
75412+ kfree(dev_data);
75413+ pdev->pci_dev_data = NULL;
75414+}
75415diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pci_stub.c linux-2.6.16.33/drivers/xen/pciback/pci_stub.c
75416--- linux-2.6.16.33-noxen/drivers/xen/pciback/pci_stub.c 1970-01-01 00:00:00.000000000 +0000
75417+++ linux-2.6.16.33/drivers/xen/pciback/pci_stub.c 2007-01-08 15:00:45.000000000 +0000
75418@@ -0,0 +1,916 @@
75419+/*
75420+ * PCI Stub Driver - Grabs devices in backend to be exported later
75421+ *
75422+ * Ryan Wilson <hap9@epoch.ncsc.mil>
75423+ * Chris Bookholt <hap10@epoch.ncsc.mil>
75424+ */
75425+#include <linux/module.h>
75426+#include <linux/init.h>
75427+#include <linux/list.h>
75428+#include <linux/spinlock.h>
75429+#include <linux/kref.h>
75430+#include <asm/atomic.h>
75431+#include "pciback.h"
75432+#include "conf_space.h"
75433+#include "conf_space_quirks.h"
75434+
75435+static char *pci_devs_to_hide = NULL;
75436+module_param_named(hide, pci_devs_to_hide, charp, 0444);
75437+
75438+struct pcistub_device_id {
75439+ struct list_head slot_list;
75440+ int domain;
75441+ unsigned char bus;
75442+ unsigned int devfn;
75443+};
75444+static LIST_HEAD(pcistub_device_ids);
75445+static DEFINE_SPINLOCK(device_ids_lock);
75446+
75447+struct pcistub_device {
75448+ struct kref kref;
75449+ struct list_head dev_list;
75450+ spinlock_t lock;
75451+
75452+ struct pci_dev *dev;
75453+ struct pciback_device *pdev; /* non-NULL if struct pci_dev is in use */
75454+};
75455+
75456+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
75457+ * flag must be locked with pcistub_devices_lock
75458+ */
75459+static DEFINE_SPINLOCK(pcistub_devices_lock);
75460+static LIST_HEAD(pcistub_devices);
75461+
75462+/* wait for device_initcall before initializing our devices
75463+ * (see pcistub_init_devices_late)
75464+ */
75465+static int initialize_devices = 0;
75466+static LIST_HEAD(seized_devices);
75467+
75468+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
75469+{
75470+ struct pcistub_device *psdev;
75471+
75472+ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
75473+
75474+ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
75475+ if (!psdev)
75476+ return NULL;
75477+
75478+ psdev->dev = pci_dev_get(dev);
75479+ if (!psdev->dev) {
75480+ kfree(psdev);
75481+ return NULL;
75482+ }
75483+
75484+ kref_init(&psdev->kref);
75485+ spin_lock_init(&psdev->lock);
75486+
75487+ return psdev;
75488+}
75489+
75490+/* Don't call this directly as it's called by pcistub_device_put */
75491+static void pcistub_device_release(struct kref *kref)
75492+{
75493+ struct pcistub_device *psdev;
75494+
75495+ psdev = container_of(kref, struct pcistub_device, kref);
75496+
75497+ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
75498+
75499+ /* Clean-up the device */
75500+ pciback_reset_device(psdev->dev);
75501+ pciback_config_free_dyn_fields(psdev->dev);
75502+ pciback_config_free_dev(psdev->dev);
75503+ kfree(pci_get_drvdata(psdev->dev));
75504+ pci_set_drvdata(psdev->dev, NULL);
75505+
75506+ pci_dev_put(psdev->dev);
75507+
75508+ kfree(psdev);
75509+}
75510+
75511+static inline void pcistub_device_get(struct pcistub_device *psdev)
75512+{
75513+ kref_get(&psdev->kref);
75514+}
75515+
75516+static inline void pcistub_device_put(struct pcistub_device *psdev)
75517+{
75518+ kref_put(&psdev->kref, pcistub_device_release);
75519+}
75520+
75521+static struct pcistub_device *pcistub_device_find(int domain, int bus,
75522+ int slot, int func)
75523+{
75524+ struct pcistub_device *psdev = NULL;
75525+ unsigned long flags;
75526+
75527+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75528+
75529+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75530+ if (psdev->dev != NULL
75531+ && domain == pci_domain_nr(psdev->dev->bus)
75532+ && bus == psdev->dev->bus->number
75533+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
75534+ pcistub_device_get(psdev);
75535+ goto out;
75536+ }
75537+ }
75538+
75539+ /* didn't find it */
75540+ psdev = NULL;
75541+
75542+ out:
75543+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75544+ return psdev;
75545+}
75546+
75547+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
75548+ struct pcistub_device *psdev)
75549+{
75550+ struct pci_dev *pci_dev = NULL;
75551+ unsigned long flags;
75552+
75553+ pcistub_device_get(psdev);
75554+
75555+ spin_lock_irqsave(&psdev->lock, flags);
75556+ if (!psdev->pdev) {
75557+ psdev->pdev = pdev;
75558+ pci_dev = psdev->dev;
75559+ }
75560+ spin_unlock_irqrestore(&psdev->lock, flags);
75561+
75562+ if (!pci_dev)
75563+ pcistub_device_put(psdev);
75564+
75565+ return pci_dev;
75566+}
75567+
75568+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
75569+ int domain, int bus,
75570+ int slot, int func)
75571+{
75572+ struct pcistub_device *psdev;
75573+ struct pci_dev *found_dev = NULL;
75574+ unsigned long flags;
75575+
75576+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75577+
75578+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75579+ if (psdev->dev != NULL
75580+ && domain == pci_domain_nr(psdev->dev->bus)
75581+ && bus == psdev->dev->bus->number
75582+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
75583+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
75584+ break;
75585+ }
75586+ }
75587+
75588+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75589+ return found_dev;
75590+}
75591+
75592+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
75593+ struct pci_dev *dev)
75594+{
75595+ struct pcistub_device *psdev;
75596+ struct pci_dev *found_dev = NULL;
75597+ unsigned long flags;
75598+
75599+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75600+
75601+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75602+ if (psdev->dev == dev) {
75603+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
75604+ break;
75605+ }
75606+ }
75607+
75608+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75609+ return found_dev;
75610+}
75611+
75612+void pcistub_put_pci_dev(struct pci_dev *dev)
75613+{
75614+ struct pcistub_device *psdev, *found_psdev = NULL;
75615+ unsigned long flags;
75616+
75617+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75618+
75619+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75620+ if (psdev->dev == dev) {
75621+ found_psdev = psdev;
75622+ break;
75623+ }
75624+ }
75625+
75626+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75627+
75628+ /* Cleanup our device
75629+ * (so it's ready for the next domain)
75630+ */
75631+ pciback_reset_device(found_psdev->dev);
75632+ pciback_config_free_dyn_fields(found_psdev->dev);
75633+ pciback_config_reset_dev(found_psdev->dev);
75634+
75635+ spin_lock_irqsave(&found_psdev->lock, flags);
75636+ found_psdev->pdev = NULL;
75637+ spin_unlock_irqrestore(&found_psdev->lock, flags);
75638+
75639+ pcistub_device_put(found_psdev);
75640+}
75641+
75642+static int __devinit pcistub_match_one(struct pci_dev *dev,
75643+ struct pcistub_device_id *pdev_id)
75644+{
75645+ /* Match the specified device by domain, bus, slot, func and also if
75646+ * any of the device's parent bridges match.
75647+ */
75648+ for (; dev != NULL; dev = dev->bus->self) {
75649+ if (pci_domain_nr(dev->bus) == pdev_id->domain
75650+ && dev->bus->number == pdev_id->bus
75651+ && dev->devfn == pdev_id->devfn)
75652+ return 1;
75653+
75654+ /* Sometimes topmost bridge links to itself. */
75655+ if (dev == dev->bus->self)
75656+ break;
75657+ }
75658+
75659+ return 0;
75660+}
75661+
75662+static int __devinit pcistub_match(struct pci_dev *dev)
75663+{
75664+ struct pcistub_device_id *pdev_id;
75665+ unsigned long flags;
75666+ int found = 0;
75667+
75668+ spin_lock_irqsave(&device_ids_lock, flags);
75669+ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
75670+ if (pcistub_match_one(dev, pdev_id)) {
75671+ found = 1;
75672+ break;
75673+ }
75674+ }
75675+ spin_unlock_irqrestore(&device_ids_lock, flags);
75676+
75677+ return found;
75678+}
75679+
75680+static int __devinit pcistub_init_device(struct pci_dev *dev)
75681+{
75682+ struct pciback_dev_data *dev_data;
75683+ int err = 0;
75684+
75685+ dev_dbg(&dev->dev, "initializing...\n");
75686+
75687+ /* The PCI backend is not intended to be a module (or to work with
75688+ * removable PCI devices (yet). If it were, pciback_config_free()
75689+ * would need to be called somewhere to free the memory allocated
75690+ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
75691+ */
75692+ dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
75693+ if (!dev_data) {
75694+ err = -ENOMEM;
75695+ goto out;
75696+ }
75697+ pci_set_drvdata(dev, dev_data);
75698+
75699+ dev_dbg(&dev->dev, "initializing config\n");
75700+ err = pciback_config_init_dev(dev);
75701+ if (err)
75702+ goto out;
75703+
75704+ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
75705+ * must do this here because pcibios_enable_device may specify
75706+ * the pci device's true irq (and possibly its other resources)
75707+ * if they differ from what's in the configuration space.
75708+ * This makes the assumption that the device's resources won't
75709+ * change after this point (otherwise this code may break!)
75710+ */
75711+ dev_dbg(&dev->dev, "enabling device\n");
75712+ err = pci_enable_device(dev);
75713+ if (err)
75714+ goto config_release;
75715+
75716+ /* Now disable the device (this also ensures some private device
75717+ * data is setup before we export)
75718+ */
75719+ dev_dbg(&dev->dev, "reset device\n");
75720+ pciback_reset_device(dev);
75721+
75722+ return 0;
75723+
75724+ config_release:
75725+ pciback_config_free_dev(dev);
75726+
75727+ out:
75728+ pci_set_drvdata(dev, NULL);
75729+ kfree(dev_data);
75730+ return err;
75731+}
75732+
75733+/*
75734+ * Because some initialization still happens on
75735+ * devices during fs_initcall, we need to defer
75736+ * full initialization of our devices until
75737+ * device_initcall.
75738+ */
75739+static int __init pcistub_init_devices_late(void)
75740+{
75741+ struct pcistub_device *psdev;
75742+ unsigned long flags;
75743+ int err = 0;
75744+
75745+ pr_debug("pciback: pcistub_init_devices_late\n");
75746+
75747+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75748+
75749+ while (!list_empty(&seized_devices)) {
75750+ psdev = container_of(seized_devices.next,
75751+ struct pcistub_device, dev_list);
75752+ list_del(&psdev->dev_list);
75753+
75754+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75755+
75756+ err = pcistub_init_device(psdev->dev);
75757+ if (err) {
75758+ dev_err(&psdev->dev->dev,
75759+ "error %d initializing device\n", err);
75760+ kfree(psdev);
75761+ psdev = NULL;
75762+ }
75763+
75764+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75765+
75766+ if (psdev)
75767+ list_add_tail(&psdev->dev_list, &pcistub_devices);
75768+ }
75769+
75770+ initialize_devices = 1;
75771+
75772+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75773+
75774+ return 0;
75775+}
75776+
75777+static int __devinit pcistub_seize(struct pci_dev *dev)
75778+{
75779+ struct pcistub_device *psdev;
75780+ unsigned long flags;
75781+ int err = 0;
75782+
75783+ psdev = pcistub_device_alloc(dev);
75784+ if (!psdev)
75785+ return -ENOMEM;
75786+
75787+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75788+
75789+ if (initialize_devices) {
75790+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75791+
75792+ /* don't want irqs disabled when calling pcistub_init_device */
75793+ err = pcistub_init_device(psdev->dev);
75794+
75795+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75796+
75797+ if (!err)
75798+ list_add(&psdev->dev_list, &pcistub_devices);
75799+ } else {
75800+ dev_dbg(&dev->dev, "deferring initialization\n");
75801+ list_add(&psdev->dev_list, &seized_devices);
75802+ }
75803+
75804+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75805+
75806+ if (err)
75807+ pcistub_device_put(psdev);
75808+
75809+ return err;
75810+}
75811+
75812+static int __devinit pcistub_probe(struct pci_dev *dev,
75813+ const struct pci_device_id *id)
75814+{
75815+ int err = 0;
75816+
75817+ dev_dbg(&dev->dev, "probing...\n");
75818+
75819+ if (pcistub_match(dev)) {
75820+
75821+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
75822+ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
75823+ dev_err(&dev->dev, "can't export pci devices that "
75824+ "don't have a normal (0) or bridge (1) "
75825+ "header type!\n");
75826+ err = -ENODEV;
75827+ goto out;
75828+ }
75829+
75830+ dev_info(&dev->dev, "seizing device\n");
75831+ err = pcistub_seize(dev);
75832+ } else
75833+ /* Didn't find the device */
75834+ err = -ENODEV;
75835+
75836+ out:
75837+ return err;
75838+}
75839+
75840+static void pcistub_remove(struct pci_dev *dev)
75841+{
75842+ struct pcistub_device *psdev, *found_psdev = NULL;
75843+ unsigned long flags;
75844+
75845+ dev_dbg(&dev->dev, "removing\n");
75846+
75847+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75848+
75849+ pciback_config_quirk_release(dev);
75850+
75851+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75852+ if (psdev->dev == dev) {
75853+ found_psdev = psdev;
75854+ break;
75855+ }
75856+ }
75857+
75858+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75859+
75860+ if (found_psdev) {
75861+ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
75862+ found_psdev->pdev);
75863+
75864+ if (found_psdev->pdev) {
75865+ printk(KERN_WARNING "pciback: ****** removing device "
75866+ "%s while still in-use! ******\n",
75867+ pci_name(found_psdev->dev));
75868+ printk(KERN_WARNING "pciback: ****** driver domain may "
75869+ "still access this device's i/o resources!\n");
75870+ printk(KERN_WARNING "pciback: ****** shutdown driver "
75871+ "domain before binding device\n");
75872+ printk(KERN_WARNING "pciback: ****** to other drivers "
75873+ "or domains\n");
75874+
75875+ pciback_release_pci_dev(found_psdev->pdev,
75876+ found_psdev->dev);
75877+ }
75878+
75879+ spin_lock_irqsave(&pcistub_devices_lock, flags);
75880+ list_del(&found_psdev->dev_list);
75881+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75882+
75883+ /* the final put for releasing from the list */
75884+ pcistub_device_put(found_psdev);
75885+ }
75886+}
75887+
75888+static struct pci_device_id pcistub_ids[] = {
75889+ {
75890+ .vendor = PCI_ANY_ID,
75891+ .device = PCI_ANY_ID,
75892+ .subvendor = PCI_ANY_ID,
75893+ .subdevice = PCI_ANY_ID,
75894+ },
75895+ {0,},
75896+};
75897+
75898+/*
75899+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
75900+ * for a normal device. I don't want it to be loaded automatically.
75901+ */
75902+
75903+static struct pci_driver pciback_pci_driver = {
75904+ .name = "pciback",
75905+ .id_table = pcistub_ids,
75906+ .probe = pcistub_probe,
75907+ .remove = pcistub_remove,
75908+};
75909+
75910+static inline int str_to_slot(const char *buf, int *domain, int *bus,
75911+ int *slot, int *func)
75912+{
75913+ int err;
75914+
75915+ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
75916+ if (err == 4)
75917+ return 0;
75918+ else if (err < 0)
75919+ return -EINVAL;
75920+
75921+ /* try again without domain */
75922+ *domain = 0;
75923+ err = sscanf(buf, " %x:%x.%x", bus, slot, func);
75924+ if (err == 3)
75925+ return 0;
75926+
75927+ return -EINVAL;
75928+}
75929+
75930+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
75931+ *slot, int *func, int *reg, int *size, int *mask)
75932+{
75933+ int err;
75934+
75935+ err =
75936+ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
75937+ func, reg, size, mask);
75938+ if (err == 7)
75939+ return 0;
75940+ return -EINVAL;
75941+}
75942+
75943+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
75944+{
75945+ struct pcistub_device_id *pci_dev_id;
75946+ unsigned long flags;
75947+
75948+ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
75949+ if (!pci_dev_id)
75950+ return -ENOMEM;
75951+
75952+ pci_dev_id->domain = domain;
75953+ pci_dev_id->bus = bus;
75954+ pci_dev_id->devfn = PCI_DEVFN(slot, func);
75955+
75956+ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
75957+ domain, bus, slot, func);
75958+
75959+ spin_lock_irqsave(&device_ids_lock, flags);
75960+ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
75961+ spin_unlock_irqrestore(&device_ids_lock, flags);
75962+
75963+ return 0;
75964+}
75965+
75966+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
75967+{
75968+ struct pcistub_device_id *pci_dev_id, *t;
75969+ int devfn = PCI_DEVFN(slot, func);
75970+ int err = -ENOENT;
75971+ unsigned long flags;
75972+
75973+ spin_lock_irqsave(&device_ids_lock, flags);
75974+ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
75975+
75976+ if (pci_dev_id->domain == domain
75977+ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
75978+ /* Don't break; here because it's possible the same
75979+ * slot could be in the list more than once
75980+ */
75981+ list_del(&pci_dev_id->slot_list);
75982+ kfree(pci_dev_id);
75983+
75984+ err = 0;
75985+
75986+ pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
75987+ "seize list\n", domain, bus, slot, func);
75988+ }
75989+ }
75990+ spin_unlock_irqrestore(&device_ids_lock, flags);
75991+
75992+ return err;
75993+}
75994+
75995+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
75996+ int size, int mask)
75997+{
75998+ int err = 0;
75999+ struct pcistub_device *psdev;
76000+ struct pci_dev *dev;
76001+ struct config_field *field;
76002+
76003+ psdev = pcistub_device_find(domain, bus, slot, func);
76004+ if (!psdev || !psdev->dev) {
76005+ err = -ENODEV;
76006+ goto out;
76007+ }
76008+ dev = psdev->dev;
76009+
76010+ /* check for duplicate field */
76011+ if (pciback_field_is_dup(dev, reg))
76012+ goto out;
76013+
76014+ field = kzalloc(sizeof(*field), GFP_ATOMIC);
76015+ if (!field) {
76016+ err = -ENOMEM;
76017+ goto out;
76018+ }
76019+
76020+ field->offset = reg;
76021+ field->size = size;
76022+ field->mask = mask;
76023+ field->init = NULL;
76024+ field->reset = NULL;
76025+ field->release = NULL;
76026+ field->clean = pciback_config_field_free;
76027+
76028+ err = pciback_config_quirks_add_field(dev, field);
76029+ if (err)
76030+ kfree(field);
76031+ out:
76032+ return err;
76033+}
76034+
76035+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
76036+ size_t count)
76037+{
76038+ int domain, bus, slot, func;
76039+ int err;
76040+
76041+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
76042+ if (err)
76043+ goto out;
76044+
76045+ err = pcistub_device_id_add(domain, bus, slot, func);
76046+
76047+ out:
76048+ if (!err)
76049+ err = count;
76050+ return err;
76051+}
76052+
76053+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
76054+
76055+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
76056+ size_t count)
76057+{
76058+ int domain, bus, slot, func;
76059+ int err;
76060+
76061+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
76062+ if (err)
76063+ goto out;
76064+
76065+ err = pcistub_device_id_remove(domain, bus, slot, func);
76066+
76067+ out:
76068+ if (!err)
76069+ err = count;
76070+ return err;
76071+}
76072+
76073+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
76074+
76075+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
76076+{
76077+ struct pcistub_device_id *pci_dev_id;
76078+ size_t count = 0;
76079+ unsigned long flags;
76080+
76081+ spin_lock_irqsave(&device_ids_lock, flags);
76082+ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
76083+ if (count >= PAGE_SIZE)
76084+ break;
76085+
76086+ count += scnprintf(buf + count, PAGE_SIZE - count,
76087+ "%04x:%02x:%02x.%01x\n",
76088+ pci_dev_id->domain, pci_dev_id->bus,
76089+ PCI_SLOT(pci_dev_id->devfn),
76090+ PCI_FUNC(pci_dev_id->devfn));
76091+ }
76092+ spin_unlock_irqrestore(&device_ids_lock, flags);
76093+
76094+ return count;
76095+}
76096+
76097+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
76098+
76099+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
76100+ size_t count)
76101+{
76102+ int domain, bus, slot, func, reg, size, mask;
76103+ int err;
76104+
76105+ err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
76106+ &mask);
76107+ if (err)
76108+ goto out;
76109+
76110+ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
76111+
76112+ out:
76113+ if (!err)
76114+ err = count;
76115+ return err;
76116+}
76117+
76118+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
76119+{
76120+ int count = 0;
76121+ unsigned long flags;
76122+ extern struct list_head pciback_quirks;
76123+ struct pciback_config_quirk *quirk;
76124+ struct pciback_dev_data *dev_data;
76125+ struct config_field *field;
76126+ struct config_field_entry *cfg_entry;
76127+
76128+ spin_lock_irqsave(&device_ids_lock, flags);
76129+ list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
76130+ if (count >= PAGE_SIZE)
76131+ goto out;
76132+
76133+ count += scnprintf(buf + count, PAGE_SIZE - count,
76134+ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
76135+ quirk->pdev->bus->number,
76136+ PCI_SLOT(quirk->pdev->devfn),
76137+ PCI_FUNC(quirk->pdev->devfn),
76138+ quirk->devid.vendor, quirk->devid.device,
76139+ quirk->devid.subvendor,
76140+ quirk->devid.subdevice);
76141+
76142+ dev_data = pci_get_drvdata(quirk->pdev);
76143+
76144+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
76145+ field = cfg_entry->field;
76146+ if (count >= PAGE_SIZE)
76147+ goto out;
76148+
76149+ count += scnprintf(buf + count, PAGE_SIZE -
76150+ count, "\t\t%08x:%01x:%08x\n",
76151+ field->offset, field->size,
76152+ field->mask);
76153+ }
76154+ }
76155+
76156+ out:
76157+ spin_unlock_irqrestore(&device_ids_lock, flags);
76158+
76159+ return count;
76160+}
76161+
76162+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
76163+
76164+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
76165+ size_t count)
76166+{
76167+ int domain, bus, slot, func;
76168+ int err;
76169+ struct pcistub_device *psdev;
76170+ struct pciback_dev_data *dev_data;
76171+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
76172+ if (err)
76173+ goto out;
76174+ psdev = pcistub_device_find(domain, bus, slot, func);
76175+ if (!psdev) {
76176+ err = -ENODEV;
76177+ goto out;
76178+ }
76179+ if (!psdev->dev) {
76180+ err = -ENODEV;
76181+ goto release;
76182+ }
76183+ dev_data = pci_get_drvdata(psdev->dev);
76184+ /* the driver data for a device should never be null at this point */
76185+ if (!dev_data) {
76186+ err = -ENXIO;
76187+ goto release;
76188+ }
76189+ if (!dev_data->permissive) {
76190+ dev_data->permissive = 1;
76191+ /* Let user know that what they're doing could be unsafe */
76192+ dev_warn(&psdev->dev->dev,
76193+ "enabling permissive mode configuration space accesses!\n");
76194+ dev_warn(&psdev->dev->dev,
76195+ "permissive mode is potentially unsafe!\n");
76196+ }
76197+ release:
76198+ pcistub_device_put(psdev);
76199+ out:
76200+ if (!err)
76201+ err = count;
76202+ return err;
76203+}
76204+
76205+static ssize_t permissive_show(struct device_driver *drv, char *buf)
76206+{
76207+ struct pcistub_device *psdev;
76208+ struct pciback_dev_data *dev_data;
76209+ size_t count = 0;
76210+ unsigned long flags;
76211+ spin_lock_irqsave(&pcistub_devices_lock, flags);
76212+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
76213+ if (count >= PAGE_SIZE)
76214+ break;
76215+ if (!psdev->dev)
76216+ continue;
76217+ dev_data = pci_get_drvdata(psdev->dev);
76218+ if (!dev_data || !dev_data->permissive)
76219+ continue;
76220+ count +=
76221+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
76222+ pci_name(psdev->dev));
76223+ }
76224+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
76225+ return count;
76226+}
76227+
76228+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
76229+
76230+static int __init pcistub_init(void)
76231+{
76232+ int pos = 0;
76233+ int err = 0;
76234+ int domain, bus, slot, func;
76235+ int parsed;
76236+
76237+ if (pci_devs_to_hide && *pci_devs_to_hide) {
76238+ do {
76239+ parsed = 0;
76240+
76241+ err = sscanf(pci_devs_to_hide + pos,
76242+ " (%x:%x:%x.%x) %n",
76243+ &domain, &bus, &slot, &func, &parsed);
76244+ if (err != 4) {
76245+ domain = 0;
76246+ err = sscanf(pci_devs_to_hide + pos,
76247+ " (%x:%x.%x) %n",
76248+ &bus, &slot, &func, &parsed);
76249+ if (err != 3)
76250+ goto parse_error;
76251+ }
76252+
76253+ err = pcistub_device_id_add(domain, bus, slot, func);
76254+ if (err)
76255+ goto out;
76256+
76257+ /* if parsed<=0, we've reached the end of the string */
76258+ pos += parsed;
76259+ } while (parsed > 0 && pci_devs_to_hide[pos]);
76260+ }
76261+
76262+ /* If we're the first PCI Device Driver to register, we're the
76263+ * first one to get offered PCI devices as they become
76264+ * available (and thus we can be the first to grab them)
76265+ */
76266+ err = pci_register_driver(&pciback_pci_driver);
76267+ if (err < 0)
76268+ goto out;
76269+
76270+ driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
76271+ driver_create_file(&pciback_pci_driver.driver,
76272+ &driver_attr_remove_slot);
76273+ driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
76274+ driver_create_file(&pciback_pci_driver.driver, &driver_attr_quirks);
76275+ driver_create_file(&pciback_pci_driver.driver, &driver_attr_permissive);
76276+
76277+ out:
76278+ return err;
76279+
76280+ parse_error:
76281+ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
76282+ pci_devs_to_hide + pos);
76283+ return -EINVAL;
76284+}
76285+
76286+#ifndef MODULE
76287+/*
76288+ * fs_initcall happens before device_initcall
76289+ * so pciback *should* get called first (b/c we
76290+ * want to suck up any device before other drivers
76291+ * get a chance by being the first pci device
76292+ * driver to register)
76293+ */
76294+fs_initcall(pcistub_init);
76295+#endif
76296+
76297+static int __init pciback_init(void)
76298+{
76299+ int err;
76300+
76301+ err = pciback_config_init();
76302+ if (err)
76303+ return err;
76304+
76305+#ifdef MODULE
76306+ err = pcistub_init();
76307+ if (err < 0)
76308+ return err;
76309+#endif
76310+
76311+ pcistub_init_devices_late();
76312+ pciback_xenbus_register();
76313+
76314+ return 0;
76315+}
76316+
76317+static void __exit pciback_cleanup(void)
76318+{
76319+ pciback_xenbus_unregister();
76320+
76321+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
76322+ driver_remove_file(&pciback_pci_driver.driver,
76323+ &driver_attr_remove_slot);
76324+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
76325+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
76326+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
76327+
76328+ pci_unregister_driver(&pciback_pci_driver);
76329+}
76330+
76331+module_init(pciback_init);
76332+module_exit(pciback_cleanup);
76333+
76334+MODULE_LICENSE("Dual BSD/GPL");
76335diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pciback.h linux-2.6.16.33/drivers/xen/pciback/pciback.h
76336--- linux-2.6.16.33-noxen/drivers/xen/pciback/pciback.h 1970-01-01 00:00:00.000000000 +0000
76337+++ linux-2.6.16.33/drivers/xen/pciback/pciback.h 2007-01-08 15:00:45.000000000 +0000
76338@@ -0,0 +1,93 @@
76339+/*
76340+ * PCI Backend Common Data Structures & Function Declarations
76341+ *
76342+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76343+ */
76344+#ifndef __XEN_PCIBACK_H__
76345+#define __XEN_PCIBACK_H__
76346+
76347+#include <linux/pci.h>
76348+#include <linux/interrupt.h>
76349+#include <xen/xenbus.h>
76350+#include <linux/list.h>
76351+#include <linux/spinlock.h>
76352+#include <linux/workqueue.h>
76353+#include <asm/atomic.h>
76354+#include <xen/interface/io/pciif.h>
76355+
76356+struct pci_dev_entry {
76357+ struct list_head list;
76358+ struct pci_dev *dev;
76359+};
76360+
76361+#define _PDEVF_op_active (0)
76362+#define PDEVF_op_active (1<<(_PDEVF_op_active))
76363+
76364+struct pciback_device {
76365+ void *pci_dev_data;
76366+ spinlock_t dev_lock;
76367+
76368+ struct xenbus_device *xdev;
76369+
76370+ struct xenbus_watch be_watch;
76371+ u8 be_watching;
76372+
76373+ int evtchn_irq;
76374+
76375+ struct vm_struct *sh_area;
76376+ struct xen_pci_sharedinfo *sh_info;
76377+
76378+ unsigned long flags;
76379+
76380+ struct work_struct op_work;
76381+};
76382+
76383+struct pciback_dev_data {
76384+ struct list_head config_fields;
76385+ int permissive;
76386+ int warned_on_write;
76387+};
76388+
76389+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
76390+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
76391+ int domain, int bus,
76392+ int slot, int func);
76393+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
76394+ struct pci_dev *dev);
76395+void pcistub_put_pci_dev(struct pci_dev *dev);
76396+
76397+/* Ensure a device is turned off or reset */
76398+void pciback_reset_device(struct pci_dev *pdev);
76399+
76400+/* Access a virtual configuration space for a PCI device */
76401+int pciback_config_init(void);
76402+int pciback_config_init_dev(struct pci_dev *dev);
76403+void pciback_config_free_dyn_fields(struct pci_dev *dev);
76404+void pciback_config_reset_dev(struct pci_dev *dev);
76405+void pciback_config_free_dev(struct pci_dev *dev);
76406+int pciback_config_read(struct pci_dev *dev, int offset, int size,
76407+ u32 * ret_val);
76408+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
76409+
76410+/* Handle requests for specific devices from the frontend */
76411+typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
76412+ unsigned int domain, unsigned int bus);
76413+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
76414+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
76415+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76416+ unsigned int domain, unsigned int bus,
76417+ unsigned int devfn);
76418+int pciback_init_devices(struct pciback_device *pdev);
76419+int pciback_publish_pci_roots(struct pciback_device *pdev,
76420+ publish_pci_root_cb cb);
76421+void pciback_release_devices(struct pciback_device *pdev);
76422+
76423+/* Handles events from front-end */
76424+irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
76425+void pciback_do_op(void *data);
76426+
76427+int pciback_xenbus_register(void);
76428+void pciback_xenbus_unregister(void);
76429+
76430+extern int verbose_request;
76431+#endif
76432diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pciback_ops.c linux-2.6.16.33/drivers/xen/pciback/pciback_ops.c
76433--- linux-2.6.16.33-noxen/drivers/xen/pciback/pciback_ops.c 1970-01-01 00:00:00.000000000 +0000
76434+++ linux-2.6.16.33/drivers/xen/pciback/pciback_ops.c 2007-01-08 15:00:45.000000000 +0000
76435@@ -0,0 +1,95 @@
76436+/*
76437+ * PCI Backend Operations - respond to PCI requests from Frontend
76438+ *
76439+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76440+ */
76441+#include <linux/module.h>
76442+#include <asm/bitops.h>
76443+#include <xen/evtchn.h>
76444+#include "pciback.h"
76445+
76446+int verbose_request = 0;
76447+module_param(verbose_request, int, 0644);
76448+
76449+/* Ensure a device is "turned off" and ready to be exported.
76450+ * (Also see pciback_config_reset to ensure virtual configuration space is
76451+ * ready to be re-exported)
76452+ */
76453+void pciback_reset_device(struct pci_dev *dev)
76454+{
76455+ u16 cmd;
76456+
76457+ /* Disable devices (but not bridges) */
76458+ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
76459+ pci_disable_device(dev);
76460+
76461+ pci_write_config_word(dev, PCI_COMMAND, 0);
76462+
76463+ dev->is_enabled = 0;
76464+ dev->is_busmaster = 0;
76465+ } else {
76466+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
76467+ if (cmd & (PCI_COMMAND_INVALIDATE)) {
76468+ cmd &= ~(PCI_COMMAND_INVALIDATE);
76469+ pci_write_config_word(dev, PCI_COMMAND, cmd);
76470+
76471+ dev->is_busmaster = 0;
76472+ }
76473+ }
76474+}
76475+
76476+static inline void test_and_schedule_op(struct pciback_device *pdev)
76477+{
76478+ /* Check that frontend is requesting an operation and that we are not
76479+ * already processing a request */
76480+ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
76481+ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
76482+ schedule_work(&pdev->op_work);
76483+}
76484+
76485+/* Performing the configuration space reads/writes must not be done in atomic
76486+ * context because some of the pci_* functions can sleep (mostly due to ACPI
76487+ * use of semaphores). This function is intended to be called from a work
76488+ * queue in process context taking a struct pciback_device as a parameter */
76489+void pciback_do_op(void *data)
76490+{
76491+ struct pciback_device *pdev = data;
76492+ struct pci_dev *dev;
76493+ struct xen_pci_op *op = &pdev->sh_info->op;
76494+
76495+ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
76496+
76497+ if (dev == NULL)
76498+ op->err = XEN_PCI_ERR_dev_not_found;
76499+ else if (op->cmd == XEN_PCI_OP_conf_read)
76500+ op->err = pciback_config_read(dev, op->offset, op->size,
76501+ &op->value);
76502+ else if (op->cmd == XEN_PCI_OP_conf_write)
76503+ op->err = pciback_config_write(dev, op->offset, op->size,
76504+ op->value);
76505+ else
76506+ op->err = XEN_PCI_ERR_not_implemented;
76507+
76508+ /* Tell the driver domain that we're done. */
76509+ wmb();
76510+ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
76511+ notify_remote_via_irq(pdev->evtchn_irq);
76512+
76513+ /* Mark that we're done. */
76514+ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
76515+ clear_bit(_PDEVF_op_active, &pdev->flags);
76516+ smp_mb__after_clear_bit(); /* /before/ final check for work */
76517+
76518+ /* Check to see if the driver domain tried to start another request in
76519+ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */
76520+ test_and_schedule_op(pdev);
76521+}
76522+
76523+irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
76524+{
76525+ struct pciback_device *pdev = dev_id;
76526+
76527+ test_and_schedule_op(pdev);
76528+
76529+ return IRQ_HANDLED;
76530+}
76531diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/slot.c linux-2.6.16.33/drivers/xen/pciback/slot.c
76532--- linux-2.6.16.33-noxen/drivers/xen/pciback/slot.c 1970-01-01 00:00:00.000000000 +0000
76533+++ linux-2.6.16.33/drivers/xen/pciback/slot.c 2007-01-08 15:00:45.000000000 +0000
76534@@ -0,0 +1,151 @@
76535+/*
76536+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
76537+ * to the frontend
76538+ *
76539+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
76540+ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
76541+ */
76542+
76543+#include <linux/list.h>
76544+#include <linux/slab.h>
76545+#include <linux/pci.h>
76546+#include <linux/spinlock.h>
76547+#include "pciback.h"
76548+
76549+/* There are at most 32 slots in a pci bus. */
76550+#define PCI_SLOT_MAX 32
76551+
76552+#define PCI_BUS_NBR 2
76553+
76554+struct slot_dev_data {
76555+ /* Access to dev_list must be protected by lock */
76556+ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
76557+ spinlock_t lock;
76558+};
76559+
76560+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76561+ unsigned int domain, unsigned int bus,
76562+ unsigned int devfn)
76563+{
76564+ struct pci_dev *dev = NULL;
76565+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76566+ unsigned long flags;
76567+
76568+ if (domain != 0 || PCI_FUNC(devfn) != 0)
76569+ return NULL;
76570+
76571+ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
76572+ return NULL;
76573+
76574+ spin_lock_irqsave(&slot_dev->lock, flags);
76575+ dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
76576+ spin_unlock_irqrestore(&slot_dev->lock, flags);
76577+
76578+ return dev;
76579+}
76580+
76581+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76582+{
76583+ int err = 0, slot, bus;
76584+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76585+ unsigned long flags;
76586+
76587+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
76588+ err = -EFAULT;
76589+ xenbus_dev_fatal(pdev->xdev, err,
76590+ "Can't export bridges on the virtual PCI bus");
76591+ goto out;
76592+ }
76593+
76594+ spin_lock_irqsave(&slot_dev->lock, flags);
76595+
76596+ /* Assign to a new slot on the virtual PCI bus */
76597+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
76598+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76599+ if (slot_dev->slots[bus][slot] == NULL) {
76600+ printk(KERN_INFO
76601+ "pciback: slot: %s: assign to virtual slot %d, bus %d\n",
76602+ pci_name(dev), slot, bus);
76603+ slot_dev->slots[bus][slot] = dev;
76604+ goto unlock;
76605+ }
76606+ }
76607+
76608+ err = -ENOMEM;
76609+ xenbus_dev_fatal(pdev->xdev, err,
76610+ "No more space on root virtual PCI bus");
76611+
76612+ unlock:
76613+ spin_unlock_irqrestore(&slot_dev->lock, flags);
76614+ out:
76615+ return err;
76616+}
76617+
76618+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76619+{
76620+ int slot, bus;
76621+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76622+ struct pci_dev *found_dev = NULL;
76623+ unsigned long flags;
76624+
76625+ spin_lock_irqsave(&slot_dev->lock, flags);
76626+
76627+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
76628+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76629+ if (slot_dev->slots[bus][slot] == dev) {
76630+ slot_dev->slots[bus][slot] = NULL;
76631+ found_dev = dev;
76632+ goto out;
76633+ }
76634+ }
76635+
76636+ out:
76637+ spin_unlock_irqrestore(&slot_dev->lock, flags);
76638+
76639+ if (found_dev)
76640+ pcistub_put_pci_dev(found_dev);
76641+}
76642+
76643+int pciback_init_devices(struct pciback_device *pdev)
76644+{
76645+ int slot, bus;
76646+ struct slot_dev_data *slot_dev;
76647+
76648+ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
76649+ if (!slot_dev)
76650+ return -ENOMEM;
76651+
76652+ spin_lock_init(&slot_dev->lock);
76653+
76654+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
76655+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
76656+ slot_dev->slots[bus][slot] = NULL;
76657+
76658+ pdev->pci_dev_data = slot_dev;
76659+
76660+ return 0;
76661+}
76662+
76663+int pciback_publish_pci_roots(struct pciback_device *pdev,
76664+ publish_pci_root_cb publish_cb)
76665+{
76666+ /* The Virtual PCI bus has only one root */
76667+ return publish_cb(pdev, 0, 0);
76668+}
76669+
76670+void pciback_release_devices(struct pciback_device *pdev)
76671+{
76672+ int slot, bus;
76673+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76674+ struct pci_dev *dev;
76675+
76676+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
76677+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76678+ dev = slot_dev->slots[bus][slot];
76679+ if (dev != NULL)
76680+ pcistub_put_pci_dev(dev);
76681+ }
76682+
76683+ kfree(slot_dev);
76684+ pdev->pci_dev_data = NULL;
76685+}
76686diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/vpci.c linux-2.6.16.33/drivers/xen/pciback/vpci.c
76687--- linux-2.6.16.33-noxen/drivers/xen/pciback/vpci.c 1970-01-01 00:00:00.000000000 +0000
76688+++ linux-2.6.16.33/drivers/xen/pciback/vpci.c 2007-01-08 15:00:45.000000000 +0000
76689@@ -0,0 +1,204 @@
76690+/*
76691+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
76692+ * to the frontend
76693+ *
76694+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76695+ */
76696+
76697+#include <linux/list.h>
76698+#include <linux/slab.h>
76699+#include <linux/pci.h>
76700+#include <linux/spinlock.h>
76701+#include "pciback.h"
76702+
76703+#define PCI_SLOT_MAX 32
76704+
76705+struct vpci_dev_data {
76706+ /* Access to dev_list must be protected by lock */
76707+ struct list_head dev_list[PCI_SLOT_MAX];
76708+ spinlock_t lock;
76709+};
76710+
76711+static inline struct list_head *list_first(struct list_head *head)
76712+{
76713+ return head->next;
76714+}
76715+
76716+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76717+ unsigned int domain, unsigned int bus,
76718+ unsigned int devfn)
76719+{
76720+ struct pci_dev_entry *entry;
76721+ struct pci_dev *dev = NULL;
76722+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76723+ unsigned long flags;
76724+
76725+ if (domain != 0 || bus != 0)
76726+ return NULL;
76727+
76728+ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
76729+ spin_lock_irqsave(&vpci_dev->lock, flags);
76730+
76731+ list_for_each_entry(entry,
76732+ &vpci_dev->dev_list[PCI_SLOT(devfn)],
76733+ list) {
76734+ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
76735+ dev = entry->dev;
76736+ break;
76737+ }
76738+ }
76739+
76740+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
76741+ }
76742+ return dev;
76743+}
76744+
76745+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
76746+{
76747+ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
76748+ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
76749+ return 1;
76750+
76751+ return 0;
76752+}
76753+
76754+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76755+{
76756+ int err = 0, slot;
76757+ struct pci_dev_entry *t, *dev_entry;
76758+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76759+ unsigned long flags;
76760+
76761+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
76762+ err = -EFAULT;
76763+ xenbus_dev_fatal(pdev->xdev, err,
76764+ "Can't export bridges on the virtual PCI bus");
76765+ goto out;
76766+ }
76767+
76768+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
76769+ if (!dev_entry) {
76770+ err = -ENOMEM;
76771+ xenbus_dev_fatal(pdev->xdev, err,
76772+ "Error adding entry to virtual PCI bus");
76773+ goto out;
76774+ }
76775+
76776+ dev_entry->dev = dev;
76777+
76778+ spin_lock_irqsave(&vpci_dev->lock, flags);
76779+
76780+ /* Keep multi-function devices together on the virtual PCI bus */
76781+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76782+ if (!list_empty(&vpci_dev->dev_list[slot])) {
76783+ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
76784+ struct pci_dev_entry, list);
76785+
76786+ if (match_slot(dev, t->dev)) {
76787+ pr_info("pciback: vpci: %s: "
76788+ "assign to virtual slot %d func %d\n",
76789+ pci_name(dev), slot,
76790+ PCI_FUNC(dev->devfn));
76791+ list_add_tail(&dev_entry->list,
76792+ &vpci_dev->dev_list[slot]);
76793+ goto unlock;
76794+ }
76795+ }
76796+ }
76797+
76798+ /* Assign to a new slot on the virtual PCI bus */
76799+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76800+ if (list_empty(&vpci_dev->dev_list[slot])) {
76801+ printk(KERN_INFO
76802+ "pciback: vpci: %s: assign to virtual slot %d\n",
76803+ pci_name(dev), slot);
76804+ list_add_tail(&dev_entry->list,
76805+ &vpci_dev->dev_list[slot]);
76806+ goto unlock;
76807+ }
76808+ }
76809+
76810+ err = -ENOMEM;
76811+ xenbus_dev_fatal(pdev->xdev, err,
76812+ "No more space on root virtual PCI bus");
76813+
76814+ unlock:
76815+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
76816+ out:
76817+ return err;
76818+}
76819+
76820+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76821+{
76822+ int slot;
76823+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76824+ struct pci_dev *found_dev = NULL;
76825+ unsigned long flags;
76826+
76827+ spin_lock_irqsave(&vpci_dev->lock, flags);
76828+
76829+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76830+ struct pci_dev_entry *e, *tmp;
76831+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
76832+ list) {
76833+ if (e->dev == dev) {
76834+ list_del(&e->list);
76835+ found_dev = e->dev;
76836+ kfree(e);
76837+ goto out;
76838+ }
76839+ }
76840+ }
76841+
76842+ out:
76843+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
76844+
76845+ if (found_dev)
76846+ pcistub_put_pci_dev(found_dev);
76847+}
76848+
76849+int pciback_init_devices(struct pciback_device *pdev)
76850+{
76851+ int slot;
76852+ struct vpci_dev_data *vpci_dev;
76853+
76854+ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
76855+ if (!vpci_dev)
76856+ return -ENOMEM;
76857+
76858+ spin_lock_init(&vpci_dev->lock);
76859+
76860+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76861+ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
76862+ }
76863+
76864+ pdev->pci_dev_data = vpci_dev;
76865+
76866+ return 0;
76867+}
76868+
76869+int pciback_publish_pci_roots(struct pciback_device *pdev,
76870+ publish_pci_root_cb publish_cb)
76871+{
76872+ /* The Virtual PCI bus has only one root */
76873+ return publish_cb(pdev, 0, 0);
76874+}
76875+
76876+void pciback_release_devices(struct pciback_device *pdev)
76877+{
76878+ int slot;
76879+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76880+
76881+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76882+ struct pci_dev_entry *e, *tmp;
76883+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
76884+ list) {
76885+ list_del(&e->list);
76886+ pcistub_put_pci_dev(e->dev);
76887+ kfree(e);
76888+ }
76889+ }
76890+
76891+ kfree(vpci_dev);
76892+ pdev->pci_dev_data = NULL;
76893+}
76894diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/xenbus.c linux-2.6.16.33/drivers/xen/pciback/xenbus.c
76895--- linux-2.6.16.33-noxen/drivers/xen/pciback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
76896+++ linux-2.6.16.33/drivers/xen/pciback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
76897@@ -0,0 +1,458 @@
76898+/*
76899+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
76900+ *
76901+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76902+ */
76903+#include <linux/module.h>
76904+#include <linux/init.h>
76905+#include <linux/list.h>
76906+#include <linux/vmalloc.h>
76907+#include <xen/xenbus.h>
76908+#include <xen/evtchn.h>
76909+#include "pciback.h"
76910+
76911+#define INVALID_EVTCHN_IRQ (-1)
76912+
76913+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
76914+{
76915+ struct pciback_device *pdev;
76916+
76917+ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
76918+ if (pdev == NULL)
76919+ goto out;
76920+ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
76921+
76922+ pdev->xdev = xdev;
76923+ xdev->dev.driver_data = pdev;
76924+
76925+ spin_lock_init(&pdev->dev_lock);
76926+
76927+ pdev->sh_area = NULL;
76928+ pdev->sh_info = NULL;
76929+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
76930+ pdev->be_watching = 0;
76931+
76932+ INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
76933+
76934+ if (pciback_init_devices(pdev)) {
76935+ kfree(pdev);
76936+ pdev = NULL;
76937+ }
76938+ out:
76939+ return pdev;
76940+}
76941+
76942+static void free_pdev(struct pciback_device *pdev)
76943+{
76944+ if (pdev->be_watching)
76945+ unregister_xenbus_watch(&pdev->be_watch);
76946+
76947+ /* Ensure the guest can't trigger our handler before removing devices */
76948+ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
76949+ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
76950+
76951+ /* If the driver domain started an op, make sure we complete it or
76952+ * delete it before releasing the shared memory */
76953+ cancel_delayed_work(&pdev->op_work);
76954+ flush_scheduled_work();
76955+
76956+ if (pdev->sh_info)
76957+ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
76958+
76959+ pciback_release_devices(pdev);
76960+
76961+ pdev->xdev->dev.driver_data = NULL;
76962+ pdev->xdev = NULL;
76963+
76964+ kfree(pdev);
76965+}
76966+
76967+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
76968+ int remote_evtchn)
76969+{
76970+ int err = 0;
76971+ int evtchn;
76972+ struct vm_struct *area;
76973+
76974+ dev_dbg(&pdev->xdev->dev,
76975+ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
76976+ gnt_ref, remote_evtchn);
76977+
76978+ area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
76979+ if (IS_ERR(area)) {
76980+ err = PTR_ERR(area);
76981+ goto out;
76982+ }
76983+ pdev->sh_area = area;
76984+ pdev->sh_info = area->addr;
76985+
76986+ err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
76987+ if (err)
76988+ goto out;
76989+
76990+ err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
76991+ SA_SAMPLE_RANDOM, "pciback", pdev);
76992+ if (err < 0) {
76993+ xenbus_dev_fatal(pdev->xdev, err,
76994+ "Error binding event channel to IRQ");
76995+ goto out;
76996+ }
76997+ pdev->evtchn_irq = err;
76998+ err = 0;
76999+
77000+ dev_dbg(&pdev->xdev->dev, "Attached!\n");
77001+ out:
77002+ return err;
77003+}
77004+
77005+static int pciback_attach(struct pciback_device *pdev)
77006+{
77007+ int err = 0;
77008+ int gnt_ref, remote_evtchn;
77009+ char *magic = NULL;
77010+
77011+ spin_lock(&pdev->dev_lock);
77012+
77013+ /* Make sure we only do this setup once */
77014+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77015+ XenbusStateInitialised)
77016+ goto out;
77017+
77018+ /* Wait for frontend to state that it has published the configuration */
77019+ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
77020+ XenbusStateInitialised)
77021+ goto out;
77022+
77023+ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
77024+
77025+ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
77026+ "pci-op-ref", "%u", &gnt_ref,
77027+ "event-channel", "%u", &remote_evtchn,
77028+ "magic", NULL, &magic, NULL);
77029+ if (err) {
77030+ /* If configuration didn't get read correctly, wait longer */
77031+ xenbus_dev_fatal(pdev->xdev, err,
77032+ "Error reading configuration from frontend");
77033+ goto out;
77034+ }
77035+
77036+ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
77037+ xenbus_dev_fatal(pdev->xdev, -EFAULT,
77038+ "version mismatch (%s/%s) with pcifront - "
77039+ "halting pciback",
77040+ magic, XEN_PCI_MAGIC);
77041+ goto out;
77042+ }
77043+
77044+ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
77045+ if (err)
77046+ goto out;
77047+
77048+ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
77049+
77050+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
77051+ if (err)
77052+ xenbus_dev_fatal(pdev->xdev, err,
77053+ "Error switching to connected state!");
77054+
77055+ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
77056+ out:
77057+ spin_unlock(&pdev->dev_lock);
77058+
77059+ if (magic)
77060+ kfree(magic);
77061+
77062+ return err;
77063+}
77064+
77065+static void pciback_frontend_changed(struct xenbus_device *xdev,
77066+ enum xenbus_state fe_state)
77067+{
77068+ struct pciback_device *pdev = xdev->dev.driver_data;
77069+
77070+ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
77071+
77072+ switch (fe_state) {
77073+ case XenbusStateInitialised:
77074+ pciback_attach(pdev);
77075+ break;
77076+
77077+ case XenbusStateClosing:
77078+ xenbus_switch_state(xdev, XenbusStateClosing);
77079+ break;
77080+
77081+ case XenbusStateUnknown:
77082+ case XenbusStateClosed:
77083+ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
77084+ device_unregister(&xdev->dev);
77085+ break;
77086+
77087+ default:
77088+ break;
77089+ }
77090+}
77091+
77092+static int pciback_publish_pci_root(struct pciback_device *pdev,
77093+ unsigned int domain, unsigned int bus)
77094+{
77095+ unsigned int d, b;
77096+ int i, root_num, len, err;
77097+ char str[64];
77098+
77099+ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
77100+
77101+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
77102+ "root_num", "%d", &root_num);
77103+ if (err == 0 || err == -ENOENT)
77104+ root_num = 0;
77105+ else if (err < 0)
77106+ goto out;
77107+
77108+ /* Verify that we haven't already published this pci root */
77109+ for (i = 0; i < root_num; i++) {
77110+ len = snprintf(str, sizeof(str), "root-%d", i);
77111+ if (unlikely(len >= (sizeof(str) - 1))) {
77112+ err = -ENOMEM;
77113+ goto out;
77114+ }
77115+
77116+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
77117+ str, "%x:%x", &d, &b);
77118+ if (err < 0)
77119+ goto out;
77120+ if (err != 2) {
77121+ err = -EINVAL;
77122+ goto out;
77123+ }
77124+
77125+ if (d == domain && b == bus) {
77126+ err = 0;
77127+ goto out;
77128+ }
77129+ }
77130+
77131+ len = snprintf(str, sizeof(str), "root-%d", root_num);
77132+ if (unlikely(len >= (sizeof(str) - 1))) {
77133+ err = -ENOMEM;
77134+ goto out;
77135+ }
77136+
77137+ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
77138+ root_num, domain, bus);
77139+
77140+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
77141+ "%04x:%02x", domain, bus);
77142+ if (err)
77143+ goto out;
77144+
77145+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
77146+ "root_num", "%d", (root_num + 1));
77147+
77148+ out:
77149+ return err;
77150+}
77151+
77152+static int pciback_export_device(struct pciback_device *pdev,
77153+ int domain, int bus, int slot, int func)
77154+{
77155+ struct pci_dev *dev;
77156+ int err = 0;
77157+
77158+ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
77159+ domain, bus, slot, func);
77160+
77161+ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
77162+ if (!dev) {
77163+ err = -EINVAL;
77164+ xenbus_dev_fatal(pdev->xdev, err,
77165+ "Couldn't locate PCI device "
77166+ "(%04x:%02x:%02x.%01x)! "
77167+ "perhaps already in-use?",
77168+ domain, bus, slot, func);
77169+ goto out;
77170+ }
77171+
77172+ err = pciback_add_pci_dev(pdev, dev);
77173+ if (err)
77174+ goto out;
77175+
77176+ /* TODO: It'd be nice to export a bridge and have all of its children
77177+ * get exported with it. This may be best done in xend (which will
77178+ * have to calculate resource usage anyway) but we probably want to
77179+ * put something in here to ensure that if a bridge gets given to a
77180+ * driver domain, that all devices under that bridge are not given
77181+ * to other driver domains (as he who controls the bridge can disable
77182+ * it and stop the other devices from working).
77183+ */
77184+ out:
77185+ return err;
77186+}
77187+
77188+static int pciback_setup_backend(struct pciback_device *pdev)
77189+{
77190+ /* Get configuration from xend (if available now) */
77191+ int domain, bus, slot, func;
77192+ int err = 0;
77193+ int i, num_devs;
77194+ char dev_str[64];
77195+
77196+ spin_lock(&pdev->dev_lock);
77197+
77198+ /* It's possible we could get the call to setup twice, so make sure
77199+ * we're not already connected.
77200+ */
77201+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77202+ XenbusStateInitWait)
77203+ goto out;
77204+
77205+ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
77206+
77207+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
77208+ &num_devs);
77209+ if (err != 1) {
77210+ if (err >= 0)
77211+ err = -EINVAL;
77212+ xenbus_dev_fatal(pdev->xdev, err,
77213+ "Error reading number of devices");
77214+ goto out;
77215+ }
77216+
77217+ for (i = 0; i < num_devs; i++) {
77218+ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
77219+ if (unlikely(l >= (sizeof(dev_str) - 1))) {
77220+ err = -ENOMEM;
77221+ xenbus_dev_fatal(pdev->xdev, err,
77222+ "String overflow while reading "
77223+ "configuration");
77224+ goto out;
77225+ }
77226+
77227+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
77228+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
77229+ if (err < 0) {
77230+ xenbus_dev_fatal(pdev->xdev, err,
77231+ "Error reading device configuration");
77232+ goto out;
77233+ }
77234+ if (err != 4) {
77235+ err = -EINVAL;
77236+ xenbus_dev_fatal(pdev->xdev, err,
77237+ "Error parsing pci device "
77238+ "configuration");
77239+ goto out;
77240+ }
77241+
77242+ err = pciback_export_device(pdev, domain, bus, slot, func);
77243+ if (err)
77244+ goto out;
77245+ }
77246+
77247+ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
77248+ if (err) {
77249+ xenbus_dev_fatal(pdev->xdev, err,
77250+ "Error while publish PCI root buses "
77251+ "for frontend");
77252+ goto out;
77253+ }
77254+
77255+ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
77256+ if (err)
77257+ xenbus_dev_fatal(pdev->xdev, err,
77258+ "Error switching to initialised state!");
77259+
77260+ out:
77261+ spin_unlock(&pdev->dev_lock);
77262+
77263+ if (!err)
77264+ /* see if pcifront is already configured (if not, we'll wait) */
77265+ pciback_attach(pdev);
77266+
77267+ return err;
77268+}
77269+
77270+static void pciback_be_watch(struct xenbus_watch *watch,
77271+ const char **vec, unsigned int len)
77272+{
77273+ struct pciback_device *pdev =
77274+ container_of(watch, struct pciback_device, be_watch);
77275+
77276+ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
77277+ case XenbusStateInitWait:
77278+ pciback_setup_backend(pdev);
77279+ break;
77280+
77281+ default:
77282+ break;
77283+ }
77284+}
77285+
77286+static int pciback_xenbus_probe(struct xenbus_device *dev,
77287+ const struct xenbus_device_id *id)
77288+{
77289+ int err = 0;
77290+ struct pciback_device *pdev = alloc_pdev(dev);
77291+
77292+ if (pdev == NULL) {
77293+ err = -ENOMEM;
77294+ xenbus_dev_fatal(dev, err,
77295+ "Error allocating pciback_device struct");
77296+ goto out;
77297+ }
77298+
77299+ /* wait for xend to configure us */
77300+ err = xenbus_switch_state(dev, XenbusStateInitWait);
77301+ if (err)
77302+ goto out;
77303+
77304+ /* watch the backend node for backend configuration information */
77305+ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
77306+ pciback_be_watch);
77307+ if (err)
77308+ goto out;
77309+ pdev->be_watching = 1;
77310+
77311+ /* We need to force a call to our callback here in case
77312+ * xend already configured us!
77313+ */
77314+ pciback_be_watch(&pdev->be_watch, NULL, 0);
77315+
77316+ out:
77317+ return err;
77318+}
77319+
77320+static int pciback_xenbus_remove(struct xenbus_device *dev)
77321+{
77322+ struct pciback_device *pdev = dev->dev.driver_data;
77323+
77324+ if (pdev != NULL)
77325+ free_pdev(pdev);
77326+
77327+ return 0;
77328+}
77329+
77330+static struct xenbus_device_id xenpci_ids[] = {
77331+ {"pci"},
77332+ {{0}},
77333+};
77334+
77335+static struct xenbus_driver xenbus_pciback_driver = {
77336+ .name = "pciback",
77337+ .owner = THIS_MODULE,
77338+ .ids = xenpci_ids,
77339+ .probe = pciback_xenbus_probe,
77340+ .remove = pciback_xenbus_remove,
77341+ .otherend_changed = pciback_frontend_changed,
77342+};
77343+
77344+int __init pciback_xenbus_register(void)
77345+{
77346+ if (!is_running_on_xen())
77347+ return -ENODEV;
77348+
77349+ return xenbus_register_backend(&xenbus_pciback_driver);
77350+}
77351+
77352+void __exit pciback_xenbus_unregister(void)
77353+{
77354+ xenbus_unregister_driver(&xenbus_pciback_driver);
77355+}
77356diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/Makefile linux-2.6.16.33/drivers/xen/pcifront/Makefile
77357--- linux-2.6.16.33-noxen/drivers/xen/pcifront/Makefile 1970-01-01 00:00:00.000000000 +0000
77358+++ linux-2.6.16.33/drivers/xen/pcifront/Makefile 2007-01-08 15:00:45.000000000 +0000
77359@@ -0,0 +1,7 @@
77360+obj-y += pcifront.o
77361+
77362+pcifront-y := pci_op.o xenbus.o pci.o
77363+
77364+ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
77365+EXTRA_CFLAGS += -DDEBUG
77366+endif
77367diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pci.c linux-2.6.16.33/drivers/xen/pcifront/pci.c
77368--- linux-2.6.16.33-noxen/drivers/xen/pcifront/pci.c 1970-01-01 00:00:00.000000000 +0000
77369+++ linux-2.6.16.33/drivers/xen/pcifront/pci.c 2007-01-08 15:00:45.000000000 +0000
77370@@ -0,0 +1,46 @@
77371+/*
77372+ * PCI Frontend Operations - ensure only one PCI frontend runs at a time
77373+ *
77374+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77375+ */
77376+#include <linux/module.h>
77377+#include <linux/init.h>
77378+#include <linux/pci.h>
77379+#include <linux/spinlock.h>
77380+#include "pcifront.h"
77381+
77382+DEFINE_SPINLOCK(pcifront_dev_lock);
77383+static struct pcifront_device *pcifront_dev = NULL;
77384+
77385+int pcifront_connect(struct pcifront_device *pdev)
77386+{
77387+ int err = 0;
77388+
77389+ spin_lock(&pcifront_dev_lock);
77390+
77391+ if (!pcifront_dev) {
77392+ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
77393+ pcifront_dev = pdev;
77394+ }
77395+ else {
77396+ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
77397+ err = -EEXIST;
77398+ }
77399+
77400+ spin_unlock(&pcifront_dev_lock);
77401+
77402+ return err;
77403+}
77404+
77405+void pcifront_disconnect(struct pcifront_device *pdev)
77406+{
77407+ spin_lock(&pcifront_dev_lock);
77408+
77409+ if (pdev == pcifront_dev) {
77410+ dev_info(&pdev->xdev->dev,
77411+ "Disconnecting PCI Frontend Buses\n");
77412+ pcifront_dev = NULL;
77413+ }
77414+
77415+ spin_unlock(&pcifront_dev_lock);
77416+}
77417diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pci_op.c linux-2.6.16.33/drivers/xen/pcifront/pci_op.c
77418--- linux-2.6.16.33-noxen/drivers/xen/pcifront/pci_op.c 1970-01-01 00:00:00.000000000 +0000
77419+++ linux-2.6.16.33/drivers/xen/pcifront/pci_op.c 2007-01-08 15:00:45.000000000 +0000
77420@@ -0,0 +1,273 @@
77421+/*
77422+ * PCI Frontend Operations - Communicates with frontend
77423+ *
77424+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77425+ */
77426+#include <linux/module.h>
77427+#include <linux/version.h>
77428+#include <linux/init.h>
77429+#include <linux/pci.h>
77430+#include <linux/spinlock.h>
77431+#include <linux/time.h>
77432+#include <xen/evtchn.h>
77433+#include "pcifront.h"
77434+
77435+static int verbose_request = 0;
77436+module_param(verbose_request, int, 0644);
77437+
77438+static int errno_to_pcibios_err(int errno)
77439+{
77440+ switch (errno) {
77441+ case XEN_PCI_ERR_success:
77442+ return PCIBIOS_SUCCESSFUL;
77443+
77444+ case XEN_PCI_ERR_dev_not_found:
77445+ return PCIBIOS_DEVICE_NOT_FOUND;
77446+
77447+ case XEN_PCI_ERR_invalid_offset:
77448+ case XEN_PCI_ERR_op_failed:
77449+ return PCIBIOS_BAD_REGISTER_NUMBER;
77450+
77451+ case XEN_PCI_ERR_not_implemented:
77452+ return PCIBIOS_FUNC_NOT_SUPPORTED;
77453+
77454+ case XEN_PCI_ERR_access_denied:
77455+ return PCIBIOS_SET_FAILED;
77456+ }
77457+ return errno;
77458+}
77459+
77460+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
77461+{
77462+ int err = 0;
77463+ struct xen_pci_op *active_op = &pdev->sh_info->op;
77464+ unsigned long irq_flags;
77465+ evtchn_port_t port = pdev->evtchn;
77466+ nsec_t ns, ns_timeout;
77467+ struct timeval tv;
77468+
77469+ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
77470+
77471+ memcpy(active_op, op, sizeof(struct xen_pci_op));
77472+
77473+ /* Go */
77474+ wmb();
77475+ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
77476+ notify_remote_via_evtchn(port);
77477+
77478+ /*
77479+ * We set a poll timeout of 3 seconds but give up on return after
77480+ * 2 seconds. It is better to time out too late rather than too early
77481+ * (in the latter case we end up continually re-executing poll() with a
77482+ * timeout in the past). 1s difference gives plenty of slack for error.
77483+ */
77484+ do_gettimeofday(&tv);
77485+ ns_timeout = timeval_to_ns(&tv) + 2 * (nsec_t)NSEC_PER_SEC;
77486+
77487+ clear_evtchn(port);
77488+
77489+ while (test_bit(_XEN_PCIF_active,
77490+ (unsigned long *)&pdev->sh_info->flags)) {
77491+ if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
77492+ BUG();
77493+ clear_evtchn(port);
77494+ do_gettimeofday(&tv);
77495+ ns = timeval_to_ns(&tv);
77496+ if (ns > ns_timeout) {
77497+ dev_err(&pdev->xdev->dev,
77498+ "pciback not responding!!!\n");
77499+ clear_bit(_XEN_PCIF_active,
77500+ (unsigned long *)&pdev->sh_info->flags);
77501+ err = XEN_PCI_ERR_dev_not_found;
77502+ goto out;
77503+ }
77504+ }
77505+
77506+ memcpy(op, active_op, sizeof(struct xen_pci_op));
77507+
77508+ err = op->err;
77509+ out:
77510+ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
77511+ return err;
77512+}
77513+
77514+/* Access to this function is spinlocked in drivers/pci/access.c */
77515+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
77516+ int where, int size, u32 * val)
77517+{
77518+ int err = 0;
77519+ struct xen_pci_op op = {
77520+ .cmd = XEN_PCI_OP_conf_read,
77521+ .domain = pci_domain_nr(bus),
77522+ .bus = bus->number,
77523+ .devfn = devfn,
77524+ .offset = where,
77525+ .size = size,
77526+ };
77527+ struct pcifront_sd *sd = bus->sysdata;
77528+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
77529+
77530+ if (verbose_request)
77531+ dev_info(&pdev->xdev->dev,
77532+ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
77533+ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
77534+ PCI_FUNC(devfn), where, size);
77535+
77536+ err = do_pci_op(pdev, &op);
77537+
77538+ if (likely(!err)) {
77539+ if (verbose_request)
77540+ dev_info(&pdev->xdev->dev, "read got back value %x\n",
77541+ op.value);
77542+
77543+ *val = op.value;
77544+ } else if (err == -ENODEV) {
77545+ /* No device here, pretend that it just returned 0 */
77546+ err = 0;
77547+ *val = 0;
77548+ }
77549+
77550+ return errno_to_pcibios_err(err);
77551+}
77552+
77553+/* Access to this function is spinlocked in drivers/pci/access.c */
77554+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
77555+ int where, int size, u32 val)
77556+{
77557+ struct xen_pci_op op = {
77558+ .cmd = XEN_PCI_OP_conf_write,
77559+ .domain = pci_domain_nr(bus),
77560+ .bus = bus->number,
77561+ .devfn = devfn,
77562+ .offset = where,
77563+ .size = size,
77564+ .value = val,
77565+ };
77566+ struct pcifront_sd *sd = bus->sysdata;
77567+ struct pcifront_device *pdev = pcifront_get_pdev(sd);
77568+
77569+ if (verbose_request)
77570+ dev_info(&pdev->xdev->dev,
77571+ "write dev=%04x:%02x:%02x.%01x - "
77572+ "offset %x size %d val %x\n",
77573+ pci_domain_nr(bus), bus->number,
77574+ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
77575+
77576+ return errno_to_pcibios_err(do_pci_op(pdev, &op));
77577+}
77578+
77579+struct pci_ops pcifront_bus_ops = {
77580+ .read = pcifront_bus_read,
77581+ .write = pcifront_bus_write,
77582+};
77583+
77584+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
77585+static void pcifront_claim_resource(struct pci_dev *dev, void *data)
77586+{
77587+ struct pcifront_device *pdev = data;
77588+ int i;
77589+ struct resource *r;
77590+
77591+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
77592+ r = &dev->resource[i];
77593+
77594+ if (!r->parent && r->start && r->flags) {
77595+ dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
77596+ pci_name(dev), i);
77597+ pci_claim_resource(dev, i);
77598+ }
77599+ }
77600+}
77601+
77602+int pcifront_scan_root(struct pcifront_device *pdev,
77603+ unsigned int domain, unsigned int bus)
77604+{
77605+ struct pci_bus *b;
77606+ struct pcifront_sd *sd = NULL;
77607+ struct pci_bus_entry *bus_entry = NULL;
77608+ int err = 0;
77609+
77610+#ifndef CONFIG_PCI_DOMAINS
77611+ if (domain != 0) {
77612+ dev_err(&pdev->xdev->dev,
77613+ "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
77614+ dev_err(&pdev->xdev->dev,
77615+ "Please compile with CONFIG_PCI_DOMAINS\n");
77616+ err = -EINVAL;
77617+ goto err_out;
77618+ }
77619+#endif
77620+
77621+ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
77622+ domain, bus);
77623+
77624+ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
77625+ sd = kmalloc(sizeof(*sd), GFP_KERNEL);
77626+ if (!bus_entry || !sd) {
77627+ err = -ENOMEM;
77628+ goto err_out;
77629+ }
77630+ pcifront_init_sd(sd, domain, pdev);
77631+
77632+ b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
77633+ &pcifront_bus_ops, sd);
77634+ if (!b) {
77635+ dev_err(&pdev->xdev->dev,
77636+ "Error creating PCI Frontend Bus!\n");
77637+ err = -ENOMEM;
77638+ goto err_out;
77639+ }
77640+ bus_entry->bus = b;
77641+
77642+ list_add(&bus_entry->list, &pdev->root_buses);
77643+
77644+ /* Claim resources before going "live" with our devices */
77645+ pci_walk_bus(b, pcifront_claim_resource, pdev);
77646+
77647+ pci_bus_add_devices(b);
77648+
77649+ return 0;
77650+
77651+ err_out:
77652+ kfree(bus_entry);
77653+ kfree(sd);
77654+
77655+ return err;
77656+}
77657+
77658+static void free_root_bus_devs(struct pci_bus *bus)
77659+{
77660+ struct pci_dev *dev;
77661+
77662+ spin_lock(&pci_bus_lock);
77663+ while (!list_empty(&bus->devices)) {
77664+ dev = container_of(bus->devices.next, struct pci_dev, bus_list);
77665+ spin_unlock(&pci_bus_lock);
77666+
77667+ dev_dbg(&dev->dev, "removing device\n");
77668+ pci_remove_bus_device(dev);
77669+
77670+ spin_lock(&pci_bus_lock);
77671+ }
77672+ spin_unlock(&pci_bus_lock);
77673+}
77674+
77675+void pcifront_free_roots(struct pcifront_device *pdev)
77676+{
77677+ struct pci_bus_entry *bus_entry, *t;
77678+
77679+ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
77680+
77681+ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
77682+ list_del(&bus_entry->list);
77683+
77684+ free_root_bus_devs(bus_entry->bus);
77685+
77686+ kfree(bus_entry->bus->sysdata);
77687+
77688+ device_unregister(bus_entry->bus->bridge);
77689+ pci_remove_bus(bus_entry->bus);
77690+
77691+ kfree(bus_entry);
77692+ }
77693+}
77694diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pcifront.h linux-2.6.16.33/drivers/xen/pcifront/pcifront.h
77695--- linux-2.6.16.33-noxen/drivers/xen/pcifront/pcifront.h 1970-01-01 00:00:00.000000000 +0000
77696+++ linux-2.6.16.33/drivers/xen/pcifront/pcifront.h 2007-01-08 15:00:45.000000000 +0000
77697@@ -0,0 +1,40 @@
77698+/*
77699+ * PCI Frontend - Common data structures & function declarations
77700+ *
77701+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77702+ */
77703+#ifndef __XEN_PCIFRONT_H__
77704+#define __XEN_PCIFRONT_H__
77705+
77706+#include <linux/spinlock.h>
77707+#include <linux/pci.h>
77708+#include <xen/xenbus.h>
77709+#include <xen/interface/io/pciif.h>
77710+#include <xen/pcifront.h>
77711+
77712+struct pci_bus_entry {
77713+ struct list_head list;
77714+ struct pci_bus *bus;
77715+};
77716+
77717+struct pcifront_device {
77718+ struct xenbus_device *xdev;
77719+ struct list_head root_buses;
77720+ spinlock_t dev_lock;
77721+
77722+ int evtchn;
77723+ int gnt_ref;
77724+
77725+ /* Lock this when doing any operations in sh_info */
77726+ spinlock_t sh_info_lock;
77727+ struct xen_pci_sharedinfo *sh_info;
77728+};
77729+
77730+int pcifront_connect(struct pcifront_device *pdev);
77731+void pcifront_disconnect(struct pcifront_device *pdev);
77732+
77733+int pcifront_scan_root(struct pcifront_device *pdev,
77734+ unsigned int domain, unsigned int bus);
77735+void pcifront_free_roots(struct pcifront_device *pdev);
77736+
77737+#endif /* __XEN_PCIFRONT_H__ */
77738diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/xenbus.c linux-2.6.16.33/drivers/xen/pcifront/xenbus.c
77739--- linux-2.6.16.33-noxen/drivers/xen/pcifront/xenbus.c 1970-01-01 00:00:00.000000000 +0000
77740+++ linux-2.6.16.33/drivers/xen/pcifront/xenbus.c 2007-01-08 15:00:45.000000000 +0000
77741@@ -0,0 +1,295 @@
77742+/*
77743+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
77744+ *
77745+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77746+ */
77747+#include <linux/module.h>
77748+#include <linux/init.h>
77749+#include <linux/mm.h>
77750+#include <xen/xenbus.h>
77751+#include <xen/gnttab.h>
77752+#include "pcifront.h"
77753+
77754+#define INVALID_GRANT_REF (0)
77755+#define INVALID_EVTCHN (-1)
77756+
77757+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
77758+{
77759+ struct pcifront_device *pdev;
77760+
77761+ pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
77762+ if (pdev == NULL)
77763+ goto out;
77764+
77765+ pdev->sh_info =
77766+ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
77767+ if (pdev->sh_info == NULL) {
77768+ kfree(pdev);
77769+ pdev = NULL;
77770+ goto out;
77771+ }
77772+ pdev->sh_info->flags = 0;
77773+
77774+ xdev->dev.driver_data = pdev;
77775+ pdev->xdev = xdev;
77776+
77777+ INIT_LIST_HEAD(&pdev->root_buses);
77778+
77779+ spin_lock_init(&pdev->dev_lock);
77780+ spin_lock_init(&pdev->sh_info_lock);
77781+
77782+ pdev->evtchn = INVALID_EVTCHN;
77783+ pdev->gnt_ref = INVALID_GRANT_REF;
77784+
77785+ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
77786+ pdev, pdev->sh_info);
77787+ out:
77788+ return pdev;
77789+}
77790+
77791+static void free_pdev(struct pcifront_device *pdev)
77792+{
77793+ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
77794+
77795+ pcifront_free_roots(pdev);
77796+
77797+ if (pdev->evtchn != INVALID_EVTCHN)
77798+ xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
77799+
77800+ if (pdev->gnt_ref != INVALID_GRANT_REF)
77801+ gnttab_end_foreign_access(pdev->gnt_ref, 0,
77802+ (unsigned long)pdev->sh_info);
77803+
77804+ pdev->xdev->dev.driver_data = NULL;
77805+
77806+ kfree(pdev);
77807+}
77808+
77809+static int pcifront_publish_info(struct pcifront_device *pdev)
77810+{
77811+ int err = 0;
77812+ struct xenbus_transaction trans;
77813+
77814+ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
77815+ if (err < 0)
77816+ goto out;
77817+
77818+ pdev->gnt_ref = err;
77819+
77820+ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
77821+ if (err)
77822+ goto out;
77823+
77824+ do_publish:
77825+ err = xenbus_transaction_start(&trans);
77826+ if (err) {
77827+ xenbus_dev_fatal(pdev->xdev, err,
77828+ "Error writing configuration for backend "
77829+ "(start transaction)");
77830+ goto out;
77831+ }
77832+
77833+ err = xenbus_printf(trans, pdev->xdev->nodename,
77834+ "pci-op-ref", "%u", pdev->gnt_ref);
77835+ if (!err)
77836+ err = xenbus_printf(trans, pdev->xdev->nodename,
77837+ "event-channel", "%u", pdev->evtchn);
77838+ if (!err)
77839+ err = xenbus_printf(trans, pdev->xdev->nodename,
77840+ "magic", XEN_PCI_MAGIC);
77841+
77842+ if (err) {
77843+ xenbus_transaction_end(trans, 1);
77844+ xenbus_dev_fatal(pdev->xdev, err,
77845+ "Error writing configuration for backend");
77846+ goto out;
77847+ } else {
77848+ err = xenbus_transaction_end(trans, 0);
77849+ if (err == -EAGAIN)
77850+ goto do_publish;
77851+ else if (err) {
77852+ xenbus_dev_fatal(pdev->xdev, err,
77853+ "Error completing transaction "
77854+ "for backend");
77855+ goto out;
77856+ }
77857+ }
77858+
77859+ xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
77860+
77861+ dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
77862+
77863+ out:
77864+ return err;
77865+}
77866+
77867+static int pcifront_try_connect(struct pcifront_device *pdev)
77868+{
77869+ int err = -EFAULT;
77870+ int i, num_roots, len;
77871+ char str[64];
77872+ unsigned int domain, bus;
77873+
77874+ spin_lock(&pdev->dev_lock);
77875+
77876+ /* Only connect once */
77877+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77878+ XenbusStateInitialised)
77879+ goto out;
77880+
77881+ err = pcifront_connect(pdev);
77882+ if (err) {
77883+ xenbus_dev_fatal(pdev->xdev, err,
77884+ "Error connecting PCI Frontend");
77885+ goto out;
77886+ }
77887+
77888+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
77889+ "root_num", "%d", &num_roots);
77890+ if (err == -ENOENT) {
77891+ xenbus_dev_error(pdev->xdev, err,
77892+ "No PCI Roots found, trying 0000:00");
77893+ err = pcifront_scan_root(pdev, 0, 0);
77894+ num_roots = 0;
77895+ } else if (err != 1) {
77896+ if (err == 0)
77897+ err = -EINVAL;
77898+ xenbus_dev_fatal(pdev->xdev, err,
77899+ "Error reading number of PCI roots");
77900+ goto out;
77901+ }
77902+
77903+ for (i = 0; i < num_roots; i++) {
77904+ len = snprintf(str, sizeof(str), "root-%d", i);
77905+ if (unlikely(len >= (sizeof(str) - 1))) {
77906+ err = -ENOMEM;
77907+ goto out;
77908+ }
77909+
77910+ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
77911+ "%x:%x", &domain, &bus);
77912+ if (err != 2) {
77913+ if (err >= 0)
77914+ err = -EINVAL;
77915+ xenbus_dev_fatal(pdev->xdev, err,
77916+ "Error reading PCI root %d", i);
77917+ goto out;
77918+ }
77919+
77920+ err = pcifront_scan_root(pdev, domain, bus);
77921+ if (err) {
77922+ xenbus_dev_fatal(pdev->xdev, err,
77923+ "Error scanning PCI root %04x:%02x",
77924+ domain, bus);
77925+ goto out;
77926+ }
77927+ }
77928+
77929+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
77930+ if (err)
77931+ goto out;
77932+
77933+ out:
77934+ spin_unlock(&pdev->dev_lock);
77935+ return err;
77936+}
77937+
77938+static int pcifront_try_disconnect(struct pcifront_device *pdev)
77939+{
77940+ int err = 0;
77941+ enum xenbus_state prev_state;
77942+
77943+ spin_lock(&pdev->dev_lock);
77944+
77945+ prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
77946+
77947+ if (prev_state < XenbusStateClosing)
77948+ err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
77949+
77950+ if (!err && prev_state == XenbusStateConnected)
77951+ pcifront_disconnect(pdev);
77952+
77953+ spin_unlock(&pdev->dev_lock);
77954+
77955+ return err;
77956+}
77957+
77958+static void pcifront_backend_changed(struct xenbus_device *xdev,
77959+ enum xenbus_state be_state)
77960+{
77961+ struct pcifront_device *pdev = xdev->dev.driver_data;
77962+
77963+ switch (be_state) {
77964+ case XenbusStateClosing:
77965+ dev_warn(&xdev->dev, "backend going away!\n");
77966+ pcifront_try_disconnect(pdev);
77967+ break;
77968+
77969+ case XenbusStateUnknown:
77970+ case XenbusStateClosed:
77971+ dev_warn(&xdev->dev, "backend went away!\n");
77972+ pcifront_try_disconnect(pdev);
77973+
77974+ device_unregister(&pdev->xdev->dev);
77975+ break;
77976+
77977+ case XenbusStateConnected:
77978+ pcifront_try_connect(pdev);
77979+ break;
77980+
77981+ default:
77982+ break;
77983+ }
77984+}
77985+
77986+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
77987+ const struct xenbus_device_id *id)
77988+{
77989+ int err = 0;
77990+ struct pcifront_device *pdev = alloc_pdev(xdev);
77991+
77992+ if (pdev == NULL) {
77993+ err = -ENOMEM;
77994+ xenbus_dev_fatal(xdev, err,
77995+ "Error allocating pcifront_device struct");
77996+ goto out;
77997+ }
77998+
77999+ err = pcifront_publish_info(pdev);
78000+
78001+ out:
78002+ return err;
78003+}
78004+
78005+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
78006+{
78007+ if (xdev->dev.driver_data)
78008+ free_pdev(xdev->dev.driver_data);
78009+
78010+ return 0;
78011+}
78012+
78013+static struct xenbus_device_id xenpci_ids[] = {
78014+ {"pci"},
78015+ {{0}},
78016+};
78017+
78018+static struct xenbus_driver xenbus_pcifront_driver = {
78019+ .name = "pcifront",
78020+ .owner = THIS_MODULE,
78021+ .ids = xenpci_ids,
78022+ .probe = pcifront_xenbus_probe,
78023+ .remove = pcifront_xenbus_remove,
78024+ .otherend_changed = pcifront_backend_changed,
78025+};
78026+
78027+static int __init pcifront_init(void)
78028+{
78029+ if (!is_running_on_xen())
78030+ return -ENODEV;
78031+
78032+ return xenbus_register_frontend(&xenbus_pcifront_driver);
78033+}
78034+
78035+/* Initialize after the Xen PCI Frontend Stub is initialized */
78036+subsys_initcall(pcifront_init);
78037diff -Nur linux-2.6.16.33-noxen/drivers/xen/privcmd/Makefile linux-2.6.16.33/drivers/xen/privcmd/Makefile
78038--- linux-2.6.16.33-noxen/drivers/xen/privcmd/Makefile 1970-01-01 00:00:00.000000000 +0000
78039+++ linux-2.6.16.33/drivers/xen/privcmd/Makefile 2007-01-08 15:00:45.000000000 +0000
78040@@ -0,0 +1,2 @@
78041+
78042+obj-$(CONFIG_XEN_PRIVCMD) := privcmd.o
78043diff -Nur linux-2.6.16.33-noxen/drivers/xen/privcmd/privcmd.c linux-2.6.16.33/drivers/xen/privcmd/privcmd.c
78044--- linux-2.6.16.33-noxen/drivers/xen/privcmd/privcmd.c 1970-01-01 00:00:00.000000000 +0000
78045+++ linux-2.6.16.33/drivers/xen/privcmd/privcmd.c 2007-01-08 15:00:45.000000000 +0000
78046@@ -0,0 +1,286 @@
78047+/******************************************************************************
78048+ * privcmd.c
78049+ *
78050+ * Interface to privileged domain-0 commands.
78051+ *
78052+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
78053+ */
78054+
78055+#include <linux/config.h>
78056+#include <linux/kernel.h>
78057+#include <linux/sched.h>
78058+#include <linux/slab.h>
78059+#include <linux/string.h>
78060+#include <linux/errno.h>
78061+#include <linux/mm.h>
78062+#include <linux/mman.h>
78063+#include <linux/swap.h>
78064+#include <linux/smp_lock.h>
78065+#include <linux/highmem.h>
78066+#include <linux/pagemap.h>
78067+#include <linux/seq_file.h>
78068+#include <linux/kthread.h>
78069+#include <asm/hypervisor.h>
78070+
78071+#include <asm/pgalloc.h>
78072+#include <asm/pgtable.h>
78073+#include <asm/uaccess.h>
78074+#include <asm/tlb.h>
78075+#include <asm/hypervisor.h>
78076+#include <xen/public/privcmd.h>
78077+#include <xen/interface/xen.h>
78078+#include <xen/interface/dom0_ops.h>
78079+#include <xen/xen_proc.h>
78080+
78081+static struct proc_dir_entry *privcmd_intf;
78082+static struct proc_dir_entry *capabilities_intf;
78083+
78084+#ifndef HAVE_ARCH_PRIVCMD_MMAP
78085+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
78086+#endif
78087+
78088+static int privcmd_ioctl(struct inode *inode, struct file *file,
78089+ unsigned int cmd, unsigned long data)
78090+{
78091+ int ret = -ENOSYS;
78092+ void __user *udata = (void __user *) data;
78093+
78094+ switch (cmd) {
78095+ case IOCTL_PRIVCMD_HYPERCALL: {
78096+ privcmd_hypercall_t hypercall;
78097+
78098+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
78099+ return -EFAULT;
78100+
78101+#if defined(__i386__)
78102+ if (hypercall.op >= (PAGE_SIZE >> 5))
78103+ break;
78104+ __asm__ __volatile__ (
78105+ "pushl %%ebx; pushl %%ecx; pushl %%edx; "
78106+ "pushl %%esi; pushl %%edi; "
78107+ "movl 8(%%eax),%%ebx ;"
78108+ "movl 16(%%eax),%%ecx ;"
78109+ "movl 24(%%eax),%%edx ;"
78110+ "movl 32(%%eax),%%esi ;"
78111+ "movl 40(%%eax),%%edi ;"
78112+ "movl (%%eax),%%eax ;"
78113+ "shll $5,%%eax ;"
78114+ "addl $hypercall_page,%%eax ;"
78115+ "call *%%eax ;"
78116+ "popl %%edi; popl %%esi; popl %%edx; "
78117+ "popl %%ecx; popl %%ebx"
78118+ : "=a" (ret) : "0" (&hypercall) : "memory" );
78119+#elif defined (__x86_64__)
78120+ if (hypercall.op < (PAGE_SIZE >> 5)) {
78121+ long ign1, ign2, ign3;
78122+ __asm__ __volatile__ (
78123+ "movq %8,%%r10; movq %9,%%r8;"
78124+ "shll $5,%%eax ;"
78125+ "addq $hypercall_page,%%rax ;"
78126+ "call *%%rax"
78127+ : "=a" (ret), "=D" (ign1),
78128+ "=S" (ign2), "=d" (ign3)
78129+ : "0" ((unsigned int)hypercall.op),
78130+ "1" (hypercall.arg[0]),
78131+ "2" (hypercall.arg[1]),
78132+ "3" (hypercall.arg[2]),
78133+ "g" (hypercall.arg[3]),
78134+ "g" (hypercall.arg[4])
78135+ : "r8", "r10", "memory" );
78136+ }
78137+#elif defined (__ia64__)
78138+ ret = privcmd_hypercall(&hypercall);
78139+#endif
78140+ }
78141+ break;
78142+
78143+ case IOCTL_PRIVCMD_MMAP: {
78144+ privcmd_mmap_t mmapcmd;
78145+ privcmd_mmap_entry_t msg;
78146+ privcmd_mmap_entry_t __user *p;
78147+ struct mm_struct *mm = current->mm;
78148+ struct vm_area_struct *vma;
78149+ unsigned long va;
78150+ int i, rc;
78151+
78152+ if (!is_initial_xendomain())
78153+ return -EPERM;
78154+
78155+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
78156+ return -EFAULT;
78157+
78158+ p = mmapcmd.entry;
78159+ if (copy_from_user(&msg, p, sizeof(msg)))
78160+ return -EFAULT;
78161+
78162+ down_read(&mm->mmap_sem);
78163+
78164+ vma = find_vma(mm, msg.va);
78165+ rc = -EINVAL;
78166+ if (!vma || (msg.va != vma->vm_start) ||
78167+ !privcmd_enforce_singleshot_mapping(vma))
78168+ goto mmap_out;
78169+
78170+ va = vma->vm_start;
78171+
78172+ for (i = 0; i < mmapcmd.num; i++) {
78173+ rc = -EFAULT;
78174+ if (copy_from_user(&msg, p, sizeof(msg)))
78175+ goto mmap_out;
78176+
78177+ /* Do not allow range to wrap the address space. */
78178+ rc = -EINVAL;
78179+ if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) ||
78180+ ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va))
78181+ goto mmap_out;
78182+
78183+ /* Range chunks must be contiguous in va space. */
78184+ if ((msg.va != va) ||
78185+ ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end))
78186+ goto mmap_out;
78187+
78188+ if ((rc = direct_remap_pfn_range(
78189+ vma,
78190+ msg.va & PAGE_MASK,
78191+ msg.mfn,
78192+ msg.npages << PAGE_SHIFT,
78193+ vma->vm_page_prot,
78194+ mmapcmd.dom)) < 0)
78195+ goto mmap_out;
78196+
78197+ p++;
78198+ va += msg.npages << PAGE_SHIFT;
78199+ }
78200+
78201+ rc = 0;
78202+
78203+ mmap_out:
78204+ up_read(&mm->mmap_sem);
78205+ ret = rc;
78206+ }
78207+ break;
78208+
78209+ case IOCTL_PRIVCMD_MMAPBATCH: {
78210+ privcmd_mmapbatch_t m;
78211+ struct mm_struct *mm = current->mm;
78212+ struct vm_area_struct *vma;
78213+ xen_pfn_t __user *p;
78214+ unsigned long addr, mfn, nr_pages;
78215+ int i;
78216+
78217+ if (!is_initial_xendomain())
78218+ return -EPERM;
78219+
78220+ if (copy_from_user(&m, udata, sizeof(m)))
78221+ return -EFAULT;
78222+
78223+ nr_pages = m.num;
78224+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
78225+ return -EINVAL;
78226+
78227+ down_read(&mm->mmap_sem);
78228+
78229+ vma = find_vma(mm, m.addr);
78230+ if (!vma ||
78231+ (m.addr != vma->vm_start) ||
78232+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
78233+ !privcmd_enforce_singleshot_mapping(vma)) {
78234+ up_read(&mm->mmap_sem);
78235+ return -EINVAL;
78236+ }
78237+
78238+ p = m.arr;
78239+ addr = m.addr;
78240+ for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) {
78241+ if (get_user(mfn, p)) {
78242+ up_read(&mm->mmap_sem);
78243+ return -EFAULT;
78244+ }
78245+
78246+ ret = direct_remap_pfn_range(vma, addr & PAGE_MASK,
78247+ mfn, PAGE_SIZE,
78248+ vma->vm_page_prot, m.dom);
78249+ if (ret < 0)
78250+ put_user(0xF0000000 | mfn, p);
78251+ }
78252+
78253+ up_read(&mm->mmap_sem);
78254+ ret = 0;
78255+ }
78256+ break;
78257+
78258+ default:
78259+ ret = -EINVAL;
78260+ break;
78261+ }
78262+
78263+ return ret;
78264+}
78265+
78266+#ifndef HAVE_ARCH_PRIVCMD_MMAP
78267+static struct page *privcmd_nopage(struct vm_area_struct *vma,
78268+ unsigned long address,
78269+ int *type)
78270+{
78271+ return NOPAGE_SIGBUS;
78272+}
78273+
78274+static struct vm_operations_struct privcmd_vm_ops = {
78275+ .nopage = privcmd_nopage
78276+};
78277+
78278+static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
78279+{
78280+ /* Unsupported for auto-translate guests. */
78281+ if (xen_feature(XENFEAT_auto_translated_physmap))
78282+ return -ENOSYS;
78283+
78284+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
78285+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
78286+ vma->vm_ops = &privcmd_vm_ops;
78287+ vma->vm_private_data = NULL;
78288+
78289+ return 0;
78290+}
78291+
78292+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
78293+{
78294+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
78295+}
78296+#endif
78297+
78298+static struct file_operations privcmd_file_ops = {
78299+ .ioctl = privcmd_ioctl,
78300+ .mmap = privcmd_mmap,
78301+};
78302+
78303+static int capabilities_read(char *page, char **start, off_t off,
78304+ int count, int *eof, void *data)
78305+{
78306+ int len = 0;
78307+ *page = 0;
78308+
78309+ if (is_initial_xendomain())
78310+ len = sprintf( page, "control_d\n" );
78311+
78312+ *eof = 1;
78313+ return len;
78314+}
78315+
78316+static int __init privcmd_init(void)
78317+{
78318+ if (!is_running_on_xen())
78319+ return -ENODEV;
78320+
78321+ privcmd_intf = create_xen_proc_entry("privcmd", 0400);
78322+ if (privcmd_intf != NULL)
78323+ privcmd_intf->proc_fops = &privcmd_file_ops;
78324+
78325+ capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
78326+ if (capabilities_intf != NULL)
78327+ capabilities_intf->read_proc = capabilities_read;
78328+
78329+ return 0;
78330+}
78331+
78332+__initcall(privcmd_init);
78333diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/Makefile linux-2.6.16.33/drivers/xen/tpmback/Makefile
78334--- linux-2.6.16.33-noxen/drivers/xen/tpmback/Makefile 1970-01-01 00:00:00.000000000 +0000
78335+++ linux-2.6.16.33/drivers/xen/tpmback/Makefile 2007-01-08 15:00:45.000000000 +0000
78336@@ -0,0 +1,4 @@
78337+
78338+obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmbk.o
78339+
78340+tpmbk-y += tpmback.o interface.o xenbus.o
78341diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/common.h linux-2.6.16.33/drivers/xen/tpmback/common.h
78342--- linux-2.6.16.33-noxen/drivers/xen/tpmback/common.h 1970-01-01 00:00:00.000000000 +0000
78343+++ linux-2.6.16.33/drivers/xen/tpmback/common.h 2007-01-08 15:00:45.000000000 +0000
78344@@ -0,0 +1,87 @@
78345+/******************************************************************************
78346+ * drivers/xen/tpmback/common.h
78347+ */
78348+
78349+#ifndef __NETIF__BACKEND__COMMON_H__
78350+#define __NETIF__BACKEND__COMMON_H__
78351+
78352+#include <linux/config.h>
78353+#include <linux/version.h>
78354+#include <linux/module.h>
78355+#include <linux/interrupt.h>
78356+#include <linux/slab.h>
78357+#include <xen/evtchn.h>
78358+#include <xen/driver_util.h>
78359+#include <xen/interface/grant_table.h>
78360+#include <xen/interface/io/tpmif.h>
78361+#include <asm/io.h>
78362+#include <asm/pgalloc.h>
78363+
78364+#define DPRINTK(_f, _a...) \
78365+ pr_debug("(file=%s, line=%d) " _f, \
78366+ __FILE__ , __LINE__ , ## _a )
78367+
78368+struct backend_info;
78369+
78370+typedef struct tpmif_st {
78371+ struct list_head tpmif_list;
78372+ /* Unique identifier for this interface. */
78373+ domid_t domid;
78374+ unsigned int handle;
78375+
78376+ /* Physical parameters of the comms window. */
78377+ unsigned int evtchn;
78378+ unsigned int irq;
78379+
78380+ /* The shared rings and indexes. */
78381+ tpmif_tx_interface_t *tx;
78382+ struct vm_struct *tx_area;
78383+
78384+ /* Miscellaneous private stuff. */
78385+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
78386+ int active;
78387+
78388+ struct tpmif_st *hash_next;
78389+ struct list_head list; /* scheduling list */
78390+ atomic_t refcnt;
78391+
78392+ struct backend_info *bi;
78393+
78394+ grant_handle_t shmem_handle;
78395+ grant_ref_t shmem_ref;
78396+ struct page **mmap_pages;
78397+
78398+ char devname[20];
78399+} tpmif_t;
78400+
78401+void tpmif_disconnect_complete(tpmif_t * tpmif);
78402+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
78403+void tpmif_interface_init(void);
78404+void tpmif_interface_exit(void);
78405+void tpmif_schedule_work(tpmif_t * tpmif);
78406+void tpmif_deschedule_work(tpmif_t * tpmif);
78407+void tpmif_xenbus_init(void);
78408+void tpmif_xenbus_exit(void);
78409+int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
78410+irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
78411+
78412+long int tpmback_get_instance(struct backend_info *bi);
78413+
78414+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
78415+
78416+
78417+#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
78418+#define tpmif_put(_b) \
78419+ do { \
78420+ if (atomic_dec_and_test(&(_b)->refcnt)) \
78421+ tpmif_disconnect_complete(_b); \
78422+ } while (0)
78423+
78424+extern int num_frontends;
78425+
78426+static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
78427+{
78428+ return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
78429+}
78430+
78431+#endif /* __TPMIF__BACKEND__COMMON_H__ */
78432diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/interface.c linux-2.6.16.33/drivers/xen/tpmback/interface.c
78433--- linux-2.6.16.33-noxen/drivers/xen/tpmback/interface.c 1970-01-01 00:00:00.000000000 +0000
78434+++ linux-2.6.16.33/drivers/xen/tpmback/interface.c 2007-01-08 15:00:45.000000000 +0000
78435@@ -0,0 +1,182 @@
78436+ /*****************************************************************************
78437+ * drivers/xen/tpmback/interface.c
78438+ *
78439+ * Vritual TPM interface management.
78440+ *
78441+ * Copyright (c) 2005, IBM Corporation
78442+ *
78443+ * Author: Stefan Berger, stefanb@us.ibm.com
78444+ *
78445+ * This code has been derived from drivers/xen/netback/interface.c
78446+ * Copyright (c) 2004, Keir Fraser
78447+ */
78448+
78449+#include "common.h"
78450+#include <xen/balloon.h>
78451+#include <xen/gnttab.h>
78452+
78453+static kmem_cache_t *tpmif_cachep;
78454+int num_frontends = 0;
78455+
78456+LIST_HEAD(tpmif_list);
78457+
78458+static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
78459+{
78460+ tpmif_t *tpmif;
78461+
78462+ tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
78463+ if (tpmif == NULL)
78464+ goto out_of_memory;
78465+
78466+ memset(tpmif, 0, sizeof (*tpmif));
78467+ tpmif->domid = domid;
78468+ tpmif->status = DISCONNECTED;
78469+ tpmif->bi = bi;
78470+ snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
78471+ atomic_set(&tpmif->refcnt, 1);
78472+
78473+ tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
78474+ if (tpmif->mmap_pages == NULL)
78475+ goto out_of_memory;
78476+
78477+ list_add(&tpmif->tpmif_list, &tpmif_list);
78478+ num_frontends++;
78479+
78480+ return tpmif;
78481+
78482+ out_of_memory:
78483+ if (tpmif != NULL)
78484+ kmem_cache_free(tpmif_cachep, tpmif);
78485+ printk("%s: out of memory\n", __FUNCTION__);
78486+ return ERR_PTR(-ENOMEM);
78487+}
78488+
78489+static void free_tpmif(tpmif_t * tpmif)
78490+{
78491+ num_frontends--;
78492+ list_del(&tpmif->tpmif_list);
78493+ free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
78494+ kmem_cache_free(tpmif_cachep, tpmif);
78495+}
78496+
78497+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
78498+{
78499+ tpmif_t *tpmif;
78500+
78501+ list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
78502+ if (tpmif->bi == bi) {
78503+ if (tpmif->domid == domid) {
78504+ tpmif_get(tpmif);
78505+ return tpmif;
78506+ } else {
78507+ return ERR_PTR(-EEXIST);
78508+ }
78509+ }
78510+ }
78511+
78512+ return alloc_tpmif(domid, bi);
78513+}
78514+
78515+static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
78516+{
78517+ int ret;
78518+ struct gnttab_map_grant_ref op;
78519+
78520+ gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr,
78521+ GNTMAP_host_map, shared_page, tpmif->domid);
78522+
78523+ lock_vm_area(tpmif->tx_area);
78524+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
78525+ unlock_vm_area(tpmif->tx_area);
78526+ BUG_ON(ret);
78527+
78528+ if (op.status) {
78529+ DPRINTK(" Grant table operation failure !\n");
78530+ return op.status;
78531+ }
78532+
78533+ tpmif->shmem_ref = shared_page;
78534+ tpmif->shmem_handle = op.handle;
78535+
78536+ return 0;
78537+}
78538+
78539+static void unmap_frontend_page(tpmif_t *tpmif)
78540+{
78541+ struct gnttab_unmap_grant_ref op;
78542+ int ret;
78543+
78544+ gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr,
78545+ GNTMAP_host_map, tpmif->shmem_handle);
78546+
78547+ lock_vm_area(tpmif->tx_area);
78548+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
78549+ unlock_vm_area(tpmif->tx_area);
78550+ BUG_ON(ret);
78551+}
78552+
78553+int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
78554+{
78555+ int err;
78556+ struct evtchn_bind_interdomain bind_interdomain;
78557+
78558+ if (tpmif->irq) {
78559+ return 0;
78560+ }
78561+
78562+ if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
78563+ return -ENOMEM;
78564+
78565+ err = map_frontend_page(tpmif, shared_page);
78566+ if (err) {
78567+ free_vm_area(tpmif->tx_area);
78568+ return err;
78569+ }
78570+
78571+
78572+ bind_interdomain.remote_dom = tpmif->domid;
78573+ bind_interdomain.remote_port = evtchn;
78574+
78575+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
78576+ &bind_interdomain);
78577+ if (err) {
78578+ unmap_frontend_page(tpmif);
78579+ free_vm_area(tpmif->tx_area);
78580+ return err;
78581+ }
78582+
78583+ tpmif->evtchn = bind_interdomain.local_port;
78584+
78585+ tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
78586+
78587+ tpmif->irq = bind_evtchn_to_irqhandler(
78588+ tpmif->evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
78589+ tpmif->shmem_ref = shared_page;
78590+ tpmif->active = 1;
78591+
78592+ return 0;
78593+}
78594+
78595+void tpmif_disconnect_complete(tpmif_t *tpmif)
78596+{
78597+ if (tpmif->irq)
78598+ unbind_from_irqhandler(tpmif->irq, tpmif);
78599+
78600+ if (tpmif->tx) {
78601+ unmap_frontend_page(tpmif);
78602+ free_vm_area(tpmif->tx_area);
78603+ }
78604+
78605+ free_tpmif(tpmif);
78606+}
78607+
78608+void __init tpmif_interface_init(void)
78609+{
78610+ tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
78611+ 0, 0, NULL, NULL);
78612+}
78613+
78614+void __exit tpmif_interface_exit(void)
78615+{
78616+ kmem_cache_destroy(tpmif_cachep);
78617+}
78618diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/tpmback.c linux-2.6.16.33/drivers/xen/tpmback/tpmback.c
78619--- linux-2.6.16.33-noxen/drivers/xen/tpmback/tpmback.c 1970-01-01 00:00:00.000000000 +0000
78620+++ linux-2.6.16.33/drivers/xen/tpmback/tpmback.c 2007-01-08 15:00:45.000000000 +0000
78621@@ -0,0 +1,944 @@
78622+/******************************************************************************
78623+ * drivers/xen/tpmback/tpmback.c
78624+ *
78625+ * Copyright (c) 2005, IBM Corporation
78626+ *
78627+ * Author: Stefan Berger, stefanb@us.ibm.com
78628+ * Grant table support: Mahadevan Gomathisankaran
78629+ *
78630+ * This code has been derived from drivers/xen/netback/netback.c
78631+ * Copyright (c) 2002-2004, K A Fraser
78632+ *
78633+ */
78634+
78635+#include "common.h"
78636+#include <xen/evtchn.h>
78637+
78638+#include <linux/types.h>
78639+#include <linux/list.h>
78640+#include <linux/miscdevice.h>
78641+#include <linux/poll.h>
78642+#include <asm/uaccess.h>
78643+#include <xen/xenbus.h>
78644+#include <xen/interface/grant_table.h>
78645+#include <xen/gnttab.h>
78646+
78647+/* local data structures */
78648+struct data_exchange {
78649+ struct list_head pending_pak;
78650+ struct list_head current_pak;
78651+ unsigned int copied_so_far;
78652+ u8 has_opener:1;
78653+ u8 aborted:1;
78654+ rwlock_t pak_lock; // protects all of the previous fields
78655+ wait_queue_head_t wait_queue;
78656+};
78657+
78658+struct vtpm_resp_hdr {
78659+ uint32_t instance_no;
78660+ uint16_t tag_no;
78661+ uint32_t len_no;
78662+ uint32_t ordinal_no;
78663+} __attribute__ ((packed));
78664+
78665+struct packet {
78666+ struct list_head next;
78667+ unsigned int data_len;
78668+ u8 *data_buffer;
78669+ tpmif_t *tpmif;
78670+ u32 tpm_instance;
78671+ u8 req_tag;
78672+ u32 last_read;
78673+ u8 flags;
78674+ struct timer_list processing_timer;
78675+};
78676+
78677+enum {
78678+ PACKET_FLAG_DISCARD_RESPONSE = 1,
78679+};
78680+
78681+/* local variables */
78682+static struct data_exchange dataex;
78683+
78684+/* local function prototypes */
78685+static int _packet_write(struct packet *pak,
78686+ const char *data, size_t size, int userbuffer);
78687+static void processing_timeout(unsigned long ptr);
78688+static int packet_read_shmem(struct packet *pak,
78689+ tpmif_t * tpmif,
78690+ u32 offset,
78691+ char *buffer, int isuserbuffer, u32 left);
78692+static int vtpm_queue_packet(struct packet *pak);
78693+
78694+/***************************************************************
78695+ Buffer copying fo user and kernel space buffes.
78696+***************************************************************/
78697+static inline int copy_from_buffer(void *to,
78698+ const void *from, unsigned long size,
78699+ int isuserbuffer)
78700+{
78701+ if (isuserbuffer) {
78702+ if (copy_from_user(to, (void __user *)from, size))
78703+ return -EFAULT;
78704+ } else {
78705+ memcpy(to, from, size);
78706+ }
78707+ return 0;
78708+}
78709+
78710+static inline int copy_to_buffer(void *to,
78711+ const void *from, unsigned long size,
78712+ int isuserbuffer)
78713+{
78714+ if (isuserbuffer) {
78715+ if (copy_to_user((void __user *)to, from, size))
78716+ return -EFAULT;
78717+ } else {
78718+ memcpy(to, from, size);
78719+ }
78720+ return 0;
78721+}
78722+
78723+
78724+static void dataex_init(struct data_exchange *dataex)
78725+{
78726+ INIT_LIST_HEAD(&dataex->pending_pak);
78727+ INIT_LIST_HEAD(&dataex->current_pak);
78728+ dataex->has_opener = 0;
78729+ rwlock_init(&dataex->pak_lock);
78730+ init_waitqueue_head(&dataex->wait_queue);
78731+}
78732+
78733+/***************************************************************
78734+ Packet-related functions
78735+***************************************************************/
78736+
78737+static struct packet *packet_find_instance(struct list_head *head,
78738+ u32 tpm_instance)
78739+{
78740+ struct packet *pak;
78741+ struct list_head *p;
78742+
78743+ /*
78744+ * traverse the list of packets and return the first
78745+ * one with the given instance number
78746+ */
78747+ list_for_each(p, head) {
78748+ pak = list_entry(p, struct packet, next);
78749+
78750+ if (pak->tpm_instance == tpm_instance) {
78751+ return pak;
78752+ }
78753+ }
78754+ return NULL;
78755+}
78756+
78757+static struct packet *packet_find_packet(struct list_head *head, void *packet)
78758+{
78759+ struct packet *pak;
78760+ struct list_head *p;
78761+
78762+ /*
78763+ * traverse the list of packets and return the first
78764+ * one with the given instance number
78765+ */
78766+ list_for_each(p, head) {
78767+ pak = list_entry(p, struct packet, next);
78768+
78769+ if (pak == packet) {
78770+ return pak;
78771+ }
78772+ }
78773+ return NULL;
78774+}
78775+
78776+static struct packet *packet_alloc(tpmif_t * tpmif,
78777+ u32 size, u8 req_tag, u8 flags)
78778+{
78779+ struct packet *pak = NULL;
78780+ pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
78781+ if (NULL != pak) {
78782+ if (tpmif) {
78783+ pak->tpmif = tpmif;
78784+ pak->tpm_instance = tpmback_get_instance(tpmif->bi);
78785+ tpmif_get(tpmif);
78786+ }
78787+ pak->data_len = size;
78788+ pak->req_tag = req_tag;
78789+ pak->last_read = 0;
78790+ pak->flags = flags;
78791+
78792+ /*
78793+ * cannot do tpmif_get(tpmif); bad things happen
78794+ * on the last tpmif_put()
78795+ */
78796+ init_timer(&pak->processing_timer);
78797+ pak->processing_timer.function = processing_timeout;
78798+ pak->processing_timer.data = (unsigned long)pak;
78799+ }
78800+ return pak;
78801+}
78802+
78803+static void inline packet_reset(struct packet *pak)
78804+{
78805+ pak->last_read = 0;
78806+}
78807+
78808+static void packet_free(struct packet *pak)
78809+{
78810+ if (timer_pending(&pak->processing_timer)) {
78811+ BUG();
78812+ }
78813+
78814+ if (pak->tpmif)
78815+ tpmif_put(pak->tpmif);
78816+ kfree(pak->data_buffer);
78817+ /*
78818+ * cannot do tpmif_put(pak->tpmif); bad things happen
78819+ * on the last tpmif_put()
78820+ */
78821+ kfree(pak);
78822+}
78823+
78824+
78825+/*
78826+ * Write data to the shared memory and send it to the FE.
78827+ */
78828+static int packet_write(struct packet *pak,
78829+ const char *data, size_t size, int isuserbuffer)
78830+{
78831+ int rc = 0;
78832+
78833+ if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
78834+ /* Don't send a respone to this packet. Just acknowledge it. */
78835+ rc = size;
78836+ } else {
78837+ rc = _packet_write(pak, data, size, isuserbuffer);
78838+ }
78839+
78840+ return rc;
78841+}
78842+
78843+int _packet_write(struct packet *pak,
78844+ const char *data, size_t size, int isuserbuffer)
78845+{
78846+ /*
78847+ * Write into the shared memory pages directly
78848+ * and send it to the front end.
78849+ */
78850+ tpmif_t *tpmif = pak->tpmif;
78851+ grant_handle_t handle;
78852+ int rc = 0;
78853+ unsigned int i = 0;
78854+ unsigned int offset = 0;
78855+
78856+ if (tpmif == NULL) {
78857+ return -EFAULT;
78858+ }
78859+
78860+ if (tpmif->status == DISCONNECTED) {
78861+ return size;
78862+ }
78863+
78864+ while (offset < size && i < TPMIF_TX_RING_SIZE) {
78865+ unsigned int tocopy;
78866+ struct gnttab_map_grant_ref map_op;
78867+ struct gnttab_unmap_grant_ref unmap_op;
78868+ tpmif_tx_request_t *tx;
78869+
78870+ tx = &tpmif->tx->ring[i].req;
78871+
78872+ if (0 == tx->addr) {
78873+ DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
78874+ return 0;
78875+ }
78876+
78877+ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
78878+ GNTMAP_host_map, tx->ref, tpmif->domid);
78879+
78880+ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
78881+ &map_op, 1))) {
78882+ BUG();
78883+ }
78884+
78885+ handle = map_op.handle;
78886+
78887+ if (map_op.status) {
78888+ DPRINTK(" Grant table operation failure !\n");
78889+ return 0;
78890+ }
78891+
78892+ tocopy = min_t(size_t, size - offset, PAGE_SIZE);
78893+
78894+ if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
78895+ (tx->addr & ~PAGE_MASK)),
78896+ &data[offset], tocopy, isuserbuffer)) {
78897+ tpmif_put(tpmif);
78898+ return -EFAULT;
78899+ }
78900+ tx->size = tocopy;
78901+
78902+ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
78903+ GNTMAP_host_map, handle);
78904+
78905+ if (unlikely
78906+ (HYPERVISOR_grant_table_op
78907+ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
78908+ BUG();
78909+ }
78910+
78911+ offset += tocopy;
78912+ i++;
78913+ }
78914+
78915+ rc = offset;
78916+ DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
78917+ notify_remote_via_irq(tpmif->irq);
78918+
78919+ return rc;
78920+}
78921+
78922+/*
78923+ * Read data from the shared memory and copy it directly into the
78924+ * provided buffer. Advance the read_last indicator which tells
78925+ * how many bytes have already been read.
78926+ */
78927+static int packet_read(struct packet *pak, size_t numbytes,
78928+ char *buffer, size_t buffersize, int isuserbuffer)
78929+{
78930+ tpmif_t *tpmif = pak->tpmif;
78931+
78932+ /*
78933+ * Read 'numbytes' of data from the buffer. The first 4
78934+ * bytes are the instance number in network byte order,
78935+ * after that come the data from the shared memory buffer.
78936+ */
78937+ u32 to_copy;
78938+ u32 offset = 0;
78939+ u32 room_left = buffersize;
78940+
78941+ if (pak->last_read < 4) {
78942+ /*
78943+ * copy the instance number into the buffer
78944+ */
78945+ u32 instance_no = htonl(pak->tpm_instance);
78946+ u32 last_read = pak->last_read;
78947+
78948+ to_copy = min_t(size_t, 4 - last_read, numbytes);
78949+
78950+ if (copy_to_buffer(&buffer[0],
78951+ &(((u8 *) & instance_no)[last_read]),
78952+ to_copy, isuserbuffer)) {
78953+ return -EFAULT;
78954+ }
78955+
78956+ pak->last_read += to_copy;
78957+ offset += to_copy;
78958+ room_left -= to_copy;
78959+ }
78960+
78961+ /*
78962+ * If the packet has a data buffer appended, read from it...
78963+ */
78964+
78965+ if (room_left > 0) {
78966+ if (pak->data_buffer) {
78967+ u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
78968+ u32 last_read = pak->last_read - 4;
78969+
78970+ if (copy_to_buffer(&buffer[offset],
78971+ &pak->data_buffer[last_read],
78972+ to_copy, isuserbuffer)) {
78973+ return -EFAULT;
78974+ }
78975+ pak->last_read += to_copy;
78976+ offset += to_copy;
78977+ } else {
78978+ offset = packet_read_shmem(pak,
78979+ tpmif,
78980+ offset,
78981+ buffer,
78982+ isuserbuffer, room_left);
78983+ }
78984+ }
78985+ return offset;
78986+}
78987+
78988+static int packet_read_shmem(struct packet *pak,
78989+ tpmif_t * tpmif,
78990+ u32 offset, char *buffer, int isuserbuffer,
78991+ u32 room_left)
78992+{
78993+ u32 last_read = pak->last_read - 4;
78994+ u32 i = (last_read / PAGE_SIZE);
78995+ u32 pg_offset = last_read & (PAGE_SIZE - 1);
78996+ u32 to_copy;
78997+ grant_handle_t handle;
78998+
78999+ tpmif_tx_request_t *tx;
79000+
79001+ tx = &tpmif->tx->ring[0].req;
79002+ /*
79003+ * Start copying data at the page with index 'index'
79004+ * and within that page at offset 'offset'.
79005+ * Copy a maximum of 'room_left' bytes.
79006+ */
79007+ to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
79008+ while (to_copy > 0) {
79009+ void *src;
79010+ struct gnttab_map_grant_ref map_op;
79011+ struct gnttab_unmap_grant_ref unmap_op;
79012+
79013+ tx = &tpmif->tx->ring[i].req;
79014+
79015+ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
79016+ GNTMAP_host_map, tx->ref, tpmif->domid);
79017+
79018+ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
79019+ &map_op, 1))) {
79020+ BUG();
79021+ }
79022+
79023+ if (map_op.status) {
79024+ DPRINTK(" Grant table operation failure !\n");
79025+ return -EFAULT;
79026+ }
79027+
79028+ handle = map_op.handle;
79029+
79030+ if (to_copy > tx->size) {
79031+ /*
79032+ * User requests more than what's available
79033+ */
79034+ to_copy = min_t(u32, tx->size, to_copy);
79035+ }
79036+
79037+ DPRINTK("Copying from mapped memory at %08lx\n",
79038+ (unsigned long)(idx_to_kaddr(tpmif, i) |
79039+ (tx->addr & ~PAGE_MASK)));
79040+
79041+ src = (void *)(idx_to_kaddr(tpmif, i) |
79042+ ((tx->addr & ~PAGE_MASK) + pg_offset));
79043+ if (copy_to_buffer(&buffer[offset],
79044+ src, to_copy, isuserbuffer)) {
79045+ return -EFAULT;
79046+ }
79047+
79048+ DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
79049+ tpmif->domid, buffer[offset], buffer[offset + 1],
79050+ buffer[offset + 2], buffer[offset + 3]);
79051+
79052+ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
79053+ GNTMAP_host_map, handle);
79054+
79055+ if (unlikely
79056+ (HYPERVISOR_grant_table_op
79057+ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
79058+ BUG();
79059+ }
79060+
79061+ offset += to_copy;
79062+ pg_offset = 0;
79063+ last_read += to_copy;
79064+ room_left -= to_copy;
79065+
79066+ to_copy = min_t(u32, PAGE_SIZE, room_left);
79067+ i++;
79068+ } /* while (to_copy > 0) */
79069+ /*
79070+ * Adjust the last_read pointer
79071+ */
79072+ pak->last_read = last_read + 4;
79073+ return offset;
79074+}
79075+
79076+/* ============================================================
79077+ * The file layer for reading data from this device
79078+ * ============================================================
79079+ */
79080+static int vtpm_op_open(struct inode *inode, struct file *f)
79081+{
79082+ int rc = 0;
79083+ unsigned long flags;
79084+
79085+ write_lock_irqsave(&dataex.pak_lock, flags);
79086+ if (dataex.has_opener == 0) {
79087+ dataex.has_opener = 1;
79088+ } else {
79089+ rc = -EPERM;
79090+ }
79091+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79092+ return rc;
79093+}
79094+
79095+static ssize_t vtpm_op_read(struct file *file,
79096+ char __user * data, size_t size, loff_t * offset)
79097+{
79098+ int ret_size = -ENODATA;
79099+ struct packet *pak = NULL;
79100+ unsigned long flags;
79101+
79102+ write_lock_irqsave(&dataex.pak_lock, flags);
79103+ if (dataex.aborted) {
79104+ dataex.aborted = 0;
79105+ dataex.copied_so_far = 0;
79106+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79107+ return -EIO;
79108+ }
79109+
79110+ if (list_empty(&dataex.pending_pak)) {
79111+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79112+ wait_event_interruptible(dataex.wait_queue,
79113+ !list_empty(&dataex.pending_pak));
79114+ write_lock_irqsave(&dataex.pak_lock, flags);
79115+ dataex.copied_so_far = 0;
79116+ }
79117+
79118+ if (!list_empty(&dataex.pending_pak)) {
79119+ unsigned int left;
79120+
79121+ pak = list_entry(dataex.pending_pak.next, struct packet, next);
79122+ left = pak->data_len - dataex.copied_so_far;
79123+ list_del(&pak->next);
79124+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79125+
79126+ DPRINTK("size given by app: %d, available: %d\n", size, left);
79127+
79128+ ret_size = min_t(size_t, size, left);
79129+
79130+ ret_size = packet_read(pak, ret_size, data, size, 1);
79131+
79132+ write_lock_irqsave(&dataex.pak_lock, flags);
79133+
79134+ if (ret_size < 0) {
79135+ del_singleshot_timer_sync(&pak->processing_timer);
79136+ packet_free(pak);
79137+ dataex.copied_so_far = 0;
79138+ } else {
79139+ DPRINTK("Copied %d bytes to user buffer\n", ret_size);
79140+
79141+ dataex.copied_so_far += ret_size;
79142+ if (dataex.copied_so_far >= pak->data_len + 4) {
79143+ DPRINTK("All data from this packet given to app.\n");
79144+ /* All data given to app */
79145+
79146+ del_singleshot_timer_sync(&pak->
79147+ processing_timer);
79148+ list_add_tail(&pak->next, &dataex.current_pak);
79149+ /*
79150+ * The more fontends that are handled at the same time,
79151+ * the more time we give the TPM to process the request.
79152+ */
79153+ mod_timer(&pak->processing_timer,
79154+ jiffies + (num_frontends * 60 * HZ));
79155+ dataex.copied_so_far = 0;
79156+ } else {
79157+ list_add(&pak->next, &dataex.pending_pak);
79158+ }
79159+ }
79160+ }
79161+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79162+
79163+ DPRINTK("Returning result from read to app: %d\n", ret_size);
79164+
79165+ return ret_size;
79166+}
79167+
79168+/*
79169+ * Write operation - only works after a previous read operation!
79170+ */
79171+static ssize_t vtpm_op_write(struct file *file,
79172+ const char __user * data, size_t size,
79173+ loff_t * offset)
79174+{
79175+ struct packet *pak;
79176+ int rc = 0;
79177+ unsigned int off = 4;
79178+ unsigned long flags;
79179+ struct vtpm_resp_hdr vrh;
79180+
79181+ /*
79182+ * Minimum required packet size is:
79183+ * 4 bytes for instance number
79184+ * 2 bytes for tag
79185+ * 4 bytes for paramSize
79186+ * 4 bytes for the ordinal
79187+ * sum: 14 bytes
79188+ */
79189+ if (size < sizeof (vrh))
79190+ return -EFAULT;
79191+
79192+ if (copy_from_user(&vrh, data, sizeof (vrh)))
79193+ return -EFAULT;
79194+
79195+ /* malformed packet? */
79196+ if ((off + ntohl(vrh.len_no)) != size)
79197+ return -EFAULT;
79198+
79199+ write_lock_irqsave(&dataex.pak_lock, flags);
79200+ pak = packet_find_instance(&dataex.current_pak,
79201+ ntohl(vrh.instance_no));
79202+
79203+ if (pak == NULL) {
79204+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79205+ DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
79206+ ntohl(vrh.instance_no));
79207+ return -EFAULT;
79208+ }
79209+
79210+ del_singleshot_timer_sync(&pak->processing_timer);
79211+ list_del(&pak->next);
79212+
79213+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79214+
79215+ /*
79216+ * The first 'offset' bytes must be the instance number - skip them.
79217+ */
79218+ size -= off;
79219+
79220+ rc = packet_write(pak, &data[off], size, 1);
79221+
79222+ if (rc > 0) {
79223+ /* I neglected the first 4 bytes */
79224+ rc += off;
79225+ }
79226+ packet_free(pak);
79227+ return rc;
79228+}
79229+
79230+static int vtpm_op_release(struct inode *inode, struct file *file)
79231+{
79232+ unsigned long flags;
79233+
79234+ vtpm_release_packets(NULL, 1);
79235+ write_lock_irqsave(&dataex.pak_lock, flags);
79236+ dataex.has_opener = 0;
79237+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79238+ return 0;
79239+}
79240+
79241+static unsigned int vtpm_op_poll(struct file *file,
79242+ struct poll_table_struct *pts)
79243+{
79244+ unsigned int flags = POLLOUT | POLLWRNORM;
79245+
79246+ poll_wait(file, &dataex.wait_queue, pts);
79247+ if (!list_empty(&dataex.pending_pak)) {
79248+ flags |= POLLIN | POLLRDNORM;
79249+ }
79250+ return flags;
79251+}
79252+
79253+static struct file_operations vtpm_ops = {
79254+ .owner = THIS_MODULE,
79255+ .llseek = no_llseek,
79256+ .open = vtpm_op_open,
79257+ .read = vtpm_op_read,
79258+ .write = vtpm_op_write,
79259+ .release = vtpm_op_release,
79260+ .poll = vtpm_op_poll,
79261+};
79262+
79263+static struct miscdevice vtpms_miscdevice = {
79264+ .minor = 225,
79265+ .name = "vtpm",
79266+ .fops = &vtpm_ops,
79267+};
79268+
79269+/***************************************************************
79270+ Utility functions
79271+***************************************************************/
79272+
79273+static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
79274+{
79275+ int rc;
79276+ static const unsigned char tpm_error_message_fail[] = {
79277+ 0x00, 0x00,
79278+ 0x00, 0x00, 0x00, 0x0a,
79279+ 0x00, 0x00, 0x00, 0x09 /* TPM_FAIL */
79280+ };
79281+ unsigned char buffer[sizeof (tpm_error_message_fail)];
79282+
79283+ memcpy(buffer, tpm_error_message_fail,
79284+ sizeof (tpm_error_message_fail));
79285+ /*
79286+ * Insert the right response tag depending on the given tag
79287+ * All response tags are '+3' to the request tag.
79288+ */
79289+ buffer[1] = req_tag + 3;
79290+
79291+ /*
79292+ * Write the data to shared memory and notify the front-end
79293+ */
79294+ rc = packet_write(pak, buffer, sizeof (buffer), 0);
79295+
79296+ return rc;
79297+}
79298+
79299+static int _vtpm_release_packets(struct list_head *head,
79300+ tpmif_t * tpmif, int send_msgs)
79301+{
79302+ int aborted = 0;
79303+ int c = 0;
79304+ struct packet *pak;
79305+ struct list_head *pos, *tmp;
79306+
79307+ list_for_each_safe(pos, tmp, head) {
79308+ pak = list_entry(pos, struct packet, next);
79309+ c += 1;
79310+
79311+ if (tpmif == NULL || pak->tpmif == tpmif) {
79312+ int can_send = 0;
79313+
79314+ del_singleshot_timer_sync(&pak->processing_timer);
79315+ list_del(&pak->next);
79316+
79317+ if (pak->tpmif && pak->tpmif->status == CONNECTED) {
79318+ can_send = 1;
79319+ }
79320+
79321+ if (send_msgs && can_send) {
79322+ tpm_send_fail_message(pak, pak->req_tag);
79323+ }
79324+ packet_free(pak);
79325+ if (c == 1)
79326+ aborted = 1;
79327+ }
79328+ }
79329+ return aborted;
79330+}
79331+
79332+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
79333+{
79334+ unsigned long flags;
79335+
79336+ write_lock_irqsave(&dataex.pak_lock, flags);
79337+
79338+ dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
79339+ tpmif,
79340+ send_msgs);
79341+ _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
79342+
79343+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79344+ return 0;
79345+}
79346+
79347+static int vtpm_queue_packet(struct packet *pak)
79348+{
79349+ int rc = 0;
79350+
79351+ if (dataex.has_opener) {
79352+ unsigned long flags;
79353+
79354+ write_lock_irqsave(&dataex.pak_lock, flags);
79355+ list_add_tail(&pak->next, &dataex.pending_pak);
79356+ /* give the TPM some time to pick up the request */
79357+ mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
79358+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79359+
79360+ wake_up_interruptible(&dataex.wait_queue);
79361+ } else {
79362+ rc = -EFAULT;
79363+ }
79364+ return rc;
79365+}
79366+
79367+static int vtpm_receive(tpmif_t * tpmif, u32 size)
79368+{
79369+ int rc = 0;
79370+ unsigned char buffer[10];
79371+ __be32 *native_size;
79372+ struct packet *pak = packet_alloc(tpmif, size, 0, 0);
79373+
79374+ if (!pak)
79375+ return -ENOMEM;
79376+ /*
79377+ * Read 10 bytes from the received buffer to test its
79378+ * content for validity.
79379+ */
79380+ if (sizeof (buffer) != packet_read(pak,
79381+ sizeof (buffer), buffer,
79382+ sizeof (buffer), 0)) {
79383+ goto failexit;
79384+ }
79385+ /*
79386+ * Reset the packet read pointer so we can read all its
79387+ * contents again.
79388+ */
79389+ packet_reset(pak);
79390+
79391+ native_size = (__force __be32 *) (&buffer[4 + 2]);
79392+ /*
79393+ * Verify that the size of the packet is correct
79394+ * as indicated and that there's actually someone reading packets.
79395+ * The minimum size of the packet is '10' for tag, size indicator
79396+ * and ordinal.
79397+ */
79398+ if (size < 10 ||
79399+ be32_to_cpu(*native_size) != size ||
79400+ 0 == dataex.has_opener || tpmif->status != CONNECTED) {
79401+ rc = -EINVAL;
79402+ goto failexit;
79403+ } else {
79404+ rc = vtpm_queue_packet(pak);
79405+ if (rc < 0)
79406+ goto failexit;
79407+ }
79408+ return 0;
79409+
79410+ failexit:
79411+ if (pak) {
79412+ tpm_send_fail_message(pak, buffer[4 + 1]);
79413+ packet_free(pak);
79414+ }
79415+ return rc;
79416+}
79417+
79418+/*
79419+ * Timeout function that gets invoked when a packet has not been processed
79420+ * during the timeout period.
79421+ * The packet must be on a list when this function is invoked. This
79422+ * also means that once its taken off a list, the timer must be
79423+ * destroyed as well.
79424+ */
79425+static void processing_timeout(unsigned long ptr)
79426+{
79427+ struct packet *pak = (struct packet *)ptr;
79428+ unsigned long flags;
79429+
79430+ write_lock_irqsave(&dataex.pak_lock, flags);
79431+ /*
79432+ * The packet needs to be searched whether it
79433+ * is still on the list.
79434+ */
79435+ if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
79436+ pak == packet_find_packet(&dataex.current_pak, pak)) {
79437+ if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
79438+ tpm_send_fail_message(pak, pak->req_tag);
79439+ }
79440+ /* discard future responses */
79441+ pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
79442+ }
79443+
79444+ write_unlock_irqrestore(&dataex.pak_lock, flags);
79445+}
79446+
79447+static void tpm_tx_action(unsigned long unused);
79448+static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
79449+
79450+static struct list_head tpm_schedule_list;
79451+static spinlock_t tpm_schedule_list_lock;
79452+
79453+static inline void maybe_schedule_tx_action(void)
79454+{
79455+ smp_mb();
79456+ tasklet_schedule(&tpm_tx_tasklet);
79457+}
79458+
79459+static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
79460+{
79461+ return tpmif->list.next != NULL;
79462+}
79463+
79464+static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
79465+{
79466+ spin_lock_irq(&tpm_schedule_list_lock);
79467+ if (likely(__on_tpm_schedule_list(tpmif))) {
79468+ list_del(&tpmif->list);
79469+ tpmif->list.next = NULL;
79470+ tpmif_put(tpmif);
79471+ }
79472+ spin_unlock_irq(&tpm_schedule_list_lock);
79473+}
79474+
79475+static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
79476+{
79477+ if (__on_tpm_schedule_list(tpmif))
79478+ return;
79479+
79480+ spin_lock_irq(&tpm_schedule_list_lock);
79481+ if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
79482+ list_add_tail(&tpmif->list, &tpm_schedule_list);
79483+ tpmif_get(tpmif);
79484+ }
79485+ spin_unlock_irq(&tpm_schedule_list_lock);
79486+}
79487+
79488+void tpmif_schedule_work(tpmif_t * tpmif)
79489+{
79490+ add_to_tpm_schedule_list_tail(tpmif);
79491+ maybe_schedule_tx_action();
79492+}
79493+
79494+void tpmif_deschedule_work(tpmif_t * tpmif)
79495+{
79496+ remove_from_tpm_schedule_list(tpmif);
79497+}
79498+
79499+static void tpm_tx_action(unsigned long unused)
79500+{
79501+ struct list_head *ent;
79502+ tpmif_t *tpmif;
79503+ tpmif_tx_request_t *tx;
79504+
79505+ DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
79506+
79507+ while (!list_empty(&tpm_schedule_list)) {
79508+ /* Get a tpmif from the list with work to do. */
79509+ ent = tpm_schedule_list.next;
79510+ tpmif = list_entry(ent, tpmif_t, list);
79511+ tpmif_get(tpmif);
79512+ remove_from_tpm_schedule_list(tpmif);
79513+
79514+ tx = &tpmif->tx->ring[0].req;
79515+
79516+ /* pass it up */
79517+ vtpm_receive(tpmif, tx->size);
79518+
79519+ tpmif_put(tpmif);
79520+ }
79521+}
79522+
79523+irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
79524+{
79525+ tpmif_t *tpmif = (tpmif_t *) dev_id;
79526+
79527+ add_to_tpm_schedule_list_tail(tpmif);
79528+ maybe_schedule_tx_action();
79529+ return IRQ_HANDLED;
79530+}
79531+
79532+static int __init tpmback_init(void)
79533+{
79534+ int rc;
79535+
79536+ if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
79537+ printk(KERN_ALERT
79538+ "Could not register misc device for TPM BE.\n");
79539+ return rc;
79540+ }
79541+
79542+ dataex_init(&dataex);
79543+
79544+ spin_lock_init(&tpm_schedule_list_lock);
79545+ INIT_LIST_HEAD(&tpm_schedule_list);
79546+
79547+ tpmif_interface_init();
79548+ tpmif_xenbus_init();
79549+
79550+ printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
79551+
79552+ return 0;
79553+}
79554+
79555+module_init(tpmback_init);
79556+
79557+void __exit tpmback_exit(void)
79558+{
79559+ vtpm_release_packets(NULL, 0);
79560+ tpmif_xenbus_exit();
79561+ tpmif_interface_exit();
79562+ misc_deregister(&vtpms_miscdevice);
79563+}
79564+
79565+MODULE_LICENSE("Dual BSD/GPL");
79566diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/xenbus.c linux-2.6.16.33/drivers/xen/tpmback/xenbus.c
79567--- linux-2.6.16.33-noxen/drivers/xen/tpmback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
79568+++ linux-2.6.16.33/drivers/xen/tpmback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
79569@@ -0,0 +1,289 @@
79570+/* Xenbus code for tpmif backend
79571+ Copyright (C) 2005 IBM Corporation
79572+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
79573+
79574+ This program is free software; you can redistribute it and/or modify
79575+ it under the terms of the GNU General Public License as published by
79576+ the Free Software Foundation; either version 2 of the License, or
79577+ (at your option) any later version.
79578+
79579+ This program is distributed in the hope that it will be useful,
79580+ but WITHOUT ANY WARRANTY; without even the implied warranty of
79581+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79582+ GNU General Public License for more details.
79583+
79584+ You should have received a copy of the GNU General Public License
79585+ along with this program; if not, write to the Free Software
79586+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
79587+*/
79588+#include <stdarg.h>
79589+#include <linux/module.h>
79590+#include <xen/xenbus.h>
79591+#include "common.h"
79592+
79593+struct backend_info
79594+{
79595+ struct xenbus_device *dev;
79596+
79597+ /* our communications channel */
79598+ tpmif_t *tpmif;
79599+
79600+ long int frontend_id;
79601+ long int instance; // instance of TPM
79602+ u8 is_instance_set;// whether instance number has been set
79603+
79604+ /* watch front end for changes */
79605+ struct xenbus_watch backend_watch;
79606+};
79607+
79608+static void maybe_connect(struct backend_info *be);
79609+static void connect(struct backend_info *be);
79610+static int connect_ring(struct backend_info *be);
79611+static void backend_changed(struct xenbus_watch *watch,
79612+ const char **vec, unsigned int len);
79613+static void frontend_changed(struct xenbus_device *dev,
79614+ enum xenbus_state frontend_state);
79615+
79616+long int tpmback_get_instance(struct backend_info *bi)
79617+{
79618+ long int res = -1;
79619+ if (bi && bi->is_instance_set)
79620+ res = bi->instance;
79621+ return res;
79622+}
79623+
79624+static int tpmback_remove(struct xenbus_device *dev)
79625+{
79626+ struct backend_info *be = dev->dev.driver_data;
79627+
79628+ if (!be) return 0;
79629+
79630+ if (be->backend_watch.node) {
79631+ unregister_xenbus_watch(&be->backend_watch);
79632+ kfree(be->backend_watch.node);
79633+ be->backend_watch.node = NULL;
79634+ }
79635+ if (be->tpmif) {
79636+ be->tpmif->bi = NULL;
79637+ vtpm_release_packets(be->tpmif, 0);
79638+ tpmif_put(be->tpmif);
79639+ be->tpmif = NULL;
79640+ }
79641+ kfree(be);
79642+ dev->dev.driver_data = NULL;
79643+ return 0;
79644+}
79645+
79646+static int tpmback_probe(struct xenbus_device *dev,
79647+ const struct xenbus_device_id *id)
79648+{
79649+ int err;
79650+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
79651+ GFP_KERNEL);
79652+
79653+ if (!be) {
79654+ xenbus_dev_fatal(dev, -ENOMEM,
79655+ "allocating backend structure");
79656+ return -ENOMEM;
79657+ }
79658+
79659+ be->is_instance_set = 0;
79660+ be->dev = dev;
79661+ dev->dev.driver_data = be;
79662+
79663+ err = xenbus_watch_path2(dev, dev->nodename,
79664+ "instance", &be->backend_watch,
79665+ backend_changed);
79666+ if (err) {
79667+ goto fail;
79668+ }
79669+
79670+ err = xenbus_switch_state(dev, XenbusStateInitWait);
79671+ if (err) {
79672+ goto fail;
79673+ }
79674+ return 0;
79675+fail:
79676+ tpmback_remove(dev);
79677+ return err;
79678+}
79679+
79680+
79681+static void backend_changed(struct xenbus_watch *watch,
79682+ const char **vec, unsigned int len)
79683+{
79684+ int err;
79685+ long instance;
79686+ struct backend_info *be
79687+ = container_of(watch, struct backend_info, backend_watch);
79688+ struct xenbus_device *dev = be->dev;
79689+
79690+ err = xenbus_scanf(XBT_NIL, dev->nodename,
79691+ "instance","%li", &instance);
79692+ if (XENBUS_EXIST_ERR(err)) {
79693+ return;
79694+ }
79695+
79696+ if (err != 1) {
79697+ xenbus_dev_fatal(dev, err, "reading instance");
79698+ return;
79699+ }
79700+
79701+ if (be->is_instance_set == 0) {
79702+ be->instance = instance;
79703+ be->is_instance_set = 1;
79704+ }
79705+}
79706+
79707+
79708+static void frontend_changed(struct xenbus_device *dev,
79709+ enum xenbus_state frontend_state)
79710+{
79711+ struct backend_info *be = dev->dev.driver_data;
79712+ int err;
79713+
79714+ switch (frontend_state) {
79715+ case XenbusStateInitialising:
79716+ case XenbusStateInitialised:
79717+ break;
79718+
79719+ case XenbusStateConnected:
79720+ err = connect_ring(be);
79721+ if (err) {
79722+ return;
79723+ }
79724+ maybe_connect(be);
79725+ break;
79726+
79727+ case XenbusStateClosing:
79728+ be->instance = -1;
79729+ xenbus_switch_state(dev, XenbusStateClosing);
79730+ break;
79731+
79732+ case XenbusStateUnknown: /* keep it here */
79733+ case XenbusStateClosed:
79734+ xenbus_switch_state(dev, XenbusStateClosed);
79735+ device_unregister(&be->dev->dev);
79736+ tpmback_remove(dev);
79737+ break;
79738+
79739+ default:
79740+ xenbus_dev_fatal(dev, -EINVAL,
79741+ "saw state %d at frontend",
79742+ frontend_state);
79743+ break;
79744+ }
79745+}
79746+
79747+
79748+
79749+static void maybe_connect(struct backend_info *be)
79750+{
79751+ if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
79752+ return;
79753+
79754+ connect(be);
79755+}
79756+
79757+
79758+static void connect(struct backend_info *be)
79759+{
79760+ struct xenbus_transaction xbt;
79761+ int err;
79762+ struct xenbus_device *dev = be->dev;
79763+ unsigned long ready = 1;
79764+
79765+again:
79766+ err = xenbus_transaction_start(&xbt);
79767+ if (err) {
79768+ xenbus_dev_fatal(be->dev, err, "starting transaction");
79769+ return;
79770+ }
79771+
79772+ err = xenbus_printf(xbt, be->dev->nodename,
79773+ "ready", "%lu", ready);
79774+ if (err) {
79775+ xenbus_dev_fatal(be->dev, err, "writing 'ready'");
79776+ goto abort;
79777+ }
79778+
79779+ err = xenbus_transaction_end(xbt, 0);
79780+ if (err == -EAGAIN)
79781+ goto again;
79782+ if (err)
79783+ xenbus_dev_fatal(be->dev, err, "end of transaction");
79784+
79785+ err = xenbus_switch_state(dev, XenbusStateConnected);
79786+ if (!err)
79787+ be->tpmif->status = CONNECTED;
79788+ return;
79789+abort:
79790+ xenbus_transaction_end(xbt, 1);
79791+}
79792+
79793+
79794+static int connect_ring(struct backend_info *be)
79795+{
79796+ struct xenbus_device *dev = be->dev;
79797+ unsigned long ring_ref;
79798+ unsigned int evtchn;
79799+ int err;
79800+
79801+ err = xenbus_gather(XBT_NIL, dev->otherend,
79802+ "ring-ref", "%lu", &ring_ref,
79803+ "event-channel", "%u", &evtchn, NULL);
79804+ if (err) {
79805+ xenbus_dev_error(dev, err,
79806+ "reading %s/ring-ref and event-channel",
79807+ dev->otherend);
79808+ return err;
79809+ }
79810+
79811+ if (!be->tpmif) {
79812+ be->tpmif = tpmif_find(dev->otherend_id, be);
79813+ if (IS_ERR(be->tpmif)) {
79814+ err = PTR_ERR(be->tpmif);
79815+ be->tpmif = NULL;
79816+ xenbus_dev_fatal(dev,err,"creating vtpm interface");
79817+ return err;
79818+ }
79819+ }
79820+
79821+ if (be->tpmif != NULL) {
79822+ err = tpmif_map(be->tpmif, ring_ref, evtchn);
79823+ if (err) {
79824+ xenbus_dev_error(dev, err,
79825+ "mapping shared-frame %lu port %u",
79826+ ring_ref, evtchn);
79827+ return err;
79828+ }
79829+ }
79830+ return 0;
79831+}
79832+
79833+
79834+static struct xenbus_device_id tpmback_ids[] = {
79835+ { "vtpm" },
79836+ { "" }
79837+};
79838+
79839+
79840+static struct xenbus_driver tpmback = {
79841+ .name = "vtpm",
79842+ .owner = THIS_MODULE,
79843+ .ids = tpmback_ids,
79844+ .probe = tpmback_probe,
79845+ .remove = tpmback_remove,
79846+ .otherend_changed = frontend_changed,
79847+};
79848+
79849+
79850+void tpmif_xenbus_init(void)
79851+{
79852+ xenbus_register_backend(&tpmback);
79853+}
79854+
79855+void tpmif_xenbus_exit(void)
79856+{
79857+ xenbus_unregister_driver(&tpmback);
79858+}
79859diff -Nur linux-2.6.16.33-noxen/drivers/xen/util.c linux-2.6.16.33/drivers/xen/util.c
79860--- linux-2.6.16.33-noxen/drivers/xen/util.c 1970-01-01 00:00:00.000000000 +0000
79861+++ linux-2.6.16.33/drivers/xen/util.c 2007-01-08 15:00:45.000000000 +0000
79862@@ -0,0 +1,70 @@
79863+#include <linux/config.h>
79864+#include <linux/mm.h>
79865+#include <linux/module.h>
79866+#include <linux/slab.h>
79867+#include <linux/vmalloc.h>
79868+#include <asm/uaccess.h>
79869+#include <xen/driver_util.h>
79870+
79871+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
79872+{
79873+ /* apply_to_page_range() does all the hard work. */
79874+ return 0;
79875+}
79876+
79877+struct vm_struct *alloc_vm_area(unsigned long size)
79878+{
79879+ struct vm_struct *area;
79880+
79881+ area = get_vm_area(size, VM_IOREMAP);
79882+ if (area == NULL)
79883+ return NULL;
79884+
79885+ /*
79886+ * This ensures that page tables are constructed for this region
79887+ * of kernel virtual address space and mapped into init_mm.
79888+ */
79889+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
79890+ area->size, f, NULL)) {
79891+ free_vm_area(area);
79892+ return NULL;
79893+ }
79894+
79895+ return area;
79896+}
79897+EXPORT_SYMBOL_GPL(alloc_vm_area);
79898+
79899+void free_vm_area(struct vm_struct *area)
79900+{
79901+ struct vm_struct *ret;
79902+ ret = remove_vm_area(area->addr);
79903+ BUG_ON(ret != area);
79904+ kfree(area);
79905+}
79906+EXPORT_SYMBOL_GPL(free_vm_area);
79907+
79908+void lock_vm_area(struct vm_struct *area)
79909+{
79910+ unsigned long i;
79911+ char c;
79912+
79913+ /*
79914+ * Prevent context switch to a lazy mm that doesn't have this area
79915+ * mapped into its page tables.
79916+ */
79917+ preempt_disable();
79918+
79919+ /*
79920+ * Ensure that the page tables are mapped into the current mm. The
79921+ * page-fault path will copy the page directory pointers from init_mm.
79922+ */
79923+ for (i = 0; i < area->size; i += PAGE_SIZE)
79924+ (void)__get_user(c, (char __user *)area->addr + i);
79925+}
79926+EXPORT_SYMBOL_GPL(lock_vm_area);
79927+
79928+void unlock_vm_area(struct vm_struct *area)
79929+{
79930+ preempt_enable();
79931+}
79932+EXPORT_SYMBOL_GPL(unlock_vm_area);
79933diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/Makefile linux-2.6.16.33/drivers/xen/xenbus/Makefile
79934--- linux-2.6.16.33-noxen/drivers/xen/xenbus/Makefile 1970-01-01 00:00:00.000000000 +0000
79935+++ linux-2.6.16.33/drivers/xen/xenbus/Makefile 2007-01-08 15:00:45.000000000 +0000
79936@@ -0,0 +1,9 @@
79937+obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
79938+obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
79939+
79940+xenbus_be-objs =
79941+xenbus_be-objs += xenbus_backend_client.o
79942+
79943+xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
79944+obj-y += $(xenbus-y) $(xenbus-m)
79945+obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
79946diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_backend_client.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_backend_client.c
79947--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_backend_client.c 1970-01-01 00:00:00.000000000 +0000
79948+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_backend_client.c 2007-01-08 15:00:45.000000000 +0000
79949@@ -0,0 +1,147 @@
79950+/******************************************************************************
79951+ * Backend-client-facing interface for the Xenbus driver. In other words, the
79952+ * interface between the Xenbus and the device-specific code in the backend
79953+ * driver.
79954+ *
79955+ * Copyright (C) 2005-2006 XenSource Ltd
79956+ *
79957+ * This program is free software; you can redistribute it and/or
79958+ * modify it under the terms of the GNU General Public License version 2
79959+ * as published by the Free Software Foundation; or, when distributed
79960+ * separately from the Linux kernel or incorporated into other
79961+ * software packages, subject to the following license:
79962+ *
79963+ * Permission is hereby granted, free of charge, to any person obtaining a copy
79964+ * of this source file (the "Software"), to deal in the Software without
79965+ * restriction, including without limitation the rights to use, copy, modify,
79966+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
79967+ * and to permit persons to whom the Software is furnished to do so, subject to
79968+ * the following conditions:
79969+ *
79970+ * The above copyright notice and this permission notice shall be included in
79971+ * all copies or substantial portions of the Software.
79972+ *
79973+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
79974+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
79975+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
79976+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
79977+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
79978+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
79979+ * IN THE SOFTWARE.
79980+ */
79981+
79982+#include <linux/err.h>
79983+#include <xen/gnttab.h>
79984+#include <xen/xenbus.h>
79985+#include <xen/driver_util.h>
79986+
79987+/* Based on Rusty Russell's skeleton driver's map_page */
79988+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref)
79989+{
79990+ struct gnttab_map_grant_ref op;
79991+ struct vm_struct *area;
79992+
79993+ area = alloc_vm_area(PAGE_SIZE);
79994+ if (!area)
79995+ return ERR_PTR(-ENOMEM);
79996+
79997+ gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
79998+ gnt_ref, dev->otherend_id);
79999+
80000+ lock_vm_area(area);
80001+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
80002+ unlock_vm_area(area);
80003+
80004+ if (op.status != GNTST_okay) {
80005+ free_vm_area(area);
80006+ xenbus_dev_fatal(dev, op.status,
80007+ "mapping in shared page %d from domain %d",
80008+ gnt_ref, dev->otherend_id);
80009+ BUG_ON(!IS_ERR(ERR_PTR(op.status)));
80010+ return ERR_PTR(op.status);
80011+ }
80012+
80013+ /* Stuff the handle in an unused field */
80014+ area->phys_addr = (unsigned long)op.handle;
80015+
80016+ return area;
80017+}
80018+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
80019+
80020+
80021+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
80022+ grant_handle_t *handle, void *vaddr)
80023+{
80024+ struct gnttab_map_grant_ref op;
80025+
80026+ gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
80027+ gnt_ref, dev->otherend_id);
80028+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
80029+
80030+ if (op.status != GNTST_okay) {
80031+ xenbus_dev_fatal(dev, op.status,
80032+ "mapping in shared page %d from domain %d",
80033+ gnt_ref, dev->otherend_id);
80034+ } else
80035+ *handle = op.handle;
80036+
80037+ return op.status;
80038+}
80039+EXPORT_SYMBOL_GPL(xenbus_map_ring);
80040+
80041+
80042+/* Based on Rusty Russell's skeleton driver's unmap_page */
80043+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
80044+{
80045+ struct gnttab_unmap_grant_ref op;
80046+
80047+ gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
80048+ (grant_handle_t)area->phys_addr);
80049+
80050+ lock_vm_area(area);
80051+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
80052+ unlock_vm_area(area);
80053+
80054+ if (op.status == GNTST_okay)
80055+ free_vm_area(area);
80056+ else
80057+ xenbus_dev_error(dev, op.status,
80058+ "unmapping page at handle %d error %d",
80059+ (int16_t)area->phys_addr, op.status);
80060+
80061+ return op.status;
80062+}
80063+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
80064+
80065+
80066+int xenbus_unmap_ring(struct xenbus_device *dev,
80067+ grant_handle_t handle, void *vaddr)
80068+{
80069+ struct gnttab_unmap_grant_ref op;
80070+
80071+ gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
80072+ handle);
80073+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
80074+
80075+ if (op.status != GNTST_okay)
80076+ xenbus_dev_error(dev, op.status,
80077+ "unmapping page at handle %d error %d",
80078+ handle, op.status);
80079+
80080+ return op.status;
80081+}
80082+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
80083+
80084+int xenbus_dev_is_online(struct xenbus_device *dev)
80085+{
80086+ int rc, val;
80087+
80088+ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
80089+ if (rc != 1)
80090+ val = 0; /* no online node present */
80091+
80092+ return val;
80093+}
80094+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
80095+
80096+MODULE_LICENSE("Dual BSD/GPL");
80097diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_client.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_client.c
80098--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_client.c 1970-01-01 00:00:00.000000000 +0000
80099+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_client.c 2007-01-08 15:00:45.000000000 +0000
80100@@ -0,0 +1,304 @@
80101+/******************************************************************************
80102+ * Client-facing interface for the Xenbus driver. In other words, the
80103+ * interface between the Xenbus and the device-specific code, be it the
80104+ * frontend or the backend of that driver.
80105+ *
80106+ * Copyright (C) 2005 XenSource Ltd
80107+ *
80108+ * This program is free software; you can redistribute it and/or
80109+ * modify it under the terms of the GNU General Public License version 2
80110+ * as published by the Free Software Foundation; or, when distributed
80111+ * separately from the Linux kernel or incorporated into other
80112+ * software packages, subject to the following license:
80113+ *
80114+ * Permission is hereby granted, free of charge, to any person obtaining a copy
80115+ * of this source file (the "Software"), to deal in the Software without
80116+ * restriction, including without limitation the rights to use, copy, modify,
80117+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80118+ * and to permit persons to whom the Software is furnished to do so, subject to
80119+ * the following conditions:
80120+ *
80121+ * The above copyright notice and this permission notice shall be included in
80122+ * all copies or substantial portions of the Software.
80123+ *
80124+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80125+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80126+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80127+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80128+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80129+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80130+ * IN THE SOFTWARE.
80131+ */
80132+
80133+#include <xen/evtchn.h>
80134+#include <xen/gnttab.h>
80135+#include <xen/xenbus.h>
80136+#include <xen/driver_util.h>
80137+
80138+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80139+#include <xen/platform-compat.h>
80140+#endif
80141+
80142+#define DPRINTK(fmt, args...) \
80143+ pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
80144+
80145+char *xenbus_strstate(enum xenbus_state state)
80146+{
80147+ static char *name[] = {
80148+ [ XenbusStateUnknown ] = "Unknown",
80149+ [ XenbusStateInitialising ] = "Initialising",
80150+ [ XenbusStateInitWait ] = "InitWait",
80151+ [ XenbusStateInitialised ] = "Initialised",
80152+ [ XenbusStateConnected ] = "Connected",
80153+ [ XenbusStateClosing ] = "Closing",
80154+ [ XenbusStateClosed ] = "Closed",
80155+ };
80156+ return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
80157+}
80158+
80159+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
80160+ struct xenbus_watch *watch,
80161+ void (*callback)(struct xenbus_watch *,
80162+ const char **, unsigned int))
80163+{
80164+ int err;
80165+
80166+ watch->node = path;
80167+ watch->callback = callback;
80168+
80169+ err = register_xenbus_watch(watch);
80170+
80171+ if (err) {
80172+ watch->node = NULL;
80173+ watch->callback = NULL;
80174+ xenbus_dev_fatal(dev, err, "adding watch on %s", path);
80175+ }
80176+
80177+ return err;
80178+}
80179+EXPORT_SYMBOL_GPL(xenbus_watch_path);
80180+
80181+
80182+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
80183+ const char *path2, struct xenbus_watch *watch,
80184+ void (*callback)(struct xenbus_watch *,
80185+ const char **, unsigned int))
80186+{
80187+ int err;
80188+ char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2);
80189+ if (!state) {
80190+ xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
80191+ return -ENOMEM;
80192+ }
80193+ err = xenbus_watch_path(dev, state, watch, callback);
80194+
80195+ if (err)
80196+ kfree(state);
80197+ return err;
80198+}
80199+EXPORT_SYMBOL_GPL(xenbus_watch_path2);
80200+
80201+
80202+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
80203+{
80204+ /* We check whether the state is currently set to the given value, and
80205+ if not, then the state is set. We don't want to unconditionally
80206+ write the given state, because we don't want to fire watches
80207+ unnecessarily. Furthermore, if the node has gone, we don't write
80208+ to it, as the device will be tearing down, and we don't want to
80209+ resurrect that directory.
80210+
80211+ Note that, because of this cached value of our state, this function
80212+ will not work inside a Xenstore transaction (something it was
80213+ trying to in the past) because dev->state would not get reset if
80214+ the transaction was aborted.
80215+
80216+ */
80217+
80218+ int current_state;
80219+ int err;
80220+
80221+ if (state == dev->state)
80222+ return 0;
80223+
80224+ err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
80225+ &current_state);
80226+ if (err != 1)
80227+ return 0;
80228+
80229+ err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
80230+ if (err) {
80231+ if (state != XenbusStateClosing) /* Avoid looping */
80232+ xenbus_dev_fatal(dev, err, "writing new state");
80233+ return err;
80234+ }
80235+
80236+ dev->state = state;
80237+
80238+ return 0;
80239+}
80240+EXPORT_SYMBOL_GPL(xenbus_switch_state);
80241+
80242+int xenbus_frontend_closed(struct xenbus_device *dev)
80243+{
80244+ xenbus_switch_state(dev, XenbusStateClosed);
80245+ complete(&dev->down);
80246+ return 0;
80247+}
80248+EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
80249+
80250+/**
80251+ * Return the path to the error node for the given device, or NULL on failure.
80252+ * If the value returned is non-NULL, then it is the caller's to kfree.
80253+ */
80254+static char *error_path(struct xenbus_device *dev)
80255+{
80256+ return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
80257+}
80258+
80259+
80260+void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
80261+ va_list ap)
80262+{
80263+ int ret;
80264+ unsigned int len;
80265+ char *printf_buffer = NULL, *path_buffer = NULL;
80266+
80267+#define PRINTF_BUFFER_SIZE 4096
80268+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
80269+ if (printf_buffer == NULL)
80270+ goto fail;
80271+
80272+ len = sprintf(printf_buffer, "%i ", -err);
80273+ ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
80274+
80275+ BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
80276+
80277+ dev_err(&dev->dev, "%s\n", printf_buffer);
80278+
80279+ path_buffer = error_path(dev);
80280+
80281+ if (path_buffer == NULL) {
80282+ printk("xenbus: failed to write error node for %s (%s)\n",
80283+ dev->nodename, printf_buffer);
80284+ goto fail;
80285+ }
80286+
80287+ if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
80288+ printk("xenbus: failed to write error node for %s (%s)\n",
80289+ dev->nodename, printf_buffer);
80290+ goto fail;
80291+ }
80292+
80293+fail:
80294+ if (printf_buffer)
80295+ kfree(printf_buffer);
80296+ if (path_buffer)
80297+ kfree(path_buffer);
80298+}
80299+
80300+
80301+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
80302+ ...)
80303+{
80304+ va_list ap;
80305+
80306+ va_start(ap, fmt);
80307+ _dev_error(dev, err, fmt, ap);
80308+ va_end(ap);
80309+}
80310+EXPORT_SYMBOL_GPL(xenbus_dev_error);
80311+
80312+
80313+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
80314+ ...)
80315+{
80316+ va_list ap;
80317+
80318+ va_start(ap, fmt);
80319+ _dev_error(dev, err, fmt, ap);
80320+ va_end(ap);
80321+
80322+ xenbus_switch_state(dev, XenbusStateClosing);
80323+}
80324+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
80325+
80326+
80327+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
80328+{
80329+ int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
80330+ if (err < 0)
80331+ xenbus_dev_fatal(dev, err, "granting access to ring page");
80332+ return err;
80333+}
80334+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
80335+
80336+
80337+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
80338+{
80339+ struct evtchn_alloc_unbound alloc_unbound;
80340+ int err;
80341+
80342+ alloc_unbound.dom = DOMID_SELF;
80343+ alloc_unbound.remote_dom = dev->otherend_id;
80344+
80345+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
80346+ &alloc_unbound);
80347+ if (err)
80348+ xenbus_dev_fatal(dev, err, "allocating event channel");
80349+ else
80350+ *port = alloc_unbound.port;
80351+
80352+ return err;
80353+}
80354+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
80355+
80356+
80357+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
80358+{
80359+ struct evtchn_bind_interdomain bind_interdomain;
80360+ int err;
80361+
80362+ bind_interdomain.remote_dom = dev->otherend_id;
80363+ bind_interdomain.remote_port = remote_port,
80364+
80365+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
80366+ &bind_interdomain);
80367+ if (err)
80368+ xenbus_dev_fatal(dev, err,
80369+ "binding to event channel %d from domain %d",
80370+ remote_port, dev->otherend_id);
80371+ else
80372+ *port = bind_interdomain.local_port;
80373+
80374+ return err;
80375+}
80376+EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
80377+
80378+
80379+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
80380+{
80381+ struct evtchn_close close;
80382+ int err;
80383+
80384+ close.port = port;
80385+
80386+ err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
80387+ if (err)
80388+ xenbus_dev_error(dev, err, "freeing event channel %d", port);
80389+
80390+ return err;
80391+}
80392+EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
80393+
80394+
80395+enum xenbus_state xenbus_read_driver_state(const char *path)
80396+{
80397+ enum xenbus_state result;
80398+ int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
80399+ if (err)
80400+ result = XenbusStateUnknown;
80401+
80402+ return result;
80403+}
80404+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
80405diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.c
80406--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.c 1970-01-01 00:00:00.000000000 +0000
80407+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.c 2007-01-08 15:00:45.000000000 +0000
80408@@ -0,0 +1,210 @@
80409+/******************************************************************************
80410+ * xenbus_comms.c
80411+ *
80412+ * Low level code to talks to Xen Store: ringbuffer and event channel.
80413+ *
80414+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
80415+ *
80416+ * This program is free software; you can redistribute it and/or
80417+ * modify it under the terms of the GNU General Public License version 2
80418+ * as published by the Free Software Foundation; or, when distributed
80419+ * separately from the Linux kernel or incorporated into other
80420+ * software packages, subject to the following license:
80421+ *
80422+ * Permission is hereby granted, free of charge, to any person obtaining a copy
80423+ * of this source file (the "Software"), to deal in the Software without
80424+ * restriction, including without limitation the rights to use, copy, modify,
80425+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80426+ * and to permit persons to whom the Software is furnished to do so, subject to
80427+ * the following conditions:
80428+ *
80429+ * The above copyright notice and this permission notice shall be included in
80430+ * all copies or substantial portions of the Software.
80431+ *
80432+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80433+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80434+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80435+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80436+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80437+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80438+ * IN THE SOFTWARE.
80439+ */
80440+
80441+#include <linux/wait.h>
80442+#include <linux/interrupt.h>
80443+#include <linux/sched.h>
80444+#include <linux/err.h>
80445+#include <linux/ptrace.h>
80446+#include <xen/evtchn.h>
80447+#include <xen/xenbus.h>
80448+
80449+#include <asm/hypervisor.h>
80450+
80451+#include "xenbus_comms.h"
80452+
80453+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80454+#include <xen/platform-compat.h>
80455+#endif
80456+
80457+static int xenbus_irq;
80458+
80459+extern void xenbus_probe(void *);
80460+extern int xenstored_ready;
80461+static DECLARE_WORK(probe_work, xenbus_probe, NULL);
80462+
80463+static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
80464+
80465+static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
80466+{
80467+ if (unlikely(xenstored_ready == 0)) {
80468+ xenstored_ready = 1;
80469+ schedule_work(&probe_work);
80470+ }
80471+
80472+ wake_up(&xb_waitq);
80473+ return IRQ_HANDLED;
80474+}
80475+
80476+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
80477+{
80478+ return ((prod - cons) <= XENSTORE_RING_SIZE);
80479+}
80480+
80481+static void *get_output_chunk(XENSTORE_RING_IDX cons,
80482+ XENSTORE_RING_IDX prod,
80483+ char *buf, uint32_t *len)
80484+{
80485+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
80486+ if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
80487+ *len = XENSTORE_RING_SIZE - (prod - cons);
80488+ return buf + MASK_XENSTORE_IDX(prod);
80489+}
80490+
80491+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
80492+ XENSTORE_RING_IDX prod,
80493+ const char *buf, uint32_t *len)
80494+{
80495+ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
80496+ if ((prod - cons) < *len)
80497+ *len = prod - cons;
80498+ return buf + MASK_XENSTORE_IDX(cons);
80499+}
80500+
80501+int xb_write(const void *data, unsigned len)
80502+{
80503+ struct xenstore_domain_interface *intf = xen_store_interface;
80504+ XENSTORE_RING_IDX cons, prod;
80505+ int rc;
80506+
80507+ while (len != 0) {
80508+ void *dst;
80509+ unsigned int avail;
80510+
80511+ rc = wait_event_interruptible(
80512+ xb_waitq,
80513+ (intf->req_prod - intf->req_cons) !=
80514+ XENSTORE_RING_SIZE);
80515+ if (rc < 0)
80516+ return rc;
80517+
80518+ /* Read indexes, then verify. */
80519+ cons = intf->req_cons;
80520+ prod = intf->req_prod;
80521+ mb();
80522+ if (!check_indexes(cons, prod)) {
80523+ intf->req_cons = intf->req_prod = 0;
80524+ return -EIO;
80525+ }
80526+
80527+ dst = get_output_chunk(cons, prod, intf->req, &avail);
80528+ if (avail == 0)
80529+ continue;
80530+ if (avail > len)
80531+ avail = len;
80532+
80533+ memcpy(dst, data, avail);
80534+ data += avail;
80535+ len -= avail;
80536+
80537+ /* Other side must not see new header until data is there. */
80538+ wmb();
80539+ intf->req_prod += avail;
80540+
80541+ /* This implies mb() before other side sees interrupt. */
80542+ notify_remote_via_evtchn(xen_store_evtchn);
80543+ }
80544+
80545+ return 0;
80546+}
80547+
80548+int xb_read(void *data, unsigned len)
80549+{
80550+ struct xenstore_domain_interface *intf = xen_store_interface;
80551+ XENSTORE_RING_IDX cons, prod;
80552+ int rc;
80553+
80554+ while (len != 0) {
80555+ unsigned int avail;
80556+ const char *src;
80557+
80558+ rc = wait_event_interruptible(
80559+ xb_waitq,
80560+ intf->rsp_cons != intf->rsp_prod);
80561+ if (rc < 0)
80562+ return rc;
80563+
80564+ /* Read indexes, then verify. */
80565+ cons = intf->rsp_cons;
80566+ prod = intf->rsp_prod;
80567+ mb();
80568+ if (!check_indexes(cons, prod)) {
80569+ intf->rsp_cons = intf->rsp_prod = 0;
80570+ return -EIO;
80571+ }
80572+
80573+ src = get_input_chunk(cons, prod, intf->rsp, &avail);
80574+ if (avail == 0)
80575+ continue;
80576+ if (avail > len)
80577+ avail = len;
80578+
80579+ /* We must read header before we read data. */
80580+ rmb();
80581+
80582+ memcpy(data, src, avail);
80583+ data += avail;
80584+ len -= avail;
80585+
80586+ /* Other side must not see free space until we've copied out */
80587+ mb();
80588+ intf->rsp_cons += avail;
80589+
80590+ pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
80591+
80592+ /* Implies mb(): they will see new header. */
80593+ notify_remote_via_evtchn(xen_store_evtchn);
80594+ }
80595+
80596+ return 0;
80597+}
80598+
80599+/* Set up interrupt handler off store event channel. */
80600+int xb_init_comms(void)
80601+{
80602+ int err;
80603+
80604+ if (xenbus_irq)
80605+ unbind_from_irqhandler(xenbus_irq, &xb_waitq);
80606+
80607+ err = bind_evtchn_to_irqhandler(
80608+ xen_store_evtchn, wake_waiting,
80609+ 0, "xenbus", &xb_waitq);
80610+ if (err <= 0) {
80611+ printk(KERN_ERR "XENBUS request irq failed %i\n", err);
80612+ return err;
80613+ }
80614+
80615+ xenbus_irq = err;
80616+
80617+ return 0;
80618+}
80619diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.h linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.h
80620--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.h 1970-01-01 00:00:00.000000000 +0000
80621+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.h 2007-01-08 15:00:45.000000000 +0000
80622@@ -0,0 +1,44 @@
80623+/*
80624+ * Private include for xenbus communications.
80625+ *
80626+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
80627+ *
80628+ * This program is free software; you can redistribute it and/or
80629+ * modify it under the terms of the GNU General Public License version 2
80630+ * as published by the Free Software Foundation; or, when distributed
80631+ * separately from the Linux kernel or incorporated into other
80632+ * software packages, subject to the following license:
80633+ *
80634+ * Permission is hereby granted, free of charge, to any person obtaining a copy
80635+ * of this source file (the "Software"), to deal in the Software without
80636+ * restriction, including without limitation the rights to use, copy, modify,
80637+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80638+ * and to permit persons to whom the Software is furnished to do so, subject to
80639+ * the following conditions:
80640+ *
80641+ * The above copyright notice and this permission notice shall be included in
80642+ * all copies or substantial portions of the Software.
80643+ *
80644+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80645+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80646+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80647+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80648+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80649+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80650+ * IN THE SOFTWARE.
80651+ */
80652+
80653+#ifndef _XENBUS_COMMS_H
80654+#define _XENBUS_COMMS_H
80655+
80656+int xs_init(void);
80657+int xb_init_comms(void);
80658+
80659+/* Low level routines. */
80660+int xb_write(const void *data, unsigned len);
80661+int xb_read(void *data, unsigned len);
80662+int xs_input_avail(void);
80663+extern struct xenstore_domain_interface *xen_store_interface;
80664+extern int xen_store_evtchn;
80665+
80666+#endif /* _XENBUS_COMMS_H */
80667diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_dev.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_dev.c
80668--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_dev.c 1970-01-01 00:00:00.000000000 +0000
80669+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_dev.c 2007-01-08 15:00:45.000000000 +0000
80670@@ -0,0 +1,362 @@
80671+/*
80672+ * xenbus_dev.c
80673+ *
80674+ * Driver giving user-space access to the kernel's xenbus connection
80675+ * to xenstore.
80676+ *
80677+ * Copyright (c) 2005, Christian Limpach
80678+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
80679+ *
80680+ * This program is free software; you can redistribute it and/or
80681+ * modify it under the terms of the GNU General Public License version 2
80682+ * as published by the Free Software Foundation; or, when distributed
80683+ * separately from the Linux kernel or incorporated into other
80684+ * software packages, subject to the following license:
80685+ *
80686+ * Permission is hereby granted, free of charge, to any person obtaining a copy
80687+ * of this source file (the "Software"), to deal in the Software without
80688+ * restriction, including without limitation the rights to use, copy, modify,
80689+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80690+ * and to permit persons to whom the Software is furnished to do so, subject to
80691+ * the following conditions:
80692+ *
80693+ * The above copyright notice and this permission notice shall be included in
80694+ * all copies or substantial portions of the Software.
80695+ *
80696+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80697+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80698+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80699+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80700+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80701+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80702+ * IN THE SOFTWARE.
80703+ */
80704+
80705+#include <linux/config.h>
80706+#include <linux/kernel.h>
80707+#include <linux/errno.h>
80708+#include <linux/uio.h>
80709+#include <linux/notifier.h>
80710+#include <linux/wait.h>
80711+#include <linux/fs.h>
80712+#include <linux/poll.h>
80713+#include <linux/mutex.h>
80714+
80715+#include "xenbus_comms.h"
80716+
80717+#include <asm/uaccess.h>
80718+#include <asm/hypervisor.h>
80719+#include <xen/xenbus.h>
80720+#include <xen/xen_proc.h>
80721+#include <asm/hypervisor.h>
80722+
80723+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80724+#include <xen/platform-compat.h>
80725+#endif
80726+
80727+struct xenbus_dev_transaction {
80728+ struct list_head list;
80729+ struct xenbus_transaction handle;
80730+};
80731+
80732+struct xenbus_dev_data {
80733+ /* In-progress transaction. */
80734+ struct list_head transactions;
80735+
80736+ /* Active watches. */
80737+ struct list_head watches;
80738+
80739+ /* Partial request. */
80740+ unsigned int len;
80741+ union {
80742+ struct xsd_sockmsg msg;
80743+ char buffer[PAGE_SIZE];
80744+ } u;
80745+
80746+ /* Response queue. */
80747+#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
80748+ char read_buffer[PAGE_SIZE];
80749+ unsigned int read_cons, read_prod;
80750+ wait_queue_head_t read_waitq;
80751+
80752+ struct mutex reply_mutex;
80753+};
80754+
80755+static struct proc_dir_entry *xenbus_dev_intf;
80756+
80757+static ssize_t xenbus_dev_read(struct file *filp,
80758+ char __user *ubuf,
80759+ size_t len, loff_t *ppos)
80760+{
80761+ struct xenbus_dev_data *u = filp->private_data;
80762+ int i;
80763+
80764+ if (wait_event_interruptible(u->read_waitq,
80765+ u->read_prod != u->read_cons))
80766+ return -EINTR;
80767+
80768+ for (i = 0; i < len; i++) {
80769+ if (u->read_cons == u->read_prod)
80770+ break;
80771+ put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
80772+ u->read_cons++;
80773+ }
80774+
80775+ return i;
80776+}
80777+
80778+static void queue_reply(struct xenbus_dev_data *u,
80779+ char *data, unsigned int len)
80780+{
80781+ int i;
80782+
80783+ mutex_lock(&u->reply_mutex);
80784+
80785+ for (i = 0; i < len; i++, u->read_prod++)
80786+ u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
80787+
80788+ BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
80789+
80790+ mutex_unlock(&u->reply_mutex);
80791+
80792+ wake_up(&u->read_waitq);
80793+}
80794+
80795+struct watch_adapter
80796+{
80797+ struct list_head list;
80798+ struct xenbus_watch watch;
80799+ struct xenbus_dev_data *dev_data;
80800+ char *token;
80801+};
80802+
80803+static void free_watch_adapter (struct watch_adapter *watch)
80804+{
80805+ kfree(watch->watch.node);
80806+ kfree(watch->token);
80807+ kfree(watch);
80808+}
80809+
80810+static void watch_fired(struct xenbus_watch *watch,
80811+ const char **vec,
80812+ unsigned int len)
80813+{
80814+ struct watch_adapter *adap =
80815+ container_of(watch, struct watch_adapter, watch);
80816+ struct xsd_sockmsg hdr;
80817+ const char *path, *token;
80818+ int path_len, tok_len, body_len;
80819+
80820+ path = vec[XS_WATCH_PATH];
80821+ token = adap->token;
80822+
80823+ path_len = strlen(path) + 1;
80824+ tok_len = strlen(token) + 1;
80825+ body_len = path_len + tok_len;
80826+
80827+ hdr.type = XS_WATCH_EVENT;
80828+ hdr.len = body_len;
80829+
80830+ queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr));
80831+ queue_reply(adap->dev_data, (char *)path, path_len);
80832+ queue_reply(adap->dev_data, (char *)token, tok_len);
80833+}
80834+
80835+static LIST_HEAD(watch_list);
80836+
80837+static ssize_t xenbus_dev_write(struct file *filp,
80838+ const char __user *ubuf,
80839+ size_t len, loff_t *ppos)
80840+{
80841+ struct xenbus_dev_data *u = filp->private_data;
80842+ struct xenbus_dev_transaction *trans = NULL;
80843+ uint32_t msg_type;
80844+ void *reply;
80845+ char *path, *token;
80846+ struct watch_adapter *watch, *tmp_watch;
80847+ int err;
80848+
80849+ if ((len + u->len) > sizeof(u->u.buffer))
80850+ return -EINVAL;
80851+
80852+ if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
80853+ return -EFAULT;
80854+
80855+ u->len += len;
80856+ if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
80857+ return len;
80858+
80859+ msg_type = u->u.msg.type;
80860+
80861+ switch (msg_type) {
80862+ case XS_TRANSACTION_START:
80863+ case XS_TRANSACTION_END:
80864+ case XS_DIRECTORY:
80865+ case XS_READ:
80866+ case XS_GET_PERMS:
80867+ case XS_RELEASE:
80868+ case XS_GET_DOMAIN_PATH:
80869+ case XS_WRITE:
80870+ case XS_MKDIR:
80871+ case XS_RM:
80872+ case XS_SET_PERMS:
80873+ if (msg_type == XS_TRANSACTION_START) {
80874+ trans = kmalloc(sizeof(*trans), GFP_KERNEL);
80875+ if (!trans)
80876+ return -ENOMEM;
80877+ }
80878+
80879+ reply = xenbus_dev_request_and_reply(&u->u.msg);
80880+ if (IS_ERR(reply)) {
80881+ kfree(trans);
80882+ return PTR_ERR(reply);
80883+ }
80884+
80885+ if (msg_type == XS_TRANSACTION_START) {
80886+ trans->handle.id = simple_strtoul(reply, NULL, 0);
80887+ list_add(&trans->list, &u->transactions);
80888+ } else if (msg_type == XS_TRANSACTION_END) {
80889+ list_for_each_entry(trans, &u->transactions, list)
80890+ if (trans->handle.id == u->u.msg.tx_id)
80891+ break;
80892+ BUG_ON(&trans->list == &u->transactions);
80893+ list_del(&trans->list);
80894+ kfree(trans);
80895+ }
80896+ queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
80897+ queue_reply(u, (char *)reply, u->u.msg.len);
80898+ kfree(reply);
80899+ break;
80900+
80901+ case XS_WATCH:
80902+ case XS_UNWATCH:
80903+ path = u->u.buffer + sizeof(u->u.msg);
80904+ token = memchr(path, 0, u->u.msg.len);
80905+ if (token == NULL)
80906+ return -EILSEQ;
80907+ token++;
80908+
80909+ if (msg_type == XS_WATCH) {
80910+ static const char * XS_WATCH_RESP = "OK";
80911+ struct xsd_sockmsg hdr;
80912+
80913+ watch = kmalloc(sizeof(*watch), GFP_KERNEL);
80914+ watch->watch.node = kmalloc(strlen(path)+1,
80915+ GFP_KERNEL);
80916+ strcpy((char *)watch->watch.node, path);
80917+ watch->watch.callback = watch_fired;
80918+ watch->token = kmalloc(strlen(token)+1, GFP_KERNEL);
80919+ strcpy(watch->token, token);
80920+ watch->dev_data = u;
80921+
80922+ err = register_xenbus_watch(&watch->watch);
80923+ if (err) {
80924+ free_watch_adapter(watch);
80925+ return err;
80926+ }
80927+
80928+ list_add(&watch->list, &u->watches);
80929+
80930+ hdr.type = XS_WATCH;
80931+ hdr.len = strlen(XS_WATCH_RESP) + 1;
80932+ queue_reply(u, (char *)&hdr, sizeof(hdr));
80933+ queue_reply(u, (char *)XS_WATCH_RESP, hdr.len);
80934+ } else {
80935+ list_for_each_entry_safe(watch, tmp_watch,
80936+ &u->watches, list) {
80937+ if (!strcmp(watch->token, token) &&
80938+ !strcmp(watch->watch.node, path))
80939+ break;
80940+ {
80941+ unregister_xenbus_watch(&watch->watch);
80942+ list_del(&watch->list);
80943+ free_watch_adapter(watch);
80944+ break;
80945+ }
80946+ }
80947+ }
80948+
80949+ break;
80950+
80951+ default:
80952+ return -EINVAL;
80953+ }
80954+
80955+ u->len = 0;
80956+ return len;
80957+}
80958+
80959+static int xenbus_dev_open(struct inode *inode, struct file *filp)
80960+{
80961+ struct xenbus_dev_data *u;
80962+
80963+ if (xen_store_evtchn == 0)
80964+ return -ENOENT;
80965+
80966+ nonseekable_open(inode, filp);
80967+
80968+ u = kzalloc(sizeof(*u), GFP_KERNEL);
80969+ if (u == NULL)
80970+ return -ENOMEM;
80971+
80972+ INIT_LIST_HEAD(&u->transactions);
80973+ INIT_LIST_HEAD(&u->watches);
80974+ init_waitqueue_head(&u->read_waitq);
80975+
80976+ mutex_init(&u->reply_mutex);
80977+
80978+ filp->private_data = u;
80979+
80980+ return 0;
80981+}
80982+
80983+static int xenbus_dev_release(struct inode *inode, struct file *filp)
80984+{
80985+ struct xenbus_dev_data *u = filp->private_data;
80986+ struct xenbus_dev_transaction *trans, *tmp;
80987+ struct watch_adapter *watch, *tmp_watch;
80988+
80989+ list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
80990+ xenbus_transaction_end(trans->handle, 1);
80991+ list_del(&trans->list);
80992+ kfree(trans);
80993+ }
80994+
80995+ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
80996+ unregister_xenbus_watch(&watch->watch);
80997+ list_del(&watch->list);
80998+ free_watch_adapter(watch);
80999+ }
81000+
81001+ kfree(u);
81002+
81003+ return 0;
81004+}
81005+
81006+static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
81007+{
81008+ struct xenbus_dev_data *u = file->private_data;
81009+
81010+ poll_wait(file, &u->read_waitq, wait);
81011+ if (u->read_cons != u->read_prod)
81012+ return POLLIN | POLLRDNORM;
81013+ return 0;
81014+}
81015+
81016+static struct file_operations xenbus_dev_file_ops = {
81017+ .read = xenbus_dev_read,
81018+ .write = xenbus_dev_write,
81019+ .open = xenbus_dev_open,
81020+ .release = xenbus_dev_release,
81021+ .poll = xenbus_dev_poll,
81022+};
81023+
81024+int __init
81025+xenbus_dev_init(void)
81026+{
81027+ xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
81028+ if (xenbus_dev_intf)
81029+ xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
81030+
81031+ return 0;
81032+}
81033diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.c
81034--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.c 1970-01-01 00:00:00.000000000 +0000
81035+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.c 2007-01-08 15:00:45.000000000 +0000
81036@@ -0,0 +1,1017 @@
81037+/******************************************************************************
81038+ * Talks to Xen Store to figure out what devices we have.
81039+ *
81040+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
81041+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
81042+ * Copyright (C) 2005, 2006 XenSource Ltd
81043+ *
81044+ * This program is free software; you can redistribute it and/or
81045+ * modify it under the terms of the GNU General Public License version 2
81046+ * as published by the Free Software Foundation; or, when distributed
81047+ * separately from the Linux kernel or incorporated into other
81048+ * software packages, subject to the following license:
81049+ *
81050+ * Permission is hereby granted, free of charge, to any person obtaining a copy
81051+ * of this source file (the "Software"), to deal in the Software without
81052+ * restriction, including without limitation the rights to use, copy, modify,
81053+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
81054+ * and to permit persons to whom the Software is furnished to do so, subject to
81055+ * the following conditions:
81056+ *
81057+ * The above copyright notice and this permission notice shall be included in
81058+ * all copies or substantial portions of the Software.
81059+ *
81060+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81061+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81062+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
81063+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81064+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
81065+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
81066+ * IN THE SOFTWARE.
81067+ */
81068+
81069+#define DPRINTK(fmt, args...) \
81070+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
81071+ __FUNCTION__, __LINE__, ##args)
81072+
81073+#include <linux/kernel.h>
81074+#include <linux/err.h>
81075+#include <linux/string.h>
81076+#include <linux/ctype.h>
81077+#include <linux/fcntl.h>
81078+#include <linux/mm.h>
81079+#include <linux/notifier.h>
81080+#include <linux/kthread.h>
81081+#include <linux/mutex.h>
81082+
81083+#include <asm/io.h>
81084+#include <asm/page.h>
81085+#include <asm/maddr.h>
81086+#include <asm/pgtable.h>
81087+#include <asm/hypervisor.h>
81088+#include <xen/xenbus.h>
81089+#include <xen/xen_proc.h>
81090+#include <xen/evtchn.h>
81091+#include <xen/features.h>
81092+#include <xen/hvm.h>
81093+
81094+#include "xenbus_comms.h"
81095+#include "xenbus_probe.h"
81096+
81097+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
81098+#include <xen/platform-compat.h>
81099+#endif
81100+
81101+int xen_store_evtchn;
81102+struct xenstore_domain_interface *xen_store_interface;
81103+static unsigned long xen_store_mfn;
81104+
81105+extern struct mutex xenwatch_mutex;
81106+
81107+static struct notifier_block *xenstore_chain;
81108+
81109+static void wait_for_devices(struct xenbus_driver *xendrv);
81110+
81111+static int xenbus_probe_frontend(const char *type, const char *name);
81112+
81113+static void xenbus_dev_shutdown(struct device *_dev);
81114+
81115+/* If something in array of ids matches this device, return it. */
81116+static const struct xenbus_device_id *
81117+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
81118+{
81119+ for (; *arr->devicetype != '\0'; arr++) {
81120+ if (!strcmp(arr->devicetype, dev->devicetype))
81121+ return arr;
81122+ }
81123+ return NULL;
81124+}
81125+
81126+int xenbus_match(struct device *_dev, struct device_driver *_drv)
81127+{
81128+ struct xenbus_driver *drv = to_xenbus_driver(_drv);
81129+
81130+ if (!drv->ids)
81131+ return 0;
81132+
81133+ return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
81134+}
81135+
81136+/* device/<type>/<id> => <type>-<id> */
81137+static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
81138+{
81139+ nodename = strchr(nodename, '/');
81140+ if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
81141+ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
81142+ return -EINVAL;
81143+ }
81144+
81145+ strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
81146+ if (!strchr(bus_id, '/')) {
81147+ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
81148+ return -EINVAL;
81149+ }
81150+ *strchr(bus_id, '/') = '-';
81151+ return 0;
81152+}
81153+
81154+
81155+static void free_otherend_details(struct xenbus_device *dev)
81156+{
81157+ kfree(dev->otherend);
81158+ dev->otherend = NULL;
81159+}
81160+
81161+
81162+static void free_otherend_watch(struct xenbus_device *dev)
81163+{
81164+ if (dev->otherend_watch.node) {
81165+ unregister_xenbus_watch(&dev->otherend_watch);
81166+ kfree(dev->otherend_watch.node);
81167+ dev->otherend_watch.node = NULL;
81168+ }
81169+}
81170+
81171+
81172+int read_otherend_details(struct xenbus_device *xendev,
81173+ char *id_node, char *path_node)
81174+{
81175+ int err = xenbus_gather(XBT_NIL, xendev->nodename,
81176+ id_node, "%i", &xendev->otherend_id,
81177+ path_node, NULL, &xendev->otherend,
81178+ NULL);
81179+ if (err) {
81180+ xenbus_dev_fatal(xendev, err,
81181+ "reading other end details from %s",
81182+ xendev->nodename);
81183+ return err;
81184+ }
81185+ if (strlen(xendev->otherend) == 0 ||
81186+ !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
81187+ xenbus_dev_fatal(xendev, -ENOENT,
81188+ "unable to read other end from %s. "
81189+ "missing or inaccessible.",
81190+ xendev->nodename);
81191+ free_otherend_details(xendev);
81192+ return -ENOENT;
81193+ }
81194+
81195+ return 0;
81196+}
81197+
81198+
81199+static int read_backend_details(struct xenbus_device *xendev)
81200+{
81201+ return read_otherend_details(xendev, "backend-id", "backend");
81202+}
81203+
81204+
81205+/* Bus type for frontend drivers. */
81206+static struct xen_bus_type xenbus_frontend = {
81207+ .root = "device",
81208+ .levels = 2, /* device/type/<id> */
81209+ .get_bus_id = frontend_bus_id,
81210+ .probe = xenbus_probe_frontend,
81211+ .bus = {
81212+ .name = "xen",
81213+ .match = xenbus_match,
81214+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
81215+ .probe = xenbus_dev_probe,
81216+ .remove = xenbus_dev_remove,
81217+ .shutdown = xenbus_dev_shutdown,
81218+#endif
81219+ },
81220+ .dev = {
81221+ .bus_id = "xen",
81222+ },
81223+};
81224+
81225+static void otherend_changed(struct xenbus_watch *watch,
81226+ const char **vec, unsigned int len)
81227+{
81228+ struct xenbus_device *dev =
81229+ container_of(watch, struct xenbus_device, otherend_watch);
81230+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
81231+ enum xenbus_state state;
81232+
81233+ /* Protect us against watches firing on old details when the otherend
81234+ details change, say immediately after a resume. */
81235+ if (!dev->otherend ||
81236+ strncmp(dev->otherend, vec[XS_WATCH_PATH],
81237+ strlen(dev->otherend))) {
81238+ DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
81239+ return;
81240+ }
81241+
81242+ state = xenbus_read_driver_state(dev->otherend);
81243+
81244+ DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state),
81245+ dev->otherend_watch.node, vec[XS_WATCH_PATH]);
81246+
81247+ /*
81248+ * Ignore xenbus transitions during shutdown. This prevents us doing
81249+ * work that can fail e.g., when the rootfs is gone.
81250+ */
81251+ if (system_state > SYSTEM_RUNNING) {
81252+ struct xen_bus_type *bus = bus;
81253+ bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
81254+ /* If we're frontend, drive the state machine to Closed. */
81255+ /* This should cause the backend to release our resources. */
81256+ if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
81257+ xenbus_frontend_closed(dev);
81258+ return;
81259+ }
81260+
81261+ if (drv->otherend_changed)
81262+ drv->otherend_changed(dev, state);
81263+}
81264+
81265+
81266+static int talk_to_otherend(struct xenbus_device *dev)
81267+{
81268+ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
81269+
81270+ free_otherend_watch(dev);
81271+ free_otherend_details(dev);
81272+
81273+ return drv->read_otherend_details(dev);
81274+}
81275+
81276+
81277+static int watch_otherend(struct xenbus_device *dev)
81278+{
81279+ return xenbus_watch_path2(dev, dev->otherend, "state",
81280+ &dev->otherend_watch, otherend_changed);
81281+}
81282+
81283+
81284+int xenbus_dev_probe(struct device *_dev)
81285+{
81286+ struct xenbus_device *dev = to_xenbus_device(_dev);
81287+ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
81288+ const struct xenbus_device_id *id;
81289+ int err;
81290+
81291+ DPRINTK("%s", dev->nodename);
81292+
81293+ if (!drv->probe) {
81294+ err = -ENODEV;
81295+ goto fail;
81296+ }
81297+
81298+ id = match_device(drv->ids, dev);
81299+ if (!id) {
81300+ err = -ENODEV;
81301+ goto fail;
81302+ }
81303+
81304+ err = talk_to_otherend(dev);
81305+ if (err) {
81306+ printk(KERN_WARNING
81307+ "xenbus_probe: talk_to_otherend on %s failed.\n",
81308+ dev->nodename);
81309+ return err;
81310+ }
81311+
81312+ err = drv->probe(dev, id);
81313+ if (err)
81314+ goto fail;
81315+
81316+ err = watch_otherend(dev);
81317+ if (err) {
81318+ printk(KERN_WARNING
81319+ "xenbus_probe: watch_otherend on %s failed.\n",
81320+ dev->nodename);
81321+ return err;
81322+ }
81323+
81324+ return 0;
81325+fail:
81326+ xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
81327+ xenbus_switch_state(dev, XenbusStateClosed);
81328+ return -ENODEV;
81329+}
81330+
81331+int xenbus_dev_remove(struct device *_dev)
81332+{
81333+ struct xenbus_device *dev = to_xenbus_device(_dev);
81334+ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
81335+
81336+ DPRINTK("%s", dev->nodename);
81337+
81338+ free_otherend_watch(dev);
81339+ free_otherend_details(dev);
81340+
81341+ if (drv->remove)
81342+ drv->remove(dev);
81343+
81344+ xenbus_switch_state(dev, XenbusStateClosed);
81345+ return 0;
81346+}
81347+
81348+static void xenbus_dev_shutdown(struct device *_dev)
81349+{
81350+ struct xenbus_device *dev = to_xenbus_device(_dev);
81351+ unsigned long timeout = 5*HZ;
81352+
81353+ DPRINTK("%s", dev->nodename);
81354+
81355+ get_device(&dev->dev);
81356+ if (dev->state != XenbusStateConnected) {
81357+ printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__,
81358+ dev->nodename, xenbus_strstate(dev->state));
81359+ goto out;
81360+ }
81361+ xenbus_switch_state(dev, XenbusStateClosing);
81362+ timeout = wait_for_completion_timeout(&dev->down, timeout);
81363+ if (!timeout)
81364+ printk("%s: %s timeout closing device\n", __FUNCTION__, dev->nodename);
81365+ out:
81366+ put_device(&dev->dev);
81367+}
81368+
81369+int xenbus_register_driver_common(struct xenbus_driver *drv,
81370+ struct xen_bus_type *bus)
81371+{
81372+ int ret;
81373+
81374+ drv->driver.name = drv->name;
81375+ drv->driver.bus = &bus->bus;
81376+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
81377+ drv->driver.owner = drv->owner;
81378+#endif
81379+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
81380+ drv->driver.probe = xenbus_dev_probe;
81381+ drv->driver.remove = xenbus_dev_remove;
81382+ drv->driver.shutdown = xenbus_dev_shutdown;
81383+#endif
81384+
81385+ mutex_lock(&xenwatch_mutex);
81386+ ret = driver_register(&drv->driver);
81387+ mutex_unlock(&xenwatch_mutex);
81388+ return ret;
81389+}
81390+
81391+int xenbus_register_frontend(struct xenbus_driver *drv)
81392+{
81393+ int ret;
81394+
81395+ drv->read_otherend_details = read_backend_details;
81396+
81397+ ret = xenbus_register_driver_common(drv, &xenbus_frontend);
81398+ if (ret)
81399+ return ret;
81400+
81401+ /* If this driver is loaded as a module wait for devices to attach. */
81402+ wait_for_devices(drv);
81403+
81404+ return 0;
81405+}
81406+EXPORT_SYMBOL_GPL(xenbus_register_frontend);
81407+
81408+void xenbus_unregister_driver(struct xenbus_driver *drv)
81409+{
81410+ driver_unregister(&drv->driver);
81411+}
81412+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
81413+
81414+struct xb_find_info
81415+{
81416+ struct xenbus_device *dev;
81417+ const char *nodename;
81418+};
81419+
81420+static int cmp_dev(struct device *dev, void *data)
81421+{
81422+ struct xenbus_device *xendev = to_xenbus_device(dev);
81423+ struct xb_find_info *info = data;
81424+
81425+ if (!strcmp(xendev->nodename, info->nodename)) {
81426+ info->dev = xendev;
81427+ get_device(dev);
81428+ return 1;
81429+ }
81430+ return 0;
81431+}
81432+
81433+struct xenbus_device *xenbus_device_find(const char *nodename,
81434+ struct bus_type *bus)
81435+{
81436+ struct xb_find_info info = { .dev = NULL, .nodename = nodename };
81437+
81438+ bus_for_each_dev(bus, NULL, &info, cmp_dev);
81439+ return info.dev;
81440+}
81441+
81442+static int cleanup_dev(struct device *dev, void *data)
81443+{
81444+ struct xenbus_device *xendev = to_xenbus_device(dev);
81445+ struct xb_find_info *info = data;
81446+ int len = strlen(info->nodename);
81447+
81448+ DPRINTK("%s", info->nodename);
81449+
81450+ /* Match the info->nodename path, or any subdirectory of that path. */
81451+ if (strncmp(xendev->nodename, info->nodename, len))
81452+ return 0;
81453+
81454+ /* If the node name is longer, ensure it really is a subdirectory. */
81455+ if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
81456+ return 0;
81457+
81458+ info->dev = xendev;
81459+ get_device(dev);
81460+ return 1;
81461+}
81462+
81463+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
81464+{
81465+ struct xb_find_info info = { .nodename = path };
81466+
81467+ do {
81468+ info.dev = NULL;
81469+ bus_for_each_dev(bus, NULL, &info, cleanup_dev);
81470+ if (info.dev) {
81471+ device_unregister(&info.dev->dev);
81472+ put_device(&info.dev->dev);
81473+ }
81474+ } while (info.dev);
81475+}
81476+
81477+static void xenbus_dev_release(struct device *dev)
81478+{
81479+ if (dev)
81480+ kfree(to_xenbus_device(dev));
81481+}
81482+
81483+static ssize_t xendev_show_nodename(struct device *dev,
81484+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
81485+ struct device_attribute *attr,
81486+#endif
81487+ char *buf)
81488+{
81489+ return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
81490+}
81491+DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
81492+
81493+static ssize_t xendev_show_devtype(struct device *dev,
81494+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
81495+ struct device_attribute *attr,
81496+#endif
81497+ char *buf)
81498+{
81499+ return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
81500+}
81501+DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
81502+
81503+
81504+int xenbus_probe_node(struct xen_bus_type *bus,
81505+ const char *type,
81506+ const char *nodename)
81507+{
81508+ int err;
81509+ struct xenbus_device *xendev;
81510+ size_t stringlen;
81511+ char *tmpstring;
81512+
81513+ enum xenbus_state state = xenbus_read_driver_state(nodename);
81514+
81515+ if (state != XenbusStateInitialising) {
81516+ /* Device is not new, so ignore it. This can happen if a
81517+ device is going away after switching to Closed. */
81518+ return 0;
81519+ }
81520+
81521+ stringlen = strlen(nodename) + 1 + strlen(type) + 1;
81522+ xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
81523+ if (!xendev)
81524+ return -ENOMEM;
81525+
81526+ xendev->state = XenbusStateInitialising;
81527+
81528+ /* Copy the strings into the extra space. */
81529+
81530+ tmpstring = (char *)(xendev + 1);
81531+ strcpy(tmpstring, nodename);
81532+ xendev->nodename = tmpstring;
81533+
81534+ tmpstring += strlen(tmpstring) + 1;
81535+ strcpy(tmpstring, type);
81536+ xendev->devicetype = tmpstring;
81537+ init_completion(&xendev->down);
81538+
81539+ xendev->dev.parent = &bus->dev;
81540+ xendev->dev.bus = &bus->bus;
81541+ xendev->dev.release = xenbus_dev_release;
81542+
81543+ err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
81544+ if (err)
81545+ goto fail;
81546+
81547+ /* Register with generic device framework. */
81548+ err = device_register(&xendev->dev);
81549+ if (err)
81550+ goto fail;
81551+
81552+ device_create_file(&xendev->dev, &dev_attr_nodename);
81553+ device_create_file(&xendev->dev, &dev_attr_devtype);
81554+
81555+ return 0;
81556+fail:
81557+ kfree(xendev);
81558+ return err;
81559+}
81560+
81561+/* device/<typename>/<name> */
81562+static int xenbus_probe_frontend(const char *type, const char *name)
81563+{
81564+ char *nodename;
81565+ int err;
81566+
81567+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name);
81568+ if (!nodename)
81569+ return -ENOMEM;
81570+
81571+ DPRINTK("%s", nodename);
81572+
81573+ err = xenbus_probe_node(&xenbus_frontend, type, nodename);
81574+ kfree(nodename);
81575+ return err;
81576+}
81577+
81578+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
81579+{
81580+ int err = 0;
81581+ char **dir;
81582+ unsigned int dir_n = 0;
81583+ int i;
81584+
81585+ dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
81586+ if (IS_ERR(dir))
81587+ return PTR_ERR(dir);
81588+
81589+ for (i = 0; i < dir_n; i++) {
81590+ err = bus->probe(type, dir[i]);
81591+ if (err)
81592+ break;
81593+ }
81594+ kfree(dir);
81595+ return err;
81596+}
81597+
81598+int xenbus_probe_devices(struct xen_bus_type *bus)
81599+{
81600+ int err = 0;
81601+ char **dir;
81602+ unsigned int i, dir_n;
81603+
81604+ dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
81605+ if (IS_ERR(dir))
81606+ return PTR_ERR(dir);
81607+
81608+ for (i = 0; i < dir_n; i++) {
81609+ err = xenbus_probe_device_type(bus, dir[i]);
81610+ if (err)
81611+ break;
81612+ }
81613+ kfree(dir);
81614+ return err;
81615+}
81616+
81617+static unsigned int char_count(const char *str, char c)
81618+{
81619+ unsigned int i, ret = 0;
81620+
81621+ for (i = 0; str[i]; i++)
81622+ if (str[i] == c)
81623+ ret++;
81624+ return ret;
81625+}
81626+
81627+static int strsep_len(const char *str, char c, unsigned int len)
81628+{
81629+ unsigned int i;
81630+
81631+ for (i = 0; str[i]; i++)
81632+ if (str[i] == c) {
81633+ if (len == 0)
81634+ return i;
81635+ len--;
81636+ }
81637+ return (len == 0) ? i : -ERANGE;
81638+}
81639+
81640+void dev_changed(const char *node, struct xen_bus_type *bus)
81641+{
81642+ int exists, rootlen;
81643+ struct xenbus_device *dev;
81644+ char type[BUS_ID_SIZE];
81645+ const char *p, *root;
81646+
81647+ if (char_count(node, '/') < 2)
81648+ return;
81649+
81650+ exists = xenbus_exists(XBT_NIL, node, "");
81651+ if (!exists) {
81652+ xenbus_cleanup_devices(node, &bus->bus);
81653+ return;
81654+ }
81655+
81656+ /* backend/<type>/... or device/<type>/... */
81657+ p = strchr(node, '/') + 1;
81658+ snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
81659+ type[BUS_ID_SIZE-1] = '\0';
81660+
81661+ rootlen = strsep_len(node, '/', bus->levels);
81662+ if (rootlen < 0)
81663+ return;
81664+ root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
81665+ if (!root)
81666+ return;
81667+
81668+ dev = xenbus_device_find(root, &bus->bus);
81669+ if (!dev)
81670+ xenbus_probe_node(bus, type, root);
81671+ else
81672+ put_device(&dev->dev);
81673+
81674+ kfree(root);
81675+}
81676+
81677+static void frontend_changed(struct xenbus_watch *watch,
81678+ const char **vec, unsigned int len)
81679+{
81680+ DPRINTK("");
81681+
81682+ dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
81683+}
81684+
81685+/* We watch for devices appearing and vanishing. */
81686+static struct xenbus_watch fe_watch = {
81687+ .node = "device",
81688+ .callback = frontend_changed,
81689+};
81690+
81691+static int suspend_dev(struct device *dev, void *data)
81692+{
81693+ int err = 0;
81694+ struct xenbus_driver *drv;
81695+ struct xenbus_device *xdev;
81696+
81697+ DPRINTK("");
81698+
81699+ if (dev->driver == NULL)
81700+ return 0;
81701+ drv = to_xenbus_driver(dev->driver);
81702+ xdev = container_of(dev, struct xenbus_device, dev);
81703+ if (drv->suspend)
81704+ err = drv->suspend(xdev);
81705+ if (err)
81706+ printk(KERN_WARNING
81707+ "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
81708+ return 0;
81709+}
81710+
81711+static int resume_dev(struct device *dev, void *data)
81712+{
81713+ int err;
81714+ struct xenbus_driver *drv;
81715+ struct xenbus_device *xdev;
81716+
81717+ DPRINTK("");
81718+
81719+ if (dev->driver == NULL)
81720+ return 0;
81721+
81722+ drv = to_xenbus_driver(dev->driver);
81723+ xdev = container_of(dev, struct xenbus_device, dev);
81724+
81725+ err = talk_to_otherend(xdev);
81726+ if (err) {
81727+ printk(KERN_WARNING
81728+ "xenbus: resume (talk_to_otherend) %s failed: %i\n",
81729+ dev->bus_id, err);
81730+ return err;
81731+ }
81732+
81733+ xdev->state = XenbusStateInitialising;
81734+
81735+ if (drv->resume) {
81736+ err = drv->resume(xdev);
81737+ if (err) {
81738+ printk(KERN_WARNING
81739+ "xenbus: resume %s failed: %i\n",
81740+ dev->bus_id, err);
81741+ return err;
81742+ }
81743+ }
81744+
81745+ err = watch_otherend(xdev);
81746+ if (err) {
81747+ printk(KERN_WARNING
81748+ "xenbus_probe: resume (watch_otherend) %s failed: "
81749+ "%d.\n", dev->bus_id, err);
81750+ return err;
81751+ }
81752+
81753+ return 0;
81754+}
81755+
81756+void xenbus_suspend(void)
81757+{
81758+ DPRINTK("");
81759+
81760+ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
81761+ xenbus_backend_suspend(suspend_dev);
81762+ xs_suspend();
81763+}
81764+EXPORT_SYMBOL_GPL(xenbus_suspend);
81765+
81766+void xenbus_resume(void)
81767+{
81768+ xb_init_comms();
81769+ xs_resume();
81770+ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
81771+ xenbus_backend_resume(resume_dev);
81772+}
81773+EXPORT_SYMBOL_GPL(xenbus_resume);
81774+
81775+
81776+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
81777+int xenstored_ready = 0;
81778+
81779+
81780+int register_xenstore_notifier(struct notifier_block *nb)
81781+{
81782+ int ret = 0;
81783+
81784+ if (xenstored_ready > 0)
81785+ ret = nb->notifier_call(nb, 0, NULL);
81786+ else
81787+ notifier_chain_register(&xenstore_chain, nb);
81788+
81789+ return ret;
81790+}
81791+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
81792+
81793+void unregister_xenstore_notifier(struct notifier_block *nb)
81794+{
81795+ notifier_chain_unregister(&xenstore_chain, nb);
81796+}
81797+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
81798+
81799+
81800+void xenbus_probe(void *unused)
81801+{
81802+ BUG_ON((xenstored_ready <= 0));
81803+
81804+ /* Enumerate devices in xenstore and watch for changes. */
81805+ xenbus_probe_devices(&xenbus_frontend);
81806+ register_xenbus_watch(&fe_watch);
81807+ xenbus_backend_probe_and_watch();
81808+
81809+ /* Notify others that xenstore is up */
81810+ notifier_call_chain(&xenstore_chain, 0, NULL);
81811+}
81812+
81813+
81814+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
81815+static struct file_operations xsd_kva_fops;
81816+static struct proc_dir_entry *xsd_kva_intf;
81817+static struct proc_dir_entry *xsd_port_intf;
81818+
81819+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
81820+{
81821+ size_t size = vma->vm_end - vma->vm_start;
81822+
81823+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
81824+ return -EINVAL;
81825+
81826+ if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
81827+ size, vma->vm_page_prot))
81828+ return -EAGAIN;
81829+
81830+ return 0;
81831+}
81832+
81833+static int xsd_kva_read(char *page, char **start, off_t off,
81834+ int count, int *eof, void *data)
81835+{
81836+ int len;
81837+
81838+ len = sprintf(page, "0x%p", xen_store_interface);
81839+ *eof = 1;
81840+ return len;
81841+}
81842+
81843+static int xsd_port_read(char *page, char **start, off_t off,
81844+ int count, int *eof, void *data)
81845+{
81846+ int len;
81847+
81848+ len = sprintf(page, "%d", xen_store_evtchn);
81849+ *eof = 1;
81850+ return len;
81851+}
81852+#endif
81853+
81854+static int __init xenbus_probe_init(void)
81855+{
81856+ int err = 0;
81857+ unsigned long page = 0;
81858+
81859+ DPRINTK("");
81860+
81861+ if (!is_running_on_xen())
81862+ return -ENODEV;
81863+
81864+ /* Register ourselves with the kernel bus subsystem */
81865+ bus_register(&xenbus_frontend.bus);
81866+ xenbus_backend_bus_register();
81867+
81868+ /*
81869+ * Domain0 doesn't have a store_evtchn or store_mfn yet.
81870+ */
81871+ if (is_initial_xendomain()) {
81872+ struct evtchn_alloc_unbound alloc_unbound;
81873+
81874+ /* Allocate page. */
81875+ page = get_zeroed_page(GFP_KERNEL);
81876+ if (!page)
81877+ return -ENOMEM;
81878+
81879+ xen_store_mfn = xen_start_info->store_mfn =
81880+ pfn_to_mfn(virt_to_phys((void *)page) >>
81881+ PAGE_SHIFT);
81882+
81883+ /* Next allocate a local port which xenstored can bind to */
81884+ alloc_unbound.dom = DOMID_SELF;
81885+ alloc_unbound.remote_dom = 0;
81886+
81887+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
81888+ &alloc_unbound);
81889+ if (err == -ENOSYS)
81890+ goto err;
81891+ BUG_ON(err);
81892+ xen_store_evtchn = xen_start_info->store_evtchn =
81893+ alloc_unbound.port;
81894+
81895+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
81896+ /* And finally publish the above info in /proc/xen */
81897+ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
81898+ if (xsd_kva_intf) {
81899+ memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
81900+ sizeof(xsd_kva_fops));
81901+ xsd_kva_fops.mmap = xsd_kva_mmap;
81902+ xsd_kva_intf->proc_fops = &xsd_kva_fops;
81903+ xsd_kva_intf->read_proc = xsd_kva_read;
81904+ }
81905+ xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
81906+ if (xsd_port_intf)
81907+ xsd_port_intf->read_proc = xsd_port_read;
81908+#endif
81909+ xen_store_interface = mfn_to_virt(xen_store_mfn);
81910+ } else {
81911+ xenstored_ready = 1;
81912+#ifdef CONFIG_XEN
81913+ xen_store_evtchn = xen_start_info->store_evtchn;
81914+ xen_store_mfn = xen_start_info->store_mfn;
81915+ xen_store_interface = mfn_to_virt(xen_store_mfn);
81916+#else
81917+ xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
81918+ xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
81919+ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
81920+ PAGE_SIZE);
81921+#endif
81922+ }
81923+
81924+
81925+ xenbus_dev_init();
81926+
81927+ /* Initialize the interface to xenstore. */
81928+ err = xs_init();
81929+ if (err) {
81930+ printk(KERN_WARNING
81931+ "XENBUS: Error initializing xenstore comms: %i\n", err);
81932+ goto err;
81933+ }
81934+
81935+ /* Register ourselves with the kernel device subsystem */
81936+ device_register(&xenbus_frontend.dev);
81937+ xenbus_backend_device_register();
81938+
81939+ if (!is_initial_xendomain())
81940+ xenbus_probe(NULL);
81941+
81942+ return 0;
81943+
81944+ err:
81945+ if (page)
81946+ free_page(page);
81947+
81948+ /*
81949+ * Do not unregister the xenbus front/backend buses here. The buses
81950+ * must exist because front/backend drivers will use them when they are
81951+ * registered.
81952+ */
81953+
81954+ return err;
81955+}
81956+
81957+postcore_initcall(xenbus_probe_init);
81958+
81959+MODULE_LICENSE("Dual BSD/GPL");
81960+
81961+
81962+static int is_disconnected_device(struct device *dev, void *data)
81963+{
81964+ struct xenbus_device *xendev = to_xenbus_device(dev);
81965+ struct device_driver *drv = data;
81966+
81967+ /*
81968+ * A device with no driver will never connect. We care only about
81969+ * devices which should currently be in the process of connecting.
81970+ */
81971+ if (!dev->driver)
81972+ return 0;
81973+
81974+ /* Is this search limited to a particular driver? */
81975+ if (drv && (dev->driver != drv))
81976+ return 0;
81977+
81978+ return (xendev->state != XenbusStateConnected);
81979+}
81980+
81981+static int exists_disconnected_device(struct device_driver *drv)
81982+{
81983+ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
81984+ is_disconnected_device);
81985+}
81986+
81987+static int print_device_status(struct device *dev, void *data)
81988+{
81989+ struct xenbus_device *xendev = to_xenbus_device(dev);
81990+ struct device_driver *drv = data;
81991+
81992+ /* Is this operation limited to a particular driver? */
81993+ if (drv && (dev->driver != drv))
81994+ return 0;
81995+
81996+ if (!dev->driver) {
81997+ /* Information only: is this too noisy? */
81998+ printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
81999+ xendev->nodename);
82000+ } else if (xendev->state != XenbusStateConnected) {
82001+ printk(KERN_WARNING "XENBUS: Timeout connecting "
82002+ "to device: %s (state %d)\n",
82003+ xendev->nodename, xendev->state);
82004+ }
82005+
82006+ return 0;
82007+}
82008+
82009+/* We only wait for device setup after most initcalls have run. */
82010+static int ready_to_wait_for_devices;
82011+
82012+/*
82013+ * On a 10 second timeout, wait for all devices currently configured. We need
82014+ * to do this to guarantee that the filesystems and / or network devices
82015+ * needed for boot are available, before we can allow the boot to proceed.
82016+ *
82017+ * This needs to be on a late_initcall, to happen after the frontend device
82018+ * drivers have been initialised, but before the root fs is mounted.
82019+ *
82020+ * A possible improvement here would be to have the tools add a per-device
82021+ * flag to the store entry, indicating whether it is needed at boot time.
82022+ * This would allow people who knew what they were doing to accelerate their
82023+ * boot slightly, but of course needs tools or manual intervention to set up
82024+ * those flags correctly.
82025+ */
82026+static void wait_for_devices(struct xenbus_driver *xendrv)
82027+{
82028+ unsigned long timeout = jiffies + 10*HZ;
82029+ struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
82030+
82031+ if (!ready_to_wait_for_devices || !is_running_on_xen())
82032+ return;
82033+
82034+ while (exists_disconnected_device(drv)) {
82035+ if (time_after(jiffies, timeout))
82036+ break;
82037+ schedule_timeout_interruptible(HZ/10);
82038+ }
82039+
82040+ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
82041+ print_device_status);
82042+}
82043+
82044+#ifndef MODULE
82045+static int __init boot_wait_for_devices(void)
82046+{
82047+ ready_to_wait_for_devices = 1;
82048+ wait_for_devices(NULL);
82049+ return 0;
82050+}
82051+
82052+late_initcall(boot_wait_for_devices);
82053+#endif
82054diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.h linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.h
82055--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.h 1970-01-01 00:00:00.000000000 +0000
82056+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.h 2007-01-08 15:00:45.000000000 +0000
82057@@ -0,0 +1,74 @@
82058+/******************************************************************************
82059+ * xenbus_probe.h
82060+ *
82061+ * Talks to Xen Store to figure out what devices we have.
82062+ *
82063+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
82064+ * Copyright (C) 2005 XenSource Ltd.
82065+ *
82066+ * This program is free software; you can redistribute it and/or
82067+ * modify it under the terms of the GNU General Public License version 2
82068+ * as published by the Free Software Foundation; or, when distributed
82069+ * separately from the Linux kernel or incorporated into other
82070+ * software packages, subject to the following license:
82071+ *
82072+ * Permission is hereby granted, free of charge, to any person obtaining a copy
82073+ * of this source file (the "Software"), to deal in the Software without
82074+ * restriction, including without limitation the rights to use, copy, modify,
82075+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82076+ * and to permit persons to whom the Software is furnished to do so, subject to
82077+ * the following conditions:
82078+ *
82079+ * The above copyright notice and this permission notice shall be included in
82080+ * all copies or substantial portions of the Software.
82081+ *
82082+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82083+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82084+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82085+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82086+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82087+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82088+ * IN THE SOFTWARE.
82089+ */
82090+
82091+#ifndef _XENBUS_PROBE_H
82092+#define _XENBUS_PROBE_H
82093+
82094+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
82095+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
82096+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
82097+extern void xenbus_backend_probe_and_watch(void);
82098+extern void xenbus_backend_bus_register(void);
82099+extern void xenbus_backend_device_register(void);
82100+#else
82101+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
82102+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
82103+static inline void xenbus_backend_probe_and_watch(void) {}
82104+static inline void xenbus_backend_bus_register(void) {}
82105+static inline void xenbus_backend_device_register(void) {}
82106+#endif
82107+
82108+struct xen_bus_type
82109+{
82110+ char *root;
82111+ unsigned int levels;
82112+ int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
82113+ int (*probe)(const char *type, const char *dir);
82114+ struct bus_type bus;
82115+ struct device dev;
82116+};
82117+
82118+extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
82119+extern int xenbus_dev_probe(struct device *_dev);
82120+extern int xenbus_dev_remove(struct device *_dev);
82121+extern int xenbus_register_driver_common(struct xenbus_driver *drv,
82122+ struct xen_bus_type *bus);
82123+extern int xenbus_probe_node(struct xen_bus_type *bus,
82124+ const char *type,
82125+ const char *nodename);
82126+extern int xenbus_probe_devices(struct xen_bus_type *bus);
82127+
82128+extern void dev_changed(const char *node, struct xen_bus_type *bus);
82129+
82130+#endif
82131+
82132diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe_backend.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe_backend.c
82133--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe_backend.c 1970-01-01 00:00:00.000000000 +0000
82134+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe_backend.c 2007-01-08 15:00:45.000000000 +0000
82135@@ -0,0 +1,271 @@
82136+/******************************************************************************
82137+ * Talks to Xen Store to figure out what devices we have (backend half).
82138+ *
82139+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
82140+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
82141+ * Copyright (C) 2005, 2006 XenSource Ltd
82142+ *
82143+ * This program is free software; you can redistribute it and/or
82144+ * modify it under the terms of the GNU General Public License version 2
82145+ * as published by the Free Software Foundation; or, when distributed
82146+ * separately from the Linux kernel or incorporated into other
82147+ * software packages, subject to the following license:
82148+ *
82149+ * Permission is hereby granted, free of charge, to any person obtaining a copy
82150+ * of this source file (the "Software"), to deal in the Software without
82151+ * restriction, including without limitation the rights to use, copy, modify,
82152+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82153+ * and to permit persons to whom the Software is furnished to do so, subject to
82154+ * the following conditions:
82155+ *
82156+ * The above copyright notice and this permission notice shall be included in
82157+ * all copies or substantial portions of the Software.
82158+ *
82159+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82160+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82161+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82162+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82163+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82164+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82165+ * IN THE SOFTWARE.
82166+ */
82167+
82168+#define DPRINTK(fmt, args...) \
82169+ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
82170+ __FUNCTION__, __LINE__, ##args)
82171+
82172+#include <linux/kernel.h>
82173+#include <linux/err.h>
82174+#include <linux/string.h>
82175+#include <linux/ctype.h>
82176+#include <linux/fcntl.h>
82177+#include <linux/mm.h>
82178+#include <linux/notifier.h>
82179+#include <linux/kthread.h>
82180+
82181+#include <asm/io.h>
82182+#include <asm/page.h>
82183+#include <asm/maddr.h>
82184+#include <asm/pgtable.h>
82185+#include <asm/hypervisor.h>
82186+#include <xen/xenbus.h>
82187+#include <xen/xen_proc.h>
82188+#include <xen/evtchn.h>
82189+#include <xen/features.h>
82190+#include <xen/hvm.h>
82191+
82192+#include "xenbus_comms.h"
82193+#include "xenbus_probe.h"
82194+
82195+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
82196+#include <xen/platform-compat.h>
82197+#endif
82198+
82199+static int xenbus_uevent_backend(struct device *dev, char **envp,
82200+ int num_envp, char *buffer, int buffer_size);
82201+static int xenbus_probe_backend(const char *type, const char *domid);
82202+
82203+extern int read_otherend_details(struct xenbus_device *xendev,
82204+ char *id_node, char *path_node);
82205+
82206+static int read_frontend_details(struct xenbus_device *xendev)
82207+{
82208+ return read_otherend_details(xendev, "frontend-id", "frontend");
82209+}
82210+
82211+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
82212+static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
82213+{
82214+ int domid, err;
82215+ const char *devid, *type, *frontend;
82216+ unsigned int typelen;
82217+
82218+ type = strchr(nodename, '/');
82219+ if (!type)
82220+ return -EINVAL;
82221+ type++;
82222+ typelen = strcspn(type, "/");
82223+ if (!typelen || type[typelen] != '/')
82224+ return -EINVAL;
82225+
82226+ devid = strrchr(nodename, '/') + 1;
82227+
82228+ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
82229+ "frontend", NULL, &frontend,
82230+ NULL);
82231+ if (err)
82232+ return err;
82233+ if (strlen(frontend) == 0)
82234+ err = -ERANGE;
82235+ if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
82236+ err = -ENOENT;
82237+ kfree(frontend);
82238+
82239+ if (err)
82240+ return err;
82241+
82242+ if (snprintf(bus_id, BUS_ID_SIZE,
82243+ "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
82244+ return -ENOSPC;
82245+ return 0;
82246+}
82247+
82248+static struct xen_bus_type xenbus_backend = {
82249+ .root = "backend",
82250+ .levels = 3, /* backend/type/<frontend>/<id> */
82251+ .get_bus_id = backend_bus_id,
82252+ .probe = xenbus_probe_backend,
82253+ .bus = {
82254+ .name = "xen-backend",
82255+ .match = xenbus_match,
82256+ .probe = xenbus_dev_probe,
82257+ .remove = xenbus_dev_remove,
82258+// .shutdown = xenbus_dev_shutdown,
82259+ .uevent = xenbus_uevent_backend,
82260+ },
82261+ .dev = {
82262+ .bus_id = "xen-backend",
82263+ },
82264+};
82265+
82266+static int xenbus_uevent_backend(struct device *dev, char **envp,
82267+ int num_envp, char *buffer, int buffer_size)
82268+{
82269+ struct xenbus_device *xdev;
82270+ struct xenbus_driver *drv;
82271+ int i = 0;
82272+ int length = 0;
82273+
82274+ DPRINTK("");
82275+
82276+ if (dev == NULL)
82277+ return -ENODEV;
82278+
82279+ xdev = to_xenbus_device(dev);
82280+ if (xdev == NULL)
82281+ return -ENODEV;
82282+
82283+ /* stuff we want to pass to /sbin/hotplug */
82284+ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82285+ "XENBUS_TYPE=%s", xdev->devicetype);
82286+
82287+ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82288+ "XENBUS_PATH=%s", xdev->nodename);
82289+
82290+ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82291+ "XENBUS_BASE_PATH=%s", xenbus_backend.root);
82292+
82293+ /* terminate, set to next free slot, shrink available space */
82294+ envp[i] = NULL;
82295+ envp = &envp[i];
82296+ num_envp -= i;
82297+ buffer = &buffer[length];
82298+ buffer_size -= length;
82299+
82300+ if (dev->driver) {
82301+ drv = to_xenbus_driver(dev->driver);
82302+ if (drv && drv->uevent)
82303+ return drv->uevent(xdev, envp, num_envp, buffer,
82304+ buffer_size);
82305+ }
82306+
82307+ return 0;
82308+}
82309+
82310+int xenbus_register_backend(struct xenbus_driver *drv)
82311+{
82312+ drv->read_otherend_details = read_frontend_details;
82313+
82314+ return xenbus_register_driver_common(drv, &xenbus_backend);
82315+}
82316+EXPORT_SYMBOL_GPL(xenbus_register_backend);
82317+
82318+/* backend/<typename>/<frontend-uuid>/<name> */
82319+static int xenbus_probe_backend_unit(const char *dir,
82320+ const char *type,
82321+ const char *name)
82322+{
82323+ char *nodename;
82324+ int err;
82325+
82326+ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
82327+ if (!nodename)
82328+ return -ENOMEM;
82329+
82330+ DPRINTK("%s\n", nodename);
82331+
82332+ err = xenbus_probe_node(&xenbus_backend, type, nodename);
82333+ kfree(nodename);
82334+ return err;
82335+}
82336+
82337+/* backend/<typename>/<frontend-domid> */
82338+static int xenbus_probe_backend(const char *type, const char *domid)
82339+{
82340+ char *nodename;
82341+ int err = 0;
82342+ char **dir;
82343+ unsigned int i, dir_n = 0;
82344+
82345+ DPRINTK("");
82346+
82347+ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid);
82348+ if (!nodename)
82349+ return -ENOMEM;
82350+
82351+ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
82352+ if (IS_ERR(dir)) {
82353+ kfree(nodename);
82354+ return PTR_ERR(dir);
82355+ }
82356+
82357+ for (i = 0; i < dir_n; i++) {
82358+ err = xenbus_probe_backend_unit(nodename, type, dir[i]);
82359+ if (err)
82360+ break;
82361+ }
82362+ kfree(dir);
82363+ kfree(nodename);
82364+ return err;
82365+}
82366+
82367+static void backend_changed(struct xenbus_watch *watch,
82368+ const char **vec, unsigned int len)
82369+{
82370+ DPRINTK("");
82371+
82372+ dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
82373+}
82374+
82375+static struct xenbus_watch be_watch = {
82376+ .node = "backend",
82377+ .callback = backend_changed,
82378+};
82379+
82380+void xenbus_backend_suspend(int (*fn)(struct device *, void *))
82381+{
82382+ DPRINTK("");
82383+ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
82384+}
82385+
82386+void xenbus_backend_resume(int (*fn)(struct device *, void *))
82387+{
82388+ DPRINTK("");
82389+ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
82390+}
82391+
82392+void xenbus_backend_probe_and_watch(void)
82393+{
82394+ xenbus_probe_devices(&xenbus_backend);
82395+ register_xenbus_watch(&be_watch);
82396+}
82397+
82398+void xenbus_backend_bus_register(void)
82399+{
82400+ bus_register(&xenbus_backend.bus);
82401+}
82402+
82403+void xenbus_backend_device_register(void)
82404+{
82405+ device_register(&xenbus_backend.dev);
82406+}
82407diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_xs.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_xs.c
82408--- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_xs.c 1970-01-01 00:00:00.000000000 +0000
82409+++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_xs.c 2007-01-08 15:00:45.000000000 +0000
82410@@ -0,0 +1,859 @@
82411+/******************************************************************************
82412+ * xenbus_xs.c
82413+ *
82414+ * This is the kernel equivalent of the "xs" library. We don't need everything
82415+ * and we use xenbus_comms for communication.
82416+ *
82417+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
82418+ *
82419+ * This program is free software; you can redistribute it and/or
82420+ * modify it under the terms of the GNU General Public License version 2
82421+ * as published by the Free Software Foundation; or, when distributed
82422+ * separately from the Linux kernel or incorporated into other
82423+ * software packages, subject to the following license:
82424+ *
82425+ * Permission is hereby granted, free of charge, to any person obtaining a copy
82426+ * of this source file (the "Software"), to deal in the Software without
82427+ * restriction, including without limitation the rights to use, copy, modify,
82428+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82429+ * and to permit persons to whom the Software is furnished to do so, subject to
82430+ * the following conditions:
82431+ *
82432+ * The above copyright notice and this permission notice shall be included in
82433+ * all copies or substantial portions of the Software.
82434+ *
82435+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82436+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82437+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82438+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82439+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82440+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82441+ * IN THE SOFTWARE.
82442+ */
82443+
82444+#include <linux/unistd.h>
82445+#include <linux/errno.h>
82446+#include <linux/types.h>
82447+#include <linux/uio.h>
82448+#include <linux/kernel.h>
82449+#include <linux/string.h>
82450+#include <linux/err.h>
82451+#include <linux/slab.h>
82452+#include <linux/fcntl.h>
82453+#include <linux/kthread.h>
82454+#include <linux/rwsem.h>
82455+#include <linux/module.h>
82456+#include <linux/mutex.h>
82457+#include <xen/xenbus.h>
82458+#include "xenbus_comms.h"
82459+
82460+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
82461+#include <xen/platform-compat.h>
82462+#endif
82463+
82464+struct xs_stored_msg {
82465+ struct list_head list;
82466+
82467+ struct xsd_sockmsg hdr;
82468+
82469+ union {
82470+ /* Queued replies. */
82471+ struct {
82472+ char *body;
82473+ } reply;
82474+
82475+ /* Queued watch events. */
82476+ struct {
82477+ struct xenbus_watch *handle;
82478+ char **vec;
82479+ unsigned int vec_size;
82480+ } watch;
82481+ } u;
82482+};
82483+
82484+struct xs_handle {
82485+ /* A list of replies. Currently only one will ever be outstanding. */
82486+ struct list_head reply_list;
82487+ spinlock_t reply_lock;
82488+ wait_queue_head_t reply_waitq;
82489+
82490+ /* One request at a time. */
82491+ struct mutex request_mutex;
82492+
82493+ /* Protect transactions against save/restore. */
82494+ struct rw_semaphore suspend_mutex;
82495+};
82496+
82497+static struct xs_handle xs_state;
82498+
82499+/* List of registered watches, and a lock to protect it. */
82500+static LIST_HEAD(watches);
82501+static DEFINE_SPINLOCK(watches_lock);
82502+
82503+/* List of pending watch callback events, and a lock to protect it. */
82504+static LIST_HEAD(watch_events);
82505+static DEFINE_SPINLOCK(watch_events_lock);
82506+
82507+/*
82508+ * Details of the xenwatch callback kernel thread. The thread waits on the
82509+ * watch_events_waitq for work to do (queued on watch_events list). When it
82510+ * wakes up it acquires the xenwatch_mutex before reading the list and
82511+ * carrying out work.
82512+ */
82513+static pid_t xenwatch_pid;
82514+/* static */ DEFINE_MUTEX(xenwatch_mutex);
82515+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
82516+
82517+static int get_error(const char *errorstring)
82518+{
82519+ unsigned int i;
82520+
82521+ for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
82522+ if (i == ARRAY_SIZE(xsd_errors) - 1) {
82523+ printk(KERN_WARNING
82524+ "XENBUS xen store gave: unknown error %s",
82525+ errorstring);
82526+ return EINVAL;
82527+ }
82528+ }
82529+ return xsd_errors[i].errnum;
82530+}
82531+
82532+static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
82533+{
82534+ struct xs_stored_msg *msg;
82535+ char *body;
82536+
82537+ spin_lock(&xs_state.reply_lock);
82538+
82539+ while (list_empty(&xs_state.reply_list)) {
82540+ spin_unlock(&xs_state.reply_lock);
82541+ /* XXX FIXME: Avoid synchronous wait for response here. */
82542+ wait_event(xs_state.reply_waitq,
82543+ !list_empty(&xs_state.reply_list));
82544+ spin_lock(&xs_state.reply_lock);
82545+ }
82546+
82547+ msg = list_entry(xs_state.reply_list.next,
82548+ struct xs_stored_msg, list);
82549+ list_del(&msg->list);
82550+
82551+ spin_unlock(&xs_state.reply_lock);
82552+
82553+ *type = msg->hdr.type;
82554+ if (len)
82555+ *len = msg->hdr.len;
82556+ body = msg->u.reply.body;
82557+
82558+ kfree(msg);
82559+
82560+ return body;
82561+}
82562+
82563+/* Emergency write. */
82564+void xenbus_debug_write(const char *str, unsigned int count)
82565+{
82566+ struct xsd_sockmsg msg = { 0 };
82567+
82568+ msg.type = XS_DEBUG;
82569+ msg.len = sizeof("print") + count + 1;
82570+
82571+ mutex_lock(&xs_state.request_mutex);
82572+ xb_write(&msg, sizeof(msg));
82573+ xb_write("print", sizeof("print"));
82574+ xb_write(str, count);
82575+ xb_write("", 1);
82576+ mutex_unlock(&xs_state.request_mutex);
82577+}
82578+
82579+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
82580+{
82581+ void *ret;
82582+ struct xsd_sockmsg req_msg = *msg;
82583+ int err;
82584+
82585+ if (req_msg.type == XS_TRANSACTION_START)
82586+ down_read(&xs_state.suspend_mutex);
82587+
82588+ mutex_lock(&xs_state.request_mutex);
82589+
82590+ err = xb_write(msg, sizeof(*msg) + msg->len);
82591+ if (err) {
82592+ msg->type = XS_ERROR;
82593+ ret = ERR_PTR(err);
82594+ } else
82595+ ret = read_reply(&msg->type, &msg->len);
82596+
82597+ mutex_unlock(&xs_state.request_mutex);
82598+
82599+ if ((req_msg.type == XS_TRANSACTION_END) ||
82600+ ((req_msg.type == XS_TRANSACTION_START) &&
82601+ (msg->type == XS_ERROR)))
82602+ up_read(&xs_state.suspend_mutex);
82603+
82604+ return ret;
82605+}
82606+
82607+/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
82608+static void *xs_talkv(struct xenbus_transaction t,
82609+ enum xsd_sockmsg_type type,
82610+ const struct kvec *iovec,
82611+ unsigned int num_vecs,
82612+ unsigned int *len)
82613+{
82614+ struct xsd_sockmsg msg;
82615+ void *ret = NULL;
82616+ unsigned int i;
82617+ int err;
82618+
82619+ msg.tx_id = t.id;
82620+ msg.req_id = 0;
82621+ msg.type = type;
82622+ msg.len = 0;
82623+ for (i = 0; i < num_vecs; i++)
82624+ msg.len += iovec[i].iov_len;
82625+
82626+ mutex_lock(&xs_state.request_mutex);
82627+
82628+ err = xb_write(&msg, sizeof(msg));
82629+ if (err) {
82630+ mutex_unlock(&xs_state.request_mutex);
82631+ return ERR_PTR(err);
82632+ }
82633+
82634+ for (i = 0; i < num_vecs; i++) {
82635+ err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
82636+ if (err) {
82637+ mutex_unlock(&xs_state.request_mutex);
82638+ return ERR_PTR(err);
82639+ }
82640+ }
82641+
82642+ ret = read_reply(&msg.type, len);
82643+
82644+ mutex_unlock(&xs_state.request_mutex);
82645+
82646+ if (IS_ERR(ret))
82647+ return ret;
82648+
82649+ if (msg.type == XS_ERROR) {
82650+ err = get_error(ret);
82651+ kfree(ret);
82652+ return ERR_PTR(-err);
82653+ }
82654+
82655+ if (msg.type != type) {
82656+ if (printk_ratelimit())
82657+ printk(KERN_WARNING
82658+ "XENBUS unexpected type [%d], expected [%d]\n",
82659+ msg.type, type);
82660+ kfree(ret);
82661+ return ERR_PTR(-EINVAL);
82662+ }
82663+ return ret;
82664+}
82665+
82666+/* Simplified version of xs_talkv: single message. */
82667+static void *xs_single(struct xenbus_transaction t,
82668+ enum xsd_sockmsg_type type,
82669+ const char *string,
82670+ unsigned int *len)
82671+{
82672+ struct kvec iovec;
82673+
82674+ iovec.iov_base = (void *)string;
82675+ iovec.iov_len = strlen(string) + 1;
82676+ return xs_talkv(t, type, &iovec, 1, len);
82677+}
82678+
82679+/* Many commands only need an ack, don't care what it says. */
82680+static int xs_error(char *reply)
82681+{
82682+ if (IS_ERR(reply))
82683+ return PTR_ERR(reply);
82684+ kfree(reply);
82685+ return 0;
82686+}
82687+
82688+static unsigned int count_strings(const char *strings, unsigned int len)
82689+{
82690+ unsigned int num;
82691+ const char *p;
82692+
82693+ for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
82694+ num++;
82695+
82696+ return num;
82697+}
82698+
82699+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
82700+static char *join(const char *dir, const char *name)
82701+{
82702+ char *buffer;
82703+
82704+ if (strlen(name) == 0)
82705+ buffer = kasprintf(GFP_KERNEL, "%s", dir);
82706+ else
82707+ buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
82708+ return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
82709+}
82710+
82711+static char **split(char *strings, unsigned int len, unsigned int *num)
82712+{
82713+ char *p, **ret;
82714+
82715+ /* Count the strings. */
82716+ *num = count_strings(strings, len);
82717+
82718+ /* Transfer to one big alloc for easy freeing. */
82719+ ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
82720+ if (!ret) {
82721+ kfree(strings);
82722+ return ERR_PTR(-ENOMEM);
82723+ }
82724+ memcpy(&ret[*num], strings, len);
82725+ kfree(strings);
82726+
82727+ strings = (char *)&ret[*num];
82728+ for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
82729+ ret[(*num)++] = p;
82730+
82731+ return ret;
82732+}
82733+
82734+char **xenbus_directory(struct xenbus_transaction t,
82735+ const char *dir, const char *node, unsigned int *num)
82736+{
82737+ char *strings, *path;
82738+ unsigned int len;
82739+
82740+ path = join(dir, node);
82741+ if (IS_ERR(path))
82742+ return (char **)path;
82743+
82744+ strings = xs_single(t, XS_DIRECTORY, path, &len);
82745+ kfree(path);
82746+ if (IS_ERR(strings))
82747+ return (char **)strings;
82748+
82749+ return split(strings, len, num);
82750+}
82751+EXPORT_SYMBOL_GPL(xenbus_directory);
82752+
82753+/* Check if a path exists. Return 1 if it does. */
82754+int xenbus_exists(struct xenbus_transaction t,
82755+ const char *dir, const char *node)
82756+{
82757+ char **d;
82758+ int dir_n;
82759+
82760+ d = xenbus_directory(t, dir, node, &dir_n);
82761+ if (IS_ERR(d))
82762+ return 0;
82763+ kfree(d);
82764+ return 1;
82765+}
82766+EXPORT_SYMBOL_GPL(xenbus_exists);
82767+
82768+/* Get the value of a single file.
82769+ * Returns a kmalloced value: call free() on it after use.
82770+ * len indicates length in bytes.
82771+ */
82772+void *xenbus_read(struct xenbus_transaction t,
82773+ const char *dir, const char *node, unsigned int *len)
82774+{
82775+ char *path;
82776+ void *ret;
82777+
82778+ path = join(dir, node);
82779+ if (IS_ERR(path))
82780+ return (void *)path;
82781+
82782+ ret = xs_single(t, XS_READ, path, len);
82783+ kfree(path);
82784+ return ret;
82785+}
82786+EXPORT_SYMBOL_GPL(xenbus_read);
82787+
82788+/* Write the value of a single file.
82789+ * Returns -err on failure.
82790+ */
82791+int xenbus_write(struct xenbus_transaction t,
82792+ const char *dir, const char *node, const char *string)
82793+{
82794+ const char *path;
82795+ struct kvec iovec[2];
82796+ int ret;
82797+
82798+ path = join(dir, node);
82799+ if (IS_ERR(path))
82800+ return PTR_ERR(path);
82801+
82802+ iovec[0].iov_base = (void *)path;
82803+ iovec[0].iov_len = strlen(path) + 1;
82804+ iovec[1].iov_base = (void *)string;
82805+ iovec[1].iov_len = strlen(string);
82806+
82807+ ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
82808+ kfree(path);
82809+ return ret;
82810+}
82811+EXPORT_SYMBOL_GPL(xenbus_write);
82812+
82813+/* Create a new directory. */
82814+int xenbus_mkdir(struct xenbus_transaction t,
82815+ const char *dir, const char *node)
82816+{
82817+ char *path;
82818+ int ret;
82819+
82820+ path = join(dir, node);
82821+ if (IS_ERR(path))
82822+ return PTR_ERR(path);
82823+
82824+ ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
82825+ kfree(path);
82826+ return ret;
82827+}
82828+EXPORT_SYMBOL_GPL(xenbus_mkdir);
82829+
82830+/* Destroy a file or directory (directories must be empty). */
82831+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
82832+{
82833+ char *path;
82834+ int ret;
82835+
82836+ path = join(dir, node);
82837+ if (IS_ERR(path))
82838+ return PTR_ERR(path);
82839+
82840+ ret = xs_error(xs_single(t, XS_RM, path, NULL));
82841+ kfree(path);
82842+ return ret;
82843+}
82844+EXPORT_SYMBOL_GPL(xenbus_rm);
82845+
82846+/* Start a transaction: changes by others will not be seen during this
82847+ * transaction, and changes will not be visible to others until end.
82848+ */
82849+int xenbus_transaction_start(struct xenbus_transaction *t)
82850+{
82851+ char *id_str;
82852+
82853+ down_read(&xs_state.suspend_mutex);
82854+
82855+ id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
82856+ if (IS_ERR(id_str)) {
82857+ up_read(&xs_state.suspend_mutex);
82858+ return PTR_ERR(id_str);
82859+ }
82860+
82861+ t->id = simple_strtoul(id_str, NULL, 0);
82862+ kfree(id_str);
82863+ return 0;
82864+}
82865+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
82866+
82867+/* End a transaction.
82868+ * If abandon is true, transaction is discarded instead of committed.
82869+ */
82870+int xenbus_transaction_end(struct xenbus_transaction t, int abort)
82871+{
82872+ char abortstr[2];
82873+ int err;
82874+
82875+ if (abort)
82876+ strcpy(abortstr, "F");
82877+ else
82878+ strcpy(abortstr, "T");
82879+
82880+ err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
82881+
82882+ up_read(&xs_state.suspend_mutex);
82883+
82884+ return err;
82885+}
82886+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
82887+
82888+/* Single read and scanf: returns -errno or num scanned. */
82889+int xenbus_scanf(struct xenbus_transaction t,
82890+ const char *dir, const char *node, const char *fmt, ...)
82891+{
82892+ va_list ap;
82893+ int ret;
82894+ char *val;
82895+
82896+ val = xenbus_read(t, dir, node, NULL);
82897+ if (IS_ERR(val))
82898+ return PTR_ERR(val);
82899+
82900+ va_start(ap, fmt);
82901+ ret = vsscanf(val, fmt, ap);
82902+ va_end(ap);
82903+ kfree(val);
82904+ /* Distinctive errno. */
82905+ if (ret == 0)
82906+ return -ERANGE;
82907+ return ret;
82908+}
82909+EXPORT_SYMBOL_GPL(xenbus_scanf);
82910+
82911+/* Single printf and write: returns -errno or 0. */
82912+int xenbus_printf(struct xenbus_transaction t,
82913+ const char *dir, const char *node, const char *fmt, ...)
82914+{
82915+ va_list ap;
82916+ int ret;
82917+#define PRINTF_BUFFER_SIZE 4096
82918+ char *printf_buffer;
82919+
82920+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
82921+ if (printf_buffer == NULL)
82922+ return -ENOMEM;
82923+
82924+ va_start(ap, fmt);
82925+ ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
82926+ va_end(ap);
82927+
82928+ BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
82929+ ret = xenbus_write(t, dir, node, printf_buffer);
82930+
82931+ kfree(printf_buffer);
82932+
82933+ return ret;
82934+}
82935+EXPORT_SYMBOL_GPL(xenbus_printf);
82936+
82937+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
82938+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
82939+{
82940+ va_list ap;
82941+ const char *name;
82942+ int ret = 0;
82943+
82944+ va_start(ap, dir);
82945+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
82946+ const char *fmt = va_arg(ap, char *);
82947+ void *result = va_arg(ap, void *);
82948+ char *p;
82949+
82950+ p = xenbus_read(t, dir, name, NULL);
82951+ if (IS_ERR(p)) {
82952+ ret = PTR_ERR(p);
82953+ break;
82954+ }
82955+ if (fmt) {
82956+ if (sscanf(p, fmt, result) == 0)
82957+ ret = -EINVAL;
82958+ kfree(p);
82959+ } else
82960+ *(char **)result = p;
82961+ }
82962+ va_end(ap);
82963+ return ret;
82964+}
82965+EXPORT_SYMBOL_GPL(xenbus_gather);
82966+
82967+static int xs_watch(const char *path, const char *token)
82968+{
82969+ struct kvec iov[2];
82970+
82971+ iov[0].iov_base = (void *)path;
82972+ iov[0].iov_len = strlen(path) + 1;
82973+ iov[1].iov_base = (void *)token;
82974+ iov[1].iov_len = strlen(token) + 1;
82975+
82976+ return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
82977+ ARRAY_SIZE(iov), NULL));
82978+}
82979+
82980+static int xs_unwatch(const char *path, const char *token)
82981+{
82982+ struct kvec iov[2];
82983+
82984+ iov[0].iov_base = (char *)path;
82985+ iov[0].iov_len = strlen(path) + 1;
82986+ iov[1].iov_base = (char *)token;
82987+ iov[1].iov_len = strlen(token) + 1;
82988+
82989+ return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
82990+ ARRAY_SIZE(iov), NULL));
82991+}
82992+
82993+static struct xenbus_watch *find_watch(const char *token)
82994+{
82995+ struct xenbus_watch *i, *cmp;
82996+
82997+ cmp = (void *)simple_strtoul(token, NULL, 16);
82998+
82999+ list_for_each_entry(i, &watches, list)
83000+ if (i == cmp)
83001+ return i;
83002+
83003+ return NULL;
83004+}
83005+
83006+/* Register callback to watch this node. */
83007+int register_xenbus_watch(struct xenbus_watch *watch)
83008+{
83009+ /* Pointer in ascii is the token. */
83010+ char token[sizeof(watch) * 2 + 1];
83011+ int err;
83012+
83013+ sprintf(token, "%lX", (long)watch);
83014+
83015+ down_read(&xs_state.suspend_mutex);
83016+
83017+ spin_lock(&watches_lock);
83018+ BUG_ON(find_watch(token));
83019+ list_add(&watch->list, &watches);
83020+ spin_unlock(&watches_lock);
83021+
83022+ err = xs_watch(watch->node, token);
83023+
83024+ /* Ignore errors due to multiple registration. */
83025+ if ((err != 0) && (err != -EEXIST)) {
83026+ spin_lock(&watches_lock);
83027+ list_del(&watch->list);
83028+ spin_unlock(&watches_lock);
83029+ }
83030+
83031+ up_read(&xs_state.suspend_mutex);
83032+
83033+ return err;
83034+}
83035+EXPORT_SYMBOL_GPL(register_xenbus_watch);
83036+
83037+void unregister_xenbus_watch(struct xenbus_watch *watch)
83038+{
83039+ struct xs_stored_msg *msg, *tmp;
83040+ char token[sizeof(watch) * 2 + 1];
83041+ int err;
83042+
83043+ sprintf(token, "%lX", (long)watch);
83044+
83045+ down_read(&xs_state.suspend_mutex);
83046+
83047+ spin_lock(&watches_lock);
83048+ BUG_ON(!find_watch(token));
83049+ list_del(&watch->list);
83050+ spin_unlock(&watches_lock);
83051+
83052+ err = xs_unwatch(watch->node, token);
83053+ if (err)
83054+ printk(KERN_WARNING
83055+ "XENBUS Failed to release watch %s: %i\n",
83056+ watch->node, err);
83057+
83058+ up_read(&xs_state.suspend_mutex);
83059+
83060+ /* Cancel pending watch events. */
83061+ spin_lock(&watch_events_lock);
83062+ list_for_each_entry_safe(msg, tmp, &watch_events, list) {
83063+ if (msg->u.watch.handle != watch)
83064+ continue;
83065+ list_del(&msg->list);
83066+ kfree(msg->u.watch.vec);
83067+ kfree(msg);
83068+ }
83069+ spin_unlock(&watch_events_lock);
83070+
83071+ /* Flush any currently-executing callback, unless we are it. :-) */
83072+ if (current->pid != xenwatch_pid) {
83073+ mutex_lock(&xenwatch_mutex);
83074+ mutex_unlock(&xenwatch_mutex);
83075+ }
83076+}
83077+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
83078+
83079+void xs_suspend(void)
83080+{
83081+ struct xenbus_watch *watch;
83082+ char token[sizeof(watch) * 2 + 1];
83083+
83084+ down_write(&xs_state.suspend_mutex);
83085+
83086+ /* No need for watches_lock: the suspend_mutex is sufficient. */
83087+ list_for_each_entry(watch, &watches, list) {
83088+ sprintf(token, "%lX", (long)watch);
83089+ xs_unwatch(watch->node, token);
83090+ }
83091+
83092+ mutex_lock(&xs_state.request_mutex);
83093+}
83094+
83095+void xs_resume(void)
83096+{
83097+ struct xenbus_watch *watch;
83098+ char token[sizeof(watch) * 2 + 1];
83099+
83100+ mutex_unlock(&xs_state.request_mutex);
83101+
83102+ /* No need for watches_lock: the suspend_mutex is sufficient. */
83103+ list_for_each_entry(watch, &watches, list) {
83104+ sprintf(token, "%lX", (long)watch);
83105+ xs_watch(watch->node, token);
83106+ }
83107+
83108+ up_write(&xs_state.suspend_mutex);
83109+}
83110+
83111+static int xenwatch_handle_callback(void *data)
83112+{
83113+ struct xs_stored_msg *msg = data;
83114+
83115+ msg->u.watch.handle->callback(msg->u.watch.handle,
83116+ (const char **)msg->u.watch.vec,
83117+ msg->u.watch.vec_size);
83118+
83119+ kfree(msg->u.watch.vec);
83120+ kfree(msg);
83121+
83122+ /* Kill this kthread if we were spawned just for this callback. */
83123+ if (current->pid != xenwatch_pid)
83124+ do_exit(0);
83125+
83126+ return 0;
83127+}
83128+
83129+static int xenwatch_thread(void *unused)
83130+{
83131+ struct list_head *ent;
83132+ struct xs_stored_msg *msg;
83133+
83134+ for (;;) {
83135+ wait_event_interruptible(watch_events_waitq,
83136+ !list_empty(&watch_events));
83137+
83138+ if (kthread_should_stop())
83139+ break;
83140+
83141+ mutex_lock(&xenwatch_mutex);
83142+
83143+ spin_lock(&watch_events_lock);
83144+ ent = watch_events.next;
83145+ if (ent != &watch_events)
83146+ list_del(ent);
83147+ spin_unlock(&watch_events_lock);
83148+
83149+ if (ent != &watch_events) {
83150+ msg = list_entry(ent, struct xs_stored_msg, list);
83151+ if (msg->u.watch.handle->flags & XBWF_new_thread)
83152+ kthread_run(xenwatch_handle_callback,
83153+ msg, "xenwatch_cb");
83154+ else
83155+ xenwatch_handle_callback(msg);
83156+ }
83157+
83158+ mutex_unlock(&xenwatch_mutex);
83159+ }
83160+
83161+ return 0;
83162+}
83163+
83164+static int process_msg(void)
83165+{
83166+ struct xs_stored_msg *msg;
83167+ char *body;
83168+ int err;
83169+
83170+ msg = kmalloc(sizeof(*msg), GFP_KERNEL);
83171+ if (msg == NULL)
83172+ return -ENOMEM;
83173+
83174+ err = xb_read(&msg->hdr, sizeof(msg->hdr));
83175+ if (err) {
83176+ kfree(msg);
83177+ return err;
83178+ }
83179+
83180+ body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
83181+ if (body == NULL) {
83182+ kfree(msg);
83183+ return -ENOMEM;
83184+ }
83185+
83186+ err = xb_read(body, msg->hdr.len);
83187+ if (err) {
83188+ kfree(body);
83189+ kfree(msg);
83190+ return err;
83191+ }
83192+ body[msg->hdr.len] = '\0';
83193+
83194+ if (msg->hdr.type == XS_WATCH_EVENT) {
83195+ msg->u.watch.vec = split(body, msg->hdr.len,
83196+ &msg->u.watch.vec_size);
83197+ if (IS_ERR(msg->u.watch.vec)) {
83198+ kfree(msg);
83199+ return PTR_ERR(msg->u.watch.vec);
83200+ }
83201+
83202+ spin_lock(&watches_lock);
83203+ msg->u.watch.handle = find_watch(
83204+ msg->u.watch.vec[XS_WATCH_TOKEN]);
83205+ if (msg->u.watch.handle != NULL) {
83206+ spin_lock(&watch_events_lock);
83207+ list_add_tail(&msg->list, &watch_events);
83208+ wake_up(&watch_events_waitq);
83209+ spin_unlock(&watch_events_lock);
83210+ } else {
83211+ kfree(msg->u.watch.vec);
83212+ kfree(msg);
83213+ }
83214+ spin_unlock(&watches_lock);
83215+ } else {
83216+ msg->u.reply.body = body;
83217+ spin_lock(&xs_state.reply_lock);
83218+ list_add_tail(&msg->list, &xs_state.reply_list);
83219+ spin_unlock(&xs_state.reply_lock);
83220+ wake_up(&xs_state.reply_waitq);
83221+ }
83222+
83223+ return 0;
83224+}
83225+
83226+static int xenbus_thread(void *unused)
83227+{
83228+ int err;
83229+
83230+ for (;;) {
83231+ err = process_msg();
83232+ if (err)
83233+ printk(KERN_WARNING "XENBUS error %d while reading "
83234+ "message\n", err);
83235+ if (kthread_should_stop())
83236+ break;
83237+ }
83238+
83239+ return 0;
83240+}
83241+
83242+int xs_init(void)
83243+{
83244+ int err;
83245+ struct task_struct *task;
83246+
83247+ INIT_LIST_HEAD(&xs_state.reply_list);
83248+ spin_lock_init(&xs_state.reply_lock);
83249+ init_waitqueue_head(&xs_state.reply_waitq);
83250+
83251+ mutex_init(&xs_state.request_mutex);
83252+ init_rwsem(&xs_state.suspend_mutex);
83253+
83254+ /* Initialize the shared memory rings to talk to xenstored */
83255+ err = xb_init_comms();
83256+ if (err)
83257+ return err;
83258+
83259+ task = kthread_run(xenwatch_thread, NULL, "xenwatch");
83260+ if (IS_ERR(task))
83261+ return PTR_ERR(task);
83262+ xenwatch_pid = task->pid;
83263+
83264+ task = kthread_run(xenbus_thread, NULL, "xenbus");
83265+ if (IS_ERR(task))
83266+ return PTR_ERR(task);
83267+
83268+ return 0;
83269+}
83270diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenoprof/xenoprofile.c linux-2.6.16.33/drivers/xen/xenoprof/xenoprofile.c
83271--- linux-2.6.16.33-noxen/drivers/xen/xenoprof/xenoprofile.c 1970-01-01 00:00:00.000000000 +0000
83272+++ linux-2.6.16.33/drivers/xen/xenoprof/xenoprofile.c 2007-01-08 15:00:45.000000000 +0000
83273@@ -0,0 +1,500 @@
83274+/**
83275+ * @file xenoprofile.c
83276+ *
83277+ * @remark Copyright 2002 OProfile authors
83278+ * @remark Read the file COPYING
83279+ *
83280+ * @author John Levon <levon@movementarian.org>
83281+ *
83282+ * Modified by Aravind Menon and Jose Renato Santos for Xen
83283+ * These modifications are:
83284+ * Copyright (C) 2005 Hewlett-Packard Co.
83285+ *
83286+ * Separated out arch-generic part
83287+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
83288+ * VA Linux Systems Japan K.K.
83289+ */
83290+
83291+#include <linux/init.h>
83292+#include <linux/notifier.h>
83293+#include <linux/smp.h>
83294+#include <linux/oprofile.h>
83295+#include <linux/sysdev.h>
83296+#include <linux/slab.h>
83297+#include <linux/interrupt.h>
83298+#include <linux/vmalloc.h>
83299+#include <asm/pgtable.h>
83300+#include <xen/evtchn.h>
83301+#include <xen/xenoprof.h>
83302+#include <xen/driver_util.h>
83303+#include <xen/interface/xen.h>
83304+#include <xen/interface/xenoprof.h>
83305+#include "../../../drivers/oprofile/cpu_buffer.h"
83306+#include "../../../drivers/oprofile/event_buffer.h"
83307+
83308+#define MAX_XENOPROF_SAMPLES 16
83309+
83310+/* sample buffers shared with Xen */
83311+xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS];
83312+/* Shared buffer area */
83313+struct xenoprof_shared_buffer shared_buffer;
83314+
83315+/* Passive sample buffers shared with Xen */
83316+xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS];
83317+/* Passive shared buffer area */
83318+struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
83319+
83320+static int xenoprof_start(void);
83321+static void xenoprof_stop(void);
83322+
83323+static int xenoprof_enabled = 0;
83324+static int xenoprof_is_primary = 0;
83325+static int active_defined;
83326+
83327+/* Number of buffers in shared area (one per VCPU) */
83328+int nbuf;
83329+/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
83330+int ovf_irq[NR_CPUS];
83331+/* cpu model type string - copied from Xen memory space on XENOPROF_init command */
83332+char cpu_type[XENOPROF_CPU_TYPE_SIZE];
83333+
83334+#ifdef CONFIG_PM
83335+
83336+static int xenoprof_suspend(struct sys_device * dev, pm_message_t state)
83337+{
83338+ if (xenoprof_enabled == 1)
83339+ xenoprof_stop();
83340+ return 0;
83341+}
83342+
83343+
83344+static int xenoprof_resume(struct sys_device * dev)
83345+{
83346+ if (xenoprof_enabled == 1)
83347+ xenoprof_start();
83348+ return 0;
83349+}
83350+
83351+
83352+static struct sysdev_class oprofile_sysclass = {
83353+ set_kset_name("oprofile"),
83354+ .resume = xenoprof_resume,
83355+ .suspend = xenoprof_suspend
83356+};
83357+
83358+
83359+static struct sys_device device_oprofile = {
83360+ .id = 0,
83361+ .cls = &oprofile_sysclass,
83362+};
83363+
83364+
83365+static int __init init_driverfs(void)
83366+{
83367+ int error;
83368+ if (!(error = sysdev_class_register(&oprofile_sysclass)))
83369+ error = sysdev_register(&device_oprofile);
83370+ return error;
83371+}
83372+
83373+
83374+static void exit_driverfs(void)
83375+{
83376+ sysdev_unregister(&device_oprofile);
83377+ sysdev_class_unregister(&oprofile_sysclass);
83378+}
83379+
83380+#else
83381+#define init_driverfs() do { } while (0)
83382+#define exit_driverfs() do { } while (0)
83383+#endif /* CONFIG_PM */
83384+
83385+unsigned long long oprofile_samples = 0;
83386+unsigned long long p_oprofile_samples = 0;
83387+
83388+unsigned int pdomains;
83389+struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
83390+
83391+static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
83392+{
83393+ int head, tail, size;
83394+
83395+ head = buf->event_head;
83396+ tail = buf->event_tail;
83397+ size = buf->event_size;
83398+
83399+ if (tail > head) {
83400+ while (tail < size) {
83401+ oprofile_add_pc(buf->event_log[tail].eip,
83402+ buf->event_log[tail].mode,
83403+ buf->event_log[tail].event);
83404+ if (!is_passive)
83405+ oprofile_samples++;
83406+ else
83407+ p_oprofile_samples++;
83408+ tail++;
83409+ }
83410+ tail = 0;
83411+ }
83412+ while (tail < head) {
83413+ oprofile_add_pc(buf->event_log[tail].eip,
83414+ buf->event_log[tail].mode,
83415+ buf->event_log[tail].event);
83416+ if (!is_passive)
83417+ oprofile_samples++;
83418+ else
83419+ p_oprofile_samples++;
83420+ tail++;
83421+ }
83422+
83423+ buf->event_tail = tail;
83424+}
83425+
83426+static void xenoprof_handle_passive(void)
83427+{
83428+ int i, j;
83429+ int flag_domain, flag_switch = 0;
83430+
83431+ for (i = 0; i < pdomains; i++) {
83432+ flag_domain = 0;
83433+ for (j = 0; j < passive_domains[i].nbuf; j++) {
83434+ xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
83435+ if (buf->event_head == buf->event_tail)
83436+ continue;
83437+ if (!flag_domain) {
83438+ if (!oprofile_add_domain_switch(passive_domains[i].
83439+ domain_id))
83440+ goto done;
83441+ flag_domain = 1;
83442+ }
83443+ xenoprof_add_pc(buf, 1);
83444+ flag_switch = 1;
83445+ }
83446+ }
83447+done:
83448+ if (flag_switch)
83449+ oprofile_add_domain_switch(COORDINATOR_DOMAIN);
83450+}
83451+
83452+static irqreturn_t
83453+xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
83454+{
83455+ struct xenoprof_buf * buf;
83456+ int cpu;
83457+ static unsigned long flag;
83458+
83459+ cpu = smp_processor_id();
83460+ buf = xenoprof_buf[cpu];
83461+
83462+ xenoprof_add_pc(buf, 0);
83463+
83464+ if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
83465+ xenoprof_handle_passive();
83466+ smp_mb__before_clear_bit();
83467+ clear_bit(0, &flag);
83468+ }
83469+
83470+ return IRQ_HANDLED;
83471+}
83472+
83473+
83474+static void unbind_virq(void)
83475+{
83476+ int i;
83477+
83478+ for_each_online_cpu(i) {
83479+ if (ovf_irq[i] >= 0) {
83480+ unbind_from_irqhandler(ovf_irq[i], NULL);
83481+ ovf_irq[i] = -1;
83482+ }
83483+ }
83484+}
83485+
83486+
83487+static int bind_virq(void)
83488+{
83489+ int i, result;
83490+
83491+ for_each_online_cpu(i) {
83492+ result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
83493+ i,
83494+ xenoprof_ovf_interrupt,
83495+ SA_INTERRUPT,
83496+ "xenoprof",
83497+ NULL);
83498+
83499+ if (result < 0) {
83500+ unbind_virq();
83501+ return result;
83502+ }
83503+
83504+ ovf_irq[i] = result;
83505+ }
83506+
83507+ return 0;
83508+}
83509+
83510+
83511+static void unmap_passive_list(void)
83512+{
83513+ int i;
83514+ for (i = 0; i < pdomains; i++)
83515+ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
83516+ pdomains = 0;
83517+}
83518+
83519+
83520+static int map_xenoprof_buffer(int max_samples)
83521+{
83522+ struct xenoprof_get_buffer get_buffer;
83523+ struct xenoprof_buf *buf;
83524+ int ret, i;
83525+
83526+ if ( shared_buffer.buffer )
83527+ return 0;
83528+
83529+ get_buffer.max_samples = max_samples;
83530+ ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
83531+ if (ret)
83532+ return ret;
83533+ nbuf = get_buffer.nbuf;
83534+
83535+ for (i=0; i< nbuf; i++) {
83536+ buf = (struct xenoprof_buf*)
83537+ &shared_buffer.buffer[i * get_buffer.bufsize];
83538+ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
83539+ xenoprof_buf[buf->vcpu_id] = buf;
83540+ }
83541+
83542+ return 0;
83543+}
83544+
83545+
83546+static int xenoprof_setup(void)
83547+{
83548+ int ret;
83549+
83550+ if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
83551+ return ret;
83552+
83553+ if ( (ret = bind_virq()) )
83554+ return ret;
83555+
83556+ if (xenoprof_is_primary) {
83557+ /* Define dom0 as an active domain if not done yet */
83558+ if (!active_defined) {
83559+ domid_t domid;
83560+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83561+ if (ret)
83562+ goto err;
83563+ domid = 0;
83564+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83565+ if (ret)
83566+ goto err;
83567+ active_defined = 1;
83568+ }
83569+
83570+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
83571+ if (ret)
83572+ goto err;
83573+ xenoprof_arch_counter();
83574+ ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
83575+
83576+ if (ret)
83577+ goto err;
83578+ }
83579+
83580+ ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
83581+ if (ret)
83582+ goto err;
83583+
83584+ xenoprof_enabled = 1;
83585+ return 0;
83586+ err:
83587+ unbind_virq();
83588+ return ret;
83589+}
83590+
83591+
83592+static void xenoprof_shutdown(void)
83593+{
83594+ xenoprof_enabled = 0;
83595+
83596+ HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL);
83597+
83598+ if (xenoprof_is_primary) {
83599+ HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL);
83600+ active_defined = 0;
83601+ }
83602+
83603+ unbind_virq();
83604+
83605+ xenoprof_arch_unmap_shared_buffer(&shared_buffer);
83606+ if (xenoprof_is_primary)
83607+ unmap_passive_list();
83608+}
83609+
83610+
83611+static int xenoprof_start(void)
83612+{
83613+ int ret = 0;
83614+
83615+ if (xenoprof_is_primary)
83616+ ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
83617+ if (!ret)
83618+ xenoprof_arch_start();
83619+ return ret;
83620+}
83621+
83622+
83623+static void xenoprof_stop(void)
83624+{
83625+ if (xenoprof_is_primary)
83626+ HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL);
83627+ xenoprof_arch_stop();
83628+}
83629+
83630+
83631+static int xenoprof_set_active(int * active_domains,
83632+ unsigned int adomains)
83633+{
83634+ int ret = 0;
83635+ int i;
83636+ int set_dom0 = 0;
83637+ domid_t domid;
83638+
83639+ if (!xenoprof_is_primary)
83640+ return 0;
83641+
83642+ if (adomains > MAX_OPROF_DOMAINS)
83643+ return -E2BIG;
83644+
83645+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83646+ if (ret)
83647+ return ret;
83648+
83649+ for (i=0; i<adomains; i++) {
83650+ domid = active_domains[i];
83651+ if (domid != active_domains[i]) {
83652+ ret = -EINVAL;
83653+ goto out;
83654+ }
83655+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83656+ if (ret)
83657+ goto out;
83658+ if (active_domains[i] == 0)
83659+ set_dom0 = 1;
83660+ }
83661+ /* dom0 must always be active but may not be in the list */
83662+ if (!set_dom0) {
83663+ domid = 0;
83664+ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83665+ }
83666+
83667+out:
83668+ if (ret)
83669+ HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83670+ active_defined = !ret;
83671+ return ret;
83672+}
83673+
83674+static int xenoprof_set_passive(int * p_domains,
83675+ unsigned int pdoms)
83676+{
83677+ int ret;
83678+ int i, j;
83679+ struct xenoprof_buf *buf;
83680+
83681+ if (!xenoprof_is_primary)
83682+ return 0;
83683+
83684+ if (pdoms > MAX_OPROF_DOMAINS)
83685+ return -E2BIG;
83686+
83687+ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
83688+ if (ret)
83689+ return ret;
83690+ unmap_passive_list();
83691+
83692+ for (i = 0; i < pdoms; i++) {
83693+ passive_domains[i].domain_id = p_domains[i];
83694+ passive_domains[i].max_samples = 2048;
83695+ ret = xenoprof_arch_set_passive(&passive_domains[i],
83696+ &p_shared_buffer[i]);
83697+ if (ret)
83698+ goto out;
83699+ for (j = 0; j < passive_domains[i].nbuf; j++) {
83700+ buf = (struct xenoprof_buf *)
83701+ &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize];
83702+ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
83703+ p_xenoprof_buf[i][buf->vcpu_id] = buf;
83704+ }
83705+ }
83706+
83707+ pdomains = pdoms;
83708+ return 0;
83709+
83710+out:
83711+ for (j = 0; j < i; j++)
83712+ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
83713+
83714+ return ret;
83715+}
83716+
83717+struct oprofile_operations xenoprof_ops = {
83718+#ifdef HAVE_XENOPROF_CREATE_FILES
83719+ .create_files = xenoprof_create_files,
83720+#endif
83721+ .set_active = xenoprof_set_active,
83722+ .set_passive = xenoprof_set_passive,
83723+ .setup = xenoprof_setup,
83724+ .shutdown = xenoprof_shutdown,
83725+ .start = xenoprof_start,
83726+ .stop = xenoprof_stop
83727+};
83728+
83729+
83730+/* in order to get driverfs right */
83731+static int using_xenoprof;
83732+
83733+int __init xenoprofile_init(struct oprofile_operations * ops)
83734+{
83735+ struct xenoprof_init init;
83736+ int ret, i;
83737+
83738+ ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
83739+ if (!ret) {
83740+ xenoprof_arch_init_counter(&init);
83741+ xenoprof_is_primary = init.is_primary;
83742+
83743+ /* cpu_type is detected by Xen */
83744+ cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
83745+ strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
83746+ xenoprof_ops.cpu_type = cpu_type;
83747+
83748+ init_driverfs();
83749+ using_xenoprof = 1;
83750+ *ops = xenoprof_ops;
83751+
83752+ for (i=0; i<NR_CPUS; i++)
83753+ ovf_irq[i] = -1;
83754+
83755+ active_defined = 0;
83756+ }
83757+ printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n",
83758+ __func__, ret, init.num_events, xenoprof_is_primary);
83759+ return ret;
83760+}
83761+
83762+
83763+void xenoprofile_exit(void)
83764+{
83765+ if (using_xenoprof)
83766+ exit_driverfs();
83767+
83768+ xenoprof_arch_unmap_shared_buffer(&shared_buffer);
83769+ if (xenoprof_is_primary) {
83770+ unmap_passive_list();
83771+ HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL);
83772+ }
83773+}
83774diff -Nur linux-2.6.16.33-noxen/fs/Kconfig linux-2.6.16.33/fs/Kconfig
83775--- linux-2.6.16.33-noxen/fs/Kconfig 2006-11-22 18:06:31.000000000 +0000
83776+++ linux-2.6.16.33/fs/Kconfig 2007-01-08 15:00:45.000000000 +0000
83777@@ -841,6 +841,7 @@
83778 config HUGETLBFS
83779 bool "HugeTLB file system support"
83780 depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
83781+ depends !XEN
83782
83783 config HUGETLB_PAGE
83784 def_bool HUGETLBFS
83785diff -Nur linux-2.6.16.33-noxen/fs/aio.c linux-2.6.16.33/fs/aio.c
83786--- linux-2.6.16.33-noxen/fs/aio.c 2006-11-22 18:06:31.000000000 +0000
83787+++ linux-2.6.16.33/fs/aio.c 2007-05-23 21:00:01.000000000 +0000
83788@@ -34,6 +34,11 @@
83789 #include <asm/uaccess.h>
83790 #include <asm/mmu_context.h>
83791
83792+#ifdef CONFIG_EPOLL
83793+#include <linux/poll.h>
83794+#include <linux/eventpoll.h>
83795+#endif
83796+
83797 #if DEBUG > 1
83798 #define dprintk printk
83799 #else
83800@@ -1016,6 +1021,10 @@
83801 if (waitqueue_active(&ctx->wait))
83802 wake_up(&ctx->wait);
83803
83804+#ifdef CONFIG_EPOLL
83805+ if (ctx->file && waitqueue_active(&ctx->poll_wait))
83806+ wake_up(&ctx->poll_wait);
83807+#endif
83808 if (ret)
83809 put_ioctx(ctx);
83810
83811@@ -1025,6 +1034,8 @@
83812 /* aio_read_evt
83813 * Pull an event off of the ioctx's event ring. Returns the number of
83814 * events fetched (0 or 1 ;-)
83815+ * If ent parameter is 0, just returns the number of events that would
83816+ * be fetched.
83817 * FIXME: make this use cmpxchg.
83818 * TODO: make the ringbuffer user mmap()able (requires FIXME).
83819 */
83820@@ -1047,13 +1058,18 @@
83821
83822 head = ring->head % info->nr;
83823 if (head != ring->tail) {
83824- struct io_event *evp = aio_ring_event(info, head, KM_USER1);
83825- *ent = *evp;
83826- head = (head + 1) % info->nr;
83827- smp_mb(); /* finish reading the event before updatng the head */
83828- ring->head = head;
83829- ret = 1;
83830- put_aio_ring_event(evp, KM_USER1);
83831+ if (ent) { /* event requested */
83832+ struct io_event *evp =
83833+ aio_ring_event(info, head, KM_USER1);
83834+ *ent = *evp;
83835+ head = (head + 1) % info->nr;
83836+ /* finish reading the event before updatng the head */
83837+ smp_mb();
83838+ ring->head = head;
83839+ ret = 1;
83840+ put_aio_ring_event(evp, KM_USER1);
83841+ } else /* only need to know availability */
83842+ ret = 1;
83843 }
83844 spin_unlock(&info->ring_lock);
83845
83846@@ -1236,9 +1252,78 @@
83847
83848 aio_cancel_all(ioctx);
83849 wait_for_all_aios(ioctx);
83850+#ifdef CONFIG_EPOLL
83851+ /* forget the poll file, but it's up to the user to close it */
83852+ if (ioctx->file) {
83853+ ioctx->file->private_data = 0;
83854+ ioctx->file = 0;
83855+ }
83856+#endif
83857 put_ioctx(ioctx); /* once for the lookup */
83858 }
83859
83860+#ifdef CONFIG_EPOLL
83861+
83862+static int aio_queue_fd_close(struct inode *inode, struct file *file)
83863+{
83864+ struct kioctx *ioctx = file->private_data;
83865+ if (ioctx) {
83866+ file->private_data = 0;
83867+ spin_lock_irq(&ioctx->ctx_lock);
83868+ ioctx->file = 0;
83869+ spin_unlock_irq(&ioctx->ctx_lock);
83870+ }
83871+ return 0;
83872+}
83873+
83874+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
83875+{ unsigned int pollflags = 0;
83876+ struct kioctx *ioctx = file->private_data;
83877+
83878+ if (ioctx) {
83879+
83880+ spin_lock_irq(&ioctx->ctx_lock);
83881+ /* Insert inside our poll wait queue */
83882+ poll_wait(file, &ioctx->poll_wait, wait);
83883+
83884+ /* Check our condition */
83885+ if (aio_read_evt(ioctx, 0))
83886+ pollflags = POLLIN | POLLRDNORM;
83887+ spin_unlock_irq(&ioctx->ctx_lock);
83888+ }
83889+
83890+ return pollflags;
83891+}
83892+
83893+static struct file_operations aioq_fops = {
83894+ .release = aio_queue_fd_close,
83895+ .poll = aio_queue_fd_poll
83896+};
83897+
83898+/* make_aio_fd:
83899+ * Create a file descriptor that can be used to poll the event queue.
83900+ * Based and piggybacked on the excellent epoll code.
83901+ */
83902+
83903+static int make_aio_fd(struct kioctx *ioctx)
83904+{
83905+ int error, fd;
83906+ struct inode *inode;
83907+ struct file *file;
83908+
83909+ error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
83910+ if (error)
83911+ return error;
83912+
83913+ /* associate the file with the IO context */
83914+ file->private_data = ioctx;
83915+ ioctx->file = file;
83916+ init_waitqueue_head(&ioctx->poll_wait);
83917+ return fd;
83918+}
83919+#endif
83920+
83921+
83922 /* sys_io_setup:
83923 * Create an aio_context capable of receiving at least nr_events.
83924 * ctxp must not point to an aio_context that already exists, and
83925@@ -1251,18 +1336,30 @@
83926 * resources are available. May fail with -EFAULT if an invalid
83927 * pointer is passed for ctxp. Will fail with -ENOSYS if not
83928 * implemented.
83929+ *
83930+ * To request a selectable fd, the user context has to be initialized
83931+ * to 1, instead of 0, and the return value is the fd.
83932+ * This keeps the system call compatible, since a non-zero value
83933+ * was not allowed so far.
83934 */
83935 asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
83936 {
83937 struct kioctx *ioctx = NULL;
83938 unsigned long ctx;
83939 long ret;
83940+ int make_fd = 0;
83941
83942 ret = get_user(ctx, ctxp);
83943 if (unlikely(ret))
83944 goto out;
83945
83946 ret = -EINVAL;
83947+#ifdef CONFIG_EPOLL
83948+ if (ctx == 1) {
83949+ make_fd = 1;
83950+ ctx = 0;
83951+ }
83952+#endif
83953 if (unlikely(ctx || nr_events == 0)) {
83954 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
83955 ctx, nr_events);
83956@@ -1273,8 +1370,12 @@
83957 ret = PTR_ERR(ioctx);
83958 if (!IS_ERR(ioctx)) {
83959 ret = put_user(ioctx->user_id, ctxp);
83960- if (!ret)
83961- return 0;
83962+#ifdef CONFIG_EPOLL
83963+ if (make_fd && ret >= 0)
83964+ ret = make_aio_fd(ioctx);
83965+#endif
83966+ if (ret >= 0)
83967+ return ret;
83968
83969 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
83970 io_destroy(ioctx);
83971diff -Nur linux-2.6.16.33-noxen/fs/eventpoll.c linux-2.6.16.33/fs/eventpoll.c
83972--- linux-2.6.16.33-noxen/fs/eventpoll.c 2006-11-22 18:06:31.000000000 +0000
83973+++ linux-2.6.16.33/fs/eventpoll.c 2007-05-23 21:00:01.000000000 +0000
83974@@ -235,8 +235,6 @@
83975
83976 static void ep_poll_safewake_init(struct poll_safewake *psw);
83977 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
83978-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
83979- struct eventpoll *ep);
83980 static int ep_alloc(struct eventpoll **pep);
83981 static void ep_free(struct eventpoll *ep);
83982 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
83983@@ -266,7 +264,7 @@
83984 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
83985 int maxevents, long timeout);
83986 static int eventpollfs_delete_dentry(struct dentry *dentry);
83987-static struct inode *ep_eventpoll_inode(void);
83988+static struct inode *ep_eventpoll_inode(struct file_operations *fops);
83989 static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
83990 int flags, const char *dev_name,
83991 void *data);
83992@@ -525,7 +523,7 @@
83993 * Creates all the items needed to setup an eventpoll file. That is,
83994 * a file structure, and inode and a free file descriptor.
83995 */
83996- error = ep_getfd(&fd, &inode, &file, ep);
83997+ error = ep_getfd(&fd, &inode, &file, ep, &eventpoll_fops);
83998 if (error)
83999 goto eexit_2;
84000
84001@@ -710,8 +708,8 @@
84002 /*
84003 * Creates the file descriptor to be used by the epoll interface.
84004 */
84005-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
84006- struct eventpoll *ep)
84007+int ep_getfd(int *efd, struct inode **einode, struct file **efile,
84008+ struct eventpoll *ep, struct file_operations *fops)
84009 {
84010 struct qstr this;
84011 char name[32];
84012@@ -727,7 +725,7 @@
84013 goto eexit_1;
84014
84015 /* Allocates an inode from the eventpoll file system */
84016- inode = ep_eventpoll_inode();
84017+ inode = ep_eventpoll_inode(fops);
84018 error = PTR_ERR(inode);
84019 if (IS_ERR(inode))
84020 goto eexit_2;
84021@@ -758,7 +756,7 @@
84022
84023 file->f_pos = 0;
84024 file->f_flags = O_RDONLY;
84025- file->f_op = &eventpoll_fops;
84026+ file->f_op = fops;
84027 file->f_mode = FMODE_READ;
84028 file->f_version = 0;
84029 file->private_data = ep;
84030@@ -1574,7 +1572,7 @@
84031 }
84032
84033
84034-static struct inode *ep_eventpoll_inode(void)
84035+static struct inode *ep_eventpoll_inode(struct file_operations *fops)
84036 {
84037 int error = -ENOMEM;
84038 struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
84039@@ -1582,7 +1580,7 @@
84040 if (!inode)
84041 goto eexit_1;
84042
84043- inode->i_fop = &eventpoll_fops;
84044+ inode->i_fop = fops;
84045
84046 /*
84047 * Mark the inode dirty from the very beginning,
84048diff -Nur linux-2.6.16.33-noxen/fs/proc/proc_misc.c linux-2.6.16.33/fs/proc/proc_misc.c
84049--- linux-2.6.16.33-noxen/fs/proc/proc_misc.c 2006-11-22 18:06:31.000000000 +0000
84050+++ linux-2.6.16.33/fs/proc/proc_misc.c 2007-05-23 21:00:01.000000000 +0000
84051@@ -433,7 +433,7 @@
84052 (unsigned long long)cputime64_to_clock_t(irq),
84053 (unsigned long long)cputime64_to_clock_t(softirq),
84054 (unsigned long long)cputime64_to_clock_t(steal));
84055- for_each_online_cpu(i) {
84056+ for_each_cpu(i) {
84057
84058 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
84059 user = kstat_cpu(i).cpustat.user;
84060diff -Nur linux-2.6.16.33-noxen/include/asm-generic/vmlinux.lds.h linux-2.6.16.33/include/asm-generic/vmlinux.lds.h
84061--- linux-2.6.16.33-noxen/include/asm-generic/vmlinux.lds.h 2006-11-22 18:06:31.000000000 +0000
84062+++ linux-2.6.16.33/include/asm-generic/vmlinux.lds.h 2007-05-23 21:00:01.000000000 +0000
84063@@ -152,3 +152,6 @@
84064 .stab.index 0 : { *(.stab.index) } \
84065 .stab.indexstr 0 : { *(.stab.indexstr) } \
84066 .comment 0 : { *(.comment) }
84067+
84068+#define NOTES \
84069+ .notes : { *(.note.*) } :note
84070diff -Nur linux-2.6.16.33-noxen/include/asm-i386/a.out.h linux-2.6.16.33/include/asm-i386/a.out.h
84071--- linux-2.6.16.33-noxen/include/asm-i386/a.out.h 2006-11-22 18:06:31.000000000 +0000
84072+++ linux-2.6.16.33/include/asm-i386/a.out.h 2007-01-08 15:00:45.000000000 +0000
84073@@ -19,7 +19,7 @@
84074
84075 #ifdef __KERNEL__
84076
84077-#define STACK_TOP TASK_SIZE
84078+#define STACK_TOP (TASK_SIZE - 3*PAGE_SIZE)
84079
84080 #endif
84081
84082diff -Nur linux-2.6.16.33-noxen/include/asm-i386/apic.h linux-2.6.16.33/include/asm-i386/apic.h
84083--- linux-2.6.16.33-noxen/include/asm-i386/apic.h 2006-11-22 18:06:31.000000000 +0000
84084+++ linux-2.6.16.33/include/asm-i386/apic.h 2007-01-08 15:00:45.000000000 +0000
84085@@ -132,10 +132,12 @@
84086
84087 extern int disable_timer_pin_1;
84088
84089+#ifndef CONFIG_XEN
84090 void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
84091 void switch_APIC_timer_to_ipi(void *cpumask);
84092 void switch_ipi_to_APIC_timer(void *cpumask);
84093 #define ARCH_APICTIMER_STOPS_ON_C3 1
84094+#endif
84095
84096 extern int timer_over_8254;
84097
84098diff -Nur linux-2.6.16.33-noxen/include/asm-i386/atomic.h linux-2.6.16.33/include/asm-i386/atomic.h
84099--- linux-2.6.16.33-noxen/include/asm-i386/atomic.h 2006-11-22 18:06:31.000000000 +0000
84100+++ linux-2.6.16.33/include/asm-i386/atomic.h 2007-01-08 15:00:45.000000000 +0000
84101@@ -4,18 +4,13 @@
84102 #include <linux/config.h>
84103 #include <linux/compiler.h>
84104 #include <asm/processor.h>
84105+#include <asm/smp_alt.h>
84106
84107 /*
84108 * Atomic operations that C can't guarantee us. Useful for
84109 * resource counting etc..
84110 */
84111
84112-#ifdef CONFIG_SMP
84113-#define LOCK "lock ; "
84114-#else
84115-#define LOCK ""
84116-#endif
84117-
84118 /*
84119 * Make sure gcc doesn't try to be clever and move things around
84120 * on us. We need to use _exactly_ the address the user gave us,
84121diff -Nur linux-2.6.16.33-noxen/include/asm-i386/bitops.h linux-2.6.16.33/include/asm-i386/bitops.h
84122--- linux-2.6.16.33-noxen/include/asm-i386/bitops.h 2006-11-22 18:06:31.000000000 +0000
84123+++ linux-2.6.16.33/include/asm-i386/bitops.h 2007-01-08 15:00:45.000000000 +0000
84124@@ -7,6 +7,7 @@
84125
84126 #include <linux/config.h>
84127 #include <linux/compiler.h>
84128+#include <asm/smp_alt.h>
84129
84130 /*
84131 * These have to be done with inline assembly: that way the bit-setting
84132@@ -16,12 +17,6 @@
84133 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
84134 */
84135
84136-#ifdef CONFIG_SMP
84137-#define LOCK_PREFIX "lock ; "
84138-#else
84139-#define LOCK_PREFIX ""
84140-#endif
84141-
84142 #define ADDR (*(volatile long *) addr)
84143
84144 /**
84145@@ -41,7 +36,7 @@
84146 */
84147 static inline void set_bit(int nr, volatile unsigned long * addr)
84148 {
84149- __asm__ __volatile__( LOCK_PREFIX
84150+ __asm__ __volatile__( LOCK
84151 "btsl %1,%0"
84152 :"+m" (ADDR)
84153 :"Ir" (nr));
84154@@ -76,7 +71,7 @@
84155 */
84156 static inline void clear_bit(int nr, volatile unsigned long * addr)
84157 {
84158- __asm__ __volatile__( LOCK_PREFIX
84159+ __asm__ __volatile__( LOCK
84160 "btrl %1,%0"
84161 :"+m" (ADDR)
84162 :"Ir" (nr));
84163@@ -121,7 +116,7 @@
84164 */
84165 static inline void change_bit(int nr, volatile unsigned long * addr)
84166 {
84167- __asm__ __volatile__( LOCK_PREFIX
84168+ __asm__ __volatile__( LOCK
84169 "btcl %1,%0"
84170 :"+m" (ADDR)
84171 :"Ir" (nr));
84172@@ -140,7 +135,7 @@
84173 {
84174 int oldbit;
84175
84176- __asm__ __volatile__( LOCK_PREFIX
84177+ __asm__ __volatile__( LOCK
84178 "btsl %2,%1\n\tsbbl %0,%0"
84179 :"=r" (oldbit),"+m" (ADDR)
84180 :"Ir" (nr) : "memory");
84181@@ -180,7 +175,7 @@
84182 {
84183 int oldbit;
84184
84185- __asm__ __volatile__( LOCK_PREFIX
84186+ __asm__ __volatile__( LOCK
84187 "btrl %2,%1\n\tsbbl %0,%0"
84188 :"=r" (oldbit),"+m" (ADDR)
84189 :"Ir" (nr) : "memory");
84190@@ -231,7 +226,7 @@
84191 {
84192 int oldbit;
84193
84194- __asm__ __volatile__( LOCK_PREFIX
84195+ __asm__ __volatile__( LOCK
84196 "btcl %2,%1\n\tsbbl %0,%0"
84197 :"=r" (oldbit),"+m" (ADDR)
84198 :"Ir" (nr) : "memory");
84199diff -Nur linux-2.6.16.33-noxen/include/asm-i386/elf.h linux-2.6.16.33/include/asm-i386/elf.h
84200--- linux-2.6.16.33-noxen/include/asm-i386/elf.h 2006-11-22 18:06:31.000000000 +0000
84201+++ linux-2.6.16.33/include/asm-i386/elf.h 2007-01-08 15:00:45.000000000 +0000
84202@@ -129,11 +129,16 @@
84203 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
84204 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
84205
84206-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
84207+#define VSYSCALL_BASE (PAGE_OFFSET - 2*PAGE_SIZE)
84208 #define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
84209 #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
84210 extern void __kernel_vsyscall;
84211
84212+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
84213+struct linux_binprm;
84214+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
84215+ int executable_stack);
84216+
84217 #define ARCH_DLINFO \
84218 do { \
84219 NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
84220diff -Nur linux-2.6.16.33-noxen/include/asm-i386/fixmap.h linux-2.6.16.33/include/asm-i386/fixmap.h
84221--- linux-2.6.16.33-noxen/include/asm-i386/fixmap.h 2006-11-22 18:06:31.000000000 +0000
84222+++ linux-2.6.16.33/include/asm-i386/fixmap.h 2007-01-08 15:00:45.000000000 +0000
84223@@ -20,7 +20,7 @@
84224 * Leave one empty page between vmalloc'ed areas and
84225 * the start of the fixmap.
84226 */
84227-#define __FIXADDR_TOP 0xfffff000
84228+extern unsigned long __FIXADDR_TOP;
84229
84230 #ifndef __ASSEMBLY__
84231 #include <linux/kernel.h>
84232@@ -52,7 +52,6 @@
84233 */
84234 enum fixed_addresses {
84235 FIX_HOLE,
84236- FIX_VSYSCALL,
84237 #ifdef CONFIG_X86_LOCAL_APIC
84238 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
84239 #endif
84240@@ -95,6 +94,8 @@
84241 extern void __set_fixmap (enum fixed_addresses idx,
84242 unsigned long phys, pgprot_t flags);
84243
84244+extern void set_fixaddr_top(unsigned long top);
84245+
84246 #define set_fixmap(idx, phys) \
84247 __set_fixmap(idx, phys, PAGE_KERNEL)
84248 /*
84249@@ -116,14 +117,6 @@
84250 #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
84251 #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
84252
84253-/*
84254- * This is the range that is readable by user mode, and things
84255- * acting like user mode such as get_user_pages.
84256- */
84257-#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL))
84258-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
84259-
84260-
84261 extern void __this_fixmap_does_not_exist(void);
84262
84263 /*
84264diff -Nur linux-2.6.16.33-noxen/include/asm-i386/futex.h linux-2.6.16.33/include/asm-i386/futex.h
84265--- linux-2.6.16.33-noxen/include/asm-i386/futex.h 2006-11-22 18:06:31.000000000 +0000
84266+++ linux-2.6.16.33/include/asm-i386/futex.h 2007-01-08 15:00:45.000000000 +0000
84267@@ -28,7 +28,7 @@
84268 "1: movl %2, %0\n\
84269 movl %0, %3\n" \
84270 insn "\n" \
84271-"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
84272+"2: " LOCK "cmpxchgl %3, %2\n\
84273 jnz 1b\n\
84274 3: .section .fixup,\"ax\"\n\
84275 4: mov %5, %1\n\
84276@@ -68,7 +68,7 @@
84277 #endif
84278 switch (op) {
84279 case FUTEX_OP_ADD:
84280- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
84281+ __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
84282 oldval, uaddr, oparg);
84283 break;
84284 case FUTEX_OP_OR:
84285diff -Nur linux-2.6.16.33-noxen/include/asm-i386/kexec.h linux-2.6.16.33/include/asm-i386/kexec.h
84286--- linux-2.6.16.33-noxen/include/asm-i386/kexec.h 2006-11-22 18:06:31.000000000 +0000
84287+++ linux-2.6.16.33/include/asm-i386/kexec.h 2007-01-08 15:00:45.000000000 +0000
84288@@ -1,6 +1,26 @@
84289 #ifndef _I386_KEXEC_H
84290 #define _I386_KEXEC_H
84291
84292+#define PA_CONTROL_PAGE 0
84293+#define VA_CONTROL_PAGE 1
84294+#define PA_PGD 2
84295+#define VA_PGD 3
84296+#define PA_PTE_0 4
84297+#define VA_PTE_0 5
84298+#define PA_PTE_1 6
84299+#define VA_PTE_1 7
84300+#ifdef CONFIG_X86_PAE
84301+#define PA_PMD_0 8
84302+#define VA_PMD_0 9
84303+#define PA_PMD_1 10
84304+#define VA_PMD_1 11
84305+#define PAGES_NR 12
84306+#else
84307+#define PAGES_NR 8
84308+#endif
84309+
84310+#ifndef __ASSEMBLY__
84311+
84312 #include <asm/fixmap.h>
84313 #include <asm/ptrace.h>
84314 #include <asm/string.h>
84315@@ -72,5 +92,26 @@
84316 newregs->eip = (unsigned long)current_text_addr();
84317 }
84318 }
84319+asmlinkage NORET_TYPE void
84320+relocate_kernel(unsigned long indirection_page,
84321+ unsigned long control_page,
84322+ unsigned long start_address,
84323+ unsigned int has_pae) ATTRIB_NORET;
84324+
84325+
84326+/* Under Xen we need to work with machine addresses. These macros give the
84327+ * machine address of a certain page to the generic kexec code instead of
84328+ * the pseudo physical address which would be given by the default macros.
84329+ */
84330+
84331+#ifdef CONFIG_XEN
84332+#define KEXEC_ARCH_HAS_PAGE_MACROS
84333+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
84334+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
84335+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
84336+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
84337+#endif
84338+
84339+#endif /* __ASSEMBLY__ */
84340
84341 #endif /* _I386_KEXEC_H */
84342diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-default/mach_traps.h linux-2.6.16.33/include/asm-i386/mach-default/mach_traps.h
84343--- linux-2.6.16.33-noxen/include/asm-i386/mach-default/mach_traps.h 2006-11-22 18:06:31.000000000 +0000
84344+++ linux-2.6.16.33/include/asm-i386/mach-default/mach_traps.h 2007-01-08 15:00:45.000000000 +0000
84345@@ -15,6 +15,18 @@
84346 outb(reason, 0x61);
84347 }
84348
84349+static inline void clear_io_check_error(unsigned char reason)
84350+{
84351+ unsigned long i;
84352+
84353+ reason = (reason & 0xf) | 8;
84354+ outb(reason, 0x61);
84355+ i = 2000;
84356+ while (--i) udelay(1000);
84357+ reason &= ~8;
84358+ outb(reason, 0x61);
84359+}
84360+
84361 static inline unsigned char get_nmi_reason(void)
84362 {
84363 return inb(0x61);
84364diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/agp.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/agp.h
84365--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/agp.h 1970-01-01 00:00:00.000000000 +0000
84366+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/agp.h 2007-01-08 15:00:45.000000000 +0000
84367@@ -0,0 +1,37 @@
84368+#ifndef AGP_H
84369+#define AGP_H 1
84370+
84371+#include <asm/pgtable.h>
84372+#include <asm/cacheflush.h>
84373+#include <asm/system.h>
84374+
84375+/*
84376+ * Functions to keep the agpgart mappings coherent with the MMU.
84377+ * The GART gives the CPU a physical alias of pages in memory. The alias region is
84378+ * mapped uncacheable. Make sure there are no conflicting mappings
84379+ * with different cachability attributes for the same page. This avoids
84380+ * data corruption on some CPUs.
84381+ */
84382+
84383+int map_page_into_agp(struct page *page);
84384+int unmap_page_from_agp(struct page *page);
84385+#define flush_agp_mappings() global_flush_tlb()
84386+
84387+/* Could use CLFLUSH here if the cpu supports it. But then it would
84388+ need to be called for each cacheline of the whole page so it may not be
84389+ worth it. Would need a page for it. */
84390+#define flush_agp_cache() wbinvd()
84391+
84392+/* Convert a physical address to an address suitable for the GART. */
84393+#define phys_to_gart(x) phys_to_machine(x)
84394+#define gart_to_phys(x) machine_to_phys(x)
84395+
84396+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
84397+#define alloc_gatt_pages(order) ({ \
84398+ char *_t; dma_addr_t _d; \
84399+ _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
84400+ _t; })
84401+#define free_gatt_pages(table, order) \
84402+ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
84403+
84404+#endif
84405diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/desc.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/desc.h
84406--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/desc.h 1970-01-01 00:00:00.000000000 +0000
84407+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/desc.h 2007-01-08 15:00:45.000000000 +0000
84408@@ -0,0 +1,164 @@
84409+#ifndef __ARCH_DESC_H
84410+#define __ARCH_DESC_H
84411+
84412+#include <asm/ldt.h>
84413+#include <asm/segment.h>
84414+
84415+#define CPU_16BIT_STACK_SIZE 1024
84416+
84417+#ifndef __ASSEMBLY__
84418+
84419+#include <linux/preempt.h>
84420+#include <linux/smp.h>
84421+
84422+#include <asm/mmu.h>
84423+
84424+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
84425+
84426+DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
84427+
84428+struct Xgt_desc_struct {
84429+ unsigned short size;
84430+ unsigned long address __attribute__((packed));
84431+ unsigned short pad;
84432+} __attribute__ ((packed));
84433+
84434+extern struct Xgt_desc_struct idt_descr;
84435+DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
84436+
84437+
84438+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
84439+{
84440+ return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
84441+}
84442+
84443+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
84444+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
84445+
84446+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
84447+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
84448+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
84449+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
84450+
84451+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
84452+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
84453+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
84454+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
84455+
84456+/*
84457+ * This is the ldt that every process will get unless we need
84458+ * something other than this.
84459+ */
84460+extern struct desc_struct default_ldt[];
84461+extern void set_intr_gate(unsigned int irq, void * addr);
84462+
84463+#define _set_tssldt_desc(n,addr,limit,type) \
84464+__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
84465+ "movw %w1,2(%2)\n\t" \
84466+ "rorl $16,%1\n\t" \
84467+ "movb %b1,4(%2)\n\t" \
84468+ "movb %4,5(%2)\n\t" \
84469+ "movb $0,6(%2)\n\t" \
84470+ "movb %h1,7(%2)\n\t" \
84471+ "rorl $16,%1" \
84472+ : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
84473+
84474+#ifndef CONFIG_X86_NO_TSS
84475+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
84476+{
84477+ _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
84478+ offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
84479+}
84480+
84481+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
84482+#endif
84483+
84484+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
84485+{
84486+ _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
84487+}
84488+
84489+#define LDT_entry_a(info) \
84490+ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
84491+
84492+#define LDT_entry_b(info) \
84493+ (((info)->base_addr & 0xff000000) | \
84494+ (((info)->base_addr & 0x00ff0000) >> 16) | \
84495+ ((info)->limit & 0xf0000) | \
84496+ (((info)->read_exec_only ^ 1) << 9) | \
84497+ ((info)->contents << 10) | \
84498+ (((info)->seg_not_present ^ 1) << 15) | \
84499+ ((info)->seg_32bit << 22) | \
84500+ ((info)->limit_in_pages << 23) | \
84501+ ((info)->useable << 20) | \
84502+ 0x7000)
84503+
84504+#define LDT_empty(info) (\
84505+ (info)->base_addr == 0 && \
84506+ (info)->limit == 0 && \
84507+ (info)->contents == 0 && \
84508+ (info)->read_exec_only == 1 && \
84509+ (info)->seg_32bit == 0 && \
84510+ (info)->limit_in_pages == 0 && \
84511+ (info)->seg_not_present == 1 && \
84512+ (info)->useable == 0 )
84513+
84514+extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
84515+
84516+#if TLS_SIZE != 24
84517+# error update this code.
84518+#endif
84519+
84520+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
84521+{
84522+#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
84523+ C(0); C(1); C(2);
84524+#undef C
84525+}
84526+
84527+static inline void clear_LDT(void)
84528+{
84529+ int cpu = get_cpu();
84530+
84531+ /*
84532+ * NB. We load the default_ldt for lcall7/27 handling on demand, as
84533+ * it slows down context switching. Noone uses it anyway.
84534+ */
84535+ cpu = cpu; /* XXX avoid compiler warning */
84536+ xen_set_ldt(0UL, 0);
84537+ put_cpu();
84538+}
84539+
84540+/*
84541+ * load one particular LDT into the current CPU
84542+ */
84543+static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
84544+{
84545+ void *segments = pc->ldt;
84546+ int count = pc->size;
84547+
84548+ if (likely(!count))
84549+ segments = NULL;
84550+
84551+ xen_set_ldt((unsigned long)segments, count);
84552+}
84553+
84554+static inline void load_LDT(mm_context_t *pc)
84555+{
84556+ int cpu = get_cpu();
84557+ load_LDT_nolock(pc, cpu);
84558+ put_cpu();
84559+}
84560+
84561+static inline unsigned long get_desc_base(unsigned long *desc)
84562+{
84563+ unsigned long base;
84564+ base = ((desc[0] >> 16) & 0x0000ffff) |
84565+ ((desc[1] << 16) & 0x00ff0000) |
84566+ (desc[1] & 0xff000000);
84567+ return base;
84568+}
84569+
84570+#endif /* !__ASSEMBLY__ */
84571+
84572+#endif
84573diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/dma-mapping.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/dma-mapping.h
84574--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/dma-mapping.h 1970-01-01 00:00:00.000000000 +0000
84575+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/dma-mapping.h 2007-01-08 15:00:45.000000000 +0000
84576@@ -0,0 +1,152 @@
84577+#ifndef _ASM_I386_DMA_MAPPING_H
84578+#define _ASM_I386_DMA_MAPPING_H
84579+
84580+/*
84581+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
84582+ * documentation.
84583+ */
84584+
84585+#include <linux/config.h>
84586+#include <linux/mm.h>
84587+#include <asm/cache.h>
84588+#include <asm/io.h>
84589+#include <asm/scatterlist.h>
84590+#include <asm/swiotlb.h>
84591+
84592+static inline int
84593+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
84594+{
84595+ dma_addr_t mask = 0xffffffff;
84596+ /* If the device has a mask, use it, otherwise default to 32 bits */
84597+ if (hwdev && hwdev->dma_mask)
84598+ mask = *hwdev->dma_mask;
84599+ return (addr & ~mask) != 0;
84600+}
84601+
84602+static inline int
84603+range_straddles_page_boundary(void *p, size_t size)
84604+{
84605+ extern unsigned long *contiguous_bitmap;
84606+ return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
84607+ !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
84608+}
84609+
84610+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
84611+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
84612+
84613+void *dma_alloc_coherent(struct device *dev, size_t size,
84614+ dma_addr_t *dma_handle, gfp_t flag);
84615+
84616+void dma_free_coherent(struct device *dev, size_t size,
84617+ void *vaddr, dma_addr_t dma_handle);
84618+
84619+extern dma_addr_t
84620+dma_map_single(struct device *dev, void *ptr, size_t size,
84621+ enum dma_data_direction direction);
84622+
84623+extern void
84624+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
84625+ enum dma_data_direction direction);
84626+
84627+extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
84628+ int nents, enum dma_data_direction direction);
84629+extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
84630+ int nents, enum dma_data_direction direction);
84631+
84632+extern dma_addr_t
84633+dma_map_page(struct device *dev, struct page *page, unsigned long offset,
84634+ size_t size, enum dma_data_direction direction);
84635+
84636+extern void
84637+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
84638+ enum dma_data_direction direction);
84639+
84640+extern void
84641+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
84642+ enum dma_data_direction direction);
84643+
84644+extern void
84645+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
84646+ enum dma_data_direction direction);
84647+
84648+static inline void
84649+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
84650+ unsigned long offset, size_t size,
84651+ enum dma_data_direction direction)
84652+{
84653+ dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
84654+}
84655+
84656+static inline void
84657+dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
84658+ unsigned long offset, size_t size,
84659+ enum dma_data_direction direction)
84660+{
84661+ dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
84662+}
84663+
84664+static inline void
84665+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
84666+ enum dma_data_direction direction)
84667+{
84668+ if (swiotlb)
84669+ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
84670+ flush_write_buffers();
84671+}
84672+
84673+static inline void
84674+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
84675+ enum dma_data_direction direction)
84676+{
84677+ if (swiotlb)
84678+ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
84679+ flush_write_buffers();
84680+}
84681+
84682+extern int
84683+dma_mapping_error(dma_addr_t dma_addr);
84684+
84685+extern int
84686+dma_supported(struct device *dev, u64 mask);
84687+
84688+static inline int
84689+dma_set_mask(struct device *dev, u64 mask)
84690+{
84691+ if(!dev->dma_mask || !dma_supported(dev, mask))
84692+ return -EIO;
84693+
84694+ *dev->dma_mask = mask;
84695+
84696+ return 0;
84697+}
84698+
84699+static inline int
84700+dma_get_cache_alignment(void)
84701+{
84702+ /* no easy way to get cache size on all x86, so return the
84703+ * maximum possible, to be safe */
84704+ return (1 << INTERNODE_CACHE_SHIFT);
84705+}
84706+
84707+#define dma_is_consistent(d) (1)
84708+
84709+static inline void
84710+dma_cache_sync(void *vaddr, size_t size,
84711+ enum dma_data_direction direction)
84712+{
84713+ flush_write_buffers();
84714+}
84715+
84716+#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
84717+extern int
84718+dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
84719+ dma_addr_t device_addr, size_t size, int flags);
84720+
84721+extern void
84722+dma_release_declared_memory(struct device *dev);
84723+
84724+extern void *
84725+dma_mark_declared_memory_occupied(struct device *dev,
84726+ dma_addr_t device_addr, size_t size);
84727+
84728+#endif
84729diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/fixmap.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/fixmap.h
84730--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/fixmap.h 1970-01-01 00:00:00.000000000 +0000
84731+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/fixmap.h 2007-01-08 15:00:45.000000000 +0000
84732@@ -0,0 +1,155 @@
84733+/*
84734+ * fixmap.h: compile-time virtual memory allocation
84735+ *
84736+ * This file is subject to the terms and conditions of the GNU General Public
84737+ * License. See the file "COPYING" in the main directory of this archive
84738+ * for more details.
84739+ *
84740+ * Copyright (C) 1998 Ingo Molnar
84741+ *
84742+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
84743+ */
84744+
84745+#ifndef _ASM_FIXMAP_H
84746+#define _ASM_FIXMAP_H
84747+
84748+#include <linux/config.h>
84749+
84750+/* used by vmalloc.c, vsyscall.lds.S.
84751+ *
84752+ * Leave one empty page between vmalloc'ed areas and
84753+ * the start of the fixmap.
84754+ */
84755+extern unsigned long __FIXADDR_TOP;
84756+
84757+#ifndef __ASSEMBLY__
84758+#include <linux/kernel.h>
84759+#include <asm/acpi.h>
84760+#include <asm/apicdef.h>
84761+#include <asm/page.h>
84762+#ifdef CONFIG_HIGHMEM
84763+#include <linux/threads.h>
84764+#include <asm/kmap_types.h>
84765+#endif
84766+
84767+/*
84768+ * Here we define all the compile-time 'special' virtual
84769+ * addresses. The point is to have a constant address at
84770+ * compile time, but to set the physical address only
84771+ * in the boot process. We allocate these special addresses
84772+ * from the end of virtual memory (0xfffff000) backwards.
84773+ * Also this lets us do fail-safe vmalloc(), we
84774+ * can guarantee that these special addresses and
84775+ * vmalloc()-ed addresses never overlap.
84776+ *
84777+ * these 'compile-time allocated' memory buffers are
84778+ * fixed-size 4k pages. (or larger if used with an increment
84779+ * highger than 1) use fixmap_set(idx,phys) to associate
84780+ * physical memory with fixmap indices.
84781+ *
84782+ * TLB entries of such buffers will not be flushed across
84783+ * task switches.
84784+ */
84785+enum fixed_addresses {
84786+ FIX_HOLE,
84787+#ifdef CONFIG_X86_LOCAL_APIC
84788+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
84789+#endif
84790+#ifdef CONFIG_X86_IO_APIC
84791+ FIX_IO_APIC_BASE_0,
84792+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
84793+#endif
84794+#ifdef CONFIG_X86_VISWS_APIC
84795+ FIX_CO_CPU, /* Cobalt timer */
84796+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
84797+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
84798+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
84799+#endif
84800+#ifdef CONFIG_X86_F00F_BUG
84801+ FIX_F00F_IDT, /* Virtual mapping for IDT */
84802+#endif
84803+#ifdef CONFIG_X86_CYCLONE_TIMER
84804+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
84805+#endif
84806+#ifdef CONFIG_HIGHMEM
84807+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
84808+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
84809+#endif
84810+#ifdef CONFIG_ACPI
84811+ FIX_ACPI_BEGIN,
84812+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
84813+#endif
84814+#ifdef CONFIG_PCI_MMCONFIG
84815+ FIX_PCIE_MCFG,
84816+#endif
84817+ FIX_SHARED_INFO,
84818+#define NR_FIX_ISAMAPS 256
84819+ FIX_ISAMAP_END,
84820+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
84821+ __end_of_permanent_fixed_addresses,
84822+ /* temporary boot-time mappings, used before ioremap() is functional */
84823+#define NR_FIX_BTMAPS 16
84824+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
84825+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
84826+ FIX_WP_TEST,
84827+ __end_of_fixed_addresses
84828+};
84829+
84830+extern void __set_fixmap(enum fixed_addresses idx,
84831+ maddr_t phys, pgprot_t flags);
84832+
84833+extern void set_fixaddr_top(void);
84834+
84835+#define set_fixmap(idx, phys) \
84836+ __set_fixmap(idx, phys, PAGE_KERNEL)
84837+/*
84838+ * Some hardware wants to get fixmapped without caching.
84839+ */
84840+#define set_fixmap_nocache(idx, phys) \
84841+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
84842+
84843+#define clear_fixmap(idx) \
84844+ __set_fixmap(idx, 0, __pgprot(0))
84845+
84846+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
84847+
84848+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
84849+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
84850+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
84851+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
84852+
84853+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
84854+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
84855+
84856+extern void __this_fixmap_does_not_exist(void);
84857+
84858+/*
84859+ * 'index to address' translation. If anyone tries to use the idx
84860+ * directly without tranlation, we catch the bug with a NULL-deference
84861+ * kernel oops. Illegal ranges of incoming indices are caught too.
84862+ */
84863+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
84864+{
84865+ /*
84866+ * this branch gets completely eliminated after inlining,
84867+ * except when someone tries to use fixaddr indices in an
84868+ * illegal way. (such as mixing up address types or using
84869+ * out-of-range indices).
84870+ *
84871+ * If it doesn't get removed, the linker will complain
84872+ * loudly with a reasonably clear error message..
84873+ */
84874+ if (idx >= __end_of_fixed_addresses)
84875+ __this_fixmap_does_not_exist();
84876+
84877+ return __fix_to_virt(idx);
84878+}
84879+
84880+static inline unsigned long virt_to_fix(const unsigned long vaddr)
84881+{
84882+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
84883+ return __virt_to_fix(vaddr);
84884+}
84885+
84886+#endif /* !__ASSEMBLY__ */
84887+#endif
84888diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/floppy.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/floppy.h
84889--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/floppy.h 1970-01-01 00:00:00.000000000 +0000
84890+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/floppy.h 2007-01-08 15:00:45.000000000 +0000
84891@@ -0,0 +1,147 @@
84892+/*
84893+ * Architecture specific parts of the Floppy driver
84894+ *
84895+ * This file is subject to the terms and conditions of the GNU General Public
84896+ * License. See the file "COPYING" in the main directory of this archive
84897+ * for more details.
84898+ *
84899+ * Copyright (C) 1995
84900+ *
84901+ * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
84902+ */
84903+#ifndef __ASM_XEN_I386_FLOPPY_H
84904+#define __ASM_XEN_I386_FLOPPY_H
84905+
84906+#include <linux/vmalloc.h>
84907+
84908+/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
84909+#include <asm/dma.h>
84910+#undef MAX_DMA_ADDRESS
84911+#define MAX_DMA_ADDRESS 0
84912+#define CROSS_64KB(a,s) (0)
84913+
84914+#define fd_inb(port) inb_p(port)
84915+#define fd_outb(value,port) outb_p(value,port)
84916+
84917+#define fd_request_dma() (0)
84918+#define fd_free_dma() ((void)0)
84919+#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
84920+#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
84921+#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
84922+#define fd_get_dma_residue() (virtual_dma_count + virtual_dma_residue)
84923+#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
84924+/*
84925+ * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
84926+ * softirq context via motor_off_callback. A generic bug we happen to trigger.
84927+ */
84928+#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
84929+#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
84930+
84931+static int virtual_dma_count;
84932+static int virtual_dma_residue;
84933+static char *virtual_dma_addr;
84934+static int virtual_dma_mode;
84935+static int doing_pdma;
84936+
84937+static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
84938+{
84939+ register unsigned char st;
84940+ register int lcount;
84941+ register char *lptr;
84942+
84943+ if (!doing_pdma)
84944+ return floppy_interrupt(irq, dev_id, regs);
84945+
84946+ st = 1;
84947+ for(lcount=virtual_dma_count, lptr=virtual_dma_addr;
84948+ lcount; lcount--, lptr++) {
84949+ st=inb(virtual_dma_port+4) & 0xa0 ;
84950+ if(st != 0xa0)
84951+ break;
84952+ if(virtual_dma_mode)
84953+ outb_p(*lptr, virtual_dma_port+5);
84954+ else
84955+ *lptr = inb_p(virtual_dma_port+5);
84956+ }
84957+ virtual_dma_count = lcount;
84958+ virtual_dma_addr = lptr;
84959+ st = inb(virtual_dma_port+4);
84960+
84961+ if(st == 0x20)
84962+ return IRQ_HANDLED;
84963+ if(!(st & 0x20)) {
84964+ virtual_dma_residue += virtual_dma_count;
84965+ virtual_dma_count=0;
84966+ doing_pdma = 0;
84967+ floppy_interrupt(irq, dev_id, regs);
84968+ return IRQ_HANDLED;
84969+ }
84970+ return IRQ_HANDLED;
84971+}
84972+
84973+static void fd_disable_dma(void)
84974+{
84975+ doing_pdma = 0;
84976+ virtual_dma_residue += virtual_dma_count;
84977+ virtual_dma_count=0;
84978+}
84979+
84980+static int fd_request_irq(void)
84981+{
84982+ return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
84983+ "floppy", NULL);
84984+}
84985+
84986+static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
84987+{
84988+ doing_pdma = 1;
84989+ virtual_dma_port = io;
84990+ virtual_dma_mode = (mode == DMA_MODE_WRITE);
84991+ virtual_dma_addr = addr;
84992+ virtual_dma_count = size;
84993+ virtual_dma_residue = 0;
84994+ return 0;
84995+}
84996+
84997+/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
84998+#define FDC1 xen_floppy_init()
84999+static int FDC2 = -1;
85000+
85001+static int xen_floppy_init(void)
85002+{
85003+ use_virtual_dma = 1;
85004+ can_use_virtual_dma = 1;
85005+ return 0x3f0;
85006+}
85007+
85008+/*
85009+ * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
85010+ * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
85011+ * coincides with another rtc CMOS user. Paul G.
85012+ */
85013+#define FLOPPY0_TYPE ({ \
85014+ unsigned long flags; \
85015+ unsigned char val; \
85016+ spin_lock_irqsave(&rtc_lock, flags); \
85017+ val = (CMOS_READ(0x10) >> 4) & 15; \
85018+ spin_unlock_irqrestore(&rtc_lock, flags); \
85019+ val; \
85020+})
85021+
85022+#define FLOPPY1_TYPE ({ \
85023+ unsigned long flags; \
85024+ unsigned char val; \
85025+ spin_lock_irqsave(&rtc_lock, flags); \
85026+ val = CMOS_READ(0x10) & 15; \
85027+ spin_unlock_irqrestore(&rtc_lock, flags); \
85028+ val; \
85029+})
85030+
85031+#define N_FDC 2
85032+#define N_DRIVE 8
85033+
85034+#define FLOPPY_MOTOR_MASK 0xf0
85035+
85036+#define EXTRA_FLOPPY_PARAMS
85037+
85038+#endif /* __ASM_XEN_I386_FLOPPY_H */
85039diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/highmem.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/highmem.h
85040--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/highmem.h 1970-01-01 00:00:00.000000000 +0000
85041+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/highmem.h 2007-01-08 15:00:45.000000000 +0000
85042@@ -0,0 +1,81 @@
85043+/*
85044+ * highmem.h: virtual kernel memory mappings for high memory
85045+ *
85046+ * Used in CONFIG_HIGHMEM systems for memory pages which
85047+ * are not addressable by direct kernel virtual addresses.
85048+ *
85049+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
85050+ * Gerhard.Wichert@pdb.siemens.de
85051+ *
85052+ *
85053+ * Redesigned the x86 32-bit VM architecture to deal with
85054+ * up to 16 Terabyte physical memory. With current x86 CPUs
85055+ * we now support up to 64 Gigabytes physical RAM.
85056+ *
85057+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
85058+ */
85059+
85060+#ifndef _ASM_HIGHMEM_H
85061+#define _ASM_HIGHMEM_H
85062+
85063+#ifdef __KERNEL__
85064+
85065+#include <linux/config.h>
85066+#include <linux/interrupt.h>
85067+#include <linux/threads.h>
85068+#include <asm/kmap_types.h>
85069+#include <asm/tlbflush.h>
85070+
85071+/* declarations for highmem.c */
85072+extern unsigned long highstart_pfn, highend_pfn;
85073+
85074+extern pte_t *kmap_pte;
85075+extern pgprot_t kmap_prot;
85076+extern pte_t *pkmap_page_table;
85077+
85078+/*
85079+ * Right now we initialize only a single pte table. It can be extended
85080+ * easily, subsequent pte tables have to be allocated in one physical
85081+ * chunk of RAM.
85082+ */
85083+#ifdef CONFIG_X86_PAE
85084+#define LAST_PKMAP 512
85085+#else
85086+#define LAST_PKMAP 1024
85087+#endif
85088+/*
85089+ * Ordering is:
85090+ *
85091+ * FIXADDR_TOP
85092+ * fixed_addresses
85093+ * FIXADDR_START
85094+ * temp fixed addresses
85095+ * FIXADDR_BOOT_START
85096+ * Persistent kmap area
85097+ * PKMAP_BASE
85098+ * VMALLOC_END
85099+ * Vmalloc area
85100+ * VMALLOC_START
85101+ * high_memory
85102+ */
85103+#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
85104+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
85105+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
85106+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
85107+
85108+extern void * FASTCALL(kmap_high(struct page *page));
85109+extern void FASTCALL(kunmap_high(struct page *page));
85110+
85111+void *kmap(struct page *page);
85112+void kunmap(struct page *page);
85113+void *kmap_atomic(struct page *page, enum km_type type);
85114+void *kmap_atomic_pte(struct page *page, enum km_type type);
85115+void kunmap_atomic(void *kvaddr, enum km_type type);
85116+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
85117+struct page *kmap_atomic_to_page(void *ptr);
85118+
85119+#define flush_cache_kmaps() do { } while (0)
85120+
85121+#endif /* __KERNEL__ */
85122+
85123+#endif /* _ASM_HIGHMEM_H */
85124diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hw_irq.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hw_irq.h
85125--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hw_irq.h 1970-01-01 00:00:00.000000000 +0000
85126+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hw_irq.h 2007-01-08 15:00:45.000000000 +0000
85127@@ -0,0 +1,77 @@
85128+#ifndef _ASM_HW_IRQ_H
85129+#define _ASM_HW_IRQ_H
85130+
85131+/*
85132+ * linux/include/asm/hw_irq.h
85133+ *
85134+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
85135+ *
85136+ * moved some of the old arch/i386/kernel/irq.h to here. VY
85137+ *
85138+ * IRQ/IPI changes taken from work by Thomas Radke
85139+ * <tomsoft@informatik.tu-chemnitz.de>
85140+ */
85141+
85142+#include <linux/config.h>
85143+#include <linux/profile.h>
85144+#include <asm/atomic.h>
85145+#include <asm/irq.h>
85146+#include <asm/sections.h>
85147+
85148+struct hw_interrupt_type;
85149+
85150+/*
85151+ * Various low-level irq details needed by irq.c, process.c,
85152+ * time.c, io_apic.c and smp.c
85153+ *
85154+ * Interrupt entry/exit code at both C and assembly level
85155+ */
85156+
85157+extern u8 irq_vector[NR_IRQ_VECTORS];
85158+#define IO_APIC_VECTOR(irq) (irq_vector[irq])
85159+#define AUTO_ASSIGN -1
85160+
85161+extern void (*interrupt[NR_IRQS])(void);
85162+
85163+#ifdef CONFIG_SMP
85164+fastcall void reschedule_interrupt(void);
85165+fastcall void invalidate_interrupt(void);
85166+fastcall void call_function_interrupt(void);
85167+#endif
85168+
85169+#ifdef CONFIG_X86_LOCAL_APIC
85170+fastcall void apic_timer_interrupt(void);
85171+fastcall void error_interrupt(void);
85172+fastcall void spurious_interrupt(void);
85173+fastcall void thermal_interrupt(struct pt_regs *);
85174+#define platform_legacy_irq(irq) ((irq) < 16)
85175+#endif
85176+
85177+void disable_8259A_irq(unsigned int irq);
85178+void enable_8259A_irq(unsigned int irq);
85179+int i8259A_irq_pending(unsigned int irq);
85180+void make_8259A_irq(unsigned int irq);
85181+void init_8259A(int aeoi);
85182+void FASTCALL(send_IPI_self(int vector));
85183+void init_VISWS_APIC_irqs(void);
85184+void setup_IO_APIC(void);
85185+void disable_IO_APIC(void);
85186+void print_IO_APIC(void);
85187+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
85188+void send_IPI(int dest, int vector);
85189+void setup_ioapic_dest(void);
85190+
85191+extern unsigned long io_apic_irqs;
85192+
85193+extern atomic_t irq_err_count;
85194+extern atomic_t irq_mis_count;
85195+
85196+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
85197+
85198+extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
85199+static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
85200+{
85201+ resend_irq_on_evtchn(h, i);
85202+}
85203+
85204+#endif /* _ASM_HW_IRQ_H */
85205diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypercall.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypercall.h
85206--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypercall.h 1970-01-01 00:00:00.000000000 +0000
85207+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypercall.h 2007-01-08 15:00:45.000000000 +0000
85208@@ -0,0 +1,407 @@
85209+/******************************************************************************
85210+ * hypercall.h
85211+ *
85212+ * Linux-specific hypervisor handling.
85213+ *
85214+ * Copyright (c) 2002-2004, K A Fraser
85215+ *
85216+ * This program is free software; you can redistribute it and/or
85217+ * modify it under the terms of the GNU General Public License version 2
85218+ * as published by the Free Software Foundation; or, when distributed
85219+ * separately from the Linux kernel or incorporated into other
85220+ * software packages, subject to the following license:
85221+ *
85222+ * Permission is hereby granted, free of charge, to any person obtaining a copy
85223+ * of this source file (the "Software"), to deal in the Software without
85224+ * restriction, including without limitation the rights to use, copy, modify,
85225+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
85226+ * and to permit persons to whom the Software is furnished to do so, subject to
85227+ * the following conditions:
85228+ *
85229+ * The above copyright notice and this permission notice shall be included in
85230+ * all copies or substantial portions of the Software.
85231+ *
85232+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
85233+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
85234+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
85235+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
85236+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
85237+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
85238+ * IN THE SOFTWARE.
85239+ */
85240+
85241+#ifndef __HYPERCALL_H__
85242+#define __HYPERCALL_H__
85243+
85244+#include <linux/string.h> /* memcpy() */
85245+
85246+#ifndef __HYPERVISOR_H__
85247+# error "please don't include this file directly"
85248+#endif
85249+
85250+#define __STR(x) #x
85251+#define STR(x) __STR(x)
85252+
85253+#ifdef CONFIG_XEN
85254+#define HYPERCALL_STR(name) \
85255+ "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
85256+#else
85257+#define HYPERCALL_STR(name) \
85258+ "mov hypercall_stubs,%%eax; " \
85259+ "add $("STR(__HYPERVISOR_##name)" * 32),%%eax; " \
85260+ "call *%%eax"
85261+#endif
85262+
85263+#define _hypercall0(type, name) \
85264+({ \
85265+ long __res; \
85266+ asm volatile ( \
85267+ HYPERCALL_STR(name) \
85268+ : "=a" (__res) \
85269+ : \
85270+ : "memory" ); \
85271+ (type)__res; \
85272+})
85273+
85274+#define _hypercall1(type, name, a1) \
85275+({ \
85276+ long __res, __ign1; \
85277+ asm volatile ( \
85278+ HYPERCALL_STR(name) \
85279+ : "=a" (__res), "=b" (__ign1) \
85280+ : "1" ((long)(a1)) \
85281+ : "memory" ); \
85282+ (type)__res; \
85283+})
85284+
85285+#define _hypercall2(type, name, a1, a2) \
85286+({ \
85287+ long __res, __ign1, __ign2; \
85288+ asm volatile ( \
85289+ HYPERCALL_STR(name) \
85290+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
85291+ : "1" ((long)(a1)), "2" ((long)(a2)) \
85292+ : "memory" ); \
85293+ (type)__res; \
85294+})
85295+
85296+#define _hypercall3(type, name, a1, a2, a3) \
85297+({ \
85298+ long __res, __ign1, __ign2, __ign3; \
85299+ asm volatile ( \
85300+ HYPERCALL_STR(name) \
85301+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85302+ "=d" (__ign3) \
85303+ : "1" ((long)(a1)), "2" ((long)(a2)), \
85304+ "3" ((long)(a3)) \
85305+ : "memory" ); \
85306+ (type)__res; \
85307+})
85308+
85309+#define _hypercall4(type, name, a1, a2, a3, a4) \
85310+({ \
85311+ long __res, __ign1, __ign2, __ign3, __ign4; \
85312+ asm volatile ( \
85313+ HYPERCALL_STR(name) \
85314+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85315+ "=d" (__ign3), "=S" (__ign4) \
85316+ : "1" ((long)(a1)), "2" ((long)(a2)), \
85317+ "3" ((long)(a3)), "4" ((long)(a4)) \
85318+ : "memory" ); \
85319+ (type)__res; \
85320+})
85321+
85322+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
85323+({ \
85324+ long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
85325+ asm volatile ( \
85326+ HYPERCALL_STR(name) \
85327+ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85328+ "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
85329+ : "1" ((long)(a1)), "2" ((long)(a2)), \
85330+ "3" ((long)(a3)), "4" ((long)(a4)), \
85331+ "5" ((long)(a5)) \
85332+ : "memory" ); \
85333+ (type)__res; \
85334+})
85335+
85336+static inline int
85337+HYPERVISOR_set_trap_table(
85338+ trap_info_t *table)
85339+{
85340+ return _hypercall1(int, set_trap_table, table);
85341+}
85342+
85343+static inline int
85344+HYPERVISOR_mmu_update(
85345+ mmu_update_t *req, int count, int *success_count, domid_t domid)
85346+{
85347+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
85348+}
85349+
85350+static inline int
85351+HYPERVISOR_mmuext_op(
85352+ struct mmuext_op *op, int count, int *success_count, domid_t domid)
85353+{
85354+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
85355+}
85356+
85357+static inline int
85358+HYPERVISOR_set_gdt(
85359+ unsigned long *frame_list, int entries)
85360+{
85361+ return _hypercall2(int, set_gdt, frame_list, entries);
85362+}
85363+
85364+static inline int
85365+HYPERVISOR_stack_switch(
85366+ unsigned long ss, unsigned long esp)
85367+{
85368+ return _hypercall2(int, stack_switch, ss, esp);
85369+}
85370+
85371+static inline int
85372+HYPERVISOR_set_callbacks(
85373+ unsigned long event_selector, unsigned long event_address,
85374+ unsigned long failsafe_selector, unsigned long failsafe_address)
85375+{
85376+ return _hypercall4(int, set_callbacks,
85377+ event_selector, event_address,
85378+ failsafe_selector, failsafe_address);
85379+}
85380+
85381+static inline int
85382+HYPERVISOR_fpu_taskswitch(
85383+ int set)
85384+{
85385+ return _hypercall1(int, fpu_taskswitch, set);
85386+}
85387+
85388+static inline int
85389+HYPERVISOR_sched_op_compat(
85390+ int cmd, unsigned long arg)
85391+{
85392+ return _hypercall2(int, sched_op_compat, cmd, arg);
85393+}
85394+
85395+static inline int
85396+HYPERVISOR_sched_op(
85397+ int cmd, void *arg)
85398+{
85399+ return _hypercall2(int, sched_op, cmd, arg);
85400+}
85401+
85402+static inline long
85403+HYPERVISOR_set_timer_op(
85404+ u64 timeout)
85405+{
85406+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
85407+ unsigned long timeout_lo = (unsigned long)timeout;
85408+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
85409+}
85410+
85411+static inline int
85412+HYPERVISOR_dom0_op(
85413+ dom0_op_t *dom0_op)
85414+{
85415+ dom0_op->interface_version = DOM0_INTERFACE_VERSION;
85416+ return _hypercall1(int, dom0_op, dom0_op);
85417+}
85418+
85419+static inline int
85420+HYPERVISOR_set_debugreg(
85421+ int reg, unsigned long value)
85422+{
85423+ return _hypercall2(int, set_debugreg, reg, value);
85424+}
85425+
85426+static inline unsigned long
85427+HYPERVISOR_get_debugreg(
85428+ int reg)
85429+{
85430+ return _hypercall1(unsigned long, get_debugreg, reg);
85431+}
85432+
85433+static inline int
85434+HYPERVISOR_update_descriptor(
85435+ u64 ma, u64 desc)
85436+{
85437+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
85438+}
85439+
85440+static inline int
85441+HYPERVISOR_memory_op(
85442+ unsigned int cmd, void *arg)
85443+{
85444+ return _hypercall2(int, memory_op, cmd, arg);
85445+}
85446+
85447+static inline int
85448+HYPERVISOR_multicall(
85449+ void *call_list, int nr_calls)
85450+{
85451+ return _hypercall2(int, multicall, call_list, nr_calls);
85452+}
85453+
85454+static inline int
85455+HYPERVISOR_update_va_mapping(
85456+ unsigned long va, pte_t new_val, unsigned long flags)
85457+{
85458+ unsigned long pte_hi = 0;
85459+#ifdef CONFIG_X86_PAE
85460+ pte_hi = new_val.pte_high;
85461+#endif
85462+ return _hypercall4(int, update_va_mapping, va,
85463+ new_val.pte_low, pte_hi, flags);
85464+}
85465+
85466+static inline int
85467+HYPERVISOR_event_channel_op(
85468+ int cmd, void *arg)
85469+{
85470+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
85471+
85472+#ifdef CONFIG_XEN_COMPAT_030002
85473+ if (unlikely(rc == -ENOSYS)) {
85474+ struct evtchn_op op;
85475+ op.cmd = cmd;
85476+ memcpy(&op.u, arg, sizeof(op.u));
85477+ rc = _hypercall1(int, event_channel_op_compat, &op);
85478+ memcpy(arg, &op.u, sizeof(op.u));
85479+ }
85480+#endif
85481+
85482+ return rc;
85483+}
85484+
85485+static inline int
85486+HYPERVISOR_acm_op(
85487+ int cmd, void *arg)
85488+{
85489+ return _hypercall2(int, acm_op, cmd, arg);
85490+}
85491+
85492+static inline int
85493+HYPERVISOR_xen_version(
85494+ int cmd, void *arg)
85495+{
85496+ return _hypercall2(int, xen_version, cmd, arg);
85497+}
85498+
85499+static inline int
85500+HYPERVISOR_console_io(
85501+ int cmd, int count, char *str)
85502+{
85503+ return _hypercall3(int, console_io, cmd, count, str);
85504+}
85505+
85506+static inline int
85507+HYPERVISOR_physdev_op(
85508+ int cmd, void *arg)
85509+{
85510+ int rc = _hypercall2(int, physdev_op, cmd, arg);
85511+
85512+#ifdef CONFIG_XEN_COMPAT_030002
85513+ if (unlikely(rc == -ENOSYS)) {
85514+ struct physdev_op op;
85515+ op.cmd = cmd;
85516+ memcpy(&op.u, arg, sizeof(op.u));
85517+ rc = _hypercall1(int, physdev_op_compat, &op);
85518+ memcpy(arg, &op.u, sizeof(op.u));
85519+ }
85520+#endif
85521+
85522+ return rc;
85523+}
85524+
85525+static inline int
85526+HYPERVISOR_grant_table_op(
85527+ unsigned int cmd, void *uop, unsigned int count)
85528+{
85529+ return _hypercall3(int, grant_table_op, cmd, uop, count);
85530+}
85531+
85532+static inline int
85533+HYPERVISOR_update_va_mapping_otherdomain(
85534+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
85535+{
85536+ unsigned long pte_hi = 0;
85537+#ifdef CONFIG_X86_PAE
85538+ pte_hi = new_val.pte_high;
85539+#endif
85540+ return _hypercall5(int, update_va_mapping_otherdomain, va,
85541+ new_val.pte_low, pte_hi, flags, domid);
85542+}
85543+
85544+static inline int
85545+HYPERVISOR_vm_assist(
85546+ unsigned int cmd, unsigned int type)
85547+{
85548+ return _hypercall2(int, vm_assist, cmd, type);
85549+}
85550+
85551+static inline int
85552+HYPERVISOR_vcpu_op(
85553+ int cmd, int vcpuid, void *extra_args)
85554+{
85555+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
85556+}
85557+
85558+static inline int
85559+HYPERVISOR_suspend(
85560+ unsigned long srec)
85561+{
85562+ struct sched_shutdown sched_shutdown = {
85563+ .reason = SHUTDOWN_suspend
85564+ };
85565+
85566+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
85567+ &sched_shutdown, srec);
85568+
85569+#ifdef CONFIG_XEN_COMPAT_030002
85570+ if (rc == -ENOSYS)
85571+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
85572+ SHUTDOWN_suspend, srec);
85573+#endif
85574+
85575+ return rc;
85576+}
85577+
85578+static inline int
85579+HYPERVISOR_nmi_op(
85580+ unsigned long op, void *arg)
85581+{
85582+ return _hypercall2(int, nmi_op, op, arg);
85583+}
85584+
85585+static inline unsigned long
85586+HYPERVISOR_hvm_op(
85587+ int op, void *arg)
85588+{
85589+ return _hypercall2(unsigned long, hvm_op, op, arg);
85590+}
85591+
85592+static inline int
85593+HYPERVISOR_callback_op(
85594+ int cmd, void *arg)
85595+{
85596+ return _hypercall2(int, callback_op, cmd, arg);
85597+}
85598+
85599+static inline int
85600+HYPERVISOR_xenoprof_op(
85601+ int op, void *arg)
85602+{
85603+ return _hypercall2(int, xenoprof_op, op, arg);
85604+}
85605+
85606+static inline int
85607+HYPERVISOR_kexec_op(
85608+ unsigned long op, void *args)
85609+{
85610+ return _hypercall2(int, kexec_op, op, args);
85611+}
85612+
85613+
85614+
85615+#endif /* __HYPERCALL_H__ */
85616diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypervisor.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypervisor.h
85617--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
85618+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypervisor.h 2007-01-08 15:00:45.000000000 +0000
85619@@ -0,0 +1,246 @@
85620+/******************************************************************************
85621+ * hypervisor.h
85622+ *
85623+ * Linux-specific hypervisor handling.
85624+ *
85625+ * Copyright (c) 2002-2004, K A Fraser
85626+ *
85627+ * This program is free software; you can redistribute it and/or
85628+ * modify it under the terms of the GNU General Public License version 2
85629+ * as published by the Free Software Foundation; or, when distributed
85630+ * separately from the Linux kernel or incorporated into other
85631+ * software packages, subject to the following license:
85632+ *
85633+ * Permission is hereby granted, free of charge, to any person obtaining a copy
85634+ * of this source file (the "Software"), to deal in the Software without
85635+ * restriction, including without limitation the rights to use, copy, modify,
85636+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
85637+ * and to permit persons to whom the Software is furnished to do so, subject to
85638+ * the following conditions:
85639+ *
85640+ * The above copyright notice and this permission notice shall be included in
85641+ * all copies or substantial portions of the Software.
85642+ *
85643+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
85644+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
85645+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
85646+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
85647+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
85648+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
85649+ * IN THE SOFTWARE.
85650+ */
85651+
85652+#ifndef __HYPERVISOR_H__
85653+#define __HYPERVISOR_H__
85654+
85655+#include <linux/config.h>
85656+#include <linux/types.h>
85657+#include <linux/kernel.h>
85658+#include <linux/version.h>
85659+#include <linux/errno.h>
85660+#include <xen/interface/xen.h>
85661+#include <xen/interface/dom0_ops.h>
85662+#include <xen/interface/event_channel.h>
85663+#include <xen/interface/physdev.h>
85664+#include <xen/interface/sched.h>
85665+#include <xen/interface/nmi.h>
85666+#include <asm/ptrace.h>
85667+#include <asm/page.h>
85668+#if defined(__i386__)
85669+# ifdef CONFIG_X86_PAE
85670+# include <asm-generic/pgtable-nopud.h>
85671+# else
85672+# include <asm-generic/pgtable-nopmd.h>
85673+# endif
85674+#endif
85675+
85676+extern shared_info_t *HYPERVISOR_shared_info;
85677+
85678+#ifdef CONFIG_X86_32
85679+extern unsigned long hypervisor_virt_start;
85680+#endif
85681+
85682+/* arch/xen/i386/kernel/setup.c */
85683+extern start_info_t *xen_start_info;
85684+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
85685+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
85686+#else
85687+#define is_initial_xendomain() 0
85688+#endif
85689+
85690+/* arch/xen/kernel/evtchn.c */
85691+/* Force a proper event-channel callback from Xen. */
85692+void force_evtchn_callback(void);
85693+
85694+/* arch/xen/kernel/process.c */
85695+void xen_cpu_idle (void);
85696+
85697+/* arch/xen/i386/kernel/hypervisor.c */
85698+void do_hypervisor_callback(struct pt_regs *regs);
85699+
85700+/* arch/xen/i386/mm/hypervisor.c */
85701+/*
85702+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
85703+ * be MACHINE addresses.
85704+ */
85705+
85706+void xen_pt_switch(unsigned long ptr);
85707+void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
85708+void xen_load_gs(unsigned int selector); /* x86_64 only */
85709+void xen_tlb_flush(void);
85710+void xen_invlpg(unsigned long ptr);
85711+
85712+void xen_l1_entry_update(pte_t *ptr, pte_t val);
85713+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
85714+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
85715+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
85716+void xen_pgd_pin(unsigned long ptr);
85717+void xen_pgd_unpin(unsigned long ptr);
85718+
85719+void xen_set_ldt(unsigned long ptr, unsigned long bytes);
85720+
85721+#ifdef CONFIG_SMP
85722+#include <linux/cpumask.h>
85723+void xen_tlb_flush_all(void);
85724+void xen_invlpg_all(unsigned long ptr);
85725+void xen_tlb_flush_mask(cpumask_t *mask);
85726+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
85727+#endif
85728+
85729+/* Returns zero on success else negative errno. */
85730+int xen_create_contiguous_region(
85731+ unsigned long vstart, unsigned int order, unsigned int address_bits);
85732+void xen_destroy_contiguous_region(
85733+ unsigned long vstart, unsigned int order);
85734+
85735+/* Turn jiffies into Xen system time. */
85736+u64 jiffies_to_st(unsigned long jiffies);
85737+
85738+#include <asm/hypercall.h>
85739+
85740+#if defined(CONFIG_X86_64)
85741+#define MULTI_UVMFLAGS_INDEX 2
85742+#define MULTI_UVMDOMID_INDEX 3
85743+#else
85744+#define MULTI_UVMFLAGS_INDEX 3
85745+#define MULTI_UVMDOMID_INDEX 4
85746+#endif
85747+
85748+#define is_running_on_xen() 1
85749+
85750+static inline int
85751+HYPERVISOR_yield(
85752+ void)
85753+{
85754+ int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
85755+
85756+#ifdef CONFIG_XEN_COMPAT_030002
85757+ if (rc == -ENOSYS)
85758+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
85759+#endif
85760+
85761+ return rc;
85762+}
85763+
85764+static inline int
85765+HYPERVISOR_block(
85766+ void)
85767+{
85768+ int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
85769+
85770+#ifdef CONFIG_XEN_COMPAT_030002
85771+ if (rc == -ENOSYS)
85772+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
85773+#endif
85774+
85775+ return rc;
85776+}
85777+
85778+static inline int
85779+HYPERVISOR_shutdown(
85780+ unsigned int reason)
85781+{
85782+ struct sched_shutdown sched_shutdown = {
85783+ .reason = reason
85784+ };
85785+
85786+ int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
85787+
85788+#ifdef CONFIG_XEN_COMPAT_030002
85789+ if (rc == -ENOSYS)
85790+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
85791+#endif
85792+
85793+ return rc;
85794+}
85795+
85796+static inline int
85797+HYPERVISOR_poll(
85798+ evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
85799+{
85800+ int rc;
85801+ struct sched_poll sched_poll = {
85802+ .nr_ports = nr_ports,
85803+ .timeout = jiffies_to_st(timeout)
85804+ };
85805+ set_xen_guest_handle(sched_poll.ports, ports);
85806+
85807+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
85808+#ifdef CONFIG_XEN_COMPAT_030002
85809+ if (rc == -ENOSYS)
85810+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
85811+#endif
85812+
85813+ return rc;
85814+}
85815+
85816+static inline void
85817+MULTI_update_va_mapping(
85818+ multicall_entry_t *mcl, unsigned long va,
85819+ pte_t new_val, unsigned long flags)
85820+{
85821+ mcl->op = __HYPERVISOR_update_va_mapping;
85822+ mcl->args[0] = va;
85823+#if defined(CONFIG_X86_64)
85824+ mcl->args[1] = new_val.pte;
85825+#elif defined(CONFIG_X86_PAE)
85826+ mcl->args[1] = new_val.pte_low;
85827+ mcl->args[2] = new_val.pte_high;
85828+#else
85829+ mcl->args[1] = new_val.pte_low;
85830+ mcl->args[2] = 0;
85831+#endif
85832+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
85833+}
85834+
85835+static inline void
85836+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
85837+ void *uop, unsigned int count)
85838+{
85839+ mcl->op = __HYPERVISOR_grant_table_op;
85840+ mcl->args[0] = cmd;
85841+ mcl->args[1] = (unsigned long)uop;
85842+ mcl->args[2] = count;
85843+}
85844+
85845+static inline void
85846+MULTI_update_va_mapping_otherdomain(
85847+ multicall_entry_t *mcl, unsigned long va,
85848+ pte_t new_val, unsigned long flags, domid_t domid)
85849+{
85850+ mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
85851+ mcl->args[0] = va;
85852+#if defined(CONFIG_X86_64)
85853+ mcl->args[1] = new_val.pte;
85854+#elif defined(CONFIG_X86_PAE)
85855+ mcl->args[1] = new_val.pte_low;
85856+ mcl->args[2] = new_val.pte_high;
85857+#else
85858+ mcl->args[1] = new_val.pte_low;
85859+ mcl->args[2] = 0;
85860+#endif
85861+ mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
85862+ mcl->args[MULTI_UVMDOMID_INDEX] = domid;
85863+}
85864+
85865+#endif /* __HYPERVISOR_H__ */
85866diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/io.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/io.h
85867--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/io.h 1970-01-01 00:00:00.000000000 +0000
85868+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/io.h 2007-01-08 15:00:45.000000000 +0000
85869@@ -0,0 +1,403 @@
85870+#ifndef _ASM_IO_H
85871+#define _ASM_IO_H
85872+
85873+#include <linux/config.h>
85874+#include <linux/string.h>
85875+#include <linux/compiler.h>
85876+
85877+/*
85878+ * This file contains the definitions for the x86 IO instructions
85879+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
85880+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
85881+ * versions of the single-IO instructions (inb_p/inw_p/..).
85882+ *
85883+ * This file is not meant to be obfuscating: it's just complicated
85884+ * to (a) handle it all in a way that makes gcc able to optimize it
85885+ * as well as possible and (b) trying to avoid writing the same thing
85886+ * over and over again with slight variations and possibly making a
85887+ * mistake somewhere.
85888+ */
85889+
85890+/*
85891+ * Thanks to James van Artsdalen for a better timing-fix than
85892+ * the two short jumps: using outb's to a nonexistent port seems
85893+ * to guarantee better timings even on fast machines.
85894+ *
85895+ * On the other hand, I'd like to be sure of a non-existent port:
85896+ * I feel a bit unsafe about using 0x80 (should be safe, though)
85897+ *
85898+ * Linus
85899+ */
85900+
85901+ /*
85902+ * Bit simplified and optimized by Jan Hubicka
85903+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
85904+ *
85905+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
85906+ * isa_read[wl] and isa_write[wl] fixed
85907+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
85908+ */
85909+
85910+#define IO_SPACE_LIMIT 0xffff
85911+
85912+#define XQUAD_PORTIO_BASE 0xfe400000
85913+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
85914+
85915+#ifdef __KERNEL__
85916+
85917+#include <asm-generic/iomap.h>
85918+
85919+#include <linux/vmalloc.h>
85920+#include <asm/fixmap.h>
85921+
85922+/*
85923+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
85924+ * access
85925+ */
85926+#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz)
85927+#define xlate_dev_mem_ptr_unmap(p) iounmap(p)
85928+
85929+/*
85930+ * Convert a virtual cached pointer to an uncached pointer
85931+ */
85932+#define xlate_dev_kmem_ptr(p) p
85933+
85934+/**
85935+ * virt_to_phys - map virtual addresses to physical
85936+ * @address: address to remap
85937+ *
85938+ * The returned physical address is the physical (CPU) mapping for
85939+ * the memory address given. It is only valid to use this function on
85940+ * addresses directly mapped or allocated via kmalloc.
85941+ *
85942+ * This function does not give bus mappings for DMA transfers. In
85943+ * almost all conceivable cases a device driver should not be using
85944+ * this function
85945+ */
85946+
85947+static inline unsigned long virt_to_phys(volatile void * address)
85948+{
85949+ return __pa(address);
85950+}
85951+
85952+/**
85953+ * phys_to_virt - map physical address to virtual
85954+ * @address: address to remap
85955+ *
85956+ * The returned virtual address is a current CPU mapping for
85957+ * the memory address given. It is only valid to use this function on
85958+ * addresses that have a kernel mapping
85959+ *
85960+ * This function does not handle bus mappings for DMA transfers. In
85961+ * almost all conceivable cases a device driver should not be using
85962+ * this function
85963+ */
85964+
85965+static inline void * phys_to_virt(unsigned long address)
85966+{
85967+ return __va(address);
85968+}
85969+
85970+/*
85971+ * Change "struct page" to physical address.
85972+ */
85973+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
85974+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
85975+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
85976+
85977+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
85978+ (unsigned long) bio_offset((bio)))
85979+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
85980+ (unsigned long) (bv)->bv_offset)
85981+
85982+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
85983+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
85984+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
85985+ bvec_to_pseudophys((vec2))))
85986+
85987+extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
85988+
85989+/**
85990+ * ioremap - map bus memory into CPU space
85991+ * @offset: bus address of the memory
85992+ * @size: size of the resource to map
85993+ *
85994+ * ioremap performs a platform specific sequence of operations to
85995+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
85996+ * writew/writel functions and the other mmio helpers. The returned
85997+ * address is not guaranteed to be usable directly as a virtual
85998+ * address.
85999+ */
86000+
86001+static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
86002+{
86003+ return __ioremap(offset, size, 0);
86004+}
86005+
86006+extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
86007+extern void iounmap(volatile void __iomem *addr);
86008+
86009+/*
86010+ * bt_ioremap() and bt_iounmap() are for temporary early boot-time
86011+ * mappings, before the real ioremap() is functional.
86012+ * A boot-time mapping is currently limited to at most 16 pages.
86013+ */
86014+extern void *bt_ioremap(unsigned long offset, unsigned long size);
86015+extern void bt_iounmap(void *addr, unsigned long size);
86016+
86017+/* Use early IO mappings for DMI because it's initialized early */
86018+#define dmi_ioremap bt_ioremap
86019+#define dmi_iounmap bt_iounmap
86020+#define dmi_alloc alloc_bootmem
86021+
86022+/*
86023+ * ISA I/O bus memory addresses are 1:1 with the physical address.
86024+ */
86025+#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
86026+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
86027+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
86028+
86029+/*
86030+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
86031+ * are forbidden in portable PCI drivers.
86032+ *
86033+ * Allow them on x86 for legacy drivers, though.
86034+ */
86035+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
86036+#define bus_to_virt(_x) __va(machine_to_phys(_x))
86037+
86038+/*
86039+ * readX/writeX() are used to access memory mapped devices. On some
86040+ * architectures the memory mapped IO stuff needs to be accessed
86041+ * differently. On the x86 architecture, we just read/write the
86042+ * memory location directly.
86043+ */
86044+
86045+static inline unsigned char readb(const volatile void __iomem *addr)
86046+{
86047+ return *(volatile unsigned char __force *) addr;
86048+}
86049+static inline unsigned short readw(const volatile void __iomem *addr)
86050+{
86051+ return *(volatile unsigned short __force *) addr;
86052+}
86053+static inline unsigned int readl(const volatile void __iomem *addr)
86054+{
86055+ return *(volatile unsigned int __force *) addr;
86056+}
86057+#define readb_relaxed(addr) readb(addr)
86058+#define readw_relaxed(addr) readw(addr)
86059+#define readl_relaxed(addr) readl(addr)
86060+#define __raw_readb readb
86061+#define __raw_readw readw
86062+#define __raw_readl readl
86063+
86064+static inline void writeb(unsigned char b, volatile void __iomem *addr)
86065+{
86066+ *(volatile unsigned char __force *) addr = b;
86067+}
86068+static inline void writew(unsigned short b, volatile void __iomem *addr)
86069+{
86070+ *(volatile unsigned short __force *) addr = b;
86071+}
86072+static inline void writel(unsigned int b, volatile void __iomem *addr)
86073+{
86074+ *(volatile unsigned int __force *) addr = b;
86075+}
86076+#define __raw_writeb writeb
86077+#define __raw_writew writew
86078+#define __raw_writel writel
86079+
86080+#define mmiowb()
86081+
86082+static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
86083+{
86084+ memset((void __force *) addr, val, count);
86085+}
86086+static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
86087+{
86088+ __memcpy(dst, (void __force *) src, count);
86089+}
86090+static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
86091+{
86092+ __memcpy((void __force *) dst, src, count);
86093+}
86094+
86095+/*
86096+ * ISA space is 'always mapped' on a typical x86 system, no need to
86097+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
86098+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
86099+ * are physical addresses. The following constant pointer can be
86100+ * used as the IO-area pointer (it can be iounmapped as well, so the
86101+ * analogy with PCI is quite large):
86102+ */
86103+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
86104+
86105+#define isa_readb(a) readb(__ISA_IO_base + (a))
86106+#define isa_readw(a) readw(__ISA_IO_base + (a))
86107+#define isa_readl(a) readl(__ISA_IO_base + (a))
86108+#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
86109+#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
86110+#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
86111+#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c))
86112+#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c))
86113+#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c))
86114+
86115+
86116+/*
86117+ * Again, i386 does not require mem IO specific function.
86118+ */
86119+
86120+#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
86121+#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(__ISA_IO_base + (b)),(c),(d))
86122+
86123+/**
86124+ * check_signature - find BIOS signatures
86125+ * @io_addr: mmio address to check
86126+ * @signature: signature block
86127+ * @length: length of signature
86128+ *
86129+ * Perform a signature comparison with the mmio address io_addr. This
86130+ * address should have been obtained by ioremap.
86131+ * Returns 1 on a match.
86132+ */
86133+
86134+static inline int check_signature(volatile void __iomem * io_addr,
86135+ const unsigned char *signature, int length)
86136+{
86137+ int retval = 0;
86138+ do {
86139+ if (readb(io_addr) != *signature)
86140+ goto out;
86141+ io_addr++;
86142+ signature++;
86143+ length--;
86144+ } while (length);
86145+ retval = 1;
86146+out:
86147+ return retval;
86148+}
86149+
86150+/*
86151+ * Cache management
86152+ *
86153+ * This needed for two cases
86154+ * 1. Out of order aware processors
86155+ * 2. Accidentally out of order processors (PPro errata #51)
86156+ */
86157+
86158+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
86159+
86160+static inline void flush_write_buffers(void)
86161+{
86162+ __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
86163+}
86164+
86165+#define dma_cache_inv(_start,_size) flush_write_buffers()
86166+#define dma_cache_wback(_start,_size) flush_write_buffers()
86167+#define dma_cache_wback_inv(_start,_size) flush_write_buffers()
86168+
86169+#else
86170+
86171+/* Nothing to do */
86172+
86173+#define dma_cache_inv(_start,_size) do { } while (0)
86174+#define dma_cache_wback(_start,_size) do { } while (0)
86175+#define dma_cache_wback_inv(_start,_size) do { } while (0)
86176+#define flush_write_buffers()
86177+
86178+#endif
86179+
86180+#endif /* __KERNEL__ */
86181+
86182+#ifdef SLOW_IO_BY_JUMPING
86183+#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
86184+#else
86185+#define __SLOW_DOWN_IO "outb %%al,$0x80;"
86186+#endif
86187+
86188+static inline void slow_down_io(void) {
86189+ __asm__ __volatile__(
86190+ __SLOW_DOWN_IO
86191+#ifdef REALLY_SLOW_IO
86192+ __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
86193+#endif
86194+ : : );
86195+}
86196+
86197+#ifdef CONFIG_X86_NUMAQ
86198+extern void *xquad_portio; /* Where the IO area was mapped */
86199+#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
86200+#define __BUILDIO(bwl,bw,type) \
86201+static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
86202+ if (xquad_portio) \
86203+ write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
86204+ else \
86205+ out##bwl##_local(value, port); \
86206+} \
86207+static inline void out##bwl(unsigned type value, int port) { \
86208+ out##bwl##_quad(value, port, 0); \
86209+} \
86210+static inline unsigned type in##bwl##_quad(int port, int quad) { \
86211+ if (xquad_portio) \
86212+ return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
86213+ else \
86214+ return in##bwl##_local(port); \
86215+} \
86216+static inline unsigned type in##bwl(int port) { \
86217+ return in##bwl##_quad(port, 0); \
86218+}
86219+#else
86220+#define __BUILDIO(bwl,bw,type) \
86221+static inline void out##bwl(unsigned type value, int port) { \
86222+ out##bwl##_local(value, port); \
86223+} \
86224+static inline unsigned type in##bwl(int port) { \
86225+ return in##bwl##_local(port); \
86226+}
86227+#endif
86228+
86229+
86230+#define BUILDIO(bwl,bw,type) \
86231+static inline void out##bwl##_local(unsigned type value, int port) { \
86232+ __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
86233+} \
86234+static inline unsigned type in##bwl##_local(int port) { \
86235+ unsigned type value; \
86236+ __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
86237+ return value; \
86238+} \
86239+static inline void out##bwl##_local_p(unsigned type value, int port) { \
86240+ out##bwl##_local(value, port); \
86241+ slow_down_io(); \
86242+} \
86243+static inline unsigned type in##bwl##_local_p(int port) { \
86244+ unsigned type value = in##bwl##_local(port); \
86245+ slow_down_io(); \
86246+ return value; \
86247+} \
86248+__BUILDIO(bwl,bw,type) \
86249+static inline void out##bwl##_p(unsigned type value, int port) { \
86250+ out##bwl(value, port); \
86251+ slow_down_io(); \
86252+} \
86253+static inline unsigned type in##bwl##_p(int port) { \
86254+ unsigned type value = in##bwl(port); \
86255+ slow_down_io(); \
86256+ return value; \
86257+} \
86258+static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
86259+ __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
86260+} \
86261+static inline void ins##bwl(int port, void *addr, unsigned long count) { \
86262+ __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
86263+}
86264+
86265+BUILDIO(b,b,char)
86266+BUILDIO(w,w,short)
86267+BUILDIO(l,,int)
86268+
86269+/* We will be supplying our own /dev/mem implementation */
86270+#define ARCH_HAS_DEV_MEM
86271+
86272+#endif
86273diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/kmap_types.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/kmap_types.h
86274--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/kmap_types.h 1970-01-01 00:00:00.000000000 +0000
86275+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/kmap_types.h 2007-01-08 15:00:45.000000000 +0000
86276@@ -0,0 +1,32 @@
86277+#ifndef _ASM_KMAP_TYPES_H
86278+#define _ASM_KMAP_TYPES_H
86279+
86280+#include <linux/config.h>
86281+
86282+#ifdef CONFIG_DEBUG_HIGHMEM
86283+# define D(n) __KM_FENCE_##n ,
86284+#else
86285+# define D(n)
86286+#endif
86287+
86288+enum km_type {
86289+D(0) KM_BOUNCE_READ,
86290+D(1) KM_SKB_SUNRPC_DATA,
86291+D(2) KM_SKB_DATA_SOFTIRQ,
86292+D(3) KM_USER0,
86293+D(4) KM_USER1,
86294+D(5) KM_BIO_SRC_IRQ,
86295+D(6) KM_BIO_DST_IRQ,
86296+D(7) KM_PTE0,
86297+D(8) KM_PTE1,
86298+D(9) KM_IRQ0,
86299+D(10) KM_IRQ1,
86300+D(11) KM_SOFTIRQ0,
86301+D(12) KM_SOFTIRQ1,
86302+D(13) KM_SWIOTLB,
86303+D(14) KM_TYPE_NR
86304+};
86305+
86306+#undef D
86307+
86308+#endif
86309diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/maddr.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/maddr.h
86310--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/maddr.h 1970-01-01 00:00:00.000000000 +0000
86311+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/maddr.h 2007-01-08 15:00:45.000000000 +0000
86312@@ -0,0 +1,176 @@
86313+#ifndef _I386_MADDR_H
86314+#define _I386_MADDR_H
86315+
86316+#include <xen/features.h>
86317+#include <xen/interface/xen.h>
86318+
86319+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
86320+#define INVALID_P2M_ENTRY (~0UL)
86321+#define FOREIGN_FRAME_BIT (1UL<<31)
86322+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
86323+
86324+/* Definitions for machine and pseudophysical addresses. */
86325+#ifdef CONFIG_X86_PAE
86326+typedef unsigned long long paddr_t;
86327+typedef unsigned long long maddr_t;
86328+#else
86329+typedef unsigned long paddr_t;
86330+typedef unsigned long maddr_t;
86331+#endif
86332+
86333+#ifdef CONFIG_XEN
86334+
86335+extern unsigned long *phys_to_machine_mapping;
86336+
86337+#undef machine_to_phys_mapping
86338+extern unsigned long *machine_to_phys_mapping;
86339+extern unsigned int machine_to_phys_order;
86340+
86341+static inline unsigned long pfn_to_mfn(unsigned long pfn)
86342+{
86343+ if (xen_feature(XENFEAT_auto_translated_physmap))
86344+ return pfn;
86345+ return phys_to_machine_mapping[(unsigned int)(pfn)] &
86346+ ~FOREIGN_FRAME_BIT;
86347+}
86348+
86349+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
86350+{
86351+ if (xen_feature(XENFEAT_auto_translated_physmap))
86352+ return 1;
86353+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
86354+}
86355+
86356+static inline unsigned long mfn_to_pfn(unsigned long mfn)
86357+{
86358+ extern unsigned long max_mapnr;
86359+ unsigned long pfn;
86360+
86361+ if (xen_feature(XENFEAT_auto_translated_physmap))
86362+ return mfn;
86363+
86364+ if (unlikely((mfn >> machine_to_phys_order) != 0))
86365+ return max_mapnr;
86366+
86367+ /* The array access can fail (e.g., device space beyond end of RAM). */
86368+ asm (
86369+ "1: movl %1,%0\n"
86370+ "2:\n"
86371+ ".section .fixup,\"ax\"\n"
86372+ "3: movl %2,%0\n"
86373+ " jmp 2b\n"
86374+ ".previous\n"
86375+ ".section __ex_table,\"a\"\n"
86376+ " .align 4\n"
86377+ " .long 1b,3b\n"
86378+ ".previous"
86379+ : "=r" (pfn)
86380+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
86381+
86382+ return pfn;
86383+}
86384+
86385+/*
86386+ * We detect special mappings in one of two ways:
86387+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
86388+ * to be outside our maximum possible pseudophys range.
86389+ * 2. If the MFN belongs to a different domain then we will certainly
86390+ * not have MFN in our p2m table. Conversely, if the page is ours,
86391+ * then we'll have p2m(m2p(MFN))==MFN.
86392+ * If we detect a special mapping then it doesn't have a 'struct page'.
86393+ * We force !pfn_valid() by returning an out-of-range pointer.
86394+ *
86395+ * NB. These checks require that, for any MFN that is not in our reservation,
86396+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
86397+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
86398+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
86399+ *
86400+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
86401+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
86402+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
86403+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
86404+ */
86405+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
86406+{
86407+ extern unsigned long max_mapnr;
86408+ unsigned long pfn = mfn_to_pfn(mfn);
86409+ if ((pfn < max_mapnr)
86410+ && !xen_feature(XENFEAT_auto_translated_physmap)
86411+ && (phys_to_machine_mapping[pfn] != mfn))
86412+ return max_mapnr; /* force !pfn_valid() */
86413+ return pfn;
86414+}
86415+
86416+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
86417+{
86418+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
86419+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
86420+ return;
86421+ }
86422+ phys_to_machine_mapping[pfn] = mfn;
86423+}
86424+
86425+static inline maddr_t phys_to_machine(paddr_t phys)
86426+{
86427+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
86428+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
86429+ return machine;
86430+}
86431+
86432+static inline paddr_t machine_to_phys(maddr_t machine)
86433+{
86434+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
86435+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
86436+ return phys;
86437+}
86438+
86439+static inline paddr_t pte_machine_to_phys(maddr_t machine)
86440+{
86441+ /*
86442+ * In PAE mode, the NX bit needs to be dealt with in the value
86443+ * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
86444+ * but for i386 the conversion to ulong for the argument will
86445+ * clip it off.
86446+ */
86447+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
86448+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
86449+ return phys;
86450+}
86451+
86452+#else /* !CONFIG_XEN */
86453+
86454+#define pfn_to_mfn(pfn) (pfn)
86455+#define mfn_to_pfn(mfn) (mfn)
86456+#define mfn_to_local_pfn(mfn) (mfn)
86457+#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
86458+#define phys_to_machine_mapping_valid(pfn) (1)
86459+#define phys_to_machine(phys) ((maddr_t)(phys))
86460+#define machine_to_phys(mach) ((paddr_t)(mach))
86461+#define pte_machine_to_phys(mach) ((paddr_t)(mach))
86462+
86463+#endif /* !CONFIG_XEN */
86464+
86465+/* VIRT <-> MACHINE conversion */
86466+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
86467+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
86468+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
86469+
86470+#ifdef CONFIG_X86_PAE
86471+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
86472+{
86473+ pte_t pte;
86474+
86475+ pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
86476+ (pgprot_val(pgprot) >> 32);
86477+ pte.pte_high &= (__supported_pte_mask >> 32);
86478+ pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
86479+ __supported_pte_mask;
86480+ return pte;
86481+}
86482+#else
86483+#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
86484+#endif
86485+
86486+#define __pte_ma(x) ((pte_t) { (x) } )
86487+
86488+#endif /* _I386_MADDR_H */
86489diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu.h
86490--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu.h 1970-01-01 00:00:00.000000000 +0000
86491+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu.h 2007-01-08 15:00:45.000000000 +0000
86492@@ -0,0 +1,28 @@
86493+#ifndef __i386_MMU_H
86494+#define __i386_MMU_H
86495+
86496+#include <asm/semaphore.h>
86497+/*
86498+ * The i386 doesn't have a mmu context, but
86499+ * we put the segment information here.
86500+ *
86501+ * cpu_vm_mask is used to optimize ldt flushing.
86502+ */
86503+typedef struct {
86504+ int size;
86505+ struct semaphore sem;
86506+ void *ldt;
86507+#ifdef CONFIG_XEN
86508+ int has_foreign_mappings;
86509+#endif
86510+} mm_context_t;
86511+
86512+/* mm/memory.c:exit_mmap hook */
86513+extern void _arch_exit_mmap(struct mm_struct *mm);
86514+#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
86515+
86516+/* kernel/fork.c:dup_mmap hook */
86517+extern void _arch_dup_mmap(struct mm_struct *mm);
86518+#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
86519+
86520+#endif
86521diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu_context.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu_context.h
86522--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu_context.h 1970-01-01 00:00:00.000000000 +0000
86523+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu_context.h 2007-01-08 15:00:45.000000000 +0000
86524@@ -0,0 +1,109 @@
86525+#ifndef __I386_SCHED_H
86526+#define __I386_SCHED_H
86527+
86528+#include <linux/config.h>
86529+#include <asm/desc.h>
86530+#include <asm/atomic.h>
86531+#include <asm/pgalloc.h>
86532+#include <asm/tlbflush.h>
86533+
86534+/*
86535+ * Used for LDT copy/destruction.
86536+ */
86537+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
86538+void destroy_context(struct mm_struct *mm);
86539+
86540+
86541+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
86542+{
86543+#if 0 /* XEN: no lazy tlb */
86544+ unsigned cpu = smp_processor_id();
86545+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
86546+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
86547+#endif
86548+}
86549+
86550+#define prepare_arch_switch(next) __prepare_arch_switch()
86551+
86552+static inline void __prepare_arch_switch(void)
86553+{
86554+ /*
86555+ * Save away %fs and %gs. No need to save %es and %ds, as those
86556+ * are always kernel segments while inside the kernel. Must
86557+ * happen before reload of cr3/ldt (i.e., not in __switch_to).
86558+ */
86559+ asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
86560+ : "=m" (current->thread.fs),
86561+ "=m" (current->thread.gs));
86562+ asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
86563+ : : "r" (0) );
86564+}
86565+
86566+extern void mm_pin(struct mm_struct *mm);
86567+extern void mm_unpin(struct mm_struct *mm);
86568+void mm_pin_all(void);
86569+
86570+static inline void switch_mm(struct mm_struct *prev,
86571+ struct mm_struct *next,
86572+ struct task_struct *tsk)
86573+{
86574+ int cpu = smp_processor_id();
86575+ struct mmuext_op _op[2], *op = _op;
86576+
86577+ if (likely(prev != next)) {
86578+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
86579+ !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
86580+
86581+ /* stop flush ipis for the previous mm */
86582+ cpu_clear(cpu, prev->cpu_vm_mask);
86583+#if 0 /* XEN: no lazy tlb */
86584+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
86585+ per_cpu(cpu_tlbstate, cpu).active_mm = next;
86586+#endif
86587+ cpu_set(cpu, next->cpu_vm_mask);
86588+
86589+ /* Re-load page tables: load_cr3(next->pgd) */
86590+ op->cmd = MMUEXT_NEW_BASEPTR;
86591+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
86592+ op++;
86593+
86594+ /*
86595+ * load the LDT, if the LDT is different:
86596+ */
86597+ if (unlikely(prev->context.ldt != next->context.ldt)) {
86598+ /* load_LDT_nolock(&next->context, cpu) */
86599+ op->cmd = MMUEXT_SET_LDT;
86600+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
86601+ op->arg2.nr_ents = next->context.size;
86602+ op++;
86603+ }
86604+
86605+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
86606+ }
86607+#if 0 /* XEN: no lazy tlb */
86608+ else {
86609+ per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
86610+ BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
86611+
86612+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
86613+ /* We were in lazy tlb mode and leave_mm disabled
86614+ * tlb flush IPI delivery. We must reload %cr3.
86615+ */
86616+ load_cr3(next->pgd);
86617+ load_LDT_nolock(&next->context, cpu);
86618+ }
86619+ }
86620+#endif
86621+}
86622+
86623+#define deactivate_mm(tsk, mm) \
86624+ asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
86625+
86626+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
86627+{
86628+ if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
86629+ mm_pin(next);
86630+ switch_mm(prev, next, NULL);
86631+}
86632+
86633+#endif
86634diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/page.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/page.h
86635--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/page.h 1970-01-01 00:00:00.000000000 +0000
86636+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/page.h 2007-01-08 15:00:45.000000000 +0000
86637@@ -0,0 +1,227 @@
86638+#ifndef _I386_PAGE_H
86639+#define _I386_PAGE_H
86640+
86641+/* PAGE_SHIFT determines the page size */
86642+#define PAGE_SHIFT 12
86643+#define PAGE_SIZE (1UL << PAGE_SHIFT)
86644+#define PAGE_MASK (~(PAGE_SIZE-1))
86645+
86646+#ifdef CONFIG_X86_PAE
86647+#define __PHYSICAL_MASK_SHIFT 36
86648+#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
86649+#define PHYSICAL_PAGE_MASK (~((1ULL << PAGE_SHIFT) - 1) & __PHYSICAL_MASK)
86650+#else
86651+#define __PHYSICAL_MASK_SHIFT 32
86652+#define __PHYSICAL_MASK (~0UL)
86653+#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK)
86654+#endif
86655+
86656+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
86657+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
86658+
86659+#ifdef __KERNEL__
86660+#ifndef __ASSEMBLY__
86661+
86662+#include <linux/config.h>
86663+#include <linux/string.h>
86664+#include <linux/types.h>
86665+#include <linux/kernel.h>
86666+#include <asm/bug.h>
86667+#include <xen/interface/xen.h>
86668+#include <xen/features.h>
86669+#include <xen/foreign_page.h>
86670+
86671+#define arch_free_page(_page,_order) \
86672+({ int foreign = PageForeign(_page); \
86673+ if (foreign) \
86674+ (PageForeignDestructor(_page))(_page); \
86675+ foreign; \
86676+})
86677+#define HAVE_ARCH_FREE_PAGE
86678+
86679+#ifdef CONFIG_XEN_SCRUB_PAGES
86680+#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
86681+#else
86682+#define scrub_pages(_p,_n) ((void)0)
86683+#endif
86684+
86685+#ifdef CONFIG_X86_USE_3DNOW
86686+
86687+#include <asm/mmx.h>
86688+
86689+#define clear_page(page) mmx_clear_page((void *)(page))
86690+#define copy_page(to,from) mmx_copy_page(to,from)
86691+
86692+#else
86693+
86694+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
86695+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
86696+
86697+/*
86698+ * On older X86 processors it's not a win to use MMX here it seems.
86699+ * Maybe the K6-III ?
86700+ */
86701+
86702+#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
86703+#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
86704+
86705+#endif
86706+
86707+#define clear_user_page(page, vaddr, pg) clear_page(page)
86708+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
86709+
86710+/*
86711+ * These are used to make use of C type-checking..
86712+ */
86713+extern int nx_enabled;
86714+#ifdef CONFIG_X86_PAE
86715+extern unsigned long long __supported_pte_mask;
86716+typedef struct { unsigned long pte_low, pte_high; } pte_t;
86717+typedef struct { unsigned long long pmd; } pmd_t;
86718+typedef struct { unsigned long long pgd; } pgd_t;
86719+typedef struct { unsigned long long pgprot; } pgprot_t;
86720+#define pgprot_val(x) ((x).pgprot)
86721+#include <asm/maddr.h>
86722+#define __pte(x) ({ unsigned long long _x = (x); \
86723+ if (_x & 1) _x = phys_to_machine(_x); \
86724+ ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
86725+#define __pgd(x) ({ unsigned long long _x = (x); \
86726+ (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
86727+#define __pmd(x) ({ unsigned long long _x = (x); \
86728+ (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
86729+static inline unsigned long long pte_val(pte_t x)
86730+{
86731+ unsigned long long ret;
86732+
86733+ if (x.pte_low) {
86734+ ret = x.pte_low | (unsigned long long)x.pte_high << 32;
86735+ ret = pte_machine_to_phys(ret) | 1;
86736+ } else {
86737+ ret = 0;
86738+ }
86739+ return ret;
86740+}
86741+static inline unsigned long long pmd_val(pmd_t x)
86742+{
86743+ unsigned long long ret = x.pmd;
86744+ if (ret) ret = pte_machine_to_phys(ret) | 1;
86745+ return ret;
86746+}
86747+static inline unsigned long long pgd_val(pgd_t x)
86748+{
86749+ unsigned long long ret = x.pgd;
86750+ if (ret) ret = pte_machine_to_phys(ret) | 1;
86751+ return ret;
86752+}
86753+static inline unsigned long long pte_val_ma(pte_t x)
86754+{
86755+ return (unsigned long long)x.pte_high << 32 | x.pte_low;
86756+}
86757+#define HPAGE_SHIFT 21
86758+#else
86759+typedef struct { unsigned long pte_low; } pte_t;
86760+typedef struct { unsigned long pgd; } pgd_t;
86761+typedef struct { unsigned long pgprot; } pgprot_t;
86762+#define pgprot_val(x) ((x).pgprot)
86763+#include <asm/maddr.h>
86764+#define boot_pte_t pte_t /* or would you rather have a typedef */
86765+#define pte_val(x) (((x).pte_low & 1) ? \
86766+ pte_machine_to_phys((x).pte_low) : \
86767+ (x).pte_low)
86768+#define pte_val_ma(x) ((x).pte_low)
86769+#define __pte(x) ({ unsigned long _x = (x); \
86770+ (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
86771+#define __pgd(x) ({ unsigned long _x = (x); \
86772+ (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
86773+static inline unsigned long pgd_val(pgd_t x)
86774+{
86775+ unsigned long ret = x.pgd;
86776+ if (ret) ret = pte_machine_to_phys(ret) | 1;
86777+ return ret;
86778+}
86779+#define HPAGE_SHIFT 22
86780+#endif
86781+#define PTE_MASK PAGE_MASK
86782+
86783+#ifdef CONFIG_HUGETLB_PAGE
86784+#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
86785+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
86786+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
86787+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
86788+#endif
86789+
86790+#define __pgprot(x) ((pgprot_t) { (x) } )
86791+
86792+#endif /* !__ASSEMBLY__ */
86793+
86794+/* to align the pointer to the (next) page boundary */
86795+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
86796+
86797+/*
86798+ * This handles the memory map.. We could make this a config
86799+ * option, but too many people screw it up, and too few need
86800+ * it.
86801+ *
86802+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
86803+ * a virtual address space of one gigabyte, which limits the
86804+ * amount of physical memory you can use to about 950MB.
86805+ *
86806+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
86807+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
86808+ */
86809+
86810+#ifndef __ASSEMBLY__
86811+
86812+/*
86813+ * This much address space is reserved for vmalloc() and iomap()
86814+ * as well as fixmap mappings.
86815+ */
86816+extern unsigned int __VMALLOC_RESERVE;
86817+
86818+extern int sysctl_legacy_va_layout;
86819+
86820+extern int page_is_ram(unsigned long pagenr);
86821+
86822+#endif /* __ASSEMBLY__ */
86823+
86824+#ifdef __ASSEMBLY__
86825+#define __PAGE_OFFSET CONFIG_PAGE_OFFSET
86826+#define __PHYSICAL_START CONFIG_PHYSICAL_START
86827+#else
86828+#define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET)
86829+#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
86830+#endif
86831+#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START)
86832+
86833+#ifdef CONFIG_XEN_COMPAT_030002
86834+#undef LOAD_OFFSET
86835+#define LOAD_OFFSET 0
86836+#endif /* CONFIG_XEN_COMPAT_030002 */
86837+
86838+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
86839+#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
86840+#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
86841+#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
86842+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
86843+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
86844+#ifdef CONFIG_FLATMEM
86845+#define pfn_to_page(pfn) (mem_map + (pfn))
86846+#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
86847+#define pfn_valid(pfn) ((pfn) < max_mapnr)
86848+#endif /* CONFIG_FLATMEM */
86849+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
86850+
86851+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
86852+
86853+#define VM_DATA_DEFAULT_FLAGS \
86854+ (VM_READ | VM_WRITE | \
86855+ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
86856+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
86857+
86858+#define __HAVE_ARCH_GATE_AREA 1
86859+
86860+#endif /* __KERNEL__ */
86861+
86862+#include <asm-generic/page.h>
86863+
86864+#endif /* _I386_PAGE_H */
86865diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/param.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/param.h
86866--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/param.h 1970-01-01 00:00:00.000000000 +0000
86867+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/param.h 2007-01-08 15:00:45.000000000 +0000
86868@@ -0,0 +1,24 @@
86869+#ifndef _ASMi386_PARAM_H
86870+#define _ASMi386_PARAM_H
86871+
86872+#ifdef __KERNEL__
86873+# include <linux/config.h>
86874+# define HZ CONFIG_HZ /* Internal kernel timer frequency */
86875+# define USER_HZ 100 /* .. some user interfaces are in "ticks" */
86876+# define CLOCKS_PER_SEC (USER_HZ) /* like times() */
86877+#endif
86878+
86879+#ifndef HZ
86880+#define HZ 100
86881+#endif
86882+
86883+#define EXEC_PAGESIZE 4096
86884+
86885+#ifndef NOGROUP
86886+#define NOGROUP (-1)
86887+#endif
86888+
86889+#define MAXHOSTNAMELEN 64 /* max length of hostname */
86890+#define COMMAND_LINE_SIZE 256
86891+
86892+#endif
86893diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pci.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pci.h
86894--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pci.h 1970-01-01 00:00:00.000000000 +0000
86895+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pci.h 2007-01-08 15:00:45.000000000 +0000
86896@@ -0,0 +1,154 @@
86897+#ifndef __i386_PCI_H
86898+#define __i386_PCI_H
86899+
86900+#include <linux/config.h>
86901+
86902+#ifdef __KERNEL__
86903+#include <linux/mm.h> /* for struct page */
86904+
86905+/* Can be used to override the logic in pci_scan_bus for skipping
86906+ already-configured bus numbers - to be used for buggy BIOSes
86907+ or architectures with incomplete PCI setup by the loader */
86908+
86909+#ifdef CONFIG_PCI
86910+extern unsigned int pcibios_assign_all_busses(void);
86911+#else
86912+#define pcibios_assign_all_busses() 0
86913+#endif
86914+#define pcibios_scan_all_fns(a, b) 0
86915+
86916+extern unsigned long pci_mem_start;
86917+#define PCIBIOS_MIN_IO 0x1000
86918+#define PCIBIOS_MIN_MEM (pci_mem_start)
86919+
86920+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
86921+
86922+void pcibios_config_init(void);
86923+struct pci_bus * pcibios_scan_root(int bus);
86924+
86925+void pcibios_set_master(struct pci_dev *dev);
86926+void pcibios_penalize_isa_irq(int irq, int active);
86927+struct irq_routing_table *pcibios_get_irq_routing_table(void);
86928+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
86929+
86930+/* Dynamic DMA mapping stuff.
86931+ * i386 has everything mapped statically.
86932+ */
86933+
86934+#include <linux/types.h>
86935+#include <linux/slab.h>
86936+#include <asm/scatterlist.h>
86937+#include <linux/string.h>
86938+#include <asm/io.h>
86939+
86940+struct pci_dev;
86941+
86942+#ifdef CONFIG_SWIOTLB
86943+
86944+
86945+/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
86946+#define PCI_DMA_BUS_IS_PHYS (0)
86947+
86948+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
86949+ dma_addr_t ADDR_NAME;
86950+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
86951+ __u32 LEN_NAME;
86952+#define pci_unmap_addr(PTR, ADDR_NAME) \
86953+ ((PTR)->ADDR_NAME)
86954+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
86955+ (((PTR)->ADDR_NAME) = (VAL))
86956+#define pci_unmap_len(PTR, LEN_NAME) \
86957+ ((PTR)->LEN_NAME)
86958+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
86959+ (((PTR)->LEN_NAME) = (VAL))
86960+
86961+#else
86962+
86963+/* The PCI address space does equal the physical memory
86964+ * address space. The networking and block device layers use
86965+ * this boolean for bounce buffer decisions.
86966+ */
86967+#define PCI_DMA_BUS_IS_PHYS (1)
86968+
86969+/* pci_unmap_{page,single} is a nop so... */
86970+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
86971+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
86972+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
86973+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
86974+#define pci_unmap_len(PTR, LEN_NAME) (0)
86975+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
86976+
86977+#endif
86978+
86979+/* This is always fine. */
86980+#define pci_dac_dma_supported(pci_dev, mask) (1)
86981+
86982+static inline dma64_addr_t
86983+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
86984+{
86985+ return ((dma64_addr_t) page_to_phys(page) +
86986+ (dma64_addr_t) offset);
86987+}
86988+
86989+static inline struct page *
86990+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
86991+{
86992+ return pfn_to_page(dma_addr >> PAGE_SHIFT);
86993+}
86994+
86995+static inline unsigned long
86996+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
86997+{
86998+ return (dma_addr & ~PAGE_MASK);
86999+}
87000+
87001+static inline void
87002+pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
87003+{
87004+}
87005+
87006+static inline void
87007+pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
87008+{
87009+ flush_write_buffers();
87010+}
87011+
87012+#define HAVE_PCI_MMAP
87013+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
87014+ enum pci_mmap_state mmap_state, int write_combine);
87015+
87016+
87017+static inline void pcibios_add_platform_entries(struct pci_dev *dev)
87018+{
87019+}
87020+
87021+#ifdef CONFIG_PCI
87022+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
87023+ enum pci_dma_burst_strategy *strat,
87024+ unsigned long *strategy_parameter)
87025+{
87026+ *strat = PCI_DMA_BURST_INFINITY;
87027+ *strategy_parameter = ~0UL;
87028+}
87029+#endif
87030+
87031+#endif /* __KERNEL__ */
87032+
87033+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
87034+#include <xen/pcifront.h>
87035+#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
87036+
87037+/* implement the pci_ DMA API in terms of the generic device dma_ one */
87038+#include <asm-generic/pci-dma-compat.h>
87039+
87040+/* generic pci stuff */
87041+#include <asm-generic/pci.h>
87042+
87043+/* On Xen we have to scan all functions since Xen hides bridges from
87044+ * us. If a bridge is at fn=0 and that slot has a multifunction
87045+ * device, we won't find the additional devices without scanning all
87046+ * functions. */
87047+#undef pcibios_scan_all_fns
87048+#define pcibios_scan_all_fns(a, b) 1
87049+
87050+#endif /* __i386_PCI_H */
87051diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgalloc.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgalloc.h
87052--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgalloc.h 1970-01-01 00:00:00.000000000 +0000
87053+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
87054@@ -0,0 +1,64 @@
87055+#ifndef _I386_PGALLOC_H
87056+#define _I386_PGALLOC_H
87057+
87058+#include <linux/config.h>
87059+#include <asm/fixmap.h>
87060+#include <linux/threads.h>
87061+#include <linux/mm.h> /* for struct page */
87062+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
87063+
87064+/* Is this pagetable pinned? */
87065+#define PG_pinned PG_arch_1
87066+
87067+#define pmd_populate_kernel(mm, pmd, pte) \
87068+ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
87069+
87070+#define pmd_populate(mm, pmd, pte) \
87071+do { \
87072+ if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \
87073+ if (!PageHighMem(pte)) \
87074+ BUG_ON(HYPERVISOR_update_va_mapping( \
87075+ (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
87076+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
87077+ set_pmd(pmd, __pmd(_PAGE_TABLE + \
87078+ ((unsigned long long)page_to_pfn(pte) << \
87079+ (unsigned long long) PAGE_SHIFT))); \
87080+ } else { \
87081+ *(pmd) = __pmd(_PAGE_TABLE + \
87082+ ((unsigned long long)page_to_pfn(pte) << \
87083+ (unsigned long long) PAGE_SHIFT)); \
87084+ } \
87085+} while (0)
87086+
87087+/*
87088+ * Allocate and free page tables.
87089+ */
87090+extern pgd_t *pgd_alloc(struct mm_struct *);
87091+extern void pgd_free(pgd_t *pgd);
87092+
87093+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
87094+extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
87095+
87096+static inline void pte_free_kernel(pte_t *pte)
87097+{
87098+ free_page((unsigned long)pte);
87099+ make_page_writable(pte, XENFEAT_writable_page_tables);
87100+}
87101+
87102+extern void pte_free(struct page *pte);
87103+
87104+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
87105+
87106+#ifdef CONFIG_X86_PAE
87107+/*
87108+ * In the PAE case we free the pmds as part of the pgd.
87109+ */
87110+#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
87111+#define pmd_free(x) do { } while (0)
87112+#define __pmd_free_tlb(tlb,x) do { } while (0)
87113+#define pud_populate(mm, pmd, pte) BUG()
87114+#endif
87115+
87116+#define check_pgt_cache() do { } while (0)
87117+
87118+#endif /* _I386_PGALLOC_H */
87119diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h
87120--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 1970-01-01 00:00:00.000000000 +0000
87121+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 2007-01-08 15:00:46.000000000 +0000
87122@@ -0,0 +1,20 @@
87123+#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
87124+#define _I386_PGTABLE_2LEVEL_DEFS_H
87125+
87126+#define HAVE_SHARED_KERNEL_PMD 0
87127+
87128+/*
87129+ * traditional i386 two-level paging structure:
87130+ */
87131+
87132+#define PGDIR_SHIFT 22
87133+#define PTRS_PER_PGD 1024
87134+
87135+/*
87136+ * the i386 is two-level, so we don't really have any
87137+ * PMD directory physically.
87138+ */
87139+
87140+#define PTRS_PER_PTE 1024
87141+
87142+#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */
87143diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level.h
87144--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level.h 1970-01-01 00:00:00.000000000 +0000
87145+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level.h 2007-01-08 15:00:46.000000000 +0000
87146@@ -0,0 +1,85 @@
87147+#ifndef _I386_PGTABLE_2LEVEL_H
87148+#define _I386_PGTABLE_2LEVEL_H
87149+
87150+#include <asm-generic/pgtable-nopmd.h>
87151+
87152+#define pte_ERROR(e) \
87153+ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
87154+#define pgd_ERROR(e) \
87155+ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
87156+
87157+/*
87158+ * Certain architectures need to do special things when PTEs
87159+ * within a page table are directly modified. Thus, the following
87160+ * hook is made available.
87161+ */
87162+#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
87163+
87164+#define set_pte_at(_mm,addr,ptep,pteval) do { \
87165+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
87166+ HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
87167+ set_pte((ptep), (pteval)); \
87168+} while (0)
87169+
87170+#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
87171+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
87172+ HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
87173+ set_pte((ptep), (pteval)); \
87174+ xen_invlpg((addr)); \
87175+ } \
87176+} while (0)
87177+
87178+#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
87179+
87180+#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
87181+
87182+#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
87183+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
87184+
87185+#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
87186+#define pte_same(a, b) ((a).pte_low == (b).pte_low)
87187+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
87188+#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
87189+
87190+#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
87191+
87192+#define pte_none(x) (!(x).pte_low)
87193+#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
87194+#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
87195+
87196+/*
87197+ * All present user pages are user-executable:
87198+ */
87199+static inline int pte_exec(pte_t pte)
87200+{
87201+ return pte_user(pte);
87202+}
87203+
87204+/*
87205+ * All present pages are kernel-executable:
87206+ */
87207+static inline int pte_exec_kernel(pte_t pte)
87208+{
87209+ return 1;
87210+}
87211+
87212+/*
87213+ * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
87214+ * into this range:
87215+ */
87216+#define PTE_FILE_MAX_BITS 29
87217+
87218+#define pte_to_pgoff(pte) \
87219+ ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
87220+
87221+#define pgoff_to_pte(off) \
87222+ ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
87223+
87224+/* Encode and de-code a swap entry */
87225+#define __swp_type(x) (((x).val >> 1) & 0x1f)
87226+#define __swp_offset(x) ((x).val >> 8)
87227+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
87228+#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
87229+#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
87230+
87231+#endif /* _I386_PGTABLE_2LEVEL_H */
87232diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
87233--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 1970-01-01 00:00:00.000000000 +0000
87234+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 2007-01-08 15:00:46.000000000 +0000
87235@@ -0,0 +1,24 @@
87236+#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
87237+#define _I386_PGTABLE_3LEVEL_DEFS_H
87238+
87239+#define HAVE_SHARED_KERNEL_PMD 0
87240+
87241+/*
87242+ * PGDIR_SHIFT determines what a top-level page table entry can map
87243+ */
87244+#define PGDIR_SHIFT 30
87245+#define PTRS_PER_PGD 4
87246+
87247+/*
87248+ * PMD_SHIFT determines the size of the area a middle-level
87249+ * page table can map
87250+ */
87251+#define PMD_SHIFT 21
87252+#define PTRS_PER_PMD 512
87253+
87254+/*
87255+ * entries per page directory level
87256+ */
87257+#define PTRS_PER_PTE 512
87258+
87259+#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
87260diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level.h
87261--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level.h 1970-01-01 00:00:00.000000000 +0000
87262+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level.h 2007-01-08 15:00:46.000000000 +0000
87263@@ -0,0 +1,183 @@
87264+#ifndef _I386_PGTABLE_3LEVEL_H
87265+#define _I386_PGTABLE_3LEVEL_H
87266+
87267+#include <asm-generic/pgtable-nopud.h>
87268+
87269+/*
87270+ * Intel Physical Address Extension (PAE) Mode - three-level page
87271+ * tables on PPro+ CPUs.
87272+ *
87273+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
87274+ */
87275+
87276+#define pte_ERROR(e) \
87277+ printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
87278+#define pmd_ERROR(e) \
87279+ printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
87280+#define pgd_ERROR(e) \
87281+ printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
87282+
87283+#define pud_none(pud) 0
87284+#define pud_bad(pud) 0
87285+#define pud_present(pud) 1
87286+
87287+/*
87288+ * Is the pte executable?
87289+ */
87290+static inline int pte_x(pte_t pte)
87291+{
87292+ return !(pte_val(pte) & _PAGE_NX);
87293+}
87294+
87295+/*
87296+ * All present user-pages with !NX bit are user-executable:
87297+ */
87298+static inline int pte_exec(pte_t pte)
87299+{
87300+ return pte_user(pte) && pte_x(pte);
87301+}
87302+/*
87303+ * All present pages with !NX bit are kernel-executable:
87304+ */
87305+static inline int pte_exec_kernel(pte_t pte)
87306+{
87307+ return pte_x(pte);
87308+}
87309+
87310+/* Rules for using set_pte: the pte being assigned *must* be
87311+ * either not present or in a state where the hardware will
87312+ * not attempt to update the pte. In places where this is
87313+ * not possible, use pte_get_and_clear to obtain the old pte
87314+ * value and then use set_pte to update it. -ben
87315+ */
87316+#define __HAVE_ARCH_SET_PTE_ATOMIC
87317+
87318+#if 1
87319+/* use writable pagetables */
87320+static inline void set_pte(pte_t *ptep, pte_t pte)
87321+{
87322+ ptep->pte_high = pte.pte_high;
87323+ smp_wmb();
87324+ ptep->pte_low = pte.pte_low;
87325+}
87326+# define set_pte_atomic(pteptr,pteval) \
87327+ set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
87328+#else
87329+/* no writable pagetables */
87330+# define set_pte(pteptr,pteval) \
87331+ xen_l1_entry_update((pteptr), (pteval))
87332+# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
87333+#endif
87334+
87335+#define set_pte_at(_mm,addr,ptep,pteval) do { \
87336+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
87337+ HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
87338+ set_pte((ptep), (pteval)); \
87339+} while (0)
87340+
87341+#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
87342+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
87343+ HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
87344+ set_pte((ptep), (pteval)); \
87345+ xen_invlpg((addr)); \
87346+ } \
87347+} while (0)
87348+
87349+#define set_pmd(pmdptr,pmdval) \
87350+ xen_l2_entry_update((pmdptr), (pmdval))
87351+#define set_pud(pudptr,pudval) \
87352+ xen_l3_entry_update((pudptr), (pudval))
87353+
87354+/*
87355+ * Pentium-II erratum A13: in PAE mode we explicitly have to flush
87356+ * the TLB via cr3 if the top-level pgd is changed...
87357+ * We do not let the generic code free and clear pgd entries due to
87358+ * this erratum.
87359+ */
87360+static inline void pud_clear (pud_t * pud) { }
87361+
87362+#define pud_page(pud) \
87363+((struct page *) __va(pud_val(pud) & PAGE_MASK))
87364+
87365+#define pud_page_kernel(pud) \
87366+((unsigned long) __va(pud_val(pud) & PAGE_MASK))
87367+
87368+
87369+/* Find an entry in the second-level page table.. */
87370+#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
87371+ pmd_index(address))
87372+
87373+/*
87374+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
87375+ * entry, so clear the bottom half first and enforce ordering with a compiler
87376+ * barrier.
87377+ */
87378+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87379+{
87380+ ptep->pte_low = 0;
87381+ smp_wmb();
87382+ ptep->pte_high = 0;
87383+}
87384+
87385+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
87386+
87387+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87388+{
87389+ pte_t res;
87390+
87391+ /* xchg acts as a barrier before the setting of the high bits */
87392+ res.pte_low = xchg(&ptep->pte_low, 0);
87393+ res.pte_high = ptep->pte_high;
87394+ ptep->pte_high = 0;
87395+
87396+ return res;
87397+}
87398+
87399+static inline int pte_same(pte_t a, pte_t b)
87400+{
87401+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
87402+}
87403+
87404+#define pte_page(x) pfn_to_page(pte_pfn(x))
87405+
87406+static inline int pte_none(pte_t pte)
87407+{
87408+ return !pte.pte_low && !pte.pte_high;
87409+}
87410+
87411+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
87412+ (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
87413+#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
87414+
87415+extern unsigned long long __supported_pte_mask;
87416+
87417+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
87418+{
87419+ return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
87420+}
87421+
87422+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
87423+{
87424+ BUG(); panic("needs review");
87425+ return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
87426+ pgprot_val(pgprot)) & __supported_pte_mask);
87427+}
87428+
87429+/*
87430+ * Bits 0, 6 and 7 are taken in the low part of the pte,
87431+ * put the 32 bits of offset into the high part.
87432+ */
87433+#define pte_to_pgoff(pte) ((pte).pte_high)
87434+#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
87435+#define PTE_FILE_MAX_BITS 32
87436+
87437+/* Encode and de-code a swap entry */
87438+#define __swp_type(x) (((x).val) & 0x1f)
87439+#define __swp_offset(x) ((x).val >> 5)
87440+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
87441+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
87442+#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
87443+
87444+#define __pmd_free_tlb(tlb, x) do { } while (0)
87445+
87446+#endif /* _I386_PGTABLE_3LEVEL_H */
87447diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable.h
87448--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable.h 1970-01-01 00:00:00.000000000 +0000
87449+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable.h 2007-01-08 15:00:46.000000000 +0000
87450@@ -0,0 +1,510 @@
87451+#ifndef _I386_PGTABLE_H
87452+#define _I386_PGTABLE_H
87453+
87454+#include <linux/config.h>
87455+#include <asm/hypervisor.h>
87456+
87457+/*
87458+ * The Linux memory management assumes a three-level page table setup. On
87459+ * the i386, we use that, but "fold" the mid level into the top-level page
87460+ * table, so that we physically have the same two-level page table as the
87461+ * i386 mmu expects.
87462+ *
87463+ * This file contains the functions and defines necessary to modify and use
87464+ * the i386 page table tree.
87465+ */
87466+#ifndef __ASSEMBLY__
87467+#include <asm/processor.h>
87468+#include <asm/fixmap.h>
87469+#include <linux/threads.h>
87470+
87471+#ifndef _I386_BITOPS_H
87472+#include <asm/bitops.h>
87473+#endif
87474+
87475+#include <linux/slab.h>
87476+#include <linux/list.h>
87477+#include <linux/spinlock.h>
87478+
87479+struct mm_struct;
87480+struct vm_area_struct;
87481+
87482+/*
87483+ * ZERO_PAGE is a global shared page that is always zero: used
87484+ * for zero-mapped memory areas etc..
87485+ */
87486+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
87487+extern unsigned long empty_zero_page[1024];
87488+extern pgd_t *swapper_pg_dir;
87489+extern kmem_cache_t *pgd_cache;
87490+extern kmem_cache_t *pmd_cache;
87491+extern spinlock_t pgd_lock;
87492+extern struct page *pgd_list;
87493+
87494+void pmd_ctor(void *, kmem_cache_t *, unsigned long);
87495+void pgd_ctor(void *, kmem_cache_t *, unsigned long);
87496+void pgd_dtor(void *, kmem_cache_t *, unsigned long);
87497+void pgtable_cache_init(void);
87498+void paging_init(void);
87499+
87500+/*
87501+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
87502+ * implements both the traditional 2-level x86 page tables and the
87503+ * newer 3-level PAE-mode page tables.
87504+ */
87505+#ifdef CONFIG_X86_PAE
87506+# include <asm/pgtable-3level-defs.h>
87507+# define PMD_SIZE (1UL << PMD_SHIFT)
87508+# define PMD_MASK (~(PMD_SIZE-1))
87509+#else
87510+# include <asm/pgtable-2level-defs.h>
87511+#endif
87512+
87513+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
87514+#define PGDIR_MASK (~(PGDIR_SIZE-1))
87515+
87516+#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
87517+#define FIRST_USER_ADDRESS 0
87518+
87519+#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
87520+#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
87521+
87522+#define TWOLEVEL_PGDIR_SHIFT 22
87523+#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
87524+#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
87525+
87526+/* Just any arbitrary offset to the start of the vmalloc VM area: the
87527+ * current 8MB value just means that there will be a 8MB "hole" after the
87528+ * physical memory until the kernel virtual memory starts. That means that
87529+ * any out-of-bounds memory accesses will hopefully be caught.
87530+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
87531+ * area for the same reason. ;)
87532+ */
87533+#define VMALLOC_OFFSET (8*1024*1024)
87534+#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \
87535+ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
87536+#ifdef CONFIG_HIGHMEM
87537+# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
87538+#else
87539+# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
87540+#endif
87541+
87542+/*
87543+ * _PAGE_PSE set in the page directory entry just means that
87544+ * the page directory entry points directly to a 4MB-aligned block of
87545+ * memory.
87546+ */
87547+#define _PAGE_BIT_PRESENT 0
87548+#define _PAGE_BIT_RW 1
87549+#define _PAGE_BIT_USER 2
87550+#define _PAGE_BIT_PWT 3
87551+#define _PAGE_BIT_PCD 4
87552+#define _PAGE_BIT_ACCESSED 5
87553+#define _PAGE_BIT_DIRTY 6
87554+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
87555+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
87556+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
87557+#define _PAGE_BIT_UNUSED2 10
87558+#define _PAGE_BIT_UNUSED3 11
87559+#define _PAGE_BIT_NX 63
87560+
87561+#define _PAGE_PRESENT 0x001
87562+#define _PAGE_RW 0x002
87563+#define _PAGE_USER 0x004
87564+#define _PAGE_PWT 0x008
87565+#define _PAGE_PCD 0x010
87566+#define _PAGE_ACCESSED 0x020
87567+#define _PAGE_DIRTY 0x040
87568+#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
87569+#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
87570+#define _PAGE_UNUSED1 0x200 /* available for programmer */
87571+#define _PAGE_UNUSED2 0x400
87572+#define _PAGE_UNUSED3 0x800
87573+
87574+/* If _PAGE_PRESENT is clear, we use these: */
87575+#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
87576+#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
87577+ pte_present gives true */
87578+#ifdef CONFIG_X86_PAE
87579+#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
87580+#else
87581+#define _PAGE_NX 0
87582+#endif
87583+
87584+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
87585+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
87586+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
87587+
87588+#define PAGE_NONE \
87589+ __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
87590+#define PAGE_SHARED \
87591+ __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
87592+
87593+#define PAGE_SHARED_EXEC \
87594+ __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
87595+#define PAGE_COPY_NOEXEC \
87596+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
87597+#define PAGE_COPY_EXEC \
87598+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
87599+#define PAGE_COPY \
87600+ PAGE_COPY_NOEXEC
87601+#define PAGE_READONLY \
87602+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
87603+#define PAGE_READONLY_EXEC \
87604+ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
87605+
87606+#define _PAGE_KERNEL \
87607+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
87608+#define _PAGE_KERNEL_EXEC \
87609+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
87610+
87611+extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
87612+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
87613+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
87614+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
87615+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
87616+
87617+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
87618+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
87619+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
87620+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
87621+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
87622+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
87623+
87624+/*
87625+ * The i386 can't do page protection for execute, and considers that
87626+ * the same are read. Also, write permissions imply read permissions.
87627+ * This is the closest we can get..
87628+ */
87629+#define __P000 PAGE_NONE
87630+#define __P001 PAGE_READONLY
87631+#define __P010 PAGE_COPY
87632+#define __P011 PAGE_COPY
87633+#define __P100 PAGE_READONLY_EXEC
87634+#define __P101 PAGE_READONLY_EXEC
87635+#define __P110 PAGE_COPY_EXEC
87636+#define __P111 PAGE_COPY_EXEC
87637+
87638+#define __S000 PAGE_NONE
87639+#define __S001 PAGE_READONLY
87640+#define __S010 PAGE_SHARED
87641+#define __S011 PAGE_SHARED
87642+#define __S100 PAGE_READONLY_EXEC
87643+#define __S101 PAGE_READONLY_EXEC
87644+#define __S110 PAGE_SHARED_EXEC
87645+#define __S111 PAGE_SHARED_EXEC
87646+
87647+/*
87648+ * Define this if things work differently on an i386 and an i486:
87649+ * it will (on an i486) warn about kernel memory accesses that are
87650+ * done without a 'access_ok(VERIFY_WRITE,..)'
87651+ */
87652+#undef TEST_ACCESS_OK
87653+
87654+/* The boot page tables (all created as a single array) */
87655+extern unsigned long pg0[];
87656+
87657+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
87658+
87659+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
87660+#define pmd_none(x) (!(unsigned long)pmd_val(x))
87661+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
87662+ can temporarily clear it. */
87663+#define pmd_present(x) (pmd_val(x))
87664+#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
87665+
87666+
87667+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
87668+
87669+/*
87670+ * The following only work if pte_present() is true.
87671+ * Undefined behaviour if not..
87672+ */
87673+#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
87674+static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
87675+static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
87676+static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
87677+static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
87678+static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
87679+static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
87680+
87681+/*
87682+ * The following only works if pte_present() is not true.
87683+ */
87684+static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
87685+
87686+static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
87687+static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
87688+static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
87689+static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
87690+static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
87691+static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
87692+static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
87693+static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
87694+static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
87695+static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
87696+static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; }
87697+
87698+#ifdef CONFIG_X86_PAE
87699+# include <asm/pgtable-3level.h>
87700+#else
87701+# include <asm/pgtable-2level.h>
87702+#endif
87703+
87704+static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
87705+{
87706+ if (!pte_dirty(*ptep))
87707+ return 0;
87708+ return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
87709+}
87710+
87711+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
87712+{
87713+ if (!pte_young(*ptep))
87714+ return 0;
87715+ return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
87716+}
87717+
87718+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
87719+{
87720+ pte_t pte;
87721+ if (full) {
87722+ pte = *ptep;
87723+ pte_clear(mm, addr, ptep);
87724+ } else {
87725+ pte = ptep_get_and_clear(mm, addr, ptep);
87726+ }
87727+ return pte;
87728+}
87729+
87730+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87731+{
87732+ if (pte_write(*ptep))
87733+ clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
87734+}
87735+
87736+/*
87737+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
87738+ *
87739+ * dst - pointer to pgd range anwhere on a pgd page
87740+ * src - ""
87741+ * count - the number of pgds to copy.
87742+ *
87743+ * dst and src can be on the same page, but the range must not overlap,
87744+ * and must not cross a page boundary.
87745+ */
87746+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
87747+{
87748+ memcpy(dst, src, count * sizeof(pgd_t));
87749+}
87750+
87751+/*
87752+ * Macro to mark a page protection value as "uncacheable". On processors which do not support
87753+ * it, this is a no-op.
87754+ */
87755+#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
87756+ ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
87757+
87758+/*
87759+ * Conversion functions: convert a page and protection to a page entry,
87760+ * and a page entry and page directory to the page they refer to.
87761+ */
87762+
87763+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
87764+
87765+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
87766+{
87767+ pte.pte_low &= _PAGE_CHG_MASK;
87768+ pte.pte_low |= pgprot_val(newprot);
87769+#ifdef CONFIG_X86_PAE
87770+ /*
87771+ * Chop off the NX bit (if present), and add the NX portion of
87772+ * the newprot (if present):
87773+ */
87774+ pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
87775+ pte.pte_high |= (pgprot_val(newprot) >> 32) & \
87776+ (__supported_pte_mask >> 32);
87777+#endif
87778+ return pte;
87779+}
87780+
87781+#define pmd_large(pmd) \
87782+((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
87783+
87784+/*
87785+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
87786+ *
87787+ * this macro returns the index of the entry in the pgd page which would
87788+ * control the given virtual address
87789+ */
87790+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
87791+#define pgd_index_k(addr) pgd_index(addr)
87792+
87793+/*
87794+ * pgd_offset() returns a (pgd_t *)
87795+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
87796+ */
87797+#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
87798+
87799+/*
87800+ * a shortcut which implies the use of the kernel's pgd, instead
87801+ * of a process's
87802+ */
87803+#define pgd_offset_k(address) pgd_offset(&init_mm, address)
87804+
87805+/*
87806+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
87807+ *
87808+ * this macro returns the index of the entry in the pmd page which would
87809+ * control the given virtual address
87810+ */
87811+#define pmd_index(address) \
87812+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
87813+
87814+/*
87815+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
87816+ *
87817+ * this macro returns the index of the entry in the pte page which would
87818+ * control the given virtual address
87819+ */
87820+#define pte_index(address) \
87821+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
87822+#define pte_offset_kernel(dir, address) \
87823+ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
87824+
87825+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
87826+
87827+#define pmd_page_kernel(pmd) \
87828+ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
87829+
87830+/*
87831+ * Helper function that returns the kernel pagetable entry controlling
87832+ * the virtual address 'address'. NULL means no pagetable entry present.
87833+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
87834+ * as a pte too.
87835+ */
87836+extern pte_t *lookup_address(unsigned long address);
87837+
87838+/*
87839+ * Make a given kernel text page executable/non-executable.
87840+ * Returns the previous executability setting of that page (which
87841+ * is used to restore the previous state). Used by the SMP bootup code.
87842+ * NOTE: this is an __init function for security reasons.
87843+ */
87844+#ifdef CONFIG_X86_PAE
87845+ extern int set_kernel_exec(unsigned long vaddr, int enable);
87846+#else
87847+ static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
87848+#endif
87849+
87850+extern void noexec_setup(const char *str);
87851+
87852+#if defined(CONFIG_HIGHPTE)
87853+#define pte_offset_map(dir, address) \
87854+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
87855+ pte_index(address))
87856+#define pte_offset_map_nested(dir, address) \
87857+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
87858+ pte_index(address))
87859+#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
87860+#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
87861+#else
87862+#define pte_offset_map(dir, address) \
87863+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
87864+#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
87865+#define pte_unmap(pte) do { } while (0)
87866+#define pte_unmap_nested(pte) do { } while (0)
87867+#endif
87868+
87869+/*
87870+ * The i386 doesn't have any external MMU info: the kernel page
87871+ * tables contain all the necessary information.
87872+ *
87873+ * Also, we only update the dirty/accessed state if we set
87874+ * the dirty bit by hand in the kernel, since the hardware
87875+ * will do the accessed bit for us, and we don't want to
87876+ * race with other CPU's that might be updating the dirty
87877+ * bit at the same time.
87878+ */
87879+#define update_mmu_cache(vma,address,pte) do { } while (0)
87880+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
87881+#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
87882+ do { \
87883+ if (__dirty) { \
87884+ if ( likely((__vma)->vm_mm == current->mm) ) { \
87885+ BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
87886+ } else { \
87887+ xen_l1_entry_update((__ptep), (__entry)); \
87888+ flush_tlb_page((__vma), (__address)); \
87889+ } \
87890+ } \
87891+ } while (0)
87892+
87893+#define __HAVE_ARCH_PTEP_ESTABLISH
87894+#define ptep_establish(__vma, __address, __ptep, __entry) \
87895+do { \
87896+ ptep_set_access_flags(__vma, __address, __ptep, __entry, 1); \
87897+} while (0)
87898+
87899+#include <xen/features.h>
87900+void make_lowmem_page_readonly(void *va, unsigned int feature);
87901+void make_lowmem_page_writable(void *va, unsigned int feature);
87902+void make_page_readonly(void *va, unsigned int feature);
87903+void make_page_writable(void *va, unsigned int feature);
87904+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
87905+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
87906+
87907+#define virt_to_ptep(__va) \
87908+({ \
87909+ pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \
87910+ pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \
87911+ pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \
87912+ pte_offset_kernel(__pmd, (unsigned long)(__va)); \
87913+})
87914+
87915+#define arbitrary_virt_to_machine(__va) \
87916+({ \
87917+ maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
87918+ m | ((unsigned long)(__va) & (PAGE_SIZE-1)); \
87919+})
87920+
87921+#endif /* !__ASSEMBLY__ */
87922+
87923+#ifdef CONFIG_FLATMEM
87924+#define kern_addr_valid(addr) (1)
87925+#endif /* CONFIG_FLATMEM */
87926+
87927+int direct_remap_pfn_range(struct vm_area_struct *vma,
87928+ unsigned long address,
87929+ unsigned long mfn,
87930+ unsigned long size,
87931+ pgprot_t prot,
87932+ domid_t domid);
87933+int direct_kernel_remap_pfn_range(unsigned long address,
87934+ unsigned long mfn,
87935+ unsigned long size,
87936+ pgprot_t prot,
87937+ domid_t domid);
87938+int create_lookup_pte_addr(struct mm_struct *mm,
87939+ unsigned long address,
87940+ uint64_t *ptep);
87941+int touch_pte_range(struct mm_struct *mm,
87942+ unsigned long address,
87943+ unsigned long size);
87944+
87945+#define io_remap_pfn_range(vma,from,pfn,size,prot) \
87946+direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
87947+
87948+#define MK_IOSPACE_PFN(space, pfn) (pfn)
87949+#define GET_IOSPACE(pfn) 0
87950+#define GET_PFN(pfn) (pfn)
87951+
87952+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
87953+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
87954+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
87955+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
87956+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
87957+#define __HAVE_ARCH_PTE_SAME
87958+#include <asm-generic/pgtable.h>
87959+
87960+#endif /* _I386_PGTABLE_H */
87961diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/processor.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/processor.h
87962--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/processor.h 1970-01-01 00:00:00.000000000 +0000
87963+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/processor.h 2007-01-08 15:00:46.000000000 +0000
87964@@ -0,0 +1,750 @@
87965+/*
87966+ * include/asm-i386/processor.h
87967+ *
87968+ * Copyright (C) 1994 Linus Torvalds
87969+ */
87970+
87971+#ifndef __ASM_I386_PROCESSOR_H
87972+#define __ASM_I386_PROCESSOR_H
87973+
87974+#include <asm/vm86.h>
87975+#include <asm/math_emu.h>
87976+#include <asm/segment.h>
87977+#include <asm/page.h>
87978+#include <asm/types.h>
87979+#include <asm/sigcontext.h>
87980+#include <asm/cpufeature.h>
87981+#include <asm/msr.h>
87982+#include <asm/system.h>
87983+#include <linux/cache.h>
87984+#include <linux/config.h>
87985+#include <linux/threads.h>
87986+#include <asm/percpu.h>
87987+#include <xen/interface/physdev.h>
87988+
87989+/* flag for disabling the tsc */
87990+extern int tsc_disable;
87991+
87992+struct desc_struct {
87993+ unsigned long a,b;
87994+};
87995+
87996+#define desc_empty(desc) \
87997+ (!((desc)->a | (desc)->b))
87998+
87999+#define desc_equal(desc1, desc2) \
88000+ (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
88001+/*
88002+ * Default implementation of macro that returns current
88003+ * instruction pointer ("program counter").
88004+ */
88005+#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
88006+
88007+/*
88008+ * CPU type and hardware bug flags. Kept separately for each CPU.
88009+ * Members of this structure are referenced in head.S, so think twice
88010+ * before touching them. [mj]
88011+ */
88012+
88013+struct cpuinfo_x86 {
88014+ __u8 x86; /* CPU family */
88015+ __u8 x86_vendor; /* CPU vendor */
88016+ __u8 x86_model;
88017+ __u8 x86_mask;
88018+ char wp_works_ok; /* It doesn't on 386's */
88019+ char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
88020+ char hard_math;
88021+ char rfu;
88022+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
88023+ unsigned long x86_capability[NCAPINTS];
88024+ char x86_vendor_id[16];
88025+ char x86_model_id[64];
88026+ int x86_cache_size; /* in KB - valid for CPUS which support this
88027+ call */
88028+ int x86_cache_alignment; /* In bytes */
88029+ char fdiv_bug;
88030+ char f00f_bug;
88031+ char coma_bug;
88032+ char pad0;
88033+ int x86_power;
88034+ unsigned long loops_per_jiffy;
88035+ unsigned char x86_max_cores; /* cpuid returned max cores value */
88036+ unsigned char booted_cores; /* number of cores as seen by OS */
88037+ unsigned char apicid;
88038+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
88039+
88040+#define X86_VENDOR_INTEL 0
88041+#define X86_VENDOR_CYRIX 1
88042+#define X86_VENDOR_AMD 2
88043+#define X86_VENDOR_UMC 3
88044+#define X86_VENDOR_NEXGEN 4
88045+#define X86_VENDOR_CENTAUR 5
88046+#define X86_VENDOR_RISE 6
88047+#define X86_VENDOR_TRANSMETA 7
88048+#define X86_VENDOR_NSC 8
88049+#define X86_VENDOR_NUM 9
88050+#define X86_VENDOR_UNKNOWN 0xff
88051+
88052+/*
88053+ * capabilities of CPUs
88054+ */
88055+
88056+extern struct cpuinfo_x86 boot_cpu_data;
88057+extern struct cpuinfo_x86 new_cpu_data;
88058+#ifndef CONFIG_X86_NO_TSS
88059+extern struct tss_struct doublefault_tss;
88060+DECLARE_PER_CPU(struct tss_struct, init_tss);
88061+#endif
88062+
88063+#ifdef CONFIG_SMP
88064+extern struct cpuinfo_x86 cpu_data[];
88065+#define current_cpu_data cpu_data[smp_processor_id()]
88066+#else
88067+#define cpu_data (&boot_cpu_data)
88068+#define current_cpu_data boot_cpu_data
88069+#endif
88070+
88071+extern int phys_proc_id[NR_CPUS];
88072+extern int cpu_core_id[NR_CPUS];
88073+extern char ignore_fpu_irq;
88074+
88075+extern void identify_cpu(struct cpuinfo_x86 *);
88076+extern void print_cpu_info(struct cpuinfo_x86 *);
88077+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
88078+
88079+#ifdef CONFIG_X86_HT
88080+extern void detect_ht(struct cpuinfo_x86 *c);
88081+#else
88082+static inline void detect_ht(struct cpuinfo_x86 *c) {}
88083+#endif
88084+
88085+/*
88086+ * EFLAGS bits
88087+ */
88088+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
88089+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
88090+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
88091+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
88092+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
88093+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
88094+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
88095+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
88096+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
88097+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
88098+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
88099+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
88100+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
88101+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
88102+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
88103+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
88104+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
88105+
88106+/*
88107+ * Generic CPUID function
88108+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
88109+ * resulting in stale register contents being returned.
88110+ */
88111+static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
88112+{
88113+ __asm__(XEN_CPUID
88114+ : "=a" (*eax),
88115+ "=b" (*ebx),
88116+ "=c" (*ecx),
88117+ "=d" (*edx)
88118+ : "0" (op), "c"(0));
88119+}
88120+
88121+/* Some CPUID calls want 'count' to be placed in ecx */
88122+static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
88123+ int *edx)
88124+{
88125+ __asm__(XEN_CPUID
88126+ : "=a" (*eax),
88127+ "=b" (*ebx),
88128+ "=c" (*ecx),
88129+ "=d" (*edx)
88130+ : "0" (op), "c" (count));
88131+}
88132+
88133+/*
88134+ * CPUID functions returning a single datum
88135+ */
88136+static inline unsigned int cpuid_eax(unsigned int op)
88137+{
88138+ unsigned int eax;
88139+
88140+ __asm__(XEN_CPUID
88141+ : "=a" (eax)
88142+ : "0" (op)
88143+ : "bx", "cx", "dx");
88144+ return eax;
88145+}
88146+static inline unsigned int cpuid_ebx(unsigned int op)
88147+{
88148+ unsigned int eax, ebx;
88149+
88150+ __asm__(XEN_CPUID
88151+ : "=a" (eax), "=b" (ebx)
88152+ : "0" (op)
88153+ : "cx", "dx" );
88154+ return ebx;
88155+}
88156+static inline unsigned int cpuid_ecx(unsigned int op)
88157+{
88158+ unsigned int eax, ecx;
88159+
88160+ __asm__(XEN_CPUID
88161+ : "=a" (eax), "=c" (ecx)
88162+ : "0" (op)
88163+ : "bx", "dx" );
88164+ return ecx;
88165+}
88166+static inline unsigned int cpuid_edx(unsigned int op)
88167+{
88168+ unsigned int eax, edx;
88169+
88170+ __asm__(XEN_CPUID
88171+ : "=a" (eax), "=d" (edx)
88172+ : "0" (op)
88173+ : "bx", "cx");
88174+ return edx;
88175+}
88176+
88177+#define load_cr3(pgdir) write_cr3(__pa(pgdir))
88178+
88179+/*
88180+ * Intel CPU features in CR4
88181+ */
88182+#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
88183+#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
88184+#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
88185+#define X86_CR4_DE 0x0008 /* enable debugging extensions */
88186+#define X86_CR4_PSE 0x0010 /* enable page size extensions */
88187+#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
88188+#define X86_CR4_MCE 0x0040 /* Machine check enable */
88189+#define X86_CR4_PGE 0x0080 /* enable global pages */
88190+#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
88191+#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
88192+#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
88193+
88194+/*
88195+ * Save the cr4 feature set we're using (ie
88196+ * Pentium 4MB enable and PPro Global page
88197+ * enable), so that any CPU's that boot up
88198+ * after us can get the correct flags.
88199+ */
88200+extern unsigned long mmu_cr4_features;
88201+
88202+static inline void set_in_cr4 (unsigned long mask)
88203+{
88204+ unsigned cr4;
88205+ mmu_cr4_features |= mask;
88206+ cr4 = read_cr4();
88207+ cr4 |= mask;
88208+ write_cr4(cr4);
88209+}
88210+
88211+static inline void clear_in_cr4 (unsigned long mask)
88212+{
88213+ unsigned cr4;
88214+ mmu_cr4_features &= ~mask;
88215+ cr4 = read_cr4();
88216+ cr4 &= ~mask;
88217+ write_cr4(cr4);
88218+}
88219+
88220+/*
88221+ * NSC/Cyrix CPU configuration register indexes
88222+ */
88223+
88224+#define CX86_PCR0 0x20
88225+#define CX86_GCR 0xb8
88226+#define CX86_CCR0 0xc0
88227+#define CX86_CCR1 0xc1
88228+#define CX86_CCR2 0xc2
88229+#define CX86_CCR3 0xc3
88230+#define CX86_CCR4 0xe8
88231+#define CX86_CCR5 0xe9
88232+#define CX86_CCR6 0xea
88233+#define CX86_CCR7 0xeb
88234+#define CX86_PCR1 0xf0
88235+#define CX86_DIR0 0xfe
88236+#define CX86_DIR1 0xff
88237+#define CX86_ARR_BASE 0xc4
88238+#define CX86_RCR_BASE 0xdc
88239+
88240+/*
88241+ * NSC/Cyrix CPU indexed register access macros
88242+ */
88243+
88244+#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
88245+
88246+#define setCx86(reg, data) do { \
88247+ outb((reg), 0x22); \
88248+ outb((data), 0x23); \
88249+} while (0)
88250+
88251+/* Stop speculative execution */
88252+static inline void sync_core(void)
88253+{
88254+ int tmp;
88255+ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
88256+}
88257+
88258+static inline void __monitor(const void *eax, unsigned long ecx,
88259+ unsigned long edx)
88260+{
88261+ /* "monitor %eax,%ecx,%edx;" */
88262+ asm volatile(
88263+ ".byte 0x0f,0x01,0xc8;"
88264+ : :"a" (eax), "c" (ecx), "d"(edx));
88265+}
88266+
88267+static inline void __mwait(unsigned long eax, unsigned long ecx)
88268+{
88269+ /* "mwait %eax,%ecx;" */
88270+ asm volatile(
88271+ ".byte 0x0f,0x01,0xc9;"
88272+ : :"a" (eax), "c" (ecx));
88273+}
88274+
88275+/* from system description table in BIOS. Mostly for MCA use, but
88276+others may find it useful. */
88277+extern unsigned int machine_id;
88278+extern unsigned int machine_submodel_id;
88279+extern unsigned int BIOS_revision;
88280+extern unsigned int mca_pentium_flag;
88281+
88282+/* Boot loader type from the setup header */
88283+extern int bootloader_type;
88284+
88285+/*
88286+ * User space process size: 3GB (default).
88287+ */
88288+#define TASK_SIZE (PAGE_OFFSET)
88289+
88290+/* This decides where the kernel will search for a free chunk of vm
88291+ * space during mmap's.
88292+ */
88293+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
88294+
88295+#define HAVE_ARCH_PICK_MMAP_LAYOUT
88296+
88297+/*
88298+ * Size of io_bitmap.
88299+ */
88300+#define IO_BITMAP_BITS 65536
88301+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
88302+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
88303+#ifndef CONFIG_X86_NO_TSS
88304+#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
88305+#endif
88306+#define INVALID_IO_BITMAP_OFFSET 0x8000
88307+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
88308+
88309+struct i387_fsave_struct {
88310+ long cwd;
88311+ long swd;
88312+ long twd;
88313+ long fip;
88314+ long fcs;
88315+ long foo;
88316+ long fos;
88317+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
88318+ long status; /* software status information */
88319+};
88320+
88321+struct i387_fxsave_struct {
88322+ unsigned short cwd;
88323+ unsigned short swd;
88324+ unsigned short twd;
88325+ unsigned short fop;
88326+ long fip;
88327+ long fcs;
88328+ long foo;
88329+ long fos;
88330+ long mxcsr;
88331+ long mxcsr_mask;
88332+ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
88333+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
88334+ long padding[56];
88335+} __attribute__ ((aligned (16)));
88336+
88337+struct i387_soft_struct {
88338+ long cwd;
88339+ long swd;
88340+ long twd;
88341+ long fip;
88342+ long fcs;
88343+ long foo;
88344+ long fos;
88345+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
88346+ unsigned char ftop, changed, lookahead, no_update, rm, alimit;
88347+ struct info *info;
88348+ unsigned long entry_eip;
88349+};
88350+
88351+union i387_union {
88352+ struct i387_fsave_struct fsave;
88353+ struct i387_fxsave_struct fxsave;
88354+ struct i387_soft_struct soft;
88355+};
88356+
88357+typedef struct {
88358+ unsigned long seg;
88359+} mm_segment_t;
88360+
88361+struct thread_struct;
88362+
88363+#ifndef CONFIG_X86_NO_TSS
88364+struct tss_struct {
88365+ unsigned short back_link,__blh;
88366+ unsigned long esp0;
88367+ unsigned short ss0,__ss0h;
88368+ unsigned long esp1;
88369+ unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
88370+ unsigned long esp2;
88371+ unsigned short ss2,__ss2h;
88372+ unsigned long __cr3;
88373+ unsigned long eip;
88374+ unsigned long eflags;
88375+ unsigned long eax,ecx,edx,ebx;
88376+ unsigned long esp;
88377+ unsigned long ebp;
88378+ unsigned long esi;
88379+ unsigned long edi;
88380+ unsigned short es, __esh;
88381+ unsigned short cs, __csh;
88382+ unsigned short ss, __ssh;
88383+ unsigned short ds, __dsh;
88384+ unsigned short fs, __fsh;
88385+ unsigned short gs, __gsh;
88386+ unsigned short ldt, __ldth;
88387+ unsigned short trace, io_bitmap_base;
88388+ /*
88389+ * The extra 1 is there because the CPU will access an
88390+ * additional byte beyond the end of the IO permission
88391+ * bitmap. The extra byte must be all 1 bits, and must
88392+ * be within the limit.
88393+ */
88394+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
88395+ /*
88396+ * Cache the current maximum and the last task that used the bitmap:
88397+ */
88398+ unsigned long io_bitmap_max;
88399+ struct thread_struct *io_bitmap_owner;
88400+ /*
88401+ * pads the TSS to be cacheline-aligned (size is 0x100)
88402+ */
88403+ unsigned long __cacheline_filler[35];
88404+ /*
88405+ * .. and then another 0x100 bytes for emergency kernel stack
88406+ */
88407+ unsigned long stack[64];
88408+} __attribute__((packed));
88409+#endif
88410+
88411+#define ARCH_MIN_TASKALIGN 16
88412+
88413+struct thread_struct {
88414+/* cached TLS descriptors. */
88415+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
88416+ unsigned long esp0;
88417+ unsigned long sysenter_cs;
88418+ unsigned long eip;
88419+ unsigned long esp;
88420+ unsigned long fs;
88421+ unsigned long gs;
88422+/* Hardware debugging registers */
88423+ unsigned long debugreg[8]; /* %%db0-7 debug registers */
88424+/* fault info */
88425+ unsigned long cr2, trap_no, error_code;
88426+/* floating point info */
88427+ union i387_union i387;
88428+/* virtual 86 mode info */
88429+ struct vm86_struct __user * vm86_info;
88430+ unsigned long screen_bitmap;
88431+ unsigned long v86flags, v86mask, saved_esp0;
88432+ unsigned int saved_fs, saved_gs;
88433+/* IO permissions */
88434+ unsigned long *io_bitmap_ptr;
88435+ unsigned long iopl;
88436+/* max allowed port in the bitmap, in bytes: */
88437+ unsigned long io_bitmap_max;
88438+};
88439+
88440+#define INIT_THREAD { \
88441+ .vm86_info = NULL, \
88442+ .sysenter_cs = __KERNEL_CS, \
88443+ .io_bitmap_ptr = NULL, \
88444+}
88445+
88446+#ifndef CONFIG_X86_NO_TSS
88447+/*
88448+ * Note that the .io_bitmap member must be extra-big. This is because
88449+ * the CPU will access an additional byte beyond the end of the IO
88450+ * permission bitmap. The extra byte must be all 1 bits, and must
88451+ * be within the limit.
88452+ */
88453+#define INIT_TSS { \
88454+ .esp0 = sizeof(init_stack) + (long)&init_stack, \
88455+ .ss0 = __KERNEL_DS, \
88456+ .ss1 = __KERNEL_CS, \
88457+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
88458+ .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
88459+}
88460+
88461+static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
88462+{
88463+ tss->esp0 = thread->esp0;
88464+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
88465+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
88466+ tss->ss1 = thread->sysenter_cs;
88467+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
88468+ }
88469+}
88470+#define load_esp0(tss, thread) \
88471+ __load_esp0(tss, thread)
88472+#else
88473+#define load_esp0(tss, thread) \
88474+ HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
88475+#endif
88476+
88477+#define start_thread(regs, new_eip, new_esp) do { \
88478+ __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
88479+ set_fs(USER_DS); \
88480+ regs->xds = __USER_DS; \
88481+ regs->xes = __USER_DS; \
88482+ regs->xss = __USER_DS; \
88483+ regs->xcs = __USER_CS; \
88484+ regs->eip = new_eip; \
88485+ regs->esp = new_esp; \
88486+} while (0)
88487+
88488+/*
88489+ * These special macros can be used to get or set a debugging register
88490+ */
88491+#define get_debugreg(var, register) \
88492+ (var) = HYPERVISOR_get_debugreg((register))
88493+#define set_debugreg(value, register) \
88494+ HYPERVISOR_set_debugreg((register), (value))
88495+
88496+/*
88497+ * Set IOPL bits in EFLAGS from given mask
88498+ */
88499+static inline void set_iopl_mask(unsigned mask)
88500+{
88501+ struct physdev_set_iopl set_iopl;
88502+
88503+ /* Force the change at ring 0. */
88504+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
88505+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
88506+}
88507+
88508+/* Forward declaration, a strange C thing */
88509+struct task_struct;
88510+struct mm_struct;
88511+
88512+/* Free all resources held by a thread. */
88513+extern void release_thread(struct task_struct *);
88514+
88515+/* Prepare to copy thread state - unlazy all lazy status */
88516+extern void prepare_to_copy(struct task_struct *tsk);
88517+
88518+/*
88519+ * create a kernel thread without removing it from tasklists
88520+ */
88521+extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
88522+
88523+extern unsigned long thread_saved_pc(struct task_struct *tsk);
88524+void show_trace(struct task_struct *task, unsigned long *stack);
88525+
88526+unsigned long get_wchan(struct task_struct *p);
88527+
88528+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
88529+#define KSTK_TOP(info) \
88530+({ \
88531+ unsigned long *__ptr = (unsigned long *)(info); \
88532+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
88533+})
88534+
88535+/*
88536+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
88537+ * This is necessary to guarantee that the entire "struct pt_regs"
88538+ * is accessable even if the CPU haven't stored the SS/ESP registers
88539+ * on the stack (interrupt gate does not save these registers
88540+ * when switching to the same priv ring).
88541+ * Therefore beware: accessing the xss/esp fields of the
88542+ * "struct pt_regs" is possible, but they may contain the
88543+ * completely wrong values.
88544+ */
88545+#define task_pt_regs(task) \
88546+({ \
88547+ struct pt_regs *__regs__; \
88548+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
88549+ __regs__ - 1; \
88550+})
88551+
88552+#define KSTK_EIP(task) (task_pt_regs(task)->eip)
88553+#define KSTK_ESP(task) (task_pt_regs(task)->esp)
88554+
88555+
88556+struct microcode_header {
88557+ unsigned int hdrver;
88558+ unsigned int rev;
88559+ unsigned int date;
88560+ unsigned int sig;
88561+ unsigned int cksum;
88562+ unsigned int ldrver;
88563+ unsigned int pf;
88564+ unsigned int datasize;
88565+ unsigned int totalsize;
88566+ unsigned int reserved[3];
88567+};
88568+
88569+struct microcode {
88570+ struct microcode_header hdr;
88571+ unsigned int bits[0];
88572+};
88573+
88574+typedef struct microcode microcode_t;
88575+typedef struct microcode_header microcode_header_t;
88576+
88577+/* microcode format is extended from prescott processors */
88578+struct extended_signature {
88579+ unsigned int sig;
88580+ unsigned int pf;
88581+ unsigned int cksum;
88582+};
88583+
88584+struct extended_sigtable {
88585+ unsigned int count;
88586+ unsigned int cksum;
88587+ unsigned int reserved[3];
88588+ struct extended_signature sigs[0];
88589+};
88590+/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
88591+#define MICROCODE_IOCFREE _IO('6',0)
88592+
88593+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
88594+static inline void rep_nop(void)
88595+{
88596+ __asm__ __volatile__("rep;nop": : :"memory");
88597+}
88598+
88599+#define cpu_relax() rep_nop()
88600+
88601+/* generic versions from gas */
88602+#define GENERIC_NOP1 ".byte 0x90\n"
88603+#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
88604+#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
88605+#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
88606+#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
88607+#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
88608+#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
88609+#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
88610+
88611+/* Opteron nops */
88612+#define K8_NOP1 GENERIC_NOP1
88613+#define K8_NOP2 ".byte 0x66,0x90\n"
88614+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
88615+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
88616+#define K8_NOP5 K8_NOP3 K8_NOP2
88617+#define K8_NOP6 K8_NOP3 K8_NOP3
88618+#define K8_NOP7 K8_NOP4 K8_NOP3
88619+#define K8_NOP8 K8_NOP4 K8_NOP4
88620+
88621+/* K7 nops */
88622+/* uses eax dependencies (arbitary choice) */
88623+#define K7_NOP1 GENERIC_NOP1
88624+#define K7_NOP2 ".byte 0x8b,0xc0\n"
88625+#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
88626+#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
88627+#define K7_NOP5 K7_NOP4 ASM_NOP1
88628+#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
88629+#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
88630+#define K7_NOP8 K7_NOP7 ASM_NOP1
88631+
88632+#ifdef CONFIG_MK8
88633+#define ASM_NOP1 K8_NOP1
88634+#define ASM_NOP2 K8_NOP2
88635+#define ASM_NOP3 K8_NOP3
88636+#define ASM_NOP4 K8_NOP4
88637+#define ASM_NOP5 K8_NOP5
88638+#define ASM_NOP6 K8_NOP6
88639+#define ASM_NOP7 K8_NOP7
88640+#define ASM_NOP8 K8_NOP8
88641+#elif defined(CONFIG_MK7)
88642+#define ASM_NOP1 K7_NOP1
88643+#define ASM_NOP2 K7_NOP2
88644+#define ASM_NOP3 K7_NOP3
88645+#define ASM_NOP4 K7_NOP4
88646+#define ASM_NOP5 K7_NOP5
88647+#define ASM_NOP6 K7_NOP6
88648+#define ASM_NOP7 K7_NOP7
88649+#define ASM_NOP8 K7_NOP8
88650+#else
88651+#define ASM_NOP1 GENERIC_NOP1
88652+#define ASM_NOP2 GENERIC_NOP2
88653+#define ASM_NOP3 GENERIC_NOP3
88654+#define ASM_NOP4 GENERIC_NOP4
88655+#define ASM_NOP5 GENERIC_NOP5
88656+#define ASM_NOP6 GENERIC_NOP6
88657+#define ASM_NOP7 GENERIC_NOP7
88658+#define ASM_NOP8 GENERIC_NOP8
88659+#endif
88660+
88661+#define ASM_NOP_MAX 8
88662+
88663+/* Prefetch instructions for Pentium III and AMD Athlon */
88664+/* It's not worth to care about 3dnow! prefetches for the K6
88665+ because they are microcoded there and very slow.
88666+ However we don't do prefetches for pre XP Athlons currently
88667+ That should be fixed. */
88668+#define ARCH_HAS_PREFETCH
88669+static inline void prefetch(const void *x)
88670+{
88671+ alternative_input(ASM_NOP4,
88672+ "prefetchnta (%1)",
88673+ X86_FEATURE_XMM,
88674+ "r" (x));
88675+}
88676+
88677+#define ARCH_HAS_PREFETCH
88678+#define ARCH_HAS_PREFETCHW
88679+#define ARCH_HAS_SPINLOCK_PREFETCH
88680+
88681+/* 3dnow! prefetch to get an exclusive cache line. Useful for
88682+ spinlocks to avoid one state transition in the cache coherency protocol. */
88683+static inline void prefetchw(const void *x)
88684+{
88685+ alternative_input(ASM_NOP4,
88686+ "prefetchw (%1)",
88687+ X86_FEATURE_3DNOW,
88688+ "r" (x));
88689+}
88690+#define spin_lock_prefetch(x) prefetchw(x)
88691+
88692+extern void select_idle_routine(const struct cpuinfo_x86 *c);
88693+
88694+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
88695+
88696+extern unsigned long boot_option_idle_override;
88697+extern void enable_sep_cpu(void);
88698+extern int sysenter_setup(void);
88699+
88700+#ifdef CONFIG_MTRR
88701+extern void mtrr_ap_init(void);
88702+extern void mtrr_bp_init(void);
88703+#else
88704+#define mtrr_ap_init() do {} while (0)
88705+#define mtrr_bp_init() do {} while (0)
88706+#endif
88707+
88708+#ifdef CONFIG_X86_MCE
88709+extern void mcheck_init(struct cpuinfo_x86 *c);
88710+#else
88711+#define mcheck_init(c) do {} while(0)
88712+#endif
88713+
88714+#endif /* __ASM_I386_PROCESSOR_H */
88715diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/ptrace.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/ptrace.h
88716--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/ptrace.h 1970-01-01 00:00:00.000000000 +0000
88717+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/ptrace.h 2007-01-08 15:00:46.000000000 +0000
88718@@ -0,0 +1,90 @@
88719+#ifndef _I386_PTRACE_H
88720+#define _I386_PTRACE_H
88721+
88722+#define EBX 0
88723+#define ECX 1
88724+#define EDX 2
88725+#define ESI 3
88726+#define EDI 4
88727+#define EBP 5
88728+#define EAX 6
88729+#define DS 7
88730+#define ES 8
88731+#define FS 9
88732+#define GS 10
88733+#define ORIG_EAX 11
88734+#define EIP 12
88735+#define CS 13
88736+#define EFL 14
88737+#define UESP 15
88738+#define SS 16
88739+#define FRAME_SIZE 17
88740+
88741+/* this struct defines the way the registers are stored on the
88742+ stack during a system call. */
88743+
88744+struct pt_regs {
88745+ long ebx;
88746+ long ecx;
88747+ long edx;
88748+ long esi;
88749+ long edi;
88750+ long ebp;
88751+ long eax;
88752+ int xds;
88753+ int xes;
88754+ long orig_eax;
88755+ long eip;
88756+ int xcs;
88757+ long eflags;
88758+ long esp;
88759+ int xss;
88760+};
88761+
88762+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
88763+#define PTRACE_GETREGS 12
88764+#define PTRACE_SETREGS 13
88765+#define PTRACE_GETFPREGS 14
88766+#define PTRACE_SETFPREGS 15
88767+#define PTRACE_GETFPXREGS 18
88768+#define PTRACE_SETFPXREGS 19
88769+
88770+#define PTRACE_OLDSETOPTIONS 21
88771+
88772+#define PTRACE_GET_THREAD_AREA 25
88773+#define PTRACE_SET_THREAD_AREA 26
88774+
88775+#define PTRACE_SYSEMU 31
88776+#define PTRACE_SYSEMU_SINGLESTEP 32
88777+
88778+#ifdef __KERNEL__
88779+
88780+#include <asm/vm86.h>
88781+
88782+struct task_struct;
88783+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
88784+
88785+/*
88786+ * user_mode_vm(regs) determines whether a register set came from user mode.
88787+ * This is true if V8086 mode was enabled OR if the register set was from
88788+ * protected mode with RPL-3 CS value. This tricky test checks that with
88789+ * one comparison. Many places in the kernel can bypass this full check
88790+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
88791+ */
88792+static inline int user_mode(struct pt_regs *regs)
88793+{
88794+ return (regs->xcs & 2) != 0;
88795+}
88796+static inline int user_mode_vm(struct pt_regs *regs)
88797+{
88798+ return ((regs->xcs & 2) | (regs->eflags & VM_MASK)) != 0;
88799+}
88800+#define instruction_pointer(regs) ((regs)->eip)
88801+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
88802+extern unsigned long profile_pc(struct pt_regs *regs);
88803+#else
88804+#define profile_pc(regs) instruction_pointer(regs)
88805+#endif
88806+#endif /* __KERNEL__ */
88807+
88808+#endif
88809diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/scatterlist.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/scatterlist.h
88810--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/scatterlist.h 1970-01-01 00:00:00.000000000 +0000
88811+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/scatterlist.h 2007-01-08 15:00:46.000000000 +0000
88812@@ -0,0 +1,22 @@
88813+#ifndef _I386_SCATTERLIST_H
88814+#define _I386_SCATTERLIST_H
88815+
88816+struct scatterlist {
88817+ struct page *page;
88818+ unsigned int offset;
88819+ unsigned int length;
88820+ dma_addr_t dma_address;
88821+ unsigned int dma_length;
88822+};
88823+
88824+/* These macros should be used after a pci_map_sg call has been done
88825+ * to get bus addresses of each of the SG entries and their lengths.
88826+ * You should only work with the number of sg entries pci_map_sg
88827+ * returns.
88828+ */
88829+#define sg_dma_address(sg) ((sg)->dma_address)
88830+#define sg_dma_len(sg) ((sg)->dma_length)
88831+
88832+#define ISA_DMA_THRESHOLD (0x00ffffff)
88833+
88834+#endif /* !(_I386_SCATTERLIST_H) */
88835diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/segment.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/segment.h
88836--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/segment.h 1970-01-01 00:00:00.000000000 +0000
88837+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/segment.h 2007-01-08 15:00:46.000000000 +0000
88838@@ -0,0 +1,117 @@
88839+#ifndef _ASM_SEGMENT_H
88840+#define _ASM_SEGMENT_H
88841+
88842+/*
88843+ * The layout of the per-CPU GDT under Linux:
88844+ *
88845+ * 0 - null
88846+ * 1 - reserved
88847+ * 2 - reserved
88848+ * 3 - reserved
88849+ *
88850+ * 4 - unused <==== new cacheline
88851+ * 5 - unused
88852+ *
88853+ * ------- start of TLS (Thread-Local Storage) segments:
88854+ *
88855+ * 6 - TLS segment #1 [ glibc's TLS segment ]
88856+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
88857+ * 8 - TLS segment #3
88858+ * 9 - reserved
88859+ * 10 - reserved
88860+ * 11 - reserved
88861+ *
88862+ * ------- start of kernel segments:
88863+ *
88864+ * 12 - kernel code segment <==== new cacheline
88865+ * 13 - kernel data segment
88866+ * 14 - default user CS
88867+ * 15 - default user DS
88868+ * 16 - TSS
88869+ * 17 - LDT
88870+ * 18 - PNPBIOS support (16->32 gate)
88871+ * 19 - PNPBIOS support
88872+ * 20 - PNPBIOS support
88873+ * 21 - PNPBIOS support
88874+ * 22 - PNPBIOS support
88875+ * 23 - APM BIOS support
88876+ * 24 - APM BIOS support
88877+ * 25 - APM BIOS support
88878+ *
88879+ * 26 - ESPFIX small SS
88880+ * 27 - unused
88881+ * 28 - unused
88882+ * 29 - unused
88883+ * 30 - unused
88884+ * 31 - TSS for double fault handler
88885+ */
88886+#define GDT_ENTRY_TLS_ENTRIES 3
88887+#define GDT_ENTRY_TLS_MIN 6
88888+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
88889+
88890+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
88891+
88892+#define GDT_ENTRY_DEFAULT_USER_CS 14
88893+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
88894+
88895+#define GDT_ENTRY_DEFAULT_USER_DS 15
88896+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
88897+
88898+#define GDT_ENTRY_KERNEL_BASE 12
88899+
88900+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
88901+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
88902+#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
88903+
88904+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
88905+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
88906+#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
88907+
88908+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
88909+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
88910+
88911+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
88912+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
88913+
88914+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
88915+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
88916+
88917+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
88918+
88919+/*
88920+ * The GDT has 32 entries
88921+ */
88922+#define GDT_ENTRIES 32
88923+
88924+#define GDT_SIZE (GDT_ENTRIES * 8)
88925+
88926+/* Simple and small GDT entries for booting only */
88927+
88928+#define GDT_ENTRY_BOOT_CS 2
88929+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
88930+
88931+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
88932+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
88933+
88934+/* The PnP BIOS entries in the GDT */
88935+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
88936+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
88937+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
88938+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
88939+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
88940+
88941+/* The PnP BIOS selectors */
88942+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
88943+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
88944+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
88945+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
88946+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
88947+
88948+/*
88949+ * The interrupt descriptor table has room for 256 idt's,
88950+ * the global descriptor table is dependent on the number
88951+ * of tasks we can have..
88952+ */
88953+#define IDT_ENTRIES 256
88954+
88955+#endif
88956diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/setup.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/setup.h
88957--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/setup.h 1970-01-01 00:00:00.000000000 +0000
88958+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/setup.h 2007-01-08 15:00:46.000000000 +0000
88959@@ -0,0 +1,66 @@
88960+/*
88961+ * Just a place holder. We don't want to have to test x86 before
88962+ * we include stuff
88963+ */
88964+
88965+#ifndef _i386_SETUP_H
88966+#define _i386_SETUP_H
88967+
88968+#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
88969+#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
88970+#define PFN_PHYS(x) ((unsigned long long)(x) << PAGE_SHIFT)
88971+
88972+/*
88973+ * Reserved space for vmalloc and iomap - defined in asm/page.h
88974+ */
88975+#define MAXMEM_PFN PFN_DOWN(MAXMEM)
88976+#define MAX_NONPAE_PFN (1 << 20)
88977+
88978+#define PARAM_SIZE 4096
88979+#define COMMAND_LINE_SIZE 256
88980+
88981+#define OLD_CL_MAGIC_ADDR 0x90020
88982+#define OLD_CL_MAGIC 0xA33F
88983+#define OLD_CL_BASE_ADDR 0x90000
88984+#define OLD_CL_OFFSET 0x90022
88985+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
88986+
88987+#ifndef __ASSEMBLY__
88988+/*
88989+ * This is set up by the setup-routine at boot-time
88990+ */
88991+extern unsigned char boot_params[PARAM_SIZE];
88992+
88993+#define PARAM (boot_params)
88994+#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
88995+#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
88996+#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
88997+#define E820_MAP_NR (*(char*) (PARAM+E820NR))
88998+#define E820_MAP ((struct e820entry *) (PARAM+E820MAP))
88999+#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
89000+#define IST_INFO (*(struct ist_info *) (PARAM+0x60))
89001+#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
89002+#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
89003+#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
89004+#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
89005+#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
89006+#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
89007+#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
89008+#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
89009+#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
89010+#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
89011+#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
89012+#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
89013+#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
89014+#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
89015+#define INITRD_START (__pa(xen_start_info->mod_start))
89016+#define INITRD_SIZE (xen_start_info->mod_len)
89017+#define EDID_INFO (*(struct edid_info *) (PARAM+0x440))
89018+#define EDD_NR (*(unsigned char *) (PARAM+EDDNR))
89019+#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
89020+#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
89021+#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF))
89022+
89023+#endif /* __ASSEMBLY__ */
89024+
89025+#endif /* _i386_SETUP_H */
89026diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/smp.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/smp.h
89027--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/smp.h 1970-01-01 00:00:00.000000000 +0000
89028+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/smp.h 2007-01-08 15:00:46.000000000 +0000
89029@@ -0,0 +1,103 @@
89030+#ifndef __ASM_SMP_H
89031+#define __ASM_SMP_H
89032+
89033+/*
89034+ * We need the APIC definitions automatically as part of 'smp.h'
89035+ */
89036+#ifndef __ASSEMBLY__
89037+#include <linux/config.h>
89038+#include <linux/kernel.h>
89039+#include <linux/threads.h>
89040+#include <linux/cpumask.h>
89041+#endif
89042+
89043+#ifdef CONFIG_X86_LOCAL_APIC
89044+#ifndef __ASSEMBLY__
89045+#include <asm/fixmap.h>
89046+#include <asm/bitops.h>
89047+#include <asm/mpspec.h>
89048+#ifdef CONFIG_X86_IO_APIC
89049+#include <asm/io_apic.h>
89050+#endif
89051+#include <asm/apic.h>
89052+#endif
89053+#endif
89054+
89055+#define BAD_APICID 0xFFu
89056+#ifdef CONFIG_SMP
89057+#ifndef __ASSEMBLY__
89058+
89059+/*
89060+ * Private routines/data
89061+ */
89062+
89063+extern void smp_alloc_memory(void);
89064+extern int pic_mode;
89065+extern int smp_num_siblings;
89066+extern cpumask_t cpu_sibling_map[];
89067+extern cpumask_t cpu_core_map[];
89068+
89069+extern void (*mtrr_hook) (void);
89070+extern void zap_low_mappings (void);
89071+extern void lock_ipi_call_lock(void);
89072+extern void unlock_ipi_call_lock(void);
89073+
89074+#define MAX_APICID 256
89075+extern u8 x86_cpu_to_apicid[];
89076+
89077+#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
89078+
89079+#ifdef CONFIG_HOTPLUG_CPU
89080+extern void cpu_exit_clear(void);
89081+extern void cpu_uninit(void);
89082+#endif
89083+
89084+/*
89085+ * This function is needed by all SMP systems. It must _always_ be valid
89086+ * from the initial startup. We map APIC_BASE very early in page_setup(),
89087+ * so this is correct in the x86 case.
89088+ */
89089+#define raw_smp_processor_id() (current_thread_info()->cpu)
89090+
89091+extern cpumask_t cpu_possible_map;
89092+#define cpu_callin_map cpu_possible_map
89093+
89094+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
89095+static inline int num_booting_cpus(void)
89096+{
89097+ return cpus_weight(cpu_possible_map);
89098+}
89099+
89100+#ifdef CONFIG_X86_LOCAL_APIC
89101+
89102+#ifdef APIC_DEFINITION
89103+extern int hard_smp_processor_id(void);
89104+#else
89105+#include <mach_apicdef.h>
89106+static inline int hard_smp_processor_id(void)
89107+{
89108+ /* we don't want to mark this access volatile - bad code generation */
89109+ return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
89110+}
89111+#endif
89112+
89113+static __inline int logical_smp_processor_id(void)
89114+{
89115+ /* we don't want to mark this access volatile - bad code generation */
89116+ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
89117+}
89118+
89119+#endif
89120+
89121+extern int __cpu_disable(void);
89122+extern void __cpu_die(unsigned int cpu);
89123+#endif /* !__ASSEMBLY__ */
89124+
89125+#else /* CONFIG_SMP */
89126+
89127+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
89128+
89129+#define NO_PROC_ID 0xFF /* No processor magic marker */
89130+
89131+#endif
89132+#endif
89133diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/spinlock.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/spinlock.h
89134--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/spinlock.h 1970-01-01 00:00:00.000000000 +0000
89135+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/spinlock.h 2007-01-08 15:00:46.000000000 +0000
89136@@ -0,0 +1,217 @@
89137+#ifndef __ASM_SPINLOCK_H
89138+#define __ASM_SPINLOCK_H
89139+
89140+#include <asm/atomic.h>
89141+#include <asm/rwlock.h>
89142+#include <asm/page.h>
89143+#include <linux/config.h>
89144+#include <linux/compiler.h>
89145+#include <asm/smp_alt.h>
89146+
89147+/*
89148+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
89149+ *
89150+ * Simple spin lock operations. There are two variants, one clears IRQ's
89151+ * on the local processor, one does not.
89152+ *
89153+ * We make no fairness assumptions. They have a cost.
89154+ *
89155+ * (the type definitions are in asm/spinlock_types.h)
89156+ */
89157+
89158+#define __raw_spin_is_locked(x) \
89159+ (*(volatile signed char *)(&(x)->slock) <= 0)
89160+
89161+#define __raw_spin_lock_string \
89162+ "\n1:\n" \
89163+ LOCK \
89164+ "decb %0\n\t" \
89165+ "jns 3f\n" \
89166+ "2:\t" \
89167+ "rep;nop\n\t" \
89168+ "cmpb $0,%0\n\t" \
89169+ "jle 2b\n\t" \
89170+ "jmp 1b\n" \
89171+ "3:\n\t"
89172+
89173+#define __raw_spin_lock_string_flags \
89174+ "\n1:\n" \
89175+ LOCK \
89176+ "decb %0\n\t" \
89177+ "jns 4f\n\t" \
89178+ "2:\t" \
89179+ "testl $0x200, %1\n\t" \
89180+ "jz 3f\n\t" \
89181+ "#sti\n\t" \
89182+ "3:\t" \
89183+ "rep;nop\n\t" \
89184+ "cmpb $0, %0\n\t" \
89185+ "jle 3b\n\t" \
89186+ "#cli\n\t" \
89187+ "jmp 1b\n" \
89188+ "4:\n\t"
89189+
89190+static inline void __raw_spin_lock(raw_spinlock_t *lock)
89191+{
89192+ __asm__ __volatile__(
89193+ __raw_spin_lock_string
89194+ :"=m" (lock->slock) : : "memory");
89195+}
89196+
89197+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
89198+{
89199+ __asm__ __volatile__(
89200+ __raw_spin_lock_string_flags
89201+ :"=m" (lock->slock) : "r" (flags) : "memory");
89202+}
89203+
89204+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
89205+{
89206+ char oldval;
89207+#ifdef CONFIG_SMP_ALTERNATIVES
89208+ __asm__ __volatile__(
89209+ "1:movb %1,%b0\n"
89210+ "movb $0,%1\n"
89211+ "2:"
89212+ ".section __smp_alternatives,\"a\"\n"
89213+ ".long 1b\n"
89214+ ".long 3f\n"
89215+ ".previous\n"
89216+ ".section __smp_replacements,\"a\"\n"
89217+ "3: .byte 2b - 1b\n"
89218+ ".byte 5f-4f\n"
89219+ ".byte 0\n"
89220+ ".byte 6f-5f\n"
89221+ ".byte -1\n"
89222+ "4: xchgb %b0,%1\n"
89223+ "5: movb %1,%b0\n"
89224+ "movb $0,%1\n"
89225+ "6:\n"
89226+ ".previous\n"
89227+ :"=q" (oldval), "=m" (lock->slock)
89228+ :"0" (0) : "memory");
89229+#else
89230+ __asm__ __volatile__(
89231+ "xchgb %b0,%1"
89232+ :"=q" (oldval), "=m" (lock->slock)
89233+ :"0" (0) : "memory");
89234+#endif
89235+ return oldval > 0;
89236+}
89237+
89238+/*
89239+ * __raw_spin_unlock based on writing $1 to the low byte.
89240+ * This method works. Despite all the confusion.
89241+ * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
89242+ * (PPro errata 66, 92)
89243+ */
89244+
89245+#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
89246+
89247+#define __raw_spin_unlock_string \
89248+ "movb $1,%0" \
89249+ :"=m" (lock->slock) : : "memory"
89250+
89251+
89252+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
89253+{
89254+ __asm__ __volatile__(
89255+ __raw_spin_unlock_string
89256+ );
89257+}
89258+
89259+#else
89260+
89261+#define __raw_spin_unlock_string \
89262+ "xchgb %b0, %1" \
89263+ :"=q" (oldval), "=m" (lock->slock) \
89264+ :"0" (oldval) : "memory"
89265+
89266+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
89267+{
89268+ char oldval = 1;
89269+
89270+ __asm__ __volatile__(
89271+ __raw_spin_unlock_string
89272+ );
89273+}
89274+
89275+#endif
89276+
89277+#define __raw_spin_unlock_wait(lock) \
89278+ do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
89279+
89280+/*
89281+ * Read-write spinlocks, allowing multiple readers
89282+ * but only one writer.
89283+ *
89284+ * NOTE! it is quite common to have readers in interrupts
89285+ * but no interrupt writers. For those circumstances we
89286+ * can "mix" irq-safe locks - any writer needs to get a
89287+ * irq-safe write-lock, but readers can get non-irqsafe
89288+ * read-locks.
89289+ *
89290+ * On x86, we implement read-write locks as a 32-bit counter
89291+ * with the high bit (sign) being the "contended" bit.
89292+ *
89293+ * The inline assembly is non-obvious. Think about it.
89294+ *
89295+ * Changed to use the same technique as rw semaphores. See
89296+ * semaphore.h for details. -ben
89297+ *
89298+ * the helpers are in arch/i386/kernel/semaphore.c
89299+ */
89300+
89301+/**
89302+ * read_can_lock - would read_trylock() succeed?
89303+ * @lock: the rwlock in question.
89304+ */
89305+#define __raw_read_can_lock(x) ((int)(x)->lock > 0)
89306+
89307+/**
89308+ * write_can_lock - would write_trylock() succeed?
89309+ * @lock: the rwlock in question.
89310+ */
89311+#define __raw_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
89312+
89313+static inline void __raw_read_lock(raw_rwlock_t *rw)
89314+{
89315+ __build_read_lock(rw, "__read_lock_failed");
89316+}
89317+
89318+static inline void __raw_write_lock(raw_rwlock_t *rw)
89319+{
89320+ __build_write_lock(rw, "__write_lock_failed");
89321+}
89322+
89323+static inline int __raw_read_trylock(raw_rwlock_t *lock)
89324+{
89325+ atomic_t *count = (atomic_t *)lock;
89326+ atomic_dec(count);
89327+ if (atomic_read(count) >= 0)
89328+ return 1;
89329+ atomic_inc(count);
89330+ return 0;
89331+}
89332+
89333+static inline int __raw_write_trylock(raw_rwlock_t *lock)
89334+{
89335+ atomic_t *count = (atomic_t *)lock;
89336+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
89337+ return 1;
89338+ atomic_add(RW_LOCK_BIAS, count);
89339+ return 0;
89340+}
89341+
89342+static inline void __raw_read_unlock(raw_rwlock_t *rw)
89343+{
89344+ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
89345+}
89346+
89347+static inline void __raw_write_unlock(raw_rwlock_t *rw)
89348+{
89349+ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
89350+ : "=m" (rw->lock) : : "memory");
89351+}
89352+
89353+#endif /* __ASM_SPINLOCK_H */
89354diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/swiotlb.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/swiotlb.h
89355--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/swiotlb.h 1970-01-01 00:00:00.000000000 +0000
89356+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/swiotlb.h 2007-01-08 15:00:46.000000000 +0000
89357@@ -0,0 +1,45 @@
89358+#ifndef _ASM_SWIOTLB_H
89359+#define _ASM_SWIOTLB_H 1
89360+
89361+#include <linux/config.h>
89362+
89363+/* SWIOTLB interface */
89364+
89365+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
89366+ int dir);
89367+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
89368+ size_t size, int dir);
89369+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
89370+ dma_addr_t dev_addr,
89371+ size_t size, int dir);
89372+extern void swiotlb_sync_single_for_device(struct device *hwdev,
89373+ dma_addr_t dev_addr,
89374+ size_t size, int dir);
89375+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
89376+ struct scatterlist *sg, int nelems,
89377+ int dir);
89378+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
89379+ struct scatterlist *sg, int nelems,
89380+ int dir);
89381+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
89382+ int nents, int direction);
89383+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
89384+ int nents, int direction);
89385+extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
89386+extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
89387+ unsigned long offset, size_t size,
89388+ enum dma_data_direction direction);
89389+extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
89390+ size_t size, enum dma_data_direction direction);
89391+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
89392+extern void swiotlb_init(void);
89393+
89394+extern unsigned int dma_bits;
89395+
89396+#ifdef CONFIG_SWIOTLB
89397+extern int swiotlb;
89398+#else
89399+#define swiotlb 0
89400+#endif
89401+
89402+#endif
89403diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/synch_bitops.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/synch_bitops.h
89404--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
89405+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
89406@@ -0,0 +1,147 @@
89407+#ifndef __XEN_SYNCH_BITOPS_H__
89408+#define __XEN_SYNCH_BITOPS_H__
89409+
89410+/*
89411+ * Copyright 1992, Linus Torvalds.
89412+ * Heavily modified to provide guaranteed strong synchronisation
89413+ * when communicating with Xen or other guest OSes running on other CPUs.
89414+ */
89415+
89416+#include <linux/config.h>
89417+
89418+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
89419+#include <xen/platform-compat.h>
89420+#endif
89421+
89422+#define ADDR (*(volatile long *) addr)
89423+
89424+static __inline__ void synch_set_bit(int nr, volatile void * addr)
89425+{
89426+ __asm__ __volatile__ (
89427+ "lock btsl %1,%0"
89428+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
89429+}
89430+
89431+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
89432+{
89433+ __asm__ __volatile__ (
89434+ "lock btrl %1,%0"
89435+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
89436+}
89437+
89438+static __inline__ void synch_change_bit(int nr, volatile void * addr)
89439+{
89440+ __asm__ __volatile__ (
89441+ "lock btcl %1,%0"
89442+ : "+m" (ADDR) : "Ir" (nr) : "memory" );
89443+}
89444+
89445+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
89446+{
89447+ int oldbit;
89448+ __asm__ __volatile__ (
89449+ "lock btsl %2,%1\n\tsbbl %0,%0"
89450+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89451+ return oldbit;
89452+}
89453+
89454+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
89455+{
89456+ int oldbit;
89457+ __asm__ __volatile__ (
89458+ "lock btrl %2,%1\n\tsbbl %0,%0"
89459+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89460+ return oldbit;
89461+}
89462+
89463+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
89464+{
89465+ int oldbit;
89466+
89467+ __asm__ __volatile__ (
89468+ "lock btcl %2,%1\n\tsbbl %0,%0"
89469+ : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89470+ return oldbit;
89471+}
89472+
89473+struct __synch_xchg_dummy { unsigned long a[100]; };
89474+#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
89475+
89476+#define synch_cmpxchg(ptr, old, new) \
89477+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
89478+ (unsigned long)(old), \
89479+ (unsigned long)(new), \
89480+ sizeof(*(ptr))))
89481+
89482+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
89483+ unsigned long old,
89484+ unsigned long new, int size)
89485+{
89486+ unsigned long prev;
89487+ switch (size) {
89488+ case 1:
89489+ __asm__ __volatile__("lock; cmpxchgb %b1,%2"
89490+ : "=a"(prev)
89491+ : "q"(new), "m"(*__synch_xg(ptr)),
89492+ "0"(old)
89493+ : "memory");
89494+ return prev;
89495+ case 2:
89496+ __asm__ __volatile__("lock; cmpxchgw %w1,%2"
89497+ : "=a"(prev)
89498+ : "r"(new), "m"(*__synch_xg(ptr)),
89499+ "0"(old)
89500+ : "memory");
89501+ return prev;
89502+#ifdef CONFIG_X86_64
89503+ case 4:
89504+ __asm__ __volatile__("lock; cmpxchgl %k1,%2"
89505+ : "=a"(prev)
89506+ : "r"(new), "m"(*__synch_xg(ptr)),
89507+ "0"(old)
89508+ : "memory");
89509+ return prev;
89510+ case 8:
89511+ __asm__ __volatile__("lock; cmpxchgq %1,%2"
89512+ : "=a"(prev)
89513+ : "r"(new), "m"(*__synch_xg(ptr)),
89514+ "0"(old)
89515+ : "memory");
89516+ return prev;
89517+#else
89518+ case 4:
89519+ __asm__ __volatile__("lock; cmpxchgl %1,%2"
89520+ : "=a"(prev)
89521+ : "r"(new), "m"(*__synch_xg(ptr)),
89522+ "0"(old)
89523+ : "memory");
89524+ return prev;
89525+#endif
89526+ }
89527+ return old;
89528+}
89529+
89530+static __always_inline int synch_const_test_bit(int nr,
89531+ const volatile void * addr)
89532+{
89533+ return ((1UL << (nr & 31)) &
89534+ (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
89535+}
89536+
89537+static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
89538+{
89539+ int oldbit;
89540+ __asm__ __volatile__ (
89541+ "btl %2,%1\n\tsbbl %0,%0"
89542+ : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
89543+ return oldbit;
89544+}
89545+
89546+#define synch_test_bit(nr,addr) \
89547+(__builtin_constant_p(nr) ? \
89548+ synch_const_test_bit((nr),(addr)) : \
89549+ synch_var_test_bit((nr),(addr)))
89550+
89551+#define synch_cmpxchg_subword synch_cmpxchg
89552+
89553+#endif /* __XEN_SYNCH_BITOPS_H__ */
89554diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/system.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/system.h
89555--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/system.h 1970-01-01 00:00:00.000000000 +0000
89556+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/system.h 2007-01-08 15:00:46.000000000 +0000
89557@@ -0,0 +1,681 @@
89558+#ifndef __ASM_SYSTEM_H
89559+#define __ASM_SYSTEM_H
89560+
89561+#include <linux/config.h>
89562+#include <linux/kernel.h>
89563+#include <linux/bitops.h>
89564+#include <asm/synch_bitops.h>
89565+#include <asm/segment.h>
89566+#include <asm/cpufeature.h>
89567+#include <asm/hypervisor.h>
89568+#include <asm/smp_alt.h>
89569+
89570+#ifdef __KERNEL__
89571+
89572+#ifdef CONFIG_SMP
89573+#define __vcpu_id smp_processor_id()
89574+#else
89575+#define __vcpu_id 0
89576+#endif
89577+
89578+struct task_struct; /* one of the stranger aspects of C forward declarations.. */
89579+extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
89580+
89581+#define switch_to(prev,next,last) do { \
89582+ unsigned long esi,edi; \
89583+ asm volatile("pushl %%ebp\n\t" \
89584+ "movl %%esp,%0\n\t" /* save ESP */ \
89585+ "movl %5,%%esp\n\t" /* restore ESP */ \
89586+ "movl $1f,%1\n\t" /* save EIP */ \
89587+ "pushl %6\n\t" /* restore EIP */ \
89588+ "jmp __switch_to\n" \
89589+ "1:\t" \
89590+ "popl %%ebp\n\t" \
89591+ :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
89592+ "=a" (last),"=S" (esi),"=D" (edi) \
89593+ :"m" (next->thread.esp),"m" (next->thread.eip), \
89594+ "2" (prev), "d" (next)); \
89595+} while (0)
89596+
89597+#define _set_base(addr,base) do { unsigned long __pr; \
89598+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
89599+ "rorl $16,%%edx\n\t" \
89600+ "movb %%dl,%2\n\t" \
89601+ "movb %%dh,%3" \
89602+ :"=&d" (__pr) \
89603+ :"m" (*((addr)+2)), \
89604+ "m" (*((addr)+4)), \
89605+ "m" (*((addr)+7)), \
89606+ "0" (base) \
89607+ ); } while(0)
89608+
89609+#define _set_limit(addr,limit) do { unsigned long __lr; \
89610+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
89611+ "rorl $16,%%edx\n\t" \
89612+ "movb %2,%%dh\n\t" \
89613+ "andb $0xf0,%%dh\n\t" \
89614+ "orb %%dh,%%dl\n\t" \
89615+ "movb %%dl,%2" \
89616+ :"=&d" (__lr) \
89617+ :"m" (*(addr)), \
89618+ "m" (*((addr)+6)), \
89619+ "0" (limit) \
89620+ ); } while(0)
89621+
89622+#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
89623+#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
89624+
89625+/*
89626+ * Load a segment. Fall back on loading the zero
89627+ * segment if something goes wrong..
89628+ */
89629+#define loadsegment(seg,value) \
89630+ asm volatile("\n" \
89631+ "1:\t" \
89632+ "mov %0,%%" #seg "\n" \
89633+ "2:\n" \
89634+ ".section .fixup,\"ax\"\n" \
89635+ "3:\t" \
89636+ "pushl $0\n\t" \
89637+ "popl %%" #seg "\n\t" \
89638+ "jmp 2b\n" \
89639+ ".previous\n" \
89640+ ".section __ex_table,\"a\"\n\t" \
89641+ ".align 4\n\t" \
89642+ ".long 1b,3b\n" \
89643+ ".previous" \
89644+ : :"rm" (value))
89645+
89646+/*
89647+ * Save a segment register away
89648+ */
89649+#define savesegment(seg, value) \
89650+ asm volatile("mov %%" #seg ",%0":"=rm" (value))
89651+
89652+/*
89653+ * Clear and set 'TS' bit respectively
89654+ */
89655+#define clts() (HYPERVISOR_fpu_taskswitch(0))
89656+#define read_cr0() ({ \
89657+ unsigned int __dummy; \
89658+ __asm__ __volatile__( \
89659+ "movl %%cr0,%0\n\t" \
89660+ :"=r" (__dummy)); \
89661+ __dummy; \
89662+})
89663+#define write_cr0(x) \
89664+ __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
89665+
89666+#define read_cr2() \
89667+ (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
89668+#define write_cr2(x) \
89669+ __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
89670+
89671+#define read_cr3() ({ \
89672+ unsigned int __dummy; \
89673+ __asm__ ( \
89674+ "movl %%cr3,%0\n\t" \
89675+ :"=r" (__dummy)); \
89676+ __dummy = xen_cr3_to_pfn(__dummy); \
89677+ mfn_to_pfn(__dummy) << PAGE_SHIFT; \
89678+})
89679+#define write_cr3(x) ({ \
89680+ unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
89681+ __dummy = xen_pfn_to_cr3(__dummy); \
89682+ __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
89683+})
89684+
89685+#define read_cr4() ({ \
89686+ unsigned int __dummy; \
89687+ __asm__( \
89688+ "movl %%cr4,%0\n\t" \
89689+ :"=r" (__dummy)); \
89690+ __dummy; \
89691+})
89692+
89693+#define read_cr4_safe() ({ \
89694+ unsigned int __dummy; \
89695+ /* This could fault if %cr4 does not exist */ \
89696+ __asm__("1: movl %%cr4, %0 \n" \
89697+ "2: \n" \
89698+ ".section __ex_table,\"a\" \n" \
89699+ ".long 1b,2b \n" \
89700+ ".previous \n" \
89701+ : "=r" (__dummy): "0" (0)); \
89702+ __dummy; \
89703+})
89704+
89705+#define write_cr4(x) \
89706+ __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
89707+#define stts() (HYPERVISOR_fpu_taskswitch(1))
89708+
89709+#endif /* __KERNEL__ */
89710+
89711+#define wbinvd() \
89712+ __asm__ __volatile__ ("wbinvd": : :"memory");
89713+
89714+static inline unsigned long get_limit(unsigned long segment)
89715+{
89716+ unsigned long __limit;
89717+ __asm__("lsll %1,%0"
89718+ :"=r" (__limit):"r" (segment));
89719+ return __limit+1;
89720+}
89721+
89722+#define nop() __asm__ __volatile__ ("nop")
89723+
89724+#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
89725+
89726+#define tas(ptr) (xchg((ptr),1))
89727+
89728+struct __xchg_dummy { unsigned long a[100]; };
89729+#define __xg(x) ((struct __xchg_dummy *)(x))
89730+
89731+
89732+#ifdef CONFIG_X86_CMPXCHG64
89733+
89734+/*
89735+ * The semantics of XCHGCMP8B are a bit strange, this is why
89736+ * there is a loop and the loading of %%eax and %%edx has to
89737+ * be inside. This inlines well in most cases, the cached
89738+ * cost is around ~38 cycles. (in the future we might want
89739+ * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
89740+ * might have an implicit FPU-save as a cost, so it's not
89741+ * clear which path to go.)
89742+ *
89743+ * cmpxchg8b must be used with the lock prefix here to allow
89744+ * the instruction to be executed atomically, see page 3-102
89745+ * of the instruction set reference 24319102.pdf. We need
89746+ * the reader side to see the coherent 64bit value.
89747+ */
89748+static inline void __set_64bit (unsigned long long * ptr,
89749+ unsigned int low, unsigned int high)
89750+{
89751+ __asm__ __volatile__ (
89752+ "\n1:\t"
89753+ "movl (%0), %%eax\n\t"
89754+ "movl 4(%0), %%edx\n\t"
89755+ "lock cmpxchg8b (%0)\n\t"
89756+ "jnz 1b"
89757+ : /* no outputs */
89758+ : "D"(ptr),
89759+ "b"(low),
89760+ "c"(high)
89761+ : "ax","dx","memory");
89762+}
89763+
89764+static inline void __set_64bit_constant (unsigned long long *ptr,
89765+ unsigned long long value)
89766+{
89767+ __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
89768+}
89769+#define ll_low(x) *(((unsigned int*)&(x))+0)
89770+#define ll_high(x) *(((unsigned int*)&(x))+1)
89771+
89772+static inline void __set_64bit_var (unsigned long long *ptr,
89773+ unsigned long long value)
89774+{
89775+ __set_64bit(ptr,ll_low(value), ll_high(value));
89776+}
89777+
89778+#define set_64bit(ptr,value) \
89779+(__builtin_constant_p(value) ? \
89780+ __set_64bit_constant(ptr, value) : \
89781+ __set_64bit_var(ptr, value) )
89782+
89783+#define _set_64bit(ptr,value) \
89784+(__builtin_constant_p(value) ? \
89785+ __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
89786+ __set_64bit(ptr, ll_low(value), ll_high(value)) )
89787+
89788+#endif
89789+
89790+/*
89791+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
89792+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
89793+ * but generally the primitive is invalid, *ptr is output argument. --ANK
89794+ */
89795+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
89796+{
89797+ switch (size) {
89798+ case 1:
89799+ __asm__ __volatile__("xchgb %b0,%1"
89800+ :"=q" (x)
89801+ :"m" (*__xg(ptr)), "0" (x)
89802+ :"memory");
89803+ break;
89804+ case 2:
89805+ __asm__ __volatile__("xchgw %w0,%1"
89806+ :"=r" (x)
89807+ :"m" (*__xg(ptr)), "0" (x)
89808+ :"memory");
89809+ break;
89810+ case 4:
89811+ __asm__ __volatile__("xchgl %0,%1"
89812+ :"=r" (x)
89813+ :"m" (*__xg(ptr)), "0" (x)
89814+ :"memory");
89815+ break;
89816+ }
89817+ return x;
89818+}
89819+
89820+/*
89821+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
89822+ * store NEW in MEM. Return the initial value in MEM. Success is
89823+ * indicated by comparing RETURN with OLD.
89824+ */
89825+
89826+#ifdef CONFIG_X86_CMPXCHG
89827+#define __HAVE_ARCH_CMPXCHG 1
89828+#define cmpxchg(ptr,o,n)\
89829+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
89830+ (unsigned long)(n),sizeof(*(ptr))))
89831+#endif
89832+
89833+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
89834+ unsigned long new, int size)
89835+{
89836+ unsigned long prev;
89837+ switch (size) {
89838+ case 1:
89839+ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
89840+ : "=a"(prev)
89841+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
89842+ : "memory");
89843+ return prev;
89844+ case 2:
89845+ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
89846+ : "=a"(prev)
89847+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
89848+ : "memory");
89849+ return prev;
89850+ case 4:
89851+ __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
89852+ : "=a"(prev)
89853+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
89854+ : "memory");
89855+ return prev;
89856+ }
89857+ return old;
89858+}
89859+
89860+#ifndef CONFIG_X86_CMPXCHG
89861+/*
89862+ * Building a kernel capable running on 80386. It may be necessary to
89863+ * simulate the cmpxchg on the 80386 CPU. For that purpose we define
89864+ * a function for each of the sizes we support.
89865+ */
89866+
89867+extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
89868+extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
89869+extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
89870+
89871+static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
89872+ unsigned long new, int size)
89873+{
89874+ switch (size) {
89875+ case 1:
89876+ return cmpxchg_386_u8(ptr, old, new);
89877+ case 2:
89878+ return cmpxchg_386_u16(ptr, old, new);
89879+ case 4:
89880+ return cmpxchg_386_u32(ptr, old, new);
89881+ }
89882+ return old;
89883+}
89884+
89885+#define cmpxchg(ptr,o,n) \
89886+({ \
89887+ __typeof__(*(ptr)) __ret; \
89888+ if (likely(boot_cpu_data.x86 > 3)) \
89889+ __ret = __cmpxchg((ptr), (unsigned long)(o), \
89890+ (unsigned long)(n), sizeof(*(ptr))); \
89891+ else \
89892+ __ret = cmpxchg_386((ptr), (unsigned long)(o), \
89893+ (unsigned long)(n), sizeof(*(ptr))); \
89894+ __ret; \
89895+})
89896+#endif
89897+
89898+#ifdef CONFIG_X86_CMPXCHG64
89899+
89900+static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
89901+ unsigned long long new)
89902+{
89903+ unsigned long long prev;
89904+ __asm__ __volatile__(LOCK "cmpxchg8b %3"
89905+ : "=A"(prev)
89906+ : "b"((unsigned long)new),
89907+ "c"((unsigned long)(new >> 32)),
89908+ "m"(*__xg(ptr)),
89909+ "0"(old)
89910+ : "memory");
89911+ return prev;
89912+}
89913+
89914+#define cmpxchg64(ptr,o,n)\
89915+ ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
89916+ (unsigned long long)(n)))
89917+
89918+#endif
89919+
89920+#ifdef __KERNEL__
89921+struct alt_instr {
89922+ __u8 *instr; /* original instruction */
89923+ __u8 *replacement;
89924+ __u8 cpuid; /* cpuid bit set for replacement */
89925+ __u8 instrlen; /* length of original instruction */
89926+ __u8 replacementlen; /* length of new instruction, <= instrlen */
89927+ __u8 pad;
89928+};
89929+#endif
89930+
89931+/*
89932+ * Alternative instructions for different CPU types or capabilities.
89933+ *
89934+ * This allows to use optimized instructions even on generic binary
89935+ * kernels.
89936+ *
89937+ * length of oldinstr must be longer or equal the length of newinstr
89938+ * It can be padded with nops as needed.
89939+ *
89940+ * For non barrier like inlines please define new variants
89941+ * without volatile and memory clobber.
89942+ */
89943+#define alternative(oldinstr, newinstr, feature) \
89944+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89945+ ".section .altinstructions,\"a\"\n" \
89946+ " .align 4\n" \
89947+ " .long 661b\n" /* label */ \
89948+ " .long 663f\n" /* new instruction */ \
89949+ " .byte %c0\n" /* feature bit */ \
89950+ " .byte 662b-661b\n" /* sourcelen */ \
89951+ " .byte 664f-663f\n" /* replacementlen */ \
89952+ ".previous\n" \
89953+ ".section .altinstr_replacement,\"ax\"\n" \
89954+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
89955+ ".previous" :: "i" (feature) : "memory")
89956+
89957+/*
89958+ * Alternative inline assembly with input.
89959+ *
89960+ * Pecularities:
89961+ * No memory clobber here.
89962+ * Argument numbers start with 1.
89963+ * Best is to use constraints that are fixed size (like (%1) ... "r")
89964+ * If you use variable sized constraints like "m" or "g" in the
89965+ * replacement maake sure to pad to the worst case length.
89966+ */
89967+#define alternative_input(oldinstr, newinstr, feature, input...) \
89968+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89969+ ".section .altinstructions,\"a\"\n" \
89970+ " .align 4\n" \
89971+ " .long 661b\n" /* label */ \
89972+ " .long 663f\n" /* new instruction */ \
89973+ " .byte %c0\n" /* feature bit */ \
89974+ " .byte 662b-661b\n" /* sourcelen */ \
89975+ " .byte 664f-663f\n" /* replacementlen */ \
89976+ ".previous\n" \
89977+ ".section .altinstr_replacement,\"ax\"\n" \
89978+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
89979+ ".previous" :: "i" (feature), ##input)
89980+
89981+/*
89982+ * Force strict CPU ordering.
89983+ * And yes, this is required on UP too when we're talking
89984+ * to devices.
89985+ *
89986+ * For now, "wmb()" doesn't actually do anything, as all
89987+ * Intel CPU's follow what Intel calls a *Processor Order*,
89988+ * in which all writes are seen in the program order even
89989+ * outside the CPU.
89990+ *
89991+ * I expect future Intel CPU's to have a weaker ordering,
89992+ * but I'd also expect them to finally get their act together
89993+ * and add some real memory barriers if so.
89994+ *
89995+ * Some non intel clones support out of order store. wmb() ceases to be a
89996+ * nop for these.
89997+ */
89998+
89999+
90000+/*
90001+ * Actually only lfence would be needed for mb() because all stores done
90002+ * by the kernel should be already ordered. But keep a full barrier for now.
90003+ */
90004+
90005+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
90006+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
90007+
90008+/**
90009+ * read_barrier_depends - Flush all pending reads that subsequents reads
90010+ * depend on.
90011+ *
90012+ * No data-dependent reads from memory-like regions are ever reordered
90013+ * over this barrier. All reads preceding this primitive are guaranteed
90014+ * to access memory (but not necessarily other CPUs' caches) before any
90015+ * reads following this primitive that depend on the data return by
90016+ * any of the preceding reads. This primitive is much lighter weight than
90017+ * rmb() on most CPUs, and is never heavier weight than is
90018+ * rmb().
90019+ *
90020+ * These ordering constraints are respected by both the local CPU
90021+ * and the compiler.
90022+ *
90023+ * Ordering is not guaranteed by anything other than these primitives,
90024+ * not even by data dependencies. See the documentation for
90025+ * memory_barrier() for examples and URLs to more information.
90026+ *
90027+ * For example, the following code would force ordering (the initial
90028+ * value of "a" is zero, "b" is one, and "p" is "&a"):
90029+ *
90030+ * <programlisting>
90031+ * CPU 0 CPU 1
90032+ *
90033+ * b = 2;
90034+ * memory_barrier();
90035+ * p = &b; q = p;
90036+ * read_barrier_depends();
90037+ * d = *q;
90038+ * </programlisting>
90039+ *
90040+ * because the read of "*q" depends on the read of "p" and these
90041+ * two reads are separated by a read_barrier_depends(). However,
90042+ * the following code, with the same initial values for "a" and "b":
90043+ *
90044+ * <programlisting>
90045+ * CPU 0 CPU 1
90046+ *
90047+ * a = 2;
90048+ * memory_barrier();
90049+ * b = 3; y = b;
90050+ * read_barrier_depends();
90051+ * x = a;
90052+ * </programlisting>
90053+ *
90054+ * does not enforce ordering, since there is no data dependency between
90055+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
90056+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
90057+ * in cases like thiswhere there are no data dependencies.
90058+ **/
90059+
90060+#define read_barrier_depends() do { } while(0)
90061+
90062+#ifdef CONFIG_X86_OOSTORE
90063+/* Actually there are no OOO store capable CPUs for now that do SSE,
90064+ but make it already an possibility. */
90065+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
90066+#else
90067+#define wmb() __asm__ __volatile__ ("": : :"memory")
90068+#endif
90069+
90070+#ifdef CONFIG_SMP
90071+#define smp_wmb() wmb()
90072+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
90073+#define smp_alt_mb(instr) \
90074+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
90075+ ".section __smp_alternatives,\"a\"\n" \
90076+ ".long 6667b\n" \
90077+ ".long 6673f\n" \
90078+ ".previous\n" \
90079+ ".section __smp_replacements,\"a\"\n" \
90080+ "6673:.byte 6668b-6667b\n" \
90081+ ".byte 6670f-6669f\n" \
90082+ ".byte 6671f-6670f\n" \
90083+ ".byte 0\n" \
90084+ ".byte %c0\n" \
90085+ "6669:lock;addl $0,0(%%esp)\n" \
90086+ "6670:" instr "\n" \
90087+ "6671:\n" \
90088+ ".previous\n" \
90089+ : \
90090+ : "i" (X86_FEATURE_XMM2) \
90091+ : "memory")
90092+#define smp_rmb() smp_alt_mb("lfence")
90093+#define smp_mb() smp_alt_mb("mfence")
90094+#define set_mb(var, value) do { \
90095+unsigned long __set_mb_temp; \
90096+__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
90097+ ".section __smp_alternatives,\"a\"\n" \
90098+ ".long 6667b\n" \
90099+ ".long 6673f\n" \
90100+ ".previous\n" \
90101+ ".section __smp_replacements,\"a\"\n" \
90102+ "6673: .byte 6668b-6667b\n" \
90103+ ".byte 6670f-6669f\n" \
90104+ ".byte 0\n" \
90105+ ".byte 6671f-6670f\n" \
90106+ ".byte -1\n" \
90107+ "6669: xchg %1, %0\n" \
90108+ "6670:movl %1, %0\n" \
90109+ "6671:\n" \
90110+ ".previous\n" \
90111+ : "=m" (var), "=r" (__set_mb_temp) \
90112+ : "1" (value) \
90113+ : "memory"); } while (0)
90114+#else
90115+#define smp_rmb() rmb()
90116+#define smp_mb() mb()
90117+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
90118+#endif
90119+#define smp_read_barrier_depends() read_barrier_depends()
90120+#else
90121+#define smp_mb() barrier()
90122+#define smp_rmb() barrier()
90123+#define smp_wmb() barrier()
90124+#define smp_read_barrier_depends() do { } while(0)
90125+#define set_mb(var, value) do { var = value; barrier(); } while (0)
90126+#endif
90127+
90128+#define set_wmb(var, value) do { var = value; wmb(); } while (0)
90129+
90130+/* interrupt control.. */
90131+
90132+/*
90133+ * The use of 'barrier' in the following reflects their use as local-lock
90134+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
90135+ * critical operations are executed. All critical operations must complete
90136+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
90137+ * includes these barriers, for example.
90138+ */
90139+
90140+#define __cli() \
90141+do { \
90142+ vcpu_info_t *_vcpu; \
90143+ preempt_disable(); \
90144+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90145+ _vcpu->evtchn_upcall_mask = 1; \
90146+ preempt_enable_no_resched(); \
90147+ barrier(); \
90148+} while (0)
90149+
90150+#define __sti() \
90151+do { \
90152+ vcpu_info_t *_vcpu; \
90153+ barrier(); \
90154+ preempt_disable(); \
90155+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90156+ _vcpu->evtchn_upcall_mask = 0; \
90157+ barrier(); /* unmask then check (avoid races) */ \
90158+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
90159+ force_evtchn_callback(); \
90160+ preempt_enable(); \
90161+} while (0)
90162+
90163+#define __save_flags(x) \
90164+do { \
90165+ vcpu_info_t *_vcpu; \
90166+ preempt_disable(); \
90167+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90168+ (x) = _vcpu->evtchn_upcall_mask; \
90169+ preempt_enable(); \
90170+} while (0)
90171+
90172+#define __restore_flags(x) \
90173+do { \
90174+ vcpu_info_t *_vcpu; \
90175+ barrier(); \
90176+ preempt_disable(); \
90177+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90178+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
90179+ barrier(); /* unmask then check (avoid races) */ \
90180+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
90181+ force_evtchn_callback(); \
90182+ preempt_enable(); \
90183+ } else \
90184+ preempt_enable_no_resched(); \
90185+} while (0)
90186+
90187+void safe_halt(void);
90188+void halt(void);
90189+
90190+#define __save_and_cli(x) \
90191+do { \
90192+ vcpu_info_t *_vcpu; \
90193+ preempt_disable(); \
90194+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90195+ (x) = _vcpu->evtchn_upcall_mask; \
90196+ _vcpu->evtchn_upcall_mask = 1; \
90197+ preempt_enable_no_resched(); \
90198+ barrier(); \
90199+} while (0)
90200+
90201+#define local_irq_save(x) __save_and_cli(x)
90202+#define local_irq_restore(x) __restore_flags(x)
90203+#define local_save_flags(x) __save_flags(x)
90204+#define local_irq_disable() __cli()
90205+#define local_irq_enable() __sti()
90206+
90207+/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
90208+#define irqs_disabled() \
90209+({ int ___x; \
90210+ vcpu_info_t *_vcpu; \
90211+ preempt_disable(); \
90212+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90213+ ___x = (_vcpu->evtchn_upcall_mask != 0); \
90214+ preempt_enable_no_resched(); \
90215+ ___x; })
90216+
90217+/*
90218+ * disable hlt during certain critical i/o operations
90219+ */
90220+#define HAVE_DISABLE_HLT
90221+void disable_hlt(void);
90222+void enable_hlt(void);
90223+
90224+extern int es7000_plat;
90225+void cpu_idle_wait(void);
90226+
90227+/*
90228+ * On SMP systems, when the scheduler does migration-cost autodetection,
90229+ * it needs a way to flush as much of the CPU's caches as possible:
90230+ */
90231+static inline void sched_cacheflush(void)
90232+{
90233+ wbinvd();
90234+}
90235+
90236+extern unsigned long arch_align_stack(unsigned long sp);
90237+
90238+#endif
90239diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/tlbflush.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/tlbflush.h
90240--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/tlbflush.h 1970-01-01 00:00:00.000000000 +0000
90241+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/tlbflush.h 2007-01-08 15:00:46.000000000 +0000
90242@@ -0,0 +1,102 @@
90243+#ifndef _I386_TLBFLUSH_H
90244+#define _I386_TLBFLUSH_H
90245+
90246+#include <linux/config.h>
90247+#include <linux/mm.h>
90248+#include <asm/processor.h>
90249+
90250+#define __flush_tlb() xen_tlb_flush()
90251+#define __flush_tlb_global() xen_tlb_flush()
90252+#define __flush_tlb_all() xen_tlb_flush()
90253+
90254+extern unsigned long pgkern_mask;
90255+
90256+#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
90257+
90258+#define __flush_tlb_single(addr) xen_invlpg(addr)
90259+
90260+#define __flush_tlb_one(addr) __flush_tlb_single(addr)
90261+
90262+/*
90263+ * TLB flushing:
90264+ *
90265+ * - flush_tlb() flushes the current mm struct TLBs
90266+ * - flush_tlb_all() flushes all processes TLBs
90267+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
90268+ * - flush_tlb_page(vma, vmaddr) flushes one page
90269+ * - flush_tlb_range(vma, start, end) flushes a range of pages
90270+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
90271+ * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
90272+ *
90273+ * ..but the i386 has somewhat limited tlb flushing capabilities,
90274+ * and page-granular flushes are available only on i486 and up.
90275+ */
90276+
90277+#ifndef CONFIG_SMP
90278+
90279+#define flush_tlb() __flush_tlb()
90280+#define flush_tlb_all() __flush_tlb_all()
90281+#define local_flush_tlb() __flush_tlb()
90282+
90283+static inline void flush_tlb_mm(struct mm_struct *mm)
90284+{
90285+ if (mm == current->active_mm)
90286+ __flush_tlb();
90287+}
90288+
90289+static inline void flush_tlb_page(struct vm_area_struct *vma,
90290+ unsigned long addr)
90291+{
90292+ if (vma->vm_mm == current->active_mm)
90293+ __flush_tlb_one(addr);
90294+}
90295+
90296+static inline void flush_tlb_range(struct vm_area_struct *vma,
90297+ unsigned long start, unsigned long end)
90298+{
90299+ if (vma->vm_mm == current->active_mm)
90300+ __flush_tlb();
90301+}
90302+
90303+#else
90304+
90305+#include <asm/smp.h>
90306+
90307+#define local_flush_tlb() \
90308+ __flush_tlb()
90309+
90310+extern void flush_tlb_all(void);
90311+extern void flush_tlb_current_task(void);
90312+extern void flush_tlb_mm(struct mm_struct *);
90313+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
90314+
90315+#define flush_tlb() flush_tlb_current_task()
90316+
90317+static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
90318+{
90319+ flush_tlb_mm(vma->vm_mm);
90320+}
90321+
90322+#define TLBSTATE_OK 1
90323+#define TLBSTATE_LAZY 2
90324+
90325+struct tlb_state
90326+{
90327+ struct mm_struct *active_mm;
90328+ int state;
90329+ char __cacheline_padding[L1_CACHE_BYTES-8];
90330+};
90331+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
90332+
90333+
90334+#endif
90335+
90336+#define flush_tlb_kernel_range(start, end) flush_tlb_all()
90337+
90338+static inline void flush_tlb_pgtables(struct mm_struct *mm,
90339+ unsigned long start, unsigned long end)
90340+{
90341+ /* i386 does not keep any page table caches in TLB */
90342+}
90343+
90344+#endif /* _I386_TLBFLUSH_H */
90345diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/vga.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/vga.h
90346--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/vga.h 1970-01-01 00:00:00.000000000 +0000
90347+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/vga.h 2007-01-08 15:00:46.000000000 +0000
90348@@ -0,0 +1,20 @@
90349+/*
90350+ * Access to VGA videoram
90351+ *
90352+ * (c) 1998 Martin Mares <mj@ucw.cz>
90353+ */
90354+
90355+#ifndef _LINUX_ASM_VGA_H_
90356+#define _LINUX_ASM_VGA_H_
90357+
90358+/*
90359+ * On the PC, we can just recalculate addresses and then
90360+ * access the videoram directly without any black magic.
90361+ */
90362+
90363+#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
90364+
90365+#define vga_readb(x) (*(x))
90366+#define vga_writeb(x,y) (*(y) = (x))
90367+
90368+#endif
90369diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/xenoprof.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/xenoprof.h
90370--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
90371+++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
90372@@ -0,0 +1,48 @@
90373+/******************************************************************************
90374+ * asm-i386/mach-xen/asm/xenoprof.h
90375+ *
90376+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
90377+ * VA Linux Systems Japan K.K.
90378+ *
90379+ * This program is free software; you can redistribute it and/or modify
90380+ * it under the terms of the GNU General Public License as published by
90381+ * the Free Software Foundation; either version 2 of the License, or
90382+ * (at your option) any later version.
90383+ *
90384+ * This program is distributed in the hope that it will be useful,
90385+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
90386+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
90387+ * GNU General Public License for more details.
90388+ *
90389+ * You should have received a copy of the GNU General Public License
90390+ * along with this program; if not, write to the Free Software
90391+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
90392+ *
90393+ */
90394+#ifndef __ASM_XENOPROF_H__
90395+#define __ASM_XENOPROF_H__
90396+#ifdef CONFIG_XEN
90397+
90398+struct super_block;
90399+struct dentry;
90400+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
90401+#define HAVE_XENOPROF_CREATE_FILES
90402+
90403+struct xenoprof_init;
90404+void xenoprof_arch_init_counter(struct xenoprof_init *init);
90405+void xenoprof_arch_counter(void);
90406+void xenoprof_arch_start(void);
90407+void xenoprof_arch_stop(void);
90408+
90409+struct xenoprof_arch_shared_buffer {
90410+ /* nothing */
90411+};
90412+struct xenoprof_shared_buffer;
90413+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
90414+struct xenoprof_get_buffer;
90415+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
90416+struct xenoprof_passive;
90417+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
90418+
90419+#endif /* CONFIG_XEN */
90420+#endif /* __ASM_XENOPROF_H__ */
90421diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/irq_vectors.h linux-2.6.16.33/include/asm-i386/mach-xen/irq_vectors.h
90422--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/irq_vectors.h 1970-01-01 00:00:00.000000000 +0000
90423+++ linux-2.6.16.33/include/asm-i386/mach-xen/irq_vectors.h 2007-01-08 15:00:46.000000000 +0000
90424@@ -0,0 +1,125 @@
90425+/*
90426+ * This file should contain #defines for all of the interrupt vector
90427+ * numbers used by this architecture.
90428+ *
90429+ * In addition, there are some standard defines:
90430+ *
90431+ * FIRST_EXTERNAL_VECTOR:
90432+ * The first free place for external interrupts
90433+ *
90434+ * SYSCALL_VECTOR:
90435+ * The IRQ vector a syscall makes the user to kernel transition
90436+ * under.
90437+ *
90438+ * TIMER_IRQ:
90439+ * The IRQ number the timer interrupt comes in at.
90440+ *
90441+ * NR_IRQS:
90442+ * The total number of interrupt vectors (including all the
90443+ * architecture specific interrupts) needed.
90444+ *
90445+ */
90446+#ifndef _ASM_IRQ_VECTORS_H
90447+#define _ASM_IRQ_VECTORS_H
90448+
90449+/*
90450+ * IDT vectors usable for external interrupt sources start
90451+ * at 0x20:
90452+ */
90453+#define FIRST_EXTERNAL_VECTOR 0x20
90454+
90455+#define SYSCALL_VECTOR 0x80
90456+
90457+/*
90458+ * Vectors 0x20-0x2f are used for ISA interrupts.
90459+ */
90460+
90461+#if 0
90462+/*
90463+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
90464+ *
90465+ * some of the following vectors are 'rare', they are merged
90466+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
90467+ * TLB, reschedule and local APIC vectors are performance-critical.
90468+ *
90469+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
90470+ */
90471+#define SPURIOUS_APIC_VECTOR 0xff
90472+#define ERROR_APIC_VECTOR 0xfe
90473+#define INVALIDATE_TLB_VECTOR 0xfd
90474+#define RESCHEDULE_VECTOR 0xfc
90475+#define CALL_FUNCTION_VECTOR 0xfb
90476+
90477+#define THERMAL_APIC_VECTOR 0xf0
90478+/*
90479+ * Local APIC timer IRQ vector is on a different priority level,
90480+ * to work around the 'lost local interrupt if more than 2 IRQ
90481+ * sources per level' errata.
90482+ */
90483+#define LOCAL_TIMER_VECTOR 0xef
90484+#endif
90485+
90486+#define SPURIOUS_APIC_VECTOR 0xff
90487+#define ERROR_APIC_VECTOR 0xfe
90488+
90489+/*
90490+ * First APIC vector available to drivers: (vectors 0x30-0xee)
90491+ * we start at 0x31 to spread out vectors evenly between priority
90492+ * levels. (0x80 is the syscall vector)
90493+ */
90494+#define FIRST_DEVICE_VECTOR 0x31
90495+#define FIRST_SYSTEM_VECTOR 0xef
90496+
90497+/*
90498+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
90499+ * Right now the APIC is mostly only used for SMP.
90500+ * 256 vectors is an architectural limit. (we can have
90501+ * more than 256 devices theoretically, but they will
90502+ * have to use shared interrupts)
90503+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
90504+ * the usable vector space is 0x20-0xff (224 vectors)
90505+ */
90506+
90507+#define RESCHEDULE_VECTOR 0
90508+#define CALL_FUNCTION_VECTOR 1
90509+#define NR_IPIS 2
90510+
90511+/*
90512+ * The maximum number of vectors supported by i386 processors
90513+ * is limited to 256. For processors other than i386, NR_VECTORS
90514+ * should be changed accordingly.
90515+ */
90516+#define NR_VECTORS 256
90517+
90518+#define FPU_IRQ 13
90519+
90520+#define FIRST_VM86_IRQ 3
90521+#define LAST_VM86_IRQ 15
90522+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
90523+
90524+/*
90525+ * The flat IRQ space is divided into two regions:
90526+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
90527+ * if we have physical device-access privilege. This region is at the
90528+ * start of the IRQ space so that existing device drivers do not need
90529+ * to be modified to translate physical IRQ numbers into our IRQ space.
90530+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
90531+ * are bound using the provided bind/unbind functions.
90532+ */
90533+
90534+#define PIRQ_BASE 0
90535+#define NR_PIRQS 256
90536+
90537+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
90538+#define NR_DYNIRQS 256
90539+
90540+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
90541+#define NR_IRQ_VECTORS NR_IRQS
90542+
90543+#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
90544+#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
90545+
90546+#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
90547+#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
90548+
90549+#endif /* _ASM_IRQ_VECTORS_H */
90550diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/mach_traps.h linux-2.6.16.33/include/asm-i386/mach-xen/mach_traps.h
90551--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/mach_traps.h 1970-01-01 00:00:00.000000000 +0000
90552+++ linux-2.6.16.33/include/asm-i386/mach-xen/mach_traps.h 2007-01-08 15:00:46.000000000 +0000
90553@@ -0,0 +1,33 @@
90554+/*
90555+ * include/asm-xen/asm-i386/mach-xen/mach_traps.h
90556+ *
90557+ * Machine specific NMI handling for Xen
90558+ */
90559+#ifndef _MACH_TRAPS_H
90560+#define _MACH_TRAPS_H
90561+
90562+#include <linux/bitops.h>
90563+#include <xen/interface/nmi.h>
90564+
90565+static inline void clear_mem_error(unsigned char reason) {}
90566+static inline void clear_io_check_error(unsigned char reason) {}
90567+
90568+static inline unsigned char get_nmi_reason(void)
90569+{
90570+ shared_info_t *s = HYPERVISOR_shared_info;
90571+ unsigned char reason = 0;
90572+
90573+ /* construct a value which looks like it came from
90574+ * port 0x61.
90575+ */
90576+ if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
90577+ reason |= 0x40;
90578+ if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
90579+ reason |= 0x80;
90580+
90581+ return reason;
90582+}
90583+
90584+static inline void reassert_nmi(void) {}
90585+
90586+#endif /* !_MACH_TRAPS_H */
90587diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_post.h linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_post.h
90588--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_post.h 1970-01-01 00:00:00.000000000 +0000
90589+++ linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_post.h 2007-01-08 15:00:46.000000000 +0000
90590@@ -0,0 +1,108 @@
90591+/**
90592+ * machine_specific_memory_setup - Hook for machine specific memory setup.
90593+ *
90594+ * Description:
90595+ * This is included late in kernel/setup.c so that it can make
90596+ * use of all of the static functions.
90597+ **/
90598+
90599+#include <xen/interface/callback.h>
90600+#include <xen/interface/memory.h>
90601+
90602+static char * __init machine_specific_memory_setup(void)
90603+{
90604+ int rc;
90605+ struct xen_memory_map memmap;
90606+ /*
90607+ * This is rather large for a stack variable but this early in
90608+ * the boot process we know we have plenty slack space.
90609+ */
90610+ struct e820entry map[E820MAX];
90611+
90612+ memmap.nr_entries = E820MAX;
90613+ set_xen_guest_handle(memmap.buffer, map);
90614+
90615+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
90616+ if ( rc == -ENOSYS ) {
90617+ memmap.nr_entries = 1;
90618+ map[0].addr = 0ULL;
90619+ map[0].size = PFN_PHYS(xen_start_info->nr_pages);
90620+ /* 8MB slack (to balance backend allocations). */
90621+ map[0].size += 8ULL << 20;
90622+ map[0].type = E820_RAM;
90623+ rc = 0;
90624+ }
90625+ BUG_ON(rc);
90626+
90627+ sanitize_e820_map(map, (char *)&memmap.nr_entries);
90628+
90629+ BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
90630+
90631+ return "Xen";
90632+}
90633+
90634+extern void hypervisor_callback(void);
90635+extern void failsafe_callback(void);
90636+extern void nmi(void);
90637+
90638+unsigned long *machine_to_phys_mapping;
90639+EXPORT_SYMBOL(machine_to_phys_mapping);
90640+unsigned int machine_to_phys_order;
90641+EXPORT_SYMBOL(machine_to_phys_order);
90642+
90643+static void __init machine_specific_arch_setup(void)
90644+{
90645+ int ret;
90646+ struct xen_machphys_mapping mapping;
90647+ unsigned long machine_to_phys_nr_ents;
90648+ struct xen_platform_parameters pp;
90649+ static struct callback_register __initdata event = {
90650+ .type = CALLBACKTYPE_event,
90651+ .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
90652+ };
90653+ static struct callback_register __initdata failsafe = {
90654+ .type = CALLBACKTYPE_failsafe,
90655+ .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
90656+ };
90657+ static struct callback_register __initdata nmi_cb = {
90658+ .type = CALLBACKTYPE_nmi,
90659+ .address = { __KERNEL_CS, (unsigned long)nmi },
90660+ };
90661+
90662+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
90663+ if (ret == 0)
90664+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
90665+#ifdef CONFIG_XEN_COMPAT_030002
90666+ if (ret == -ENOSYS)
90667+ ret = HYPERVISOR_set_callbacks(
90668+ event.address.cs, event.address.eip,
90669+ failsafe.address.cs, failsafe.address.eip);
90670+#endif
90671+ BUG_ON(ret);
90672+
90673+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
90674+#ifdef CONFIG_XEN_COMPAT_030002
90675+ if (ret == -ENOSYS) {
90676+ static struct xennmi_callback __initdata cb = {
90677+ .handler_address = (unsigned long)nmi
90678+ };
90679+
90680+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
90681+ }
90682+#endif
90683+
90684+ if (HYPERVISOR_xen_version(XENVER_platform_parameters,
90685+ &pp) == 0) {
90686+ hypervisor_virt_start = pp.virt_start;
90687+ set_fixaddr_top();
90688+ }
90689+
90690+ machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
90691+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
90692+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
90693+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
90694+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
90695+ }
90696+ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
90697+ machine_to_phys_order++;
90698+}
90699diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_pre.h linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_pre.h
90700--- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_pre.h 1970-01-01 00:00:00.000000000 +0000
90701+++ linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_pre.h 2007-01-08 15:00:46.000000000 +0000
90702@@ -0,0 +1,5 @@
90703+/* Hook to call BIOS initialisation function */
90704+
90705+#define ARCH_SETUP machine_specific_arch_setup();
90706+
90707+static void __init machine_specific_arch_setup(void);
90708diff -Nur linux-2.6.16.33-noxen/include/asm-i386/page.h linux-2.6.16.33/include/asm-i386/page.h
90709--- linux-2.6.16.33-noxen/include/asm-i386/page.h 2006-11-22 18:06:31.000000000 +0000
90710+++ linux-2.6.16.33/include/asm-i386/page.h 2007-01-08 15:00:46.000000000 +0000
90711@@ -121,7 +121,7 @@
90712
90713 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
90714 #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
90715-#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE)
90716+#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
90717 #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
90718 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
90719 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
90720@@ -139,6 +139,8 @@
90721 ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
90722 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
90723
90724+#define __HAVE_ARCH_GATE_AREA 1
90725+
90726 #endif /* __KERNEL__ */
90727
90728 #include <asm-generic/page.h>
90729diff -Nur linux-2.6.16.33-noxen/include/asm-i386/pgtable-2level-defs.h linux-2.6.16.33/include/asm-i386/pgtable-2level-defs.h
90730--- linux-2.6.16.33-noxen/include/asm-i386/pgtable-2level-defs.h 2006-11-22 18:06:31.000000000 +0000
90731+++ linux-2.6.16.33/include/asm-i386/pgtable-2level-defs.h 2007-05-23 21:00:01.000000000 +0000
90732@@ -1,6 +1,8 @@
90733 #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
90734 #define _I386_PGTABLE_2LEVEL_DEFS_H
90735
90736+#define HAVE_SHARED_KERNEL_PMD 0
90737+
90738 /*
90739 * traditional i386 two-level paging structure:
90740 */
90741diff -Nur linux-2.6.16.33-noxen/include/asm-i386/pgtable-3level-defs.h linux-2.6.16.33/include/asm-i386/pgtable-3level-defs.h
90742--- linux-2.6.16.33-noxen/include/asm-i386/pgtable-3level-defs.h 2006-11-22 18:06:31.000000000 +0000
90743+++ linux-2.6.16.33/include/asm-i386/pgtable-3level-defs.h 2007-05-23 21:00:01.000000000 +0000
90744@@ -1,6 +1,8 @@
90745 #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
90746 #define _I386_PGTABLE_3LEVEL_DEFS_H
90747
90748+#define HAVE_SHARED_KERNEL_PMD 1
90749+
90750 /*
90751 * PGDIR_SHIFT determines what a top-level page table entry can map
90752 */
90753diff -Nur linux-2.6.16.33-noxen/include/asm-i386/rwsem.h linux-2.6.16.33/include/asm-i386/rwsem.h
90754--- linux-2.6.16.33-noxen/include/asm-i386/rwsem.h 2006-11-22 18:06:31.000000000 +0000
90755+++ linux-2.6.16.33/include/asm-i386/rwsem.h 2007-01-08 15:00:46.000000000 +0000
90756@@ -40,6 +40,7 @@
90757
90758 #include <linux/list.h>
90759 #include <linux/spinlock.h>
90760+#include <asm/smp_alt.h>
90761
90762 struct rwsem_waiter;
90763
90764@@ -99,7 +100,7 @@
90765 {
90766 __asm__ __volatile__(
90767 "# beginning down_read\n\t"
90768-LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
90769+LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
90770 " js 2f\n\t" /* jump if we weren't granted the lock */
90771 "1:\n\t"
90772 LOCK_SECTION_START("")
90773@@ -130,7 +131,7 @@
90774 " movl %1,%2\n\t"
90775 " addl %3,%2\n\t"
90776 " jle 2f\n\t"
90777-LOCK_PREFIX " cmpxchgl %2,%0\n\t"
90778+LOCK " cmpxchgl %2,%0\n\t"
90779 " jnz 1b\n\t"
90780 "2:\n\t"
90781 "# ending __down_read_trylock\n\t"
90782@@ -150,7 +151,7 @@
90783 tmp = RWSEM_ACTIVE_WRITE_BIAS;
90784 __asm__ __volatile__(
90785 "# beginning down_write\n\t"
90786-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
90787+LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
90788 " testl %%edx,%%edx\n\t" /* was the count 0 before? */
90789 " jnz 2f\n\t" /* jump if we weren't granted the lock */
90790 "1:\n\t"
90791@@ -188,7 +189,7 @@
90792 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
90793 __asm__ __volatile__(
90794 "# beginning __up_read\n\t"
90795-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
90796+LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
90797 " js 2f\n\t" /* jump if the lock is being waited upon */
90798 "1:\n\t"
90799 LOCK_SECTION_START("")
90800@@ -214,7 +215,7 @@
90801 __asm__ __volatile__(
90802 "# beginning __up_write\n\t"
90803 " movl %2,%%edx\n\t"
90804-LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
90805+LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
90806 " jnz 2f\n\t" /* jump if the lock is being waited upon */
90807 "1:\n\t"
90808 LOCK_SECTION_START("")
90809@@ -239,7 +240,7 @@
90810 {
90811 __asm__ __volatile__(
90812 "# beginning __downgrade_write\n\t"
90813-LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
90814+LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
90815 " js 2f\n\t" /* jump if the lock is being waited upon */
90816 "1:\n\t"
90817 LOCK_SECTION_START("")
90818@@ -263,7 +264,7 @@
90819 static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
90820 {
90821 __asm__ __volatile__(
90822-LOCK_PREFIX "addl %1,%0"
90823+LOCK "addl %1,%0"
90824 : "=m"(sem->count)
90825 : "ir"(delta), "m"(sem->count));
90826 }
90827@@ -276,7 +277,7 @@
90828 int tmp = delta;
90829
90830 __asm__ __volatile__(
90831-LOCK_PREFIX "xadd %0,(%2)"
90832+LOCK "xadd %0,(%2)"
90833 : "+r"(tmp), "=m"(sem->count)
90834 : "r"(sem), "m"(sem->count)
90835 : "memory");
90836diff -Nur linux-2.6.16.33-noxen/include/asm-i386/smp_alt.h linux-2.6.16.33/include/asm-i386/smp_alt.h
90837--- linux-2.6.16.33-noxen/include/asm-i386/smp_alt.h 1970-01-01 00:00:00.000000000 +0000
90838+++ linux-2.6.16.33/include/asm-i386/smp_alt.h 2007-01-08 15:00:46.000000000 +0000
90839@@ -0,0 +1,32 @@
90840+#ifndef __ASM_SMP_ALT_H__
90841+#define __ASM_SMP_ALT_H__
90842+
90843+#include <linux/config.h>
90844+
90845+#ifdef CONFIG_SMP
90846+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
90847+#define LOCK \
90848+ "6677: nop\n" \
90849+ ".section __smp_alternatives,\"a\"\n" \
90850+ ".long 6677b\n" \
90851+ ".long 6678f\n" \
90852+ ".previous\n" \
90853+ ".section __smp_replacements,\"a\"\n" \
90854+ "6678: .byte 1\n" \
90855+ ".byte 1\n" \
90856+ ".byte 0\n" \
90857+ ".byte 1\n" \
90858+ ".byte -1\n" \
90859+ "lock\n" \
90860+ "nop\n" \
90861+ ".previous\n"
90862+void prepare_for_smp(void);
90863+void unprepare_for_smp(void);
90864+#else
90865+#define LOCK "lock ; "
90866+#endif
90867+#else
90868+#define LOCK ""
90869+#endif
90870+
90871+#endif /* __ASM_SMP_ALT_H__ */
90872diff -Nur linux-2.6.16.33-noxen/include/asm-i386/spinlock.h linux-2.6.16.33/include/asm-i386/spinlock.h
90873--- linux-2.6.16.33-noxen/include/asm-i386/spinlock.h 2006-11-22 18:06:31.000000000 +0000
90874+++ linux-2.6.16.33/include/asm-i386/spinlock.h 2007-01-08 15:00:46.000000000 +0000
90875@@ -6,6 +6,7 @@
90876 #include <asm/page.h>
90877 #include <linux/config.h>
90878 #include <linux/compiler.h>
90879+#include <asm/smp_alt.h>
90880
90881 /*
90882 * Your basic SMP spinlocks, allowing only a single CPU anywhere
90883@@ -23,7 +24,8 @@
90884
90885 #define __raw_spin_lock_string \
90886 "\n1:\t" \
90887- "lock ; decb %0\n\t" \
90888+ LOCK \
90889+ "decb %0\n\t" \
90890 "jns 3f\n" \
90891 "2:\t" \
90892 "rep;nop\n\t" \
90893@@ -34,7 +36,8 @@
90894
90895 #define __raw_spin_lock_string_flags \
90896 "\n1:\t" \
90897- "lock ; decb %0\n\t" \
90898+ LOCK \
90899+ "decb %0\n\t" \
90900 "jns 4f\n\t" \
90901 "2:\t" \
90902 "testl $0x200, %1\n\t" \
90903@@ -65,10 +68,34 @@
90904 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
90905 {
90906 char oldval;
90907+#ifdef CONFIG_SMP_ALTERNATIVES
90908+ __asm__ __volatile__(
90909+ "1:movb %1,%b0\n"
90910+ "movb $0,%1\n"
90911+ "2:"
90912+ ".section __smp_alternatives,\"a\"\n"
90913+ ".long 1b\n"
90914+ ".long 3f\n"
90915+ ".previous\n"
90916+ ".section __smp_replacements,\"a\"\n"
90917+ "3: .byte 2b - 1b\n"
90918+ ".byte 5f-4f\n"
90919+ ".byte 0\n"
90920+ ".byte 6f-5f\n"
90921+ ".byte -1\n"
90922+ "4: xchgb %b0,%1\n"
90923+ "5: movb %1,%b0\n"
90924+ "movb $0,%1\n"
90925+ "6:\n"
90926+ ".previous\n"
90927+ :"=q" (oldval), "=m" (lock->slock)
90928+ :"0" (0) : "memory");
90929+#else
90930 __asm__ __volatile__(
90931 "xchgb %b0,%1"
90932 :"=q" (oldval), "=m" (lock->slock)
90933 :"0" (0) : "memory");
90934+#endif
90935 return oldval > 0;
90936 }
90937
90938@@ -178,12 +205,12 @@
90939
90940 static inline void __raw_read_unlock(raw_rwlock_t *rw)
90941 {
90942- asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
90943+ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
90944 }
90945
90946 static inline void __raw_write_unlock(raw_rwlock_t *rw)
90947 {
90948- asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
90949+ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
90950 : "=m" (rw->lock) : : "memory");
90951 }
90952
90953diff -Nur linux-2.6.16.33-noxen/include/asm-i386/system.h linux-2.6.16.33/include/asm-i386/system.h
90954--- linux-2.6.16.33-noxen/include/asm-i386/system.h 2006-11-22 18:06:31.000000000 +0000
90955+++ linux-2.6.16.33/include/asm-i386/system.h 2007-01-08 15:00:46.000000000 +0000
90956@@ -5,7 +5,7 @@
90957 #include <linux/kernel.h>
90958 #include <asm/segment.h>
90959 #include <asm/cpufeature.h>
90960-#include <linux/bitops.h> /* for LOCK_PREFIX */
90961+#include <asm/smp_alt.h>
90962
90963 #ifdef __KERNEL__
90964
90965@@ -271,19 +271,19 @@
90966 unsigned long prev;
90967 switch (size) {
90968 case 1:
90969- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
90970+ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
90971 : "=a"(prev)
90972 : "q"(new), "m"(*__xg(ptr)), "0"(old)
90973 : "memory");
90974 return prev;
90975 case 2:
90976- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
90977+ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
90978 : "=a"(prev)
90979 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90980 : "memory");
90981 return prev;
90982 case 4:
90983- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
90984+ __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
90985 : "=a"(prev)
90986 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90987 : "memory");
90988@@ -336,7 +336,7 @@
90989 unsigned long long new)
90990 {
90991 unsigned long long prev;
90992- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
90993+ __asm__ __volatile__(LOCK "cmpxchg8b %3"
90994 : "=A"(prev)
90995 : "b"((unsigned long)new),
90996 "c"((unsigned long)(new >> 32)),
90997@@ -503,11 +503,55 @@
90998 #endif
90999
91000 #ifdef CONFIG_SMP
91001+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
91002+#define smp_alt_mb(instr) \
91003+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
91004+ ".section __smp_alternatives,\"a\"\n" \
91005+ ".long 6667b\n" \
91006+ ".long 6673f\n" \
91007+ ".previous\n" \
91008+ ".section __smp_replacements,\"a\"\n" \
91009+ "6673:.byte 6668b-6667b\n" \
91010+ ".byte 6670f-6669f\n" \
91011+ ".byte 6671f-6670f\n" \
91012+ ".byte 0\n" \
91013+ ".byte %c0\n" \
91014+ "6669:lock;addl $0,0(%%esp)\n" \
91015+ "6670:" instr "\n" \
91016+ "6671:\n" \
91017+ ".previous\n" \
91018+ : \
91019+ : "i" (X86_FEATURE_XMM2) \
91020+ : "memory")
91021+#define smp_mb() smp_alt_mb("mfence")
91022+#define smp_rmb() smp_alt_mb("lfence")
91023+#define set_mb(var, value) do { \
91024+unsigned long __set_mb_temp; \
91025+__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
91026+ ".section __smp_alternatives,\"a\"\n" \
91027+ ".long 6667b\n" \
91028+ ".long 6673f\n" \
91029+ ".previous\n" \
91030+ ".section __smp_replacements,\"a\"\n" \
91031+ "6673: .byte 6668b-6667b\n" \
91032+ ".byte 6670f-6669f\n" \
91033+ ".byte 0\n" \
91034+ ".byte 6671f-6670f\n" \
91035+ ".byte -1\n" \
91036+ "6669: xchg %1, %0\n" \
91037+ "6670:movl %1, %0\n" \
91038+ "6671:\n" \
91039+ ".previous\n" \
91040+ : "=m" (var), "=r" (__set_mb_temp) \
91041+ : "1" (value) \
91042+ : "memory"); } while (0)
91043+#else
91044 #define smp_mb() mb()
91045 #define smp_rmb() rmb()
91046+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
91047+#endif
91048 #define smp_wmb() wmb()
91049 #define smp_read_barrier_depends() read_barrier_depends()
91050-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
91051 #else
91052 #define smp_mb() barrier()
91053 #define smp_rmb() barrier()
91054diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/agp.h linux-2.6.16.33/include/asm-ia64/agp.h
91055--- linux-2.6.16.33-noxen/include/asm-ia64/agp.h 2006-11-22 18:06:31.000000000 +0000
91056+++ linux-2.6.16.33/include/asm-ia64/agp.h 2007-01-08 15:00:46.000000000 +0000
91057@@ -19,13 +19,44 @@
91058 #define flush_agp_cache() mb()
91059
91060 /* Convert a physical address to an address suitable for the GART. */
91061+#ifndef CONFIG_XEN
91062 #define phys_to_gart(x) (x)
91063 #define gart_to_phys(x) (x)
91064+#else
91065+#define phys_to_gart(x) phys_to_machine_for_dma(x)
91066+#define gart_to_phys(x) machine_to_phys_for_dma(x)
91067+#endif
91068
91069 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
91070+#ifndef CONFIG_XEN
91071 #define alloc_gatt_pages(order) \
91072 ((char *)__get_free_pages(GFP_KERNEL, (order)))
91073 #define free_gatt_pages(table, order) \
91074 free_pages((unsigned long)(table), (order))
91075+#else
91076+#include <asm/hypervisor.h>
91077+static inline char*
91078+alloc_gatt_pages(unsigned int order)
91079+{
91080+ unsigned long error;
91081+ unsigned long ret = __get_free_pages(GFP_KERNEL, (order));
91082+ if (ret == 0) {
91083+ goto out;
91084+ }
91085+ error = xen_create_contiguous_region(ret, order, 0);
91086+ if (error) {
91087+ free_pages(ret, order);
91088+ ret = 0;
91089+ }
91090+out:
91091+ return (char*)ret;
91092+}
91093+static inline void
91094+free_gatt_pages(void* table, unsigned int order)
91095+{
91096+ xen_destroy_contiguous_region((unsigned long)table, order);
91097+ free_pages((unsigned long)table, order);
91098+}
91099+#endif /* CONFIG_XEN */
91100
91101 #endif /* _ASM_IA64_AGP_H */
91102diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/dma-mapping.h linux-2.6.16.33/include/asm-ia64/dma-mapping.h
91103--- linux-2.6.16.33-noxen/include/asm-ia64/dma-mapping.h 2006-11-22 18:06:31.000000000 +0000
91104+++ linux-2.6.16.33/include/asm-ia64/dma-mapping.h 2007-01-08 15:00:46.000000000 +0000
91105@@ -7,7 +7,14 @@
91106 */
91107 #include <linux/config.h>
91108 #include <asm/machvec.h>
91109+#ifdef CONFIG_XEN
91110+/* Needed for arch/i386/kernel/swiotlb.c and arch/i386/kernel/pci-dma-xen.c */
91111+#include <asm/hypervisor.h>
91112+/* Needed for arch/i386/kernel/swiotlb.c */
91113+#include <asm-i386/mach-xen/asm/swiotlb.h>
91114+#endif
91115
91116+#ifndef CONFIG_XEN
91117 #define dma_alloc_coherent platform_dma_alloc_coherent
91118 #define dma_alloc_noncoherent platform_dma_alloc_coherent /* coherent mem. is cheap */
91119 #define dma_free_coherent platform_dma_free_coherent
91120@@ -21,6 +28,46 @@
91121 #define dma_sync_single_for_device platform_dma_sync_single_for_device
91122 #define dma_sync_sg_for_device platform_dma_sync_sg_for_device
91123 #define dma_mapping_error platform_dma_mapping_error
91124+#else
91125+int dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
91126+ enum dma_data_direction direction);
91127+void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
91128+ enum dma_data_direction direction);
91129+int dma_supported(struct device *dev, u64 mask);
91130+void *dma_alloc_coherent(struct device *dev, size_t size,
91131+ dma_addr_t *dma_handle, gfp_t gfp);
91132+void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
91133+ dma_addr_t dma_handle);
91134+dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
91135+ enum dma_data_direction direction);
91136+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
91137+ enum dma_data_direction direction);
91138+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
91139+ size_t size, enum dma_data_direction direction);
91140+void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
91141+ size_t size,
91142+ enum dma_data_direction direction);
91143+int dma_mapping_error(dma_addr_t dma_addr);
91144+
91145+#define flush_write_buffers() do { } while (0)
91146+static inline void
91147+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
91148+ enum dma_data_direction direction)
91149+{
91150+ if (swiotlb)
91151+ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
91152+ flush_write_buffers();
91153+}
91154+
91155+static inline void
91156+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
91157+ enum dma_data_direction direction)
91158+{
91159+ if (swiotlb)
91160+ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
91161+ flush_write_buffers();
91162+}
91163+#endif
91164
91165 #define dma_map_page(dev, pg, off, size, dir) \
91166 dma_map_single(dev, page_address(pg) + (off), (size), (dir))
91167@@ -37,7 +84,9 @@
91168 #define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir) \
91169 dma_sync_single_for_device(dev, dma_handle, size, dir)
91170
91171+#ifndef CONFIG_XEN
91172 #define dma_supported platform_dma_supported
91173+#endif
91174
91175 static inline int
91176 dma_set_mask (struct device *dev, u64 mask)
91177@@ -62,4 +111,27 @@
91178
91179 #define dma_is_consistent(dma_handle) (1) /* all we do is coherent memory... */
91180
91181+#ifdef CONFIG_XEN
91182+/* arch/i386/kernel/swiotlb.o requires */
91183+void contiguous_bitmap_init(unsigned long end_pfn);
91184+
91185+static inline int
91186+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
91187+{
91188+ dma_addr_t mask = DMA_64BIT_MASK;
91189+ /* If the device has a mask, use it, otherwise default to 64 bits */
91190+ if (hwdev && hwdev->dma_mask)
91191+ mask = *hwdev->dma_mask;
91192+ return (addr & ~mask) != 0;
91193+}
91194+
91195+static inline int
91196+range_straddles_page_boundary(void *p, size_t size)
91197+{
91198+ extern unsigned long *contiguous_bitmap;
91199+ return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
91200+ !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
91201+}
91202+#endif
91203+
91204 #endif /* _ASM_IA64_DMA_MAPPING_H */
91205diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/fixmap.h linux-2.6.16.33/include/asm-ia64/fixmap.h
91206--- linux-2.6.16.33-noxen/include/asm-ia64/fixmap.h 1970-01-01 00:00:00.000000000 +0000
91207+++ linux-2.6.16.33/include/asm-ia64/fixmap.h 2007-01-08 15:00:46.000000000 +0000
91208@@ -0,0 +1,2 @@
91209+#define clear_fixmap(x) do {} while (0)
91210+#define set_fixmap(x,y) do {} while (0)
91211diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/gcc_intrin.h linux-2.6.16.33/include/asm-ia64/gcc_intrin.h
91212--- linux-2.6.16.33-noxen/include/asm-ia64/gcc_intrin.h 2006-11-22 18:06:31.000000000 +0000
91213+++ linux-2.6.16.33/include/asm-ia64/gcc_intrin.h 2007-01-08 15:00:46.000000000 +0000
91214@@ -26,7 +26,7 @@
91215
91216 register unsigned long ia64_r13 asm ("r13") __attribute_used__;
91217
91218-#define ia64_setreg(regnum, val) \
91219+#define __ia64_setreg(regnum, val) \
91220 ({ \
91221 switch (regnum) { \
91222 case _IA64_REG_PSR_L: \
91223@@ -55,7 +55,7 @@
91224 } \
91225 })
91226
91227-#define ia64_getreg(regnum) \
91228+#define __ia64_getreg(regnum) \
91229 ({ \
91230 __u64 ia64_intri_res; \
91231 \
91232@@ -92,7 +92,7 @@
91233
91234 #define ia64_hint_pause 0
91235
91236-#define ia64_hint(mode) \
91237+#define __ia64_hint(mode) \
91238 ({ \
91239 switch (mode) { \
91240 case ia64_hint_pause: \
91241@@ -374,7 +374,7 @@
91242
91243 #define ia64_invala() asm volatile ("invala" ::: "memory")
91244
91245-#define ia64_thash(addr) \
91246+#define __ia64_thash(addr) \
91247 ({ \
91248 __u64 ia64_intri_res; \
91249 asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr)); \
91250@@ -394,18 +394,18 @@
91251
91252 #define ia64_nop(x) asm volatile ("nop %0"::"i"(x));
91253
91254-#define ia64_itci(addr) asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
91255+#define __ia64_itci(addr) asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
91256
91257-#define ia64_itcd(addr) asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
91258+#define __ia64_itcd(addr) asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
91259
91260
91261-#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1" \
91262+#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1" \
91263 :: "r"(trnum), "r"(addr) : "memory")
91264
91265-#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1" \
91266+#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1" \
91267 :: "r"(trnum), "r"(addr) : "memory")
91268
91269-#define ia64_tpa(addr) \
91270+#define __ia64_tpa(addr) \
91271 ({ \
91272 __u64 ia64_pa; \
91273 asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory"); \
91274@@ -415,22 +415,22 @@
91275 #define __ia64_set_dbr(index, val) \
91276 asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91277
91278-#define ia64_set_ibr(index, val) \
91279+#define __ia64_set_ibr(index, val) \
91280 asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91281
91282-#define ia64_set_pkr(index, val) \
91283+#define __ia64_set_pkr(index, val) \
91284 asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91285
91286-#define ia64_set_pmc(index, val) \
91287+#define __ia64_set_pmc(index, val) \
91288 asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
91289
91290-#define ia64_set_pmd(index, val) \
91291+#define __ia64_set_pmd(index, val) \
91292 asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
91293
91294-#define ia64_set_rr(index, val) \
91295+#define __ia64_set_rr(index, val) \
91296 asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
91297
91298-#define ia64_get_cpuid(index) \
91299+#define __ia64_get_cpuid(index) \
91300 ({ \
91301 __u64 ia64_intri_res; \
91302 asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index)); \
91303@@ -444,21 +444,21 @@
91304 ia64_intri_res; \
91305 })
91306
91307-#define ia64_get_ibr(index) \
91308+#define __ia64_get_ibr(index) \
91309 ({ \
91310 __u64 ia64_intri_res; \
91311 asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91312 ia64_intri_res; \
91313 })
91314
91315-#define ia64_get_pkr(index) \
91316+#define __ia64_get_pkr(index) \
91317 ({ \
91318 __u64 ia64_intri_res; \
91319 asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91320 ia64_intri_res; \
91321 })
91322
91323-#define ia64_get_pmc(index) \
91324+#define __ia64_get_pmc(index) \
91325 ({ \
91326 __u64 ia64_intri_res; \
91327 asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91328@@ -466,48 +466,48 @@
91329 })
91330
91331
91332-#define ia64_get_pmd(index) \
91333+#define __ia64_get_pmd(index) \
91334 ({ \
91335 __u64 ia64_intri_res; \
91336 asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91337 ia64_intri_res; \
91338 })
91339
91340-#define ia64_get_rr(index) \
91341+#define __ia64_get_rr(index) \
91342 ({ \
91343 __u64 ia64_intri_res; \
91344 asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index)); \
91345 ia64_intri_res; \
91346 })
91347
91348-#define ia64_fc(addr) asm volatile ("fc %0" :: "r"(addr) : "memory")
91349+#define __ia64_fc(addr) asm volatile ("fc %0" :: "r"(addr) : "memory")
91350
91351
91352 #define ia64_sync_i() asm volatile (";; sync.i" ::: "memory")
91353
91354-#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
91355-#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
91356+#define __ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
91357+#define __ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
91358 #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
91359 #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
91360
91361-#define ia64_ptce(addr) asm volatile ("ptc.e %0" :: "r"(addr))
91362+#define __ia64_ptce(addr) asm volatile ("ptc.e %0" :: "r"(addr))
91363
91364-#define ia64_ptcga(addr, size) \
91365+#define __ia64_ptcga(addr, size) \
91366 do { \
91367 asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory"); \
91368 ia64_dv_serialize_data(); \
91369 } while (0)
91370
91371-#define ia64_ptcl(addr, size) \
91372+#define __ia64_ptcl(addr, size) \
91373 do { \
91374 asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory"); \
91375 ia64_dv_serialize_data(); \
91376 } while (0)
91377
91378-#define ia64_ptri(addr, size) \
91379+#define __ia64_ptri(addr, size) \
91380 asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
91381
91382-#define ia64_ptrd(addr, size) \
91383+#define __ia64_ptrd(addr, size) \
91384 asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
91385
91386 /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
91387@@ -589,7 +589,7 @@
91388 } \
91389 })
91390
91391-#define ia64_intrin_local_irq_restore(x) \
91392+#define __ia64_intrin_local_irq_restore(x) \
91393 do { \
91394 asm volatile (";; cmp.ne p6,p7=%0,r0;;" \
91395 "(p6) ssm psr.i;" \
91396@@ -598,4 +598,6 @@
91397 :: "r"((x)) : "p6", "p7", "memory"); \
91398 } while (0)
91399
91400+#define __ia64_get_psr_i() (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
91401+
91402 #endif /* _ASM_IA64_GCC_INTRIN_H */
91403diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hw_irq.h linux-2.6.16.33/include/asm-ia64/hw_irq.h
91404--- linux-2.6.16.33-noxen/include/asm-ia64/hw_irq.h 2006-11-22 18:06:31.000000000 +0000
91405+++ linux-2.6.16.33/include/asm-ia64/hw_irq.h 2007-01-08 15:00:46.000000000 +0000
91406@@ -15,7 +15,11 @@
91407 #include <asm/ptrace.h>
91408 #include <asm/smp.h>
91409
91410+#ifndef CONFIG_XEN
91411 typedef u8 ia64_vector;
91412+#else
91413+typedef u16 ia64_vector;
91414+#endif
91415
91416 /*
91417 * 0 special
91418@@ -89,6 +93,13 @@
91419 static inline void
91420 hw_resend_irq (struct hw_interrupt_type *h, unsigned int vector)
91421 {
91422+#ifdef CONFIG_XEN
91423+ extern void resend_irq_on_evtchn(struct hw_interrupt_type *h,
91424+ unsigned int i);
91425+ if (is_running_on_xen())
91426+ resend_irq_on_evtchn(h, vector);
91427+ else
91428+#endif /* CONFIG_XEN */
91429 platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
91430 }
91431
91432diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hypercall.h linux-2.6.16.33/include/asm-ia64/hypercall.h
91433--- linux-2.6.16.33-noxen/include/asm-ia64/hypercall.h 1970-01-01 00:00:00.000000000 +0000
91434+++ linux-2.6.16.33/include/asm-ia64/hypercall.h 2007-01-08 15:00:46.000000000 +0000
91435@@ -0,0 +1,463 @@
91436+/******************************************************************************
91437+ * hypercall.h
91438+ *
91439+ * Linux-specific hypervisor handling.
91440+ *
91441+ * Copyright (c) 2002-2004, K A Fraser
91442+ *
91443+ * This program is free software; you can redistribute it and/or
91444+ * modify it under the terms of the GNU General Public License version 2
91445+ * as published by the Free Software Foundation; or, when distributed
91446+ * separately from the Linux kernel or incorporated into other
91447+ * software packages, subject to the following license:
91448+ *
91449+ * Permission is hereby granted, free of charge, to any person obtaining a copy
91450+ * of this source file (the "Software"), to deal in the Software without
91451+ * restriction, including without limitation the rights to use, copy, modify,
91452+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
91453+ * and to permit persons to whom the Software is furnished to do so, subject to
91454+ * the following conditions:
91455+ *
91456+ * The above copyright notice and this permission notice shall be included in
91457+ * all copies or substantial portions of the Software.
91458+ *
91459+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
91460+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
91461+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
91462+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
91463+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91464+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
91465+ * IN THE SOFTWARE.
91466+ */
91467+
91468+#ifndef __HYPERCALL_H__
91469+#define __HYPERCALL_H__
91470+
91471+#ifndef __HYPERVISOR_H__
91472+# error "please don't include this file directly"
91473+#endif
91474+
91475+#include <asm/xen/xcom_hcall.h>
91476+struct xencomm_handle;
91477+
91478+/*
91479+ * Assembler stubs for hyper-calls.
91480+ */
91481+
91482+#define _hypercall0(type, name) \
91483+({ \
91484+ long __res; \
91485+ __asm__ __volatile__ (";;\n" \
91486+ "mov r2=%1\n" \
91487+ "break 0x1000 ;;\n" \
91488+ "mov %0=r8 ;;\n" \
91489+ : "=r" (__res) \
91490+ : "J" (__HYPERVISOR_##name) \
91491+ : "r2","r8", \
91492+ "memory" ); \
91493+ (type)__res; \
91494+})
91495+
91496+#define _hypercall1(type, name, a1) \
91497+({ \
91498+ long __res; \
91499+ __asm__ __volatile__ (";;\n" \
91500+ "mov r14=%2\n" \
91501+ "mov r2=%1\n" \
91502+ "break 0x1000 ;;\n" \
91503+ "mov %0=r8 ;;\n" \
91504+ : "=r" (__res) \
91505+ : "J" (__HYPERVISOR_##name), \
91506+ "rI" ((unsigned long)(a1)) \
91507+ : "r14","r2","r8", \
91508+ "memory" ); \
91509+ (type)__res; \
91510+})
91511+
91512+#define _hypercall2(type, name, a1, a2) \
91513+({ \
91514+ long __res; \
91515+ __asm__ __volatile__ (";;\n" \
91516+ "mov r14=%2\n" \
91517+ "mov r15=%3\n" \
91518+ "mov r2=%1\n" \
91519+ "break 0x1000 ;;\n" \
91520+ "mov %0=r8 ;;\n" \
91521+ : "=r" (__res) \
91522+ : "J" (__HYPERVISOR_##name), \
91523+ "rI" ((unsigned long)(a1)), \
91524+ "rI" ((unsigned long)(a2)) \
91525+ : "r14","r15","r2","r8", \
91526+ "memory" ); \
91527+ (type)__res; \
91528+})
91529+
91530+#define _hypercall3(type, name, a1, a2, a3) \
91531+({ \
91532+ long __res; \
91533+ __asm__ __volatile__ (";;\n" \
91534+ "mov r14=%2\n" \
91535+ "mov r15=%3\n" \
91536+ "mov r16=%4\n" \
91537+ "mov r2=%1\n" \
91538+ "break 0x1000 ;;\n" \
91539+ "mov %0=r8 ;;\n" \
91540+ : "=r" (__res) \
91541+ : "J" (__HYPERVISOR_##name), \
91542+ "rI" ((unsigned long)(a1)), \
91543+ "rI" ((unsigned long)(a2)), \
91544+ "rI" ((unsigned long)(a3)) \
91545+ : "r14","r15","r16","r2","r8", \
91546+ "memory" ); \
91547+ (type)__res; \
91548+})
91549+
91550+#define _hypercall4(type, name, a1, a2, a3, a4) \
91551+({ \
91552+ long __res; \
91553+ __asm__ __volatile__ (";;\n" \
91554+ "mov r14=%2\n" \
91555+ "mov r15=%3\n" \
91556+ "mov r16=%4\n" \
91557+ "mov r17=%5\n" \
91558+ "mov r2=%1\n" \
91559+ "break 0x1000 ;;\n" \
91560+ "mov %0=r8 ;;\n" \
91561+ : "=r" (__res) \
91562+ : "J" (__HYPERVISOR_##name), \
91563+ "rI" ((unsigned long)(a1)), \
91564+ "rI" ((unsigned long)(a2)), \
91565+ "rI" ((unsigned long)(a3)), \
91566+ "rI" ((unsigned long)(a4)) \
91567+ : "r14","r15","r16","r2","r8", \
91568+ "r17","memory" ); \
91569+ (type)__res; \
91570+})
91571+
91572+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
91573+({ \
91574+ long __res; \
91575+ __asm__ __volatile__ (";;\n" \
91576+ "mov r14=%2\n" \
91577+ "mov r15=%3\n" \
91578+ "mov r16=%4\n" \
91579+ "mov r17=%5\n" \
91580+ "mov r18=%6\n" \
91581+ "mov r2=%1\n" \
91582+ "break 0x1000 ;;\n" \
91583+ "mov %0=r8 ;;\n" \
91584+ : "=r" (__res) \
91585+ : "J" (__HYPERVISOR_##name), \
91586+ "rI" ((unsigned long)(a1)), \
91587+ "rI" ((unsigned long)(a2)), \
91588+ "rI" ((unsigned long)(a3)), \
91589+ "rI" ((unsigned long)(a4)), \
91590+ "rI" ((unsigned long)(a5)) \
91591+ : "r14","r15","r16","r2","r8", \
91592+ "r17","r18","memory" ); \
91593+ (type)__res; \
91594+})
91595+
91596+
91597+static inline int
91598+xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
91599+{
91600+ return _hypercall2(int, sched_op, cmd, arg);
91601+}
91602+
91603+static inline long
91604+HYPERVISOR_set_timer_op(u64 timeout)
91605+{
91606+ unsigned long timeout_hi = (unsigned long)(timeout >> 32);
91607+ unsigned long timeout_lo = (unsigned long)timeout;
91608+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
91609+}
91610+
91611+static inline int
91612+xencomm_arch_hypercall_dom0_op(struct xencomm_handle *op)
91613+{
91614+ return _hypercall1(int, dom0_op, op);
91615+}
91616+
91617+static inline int
91618+xencomm_arch_hypercall_sysctl(struct xencomm_handle *op)
91619+{
91620+ return _hypercall1(int, sysctl, op);
91621+}
91622+
91623+static inline int
91624+xencomm_arch_hypercall_domctl(struct xencomm_handle *op)
91625+{
91626+ return _hypercall1(int, domctl, op);
91627+}
91628+
91629+static inline int
91630+xencomm_arch_hypercall_multicall(struct xencomm_handle *call_list,
91631+ int nr_calls)
91632+{
91633+ return _hypercall2(int, multicall, call_list, nr_calls);
91634+}
91635+
91636+static inline int
91637+xencomm_arch_hypercall_memory_op(unsigned int cmd, struct xencomm_handle *arg)
91638+{
91639+ return _hypercall2(int, memory_op, cmd, arg);
91640+}
91641+
91642+static inline int
91643+xencomm_arch_hypercall_event_channel_op(int cmd, struct xencomm_handle *arg)
91644+{
91645+ return _hypercall2(int, event_channel_op, cmd, arg);
91646+}
91647+
91648+static inline int
91649+xencomm_arch_hypercall_acm_op(unsigned int cmd, struct xencomm_handle *arg)
91650+{
91651+ return _hypercall2(int, acm_op, cmd, arg);
91652+}
91653+
91654+static inline int
91655+xencomm_arch_hypercall_xen_version(int cmd, struct xencomm_handle *arg)
91656+{
91657+ return _hypercall2(int, xen_version, cmd, arg);
91658+}
91659+
91660+static inline int
91661+xencomm_arch_hypercall_console_io(int cmd, int count,
91662+ struct xencomm_handle *str)
91663+{
91664+ return _hypercall3(int, console_io, cmd, count, str);
91665+}
91666+
91667+static inline int
91668+xencomm_arch_hypercall_physdev_op(int cmd, struct xencomm_handle *arg)
91669+{
91670+ return _hypercall2(int, physdev_op, cmd, arg);
91671+}
91672+
91673+static inline int
91674+xencomm_arch_hypercall_grant_table_op(unsigned int cmd,
91675+ struct xencomm_handle *uop,
91676+ unsigned int count)
91677+{
91678+ return _hypercall3(int, grant_table_op, cmd, uop, count);
91679+}
91680+
91681+int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
91682+
91683+extern int xencomm_arch_hypercall_suspend(struct xencomm_handle *arg);
91684+
91685+static inline int
91686+xencomm_arch_hypercall_callback_op(int cmd, struct xencomm_handle *arg)
91687+{
91688+ return _hypercall2(int, callback_op, cmd, arg);
91689+}
91690+
91691+static inline unsigned long
91692+xencomm_arch_hypercall_hvm_op(int cmd, void *arg)
91693+{
91694+ return _hypercall2(unsigned long, hvm_op, cmd, arg);
91695+}
91696+
91697+static inline int
91698+HYPERVISOR_physdev_op(int cmd, void *arg)
91699+{
91700+ switch (cmd) {
91701+ case PHYSDEVOP_eoi:
91702+ return _hypercall1(int, ia64_fast_eoi,
91703+ ((struct physdev_eoi *)arg)->irq);
91704+ default:
91705+ return xencomm_hypercall_physdev_op(cmd, arg);
91706+ }
91707+}
91708+
91709+static inline int
91710+xencomm_arch_hypercall_xenoprof_op(int op, struct xencomm_handle *arg)
91711+{
91712+ return _hypercall2(int, xenoprof_op, op, arg);
91713+}
91714+
91715+extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
91716+static inline void exit_idle(void) {}
91717+#define do_IRQ(irq, regs) ({ \
91718+ irq_enter(); \
91719+ __do_IRQ((irq), (regs)); \
91720+ irq_exit(); \
91721+})
91722+
91723+#include <linux/err.h>
91724+#ifdef CONFIG_XEN
91725+#include <asm/xen/privop.h>
91726+#endif /* CONFIG_XEN */
91727+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
91728+#include <xen/platform-compat.h>
91729+#endif
91730+
91731+static inline unsigned long
91732+__HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
91733+{
91734+ return _hypercall3(unsigned long, ia64_dom0vp_op,
91735+ IA64_DOM0VP_ioremap, ioaddr, size);
91736+}
91737+
91738+static inline unsigned long
91739+HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
91740+{
91741+ unsigned long ret = ioaddr;
91742+ if (is_running_on_xen()) {
91743+ ret = __HYPERVISOR_ioremap(ioaddr, size);
91744+ if (unlikely(ret == -ENOSYS))
91745+ panic("hypercall %s failed with %ld. "
91746+ "Please check Xen and Linux config mismatch\n",
91747+ __func__, -ret);
91748+ else if (unlikely(IS_ERR_VALUE(ret)))
91749+ ret = ioaddr;
91750+ }
91751+ return ret;
91752+}
91753+
91754+static inline unsigned long
91755+__HYPERVISOR_phystomach(unsigned long gpfn)
91756+{
91757+ return _hypercall2(unsigned long, ia64_dom0vp_op,
91758+ IA64_DOM0VP_phystomach, gpfn);
91759+}
91760+
91761+static inline unsigned long
91762+HYPERVISOR_phystomach(unsigned long gpfn)
91763+{
91764+ unsigned long ret = gpfn;
91765+ if (is_running_on_xen()) {
91766+ ret = __HYPERVISOR_phystomach(gpfn);
91767+ }
91768+ return ret;
91769+}
91770+
91771+static inline unsigned long
91772+__HYPERVISOR_machtophys(unsigned long mfn)
91773+{
91774+ return _hypercall2(unsigned long, ia64_dom0vp_op,
91775+ IA64_DOM0VP_machtophys, mfn);
91776+}
91777+
91778+static inline unsigned long
91779+HYPERVISOR_machtophys(unsigned long mfn)
91780+{
91781+ unsigned long ret = mfn;
91782+ if (is_running_on_xen()) {
91783+ ret = __HYPERVISOR_machtophys(mfn);
91784+ }
91785+ return ret;
91786+}
91787+
91788+static inline unsigned long
91789+__HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
91790+{
91791+ return _hypercall3(unsigned long, ia64_dom0vp_op,
91792+ IA64_DOM0VP_zap_physmap, gpfn, extent_order);
91793+}
91794+
91795+static inline unsigned long
91796+HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
91797+{
91798+ unsigned long ret = 0;
91799+ if (is_running_on_xen()) {
91800+ ret = __HYPERVISOR_zap_physmap(gpfn, extent_order);
91801+ }
91802+ return ret;
91803+}
91804+
91805+static inline unsigned long
91806+__HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
91807+ unsigned long flags, domid_t domid)
91808+{
91809+ return _hypercall5(unsigned long, ia64_dom0vp_op,
91810+ IA64_DOM0VP_add_physmap, gpfn, mfn, flags, domid);
91811+}
91812+
91813+static inline unsigned long
91814+HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
91815+ unsigned long flags, domid_t domid)
91816+{
91817+ unsigned long ret = 0;
91818+ BUG_ON(!is_running_on_xen());//XXX
91819+ if (is_running_on_xen()) {
91820+ ret = __HYPERVISOR_add_physmap(gpfn, mfn, flags, domid);
91821+ }
91822+ return ret;
91823+}
91824+
91825+static inline unsigned long
91826+__HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
91827+ unsigned long flags, domid_t domid)
91828+{
91829+ return _hypercall5(unsigned long, ia64_dom0vp_op,
91830+ IA64_DOM0VP_add_physmap_with_gmfn,
91831+ gpfn, gmfn, flags, domid);
91832+}
91833+
91834+static inline unsigned long
91835+HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
91836+ unsigned long flags, domid_t domid)
91837+{
91838+ unsigned long ret = 0;
91839+ BUG_ON(!is_running_on_xen());//XXX
91840+ if (is_running_on_xen()) {
91841+ ret = __HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn,
91842+ flags, domid);
91843+ }
91844+ return ret;
91845+}
91846+
91847+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
91848+static inline unsigned long
91849+HYPERVISOR_expose_p2m(unsigned long conv_start_gpfn,
91850+ unsigned long assign_start_gpfn,
91851+ unsigned long expose_size, unsigned long granule_pfn)
91852+{
91853+ return _hypercall5(unsigned long, ia64_dom0vp_op,
91854+ IA64_DOM0VP_expose_p2m, conv_start_gpfn,
91855+ assign_start_gpfn, expose_size, granule_pfn);
91856+}
91857+#endif
91858+
91859+static inline int
91860+xencomm_arch_hypercall_perfmon_op(unsigned long cmd,
91861+ struct xencomm_handle *arg,
91862+ unsigned long count)
91863+{
91864+ return _hypercall4(int, ia64_dom0vp_op,
91865+ IA64_DOM0VP_perfmon, cmd, arg, count);
91866+}
91867+
91868+// for balloon driver
91869+#define HYPERVISOR_update_va_mapping(va, new_val, flags) (0)
91870+
91871+/* Use xencomm to do hypercalls. */
91872+#ifdef MODULE
91873+#define HYPERVISOR_sched_op xencomm_mini_hypercall_sched_op
91874+#define HYPERVISOR_event_channel_op xencomm_mini_hypercall_event_channel_op
91875+#define HYPERVISOR_callback_op xencomm_mini_hypercall_callback_op
91876+#define HYPERVISOR_multicall xencomm_mini_hypercall_multicall
91877+#define HYPERVISOR_xen_version xencomm_mini_hypercall_xen_version
91878+#define HYPERVISOR_console_io xencomm_mini_hypercall_console_io
91879+#define HYPERVISOR_hvm_op xencomm_mini_hypercall_hvm_op
91880+#define HYPERVISOR_memory_op xencomm_mini_hypercall_memory_op
91881+#define HYPERVISOR_xenoprof_op xencomm_mini_hypercall_xenoprof_op
91882+#define HYPERVISOR_perfmon_op xencomm_mini_hypercall_perfmon_op
91883+#else
91884+#define HYPERVISOR_sched_op xencomm_hypercall_sched_op
91885+#define HYPERVISOR_event_channel_op xencomm_hypercall_event_channel_op
91886+#define HYPERVISOR_callback_op xencomm_hypercall_callback_op
91887+#define HYPERVISOR_multicall xencomm_hypercall_multicall
91888+#define HYPERVISOR_xen_version xencomm_hypercall_xen_version
91889+#define HYPERVISOR_console_io xencomm_hypercall_console_io
91890+#define HYPERVISOR_hvm_op xencomm_hypercall_hvm_op
91891+#define HYPERVISOR_memory_op xencomm_hypercall_memory_op
91892+#define HYPERVISOR_xenoprof_op xencomm_hypercall_xenoprof_op
91893+#define HYPERVISOR_perfmon_op xencomm_hypercall_perfmon_op
91894+#endif
91895+
91896+#define HYPERVISOR_suspend xencomm_hypercall_suspend
91897+
91898+#endif /* __HYPERCALL_H__ */
91899diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hypervisor.h linux-2.6.16.33/include/asm-ia64/hypervisor.h
91900--- linux-2.6.16.33-noxen/include/asm-ia64/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
91901+++ linux-2.6.16.33/include/asm-ia64/hypervisor.h 2007-01-08 15:00:46.000000000 +0000
91902@@ -0,0 +1,223 @@
91903+/******************************************************************************
91904+ * hypervisor.h
91905+ *
91906+ * Linux-specific hypervisor handling.
91907+ *
91908+ * Copyright (c) 2002-2004, K A Fraser
91909+ *
91910+ * This program is free software; you can redistribute it and/or
91911+ * modify it under the terms of the GNU General Public License version 2
91912+ * as published by the Free Software Foundation; or, when distributed
91913+ * separately from the Linux kernel or incorporated into other
91914+ * software packages, subject to the following license:
91915+ *
91916+ * Permission is hereby granted, free of charge, to any person obtaining a copy
91917+ * of this source file (the "Software"), to deal in the Software without
91918+ * restriction, including without limitation the rights to use, copy, modify,
91919+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
91920+ * and to permit persons to whom the Software is furnished to do so, subject to
91921+ * the following conditions:
91922+ *
91923+ * The above copyright notice and this permission notice shall be included in
91924+ * all copies or substantial portions of the Software.
91925+ *
91926+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
91927+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
91928+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
91929+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
91930+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91931+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
91932+ * IN THE SOFTWARE.
91933+ */
91934+
91935+#ifndef __HYPERVISOR_H__
91936+#define __HYPERVISOR_H__
91937+
91938+#ifdef CONFIG_XEN
91939+extern int running_on_xen;
91940+#define is_running_on_xen() (running_on_xen)
91941+#else /* CONFIG_XEN */
91942+# ifdef CONFIG_VMX_GUEST
91943+# define is_running_on_xen() (1)
91944+# else /* CONFIG_VMX_GUEST */
91945+# define is_running_on_xen() (0)
91946+# define HYPERVISOR_ioremap(offset, size) (offset)
91947+# endif /* CONFIG_VMX_GUEST */
91948+#endif /* CONFIG_XEN */
91949+
91950+#if defined(CONFIG_XEN) || defined(CONFIG_VMX_GUEST)
91951+#include <linux/config.h>
91952+#include <linux/types.h>
91953+#include <linux/kernel.h>
91954+#include <linux/version.h>
91955+#include <linux/errno.h>
91956+#include <xen/interface/xen.h>
91957+#include <xen/interface/dom0_ops.h>
91958+#include <xen/interface/event_channel.h>
91959+#include <xen/interface/physdev.h>
91960+#include <xen/interface/sched.h>
91961+#include <asm/hypercall.h>
91962+#include <asm/ptrace.h>
91963+#include <asm/page.h>
91964+
91965+extern shared_info_t *HYPERVISOR_shared_info;
91966+extern start_info_t *xen_start_info;
91967+
91968+void force_evtchn_callback(void);
91969+
91970+#ifndef CONFIG_VMX_GUEST
91971+/* Turn jiffies into Xen system time. XXX Implement me. */
91972+#define jiffies_to_st(j) 0
91973+
91974+static inline int
91975+HYPERVISOR_yield(
91976+ void)
91977+{
91978+ int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
91979+
91980+ return rc;
91981+}
91982+
91983+static inline int
91984+HYPERVISOR_block(
91985+ void)
91986+{
91987+ int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
91988+
91989+ return rc;
91990+}
91991+
91992+static inline int
91993+HYPERVISOR_shutdown(
91994+ unsigned int reason)
91995+{
91996+ struct sched_shutdown sched_shutdown = {
91997+ .reason = reason
91998+ };
91999+
92000+ int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
92001+
92002+ return rc;
92003+}
92004+
92005+static inline int
92006+HYPERVISOR_poll(
92007+ evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
92008+{
92009+ struct sched_poll sched_poll = {
92010+ .nr_ports = nr_ports,
92011+ .timeout = jiffies_to_st(timeout)
92012+ };
92013+
92014+ int rc;
92015+
92016+ set_xen_guest_handle(sched_poll.ports, ports);
92017+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
92018+
92019+ return rc;
92020+}
92021+
92022+// for drivers/xen/privcmd/privcmd.c
92023+#define machine_to_phys_mapping 0
92024+struct vm_area_struct;
92025+int direct_remap_pfn_range(struct vm_area_struct *vma,
92026+ unsigned long address,
92027+ unsigned long mfn,
92028+ unsigned long size,
92029+ pgprot_t prot,
92030+ domid_t domid);
92031+struct file;
92032+int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
92033+int privcmd_mmap(struct file * file, struct vm_area_struct * vma);
92034+#define HAVE_ARCH_PRIVCMD_MMAP
92035+
92036+// for drivers/xen/balloon/balloon.c
92037+#ifdef CONFIG_XEN_SCRUB_PAGES
92038+#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
92039+#else
92040+#define scrub_pages(_p,_n) ((void)0)
92041+#endif
92042+#define pte_mfn(_x) pte_pfn(_x)
92043+#define phys_to_machine_mapping_valid(_x) (1)
92044+
92045+#endif /* !CONFIG_VMX_GUEST */
92046+
92047+#define __pte_ma(_x) ((pte_t) {(_x)}) /* unmodified use */
92048+#define pfn_pte_ma(_x,_y) __pte_ma(0) /* unmodified use */
92049+
92050+#ifndef CONFIG_VMX_GUEST
92051+int __xen_create_contiguous_region(unsigned long vstart, unsigned int order, unsigned int address_bits);
92052+static inline int
92053+xen_create_contiguous_region(unsigned long vstart,
92054+ unsigned int order, unsigned int address_bits)
92055+{
92056+ int ret = 0;
92057+ if (is_running_on_xen()) {
92058+ ret = __xen_create_contiguous_region(vstart, order,
92059+ address_bits);
92060+ }
92061+ return ret;
92062+}
92063+
92064+void __xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
92065+static inline void
92066+xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
92067+{
92068+ if (is_running_on_xen())
92069+ __xen_destroy_contiguous_region(vstart, order);
92070+}
92071+
92072+#endif /* !CONFIG_VMX_GUEST */
92073+
92074+// for netfront.c, netback.c
92075+#define MULTI_UVMFLAGS_INDEX 0 //XXX any value
92076+
92077+static inline void
92078+MULTI_update_va_mapping(
92079+ multicall_entry_t *mcl, unsigned long va,
92080+ pte_t new_val, unsigned long flags)
92081+{
92082+ mcl->op = __HYPERVISOR_update_va_mapping;
92083+ mcl->result = 0;
92084+}
92085+
92086+static inline void
92087+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
92088+ void *uop, unsigned int count)
92089+{
92090+ mcl->op = __HYPERVISOR_grant_table_op;
92091+ mcl->args[0] = cmd;
92092+ mcl->args[1] = (unsigned long)uop;
92093+ mcl->args[2] = count;
92094+}
92095+
92096+/*
92097+ * for blktap.c
92098+ * int create_lookup_pte_addr(struct mm_struct *mm,
92099+ * unsigned long address,
92100+ * uint64_t *ptep);
92101+ */
92102+#define create_lookup_pte_addr(mm, address, ptep) \
92103+ ({ \
92104+ printk(KERN_EMERG \
92105+ "%s:%d " \
92106+ "create_lookup_pte_addr() isn't supported.\n", \
92107+ __func__, __LINE__); \
92108+ BUG(); \
92109+ (-ENOSYS); \
92110+ })
92111+
92112+// for debug
92113+asmlinkage int xprintk(const char *fmt, ...);
92114+#define xprintd(fmt, ...) xprintk("%s:%d " fmt, __func__, __LINE__, \
92115+ ##__VA_ARGS__)
92116+
92117+#endif /* CONFIG_XEN || CONFIG_VMX_GUEST */
92118+
92119+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
92120+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
92121+#else
92122+#define is_initial_xendomain() 0
92123+#endif
92124+
92125+#endif /* __HYPERVISOR_H__ */
92126diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/intel_intrin.h linux-2.6.16.33/include/asm-ia64/intel_intrin.h
92127--- linux-2.6.16.33-noxen/include/asm-ia64/intel_intrin.h 2006-11-22 18:06:31.000000000 +0000
92128+++ linux-2.6.16.33/include/asm-ia64/intel_intrin.h 2007-01-08 15:00:46.000000000 +0000
92129@@ -119,10 +119,10 @@
92130 * intrinsic
92131 */
92132
92133-#define ia64_getreg __getReg
92134-#define ia64_setreg __setReg
92135+#define __ia64_getreg __getReg
92136+#define __ia64_setreg __setReg
92137
92138-#define ia64_hint(x)
92139+#define __ia64_hint(x)
92140
92141 #define ia64_mux1_brcst 0
92142 #define ia64_mux1_mix 8
92143@@ -135,16 +135,16 @@
92144 #define ia64_getf_exp __getf_exp
92145 #define ia64_shrp _m64_shrp
92146
92147-#define ia64_tpa __tpa
92148+#define __ia64_tpa __tpa
92149 #define ia64_invala __invala
92150 #define ia64_invala_gr __invala_gr
92151 #define ia64_invala_fr __invala_fr
92152 #define ia64_nop __nop
92153 #define ia64_sum __sum
92154-#define ia64_ssm __ssm
92155+#define __ia64_ssm __ssm
92156 #define ia64_rum __rum
92157-#define ia64_rsm __rsm
92158-#define ia64_fc __fc
92159+#define __ia64_rsm __rsm
92160+#define __ia64_fc __fc
92161
92162 #define ia64_ldfs __ldfs
92163 #define ia64_ldfd __ldfd
92164@@ -182,24 +182,24 @@
92165
92166 #define __ia64_set_dbr(index, val) \
92167 __setIndReg(_IA64_REG_INDR_DBR, index, val)
92168-#define ia64_set_ibr(index, val) \
92169+#define __ia64_set_ibr(index, val) \
92170 __setIndReg(_IA64_REG_INDR_IBR, index, val)
92171-#define ia64_set_pkr(index, val) \
92172+#define __ia64_set_pkr(index, val) \
92173 __setIndReg(_IA64_REG_INDR_PKR, index, val)
92174-#define ia64_set_pmc(index, val) \
92175+#define __ia64_set_pmc(index, val) \
92176 __setIndReg(_IA64_REG_INDR_PMC, index, val)
92177-#define ia64_set_pmd(index, val) \
92178+#define __ia64_set_pmd(index, val) \
92179 __setIndReg(_IA64_REG_INDR_PMD, index, val)
92180-#define ia64_set_rr(index, val) \
92181+#define __ia64_set_rr(index, val) \
92182 __setIndReg(_IA64_REG_INDR_RR, index, val)
92183
92184-#define ia64_get_cpuid(index) __getIndReg(_IA64_REG_INDR_CPUID, index)
92185+#define __ia64_get_cpuid(index) __getIndReg(_IA64_REG_INDR_CPUID, index)
92186 #define __ia64_get_dbr(index) __getIndReg(_IA64_REG_INDR_DBR, index)
92187-#define ia64_get_ibr(index) __getIndReg(_IA64_REG_INDR_IBR, index)
92188-#define ia64_get_pkr(index) __getIndReg(_IA64_REG_INDR_PKR, index)
92189-#define ia64_get_pmc(index) __getIndReg(_IA64_REG_INDR_PMC, index)
92190-#define ia64_get_pmd(index) __getIndReg(_IA64_REG_INDR_PMD, index)
92191-#define ia64_get_rr(index) __getIndReg(_IA64_REG_INDR_RR, index)
92192+#define __ia64_get_ibr(index) __getIndReg(_IA64_REG_INDR_IBR, index)
92193+#define __ia64_get_pkr(index) __getIndReg(_IA64_REG_INDR_PKR, index)
92194+#define __ia64_get_pmc(index) __getIndReg(_IA64_REG_INDR_PMC, index)
92195+#define __ia64_get_pmd(index) __getIndReg(_IA64_REG_INDR_PMD, index)
92196+#define __ia64_get_rr(index) __getIndReg(_IA64_REG_INDR_RR, index)
92197
92198 #define ia64_srlz_d __dsrlz
92199 #define ia64_srlz_i __isrlz
92200@@ -218,18 +218,18 @@
92201 #define ia64_ld8_acq __ld8_acq
92202
92203 #define ia64_sync_i __synci
92204-#define ia64_thash __thash
92205-#define ia64_ttag __ttag
92206-#define ia64_itcd __itcd
92207-#define ia64_itci __itci
92208-#define ia64_itrd __itrd
92209-#define ia64_itri __itri
92210-#define ia64_ptce __ptce
92211-#define ia64_ptcl __ptcl
92212-#define ia64_ptcg __ptcg
92213-#define ia64_ptcga __ptcga
92214-#define ia64_ptri __ptri
92215-#define ia64_ptrd __ptrd
92216+#define __ia64_thash __thash
92217+#define __ia64_ttag __ttag
92218+#define __ia64_itcd __itcd
92219+#define __ia64_itci __itci
92220+#define __ia64_itrd __itrd
92221+#define __ia64_itri __itri
92222+#define __ia64_ptce __ptce
92223+#define __ia64_ptcl __ptcl
92224+#define __ia64_ptcg __ptcg
92225+#define __ia64_ptcga __ptcga
92226+#define __ia64_ptri __ptri
92227+#define __ia64_ptrd __ptrd
92228 #define ia64_dep_mi _m64_dep_mi
92229
92230 /* Values for lfhint in __lfetch and __lfetch_fault */
92231@@ -244,14 +244,16 @@
92232 #define ia64_lfetch_fault __lfetch_fault
92233 #define ia64_lfetch_fault_excl __lfetch_fault_excl
92234
92235-#define ia64_intrin_local_irq_restore(x) \
92236+#define __ia64_intrin_local_irq_restore(x) \
92237 do { \
92238 if ((x) != 0) { \
92239- ia64_ssm(IA64_PSR_I); \
92240+ __ia64_ssm(IA64_PSR_I); \
92241 ia64_srlz_d(); \
92242 } else { \
92243- ia64_rsm(IA64_PSR_I); \
92244+ __ia64_rsm(IA64_PSR_I); \
92245 } \
92246 } while (0)
92247
92248+#define __ia64_get_psr_i() (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
92249+
92250 #endif /* _ASM_IA64_INTEL_INTRIN_H */
92251diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/io.h linux-2.6.16.33/include/asm-ia64/io.h
92252--- linux-2.6.16.33-noxen/include/asm-ia64/io.h 2006-11-22 18:06:31.000000000 +0000
92253+++ linux-2.6.16.33/include/asm-ia64/io.h 2007-01-08 15:00:46.000000000 +0000
92254@@ -66,9 +66,11 @@
92255 #define PIO_RESERVED __IA64_UNCACHED_OFFSET
92256 #define HAVE_ARCH_PIO_SIZE
92257
92258+#include <asm/hypervisor.h>
92259 #include <asm/intrinsics.h>
92260 #include <asm/machvec.h>
92261 #include <asm/page.h>
92262+#include <asm/privop.h>
92263 #include <asm/system.h>
92264 #include <asm-generic/iomap.h>
92265
92266@@ -95,9 +97,44 @@
92267 * The following two macros are deprecated and scheduled for removal.
92268 * Please use the PCI-DMA interface defined in <asm/pci.h> instead.
92269 */
92270+#ifndef CONFIG_XEN
92271 #define bus_to_virt phys_to_virt
92272 #define virt_to_bus virt_to_phys
92273 #define page_to_bus page_to_phys
92274+#else
92275+#define bus_to_virt(bus) \
92276+ phys_to_virt(machine_to_phys_for_dma(bus))
92277+#define virt_to_bus(virt) \
92278+ phys_to_machine_for_dma(virt_to_phys(virt))
92279+#define page_to_bus(page) \
92280+ phys_to_machine_for_dma(page_to_pseudophys(page))
92281+
92282+#define page_to_pseudophys(page) \
92283+ ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
92284+
92285+/*
92286+ * Drivers that use page_to_phys() for bus addresses are broken.
92287+ * This includes:
92288+ * drivers/ide/cris/ide-cris.c
92289+ * drivers/scsi/dec_esp.c
92290+ */
92291+#define page_to_phys(page) (page_to_pseudophys(page))
92292+#define bvec_to_bus(bv) (page_to_bus((bv)->bv_page) + \
92293+ (unsigned long) (bv)->bv_offset)
92294+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
92295+ (unsigned long) bio_offset((bio)))
92296+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
92297+ (unsigned long) (bv)->bv_offset)
92298+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
92299+ (((bvec_to_bus((vec1)) + (vec1)->bv_len) == bvec_to_bus((vec2))) && \
92300+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
92301+ bvec_to_pseudophys((vec2))))
92302+
92303+/* We will be supplying our own /dev/mem implementation */
92304+#define ARCH_HAS_DEV_MEM
92305+#define ARCH_HAS_DEV_MEM_MMAP_MEM
92306+int xen_mmap_mem(struct file * file, struct vm_area_struct * vma);
92307+#endif /* CONFIG_XEN */
92308
92309 # endif /* KERNEL */
92310
92311@@ -425,6 +462,9 @@
92312 static inline void __iomem *
92313 ioremap (unsigned long offset, unsigned long size)
92314 {
92315+ offset = HYPERVISOR_ioremap(offset, size);
92316+ if (IS_ERR_VALUE(offset))
92317+ return (void __iomem*)offset;
92318 return (void __iomem *) (__IA64_UNCACHED_OFFSET | (offset));
92319 }
92320
92321diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/iosapic.h linux-2.6.16.33/include/asm-ia64/iosapic.h
92322--- linux-2.6.16.33-noxen/include/asm-ia64/iosapic.h 2006-11-22 18:06:31.000000000 +0000
92323+++ linux-2.6.16.33/include/asm-ia64/iosapic.h 2007-01-08 15:00:46.000000000 +0000
92324@@ -53,6 +53,7 @@
92325
92326 #define NR_IOSAPICS 256
92327
92328+#ifndef CONFIG_XEN
92329 static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
92330 {
92331 writel(reg, iosapic + IOSAPIC_REG_SELECT);
92332@@ -64,6 +65,7 @@
92333 writel(reg, iosapic + IOSAPIC_REG_SELECT);
92334 writel(val, iosapic + IOSAPIC_WINDOW);
92335 }
92336+#endif
92337
92338 static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
92339 {
92340diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/irq.h linux-2.6.16.33/include/asm-ia64/irq.h
92341--- linux-2.6.16.33-noxen/include/asm-ia64/irq.h 2006-11-22 18:06:31.000000000 +0000
92342+++ linux-2.6.16.33/include/asm-ia64/irq.h 2007-01-08 15:00:46.000000000 +0000
92343@@ -11,8 +11,41 @@
92344 * 02/29/00 D.Mosberger moved most things into hw_irq.h
92345 */
92346
92347+#ifndef CONFIG_XEN
92348 #define NR_IRQS 256
92349 #define NR_IRQ_VECTORS NR_IRQS
92350+#else
92351+/*
92352+ * The flat IRQ space is divided into two regions:
92353+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
92354+ * if we have physical device-access privilege. This region is at the
92355+ * start of the IRQ space so that existing device drivers do not need
92356+ * to be modified to translate physical IRQ numbers into our IRQ space.
92357+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
92358+ * are bound using the provided bind/unbind functions.
92359+ */
92360+
92361+#define PIRQ_BASE 0
92362+#define NR_PIRQS 256
92363+
92364+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
92365+#define NR_DYNIRQS 256
92366+
92367+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
92368+#define NR_IRQ_VECTORS NR_IRQS
92369+
92370+#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
92371+#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
92372+
92373+#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
92374+#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
92375+
92376+#define RESCHEDULE_VECTOR 0
92377+#define IPI_VECTOR 1
92378+#define CMCP_VECTOR 2
92379+#define CPEP_VECTOR 3
92380+#define NR_IPIS 4
92381+#endif /* CONFIG_XEN */
92382
92383 /*
92384 * IRQ line status macro IRQ_PER_CPU is used
92385diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/machvec_dig.h linux-2.6.16.33/include/asm-ia64/machvec_dig.h
92386--- linux-2.6.16.33-noxen/include/asm-ia64/machvec_dig.h 2006-11-22 18:06:31.000000000 +0000
92387+++ linux-2.6.16.33/include/asm-ia64/machvec_dig.h 2007-01-08 15:00:46.000000000 +0000
92388@@ -15,4 +15,19 @@
92389 #define platform_setup dig_setup
92390 #define platform_irq_init dig_irq_init
92391
92392+#ifdef CONFIG_XEN
92393+# define platform_dma_map_sg dma_map_sg
92394+# define platform_dma_unmap_sg dma_unmap_sg
92395+# define platform_dma_mapping_error dma_mapping_error
92396+# define platform_dma_supported dma_supported
92397+# define platform_dma_alloc_coherent dma_alloc_coherent
92398+# define platform_dma_free_coherent dma_free_coherent
92399+# define platform_dma_map_single dma_map_single
92400+# define platform_dma_unmap_single dma_unmap_single
92401+# define platform_dma_sync_single_for_cpu \
92402+ dma_sync_single_for_cpu
92403+# define platform_dma_sync_single_for_device \
92404+ dma_sync_single_for_device
92405+#endif
92406+
92407 #endif /* _ASM_IA64_MACHVEC_DIG_h */
92408diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/maddr.h linux-2.6.16.33/include/asm-ia64/maddr.h
92409--- linux-2.6.16.33-noxen/include/asm-ia64/maddr.h 1970-01-01 00:00:00.000000000 +0000
92410+++ linux-2.6.16.33/include/asm-ia64/maddr.h 2007-01-08 15:00:46.000000000 +0000
92411@@ -0,0 +1,102 @@
92412+#ifndef _ASM_IA64_MADDR_H
92413+#define _ASM_IA64_MADDR_H
92414+
92415+#include <linux/kernel.h>
92416+#include <asm/hypervisor.h>
92417+#include <xen/features.h>
92418+#include <xen/interface/xen.h>
92419+
92420+#ifdef CONFIG_XEN
92421+
92422+#define INVALID_P2M_ENTRY (~0UL)
92423+
92424+#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
92425+extern int p2m_initialized;
92426+extern unsigned long p2m_min_low_pfn;
92427+extern unsigned long p2m_max_low_pfn;
92428+extern unsigned long p2m_convert_min_pfn;
92429+extern unsigned long p2m_convert_max_pfn;
92430+extern volatile const pte_t* p2m_pte;
92431+unsigned long p2m_phystomach(unsigned long gpfn);
92432+#else
92433+#define p2m_initialized (0)
92434+#define p2m_phystomach(gpfn) INVALID_MFN
92435+#endif
92436+
92437+/* XXX xen page size != page size */
92438+static inline unsigned long
92439+pfn_to_mfn_for_dma(unsigned long pfn)
92440+{
92441+ unsigned long mfn;
92442+ if (p2m_initialized)
92443+ return p2m_phystomach(pfn);
92444+ mfn = HYPERVISOR_phystomach(pfn);
92445+ BUG_ON(mfn == 0); // XXX
92446+ BUG_ON(mfn == INVALID_P2M_ENTRY); // XXX
92447+ BUG_ON(mfn == INVALID_MFN);
92448+ return mfn;
92449+}
92450+
92451+static inline unsigned long
92452+phys_to_machine_for_dma(unsigned long phys)
92453+{
92454+ unsigned long machine =
92455+ pfn_to_mfn_for_dma(phys >> PAGE_SHIFT) << PAGE_SHIFT;
92456+ machine |= (phys & ~PAGE_MASK);
92457+ return machine;
92458+}
92459+
92460+static inline unsigned long
92461+mfn_to_pfn_for_dma(unsigned long mfn)
92462+{
92463+ unsigned long pfn;
92464+ pfn = HYPERVISOR_machtophys(mfn);
92465+ BUG_ON(pfn == 0);
92466+ //BUG_ON(pfn == INVALID_M2P_ENTRY);
92467+ return pfn;
92468+}
92469+
92470+static inline unsigned long
92471+machine_to_phys_for_dma(unsigned long machine)
92472+{
92473+ unsigned long phys =
92474+ mfn_to_pfn_for_dma(machine >> PAGE_SHIFT) << PAGE_SHIFT;
92475+ phys |= (machine & ~PAGE_MASK);
92476+ return phys;
92477+}
92478+
92479+static inline unsigned long
92480+mfn_to_local_pfn(unsigned long mfn)
92481+{
92482+ extern unsigned long max_mapnr;
92483+ unsigned long pfn = mfn_to_pfn_for_dma(mfn);
92484+ if (!pfn_valid(pfn))
92485+ return INVALID_P2M_ENTRY;
92486+ return pfn;
92487+}
92488+
92489+#else /* !CONFIG_XEN */
92490+
92491+#define pfn_to_mfn_for_dma(pfn) (pfn)
92492+#define mfn_to_pfn_for_dma(mfn) (mfn)
92493+#define phys_to_machine_for_dma(phys) (phys)
92494+#define machine_to_phys_for_dma(machine) (machine)
92495+#define mfn_to_local_pfn(mfn) (mfn)
92496+
92497+#endif /* !CONFIG_XEN */
92498+
92499+/* XXX to compile set_phys_to_machine(vaddr, FOREIGN_FRAME(m)) */
92500+#define FOREIGN_FRAME(m) (INVALID_P2M_ENTRY)
92501+
92502+#define mfn_to_pfn(mfn) (mfn)
92503+#define pfn_to_mfn(pfn) (pfn)
92504+
92505+#define mfn_to_virt(mfn) (__va((mfn) << PAGE_SHIFT))
92506+#define virt_to_mfn(virt) (__pa(virt) >> PAGE_SHIFT)
92507+#define virt_to_machine(virt) __pa(virt) // for tpmfront.c
92508+
92509+#define set_phys_to_machine(pfn, mfn) do { } while (0)
92510+
92511+typedef unsigned long maddr_t; // to compile netback, netfront
92512+
92513+#endif /* _ASM_IA64_MADDR_H */
92514diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/meminit.h linux-2.6.16.33/include/asm-ia64/meminit.h
92515--- linux-2.6.16.33-noxen/include/asm-ia64/meminit.h 2006-11-22 18:06:31.000000000 +0000
92516+++ linux-2.6.16.33/include/asm-ia64/meminit.h 2007-01-08 15:00:46.000000000 +0000
92517@@ -17,10 +17,15 @@
92518 * - command line string
92519 * - kernel code & data
92520 * - Kernel memory map built from EFI memory map
92521+ * - xen start info
92522 *
92523 * More could be added if necessary
92524 */
92525+#ifndef CONFIG_XEN
92526 #define IA64_MAX_RSVD_REGIONS 6
92527+#else
92528+#define IA64_MAX_RSVD_REGIONS 7
92529+#endif
92530
92531 struct rsvd_region {
92532 unsigned long start; /* virtual address of beginning of element */
92533diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/page.h linux-2.6.16.33/include/asm-ia64/page.h
92534--- linux-2.6.16.33-noxen/include/asm-ia64/page.h 2006-11-22 18:06:31.000000000 +0000
92535+++ linux-2.6.16.33/include/asm-ia64/page.h 2007-01-08 15:00:46.000000000 +0000
92536@@ -117,7 +117,9 @@
92537 # define pfn_to_page(pfn) (vmem_map + (pfn))
92538 #endif
92539
92540+#ifndef CONFIG_XEN
92541 #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
92542+#endif
92543 #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
92544 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
92545
92546@@ -219,4 +221,53 @@
92547 (((current->personality & READ_IMPLIES_EXEC) != 0) \
92548 ? VM_EXEC : 0))
92549
92550+#ifndef __ASSEMBLY__
92551+#ifdef CONFIG_XEN
92552+
92553+#include <linux/kernel.h>
92554+#include <asm/hypervisor.h>
92555+#include <xen/features.h> // to compile netback, netfront
92556+
92557+/*
92558+ * XXX hack!
92559+ * Linux/IA64 uses PG_arch_1.
92560+ * This hack will be removed once PG_foreign bit is taken.
92561+ * #include <xen/foreign_page.h>
92562+ */
92563+#ifdef __ASM_XEN_FOREIGN_PAGE_H__
92564+# error "don't include include/xen/foreign_page.h!"
92565+#endif
92566+
92567+extern struct address_space xen_ia64_foreign_dummy_mapping;
92568+#define PageForeign(page) \
92569+ ((page)->mapping == &xen_ia64_foreign_dummy_mapping)
92570+
92571+#define SetPageForeign(page, dtor) do { \
92572+ set_page_private((page), (unsigned long)(dtor)); \
92573+ (page)->mapping = &xen_ia64_foreign_dummy_mapping; \
92574+ smp_rmb(); \
92575+} while (0)
92576+
92577+#define ClearPageForeign(page) do { \
92578+ (page)->mapping = NULL; \
92579+ smp_rmb(); \
92580+ set_page_private((page), 0); \
92581+} while (0)
92582+
92583+#define PageForeignDestructor(page) \
92584+ ( (void (*) (struct page *)) page_private(page) )
92585+
92586+#define arch_free_page(_page,_order) \
92587+({ int foreign = PageForeign(_page); \
92588+ if (foreign) \
92589+ (PageForeignDestructor(_page))(_page); \
92590+ foreign; \
92591+})
92592+#define HAVE_ARCH_FREE_PAGE
92593+
92594+#include <asm/maddr.h>
92595+
92596+#endif /* CONFIG_XEN */
92597+#endif /* __ASSEMBLY__ */
92598+
92599 #endif /* _ASM_IA64_PAGE_H */
92600diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/pal.h linux-2.6.16.33/include/asm-ia64/pal.h
92601--- linux-2.6.16.33-noxen/include/asm-ia64/pal.h 2006-11-22 18:06:31.000000000 +0000
92602+++ linux-2.6.16.33/include/asm-ia64/pal.h 2007-01-08 15:00:46.000000000 +0000
92603@@ -81,6 +81,7 @@
92604 #ifndef __ASSEMBLY__
92605
92606 #include <linux/types.h>
92607+#include <asm/processor.h>
92608 #include <asm/fpu.h>
92609
92610 /*
92611diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/pgalloc.h linux-2.6.16.33/include/asm-ia64/pgalloc.h
92612--- linux-2.6.16.33-noxen/include/asm-ia64/pgalloc.h 2006-11-22 18:06:31.000000000 +0000
92613+++ linux-2.6.16.33/include/asm-ia64/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
92614@@ -126,7 +126,11 @@
92615 static inline void
92616 pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
92617 {
92618+#ifndef CONFIG_XEN
92619 pmd_val(*pmd_entry) = page_to_phys(pte);
92620+#else
92621+ pmd_val(*pmd_entry) = page_to_pseudophys(pte);
92622+#endif
92623 }
92624
92625 static inline void
92626diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/privop.h linux-2.6.16.33/include/asm-ia64/privop.h
92627--- linux-2.6.16.33-noxen/include/asm-ia64/privop.h 1970-01-01 00:00:00.000000000 +0000
92628+++ linux-2.6.16.33/include/asm-ia64/privop.h 2007-01-08 15:00:46.000000000 +0000
92629@@ -0,0 +1,60 @@
92630+#ifndef _ASM_IA64_PRIVOP_H
92631+#define _ASM_IA64_PRIVOP_H
92632+
92633+/*
92634+ * Copyright (C) 2005 Hewlett-Packard Co
92635+ * Dan Magenheimer <dan.magenheimer@hp.com>
92636+ *
92637+ */
92638+
92639+#ifdef CONFIG_XEN
92640+#include <asm/xen/privop.h>
92641+#endif
92642+
92643+#ifndef __ASSEMBLY
92644+
92645+#ifndef IA64_PARAVIRTUALIZED
92646+
92647+#define ia64_getreg __ia64_getreg
92648+#define ia64_setreg __ia64_setreg
92649+#define ia64_hint __ia64_hint
92650+#define ia64_thash __ia64_thash
92651+#define ia64_itci __ia64_itci
92652+#define ia64_itcd __ia64_itcd
92653+#define ia64_itri __ia64_itri
92654+#define ia64_itrd __ia64_itrd
92655+#define ia64_tpa __ia64_tpa
92656+#define ia64_set_ibr __ia64_set_ibr
92657+#define ia64_set_pkr __ia64_set_pkr
92658+#define ia64_set_pmc __ia64_set_pmc
92659+#define ia64_set_pmd __ia64_set_pmd
92660+#define ia64_set_rr __ia64_set_rr
92661+#define ia64_get_cpuid __ia64_get_cpuid
92662+#define ia64_get_ibr __ia64_get_ibr
92663+#define ia64_get_pkr __ia64_get_pkr
92664+#define ia64_get_pmc __ia64_get_pmc
92665+#define ia64_get_pmd __ia64_get_pmd
92666+#define ia64_get_rr __ia64_get_rr
92667+#define ia64_fc __ia64_fc
92668+#define ia64_ssm __ia64_ssm
92669+#define ia64_rsm __ia64_rsm
92670+#define ia64_ptce __ia64_ptce
92671+#define ia64_ptcga __ia64_ptcga
92672+#define ia64_ptcl __ia64_ptcl
92673+#define ia64_ptri __ia64_ptri
92674+#define ia64_ptrd __ia64_ptrd
92675+#define ia64_get_psr_i __ia64_get_psr_i
92676+#define ia64_intrin_local_irq_restore __ia64_intrin_local_irq_restore
92677+#define ia64_pal_halt_light __ia64_pal_halt_light
92678+#define ia64_leave_kernel __ia64_leave_kernel
92679+#define ia64_leave_syscall __ia64_leave_syscall
92680+#define ia64_trace_syscall __ia64_trace_syscall
92681+#define ia64_ret_from_clone __ia64_ret_from_clone
92682+#define ia64_switch_to __ia64_switch_to
92683+#define ia64_pal_call_static __ia64_pal_call_static
92684+
92685+#endif /* !IA64_PARAVIRTUALIZED */
92686+
92687+#endif /* !__ASSEMBLY */
92688+
92689+#endif /* _ASM_IA64_PRIVOP_H */
92690diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/processor.h linux-2.6.16.33/include/asm-ia64/processor.h
92691--- linux-2.6.16.33-noxen/include/asm-ia64/processor.h 2006-11-22 18:06:31.000000000 +0000
92692+++ linux-2.6.16.33/include/asm-ia64/processor.h 2007-01-08 15:00:46.000000000 +0000
92693@@ -19,6 +19,7 @@
92694 #include <asm/kregs.h>
92695 #include <asm/ptrace.h>
92696 #include <asm/ustack.h>
92697+#include <asm/privop.h>
92698
92699 #define IA64_NUM_DBG_REGS 8
92700 /*
92701diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/sal.h linux-2.6.16.33/include/asm-ia64/sal.h
92702--- linux-2.6.16.33-noxen/include/asm-ia64/sal.h 2006-11-22 18:06:31.000000000 +0000
92703+++ linux-2.6.16.33/include/asm-ia64/sal.h 2007-01-08 15:00:46.000000000 +0000
92704@@ -42,6 +42,9 @@
92705 #include <asm/pal.h>
92706 #include <asm/system.h>
92707 #include <asm/fpu.h>
92708+#ifdef CONFIG_XEN
92709+#include <asm/xen/xencomm.h>
92710+#endif
92711
92712 extern spinlock_t sal_lock;
92713
92714@@ -686,10 +689,28 @@
92715 /* Get the processor and platform information logged by SAL with respect to the machine
92716 * state at the time of the MCAs, INITs, CMCs, or CPEs.
92717 */
92718+#ifdef CONFIG_XEN
92719+static inline u64 ia64_sal_get_state_info_size (u64 sal_info_type);
92720+#endif
92721+
92722 static inline u64
92723 ia64_sal_get_state_info (u64 sal_info_type, u64 *sal_info)
92724 {
92725 struct ia64_sal_retval isrv;
92726+#ifdef CONFIG_XEN
92727+ if (is_running_on_xen()) {
92728+ struct xencomm_handle *desc;
92729+
92730+ if (xencomm_create(sal_info,
92731+ ia64_sal_get_state_info_size(sal_info_type),
92732+ &desc, GFP_KERNEL))
92733+ return 0;
92734+
92735+ SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
92736+ desc, 0, 0, 0, 0);
92737+ xencomm_free(desc);
92738+ } else
92739+#endif
92740 SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
92741 sal_info, 0, 0, 0, 0);
92742 if (isrv.status)
92743diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/synch_bitops.h linux-2.6.16.33/include/asm-ia64/synch_bitops.h
92744--- linux-2.6.16.33-noxen/include/asm-ia64/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
92745+++ linux-2.6.16.33/include/asm-ia64/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
92746@@ -0,0 +1,63 @@
92747+#ifndef __XEN_SYNCH_BITOPS_H__
92748+#define __XEN_SYNCH_BITOPS_H__
92749+
92750+/*
92751+ * Copyright 1992, Linus Torvalds.
92752+ * Heavily modified to provide guaranteed strong synchronisation
92753+ * when communicating with Xen or other guest OSes running on other CPUs.
92754+ */
92755+
92756+#include <linux/config.h>
92757+
92758+#define ADDR (*(volatile long *) addr)
92759+
92760+static __inline__ void synch_set_bit(int nr, volatile void * addr)
92761+{
92762+ set_bit(nr, addr);
92763+}
92764+
92765+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
92766+{
92767+ clear_bit(nr, addr);
92768+}
92769+
92770+static __inline__ void synch_change_bit(int nr, volatile void * addr)
92771+{
92772+ change_bit(nr, addr);
92773+}
92774+
92775+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
92776+{
92777+ return test_and_set_bit(nr, addr);
92778+}
92779+
92780+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
92781+{
92782+ return test_and_clear_bit(nr, addr);
92783+}
92784+
92785+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
92786+{
92787+ return test_and_change_bit(nr, addr);
92788+}
92789+
92790+static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
92791+{
92792+ return test_bit(nr, addr);
92793+}
92794+
92795+static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
92796+{
92797+ return test_bit(nr, addr);
92798+}
92799+
92800+#define synch_cmpxchg ia64_cmpxchg4_acq
92801+
92802+#define synch_test_bit(nr,addr) \
92803+(__builtin_constant_p(nr) ? \
92804+ synch_const_test_bit((nr),(addr)) : \
92805+ synch_var_test_bit((nr),(addr)))
92806+
92807+#define synch_cmpxchg_subword synch_cmpxchg
92808+
92809+#endif /* __XEN_SYNCH_BITOPS_H__ */
92810diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/system.h linux-2.6.16.33/include/asm-ia64/system.h
92811--- linux-2.6.16.33-noxen/include/asm-ia64/system.h 2006-11-22 18:06:31.000000000 +0000
92812+++ linux-2.6.16.33/include/asm-ia64/system.h 2007-01-08 15:00:46.000000000 +0000
92813@@ -125,7 +125,7 @@
92814 #define __local_irq_save(x) \
92815 do { \
92816 ia64_stop(); \
92817- (x) = ia64_getreg(_IA64_REG_PSR); \
92818+ (x) = ia64_get_psr_i(); \
92819 ia64_stop(); \
92820 ia64_rsm(IA64_PSR_I); \
92821 } while (0)
92822@@ -173,7 +173,7 @@
92823 #endif /* !CONFIG_IA64_DEBUG_IRQ */
92824
92825 #define local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
92826-#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
92827+#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
92828
92829 #define irqs_disabled() \
92830 ({ \
92831diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/uaccess.h linux-2.6.16.33/include/asm-ia64/uaccess.h
92832--- linux-2.6.16.33-noxen/include/asm-ia64/uaccess.h 2006-11-22 18:06:31.000000000 +0000
92833+++ linux-2.6.16.33/include/asm-ia64/uaccess.h 2007-01-08 15:00:46.000000000 +0000
92834@@ -365,6 +365,7 @@
92835 }
92836
92837 #define ARCH_HAS_TRANSLATE_MEM_PTR 1
92838+#ifndef CONFIG_XEN
92839 static __inline__ char *
92840 xlate_dev_mem_ptr (unsigned long p)
92841 {
92842@@ -379,6 +380,25 @@
92843
92844 return ptr;
92845 }
92846+#else
92847+static __inline__ char *
92848+xlate_dev_mem_ptr (unsigned long p, ssize_t sz)
92849+{
92850+ unsigned long pfn = p >> PAGE_SHIFT;
92851+
92852+ if (pfn_valid(pfn) && !PageUncached(pfn_to_page(pfn)))
92853+ return __va(p);
92854+
92855+ return ioremap(p, sz);
92856+}
92857+
92858+static __inline__ void
92859+xlate_dev_mem_ptr_unmap (char* v)
92860+{
92861+ if (REGION_NUMBER(v) == RGN_UNCACHED)
92862+ iounmap(v);
92863+}
92864+#endif
92865
92866 /*
92867 * Convert a virtual cached kernel memory pointer to an uncached pointer
92868diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/privop.h linux-2.6.16.33/include/asm-ia64/xen/privop.h
92869--- linux-2.6.16.33-noxen/include/asm-ia64/xen/privop.h 1970-01-01 00:00:00.000000000 +0000
92870+++ linux-2.6.16.33/include/asm-ia64/xen/privop.h 2007-01-08 15:00:46.000000000 +0000
92871@@ -0,0 +1,303 @@
92872+#ifndef _ASM_IA64_XEN_PRIVOP_H
92873+#define _ASM_IA64_XEN_PRIVOP_H
92874+
92875+/*
92876+ * Copyright (C) 2005 Hewlett-Packard Co
92877+ * Dan Magenheimer <dan.magenheimer@hp.com>
92878+ *
92879+ * Paravirtualizations of privileged operations for Xen/ia64
92880+ *
92881+ */
92882+
92883+
92884+#include <xen/interface/arch-ia64.h>
92885+
92886+#define IA64_PARAVIRTUALIZED
92887+
92888+/* At 1 MB, before per-cpu space but still addressable using addl instead
92889+ of movl. */
92890+#define XSI_BASE 0xfffffffffff00000
92891+
92892+/* Address of mapped regs. */
92893+#define XMAPPEDREGS_BASE (XSI_BASE + XSI_SIZE)
92894+
92895+#ifdef __ASSEMBLY__
92896+#define XEN_HYPER_RFI break HYPERPRIVOP_RFI
92897+#define XEN_HYPER_RSM_PSR_DT break HYPERPRIVOP_RSM_DT
92898+#define XEN_HYPER_SSM_PSR_DT break HYPERPRIVOP_SSM_DT
92899+#define XEN_HYPER_COVER break HYPERPRIVOP_COVER
92900+#define XEN_HYPER_ITC_D break HYPERPRIVOP_ITC_D
92901+#define XEN_HYPER_ITC_I break HYPERPRIVOP_ITC_I
92902+#define XEN_HYPER_SSM_I break HYPERPRIVOP_SSM_I
92903+#define XEN_HYPER_GET_IVR break HYPERPRIVOP_GET_IVR
92904+#define XEN_HYPER_GET_TPR break HYPERPRIVOP_GET_TPR
92905+#define XEN_HYPER_SET_TPR break HYPERPRIVOP_SET_TPR
92906+#define XEN_HYPER_EOI break HYPERPRIVOP_EOI
92907+#define XEN_HYPER_SET_ITM break HYPERPRIVOP_SET_ITM
92908+#define XEN_HYPER_THASH break HYPERPRIVOP_THASH
92909+#define XEN_HYPER_PTC_GA break HYPERPRIVOP_PTC_GA
92910+#define XEN_HYPER_ITR_D break HYPERPRIVOP_ITR_D
92911+#define XEN_HYPER_GET_RR break HYPERPRIVOP_GET_RR
92912+#define XEN_HYPER_SET_RR break HYPERPRIVOP_SET_RR
92913+#define XEN_HYPER_SET_KR break HYPERPRIVOP_SET_KR
92914+#define XEN_HYPER_FC break HYPERPRIVOP_FC
92915+#define XEN_HYPER_GET_CPUID break HYPERPRIVOP_GET_CPUID
92916+#define XEN_HYPER_GET_PMD break HYPERPRIVOP_GET_PMD
92917+#define XEN_HYPER_GET_EFLAG break HYPERPRIVOP_GET_EFLAG
92918+#define XEN_HYPER_SET_EFLAG break HYPERPRIVOP_SET_EFLAG
92919+#define XEN_HYPER_RSM_BE break HYPERPRIVOP_RSM_BE
92920+#define XEN_HYPER_GET_PSR break HYPERPRIVOP_GET_PSR
92921+
92922+#define XSI_IFS (XSI_BASE + XSI_IFS_OFS)
92923+#define XSI_PRECOVER_IFS (XSI_BASE + XSI_PRECOVER_IFS_OFS)
92924+#define XSI_INCOMPL_REGFR (XSI_BASE + XSI_INCOMPL_REGFR_OFS)
92925+#define XSI_IFA (XSI_BASE + XSI_IFA_OFS)
92926+#define XSI_ISR (XSI_BASE + XSI_ISR_OFS)
92927+#define XSI_IIM (XSI_BASE + XSI_IIM_OFS)
92928+#define XSI_ITIR (XSI_BASE + XSI_ITIR_OFS)
92929+#define XSI_PSR_I_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS)
92930+#define XSI_PSR_IC (XSI_BASE + XSI_PSR_IC_OFS)
92931+#define XSI_IPSR (XSI_BASE + XSI_IPSR_OFS)
92932+#define XSI_IIP (XSI_BASE + XSI_IIP_OFS)
92933+#define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS)
92934+#define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS)
92935+#define XSI_IHA (XSI_BASE + XSI_IHA_OFS)
92936+#endif
92937+
92938+#ifndef __ASSEMBLY__
92939+#define XEN_HYPER_SSM_I asm("break %0" : : "i" (HYPERPRIVOP_SSM_I))
92940+#define XEN_HYPER_GET_IVR asm("break %0" : : "i" (HYPERPRIVOP_GET_IVR))
92941+
92942+/************************************************/
92943+/* Instructions paravirtualized for correctness */
92944+/************************************************/
92945+
92946+/* "fc" and "thash" are privilege-sensitive instructions, meaning they
92947+ * may have different semantics depending on whether they are executed
92948+ * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't
92949+ * be allowed to execute directly, lest incorrect semantics result. */
92950+extern unsigned long xen_fc(unsigned long addr);
92951+#define ia64_fc(addr) xen_fc((unsigned long)(addr))
92952+extern unsigned long xen_thash(unsigned long addr);
92953+#define ia64_thash(addr) xen_thash((unsigned long)(addr))
92954+/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
92955+ * is not currently used (though it may be in a long-format VHPT system!)
92956+ * and the semantics of cover only change if psr.ic is off which is very
92957+ * rare (and currently non-existent outside of assembly code */
92958+
92959+/* There are also privilege-sensitive registers. These registers are
92960+ * readable at any privilege level but only writable at PL0. */
92961+extern unsigned long xen_get_cpuid(int index);
92962+#define ia64_get_cpuid(i) xen_get_cpuid(i)
92963+extern unsigned long xen_get_pmd(int index);
92964+#define ia64_get_pmd(i) xen_get_pmd(i)
92965+extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */
92966+extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
92967+
92968+/************************************************/
92969+/* Instructions paravirtualized for performance */
92970+/************************************************/
92971+
92972+/* Xen uses memory-mapped virtual privileged registers for access to many
92973+ * performance-sensitive privileged registers. Some, like the processor
92974+ * status register (psr), are broken up into multiple memory locations.
92975+ * Others, like "pend", are abstractions based on privileged registers.
92976+ * "Pend" is guaranteed to be set if reading cr.ivr would return a
92977+ * (non-spurious) interrupt. */
92978+#define XEN_MAPPEDREGS ((struct mapped_regs *)XMAPPEDREGS_BASE)
92979+#define XSI_PSR_I \
92980+ (*XEN_MAPPEDREGS->interrupt_mask_addr)
92981+#define xen_get_virtual_psr_i() \
92982+ (!XSI_PSR_I)
92983+#define xen_set_virtual_psr_i(_val) \
92984+ ({ XSI_PSR_I = (uint8_t)(_val) ? 0 : 1; })
92985+#define xen_set_virtual_psr_ic(_val) \
92986+ ({ XEN_MAPPEDREGS->interrupt_collection_enabled = _val ? 1 : 0; })
92987+#define xen_get_virtual_pend() \
92988+ (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
92989+
92990+/* Hyperprivops are "break" instructions with a well-defined API.
92991+ * In particular, the virtual psr.ic bit must be off; in this way
92992+ * it is guaranteed to never conflict with a linux break instruction.
92993+ * Normally, this is done in a xen stub but this one is frequent enough
92994+ * that we inline it */
92995+#define xen_hyper_ssm_i() \
92996+({ \
92997+ xen_set_virtual_psr_i(0); \
92998+ xen_set_virtual_psr_ic(0); \
92999+ XEN_HYPER_SSM_I; \
93000+})
93001+
93002+/* turning off interrupts can be paravirtualized simply by writing
93003+ * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
93004+#define xen_rsm_i() xen_set_virtual_psr_i(0)
93005+
93006+/* turning on interrupts is a bit more complicated.. write to the
93007+ * memory-mapped virtual psr.i bit first (to avoid race condition),
93008+ * then if any interrupts were pending, we have to execute a hyperprivop
93009+ * to ensure the pending interrupt gets delivered; else we're done! */
93010+#define xen_ssm_i() \
93011+({ \
93012+ int old = xen_get_virtual_psr_i(); \
93013+ xen_set_virtual_psr_i(1); \
93014+ if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i(); \
93015+})
93016+
93017+#define xen_ia64_intrin_local_irq_restore(x) \
93018+{ \
93019+ if (is_running_on_xen()) { \
93020+ if ((x) & IA64_PSR_I) { xen_ssm_i(); } \
93021+ else { xen_rsm_i(); } \
93022+ } \
93023+ else __ia64_intrin_local_irq_restore((x)); \
93024+}
93025+
93026+#define xen_get_psr_i() \
93027+( \
93028+ (is_running_on_xen()) ? \
93029+ (xen_get_virtual_psr_i() ? IA64_PSR_I : 0) \
93030+ : __ia64_get_psr_i() \
93031+)
93032+
93033+#define xen_ia64_ssm(mask) \
93034+{ \
93035+ if ((mask)==IA64_PSR_I) { \
93036+ if (is_running_on_xen()) { xen_ssm_i(); } \
93037+ else { __ia64_ssm(mask); } \
93038+ } \
93039+ else { __ia64_ssm(mask); } \
93040+}
93041+
93042+#define xen_ia64_rsm(mask) \
93043+{ \
93044+ if ((mask)==IA64_PSR_I) { \
93045+ if (is_running_on_xen()) { xen_rsm_i(); } \
93046+ else { __ia64_rsm(mask); } \
93047+ } \
93048+ else { __ia64_rsm(mask); } \
93049+}
93050+
93051+
93052+/* Although all privileged operations can be left to trap and will
93053+ * be properly handled by Xen, some are frequent enough that we use
93054+ * hyperprivops for performance. */
93055+
93056+extern unsigned long xen_get_ivr(void);
93057+extern unsigned long xen_get_tpr(void);
93058+extern void xen_set_itm(unsigned long);
93059+extern void xen_set_tpr(unsigned long);
93060+extern void xen_eoi(void);
93061+extern void xen_set_rr(unsigned long index, unsigned long val);
93062+extern unsigned long xen_get_rr(unsigned long index);
93063+extern void xen_set_kr(unsigned long index, unsigned long val);
93064+extern void xen_ptcga(unsigned long addr, unsigned long size);
93065+
93066+/* Note: It may look wrong to test for is_running_on_xen() in each case.
93067+ * However regnum is always a constant so, as written, the compiler
93068+ * eliminates the switch statement, whereas is_running_on_xen() must be
93069+ * tested dynamically. */
93070+#define xen_ia64_getreg(regnum) \
93071+({ \
93072+ __u64 ia64_intri_res; \
93073+ \
93074+ switch(regnum) { \
93075+ case _IA64_REG_CR_IVR: \
93076+ ia64_intri_res = (is_running_on_xen()) ? \
93077+ xen_get_ivr() : \
93078+ __ia64_getreg(regnum); \
93079+ break; \
93080+ case _IA64_REG_CR_TPR: \
93081+ ia64_intri_res = (is_running_on_xen()) ? \
93082+ xen_get_tpr() : \
93083+ __ia64_getreg(regnum); \
93084+ break; \
93085+ case _IA64_REG_AR_EFLAG: \
93086+ ia64_intri_res = (is_running_on_xen()) ? \
93087+ xen_get_eflag() : \
93088+ __ia64_getreg(regnum); \
93089+ break; \
93090+ default: \
93091+ ia64_intri_res = __ia64_getreg(regnum); \
93092+ break; \
93093+ } \
93094+ ia64_intri_res; \
93095+})
93096+
93097+#define xen_ia64_setreg(regnum,val) \
93098+({ \
93099+ switch(regnum) { \
93100+ case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7: \
93101+ (is_running_on_xen()) ? \
93102+ xen_set_kr((regnum-_IA64_REG_AR_KR0), val) : \
93103+ __ia64_setreg(regnum,val); \
93104+ break; \
93105+ case _IA64_REG_CR_ITM: \
93106+ (is_running_on_xen()) ? \
93107+ xen_set_itm(val) : \
93108+ __ia64_setreg(regnum,val); \
93109+ break; \
93110+ case _IA64_REG_CR_TPR: \
93111+ (is_running_on_xen()) ? \
93112+ xen_set_tpr(val) : \
93113+ __ia64_setreg(regnum,val); \
93114+ break; \
93115+ case _IA64_REG_CR_EOI: \
93116+ (is_running_on_xen()) ? \
93117+ xen_eoi() : \
93118+ __ia64_setreg(regnum,val); \
93119+ break; \
93120+ case _IA64_REG_AR_EFLAG: \
93121+ (is_running_on_xen()) ? \
93122+ xen_set_eflag(val) : \
93123+ __ia64_setreg(regnum,val); \
93124+ break; \
93125+ default: \
93126+ __ia64_setreg(regnum,val); \
93127+ break; \
93128+ } \
93129+})
93130+
93131+#define ia64_ssm xen_ia64_ssm
93132+#define ia64_rsm xen_ia64_rsm
93133+#define ia64_intrin_local_irq_restore xen_ia64_intrin_local_irq_restore
93134+#define ia64_ptcga xen_ptcga
93135+#define ia64_set_rr(index,val) xen_set_rr(index,val)
93136+#define ia64_get_rr(index) xen_get_rr(index)
93137+#define ia64_getreg xen_ia64_getreg
93138+#define ia64_setreg xen_ia64_setreg
93139+#define ia64_get_psr_i xen_get_psr_i
93140+
93141+/* the remainder of these are not performance-sensitive so its
93142+ * OK to not paravirtualize and just take a privop trap and emulate */
93143+#define ia64_hint __ia64_hint
93144+#define ia64_set_pmd __ia64_set_pmd
93145+#define ia64_itci __ia64_itci
93146+#define ia64_itcd __ia64_itcd
93147+#define ia64_itri __ia64_itri
93148+#define ia64_itrd __ia64_itrd
93149+#define ia64_tpa __ia64_tpa
93150+#define ia64_set_ibr __ia64_set_ibr
93151+#define ia64_set_pkr __ia64_set_pkr
93152+#define ia64_set_pmc __ia64_set_pmc
93153+#define ia64_get_ibr __ia64_get_ibr
93154+#define ia64_get_pkr __ia64_get_pkr
93155+#define ia64_get_pmc __ia64_get_pmc
93156+#define ia64_ptce __ia64_ptce
93157+#define ia64_ptcl __ia64_ptcl
93158+#define ia64_ptri __ia64_ptri
93159+#define ia64_ptrd __ia64_ptrd
93160+
93161+#endif /* !__ASSEMBLY__ */
93162+
93163+/* these routines utilize privilege-sensitive or performance-sensitive
93164+ * privileged instructions so the code must be replaced with
93165+ * paravirtualized versions */
93166+#define ia64_pal_halt_light xen_pal_halt_light
93167+#define ia64_leave_kernel xen_leave_kernel
93168+#define ia64_leave_syscall xen_leave_syscall
93169+#define ia64_trace_syscall xen_trace_syscall
93170+#define ia64_ret_from_clone xen_ret_from_clone
93171+#define ia64_switch_to xen_switch_to
93172+#define ia64_pal_call_static xen_pal_call_static
93173+
93174+#endif /* _ASM_IA64_XEN_PRIVOP_H */
93175diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/xcom_hcall.h linux-2.6.16.33/include/asm-ia64/xen/xcom_hcall.h
93176--- linux-2.6.16.33-noxen/include/asm-ia64/xen/xcom_hcall.h 1970-01-01 00:00:00.000000000 +0000
93177+++ linux-2.6.16.33/include/asm-ia64/xen/xcom_hcall.h 2007-01-08 15:00:46.000000000 +0000
93178@@ -0,0 +1,86 @@
93179+/*
93180+ * Copyright (C) 2006 Tristan Gingold <tristan.gingold@bull.net>, Bull SAS
93181+ *
93182+ * This program is free software; you can redistribute it and/or modify
93183+ * it under the terms of the GNU General Public License as published by
93184+ * the Free Software Foundation; either version 2 of the License, or
93185+ * (at your option) any later version.
93186+ *
93187+ * This program is distributed in the hope that it will be useful,
93188+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
93189+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93190+ * GNU General Public License for more details.
93191+ *
93192+ * You should have received a copy of the GNU General Public License
93193+ * along with this program; if not, write to the Free Software
93194+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93195+ */
93196+
93197+#ifndef _LINUX_XENCOMM_HCALL_H_
93198+#define _LINUX_XENCOMM_HCALL_H_
93199+
93200+/* These function creates inline descriptor for the parameters and
93201+ calls the corresponding xencomm_arch_hypercall_X.
93202+ Architectures should defines HYPERVISOR_xxx as xencomm_hypercall_xxx unless
93203+ they want to use their own wrapper. */
93204+extern int xencomm_hypercall_console_io(int cmd, int count, char *str);
93205+
93206+extern int xencomm_hypercall_event_channel_op(int cmd, void *op);
93207+
93208+extern int xencomm_hypercall_xen_version(int cmd, void *arg);
93209+
93210+extern int xencomm_hypercall_physdev_op(int cmd, void *op);
93211+
93212+extern int xencomm_hypercall_grant_table_op(unsigned int cmd, void *op,
93213+ unsigned int count);
93214+
93215+extern int xencomm_hypercall_sched_op(int cmd, void *arg);
93216+
93217+extern int xencomm_hypercall_multicall(void *call_list, int nr_calls);
93218+
93219+extern int xencomm_hypercall_callback_op(int cmd, void *arg);
93220+
93221+extern int xencomm_hypercall_memory_op(unsigned int cmd, void *arg);
93222+
93223+extern unsigned long xencomm_hypercall_hvm_op(int cmd, void *arg);
93224+
93225+extern int xencomm_hypercall_suspend(unsigned long srec);
93226+
93227+extern int xencomm_hypercall_xenoprof_op(int op, void *arg);
93228+
93229+extern int xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg,
93230+ unsigned long count);
93231+
93232+/* Using mini xencomm. */
93233+extern int xencomm_mini_hypercall_console_io(int cmd, int count, char *str);
93234+
93235+extern int xencomm_mini_hypercall_event_channel_op(int cmd, void *op);
93236+
93237+extern int xencomm_mini_hypercall_xen_version(int cmd, void *arg);
93238+
93239+extern int xencomm_mini_hypercall_physdev_op(int cmd, void *op);
93240+
93241+extern int xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
93242+ unsigned int count);
93243+
93244+extern int xencomm_mini_hypercall_sched_op(int cmd, void *arg);
93245+
93246+extern int xencomm_mini_hypercall_multicall(void *call_list, int nr_calls);
93247+
93248+extern int xencomm_mini_hypercall_callback_op(int cmd, void *arg);
93249+
93250+extern int xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg);
93251+
93252+extern unsigned long xencomm_mini_hypercall_hvm_op(int cmd, void *arg);
93253+
93254+extern int xencomm_mini_hypercall_xenoprof_op(int op, void *arg);
93255+
93256+extern int xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
93257+ unsigned long count);
93258+
93259+/* For privcmd. Locally declare argument type to avoid include storm.
93260+ Type coherency will be checked within privcmd.c */
93261+struct privcmd_hypercall;
93262+extern int privcmd_hypercall(struct privcmd_hypercall *hypercall);
93263+
93264+#endif /* _LINUX_XENCOMM_HCALL_H_ */
93265diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/xencomm.h linux-2.6.16.33/include/asm-ia64/xen/xencomm.h
93266--- linux-2.6.16.33-noxen/include/asm-ia64/xen/xencomm.h 1970-01-01 00:00:00.000000000 +0000
93267+++ linux-2.6.16.33/include/asm-ia64/xen/xencomm.h 2007-01-08 15:00:46.000000000 +0000
93268@@ -0,0 +1,60 @@
93269+/*
93270+ * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
93271+ *
93272+ * This program is free software; you can redistribute it and/or modify
93273+ * it under the terms of the GNU General Public License as published by
93274+ * the Free Software Foundation; either version 2 of the License, or
93275+ * (at your option) any later version.
93276+ *
93277+ * This program is distributed in the hope that it will be useful,
93278+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
93279+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93280+ * GNU General Public License for more details.
93281+ *
93282+ * You should have received a copy of the GNU General Public License
93283+ * along with this program; if not, write to the Free Software
93284+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93285+ */
93286+
93287+#ifndef _LINUX_XENCOMM_H_
93288+#define _LINUX_XENCOMM_H_
93289+
93290+#include <xen/interface/xencomm.h>
93291+
93292+#define XENCOMM_MINI_ADDRS 3
93293+struct xencomm_mini {
93294+ struct xencomm_desc _desc;
93295+ uint64_t address[XENCOMM_MINI_ADDRS];
93296+};
93297+
93298+/* Must be called before any hypercall. */
93299+extern void xencomm_init (void);
93300+
93301+/* To avoid additionnal virt to phys conversion, an opaque structure is
93302+ presented. */
93303+struct xencomm_handle;
93304+
93305+extern int xencomm_create(void *buffer, unsigned long bytes,
93306+ struct xencomm_handle **desc, gfp_t type);
93307+extern void xencomm_free(struct xencomm_handle *desc);
93308+
93309+extern int xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
93310+ void *buffer, unsigned long bytes,
93311+ struct xencomm_handle **ret);
93312+
93313+/* Translate virtual address to physical address. */
93314+extern unsigned long xencomm_vaddr_to_paddr(unsigned long vaddr);
93315+
93316+/* Inline version. To be used only on linear space (kernel space). */
93317+static inline struct xencomm_handle *
93318+xencomm_create_inline(void *buffer)
93319+{
93320+ unsigned long paddr;
93321+
93322+ paddr = xencomm_vaddr_to_paddr((unsigned long)buffer);
93323+ return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
93324+}
93325+
93326+#define xen_guest_handle(hnd) ((hnd).p)
93327+
93328+#endif /* _LINUX_XENCOMM_H_ */
93329diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xenoprof.h linux-2.6.16.33/include/asm-ia64/xenoprof.h
93330--- linux-2.6.16.33-noxen/include/asm-ia64/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
93331+++ linux-2.6.16.33/include/asm-ia64/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
93332@@ -0,0 +1,48 @@
93333+/******************************************************************************
93334+ * asm-ia64/xenoprof.h
93335+ *
93336+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
93337+ * VA Linux Systems Japan K.K.
93338+ *
93339+ * This program is free software; you can redistribute it and/or modify
93340+ * it under the terms of the GNU General Public License as published by
93341+ * the Free Software Foundation; either version 2 of the License, or
93342+ * (at your option) any later version.
93343+ *
93344+ * This program is distributed in the hope that it will be useful,
93345+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
93346+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93347+ * GNU General Public License for more details.
93348+ *
93349+ * You should have received a copy of the GNU General Public License
93350+ * along with this program; if not, write to the Free Software
93351+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93352+ *
93353+ */
93354+#ifndef __ASM_XENOPROF_H__
93355+#define __ASM_XENOPROF_H__
93356+#ifdef CONFIG_XEN
93357+
93358+#undef HAVE_XENOPROF_CREATE_FILES
93359+
93360+struct xenoprof_init;
93361+void xenoprof_arch_init_counter(struct xenoprof_init *init);
93362+void xenoprof_arch_counter(void);
93363+void xenoprof_arch_start(void);
93364+void xenoprof_arch_stop(void);
93365+
93366+struct xenoprof_arch_shared_buffer {
93367+ struct resource* res;
93368+};
93369+
93370+struct xenoprof_shared_buffer;
93371+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
93372+struct xenoprof_get_buffer;
93373+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
93374+ struct xenoprof_shared_buffer* sbuf);
93375+struct xenoprof_passive;
93376+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
93377+ struct xenoprof_shared_buffer* sbuf);
93378+
93379+#endif /* CONFIG_XEN */
93380+#endif /* __ASM_XENOPROF_H__ */
93381diff -Nur linux-2.6.16.33-noxen/include/asm-um/page.h linux-2.6.16.33/include/asm-um/page.h
93382--- linux-2.6.16.33-noxen/include/asm-um/page.h 2006-11-22 18:06:31.000000000 +0000
93383+++ linux-2.6.16.33/include/asm-um/page.h 2007-01-08 15:00:46.000000000 +0000
93384@@ -118,7 +118,7 @@
93385 extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
93386 #define HAVE_ARCH_VALIDATE
93387
93388-extern void arch_free_page(struct page *page, int order);
93389+extern int arch_free_page(struct page *page, int order);
93390 #define HAVE_ARCH_FREE_PAGE
93391
93392 #include <asm-generic/page.h>
93393diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/apic.h linux-2.6.16.33/include/asm-x86_64/apic.h
93394--- linux-2.6.16.33-noxen/include/asm-x86_64/apic.h 2006-11-22 18:06:31.000000000 +0000
93395+++ linux-2.6.16.33/include/asm-x86_64/apic.h 2007-01-08 15:00:46.000000000 +0000
93396@@ -105,11 +105,13 @@
93397
93398 extern void setup_threshold_lvt(unsigned long lvt_off);
93399
93400+#ifndef CONFIG_XEN
93401 void smp_send_timer_broadcast_ipi(void);
93402 void switch_APIC_timer_to_ipi(void *cpumask);
93403 void switch_ipi_to_APIC_timer(void *cpumask);
93404
93405 #define ARCH_APICTIMER_STOPS_ON_C3 1
93406+#endif
93407
93408 #endif /* CONFIG_X86_LOCAL_APIC */
93409
93410diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/hw_irq.h linux-2.6.16.33/include/asm-x86_64/hw_irq.h
93411--- linux-2.6.16.33-noxen/include/asm-x86_64/hw_irq.h 2006-11-22 18:06:31.000000000 +0000
93412+++ linux-2.6.16.33/include/asm-x86_64/hw_irq.h 2007-05-23 21:00:01.000000000 +0000
93413@@ -127,7 +127,7 @@
93414 __asm__( \
93415 "\n.p2align\n" \
93416 "IRQ" #nr "_interrupt:\n\t" \
93417- "push $" #nr "-256 ; " \
93418+ "push $~(" #nr ") ; " \
93419 "jmp common_interrupt");
93420
93421 #if defined(CONFIG_X86_IO_APIC)
93422diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/kexec.h linux-2.6.16.33/include/asm-x86_64/kexec.h
93423--- linux-2.6.16.33-noxen/include/asm-x86_64/kexec.h 2006-11-22 18:06:31.000000000 +0000
93424+++ linux-2.6.16.33/include/asm-x86_64/kexec.h 2007-01-08 15:00:46.000000000 +0000
93425@@ -1,6 +1,27 @@
93426 #ifndef _X86_64_KEXEC_H
93427 #define _X86_64_KEXEC_H
93428
93429+#define PA_CONTROL_PAGE 0
93430+#define VA_CONTROL_PAGE 1
93431+#define PA_PGD 2
93432+#define VA_PGD 3
93433+#define PA_PUD_0 4
93434+#define VA_PUD_0 5
93435+#define PA_PMD_0 6
93436+#define VA_PMD_0 7
93437+#define PA_PTE_0 8
93438+#define VA_PTE_0 9
93439+#define PA_PUD_1 10
93440+#define VA_PUD_1 11
93441+#define PA_PMD_1 12
93442+#define VA_PMD_1 13
93443+#define PA_PTE_1 14
93444+#define VA_PTE_1 15
93445+#define PA_TABLE_PAGE 16
93446+#define PAGES_NR 17
93447+
93448+#ifndef __ASSEMBLY__
93449+
93450 #include <linux/string.h>
93451
93452 #include <asm/page.h>
93453@@ -64,4 +85,25 @@
93454 newregs->rip = (unsigned long)current_text_addr();
93455 }
93456 }
93457+
93458+NORET_TYPE void
93459+relocate_kernel(unsigned long indirection_page,
93460+ unsigned long page_list,
93461+ unsigned long start_address) ATTRIB_NORET;
93462+
93463+/* Under Xen we need to work with machine addresses. These macros give the
93464+ * machine address of a certain page to the generic kexec code instead of
93465+ * the pseudo physical address which would be given by the default macros.
93466+ */
93467+
93468+#ifdef CONFIG_XEN
93469+#define KEXEC_ARCH_HAS_PAGE_MACROS
93470+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
93471+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
93472+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
93473+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
93474+#endif
93475+
93476+#endif /* __ASSEMBLY__ */
93477+
93478 #endif /* _X86_64_KEXEC_H */
93479diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/agp.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/agp.h
93480--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/agp.h 1970-01-01 00:00:00.000000000 +0000
93481+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/agp.h 2007-01-08 15:00:46.000000000 +0000
93482@@ -0,0 +1,35 @@
93483+#ifndef AGP_H
93484+#define AGP_H 1
93485+
93486+#include <asm/cacheflush.h>
93487+#include <asm/system.h>
93488+
93489+/*
93490+ * Functions to keep the agpgart mappings coherent.
93491+ * The GART gives the CPU a physical alias of memory. The alias is
93492+ * mapped uncacheable. Make sure there are no conflicting mappings
93493+ * with different cachability attributes for the same page.
93494+ */
93495+
93496+int map_page_into_agp(struct page *page);
93497+int unmap_page_from_agp(struct page *page);
93498+#define flush_agp_mappings() global_flush_tlb()
93499+
93500+/* Could use CLFLUSH here if the cpu supports it. But then it would
93501+ need to be called for each cacheline of the whole page so it may not be
93502+ worth it. Would need a page for it. */
93503+#define flush_agp_cache() wbinvd()
93504+
93505+/* Convert a physical address to an address suitable for the GART. */
93506+#define phys_to_gart(x) phys_to_machine(x)
93507+#define gart_to_phys(x) machine_to_phys(x)
93508+
93509+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
93510+#define alloc_gatt_pages(order) ({ \
93511+ char *_t; dma_addr_t _d; \
93512+ _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
93513+ _t; })
93514+#define free_gatt_pages(table, order) \
93515+ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
93516+
93517+#endif
93518diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/arch_hooks.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/arch_hooks.h
93519--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/arch_hooks.h 1970-01-01 00:00:00.000000000 +0000
93520+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/arch_hooks.h 2007-01-08 15:00:46.000000000 +0000
93521@@ -0,0 +1,27 @@
93522+#ifndef _ASM_ARCH_HOOKS_H
93523+#define _ASM_ARCH_HOOKS_H
93524+
93525+#include <linux/interrupt.h>
93526+
93527+/*
93528+ * linux/include/asm/arch_hooks.h
93529+ *
93530+ * define the architecture specific hooks
93531+ */
93532+
93533+/* these aren't arch hooks, they are generic routines
93534+ * that can be used by the hooks */
93535+extern void init_ISA_irqs(void);
93536+extern void apic_intr_init(void);
93537+extern void smp_intr_init(void);
93538+extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
93539+
93540+/* these are the defined hooks */
93541+extern void intr_init_hook(void);
93542+extern void pre_intr_init_hook(void);
93543+extern void pre_setup_arch_hook(void);
93544+extern void trap_init_hook(void);
93545+extern void time_init_hook(void);
93546+extern void mca_nmi_hook(void);
93547+
93548+#endif
93549diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/bootsetup.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/bootsetup.h
93550--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/bootsetup.h 1970-01-01 00:00:00.000000000 +0000
93551+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/bootsetup.h 2007-01-08 15:00:46.000000000 +0000
93552@@ -0,0 +1,42 @@
93553+
93554+#ifndef _X86_64_BOOTSETUP_H
93555+#define _X86_64_BOOTSETUP_H 1
93556+
93557+#define BOOT_PARAM_SIZE 4096
93558+extern char x86_boot_params[BOOT_PARAM_SIZE];
93559+
93560+/*
93561+ * This is set up by the setup-routine at boot-time
93562+ */
93563+#define PARAM ((unsigned char *)x86_boot_params)
93564+#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
93565+#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
93566+#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
93567+#define E820_MAP_NR (*(char*) (PARAM+E820NR))
93568+#define E820_MAP ((struct e820entry *) (PARAM+E820MAP))
93569+#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
93570+#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
93571+#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
93572+#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
93573+#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
93574+#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
93575+#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
93576+#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
93577+#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
93578+#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
93579+
93580+#define INITRD_START (__pa(xen_start_info->mod_start))
93581+#define INITRD_SIZE (xen_start_info->mod_len)
93582+#define EDID_INFO (*(struct edid_info *) (PARAM+0x440))
93583+
93584+#define EDD_NR (*(unsigned char *) (PARAM+EDDNR))
93585+#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
93586+#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
93587+#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF))
93588+#define COMMAND_LINE saved_command_line
93589+
93590+#define RAMDISK_IMAGE_START_MASK 0x07FF
93591+#define RAMDISK_PROMPT_FLAG 0x8000
93592+#define RAMDISK_LOAD_FLAG 0x4000
93593+
93594+#endif
93595diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/desc.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/desc.h
93596--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/desc.h 1970-01-01 00:00:00.000000000 +0000
93597+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/desc.h 2007-01-08 15:00:46.000000000 +0000
93598@@ -0,0 +1,263 @@
93599+/* Written 2000 by Andi Kleen */
93600+#ifndef __ARCH_DESC_H
93601+#define __ARCH_DESC_H
93602+
93603+#include <linux/threads.h>
93604+#include <asm/ldt.h>
93605+
93606+#ifndef __ASSEMBLY__
93607+
93608+#include <linux/string.h>
93609+#include <linux/smp.h>
93610+
93611+#include <asm/segment.h>
93612+#include <asm/mmu.h>
93613+
93614+// 8 byte segment descriptor
93615+struct desc_struct {
93616+ u16 limit0;
93617+ u16 base0;
93618+ unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
93619+ unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
93620+} __attribute__((packed));
93621+
93622+struct n_desc_struct {
93623+ unsigned int a,b;
93624+};
93625+
93626+enum {
93627+ GATE_INTERRUPT = 0xE,
93628+ GATE_TRAP = 0xF,
93629+ GATE_CALL = 0xC,
93630+};
93631+
93632+// 16byte gate
93633+struct gate_struct {
93634+ u16 offset_low;
93635+ u16 segment;
93636+ unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
93637+ u16 offset_middle;
93638+ u32 offset_high;
93639+ u32 zero1;
93640+} __attribute__((packed));
93641+
93642+#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
93643+#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
93644+#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
93645+
93646+enum {
93647+ DESC_TSS = 0x9,
93648+ DESC_LDT = 0x2,
93649+};
93650+
93651+// LDT or TSS descriptor in the GDT. 16 bytes.
93652+struct ldttss_desc {
93653+ u16 limit0;
93654+ u16 base0;
93655+ unsigned base1 : 8, type : 5, dpl : 2, p : 1;
93656+ unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
93657+ u32 base3;
93658+ u32 zero1;
93659+} __attribute__((packed));
93660+
93661+struct desc_ptr {
93662+ unsigned short size;
93663+ unsigned long address;
93664+} __attribute__((packed)) ;
93665+
93666+extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
93667+
93668+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
93669+
93670+#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
93671+#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
93672+
93673+static inline void clear_LDT(void)
93674+{
93675+ int cpu = get_cpu();
93676+
93677+ /*
93678+ * NB. We load the default_ldt for lcall7/27 handling on demand, as
93679+ * it slows down context switching. Noone uses it anyway.
93680+ */
93681+ cpu = cpu; /* XXX avoid compiler warning */
93682+ xen_set_ldt(0UL, 0);
93683+ put_cpu();
93684+}
93685+
93686+/*
93687+ * This is the ldt that every process will get unless we need
93688+ * something other than this.
93689+ */
93690+extern struct desc_struct default_ldt[];
93691+#ifndef CONFIG_X86_NO_IDT
93692+extern struct gate_struct idt_table[];
93693+#endif
93694+extern struct desc_ptr cpu_gdt_descr[];
93695+
93696+/* the cpu gdt accessor */
93697+#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
93698+
93699+static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
93700+{
93701+ struct gate_struct s;
93702+ s.offset_low = PTR_LOW(func);
93703+ s.segment = __KERNEL_CS;
93704+ s.ist = ist;
93705+ s.p = 1;
93706+ s.dpl = dpl;
93707+ s.zero0 = 0;
93708+ s.zero1 = 0;
93709+ s.type = type;
93710+ s.offset_middle = PTR_MIDDLE(func);
93711+ s.offset_high = PTR_HIGH(func);
93712+ /* does not need to be atomic because it is only done once at setup time */
93713+ memcpy(adr, &s, 16);
93714+}
93715+
93716+#ifndef CONFIG_X86_NO_IDT
93717+static inline void set_intr_gate(int nr, void *func)
93718+{
93719+ BUG_ON((unsigned)nr > 0xFF);
93720+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
93721+}
93722+
93723+static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
93724+{
93725+ BUG_ON((unsigned)nr > 0xFF);
93726+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
93727+}
93728+
93729+static inline void set_system_gate(int nr, void *func)
93730+{
93731+ BUG_ON((unsigned)nr > 0xFF);
93732+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
93733+}
93734+
93735+static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
93736+{
93737+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
93738+}
93739+#endif
93740+
93741+static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
93742+ unsigned size)
93743+{
93744+ struct ldttss_desc d;
93745+ memset(&d,0,sizeof(d));
93746+ d.limit0 = size & 0xFFFF;
93747+ d.base0 = PTR_LOW(tss);
93748+ d.base1 = PTR_MIDDLE(tss) & 0xFF;
93749+ d.type = type;
93750+ d.p = 1;
93751+ d.limit1 = (size >> 16) & 0xF;
93752+ d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
93753+ d.base3 = PTR_HIGH(tss);
93754+ memcpy(ptr, &d, 16);
93755+}
93756+
93757+#ifndef CONFIG_X86_NO_TSS
93758+static inline void set_tss_desc(unsigned cpu, void *addr)
93759+{
93760+ /*
93761+ * sizeof(unsigned long) coming from an extra "long" at the end
93762+ * of the iobitmap. See tss_struct definition in processor.h
93763+ *
93764+ * -1? seg base+limit should be pointing to the address of the
93765+ * last valid byte
93766+ */
93767+ set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
93768+ (unsigned long)addr, DESC_TSS,
93769+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
93770+}
93771+#endif
93772+
93773+static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
93774+{
93775+ set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
93776+ DESC_LDT, size * 8 - 1);
93777+}
93778+
93779+static inline void set_seg_base(unsigned cpu, int entry, void *base)
93780+{
93781+ struct desc_struct *d = &cpu_gdt(cpu)[entry];
93782+ u32 addr = (u32)(u64)base;
93783+ BUG_ON((u64)base >> 32);
93784+ d->base0 = addr & 0xffff;
93785+ d->base1 = (addr >> 16) & 0xff;
93786+ d->base2 = (addr >> 24) & 0xff;
93787+}
93788+
93789+#define LDT_entry_a(info) \
93790+ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
93791+/* Don't allow setting of the lm bit. It is useless anyways because
93792+ 64bit system calls require __USER_CS. */
93793+#define LDT_entry_b(info) \
93794+ (((info)->base_addr & 0xff000000) | \
93795+ (((info)->base_addr & 0x00ff0000) >> 16) | \
93796+ ((info)->limit & 0xf0000) | \
93797+ (((info)->read_exec_only ^ 1) << 9) | \
93798+ ((info)->contents << 10) | \
93799+ (((info)->seg_not_present ^ 1) << 15) | \
93800+ ((info)->seg_32bit << 22) | \
93801+ ((info)->limit_in_pages << 23) | \
93802+ ((info)->useable << 20) | \
93803+ /* ((info)->lm << 21) | */ \
93804+ 0x7000)
93805+
93806+#define LDT_empty(info) (\
93807+ (info)->base_addr == 0 && \
93808+ (info)->limit == 0 && \
93809+ (info)->contents == 0 && \
93810+ (info)->read_exec_only == 1 && \
93811+ (info)->seg_32bit == 0 && \
93812+ (info)->limit_in_pages == 0 && \
93813+ (info)->seg_not_present == 1 && \
93814+ (info)->useable == 0 && \
93815+ (info)->lm == 0)
93816+
93817+#if TLS_SIZE != 24
93818+# error update this code.
93819+#endif
93820+
93821+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
93822+{
93823+#if 0
93824+ u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
93825+ gdt[0] = t->tls_array[0];
93826+ gdt[1] = t->tls_array[1];
93827+ gdt[2] = t->tls_array[2];
93828+#endif
93829+#define C(i) \
93830+ HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
93831+
93832+ C(0); C(1); C(2);
93833+#undef C
93834+}
93835+
93836+/*
93837+ * load one particular LDT into the current CPU
93838+ */
93839+static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
93840+{
93841+ void *segments = pc->ldt;
93842+ int count = pc->size;
93843+
93844+ if (likely(!count))
93845+ segments = NULL;
93846+
93847+ xen_set_ldt((unsigned long)segments, count);
93848+}
93849+
93850+static inline void load_LDT(mm_context_t *pc)
93851+{
93852+ int cpu = get_cpu();
93853+ load_LDT_nolock(pc, cpu);
93854+ put_cpu();
93855+}
93856+
93857+extern struct desc_ptr idt_descr;
93858+
93859+#endif /* !__ASSEMBLY__ */
93860+
93861+#endif
93862diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/dma-mapping.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/dma-mapping.h
93863--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/dma-mapping.h 1970-01-01 00:00:00.000000000 +0000
93864+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/dma-mapping.h 2007-01-08 15:00:46.000000000 +0000
93865@@ -0,0 +1,191 @@
93866+#ifndef _X8664_DMA_MAPPING_H
93867+#define _X8664_DMA_MAPPING_H 1
93868+
93869+/*
93870+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
93871+ * documentation.
93872+ */
93873+
93874+#include <linux/config.h>
93875+
93876+#include <asm/scatterlist.h>
93877+#include <asm/io.h>
93878+#include <asm/swiotlb.h>
93879+
93880+struct dma_mapping_ops {
93881+ int (*mapping_error)(dma_addr_t dma_addr);
93882+ void* (*alloc_coherent)(struct device *dev, size_t size,
93883+ dma_addr_t *dma_handle, gfp_t gfp);
93884+ void (*free_coherent)(struct device *dev, size_t size,
93885+ void *vaddr, dma_addr_t dma_handle);
93886+ dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
93887+ size_t size, int direction);
93888+ /* like map_single, but doesn't check the device mask */
93889+ dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
93890+ size_t size, int direction);
93891+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
93892+ size_t size, int direction);
93893+ void (*sync_single_for_cpu)(struct device *hwdev,
93894+ dma_addr_t dma_handle, size_t size,
93895+ int direction);
93896+ void (*sync_single_for_device)(struct device *hwdev,
93897+ dma_addr_t dma_handle, size_t size,
93898+ int direction);
93899+ void (*sync_single_range_for_cpu)(struct device *hwdev,
93900+ dma_addr_t dma_handle, unsigned long offset,
93901+ size_t size, int direction);
93902+ void (*sync_single_range_for_device)(struct device *hwdev,
93903+ dma_addr_t dma_handle, unsigned long offset,
93904+ size_t size, int direction);
93905+ void (*sync_sg_for_cpu)(struct device *hwdev,
93906+ struct scatterlist *sg, int nelems,
93907+ int direction);
93908+ void (*sync_sg_for_device)(struct device *hwdev,
93909+ struct scatterlist *sg, int nelems,
93910+ int direction);
93911+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
93912+ int nents, int direction);
93913+ void (*unmap_sg)(struct device *hwdev,
93914+ struct scatterlist *sg, int nents,
93915+ int direction);
93916+ int (*dma_supported)(struct device *hwdev, u64 mask);
93917+ int is_phys;
93918+};
93919+
93920+extern dma_addr_t bad_dma_address;
93921+extern struct dma_mapping_ops* dma_ops;
93922+extern int iommu_merge;
93923+
93924+#if 0
93925+static inline int dma_mapping_error(dma_addr_t dma_addr)
93926+{
93927+ if (dma_ops->mapping_error)
93928+ return dma_ops->mapping_error(dma_addr);
93929+
93930+ return (dma_addr == bad_dma_address);
93931+}
93932+
93933+extern void *dma_alloc_coherent(struct device *dev, size_t size,
93934+ dma_addr_t *dma_handle, gfp_t gfp);
93935+extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
93936+ dma_addr_t dma_handle);
93937+
93938+static inline dma_addr_t
93939+dma_map_single(struct device *hwdev, void *ptr, size_t size,
93940+ int direction)
93941+{
93942+ return dma_ops->map_single(hwdev, ptr, size, direction);
93943+}
93944+
93945+static inline void
93946+dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
93947+ int direction)
93948+{
93949+ dma_ops->unmap_single(dev, addr, size, direction);
93950+}
93951+
93952+#define dma_map_page(dev,page,offset,size,dir) \
93953+ dma_map_single((dev), page_address(page)+(offset), (size), (dir))
93954+
93955+#define dma_unmap_page dma_unmap_single
93956+
93957+static inline void
93958+dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
93959+ size_t size, int direction)
93960+{
93961+ if (dma_ops->sync_single_for_cpu)
93962+ dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
93963+ direction);
93964+ flush_write_buffers();
93965+}
93966+
93967+static inline void
93968+dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
93969+ size_t size, int direction)
93970+{
93971+ if (dma_ops->sync_single_for_device)
93972+ dma_ops->sync_single_for_device(hwdev, dma_handle, size,
93973+ direction);
93974+ flush_write_buffers();
93975+}
93976+
93977+static inline void
93978+dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
93979+ unsigned long offset, size_t size, int direction)
93980+{
93981+ if (dma_ops->sync_single_range_for_cpu) {
93982+ dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
93983+ }
93984+
93985+ flush_write_buffers();
93986+}
93987+
93988+static inline void
93989+dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
93990+ unsigned long offset, size_t size, int direction)
93991+{
93992+ if (dma_ops->sync_single_range_for_device)
93993+ dma_ops->sync_single_range_for_device(hwdev, dma_handle,
93994+ offset, size, direction);
93995+
93996+ flush_write_buffers();
93997+}
93998+
93999+static inline void
94000+dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
94001+ int nelems, int direction)
94002+{
94003+ if (dma_ops->sync_sg_for_cpu)
94004+ dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
94005+ flush_write_buffers();
94006+}
94007+
94008+static inline void
94009+dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
94010+ int nelems, int direction)
94011+{
94012+ if (dma_ops->sync_sg_for_device) {
94013+ dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
94014+ }
94015+
94016+ flush_write_buffers();
94017+}
94018+
94019+static inline int
94020+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
94021+{
94022+ return dma_ops->map_sg(hwdev, sg, nents, direction);
94023+}
94024+
94025+static inline void
94026+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
94027+ int direction)
94028+{
94029+ dma_ops->unmap_sg(hwdev, sg, nents, direction);
94030+}
94031+
94032+extern int dma_supported(struct device *hwdev, u64 mask);
94033+
94034+/* same for gart, swiotlb, and nommu */
94035+static inline int dma_get_cache_alignment(void)
94036+{
94037+ return boot_cpu_data.x86_clflush_size;
94038+}
94039+
94040+#define dma_is_consistent(h) 1
94041+
94042+extern int dma_set_mask(struct device *dev, u64 mask);
94043+
94044+static inline void
94045+dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
94046+{
94047+ flush_write_buffers();
94048+}
94049+
94050+extern struct device fallback_dev;
94051+extern int panic_on_overflow;
94052+#endif
94053+
94054+#endif /* _X8664_DMA_MAPPING_H */
94055+
94056+#include <asm-i386/mach-xen/asm/dma-mapping.h>
94057diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/e820.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/e820.h
94058--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/e820.h 1970-01-01 00:00:00.000000000 +0000
94059+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/e820.h 2007-01-08 15:00:46.000000000 +0000
94060@@ -0,0 +1,63 @@
94061+/*
94062+ * structures and definitions for the int 15, ax=e820 memory map
94063+ * scheme.
94064+ *
94065+ * In a nutshell, setup.S populates a scratch table in the
94066+ * empty_zero_block that contains a list of usable address/size
94067+ * duples. setup.c, this information is transferred into the e820map,
94068+ * and in init.c/numa.c, that new information is used to mark pages
94069+ * reserved or not.
94070+ */
94071+#ifndef __E820_HEADER
94072+#define __E820_HEADER
94073+
94074+#include <linux/mmzone.h>
94075+
94076+#define E820MAP 0x2d0 /* our map */
94077+#define E820MAX 128 /* number of entries in E820MAP */
94078+#define E820NR 0x1e8 /* # entries in E820MAP */
94079+
94080+#define E820_RAM 1
94081+#define E820_RESERVED 2
94082+#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
94083+#define E820_NVS 4
94084+
94085+#define HIGH_MEMORY (1024*1024)
94086+
94087+#define LOWMEMSIZE() (0x9f000)
94088+
94089+#ifndef __ASSEMBLY__
94090+struct e820entry {
94091+ u64 addr; /* start of memory segment */
94092+ u64 size; /* size of memory segment */
94093+ u32 type; /* type of memory segment */
94094+} __attribute__((packed));
94095+
94096+struct e820map {
94097+ int nr_map;
94098+ struct e820entry map[E820MAX];
94099+};
94100+
94101+extern unsigned long find_e820_area(unsigned long start, unsigned long end,
94102+ unsigned size);
94103+extern void add_memory_region(unsigned long start, unsigned long size,
94104+ int type);
94105+extern void setup_memory_region(void);
94106+extern void contig_e820_setup(void);
94107+extern unsigned long e820_end_of_ram(void);
94108+extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
94109+extern void e820_print_map(char *who);
94110+extern int e820_mapped(unsigned long start, unsigned long end, unsigned type);
94111+
94112+extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
94113+extern void e820_setup_gap(struct e820entry *e820, int nr_map);
94114+extern unsigned long e820_hole_size(unsigned long start_pfn,
94115+ unsigned long end_pfn);
94116+
94117+extern void __init parse_memopt(char *p, char **end);
94118+extern void __init parse_memmapopt(char *p, char **end);
94119+
94120+extern struct e820map e820;
94121+#endif/*!__ASSEMBLY__*/
94122+
94123+#endif/*__E820_HEADER*/
94124diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/fixmap.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/fixmap.h
94125--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/fixmap.h 1970-01-01 00:00:00.000000000 +0000
94126+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/fixmap.h 2007-01-08 15:00:46.000000000 +0000
94127@@ -0,0 +1,108 @@
94128+/*
94129+ * fixmap.h: compile-time virtual memory allocation
94130+ *
94131+ * This file is subject to the terms and conditions of the GNU General Public
94132+ * License. See the file "COPYING" in the main directory of this archive
94133+ * for more details.
94134+ *
94135+ * Copyright (C) 1998 Ingo Molnar
94136+ */
94137+
94138+#ifndef _ASM_FIXMAP_H
94139+#define _ASM_FIXMAP_H
94140+
94141+#include <linux/config.h>
94142+#include <linux/kernel.h>
94143+#include <asm/apicdef.h>
94144+#include <asm/page.h>
94145+#include <asm/vsyscall.h>
94146+#include <asm/vsyscall32.h>
94147+#include <asm/acpi.h>
94148+
94149+/*
94150+ * Here we define all the compile-time 'special' virtual
94151+ * addresses. The point is to have a constant address at
94152+ * compile time, but to set the physical address only
94153+ * in the boot process.
94154+ *
94155+ * these 'compile-time allocated' memory buffers are
94156+ * fixed-size 4k pages. (or larger if used with an increment
94157+ * highger than 1) use fixmap_set(idx,phys) to associate
94158+ * physical memory with fixmap indices.
94159+ *
94160+ * TLB entries of such buffers will not be flushed across
94161+ * task switches.
94162+ */
94163+
94164+enum fixed_addresses {
94165+ VSYSCALL_LAST_PAGE,
94166+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
94167+ VSYSCALL_HPET,
94168+ FIX_HPET_BASE,
94169+#ifdef CONFIG_X86_LOCAL_APIC
94170+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
94171+#endif
94172+#ifdef CONFIG_X86_IO_APIC
94173+ FIX_IO_APIC_BASE_0,
94174+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
94175+#endif
94176+#ifdef CONFIG_ACPI
94177+ FIX_ACPI_BEGIN,
94178+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
94179+#endif
94180+ FIX_SHARED_INFO,
94181+#define NR_FIX_ISAMAPS 256
94182+ FIX_ISAMAP_END,
94183+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
94184+ __end_of_fixed_addresses
94185+};
94186+
94187+extern void __set_fixmap (enum fixed_addresses idx,
94188+ unsigned long phys, pgprot_t flags);
94189+
94190+#define set_fixmap(idx, phys) \
94191+ __set_fixmap(idx, phys, PAGE_KERNEL)
94192+/*
94193+ * Some hardware wants to get fixmapped without caching.
94194+ */
94195+#define set_fixmap_nocache(idx, phys) \
94196+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
94197+
94198+#define clear_fixmap(idx) \
94199+ __set_fixmap(idx, 0, __pgprot(0))
94200+
94201+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
94202+#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
94203+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
94204+
94205+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
94206+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
94207+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
94208+
94209+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
94210+
94211+extern void __this_fixmap_does_not_exist(void);
94212+
94213+/*
94214+ * 'index to address' translation. If anyone tries to use the idx
94215+ * directly without translation, we catch the bug with a NULL-deference
94216+ * kernel oops. Illegal ranges of incoming indices are caught too.
94217+ */
94218+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
94219+{
94220+ /*
94221+ * this branch gets completely eliminated after inlining,
94222+ * except when someone tries to use fixaddr indices in an
94223+ * illegal way. (such as mixing up address types or using
94224+ * out-of-range indices).
94225+ *
94226+ * If it doesn't get removed, the linker will complain
94227+ * loudly with a reasonably clear error message..
94228+ */
94229+ if (idx >= __end_of_fixed_addresses)
94230+ __this_fixmap_does_not_exist();
94231+
94232+ return __fix_to_virt(idx);
94233+}
94234+
94235+#endif
94236diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/floppy.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/floppy.h
94237--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/floppy.h 1970-01-01 00:00:00.000000000 +0000
94238+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/floppy.h 2007-01-08 15:00:46.000000000 +0000
94239@@ -0,0 +1,206 @@
94240+/*
94241+ * Architecture specific parts of the Floppy driver
94242+ *
94243+ * This file is subject to the terms and conditions of the GNU General Public
94244+ * License. See the file "COPYING" in the main directory of this archive
94245+ * for more details.
94246+ *
94247+ * Copyright (C) 1995
94248+ *
94249+ * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
94250+ */
94251+#ifndef __ASM_XEN_X86_64_FLOPPY_H
94252+#define __ASM_XEN_X86_64_FLOPPY_H
94253+
94254+#include <linux/vmalloc.h>
94255+
94256+/*
94257+ * The DMA channel used by the floppy controller cannot access data at
94258+ * addresses >= 16MB
94259+ *
94260+ * Went back to the 1MB limit, as some people had problems with the floppy
94261+ * driver otherwise. It doesn't matter much for performance anyway, as most
94262+ * floppy accesses go through the track buffer.
94263+ */
94264+#define _CROSS_64KB(a,s,vdma) \
94265+(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
94266+
94267+/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
94268+#include <asm/dma.h>
94269+#undef MAX_DMA_ADDRESS
94270+#define MAX_DMA_ADDRESS 0
94271+#define CROSS_64KB(a,s) (0)
94272+
94273+#define fd_inb(port) inb_p(port)
94274+#define fd_outb(value,port) outb_p(value,port)
94275+
94276+#define fd_request_dma() (0)
94277+#define fd_free_dma() ((void)0)
94278+#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
94279+#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
94280+#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
94281+#define fd_get_dma_residue() vdma_get_dma_residue(FLOPPY_DMA)
94282+/*
94283+ * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
94284+ * softirq context via motor_off_callback. A generic bug we happen to trigger.
94285+ */
94286+#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
94287+#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
94288+#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
94289+
94290+static int virtual_dma_count;
94291+static int virtual_dma_residue;
94292+static char *virtual_dma_addr;
94293+static int virtual_dma_mode;
94294+static int doing_pdma;
94295+
94296+static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
94297+{
94298+ register unsigned char st;
94299+
94300+#undef TRACE_FLPY_INT
94301+
94302+#ifdef TRACE_FLPY_INT
94303+ static int calls=0;
94304+ static int bytes=0;
94305+ static int dma_wait=0;
94306+#endif
94307+ if (!doing_pdma)
94308+ return floppy_interrupt(irq, dev_id, regs);
94309+
94310+#ifdef TRACE_FLPY_INT
94311+ if(!calls)
94312+ bytes = virtual_dma_count;
94313+#endif
94314+
94315+ {
94316+ register int lcount;
94317+ register char *lptr;
94318+
94319+ st = 1;
94320+ for(lcount=virtual_dma_count, lptr=virtual_dma_addr;
94321+ lcount; lcount--, lptr++) {
94322+ st=inb(virtual_dma_port+4) & 0xa0 ;
94323+ if(st != 0xa0)
94324+ break;
94325+ if(virtual_dma_mode)
94326+ outb_p(*lptr, virtual_dma_port+5);
94327+ else
94328+ *lptr = inb_p(virtual_dma_port+5);
94329+ }
94330+ virtual_dma_count = lcount;
94331+ virtual_dma_addr = lptr;
94332+ st = inb(virtual_dma_port+4);
94333+ }
94334+
94335+#ifdef TRACE_FLPY_INT
94336+ calls++;
94337+#endif
94338+ if(st == 0x20)
94339+ return IRQ_HANDLED;
94340+ if(!(st & 0x20)) {
94341+ virtual_dma_residue += virtual_dma_count;
94342+ virtual_dma_count=0;
94343+#ifdef TRACE_FLPY_INT
94344+ printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
94345+ virtual_dma_count, virtual_dma_residue, calls, bytes,
94346+ dma_wait);
94347+ calls = 0;
94348+ dma_wait=0;
94349+#endif
94350+ doing_pdma = 0;
94351+ floppy_interrupt(irq, dev_id, regs);
94352+ return IRQ_HANDLED;
94353+ }
94354+#ifdef TRACE_FLPY_INT
94355+ if(!virtual_dma_count)
94356+ dma_wait++;
94357+#endif
94358+ return IRQ_HANDLED;
94359+}
94360+
94361+static void fd_disable_dma(void)
94362+{
94363+ doing_pdma = 0;
94364+ virtual_dma_residue += virtual_dma_count;
94365+ virtual_dma_count=0;
94366+}
94367+
94368+static int vdma_get_dma_residue(unsigned int dummy)
94369+{
94370+ return virtual_dma_count + virtual_dma_residue;
94371+}
94372+
94373+
94374+static int fd_request_irq(void)
94375+{
94376+ return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
94377+ "floppy", NULL);
94378+}
94379+
94380+#if 0
94381+static unsigned long vdma_mem_alloc(unsigned long size)
94382+{
94383+ return (unsigned long) vmalloc(size);
94384+
94385+}
94386+
94387+static void vdma_mem_free(unsigned long addr, unsigned long size)
94388+{
94389+ vfree((void *)addr);
94390+}
94391+#endif
94392+
94393+static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
94394+{
94395+ doing_pdma = 1;
94396+ virtual_dma_port = io;
94397+ virtual_dma_mode = (mode == DMA_MODE_WRITE);
94398+ virtual_dma_addr = addr;
94399+ virtual_dma_count = size;
94400+ virtual_dma_residue = 0;
94401+ return 0;
94402+}
94403+
94404+/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
94405+#define FDC1 xen_floppy_init()
94406+static int FDC2 = -1;
94407+
94408+static int xen_floppy_init(void)
94409+{
94410+ use_virtual_dma = 1;
94411+ can_use_virtual_dma = 1;
94412+ return 0x3f0;
94413+}
94414+
94415+/*
94416+ * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
94417+ * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
94418+ * coincides with another rtc CMOS user. Paul G.
94419+ */
94420+#define FLOPPY0_TYPE ({ \
94421+ unsigned long flags; \
94422+ unsigned char val; \
94423+ spin_lock_irqsave(&rtc_lock, flags); \
94424+ val = (CMOS_READ(0x10) >> 4) & 15; \
94425+ spin_unlock_irqrestore(&rtc_lock, flags); \
94426+ val; \
94427+})
94428+
94429+#define FLOPPY1_TYPE ({ \
94430+ unsigned long flags; \
94431+ unsigned char val; \
94432+ spin_lock_irqsave(&rtc_lock, flags); \
94433+ val = CMOS_READ(0x10) & 15; \
94434+ spin_unlock_irqrestore(&rtc_lock, flags); \
94435+ val; \
94436+})
94437+
94438+#define N_FDC 2
94439+#define N_DRIVE 8
94440+
94441+#define FLOPPY_MOTOR_MASK 0xf0
94442+
94443+#define EXTRA_FLOPPY_PARAMS
94444+
94445+#endif /* __ASM_XEN_X86_64_FLOPPY_H */
94446diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hw_irq.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hw_irq.h
94447--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hw_irq.h 1970-01-01 00:00:00.000000000 +0000
94448+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hw_irq.h 2007-01-08 15:00:46.000000000 +0000
94449@@ -0,0 +1,145 @@
94450+#ifndef _ASM_HW_IRQ_H
94451+#define _ASM_HW_IRQ_H
94452+
94453+/*
94454+ * linux/include/asm/hw_irq.h
94455+ *
94456+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
94457+ *
94458+ * moved some of the old arch/i386/kernel/irq.h to here. VY
94459+ *
94460+ * IRQ/IPI changes taken from work by Thomas Radke
94461+ * <tomsoft@informatik.tu-chemnitz.de>
94462+ *
94463+ * hacked by Andi Kleen for x86-64.
94464+ *
94465+ * $Id: hw_irq.h,v 1.24 2001/09/14 20:55:03 vojtech Exp $
94466+ */
94467+
94468+#ifndef __ASSEMBLY__
94469+#include <linux/config.h>
94470+#include <asm/atomic.h>
94471+#include <asm/irq.h>
94472+#include <linux/profile.h>
94473+#include <linux/smp.h>
94474+
94475+struct hw_interrupt_type;
94476+#endif
94477+
94478+#define NMI_VECTOR 0x02
94479+/*
94480+ * IDT vectors usable for external interrupt sources start
94481+ * at 0x20:
94482+ */
94483+#define FIRST_EXTERNAL_VECTOR 0x20
94484+
94485+#define IA32_SYSCALL_VECTOR 0x80
94486+
94487+
94488+/*
94489+ * Vectors 0x20-0x2f are used for ISA interrupts.
94490+ */
94491+
94492+/*
94493+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
94494+ *
94495+ * some of the following vectors are 'rare', they are merged
94496+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
94497+ * TLB, reschedule and local APIC vectors are performance-critical.
94498+ */
94499+#ifndef CONFIG_XEN
94500+#define SPURIOUS_APIC_VECTOR 0xff
94501+#define ERROR_APIC_VECTOR 0xfe
94502+#define RESCHEDULE_VECTOR 0xfd
94503+#define CALL_FUNCTION_VECTOR 0xfc
94504+/* fb free - please don't readd KDB here because it's useless
94505+ (hint - think what a NMI bit does to a vector) */
94506+#define THERMAL_APIC_VECTOR 0xfa
94507+#define THRESHOLD_APIC_VECTOR 0xf9
94508+/* f8 free */
94509+#define INVALIDATE_TLB_VECTOR_END 0xf7
94510+#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
94511+
94512+#define NUM_INVALIDATE_TLB_VECTORS 8
94513+#endif
94514+
94515+/*
94516+ * Local APIC timer IRQ vector is on a different priority level,
94517+ * to work around the 'lost local interrupt if more than 2 IRQ
94518+ * sources per level' errata.
94519+ */
94520+#define LOCAL_TIMER_VECTOR 0xef
94521+
94522+/*
94523+ * First APIC vector available to drivers: (vectors 0x30-0xee)
94524+ * we start at 0x31 to spread out vectors evenly between priority
94525+ * levels. (0x80 is the syscall vector)
94526+ */
94527+#define FIRST_DEVICE_VECTOR 0x31
94528+#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */
94529+
94530+
94531+#ifndef __ASSEMBLY__
94532+extern u8 irq_vector[NR_IRQ_VECTORS];
94533+#define IO_APIC_VECTOR(irq) (irq_vector[irq])
94534+#define AUTO_ASSIGN -1
94535+
94536+/*
94537+ * Various low-level irq details needed by irq.c, process.c,
94538+ * time.c, io_apic.c and smp.c
94539+ *
94540+ * Interrupt entry/exit code at both C and assembly level
94541+ */
94542+
94543+extern void disable_8259A_irq(unsigned int irq);
94544+extern void enable_8259A_irq(unsigned int irq);
94545+extern int i8259A_irq_pending(unsigned int irq);
94546+extern void make_8259A_irq(unsigned int irq);
94547+extern void init_8259A(int aeoi);
94548+extern void FASTCALL(send_IPI_self(int vector));
94549+extern void init_VISWS_APIC_irqs(void);
94550+extern void setup_IO_APIC(void);
94551+extern void disable_IO_APIC(void);
94552+extern void print_IO_APIC(void);
94553+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
94554+extern void send_IPI(int dest, int vector);
94555+extern void setup_ioapic_dest(void);
94556+
94557+extern unsigned long io_apic_irqs;
94558+
94559+extern atomic_t irq_err_count;
94560+extern atomic_t irq_mis_count;
94561+
94562+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
94563+
94564+#define __STR(x) #x
94565+#define STR(x) __STR(x)
94566+
94567+#include <asm/ptrace.h>
94568+
94569+#define IRQ_NAME2(nr) nr##_interrupt(void)
94570+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
94571+
94572+/*
94573+ * SMP has a few special interrupts for IPI messages
94574+ */
94575+
94576+#define BUILD_IRQ(nr) \
94577+asmlinkage void IRQ_NAME(nr); \
94578+__asm__( \
94579+"\n.p2align\n" \
94580+"IRQ" #nr "_interrupt:\n\t" \
94581+ "push $" #nr "-256 ; " \
94582+ "jmp common_interrupt");
94583+
94584+extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
94585+static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
94586+{
94587+ resend_irq_on_evtchn(h, i);
94588+}
94589+
94590+#define platform_legacy_irq(irq) ((irq) < 16)
94591+
94592+#endif
94593+
94594+#endif /* _ASM_HW_IRQ_H */
94595diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypercall.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypercall.h
94596--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypercall.h 1970-01-01 00:00:00.000000000 +0000
94597+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypercall.h 2007-01-08 15:00:46.000000000 +0000
94598@@ -0,0 +1,406 @@
94599+/******************************************************************************
94600+ * hypercall.h
94601+ *
94602+ * Linux-specific hypervisor handling.
94603+ *
94604+ * Copyright (c) 2002-2004, K A Fraser
94605+ *
94606+ * 64-bit updates:
94607+ * Benjamin Liu <benjamin.liu@intel.com>
94608+ * Jun Nakajima <jun.nakajima@intel.com>
94609+ *
94610+ * This program is free software; you can redistribute it and/or
94611+ * modify it under the terms of the GNU General Public License version 2
94612+ * as published by the Free Software Foundation; or, when distributed
94613+ * separately from the Linux kernel or incorporated into other
94614+ * software packages, subject to the following license:
94615+ *
94616+ * Permission is hereby granted, free of charge, to any person obtaining a copy
94617+ * of this source file (the "Software"), to deal in the Software without
94618+ * restriction, including without limitation the rights to use, copy, modify,
94619+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
94620+ * and to permit persons to whom the Software is furnished to do so, subject to
94621+ * the following conditions:
94622+ *
94623+ * The above copyright notice and this permission notice shall be included in
94624+ * all copies or substantial portions of the Software.
94625+ *
94626+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94627+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94628+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94629+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94630+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94631+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
94632+ * IN THE SOFTWARE.
94633+ */
94634+
94635+#ifndef __HYPERCALL_H__
94636+#define __HYPERCALL_H__
94637+
94638+#include <linux/string.h> /* memcpy() */
94639+
94640+#ifndef __HYPERVISOR_H__
94641+# error "please don't include this file directly"
94642+#endif
94643+
94644+#define __STR(x) #x
94645+#define STR(x) __STR(x)
94646+
94647+#ifdef CONFIG_XEN
94648+#define HYPERCALL_STR(name) \
94649+ "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
94650+#else
94651+#define HYPERCALL_STR(name) \
94652+ "mov hypercall_stubs,%%rax; " \
94653+ "add $("STR(__HYPERVISOR_##name)" * 32),%%rax; " \
94654+ "call *%%rax"
94655+#endif
94656+
94657+#define _hypercall0(type, name) \
94658+({ \
94659+ long __res; \
94660+ asm volatile ( \
94661+ HYPERCALL_STR(name) \
94662+ : "=a" (__res) \
94663+ : \
94664+ : "memory" ); \
94665+ (type)__res; \
94666+})
94667+
94668+#define _hypercall1(type, name, a1) \
94669+({ \
94670+ long __res, __ign1; \
94671+ asm volatile ( \
94672+ HYPERCALL_STR(name) \
94673+ : "=a" (__res), "=D" (__ign1) \
94674+ : "1" ((long)(a1)) \
94675+ : "memory" ); \
94676+ (type)__res; \
94677+})
94678+
94679+#define _hypercall2(type, name, a1, a2) \
94680+({ \
94681+ long __res, __ign1, __ign2; \
94682+ asm volatile ( \
94683+ HYPERCALL_STR(name) \
94684+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
94685+ : "1" ((long)(a1)), "2" ((long)(a2)) \
94686+ : "memory" ); \
94687+ (type)__res; \
94688+})
94689+
94690+#define _hypercall3(type, name, a1, a2, a3) \
94691+({ \
94692+ long __res, __ign1, __ign2, __ign3; \
94693+ asm volatile ( \
94694+ HYPERCALL_STR(name) \
94695+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94696+ "=d" (__ign3) \
94697+ : "1" ((long)(a1)), "2" ((long)(a2)), \
94698+ "3" ((long)(a3)) \
94699+ : "memory" ); \
94700+ (type)__res; \
94701+})
94702+
94703+#define _hypercall4(type, name, a1, a2, a3, a4) \
94704+({ \
94705+ long __res, __ign1, __ign2, __ign3; \
94706+ asm volatile ( \
94707+ "movq %7,%%r10; " \
94708+ HYPERCALL_STR(name) \
94709+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94710+ "=d" (__ign3) \
94711+ : "1" ((long)(a1)), "2" ((long)(a2)), \
94712+ "3" ((long)(a3)), "g" ((long)(a4)) \
94713+ : "memory", "r10" ); \
94714+ (type)__res; \
94715+})
94716+
94717+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
94718+({ \
94719+ long __res, __ign1, __ign2, __ign3; \
94720+ asm volatile ( \
94721+ "movq %7,%%r10; movq %8,%%r8; " \
94722+ HYPERCALL_STR(name) \
94723+ : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94724+ "=d" (__ign3) \
94725+ : "1" ((long)(a1)), "2" ((long)(a2)), \
94726+ "3" ((long)(a3)), "g" ((long)(a4)), \
94727+ "g" ((long)(a5)) \
94728+ : "memory", "r10", "r8" ); \
94729+ (type)__res; \
94730+})
94731+
94732+static inline int
94733+HYPERVISOR_set_trap_table(
94734+ trap_info_t *table)
94735+{
94736+ return _hypercall1(int, set_trap_table, table);
94737+}
94738+
94739+static inline int
94740+HYPERVISOR_mmu_update(
94741+ mmu_update_t *req, int count, int *success_count, domid_t domid)
94742+{
94743+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
94744+}
94745+
94746+static inline int
94747+HYPERVISOR_mmuext_op(
94748+ struct mmuext_op *op, int count, int *success_count, domid_t domid)
94749+{
94750+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
94751+}
94752+
94753+static inline int
94754+HYPERVISOR_set_gdt(
94755+ unsigned long *frame_list, int entries)
94756+{
94757+ return _hypercall2(int, set_gdt, frame_list, entries);
94758+}
94759+
94760+static inline int
94761+HYPERVISOR_stack_switch(
94762+ unsigned long ss, unsigned long esp)
94763+{
94764+ return _hypercall2(int, stack_switch, ss, esp);
94765+}
94766+
94767+static inline int
94768+HYPERVISOR_set_callbacks(
94769+ unsigned long event_address, unsigned long failsafe_address,
94770+ unsigned long syscall_address)
94771+{
94772+ return _hypercall3(int, set_callbacks,
94773+ event_address, failsafe_address, syscall_address);
94774+}
94775+
94776+static inline int
94777+HYPERVISOR_fpu_taskswitch(
94778+ int set)
94779+{
94780+ return _hypercall1(int, fpu_taskswitch, set);
94781+}
94782+
94783+static inline int
94784+HYPERVISOR_sched_op_compat(
94785+ int cmd, unsigned long arg)
94786+{
94787+ return _hypercall2(int, sched_op_compat, cmd, arg);
94788+}
94789+
94790+static inline int
94791+HYPERVISOR_sched_op(
94792+ int cmd, void *arg)
94793+{
94794+ return _hypercall2(int, sched_op, cmd, arg);
94795+}
94796+
94797+static inline long
94798+HYPERVISOR_set_timer_op(
94799+ u64 timeout)
94800+{
94801+ return _hypercall1(long, set_timer_op, timeout);
94802+}
94803+
94804+static inline int
94805+HYPERVISOR_dom0_op(
94806+ dom0_op_t *dom0_op)
94807+{
94808+ dom0_op->interface_version = DOM0_INTERFACE_VERSION;
94809+ return _hypercall1(int, dom0_op, dom0_op);
94810+}
94811+
94812+static inline int
94813+HYPERVISOR_set_debugreg(
94814+ int reg, unsigned long value)
94815+{
94816+ return _hypercall2(int, set_debugreg, reg, value);
94817+}
94818+
94819+static inline unsigned long
94820+HYPERVISOR_get_debugreg(
94821+ int reg)
94822+{
94823+ return _hypercall1(unsigned long, get_debugreg, reg);
94824+}
94825+
94826+static inline int
94827+HYPERVISOR_update_descriptor(
94828+ unsigned long ma, unsigned long word)
94829+{
94830+ return _hypercall2(int, update_descriptor, ma, word);
94831+}
94832+
94833+static inline int
94834+HYPERVISOR_memory_op(
94835+ unsigned int cmd, void *arg)
94836+{
94837+ return _hypercall2(int, memory_op, cmd, arg);
94838+}
94839+
94840+static inline int
94841+HYPERVISOR_multicall(
94842+ void *call_list, int nr_calls)
94843+{
94844+ return _hypercall2(int, multicall, call_list, nr_calls);
94845+}
94846+
94847+static inline int
94848+HYPERVISOR_update_va_mapping(
94849+ unsigned long va, pte_t new_val, unsigned long flags)
94850+{
94851+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
94852+}
94853+
94854+static inline int
94855+HYPERVISOR_event_channel_op(
94856+ int cmd, void *arg)
94857+{
94858+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
94859+
94860+#ifdef CONFIG_XEN_COMPAT_030002
94861+ if (unlikely(rc == -ENOSYS)) {
94862+ struct evtchn_op op;
94863+ op.cmd = cmd;
94864+ memcpy(&op.u, arg, sizeof(op.u));
94865+ rc = _hypercall1(int, event_channel_op_compat, &op);
94866+ memcpy(arg, &op.u, sizeof(op.u));
94867+ }
94868+#endif
94869+
94870+ return rc;
94871+}
94872+
94873+static inline int
94874+HYPERVISOR_acm_op(
94875+ int cmd, void *arg)
94876+{
94877+ return _hypercall2(int, acm_op, cmd, arg);
94878+}
94879+
94880+static inline int
94881+HYPERVISOR_xen_version(
94882+ int cmd, void *arg)
94883+{
94884+ return _hypercall2(int, xen_version, cmd, arg);
94885+}
94886+
94887+static inline int
94888+HYPERVISOR_console_io(
94889+ int cmd, int count, char *str)
94890+{
94891+ return _hypercall3(int, console_io, cmd, count, str);
94892+}
94893+
94894+static inline int
94895+HYPERVISOR_physdev_op(
94896+ int cmd, void *arg)
94897+{
94898+ int rc = _hypercall2(int, physdev_op, cmd, arg);
94899+
94900+#ifdef CONFIG_XEN_COMPAT_030002
94901+ if (unlikely(rc == -ENOSYS)) {
94902+ struct physdev_op op;
94903+ op.cmd = cmd;
94904+ memcpy(&op.u, arg, sizeof(op.u));
94905+ rc = _hypercall1(int, physdev_op_compat, &op);
94906+ memcpy(arg, &op.u, sizeof(op.u));
94907+ }
94908+#endif
94909+
94910+ return rc;
94911+}
94912+
94913+static inline int
94914+HYPERVISOR_grant_table_op(
94915+ unsigned int cmd, void *uop, unsigned int count)
94916+{
94917+ return _hypercall3(int, grant_table_op, cmd, uop, count);
94918+}
94919+
94920+static inline int
94921+HYPERVISOR_update_va_mapping_otherdomain(
94922+ unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
94923+{
94924+ return _hypercall4(int, update_va_mapping_otherdomain, va,
94925+ new_val.pte, flags, domid);
94926+}
94927+
94928+static inline int
94929+HYPERVISOR_vm_assist(
94930+ unsigned int cmd, unsigned int type)
94931+{
94932+ return _hypercall2(int, vm_assist, cmd, type);
94933+}
94934+
94935+static inline int
94936+HYPERVISOR_vcpu_op(
94937+ int cmd, int vcpuid, void *extra_args)
94938+{
94939+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
94940+}
94941+
94942+static inline int
94943+HYPERVISOR_set_segment_base(
94944+ int reg, unsigned long value)
94945+{
94946+ return _hypercall2(int, set_segment_base, reg, value);
94947+}
94948+
94949+static inline int
94950+HYPERVISOR_suspend(
94951+ unsigned long srec)
94952+{
94953+ struct sched_shutdown sched_shutdown = {
94954+ .reason = SHUTDOWN_suspend
94955+ };
94956+
94957+ int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
94958+ &sched_shutdown, srec);
94959+
94960+#ifdef CONFIG_XEN_COMPAT_030002
94961+ if (rc == -ENOSYS)
94962+ rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
94963+ SHUTDOWN_suspend, srec);
94964+#endif
94965+
94966+ return rc;
94967+}
94968+
94969+static inline int
94970+HYPERVISOR_nmi_op(
94971+ unsigned long op, void *arg)
94972+{
94973+ return _hypercall2(int, nmi_op, op, arg);
94974+}
94975+
94976+static inline unsigned long
94977+HYPERVISOR_hvm_op(
94978+ int op, void *arg)
94979+{
94980+ return _hypercall2(unsigned long, hvm_op, op, arg);
94981+}
94982+
94983+static inline int
94984+HYPERVISOR_callback_op(
94985+ int cmd, void *arg)
94986+{
94987+ return _hypercall2(int, callback_op, cmd, arg);
94988+}
94989+
94990+static inline int
94991+HYPERVISOR_xenoprof_op(
94992+ int op, void *arg)
94993+{
94994+ return _hypercall2(int, xenoprof_op, op, arg);
94995+}
94996+
94997+static inline int
94998+HYPERVISOR_kexec_op(
94999+ unsigned long op, void *args)
95000+{
95001+ return _hypercall2(int, kexec_op, op, args);
95002+}
95003+
95004+#endif /* __HYPERCALL_H__ */
95005diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypervisor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypervisor.h
95006--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
95007+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypervisor.h 2007-01-08 15:00:46.000000000 +0000
95008@@ -0,0 +1,2 @@
95009+
95010+#include <asm-i386/mach-xen/asm/hypervisor.h>
95011diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/io.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/io.h
95012--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/io.h 1970-01-01 00:00:00.000000000 +0000
95013+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/io.h 2007-01-08 15:00:46.000000000 +0000
95014@@ -0,0 +1,361 @@
95015+#ifndef _ASM_IO_H
95016+#define _ASM_IO_H
95017+
95018+#include <linux/config.h>
95019+#include <asm/fixmap.h>
95020+
95021+/*
95022+ * This file contains the definitions for the x86 IO instructions
95023+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
95024+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
95025+ * versions of the single-IO instructions (inb_p/inw_p/..).
95026+ *
95027+ * This file is not meant to be obfuscating: it's just complicated
95028+ * to (a) handle it all in a way that makes gcc able to optimize it
95029+ * as well as possible and (b) trying to avoid writing the same thing
95030+ * over and over again with slight variations and possibly making a
95031+ * mistake somewhere.
95032+ */
95033+
95034+/*
95035+ * Thanks to James van Artsdalen for a better timing-fix than
95036+ * the two short jumps: using outb's to a nonexistent port seems
95037+ * to guarantee better timings even on fast machines.
95038+ *
95039+ * On the other hand, I'd like to be sure of a non-existent port:
95040+ * I feel a bit unsafe about using 0x80 (should be safe, though)
95041+ *
95042+ * Linus
95043+ */
95044+
95045+ /*
95046+ * Bit simplified and optimized by Jan Hubicka
95047+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
95048+ *
95049+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
95050+ * isa_read[wl] and isa_write[wl] fixed
95051+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
95052+ */
95053+
95054+#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
95055+
95056+#ifdef REALLY_SLOW_IO
95057+#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
95058+#else
95059+#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
95060+#endif
95061+
95062+/*
95063+ * Talk about misusing macros..
95064+ */
95065+#define __OUT1(s,x) \
95066+static inline void out##s(unsigned x value, unsigned short port) {
95067+
95068+#define __OUT2(s,s1,s2) \
95069+__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
95070+
95071+#define __OUT(s,s1,x) \
95072+__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
95073+__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
95074+
95075+#define __IN1(s) \
95076+static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
95077+
95078+#define __IN2(s,s1,s2) \
95079+__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
95080+
95081+#define __IN(s,s1,i...) \
95082+__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
95083+__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
95084+
95085+#define __INS(s) \
95086+static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
95087+{ __asm__ __volatile__ ("rep ; ins" #s \
95088+: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
95089+
95090+#define __OUTS(s) \
95091+static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
95092+{ __asm__ __volatile__ ("rep ; outs" #s \
95093+: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
95094+
95095+#define RETURN_TYPE unsigned char
95096+__IN(b,"")
95097+#undef RETURN_TYPE
95098+#define RETURN_TYPE unsigned short
95099+__IN(w,"")
95100+#undef RETURN_TYPE
95101+#define RETURN_TYPE unsigned int
95102+__IN(l,"")
95103+#undef RETURN_TYPE
95104+
95105+__OUT(b,"b",char)
95106+__OUT(w,"w",short)
95107+__OUT(l,,int)
95108+
95109+__INS(b)
95110+__INS(w)
95111+__INS(l)
95112+
95113+__OUTS(b)
95114+__OUTS(w)
95115+__OUTS(l)
95116+
95117+#define IO_SPACE_LIMIT 0xffff
95118+
95119+#if defined(__KERNEL__) && __x86_64__
95120+
95121+#include <linux/vmalloc.h>
95122+
95123+#ifndef __i386__
95124+/*
95125+ * Change virtual addresses to physical addresses and vv.
95126+ * These are pretty trivial
95127+ */
95128+static inline unsigned long virt_to_phys(volatile void * address)
95129+{
95130+ return __pa(address);
95131+}
95132+
95133+static inline void * phys_to_virt(unsigned long address)
95134+{
95135+ return __va(address);
95136+}
95137+
95138+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
95139+#define bus_to_virt(_x) __va(machine_to_phys(_x))
95140+#endif
95141+
95142+/*
95143+ * Change "struct page" to physical address.
95144+ */
95145+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
95146+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
95147+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
95148+
95149+#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
95150+ (unsigned long) bio_offset((bio)))
95151+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
95152+ (unsigned long) (bv)->bv_offset)
95153+
95154+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
95155+ (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
95156+ ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
95157+ bvec_to_pseudophys((vec2))))
95158+
95159+#include <asm-generic/iomap.h>
95160+
95161+extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
95162+
95163+static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
95164+{
95165+ return __ioremap(offset, size, 0);
95166+}
95167+
95168+/*
95169+ * This one maps high address device memory and turns off caching for that area.
95170+ * it's useful if some control registers are in such an area and write combining
95171+ * or read caching is not desirable:
95172+ */
95173+extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
95174+extern void iounmap(volatile void __iomem *addr);
95175+
95176+/* Use normal IO mappings for DMI */
95177+#define dmi_ioremap ioremap
95178+#define dmi_iounmap(x,l) iounmap(x)
95179+#define dmi_alloc(l) kmalloc(l, GFP_ATOMIC)
95180+
95181+/*
95182+ * ISA I/O bus memory addresses are 1:1 with the physical address.
95183+ */
95184+
95185+#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
95186+#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
95187+#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
95188+
95189+/*
95190+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
95191+ * are forbidden in portable PCI drivers.
95192+ *
95193+ * Allow them on x86 for legacy drivers, though.
95194+ */
95195+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
95196+#define bus_to_virt(_x) __va(machine_to_phys(_x))
95197+
95198+/*
95199+ * readX/writeX() are used to access memory mapped devices. On some
95200+ * architectures the memory mapped IO stuff needs to be accessed
95201+ * differently. On the x86 architecture, we just read/write the
95202+ * memory location directly.
95203+ */
95204+
95205+static inline __u8 __readb(const volatile void __iomem *addr)
95206+{
95207+ return *(__force volatile __u8 *)addr;
95208+}
95209+static inline __u16 __readw(const volatile void __iomem *addr)
95210+{
95211+ return *(__force volatile __u16 *)addr;
95212+}
95213+static inline __u32 __readl(const volatile void __iomem *addr)
95214+{
95215+ return *(__force volatile __u32 *)addr;
95216+}
95217+static inline __u64 __readq(const volatile void __iomem *addr)
95218+{
95219+ return *(__force volatile __u64 *)addr;
95220+}
95221+#define readb(x) __readb(x)
95222+#define readw(x) __readw(x)
95223+#define readl(x) __readl(x)
95224+#define readq(x) __readq(x)
95225+#define readb_relaxed(a) readb(a)
95226+#define readw_relaxed(a) readw(a)
95227+#define readl_relaxed(a) readl(a)
95228+#define readq_relaxed(a) readq(a)
95229+#define __raw_readb readb
95230+#define __raw_readw readw
95231+#define __raw_readl readl
95232+#define __raw_readq readq
95233+
95234+#define mmiowb()
95235+
95236+#ifdef CONFIG_UNORDERED_IO
95237+static inline void __writel(__u32 val, volatile void __iomem *addr)
95238+{
95239+ volatile __u32 __iomem *target = addr;
95240+ asm volatile("movnti %1,%0"
95241+ : "=m" (*target)
95242+ : "r" (val) : "memory");
95243+}
95244+
95245+static inline void __writeq(__u64 val, volatile void __iomem *addr)
95246+{
95247+ volatile __u64 __iomem *target = addr;
95248+ asm volatile("movnti %1,%0"
95249+ : "=m" (*target)
95250+ : "r" (val) : "memory");
95251+}
95252+#else
95253+static inline void __writel(__u32 b, volatile void __iomem *addr)
95254+{
95255+ *(__force volatile __u32 *)addr = b;
95256+}
95257+static inline void __writeq(__u64 b, volatile void __iomem *addr)
95258+{
95259+ *(__force volatile __u64 *)addr = b;
95260+}
95261+#endif
95262+static inline void __writeb(__u8 b, volatile void __iomem *addr)
95263+{
95264+ *(__force volatile __u8 *)addr = b;
95265+}
95266+static inline void __writew(__u16 b, volatile void __iomem *addr)
95267+{
95268+ *(__force volatile __u16 *)addr = b;
95269+}
95270+#define writeq(val,addr) __writeq((val),(addr))
95271+#define writel(val,addr) __writel((val),(addr))
95272+#define writew(val,addr) __writew((val),(addr))
95273+#define writeb(val,addr) __writeb((val),(addr))
95274+#define __raw_writeb writeb
95275+#define __raw_writew writew
95276+#define __raw_writel writel
95277+#define __raw_writeq writeq
95278+
95279+void __memcpy_fromio(void*,unsigned long,unsigned);
95280+void __memcpy_toio(unsigned long,const void*,unsigned);
95281+
95282+static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
95283+{
95284+ __memcpy_fromio(to,(unsigned long)from,len);
95285+}
95286+static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
95287+{
95288+ __memcpy_toio((unsigned long)to,from,len);
95289+}
95290+
95291+void memset_io(volatile void __iomem *a, int b, size_t c);
95292+
95293+/*
95294+ * ISA space is 'always mapped' on a typical x86 system, no need to
95295+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
95296+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
95297+ * are physical addresses. The following constant pointer can be
95298+ * used as the IO-area pointer (it can be iounmapped as well, so the
95299+ * analogy with PCI is quite large):
95300+ */
95301+#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
95302+
95303+#define isa_readb(a) readb(__ISA_IO_base + (a))
95304+#define isa_readw(a) readw(__ISA_IO_base + (a))
95305+#define isa_readl(a) readl(__ISA_IO_base + (a))
95306+#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
95307+#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
95308+#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
95309+#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c))
95310+#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c))
95311+#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c))
95312+
95313+
95314+/*
95315+ * Again, x86-64 does not require mem IO specific function.
95316+ */
95317+
95318+#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
95319+#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(__ISA_IO_base + (b)),(c),(d))
95320+
95321+/**
95322+ * check_signature - find BIOS signatures
95323+ * @io_addr: mmio address to check
95324+ * @signature: signature block
95325+ * @length: length of signature
95326+ *
95327+ * Perform a signature comparison with the mmio address io_addr. This
95328+ * address should have been obtained by ioremap.
95329+ * Returns 1 on a match.
95330+ */
95331+
95332+static inline int check_signature(void __iomem *io_addr,
95333+ const unsigned char *signature, int length)
95334+{
95335+ int retval = 0;
95336+ do {
95337+ if (readb(io_addr) != *signature)
95338+ goto out;
95339+ io_addr++;
95340+ signature++;
95341+ length--;
95342+ } while (length);
95343+ retval = 1;
95344+out:
95345+ return retval;
95346+}
95347+
95348+/* Nothing to do */
95349+
95350+#define dma_cache_inv(_start,_size) do { } while (0)
95351+#define dma_cache_wback(_start,_size) do { } while (0)
95352+#define dma_cache_wback_inv(_start,_size) do { } while (0)
95353+
95354+#define flush_write_buffers()
95355+
95356+extern int iommu_bio_merge;
95357+#define BIO_VMERGE_BOUNDARY iommu_bio_merge
95358+
95359+/*
95360+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
95361+ * access
95362+ */
95363+#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz)
95364+#define xlate_dev_mem_ptr_unmap(p) iounmap(p)
95365+
95366+/*
95367+ * Convert a virtual cached pointer to an uncached pointer
95368+ */
95369+#define xlate_dev_kmem_ptr(p) p
95370+
95371+#endif /* __KERNEL__ */
95372+
95373+#define ARCH_HAS_DEV_MEM
95374+
95375+#endif
95376diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/irq.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/irq.h
95377--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/irq.h 1970-01-01 00:00:00.000000000 +0000
95378+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/irq.h 2007-01-08 15:00:46.000000000 +0000
95379@@ -0,0 +1,39 @@
95380+#ifndef _ASM_IRQ_H
95381+#define _ASM_IRQ_H
95382+
95383+/*
95384+ * linux/include/asm/irq.h
95385+ *
95386+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
95387+ *
95388+ * IRQ/IPI changes taken from work by Thomas Radke
95389+ * <tomsoft@informatik.tu-chemnitz.de>
95390+ */
95391+
95392+#include <linux/config.h>
95393+#include <linux/sched.h>
95394+/* include comes from machine specific directory */
95395+#include "irq_vectors.h"
95396+#include <asm/thread_info.h>
95397+
95398+static __inline__ int irq_canonicalize(int irq)
95399+{
95400+ return ((irq == 2) ? 9 : irq);
95401+}
95402+
95403+#ifdef CONFIG_X86_LOCAL_APIC
95404+#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
95405+#endif
95406+
95407+#define KDB_VECTOR 0xf9
95408+
95409+# define irq_ctx_init(cpu) do { } while (0)
95410+
95411+#ifdef CONFIG_HOTPLUG_CPU
95412+#include <linux/cpumask.h>
95413+extern void fixup_irqs(cpumask_t map);
95414+#endif
95415+
95416+#define __ARCH_HAS_DO_SOFTIRQ 1
95417+
95418+#endif /* _ASM_IRQ_H */
95419diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/maddr.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/maddr.h
95420--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/maddr.h 1970-01-01 00:00:00.000000000 +0000
95421+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/maddr.h 2007-01-08 15:00:46.000000000 +0000
95422@@ -0,0 +1,150 @@
95423+#ifndef _X86_64_MADDR_H
95424+#define _X86_64_MADDR_H
95425+
95426+#include <xen/features.h>
95427+#include <xen/interface/xen.h>
95428+
95429+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
95430+#define INVALID_P2M_ENTRY (~0UL)
95431+#define FOREIGN_FRAME_BIT (1UL<<63)
95432+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
95433+
95434+/* Definitions for machine and pseudophysical addresses. */
95435+typedef unsigned long paddr_t;
95436+typedef unsigned long maddr_t;
95437+
95438+#ifdef CONFIG_XEN
95439+
95440+extern unsigned long *phys_to_machine_mapping;
95441+
95442+#undef machine_to_phys_mapping
95443+extern unsigned long *machine_to_phys_mapping;
95444+extern unsigned int machine_to_phys_order;
95445+
95446+static inline unsigned long pfn_to_mfn(unsigned long pfn)
95447+{
95448+ if (xen_feature(XENFEAT_auto_translated_physmap))
95449+ return pfn;
95450+ return phys_to_machine_mapping[(unsigned int)(pfn)] &
95451+ ~FOREIGN_FRAME_BIT;
95452+}
95453+
95454+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
95455+{
95456+ if (xen_feature(XENFEAT_auto_translated_physmap))
95457+ return 1;
95458+ return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
95459+}
95460+
95461+static inline unsigned long mfn_to_pfn(unsigned long mfn)
95462+{
95463+ unsigned long pfn;
95464+
95465+ if (xen_feature(XENFEAT_auto_translated_physmap))
95466+ return mfn;
95467+
95468+ if (unlikely((mfn >> machine_to_phys_order) != 0))
95469+ return end_pfn;
95470+
95471+ /* The array access can fail (e.g., device space beyond end of RAM). */
95472+ asm (
95473+ "1: movq %1,%0\n"
95474+ "2:\n"
95475+ ".section .fixup,\"ax\"\n"
95476+ "3: movq %2,%0\n"
95477+ " jmp 2b\n"
95478+ ".previous\n"
95479+ ".section __ex_table,\"a\"\n"
95480+ " .align 8\n"
95481+ " .quad 1b,3b\n"
95482+ ".previous"
95483+ : "=r" (pfn)
95484+ : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
95485+
95486+ return pfn;
95487+}
95488+
95489+/*
95490+ * We detect special mappings in one of two ways:
95491+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
95492+ * to be outside our maximum possible pseudophys range.
95493+ * 2. If the MFN belongs to a different domain then we will certainly
95494+ * not have MFN in our p2m table. Conversely, if the page is ours,
95495+ * then we'll have p2m(m2p(MFN))==MFN.
95496+ * If we detect a special mapping then it doesn't have a 'struct page'.
95497+ * We force !pfn_valid() by returning an out-of-range pointer.
95498+ *
95499+ * NB. These checks require that, for any MFN that is not in our reservation,
95500+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
95501+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
95502+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
95503+ *
95504+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
95505+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
95506+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
95507+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
95508+ */
95509+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
95510+{
95511+ unsigned long pfn = mfn_to_pfn(mfn);
95512+ if ((pfn < end_pfn)
95513+ && !xen_feature(XENFEAT_auto_translated_physmap)
95514+ && (phys_to_machine_mapping[pfn] != mfn))
95515+ return end_pfn; /* force !pfn_valid() */
95516+ return pfn;
95517+}
95518+
95519+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
95520+{
95521+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
95522+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
95523+ return;
95524+ }
95525+ phys_to_machine_mapping[pfn] = mfn;
95526+}
95527+
95528+static inline maddr_t phys_to_machine(paddr_t phys)
95529+{
95530+ maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
95531+ machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
95532+ return machine;
95533+}
95534+
95535+static inline paddr_t machine_to_phys(maddr_t machine)
95536+{
95537+ paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
95538+ phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
95539+ return phys;
95540+}
95541+
95542+static inline paddr_t pte_machine_to_phys(maddr_t machine)
95543+{
95544+ paddr_t phys;
95545+ phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
95546+ phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
95547+ return phys;
95548+}
95549+
95550+#else /* !CONFIG_XEN */
95551+
95552+#define pfn_to_mfn(pfn) (pfn)
95553+#define mfn_to_pfn(mfn) (mfn)
95554+#define mfn_to_local_pfn(mfn) (mfn)
95555+#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
95556+#define phys_to_machine_mapping_valid(pfn) (1)
95557+#define phys_to_machine(phys) ((maddr_t)(phys))
95558+#define machine_to_phys(mach) ((paddr_t)(mach))
95559+#define pte_machine_to_phys(mach) ((paddr_t)(mach))
95560+
95561+#endif /* !CONFIG_XEN */
95562+
95563+/* VIRT <-> MACHINE conversion */
95564+#define virt_to_machine(v) (phys_to_machine(__pa(v)))
95565+#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
95566+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
95567+
95568+#define __pte_ma(x) ((pte_t) { (x) } )
95569+#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
95570+
95571+#endif /* _X86_64_MADDR_H */
95572+
95573diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu.h
95574--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu.h 1970-01-01 00:00:00.000000000 +0000
95575+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu.h 2007-01-08 15:00:46.000000000 +0000
95576@@ -0,0 +1,38 @@
95577+#ifndef __x86_64_MMU_H
95578+#define __x86_64_MMU_H
95579+
95580+#include <linux/spinlock.h>
95581+#include <asm/semaphore.h>
95582+
95583+/*
95584+ * The x86_64 doesn't have a mmu context, but
95585+ * we put the segment information here.
95586+ *
95587+ * cpu_vm_mask is used to optimize ldt flushing.
95588+ */
95589+typedef struct {
95590+ void *ldt;
95591+ rwlock_t ldtlock;
95592+ int size;
95593+ struct semaphore sem;
95594+#ifdef CONFIG_XEN
95595+ unsigned pinned:1;
95596+ unsigned has_foreign_mappings:1;
95597+ struct list_head unpinned;
95598+#endif
95599+} mm_context_t;
95600+
95601+#ifdef CONFIG_XEN
95602+extern struct list_head mm_unpinned;
95603+extern spinlock_t mm_unpinned_lock;
95604+
95605+/* mm/memory.c:exit_mmap hook */
95606+extern void _arch_exit_mmap(struct mm_struct *mm);
95607+#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
95608+
95609+/* kernel/fork.c:dup_mmap hook */
95610+extern void _arch_dup_mmap(struct mm_struct *mm);
95611+#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
95612+#endif
95613+
95614+#endif
95615diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu_context.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu_context.h
95616--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu_context.h 1970-01-01 00:00:00.000000000 +0000
95617+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu_context.h 2007-01-08 15:00:46.000000000 +0000
95618@@ -0,0 +1,136 @@
95619+#ifndef __X86_64_MMU_CONTEXT_H
95620+#define __X86_64_MMU_CONTEXT_H
95621+
95622+#include <linux/config.h>
95623+#include <asm/desc.h>
95624+#include <asm/atomic.h>
95625+#include <asm/pgalloc.h>
95626+#include <asm/page.h>
95627+#include <asm/pda.h>
95628+#include <asm/pgtable.h>
95629+#include <asm/tlbflush.h>
95630+
95631+/*
95632+ * possibly do the LDT unload here?
95633+ */
95634+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
95635+void destroy_context(struct mm_struct *mm);
95636+
95637+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
95638+{
95639+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95640+ if (read_pda(mmu_state) == TLBSTATE_OK)
95641+ write_pda(mmu_state, TLBSTATE_LAZY);
95642+#endif
95643+}
95644+
95645+#define prepare_arch_switch(next) __prepare_arch_switch()
95646+
95647+static inline void __prepare_arch_switch(void)
95648+{
95649+ /*
95650+ * Save away %es, %ds, %fs and %gs. Must happen before reload
95651+ * of cr3/ldt (i.e., not in __switch_to).
95652+ */
95653+ __asm__ __volatile__ (
95654+ "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
95655+ : "=m" (current->thread.es),
95656+ "=m" (current->thread.ds),
95657+ "=m" (current->thread.fsindex),
95658+ "=m" (current->thread.gsindex) );
95659+
95660+ if (current->thread.ds)
95661+ __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
95662+
95663+ if (current->thread.es)
95664+ __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
95665+
95666+ if (current->thread.fsindex) {
95667+ __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
95668+ current->thread.fs = 0;
95669+ }
95670+
95671+ if (current->thread.gsindex) {
95672+ load_gs_index(0);
95673+ current->thread.gs = 0;
95674+ }
95675+}
95676+
95677+extern void mm_pin(struct mm_struct *mm);
95678+extern void mm_unpin(struct mm_struct *mm);
95679+void mm_pin_all(void);
95680+
95681+static inline void load_cr3(pgd_t *pgd)
95682+{
95683+ asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
95684+ "memory");
95685+}
95686+
95687+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
95688+ struct task_struct *tsk)
95689+{
95690+ unsigned cpu = smp_processor_id();
95691+ struct mmuext_op _op[3], *op = _op;
95692+
95693+ if (likely(prev != next)) {
95694+ BUG_ON(!next->context.pinned);
95695+
95696+ /* stop flush ipis for the previous mm */
95697+ clear_bit(cpu, &prev->cpu_vm_mask);
95698+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95699+ write_pda(mmu_state, TLBSTATE_OK);
95700+ write_pda(active_mm, next);
95701+#endif
95702+ set_bit(cpu, &next->cpu_vm_mask);
95703+
95704+ /* load_cr3(next->pgd) */
95705+ op->cmd = MMUEXT_NEW_BASEPTR;
95706+ op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
95707+ op++;
95708+
95709+ /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
95710+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
95711+ op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
95712+ op++;
95713+
95714+ if (unlikely(next->context.ldt != prev->context.ldt)) {
95715+ /* load_LDT_nolock(&next->context, cpu) */
95716+ op->cmd = MMUEXT_SET_LDT;
95717+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
95718+ op->arg2.nr_ents = next->context.size;
95719+ op++;
95720+ }
95721+
95722+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
95723+ }
95724+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95725+ else {
95726+ write_pda(mmu_state, TLBSTATE_OK);
95727+ if (read_pda(active_mm) != next)
95728+ out_of_line_bug();
95729+ if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
95730+ /* We were in lazy tlb mode and leave_mm disabled
95731+ * tlb flush IPI delivery. We must reload CR3
95732+ * to make sure to use no freed page tables.
95733+ */
95734+ load_cr3(next->pgd);
95735+ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
95736+ load_LDT_nolock(&next->context, cpu);
95737+ }
95738+ }
95739+#endif
95740+}
95741+
95742+#define deactivate_mm(tsk,mm) do { \
95743+ load_gs_index(0); \
95744+ asm volatile("movl %0,%%fs"::"r"(0)); \
95745+} while(0)
95746+
95747+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
95748+{
95749+ if (!next->context.pinned)
95750+ mm_pin(next);
95751+ switch_mm(prev, next, NULL);
95752+}
95753+
95754+#endif
95755diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/msr.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/msr.h
95756--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/msr.h 1970-01-01 00:00:00.000000000 +0000
95757+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/msr.h 2007-01-08 15:00:46.000000000 +0000
95758@@ -0,0 +1,399 @@
95759+#ifndef X86_64_MSR_H
95760+#define X86_64_MSR_H 1
95761+
95762+#ifndef __ASSEMBLY__
95763+/*
95764+ * Access to machine-specific registers (available on 586 and better only)
95765+ * Note: the rd* operations modify the parameters directly (without using
95766+ * pointer indirection), this allows gcc to optimize better
95767+ */
95768+
95769+#define rdmsr(msr,val1,val2) \
95770+ __asm__ __volatile__("rdmsr" \
95771+ : "=a" (val1), "=d" (val2) \
95772+ : "c" (msr))
95773+
95774+
95775+#define rdmsrl(msr,val) do { unsigned long a__,b__; \
95776+ __asm__ __volatile__("rdmsr" \
95777+ : "=a" (a__), "=d" (b__) \
95778+ : "c" (msr)); \
95779+ val = a__ | (b__<<32); \
95780+} while(0)
95781+
95782+#define wrmsr(msr,val1,val2) \
95783+ __asm__ __volatile__("wrmsr" \
95784+ : /* no outputs */ \
95785+ : "c" (msr), "a" (val1), "d" (val2))
95786+
95787+#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32)
95788+
95789+/* wrmsr with exception handling */
95790+#define wrmsr_safe(msr,a,b) ({ int ret__; \
95791+ asm volatile("2: wrmsr ; xorl %0,%0\n" \
95792+ "1:\n\t" \
95793+ ".section .fixup,\"ax\"\n\t" \
95794+ "3: movl %4,%0 ; jmp 1b\n\t" \
95795+ ".previous\n\t" \
95796+ ".section __ex_table,\"a\"\n" \
95797+ " .align 8\n\t" \
95798+ " .quad 2b,3b\n\t" \
95799+ ".previous" \
95800+ : "=a" (ret__) \
95801+ : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
95802+ ret__; })
95803+
95804+#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
95805+
95806+#define rdmsr_safe(msr,a,b) \
95807+ ({ int ret__; \
95808+ asm volatile ("1: rdmsr\n" \
95809+ "2:\n" \
95810+ ".section .fixup,\"ax\"\n" \
95811+ "3: movl %4,%0\n" \
95812+ " jmp 2b\n" \
95813+ ".previous\n" \
95814+ ".section __ex_table,\"a\"\n" \
95815+ " .align 8\n" \
95816+ " .quad 1b,3b\n" \
95817+ ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
95818+ :"c"(msr), "i"(-EIO), "0"(0)); \
95819+ ret__; })
95820+
95821+#define rdtsc(low,high) \
95822+ __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
95823+
95824+#define rdtscl(low) \
95825+ __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
95826+
95827+#define rdtscll(val) do { \
95828+ unsigned int __a,__d; \
95829+ asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
95830+ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
95831+} while(0)
95832+
95833+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
95834+
95835+#define rdpmc(counter,low,high) \
95836+ __asm__ __volatile__("rdpmc" \
95837+ : "=a" (low), "=d" (high) \
95838+ : "c" (counter))
95839+
95840+static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
95841+ unsigned int *ecx, unsigned int *edx)
95842+{
95843+ __asm__(XEN_CPUID
95844+ : "=a" (*eax),
95845+ "=b" (*ebx),
95846+ "=c" (*ecx),
95847+ "=d" (*edx)
95848+ : "0" (op));
95849+}
95850+
95851+/* Some CPUID calls want 'count' to be placed in ecx */
95852+static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
95853+ int *edx)
95854+{
95855+ __asm__(XEN_CPUID
95856+ : "=a" (*eax),
95857+ "=b" (*ebx),
95858+ "=c" (*ecx),
95859+ "=d" (*edx)
95860+ : "0" (op), "c" (count));
95861+}
95862+
95863+/*
95864+ * CPUID functions returning a single datum
95865+ */
95866+static inline unsigned int cpuid_eax(unsigned int op)
95867+{
95868+ unsigned int eax;
95869+
95870+ __asm__(XEN_CPUID
95871+ : "=a" (eax)
95872+ : "0" (op)
95873+ : "bx", "cx", "dx");
95874+ return eax;
95875+}
95876+static inline unsigned int cpuid_ebx(unsigned int op)
95877+{
95878+ unsigned int eax, ebx;
95879+
95880+ __asm__(XEN_CPUID
95881+ : "=a" (eax), "=b" (ebx)
95882+ : "0" (op)
95883+ : "cx", "dx" );
95884+ return ebx;
95885+}
95886+static inline unsigned int cpuid_ecx(unsigned int op)
95887+{
95888+ unsigned int eax, ecx;
95889+
95890+ __asm__(XEN_CPUID
95891+ : "=a" (eax), "=c" (ecx)
95892+ : "0" (op)
95893+ : "bx", "dx" );
95894+ return ecx;
95895+}
95896+static inline unsigned int cpuid_edx(unsigned int op)
95897+{
95898+ unsigned int eax, edx;
95899+
95900+ __asm__(XEN_CPUID
95901+ : "=a" (eax), "=d" (edx)
95902+ : "0" (op)
95903+ : "bx", "cx");
95904+ return edx;
95905+}
95906+
95907+#define MSR_IA32_UCODE_WRITE 0x79
95908+#define MSR_IA32_UCODE_REV 0x8b
95909+
95910+
95911+#endif
95912+
95913+/* AMD/K8 specific MSRs */
95914+#define MSR_EFER 0xc0000080 /* extended feature register */
95915+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
95916+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
95917+#define MSR_CSTAR 0xc0000083 /* compatibility mode SYSCALL target */
95918+#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
95919+#define MSR_FS_BASE 0xc0000100 /* 64bit GS base */
95920+#define MSR_GS_BASE 0xc0000101 /* 64bit FS base */
95921+#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */
95922+/* EFER bits: */
95923+#define _EFER_SCE 0 /* SYSCALL/SYSRET */
95924+#define _EFER_LME 8 /* Long mode enable */
95925+#define _EFER_LMA 10 /* Long mode active (read-only) */
95926+#define _EFER_NX 11 /* No execute enable */
95927+
95928+#define EFER_SCE (1<<_EFER_SCE)
95929+#define EFER_LME (1<<_EFER_LME)
95930+#define EFER_LMA (1<<_EFER_LMA)
95931+#define EFER_NX (1<<_EFER_NX)
95932+
95933+/* Intel MSRs. Some also available on other CPUs */
95934+#define MSR_IA32_TSC 0x10
95935+#define MSR_IA32_PLATFORM_ID 0x17
95936+
95937+#define MSR_IA32_PERFCTR0 0xc1
95938+#define MSR_IA32_PERFCTR1 0xc2
95939+
95940+#define MSR_MTRRcap 0x0fe
95941+#define MSR_IA32_BBL_CR_CTL 0x119
95942+
95943+#define MSR_IA32_SYSENTER_CS 0x174
95944+#define MSR_IA32_SYSENTER_ESP 0x175
95945+#define MSR_IA32_SYSENTER_EIP 0x176
95946+
95947+#define MSR_IA32_MCG_CAP 0x179
95948+#define MSR_IA32_MCG_STATUS 0x17a
95949+#define MSR_IA32_MCG_CTL 0x17b
95950+
95951+#define MSR_IA32_EVNTSEL0 0x186
95952+#define MSR_IA32_EVNTSEL1 0x187
95953+
95954+#define MSR_IA32_DEBUGCTLMSR 0x1d9
95955+#define MSR_IA32_LASTBRANCHFROMIP 0x1db
95956+#define MSR_IA32_LASTBRANCHTOIP 0x1dc
95957+#define MSR_IA32_LASTINTFROMIP 0x1dd
95958+#define MSR_IA32_LASTINTTOIP 0x1de
95959+
95960+#define MSR_MTRRfix64K_00000 0x250
95961+#define MSR_MTRRfix16K_80000 0x258
95962+#define MSR_MTRRfix16K_A0000 0x259
95963+#define MSR_MTRRfix4K_C0000 0x268
95964+#define MSR_MTRRfix4K_C8000 0x269
95965+#define MSR_MTRRfix4K_D0000 0x26a
95966+#define MSR_MTRRfix4K_D8000 0x26b
95967+#define MSR_MTRRfix4K_E0000 0x26c
95968+#define MSR_MTRRfix4K_E8000 0x26d
95969+#define MSR_MTRRfix4K_F0000 0x26e
95970+#define MSR_MTRRfix4K_F8000 0x26f
95971+#define MSR_MTRRdefType 0x2ff
95972+
95973+#define MSR_IA32_MC0_CTL 0x400
95974+#define MSR_IA32_MC0_STATUS 0x401
95975+#define MSR_IA32_MC0_ADDR 0x402
95976+#define MSR_IA32_MC0_MISC 0x403
95977+
95978+#define MSR_P6_PERFCTR0 0xc1
95979+#define MSR_P6_PERFCTR1 0xc2
95980+#define MSR_P6_EVNTSEL0 0x186
95981+#define MSR_P6_EVNTSEL1 0x187
95982+
95983+/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
95984+#define MSR_K7_EVNTSEL0 0xC0010000
95985+#define MSR_K7_PERFCTR0 0xC0010004
95986+#define MSR_K7_EVNTSEL1 0xC0010001
95987+#define MSR_K7_PERFCTR1 0xC0010005
95988+#define MSR_K7_EVNTSEL2 0xC0010002
95989+#define MSR_K7_PERFCTR2 0xC0010006
95990+#define MSR_K7_EVNTSEL3 0xC0010003
95991+#define MSR_K7_PERFCTR3 0xC0010007
95992+#define MSR_K8_TOP_MEM1 0xC001001A
95993+#define MSR_K8_TOP_MEM2 0xC001001D
95994+#define MSR_K8_SYSCFG 0xC0010010
95995+#define MSR_K8_HWCR 0xC0010015
95996+
95997+/* K6 MSRs */
95998+#define MSR_K6_EFER 0xC0000080
95999+#define MSR_K6_STAR 0xC0000081
96000+#define MSR_K6_WHCR 0xC0000082
96001+#define MSR_K6_UWCCR 0xC0000085
96002+#define MSR_K6_PSOR 0xC0000087
96003+#define MSR_K6_PFIR 0xC0000088
96004+
96005+/* Centaur-Hauls/IDT defined MSRs. */
96006+#define MSR_IDT_FCR1 0x107
96007+#define MSR_IDT_FCR2 0x108
96008+#define MSR_IDT_FCR3 0x109
96009+#define MSR_IDT_FCR4 0x10a
96010+
96011+#define MSR_IDT_MCR0 0x110
96012+#define MSR_IDT_MCR1 0x111
96013+#define MSR_IDT_MCR2 0x112
96014+#define MSR_IDT_MCR3 0x113
96015+#define MSR_IDT_MCR4 0x114
96016+#define MSR_IDT_MCR5 0x115
96017+#define MSR_IDT_MCR6 0x116
96018+#define MSR_IDT_MCR7 0x117
96019+#define MSR_IDT_MCR_CTRL 0x120
96020+
96021+/* VIA Cyrix defined MSRs*/
96022+#define MSR_VIA_FCR 0x1107
96023+#define MSR_VIA_LONGHAUL 0x110a
96024+#define MSR_VIA_RNG 0x110b
96025+#define MSR_VIA_BCR2 0x1147
96026+
96027+/* Intel defined MSRs. */
96028+#define MSR_IA32_P5_MC_ADDR 0
96029+#define MSR_IA32_P5_MC_TYPE 1
96030+#define MSR_IA32_PLATFORM_ID 0x17
96031+#define MSR_IA32_EBL_CR_POWERON 0x2a
96032+
96033+#define MSR_IA32_APICBASE 0x1b
96034+#define MSR_IA32_APICBASE_BSP (1<<8)
96035+#define MSR_IA32_APICBASE_ENABLE (1<<11)
96036+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
96037+
96038+/* P4/Xeon+ specific */
96039+#define MSR_IA32_MCG_EAX 0x180
96040+#define MSR_IA32_MCG_EBX 0x181
96041+#define MSR_IA32_MCG_ECX 0x182
96042+#define MSR_IA32_MCG_EDX 0x183
96043+#define MSR_IA32_MCG_ESI 0x184
96044+#define MSR_IA32_MCG_EDI 0x185
96045+#define MSR_IA32_MCG_EBP 0x186
96046+#define MSR_IA32_MCG_ESP 0x187
96047+#define MSR_IA32_MCG_EFLAGS 0x188
96048+#define MSR_IA32_MCG_EIP 0x189
96049+#define MSR_IA32_MCG_RESERVED 0x18A
96050+
96051+#define MSR_P6_EVNTSEL0 0x186
96052+#define MSR_P6_EVNTSEL1 0x187
96053+
96054+#define MSR_IA32_PERF_STATUS 0x198
96055+#define MSR_IA32_PERF_CTL 0x199
96056+
96057+#define MSR_IA32_THERM_CONTROL 0x19a
96058+#define MSR_IA32_THERM_INTERRUPT 0x19b
96059+#define MSR_IA32_THERM_STATUS 0x19c
96060+#define MSR_IA32_MISC_ENABLE 0x1a0
96061+
96062+#define MSR_IA32_DEBUGCTLMSR 0x1d9
96063+#define MSR_IA32_LASTBRANCHFROMIP 0x1db
96064+#define MSR_IA32_LASTBRANCHTOIP 0x1dc
96065+#define MSR_IA32_LASTINTFROMIP 0x1dd
96066+#define MSR_IA32_LASTINTTOIP 0x1de
96067+
96068+#define MSR_IA32_MC0_CTL 0x400
96069+#define MSR_IA32_MC0_STATUS 0x401
96070+#define MSR_IA32_MC0_ADDR 0x402
96071+#define MSR_IA32_MC0_MISC 0x403
96072+
96073+/* Pentium IV performance counter MSRs */
96074+#define MSR_P4_BPU_PERFCTR0 0x300
96075+#define MSR_P4_BPU_PERFCTR1 0x301
96076+#define MSR_P4_BPU_PERFCTR2 0x302
96077+#define MSR_P4_BPU_PERFCTR3 0x303
96078+#define MSR_P4_MS_PERFCTR0 0x304
96079+#define MSR_P4_MS_PERFCTR1 0x305
96080+#define MSR_P4_MS_PERFCTR2 0x306
96081+#define MSR_P4_MS_PERFCTR3 0x307
96082+#define MSR_P4_FLAME_PERFCTR0 0x308
96083+#define MSR_P4_FLAME_PERFCTR1 0x309
96084+#define MSR_P4_FLAME_PERFCTR2 0x30a
96085+#define MSR_P4_FLAME_PERFCTR3 0x30b
96086+#define MSR_P4_IQ_PERFCTR0 0x30c
96087+#define MSR_P4_IQ_PERFCTR1 0x30d
96088+#define MSR_P4_IQ_PERFCTR2 0x30e
96089+#define MSR_P4_IQ_PERFCTR3 0x30f
96090+#define MSR_P4_IQ_PERFCTR4 0x310
96091+#define MSR_P4_IQ_PERFCTR5 0x311
96092+#define MSR_P4_BPU_CCCR0 0x360
96093+#define MSR_P4_BPU_CCCR1 0x361
96094+#define MSR_P4_BPU_CCCR2 0x362
96095+#define MSR_P4_BPU_CCCR3 0x363
96096+#define MSR_P4_MS_CCCR0 0x364
96097+#define MSR_P4_MS_CCCR1 0x365
96098+#define MSR_P4_MS_CCCR2 0x366
96099+#define MSR_P4_MS_CCCR3 0x367
96100+#define MSR_P4_FLAME_CCCR0 0x368
96101+#define MSR_P4_FLAME_CCCR1 0x369
96102+#define MSR_P4_FLAME_CCCR2 0x36a
96103+#define MSR_P4_FLAME_CCCR3 0x36b
96104+#define MSR_P4_IQ_CCCR0 0x36c
96105+#define MSR_P4_IQ_CCCR1 0x36d
96106+#define MSR_P4_IQ_CCCR2 0x36e
96107+#define MSR_P4_IQ_CCCR3 0x36f
96108+#define MSR_P4_IQ_CCCR4 0x370
96109+#define MSR_P4_IQ_CCCR5 0x371
96110+#define MSR_P4_ALF_ESCR0 0x3ca
96111+#define MSR_P4_ALF_ESCR1 0x3cb
96112+#define MSR_P4_BPU_ESCR0 0x3b2
96113+#define MSR_P4_BPU_ESCR1 0x3b3
96114+#define MSR_P4_BSU_ESCR0 0x3a0
96115+#define MSR_P4_BSU_ESCR1 0x3a1
96116+#define MSR_P4_CRU_ESCR0 0x3b8
96117+#define MSR_P4_CRU_ESCR1 0x3b9
96118+#define MSR_P4_CRU_ESCR2 0x3cc
96119+#define MSR_P4_CRU_ESCR3 0x3cd
96120+#define MSR_P4_CRU_ESCR4 0x3e0
96121+#define MSR_P4_CRU_ESCR5 0x3e1
96122+#define MSR_P4_DAC_ESCR0 0x3a8
96123+#define MSR_P4_DAC_ESCR1 0x3a9
96124+#define MSR_P4_FIRM_ESCR0 0x3a4
96125+#define MSR_P4_FIRM_ESCR1 0x3a5
96126+#define MSR_P4_FLAME_ESCR0 0x3a6
96127+#define MSR_P4_FLAME_ESCR1 0x3a7
96128+#define MSR_P4_FSB_ESCR0 0x3a2
96129+#define MSR_P4_FSB_ESCR1 0x3a3
96130+#define MSR_P4_IQ_ESCR0 0x3ba
96131+#define MSR_P4_IQ_ESCR1 0x3bb
96132+#define MSR_P4_IS_ESCR0 0x3b4
96133+#define MSR_P4_IS_ESCR1 0x3b5
96134+#define MSR_P4_ITLB_ESCR0 0x3b6
96135+#define MSR_P4_ITLB_ESCR1 0x3b7
96136+#define MSR_P4_IX_ESCR0 0x3c8
96137+#define MSR_P4_IX_ESCR1 0x3c9
96138+#define MSR_P4_MOB_ESCR0 0x3aa
96139+#define MSR_P4_MOB_ESCR1 0x3ab
96140+#define MSR_P4_MS_ESCR0 0x3c0
96141+#define MSR_P4_MS_ESCR1 0x3c1
96142+#define MSR_P4_PMH_ESCR0 0x3ac
96143+#define MSR_P4_PMH_ESCR1 0x3ad
96144+#define MSR_P4_RAT_ESCR0 0x3bc
96145+#define MSR_P4_RAT_ESCR1 0x3bd
96146+#define MSR_P4_SAAT_ESCR0 0x3ae
96147+#define MSR_P4_SAAT_ESCR1 0x3af
96148+#define MSR_P4_SSU_ESCR0 0x3be
96149+#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */
96150+#define MSR_P4_TBPU_ESCR0 0x3c2
96151+#define MSR_P4_TBPU_ESCR1 0x3c3
96152+#define MSR_P4_TC_ESCR0 0x3c4
96153+#define MSR_P4_TC_ESCR1 0x3c5
96154+#define MSR_P4_U2L_ESCR0 0x3b0
96155+#define MSR_P4_U2L_ESCR1 0x3b1
96156+
96157+#endif
96158diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/nmi.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/nmi.h
96159--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/nmi.h 1970-01-01 00:00:00.000000000 +0000
96160+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/nmi.h 2007-01-08 15:00:46.000000000 +0000
96161@@ -0,0 +1,75 @@
96162+/*
96163+ * linux/include/asm-i386/nmi.h
96164+ */
96165+#ifndef ASM_NMI_H
96166+#define ASM_NMI_H
96167+
96168+#include <linux/pm.h>
96169+
96170+#include <xen/interface/nmi.h>
96171+
96172+struct pt_regs;
96173+
96174+typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
96175+
96176+/**
96177+ * set_nmi_callback
96178+ *
96179+ * Set a handler for an NMI. Only one handler may be
96180+ * set. Return 1 if the NMI was handled.
96181+ */
96182+void set_nmi_callback(nmi_callback_t callback);
96183+
96184+/**
96185+ * unset_nmi_callback
96186+ *
96187+ * Remove the handler previously set.
96188+ */
96189+void unset_nmi_callback(void);
96190+
96191+#ifdef CONFIG_PM
96192+
96193+/** Replace the PM callback routine for NMI. */
96194+struct pm_dev * set_nmi_pm_callback(pm_callback callback);
96195+
96196+/** Unset the PM callback routine back to the default. */
96197+void unset_nmi_pm_callback(struct pm_dev * dev);
96198+
96199+#else
96200+
96201+static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
96202+{
96203+ return 0;
96204+}
96205+
96206+static inline void unset_nmi_pm_callback(struct pm_dev * dev)
96207+{
96208+}
96209+
96210+#endif /* CONFIG_PM */
96211+
96212+extern void default_do_nmi(struct pt_regs *);
96213+extern void die_nmi(char *str, struct pt_regs *regs);
96214+
96215+static inline unsigned char get_nmi_reason(void)
96216+{
96217+ shared_info_t *s = HYPERVISOR_shared_info;
96218+ unsigned char reason = 0;
96219+
96220+ /* construct a value which looks like it came from
96221+ * port 0x61.
96222+ */
96223+ if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
96224+ reason |= 0x40;
96225+ if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
96226+ reason |= 0x80;
96227+
96228+ return reason;
96229+}
96230+
96231+extern int panic_on_timeout;
96232+extern int unknown_nmi_panic;
96233+
96234+extern int check_nmi_watchdog(void);
96235+
96236+#endif /* ASM_NMI_H */
96237diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/page.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/page.h
96238--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/page.h 1970-01-01 00:00:00.000000000 +0000
96239+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/page.h 2007-01-08 15:00:46.000000000 +0000
96240@@ -0,0 +1,211 @@
96241+#ifndef _X86_64_PAGE_H
96242+#define _X86_64_PAGE_H
96243+
96244+#include <linux/config.h>
96245+/* #include <linux/string.h> */
96246+#ifndef __ASSEMBLY__
96247+#include <linux/kernel.h>
96248+#include <linux/types.h>
96249+#include <asm/bug.h>
96250+#endif
96251+#include <xen/interface/xen.h>
96252+#include <xen/foreign_page.h>
96253+
96254+#define arch_free_page(_page,_order) \
96255+({ int foreign = PageForeign(_page); \
96256+ if (foreign) \
96257+ (PageForeignDestructor(_page))(_page); \
96258+ foreign; \
96259+})
96260+#define HAVE_ARCH_FREE_PAGE
96261+
96262+#ifdef CONFIG_XEN_SCRUB_PAGES
96263+#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
96264+#else
96265+#define scrub_pages(_p,_n) ((void)0)
96266+#endif
96267+
96268+/* PAGE_SHIFT determines the page size */
96269+#define PAGE_SHIFT 12
96270+#ifdef __ASSEMBLY__
96271+#define PAGE_SIZE (0x1 << PAGE_SHIFT)
96272+#else
96273+#define PAGE_SIZE (1UL << PAGE_SHIFT)
96274+#endif
96275+#define PAGE_MASK (~(PAGE_SIZE-1))
96276+
96277+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
96278+#define __PHYSICAL_MASK_SHIFT 46
96279+#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
96280+#define __VIRTUAL_MASK_SHIFT 48
96281+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
96282+
96283+#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
96284+
96285+#define THREAD_ORDER 1
96286+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
96287+#define CURRENT_MASK (~(THREAD_SIZE-1))
96288+
96289+#define EXCEPTION_STACK_ORDER 0
96290+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
96291+
96292+#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER
96293+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
96294+
96295+#define IRQSTACK_ORDER 2
96296+#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
96297+
96298+#define STACKFAULT_STACK 1
96299+#define DOUBLEFAULT_STACK 2
96300+#define NMI_STACK 3
96301+#define DEBUG_STACK 4
96302+#define MCE_STACK 5
96303+#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
96304+
96305+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
96306+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
96307+
96308+#define HPAGE_SHIFT PMD_SHIFT
96309+#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
96310+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
96311+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
96312+
96313+#ifdef __KERNEL__
96314+#ifndef __ASSEMBLY__
96315+
96316+extern unsigned long end_pfn;
96317+
96318+#include <asm/maddr.h>
96319+
96320+void clear_page(void *);
96321+void copy_page(void *, void *);
96322+
96323+#define clear_user_page(page, vaddr, pg) clear_page(page)
96324+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
96325+
96326+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
96327+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
96328+
96329+/*
96330+ * These are used to make use of C type-checking..
96331+ */
96332+typedef struct { unsigned long pte; } pte_t;
96333+typedef struct { unsigned long pmd; } pmd_t;
96334+typedef struct { unsigned long pud; } pud_t;
96335+typedef struct { unsigned long pgd; } pgd_t;
96336+#define PTE_MASK PHYSICAL_PAGE_MASK
96337+
96338+typedef struct { unsigned long pgprot; } pgprot_t;
96339+
96340+#define pte_val(x) (((x).pte & 1) ? pte_machine_to_phys((x).pte) : \
96341+ (x).pte)
96342+#define pte_val_ma(x) ((x).pte)
96343+
96344+static inline unsigned long pmd_val(pmd_t x)
96345+{
96346+ unsigned long ret = x.pmd;
96347+ if (ret) ret = pte_machine_to_phys(ret);
96348+ return ret;
96349+}
96350+
96351+static inline unsigned long pud_val(pud_t x)
96352+{
96353+ unsigned long ret = x.pud;
96354+ if (ret) ret = pte_machine_to_phys(ret);
96355+ return ret;
96356+}
96357+
96358+static inline unsigned long pgd_val(pgd_t x)
96359+{
96360+ unsigned long ret = x.pgd;
96361+ if (ret) ret = pte_machine_to_phys(ret);
96362+ return ret;
96363+}
96364+
96365+#define pgprot_val(x) ((x).pgprot)
96366+
96367+static inline pte_t __pte(unsigned long x)
96368+{
96369+ if (x & 1) x = phys_to_machine(x);
96370+ return ((pte_t) { (x) });
96371+}
96372+
96373+static inline pmd_t __pmd(unsigned long x)
96374+{
96375+ if ((x & 1)) x = phys_to_machine(x);
96376+ return ((pmd_t) { (x) });
96377+}
96378+
96379+static inline pud_t __pud(unsigned long x)
96380+{
96381+ if ((x & 1)) x = phys_to_machine(x);
96382+ return ((pud_t) { (x) });
96383+}
96384+
96385+static inline pgd_t __pgd(unsigned long x)
96386+{
96387+ if ((x & 1)) x = phys_to_machine(x);
96388+ return ((pgd_t) { (x) });
96389+}
96390+
96391+#define __pgprot(x) ((pgprot_t) { (x) } )
96392+
96393+#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
96394+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
96395+#define __START_KERNEL_map 0xffffffff80000000UL
96396+#define __PAGE_OFFSET 0xffff880000000000UL
96397+
96398+#else
96399+#define __PHYSICAL_START CONFIG_PHYSICAL_START
96400+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
96401+#define __START_KERNEL_map 0xffffffff80000000
96402+#define __PAGE_OFFSET 0xffff880000000000
96403+#endif /* !__ASSEMBLY__ */
96404+
96405+#ifdef CONFIG_XEN_COMPAT_030002
96406+#undef LOAD_OFFSET
96407+#define LOAD_OFFSET 0
96408+#endif /* CONFIG_XEN_COMPAT_030002 */
96409+
96410+/* to align the pointer to the (next) page boundary */
96411+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
96412+
96413+#define KERNEL_TEXT_SIZE (40UL*1024*1024)
96414+#define KERNEL_TEXT_START 0xffffffff80000000UL
96415+
96416+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
96417+
96418+/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
96419+ Otherwise you risk miscompilation. */
96420+#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
96421+/* __pa_symbol should be used for C visible symbols.
96422+ This seems to be the official gcc blessed way to do such arithmetic. */
96423+#define __pa_symbol(x) \
96424+ ({unsigned long v; \
96425+ asm("" : "=r" (v) : "0" (x)); \
96426+ __pa(v); })
96427+
96428+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
96429+#define __boot_va(x) __va(x)
96430+#define __boot_pa(x) __pa(x)
96431+#ifdef CONFIG_FLATMEM
96432+#define pfn_to_page(pfn) (mem_map + (pfn))
96433+#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
96434+#define pfn_valid(pfn) ((pfn) < end_pfn)
96435+#endif
96436+
96437+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
96438+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
96439+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
96440+
96441+#define VM_DATA_DEFAULT_FLAGS \
96442+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
96443+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
96444+
96445+#define __HAVE_ARCH_GATE_AREA 1
96446+
96447+#endif /* __KERNEL__ */
96448+
96449+#include <asm-generic/page.h>
96450+
96451+#endif /* _X86_64_PAGE_H */
96452diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pci.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pci.h
96453--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pci.h 1970-01-01 00:00:00.000000000 +0000
96454+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pci.h 2007-01-08 15:00:46.000000000 +0000
96455@@ -0,0 +1,174 @@
96456+#ifndef __x8664_PCI_H
96457+#define __x8664_PCI_H
96458+
96459+#include <linux/config.h>
96460+#include <asm/io.h>
96461+
96462+#ifdef __KERNEL__
96463+
96464+#include <linux/mm.h> /* for struct page */
96465+
96466+/* Can be used to override the logic in pci_scan_bus for skipping
96467+ already-configured bus numbers - to be used for buggy BIOSes
96468+ or architectures with incomplete PCI setup by the loader */
96469+
96470+#ifdef CONFIG_PCI
96471+extern unsigned int pcibios_assign_all_busses(void);
96472+#else
96473+#define pcibios_assign_all_busses() 0
96474+#endif
96475+#define pcibios_scan_all_fns(a, b) 0
96476+
96477+extern unsigned long pci_mem_start;
96478+#define PCIBIOS_MIN_IO 0x1000
96479+#define PCIBIOS_MIN_MEM (pci_mem_start)
96480+
96481+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
96482+
96483+void pcibios_config_init(void);
96484+struct pci_bus * pcibios_scan_root(int bus);
96485+extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
96486+extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
96487+
96488+void pcibios_set_master(struct pci_dev *dev);
96489+void pcibios_penalize_isa_irq(int irq, int active);
96490+struct irq_routing_table *pcibios_get_irq_routing_table(void);
96491+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
96492+
96493+#include <linux/types.h>
96494+#include <linux/slab.h>
96495+#include <asm/scatterlist.h>
96496+#include <linux/string.h>
96497+#include <asm/page.h>
96498+#include <linux/dma-mapping.h> /* for have_iommu */
96499+
96500+extern int iommu_setup(char *opt);
96501+
96502+/* The PCI address space does equal the physical memory
96503+ * address space. The networking and block device layers use
96504+ * this boolean for bounce buffer decisions
96505+ *
96506+ * On AMD64 it mostly equals, but we set it to zero if a hardware
96507+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
96508+ */
96509+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
96510+
96511+#ifdef CONFIG_GART_IOMMU
96512+
96513+/*
96514+ * x86-64 always supports DAC, but sometimes it is useful to force
96515+ * devices through the IOMMU to get automatic sg list merging.
96516+ * Optional right now.
96517+ */
96518+extern int iommu_sac_force;
96519+#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force)
96520+
96521+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96522+ dma_addr_t ADDR_NAME;
96523+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
96524+ __u32 LEN_NAME;
96525+#define pci_unmap_addr(PTR, ADDR_NAME) \
96526+ ((PTR)->ADDR_NAME)
96527+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
96528+ (((PTR)->ADDR_NAME) = (VAL))
96529+#define pci_unmap_len(PTR, LEN_NAME) \
96530+ ((PTR)->LEN_NAME)
96531+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
96532+ (((PTR)->LEN_NAME) = (VAL))
96533+
96534+#elif defined(CONFIG_SWIOTLB)
96535+
96536+#define pci_dac_dma_supported(pci_dev, mask) 1
96537+
96538+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96539+ dma_addr_t ADDR_NAME;
96540+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
96541+ __u32 LEN_NAME;
96542+#define pci_unmap_addr(PTR, ADDR_NAME) \
96543+ ((PTR)->ADDR_NAME)
96544+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
96545+ (((PTR)->ADDR_NAME) = (VAL))
96546+#define pci_unmap_len(PTR, LEN_NAME) \
96547+ ((PTR)->LEN_NAME)
96548+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
96549+ (((PTR)->LEN_NAME) = (VAL))
96550+
96551+#else
96552+/* No IOMMU */
96553+
96554+#define pci_dac_dma_supported(pci_dev, mask) 1
96555+
96556+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
96557+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
96558+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
96559+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
96560+#define pci_unmap_len(PTR, LEN_NAME) (0)
96561+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
96562+
96563+#endif
96564+
96565+#include <asm-generic/pci-dma-compat.h>
96566+
96567+static inline dma64_addr_t
96568+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
96569+{
96570+ return ((dma64_addr_t) page_to_phys(page) +
96571+ (dma64_addr_t) offset);
96572+}
96573+
96574+static inline struct page *
96575+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
96576+{
96577+ return virt_to_page(__va(dma_addr));
96578+}
96579+
96580+static inline unsigned long
96581+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
96582+{
96583+ return (dma_addr & ~PAGE_MASK);
96584+}
96585+
96586+static inline void
96587+pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
96588+{
96589+}
96590+
96591+static inline void
96592+pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
96593+{
96594+ flush_write_buffers();
96595+}
96596+
96597+#ifdef CONFIG_PCI
96598+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
96599+ enum pci_dma_burst_strategy *strat,
96600+ unsigned long *strategy_parameter)
96601+{
96602+ *strat = PCI_DMA_BURST_INFINITY;
96603+ *strategy_parameter = ~0UL;
96604+}
96605+#endif
96606+
96607+#define HAVE_PCI_MMAP
96608+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
96609+ enum pci_mmap_state mmap_state, int write_combine);
96610+
96611+static inline void pcibios_add_platform_entries(struct pci_dev *dev)
96612+{
96613+}
96614+
96615+#endif /* __KERNEL__ */
96616+
96617+/* generic pci stuff */
96618+#ifdef CONFIG_PCI
96619+#include <asm-generic/pci.h>
96620+#endif
96621+
96622+/* On Xen we have to scan all functions since Xen hides bridges from
96623+ * us. If a bridge is at fn=0 and that slot has a multifunction
96624+ * device, we won't find the additional devices without scanning all
96625+ * functions. */
96626+#undef pcibios_scan_all_fns
96627+#define pcibios_scan_all_fns(a, b) 1
96628+
96629+#endif /* __x8664_PCI_H */
96630diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgalloc.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgalloc.h
96631--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgalloc.h 1970-01-01 00:00:00.000000000 +0000
96632+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
96633@@ -0,0 +1,198 @@
96634+#ifndef _X86_64_PGALLOC_H
96635+#define _X86_64_PGALLOC_H
96636+
96637+#include <asm/fixmap.h>
96638+#include <asm/pda.h>
96639+#include <linux/threads.h>
96640+#include <linux/mm.h>
96641+#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
96642+
96643+#include <xen/features.h>
96644+void make_page_readonly(void *va, unsigned int feature);
96645+void make_page_writable(void *va, unsigned int feature);
96646+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
96647+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
96648+
96649+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
96650+
96651+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
96652+{
96653+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
96654+}
96655+
96656+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
96657+{
96658+ if (unlikely((mm)->context.pinned)) {
96659+ BUG_ON(HYPERVISOR_update_va_mapping(
96660+ (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
96661+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
96662+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
96663+ } else {
96664+ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
96665+ }
96666+}
96667+
96668+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
96669+{
96670+ if (unlikely((mm)->context.pinned)) {
96671+ BUG_ON(HYPERVISOR_update_va_mapping(
96672+ (unsigned long)pmd,
96673+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
96674+ PAGE_KERNEL_RO), 0));
96675+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
96676+ } else {
96677+ *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
96678+ }
96679+}
96680+
96681+/*
96682+ * We need to use the batch mode here, but pgd_pupulate() won't be
96683+ * be called frequently.
96684+ */
96685+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
96686+{
96687+ if (unlikely((mm)->context.pinned)) {
96688+ BUG_ON(HYPERVISOR_update_va_mapping(
96689+ (unsigned long)pud,
96690+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
96691+ PAGE_KERNEL_RO), 0));
96692+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
96693+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
96694+ } else {
96695+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
96696+ *(__user_pgd(pgd)) = *(pgd);
96697+ }
96698+}
96699+
96700+static inline void pmd_free(pmd_t *pmd)
96701+{
96702+ pte_t *ptep = virt_to_ptep(pmd);
96703+
96704+ if (!pte_write(*ptep)) {
96705+ BUG_ON(HYPERVISOR_update_va_mapping(
96706+ (unsigned long)pmd,
96707+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
96708+ 0));
96709+ }
96710+ free_page((unsigned long)pmd);
96711+}
96712+
96713+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
96714+{
96715+ pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96716+ return pmd;
96717+}
96718+
96719+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
96720+{
96721+ pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96722+ return pud;
96723+}
96724+
96725+static inline void pud_free(pud_t *pud)
96726+{
96727+ pte_t *ptep = virt_to_ptep(pud);
96728+
96729+ if (!pte_write(*ptep)) {
96730+ BUG_ON(HYPERVISOR_update_va_mapping(
96731+ (unsigned long)pud,
96732+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
96733+ 0));
96734+ }
96735+ free_page((unsigned long)pud);
96736+}
96737+
96738+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
96739+{
96740+ /*
96741+ * We allocate two contiguous pages for kernel and user.
96742+ */
96743+ unsigned boundary;
96744+ pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
96745+
96746+ if (!pgd)
96747+ return NULL;
96748+ /*
96749+ * Copy kernel pointers in from init.
96750+ * Could keep a freelist or slab cache of those because the kernel
96751+ * part never changes.
96752+ */
96753+ boundary = pgd_index(__PAGE_OFFSET);
96754+ memset(pgd, 0, boundary * sizeof(pgd_t));
96755+ memcpy(pgd + boundary,
96756+ init_level4_pgt + boundary,
96757+ (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
96758+
96759+ memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
96760+ /*
96761+ * Set level3_user_pgt for vsyscall area
96762+ */
96763+ set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START),
96764+ mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
96765+ return pgd;
96766+}
96767+
96768+static inline void pgd_free(pgd_t *pgd)
96769+{
96770+ pte_t *ptep = virt_to_ptep(pgd);
96771+
96772+ if (!pte_write(*ptep)) {
96773+ xen_pgd_unpin(__pa(pgd));
96774+ BUG_ON(HYPERVISOR_update_va_mapping(
96775+ (unsigned long)pgd,
96776+ pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
96777+ 0));
96778+ }
96779+
96780+ ptep = virt_to_ptep(__user_pgd(pgd));
96781+
96782+ if (!pte_write(*ptep)) {
96783+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
96784+ BUG_ON(HYPERVISOR_update_va_mapping(
96785+ (unsigned long)__user_pgd(pgd),
96786+ pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
96787+ PAGE_KERNEL),
96788+ 0));
96789+ }
96790+
96791+ free_pages((unsigned long)pgd, 1);
96792+}
96793+
96794+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
96795+{
96796+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96797+ if (pte)
96798+ make_page_readonly(pte, XENFEAT_writable_page_tables);
96799+
96800+ return pte;
96801+}
96802+
96803+static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
96804+{
96805+ struct page *pte;
96806+
96807+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
96808+ return pte;
96809+}
96810+
96811+/* Should really implement gc for free page table pages. This could be
96812+ done with a reference count in struct page. */
96813+
96814+static inline void pte_free_kernel(pte_t *pte)
96815+{
96816+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
96817+ make_page_writable(pte, XENFEAT_writable_page_tables);
96818+ free_page((unsigned long)pte);
96819+}
96820+
96821+extern void pte_free(struct page *pte);
96822+
96823+//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
96824+//#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
96825+//#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
96826+
96827+#define __pte_free_tlb(tlb,x) pte_free((x))
96828+#define __pmd_free_tlb(tlb,x) pmd_free((x))
96829+#define __pud_free_tlb(tlb,x) pud_free((x))
96830+
96831+#endif /* _X86_64_PGALLOC_H */
96832diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgtable.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgtable.h
96833--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgtable.h 1970-01-01 00:00:00.000000000 +0000
96834+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgtable.h 2007-01-08 15:00:46.000000000 +0000
96835@@ -0,0 +1,565 @@
96836+#ifndef _X86_64_PGTABLE_H
96837+#define _X86_64_PGTABLE_H
96838+
96839+/*
96840+ * This file contains the functions and defines necessary to modify and use
96841+ * the x86-64 page table tree.
96842+ */
96843+#include <asm/processor.h>
96844+#include <asm/fixmap.h>
96845+#include <asm/bitops.h>
96846+#include <linux/threads.h>
96847+#include <linux/sched.h>
96848+#include <asm/pda.h>
96849+#ifdef CONFIG_XEN
96850+#include <asm/hypervisor.h>
96851+
96852+extern pud_t level3_user_pgt[512];
96853+extern pud_t init_level4_user_pgt[];
96854+
96855+extern void xen_init_pt(void);
96856+
96857+#define virt_to_ptep(__va) \
96858+({ \
96859+ pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \
96860+ pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \
96861+ pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \
96862+ pte_offset_kernel(__pmd, (unsigned long)(__va)); \
96863+})
96864+
96865+#define arbitrary_virt_to_machine(__va) \
96866+({ \
96867+ maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
96868+ m | ((unsigned long)(__va) & (PAGE_SIZE-1)); \
96869+})
96870+#endif
96871+
96872+extern pud_t level3_kernel_pgt[512];
96873+extern pud_t level3_physmem_pgt[512];
96874+extern pud_t level3_ident_pgt[512];
96875+extern pmd_t level2_kernel_pgt[512];
96876+extern pgd_t init_level4_pgt[];
96877+extern pgd_t boot_level4_pgt[];
96878+extern unsigned long __supported_pte_mask;
96879+
96880+#define swapper_pg_dir init_level4_pgt
96881+
96882+extern int nonx_setup(char *str);
96883+extern void paging_init(void);
96884+extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
96885+
96886+extern unsigned long pgkern_mask;
96887+
96888+/*
96889+ * ZERO_PAGE is a global shared page that is always zero: used
96890+ * for zero-mapped memory areas etc..
96891+ */
96892+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
96893+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
96894+
96895+/*
96896+ * PGDIR_SHIFT determines what a top-level page table entry can map
96897+ */
96898+#define PGDIR_SHIFT 39
96899+#define PTRS_PER_PGD 512
96900+
96901+/*
96902+ * 3rd level page
96903+ */
96904+#define PUD_SHIFT 30
96905+#define PTRS_PER_PUD 512
96906+
96907+/*
96908+ * PMD_SHIFT determines the size of the area a middle-level
96909+ * page table can map
96910+ */
96911+#define PMD_SHIFT 21
96912+#define PTRS_PER_PMD 512
96913+
96914+/*
96915+ * entries per page directory level
96916+ */
96917+#define PTRS_PER_PTE 512
96918+
96919+#define pte_ERROR(e) \
96920+ printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
96921+#define pmd_ERROR(e) \
96922+ printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
96923+#define pud_ERROR(e) \
96924+ printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
96925+#define pgd_ERROR(e) \
96926+ printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
96927+
96928+#define pgd_none(x) (!pgd_val(x))
96929+#define pud_none(x) (!pud_val(x))
96930+
96931+#define set_pte_batched(pteptr, pteval) \
96932+ queue_l1_entry_update(pteptr, (pteval))
96933+
96934+extern inline int pud_present(pud_t pud) { return !pud_none(pud); }
96935+
96936+static inline void set_pte(pte_t *dst, pte_t val)
96937+{
96938+ *dst = val;
96939+}
96940+
96941+#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
96942+#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
96943+#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
96944+
96945+static inline void pud_clear (pud_t * pud)
96946+{
96947+ set_pud(pud, __pud(0));
96948+}
96949+
96950+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
96951+
96952+static inline void pgd_clear (pgd_t * pgd)
96953+{
96954+ set_pgd(pgd, __pgd(0));
96955+ set_pgd(__user_pgd(pgd), __pgd(0));
96956+}
96957+
96958+#define pud_page(pud) \
96959+ ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
96960+
96961+/*
96962+ * A note on implementation of this atomic 'get-and-clear' operation.
96963+ * This is actually very simple because Xen Linux can only run on a single
96964+ * processor. Therefore, we cannot race other processors setting the 'accessed'
96965+ * or 'dirty' bits on a page-table entry.
96966+ * Even if pages are shared between domains, that is not a problem because
96967+ * each domain will have separate page tables, with their own versions of
96968+ * accessed & dirty state.
96969+ */
96970+#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
96971+
96972+#if 0
96973+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
96974+{
96975+ pte_t pte = *xp;
96976+ if (pte.pte)
96977+ set_pte(xp, __pte_ma(0));
96978+ return pte;
96979+}
96980+#endif
96981+
96982+struct mm_struct;
96983+
96984+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
96985+{
96986+ pte_t pte;
96987+ if (full) {
96988+ pte = *ptep;
96989+ *ptep = __pte(0);
96990+ } else {
96991+ pte = ptep_get_and_clear(mm, addr, ptep);
96992+ }
96993+ return pte;
96994+}
96995+
96996+#define pte_same(a, b) ((a).pte == (b).pte)
96997+
96998+#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
96999+
97000+#define PMD_SIZE (1UL << PMD_SHIFT)
97001+#define PMD_MASK (~(PMD_SIZE-1))
97002+#define PUD_SIZE (1UL << PUD_SHIFT)
97003+#define PUD_MASK (~(PUD_SIZE-1))
97004+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
97005+#define PGDIR_MASK (~(PGDIR_SIZE-1))
97006+
97007+#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
97008+#define FIRST_USER_ADDRESS 0
97009+
97010+#ifndef __ASSEMBLY__
97011+#define MAXMEM 0x3fffffffffffUL
97012+#define VMALLOC_START 0xffffc20000000000UL
97013+#define VMALLOC_END 0xffffe1ffffffffffUL
97014+#define MODULES_VADDR 0xffffffff88000000UL
97015+#define MODULES_END 0xfffffffffff00000UL
97016+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
97017+
97018+#define _PAGE_BIT_PRESENT 0
97019+#define _PAGE_BIT_RW 1
97020+#define _PAGE_BIT_USER 2
97021+#define _PAGE_BIT_PWT 3
97022+#define _PAGE_BIT_PCD 4
97023+#define _PAGE_BIT_ACCESSED 5
97024+#define _PAGE_BIT_DIRTY 6
97025+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
97026+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
97027+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
97028+
97029+#define _PAGE_PRESENT 0x001
97030+#define _PAGE_RW 0x002
97031+#define _PAGE_USER 0x004
97032+#define _PAGE_PWT 0x008
97033+#define _PAGE_PCD 0x010
97034+#define _PAGE_ACCESSED 0x020
97035+#define _PAGE_DIRTY 0x040
97036+#define _PAGE_PSE 0x080 /* 2MB page */
97037+#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
97038+#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
97039+
97040+#define _PAGE_PROTNONE 0x080 /* If not present */
97041+#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
97042+
97043+#ifdef CONFIG_XEN_COMPAT_030002
97044+extern unsigned int __kernel_page_user;
97045+#else
97046+#define __kernel_page_user 0
97047+#endif
97048+
97049+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
97050+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
97051+
97052+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
97053+
97054+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
97055+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97056+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
97057+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97058+#define PAGE_COPY PAGE_COPY_NOEXEC
97059+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97060+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97061+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97062+#define __PAGE_KERNEL \
97063+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97064+#define __PAGE_KERNEL_EXEC \
97065+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
97066+#define __PAGE_KERNEL_NOCACHE \
97067+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97068+#define __PAGE_KERNEL_RO \
97069+ (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97070+#define __PAGE_KERNEL_VSYSCALL \
97071+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97072+#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
97073+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
97074+#define __PAGE_KERNEL_LARGE \
97075+ (__PAGE_KERNEL | _PAGE_PSE)
97076+#define __PAGE_KERNEL_LARGE_EXEC \
97077+ (__PAGE_KERNEL_EXEC | _PAGE_PSE)
97078+
97079+/*
97080+ * We don't support GLOBAL page in xenolinux64
97081+ */
97082+#define MAKE_GLOBAL(x) __pgprot((x))
97083+
97084+#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
97085+#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
97086+#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
97087+#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
97088+#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
97089+#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
97090+#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
97091+#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
97092+
97093+/* xwr */
97094+#define __P000 PAGE_NONE
97095+#define __P001 PAGE_READONLY
97096+#define __P010 PAGE_COPY
97097+#define __P011 PAGE_COPY
97098+#define __P100 PAGE_READONLY_EXEC
97099+#define __P101 PAGE_READONLY_EXEC
97100+#define __P110 PAGE_COPY_EXEC
97101+#define __P111 PAGE_COPY_EXEC
97102+
97103+#define __S000 PAGE_NONE
97104+#define __S001 PAGE_READONLY
97105+#define __S010 PAGE_SHARED
97106+#define __S011 PAGE_SHARED
97107+#define __S100 PAGE_READONLY_EXEC
97108+#define __S101 PAGE_READONLY_EXEC
97109+#define __S110 PAGE_SHARED_EXEC
97110+#define __S111 PAGE_SHARED_EXEC
97111+
97112+static inline unsigned long pgd_bad(pgd_t pgd)
97113+{
97114+ unsigned long val = pgd_val(pgd);
97115+ val &= ~PTE_MASK;
97116+ val &= ~(_PAGE_USER | _PAGE_DIRTY);
97117+ return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
97118+}
97119+
97120+static inline unsigned long pud_bad(pud_t pud)
97121+{
97122+ unsigned long val = pud_val(pud);
97123+ val &= ~PTE_MASK;
97124+ val &= ~(_PAGE_USER | _PAGE_DIRTY);
97125+ return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
97126+}
97127+
97128+#define set_pte_at(_mm,addr,ptep,pteval) do { \
97129+ if (((_mm) != current->mm && (_mm) != &init_mm) || \
97130+ HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
97131+ set_pte((ptep), (pteval)); \
97132+} while (0)
97133+
97134+#define pte_none(x) (!(x).pte)
97135+#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
97136+#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
97137+
97138+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
97139+
97140+#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
97141+#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
97142+
97143+#define pte_page(x) pfn_to_page(pte_pfn(x))
97144+
97145+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
97146+{
97147+ pte_t pte;
97148+
97149+ (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
97150+ (pte).pte |= pgprot_val(pgprot);
97151+ (pte).pte &= __supported_pte_mask;
97152+ return pte;
97153+}
97154+
97155+/*
97156+ * The following only work if pte_present() is true.
97157+ * Undefined behaviour if not..
97158+ */
97159+#define __pte_val(x) ((x).pte)
97160+
97161+#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
97162+static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97163+static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97164+static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97165+static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
97166+static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
97167+static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
97168+static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
97169+static inline int pte_huge(pte_t pte) { return (__pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
97170+
97171+static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
97172+static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
97173+static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
97174+static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
97175+static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
97176+static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
97177+static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
97178+static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
97179+static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
97180+static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
97181+static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= __LARGE_PTE; return pte; }
97182+
97183+struct vm_area_struct;
97184+
97185+static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
97186+{
97187+ pte_t pte = *ptep;
97188+ int ret = pte_dirty(pte);
97189+ if (ret)
97190+ set_pte(ptep, pte_mkclean(pte));
97191+ return ret;
97192+}
97193+
97194+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
97195+{
97196+ pte_t pte = *ptep;
97197+ int ret = pte_young(pte);
97198+ if (ret)
97199+ set_pte(ptep, pte_mkold(pte));
97200+ return ret;
97201+}
97202+
97203+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
97204+{
97205+ pte_t pte = *ptep;
97206+ if (pte_write(pte))
97207+ set_pte(ptep, pte_wrprotect(pte));
97208+}
97209+
97210+/*
97211+ * Macro to mark a page protection value as "uncacheable".
97212+ */
97213+#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
97214+
97215+static inline int pmd_large(pmd_t pte) {
97216+ return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
97217+}
97218+
97219+
97220+/*
97221+ * Conversion functions: convert a page and protection to a page entry,
97222+ * and a page entry and page directory to the page they refer to.
97223+ */
97224+
97225+/*
97226+ * Level 4 access.
97227+ * Never use these in the common code.
97228+ */
97229+#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
97230+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
97231+#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
97232+#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
97233+#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
97234+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
97235+
97236+/* PUD - Level3 access */
97237+/* to find an entry in a page-table-directory. */
97238+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
97239+#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
97240+static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
97241+{
97242+ return pud + pud_index(address);
97243+}
97244+
97245+/* Find correct pud via the hidden fourth level page level: */
97246+
97247+/* This accesses the reference page table of the boot cpu.
97248+ Other CPUs get synced lazily via the page fault handler. */
97249+static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
97250+{
97251+ return pud_offset(pgd_offset_k(address), address);
97252+}
97253+
97254+/* PMD - Level 2 access */
97255+#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
97256+#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
97257+
97258+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
97259+#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
97260+ pmd_index(address))
97261+#define pmd_none(x) (!pmd_val(x))
97262+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
97263+ can temporarily clear it. */
97264+#define pmd_present(x) (pmd_val(x))
97265+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
97266+#define pmd_bad(x) ((pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
97267+ != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
97268+#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
97269+#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
97270+
97271+#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
97272+#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
97273+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
97274+
97275+/* PTE - Level 1 access. */
97276+
97277+/* page, protection -> pte */
97278+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
97279+#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
97280+
97281+/* physical address -> PTE */
97282+static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
97283+{
97284+ pte_t pte;
97285+ (pte).pte = physpage | pgprot_val(pgprot);
97286+ return pte;
97287+}
97288+
97289+/* Change flags of a PTE */
97290+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
97291+{
97292+ (pte).pte &= _PAGE_CHG_MASK;
97293+ (pte).pte |= pgprot_val(newprot);
97294+ (pte).pte &= __supported_pte_mask;
97295+ return pte;
97296+}
97297+
97298+#define pte_index(address) \
97299+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
97300+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
97301+ pte_index(address))
97302+
97303+/* x86-64 always has all page tables mapped. */
97304+#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
97305+#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
97306+#define pte_unmap(pte) /* NOP */
97307+#define pte_unmap_nested(pte) /* NOP */
97308+
97309+#define update_mmu_cache(vma,address,pte) do { } while (0)
97310+
97311+/* We only update the dirty/accessed state if we set
97312+ * the dirty bit by hand in the kernel, since the hardware
97313+ * will do the accessed bit for us, and we don't want to
97314+ * race with other CPU's that might be updating the dirty
97315+ * bit at the same time. */
97316+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
97317+#if 0
97318+#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
97319+ do { \
97320+ if (__dirty) { \
97321+ set_pte(__ptep, __entry); \
97322+ flush_tlb_page(__vma, __address); \
97323+ } \
97324+ } while (0)
97325+#endif
97326+#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
97327+ do { \
97328+ if (__dirty) { \
97329+ if ( likely((__vma)->vm_mm == current->mm) ) { \
97330+ BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
97331+ } else { \
97332+ xen_l1_entry_update((__ptep), (__entry)); \
97333+ flush_tlb_page((__vma), (__address)); \
97334+ } \
97335+ } \
97336+ } while (0)
97337+
97338+/* Encode and de-code a swap entry */
97339+#define __swp_type(x) (((x).val >> 1) & 0x3f)
97340+#define __swp_offset(x) ((x).val >> 8)
97341+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
97342+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
97343+#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
97344+
97345+#endif /* !__ASSEMBLY__ */
97346+
97347+extern int kern_addr_valid(unsigned long addr);
97348+
97349+#define DOMID_LOCAL (0xFFFFU)
97350+
97351+int direct_remap_pfn_range(struct vm_area_struct *vma,
97352+ unsigned long address,
97353+ unsigned long mfn,
97354+ unsigned long size,
97355+ pgprot_t prot,
97356+ domid_t domid);
97357+
97358+int direct_kernel_remap_pfn_range(unsigned long address,
97359+ unsigned long mfn,
97360+ unsigned long size,
97361+ pgprot_t prot,
97362+ domid_t domid);
97363+
97364+int create_lookup_pte_addr(struct mm_struct *mm,
97365+ unsigned long address,
97366+ uint64_t *ptep);
97367+
97368+int touch_pte_range(struct mm_struct *mm,
97369+ unsigned long address,
97370+ unsigned long size);
97371+
97372+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
97373+ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
97374+
97375+#define MK_IOSPACE_PFN(space, pfn) (pfn)
97376+#define GET_IOSPACE(pfn) 0
97377+#define GET_PFN(pfn) (pfn)
97378+
97379+#define HAVE_ARCH_UNMAPPED_AREA
97380+
97381+#define pgtable_cache_init() do { } while (0)
97382+#define check_pgt_cache() do { } while (0)
97383+
97384+#define PAGE_AGP PAGE_KERNEL_NOCACHE
97385+#define HAVE_PAGE_AGP 1
97386+
97387+/* fs/proc/kcore.c */
97388+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
97389+#define kc_offset_to_vaddr(o) \
97390+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
97391+
97392+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
97393+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
97394+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
97395+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
97396+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
97397+#define __HAVE_ARCH_PTE_SAME
97398+#include <asm-generic/pgtable.h>
97399+
97400+#endif /* _X86_64_PGTABLE_H */
97401diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/processor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/processor.h
97402--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/processor.h 1970-01-01 00:00:00.000000000 +0000
97403+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/processor.h 2007-01-08 15:00:46.000000000 +0000
97404@@ -0,0 +1,493 @@
97405+/*
97406+ * include/asm-x86_64/processor.h
97407+ *
97408+ * Copyright (C) 1994 Linus Torvalds
97409+ */
97410+
97411+#ifndef __ASM_X86_64_PROCESSOR_H
97412+#define __ASM_X86_64_PROCESSOR_H
97413+
97414+#include <asm/segment.h>
97415+#include <asm/page.h>
97416+#include <asm/types.h>
97417+#include <asm/sigcontext.h>
97418+#include <asm/cpufeature.h>
97419+#include <linux/config.h>
97420+#include <linux/threads.h>
97421+#include <asm/msr.h>
97422+#include <asm/current.h>
97423+#include <asm/system.h>
97424+#include <asm/mmsegment.h>
97425+#include <asm/percpu.h>
97426+#include <linux/personality.h>
97427+
97428+#define TF_MASK 0x00000100
97429+#define IF_MASK 0x00000200
97430+#define IOPL_MASK 0x00003000
97431+#define NT_MASK 0x00004000
97432+#define VM_MASK 0x00020000
97433+#define AC_MASK 0x00040000
97434+#define VIF_MASK 0x00080000 /* virtual interrupt flag */
97435+#define VIP_MASK 0x00100000 /* virtual interrupt pending */
97436+#define ID_MASK 0x00200000
97437+
97438+#define desc_empty(desc) \
97439+ (!((desc)->a | (desc)->b))
97440+
97441+#define desc_equal(desc1, desc2) \
97442+ (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
97443+
97444+/*
97445+ * Default implementation of macro that returns current
97446+ * instruction pointer ("program counter").
97447+ */
97448+#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
97449+
97450+/*
97451+ * CPU type and hardware bug flags. Kept separately for each CPU.
97452+ */
97453+
97454+struct cpuinfo_x86 {
97455+ __u8 x86; /* CPU family */
97456+ __u8 x86_vendor; /* CPU vendor */
97457+ __u8 x86_model;
97458+ __u8 x86_mask;
97459+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
97460+ __u32 x86_capability[NCAPINTS];
97461+ char x86_vendor_id[16];
97462+ char x86_model_id[64];
97463+ int x86_cache_size; /* in KB */
97464+ int x86_clflush_size;
97465+ int x86_cache_alignment;
97466+ int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
97467+ __u8 x86_virt_bits, x86_phys_bits;
97468+ __u8 x86_max_cores; /* cpuid returned max cores value */
97469+ __u32 x86_power;
97470+ __u32 extended_cpuid_level; /* Max extended CPUID function supported */
97471+ unsigned long loops_per_jiffy;
97472+ __u8 apicid;
97473+ __u8 booted_cores; /* number of cores as seen by OS */
97474+} ____cacheline_aligned;
97475+
97476+#define X86_VENDOR_INTEL 0
97477+#define X86_VENDOR_CYRIX 1
97478+#define X86_VENDOR_AMD 2
97479+#define X86_VENDOR_UMC 3
97480+#define X86_VENDOR_NEXGEN 4
97481+#define X86_VENDOR_CENTAUR 5
97482+#define X86_VENDOR_RISE 6
97483+#define X86_VENDOR_TRANSMETA 7
97484+#define X86_VENDOR_NUM 8
97485+#define X86_VENDOR_UNKNOWN 0xff
97486+
97487+#ifdef CONFIG_SMP
97488+extern struct cpuinfo_x86 cpu_data[];
97489+#define current_cpu_data cpu_data[smp_processor_id()]
97490+#else
97491+#define cpu_data (&boot_cpu_data)
97492+#define current_cpu_data boot_cpu_data
97493+#endif
97494+
97495+extern char ignore_irq13;
97496+
97497+extern void identify_cpu(struct cpuinfo_x86 *);
97498+extern void print_cpu_info(struct cpuinfo_x86 *);
97499+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
97500+
97501+/*
97502+ * EFLAGS bits
97503+ */
97504+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
97505+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
97506+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
97507+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
97508+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
97509+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
97510+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
97511+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
97512+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
97513+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
97514+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
97515+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
97516+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
97517+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
97518+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
97519+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
97520+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
97521+
97522+/*
97523+ * Intel CPU features in CR4
97524+ */
97525+#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
97526+#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
97527+#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
97528+#define X86_CR4_DE 0x0008 /* enable debugging extensions */
97529+#define X86_CR4_PSE 0x0010 /* enable page size extensions */
97530+#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
97531+#define X86_CR4_MCE 0x0040 /* Machine check enable */
97532+#define X86_CR4_PGE 0x0080 /* enable global pages */
97533+#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
97534+#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
97535+#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
97536+
97537+/*
97538+ * Save the cr4 feature set we're using (ie
97539+ * Pentium 4MB enable and PPro Global page
97540+ * enable), so that any CPU's that boot up
97541+ * after us can get the correct flags.
97542+ */
97543+extern unsigned long mmu_cr4_features;
97544+
97545+static inline void set_in_cr4 (unsigned long mask)
97546+{
97547+ mmu_cr4_features |= mask;
97548+ __asm__("movq %%cr4,%%rax\n\t"
97549+ "orq %0,%%rax\n\t"
97550+ "movq %%rax,%%cr4\n"
97551+ : : "irg" (mask)
97552+ :"ax");
97553+}
97554+
97555+static inline void clear_in_cr4 (unsigned long mask)
97556+{
97557+ mmu_cr4_features &= ~mask;
97558+ __asm__("movq %%cr4,%%rax\n\t"
97559+ "andq %0,%%rax\n\t"
97560+ "movq %%rax,%%cr4\n"
97561+ : : "irg" (~mask)
97562+ :"ax");
97563+}
97564+
97565+
97566+/*
97567+ * Bus types
97568+ */
97569+#define MCA_bus 0
97570+#define MCA_bus__is_a_macro
97571+
97572+/*
97573+ * User space process size. 47bits minus one guard page.
97574+ */
97575+#define TASK_SIZE64 (0x800000000000UL - 4096)
97576+
97577+/* This decides where the kernel will search for a free chunk of vm
97578+ * space during mmap's.
97579+ */
97580+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
97581+
97582+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
97583+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
97584+
97585+#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
97586+
97587+/*
97588+ * Size of io_bitmap.
97589+ */
97590+#define IO_BITMAP_BITS 65536
97591+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
97592+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
97593+#ifndef CONFIG_X86_NO_TSS
97594+#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
97595+#endif
97596+#define INVALID_IO_BITMAP_OFFSET 0x8000
97597+
97598+struct i387_fxsave_struct {
97599+ u16 cwd;
97600+ u16 swd;
97601+ u16 twd;
97602+ u16 fop;
97603+ u64 rip;
97604+ u64 rdp;
97605+ u32 mxcsr;
97606+ u32 mxcsr_mask;
97607+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
97608+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
97609+ u32 padding[24];
97610+} __attribute__ ((aligned (16)));
97611+
97612+union i387_union {
97613+ struct i387_fxsave_struct fxsave;
97614+};
97615+
97616+#ifndef CONFIG_X86_NO_TSS
97617+struct tss_struct {
97618+ u32 reserved1;
97619+ u64 rsp0;
97620+ u64 rsp1;
97621+ u64 rsp2;
97622+ u64 reserved2;
97623+ u64 ist[7];
97624+ u32 reserved3;
97625+ u32 reserved4;
97626+ u16 reserved5;
97627+ u16 io_bitmap_base;
97628+ /*
97629+ * The extra 1 is there because the CPU will access an
97630+ * additional byte beyond the end of the IO permission
97631+ * bitmap. The extra byte must be all 1 bits, and must
97632+ * be within the limit. Thus we have:
97633+ *
97634+ * 128 bytes, the bitmap itself, for ports 0..0x3ff
97635+ * 8 bytes, for an extra "long" of ~0UL
97636+ */
97637+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
97638+} __attribute__((packed)) ____cacheline_aligned;
97639+
97640+DECLARE_PER_CPU(struct tss_struct,init_tss);
97641+#endif
97642+
97643+extern struct cpuinfo_x86 boot_cpu_data;
97644+
97645+#ifdef CONFIG_X86_VSMP
97646+#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
97647+#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
97648+#else
97649+#define ARCH_MIN_TASKALIGN 16
97650+#define ARCH_MIN_MMSTRUCT_ALIGN 0
97651+#endif
97652+
97653+struct thread_struct {
97654+ unsigned long rsp0;
97655+ unsigned long rsp;
97656+ unsigned long userrsp; /* Copy from PDA */
97657+ unsigned long fs;
97658+ unsigned long gs;
97659+ unsigned short es, ds, fsindex, gsindex;
97660+/* Hardware debugging registers */
97661+ unsigned long debugreg0;
97662+ unsigned long debugreg1;
97663+ unsigned long debugreg2;
97664+ unsigned long debugreg3;
97665+ unsigned long debugreg6;
97666+ unsigned long debugreg7;
97667+/* fault info */
97668+ unsigned long cr2, trap_no, error_code;
97669+/* floating point info */
97670+ union i387_union i387 __attribute__((aligned(16)));
97671+/* IO permissions. the bitmap could be moved into the GDT, that would make
97672+ switch faster for a limited number of ioperm using tasks. -AK */
97673+ int ioperm;
97674+ unsigned long *io_bitmap_ptr;
97675+ unsigned io_bitmap_max;
97676+/* cached TLS descriptors. */
97677+ u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
97678+ unsigned int iopl;
97679+} __attribute__((aligned(16)));
97680+
97681+#define INIT_THREAD { \
97682+ .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
97683+}
97684+
97685+#ifndef CONFIG_X86_NO_TSS
97686+#define INIT_TSS { \
97687+ .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
97688+}
97689+#endif
97690+
97691+#define INIT_MMAP \
97692+{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
97693+
97694+#define start_thread(regs,new_rip,new_rsp) do { \
97695+ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
97696+ load_gs_index(0); \
97697+ (regs)->rip = (new_rip); \
97698+ (regs)->rsp = (new_rsp); \
97699+ write_pda(oldrsp, (new_rsp)); \
97700+ (regs)->cs = __USER_CS; \
97701+ (regs)->ss = __USER_DS; \
97702+ (regs)->eflags = 0x200; \
97703+ set_fs(USER_DS); \
97704+} while(0)
97705+
97706+#define get_debugreg(var, register) \
97707+ var = HYPERVISOR_get_debugreg(register)
97708+#define set_debugreg(value, register) \
97709+ HYPERVISOR_set_debugreg(register, value)
97710+
97711+struct task_struct;
97712+struct mm_struct;
97713+
97714+/* Free all resources held by a thread. */
97715+extern void release_thread(struct task_struct *);
97716+
97717+/* Prepare to copy thread state - unlazy all lazy status */
97718+extern void prepare_to_copy(struct task_struct *tsk);
97719+
97720+/*
97721+ * create a kernel thread without removing it from tasklists
97722+ */
97723+extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
97724+
97725+/*
97726+ * Return saved PC of a blocked thread.
97727+ * What is this good for? it will be always the scheduler or ret_from_fork.
97728+ */
97729+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
97730+
97731+extern unsigned long get_wchan(struct task_struct *p);
97732+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
97733+#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
97734+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
97735+
97736+
97737+struct microcode_header {
97738+ unsigned int hdrver;
97739+ unsigned int rev;
97740+ unsigned int date;
97741+ unsigned int sig;
97742+ unsigned int cksum;
97743+ unsigned int ldrver;
97744+ unsigned int pf;
97745+ unsigned int datasize;
97746+ unsigned int totalsize;
97747+ unsigned int reserved[3];
97748+};
97749+
97750+struct microcode {
97751+ struct microcode_header hdr;
97752+ unsigned int bits[0];
97753+};
97754+
97755+typedef struct microcode microcode_t;
97756+typedef struct microcode_header microcode_header_t;
97757+
97758+/* microcode format is extended from prescott processors */
97759+struct extended_signature {
97760+ unsigned int sig;
97761+ unsigned int pf;
97762+ unsigned int cksum;
97763+};
97764+
97765+struct extended_sigtable {
97766+ unsigned int count;
97767+ unsigned int cksum;
97768+ unsigned int reserved[3];
97769+ struct extended_signature sigs[0];
97770+};
97771+
97772+/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
97773+#define MICROCODE_IOCFREE _IO('6',0)
97774+
97775+
97776+#define ASM_NOP1 K8_NOP1
97777+#define ASM_NOP2 K8_NOP2
97778+#define ASM_NOP3 K8_NOP3
97779+#define ASM_NOP4 K8_NOP4
97780+#define ASM_NOP5 K8_NOP5
97781+#define ASM_NOP6 K8_NOP6
97782+#define ASM_NOP7 K8_NOP7
97783+#define ASM_NOP8 K8_NOP8
97784+
97785+/* Opteron nops */
97786+#define K8_NOP1 ".byte 0x90\n"
97787+#define K8_NOP2 ".byte 0x66,0x90\n"
97788+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
97789+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
97790+#define K8_NOP5 K8_NOP3 K8_NOP2
97791+#define K8_NOP6 K8_NOP3 K8_NOP3
97792+#define K8_NOP7 K8_NOP4 K8_NOP3
97793+#define K8_NOP8 K8_NOP4 K8_NOP4
97794+
97795+#define ASM_NOP_MAX 8
97796+
97797+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
97798+static inline void rep_nop(void)
97799+{
97800+ __asm__ __volatile__("rep;nop": : :"memory");
97801+}
97802+
97803+/* Stop speculative execution */
97804+static inline void sync_core(void)
97805+{
97806+ int tmp;
97807+ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
97808+}
97809+
97810+#define cpu_has_fpu 1
97811+
97812+#define ARCH_HAS_PREFETCH
97813+static inline void prefetch(void *x)
97814+{
97815+ asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
97816+}
97817+
97818+#define ARCH_HAS_PREFETCHW 1
97819+static inline void prefetchw(void *x)
97820+{
97821+ alternative_input("prefetcht0 (%1)",
97822+ "prefetchw (%1)",
97823+ X86_FEATURE_3DNOW,
97824+ "r" (x));
97825+}
97826+
97827+#define ARCH_HAS_SPINLOCK_PREFETCH 1
97828+
97829+#define spin_lock_prefetch(x) prefetchw(x)
97830+
97831+#define cpu_relax() rep_nop()
97832+
97833+/*
97834+ * NSC/Cyrix CPU configuration register indexes
97835+ */
97836+#define CX86_CCR0 0xc0
97837+#define CX86_CCR1 0xc1
97838+#define CX86_CCR2 0xc2
97839+#define CX86_CCR3 0xc3
97840+#define CX86_CCR4 0xe8
97841+#define CX86_CCR5 0xe9
97842+#define CX86_CCR6 0xea
97843+#define CX86_CCR7 0xeb
97844+#define CX86_DIR0 0xfe
97845+#define CX86_DIR1 0xff
97846+#define CX86_ARR_BASE 0xc4
97847+#define CX86_RCR_BASE 0xdc
97848+
97849+/*
97850+ * NSC/Cyrix CPU indexed register access macros
97851+ */
97852+
97853+#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
97854+
97855+#define setCx86(reg, data) do { \
97856+ outb((reg), 0x22); \
97857+ outb((data), 0x23); \
97858+} while (0)
97859+
97860+static inline void serialize_cpu(void)
97861+{
97862+ __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
97863+}
97864+
97865+static inline void __monitor(const void *eax, unsigned long ecx,
97866+ unsigned long edx)
97867+{
97868+ /* "monitor %eax,%ecx,%edx;" */
97869+ asm volatile(
97870+ ".byte 0x0f,0x01,0xc8;"
97871+ : :"a" (eax), "c" (ecx), "d"(edx));
97872+}
97873+
97874+static inline void __mwait(unsigned long eax, unsigned long ecx)
97875+{
97876+ /* "mwait %eax,%ecx;" */
97877+ asm volatile(
97878+ ".byte 0x0f,0x01,0xc9;"
97879+ : :"a" (eax), "c" (ecx));
97880+}
97881+
97882+#define stack_current() \
97883+({ \
97884+ struct thread_info *ti; \
97885+ asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
97886+ ti->task; \
97887+})
97888+
97889+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
97890+
97891+extern unsigned long boot_option_idle_override;
97892+/* Boot loader type from the setup header */
97893+extern int bootloader_type;
97894+
97895+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
97896+
97897+#endif /* __ASM_X86_64_PROCESSOR_H */
97898diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/ptrace.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/ptrace.h
97899--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/ptrace.h 1970-01-01 00:00:00.000000000 +0000
97900+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/ptrace.h 2007-01-08 15:00:46.000000000 +0000
97901@@ -0,0 +1,127 @@
97902+#ifndef _X86_64_PTRACE_H
97903+#define _X86_64_PTRACE_H
97904+
97905+#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
97906+#define R15 0
97907+#define R14 8
97908+#define R13 16
97909+#define R12 24
97910+#define RBP 32
97911+#define RBX 40
97912+/* arguments: interrupts/non tracing syscalls only save upto here*/
97913+#define R11 48
97914+#define R10 56
97915+#define R9 64
97916+#define R8 72
97917+#define RAX 80
97918+#define RCX 88
97919+#define RDX 96
97920+#define RSI 104
97921+#define RDI 112
97922+#define ORIG_RAX 120 /* = ERROR */
97923+/* end of arguments */
97924+/* cpu exception frame or undefined in case of fast syscall. */
97925+#define RIP 128
97926+#define CS 136
97927+#define EFLAGS 144
97928+#define RSP 152
97929+#define SS 160
97930+#define ARGOFFSET R11
97931+#endif /* __ASSEMBLY__ */
97932+
97933+/* top of stack page */
97934+#define FRAME_SIZE 168
97935+
97936+#define PTRACE_OLDSETOPTIONS 21
97937+
97938+#ifndef __ASSEMBLY__
97939+
97940+struct pt_regs {
97941+ unsigned long r15;
97942+ unsigned long r14;
97943+ unsigned long r13;
97944+ unsigned long r12;
97945+ unsigned long rbp;
97946+ unsigned long rbx;
97947+/* arguments: non interrupts/non tracing syscalls only save upto here*/
97948+ unsigned long r11;
97949+ unsigned long r10;
97950+ unsigned long r9;
97951+ unsigned long r8;
97952+ unsigned long rax;
97953+ unsigned long rcx;
97954+ unsigned long rdx;
97955+ unsigned long rsi;
97956+ unsigned long rdi;
97957+ unsigned long orig_rax;
97958+/* end of arguments */
97959+/* cpu exception frame or undefined */
97960+ unsigned long rip;
97961+ unsigned long cs;
97962+ unsigned long eflags;
97963+ unsigned long rsp;
97964+ unsigned long ss;
97965+/* top of stack page */
97966+};
97967+
97968+#endif
97969+
97970+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
97971+#define PTRACE_GETREGS 12
97972+#define PTRACE_SETREGS 13
97973+#define PTRACE_GETFPREGS 14
97974+#define PTRACE_SETFPREGS 15
97975+#define PTRACE_GETFPXREGS 18
97976+#define PTRACE_SETFPXREGS 19
97977+
97978+/* only useful for access 32bit programs */
97979+#define PTRACE_GET_THREAD_AREA 25
97980+#define PTRACE_SET_THREAD_AREA 26
97981+
97982+#define PTRACE_ARCH_PRCTL 30 /* arch_prctl for child */
97983+
97984+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
97985+#define user_mode(regs) (!!((regs)->cs & 3))
97986+#define user_mode_vm(regs) user_mode(regs)
97987+#define instruction_pointer(regs) ((regs)->rip)
97988+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
97989+extern unsigned long profile_pc(struct pt_regs *regs);
97990+#else
97991+#define profile_pc(regs) instruction_pointer(regs)
97992+#endif
97993+
97994+#include <linux/compiler.h>
97995+
97996+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
97997+
97998+struct task_struct;
97999+
98000+extern unsigned long
98001+convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
98002+
98003+enum {
98004+ EF_CF = 0x00000001,
98005+ EF_PF = 0x00000004,
98006+ EF_AF = 0x00000010,
98007+ EF_ZF = 0x00000040,
98008+ EF_SF = 0x00000080,
98009+ EF_TF = 0x00000100,
98010+ EF_IE = 0x00000200,
98011+ EF_DF = 0x00000400,
98012+ EF_OF = 0x00000800,
98013+ EF_IOPL = 0x00003000,
98014+ EF_IOPL_RING0 = 0x00000000,
98015+ EF_IOPL_RING1 = 0x00001000,
98016+ EF_IOPL_RING2 = 0x00002000,
98017+ EF_NT = 0x00004000, /* nested task */
98018+ EF_RF = 0x00010000, /* resume */
98019+ EF_VM = 0x00020000, /* virtual mode */
98020+ EF_AC = 0x00040000, /* alignment */
98021+ EF_VIF = 0x00080000, /* virtual interrupt */
98022+ EF_VIP = 0x00100000, /* virtual interrupt pending */
98023+ EF_ID = 0x00200000, /* id */
98024+};
98025+
98026+#endif
98027+
98028+#endif
98029diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/smp.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/smp.h
98030--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/smp.h 1970-01-01 00:00:00.000000000 +0000
98031+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/smp.h 2007-01-08 15:00:46.000000000 +0000
98032@@ -0,0 +1,152 @@
98033+#ifndef __ASM_SMP_H
98034+#define __ASM_SMP_H
98035+
98036+/*
98037+ * We need the APIC definitions automatically as part of 'smp.h'
98038+ */
98039+#ifndef __ASSEMBLY__
98040+#include <linux/config.h>
98041+#include <linux/threads.h>
98042+#include <linux/cpumask.h>
98043+#include <linux/bitops.h>
98044+extern int disable_apic;
98045+#endif
98046+
98047+#ifdef CONFIG_X86_LOCAL_APIC
98048+#ifndef __ASSEMBLY__
98049+#include <asm/fixmap.h>
98050+#include <asm/mpspec.h>
98051+#ifdef CONFIG_X86_IO_APIC
98052+#include <asm/io_apic.h>
98053+#endif
98054+#include <asm/apic.h>
98055+#include <asm/thread_info.h>
98056+#endif
98057+#endif
98058+
98059+#ifdef CONFIG_SMP
98060+#ifndef ASSEMBLY
98061+
98062+#include <asm/pda.h>
98063+
98064+struct pt_regs;
98065+
98066+extern cpumask_t cpu_present_mask;
98067+extern cpumask_t cpu_possible_map;
98068+extern cpumask_t cpu_online_map;
98069+extern cpumask_t cpu_initialized;
98070+
98071+/*
98072+ * Private routines/data
98073+ */
98074+
98075+extern void smp_alloc_memory(void);
98076+extern volatile unsigned long smp_invalidate_needed;
98077+extern int pic_mode;
98078+extern void lock_ipi_call_lock(void);
98079+extern void unlock_ipi_call_lock(void);
98080+extern int smp_num_siblings;
98081+extern void smp_send_reschedule(int cpu);
98082+void smp_stop_cpu(void);
98083+extern int smp_call_function_single(int cpuid, void (*func) (void *info),
98084+ void *info, int retry, int wait);
98085+
98086+extern cpumask_t cpu_sibling_map[NR_CPUS];
98087+extern cpumask_t cpu_core_map[NR_CPUS];
98088+extern int phys_proc_id[NR_CPUS];
98089+extern int cpu_core_id[NR_CPUS];
98090+
98091+#define SMP_TRAMPOLINE_BASE 0x6000
98092+
98093+/*
98094+ * On x86 all CPUs are mapped 1:1 to the APIC space.
98095+ * This simplifies scheduling and IPI sending and
98096+ * compresses data structures.
98097+ */
98098+
98099+static inline int num_booting_cpus(void)
98100+{
98101+ return cpus_weight(cpu_possible_map);
98102+}
98103+
98104+#define raw_smp_processor_id() read_pda(cpunumber)
98105+
98106+#ifdef CONFIG_X86_LOCAL_APIC
98107+static inline int hard_smp_processor_id(void)
98108+{
98109+ /* we don't want to mark this access volatile - bad code generation */
98110+ return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
98111+}
98112+#endif
98113+
98114+extern int safe_smp_processor_id(void);
98115+extern int __cpu_disable(void);
98116+extern void __cpu_die(unsigned int cpu);
98117+extern void prefill_possible_map(void);
98118+extern unsigned num_processors;
98119+extern unsigned disabled_cpus;
98120+
98121+#endif /* !ASSEMBLY */
98122+
98123+#define NO_PROC_ID 0xFF /* No processor magic marker */
98124+
98125+#endif
98126+
98127+#ifndef ASSEMBLY
98128+/*
98129+ * Some lowlevel functions might want to know about
98130+ * the real APIC ID <-> CPU # mapping.
98131+ */
98132+extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
98133+extern u8 x86_cpu_to_log_apicid[NR_CPUS];
98134+extern u8 bios_cpu_apicid[];
98135+
98136+#ifdef CONFIG_X86_LOCAL_APIC
98137+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
98138+{
98139+ return cpus_addr(cpumask)[0];
98140+}
98141+
98142+static inline int cpu_present_to_apicid(int mps_cpu)
98143+{
98144+ if (mps_cpu < NR_CPUS)
98145+ return (int)bios_cpu_apicid[mps_cpu];
98146+ else
98147+ return BAD_APICID;
98148+}
98149+#endif
98150+
98151+#endif /* !ASSEMBLY */
98152+
98153+#ifndef CONFIG_SMP
98154+#define stack_smp_processor_id() 0
98155+#define safe_smp_processor_id() 0
98156+#define cpu_logical_map(x) (x)
98157+#else
98158+#include <asm/thread_info.h>
98159+#define stack_smp_processor_id() \
98160+({ \
98161+ struct thread_info *ti; \
98162+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
98163+ ti->cpu; \
98164+})
98165+#endif
98166+
98167+#ifndef __ASSEMBLY__
98168+#ifdef CONFIG_X86_LOCAL_APIC
98169+static __inline int logical_smp_processor_id(void)
98170+{
98171+ /* we don't want to mark this access volatile - bad code generation */
98172+ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
98173+}
98174+#endif
98175+#endif
98176+
98177+#ifdef CONFIG_SMP
98178+#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
98179+#else
98180+#define cpu_physical_id(cpu) boot_cpu_id
98181+#endif
98182+
98183+#endif
98184+
98185diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/synch_bitops.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/synch_bitops.h
98186--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
98187+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
98188@@ -0,0 +1,2 @@
98189+
98190+#include <asm-i386/mach-xen/asm/synch_bitops.h>
98191diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/system.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/system.h
98192--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/system.h 1970-01-01 00:00:00.000000000 +0000
98193+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/system.h 2007-01-08 15:00:46.000000000 +0000
98194@@ -0,0 +1,434 @@
98195+#ifndef __ASM_SYSTEM_H
98196+#define __ASM_SYSTEM_H
98197+
98198+#include <linux/config.h>
98199+#include <linux/kernel.h>
98200+#include <asm/segment.h>
98201+#include <asm/synch_bitops.h>
98202+#include <asm/hypervisor.h>
98203+#include <xen/interface/arch-x86_64.h>
98204+
98205+#ifdef __KERNEL__
98206+
98207+#ifdef CONFIG_SMP
98208+#define __vcpu_id smp_processor_id()
98209+#else
98210+#define __vcpu_id 0
98211+#endif
98212+
98213+#ifdef CONFIG_SMP
98214+#define LOCK_PREFIX "lock ; "
98215+#else
98216+#define LOCK_PREFIX ""
98217+#endif
98218+
98219+#define __STR(x) #x
98220+#define STR(x) __STR(x)
98221+
98222+#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
98223+#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
98224+
98225+/* frame pointer must be last for get_wchan */
98226+#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
98227+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t"
98228+
98229+#define __EXTRA_CLOBBER \
98230+ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
98231+
98232+#define switch_to(prev,next,last) \
98233+ asm volatile(SAVE_CONTEXT \
98234+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
98235+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
98236+ "call __switch_to\n\t" \
98237+ ".globl thread_return\n" \
98238+ "thread_return:\n\t" \
98239+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
98240+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
98241+ LOCK "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
98242+ "movq %%rax,%%rdi\n\t" \
98243+ "jc ret_from_fork\n\t" \
98244+ RESTORE_CONTEXT \
98245+ : "=a" (last) \
98246+ : [next] "S" (next), [prev] "D" (prev), \
98247+ [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
98248+ [ti_flags] "i" (offsetof(struct thread_info, flags)),\
98249+ [tif_fork] "i" (TIF_FORK), \
98250+ [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
98251+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
98252+ : "memory", "cc" __EXTRA_CLOBBER)
98253+
98254+
98255+extern void load_gs_index(unsigned);
98256+
98257+/*
98258+ * Load a segment. Fall back on loading the zero
98259+ * segment if something goes wrong..
98260+ */
98261+#define loadsegment(seg,value) \
98262+ asm volatile("\n" \
98263+ "1:\t" \
98264+ "movl %k0,%%" #seg "\n" \
98265+ "2:\n" \
98266+ ".section .fixup,\"ax\"\n" \
98267+ "3:\t" \
98268+ "movl %1,%%" #seg "\n\t" \
98269+ "jmp 2b\n" \
98270+ ".previous\n" \
98271+ ".section __ex_table,\"a\"\n\t" \
98272+ ".align 8\n\t" \
98273+ ".quad 1b,3b\n" \
98274+ ".previous" \
98275+ : :"r" (value), "r" (0))
98276+
98277+#define set_debug(value,register) \
98278+ __asm__("movq %0,%%db" #register \
98279+ : /* no output */ \
98280+ :"r" ((unsigned long) value))
98281+
98282+
98283+#ifdef __KERNEL__
98284+struct alt_instr {
98285+ __u8 *instr; /* original instruction */
98286+ __u8 *replacement;
98287+ __u8 cpuid; /* cpuid bit set for replacement */
98288+ __u8 instrlen; /* length of original instruction */
98289+ __u8 replacementlen; /* length of new instruction, <= instrlen */
98290+ __u8 pad[5];
98291+};
98292+#endif
98293+
98294+/*
98295+ * Alternative instructions for different CPU types or capabilities.
98296+ *
98297+ * This allows to use optimized instructions even on generic binary
98298+ * kernels.
98299+ *
98300+ * length of oldinstr must be longer or equal the length of newinstr
98301+ * It can be padded with nops as needed.
98302+ *
98303+ * For non barrier like inlines please define new variants
98304+ * without volatile and memory clobber.
98305+ */
98306+#define alternative(oldinstr, newinstr, feature) \
98307+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98308+ ".section .altinstructions,\"a\"\n" \
98309+ " .align 8\n" \
98310+ " .quad 661b\n" /* label */ \
98311+ " .quad 663f\n" /* new instruction */ \
98312+ " .byte %c0\n" /* feature bit */ \
98313+ " .byte 662b-661b\n" /* sourcelen */ \
98314+ " .byte 664f-663f\n" /* replacementlen */ \
98315+ ".previous\n" \
98316+ ".section .altinstr_replacement,\"ax\"\n" \
98317+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98318+ ".previous" :: "i" (feature) : "memory")
98319+
98320+/*
98321+ * Alternative inline assembly with input.
98322+ *
98323+ * Peculiarities:
98324+ * No memory clobber here.
98325+ * Argument numbers start with 1.
98326+ * Best is to use constraints that are fixed size (like (%1) ... "r")
98327+ * If you use variable sized constraints like "m" or "g" in the
98328+ * replacement make sure to pad to the worst case length.
98329+ */
98330+#define alternative_input(oldinstr, newinstr, feature, input...) \
98331+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98332+ ".section .altinstructions,\"a\"\n" \
98333+ " .align 8\n" \
98334+ " .quad 661b\n" /* label */ \
98335+ " .quad 663f\n" /* new instruction */ \
98336+ " .byte %c0\n" /* feature bit */ \
98337+ " .byte 662b-661b\n" /* sourcelen */ \
98338+ " .byte 664f-663f\n" /* replacementlen */ \
98339+ ".previous\n" \
98340+ ".section .altinstr_replacement,\"ax\"\n" \
98341+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98342+ ".previous" :: "i" (feature), ##input)
98343+
98344+/* Like alternative_input, but with a single output argument */
98345+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
98346+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98347+ ".section .altinstructions,\"a\"\n" \
98348+ " .align 8\n" \
98349+ " .quad 661b\n" /* label */ \
98350+ " .quad 663f\n" /* new instruction */ \
98351+ " .byte %c[feat]\n" /* feature bit */ \
98352+ " .byte 662b-661b\n" /* sourcelen */ \
98353+ " .byte 664f-663f\n" /* replacementlen */ \
98354+ ".previous\n" \
98355+ ".section .altinstr_replacement,\"ax\"\n" \
98356+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98357+ ".previous" : output : [feat] "i" (feature), ##input)
98358+
98359+/*
98360+ * Clear and set 'TS' bit respectively
98361+ */
98362+#define clts() (HYPERVISOR_fpu_taskswitch(0))
98363+
98364+static inline unsigned long read_cr0(void)
98365+{
98366+ unsigned long cr0;
98367+ asm volatile("movq %%cr0,%0" : "=r" (cr0));
98368+ return cr0;
98369+}
98370+
98371+static inline void write_cr0(unsigned long val)
98372+{
98373+ asm volatile("movq %0,%%cr0" :: "r" (val));
98374+}
98375+
98376+#define read_cr3() ({ \
98377+ unsigned long __dummy; \
98378+ asm("movq %%cr3,%0" : "=r" (__dummy)); \
98379+ machine_to_phys(__dummy); \
98380+})
98381+
98382+static inline unsigned long read_cr4(void)
98383+{
98384+ unsigned long cr4;
98385+ asm("movq %%cr4,%0" : "=r" (cr4));
98386+ return cr4;
98387+}
98388+
98389+static inline void write_cr4(unsigned long val)
98390+{
98391+ asm volatile("movq %0,%%cr4" :: "r" (val));
98392+}
98393+
98394+#define stts() (HYPERVISOR_fpu_taskswitch(1))
98395+
98396+#define wbinvd() \
98397+ __asm__ __volatile__ ("wbinvd": : :"memory");
98398+
98399+/*
98400+ * On SMP systems, when the scheduler does migration-cost autodetection,
98401+ * it needs a way to flush as much of the CPU's caches as possible.
98402+ */
98403+static inline void sched_cacheflush(void)
98404+{
98405+ wbinvd();
98406+}
98407+
98408+#endif /* __KERNEL__ */
98409+
98410+#define nop() __asm__ __volatile__ ("nop")
98411+
98412+#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
98413+
98414+#define tas(ptr) (xchg((ptr),1))
98415+
98416+#define __xg(x) ((volatile long *)(x))
98417+
98418+static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
98419+{
98420+ *ptr = val;
98421+}
98422+
98423+#define _set_64bit set_64bit
98424+
98425+/*
98426+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
98427+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
98428+ * but generally the primitive is invalid, *ptr is output argument. --ANK
98429+ */
98430+static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
98431+{
98432+ switch (size) {
98433+ case 1:
98434+ __asm__ __volatile__("xchgb %b0,%1"
98435+ :"=q" (x)
98436+ :"m" (*__xg(ptr)), "0" (x)
98437+ :"memory");
98438+ break;
98439+ case 2:
98440+ __asm__ __volatile__("xchgw %w0,%1"
98441+ :"=r" (x)
98442+ :"m" (*__xg(ptr)), "0" (x)
98443+ :"memory");
98444+ break;
98445+ case 4:
98446+ __asm__ __volatile__("xchgl %k0,%1"
98447+ :"=r" (x)
98448+ :"m" (*__xg(ptr)), "0" (x)
98449+ :"memory");
98450+ break;
98451+ case 8:
98452+ __asm__ __volatile__("xchgq %0,%1"
98453+ :"=r" (x)
98454+ :"m" (*__xg(ptr)), "0" (x)
98455+ :"memory");
98456+ break;
98457+ }
98458+ return x;
98459+}
98460+
98461+/*
98462+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
98463+ * store NEW in MEM. Return the initial value in MEM. Success is
98464+ * indicated by comparing RETURN with OLD.
98465+ */
98466+
98467+#define __HAVE_ARCH_CMPXCHG 1
98468+
98469+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
98470+ unsigned long new, int size)
98471+{
98472+ unsigned long prev;
98473+ switch (size) {
98474+ case 1:
98475+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
98476+ : "=a"(prev)
98477+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
98478+ : "memory");
98479+ return prev;
98480+ case 2:
98481+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
98482+ : "=a"(prev)
98483+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
98484+ : "memory");
98485+ return prev;
98486+ case 4:
98487+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
98488+ : "=a"(prev)
98489+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
98490+ : "memory");
98491+ return prev;
98492+ case 8:
98493+ __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
98494+ : "=a"(prev)
98495+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
98496+ : "memory");
98497+ return prev;
98498+ }
98499+ return old;
98500+}
98501+
98502+#define cmpxchg(ptr,o,n)\
98503+ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
98504+ (unsigned long)(n),sizeof(*(ptr))))
98505+
98506+#ifdef CONFIG_SMP
98507+#define smp_mb() mb()
98508+#define smp_rmb() rmb()
98509+#define smp_wmb() wmb()
98510+#define smp_read_barrier_depends() do {} while(0)
98511+#else
98512+#define smp_mb() barrier()
98513+#define smp_rmb() barrier()
98514+#define smp_wmb() barrier()
98515+#define smp_read_barrier_depends() do {} while(0)
98516+#endif
98517+
98518+
98519+/*
98520+ * Force strict CPU ordering.
98521+ * And yes, this is required on UP too when we're talking
98522+ * to devices.
98523+ */
98524+#define mb() asm volatile("mfence":::"memory")
98525+#define rmb() asm volatile("lfence":::"memory")
98526+
98527+#ifdef CONFIG_UNORDERED_IO
98528+#define wmb() asm volatile("sfence" ::: "memory")
98529+#else
98530+#define wmb() asm volatile("" ::: "memory")
98531+#endif
98532+#define read_barrier_depends() do {} while(0)
98533+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
98534+#define set_wmb(var, value) do { var = value; wmb(); } while (0)
98535+
98536+#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
98537+
98538+
98539+/*
98540+ * The use of 'barrier' in the following reflects their use as local-lock
98541+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
98542+ * critical operations are executed. All critical operations must complete
98543+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
98544+ * includes these barriers, for example.
98545+ */
98546+
98547+#define __cli() \
98548+do { \
98549+ vcpu_info_t *_vcpu; \
98550+ preempt_disable(); \
98551+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98552+ _vcpu->evtchn_upcall_mask = 1; \
98553+ preempt_enable_no_resched(); \
98554+ barrier(); \
98555+} while (0)
98556+
98557+#define __sti() \
98558+do { \
98559+ vcpu_info_t *_vcpu; \
98560+ barrier(); \
98561+ preempt_disable(); \
98562+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98563+ _vcpu->evtchn_upcall_mask = 0; \
98564+ barrier(); /* unmask then check (avoid races) */ \
98565+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
98566+ force_evtchn_callback(); \
98567+ preempt_enable(); \
98568+} while (0)
98569+
98570+#define __save_flags(x) \
98571+do { \
98572+ vcpu_info_t *_vcpu; \
98573+ preempt_disable(); \
98574+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98575+ (x) = _vcpu->evtchn_upcall_mask; \
98576+ preempt_enable(); \
98577+} while (0)
98578+
98579+#define __restore_flags(x) \
98580+do { \
98581+ vcpu_info_t *_vcpu; \
98582+ barrier(); \
98583+ preempt_disable(); \
98584+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98585+ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
98586+ barrier(); /* unmask then check (avoid races) */ \
98587+ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
98588+ force_evtchn_callback(); \
98589+ preempt_enable(); \
98590+ } else \
98591+ preempt_enable_no_resched(); \
98592+} while (0)
98593+
98594+#define __save_and_cli(x) \
98595+do { \
98596+ vcpu_info_t *_vcpu; \
98597+ preempt_disable(); \
98598+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98599+ (x) = _vcpu->evtchn_upcall_mask; \
98600+ _vcpu->evtchn_upcall_mask = 1; \
98601+ preempt_enable_no_resched(); \
98602+ barrier(); \
98603+} while (0)
98604+
98605+#define local_irq_save(x) __save_and_cli(x)
98606+#define local_irq_restore(x) __restore_flags(x)
98607+#define local_save_flags(x) __save_flags(x)
98608+#define local_irq_disable() __cli()
98609+#define local_irq_enable() __sti()
98610+
98611+/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
98612+#define irqs_disabled() \
98613+({ int ___x; \
98614+ vcpu_info_t *_vcpu; \
98615+ preempt_disable(); \
98616+ _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98617+ ___x = (_vcpu->evtchn_upcall_mask != 0); \
98618+ preempt_enable_no_resched(); \
98619+ ___x; })
98620+
98621+void safe_halt(void);
98622+void halt(void);
98623+
98624+void cpu_idle_wait(void);
98625+
98626+extern unsigned long arch_align_stack(unsigned long sp);
98627+
98628+#endif
98629diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/timer.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/timer.h
98630--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/timer.h 1970-01-01 00:00:00.000000000 +0000
98631+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/timer.h 2007-01-08 15:00:46.000000000 +0000
98632@@ -0,0 +1,67 @@
98633+#ifndef _ASMi386_TIMER_H
98634+#define _ASMi386_TIMER_H
98635+#include <linux/init.h>
98636+
98637+/**
98638+ * struct timer_ops - used to define a timer source
98639+ *
98640+ * @name: name of the timer.
98641+ * @init: Probes and initializes the timer. Takes clock= override
98642+ * string as an argument. Returns 0 on success, anything else
98643+ * on failure.
98644+ * @mark_offset: called by the timer interrupt.
98645+ * @get_offset: called by gettimeofday(). Returns the number of microseconds
98646+ * since the last timer interupt.
98647+ * @monotonic_clock: returns the number of nanoseconds since the init of the
98648+ * timer.
98649+ * @delay: delays this many clock cycles.
98650+ */
98651+struct timer_opts {
98652+ char* name;
98653+ void (*mark_offset)(void);
98654+ unsigned long (*get_offset)(void);
98655+ unsigned long long (*monotonic_clock)(void);
98656+ void (*delay)(unsigned long);
98657+ unsigned long (*read_timer)(void);
98658+ int (*suspend)(pm_message_t state);
98659+ int (*resume)(void);
98660+};
98661+
98662+struct init_timer_opts {
98663+ int (*init)(char *override);
98664+ struct timer_opts *opts;
98665+};
98666+
98667+#define TICK_SIZE (tick_nsec / 1000)
98668+
98669+extern struct timer_opts* __init select_timer(void);
98670+extern void clock_fallback(void);
98671+void setup_pit_timer(void);
98672+
98673+/* Modifiers for buggy PIT handling */
98674+
98675+extern int pit_latch_buggy;
98676+
98677+extern struct timer_opts *cur_timer;
98678+extern int timer_ack;
98679+
98680+/* list of externed timers */
98681+extern struct timer_opts timer_none;
98682+extern struct timer_opts timer_pit;
98683+extern struct init_timer_opts timer_pit_init;
98684+extern struct init_timer_opts timer_tsc_init;
98685+#ifdef CONFIG_X86_CYCLONE_TIMER
98686+extern struct init_timer_opts timer_cyclone_init;
98687+#endif
98688+
98689+extern unsigned long calibrate_tsc(void);
98690+extern void init_cpu_khz(void);
98691+#ifdef CONFIG_HPET_TIMER
98692+extern struct init_timer_opts timer_hpet_init;
98693+extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
98694+#endif
98695+
98696+#ifdef CONFIG_X86_PM_TIMER
98697+extern struct init_timer_opts timer_pmtmr_init;
98698+#endif
98699+#endif
98700diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/tlbflush.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/tlbflush.h
98701--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/tlbflush.h 1970-01-01 00:00:00.000000000 +0000
98702+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/tlbflush.h 2007-01-08 15:00:46.000000000 +0000
98703@@ -0,0 +1,104 @@
98704+#ifndef _X8664_TLBFLUSH_H
98705+#define _X8664_TLBFLUSH_H
98706+
98707+#include <linux/config.h>
98708+#include <linux/mm.h>
98709+#include <asm/processor.h>
98710+
98711+#define __flush_tlb() xen_tlb_flush()
98712+
98713+/*
98714+ * Global pages have to be flushed a bit differently. Not a real
98715+ * performance problem because this does not happen often.
98716+ */
98717+#define __flush_tlb_global() xen_tlb_flush()
98718+
98719+
98720+extern unsigned long pgkern_mask;
98721+
98722+#define __flush_tlb_all() __flush_tlb_global()
98723+
98724+#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
98725+
98726+
98727+/*
98728+ * TLB flushing:
98729+ *
98730+ * - flush_tlb() flushes the current mm struct TLBs
98731+ * - flush_tlb_all() flushes all processes TLBs
98732+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
98733+ * - flush_tlb_page(vma, vmaddr) flushes one page
98734+ * - flush_tlb_range(vma, start, end) flushes a range of pages
98735+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
98736+ * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
98737+ *
98738+ * x86-64 can only flush individual pages or full VMs. For a range flush
98739+ * we always do the full VM. Might be worth trying if for a small
98740+ * range a few INVLPGs in a row are a win.
98741+ */
98742+
98743+#ifndef CONFIG_SMP
98744+
98745+#define flush_tlb() __flush_tlb()
98746+#define flush_tlb_all() __flush_tlb_all()
98747+#define local_flush_tlb() __flush_tlb()
98748+
98749+static inline void flush_tlb_mm(struct mm_struct *mm)
98750+{
98751+ if (mm == current->active_mm)
98752+ __flush_tlb();
98753+}
98754+
98755+static inline void flush_tlb_page(struct vm_area_struct *vma,
98756+ unsigned long addr)
98757+{
98758+ if (vma->vm_mm == current->active_mm)
98759+ __flush_tlb_one(addr);
98760+}
98761+
98762+static inline void flush_tlb_range(struct vm_area_struct *vma,
98763+ unsigned long start, unsigned long end)
98764+{
98765+ if (vma->vm_mm == current->active_mm)
98766+ __flush_tlb();
98767+}
98768+
98769+#else
98770+
98771+#include <asm/smp.h>
98772+
98773+#define local_flush_tlb() \
98774+ __flush_tlb()
98775+
98776+extern void flush_tlb_all(void);
98777+extern void flush_tlb_current_task(void);
98778+extern void flush_tlb_mm(struct mm_struct *);
98779+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
98780+
98781+#define flush_tlb() flush_tlb_current_task()
98782+
98783+static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
98784+{
98785+ flush_tlb_mm(vma->vm_mm);
98786+}
98787+
98788+#define TLBSTATE_OK 1
98789+#define TLBSTATE_LAZY 2
98790+
98791+/* Roughly an IPI every 20MB with 4k pages for freeing page table
98792+ ranges. Cost is about 42k of memory for each CPU. */
98793+#define ARCH_FREE_PTE_NR 5350
98794+
98795+#endif
98796+
98797+#define flush_tlb_kernel_range(start, end) flush_tlb_all()
98798+
98799+static inline void flush_tlb_pgtables(struct mm_struct *mm,
98800+ unsigned long start, unsigned long end)
98801+{
98802+ /* x86_64 does not keep any page table caches in a software TLB.
98803+ The CPUs do in their hardware TLBs, but they are handled
98804+ by the normal TLB flushing algorithms. */
98805+}
98806+
98807+#endif /* _X8664_TLBFLUSH_H */
98808diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/vga.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/vga.h
98809--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/vga.h 1970-01-01 00:00:00.000000000 +0000
98810+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/vga.h 2007-01-08 15:00:46.000000000 +0000
98811@@ -0,0 +1,20 @@
98812+/*
98813+ * Access to VGA videoram
98814+ *
98815+ * (c) 1998 Martin Mares <mj@ucw.cz>
98816+ */
98817+
98818+#ifndef _LINUX_ASM_VGA_H_
98819+#define _LINUX_ASM_VGA_H_
98820+
98821+/*
98822+ * On the PC, we can just recalculate addresses and then
98823+ * access the videoram directly without any black magic.
98824+ */
98825+
98826+#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
98827+
98828+#define vga_readb(x) (*(x))
98829+#define vga_writeb(x,y) (*(y) = (x))
98830+
98831+#endif
98832diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xenoprof.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xenoprof.h
98833--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
98834+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
98835@@ -0,0 +1 @@
98836+#include <asm-i386/mach-xen/asm/xenoprof.h>
98837diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xor.h
98838--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xor.h 1970-01-01 00:00:00.000000000 +0000
98839+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xor.h 2007-01-08 15:00:46.000000000 +0000
98840@@ -0,0 +1,328 @@
98841+/*
98842+ * x86-64 changes / gcc fixes from Andi Kleen.
98843+ * Copyright 2002 Andi Kleen, SuSE Labs.
98844+ *
98845+ * This hasn't been optimized for the hammer yet, but there are likely
98846+ * no advantages to be gotten from x86-64 here anyways.
98847+ */
98848+
98849+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
98850+
98851+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
98852+ tell it to do a clts before the register saving. */
98853+#define XMMS_SAVE do { \
98854+ preempt_disable(); \
98855+ if (!(current_thread_info()->status & TS_USEDFPU)) \
98856+ clts(); \
98857+ __asm__ __volatile__ ( \
98858+ "movups %%xmm0,(%1) ;\n\t" \
98859+ "movups %%xmm1,0x10(%1) ;\n\t" \
98860+ "movups %%xmm2,0x20(%1) ;\n\t" \
98861+ "movups %%xmm3,0x30(%1) ;\n\t" \
98862+ : "=&r" (cr0) \
98863+ : "r" (xmm_save) \
98864+ : "memory"); \
98865+} while(0)
98866+
98867+#define XMMS_RESTORE do { \
98868+ asm volatile ( \
98869+ "sfence ;\n\t" \
98870+ "movups (%1),%%xmm0 ;\n\t" \
98871+ "movups 0x10(%1),%%xmm1 ;\n\t" \
98872+ "movups 0x20(%1),%%xmm2 ;\n\t" \
98873+ "movups 0x30(%1),%%xmm3 ;\n\t" \
98874+ : \
98875+ : "r" (cr0), "r" (xmm_save) \
98876+ : "memory"); \
98877+ if (!(current_thread_info()->status & TS_USEDFPU)) \
98878+ stts(); \
98879+ preempt_enable(); \
98880+} while(0)
98881+
98882+#define OFFS(x) "16*("#x")"
98883+#define PF_OFFS(x) "256+16*("#x")"
98884+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
98885+#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
98886+#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
98887+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
98888+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
98889+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
98890+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
98891+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
98892+#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
98893+#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
98894+#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
98895+#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
98896+#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
98897+
98898+
98899+static void
98900+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
98901+{
98902+ unsigned int lines = bytes >> 8;
98903+ unsigned long cr0;
98904+ xmm_store_t xmm_save[4];
98905+
98906+ XMMS_SAVE;
98907+
98908+ asm volatile (
98909+#undef BLOCK
98910+#define BLOCK(i) \
98911+ LD(i,0) \
98912+ LD(i+1,1) \
98913+ PF1(i) \
98914+ PF1(i+2) \
98915+ LD(i+2,2) \
98916+ LD(i+3,3) \
98917+ PF0(i+4) \
98918+ PF0(i+6) \
98919+ XO1(i,0) \
98920+ XO1(i+1,1) \
98921+ XO1(i+2,2) \
98922+ XO1(i+3,3) \
98923+ ST(i,0) \
98924+ ST(i+1,1) \
98925+ ST(i+2,2) \
98926+ ST(i+3,3) \
98927+
98928+
98929+ PF0(0)
98930+ PF0(2)
98931+
98932+ " .align 32 ;\n"
98933+ " 1: ;\n"
98934+
98935+ BLOCK(0)
98936+ BLOCK(4)
98937+ BLOCK(8)
98938+ BLOCK(12)
98939+
98940+ " addq %[inc], %[p1] ;\n"
98941+ " addq %[inc], %[p2] ;\n"
98942+ " decl %[cnt] ; jnz 1b"
98943+ : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
98944+ : [inc] "r" (256UL)
98945+ : "memory");
98946+
98947+ XMMS_RESTORE;
98948+}
98949+
98950+static void
98951+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
98952+ unsigned long *p3)
98953+{
98954+ unsigned int lines = bytes >> 8;
98955+ xmm_store_t xmm_save[4];
98956+ unsigned long cr0;
98957+
98958+ XMMS_SAVE;
98959+
98960+ __asm__ __volatile__ (
98961+#undef BLOCK
98962+#define BLOCK(i) \
98963+ PF1(i) \
98964+ PF1(i+2) \
98965+ LD(i,0) \
98966+ LD(i+1,1) \
98967+ LD(i+2,2) \
98968+ LD(i+3,3) \
98969+ PF2(i) \
98970+ PF2(i+2) \
98971+ PF0(i+4) \
98972+ PF0(i+6) \
98973+ XO1(i,0) \
98974+ XO1(i+1,1) \
98975+ XO1(i+2,2) \
98976+ XO1(i+3,3) \
98977+ XO2(i,0) \
98978+ XO2(i+1,1) \
98979+ XO2(i+2,2) \
98980+ XO2(i+3,3) \
98981+ ST(i,0) \
98982+ ST(i+1,1) \
98983+ ST(i+2,2) \
98984+ ST(i+3,3) \
98985+
98986+
98987+ PF0(0)
98988+ PF0(2)
98989+
98990+ " .align 32 ;\n"
98991+ " 1: ;\n"
98992+
98993+ BLOCK(0)
98994+ BLOCK(4)
98995+ BLOCK(8)
98996+ BLOCK(12)
98997+
98998+ " addq %[inc], %[p1] ;\n"
98999+ " addq %[inc], %[p2] ;\n"
99000+ " addq %[inc], %[p3] ;\n"
99001+ " decl %[cnt] ; jnz 1b"
99002+ : [cnt] "+r" (lines),
99003+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
99004+ : [inc] "r" (256UL)
99005+ : "memory");
99006+ XMMS_RESTORE;
99007+}
99008+
99009+static void
99010+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
99011+ unsigned long *p3, unsigned long *p4)
99012+{
99013+ unsigned int lines = bytes >> 8;
99014+ xmm_store_t xmm_save[4];
99015+ unsigned long cr0;
99016+
99017+ XMMS_SAVE;
99018+
99019+ __asm__ __volatile__ (
99020+#undef BLOCK
99021+#define BLOCK(i) \
99022+ PF1(i) \
99023+ PF1(i+2) \
99024+ LD(i,0) \
99025+ LD(i+1,1) \
99026+ LD(i+2,2) \
99027+ LD(i+3,3) \
99028+ PF2(i) \
99029+ PF2(i+2) \
99030+ XO1(i,0) \
99031+ XO1(i+1,1) \
99032+ XO1(i+2,2) \
99033+ XO1(i+3,3) \
99034+ PF3(i) \
99035+ PF3(i+2) \
99036+ PF0(i+4) \
99037+ PF0(i+6) \
99038+ XO2(i,0) \
99039+ XO2(i+1,1) \
99040+ XO2(i+2,2) \
99041+ XO2(i+3,3) \
99042+ XO3(i,0) \
99043+ XO3(i+1,1) \
99044+ XO3(i+2,2) \
99045+ XO3(i+3,3) \
99046+ ST(i,0) \
99047+ ST(i+1,1) \
99048+ ST(i+2,2) \
99049+ ST(i+3,3) \
99050+
99051+
99052+ PF0(0)
99053+ PF0(2)
99054+
99055+ " .align 32 ;\n"
99056+ " 1: ;\n"
99057+
99058+ BLOCK(0)
99059+ BLOCK(4)
99060+ BLOCK(8)
99061+ BLOCK(12)
99062+
99063+ " addq %[inc], %[p1] ;\n"
99064+ " addq %[inc], %[p2] ;\n"
99065+ " addq %[inc], %[p3] ;\n"
99066+ " addq %[inc], %[p4] ;\n"
99067+ " decl %[cnt] ; jnz 1b"
99068+ : [cnt] "+c" (lines),
99069+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
99070+ : [inc] "r" (256UL)
99071+ : "memory" );
99072+
99073+ XMMS_RESTORE;
99074+}
99075+
99076+static void
99077+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
99078+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
99079+{
99080+ unsigned int lines = bytes >> 8;
99081+ xmm_store_t xmm_save[4];
99082+ unsigned long cr0;
99083+
99084+ XMMS_SAVE;
99085+
99086+ __asm__ __volatile__ (
99087+#undef BLOCK
99088+#define BLOCK(i) \
99089+ PF1(i) \
99090+ PF1(i+2) \
99091+ LD(i,0) \
99092+ LD(i+1,1) \
99093+ LD(i+2,2) \
99094+ LD(i+3,3) \
99095+ PF2(i) \
99096+ PF2(i+2) \
99097+ XO1(i,0) \
99098+ XO1(i+1,1) \
99099+ XO1(i+2,2) \
99100+ XO1(i+3,3) \
99101+ PF3(i) \
99102+ PF3(i+2) \
99103+ XO2(i,0) \
99104+ XO2(i+1,1) \
99105+ XO2(i+2,2) \
99106+ XO2(i+3,3) \
99107+ PF4(i) \
99108+ PF4(i+2) \
99109+ PF0(i+4) \
99110+ PF0(i+6) \
99111+ XO3(i,0) \
99112+ XO3(i+1,1) \
99113+ XO3(i+2,2) \
99114+ XO3(i+3,3) \
99115+ XO4(i,0) \
99116+ XO4(i+1,1) \
99117+ XO4(i+2,2) \
99118+ XO4(i+3,3) \
99119+ ST(i,0) \
99120+ ST(i+1,1) \
99121+ ST(i+2,2) \
99122+ ST(i+3,3) \
99123+
99124+
99125+ PF0(0)
99126+ PF0(2)
99127+
99128+ " .align 32 ;\n"
99129+ " 1: ;\n"
99130+
99131+ BLOCK(0)
99132+ BLOCK(4)
99133+ BLOCK(8)
99134+ BLOCK(12)
99135+
99136+ " addq %[inc], %[p1] ;\n"
99137+ " addq %[inc], %[p2] ;\n"
99138+ " addq %[inc], %[p3] ;\n"
99139+ " addq %[inc], %[p4] ;\n"
99140+ " addq %[inc], %[p5] ;\n"
99141+ " decl %[cnt] ; jnz 1b"
99142+ : [cnt] "+c" (lines),
99143+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
99144+ [p5] "+r" (p5)
99145+ : [inc] "r" (256UL)
99146+ : "memory");
99147+
99148+ XMMS_RESTORE;
99149+}
99150+
99151+static struct xor_block_template xor_block_sse = {
99152+ .name = "generic_sse",
99153+ .do_2 = xor_sse_2,
99154+ .do_3 = xor_sse_3,
99155+ .do_4 = xor_sse_4,
99156+ .do_5 = xor_sse_5,
99157+};
99158+
99159+#undef XOR_TRY_TEMPLATES
99160+#define XOR_TRY_TEMPLATES \
99161+ do { \
99162+ xor_speed(&xor_block_sse); \
99163+ } while (0)
99164+
99165+/* We force the use of the SSE xor block because it can write around L2.
99166+ We may also be able to load into the L1 only depending on how the cpu
99167+ deals with a load to a line that is being prefetched. */
99168+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
99169diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/irq_vectors.h linux-2.6.16.33/include/asm-x86_64/mach-xen/irq_vectors.h
99170--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/irq_vectors.h 1970-01-01 00:00:00.000000000 +0000
99171+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/irq_vectors.h 2007-01-08 15:00:46.000000000 +0000
99172@@ -0,0 +1,123 @@
99173+/*
99174+ * This file should contain #defines for all of the interrupt vector
99175+ * numbers used by this architecture.
99176+ *
99177+ * In addition, there are some standard defines:
99178+ *
99179+ * FIRST_EXTERNAL_VECTOR:
99180+ * The first free place for external interrupts
99181+ *
99182+ * SYSCALL_VECTOR:
99183+ * The IRQ vector a syscall makes the user to kernel transition
99184+ * under.
99185+ *
99186+ * TIMER_IRQ:
99187+ * The IRQ number the timer interrupt comes in at.
99188+ *
99189+ * NR_IRQS:
99190+ * The total number of interrupt vectors (including all the
99191+ * architecture specific interrupts) needed.
99192+ *
99193+ */
99194+#ifndef _ASM_IRQ_VECTORS_H
99195+#define _ASM_IRQ_VECTORS_H
99196+
99197+/*
99198+ * IDT vectors usable for external interrupt sources start
99199+ * at 0x20:
99200+ */
99201+#define FIRST_EXTERNAL_VECTOR 0x20
99202+
99203+#define SYSCALL_VECTOR 0x80
99204+
99205+/*
99206+ * Vectors 0x20-0x2f are used for ISA interrupts.
99207+ */
99208+
99209+#if 0
99210+/*
99211+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
99212+ *
99213+ * some of the following vectors are 'rare', they are merged
99214+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
99215+ * TLB, reschedule and local APIC vectors are performance-critical.
99216+ *
99217+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
99218+ */
99219+#define INVALIDATE_TLB_VECTOR 0xfd
99220+#define RESCHEDULE_VECTOR 0xfc
99221+#define CALL_FUNCTION_VECTOR 0xfb
99222+
99223+#define THERMAL_APIC_VECTOR 0xf0
99224+/*
99225+ * Local APIC timer IRQ vector is on a different priority level,
99226+ * to work around the 'lost local interrupt if more than 2 IRQ
99227+ * sources per level' errata.
99228+ */
99229+#define LOCAL_TIMER_VECTOR 0xef
99230+#endif
99231+
99232+#define SPURIOUS_APIC_VECTOR 0xff
99233+#define ERROR_APIC_VECTOR 0xfe
99234+
99235+/*
99236+ * First APIC vector available to drivers: (vectors 0x30-0xee)
99237+ * we start at 0x31 to spread out vectors evenly between priority
99238+ * levels. (0x80 is the syscall vector)
99239+ */
99240+#define FIRST_DEVICE_VECTOR 0x31
99241+#define FIRST_SYSTEM_VECTOR 0xef
99242+
99243+/*
99244+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
99245+ * Right now the APIC is mostly only used for SMP.
99246+ * 256 vectors is an architectural limit. (we can have
99247+ * more than 256 devices theoretically, but they will
99248+ * have to use shared interrupts)
99249+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
99250+ * the usable vector space is 0x20-0xff (224 vectors)
99251+ */
99252+
99253+#define RESCHEDULE_VECTOR 0
99254+#define CALL_FUNCTION_VECTOR 1
99255+#define NR_IPIS 2
99256+
99257+/*
99258+ * The maximum number of vectors supported by i386 processors
99259+ * is limited to 256. For processors other than i386, NR_VECTORS
99260+ * should be changed accordingly.
99261+ */
99262+#define NR_VECTORS 256
99263+
99264+#define FPU_IRQ 13
99265+
99266+#define FIRST_VM86_IRQ 3
99267+#define LAST_VM86_IRQ 15
99268+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
99269+
99270+/*
99271+ * The flat IRQ space is divided into two regions:
99272+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
99273+ * if we have physical device-access privilege. This region is at the
99274+ * start of the IRQ space so that existing device drivers do not need
99275+ * to be modified to translate physical IRQ numbers into our IRQ space.
99276+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
99277+ * are bound using the provided bind/unbind functions.
99278+ */
99279+
99280+#define PIRQ_BASE 0
99281+#define NR_PIRQS 256
99282+
99283+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
99284+#define NR_DYNIRQS 256
99285+
99286+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
99287+#define NR_IRQ_VECTORS NR_IRQS
99288+
99289+#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
99290+#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
99291+
99292+#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
99293+#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
99294+
99295+#endif /* _ASM_IRQ_VECTORS_H */
99296diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_time.h linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_time.h
99297--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_time.h 1970-01-01 00:00:00.000000000 +0000
99298+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_time.h 2007-01-08 15:00:46.000000000 +0000
99299@@ -0,0 +1,122 @@
99300+/*
99301+ * include/asm-i386/mach-default/mach_time.h
99302+ *
99303+ * Machine specific set RTC function for generic.
99304+ * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
99305+ */
99306+#ifndef _MACH_TIME_H
99307+#define _MACH_TIME_H
99308+
99309+#include <asm-i386/mc146818rtc.h>
99310+
99311+/* for check timing call set_rtc_mmss() 500ms */
99312+/* used in arch/i386/time.c::do_timer_interrupt() */
99313+#define USEC_AFTER 500000
99314+#define USEC_BEFORE 500000
99315+
99316+/*
99317+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
99318+ * called 500 ms after the second nowtime has started, because when
99319+ * nowtime is written into the registers of the CMOS clock, it will
99320+ * jump to the next second precisely 500 ms later. Check the Motorola
99321+ * MC146818A or Dallas DS12887 data sheet for details.
99322+ *
99323+ * BUG: This routine does not handle hour overflow properly; it just
99324+ * sets the minutes. Usually you'll only notice that after reboot!
99325+ */
99326+static inline int mach_set_rtc_mmss(unsigned long nowtime)
99327+{
99328+ int retval = 0;
99329+ int real_seconds, real_minutes, cmos_minutes;
99330+ unsigned char save_control, save_freq_select;
99331+
99332+ save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
99333+ CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
99334+
99335+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
99336+ CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
99337+
99338+ cmos_minutes = CMOS_READ(RTC_MINUTES);
99339+ if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
99340+ BCD_TO_BIN(cmos_minutes);
99341+
99342+ /*
99343+ * since we're only adjusting minutes and seconds,
99344+ * don't interfere with hour overflow. This avoids
99345+ * messing with unknown time zones but requires your
99346+ * RTC not to be off by more than 15 minutes
99347+ */
99348+ real_seconds = nowtime % 60;
99349+ real_minutes = nowtime / 60;
99350+ if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
99351+ real_minutes += 30; /* correct for half hour time zone */
99352+ real_minutes %= 60;
99353+
99354+ if (abs(real_minutes - cmos_minutes) < 30) {
99355+ if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
99356+ BIN_TO_BCD(real_seconds);
99357+ BIN_TO_BCD(real_minutes);
99358+ }
99359+ CMOS_WRITE(real_seconds,RTC_SECONDS);
99360+ CMOS_WRITE(real_minutes,RTC_MINUTES);
99361+ } else {
99362+ printk(KERN_WARNING
99363+ "set_rtc_mmss: can't update from %d to %d\n",
99364+ cmos_minutes, real_minutes);
99365+ retval = -1;
99366+ }
99367+
99368+ /* The following flags have to be released exactly in this order,
99369+ * otherwise the DS12887 (popular MC146818A clone with integrated
99370+ * battery and quartz) will not reset the oscillator and will not
99371+ * update precisely 500 ms later. You won't find this mentioned in
99372+ * the Dallas Semiconductor data sheets, but who believes data
99373+ * sheets anyway ... -- Markus Kuhn
99374+ */
99375+ CMOS_WRITE(save_control, RTC_CONTROL);
99376+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
99377+
99378+ return retval;
99379+}
99380+
99381+static inline unsigned long mach_get_cmos_time(void)
99382+{
99383+ unsigned int year, mon, day, hour, min, sec;
99384+ int i;
99385+
99386+ /* The Linux interpretation of the CMOS clock register contents:
99387+ * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
99388+ * RTC registers show the second which has precisely just started.
99389+ * Let's hope other operating systems interpret the RTC the same way.
99390+ */
99391+ /* read RTC exactly on falling edge of update flag */
99392+ for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */
99393+ if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
99394+ break;
99395+ for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */
99396+ if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
99397+ break;
99398+ do { /* Isn't this overkill ? UIP above should guarantee consistency */
99399+ sec = CMOS_READ(RTC_SECONDS);
99400+ min = CMOS_READ(RTC_MINUTES);
99401+ hour = CMOS_READ(RTC_HOURS);
99402+ day = CMOS_READ(RTC_DAY_OF_MONTH);
99403+ mon = CMOS_READ(RTC_MONTH);
99404+ year = CMOS_READ(RTC_YEAR);
99405+ } while (sec != CMOS_READ(RTC_SECONDS));
99406+ if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
99407+ {
99408+ BCD_TO_BIN(sec);
99409+ BCD_TO_BIN(min);
99410+ BCD_TO_BIN(hour);
99411+ BCD_TO_BIN(day);
99412+ BCD_TO_BIN(mon);
99413+ BCD_TO_BIN(year);
99414+ }
99415+ if ((year += 1900) < 1970)
99416+ year += 100;
99417+
99418+ return mktime(year, mon, day, hour, min, sec);
99419+}
99420+
99421+#endif /* !_MACH_TIME_H */
99422diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_timer.h linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_timer.h
99423--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_timer.h 1970-01-01 00:00:00.000000000 +0000
99424+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_timer.h 2007-01-08 15:00:46.000000000 +0000
99425@@ -0,0 +1,48 @@
99426+/*
99427+ * include/asm-i386/mach-default/mach_timer.h
99428+ *
99429+ * Machine specific calibrate_tsc() for generic.
99430+ * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
99431+ */
99432+/* ------ Calibrate the TSC -------
99433+ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
99434+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
99435+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
99436+ * output busy loop as low as possible. We avoid reading the CTC registers
99437+ * directly because of the awkward 8-bit access mechanism of the 82C54
99438+ * device.
99439+ */
99440+#ifndef _MACH_TIMER_H
99441+#define _MACH_TIMER_H
99442+
99443+#define CALIBRATE_LATCH (5 * LATCH)
99444+
99445+static inline void mach_prepare_counter(void)
99446+{
99447+ /* Set the Gate high, disable speaker */
99448+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
99449+
99450+ /*
99451+ * Now let's take care of CTC channel 2
99452+ *
99453+ * Set the Gate high, program CTC channel 2 for mode 0,
99454+ * (interrupt on terminal count mode), binary count,
99455+ * load 5 * LATCH count, (LSB and MSB) to begin countdown.
99456+ *
99457+ * Some devices need a delay here.
99458+ */
99459+ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
99460+ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
99461+ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
99462+}
99463+
99464+static inline void mach_countup(unsigned long *count_p)
99465+{
99466+ unsigned long count = 0;
99467+ do {
99468+ count++;
99469+ } while ((inb_p(0x61) & 0x20) == 0);
99470+ *count_p = count;
99471+}
99472+
99473+#endif /* !_MACH_TIMER_H */
99474diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_post.h linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_post.h
99475--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_post.h 1970-01-01 00:00:00.000000000 +0000
99476+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_post.h 2007-01-08 15:00:46.000000000 +0000
99477@@ -0,0 +1,63 @@
99478+/**
99479+ * machine_specific_* - Hooks for machine specific setup.
99480+ *
99481+ * Description:
99482+ * This is included late in kernel/setup.c so that it can make
99483+ * use of all of the static functions.
99484+ **/
99485+
99486+#include <xen/interface/callback.h>
99487+
99488+extern void hypervisor_callback(void);
99489+extern void failsafe_callback(void);
99490+extern void nmi(void);
99491+
99492+static void __init machine_specific_arch_setup(void)
99493+{
99494+ int ret;
99495+ static struct callback_register __initdata event = {
99496+ .type = CALLBACKTYPE_event,
99497+ .address = (unsigned long) hypervisor_callback,
99498+ };
99499+ static struct callback_register __initdata failsafe = {
99500+ .type = CALLBACKTYPE_failsafe,
99501+ .address = (unsigned long)failsafe_callback,
99502+ };
99503+ static struct callback_register __initdata syscall = {
99504+ .type = CALLBACKTYPE_syscall,
99505+ .address = (unsigned long)system_call,
99506+ };
99507+#ifdef CONFIG_X86_LOCAL_APIC
99508+ static struct callback_register __initdata nmi_cb = {
99509+ .type = CALLBACKTYPE_nmi,
99510+ .address = (unsigned long)nmi,
99511+ };
99512+#endif
99513+
99514+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
99515+ if (ret == 0)
99516+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
99517+ if (ret == 0)
99518+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
99519+#ifdef CONFIG_XEN_COMPAT_030002
99520+ if (ret == -ENOSYS)
99521+ ret = HYPERVISOR_set_callbacks(
99522+ event.address,
99523+ failsafe.address,
99524+ syscall.address);
99525+#endif
99526+ BUG_ON(ret);
99527+
99528+#ifdef CONFIG_X86_LOCAL_APIC
99529+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
99530+#ifdef CONFIG_XEN_COMPAT_030002
99531+ if (ret == -ENOSYS) {
99532+ static struct xennmi_callback __initdata cb = {
99533+ .handler_address = (unsigned long)nmi
99534+ };
99535+
99536+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
99537+ }
99538+#endif
99539+#endif
99540+}
99541diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_pre.h linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_pre.h
99542--- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_pre.h 1970-01-01 00:00:00.000000000 +0000
99543+++ linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_pre.h 2007-01-08 15:00:46.000000000 +0000
99544@@ -0,0 +1,5 @@
99545+/* Hook to call BIOS initialisation function */
99546+
99547+#define ARCH_SETUP machine_specific_arch_setup();
99548+
99549+static void __init machine_specific_arch_setup(void);
99550diff -Nur linux-2.6.16.33-noxen/include/linux/aio.h linux-2.6.16.33/include/linux/aio.h
99551--- linux-2.6.16.33-noxen/include/linux/aio.h 2006-11-22 18:06:31.000000000 +0000
99552+++ linux-2.6.16.33/include/linux/aio.h 2007-05-23 21:00:01.000000000 +0000
99553@@ -191,6 +191,11 @@
99554 struct aio_ring_info ring_info;
99555
99556 struct work_struct wq;
99557+#ifdef CONFIG_EPOLL
99558+ // poll integration
99559+ wait_queue_head_t poll_wait;
99560+ struct file *file;
99561+#endif
99562 };
99563
99564 /* prototypes */
99565diff -Nur linux-2.6.16.33-noxen/include/linux/elfnote.h linux-2.6.16.33/include/linux/elfnote.h
99566--- linux-2.6.16.33-noxen/include/linux/elfnote.h 1970-01-01 00:00:00.000000000 +0000
99567+++ linux-2.6.16.33/include/linux/elfnote.h 2007-05-23 21:00:01.000000000 +0000
99568@@ -0,0 +1,90 @@
99569+#ifndef _LINUX_ELFNOTE_H
99570+#define _LINUX_ELFNOTE_H
99571+/*
99572+ * Helper macros to generate ELF Note structures, which are put into a
99573+ * PT_NOTE segment of the final vmlinux image. These are useful for
99574+ * including name-value pairs of metadata into the kernel binary (or
99575+ * modules?) for use by external programs.
99576+ *
99577+ * Each note has three parts: a name, a type and a desc. The name is
99578+ * intended to distinguish the note's originator, so it would be a
99579+ * company, project, subsystem, etc; it must be in a suitable form for
99580+ * use in a section name. The type is an integer which is used to tag
99581+ * the data, and is considered to be within the "name" namespace (so
99582+ * "FooCo"'s type 42 is distinct from "BarProj"'s type 42). The
99583+ * "desc" field is the actual data. There are no constraints on the
99584+ * desc field's contents, though typically they're fairly small.
99585+ *
99586+ * All notes from a given NAME are put into a section named
99587+ * .note.NAME. When the kernel image is finally linked, all the notes
99588+ * are packed into a single .notes section, which is mapped into the
99589+ * PT_NOTE segment. Because notes for a given name are grouped into
99590+ * the same section, they'll all be adjacent the output file.
99591+ *
99592+ * This file defines macros for both C and assembler use. Their
99593+ * syntax is slightly different, but they're semantically similar.
99594+ *
99595+ * See the ELF specification for more detail about ELF notes.
99596+ */
99597+
99598+#ifdef __ASSEMBLER__
99599+/*
99600+ * Generate a structure with the same shape as Elf{32,64}_Nhdr (which
99601+ * turn out to be the same size and shape), followed by the name and
99602+ * desc data with appropriate padding. The 'desctype' argument is the
99603+ * assembler pseudo op defining the type of the data e.g. .asciz while
99604+ * 'descdata' is the data itself e.g. "hello, world".
99605+ *
99606+ * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
99607+ * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
99608+ */
99609+#define ELFNOTE(name, type, desctype, descdata) \
99610+.pushsection .note.name ; \
99611+ .align 4 ; \
99612+ .long 2f - 1f /* namesz */ ; \
99613+ .long 4f - 3f /* descsz */ ; \
99614+ .long type ; \
99615+1:.asciz "name" ; \
99616+2:.align 4 ; \
99617+3:desctype descdata ; \
99618+4:.align 4 ; \
99619+.popsection ;
99620+#else /* !__ASSEMBLER__ */
99621+#include <linux/elf.h>
99622+/*
99623+ * Use an anonymous structure which matches the shape of
99624+ * Elf{32,64}_Nhdr, but includes the name and desc data. The size and
99625+ * type of name and desc depend on the macro arguments. "name" must
99626+ * be a literal string, and "desc" must be passed by value. You may
99627+ * only define one note per line, since __LINE__ is used to generate
99628+ * unique symbols.
99629+ */
99630+#define _ELFNOTE_PASTE(a,b) a##b
99631+#define _ELFNOTE(size, name, unique, type, desc) \
99632+ static const struct { \
99633+ struct elf##size##_note _nhdr; \
99634+ unsigned char _name[sizeof(name)] \
99635+ __attribute__((aligned(sizeof(Elf##size##_Word)))); \
99636+ typeof(desc) _desc \
99637+ __attribute__((aligned(sizeof(Elf##size##_Word)))); \
99638+ } _ELFNOTE_PASTE(_note_, unique) \
99639+ __attribute_used__ \
99640+ __attribute__((section(".note." name), \
99641+ aligned(sizeof(Elf##size##_Word)), \
99642+ unused)) = { \
99643+ { \
99644+ sizeof(name), \
99645+ sizeof(desc), \
99646+ type, \
99647+ }, \
99648+ name, \
99649+ desc \
99650+ }
99651+#define ELFNOTE(size, name, type, desc) \
99652+ _ELFNOTE(size, name, __LINE__, type, desc)
99653+
99654+#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
99655+#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
99656+#endif /* __ASSEMBLER__ */
99657+
99658+#endif /* _LINUX_ELFNOTE_H */
99659diff -Nur linux-2.6.16.33-noxen/include/linux/ethtool.h linux-2.6.16.33/include/linux/ethtool.h
99660--- linux-2.6.16.33-noxen/include/linux/ethtool.h 2006-11-22 18:06:31.000000000 +0000
99661+++ linux-2.6.16.33/include/linux/ethtool.h 2007-05-23 21:00:01.000000000 +0000
99662@@ -408,6 +408,8 @@
99663 #define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */
99664 #define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */
99665 #define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */
99666+#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */
99667+#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */
99668
99669 /* compatibility with older code */
99670 #define SPARC_ETH_GSET ETHTOOL_GSET
99671diff -Nur linux-2.6.16.33-noxen/include/linux/eventpoll.h linux-2.6.16.33/include/linux/eventpoll.h
99672--- linux-2.6.16.33-noxen/include/linux/eventpoll.h 2006-11-22 18:06:31.000000000 +0000
99673+++ linux-2.6.16.33/include/linux/eventpoll.h 2007-05-23 21:00:01.000000000 +0000
99674@@ -86,6 +86,12 @@
99675 }
99676
99677
99678+/*
99679+ * called by aio code to create fd that can poll the aio event queueQ
99680+ */
99681+struct eventpoll;
99682+int ep_getfd(int *efd, struct inode **einode, struct file **efile,
99683+ struct eventpoll *ep, struct file_operations *fops);
99684 #else
99685
99686 static inline void eventpoll_init_file(struct file *file) {}
99687diff -Nur linux-2.6.16.33-noxen/include/linux/gfp.h linux-2.6.16.33/include/linux/gfp.h
99688--- linux-2.6.16.33-noxen/include/linux/gfp.h 2006-11-22 18:06:31.000000000 +0000
99689+++ linux-2.6.16.33/include/linux/gfp.h 2007-01-08 15:00:46.000000000 +0000
99690@@ -98,7 +98,11 @@
99691 */
99692
99693 #ifndef HAVE_ARCH_FREE_PAGE
99694-static inline void arch_free_page(struct page *page, int order) { }
99695+/*
99696+ * If arch_free_page returns non-zero then the generic free_page code can
99697+ * immediately bail: the arch-specific function has done all the work.
99698+ */
99699+static inline int arch_free_page(struct page *page, int order) { return 0; }
99700 #endif
99701
99702 extern struct page *
99703diff -Nur linux-2.6.16.33-noxen/include/linux/highmem.h linux-2.6.16.33/include/linux/highmem.h
99704--- linux-2.6.16.33-noxen/include/linux/highmem.h 2006-11-22 18:06:31.000000000 +0000
99705+++ linux-2.6.16.33/include/linux/highmem.h 2007-01-08 15:00:46.000000000 +0000
99706@@ -13,10 +13,16 @@
99707
99708 /* declarations for linux/mm/highmem.c */
99709 unsigned int nr_free_highpages(void);
99710+#ifdef CONFIG_XEN
99711+void kmap_flush_unused(void);
99712+#endif
99713
99714 #else /* CONFIG_HIGHMEM */
99715
99716 static inline unsigned int nr_free_highpages(void) { return 0; }
99717+#ifdef CONFIG_XEN
99718+static inline void kmap_flush_unused(void) { }
99719+#endif
99720
99721 static inline void *kmap(struct page *page)
99722 {
99723diff -Nur linux-2.6.16.33-noxen/include/linux/interrupt.h linux-2.6.16.33/include/linux/interrupt.h
99724--- linux-2.6.16.33-noxen/include/linux/interrupt.h 2006-11-22 18:06:31.000000000 +0000
99725+++ linux-2.6.16.33/include/linux/interrupt.h 2007-01-08 15:00:46.000000000 +0000
99726@@ -58,6 +58,12 @@
99727 extern void enable_irq(unsigned int irq);
99728 #endif
99729
99730+#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
99731+int irq_ignore_unhandled(unsigned int irq);
99732+#else
99733+#define irq_ignore_unhandled(irq) 0
99734+#endif
99735+
99736 #ifndef __ARCH_SET_SOFTIRQ_PENDING
99737 #define set_softirq_pending(x) (local_softirq_pending() = (x))
99738 #define or_softirq_pending(x) (local_softirq_pending() |= (x))
99739diff -Nur linux-2.6.16.33-noxen/include/linux/kernel.h linux-2.6.16.33/include/linux/kernel.h
99740--- linux-2.6.16.33-noxen/include/linux/kernel.h 2006-11-22 18:06:31.000000000 +0000
99741+++ linux-2.6.16.33/include/linux/kernel.h 2007-05-23 21:00:01.000000000 +0000
99742@@ -111,6 +111,8 @@
99743 __attribute__ ((format (printf, 3, 4)));
99744 extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
99745 __attribute__ ((format (printf, 3, 0)));
99746+extern char *kasprintf(gfp_t gfp, const char *fmt, ...)
99747+ __attribute__ ((format (printf, 2, 3)));
99748
99749 extern int sscanf(const char *, const char *, ...)
99750 __attribute__ ((format (scanf, 2, 3)));
99751diff -Nur linux-2.6.16.33-noxen/include/linux/kernel.h~ linux-2.6.16.33/include/linux/kernel.h~
99752--- linux-2.6.16.33-noxen/include/linux/kernel.h~ 1970-01-01 00:00:00.000000000 +0000
99753+++ linux-2.6.16.33/include/linux/kernel.h~ 2006-11-22 18:06:31.000000000 +0000
99754@@ -0,0 +1,332 @@
99755+#ifndef _LINUX_KERNEL_H
99756+#define _LINUX_KERNEL_H
99757+
99758+/*
99759+ * 'kernel.h' contains some often-used function prototypes etc
99760+ */
99761+
99762+#ifdef __KERNEL__
99763+
99764+#include <stdarg.h>
99765+#include <linux/linkage.h>
99766+#include <linux/stddef.h>
99767+#include <linux/types.h>
99768+#include <linux/compiler.h>
99769+#include <linux/bitops.h>
99770+#include <asm/byteorder.h>
99771+#include <asm/bug.h>
99772+
99773+extern const char linux_banner[];
99774+
99775+#define INT_MAX ((int)(~0U>>1))
99776+#define INT_MIN (-INT_MAX - 1)
99777+#define UINT_MAX (~0U)
99778+#define LONG_MAX ((long)(~0UL>>1))
99779+#define LONG_MIN (-LONG_MAX - 1)
99780+#define ULONG_MAX (~0UL)
99781+
99782+#define STACK_MAGIC 0xdeadbeef
99783+
99784+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
99785+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
99786+
99787+#define KERN_EMERG "<0>" /* system is unusable */
99788+#define KERN_ALERT "<1>" /* action must be taken immediately */
99789+#define KERN_CRIT "<2>" /* critical conditions */
99790+#define KERN_ERR "<3>" /* error conditions */
99791+#define KERN_WARNING "<4>" /* warning conditions */
99792+#define KERN_NOTICE "<5>" /* normal but significant condition */
99793+#define KERN_INFO "<6>" /* informational */
99794+#define KERN_DEBUG "<7>" /* debug-level messages */
99795+
99796+extern int console_printk[];
99797+
99798+#define console_loglevel (console_printk[0])
99799+#define default_message_loglevel (console_printk[1])
99800+#define minimum_console_loglevel (console_printk[2])
99801+#define default_console_loglevel (console_printk[3])
99802+
99803+struct completion;
99804+struct pt_regs;
99805+struct user;
99806+
99807+/**
99808+ * might_sleep - annotation for functions that can sleep
99809+ *
99810+ * this macro will print a stack trace if it is executed in an atomic
99811+ * context (spinlock, irq-handler, ...).
99812+ *
99813+ * This is a useful debugging help to be able to catch problems early and not
99814+ * be biten later when the calling function happens to sleep when it is not
99815+ * supposed to.
99816+ */
99817+#ifdef CONFIG_PREEMPT_VOLUNTARY
99818+extern int cond_resched(void);
99819+# define might_resched() cond_resched()
99820+#else
99821+# define might_resched() do { } while (0)
99822+#endif
99823+
99824+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
99825+ void __might_sleep(char *file, int line);
99826+# define might_sleep() \
99827+ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
99828+#else
99829+# define might_sleep() do { might_resched(); } while (0)
99830+#endif
99831+
99832+#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
99833+
99834+#define abs(x) ({ \
99835+ int __x = (x); \
99836+ (__x < 0) ? -__x : __x; \
99837+ })
99838+
99839+#define labs(x) ({ \
99840+ long __x = (x); \
99841+ (__x < 0) ? -__x : __x; \
99842+ })
99843+
99844+extern struct notifier_block *panic_notifier_list;
99845+extern long (*panic_blink)(long time);
99846+NORET_TYPE void panic(const char * fmt, ...)
99847+ __attribute__ ((NORET_AND format (printf, 1, 2)));
99848+fastcall NORET_TYPE void do_exit(long error_code)
99849+ ATTRIB_NORET;
99850+NORET_TYPE void complete_and_exit(struct completion *, long)
99851+ ATTRIB_NORET;
99852+extern unsigned long simple_strtoul(const char *,char **,unsigned int);
99853+extern long simple_strtol(const char *,char **,unsigned int);
99854+extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
99855+extern long long simple_strtoll(const char *,char **,unsigned int);
99856+extern int sprintf(char * buf, const char * fmt, ...)
99857+ __attribute__ ((format (printf, 2, 3)));
99858+extern int vsprintf(char *buf, const char *, va_list)
99859+ __attribute__ ((format (printf, 2, 0)));
99860+extern int snprintf(char * buf, size_t size, const char * fmt, ...)
99861+ __attribute__ ((format (printf, 3, 4)));
99862+extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
99863+ __attribute__ ((format (printf, 3, 0)));
99864+extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
99865+ __attribute__ ((format (printf, 3, 4)));
99866+extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
99867+ __attribute__ ((format (printf, 3, 0)));
99868+
99869+extern int sscanf(const char *, const char *, ...)
99870+ __attribute__ ((format (scanf, 2, 3)));
99871+extern int vsscanf(const char *, const char *, va_list)
99872+ __attribute__ ((format (scanf, 2, 0)));
99873+
99874+extern int get_option(char **str, int *pint);
99875+extern char *get_options(const char *str, int nints, int *ints);
99876+extern unsigned long long memparse(char *ptr, char **retptr);
99877+
99878+extern int __kernel_text_address(unsigned long addr);
99879+extern int kernel_text_address(unsigned long addr);
99880+extern int session_of_pgrp(int pgrp);
99881+
99882+extern void dump_thread(struct pt_regs *regs, struct user *dump);
99883+
99884+#ifdef CONFIG_PRINTK
99885+asmlinkage int vprintk(const char *fmt, va_list args)
99886+ __attribute__ ((format (printf, 1, 0)));
99887+asmlinkage int printk(const char * fmt, ...)
99888+ __attribute__ ((format (printf, 1, 2)));
99889+#else
99890+static inline int vprintk(const char *s, va_list args)
99891+ __attribute__ ((format (printf, 1, 0)));
99892+static inline int vprintk(const char *s, va_list args) { return 0; }
99893+static inline int printk(const char *s, ...)
99894+ __attribute__ ((format (printf, 1, 2)));
99895+static inline int printk(const char *s, ...) { return 0; }
99896+#endif
99897+
99898+unsigned long int_sqrt(unsigned long);
99899+
99900+static inline int __attribute_pure__ long_log2(unsigned long x)
99901+{
99902+ int r = 0;
99903+ for (x >>= 1; x > 0; x >>= 1)
99904+ r++;
99905+ return r;
99906+}
99907+
99908+static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x)
99909+{
99910+ return (1UL << fls(x - 1));
99911+}
99912+
99913+extern int printk_ratelimit(void);
99914+extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
99915+
99916+static inline void console_silent(void)
99917+{
99918+ console_loglevel = 0;
99919+}
99920+
99921+static inline void console_verbose(void)
99922+{
99923+ if (console_loglevel)
99924+ console_loglevel = 15;
99925+}
99926+
99927+extern void bust_spinlocks(int yes);
99928+extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
99929+extern __deprecated_for_modules int panic_timeout;
99930+extern int panic_on_oops;
99931+extern int tainted;
99932+extern const char *print_tainted(void);
99933+extern void add_taint(unsigned);
99934+
99935+/* Values used for system_state */
99936+extern enum system_states {
99937+ SYSTEM_BOOTING,
99938+ SYSTEM_RUNNING,
99939+ SYSTEM_HALT,
99940+ SYSTEM_POWER_OFF,
99941+ SYSTEM_RESTART,
99942+ SYSTEM_SUSPEND_DISK,
99943+} system_state;
99944+
99945+#define TAINT_PROPRIETARY_MODULE (1<<0)
99946+#define TAINT_FORCED_MODULE (1<<1)
99947+#define TAINT_UNSAFE_SMP (1<<2)
99948+#define TAINT_FORCED_RMMOD (1<<3)
99949+#define TAINT_MACHINE_CHECK (1<<4)
99950+#define TAINT_BAD_PAGE (1<<5)
99951+
99952+extern void dump_stack(void);
99953+
99954+#ifdef DEBUG
99955+#define pr_debug(fmt,arg...) \
99956+ printk(KERN_DEBUG fmt,##arg)
99957+#else
99958+#define pr_debug(fmt,arg...) \
99959+ do { } while (0)
99960+#endif
99961+
99962+#define pr_info(fmt,arg...) \
99963+ printk(KERN_INFO fmt,##arg)
99964+
99965+/*
99966+ * Display an IP address in readable format.
99967+ */
99968+
99969+#define NIPQUAD(addr) \
99970+ ((unsigned char *)&addr)[0], \
99971+ ((unsigned char *)&addr)[1], \
99972+ ((unsigned char *)&addr)[2], \
99973+ ((unsigned char *)&addr)[3]
99974+#define NIPQUAD_FMT "%u.%u.%u.%u"
99975+
99976+#define NIP6(addr) \
99977+ ntohs((addr).s6_addr16[0]), \
99978+ ntohs((addr).s6_addr16[1]), \
99979+ ntohs((addr).s6_addr16[2]), \
99980+ ntohs((addr).s6_addr16[3]), \
99981+ ntohs((addr).s6_addr16[4]), \
99982+ ntohs((addr).s6_addr16[5]), \
99983+ ntohs((addr).s6_addr16[6]), \
99984+ ntohs((addr).s6_addr16[7])
99985+#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
99986+#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
99987+
99988+#if defined(__LITTLE_ENDIAN)
99989+#define HIPQUAD(addr) \
99990+ ((unsigned char *)&addr)[3], \
99991+ ((unsigned char *)&addr)[2], \
99992+ ((unsigned char *)&addr)[1], \
99993+ ((unsigned char *)&addr)[0]
99994+#elif defined(__BIG_ENDIAN)
99995+#define HIPQUAD NIPQUAD
99996+#else
99997+#error "Please fix asm/byteorder.h"
99998+#endif /* __LITTLE_ENDIAN */
99999+
100000+/*
100001+ * min()/max() macros that also do
100002+ * strict type-checking.. See the
100003+ * "unnecessary" pointer comparison.
100004+ */
100005+#define min(x,y) ({ \
100006+ typeof(x) _x = (x); \
100007+ typeof(y) _y = (y); \
100008+ (void) (&_x == &_y); \
100009+ _x < _y ? _x : _y; })
100010+
100011+#define max(x,y) ({ \
100012+ typeof(x) _x = (x); \
100013+ typeof(y) _y = (y); \
100014+ (void) (&_x == &_y); \
100015+ _x > _y ? _x : _y; })
100016+
100017+/*
100018+ * ..and if you can't take the strict
100019+ * types, you can specify one yourself.
100020+ *
100021+ * Or not use min/max at all, of course.
100022+ */
100023+#define min_t(type,x,y) \
100024+ ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
100025+#define max_t(type,x,y) \
100026+ ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
100027+
100028+
100029+/**
100030+ * container_of - cast a member of a structure out to the containing structure
100031+ * @ptr: the pointer to the member.
100032+ * @type: the type of the container struct this is embedded in.
100033+ * @member: the name of the member within the struct.
100034+ *
100035+ */
100036+#define container_of(ptr, type, member) ({ \
100037+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
100038+ (type *)( (char *)__mptr - offsetof(type,member) );})
100039+
100040+/*
100041+ * Check at compile time that something is of a particular type.
100042+ * Always evaluates to 1 so you may use it easily in comparisons.
100043+ */
100044+#define typecheck(type,x) \
100045+({ type __dummy; \
100046+ typeof(x) __dummy2; \
100047+ (void)(&__dummy == &__dummy2); \
100048+ 1; \
100049+})
100050+
100051+/*
100052+ * Check at compile time that 'function' is a certain type, or is a pointer
100053+ * to that type (needs to use typedef for the function type.)
100054+ */
100055+#define typecheck_fn(type,function) \
100056+({ typeof(type) __tmp = function; \
100057+ (void)__tmp; \
100058+})
100059+
100060+#endif /* __KERNEL__ */
100061+
100062+#define SI_LOAD_SHIFT 16
100063+struct sysinfo {
100064+ long uptime; /* Seconds since boot */
100065+ unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
100066+ unsigned long totalram; /* Total usable main memory size */
100067+ unsigned long freeram; /* Available memory size */
100068+ unsigned long sharedram; /* Amount of shared memory */
100069+ unsigned long bufferram; /* Memory used by buffers */
100070+ unsigned long totalswap; /* Total swap space size */
100071+ unsigned long freeswap; /* swap space still available */
100072+ unsigned short procs; /* Number of current processes */
100073+ unsigned short pad; /* explicit padding for m68k */
100074+ unsigned long totalhigh; /* Total high memory size */
100075+ unsigned long freehigh; /* Available high memory size */
100076+ unsigned int mem_unit; /* Memory unit size in bytes */
100077+ char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
100078+};
100079+
100080+/* Force a compilation error if condition is true */
100081+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
100082+
100083+/* Trap pasters of __FUNCTION__ at compile-time */
100084+#define __FUNCTION__ (__func__)
100085+
100086+#endif
100087diff -Nur linux-2.6.16.33-noxen/include/linux/kexec.h linux-2.6.16.33/include/linux/kexec.h
100088--- linux-2.6.16.33-noxen/include/linux/kexec.h 2006-11-22 18:06:31.000000000 +0000
100089+++ linux-2.6.16.33/include/linux/kexec.h 2007-01-08 15:00:46.000000000 +0000
100090@@ -31,6 +31,13 @@
100091 #error KEXEC_ARCH not defined
100092 #endif
100093
100094+#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
100095+#define kexec_page_to_pfn(page) page_to_pfn(page)
100096+#define kexec_pfn_to_page(pfn) pfn_to_page(pfn)
100097+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
100098+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
100099+#endif
100100+
100101 /*
100102 * This structure is used to hold the arguments that are used when loading
100103 * kernel binaries.
100104@@ -91,6 +98,12 @@
100105 extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
100106 extern int machine_kexec_prepare(struct kimage *image);
100107 extern void machine_kexec_cleanup(struct kimage *image);
100108+#ifdef CONFIG_XEN
100109+extern int xen_machine_kexec_load(struct kimage *image);
100110+extern void xen_machine_kexec_unload(struct kimage *image);
100111+extern void xen_machine_kexec_setup_resources(void);
100112+extern void xen_machine_kexec_register_resources(struct resource *res);
100113+#endif
100114 extern asmlinkage long sys_kexec_load(unsigned long entry,
100115 unsigned long nr_segments,
100116 struct kexec_segment __user *segments,
100117diff -Nur linux-2.6.16.33-noxen/include/linux/mm.h linux-2.6.16.33/include/linux/mm.h
100118--- linux-2.6.16.33-noxen/include/linux/mm.h 2006-11-22 18:06:31.000000000 +0000
100119+++ linux-2.6.16.33/include/linux/mm.h 2007-01-08 15:00:46.000000000 +0000
100120@@ -166,6 +166,9 @@
100121 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
100122 #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
100123 #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
100124+#ifdef CONFIG_XEN
100125+#define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */
100126+#endif
100127
100128 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
100129 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
100130@@ -1012,6 +1015,13 @@
100131 #define FOLL_GET 0x04 /* do get_page on page */
100132 #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
100133
100134+#ifdef CONFIG_XEN
100135+typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
100136+ void *data);
100137+extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
100138+ unsigned long size, pte_fn_t fn, void *data);
100139+#endif
100140+
100141 #ifdef CONFIG_PROC_FS
100142 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
100143 #else
100144diff -Nur linux-2.6.16.33-noxen/include/linux/netdevice.h linux-2.6.16.33/include/linux/netdevice.h
100145--- linux-2.6.16.33-noxen/include/linux/netdevice.h 2006-11-22 18:06:31.000000000 +0000
100146+++ linux-2.6.16.33/include/linux/netdevice.h 2007-05-23 21:00:01.000000000 +0000
100147@@ -230,7 +230,8 @@
100148 __LINK_STATE_SCHED,
100149 __LINK_STATE_NOCARRIER,
100150 __LINK_STATE_RX_SCHED,
100151- __LINK_STATE_LINKWATCH_PENDING
100152+ __LINK_STATE_LINKWATCH_PENDING,
100153+ __LINK_STATE_QDISC_RUNNING,
100154 };
100155
100156
100157@@ -306,9 +307,17 @@
100158 #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
100159 #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
100160 #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
100161-#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
100162+#define NETIF_F_GSO 2048 /* Enable software GSO. */
100163 #define NETIF_F_LLTX 4096 /* LockLess TX */
100164-#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/
100165+
100166+ /* Segmentation offload features */
100167+#define NETIF_F_GSO_SHIFT 16
100168+#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
100169+#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
100170+#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
100171+
100172+#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
100173+#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
100174
100175 struct net_device *next_sched;
100176
100177@@ -394,6 +403,9 @@
100178 struct list_head qdisc_list;
100179 unsigned long tx_queue_len; /* Max frames per queue allowed */
100180
100181+ /* Partially transmitted GSO packet. */
100182+ struct sk_buff *gso_skb;
100183+
100184 /* ingress path synchronizer */
100185 spinlock_t ingress_lock;
100186 struct Qdisc *qdisc_ingress;
100187@@ -402,7 +414,7 @@
100188 * One part is mostly used on xmit path (device)
100189 */
100190 /* hard_start_xmit synchronizer */
100191- spinlock_t xmit_lock ____cacheline_aligned_in_smp;
100192+ spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
100193 /* cpu id of processor entered to hard_start_xmit or -1,
100194 if nobody entered there.
100195 */
100196@@ -527,6 +539,9 @@
100197 struct net_device *,
100198 struct packet_type *,
100199 struct net_device *);
100200+ struct sk_buff *(*gso_segment)(struct sk_buff *skb,
100201+ int features);
100202+ int (*gso_send_check)(struct sk_buff *skb);
100203 void *af_packet_priv;
100204 struct list_head list;
100205 };
100206@@ -693,7 +708,8 @@
100207 extern int dev_set_mtu(struct net_device *, int);
100208 extern int dev_set_mac_address(struct net_device *,
100209 struct sockaddr *);
100210-extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
100211+extern int dev_hard_start_xmit(struct sk_buff *skb,
100212+ struct net_device *dev);
100213
100214 extern void dev_init(void);
100215
100216@@ -900,11 +916,43 @@
100217 clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
100218 }
100219
100220+static inline void netif_tx_lock(struct net_device *dev)
100221+{
100222+ spin_lock(&dev->_xmit_lock);
100223+ dev->xmit_lock_owner = smp_processor_id();
100224+}
100225+
100226+static inline void netif_tx_lock_bh(struct net_device *dev)
100227+{
100228+ spin_lock_bh(&dev->_xmit_lock);
100229+ dev->xmit_lock_owner = smp_processor_id();
100230+}
100231+
100232+static inline int netif_tx_trylock(struct net_device *dev)
100233+{
100234+ int ok = spin_trylock(&dev->_xmit_lock);
100235+ if (likely(ok))
100236+ dev->xmit_lock_owner = smp_processor_id();
100237+ return ok;
100238+}
100239+
100240+static inline void netif_tx_unlock(struct net_device *dev)
100241+{
100242+ dev->xmit_lock_owner = -1;
100243+ spin_unlock(&dev->_xmit_lock);
100244+}
100245+
100246+static inline void netif_tx_unlock_bh(struct net_device *dev)
100247+{
100248+ dev->xmit_lock_owner = -1;
100249+ spin_unlock_bh(&dev->_xmit_lock);
100250+}
100251+
100252 static inline void netif_tx_disable(struct net_device *dev)
100253 {
100254- spin_lock_bh(&dev->xmit_lock);
100255+ netif_tx_lock_bh(dev);
100256 netif_stop_queue(dev);
100257- spin_unlock_bh(&dev->xmit_lock);
100258+ netif_tx_unlock_bh(dev);
100259 }
100260
100261 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
100262@@ -932,6 +980,7 @@
100263 extern int weight_p;
100264 extern int netdev_set_master(struct net_device *dev, struct net_device *master);
100265 extern int skb_checksum_help(struct sk_buff *skb, int inward);
100266+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features);
100267 #ifdef CONFIG_BUG
100268 extern void netdev_rx_csum_fault(struct net_device *dev);
100269 #else
100270@@ -951,6 +1000,19 @@
100271
100272 extern void linkwatch_run_queue(void);
100273
100274+static inline int skb_gso_ok(struct sk_buff *skb, int features)
100275+{
100276+ int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT;
100277+ return (features & feature) == feature;
100278+}
100279+
100280+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
100281+{
100282+ return skb_is_gso(skb) &&
100283+ (!skb_gso_ok(skb, dev->features) ||
100284+ unlikely(skb->ip_summed != CHECKSUM_HW));
100285+}
100286+
100287 #endif /* __KERNEL__ */
100288
100289 #endif /* _LINUX_DEV_H */
100290diff -Nur linux-2.6.16.33-noxen/include/linux/oprofile.h linux-2.6.16.33/include/linux/oprofile.h
100291--- linux-2.6.16.33-noxen/include/linux/oprofile.h 2006-11-22 18:06:31.000000000 +0000
100292+++ linux-2.6.16.33/include/linux/oprofile.h 2007-05-23 21:00:01.000000000 +0000
100293@@ -16,6 +16,8 @@
100294 #include <linux/types.h>
100295 #include <linux/spinlock.h>
100296 #include <asm/atomic.h>
100297+
100298+#include <xen/interface/xenoprof.h>
100299
100300 struct super_block;
100301 struct dentry;
100302@@ -27,6 +29,11 @@
100303 /* create any necessary configuration files in the oprofile fs.
100304 * Optional. */
100305 int (*create_files)(struct super_block * sb, struct dentry * root);
100306+ /* setup active domains with Xen */
100307+ int (*set_active)(int *active_domains, unsigned int adomains);
100308+ /* setup passive domains with Xen */
100309+ int (*set_passive)(int *passive_domains, unsigned int pdomains);
100310+
100311 /* Do any necessary interrupt setup. Optional. */
100312 int (*setup)(void);
100313 /* Do any necessary interrupt shutdown. Optional. */
100314@@ -68,6 +75,8 @@
100315 /* add a backtrace entry, to be called from the ->backtrace callback */
100316 void oprofile_add_trace(unsigned long eip);
100317
100318+/* add a domain switch entry */
100319+int oprofile_add_domain_switch(int32_t domain_id);
100320
100321 /**
100322 * Create a file of the given name as a child of the given root, with
100323diff -Nur linux-2.6.16.33-noxen/include/linux/rcupdate.h linux-2.6.16.33/include/linux/rcupdate.h
100324--- linux-2.6.16.33-noxen/include/linux/rcupdate.h 2006-11-22 18:06:31.000000000 +0000
100325+++ linux-2.6.16.33/include/linux/rcupdate.h 2007-05-23 21:00:01.000000000 +0000
100326@@ -134,6 +134,7 @@
100327 }
100328
100329 extern int rcu_pending(int cpu);
100330+extern int rcu_needs_cpu(int cpu);
100331
100332 /**
100333 * rcu_read_lock - mark the beginning of an RCU read-side critical section.
100334diff -Nur linux-2.6.16.33-noxen/include/linux/skbuff.h linux-2.6.16.33/include/linux/skbuff.h
100335--- linux-2.6.16.33-noxen/include/linux/skbuff.h 2006-11-22 18:06:31.000000000 +0000
100336+++ linux-2.6.16.33/include/linux/skbuff.h 2007-01-08 15:00:46.000000000 +0000
100337@@ -134,9 +134,10 @@
100338 struct skb_shared_info {
100339 atomic_t dataref;
100340 unsigned short nr_frags;
100341- unsigned short tso_size;
100342- unsigned short tso_segs;
100343- unsigned short ufo_size;
100344+ unsigned short gso_size;
100345+ /* Warning: this field is not always filled in (UFO)! */
100346+ unsigned short gso_segs;
100347+ unsigned short gso_type;
100348 unsigned int ip6_frag_id;
100349 struct sk_buff *frag_list;
100350 skb_frag_t frags[MAX_SKB_FRAGS];
100351@@ -168,6 +169,14 @@
100352 SKB_FCLONE_CLONE,
100353 };
100354
100355+enum {
100356+ SKB_GSO_TCPV4 = 1 << 0,
100357+ SKB_GSO_UDPV4 = 1 << 1,
100358+
100359+ /* This indicates the skb is from an untrusted source. */
100360+ SKB_GSO_DODGY = 1 << 2,
100361+};
100362+
100363 /**
100364 * struct sk_buff - socket buffer
100365 * @next: Next buffer in list
100366@@ -189,6 +198,8 @@
100367 * @local_df: allow local fragmentation
100368 * @cloned: Head may be cloned (check refcnt to be sure)
100369 * @nohdr: Payload reference only, must not modify header
100370+ * @proto_data_valid: Protocol data validated since arriving at localhost
100371+ * @proto_csum_blank: Protocol csum must be added before leaving localhost
100372 * @pkt_type: Packet class
100373 * @fclone: skbuff clone status
100374 * @ip_summed: Driver fed us an IP checksum
100375@@ -265,7 +276,13 @@
100376 nfctinfo:3;
100377 __u8 pkt_type:3,
100378 fclone:2,
100379+#ifndef CONFIG_XEN
100380 ipvs_property:1;
100381+#else
100382+ ipvs_property:1,
100383+ proto_data_valid:1,
100384+ proto_csum_blank:1;
100385+#endif
100386 __be16 protocol;
100387
100388 void (*destructor)(struct sk_buff *skb);
100389@@ -321,7 +338,8 @@
100390
100391 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
100392 unsigned int size,
100393- gfp_t priority);
100394+ gfp_t priority,
100395+ int fclone);
100396 extern void kfree_skbmem(struct sk_buff *skb);
100397 extern struct sk_buff *skb_clone(struct sk_buff *skb,
100398 gfp_t priority);
100399@@ -1051,7 +1069,7 @@
100400 return skb;
100401 }
100402 #else
100403-extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
100404+extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
100405 #endif
100406
100407 /**
100408@@ -1148,18 +1166,34 @@
100409 return 0;
100410 }
100411
100412+static inline int __skb_linearize(struct sk_buff *skb)
100413+{
100414+ return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
100415+}
100416+
100417 /**
100418 * skb_linearize - convert paged skb to linear one
100419 * @skb: buffer to linarize
100420- * @gfp: allocation mode
100421 *
100422 * If there is no free memory -ENOMEM is returned, otherwise zero
100423 * is returned and the old skb data released.
100424 */
100425-extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
100426-static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
100427+static inline int skb_linearize(struct sk_buff *skb)
100428+{
100429+ return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
100430+}
100431+
100432+/**
100433+ * skb_linearize_cow - make sure skb is linear and writable
100434+ * @skb: buffer to process
100435+ *
100436+ * If there is no free memory -ENOMEM is returned, otherwise zero
100437+ * is returned and the old skb data released.
100438+ */
100439+static inline int skb_linearize_cow(struct sk_buff *skb)
100440 {
100441- return __skb_linearize(skb, gfp);
100442+ return skb_is_nonlinear(skb) || skb_cloned(skb) ?
100443+ __skb_linearize(skb) : 0;
100444 }
100445
100446 /**
100447@@ -1254,6 +1288,7 @@
100448 struct sk_buff *skb1, const u32 len);
100449
100450 extern void skb_release_data(struct sk_buff *skb);
100451+extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
100452
100453 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
100454 int len, void *buffer)
100455@@ -1377,5 +1412,10 @@
100456 static inline void nf_reset(struct sk_buff *skb) {}
100457 #endif /* CONFIG_NETFILTER */
100458
100459+static inline int skb_is_gso(const struct sk_buff *skb)
100460+{
100461+ return skb_shinfo(skb)->gso_size;
100462+}
100463+
100464 #endif /* __KERNEL__ */
100465 #endif /* _LINUX_SKBUFF_H */
100466diff -Nur linux-2.6.16.33-noxen/include/net/pkt_sched.h linux-2.6.16.33/include/net/pkt_sched.h
100467--- linux-2.6.16.33-noxen/include/net/pkt_sched.h 2006-11-22 18:06:31.000000000 +0000
100468+++ linux-2.6.16.33/include/net/pkt_sched.h 2007-05-23 21:00:01.000000000 +0000
100469@@ -218,12 +218,13 @@
100470 struct rtattr *tab);
100471 extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
100472
100473-extern int qdisc_restart(struct net_device *dev);
100474+extern void __qdisc_run(struct net_device *dev);
100475
100476 static inline void qdisc_run(struct net_device *dev)
100477 {
100478- while (!netif_queue_stopped(dev) && qdisc_restart(dev) < 0)
100479- /* NOTHING */;
100480+ if (!netif_queue_stopped(dev) &&
100481+ !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
100482+ __qdisc_run(dev);
100483 }
100484
100485 extern int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
100486diff -Nur linux-2.6.16.33-noxen/include/net/protocol.h linux-2.6.16.33/include/net/protocol.h
100487--- linux-2.6.16.33-noxen/include/net/protocol.h 2006-11-22 18:06:31.000000000 +0000
100488+++ linux-2.6.16.33/include/net/protocol.h 2007-05-23 21:00:01.000000000 +0000
100489@@ -37,6 +37,9 @@
100490 struct net_protocol {
100491 int (*handler)(struct sk_buff *skb);
100492 void (*err_handler)(struct sk_buff *skb, u32 info);
100493+ int (*gso_send_check)(struct sk_buff *skb);
100494+ struct sk_buff *(*gso_segment)(struct sk_buff *skb,
100495+ int features);
100496 int no_policy;
100497 };
100498
100499diff -Nur linux-2.6.16.33-noxen/include/net/sock.h linux-2.6.16.33/include/net/sock.h
100500--- linux-2.6.16.33-noxen/include/net/sock.h 2006-11-22 18:06:31.000000000 +0000
100501+++ linux-2.6.16.33/include/net/sock.h 2007-05-23 21:00:01.000000000 +0000
100502@@ -1064,9 +1064,13 @@
100503 {
100504 __sk_dst_set(sk, dst);
100505 sk->sk_route_caps = dst->dev->features;
100506+ if (sk->sk_route_caps & NETIF_F_GSO)
100507+ sk->sk_route_caps |= NETIF_F_TSO;
100508 if (sk->sk_route_caps & NETIF_F_TSO) {
100509 if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
100510 sk->sk_route_caps &= ~NETIF_F_TSO;
100511+ else
100512+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
100513 }
100514 }
100515
100516diff -Nur linux-2.6.16.33-noxen/include/net/tcp.h linux-2.6.16.33/include/net/tcp.h
100517--- linux-2.6.16.33-noxen/include/net/tcp.h 2006-11-22 18:06:31.000000000 +0000
100518+++ linux-2.6.16.33/include/net/tcp.h 2007-05-23 21:00:01.000000000 +0000
100519@@ -552,13 +552,13 @@
100520 */
100521 static inline int tcp_skb_pcount(const struct sk_buff *skb)
100522 {
100523- return skb_shinfo(skb)->tso_segs;
100524+ return skb_shinfo(skb)->gso_segs;
100525 }
100526
100527 /* This is valid iff tcp_skb_pcount() > 1. */
100528 static inline int tcp_skb_mss(const struct sk_buff *skb)
100529 {
100530- return skb_shinfo(skb)->tso_size;
100531+ return skb_shinfo(skb)->gso_size;
100532 }
100533
100534 static inline void tcp_dec_pcount_approx(__u32 *count,
100535@@ -1063,6 +1063,9 @@
100536
100537 extern int tcp_v4_destroy_sock(struct sock *sk);
100538
100539+extern int tcp_v4_gso_send_check(struct sk_buff *skb);
100540+extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features);
100541+
100542 #ifdef CONFIG_PROC_FS
100543 extern int tcp4_proc_init(void);
100544 extern void tcp4_proc_exit(void);
100545diff -Nur linux-2.6.16.33-noxen/include/xen/balloon.h linux-2.6.16.33/include/xen/balloon.h
100546--- linux-2.6.16.33-noxen/include/xen/balloon.h 1970-01-01 00:00:00.000000000 +0000
100547+++ linux-2.6.16.33/include/xen/balloon.h 2007-01-08 15:00:46.000000000 +0000
100548@@ -0,0 +1,57 @@
100549+/******************************************************************************
100550+ * balloon.h
100551+ *
100552+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
100553+ *
100554+ * Copyright (c) 2003, B Dragovic
100555+ * Copyright (c) 2003-2004, M Williamson, K Fraser
100556+ *
100557+ * This program is free software; you can redistribute it and/or
100558+ * modify it under the terms of the GNU General Public License version 2
100559+ * as published by the Free Software Foundation; or, when distributed
100560+ * separately from the Linux kernel or incorporated into other
100561+ * software packages, subject to the following license:
100562+ *
100563+ * Permission is hereby granted, free of charge, to any person obtaining a copy
100564+ * of this source file (the "Software"), to deal in the Software without
100565+ * restriction, including without limitation the rights to use, copy, modify,
100566+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100567+ * and to permit persons to whom the Software is furnished to do so, subject to
100568+ * the following conditions:
100569+ *
100570+ * The above copyright notice and this permission notice shall be included in
100571+ * all copies or substantial portions of the Software.
100572+ *
100573+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100574+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100575+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100576+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100577+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100578+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100579+ * IN THE SOFTWARE.
100580+ */
100581+
100582+#ifndef __ASM_BALLOON_H__
100583+#define __ASM_BALLOON_H__
100584+
100585+/*
100586+ * Inform the balloon driver that it should allow some slop for device-driver
100587+ * memory activities.
100588+ */
100589+void balloon_update_driver_allowance(long delta);
100590+
100591+/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
100592+struct page **alloc_empty_pages_and_pagevec(int nr_pages);
100593+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
100594+
100595+void balloon_release_driver_page(struct page *page);
100596+
100597+/*
100598+ * Prevent the balloon driver from changing the memory reservation during
100599+ * a driver critical region.
100600+ */
100601+extern spinlock_t balloon_lock;
100602+#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
100603+#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
100604+
100605+#endif /* __ASM_BALLOON_H__ */
100606diff -Nur linux-2.6.16.33-noxen/include/xen/cpu_hotplug.h linux-2.6.16.33/include/xen/cpu_hotplug.h
100607--- linux-2.6.16.33-noxen/include/xen/cpu_hotplug.h 1970-01-01 00:00:00.000000000 +0000
100608+++ linux-2.6.16.33/include/xen/cpu_hotplug.h 2007-01-08 15:00:46.000000000 +0000
100609@@ -0,0 +1,44 @@
100610+#ifndef __XEN_CPU_HOTPLUG_H__
100611+#define __XEN_CPU_HOTPLUG_H__
100612+
100613+#include <linux/config.h>
100614+#include <linux/kernel.h>
100615+#include <linux/cpumask.h>
100616+
100617+#if defined(CONFIG_HOTPLUG_CPU)
100618+
100619+#if defined(CONFIG_X86)
100620+void cpu_initialize_context(unsigned int cpu);
100621+#else
100622+#define cpu_initialize_context(cpu) ((void)0)
100623+#endif
100624+
100625+int cpu_up_check(unsigned int cpu);
100626+void init_xenbus_allowed_cpumask(void);
100627+int smp_suspend(void);
100628+void smp_resume(void);
100629+
100630+void cpu_bringup(void);
100631+
100632+#else /* !defined(CONFIG_HOTPLUG_CPU) */
100633+
100634+#define cpu_up_check(cpu) (0)
100635+#define init_xenbus_allowed_cpumask() ((void)0)
100636+
100637+static inline int smp_suspend(void)
100638+{
100639+ if (num_online_cpus() > 1) {
100640+ printk(KERN_WARNING "Can't suspend SMP guests "
100641+ "without CONFIG_HOTPLUG_CPU\n");
100642+ return -EOPNOTSUPP;
100643+ }
100644+ return 0;
100645+}
100646+
100647+static inline void smp_resume(void)
100648+{
100649+}
100650+
100651+#endif /* !defined(CONFIG_HOTPLUG_CPU) */
100652+
100653+#endif /* __XEN_CPU_HOTPLUG_H__ */
100654diff -Nur linux-2.6.16.33-noxen/include/xen/driver_util.h linux-2.6.16.33/include/xen/driver_util.h
100655--- linux-2.6.16.33-noxen/include/xen/driver_util.h 1970-01-01 00:00:00.000000000 +0000
100656+++ linux-2.6.16.33/include/xen/driver_util.h 2007-01-08 15:00:46.000000000 +0000
100657@@ -0,0 +1,16 @@
100658+
100659+#ifndef __ASM_XEN_DRIVER_UTIL_H__
100660+#define __ASM_XEN_DRIVER_UTIL_H__
100661+
100662+#include <linux/config.h>
100663+#include <linux/vmalloc.h>
100664+
100665+/* Allocate/destroy a 'vmalloc' VM area. */
100666+extern struct vm_struct *alloc_vm_area(unsigned long size);
100667+extern void free_vm_area(struct vm_struct *area);
100668+
100669+/* Lock an area so that PTEs are accessible in the current address space. */
100670+extern void lock_vm_area(struct vm_struct *area);
100671+extern void unlock_vm_area(struct vm_struct *area);
100672+
100673+#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
100674diff -Nur linux-2.6.16.33-noxen/include/xen/evtchn.h linux-2.6.16.33/include/xen/evtchn.h
100675--- linux-2.6.16.33-noxen/include/xen/evtchn.h 1970-01-01 00:00:00.000000000 +0000
100676+++ linux-2.6.16.33/include/xen/evtchn.h 2007-01-08 15:00:46.000000000 +0000
100677@@ -0,0 +1,114 @@
100678+/******************************************************************************
100679+ * evtchn.h
100680+ *
100681+ * Communication via Xen event channels.
100682+ * Also definitions for the device that demuxes notifications to userspace.
100683+ *
100684+ * Copyright (c) 2004-2005, K A Fraser
100685+ *
100686+ * This program is free software; you can redistribute it and/or
100687+ * modify it under the terms of the GNU General Public License version 2
100688+ * as published by the Free Software Foundation; or, when distributed
100689+ * separately from the Linux kernel or incorporated into other
100690+ * software packages, subject to the following license:
100691+ *
100692+ * Permission is hereby granted, free of charge, to any person obtaining a copy
100693+ * of this source file (the "Software"), to deal in the Software without
100694+ * restriction, including without limitation the rights to use, copy, modify,
100695+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100696+ * and to permit persons to whom the Software is furnished to do so, subject to
100697+ * the following conditions:
100698+ *
100699+ * The above copyright notice and this permission notice shall be included in
100700+ * all copies or substantial portions of the Software.
100701+ *
100702+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100703+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100704+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100705+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100706+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100707+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100708+ * IN THE SOFTWARE.
100709+ */
100710+
100711+#ifndef __ASM_EVTCHN_H__
100712+#define __ASM_EVTCHN_H__
100713+
100714+#include <linux/config.h>
100715+#include <linux/interrupt.h>
100716+#include <asm/hypervisor.h>
100717+#include <asm/ptrace.h>
100718+#include <asm/synch_bitops.h>
100719+#include <xen/interface/event_channel.h>
100720+#include <linux/smp.h>
100721+
100722+/*
100723+ * LOW-LEVEL DEFINITIONS
100724+ */
100725+
100726+/*
100727+ * Dynamically bind an event source to an IRQ-like callback handler.
100728+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
100729+ * The IRQ argument passed to the callback handler is the same as returned
100730+ * from the bind call. It may not correspond to a Linux IRQ number.
100731+ * Returns IRQ or negative errno.
100732+ * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
100733+ */
100734+extern int bind_evtchn_to_irqhandler(
100735+ unsigned int evtchn,
100736+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
100737+ unsigned long irqflags,
100738+ const char *devname,
100739+ void *dev_id);
100740+extern int bind_virq_to_irqhandler(
100741+ unsigned int virq,
100742+ unsigned int cpu,
100743+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
100744+ unsigned long irqflags,
100745+ const char *devname,
100746+ void *dev_id);
100747+extern int bind_ipi_to_irqhandler(
100748+ unsigned int ipi,
100749+ unsigned int cpu,
100750+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
100751+ unsigned long irqflags,
100752+ const char *devname,
100753+ void *dev_id);
100754+
100755+/*
100756+ * Common unbind function for all event sources. Takes IRQ to unbind from.
100757+ * Automatically closes the underlying event channel (even for bindings
100758+ * made with bind_evtchn_to_irqhandler()).
100759+ */
100760+extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
100761+
100762+extern void irq_resume(void);
100763+
100764+/* Entry point for notifications into Linux subsystems. */
100765+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
100766+
100767+/* Entry point for notifications into the userland character device. */
100768+extern void evtchn_device_upcall(int port);
100769+
100770+extern void mask_evtchn(int port);
100771+extern void unmask_evtchn(int port);
100772+
100773+static inline void clear_evtchn(int port)
100774+{
100775+ shared_info_t *s = HYPERVISOR_shared_info;
100776+ synch_clear_bit(port, &s->evtchn_pending[0]);
100777+}
100778+
100779+static inline void notify_remote_via_evtchn(int port)
100780+{
100781+ struct evtchn_send send = { .port = port };
100782+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
100783+}
100784+
100785+/*
100786+ * Unlike notify_remote_via_evtchn(), this is safe to use across
100787+ * save/restore. Notifications on a broken connection are silently dropped.
100788+ */
100789+extern void notify_remote_via_irq(int irq);
100790+
100791+#endif /* __ASM_EVTCHN_H__ */
100792diff -Nur linux-2.6.16.33-noxen/include/xen/features.h linux-2.6.16.33/include/xen/features.h
100793--- linux-2.6.16.33-noxen/include/xen/features.h 1970-01-01 00:00:00.000000000 +0000
100794+++ linux-2.6.16.33/include/xen/features.h 2007-01-08 15:00:46.000000000 +0000
100795@@ -0,0 +1,20 @@
100796+/******************************************************************************
100797+ * features.h
100798+ *
100799+ * Query the features reported by Xen.
100800+ *
100801+ * Copyright (c) 2006, Ian Campbell
100802+ */
100803+
100804+#ifndef __ASM_XEN_FEATURES_H__
100805+#define __ASM_XEN_FEATURES_H__
100806+
100807+#include <xen/interface/version.h>
100808+
100809+extern void setup_xen_features(void);
100810+
100811+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
100812+
100813+#define xen_feature(flag) (xen_features[flag])
100814+
100815+#endif /* __ASM_XEN_FEATURES_H__ */
100816diff -Nur linux-2.6.16.33-noxen/include/xen/foreign_page.h linux-2.6.16.33/include/xen/foreign_page.h
100817--- linux-2.6.16.33-noxen/include/xen/foreign_page.h 1970-01-01 00:00:00.000000000 +0000
100818+++ linux-2.6.16.33/include/xen/foreign_page.h 2007-01-08 15:00:46.000000000 +0000
100819@@ -0,0 +1,30 @@
100820+/******************************************************************************
100821+ * foreign_page.h
100822+ *
100823+ * Provide a "foreign" page type, that is owned by a foreign allocator and
100824+ * not the normal buddy allocator in page_alloc.c
100825+ *
100826+ * Copyright (c) 2004, K A Fraser
100827+ */
100828+
100829+#ifndef __ASM_XEN_FOREIGN_PAGE_H__
100830+#define __ASM_XEN_FOREIGN_PAGE_H__
100831+
100832+#define PG_foreign PG_arch_1
100833+
100834+#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
100835+
100836+#define SetPageForeign(page, dtor) do { \
100837+ set_bit(PG_foreign, &(page)->flags); \
100838+ (page)->mapping = (void *)dtor; \
100839+} while (0)
100840+
100841+#define ClearPageForeign(page) do { \
100842+ clear_bit(PG_foreign, &(page)->flags); \
100843+ (page)->mapping = NULL; \
100844+} while (0)
100845+
100846+#define PageForeignDestructor(page) \
100847+ ( (void (*) (struct page *)) (page)->mapping )
100848+
100849+#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
100850diff -Nur linux-2.6.16.33-noxen/include/xen/gnttab.h linux-2.6.16.33/include/xen/gnttab.h
100851--- linux-2.6.16.33-noxen/include/xen/gnttab.h 1970-01-01 00:00:00.000000000 +0000
100852+++ linux-2.6.16.33/include/xen/gnttab.h 2007-01-08 15:00:46.000000000 +0000
100853@@ -0,0 +1,152 @@
100854+/******************************************************************************
100855+ * gnttab.h
100856+ *
100857+ * Two sets of functionality:
100858+ * 1. Granting foreign access to our memory reservation.
100859+ * 2. Accessing others' memory reservations via grant references.
100860+ * (i.e., mechanisms for both sender and recipient of grant references)
100861+ *
100862+ * Copyright (c) 2004-2005, K A Fraser
100863+ * Copyright (c) 2005, Christopher Clark
100864+ *
100865+ * This program is free software; you can redistribute it and/or
100866+ * modify it under the terms of the GNU General Public License version 2
100867+ * as published by the Free Software Foundation; or, when distributed
100868+ * separately from the Linux kernel or incorporated into other
100869+ * software packages, subject to the following license:
100870+ *
100871+ * Permission is hereby granted, free of charge, to any person obtaining a copy
100872+ * of this source file (the "Software"), to deal in the Software without
100873+ * restriction, including without limitation the rights to use, copy, modify,
100874+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100875+ * and to permit persons to whom the Software is furnished to do so, subject to
100876+ * the following conditions:
100877+ *
100878+ * The above copyright notice and this permission notice shall be included in
100879+ * all copies or substantial portions of the Software.
100880+ *
100881+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100882+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100883+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100884+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100885+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100886+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100887+ * IN THE SOFTWARE.
100888+ */
100889+
100890+#ifndef __ASM_GNTTAB_H__
100891+#define __ASM_GNTTAB_H__
100892+
100893+#include <linux/config.h>
100894+#include <asm/hypervisor.h>
100895+#include <asm/maddr.h> /* maddr_t */
100896+#include <xen/interface/grant_table.h>
100897+#include <xen/features.h>
100898+
100899+/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
100900+#ifdef __ia64__
100901+#define NR_GRANT_FRAMES 1
100902+#else
100903+#define NR_GRANT_FRAMES 4
100904+#endif
100905+
100906+struct gnttab_free_callback {
100907+ struct gnttab_free_callback *next;
100908+ void (*fn)(void *);
100909+ void *arg;
100910+ u16 count;
100911+};
100912+
100913+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
100914+ int readonly);
100915+
100916+/*
100917+ * End access through the given grant reference, iff the grant entry is no
100918+ * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
100919+ * use.
100920+ */
100921+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
100922+
100923+/*
100924+ * Eventually end access through the given grant reference, and once that
100925+ * access has been ended, free the given page too. Access will be ended
100926+ * immediately iff the grant entry is not in use, otherwise it will happen
100927+ * some time later. page may be 0, in which case no freeing will occur.
100928+ */
100929+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
100930+ unsigned long page);
100931+
100932+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
100933+
100934+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
100935+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
100936+
100937+int gnttab_query_foreign_access(grant_ref_t ref);
100938+
100939+/*
100940+ * operations on reserved batches of grant references
100941+ */
100942+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
100943+
100944+void gnttab_free_grant_reference(grant_ref_t ref);
100945+
100946+void gnttab_free_grant_references(grant_ref_t head);
100947+
100948+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
100949+
100950+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
100951+
100952+void gnttab_release_grant_reference(grant_ref_t *private_head,
100953+ grant_ref_t release);
100954+
100955+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
100956+ void (*fn)(void *), void *arg, u16 count);
100957+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
100958+
100959+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
100960+ unsigned long frame, int readonly);
100961+
100962+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
100963+ unsigned long pfn);
100964+
100965+#ifdef __ia64__
100966+#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
100967+#else
100968+#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
100969+#endif
100970+
100971+int gnttab_suspend(void);
100972+int gnttab_resume(void);
100973+
100974+static inline void
100975+gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
100976+ uint32_t flags, grant_ref_t ref, domid_t domid)
100977+{
100978+ if (flags & GNTMAP_contains_pte)
100979+ map->host_addr = addr;
100980+ else if (xen_feature(XENFEAT_auto_translated_physmap))
100981+ map->host_addr = __pa(addr);
100982+ else
100983+ map->host_addr = addr;
100984+
100985+ map->flags = flags;
100986+ map->ref = ref;
100987+ map->dom = domid;
100988+}
100989+
100990+static inline void
100991+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
100992+ uint32_t flags, grant_handle_t handle)
100993+{
100994+ if (flags & GNTMAP_contains_pte)
100995+ unmap->host_addr = addr;
100996+ else if (xen_feature(XENFEAT_auto_translated_physmap))
100997+ unmap->host_addr = __pa(addr);
100998+ else
100999+ unmap->host_addr = addr;
101000+
101001+ unmap->handle = handle;
101002+ unmap->dev_bus_addr = 0;
101003+}
101004+
101005+#endif /* __ASM_GNTTAB_H__ */
101006diff -Nur linux-2.6.16.33-noxen/include/xen/hvm.h linux-2.6.16.33/include/xen/hvm.h
101007--- linux-2.6.16.33-noxen/include/xen/hvm.h 1970-01-01 00:00:00.000000000 +0000
101008+++ linux-2.6.16.33/include/xen/hvm.h 2007-01-08 15:00:46.000000000 +0000
101009@@ -0,0 +1,24 @@
101010+/* Simple wrappers around HVM functions */
101011+#ifndef XEN_HVM_H__
101012+#define XEN_HVM_H__
101013+
101014+#include <xen/interface/hvm/params.h>
101015+#include <asm/hypercall.h>
101016+
101017+static inline unsigned long hvm_get_parameter(int idx)
101018+{
101019+ struct xen_hvm_param xhv;
101020+ int r;
101021+
101022+ xhv.domid = DOMID_SELF;
101023+ xhv.index = idx;
101024+ r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
101025+ if (r < 0) {
101026+ printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
101027+ idx, r);
101028+ return 0;
101029+ }
101030+ return xhv.value;
101031+}
101032+
101033+#endif /* XEN_HVM_H__ */
101034diff -Nur linux-2.6.16.33-noxen/include/xen/hypervisor_sysfs.h linux-2.6.16.33/include/xen/hypervisor_sysfs.h
101035--- linux-2.6.16.33-noxen/include/xen/hypervisor_sysfs.h 1970-01-01 00:00:00.000000000 +0000
101036+++ linux-2.6.16.33/include/xen/hypervisor_sysfs.h 2007-01-08 15:00:46.000000000 +0000
101037@@ -0,0 +1,32 @@
101038+/*
101039+ * copyright (c) 2006 IBM Corporation
101040+ * Authored by: Mike D. Day <ncmike@us.ibm.com>
101041+ *
101042+ * This program is free software; you can redistribute it and/or modify
101043+ * it under the terms of the GNU General Public License version 2 as
101044+ * published by the Free Software Foundation.
101045+ */
101046+
101047+#ifndef _HYP_SYSFS_H_
101048+#define _HYP_SYSFS_H_
101049+
101050+#include <linux/kobject.h>
101051+#include <linux/sysfs.h>
101052+
101053+#define HYPERVISOR_ATTR_RO(_name) \
101054+static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
101055+
101056+#define HYPERVISOR_ATTR_RW(_name) \
101057+static struct hyp_sysfs_attr _name##_attr = \
101058+ __ATTR(_name, 0644, _name##_show, _name##_store)
101059+
101060+extern struct subsystem hypervisor_subsys;
101061+
101062+struct hyp_sysfs_attr {
101063+ struct attribute attr;
101064+ ssize_t (*show)(struct hyp_sysfs_attr *, char *);
101065+ ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
101066+ void *hyp_attr_data;
101067+};
101068+
101069+#endif /* _HYP_SYSFS_H_ */
101070diff -Nur linux-2.6.16.33-noxen/include/xen/interface/COPYING linux-2.6.16.33/include/xen/interface/COPYING
101071--- linux-2.6.16.33-noxen/include/xen/interface/COPYING 1970-01-01 00:00:00.000000000 +0000
101072+++ linux-2.6.16.33/include/xen/interface/COPYING 2007-01-08 15:00:55.000000000 +0000
101073@@ -0,0 +1,38 @@
101074+XEN NOTICE
101075+==========
101076+
101077+This copyright applies to all files within this subdirectory and its
101078+subdirectories:
101079+ include/public/*.h
101080+ include/public/hvm/*.h
101081+ include/public/io/*.h
101082+
101083+The intention is that these files can be freely copied into the source
101084+tree of an operating system when porting that OS to run on Xen. Doing
101085+so does *not* cause the OS to become subject to the terms of the GPL.
101086+
101087+All other files in the Xen source distribution are covered by version
101088+2 of the GNU General Public License except where explicitly stated
101089+otherwise within individual source files.
101090+
101091+ -- Keir Fraser (on behalf of the Xen team)
101092+
101093+=====================================================================
101094+
101095+Permission is hereby granted, free of charge, to any person obtaining a copy
101096+of this software and associated documentation files (the "Software"), to
101097+deal in the Software without restriction, including without limitation the
101098+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101099+sell copies of the Software, and to permit persons to whom the Software is
101100+furnished to do so, subject to the following conditions:
101101+
101102+The above copyright notice and this permission notice shall be included in
101103+all copies or substantial portions of the Software.
101104+
101105+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101106+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101107+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101108+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101109+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101110+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101111+DEALINGS IN THE SOFTWARE.
101112diff -Nur linux-2.6.16.33-noxen/include/xen/interface/acm.h linux-2.6.16.33/include/xen/interface/acm.h
101113--- linux-2.6.16.33-noxen/include/xen/interface/acm.h 1970-01-01 00:00:00.000000000 +0000
101114+++ linux-2.6.16.33/include/xen/interface/acm.h 2007-01-08 15:00:55.000000000 +0000
101115@@ -0,0 +1,205 @@
101116+/*
101117+ * acm.h: Xen access control module interface defintions
101118+ *
101119+ * Permission is hereby granted, free of charge, to any person obtaining a copy
101120+ * of this software and associated documentation files (the "Software"), to
101121+ * deal in the Software without restriction, including without limitation the
101122+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101123+ * sell copies of the Software, and to permit persons to whom the Software is
101124+ * furnished to do so, subject to the following conditions:
101125+ *
101126+ * The above copyright notice and this permission notice shall be included in
101127+ * all copies or substantial portions of the Software.
101128+ *
101129+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101130+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101131+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101132+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101133+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101134+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101135+ * DEALINGS IN THE SOFTWARE.
101136+ *
101137+ * Reiner Sailer <sailer@watson.ibm.com>
101138+ * Copyright (c) 2005, International Business Machines Corporation.
101139+ */
101140+
101141+#ifndef _XEN_PUBLIC_ACM_H
101142+#define _XEN_PUBLIC_ACM_H
101143+
101144+#include "xen.h"
101145+
101146+/* if ACM_DEBUG defined, all hooks should
101147+ * print a short trace message (comment it out
101148+ * when not in testing mode )
101149+ */
101150+/* #define ACM_DEBUG */
101151+
101152+#ifdef ACM_DEBUG
101153+# define printkd(fmt, args...) printk(fmt,## args)
101154+#else
101155+# define printkd(fmt, args...)
101156+#endif
101157+
101158+/* default ssid reference value if not supplied */
101159+#define ACM_DEFAULT_SSID 0x0
101160+#define ACM_DEFAULT_LOCAL_SSID 0x0
101161+
101162+/* Internal ACM ERROR types */
101163+#define ACM_OK 0
101164+#define ACM_UNDEF -1
101165+#define ACM_INIT_SSID_ERROR -2
101166+#define ACM_INIT_SOID_ERROR -3
101167+#define ACM_ERROR -4
101168+
101169+/* External ACCESS DECISIONS */
101170+#define ACM_ACCESS_PERMITTED 0
101171+#define ACM_ACCESS_DENIED -111
101172+#define ACM_NULL_POINTER_ERROR -200
101173+
101174+/* primary policy in lower 4 bits */
101175+#define ACM_NULL_POLICY 0
101176+#define ACM_CHINESE_WALL_POLICY 1
101177+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
101178+#define ACM_POLICY_UNDEFINED 15
101179+
101180+/* combinations have secondary policy component in higher 4bit */
101181+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
101182+ ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
101183+
101184+/* policy: */
101185+#define ACM_POLICY_NAME(X) \
101186+ ((X) == (ACM_NULL_POLICY)) ? "NULL" : \
101187+ ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" : \
101188+ ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
101189+ ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
101190+ "UNDEFINED"
101191+
101192+/* the following policy versions must be increased
101193+ * whenever the interpretation of the related
101194+ * policy's data structure changes
101195+ */
101196+#define ACM_POLICY_VERSION 2
101197+#define ACM_CHWALL_VERSION 1
101198+#define ACM_STE_VERSION 1
101199+
101200+/* defines a ssid reference used by xen */
101201+typedef uint32_t ssidref_t;
101202+
101203+/* hooks that are known to domains */
101204+#define ACMHOOK_none 0
101205+#define ACMHOOK_sharing 1
101206+
101207+/* -------security policy relevant type definitions-------- */
101208+
101209+/* type identifier; compares to "equal" or "not equal" */
101210+typedef uint16_t domaintype_t;
101211+
101212+/* CHINESE WALL POLICY DATA STRUCTURES
101213+ *
101214+ * current accumulated conflict type set:
101215+ * When a domain is started and has a type that is in
101216+ * a conflict set, the conflicting types are incremented in
101217+ * the aggregate set. When a domain is destroyed, the
101218+ * conflicting types to its type are decremented.
101219+ * If a domain has multiple types, this procedure works over
101220+ * all those types.
101221+ *
101222+ * conflict_aggregate_set[i] holds the number of
101223+ * running domains that have a conflict with type i.
101224+ *
101225+ * running_types[i] holds the number of running domains
101226+ * that include type i in their ssidref-referenced type set
101227+ *
101228+ * conflict_sets[i][j] is "0" if type j has no conflict
101229+ * with type i and is "1" otherwise.
101230+ */
101231+/* high-16 = version, low-16 = check magic */
101232+#define ACM_MAGIC 0x0001debc
101233+
101234+/* each offset in bytes from start of the struct they
101235+ * are part of */
101236+
101237+/* each buffer consists of all policy information for
101238+ * the respective policy given in the policy code
101239+ *
101240+ * acm_policy_buffer, acm_chwall_policy_buffer,
101241+ * and acm_ste_policy_buffer need to stay 32-bit aligned
101242+ * because we create binary policies also with external
101243+ * tools that assume packed representations (e.g. the java tool)
101244+ */
101245+struct acm_policy_buffer {
101246+ uint32_t policy_version; /* ACM_POLICY_VERSION */
101247+ uint32_t magic;
101248+ uint32_t len;
101249+ uint32_t policy_reference_offset;
101250+ uint32_t primary_policy_code;
101251+ uint32_t primary_buffer_offset;
101252+ uint32_t secondary_policy_code;
101253+ uint32_t secondary_buffer_offset;
101254+};
101255+
101256+struct acm_policy_reference_buffer {
101257+ uint32_t len;
101258+};
101259+
101260+struct acm_chwall_policy_buffer {
101261+ uint32_t policy_version; /* ACM_CHWALL_VERSION */
101262+ uint32_t policy_code;
101263+ uint32_t chwall_max_types;
101264+ uint32_t chwall_max_ssidrefs;
101265+ uint32_t chwall_max_conflictsets;
101266+ uint32_t chwall_ssid_offset;
101267+ uint32_t chwall_conflict_sets_offset;
101268+ uint32_t chwall_running_types_offset;
101269+ uint32_t chwall_conflict_aggregate_offset;
101270+};
101271+
101272+struct acm_ste_policy_buffer {
101273+ uint32_t policy_version; /* ACM_STE_VERSION */
101274+ uint32_t policy_code;
101275+ uint32_t ste_max_types;
101276+ uint32_t ste_max_ssidrefs;
101277+ uint32_t ste_ssid_offset;
101278+};
101279+
101280+struct acm_stats_buffer {
101281+ uint32_t magic;
101282+ uint32_t len;
101283+ uint32_t primary_policy_code;
101284+ uint32_t primary_stats_offset;
101285+ uint32_t secondary_policy_code;
101286+ uint32_t secondary_stats_offset;
101287+};
101288+
101289+struct acm_ste_stats_buffer {
101290+ uint32_t ec_eval_count;
101291+ uint32_t gt_eval_count;
101292+ uint32_t ec_denied_count;
101293+ uint32_t gt_denied_count;
101294+ uint32_t ec_cachehit_count;
101295+ uint32_t gt_cachehit_count;
101296+};
101297+
101298+struct acm_ssid_buffer {
101299+ uint32_t len;
101300+ ssidref_t ssidref;
101301+ uint32_t policy_reference_offset;
101302+ uint32_t primary_policy_code;
101303+ uint32_t primary_max_types;
101304+ uint32_t primary_types_offset;
101305+ uint32_t secondary_policy_code;
101306+ uint32_t secondary_max_types;
101307+ uint32_t secondary_types_offset;
101308+};
101309+
101310+#endif
101311+
101312+/*
101313+ * Local variables:
101314+ * mode: C
101315+ * c-set-style: "BSD"
101316+ * c-basic-offset: 4
101317+ * tab-width: 4
101318+ * indent-tabs-mode: nil
101319+ * End:
101320+ */
101321diff -Nur linux-2.6.16.33-noxen/include/xen/interface/acm_ops.h linux-2.6.16.33/include/xen/interface/acm_ops.h
101322--- linux-2.6.16.33-noxen/include/xen/interface/acm_ops.h 1970-01-01 00:00:00.000000000 +0000
101323+++ linux-2.6.16.33/include/xen/interface/acm_ops.h 2007-01-08 15:00:55.000000000 +0000
101324@@ -0,0 +1,120 @@
101325+/*
101326+ * acm_ops.h: Xen access control module hypervisor commands
101327+ *
101328+ * Permission is hereby granted, free of charge, to any person obtaining a copy
101329+ * of this software and associated documentation files (the "Software"), to
101330+ * deal in the Software without restriction, including without limitation the
101331+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101332+ * sell copies of the Software, and to permit persons to whom the Software is
101333+ * furnished to do so, subject to the following conditions:
101334+ *
101335+ * The above copyright notice and this permission notice shall be included in
101336+ * all copies or substantial portions of the Software.
101337+ *
101338+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101339+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101340+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101341+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101342+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101343+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101344+ * DEALINGS IN THE SOFTWARE.
101345+ *
101346+ * Reiner Sailer <sailer@watson.ibm.com>
101347+ * Copyright (c) 2005,2006 International Business Machines Corporation.
101348+ */
101349+
101350+#ifndef __XEN_PUBLIC_ACM_OPS_H__
101351+#define __XEN_PUBLIC_ACM_OPS_H__
101352+
101353+#include "xen.h"
101354+#include "acm.h"
101355+
101356+/*
101357+ * Make sure you increment the interface version whenever you modify this file!
101358+ * This makes sure that old versions of acm tools will stop working in a
101359+ * well-defined way (rather than crashing the machine, for instance).
101360+ */
101361+#define ACM_INTERFACE_VERSION 0xAAAA0008
101362+
101363+/************************************************************************/
101364+
101365+/*
101366+ * Prototype for this hypercall is:
101367+ * int acm_op(int cmd, void *args)
101368+ * @cmd == ACMOP_??? (access control module operation).
101369+ * @args == Operation-specific extra arguments (NULL if none).
101370+ */
101371+
101372+
101373+#define ACMOP_setpolicy 1
101374+struct acm_setpolicy {
101375+ /* IN */
101376+ uint32_t interface_version;
101377+ XEN_GUEST_HANDLE(void) pushcache;
101378+ uint32_t pushcache_size;
101379+};
101380+
101381+
101382+#define ACMOP_getpolicy 2
101383+struct acm_getpolicy {
101384+ /* IN */
101385+ uint32_t interface_version;
101386+ XEN_GUEST_HANDLE(void) pullcache;
101387+ uint32_t pullcache_size;
101388+};
101389+
101390+
101391+#define ACMOP_dumpstats 3
101392+struct acm_dumpstats {
101393+ /* IN */
101394+ uint32_t interface_version;
101395+ XEN_GUEST_HANDLE(void) pullcache;
101396+ uint32_t pullcache_size;
101397+};
101398+
101399+
101400+#define ACMOP_getssid 4
101401+#define ACM_GETBY_ssidref 1
101402+#define ACM_GETBY_domainid 2
101403+struct acm_getssid {
101404+ /* IN */
101405+ uint32_t interface_version;
101406+ uint32_t get_ssid_by; /* ACM_GETBY_* */
101407+ union {
101408+ domaintype_t domainid;
101409+ ssidref_t ssidref;
101410+ } id;
101411+ XEN_GUEST_HANDLE(void) ssidbuf;
101412+ uint32_t ssidbuf_size;
101413+};
101414+
101415+#define ACMOP_getdecision 5
101416+struct acm_getdecision {
101417+ /* IN */
101418+ uint32_t interface_version;
101419+ uint32_t get_decision_by1; /* ACM_GETBY_* */
101420+ uint32_t get_decision_by2; /* ACM_GETBY_* */
101421+ union {
101422+ domaintype_t domainid;
101423+ ssidref_t ssidref;
101424+ } id1;
101425+ union {
101426+ domaintype_t domainid;
101427+ ssidref_t ssidref;
101428+ } id2;
101429+ uint32_t hook;
101430+ /* OUT */
101431+ uint32_t acm_decision;
101432+};
101433+
101434+#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
101435+
101436+/*
101437+ * Local variables:
101438+ * mode: C
101439+ * c-set-style: "BSD"
101440+ * c-basic-offset: 4
101441+ * tab-width: 4
101442+ * indent-tabs-mode: nil
101443+ * End:
101444+ */
101445diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-ia64.h linux-2.6.16.33/include/xen/interface/arch-ia64.h
101446--- linux-2.6.16.33-noxen/include/xen/interface/arch-ia64.h 1970-01-01 00:00:00.000000000 +0000
101447+++ linux-2.6.16.33/include/xen/interface/arch-ia64.h 2007-01-08 15:00:55.000000000 +0000
101448@@ -0,0 +1,500 @@
101449+/******************************************************************************
101450+ * arch-ia64/hypervisor-if.h
101451+ *
101452+ * Guest OS interface to IA64 Xen.
101453+ *
101454+ * Permission is hereby granted, free of charge, to any person obtaining a copy
101455+ * of this software and associated documentation files (the "Software"), to
101456+ * deal in the Software without restriction, including without limitation the
101457+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101458+ * sell copies of the Software, and to permit persons to whom the Software is
101459+ * furnished to do so, subject to the following conditions:
101460+ *
101461+ * The above copyright notice and this permission notice shall be included in
101462+ * all copies or substantial portions of the Software.
101463+ *
101464+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101465+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101466+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101467+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101468+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101469+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101470+ * DEALINGS IN THE SOFTWARE.
101471+ *
101472+ */
101473+
101474+#ifndef __HYPERVISOR_IF_IA64_H__
101475+#define __HYPERVISOR_IF_IA64_H__
101476+
101477+/* Structural guest handles introduced in 0x00030201. */
101478+#if __XEN_INTERFACE_VERSION__ >= 0x00030201
101479+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101480+ typedef struct { type *p; } __guest_handle_ ## name
101481+#else
101482+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101483+ typedef type * __guest_handle_ ## name
101484+#endif
101485+
101486+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
101487+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
101488+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
101489+#ifdef __XEN_TOOLS__
101490+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
101491+#endif
101492+
101493+#ifndef __ASSEMBLY__
101494+/* Guest handles for primitive C types. */
101495+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
101496+__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
101497+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
101498+__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long);
101499+DEFINE_XEN_GUEST_HANDLE(char);
101500+DEFINE_XEN_GUEST_HANDLE(int);
101501+DEFINE_XEN_GUEST_HANDLE(long);
101502+DEFINE_XEN_GUEST_HANDLE(void);
101503+
101504+typedef unsigned long xen_pfn_t;
101505+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
101506+#endif
101507+
101508+/* Arch specific VIRQs definition */
101509+#define VIRQ_ITC VIRQ_ARCH_0 /* V. Virtual itc timer */
101510+#define VIRQ_MCA_CMC VIRQ_ARCH_1 /* MCA cmc interrupt */
101511+#define VIRQ_MCA_CPE VIRQ_ARCH_2 /* MCA cpe interrupt */
101512+
101513+/* Maximum number of virtual CPUs in multi-processor guests. */
101514+/* WARNING: before changing this, check that shared_info fits on a page */
101515+#define MAX_VIRT_CPUS 64
101516+
101517+#ifndef __ASSEMBLY__
101518+
101519+typedef unsigned long xen_ulong_t;
101520+
101521+#define INVALID_MFN (~0UL)
101522+
101523+#define MEM_G (1UL << 30)
101524+#define MEM_M (1UL << 20)
101525+
101526+#define MMIO_START (3 * MEM_G)
101527+#define MMIO_SIZE (512 * MEM_M)
101528+
101529+#define VGA_IO_START 0xA0000UL
101530+#define VGA_IO_SIZE 0x20000
101531+
101532+#define LEGACY_IO_START (MMIO_START + MMIO_SIZE)
101533+#define LEGACY_IO_SIZE (64*MEM_M)
101534+
101535+#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
101536+#define IO_PAGE_SIZE PAGE_SIZE
101537+
101538+#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
101539+#define STORE_PAGE_SIZE PAGE_SIZE
101540+
101541+#define BUFFER_IO_PAGE_START (STORE_PAGE_START+PAGE_SIZE)
101542+#define BUFFER_IO_PAGE_SIZE PAGE_SIZE
101543+
101544+#define IO_SAPIC_START 0xfec00000UL
101545+#define IO_SAPIC_SIZE 0x100000
101546+
101547+#define PIB_START 0xfee00000UL
101548+#define PIB_SIZE 0x200000
101549+
101550+#define GFW_START (4*MEM_G -16*MEM_M)
101551+#define GFW_SIZE (16*MEM_M)
101552+
101553+struct pt_fpreg {
101554+ union {
101555+ unsigned long bits[2];
101556+ long double __dummy; /* force 16-byte alignment */
101557+ } u;
101558+};
101559+
101560+struct cpu_user_regs {
101561+ /* The following registers are saved by SAVE_MIN: */
101562+ unsigned long b6; /* scratch */
101563+ unsigned long b7; /* scratch */
101564+
101565+ unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
101566+ unsigned long ar_ssd; /* reserved for future use (scratch) */
101567+
101568+ unsigned long r8; /* scratch (return value register 0) */
101569+ unsigned long r9; /* scratch (return value register 1) */
101570+ unsigned long r10; /* scratch (return value register 2) */
101571+ unsigned long r11; /* scratch (return value register 3) */
101572+
101573+ unsigned long cr_ipsr; /* interrupted task's psr */
101574+ unsigned long cr_iip; /* interrupted task's instruction pointer */
101575+ unsigned long cr_ifs; /* interrupted task's function state */
101576+
101577+ unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
101578+ unsigned long ar_pfs; /* prev function state */
101579+ unsigned long ar_rsc; /* RSE configuration */
101580+ /* The following two are valid only if cr_ipsr.cpl > 0: */
101581+ unsigned long ar_rnat; /* RSE NaT */
101582+ unsigned long ar_bspstore; /* RSE bspstore */
101583+
101584+ unsigned long pr; /* 64 predicate registers (1 bit each) */
101585+ unsigned long b0; /* return pointer (bp) */
101586+ unsigned long loadrs; /* size of dirty partition << 16 */
101587+
101588+ unsigned long r1; /* the gp pointer */
101589+ unsigned long r12; /* interrupted task's memory stack pointer */
101590+ unsigned long r13; /* thread pointer */
101591+
101592+ unsigned long ar_fpsr; /* floating point status (preserved) */
101593+ unsigned long r15; /* scratch */
101594+
101595+ /* The remaining registers are NOT saved for system calls. */
101596+
101597+ unsigned long r14; /* scratch */
101598+ unsigned long r2; /* scratch */
101599+ unsigned long r3; /* scratch */
101600+ unsigned long r16; /* scratch */
101601+ unsigned long r17; /* scratch */
101602+ unsigned long r18; /* scratch */
101603+ unsigned long r19; /* scratch */
101604+ unsigned long r20; /* scratch */
101605+ unsigned long r21; /* scratch */
101606+ unsigned long r22; /* scratch */
101607+ unsigned long r23; /* scratch */
101608+ unsigned long r24; /* scratch */
101609+ unsigned long r25; /* scratch */
101610+ unsigned long r26; /* scratch */
101611+ unsigned long r27; /* scratch */
101612+ unsigned long r28; /* scratch */
101613+ unsigned long r29; /* scratch */
101614+ unsigned long r30; /* scratch */
101615+ unsigned long r31; /* scratch */
101616+ unsigned long ar_ccv; /* compare/exchange value (scratch) */
101617+
101618+ /*
101619+ * Floating point registers that the kernel considers scratch:
101620+ */
101621+ struct pt_fpreg f6; /* scratch */
101622+ struct pt_fpreg f7; /* scratch */
101623+ struct pt_fpreg f8; /* scratch */
101624+ struct pt_fpreg f9; /* scratch */
101625+ struct pt_fpreg f10; /* scratch */
101626+ struct pt_fpreg f11; /* scratch */
101627+ unsigned long r4; /* preserved */
101628+ unsigned long r5; /* preserved */
101629+ unsigned long r6; /* preserved */
101630+ unsigned long r7; /* preserved */
101631+ unsigned long eml_unat; /* used for emulating instruction */
101632+ unsigned long pad0; /* alignment pad */
101633+
101634+};
101635+typedef struct cpu_user_regs cpu_user_regs_t;
101636+
101637+union vac {
101638+ unsigned long value;
101639+ struct {
101640+ int a_int:1;
101641+ int a_from_int_cr:1;
101642+ int a_to_int_cr:1;
101643+ int a_from_psr:1;
101644+ int a_from_cpuid:1;
101645+ int a_cover:1;
101646+ int a_bsw:1;
101647+ long reserved:57;
101648+ };
101649+};
101650+typedef union vac vac_t;
101651+
101652+union vdc {
101653+ unsigned long value;
101654+ struct {
101655+ int d_vmsw:1;
101656+ int d_extint:1;
101657+ int d_ibr_dbr:1;
101658+ int d_pmc:1;
101659+ int d_to_pmd:1;
101660+ int d_itm:1;
101661+ long reserved:58;
101662+ };
101663+};
101664+typedef union vdc vdc_t;
101665+
101666+struct mapped_regs {
101667+ union vac vac;
101668+ union vdc vdc;
101669+ unsigned long virt_env_vaddr;
101670+ unsigned long reserved1[29];
101671+ unsigned long vhpi;
101672+ unsigned long reserved2[95];
101673+ union {
101674+ unsigned long vgr[16];
101675+ unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
101676+ };
101677+ union {
101678+ unsigned long vbgr[16];
101679+ unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
101680+ };
101681+ unsigned long vnat;
101682+ unsigned long vbnat;
101683+ unsigned long vcpuid[5];
101684+ unsigned long reserved3[11];
101685+ unsigned long vpsr;
101686+ unsigned long vpr;
101687+ unsigned long reserved4[76];
101688+ union {
101689+ unsigned long vcr[128];
101690+ struct {
101691+ unsigned long dcr; // CR0
101692+ unsigned long itm;
101693+ unsigned long iva;
101694+ unsigned long rsv1[5];
101695+ unsigned long pta; // CR8
101696+ unsigned long rsv2[7];
101697+ unsigned long ipsr; // CR16
101698+ unsigned long isr;
101699+ unsigned long rsv3;
101700+ unsigned long iip;
101701+ unsigned long ifa;
101702+ unsigned long itir;
101703+ unsigned long iipa;
101704+ unsigned long ifs;
101705+ unsigned long iim; // CR24
101706+ unsigned long iha;
101707+ unsigned long rsv4[38];
101708+ unsigned long lid; // CR64
101709+ unsigned long ivr;
101710+ unsigned long tpr;
101711+ unsigned long eoi;
101712+ unsigned long irr[4];
101713+ unsigned long itv; // CR72
101714+ unsigned long pmv;
101715+ unsigned long cmcv;
101716+ unsigned long rsv5[5];
101717+ unsigned long lrr0; // CR80
101718+ unsigned long lrr1;
101719+ unsigned long rsv6[46];
101720+ };
101721+ };
101722+ union {
101723+ unsigned long reserved5[128];
101724+ struct {
101725+ unsigned long precover_ifs;
101726+ unsigned long unat; // not sure if this is needed until NaT arch is done
101727+ int interrupt_collection_enabled; // virtual psr.ic
101728+ /* virtual interrupt deliverable flag is evtchn_upcall_mask in
101729+ * shared info area now. interrupt_mask_addr is the address
101730+ * of evtchn_upcall_mask for current vcpu
101731+ */
101732+ unsigned char *interrupt_mask_addr;
101733+ int pending_interruption;
101734+ int incomplete_regframe; // see SDM vol2 6.8
101735+ unsigned char vpsr_pp;
101736+ unsigned char reserved5_2[7];
101737+ unsigned long reserved5_1[3];
101738+ int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
101739+ int banknum; // 0 or 1, which virtual register bank is active
101740+ unsigned long rrs[8]; // region registers
101741+ unsigned long krs[8]; // kernel registers
101742+ unsigned long pkrs[8]; // protection key registers
101743+ unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
101744+ };
101745+ };
101746+};
101747+typedef struct mapped_regs mapped_regs_t;
101748+
101749+struct vpd {
101750+ struct mapped_regs vpd_low;
101751+ unsigned long reserved6[3456];
101752+ unsigned long vmm_avail[128];
101753+ unsigned long reserved7[4096];
101754+};
101755+typedef struct vpd vpd_t;
101756+
101757+struct arch_vcpu_info {
101758+};
101759+typedef struct arch_vcpu_info arch_vcpu_info_t;
101760+
101761+struct arch_shared_info {
101762+ /* PFN of the start_info page. */
101763+ unsigned long start_info_pfn;
101764+
101765+ /* Interrupt vector for event channel. */
101766+ int evtchn_vector;
101767+
101768+ uint64_t pad[32];
101769+};
101770+typedef struct arch_shared_info arch_shared_info_t;
101771+
101772+typedef unsigned long xen_callback_t;
101773+
101774+struct ia64_tr_entry {
101775+ unsigned long pte;
101776+ unsigned long itir;
101777+ unsigned long vadr;
101778+ unsigned long rid;
101779+};
101780+
101781+struct vcpu_extra_regs {
101782+ struct ia64_tr_entry itrs[8];
101783+ struct ia64_tr_entry dtrs[8];
101784+ unsigned long iva;
101785+ unsigned long dcr;
101786+ unsigned long event_callback_ip;
101787+};
101788+
101789+struct vcpu_guest_context {
101790+#define VGCF_EXTRA_REGS (1<<1) /* Get/Set extra regs. */
101791+ unsigned long flags; /* VGCF_* flags */
101792+
101793+ struct cpu_user_regs user_regs;
101794+ struct vcpu_extra_regs extra_regs;
101795+ unsigned long privregs_pfn;
101796+};
101797+typedef struct vcpu_guest_context vcpu_guest_context_t;
101798+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
101799+
101800+/* dom0 vp op */
101801+#define __HYPERVISOR_ia64_dom0vp_op __HYPERVISOR_arch_0
101802+/* Map io space in machine address to dom0 physical address space.
101803+ Currently physical assigned address equals to machine address. */
101804+#define IA64_DOM0VP_ioremap 0
101805+
101806+/* Convert a pseudo physical page frame number to the corresponding
101807+ machine page frame number. If no page is assigned, INVALID_MFN or
101808+ GPFN_INV_MASK is returned depending on domain's non-vti/vti mode. */
101809+#define IA64_DOM0VP_phystomach 1
101810+
101811+/* Convert a machine page frame number to the corresponding pseudo physical
101812+ page frame number of the caller domain. */
101813+#define IA64_DOM0VP_machtophys 3
101814+
101815+/* Reserved for future use. */
101816+#define IA64_DOM0VP_iounmap 4
101817+
101818+/* Unmap and free pages contained in the specified pseudo physical region. */
101819+#define IA64_DOM0VP_zap_physmap 5
101820+
101821+/* Assign machine page frame to dom0's pseudo physical address space. */
101822+#define IA64_DOM0VP_add_physmap 6
101823+
101824+/* expose the p2m table into domain */
101825+#define IA64_DOM0VP_expose_p2m 7
101826+
101827+/* xen perfmon */
101828+#define IA64_DOM0VP_perfmon 8
101829+
101830+/* gmfn version of IA64_DOM0VP_add_physmap */
101831+#define IA64_DOM0VP_add_physmap_with_gmfn 9
101832+
101833+// flags for page assignement to pseudo physical address space
101834+#define _ASSIGN_readonly 0
101835+#define ASSIGN_readonly (1UL << _ASSIGN_readonly)
101836+#define ASSIGN_writable (0UL << _ASSIGN_readonly) // dummy flag
101837+/* Internal only: memory attribute must be WC/UC/UCE. */
101838+#define _ASSIGN_nocache 1
101839+#define ASSIGN_nocache (1UL << _ASSIGN_nocache)
101840+// tlb tracking
101841+#define _ASSIGN_tlb_track 2
101842+#define ASSIGN_tlb_track (1UL << _ASSIGN_tlb_track)
101843+/* Internal only: associated with PGC_allocated bit */
101844+#define _ASSIGN_pgc_allocated 3
101845+#define ASSIGN_pgc_allocated (1UL << _ASSIGN_pgc_allocated)
101846+
101847+/* This structure has the same layout of struct ia64_boot_param, defined in
101848+ <asm/system.h>. It is redefined here to ease use. */
101849+struct xen_ia64_boot_param {
101850+ unsigned long command_line; /* physical address of cmd line args */
101851+ unsigned long efi_systab; /* physical address of EFI system table */
101852+ unsigned long efi_memmap; /* physical address of EFI memory map */
101853+ unsigned long efi_memmap_size; /* size of EFI memory map */
101854+ unsigned long efi_memdesc_size; /* size of an EFI memory map descriptor */
101855+ unsigned int efi_memdesc_version; /* memory descriptor version */
101856+ struct {
101857+ unsigned short num_cols; /* number of columns on console. */
101858+ unsigned short num_rows; /* number of rows on console. */
101859+ unsigned short orig_x; /* cursor's x position */
101860+ unsigned short orig_y; /* cursor's y position */
101861+ } console_info;
101862+ unsigned long fpswa; /* physical address of the fpswa interface */
101863+ unsigned long initrd_start;
101864+ unsigned long initrd_size;
101865+ unsigned long domain_start; /* va where the boot time domain begins */
101866+ unsigned long domain_size; /* how big is the boot domain */
101867+};
101868+
101869+#endif /* !__ASSEMBLY__ */
101870+
101871+/* Size of the shared_info area (this is not related to page size). */
101872+#define XSI_SHIFT 14
101873+#define XSI_SIZE (1 << XSI_SHIFT)
101874+/* Log size of mapped_regs area (64 KB - only 4KB is used). */
101875+#define XMAPPEDREGS_SHIFT 12
101876+#define XMAPPEDREGS_SIZE (1 << XMAPPEDREGS_SHIFT)
101877+/* Offset of XASI (Xen arch shared info) wrt XSI_BASE. */
101878+#define XMAPPEDREGS_OFS XSI_SIZE
101879+
101880+/* Hyperprivops. */
101881+#define HYPERPRIVOP_RFI 0x1
101882+#define HYPERPRIVOP_RSM_DT 0x2
101883+#define HYPERPRIVOP_SSM_DT 0x3
101884+#define HYPERPRIVOP_COVER 0x4
101885+#define HYPERPRIVOP_ITC_D 0x5
101886+#define HYPERPRIVOP_ITC_I 0x6
101887+#define HYPERPRIVOP_SSM_I 0x7
101888+#define HYPERPRIVOP_GET_IVR 0x8
101889+#define HYPERPRIVOP_GET_TPR 0x9
101890+#define HYPERPRIVOP_SET_TPR 0xa
101891+#define HYPERPRIVOP_EOI 0xb
101892+#define HYPERPRIVOP_SET_ITM 0xc
101893+#define HYPERPRIVOP_THASH 0xd
101894+#define HYPERPRIVOP_PTC_GA 0xe
101895+#define HYPERPRIVOP_ITR_D 0xf
101896+#define HYPERPRIVOP_GET_RR 0x10
101897+#define HYPERPRIVOP_SET_RR 0x11
101898+#define HYPERPRIVOP_SET_KR 0x12
101899+#define HYPERPRIVOP_FC 0x13
101900+#define HYPERPRIVOP_GET_CPUID 0x14
101901+#define HYPERPRIVOP_GET_PMD 0x15
101902+#define HYPERPRIVOP_GET_EFLAG 0x16
101903+#define HYPERPRIVOP_SET_EFLAG 0x17
101904+#define HYPERPRIVOP_RSM_BE 0x18
101905+#define HYPERPRIVOP_GET_PSR 0x19
101906+#define HYPERPRIVOP_MAX 0x19
101907+
101908+/* Fast and light hypercalls. */
101909+#define __HYPERVISOR_ia64_fast_eoi 0x0200
101910+
101911+/* Xencomm macros. */
101912+#define XENCOMM_INLINE_MASK 0xf800000000000000UL
101913+#define XENCOMM_INLINE_FLAG 0x8000000000000000UL
101914+
101915+#define XENCOMM_IS_INLINE(addr) \
101916+ (((unsigned long)(addr) & XENCOMM_INLINE_MASK) == XENCOMM_INLINE_FLAG)
101917+#define XENCOMM_INLINE_ADDR(addr) \
101918+ ((unsigned long)(addr) & ~XENCOMM_INLINE_MASK)
101919+
101920+/* xen perfmon */
101921+#ifdef XEN
101922+#ifndef __ASSEMBLY__
101923+#ifndef _ASM_IA64_PERFMON_H
101924+
101925+#include <xen/list.h> // asm/perfmon.h requires struct list_head
101926+#include <asm/perfmon.h>
101927+// for PFM_xxx and pfarg_features_t, pfarg_context_t, pfarg_reg_t, pfarg_load_t
101928+
101929+#endif /* _ASM_IA64_PERFMON_H */
101930+
101931+DEFINE_XEN_GUEST_HANDLE(pfarg_features_t);
101932+DEFINE_XEN_GUEST_HANDLE(pfarg_context_t);
101933+DEFINE_XEN_GUEST_HANDLE(pfarg_reg_t);
101934+DEFINE_XEN_GUEST_HANDLE(pfarg_load_t);
101935+#endif /* __ASSEMBLY__ */
101936+#endif /* XEN */
101937+
101938+#endif /* __HYPERVISOR_IF_IA64_H__ */
101939+
101940+/*
101941+ * Local variables:
101942+ * mode: C
101943+ * c-set-style: "BSD"
101944+ * c-basic-offset: 4
101945+ * tab-width: 4
101946+ * indent-tabs-mode: nil
101947+ * End:
101948+ */
101949diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-powerpc.h linux-2.6.16.33/include/xen/interface/arch-powerpc.h
101950--- linux-2.6.16.33-noxen/include/xen/interface/arch-powerpc.h 1970-01-01 00:00:00.000000000 +0000
101951+++ linux-2.6.16.33/include/xen/interface/arch-powerpc.h 2007-01-08 15:00:55.000000000 +0000
101952@@ -0,0 +1,121 @@
101953+/*
101954+ * Permission is hereby granted, free of charge, to any person obtaining a copy
101955+ * of this software and associated documentation files (the "Software"), to
101956+ * deal in the Software without restriction, including without limitation the
101957+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101958+ * sell copies of the Software, and to permit persons to whom the Software is
101959+ * furnished to do so, subject to the following conditions:
101960+ *
101961+ * The above copyright notice and this permission notice shall be included in
101962+ * all copies or substantial portions of the Software.
101963+ *
101964+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101965+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101966+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101967+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101968+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101969+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101970+ * DEALINGS IN THE SOFTWARE.
101971+ *
101972+ * Copyright (C) IBM Corp. 2005, 2006
101973+ *
101974+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
101975+ */
101976+
101977+#ifndef __XEN_PUBLIC_ARCH_PPC_64_H__
101978+#define __XEN_PUBLIC_ARCH_PPC_64_H__
101979+
101980+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101981+ typedef struct { \
101982+ int __pad[(sizeof (long long) - sizeof (void *)) / sizeof (int)]; \
101983+ type *p; \
101984+ } __attribute__((__aligned__(8))) __guest_handle_ ## name
101985+
101986+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
101987+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
101988+#define set_xen_guest_handle(hnd, val) \
101989+ do { \
101990+ if (sizeof ((hnd).__pad)) \
101991+ (hnd).__pad[0] = 0; \
101992+ (hnd).p = val; \
101993+ } while (0)
101994+
101995+#ifdef __XEN_TOOLS__
101996+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
101997+#endif
101998+
101999+#ifndef __ASSEMBLY__
102000+/* Guest handles for primitive C types. */
102001+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
102002+__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
102003+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
102004+DEFINE_XEN_GUEST_HANDLE(char);
102005+DEFINE_XEN_GUEST_HANDLE(int);
102006+DEFINE_XEN_GUEST_HANDLE(long);
102007+DEFINE_XEN_GUEST_HANDLE(void);
102008+
102009+typedef unsigned long long xen_pfn_t;
102010+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
102011+#endif
102012+
102013+/*
102014+ * Pointers and other address fields inside interface structures are padded to
102015+ * 64 bits. This means that field alignments aren't different between 32- and
102016+ * 64-bit architectures.
102017+ */
102018+/* NB. Multi-level macro ensures __LINE__ is expanded before concatenation. */
102019+#define __MEMORY_PADDING(_X)
102020+#define _MEMORY_PADDING(_X) __MEMORY_PADDING(_X)
102021+#define MEMORY_PADDING _MEMORY_PADDING(__LINE__)
102022+
102023+/* And the trap vector is... */
102024+#define TRAP_INSTR "li 0,-1; sc" /* XXX just "sc"? */
102025+
102026+#ifndef __ASSEMBLY__
102027+
102028+#define XENCOMM_INLINE_FLAG (1UL << 63)
102029+
102030+typedef uint64_t xen_ulong_t;
102031+
102032+/* User-accessible registers: need to be saved/restored for every nested Xen
102033+ * invocation. */
102034+struct cpu_user_regs
102035+{
102036+ uint64_t gprs[32];
102037+ uint64_t lr;
102038+ uint64_t ctr;
102039+ uint64_t srr0;
102040+ uint64_t srr1;
102041+ uint64_t pc;
102042+ uint64_t msr;
102043+ uint64_t fpscr;
102044+ uint64_t xer;
102045+ uint64_t hid4;
102046+ uint32_t cr;
102047+ uint32_t entry_vector;
102048+};
102049+typedef struct cpu_user_regs cpu_user_regs_t;
102050+
102051+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ /* XXX timebase */
102052+
102053+/* ONLY used to communicate with dom0! See also struct exec_domain. */
102054+struct vcpu_guest_context {
102055+ cpu_user_regs_t user_regs; /* User-level CPU registers */
102056+ uint64_t sdr1; /* Pagetable base */
102057+ /* XXX etc */
102058+};
102059+typedef struct vcpu_guest_context vcpu_guest_context_t;
102060+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
102061+
102062+struct arch_shared_info {
102063+ uint64_t pad[32];
102064+};
102065+
102066+struct arch_vcpu_info {
102067+};
102068+
102069+/* Support for multi-processor guests. */
102070+#define MAX_VIRT_CPUS 32
102071+#endif
102072+
102073+#endif
102074diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_32.h linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_32.h
102075--- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_32.h 1970-01-01 00:00:00.000000000 +0000
102076+++ linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_32.h 2007-01-08 15:00:55.000000000 +0000
102077@@ -0,0 +1,151 @@
102078+/******************************************************************************
102079+ * xen-x86_32.h
102080+ *
102081+ * Guest OS interface to x86 32-bit Xen.
102082+ *
102083+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102084+ * of this software and associated documentation files (the "Software"), to
102085+ * deal in the Software without restriction, including without limitation the
102086+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102087+ * sell copies of the Software, and to permit persons to whom the Software is
102088+ * furnished to do so, subject to the following conditions:
102089+ *
102090+ * The above copyright notice and this permission notice shall be included in
102091+ * all copies or substantial portions of the Software.
102092+ *
102093+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102094+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102095+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102096+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102097+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102098+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102099+ * DEALINGS IN THE SOFTWARE.
102100+ *
102101+ * Copyright (c) 2004-2006, K A Fraser
102102+ */
102103+
102104+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
102105+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
102106+
102107+/*
102108+ * Hypercall interface:
102109+ * Input: %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
102110+ * Output: %eax
102111+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
102112+ * call hypercall_page + hypercall-number * 32
102113+ * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
102114+ */
102115+
102116+#if __XEN_INTERFACE_VERSION__ < 0x00030203
102117+/*
102118+ * Legacy hypercall interface:
102119+ * As above, except the entry sequence to the hypervisor is:
102120+ * mov $hypercall-number*32,%eax ; int $0x82
102121+ */
102122+#define TRAP_INSTR "int $0x82"
102123+#endif
102124+
102125+/*
102126+ * These flat segments are in the Xen-private section of every GDT. Since these
102127+ * are also present in the initial GDT, many OSes will be able to avoid
102128+ * installing their own GDT.
102129+ */
102130+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
102131+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
102132+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
102133+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
102134+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
102135+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
102136+
102137+#define FLAT_KERNEL_CS FLAT_RING1_CS
102138+#define FLAT_KERNEL_DS FLAT_RING1_DS
102139+#define FLAT_KERNEL_SS FLAT_RING1_SS
102140+#define FLAT_USER_CS FLAT_RING3_CS
102141+#define FLAT_USER_DS FLAT_RING3_DS
102142+#define FLAT_USER_SS FLAT_RING3_SS
102143+
102144+/*
102145+ * Virtual addresses beyond this are not modifiable by guest OSes. The
102146+ * machine->physical mapping table starts at this address, read-only.
102147+ */
102148+#ifdef CONFIG_X86_PAE
102149+#define __HYPERVISOR_VIRT_START 0xF5800000
102150+#define __MACH2PHYS_VIRT_START 0xF5800000
102151+#define __MACH2PHYS_VIRT_END 0xF6800000
102152+#else
102153+#define __HYPERVISOR_VIRT_START 0xFC000000
102154+#define __MACH2PHYS_VIRT_START 0xFC000000
102155+#define __MACH2PHYS_VIRT_END 0xFC400000
102156+#endif
102157+
102158+#ifndef HYPERVISOR_VIRT_START
102159+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
102160+#endif
102161+
102162+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
102163+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
102164+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
102165+#ifndef machine_to_phys_mapping
102166+#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
102167+#endif
102168+
102169+#ifndef __ASSEMBLY__
102170+
102171+struct cpu_user_regs {
102172+ uint32_t ebx;
102173+ uint32_t ecx;
102174+ uint32_t edx;
102175+ uint32_t esi;
102176+ uint32_t edi;
102177+ uint32_t ebp;
102178+ uint32_t eax;
102179+ uint16_t error_code; /* private */
102180+ uint16_t entry_vector; /* private */
102181+ uint32_t eip;
102182+ uint16_t cs;
102183+ uint8_t saved_upcall_mask;
102184+ uint8_t _pad0;
102185+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
102186+ uint32_t esp;
102187+ uint16_t ss, _pad1;
102188+ uint16_t es, _pad2;
102189+ uint16_t ds, _pad3;
102190+ uint16_t fs, _pad4;
102191+ uint16_t gs, _pad5;
102192+};
102193+typedef struct cpu_user_regs cpu_user_regs_t;
102194+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
102195+
102196+/*
102197+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
102198+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
102199+ * must use the following accessor macros to pack/unpack valid MFNs.
102200+ */
102201+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
102202+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
102203+
102204+struct arch_vcpu_info {
102205+ unsigned long cr2;
102206+ unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
102207+};
102208+typedef struct arch_vcpu_info arch_vcpu_info_t;
102209+
102210+struct xen_callback {
102211+ unsigned long cs;
102212+ unsigned long eip;
102213+};
102214+typedef struct xen_callback xen_callback_t;
102215+
102216+#endif /* !__ASSEMBLY__ */
102217+
102218+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
102219+
102220+/*
102221+ * Local variables:
102222+ * mode: C
102223+ * c-set-style: "BSD"
102224+ * c-basic-offset: 4
102225+ * tab-width: 4
102226+ * indent-tabs-mode: nil
102227+ * End:
102228+ */
102229diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_64.h linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_64.h
102230--- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_64.h 1970-01-01 00:00:00.000000000 +0000
102231+++ linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_64.h 2007-01-08 15:00:55.000000000 +0000
102232@@ -0,0 +1,208 @@
102233+/******************************************************************************
102234+ * xen-x86_64.h
102235+ *
102236+ * Guest OS interface to x86 64-bit Xen.
102237+ *
102238+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102239+ * of this software and associated documentation files (the "Software"), to
102240+ * deal in the Software without restriction, including without limitation the
102241+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102242+ * sell copies of the Software, and to permit persons to whom the Software is
102243+ * furnished to do so, subject to the following conditions:
102244+ *
102245+ * The above copyright notice and this permission notice shall be included in
102246+ * all copies or substantial portions of the Software.
102247+ *
102248+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102249+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102250+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102251+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102252+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102253+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102254+ * DEALINGS IN THE SOFTWARE.
102255+ *
102256+ * Copyright (c) 2004-2006, K A Fraser
102257+ */
102258+
102259+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
102260+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
102261+
102262+/*
102263+ * Hypercall interface:
102264+ * Input: %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
102265+ * Output: %rax
102266+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
102267+ * call hypercall_page + hypercall-number * 32
102268+ * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
102269+ */
102270+
102271+#if __XEN_INTERFACE_VERSION__ < 0x00030203
102272+/*
102273+ * Legacy hypercall interface:
102274+ * As above, except the entry sequence to the hypervisor is:
102275+ * mov $hypercall-number*32,%eax ; syscall
102276+ * Clobbered: %rcx, %r11, argument registers (as above)
102277+ */
102278+#define TRAP_INSTR "syscall"
102279+#endif
102280+
102281+/*
102282+ * 64-bit segment selectors
102283+ * These flat segments are in the Xen-private section of every GDT. Since these
102284+ * are also present in the initial GDT, many OSes will be able to avoid
102285+ * installing their own GDT.
102286+ */
102287+
102288+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
102289+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
102290+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
102291+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
102292+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
102293+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
102294+
102295+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
102296+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
102297+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
102298+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
102299+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
102300+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
102301+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
102302+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
102303+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
102304+
102305+#define FLAT_USER_DS64 FLAT_RING3_DS64
102306+#define FLAT_USER_DS32 FLAT_RING3_DS32
102307+#define FLAT_USER_DS FLAT_USER_DS64
102308+#define FLAT_USER_CS64 FLAT_RING3_CS64
102309+#define FLAT_USER_CS32 FLAT_RING3_CS32
102310+#define FLAT_USER_CS FLAT_USER_CS64
102311+#define FLAT_USER_SS64 FLAT_RING3_SS64
102312+#define FLAT_USER_SS32 FLAT_RING3_SS32
102313+#define FLAT_USER_SS FLAT_USER_SS64
102314+
102315+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
102316+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
102317+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
102318+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
102319+
102320+#ifndef HYPERVISOR_VIRT_START
102321+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
102322+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
102323+#endif
102324+
102325+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
102326+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
102327+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
102328+#ifndef machine_to_phys_mapping
102329+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
102330+#endif
102331+
102332+#ifndef __ASSEMBLY__
102333+
102334+/*
102335+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
102336+ * @which == SEGBASE_* ; @base == 64-bit base address
102337+ * Returns 0 on success.
102338+ */
102339+#define SEGBASE_FS 0
102340+#define SEGBASE_GS_USER 1
102341+#define SEGBASE_GS_KERNEL 2
102342+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
102343+
102344+/*
102345+ * int HYPERVISOR_iret(void)
102346+ * All arguments are on the kernel stack, in the following format.
102347+ * Never returns if successful. Current kernel context is lost.
102348+ * The saved CS is mapped as follows:
102349+ * RING0 -> RING3 kernel mode.
102350+ * RING1 -> RING3 kernel mode.
102351+ * RING2 -> RING3 kernel mode.
102352+ * RING3 -> RING3 user mode.
102353+ * However RING0 indicates that the guest kernel should return to iteself
102354+ * directly with
102355+ * orb $3,1*8(%rsp)
102356+ * iretq
102357+ * If flags contains VGCF_in_syscall:
102358+ * Restore RAX, RIP, RFLAGS, RSP.
102359+ * Discard R11, RCX, CS, SS.
102360+ * Otherwise:
102361+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
102362+ * All other registers are saved on hypercall entry and restored to user.
102363+ */
102364+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
102365+#define _VGCF_in_syscall 8
102366+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
102367+#define VGCF_IN_SYSCALL VGCF_in_syscall
102368+struct iret_context {
102369+ /* Top of stack (%rsp at point of hypercall). */
102370+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
102371+ /* Bottom of iret stack frame. */
102372+};
102373+
102374+#ifdef __GNUC__
102375+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
102376+#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
102377+#else
102378+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
102379+#define __DECL_REG(name) uint64_t r ## name
102380+#endif
102381+
102382+struct cpu_user_regs {
102383+ uint64_t r15;
102384+ uint64_t r14;
102385+ uint64_t r13;
102386+ uint64_t r12;
102387+ __DECL_REG(bp);
102388+ __DECL_REG(bx);
102389+ uint64_t r11;
102390+ uint64_t r10;
102391+ uint64_t r9;
102392+ uint64_t r8;
102393+ __DECL_REG(ax);
102394+ __DECL_REG(cx);
102395+ __DECL_REG(dx);
102396+ __DECL_REG(si);
102397+ __DECL_REG(di);
102398+ uint32_t error_code; /* private */
102399+ uint32_t entry_vector; /* private */
102400+ __DECL_REG(ip);
102401+ uint16_t cs, _pad0[1];
102402+ uint8_t saved_upcall_mask;
102403+ uint8_t _pad1[3];
102404+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
102405+ __DECL_REG(sp);
102406+ uint16_t ss, _pad2[3];
102407+ uint16_t es, _pad3[3];
102408+ uint16_t ds, _pad4[3];
102409+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
102410+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
102411+};
102412+typedef struct cpu_user_regs cpu_user_regs_t;
102413+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
102414+
102415+#undef __DECL_REG
102416+
102417+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
102418+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
102419+
102420+struct arch_vcpu_info {
102421+ unsigned long cr2;
102422+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
102423+};
102424+typedef struct arch_vcpu_info arch_vcpu_info_t;
102425+
102426+typedef unsigned long xen_callback_t;
102427+
102428+#endif /* !__ASSEMBLY__ */
102429+
102430+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
102431+
102432+/*
102433+ * Local variables:
102434+ * mode: C
102435+ * c-set-style: "BSD"
102436+ * c-basic-offset: 4
102437+ * tab-width: 4
102438+ * indent-tabs-mode: nil
102439+ * End:
102440+ */
102441diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen.h linux-2.6.16.33/include/xen/interface/arch-x86/xen.h
102442--- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen.h 1970-01-01 00:00:00.000000000 +0000
102443+++ linux-2.6.16.33/include/xen/interface/arch-x86/xen.h 2007-01-08 15:00:55.000000000 +0000
102444@@ -0,0 +1,190 @@
102445+/******************************************************************************
102446+ * arch-x86/xen.h
102447+ *
102448+ * Guest OS interface to x86 Xen.
102449+ *
102450+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102451+ * of this software and associated documentation files (the "Software"), to
102452+ * deal in the Software without restriction, including without limitation the
102453+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102454+ * sell copies of the Software, and to permit persons to whom the Software is
102455+ * furnished to do so, subject to the following conditions:
102456+ *
102457+ * The above copyright notice and this permission notice shall be included in
102458+ * all copies or substantial portions of the Software.
102459+ *
102460+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102461+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102462+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102463+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102464+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102465+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102466+ * DEALINGS IN THE SOFTWARE.
102467+ *
102468+ * Copyright (c) 2004-2006, K A Fraser
102469+ */
102470+
102471+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
102472+#define __XEN_PUBLIC_ARCH_X86_XEN_H__
102473+
102474+/* Structural guest handles introduced in 0x00030201. */
102475+#if __XEN_INTERFACE_VERSION__ >= 0x00030201
102476+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
102477+ typedef struct { type *p; } __guest_handle_ ## name
102478+#else
102479+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
102480+ typedef type * __guest_handle_ ## name
102481+#endif
102482+
102483+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
102484+#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
102485+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
102486+#ifdef __XEN_TOOLS__
102487+#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
102488+#endif
102489+
102490+#ifndef __ASSEMBLY__
102491+/* Guest handles for primitive C types. */
102492+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
102493+__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
102494+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
102495+DEFINE_XEN_GUEST_HANDLE(char);
102496+DEFINE_XEN_GUEST_HANDLE(int);
102497+DEFINE_XEN_GUEST_HANDLE(long);
102498+DEFINE_XEN_GUEST_HANDLE(void);
102499+
102500+typedef unsigned long xen_pfn_t;
102501+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
102502+#endif
102503+
102504+#if defined(__i386__)
102505+#include "xen-x86_32.h"
102506+#elif defined(__x86_64__)
102507+#include "xen-x86_64.h"
102508+#endif
102509+
102510+/*
102511+ * SEGMENT DESCRIPTOR TABLES
102512+ */
102513+/*
102514+ * A number of GDT entries are reserved by Xen. These are not situated at the
102515+ * start of the GDT because some stupid OSes export hard-coded selector values
102516+ * in their ABI. These hard-coded values are always near the start of the GDT,
102517+ * so Xen places itself out of the way, at the far end of the GDT.
102518+ */
102519+#define FIRST_RESERVED_GDT_PAGE 14
102520+#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
102521+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
102522+
102523+/* Maximum number of virtual CPUs in multi-processor guests. */
102524+#define MAX_VIRT_CPUS 32
102525+
102526+#ifndef __ASSEMBLY__
102527+
102528+typedef unsigned long xen_ulong_t;
102529+
102530+/*
102531+ * Send an array of these to HYPERVISOR_set_trap_table().
102532+ * The privilege level specifies which modes may enter a trap via a software
102533+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
102534+ * privilege levels as follows:
102535+ * Level == 0: Noone may enter
102536+ * Level == 1: Kernel may enter
102537+ * Level == 2: Kernel may enter
102538+ * Level == 3: Everyone may enter
102539+ */
102540+#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
102541+#define TI_GET_IF(_ti) ((_ti)->flags & 4)
102542+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
102543+#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2))
102544+struct trap_info {
102545+ uint8_t vector; /* exception vector */
102546+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
102547+ uint16_t cs; /* code selector */
102548+ unsigned long address; /* code offset */
102549+};
102550+typedef struct trap_info trap_info_t;
102551+DEFINE_XEN_GUEST_HANDLE(trap_info_t);
102552+
102553+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
102554+
102555+/*
102556+ * The following is all CPU context. Note that the fpu_ctxt block is filled
102557+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
102558+ */
102559+struct vcpu_guest_context {
102560+ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
102561+ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
102562+#define VGCF_I387_VALID (1<<0)
102563+#define VGCF_IN_KERNEL (1<<2)
102564+#define _VGCF_i387_valid 0
102565+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
102566+#define _VGCF_in_kernel 2
102567+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
102568+#define _VGCF_failsafe_disables_events 3
102569+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
102570+#define _VGCF_syscall_disables_events 4
102571+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
102572+ unsigned long flags; /* VGCF_* flags */
102573+ struct cpu_user_regs user_regs; /* User-level CPU registers */
102574+ struct trap_info trap_ctxt[256]; /* Virtual IDT */
102575+ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
102576+ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
102577+ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
102578+ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
102579+ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
102580+#ifdef __i386__
102581+ unsigned long event_callback_cs; /* CS:EIP of event callback */
102582+ unsigned long event_callback_eip;
102583+ unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
102584+ unsigned long failsafe_callback_eip;
102585+#else
102586+ unsigned long event_callback_eip;
102587+ unsigned long failsafe_callback_eip;
102588+ unsigned long syscall_callback_eip;
102589+#endif
102590+ unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
102591+#ifdef __x86_64__
102592+ /* Segment base addresses. */
102593+ uint64_t fs_base;
102594+ uint64_t gs_base_kernel;
102595+ uint64_t gs_base_user;
102596+#endif
102597+};
102598+typedef struct vcpu_guest_context vcpu_guest_context_t;
102599+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
102600+
102601+struct arch_shared_info {
102602+ unsigned long max_pfn; /* max pfn that appears in table */
102603+ /* Frame containing list of mfns containing list of mfns containing p2m. */
102604+ xen_pfn_t pfn_to_mfn_frame_list_list;
102605+ unsigned long nmi_reason;
102606+ uint64_t pad[32];
102607+};
102608+typedef struct arch_shared_info arch_shared_info_t;
102609+
102610+#endif /* !__ASSEMBLY__ */
102611+
102612+/*
102613+ * Prefix forces emulation of some non-trapping instructions.
102614+ * Currently only CPUID.
102615+ */
102616+#ifdef __ASSEMBLY__
102617+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
102618+#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
102619+#else
102620+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
102621+#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
102622+#endif
102623+
102624+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
102625+
102626+/*
102627+ * Local variables:
102628+ * mode: C
102629+ * c-set-style: "BSD"
102630+ * c-basic-offset: 4
102631+ * tab-width: 4
102632+ * indent-tabs-mode: nil
102633+ * End:
102634+ */
102635diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86_32.h linux-2.6.16.33/include/xen/interface/arch-x86_32.h
102636--- linux-2.6.16.33-noxen/include/xen/interface/arch-x86_32.h 1970-01-01 00:00:00.000000000 +0000
102637+++ linux-2.6.16.33/include/xen/interface/arch-x86_32.h 2007-01-08 15:00:55.000000000 +0000
102638@@ -0,0 +1,27 @@
102639+/******************************************************************************
102640+ * arch-x86_32.h
102641+ *
102642+ * Guest OS interface to x86 32-bit Xen.
102643+ *
102644+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102645+ * of this software and associated documentation files (the "Software"), to
102646+ * deal in the Software without restriction, including without limitation the
102647+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102648+ * sell copies of the Software, and to permit persons to whom the Software is
102649+ * furnished to do so, subject to the following conditions:
102650+ *
102651+ * The above copyright notice and this permission notice shall be included in
102652+ * all copies or substantial portions of the Software.
102653+ *
102654+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102655+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102656+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102657+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102658+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102659+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102660+ * DEALINGS IN THE SOFTWARE.
102661+ *
102662+ * Copyright (c) 2004-2006, K A Fraser
102663+ */
102664+
102665+#include "arch-x86/xen.h"
102666diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86_64.h linux-2.6.16.33/include/xen/interface/arch-x86_64.h
102667--- linux-2.6.16.33-noxen/include/xen/interface/arch-x86_64.h 1970-01-01 00:00:00.000000000 +0000
102668+++ linux-2.6.16.33/include/xen/interface/arch-x86_64.h 2007-01-08 15:00:55.000000000 +0000
102669@@ -0,0 +1,27 @@
102670+/******************************************************************************
102671+ * arch-x86_64.h
102672+ *
102673+ * Guest OS interface to x86 64-bit Xen.
102674+ *
102675+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102676+ * of this software and associated documentation files (the "Software"), to
102677+ * deal in the Software without restriction, including without limitation the
102678+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102679+ * sell copies of the Software, and to permit persons to whom the Software is
102680+ * furnished to do so, subject to the following conditions:
102681+ *
102682+ * The above copyright notice and this permission notice shall be included in
102683+ * all copies or substantial portions of the Software.
102684+ *
102685+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102686+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102687+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102688+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102689+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102690+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102691+ * DEALINGS IN THE SOFTWARE.
102692+ *
102693+ * Copyright (c) 2004-2006, K A Fraser
102694+ */
102695+
102696+#include "arch-x86/xen.h"
102697diff -Nur linux-2.6.16.33-noxen/include/xen/interface/callback.h linux-2.6.16.33/include/xen/interface/callback.h
102698--- linux-2.6.16.33-noxen/include/xen/interface/callback.h 1970-01-01 00:00:00.000000000 +0000
102699+++ linux-2.6.16.33/include/xen/interface/callback.h 2007-01-08 15:00:55.000000000 +0000
102700@@ -0,0 +1,92 @@
102701+/******************************************************************************
102702+ * callback.h
102703+ *
102704+ * Register guest OS callbacks with Xen.
102705+ *
102706+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102707+ * of this software and associated documentation files (the "Software"), to
102708+ * deal in the Software without restriction, including without limitation the
102709+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102710+ * sell copies of the Software, and to permit persons to whom the Software is
102711+ * furnished to do so, subject to the following conditions:
102712+ *
102713+ * The above copyright notice and this permission notice shall be included in
102714+ * all copies or substantial portions of the Software.
102715+ *
102716+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102717+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102718+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102719+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102720+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102721+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102722+ * DEALINGS IN THE SOFTWARE.
102723+ *
102724+ * Copyright (c) 2006, Ian Campbell
102725+ */
102726+
102727+#ifndef __XEN_PUBLIC_CALLBACK_H__
102728+#define __XEN_PUBLIC_CALLBACK_H__
102729+
102730+#include "xen.h"
102731+
102732+/*
102733+ * Prototype for this hypercall is:
102734+ * long callback_op(int cmd, void *extra_args)
102735+ * @cmd == CALLBACKOP_??? (callback operation).
102736+ * @extra_args == Operation-specific extra arguments (NULL if none).
102737+ */
102738+
102739+#define CALLBACKTYPE_event 0
102740+#define CALLBACKTYPE_failsafe 1
102741+#define CALLBACKTYPE_syscall 2 /* x86_64 only */
102742+/*
102743+ * sysenter is only available on x86_32 with the
102744+ * supervisor_mode_kernel option enabled.
102745+ */
102746+#define CALLBACKTYPE_sysenter 3
102747+#define CALLBACKTYPE_nmi 4
102748+
102749+/*
102750+ * Disable event deliver during callback? This flag is ignored for event and
102751+ * NMI callbacks: event delivery is unconditionally disabled.
102752+ */
102753+#define _CALLBACKF_mask_events 0
102754+#define CALLBACKF_mask_events (1U << _CALLBACKF_mask_events)
102755+
102756+/*
102757+ * Register a callback.
102758+ */
102759+#define CALLBACKOP_register 0
102760+struct callback_register {
102761+ uint16_t type;
102762+ uint16_t flags;
102763+ xen_callback_t address;
102764+};
102765+typedef struct callback_register callback_register_t;
102766+DEFINE_XEN_GUEST_HANDLE(callback_register_t);
102767+
102768+/*
102769+ * Unregister a callback.
102770+ *
102771+ * Not all callbacks can be unregistered. -EINVAL will be returned if
102772+ * you attempt to unregister such a callback.
102773+ */
102774+#define CALLBACKOP_unregister 1
102775+struct callback_unregister {
102776+ uint16_t type;
102777+ uint16_t _unused;
102778+};
102779+typedef struct callback_unregister callback_unregister_t;
102780+DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
102781+
102782+#endif /* __XEN_PUBLIC_CALLBACK_H__ */
102783+
102784+/*
102785+ * Local variables:
102786+ * mode: C
102787+ * c-set-style: "BSD"
102788+ * c-basic-offset: 4
102789+ * tab-width: 4
102790+ * indent-tabs-mode: nil
102791+ * End:
102792+ */
102793diff -Nur linux-2.6.16.33-noxen/include/xen/interface/dom0_ops.h linux-2.6.16.33/include/xen/interface/dom0_ops.h
102794--- linux-2.6.16.33-noxen/include/xen/interface/dom0_ops.h 1970-01-01 00:00:00.000000000 +0000
102795+++ linux-2.6.16.33/include/xen/interface/dom0_ops.h 2007-01-08 15:00:55.000000000 +0000
102796@@ -0,0 +1,120 @@
102797+/******************************************************************************
102798+ * dom0_ops.h
102799+ *
102800+ * Process command requests from domain-0 guest OS.
102801+ *
102802+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102803+ * of this software and associated documentation files (the "Software"), to
102804+ * deal in the Software without restriction, including without limitation the
102805+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102806+ * sell copies of the Software, and to permit persons to whom the Software is
102807+ * furnished to do so, subject to the following conditions:
102808+ *
102809+ * The above copyright notice and this permission notice shall be included in
102810+ * all copies or substantial portions of the Software.
102811+ *
102812+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102813+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102814+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102815+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102816+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102817+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102818+ * DEALINGS IN THE SOFTWARE.
102819+ *
102820+ * Copyright (c) 2002-2003, B Dragovic
102821+ * Copyright (c) 2002-2006, K Fraser
102822+ */
102823+
102824+#ifndef __XEN_PUBLIC_DOM0_OPS_H__
102825+#define __XEN_PUBLIC_DOM0_OPS_H__
102826+
102827+#include "xen.h"
102828+#include "platform.h"
102829+
102830+#if __XEN_INTERFACE_VERSION__ >= 0x00030204
102831+#error "dom0_ops.h is a compatibility interface only"
102832+#endif
102833+
102834+#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
102835+
102836+#define DOM0_SETTIME XENPF_settime
102837+#define dom0_settime xenpf_settime
102838+#define dom0_settime_t xenpf_settime_t
102839+
102840+#define DOM0_ADD_MEMTYPE XENPF_add_memtype
102841+#define dom0_add_memtype xenpf_add_memtype
102842+#define dom0_add_memtype_t xenpf_add_memtype_t
102843+
102844+#define DOM0_DEL_MEMTYPE XENPF_del_memtype
102845+#define dom0_del_memtype xenpf_del_memtype
102846+#define dom0_del_memtype_t xenpf_del_memtype_t
102847+
102848+#define DOM0_READ_MEMTYPE XENPF_read_memtype
102849+#define dom0_read_memtype xenpf_read_memtype
102850+#define dom0_read_memtype_t xenpf_read_memtype_t
102851+
102852+#define DOM0_MICROCODE XENPF_microcode_update
102853+#define dom0_microcode xenpf_microcode_update
102854+#define dom0_microcode_t xenpf_microcode_update_t
102855+
102856+#define DOM0_PLATFORM_QUIRK XENPF_platform_quirk
102857+#define dom0_platform_quirk xenpf_platform_quirk
102858+#define dom0_platform_quirk_t xenpf_platform_quirk_t
102859+
102860+typedef uint64_t cpumap_t;
102861+
102862+/* Unsupported legacy operation -- defined for API compatibility. */
102863+#define DOM0_MSR 15
102864+struct dom0_msr {
102865+ /* IN variables. */
102866+ uint32_t write;
102867+ cpumap_t cpu_mask;
102868+ uint32_t msr;
102869+ uint32_t in1;
102870+ uint32_t in2;
102871+ /* OUT variables. */
102872+ uint32_t out1;
102873+ uint32_t out2;
102874+};
102875+typedef struct dom0_msr dom0_msr_t;
102876+DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
102877+
102878+/* Unsupported legacy operation -- defined for API compatibility. */
102879+#define DOM0_PHYSICAL_MEMORY_MAP 40
102880+struct dom0_memory_map_entry {
102881+ uint64_t start, end;
102882+ uint32_t flags; /* reserved */
102883+ uint8_t is_ram;
102884+};
102885+typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
102886+DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
102887+
102888+struct dom0_op {
102889+ uint32_t cmd;
102890+ uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
102891+ union {
102892+ struct dom0_msr msr;
102893+ struct dom0_settime settime;
102894+ struct dom0_add_memtype add_memtype;
102895+ struct dom0_del_memtype del_memtype;
102896+ struct dom0_read_memtype read_memtype;
102897+ struct dom0_microcode microcode;
102898+ struct dom0_platform_quirk platform_quirk;
102899+ struct dom0_memory_map_entry physical_memory_map;
102900+ uint8_t pad[128];
102901+ } u;
102902+};
102903+typedef struct dom0_op dom0_op_t;
102904+DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
102905+
102906+#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
102907+
102908+/*
102909+ * Local variables:
102910+ * mode: C
102911+ * c-set-style: "BSD"
102912+ * c-basic-offset: 4
102913+ * tab-width: 4
102914+ * indent-tabs-mode: nil
102915+ * End:
102916+ */
102917diff -Nur linux-2.6.16.33-noxen/include/xen/interface/domctl.h linux-2.6.16.33/include/xen/interface/domctl.h
102918--- linux-2.6.16.33-noxen/include/xen/interface/domctl.h 1970-01-01 00:00:00.000000000 +0000
102919+++ linux-2.6.16.33/include/xen/interface/domctl.h 2007-01-08 15:00:55.000000000 +0000
102920@@ -0,0 +1,437 @@
102921+/******************************************************************************
102922+ * domctl.h
102923+ *
102924+ * Domain management operations. For use by node control stack.
102925+ *
102926+ * Permission is hereby granted, free of charge, to any person obtaining a copy
102927+ * of this software and associated documentation files (the "Software"), to
102928+ * deal in the Software without restriction, including without limitation the
102929+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102930+ * sell copies of the Software, and to permit persons to whom the Software is
102931+ * furnished to do so, subject to the following conditions:
102932+ *
102933+ * The above copyright notice and this permission notice shall be included in
102934+ * all copies or substantial portions of the Software.
102935+ *
102936+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102937+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102938+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102939+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102940+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102941+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102942+ * DEALINGS IN THE SOFTWARE.
102943+ *
102944+ * Copyright (c) 2002-2003, B Dragovic
102945+ * Copyright (c) 2002-2006, K Fraser
102946+ */
102947+
102948+#ifndef __XEN_PUBLIC_DOMCTL_H__
102949+#define __XEN_PUBLIC_DOMCTL_H__
102950+
102951+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
102952+#error "domctl operations are intended for use by node control tools only"
102953+#endif
102954+
102955+#include "xen.h"
102956+
102957+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000004
102958+
102959+struct xenctl_cpumap {
102960+ XEN_GUEST_HANDLE(uint8_t) bitmap;
102961+ uint32_t nr_cpus;
102962+};
102963+
102964+/*
102965+ * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
102966+ * If it is specified as zero, an id is auto-allocated and returned.
102967+ */
102968+#define XEN_DOMCTL_createdomain 1
102969+struct xen_domctl_createdomain {
102970+ /* IN parameters */
102971+ uint32_t ssidref;
102972+ xen_domain_handle_t handle;
102973+ /* Is this an HVM guest (as opposed to a PV guest)? */
102974+#define _XEN_DOMCTL_CDF_hvm_guest 0
102975+#define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest)
102976+ uint32_t flags;
102977+};
102978+typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
102979+DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
102980+
102981+#define XEN_DOMCTL_destroydomain 2
102982+#define XEN_DOMCTL_pausedomain 3
102983+#define XEN_DOMCTL_unpausedomain 4
102984+
102985+#define XEN_DOMCTL_getdomaininfo 5
102986+struct xen_domctl_getdomaininfo {
102987+ /* OUT variables. */
102988+ domid_t domain; /* Also echoed in domctl.domain */
102989+ /* Domain is scheduled to die. */
102990+#define _XEN_DOMINF_dying 0
102991+#define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying)
102992+ /* Domain is an HVM guest (as opposed to a PV guest). */
102993+#define _XEN_DOMINF_hvm_guest 1
102994+#define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest)
102995+ /* The guest OS has shut down. */
102996+#define _XEN_DOMINF_shutdown 2
102997+#define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown)
102998+ /* Currently paused by control software. */
102999+#define _XEN_DOMINF_paused 3
103000+#define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused)
103001+ /* Currently blocked pending an event. */
103002+#define _XEN_DOMINF_blocked 4
103003+#define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked)
103004+ /* Domain is currently running. */
103005+#define _XEN_DOMINF_running 5
103006+#define XEN_DOMINF_running (1U<<_XEN_DOMINF_running)
103007+ /* CPU to which this domain is bound. */
103008+#define XEN_DOMINF_cpumask 255
103009+#define XEN_DOMINF_cpushift 8
103010+ /* XEN_DOMINF_shutdown guest-supplied code. */
103011+#define XEN_DOMINF_shutdownmask 255
103012+#define XEN_DOMINF_shutdownshift 16
103013+ uint32_t flags; /* XEN_DOMINF_* */
103014+ uint64_t tot_pages;
103015+ uint64_t max_pages;
103016+ uint64_t shared_info_frame; /* GMFN of shared_info struct */
103017+ uint64_t cpu_time;
103018+ uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */
103019+ uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */
103020+ uint32_t ssidref;
103021+ xen_domain_handle_t handle;
103022+};
103023+typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
103024+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
103025+
103026+
103027+#define XEN_DOMCTL_getmemlist 6
103028+struct xen_domctl_getmemlist {
103029+ /* IN variables. */
103030+ /* Max entries to write to output buffer. */
103031+ uint64_t max_pfns;
103032+ /* Start index in guest's page list. */
103033+ uint64_t start_pfn;
103034+ XEN_GUEST_HANDLE(xen_pfn_t) buffer;
103035+ /* OUT variables. */
103036+ uint64_t num_pfns;
103037+};
103038+typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
103039+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
103040+
103041+
103042+#define XEN_DOMCTL_getpageframeinfo 7
103043+
103044+#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
103045+#define XEN_DOMCTL_PFINFO_NOTAB (0x0<<28)
103046+#define XEN_DOMCTL_PFINFO_L1TAB (0x1<<28)
103047+#define XEN_DOMCTL_PFINFO_L2TAB (0x2<<28)
103048+#define XEN_DOMCTL_PFINFO_L3TAB (0x3<<28)
103049+#define XEN_DOMCTL_PFINFO_L4TAB (0x4<<28)
103050+#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7<<28)
103051+#define XEN_DOMCTL_PFINFO_LPINTAB (0x1<<31)
103052+#define XEN_DOMCTL_PFINFO_XTAB (0xf<<28) /* invalid page */
103053+#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xf<<28)
103054+
103055+struct xen_domctl_getpageframeinfo {
103056+ /* IN variables. */
103057+ uint64_t gmfn; /* GMFN to query */
103058+ /* OUT variables. */
103059+ /* Is the page PINNED to a type? */
103060+ uint32_t type; /* see above type defs */
103061+};
103062+typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
103063+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
103064+
103065+
103066+#define XEN_DOMCTL_getpageframeinfo2 8
103067+struct xen_domctl_getpageframeinfo2 {
103068+ /* IN variables. */
103069+ uint64_t num;
103070+ /* IN/OUT variables. */
103071+ XEN_GUEST_HANDLE(ulong) array;
103072+};
103073+typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
103074+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
103075+
103076+
103077+/*
103078+ * Control shadow pagetables operation
103079+ */
103080+#define XEN_DOMCTL_shadow_op 10
103081+
103082+/* Disable shadow mode. */
103083+#define XEN_DOMCTL_SHADOW_OP_OFF 0
103084+
103085+/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
103086+#define XEN_DOMCTL_SHADOW_OP_ENABLE 32
103087+
103088+/* Log-dirty bitmap operations. */
103089+ /* Return the bitmap and clean internal copy for next round. */
103090+#define XEN_DOMCTL_SHADOW_OP_CLEAN 11
103091+ /* Return the bitmap but do not modify internal copy. */
103092+#define XEN_DOMCTL_SHADOW_OP_PEEK 12
103093+
103094+/* Memory allocation accessors. */
103095+#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION 30
103096+#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION 31
103097+
103098+/* Legacy enable operations. */
103099+ /* Equiv. to ENABLE with no mode flags. */
103100+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST 1
103101+ /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
103102+#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY 2
103103+ /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
103104+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE 3
103105+
103106+/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
103107+ /*
103108+ * Shadow pagetables are refcounted: guest does not use explicit mmu
103109+ * operations nor write-protect its pagetables.
103110+ */
103111+#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT (1 << 1)
103112+ /*
103113+ * Log pages in a bitmap as they are dirtied.
103114+ * Used for live relocation to determine which pages must be re-sent.
103115+ */
103116+#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
103117+ /*
103118+ * Automatically translate GPFNs into MFNs.
103119+ */
103120+#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
103121+ /*
103122+ * Xen does not steal virtual address space from the guest.
103123+ * Requires HVM support.
103124+ */
103125+#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL (1 << 4)
103126+
103127+struct xen_domctl_shadow_op_stats {
103128+ uint32_t fault_count;
103129+ uint32_t dirty_count;
103130+};
103131+typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
103132+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
103133+
103134+struct xen_domctl_shadow_op {
103135+ /* IN variables. */
103136+ uint32_t op; /* XEN_DOMCTL_SHADOW_OP_* */
103137+
103138+ /* OP_ENABLE */
103139+ uint32_t mode; /* XEN_DOMCTL_SHADOW_ENABLE_* */
103140+
103141+ /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
103142+ uint32_t mb; /* Shadow memory allocation in MB */
103143+
103144+ /* OP_PEEK / OP_CLEAN */
103145+ XEN_GUEST_HANDLE(ulong) dirty_bitmap;
103146+ uint64_t pages; /* Size of buffer. Updated with actual size. */
103147+ struct xen_domctl_shadow_op_stats stats;
103148+};
103149+typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
103150+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
103151+
103152+
103153+#define XEN_DOMCTL_max_mem 11
103154+struct xen_domctl_max_mem {
103155+ /* IN variables. */
103156+ uint64_t max_memkb;
103157+};
103158+typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
103159+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
103160+
103161+
103162+#define XEN_DOMCTL_setvcpucontext 12
103163+#define XEN_DOMCTL_getvcpucontext 13
103164+struct xen_domctl_vcpucontext {
103165+ uint32_t vcpu; /* IN */
103166+ XEN_GUEST_HANDLE(vcpu_guest_context_t) ctxt; /* IN/OUT */
103167+};
103168+typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
103169+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
103170+
103171+
103172+#define XEN_DOMCTL_getvcpuinfo 14
103173+struct xen_domctl_getvcpuinfo {
103174+ /* IN variables. */
103175+ uint32_t vcpu;
103176+ /* OUT variables. */
103177+ uint8_t online; /* currently online (not hotplugged)? */
103178+ uint8_t blocked; /* blocked waiting for an event? */
103179+ uint8_t running; /* currently scheduled on its CPU? */
103180+ uint64_t cpu_time; /* total cpu time consumed (ns) */
103181+ uint32_t cpu; /* current mapping */
103182+};
103183+typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
103184+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
103185+
103186+
103187+/* Get/set which physical cpus a vcpu can execute on. */
103188+#define XEN_DOMCTL_setvcpuaffinity 9
103189+#define XEN_DOMCTL_getvcpuaffinity 25
103190+struct xen_domctl_vcpuaffinity {
103191+ uint32_t vcpu; /* IN */
103192+ struct xenctl_cpumap cpumap; /* IN/OUT */
103193+};
103194+typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
103195+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
103196+
103197+
103198+#define XEN_DOMCTL_max_vcpus 15
103199+struct xen_domctl_max_vcpus {
103200+ uint32_t max; /* maximum number of vcpus */
103201+};
103202+typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
103203+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
103204+
103205+
103206+#define XEN_DOMCTL_scheduler_op 16
103207+/* Scheduler types. */
103208+#define XEN_SCHEDULER_SEDF 4
103209+#define XEN_SCHEDULER_CREDIT 5
103210+/* Set or get info? */
103211+#define XEN_DOMCTL_SCHEDOP_putinfo 0
103212+#define XEN_DOMCTL_SCHEDOP_getinfo 1
103213+struct xen_domctl_scheduler_op {
103214+ uint32_t sched_id; /* XEN_SCHEDULER_* */
103215+ uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */
103216+ union {
103217+ struct xen_domctl_sched_sedf {
103218+ uint64_t period;
103219+ uint64_t slice;
103220+ uint64_t latency;
103221+ uint32_t extratime;
103222+ uint32_t weight;
103223+ } sedf;
103224+ struct xen_domctl_sched_credit {
103225+ uint16_t weight;
103226+ uint16_t cap;
103227+ } credit;
103228+ } u;
103229+};
103230+typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
103231+DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
103232+
103233+
103234+#define XEN_DOMCTL_setdomainhandle 17
103235+struct xen_domctl_setdomainhandle {
103236+ xen_domain_handle_t handle;
103237+};
103238+typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
103239+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
103240+
103241+
103242+#define XEN_DOMCTL_setdebugging 18
103243+struct xen_domctl_setdebugging {
103244+ uint8_t enable;
103245+};
103246+typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
103247+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
103248+
103249+
103250+#define XEN_DOMCTL_irq_permission 19
103251+struct xen_domctl_irq_permission {
103252+ uint8_t pirq;
103253+ uint8_t allow_access; /* flag to specify enable/disable of IRQ access */
103254+};
103255+typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
103256+DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
103257+
103258+
103259+#define XEN_DOMCTL_iomem_permission 20
103260+struct xen_domctl_iomem_permission {
103261+ uint64_t first_mfn; /* first page (physical page number) in range */
103262+ uint64_t nr_mfns; /* number of pages in range (>0) */
103263+ uint8_t allow_access; /* allow (!0) or deny (0) access to range? */
103264+};
103265+typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
103266+DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
103267+
103268+
103269+#define XEN_DOMCTL_ioport_permission 21
103270+struct xen_domctl_ioport_permission {
103271+ uint32_t first_port; /* first port int range */
103272+ uint32_t nr_ports; /* size of port range */
103273+ uint8_t allow_access; /* allow or deny access to range? */
103274+};
103275+typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
103276+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
103277+
103278+#define XEN_DOMCTL_hypercall_init 22
103279+struct xen_domctl_hypercall_init {
103280+ uint64_t gmfn; /* GMFN to be initialised */
103281+};
103282+typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
103283+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
103284+
103285+#define XEN_DOMCTL_arch_setup 23
103286+#define _XEN_DOMAINSETUP_hvm_guest 0
103287+#define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest)
103288+#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */
103289+#define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query)
103290+typedef struct xen_domctl_arch_setup {
103291+ uint64_t flags; /* XEN_DOMAINSETUP_* */
103292+#ifdef __ia64__
103293+ uint64_t bp; /* mpaddr of boot param area */
103294+ uint64_t maxmem; /* Highest memory address for MDT. */
103295+ uint64_t xsi_va; /* Xen shared_info area virtual address. */
103296+ uint32_t hypercall_imm; /* Break imm for Xen hypercalls. */
103297+#endif
103298+} xen_domctl_arch_setup_t;
103299+DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
103300+
103301+#define XEN_DOMCTL_settimeoffset 24
103302+struct xen_domctl_settimeoffset {
103303+ int32_t time_offset_seconds; /* applied to domain wallclock time */
103304+};
103305+typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
103306+DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
103307+
103308+#define XEN_DOMCTL_real_mode_area 26
103309+struct xen_domctl_real_mode_area {
103310+ uint32_t log; /* log2 of Real Mode Area size */
103311+};
103312+typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
103313+DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
103314+
103315+struct xen_domctl {
103316+ uint32_t cmd;
103317+ uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
103318+ domid_t domain;
103319+ union {
103320+ struct xen_domctl_createdomain createdomain;
103321+ struct xen_domctl_getdomaininfo getdomaininfo;
103322+ struct xen_domctl_getmemlist getmemlist;
103323+ struct xen_domctl_getpageframeinfo getpageframeinfo;
103324+ struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
103325+ struct xen_domctl_vcpuaffinity vcpuaffinity;
103326+ struct xen_domctl_shadow_op shadow_op;
103327+ struct xen_domctl_max_mem max_mem;
103328+ struct xen_domctl_vcpucontext vcpucontext;
103329+ struct xen_domctl_getvcpuinfo getvcpuinfo;
103330+ struct xen_domctl_max_vcpus max_vcpus;
103331+ struct xen_domctl_scheduler_op scheduler_op;
103332+ struct xen_domctl_setdomainhandle setdomainhandle;
103333+ struct xen_domctl_setdebugging setdebugging;
103334+ struct xen_domctl_irq_permission irq_permission;
103335+ struct xen_domctl_iomem_permission iomem_permission;
103336+ struct xen_domctl_ioport_permission ioport_permission;
103337+ struct xen_domctl_hypercall_init hypercall_init;
103338+ struct xen_domctl_arch_setup arch_setup;
103339+ struct xen_domctl_settimeoffset settimeoffset;
103340+ struct xen_domctl_real_mode_area real_mode_area;
103341+ uint8_t pad[128];
103342+ } u;
103343+};
103344+typedef struct xen_domctl xen_domctl_t;
103345+DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
103346+
103347+#endif /* __XEN_PUBLIC_DOMCTL_H__ */
103348+
103349+/*
103350+ * Local variables:
103351+ * mode: C
103352+ * c-set-style: "BSD"
103353+ * c-basic-offset: 4
103354+ * tab-width: 4
103355+ * indent-tabs-mode: nil
103356+ * End:
103357+ */
103358diff -Nur linux-2.6.16.33-noxen/include/xen/interface/elfnote.h linux-2.6.16.33/include/xen/interface/elfnote.h
103359--- linux-2.6.16.33-noxen/include/xen/interface/elfnote.h 1970-01-01 00:00:00.000000000 +0000
103360+++ linux-2.6.16.33/include/xen/interface/elfnote.h 2007-01-08 15:00:55.000000000 +0000
103361@@ -0,0 +1,179 @@
103362+/******************************************************************************
103363+ * elfnote.h
103364+ *
103365+ * Definitions used for the Xen ELF notes.
103366+ *
103367+ * Permission is hereby granted, free of charge, to any person obtaining a copy
103368+ * of this software and associated documentation files (the "Software"), to
103369+ * deal in the Software without restriction, including without limitation the
103370+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103371+ * sell copies of the Software, and to permit persons to whom the Software is
103372+ * furnished to do so, subject to the following conditions:
103373+ *
103374+ * The above copyright notice and this permission notice shall be included in
103375+ * all copies or substantial portions of the Software.
103376+ *
103377+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103378+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103379+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103380+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103381+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103382+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103383+ * DEALINGS IN THE SOFTWARE.
103384+ *
103385+ * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
103386+ */
103387+
103388+#ifndef __XEN_PUBLIC_ELFNOTE_H__
103389+#define __XEN_PUBLIC_ELFNOTE_H__
103390+
103391+/*
103392+ * The notes should live in a SHT_NOTE segment and have "Xen" in the
103393+ * name field.
103394+ *
103395+ * Numeric types are either 4 or 8 bytes depending on the content of
103396+ * the desc field.
103397+ *
103398+ * LEGACY indicated the fields in the legacy __xen_guest string which
103399+ * this a note type replaces.
103400+ */
103401+
103402+/*
103403+ * NAME=VALUE pair (string).
103404+ *
103405+ * LEGACY: FEATURES and PAE
103406+ */
103407+#define XEN_ELFNOTE_INFO 0
103408+
103409+/*
103410+ * The virtual address of the entry point (numeric).
103411+ *
103412+ * LEGACY: VIRT_ENTRY
103413+ */
103414+#define XEN_ELFNOTE_ENTRY 1
103415+
103416+/* The virtual address of the hypercall transfer page (numeric).
103417+ *
103418+ * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
103419+ * number not a virtual address)
103420+ */
103421+#define XEN_ELFNOTE_HYPERCALL_PAGE 2
103422+
103423+/* The virtual address where the kernel image should be mapped (numeric).
103424+ *
103425+ * Defaults to 0.
103426+ *
103427+ * LEGACY: VIRT_BASE
103428+ */
103429+#define XEN_ELFNOTE_VIRT_BASE 3
103430+
103431+/*
103432+ * The offset of the ELF paddr field from the acutal required
103433+ * psuedo-physical address (numeric).
103434+ *
103435+ * This is used to maintain backwards compatibility with older kernels
103436+ * which wrote __PAGE_OFFSET into that field. This field defaults to 0
103437+ * if not present.
103438+ *
103439+ * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
103440+ */
103441+#define XEN_ELFNOTE_PADDR_OFFSET 4
103442+
103443+/*
103444+ * The version of Xen that we work with (string).
103445+ *
103446+ * LEGACY: XEN_VER
103447+ */
103448+#define XEN_ELFNOTE_XEN_VERSION 5
103449+
103450+/*
103451+ * The name of the guest operating system (string).
103452+ *
103453+ * LEGACY: GUEST_OS
103454+ */
103455+#define XEN_ELFNOTE_GUEST_OS 6
103456+
103457+/*
103458+ * The version of the guest operating system (string).
103459+ *
103460+ * LEGACY: GUEST_VER
103461+ */
103462+#define XEN_ELFNOTE_GUEST_VERSION 7
103463+
103464+/*
103465+ * The loader type (string).
103466+ *
103467+ * LEGACY: LOADER
103468+ */
103469+#define XEN_ELFNOTE_LOADER 8
103470+
103471+/*
103472+ * The kernel supports PAE (x86/32 only, string = "yes" or "no").
103473+ *
103474+ * LEGACY: PAE (n.b. The legacy interface included a provision to
103475+ * indicate 'extended-cr3' support allowing L3 page tables to be
103476+ * placed above 4G. It is assumed that any kernel new enough to use
103477+ * these ELF notes will include this and therefore "yes" here is
103478+ * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
103479+ */
103480+#define XEN_ELFNOTE_PAE_MODE 9
103481+
103482+/*
103483+ * The features supported/required by this kernel (string).
103484+ *
103485+ * The string must consist of a list of feature names (as given in
103486+ * features.h, without the "XENFEAT_" prefix) separated by '|'
103487+ * characters. If a feature is required for the kernel to function
103488+ * then the feature name must be preceded by a '!' character.
103489+ *
103490+ * LEGACY: FEATURES
103491+ */
103492+#define XEN_ELFNOTE_FEATURES 10
103493+
103494+/*
103495+ * The kernel requires the symbol table to be loaded (string = "yes" or "no")
103496+ * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
103497+ * of this string as a boolean flag rather than requiring "yes" or
103498+ * "no".
103499+ */
103500+#define XEN_ELFNOTE_BSD_SYMTAB 11
103501+
103502+/*
103503+ * The lowest address the hypervisor hole can begin at (numeric).
103504+ *
103505+ * This must not be set higher than HYPERVISOR_VIRT_START. Its presence
103506+ * also indicates to the hypervisor that the kernel can deal with the
103507+ * hole starting at a higher address.
103508+ */
103509+#define XEN_ELFNOTE_HV_START_LOW 12
103510+
103511+/*
103512+ * System information exported through crash notes.
103513+ *
103514+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
103515+ * note in case of a system crash. This note will contain various
103516+ * information about the system, see xen/include/xen/elfcore.h.
103517+ */
103518+#define XEN_ELFNOTE_CRASH_INFO 0x1000001
103519+
103520+/*
103521+ * System registers exported through crash notes.
103522+ *
103523+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
103524+ * note per cpu in case of a system crash. This note is architecture
103525+ * specific and will contain registers not saved in the "CORE" note.
103526+ * See xen/include/xen/elfcore.h for more information.
103527+ */
103528+#define XEN_ELFNOTE_CRASH_REGS 0x1000002
103529+
103530+#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
103531+
103532+/*
103533+ * Local variables:
103534+ * mode: C
103535+ * c-set-style: "BSD"
103536+ * c-basic-offset: 4
103537+ * tab-width: 4
103538+ * indent-tabs-mode: nil
103539+ * End:
103540+ */
103541diff -Nur linux-2.6.16.33-noxen/include/xen/interface/event_channel.h linux-2.6.16.33/include/xen/interface/event_channel.h
103542--- linux-2.6.16.33-noxen/include/xen/interface/event_channel.h 1970-01-01 00:00:00.000000000 +0000
103543+++ linux-2.6.16.33/include/xen/interface/event_channel.h 2007-01-08 15:00:55.000000000 +0000
103544@@ -0,0 +1,251 @@
103545+/******************************************************************************
103546+ * event_channel.h
103547+ *
103548+ * Event channels between domains.
103549+ *
103550+ * Permission is hereby granted, free of charge, to any person obtaining a copy
103551+ * of this software and associated documentation files (the "Software"), to
103552+ * deal in the Software without restriction, including without limitation the
103553+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103554+ * sell copies of the Software, and to permit persons to whom the Software is
103555+ * furnished to do so, subject to the following conditions:
103556+ *
103557+ * The above copyright notice and this permission notice shall be included in
103558+ * all copies or substantial portions of the Software.
103559+ *
103560+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103561+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103562+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103563+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103564+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103565+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103566+ * DEALINGS IN THE SOFTWARE.
103567+ *
103568+ * Copyright (c) 2003-2004, K A Fraser.
103569+ */
103570+
103571+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
103572+#define __XEN_PUBLIC_EVENT_CHANNEL_H__
103573+
103574+/*
103575+ * Prototype for this hypercall is:
103576+ * int event_channel_op(int cmd, void *args)
103577+ * @cmd == EVTCHNOP_??? (event-channel operation).
103578+ * @args == Operation-specific extra arguments (NULL if none).
103579+ */
103580+
103581+typedef uint32_t evtchn_port_t;
103582+DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
103583+
103584+/*
103585+ * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
103586+ * accepting interdomain bindings from domain <remote_dom>. A fresh port
103587+ * is allocated in <dom> and returned as <port>.
103588+ * NOTES:
103589+ * 1. If the caller is unprivileged then <dom> must be DOMID_SELF.
103590+ * 2. <rdom> may be DOMID_SELF, allowing loopback connections.
103591+ */
103592+#define EVTCHNOP_alloc_unbound 6
103593+struct evtchn_alloc_unbound {
103594+ /* IN parameters */
103595+ domid_t dom, remote_dom;
103596+ /* OUT parameters */
103597+ evtchn_port_t port;
103598+};
103599+typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
103600+
103601+/*
103602+ * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
103603+ * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
103604+ * a port that is unbound and marked as accepting bindings from the calling
103605+ * domain. A fresh port is allocated in the calling domain and returned as
103606+ * <local_port>.
103607+ * NOTES:
103608+ * 2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
103609+ */
103610+#define EVTCHNOP_bind_interdomain 0
103611+struct evtchn_bind_interdomain {
103612+ /* IN parameters. */
103613+ domid_t remote_dom;
103614+ evtchn_port_t remote_port;
103615+ /* OUT parameters. */
103616+ evtchn_port_t local_port;
103617+};
103618+typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
103619+
103620+/*
103621+ * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
103622+ * vcpu.
103623+ * NOTES:
103624+ * 1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
103625+ * in xen.h for the classification of each VIRQ.
103626+ * 2. Global VIRQs must be allocated on VCPU0 but can subsequently be
103627+ * re-bound via EVTCHNOP_bind_vcpu.
103628+ * 3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
103629+ * The allocated event channel is bound to the specified vcpu and the
103630+ * binding cannot be changed.
103631+ */
103632+#define EVTCHNOP_bind_virq 1
103633+struct evtchn_bind_virq {
103634+ /* IN parameters. */
103635+ uint32_t virq;
103636+ uint32_t vcpu;
103637+ /* OUT parameters. */
103638+ evtchn_port_t port;
103639+};
103640+typedef struct evtchn_bind_virq evtchn_bind_virq_t;
103641+
103642+/*
103643+ * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
103644+ * NOTES:
103645+ * 1. A physical IRQ may be bound to at most one event channel per domain.
103646+ * 2. Only a sufficiently-privileged domain may bind to a physical IRQ.
103647+ */
103648+#define EVTCHNOP_bind_pirq 2
103649+struct evtchn_bind_pirq {
103650+ /* IN parameters. */
103651+ uint32_t pirq;
103652+#define BIND_PIRQ__WILL_SHARE 1
103653+ uint32_t flags; /* BIND_PIRQ__* */
103654+ /* OUT parameters. */
103655+ evtchn_port_t port;
103656+};
103657+typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
103658+
103659+/*
103660+ * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
103661+ * NOTES:
103662+ * 1. The allocated event channel is bound to the specified vcpu. The binding
103663+ * may not be changed.
103664+ */
103665+#define EVTCHNOP_bind_ipi 7
103666+struct evtchn_bind_ipi {
103667+ uint32_t vcpu;
103668+ /* OUT parameters. */
103669+ evtchn_port_t port;
103670+};
103671+typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
103672+
103673+/*
103674+ * EVTCHNOP_close: Close a local event channel <port>. If the channel is
103675+ * interdomain then the remote end is placed in the unbound state
103676+ * (EVTCHNSTAT_unbound), awaiting a new connection.
103677+ */
103678+#define EVTCHNOP_close 3
103679+struct evtchn_close {
103680+ /* IN parameters. */
103681+ evtchn_port_t port;
103682+};
103683+typedef struct evtchn_close evtchn_close_t;
103684+
103685+/*
103686+ * EVTCHNOP_send: Send an event to the remote end of the channel whose local
103687+ * endpoint is <port>.
103688+ */
103689+#define EVTCHNOP_send 4
103690+struct evtchn_send {
103691+ /* IN parameters. */
103692+ evtchn_port_t port;
103693+};
103694+typedef struct evtchn_send evtchn_send_t;
103695+
103696+/*
103697+ * EVTCHNOP_status: Get the current status of the communication channel which
103698+ * has an endpoint at <dom, port>.
103699+ * NOTES:
103700+ * 1. <dom> may be specified as DOMID_SELF.
103701+ * 2. Only a sufficiently-privileged domain may obtain the status of an event
103702+ * channel for which <dom> is not DOMID_SELF.
103703+ */
103704+#define EVTCHNOP_status 5
103705+struct evtchn_status {
103706+ /* IN parameters */
103707+ domid_t dom;
103708+ evtchn_port_t port;
103709+ /* OUT parameters */
103710+#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
103711+#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
103712+#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
103713+#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
103714+#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
103715+#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
103716+ uint32_t status;
103717+ uint32_t vcpu; /* VCPU to which this channel is bound. */
103718+ union {
103719+ struct {
103720+ domid_t dom;
103721+ } unbound; /* EVTCHNSTAT_unbound */
103722+ struct {
103723+ domid_t dom;
103724+ evtchn_port_t port;
103725+ } interdomain; /* EVTCHNSTAT_interdomain */
103726+ uint32_t pirq; /* EVTCHNSTAT_pirq */
103727+ uint32_t virq; /* EVTCHNSTAT_virq */
103728+ } u;
103729+};
103730+typedef struct evtchn_status evtchn_status_t;
103731+
103732+/*
103733+ * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
103734+ * event is pending.
103735+ * NOTES:
103736+ * 1. IPI-bound channels always notify the vcpu specified at bind time.
103737+ * This binding cannot be changed.
103738+ * 2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
103739+ * This binding cannot be changed.
103740+ * 3. All other channels notify vcpu0 by default. This default is set when
103741+ * the channel is allocated (a port that is freed and subsequently reused
103742+ * has its binding reset to vcpu0).
103743+ */
103744+#define EVTCHNOP_bind_vcpu 8
103745+struct evtchn_bind_vcpu {
103746+ /* IN parameters. */
103747+ evtchn_port_t port;
103748+ uint32_t vcpu;
103749+};
103750+typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
103751+
103752+/*
103753+ * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
103754+ * a notification to the appropriate VCPU if an event is pending.
103755+ */
103756+#define EVTCHNOP_unmask 9
103757+struct evtchn_unmask {
103758+ /* IN parameters. */
103759+ evtchn_port_t port;
103760+};
103761+typedef struct evtchn_unmask evtchn_unmask_t;
103762+
103763+/*
103764+ * Argument to event_channel_op_compat() hypercall. Superceded by new
103765+ * event_channel_op() hypercall since 0x00030202.
103766+ */
103767+struct evtchn_op {
103768+ uint32_t cmd; /* EVTCHNOP_* */
103769+ union {
103770+ struct evtchn_alloc_unbound alloc_unbound;
103771+ struct evtchn_bind_interdomain bind_interdomain;
103772+ struct evtchn_bind_virq bind_virq;
103773+ struct evtchn_bind_pirq bind_pirq;
103774+ struct evtchn_bind_ipi bind_ipi;
103775+ struct evtchn_close close;
103776+ struct evtchn_send send;
103777+ struct evtchn_status status;
103778+ struct evtchn_bind_vcpu bind_vcpu;
103779+ struct evtchn_unmask unmask;
103780+ } u;
103781+};
103782+typedef struct evtchn_op evtchn_op_t;
103783+DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
103784+
103785+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
103786+
103787+/*
103788+ * Local variables:
103789+ * mode: C
103790+ * c-set-style: "BSD"
103791+ * c-basic-offset: 4
103792+ * tab-width: 4
103793+ * indent-tabs-mode: nil
103794+ * End:
103795+ */
103796diff -Nur linux-2.6.16.33-noxen/include/xen/interface/features.h linux-2.6.16.33/include/xen/interface/features.h
103797--- linux-2.6.16.33-noxen/include/xen/interface/features.h 1970-01-01 00:00:00.000000000 +0000
103798+++ linux-2.6.16.33/include/xen/interface/features.h 2007-01-08 15:00:55.000000000 +0000
103799@@ -0,0 +1,71 @@
103800+/******************************************************************************
103801+ * features.h
103802+ *
103803+ * Feature flags, reported by XENVER_get_features.
103804+ *
103805+ * Permission is hereby granted, free of charge, to any person obtaining a copy
103806+ * of this software and associated documentation files (the "Software"), to
103807+ * deal in the Software without restriction, including without limitation the
103808+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103809+ * sell copies of the Software, and to permit persons to whom the Software is
103810+ * furnished to do so, subject to the following conditions:
103811+ *
103812+ * The above copyright notice and this permission notice shall be included in
103813+ * all copies or substantial portions of the Software.
103814+ *
103815+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103816+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103817+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103818+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103819+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103820+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103821+ * DEALINGS IN THE SOFTWARE.
103822+ *
103823+ * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
103824+ */
103825+
103826+#ifndef __XEN_PUBLIC_FEATURES_H__
103827+#define __XEN_PUBLIC_FEATURES_H__
103828+
103829+/*
103830+ * If set, the guest does not need to write-protect its pagetables, and can
103831+ * update them via direct writes.
103832+ */
103833+#define XENFEAT_writable_page_tables 0
103834+
103835+/*
103836+ * If set, the guest does not need to write-protect its segment descriptor
103837+ * tables, and can update them via direct writes.
103838+ */
103839+#define XENFEAT_writable_descriptor_tables 1
103840+
103841+/*
103842+ * If set, translation between the guest's 'pseudo-physical' address space
103843+ * and the host's machine address space are handled by the hypervisor. In this
103844+ * mode the guest does not need to perform phys-to/from-machine translations
103845+ * when performing page table operations.
103846+ */
103847+#define XENFEAT_auto_translated_physmap 2
103848+
103849+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
103850+#define XENFEAT_supervisor_mode_kernel 3
103851+
103852+/*
103853+ * If set, the guest does not need to allocate x86 PAE page directories
103854+ * below 4GB. This flag is usually implied by auto_translated_physmap.
103855+ */
103856+#define XENFEAT_pae_pgdir_above_4gb 4
103857+
103858+#define XENFEAT_NR_SUBMAPS 1
103859+
103860+#endif /* __XEN_PUBLIC_FEATURES_H__ */
103861+
103862+/*
103863+ * Local variables:
103864+ * mode: C
103865+ * c-set-style: "BSD"
103866+ * c-basic-offset: 4
103867+ * tab-width: 4
103868+ * indent-tabs-mode: nil
103869+ * End:
103870+ */
103871diff -Nur linux-2.6.16.33-noxen/include/xen/interface/grant_table.h linux-2.6.16.33/include/xen/interface/grant_table.h
103872--- linux-2.6.16.33-noxen/include/xen/interface/grant_table.h 1970-01-01 00:00:00.000000000 +0000
103873+++ linux-2.6.16.33/include/xen/interface/grant_table.h 2007-01-08 15:00:55.000000000 +0000
103874@@ -0,0 +1,380 @@
103875+/******************************************************************************
103876+ * grant_table.h
103877+ *
103878+ * Interface for granting foreign access to page frames, and receiving
103879+ * page-ownership transfers.
103880+ *
103881+ * Permission is hereby granted, free of charge, to any person obtaining a copy
103882+ * of this software and associated documentation files (the "Software"), to
103883+ * deal in the Software without restriction, including without limitation the
103884+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103885+ * sell copies of the Software, and to permit persons to whom the Software is
103886+ * furnished to do so, subject to the following conditions:
103887+ *
103888+ * The above copyright notice and this permission notice shall be included in
103889+ * all copies or substantial portions of the Software.
103890+ *
103891+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103892+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103893+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103894+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103895+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103896+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103897+ * DEALINGS IN THE SOFTWARE.
103898+ *
103899+ * Copyright (c) 2004, K A Fraser
103900+ */
103901+
103902+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
103903+#define __XEN_PUBLIC_GRANT_TABLE_H__
103904+
103905+
103906+/***********************************
103907+ * GRANT TABLE REPRESENTATION
103908+ */
103909+
103910+/* Some rough guidelines on accessing and updating grant-table entries
103911+ * in a concurrency-safe manner. For more information, Linux contains a
103912+ * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
103913+ *
103914+ * NB. WMB is a no-op on current-generation x86 processors. However, a
103915+ * compiler barrier will still be required.
103916+ *
103917+ * Introducing a valid entry into the grant table:
103918+ * 1. Write ent->domid.
103919+ * 2. Write ent->frame:
103920+ * GTF_permit_access: Frame to which access is permitted.
103921+ * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
103922+ * frame, or zero if none.
103923+ * 3. Write memory barrier (WMB).
103924+ * 4. Write ent->flags, inc. valid type.
103925+ *
103926+ * Invalidating an unused GTF_permit_access entry:
103927+ * 1. flags = ent->flags.
103928+ * 2. Observe that !(flags & (GTF_reading|GTF_writing)).
103929+ * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
103930+ * NB. No need for WMB as reuse of entry is control-dependent on success of
103931+ * step 3, and all architectures guarantee ordering of ctrl-dep writes.
103932+ *
103933+ * Invalidating an in-use GTF_permit_access entry:
103934+ * This cannot be done directly. Request assistance from the domain controller
103935+ * which can set a timeout on the use of a grant entry and take necessary
103936+ * action. (NB. This is not yet implemented!).
103937+ *
103938+ * Invalidating an unused GTF_accept_transfer entry:
103939+ * 1. flags = ent->flags.
103940+ * 2. Observe that !(flags & GTF_transfer_committed). [*]
103941+ * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
103942+ * NB. No need for WMB as reuse of entry is control-dependent on success of
103943+ * step 3, and all architectures guarantee ordering of ctrl-dep writes.
103944+ * [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
103945+ * The guest must /not/ modify the grant entry until the address of the
103946+ * transferred frame is written. It is safe for the guest to spin waiting
103947+ * for this to occur (detect by observing GTF_transfer_completed in
103948+ * ent->flags).
103949+ *
103950+ * Invalidating a committed GTF_accept_transfer entry:
103951+ * 1. Wait for (ent->flags & GTF_transfer_completed).
103952+ *
103953+ * Changing a GTF_permit_access from writable to read-only:
103954+ * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
103955+ *
103956+ * Changing a GTF_permit_access from read-only to writable:
103957+ * Use SMP-safe bit-setting instruction.
103958+ */
103959+
103960+/*
103961+ * A grant table comprises a packed array of grant entries in one or more
103962+ * page frames shared between Xen and a guest.
103963+ * [XEN]: This field is written by Xen and read by the sharing guest.
103964+ * [GST]: This field is written by the guest and read by Xen.
103965+ */
103966+struct grant_entry {
103967+ /* GTF_xxx: various type and flag information. [XEN,GST] */
103968+ uint16_t flags;
103969+ /* The domain being granted foreign privileges. [GST] */
103970+ domid_t domid;
103971+ /*
103972+ * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
103973+ * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
103974+ */
103975+ uint32_t frame;
103976+};
103977+typedef struct grant_entry grant_entry_t;
103978+
103979+/*
103980+ * Type of grant entry.
103981+ * GTF_invalid: This grant entry grants no privileges.
103982+ * GTF_permit_access: Allow @domid to map/access @frame.
103983+ * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
103984+ * to this guest. Xen writes the page number to @frame.
103985+ */
103986+#define GTF_invalid (0U<<0)
103987+#define GTF_permit_access (1U<<0)
103988+#define GTF_accept_transfer (2U<<0)
103989+#define GTF_type_mask (3U<<0)
103990+
103991+/*
103992+ * Subflags for GTF_permit_access.
103993+ * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
103994+ * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
103995+ * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
103996+ */
103997+#define _GTF_readonly (2)
103998+#define GTF_readonly (1U<<_GTF_readonly)
103999+#define _GTF_reading (3)
104000+#define GTF_reading (1U<<_GTF_reading)
104001+#define _GTF_writing (4)
104002+#define GTF_writing (1U<<_GTF_writing)
104003+
104004+/*
104005+ * Subflags for GTF_accept_transfer:
104006+ * GTF_transfer_committed: Xen sets this flag to indicate that it is committed
104007+ * to transferring ownership of a page frame. When a guest sees this flag
104008+ * it must /not/ modify the grant entry until GTF_transfer_completed is
104009+ * set by Xen.
104010+ * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
104011+ * after reading GTF_transfer_committed. Xen will always write the frame
104012+ * address, followed by ORing this flag, in a timely manner.
104013+ */
104014+#define _GTF_transfer_committed (2)
104015+#define GTF_transfer_committed (1U<<_GTF_transfer_committed)
104016+#define _GTF_transfer_completed (3)
104017+#define GTF_transfer_completed (1U<<_GTF_transfer_completed)
104018+
104019+
104020+/***********************************
104021+ * GRANT TABLE QUERIES AND USES
104022+ */
104023+
104024+/*
104025+ * Reference to a grant entry in a specified domain's grant table.
104026+ */
104027+typedef uint32_t grant_ref_t;
104028+
104029+/*
104030+ * Handle to track a mapping created via a grant reference.
104031+ */
104032+typedef uint32_t grant_handle_t;
104033+
104034+/*
104035+ * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
104036+ * by devices and/or host CPUs. If successful, <handle> is a tracking number
104037+ * that must be presented later to destroy the mapping(s). On error, <handle>
104038+ * is a negative status code.
104039+ * NOTES:
104040+ * 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
104041+ * via which I/O devices may access the granted frame.
104042+ * 2. If GNTMAP_host_map is specified then a mapping will be added at
104043+ * either a host virtual address in the current address space, or at
104044+ * a PTE at the specified machine address. The type of mapping to
104045+ * perform is selected through the GNTMAP_contains_pte flag, and the
104046+ * address is specified in <host_addr>.
104047+ * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
104048+ * host mapping is destroyed by other means then it is *NOT* guaranteed
104049+ * to be accounted to the correct grant reference!
104050+ */
104051+#define GNTTABOP_map_grant_ref 0
104052+struct gnttab_map_grant_ref {
104053+ /* IN parameters. */
104054+ uint64_t host_addr;
104055+ uint32_t flags; /* GNTMAP_* */
104056+ grant_ref_t ref;
104057+ domid_t dom;
104058+ /* OUT parameters. */
104059+ int16_t status; /* GNTST_* */
104060+ grant_handle_t handle;
104061+ uint64_t dev_bus_addr;
104062+};
104063+typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
104064+DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
104065+
104066+/*
104067+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
104068+ * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
104069+ * field is ignored. If non-zero, they must refer to a device/host mapping
104070+ * that is tracked by <handle>
104071+ * NOTES:
104072+ * 1. The call may fail in an undefined manner if either mapping is not
104073+ * tracked by <handle>.
104074+ * 3. After executing a batch of unmaps, it is guaranteed that no stale
104075+ * mappings will remain in the device or host TLBs.
104076+ */
104077+#define GNTTABOP_unmap_grant_ref 1
104078+struct gnttab_unmap_grant_ref {
104079+ /* IN parameters. */
104080+ uint64_t host_addr;
104081+ uint64_t dev_bus_addr;
104082+ grant_handle_t handle;
104083+ /* OUT parameters. */
104084+ int16_t status; /* GNTST_* */
104085+};
104086+typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
104087+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
104088+
104089+/*
104090+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
104091+ * <nr_frames> pages. The frame addresses are written to the <frame_list>.
104092+ * Only <nr_frames> addresses are written, even if the table is larger.
104093+ * NOTES:
104094+ * 1. <dom> may be specified as DOMID_SELF.
104095+ * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
104096+ * 3. Xen may not support more than a single grant-table page per domain.
104097+ */
104098+#define GNTTABOP_setup_table 2
104099+struct gnttab_setup_table {
104100+ /* IN parameters. */
104101+ domid_t dom;
104102+ uint32_t nr_frames;
104103+ /* OUT parameters. */
104104+ int16_t status; /* GNTST_* */
104105+ XEN_GUEST_HANDLE(ulong) frame_list;
104106+};
104107+typedef struct gnttab_setup_table gnttab_setup_table_t;
104108+DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
104109+
104110+/*
104111+ * GNTTABOP_dump_table: Dump the contents of the grant table to the
104112+ * xen console. Debugging use only.
104113+ */
104114+#define GNTTABOP_dump_table 3
104115+struct gnttab_dump_table {
104116+ /* IN parameters. */
104117+ domid_t dom;
104118+ /* OUT parameters. */
104119+ int16_t status; /* GNTST_* */
104120+};
104121+typedef struct gnttab_dump_table gnttab_dump_table_t;
104122+DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
104123+
104124+/*
104125+ * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
104126+ * foreign domain has previously registered its interest in the transfer via
104127+ * <domid, ref>.
104128+ *
104129+ * Note that, even if the transfer fails, the specified page no longer belongs
104130+ * to the calling domain *unless* the error is GNTST_bad_page.
104131+ */
104132+#define GNTTABOP_transfer 4
104133+struct gnttab_transfer {
104134+ /* IN parameters. */
104135+ xen_pfn_t mfn;
104136+ domid_t domid;
104137+ grant_ref_t ref;
104138+ /* OUT parameters. */
104139+ int16_t status;
104140+};
104141+typedef struct gnttab_transfer gnttab_transfer_t;
104142+DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
104143+
104144+
104145+/*
104146+ * GNTTABOP_copy: Hypervisor based copy
104147+ * source and destinations can be eithers MFNs or, for foreign domains,
104148+ * grant references. the foreign domain has to grant read/write access
104149+ * in its grant table.
104150+ *
104151+ * The flags specify what type source and destinations are (either MFN
104152+ * or grant reference).
104153+ *
104154+ * Note that this can also be used to copy data between two domains
104155+ * via a third party if the source and destination domains had previously
104156+ * grant appropriate access to their pages to the third party.
104157+ *
104158+ * source_offset specifies an offset in the source frame, dest_offset
104159+ * the offset in the target frame and len specifies the number of
104160+ * bytes to be copied.
104161+ */
104162+
104163+#define _GNTCOPY_source_gref (0)
104164+#define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref)
104165+#define _GNTCOPY_dest_gref (1)
104166+#define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref)
104167+
104168+#define GNTTABOP_copy 5
104169+typedef struct gnttab_copy {
104170+ /* IN parameters. */
104171+ struct {
104172+ union {
104173+ grant_ref_t ref;
104174+ xen_pfn_t gmfn;
104175+ } u;
104176+ domid_t domid;
104177+ uint16_t offset;
104178+ } source, dest;
104179+ uint16_t len;
104180+ uint16_t flags; /* GNTCOPY_* */
104181+ /* OUT parameters. */
104182+ int16_t status;
104183+} gnttab_copy_t;
104184+DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
104185+
104186+
104187+/*
104188+ * Bitfield values for update_pin_status.flags.
104189+ */
104190+ /* Map the grant entry for access by I/O devices. */
104191+#define _GNTMAP_device_map (0)
104192+#define GNTMAP_device_map (1<<_GNTMAP_device_map)
104193+ /* Map the grant entry for access by host CPUs. */
104194+#define _GNTMAP_host_map (1)
104195+#define GNTMAP_host_map (1<<_GNTMAP_host_map)
104196+ /* Accesses to the granted frame will be restricted to read-only access. */
104197+#define _GNTMAP_readonly (2)
104198+#define GNTMAP_readonly (1<<_GNTMAP_readonly)
104199+ /*
104200+ * GNTMAP_host_map subflag:
104201+ * 0 => The host mapping is usable only by the guest OS.
104202+ * 1 => The host mapping is usable by guest OS + current application.
104203+ */
104204+#define _GNTMAP_application_map (3)
104205+#define GNTMAP_application_map (1<<_GNTMAP_application_map)
104206+
104207+ /*
104208+ * GNTMAP_contains_pte subflag:
104209+ * 0 => This map request contains a host virtual address.
104210+ * 1 => This map request contains the machine addess of the PTE to update.
104211+ */
104212+#define _GNTMAP_contains_pte (4)
104213+#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte)
104214+
104215+/*
104216+ * Values for error status returns. All errors are -ve.
104217+ */
104218+#define GNTST_okay (0) /* Normal return. */
104219+#define GNTST_general_error (-1) /* General undefined error. */
104220+#define GNTST_bad_domain (-2) /* Unrecognsed domain id. */
104221+#define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */
104222+#define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */
104223+#define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */
104224+#define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/
104225+#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */
104226+#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */
104227+#define GNTST_bad_page (-9) /* Specified page was invalid for op. */
104228+#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */
104229+
104230+#define GNTTABOP_error_msgs { \
104231+ "okay", \
104232+ "undefined error", \
104233+ "unrecognised domain id", \
104234+ "invalid grant reference", \
104235+ "invalid mapping handle", \
104236+ "invalid virtual address", \
104237+ "invalid device address", \
104238+ "no spare translation slot in the I/O MMU", \
104239+ "permission denied", \
104240+ "bad page", \
104241+ "copy arguments cross page boundary" \
104242+}
104243+
104244+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
104245+
104246+/*
104247+ * Local variables:
104248+ * mode: C
104249+ * c-set-style: "BSD"
104250+ * c-basic-offset: 4
104251+ * tab-width: 4
104252+ * indent-tabs-mode: nil
104253+ * End:
104254+ */
104255diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/e820.h linux-2.6.16.33/include/xen/interface/hvm/e820.h
104256--- linux-2.6.16.33-noxen/include/xen/interface/hvm/e820.h 1970-01-01 00:00:00.000000000 +0000
104257+++ linux-2.6.16.33/include/xen/interface/hvm/e820.h 2007-01-08 15:00:55.000000000 +0000
104258@@ -0,0 +1,47 @@
104259+
104260+/*
104261+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104262+ * of this software and associated documentation files (the "Software"), to
104263+ * deal in the Software without restriction, including without limitation the
104264+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104265+ * sell copies of the Software, and to permit persons to whom the Software is
104266+ * furnished to do so, subject to the following conditions:
104267+ *
104268+ * The above copyright notice and this permission notice shall be included in
104269+ * all copies or substantial portions of the Software.
104270+ *
104271+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104272+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104273+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104274+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104275+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104276+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104277+ * DEALINGS IN THE SOFTWARE.
104278+ */
104279+
104280+#ifndef __XEN_PUBLIC_HVM_E820_H__
104281+#define __XEN_PUBLIC_HVM_E820_H__
104282+
104283+/* PC BIOS standard E820 types. */
104284+#define E820_RAM 1
104285+#define E820_RESERVED 2
104286+#define E820_ACPI 3
104287+#define E820_NVS 4
104288+
104289+/* E820 location in HVM virtual address space. */
104290+#define E820_MAP_PAGE 0x00090000
104291+#define E820_MAP_NR_OFFSET 0x000001E8
104292+#define E820_MAP_OFFSET 0x000002D0
104293+
104294+struct e820entry {
104295+ uint64_t addr;
104296+ uint64_t size;
104297+ uint32_t type;
104298+} __attribute__((packed));
104299+
104300+#define HVM_BELOW_4G_RAM_END 0xF0000000
104301+
104302+#define HVM_BELOW_4G_MMIO_START HVM_BELOW_4G_RAM_END
104303+#define HVM_BELOW_4G_MMIO_LENGTH ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
104304+
104305+#endif /* __XEN_PUBLIC_HVM_E820_H__ */
104306diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_info_table.h linux-2.6.16.33/include/xen/interface/hvm/hvm_info_table.h
104307--- linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_info_table.h 1970-01-01 00:00:00.000000000 +0000
104308+++ linux-2.6.16.33/include/xen/interface/hvm/hvm_info_table.h 2007-01-08 15:00:55.000000000 +0000
104309@@ -0,0 +1,41 @@
104310+/******************************************************************************
104311+ * hvm/hvm_info_table.h
104312+ *
104313+ * HVM parameter and information table, written into guest memory map.
104314+ *
104315+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104316+ * of this software and associated documentation files (the "Software"), to
104317+ * deal in the Software without restriction, including without limitation the
104318+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104319+ * sell copies of the Software, and to permit persons to whom the Software is
104320+ * furnished to do so, subject to the following conditions:
104321+ *
104322+ * The above copyright notice and this permission notice shall be included in
104323+ * all copies or substantial portions of the Software.
104324+ *
104325+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104326+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104327+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104328+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104329+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104330+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104331+ * DEALINGS IN THE SOFTWARE.
104332+ */
104333+
104334+#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
104335+#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
104336+
104337+#define HVM_INFO_PFN 0x09F
104338+#define HVM_INFO_OFFSET 0x800
104339+#define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
104340+
104341+struct hvm_info_table {
104342+ char signature[8]; /* "HVM INFO" */
104343+ uint32_t length;
104344+ uint8_t checksum;
104345+ uint8_t acpi_enabled;
104346+ uint8_t apic_mode;
104347+ uint32_t nr_vcpus;
104348+};
104349+
104350+#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
104351diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_op.h linux-2.6.16.33/include/xen/interface/hvm/hvm_op.h
104352--- linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_op.h 1970-01-01 00:00:00.000000000 +0000
104353+++ linux-2.6.16.33/include/xen/interface/hvm/hvm_op.h 2007-01-08 15:00:55.000000000 +0000
104354@@ -0,0 +1,53 @@
104355+#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
104356+#define __XEN_PUBLIC_HVM_HVM_OP_H__
104357+
104358+/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
104359+#define HVMOP_set_param 0
104360+#define HVMOP_get_param 1
104361+struct xen_hvm_param {
104362+ domid_t domid; /* IN */
104363+ uint32_t index; /* IN */
104364+ uint64_t value; /* IN/OUT */
104365+};
104366+typedef struct xen_hvm_param xen_hvm_param_t;
104367+DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
104368+
104369+/* Set the logical level of one of a domain's PCI INTx wires. */
104370+#define HVMOP_set_pci_intx_level 2
104371+struct xen_hvm_set_pci_intx_level {
104372+ /* Domain to be updated. */
104373+ domid_t domid;
104374+ /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
104375+ uint8_t domain, bus, device, intx;
104376+ /* Assertion level (0 = unasserted, 1 = asserted). */
104377+ uint8_t level;
104378+};
104379+typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
104380+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
104381+
104382+/* Set the logical level of one of a domain's ISA IRQ wires. */
104383+#define HVMOP_set_isa_irq_level 3
104384+struct xen_hvm_set_isa_irq_level {
104385+ /* Domain to be updated. */
104386+ domid_t domid;
104387+ /* ISA device identification, by ISA IRQ (0-15). */
104388+ uint8_t isa_irq;
104389+ /* Assertion level (0 = unasserted, 1 = asserted). */
104390+ uint8_t level;
104391+};
104392+typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
104393+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
104394+
104395+#define HVMOP_set_pci_link_route 4
104396+struct xen_hvm_set_pci_link_route {
104397+ /* Domain to be updated. */
104398+ domid_t domid;
104399+ /* PCI link identifier (0-3). */
104400+ uint8_t link;
104401+ /* ISA IRQ (1-15), or 0 (disable link). */
104402+ uint8_t isa_irq;
104403+};
104404+typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
104405+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
104406+
104407+#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
104408diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/ioreq.h linux-2.6.16.33/include/xen/interface/hvm/ioreq.h
104409--- linux-2.6.16.33-noxen/include/xen/interface/hvm/ioreq.h 1970-01-01 00:00:00.000000000 +0000
104410+++ linux-2.6.16.33/include/xen/interface/hvm/ioreq.h 2007-01-08 15:00:55.000000000 +0000
104411@@ -0,0 +1,97 @@
104412+/*
104413+ * ioreq.h: I/O request definitions for device models
104414+ * Copyright (c) 2004, Intel Corporation.
104415+ *
104416+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104417+ * of this software and associated documentation files (the "Software"), to
104418+ * deal in the Software without restriction, including without limitation the
104419+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104420+ * sell copies of the Software, and to permit persons to whom the Software is
104421+ * furnished to do so, subject to the following conditions:
104422+ *
104423+ * The above copyright notice and this permission notice shall be included in
104424+ * all copies or substantial portions of the Software.
104425+ *
104426+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104427+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104428+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104429+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104430+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104431+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104432+ * DEALINGS IN THE SOFTWARE.
104433+ */
104434+
104435+#ifndef _IOREQ_H_
104436+#define _IOREQ_H_
104437+
104438+#define IOREQ_READ 1
104439+#define IOREQ_WRITE 0
104440+
104441+#define STATE_IOREQ_NONE 0
104442+#define STATE_IOREQ_READY 1
104443+#define STATE_IOREQ_INPROCESS 2
104444+#define STATE_IORESP_READY 3
104445+
104446+#define IOREQ_TYPE_PIO 0 /* pio */
104447+#define IOREQ_TYPE_COPY 1 /* mmio ops */
104448+#define IOREQ_TYPE_AND 2
104449+#define IOREQ_TYPE_OR 3
104450+#define IOREQ_TYPE_XOR 4
104451+#define IOREQ_TYPE_XCHG 5
104452+#define IOREQ_TYPE_ADD 6
104453+
104454+/*
104455+ * VMExit dispatcher should cooperate with instruction decoder to
104456+ * prepare this structure and notify service OS and DM by sending
104457+ * virq
104458+ */
104459+struct ioreq {
104460+ uint64_t addr; /* physical address */
104461+ uint64_t size; /* size in bytes */
104462+ uint64_t count; /* for rep prefixes */
104463+ uint64_t data; /* data (or paddr of data) */
104464+ uint8_t state:4;
104465+ uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr
104466+ * of the real data to use. */
104467+ uint8_t dir:1; /* 1=read, 0=write */
104468+ uint8_t df:1;
104469+ uint8_t type; /* I/O type */
104470+ uint64_t io_count; /* How many IO done on a vcpu */
104471+};
104472+typedef struct ioreq ioreq_t;
104473+
104474+struct vcpu_iodata {
104475+ struct ioreq vp_ioreq;
104476+ /* Event channel port */
104477+ unsigned int vp_eport; /* VMX vcpu uses this to notify DM */
104478+};
104479+typedef struct vcpu_iodata vcpu_iodata_t;
104480+
104481+struct shared_iopage {
104482+ struct vcpu_iodata vcpu_iodata[1];
104483+};
104484+typedef struct shared_iopage shared_iopage_t;
104485+
104486+#define IOREQ_BUFFER_SLOT_NUM 80
104487+struct buffered_iopage {
104488+ unsigned long read_pointer;
104489+ unsigned long write_pointer;
104490+ ioreq_t ioreq[IOREQ_BUFFER_SLOT_NUM];
104491+}; /* sizeof this structure must be in one page */
104492+typedef struct buffered_iopage buffered_iopage_t;
104493+
104494+#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40
104495+#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04)
104496+#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08)
104497+
104498+#endif /* _IOREQ_H_ */
104499+
104500+/*
104501+ * Local variables:
104502+ * mode: C
104503+ * c-set-style: "BSD"
104504+ * c-basic-offset: 4
104505+ * tab-width: 4
104506+ * indent-tabs-mode: nil
104507+ * End:
104508+ */
104509diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/params.h linux-2.6.16.33/include/xen/interface/hvm/params.h
104510--- linux-2.6.16.33-noxen/include/xen/interface/hvm/params.h 1970-01-01 00:00:00.000000000 +0000
104511+++ linux-2.6.16.33/include/xen/interface/hvm/params.h 2007-01-08 15:00:55.000000000 +0000
104512@@ -0,0 +1,36 @@
104513+
104514+/*
104515+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104516+ * of this software and associated documentation files (the "Software"), to
104517+ * deal in the Software without restriction, including without limitation the
104518+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104519+ * sell copies of the Software, and to permit persons to whom the Software is
104520+ * furnished to do so, subject to the following conditions:
104521+ *
104522+ * The above copyright notice and this permission notice shall be included in
104523+ * all copies or substantial portions of the Software.
104524+ *
104525+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104526+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104527+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104528+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104529+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104530+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104531+ * DEALINGS IN THE SOFTWARE.
104532+ */
104533+
104534+#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
104535+#define __XEN_PUBLIC_HVM_PARAMS_H__
104536+
104537+#include "hvm_op.h"
104538+
104539+/* Parameter space for HVMOP_{set,get}_param. */
104540+#define HVM_PARAM_CALLBACK_IRQ 0
104541+#define HVM_PARAM_STORE_PFN 1
104542+#define HVM_PARAM_STORE_EVTCHN 2
104543+#define HVM_PARAM_PAE_ENABLED 4
104544+#define HVM_PARAM_IOREQ_PFN 5
104545+#define HVM_PARAM_BUFIOREQ_PFN 6
104546+#define HVM_NR_PARAMS 7
104547+
104548+#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
104549diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/vmx_assist.h linux-2.6.16.33/include/xen/interface/hvm/vmx_assist.h
104550--- linux-2.6.16.33-noxen/include/xen/interface/hvm/vmx_assist.h 1970-01-01 00:00:00.000000000 +0000
104551+++ linux-2.6.16.33/include/xen/interface/hvm/vmx_assist.h 2007-01-08 15:00:55.000000000 +0000
104552@@ -0,0 +1,116 @@
104553+/*
104554+ * vmx_assist.h: Context definitions for the VMXASSIST world switch.
104555+ *
104556+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104557+ * of this software and associated documentation files (the "Software"), to
104558+ * deal in the Software without restriction, including without limitation the
104559+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104560+ * sell copies of the Software, and to permit persons to whom the Software is
104561+ * furnished to do so, subject to the following conditions:
104562+ *
104563+ * The above copyright notice and this permission notice shall be included in
104564+ * all copies or substantial portions of the Software.
104565+ *
104566+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104567+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104568+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104569+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104570+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104571+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104572+ * DEALINGS IN THE SOFTWARE.
104573+ *
104574+ * Leendert van Doorn, leendert@watson.ibm.com
104575+ * Copyright (c) 2005, International Business Machines Corporation.
104576+ */
104577+
104578+#ifndef _VMX_ASSIST_H_
104579+#define _VMX_ASSIST_H_
104580+
104581+#define VMXASSIST_BASE 0xD0000
104582+#define VMXASSIST_MAGIC 0x17101966
104583+#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
104584+
104585+#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
104586+#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
104587+
104588+#ifndef __ASSEMBLY__
104589+
104590+union vmcs_arbytes {
104591+ struct arbyte_fields {
104592+ unsigned int seg_type : 4,
104593+ s : 1,
104594+ dpl : 2,
104595+ p : 1,
104596+ reserved0 : 4,
104597+ avl : 1,
104598+ reserved1 : 1,
104599+ default_ops_size: 1,
104600+ g : 1,
104601+ null_bit : 1,
104602+ reserved2 : 15;
104603+ } fields;
104604+ unsigned int bytes;
104605+};
104606+
104607+/*
104608+ * World switch state
104609+ */
104610+struct vmx_assist_context {
104611+ uint32_t eip; /* execution pointer */
104612+ uint32_t esp; /* stack pointer */
104613+ uint32_t eflags; /* flags register */
104614+ uint32_t cr0;
104615+ uint32_t cr3; /* page table directory */
104616+ uint32_t cr4;
104617+ uint32_t idtr_limit; /* idt */
104618+ uint32_t idtr_base;
104619+ uint32_t gdtr_limit; /* gdt */
104620+ uint32_t gdtr_base;
104621+ uint32_t cs_sel; /* cs selector */
104622+ uint32_t cs_limit;
104623+ uint32_t cs_base;
104624+ union vmcs_arbytes cs_arbytes;
104625+ uint32_t ds_sel; /* ds selector */
104626+ uint32_t ds_limit;
104627+ uint32_t ds_base;
104628+ union vmcs_arbytes ds_arbytes;
104629+ uint32_t es_sel; /* es selector */
104630+ uint32_t es_limit;
104631+ uint32_t es_base;
104632+ union vmcs_arbytes es_arbytes;
104633+ uint32_t ss_sel; /* ss selector */
104634+ uint32_t ss_limit;
104635+ uint32_t ss_base;
104636+ union vmcs_arbytes ss_arbytes;
104637+ uint32_t fs_sel; /* fs selector */
104638+ uint32_t fs_limit;
104639+ uint32_t fs_base;
104640+ union vmcs_arbytes fs_arbytes;
104641+ uint32_t gs_sel; /* gs selector */
104642+ uint32_t gs_limit;
104643+ uint32_t gs_base;
104644+ union vmcs_arbytes gs_arbytes;
104645+ uint32_t tr_sel; /* task selector */
104646+ uint32_t tr_limit;
104647+ uint32_t tr_base;
104648+ union vmcs_arbytes tr_arbytes;
104649+ uint32_t ldtr_sel; /* ldtr selector */
104650+ uint32_t ldtr_limit;
104651+ uint32_t ldtr_base;
104652+ union vmcs_arbytes ldtr_arbytes;
104653+};
104654+typedef struct vmx_assist_context vmx_assist_context_t;
104655+
104656+#endif /* __ASSEMBLY__ */
104657+
104658+#endif /* _VMX_ASSIST_H_ */
104659+
104660+/*
104661+ * Local variables:
104662+ * mode: C
104663+ * c-set-style: "BSD"
104664+ * c-basic-offset: 4
104665+ * tab-width: 4
104666+ * indent-tabs-mode: nil
104667+ * End:
104668+ */
104669diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/blkif.h linux-2.6.16.33/include/xen/interface/io/blkif.h
104670--- linux-2.6.16.33-noxen/include/xen/interface/io/blkif.h 1970-01-01 00:00:00.000000000 +0000
104671+++ linux-2.6.16.33/include/xen/interface/io/blkif.h 2007-01-08 15:00:55.000000000 +0000
104672@@ -0,0 +1,126 @@
104673+/******************************************************************************
104674+ * blkif.h
104675+ *
104676+ * Unified block-device I/O interface for Xen guest OSes.
104677+ *
104678+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104679+ * of this software and associated documentation files (the "Software"), to
104680+ * deal in the Software without restriction, including without limitation the
104681+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104682+ * sell copies of the Software, and to permit persons to whom the Software is
104683+ * furnished to do so, subject to the following conditions:
104684+ *
104685+ * The above copyright notice and this permission notice shall be included in
104686+ * all copies or substantial portions of the Software.
104687+ *
104688+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104689+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104690+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104691+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104692+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104693+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104694+ * DEALINGS IN THE SOFTWARE.
104695+ *
104696+ * Copyright (c) 2003-2004, Keir Fraser
104697+ */
104698+
104699+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
104700+#define __XEN_PUBLIC_IO_BLKIF_H__
104701+
104702+#include "ring.h"
104703+#include "../grant_table.h"
104704+
104705+/*
104706+ * Front->back notifications: When enqueuing a new request, sending a
104707+ * notification can be made conditional on req_event (i.e., the generic
104708+ * hold-off mechanism provided by the ring macros). Backends must set
104709+ * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
104710+ *
104711+ * Back->front notifications: When enqueuing a new response, sending a
104712+ * notification can be made conditional on rsp_event (i.e., the generic
104713+ * hold-off mechanism provided by the ring macros). Frontends must set
104714+ * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
104715+ */
104716+
104717+#ifndef blkif_vdev_t
104718+#define blkif_vdev_t uint16_t
104719+#endif
104720+#define blkif_sector_t uint64_t
104721+
104722+/*
104723+ * REQUEST CODES.
104724+ */
104725+#define BLKIF_OP_READ 0
104726+#define BLKIF_OP_WRITE 1
104727+/*
104728+ * Recognised only if "feature-barrier" is present in backend xenbus info.
104729+ * The "feature_barrier" node contains a boolean indicating whether barrier
104730+ * requests are likely to succeed or fail. Either way, a barrier request
104731+ * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
104732+ * the underlying block-device hardware. The boolean simply indicates whether
104733+ * or not it is worthwhile for the frontend to attempt barrier requests.
104734+ * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
104735+ * create the "feature-barrier" node!
104736+ */
104737+#define BLKIF_OP_WRITE_BARRIER 2
104738+
104739+/*
104740+ * Maximum scatter/gather segments per request.
104741+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
104742+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
104743+ */
104744+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
104745+
104746+struct blkif_request {
104747+ uint8_t operation; /* BLKIF_OP_??? */
104748+ uint8_t nr_segments; /* number of segments */
104749+ blkif_vdev_t handle; /* only for read/write requests */
104750+ uint64_t id; /* private guest value, echoed in resp */
104751+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
104752+ struct blkif_request_segment {
104753+ grant_ref_t gref; /* reference to I/O buffer frame */
104754+ /* @first_sect: first sector in frame to transfer (inclusive). */
104755+ /* @last_sect: last sector in frame to transfer (inclusive). */
104756+ uint8_t first_sect, last_sect;
104757+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
104758+};
104759+typedef struct blkif_request blkif_request_t;
104760+
104761+struct blkif_response {
104762+ uint64_t id; /* copied from request */
104763+ uint8_t operation; /* copied from request */
104764+ int16_t status; /* BLKIF_RSP_??? */
104765+};
104766+typedef struct blkif_response blkif_response_t;
104767+
104768+/*
104769+ * STATUS RETURN CODES.
104770+ */
104771+ /* Operation not supported (only happens on barrier writes). */
104772+#define BLKIF_RSP_EOPNOTSUPP -2
104773+ /* Operation failed for some unspecified reason (-EIO). */
104774+#define BLKIF_RSP_ERROR -1
104775+ /* Operation completed successfully. */
104776+#define BLKIF_RSP_OKAY 0
104777+
104778+/*
104779+ * Generate blkif ring structures and types.
104780+ */
104781+
104782+DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
104783+
104784+#define VDISK_CDROM 0x1
104785+#define VDISK_REMOVABLE 0x2
104786+#define VDISK_READONLY 0x4
104787+
104788+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
104789+
104790+/*
104791+ * Local variables:
104792+ * mode: C
104793+ * c-set-style: "BSD"
104794+ * c-basic-offset: 4
104795+ * tab-width: 4
104796+ * indent-tabs-mode: nil
104797+ * End:
104798+ */
104799diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/console.h linux-2.6.16.33/include/xen/interface/io/console.h
104800--- linux-2.6.16.33-noxen/include/xen/interface/io/console.h 1970-01-01 00:00:00.000000000 +0000
104801+++ linux-2.6.16.33/include/xen/interface/io/console.h 2007-01-08 15:00:55.000000000 +0000
104802@@ -0,0 +1,51 @@
104803+/******************************************************************************
104804+ * console.h
104805+ *
104806+ * Console I/O interface for Xen guest OSes.
104807+ *
104808+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104809+ * of this software and associated documentation files (the "Software"), to
104810+ * deal in the Software without restriction, including without limitation the
104811+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104812+ * sell copies of the Software, and to permit persons to whom the Software is
104813+ * furnished to do so, subject to the following conditions:
104814+ *
104815+ * The above copyright notice and this permission notice shall be included in
104816+ * all copies or substantial portions of the Software.
104817+ *
104818+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104819+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104820+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104821+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104822+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104823+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104824+ * DEALINGS IN THE SOFTWARE.
104825+ *
104826+ * Copyright (c) 2005, Keir Fraser
104827+ */
104828+
104829+#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
104830+#define __XEN_PUBLIC_IO_CONSOLE_H__
104831+
104832+typedef uint32_t XENCONS_RING_IDX;
104833+
104834+#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
104835+
104836+struct xencons_interface {
104837+ char in[1024];
104838+ char out[2048];
104839+ XENCONS_RING_IDX in_cons, in_prod;
104840+ XENCONS_RING_IDX out_cons, out_prod;
104841+};
104842+
104843+#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
104844+
104845+/*
104846+ * Local variables:
104847+ * mode: C
104848+ * c-set-style: "BSD"
104849+ * c-basic-offset: 4
104850+ * tab-width: 4
104851+ * indent-tabs-mode: nil
104852+ * End:
104853+ */
104854diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/fbif.h linux-2.6.16.33/include/xen/interface/io/fbif.h
104855--- linux-2.6.16.33-noxen/include/xen/interface/io/fbif.h 1970-01-01 00:00:00.000000000 +0000
104856+++ linux-2.6.16.33/include/xen/interface/io/fbif.h 2007-01-08 15:00:55.000000000 +0000
104857@@ -0,0 +1,138 @@
104858+/*
104859+ * fbif.h -- Xen virtual frame buffer device
104860+ *
104861+ * Permission is hereby granted, free of charge, to any person obtaining a copy
104862+ * of this software and associated documentation files (the "Software"), to
104863+ * deal in the Software without restriction, including without limitation the
104864+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104865+ * sell copies of the Software, and to permit persons to whom the Software is
104866+ * furnished to do so, subject to the following conditions:
104867+ *
104868+ * The above copyright notice and this permission notice shall be included in
104869+ * all copies or substantial portions of the Software.
104870+ *
104871+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104872+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104873+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104874+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104875+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104876+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104877+ * DEALINGS IN THE SOFTWARE.
104878+ *
104879+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
104880+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
104881+ */
104882+
104883+#ifndef __XEN_PUBLIC_IO_FBIF_H__
104884+#define __XEN_PUBLIC_IO_FBIF_H__
104885+
104886+/* Out events (frontend -> backend) */
104887+
104888+/*
104889+ * Out events may be sent only when requested by backend, and receipt
104890+ * of an unknown out event is an error.
104891+ */
104892+
104893+/* Event type 1 currently not used */
104894+/*
104895+ * Framebuffer update notification event
104896+ * Capable frontend sets feature-update in xenstore.
104897+ * Backend requests it by setting request-update in xenstore.
104898+ */
104899+#define XENFB_TYPE_UPDATE 2
104900+
104901+struct xenfb_update
104902+{
104903+ uint8_t type; /* XENFB_TYPE_UPDATE */
104904+ int32_t x; /* source x */
104905+ int32_t y; /* source y */
104906+ int32_t width; /* rect width */
104907+ int32_t height; /* rect height */
104908+};
104909+
104910+#define XENFB_OUT_EVENT_SIZE 40
104911+
104912+union xenfb_out_event
104913+{
104914+ uint8_t type;
104915+ struct xenfb_update update;
104916+ char pad[XENFB_OUT_EVENT_SIZE];
104917+};
104918+
104919+/* In events (backend -> frontend) */
104920+
104921+/*
104922+ * Frontends should ignore unknown in events.
104923+ * No in events currently defined.
104924+ */
104925+
104926+#define XENFB_IN_EVENT_SIZE 40
104927+
104928+union xenfb_in_event
104929+{
104930+ uint8_t type;
104931+ char pad[XENFB_IN_EVENT_SIZE];
104932+};
104933+
104934+/* shared page */
104935+
104936+#define XENFB_IN_RING_SIZE 1024
104937+#define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE)
104938+#define XENFB_IN_RING_OFFS 1024
104939+#define XENFB_IN_RING(page) \
104940+ ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
104941+#define XENFB_IN_RING_REF(page, idx) \
104942+ (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
104943+
104944+#define XENFB_OUT_RING_SIZE 2048
104945+#define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE)
104946+#define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE)
104947+#define XENFB_OUT_RING(page) \
104948+ ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
104949+#define XENFB_OUT_RING_REF(page, idx) \
104950+ (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
104951+
104952+struct xenfb_page
104953+{
104954+ uint32_t in_cons, in_prod;
104955+ uint32_t out_cons, out_prod;
104956+
104957+ int32_t width; /* the width of the framebuffer (in pixels) */
104958+ int32_t height; /* the height of the framebuffer (in pixels) */
104959+ uint32_t line_length; /* the length of a row of pixels (in bytes) */
104960+ uint32_t mem_length; /* the length of the framebuffer (in bytes) */
104961+ uint8_t depth; /* the depth of a pixel (in bits) */
104962+
104963+ /*
104964+ * Framebuffer page directory
104965+ *
104966+ * Each directory page holds PAGE_SIZE / sizeof(*pd)
104967+ * framebuffer pages, and can thus map up to PAGE_SIZE *
104968+ * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and
104969+ * sizeof(unsigned long) == 4, that's 4 Megs. Two directory
104970+ * pages should be enough for a while.
104971+ */
104972+ unsigned long pd[2];
104973+};
104974+
104975+/*
104976+ * Wart: xenkbd needs to know resolution. Put it here until a better
104977+ * solution is found, but don't leak it to the backend.
104978+ */
104979+#ifdef __KERNEL__
104980+#define XENFB_WIDTH 800
104981+#define XENFB_HEIGHT 600
104982+#define XENFB_DEPTH 32
104983+#endif
104984+
104985+#endif
104986+
104987+/*
104988+ * Local variables:
104989+ * mode: C
104990+ * c-set-style: "BSD"
104991+ * c-basic-offset: 4
104992+ * tab-width: 4
104993+ * indent-tabs-mode: nil
104994+ * End:
104995+ */
104996diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/kbdif.h linux-2.6.16.33/include/xen/interface/io/kbdif.h
104997--- linux-2.6.16.33-noxen/include/xen/interface/io/kbdif.h 1970-01-01 00:00:00.000000000 +0000
104998+++ linux-2.6.16.33/include/xen/interface/io/kbdif.h 2007-01-08 15:00:55.000000000 +0000
104999@@ -0,0 +1,130 @@
105000+/*
105001+ * kbdif.h -- Xen virtual keyboard/mouse
105002+ *
105003+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105004+ * of this software and associated documentation files (the "Software"), to
105005+ * deal in the Software without restriction, including without limitation the
105006+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105007+ * sell copies of the Software, and to permit persons to whom the Software is
105008+ * furnished to do so, subject to the following conditions:
105009+ *
105010+ * The above copyright notice and this permission notice shall be included in
105011+ * all copies or substantial portions of the Software.
105012+ *
105013+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105014+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105015+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105016+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105017+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105018+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105019+ * DEALINGS IN THE SOFTWARE.
105020+ *
105021+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
105022+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
105023+ */
105024+
105025+#ifndef __XEN_PUBLIC_IO_KBDIF_H__
105026+#define __XEN_PUBLIC_IO_KBDIF_H__
105027+
105028+/* In events (backend -> frontend) */
105029+
105030+/*
105031+ * Frontends should ignore unknown in events.
105032+ */
105033+
105034+/* Pointer movement event */
105035+#define XENKBD_TYPE_MOTION 1
105036+/* Event type 2 currently not used */
105037+/* Key event (includes pointer buttons) */
105038+#define XENKBD_TYPE_KEY 3
105039+/*
105040+ * Pointer position event
105041+ * Capable backend sets feature-abs-pointer in xenstore.
105042+ * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting
105043+ * request-abs-update in xenstore.
105044+ */
105045+#define XENKBD_TYPE_POS 4
105046+
105047+struct xenkbd_motion
105048+{
105049+ uint8_t type; /* XENKBD_TYPE_MOTION */
105050+ int32_t rel_x; /* relative X motion */
105051+ int32_t rel_y; /* relative Y motion */
105052+};
105053+
105054+struct xenkbd_key
105055+{
105056+ uint8_t type; /* XENKBD_TYPE_KEY */
105057+ uint8_t pressed; /* 1 if pressed; 0 otherwise */
105058+ uint32_t keycode; /* KEY_* from linux/input.h */
105059+};
105060+
105061+struct xenkbd_position
105062+{
105063+ uint8_t type; /* XENKBD_TYPE_POS */
105064+ int32_t abs_x; /* absolute X position (in FB pixels) */
105065+ int32_t abs_y; /* absolute Y position (in FB pixels) */
105066+};
105067+
105068+#define XENKBD_IN_EVENT_SIZE 40
105069+
105070+union xenkbd_in_event
105071+{
105072+ uint8_t type;
105073+ struct xenkbd_motion motion;
105074+ struct xenkbd_key key;
105075+ struct xenkbd_position pos;
105076+ char pad[XENKBD_IN_EVENT_SIZE];
105077+};
105078+
105079+/* Out events (frontend -> backend) */
105080+
105081+/*
105082+ * Out events may be sent only when requested by backend, and receipt
105083+ * of an unknown out event is an error.
105084+ * No out events currently defined.
105085+ */
105086+
105087+#define XENKBD_OUT_EVENT_SIZE 40
105088+
105089+union xenkbd_out_event
105090+{
105091+ uint8_t type;
105092+ char pad[XENKBD_OUT_EVENT_SIZE];
105093+};
105094+
105095+/* shared page */
105096+
105097+#define XENKBD_IN_RING_SIZE 2048
105098+#define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE)
105099+#define XENKBD_IN_RING_OFFS 1024
105100+#define XENKBD_IN_RING(page) \
105101+ ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
105102+#define XENKBD_IN_RING_REF(page, idx) \
105103+ (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
105104+
105105+#define XENKBD_OUT_RING_SIZE 1024
105106+#define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE)
105107+#define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE)
105108+#define XENKBD_OUT_RING(page) \
105109+ ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
105110+#define XENKBD_OUT_RING_REF(page, idx) \
105111+ (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
105112+
105113+struct xenkbd_page
105114+{
105115+ uint32_t in_cons, in_prod;
105116+ uint32_t out_cons, out_prod;
105117+};
105118+
105119+#endif
105120+
105121+/*
105122+ * Local variables:
105123+ * mode: C
105124+ * c-set-style: "BSD"
105125+ * c-basic-offset: 4
105126+ * tab-width: 4
105127+ * indent-tabs-mode: nil
105128+ * End:
105129+ */
105130diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/netif.h linux-2.6.16.33/include/xen/interface/io/netif.h
105131--- linux-2.6.16.33-noxen/include/xen/interface/io/netif.h 1970-01-01 00:00:00.000000000 +0000
105132+++ linux-2.6.16.33/include/xen/interface/io/netif.h 2007-01-08 15:00:55.000000000 +0000
105133@@ -0,0 +1,184 @@
105134+/******************************************************************************
105135+ * netif.h
105136+ *
105137+ * Unified network-device I/O interface for Xen guest OSes.
105138+ *
105139+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105140+ * of this software and associated documentation files (the "Software"), to
105141+ * deal in the Software without restriction, including without limitation the
105142+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105143+ * sell copies of the Software, and to permit persons to whom the Software is
105144+ * furnished to do so, subject to the following conditions:
105145+ *
105146+ * The above copyright notice and this permission notice shall be included in
105147+ * all copies or substantial portions of the Software.
105148+ *
105149+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105150+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105151+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105152+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105153+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105154+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105155+ * DEALINGS IN THE SOFTWARE.
105156+ *
105157+ * Copyright (c) 2003-2004, Keir Fraser
105158+ */
105159+
105160+#ifndef __XEN_PUBLIC_IO_NETIF_H__
105161+#define __XEN_PUBLIC_IO_NETIF_H__
105162+
105163+#include "ring.h"
105164+#include "../grant_table.h"
105165+
105166+/*
105167+ * Notifications after enqueuing any type of message should be conditional on
105168+ * the appropriate req_event or rsp_event field in the shared ring.
105169+ * If the client sends notification for rx requests then it should specify
105170+ * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
105171+ * that it cannot safely queue packets (as it may not be kicked to send them).
105172+ */
105173+
105174+/*
105175+ * This is the 'wire' format for packets:
105176+ * Request 1: netif_tx_request -- NETTXF_* (any flags)
105177+ * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
105178+ * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE)
105179+ * Request 4: netif_tx_request -- NETTXF_more_data
105180+ * Request 5: netif_tx_request -- NETTXF_more_data
105181+ * ...
105182+ * Request N: netif_tx_request -- 0
105183+ */
105184+
105185+/* Protocol checksum field is blank in the packet (hardware offload)? */
105186+#define _NETTXF_csum_blank (0)
105187+#define NETTXF_csum_blank (1U<<_NETTXF_csum_blank)
105188+
105189+/* Packet data has been validated against protocol checksum. */
105190+#define _NETTXF_data_validated (1)
105191+#define NETTXF_data_validated (1U<<_NETTXF_data_validated)
105192+
105193+/* Packet continues in the next request descriptor. */
105194+#define _NETTXF_more_data (2)
105195+#define NETTXF_more_data (1U<<_NETTXF_more_data)
105196+
105197+/* Packet to be followed by extra descriptor(s). */
105198+#define _NETTXF_extra_info (3)
105199+#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
105200+
105201+struct netif_tx_request {
105202+ grant_ref_t gref; /* Reference to buffer page */
105203+ uint16_t offset; /* Offset within buffer page */
105204+ uint16_t flags; /* NETTXF_* */
105205+ uint16_t id; /* Echoed in response message. */
105206+ uint16_t size; /* Packet size in bytes. */
105207+};
105208+typedef struct netif_tx_request netif_tx_request_t;
105209+
105210+/* Types of netif_extra_info descriptors. */
105211+#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
105212+#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
105213+#define XEN_NETIF_EXTRA_TYPE_MAX (2)
105214+
105215+/* netif_extra_info flags. */
105216+#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
105217+#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
105218+
105219+/* GSO types - only TCPv4 currently supported. */
105220+#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
105221+
105222+/*
105223+ * This structure needs to fit within both netif_tx_request and
105224+ * netif_rx_response for compatibility.
105225+ */
105226+struct netif_extra_info {
105227+ uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
105228+ uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
105229+
105230+ union {
105231+ struct {
105232+ /*
105233+ * Maximum payload size of each segment. For example, for TCP this
105234+ * is just the path MSS.
105235+ */
105236+ uint16_t size;
105237+
105238+ /*
105239+ * GSO type. This determines the protocol of the packet and any
105240+ * extra features required to segment the packet properly.
105241+ */
105242+ uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
105243+
105244+ /* Future expansion. */
105245+ uint8_t pad;
105246+
105247+ /*
105248+ * GSO features. This specifies any extra GSO features required
105249+ * to process this packet, such as ECN support for TCPv4.
105250+ */
105251+ uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
105252+ } gso;
105253+
105254+ uint16_t pad[3];
105255+ } u;
105256+};
105257+
105258+struct netif_tx_response {
105259+ uint16_t id;
105260+ int16_t status; /* NETIF_RSP_* */
105261+};
105262+typedef struct netif_tx_response netif_tx_response_t;
105263+
105264+struct netif_rx_request {
105265+ uint16_t id; /* Echoed in response message. */
105266+ grant_ref_t gref; /* Reference to incoming granted frame */
105267+};
105268+typedef struct netif_rx_request netif_rx_request_t;
105269+
105270+/* Packet data has been validated against protocol checksum. */
105271+#define _NETRXF_data_validated (0)
105272+#define NETRXF_data_validated (1U<<_NETRXF_data_validated)
105273+
105274+/* Protocol checksum field is blank in the packet (hardware offload)? */
105275+#define _NETRXF_csum_blank (1)
105276+#define NETRXF_csum_blank (1U<<_NETRXF_csum_blank)
105277+
105278+/* Packet continues in the next request descriptor. */
105279+#define _NETRXF_more_data (2)
105280+#define NETRXF_more_data (1U<<_NETRXF_more_data)
105281+
105282+/* Packet to be followed by extra descriptor(s). */
105283+#define _NETRXF_extra_info (3)
105284+#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
105285+
105286+struct netif_rx_response {
105287+ uint16_t id;
105288+ uint16_t offset; /* Offset in page of start of received packet */
105289+ uint16_t flags; /* NETRXF_* */
105290+ int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
105291+};
105292+typedef struct netif_rx_response netif_rx_response_t;
105293+
105294+/*
105295+ * Generate netif ring structures and types.
105296+ */
105297+
105298+DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
105299+DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
105300+
105301+#define NETIF_RSP_DROPPED -2
105302+#define NETIF_RSP_ERROR -1
105303+#define NETIF_RSP_OKAY 0
105304+/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
105305+#define NETIF_RSP_NULL 1
105306+
105307+#endif
105308+
105309+/*
105310+ * Local variables:
105311+ * mode: C
105312+ * c-set-style: "BSD"
105313+ * c-basic-offset: 4
105314+ * tab-width: 4
105315+ * indent-tabs-mode: nil
105316+ * End:
105317+ */
105318diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/pciif.h linux-2.6.16.33/include/xen/interface/io/pciif.h
105319--- linux-2.6.16.33-noxen/include/xen/interface/io/pciif.h 1970-01-01 00:00:00.000000000 +0000
105320+++ linux-2.6.16.33/include/xen/interface/io/pciif.h 2007-01-08 15:00:55.000000000 +0000
105321@@ -0,0 +1,83 @@
105322+/*
105323+ * PCI Backend/Frontend Common Data Structures & Macros
105324+ *
105325+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105326+ * of this software and associated documentation files (the "Software"), to
105327+ * deal in the Software without restriction, including without limitation the
105328+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105329+ * sell copies of the Software, and to permit persons to whom the Software is
105330+ * furnished to do so, subject to the following conditions:
105331+ *
105332+ * The above copyright notice and this permission notice shall be included in
105333+ * all copies or substantial portions of the Software.
105334+ *
105335+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105336+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105337+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105338+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105339+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105340+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105341+ * DEALINGS IN THE SOFTWARE.
105342+ *
105343+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
105344+ */
105345+#ifndef __XEN_PCI_COMMON_H__
105346+#define __XEN_PCI_COMMON_H__
105347+
105348+/* Be sure to bump this number if you change this file */
105349+#define XEN_PCI_MAGIC "7"
105350+
105351+/* xen_pci_sharedinfo flags */
105352+#define _XEN_PCIF_active (0)
105353+#define XEN_PCIF_active (1<<_XEN_PCI_active)
105354+
105355+/* xen_pci_op commands */
105356+#define XEN_PCI_OP_conf_read (0)
105357+#define XEN_PCI_OP_conf_write (1)
105358+
105359+/* xen_pci_op error numbers */
105360+#define XEN_PCI_ERR_success (0)
105361+#define XEN_PCI_ERR_dev_not_found (-1)
105362+#define XEN_PCI_ERR_invalid_offset (-2)
105363+#define XEN_PCI_ERR_access_denied (-3)
105364+#define XEN_PCI_ERR_not_implemented (-4)
105365+/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
105366+#define XEN_PCI_ERR_op_failed (-5)
105367+
105368+struct xen_pci_op {
105369+ /* IN: what action to perform: XEN_PCI_OP_* */
105370+ uint32_t cmd;
105371+
105372+ /* OUT: will contain an error number (if any) from errno.h */
105373+ int32_t err;
105374+
105375+ /* IN: which device to touch */
105376+ uint32_t domain; /* PCI Domain/Segment */
105377+ uint32_t bus;
105378+ uint32_t devfn;
105379+
105380+ /* IN: which configuration registers to touch */
105381+ int32_t offset;
105382+ int32_t size;
105383+
105384+ /* IN/OUT: Contains the result after a READ or the value to WRITE */
105385+ uint32_t value;
105386+};
105387+
105388+struct xen_pci_sharedinfo {
105389+ /* flags - XEN_PCIF_* */
105390+ uint32_t flags;
105391+ struct xen_pci_op op;
105392+};
105393+
105394+#endif /* __XEN_PCI_COMMON_H__ */
105395+
105396+/*
105397+ * Local variables:
105398+ * mode: C
105399+ * c-set-style: "BSD"
105400+ * c-basic-offset: 4
105401+ * tab-width: 4
105402+ * indent-tabs-mode: nil
105403+ * End:
105404+ */
105405diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/ring.h linux-2.6.16.33/include/xen/interface/io/ring.h
105406--- linux-2.6.16.33-noxen/include/xen/interface/io/ring.h 1970-01-01 00:00:00.000000000 +0000
105407+++ linux-2.6.16.33/include/xen/interface/io/ring.h 2007-01-08 15:00:55.000000000 +0000
105408@@ -0,0 +1,299 @@
105409+/******************************************************************************
105410+ * ring.h
105411+ *
105412+ * Shared producer-consumer ring macros.
105413+ *
105414+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105415+ * of this software and associated documentation files (the "Software"), to
105416+ * deal in the Software without restriction, including without limitation the
105417+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105418+ * sell copies of the Software, and to permit persons to whom the Software is
105419+ * furnished to do so, subject to the following conditions:
105420+ *
105421+ * The above copyright notice and this permission notice shall be included in
105422+ * all copies or substantial portions of the Software.
105423+ *
105424+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105425+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105426+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105427+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105428+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105429+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105430+ * DEALINGS IN THE SOFTWARE.
105431+ *
105432+ * Tim Deegan and Andrew Warfield November 2004.
105433+ */
105434+
105435+#ifndef __XEN_PUBLIC_IO_RING_H__
105436+#define __XEN_PUBLIC_IO_RING_H__
105437+
105438+typedef unsigned int RING_IDX;
105439+
105440+/* Round a 32-bit unsigned constant down to the nearest power of two. */
105441+#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
105442+#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x))
105443+#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x))
105444+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x))
105445+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
105446+
105447+/*
105448+ * Calculate size of a shared ring, given the total available space for the
105449+ * ring and indexes (_sz), and the name tag of the request/response structure.
105450+ * A ring contains as many entries as will fit, rounded down to the nearest
105451+ * power of two (so we can mask with (size-1) to loop around).
105452+ */
105453+#define __RING_SIZE(_s, _sz) \
105454+ (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
105455+
105456+/*
105457+ * Macros to make the correct C datatypes for a new kind of ring.
105458+ *
105459+ * To make a new ring datatype, you need to have two message structures,
105460+ * let's say request_t, and response_t already defined.
105461+ *
105462+ * In a header where you want the ring datatype declared, you then do:
105463+ *
105464+ * DEFINE_RING_TYPES(mytag, request_t, response_t);
105465+ *
105466+ * These expand out to give you a set of types, as you can see below.
105467+ * The most important of these are:
105468+ *
105469+ * mytag_sring_t - The shared ring.
105470+ * mytag_front_ring_t - The 'front' half of the ring.
105471+ * mytag_back_ring_t - The 'back' half of the ring.
105472+ *
105473+ * To initialize a ring in your code you need to know the location and size
105474+ * of the shared memory area (PAGE_SIZE, for instance). To initialise
105475+ * the front half:
105476+ *
105477+ * mytag_front_ring_t front_ring;
105478+ * SHARED_RING_INIT((mytag_sring_t *)shared_page);
105479+ * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
105480+ *
105481+ * Initializing the back follows similarly (note that only the front
105482+ * initializes the shared ring):
105483+ *
105484+ * mytag_back_ring_t back_ring;
105485+ * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
105486+ */
105487+
105488+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
105489+ \
105490+/* Shared ring entry */ \
105491+union __name##_sring_entry { \
105492+ __req_t req; \
105493+ __rsp_t rsp; \
105494+}; \
105495+ \
105496+/* Shared ring page */ \
105497+struct __name##_sring { \
105498+ RING_IDX req_prod, req_event; \
105499+ RING_IDX rsp_prod, rsp_event; \
105500+ uint8_t pad[48]; \
105501+ union __name##_sring_entry ring[1]; /* variable-length */ \
105502+}; \
105503+ \
105504+/* "Front" end's private variables */ \
105505+struct __name##_front_ring { \
105506+ RING_IDX req_prod_pvt; \
105507+ RING_IDX rsp_cons; \
105508+ unsigned int nr_ents; \
105509+ struct __name##_sring *sring; \
105510+}; \
105511+ \
105512+/* "Back" end's private variables */ \
105513+struct __name##_back_ring { \
105514+ RING_IDX rsp_prod_pvt; \
105515+ RING_IDX req_cons; \
105516+ unsigned int nr_ents; \
105517+ struct __name##_sring *sring; \
105518+}; \
105519+ \
105520+/* Syntactic sugar */ \
105521+typedef struct __name##_sring __name##_sring_t; \
105522+typedef struct __name##_front_ring __name##_front_ring_t; \
105523+typedef struct __name##_back_ring __name##_back_ring_t
105524+
105525+/*
105526+ * Macros for manipulating rings.
105527+ *
105528+ * FRONT_RING_whatever works on the "front end" of a ring: here
105529+ * requests are pushed on to the ring and responses taken off it.
105530+ *
105531+ * BACK_RING_whatever works on the "back end" of a ring: here
105532+ * requests are taken off the ring and responses put on.
105533+ *
105534+ * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
105535+ * This is OK in 1-for-1 request-response situations where the
105536+ * requestor (front end) never has more than RING_SIZE()-1
105537+ * outstanding requests.
105538+ */
105539+
105540+/* Initialising empty rings */
105541+#define SHARED_RING_INIT(_s) do { \
105542+ (_s)->req_prod = (_s)->rsp_prod = 0; \
105543+ (_s)->req_event = (_s)->rsp_event = 1; \
105544+ memset((_s)->pad, 0, sizeof((_s)->pad)); \
105545+} while(0)
105546+
105547+#define FRONT_RING_INIT(_r, _s, __size) do { \
105548+ (_r)->req_prod_pvt = 0; \
105549+ (_r)->rsp_cons = 0; \
105550+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
105551+ (_r)->sring = (_s); \
105552+} while (0)
105553+
105554+#define BACK_RING_INIT(_r, _s, __size) do { \
105555+ (_r)->rsp_prod_pvt = 0; \
105556+ (_r)->req_cons = 0; \
105557+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
105558+ (_r)->sring = (_s); \
105559+} while (0)
105560+
105561+/* Initialize to existing shared indexes -- for recovery */
105562+#define FRONT_RING_ATTACH(_r, _s, __size) do { \
105563+ (_r)->sring = (_s); \
105564+ (_r)->req_prod_pvt = (_s)->req_prod; \
105565+ (_r)->rsp_cons = (_s)->rsp_prod; \
105566+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
105567+} while (0)
105568+
105569+#define BACK_RING_ATTACH(_r, _s, __size) do { \
105570+ (_r)->sring = (_s); \
105571+ (_r)->rsp_prod_pvt = (_s)->rsp_prod; \
105572+ (_r)->req_cons = (_s)->req_prod; \
105573+ (_r)->nr_ents = __RING_SIZE(_s, __size); \
105574+} while (0)
105575+
105576+/* How big is this ring? */
105577+#define RING_SIZE(_r) \
105578+ ((_r)->nr_ents)
105579+
105580+/* Number of free requests (for use on front side only). */
105581+#define RING_FREE_REQUESTS(_r) \
105582+ (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
105583+
105584+/* Test if there is an empty slot available on the front ring.
105585+ * (This is only meaningful from the front. )
105586+ */
105587+#define RING_FULL(_r) \
105588+ (RING_FREE_REQUESTS(_r) == 0)
105589+
105590+/* Test if there are outstanding messages to be processed on a ring. */
105591+#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
105592+ ((_r)->sring->rsp_prod - (_r)->rsp_cons)
105593+
105594+#ifdef __GNUC__
105595+#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \
105596+ unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
105597+ unsigned int rsp = RING_SIZE(_r) - \
105598+ ((_r)->req_cons - (_r)->rsp_prod_pvt); \
105599+ req < rsp ? req : rsp; \
105600+})
105601+#else
105602+/* Same as above, but without the nice GCC ({ ... }) syntax. */
105603+#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
105604+ ((((_r)->sring->req_prod - (_r)->req_cons) < \
105605+ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \
105606+ ((_r)->sring->req_prod - (_r)->req_cons) : \
105607+ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
105608+#endif
105609+
105610+/* Direct access to individual ring elements, by index. */
105611+#define RING_GET_REQUEST(_r, _idx) \
105612+ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
105613+
105614+#define RING_GET_RESPONSE(_r, _idx) \
105615+ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
105616+
105617+/* Loop termination condition: Would the specified index overflow the ring? */
105618+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
105619+ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
105620+
105621+#define RING_PUSH_REQUESTS(_r) do { \
105622+ wmb(); /* back sees requests /before/ updated producer index */ \
105623+ (_r)->sring->req_prod = (_r)->req_prod_pvt; \
105624+} while (0)
105625+
105626+#define RING_PUSH_RESPONSES(_r) do { \
105627+ wmb(); /* front sees responses /before/ updated producer index */ \
105628+ (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
105629+} while (0)
105630+
105631+/*
105632+ * Notification hold-off (req_event and rsp_event):
105633+ *
105634+ * When queueing requests or responses on a shared ring, it may not always be
105635+ * necessary to notify the remote end. For example, if requests are in flight
105636+ * in a backend, the front may be able to queue further requests without
105637+ * notifying the back (if the back checks for new requests when it queues
105638+ * responses).
105639+ *
105640+ * When enqueuing requests or responses:
105641+ *
105642+ * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
105643+ * is a boolean return value. True indicates that the receiver requires an
105644+ * asynchronous notification.
105645+ *
105646+ * After dequeuing requests or responses (before sleeping the connection):
105647+ *
105648+ * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
105649+ * The second argument is a boolean return value. True indicates that there
105650+ * are pending messages on the ring (i.e., the connection should not be put
105651+ * to sleep).
105652+ *
105653+ * These macros will set the req_event/rsp_event field to trigger a
105654+ * notification on the very next message that is enqueued. If you want to
105655+ * create batches of work (i.e., only receive a notification after several
105656+ * messages have been enqueued) then you will need to create a customised
105657+ * version of the FINAL_CHECK macro in your own code, which sets the event
105658+ * field appropriately.
105659+ */
105660+
105661+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
105662+ RING_IDX __old = (_r)->sring->req_prod; \
105663+ RING_IDX __new = (_r)->req_prod_pvt; \
105664+ wmb(); /* back sees requests /before/ updated producer index */ \
105665+ (_r)->sring->req_prod = __new; \
105666+ mb(); /* back sees new requests /before/ we check req_event */ \
105667+ (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
105668+ (RING_IDX)(__new - __old)); \
105669+} while (0)
105670+
105671+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
105672+ RING_IDX __old = (_r)->sring->rsp_prod; \
105673+ RING_IDX __new = (_r)->rsp_prod_pvt; \
105674+ wmb(); /* front sees responses /before/ updated producer index */ \
105675+ (_r)->sring->rsp_prod = __new; \
105676+ mb(); /* front sees new responses /before/ we check rsp_event */ \
105677+ (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
105678+ (RING_IDX)(__new - __old)); \
105679+} while (0)
105680+
105681+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
105682+ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
105683+ if (_work_to_do) break; \
105684+ (_r)->sring->req_event = (_r)->req_cons + 1; \
105685+ mb(); \
105686+ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
105687+} while (0)
105688+
105689+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
105690+ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
105691+ if (_work_to_do) break; \
105692+ (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
105693+ mb(); \
105694+ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
105695+} while (0)
105696+
105697+#endif /* __XEN_PUBLIC_IO_RING_H__ */
105698+
105699+/*
105700+ * Local variables:
105701+ * mode: C
105702+ * c-set-style: "BSD"
105703+ * c-basic-offset: 4
105704+ * tab-width: 4
105705+ * indent-tabs-mode: nil
105706+ * End:
105707+ */
105708diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/tpmif.h linux-2.6.16.33/include/xen/interface/io/tpmif.h
105709--- linux-2.6.16.33-noxen/include/xen/interface/io/tpmif.h 1970-01-01 00:00:00.000000000 +0000
105710+++ linux-2.6.16.33/include/xen/interface/io/tpmif.h 2007-01-08 15:00:55.000000000 +0000
105711@@ -0,0 +1,77 @@
105712+/******************************************************************************
105713+ * tpmif.h
105714+ *
105715+ * TPM I/O interface for Xen guest OSes.
105716+ *
105717+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105718+ * of this software and associated documentation files (the "Software"), to
105719+ * deal in the Software without restriction, including without limitation the
105720+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105721+ * sell copies of the Software, and to permit persons to whom the Software is
105722+ * furnished to do so, subject to the following conditions:
105723+ *
105724+ * The above copyright notice and this permission notice shall be included in
105725+ * all copies or substantial portions of the Software.
105726+ *
105727+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105728+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105729+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105730+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105731+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105732+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105733+ * DEALINGS IN THE SOFTWARE.
105734+ *
105735+ * Copyright (c) 2005, IBM Corporation
105736+ *
105737+ * Author: Stefan Berger, stefanb@us.ibm.com
105738+ * Grant table support: Mahadevan Gomathisankaran
105739+ *
105740+ * This code has been derived from tools/libxc/xen/io/netif.h
105741+ *
105742+ * Copyright (c) 2003-2004, Keir Fraser
105743+ */
105744+
105745+#ifndef __XEN_PUBLIC_IO_TPMIF_H__
105746+#define __XEN_PUBLIC_IO_TPMIF_H__
105747+
105748+#include "../grant_table.h"
105749+
105750+struct tpmif_tx_request {
105751+ unsigned long addr; /* Machine address of packet. */
105752+ grant_ref_t ref; /* grant table access reference */
105753+ uint16_t unused;
105754+ uint16_t size; /* Packet size in bytes. */
105755+};
105756+typedef struct tpmif_tx_request tpmif_tx_request_t;
105757+
105758+/*
105759+ * The TPMIF_TX_RING_SIZE defines the number of pages the
105760+ * front-end and backend can exchange (= size of array).
105761+ */
105762+typedef uint32_t TPMIF_RING_IDX;
105763+
105764+#define TPMIF_TX_RING_SIZE 10
105765+
105766+/* This structure must fit in a memory page. */
105767+
105768+struct tpmif_ring {
105769+ struct tpmif_tx_request req;
105770+};
105771+typedef struct tpmif_ring tpmif_ring_t;
105772+
105773+struct tpmif_tx_interface {
105774+ struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
105775+};
105776+typedef struct tpmif_tx_interface tpmif_tx_interface_t;
105777+
105778+#endif
105779+
105780+/*
105781+ * Local variables:
105782+ * mode: C
105783+ * c-set-style: "BSD"
105784+ * c-basic-offset: 4
105785+ * tab-width: 4
105786+ * indent-tabs-mode: nil
105787+ * End:
105788+ */
105789diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/xenbus.h linux-2.6.16.33/include/xen/interface/io/xenbus.h
105790--- linux-2.6.16.33-noxen/include/xen/interface/io/xenbus.h 1970-01-01 00:00:00.000000000 +0000
105791+++ linux-2.6.16.33/include/xen/interface/io/xenbus.h 2007-01-08 15:00:55.000000000 +0000
105792@@ -0,0 +1,73 @@
105793+/*****************************************************************************
105794+ * xenbus.h
105795+ *
105796+ * Xenbus protocol details.
105797+ *
105798+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105799+ * of this software and associated documentation files (the "Software"), to
105800+ * deal in the Software without restriction, including without limitation the
105801+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105802+ * sell copies of the Software, and to permit persons to whom the Software is
105803+ * furnished to do so, subject to the following conditions:
105804+ *
105805+ * The above copyright notice and this permission notice shall be included in
105806+ * all copies or substantial portions of the Software.
105807+ *
105808+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105809+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105810+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105811+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105812+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105813+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105814+ * DEALINGS IN THE SOFTWARE.
105815+ *
105816+ * Copyright (C) 2005 XenSource Ltd.
105817+ */
105818+
105819+#ifndef _XEN_PUBLIC_IO_XENBUS_H
105820+#define _XEN_PUBLIC_IO_XENBUS_H
105821+
105822+/*
105823+ * The state of either end of the Xenbus, i.e. the current communication
105824+ * status of initialisation across the bus. States here imply nothing about
105825+ * the state of the connection between the driver and the kernel's device
105826+ * layers.
105827+ */
105828+enum xenbus_state {
105829+ XenbusStateUnknown = 0,
105830+
105831+ XenbusStateInitialising = 1,
105832+
105833+ /*
105834+ * InitWait: Finished early initialisation but waiting for information
105835+ * from the peer or hotplug scripts.
105836+ */
105837+ XenbusStateInitWait = 2,
105838+
105839+ /*
105840+ * Initialised: Waiting for a connection from the peer.
105841+ */
105842+ XenbusStateInitialised = 3,
105843+
105844+ XenbusStateConnected = 4,
105845+
105846+ /*
105847+ * Closing: The device is being closed due to an error or an unplug event.
105848+ */
105849+ XenbusStateClosing = 5,
105850+
105851+ XenbusStateClosed = 6
105852+};
105853+typedef enum xenbus_state XenbusState;
105854+
105855+#endif /* _XEN_PUBLIC_IO_XENBUS_H */
105856+
105857+/*
105858+ * Local variables:
105859+ * mode: C
105860+ * c-set-style: "BSD"
105861+ * c-basic-offset: 4
105862+ * tab-width: 4
105863+ * indent-tabs-mode: nil
105864+ * End:
105865+ */
105866diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/xs_wire.h linux-2.6.16.33/include/xen/interface/io/xs_wire.h
105867--- linux-2.6.16.33-noxen/include/xen/interface/io/xs_wire.h 1970-01-01 00:00:00.000000000 +0000
105868+++ linux-2.6.16.33/include/xen/interface/io/xs_wire.h 2007-01-08 15:00:55.000000000 +0000
105869@@ -0,0 +1,116 @@
105870+/*
105871+ * Details of the "wire" protocol between Xen Store Daemon and client
105872+ * library or guest kernel.
105873+ *
105874+ * Permission is hereby granted, free of charge, to any person obtaining a copy
105875+ * of this software and associated documentation files (the "Software"), to
105876+ * deal in the Software without restriction, including without limitation the
105877+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105878+ * sell copies of the Software, and to permit persons to whom the Software is
105879+ * furnished to do so, subject to the following conditions:
105880+ *
105881+ * The above copyright notice and this permission notice shall be included in
105882+ * all copies or substantial portions of the Software.
105883+ *
105884+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105885+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105886+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105887+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105888+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105889+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105890+ * DEALINGS IN THE SOFTWARE.
105891+ *
105892+ * Copyright (C) 2005 Rusty Russell IBM Corporation
105893+ */
105894+
105895+#ifndef _XS_WIRE_H
105896+#define _XS_WIRE_H
105897+
105898+enum xsd_sockmsg_type
105899+{
105900+ XS_DEBUG,
105901+ XS_DIRECTORY,
105902+ XS_READ,
105903+ XS_GET_PERMS,
105904+ XS_WATCH,
105905+ XS_UNWATCH,
105906+ XS_TRANSACTION_START,
105907+ XS_TRANSACTION_END,
105908+ XS_INTRODUCE,
105909+ XS_RELEASE,
105910+ XS_GET_DOMAIN_PATH,
105911+ XS_WRITE,
105912+ XS_MKDIR,
105913+ XS_RM,
105914+ XS_SET_PERMS,
105915+ XS_WATCH_EVENT,
105916+ XS_ERROR,
105917+ XS_IS_DOMAIN_INTRODUCED
105918+};
105919+
105920+#define XS_WRITE_NONE "NONE"
105921+#define XS_WRITE_CREATE "CREATE"
105922+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
105923+
105924+/* We hand errors as strings, for portability. */
105925+struct xsd_errors
105926+{
105927+ int errnum;
105928+ const char *errstring;
105929+};
105930+#define XSD_ERROR(x) { x, #x }
105931+static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
105932+ XSD_ERROR(EINVAL),
105933+ XSD_ERROR(EACCES),
105934+ XSD_ERROR(EEXIST),
105935+ XSD_ERROR(EISDIR),
105936+ XSD_ERROR(ENOENT),
105937+ XSD_ERROR(ENOMEM),
105938+ XSD_ERROR(ENOSPC),
105939+ XSD_ERROR(EIO),
105940+ XSD_ERROR(ENOTEMPTY),
105941+ XSD_ERROR(ENOSYS),
105942+ XSD_ERROR(EROFS),
105943+ XSD_ERROR(EBUSY),
105944+ XSD_ERROR(EAGAIN),
105945+ XSD_ERROR(EISCONN)
105946+};
105947+
105948+struct xsd_sockmsg
105949+{
105950+ uint32_t type; /* XS_??? */
105951+ uint32_t req_id;/* Request identifier, echoed in daemon's response. */
105952+ uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
105953+ uint32_t len; /* Length of data following this. */
105954+
105955+ /* Generally followed by nul-terminated string(s). */
105956+};
105957+
105958+enum xs_watch_type
105959+{
105960+ XS_WATCH_PATH = 0,
105961+ XS_WATCH_TOKEN
105962+};
105963+
105964+/* Inter-domain shared memory communications. */
105965+#define XENSTORE_RING_SIZE 1024
105966+typedef uint32_t XENSTORE_RING_IDX;
105967+#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
105968+struct xenstore_domain_interface {
105969+ char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
105970+ char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
105971+ XENSTORE_RING_IDX req_cons, req_prod;
105972+ XENSTORE_RING_IDX rsp_cons, rsp_prod;
105973+};
105974+
105975+#endif /* _XS_WIRE_H */
105976+
105977+/*
105978+ * Local variables:
105979+ * mode: C
105980+ * c-set-style: "BSD"
105981+ * c-basic-offset: 4
105982+ * tab-width: 4
105983+ * indent-tabs-mode: nil
105984+ * End:
105985+ */
105986diff -Nur linux-2.6.16.33-noxen/include/xen/interface/kexec.h linux-2.6.16.33/include/xen/interface/kexec.h
105987--- linux-2.6.16.33-noxen/include/xen/interface/kexec.h 1970-01-01 00:00:00.000000000 +0000
105988+++ linux-2.6.16.33/include/xen/interface/kexec.h 2007-01-08 15:00:55.000000000 +0000
105989@@ -0,0 +1,137 @@
105990+/******************************************************************************
105991+ * kexec.h - Public portion
105992+ *
105993+ * Xen port written by:
105994+ * - Simon 'Horms' Horman <horms@verge.net.au>
105995+ * - Magnus Damm <magnus@valinux.co.jp>
105996+ */
105997+
105998+#ifndef _XEN_PUBLIC_KEXEC_H
105999+#define _XEN_PUBLIC_KEXEC_H
106000+
106001+
106002+/* This file describes the Kexec / Kdump hypercall interface for Xen.
106003+ *
106004+ * Kexec under vanilla Linux allows a user to reboot the physical machine
106005+ * into a new user-specified kernel. The Xen port extends this idea
106006+ * to allow rebooting of the machine from dom0. When kexec for dom0
106007+ * is used to reboot, both the hypervisor and the domains get replaced
106008+ * with some other kernel. It is possible to kexec between vanilla
106009+ * Linux and Xen and back again. Xen to Xen works well too.
106010+ *
106011+ * The hypercall interface for kexec can be divided into three main
106012+ * types of hypercall operations:
106013+ *
106014+ * 1) Range information:
106015+ * This is used by the dom0 kernel to ask the hypervisor about various
106016+ * address information. This information is needed to allow kexec-tools
106017+ * to fill in the ELF headers for /proc/vmcore properly.
106018+ *
106019+ * 2) Load and unload of images:
106020+ * There are no big surprises here, the kexec binary from kexec-tools
106021+ * runs in userspace in dom0. The tool loads/unloads data into the
106022+ * dom0 kernel such as new kernel, initramfs and hypervisor. When
106023+ * loaded the dom0 kernel performs a load hypercall operation, and
106024+ * before releasing all page references the dom0 kernel calls unload.
106025+ *
106026+ * 3) Kexec operation:
106027+ * This is used to start a previously loaded kernel.
106028+ */
106029+
106030+#include "xen.h"
106031+
106032+#if defined(__i386__) || defined(__x86_64__)
106033+#define KEXEC_XEN_NO_PAGES 17
106034+#endif
106035+
106036+/*
106037+ * Prototype for this hypercall is:
106038+ * int kexec_op(int cmd, void *args)
106039+ * @cmd == KEXEC_CMD_...
106040+ * KEXEC operation to perform
106041+ * @args == Operation-specific extra arguments (NULL if none).
106042+ */
106043+
106044+/*
106045+ * Kexec supports two types of operation:
106046+ * - kexec into a regular kernel, very similar to a standard reboot
106047+ * - KEXEC_TYPE_DEFAULT is used to specify this type
106048+ * - kexec into a special "crash kernel", aka kexec-on-panic
106049+ * - KEXEC_TYPE_CRASH is used to specify this type
106050+ * - parts of our system may be broken at kexec-on-panic time
106051+ * - the code should be kept as simple and self-contained as possible
106052+ */
106053+
106054+#define KEXEC_TYPE_DEFAULT 0
106055+#define KEXEC_TYPE_CRASH 1
106056+
106057+
106058+/* The kexec implementation for Xen allows the user to load two
106059+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
106060+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
106061+ * per "instance". The data mainly consists of machine address lists to pages
106062+ * together with destination addresses. The data in xen_kexec_image_t
106063+ * is passed to the "code page" which is one page of code that performs
106064+ * the final relocations before jumping to the new kernel.
106065+ */
106066+
106067+typedef struct xen_kexec_image {
106068+#if defined(__i386__) || defined(__x86_64__)
106069+ unsigned long page_list[KEXEC_XEN_NO_PAGES];
106070+#endif
106071+ unsigned long indirection_page;
106072+ unsigned long start_address;
106073+} xen_kexec_image_t;
106074+
106075+/*
106076+ * Perform kexec having previously loaded a kexec or kdump kernel
106077+ * as appropriate.
106078+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
106079+ */
106080+#define KEXEC_CMD_kexec 0
106081+typedef struct xen_kexec_exec {
106082+ int type;
106083+} xen_kexec_exec_t;
106084+
106085+/*
106086+ * Load/Unload kernel image for kexec or kdump.
106087+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
106088+ * image == relocation information for kexec (ignored for unload) [in]
106089+ */
106090+#define KEXEC_CMD_kexec_load 1
106091+#define KEXEC_CMD_kexec_unload 2
106092+typedef struct xen_kexec_load {
106093+ int type;
106094+ xen_kexec_image_t image;
106095+} xen_kexec_load_t;
106096+
106097+#define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */
106098+#define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */
106099+#define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */
106100+
106101+/*
106102+ * Find the address and size of certain memory areas
106103+ * range == KEXEC_RANGE_... [in]
106104+ * nr == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
106105+ * size == number of bytes reserved in window [out]
106106+ * start == address of the first byte in the window [out]
106107+ */
106108+#define KEXEC_CMD_kexec_get_range 3
106109+typedef struct xen_kexec_range {
106110+ int range;
106111+ int nr;
106112+ unsigned long size;
106113+ unsigned long start;
106114+} xen_kexec_range_t;
106115+
106116+#endif /* _XEN_PUBLIC_KEXEC_H */
106117+
106118+/*
106119+ * Local variables:
106120+ * mode: C
106121+ * c-set-style: "BSD"
106122+ * c-basic-offset: 4
106123+ * tab-width: 4
106124+ * indent-tabs-mode: nil
106125+ * End:
106126+ */
106127diff -Nur linux-2.6.16.33-noxen/include/xen/interface/memory.h linux-2.6.16.33/include/xen/interface/memory.h
106128--- linux-2.6.16.33-noxen/include/xen/interface/memory.h 1970-01-01 00:00:00.000000000 +0000
106129+++ linux-2.6.16.33/include/xen/interface/memory.h 2007-01-08 15:00:55.000000000 +0000
106130@@ -0,0 +1,276 @@
106131+/******************************************************************************
106132+ * memory.h
106133+ *
106134+ * Memory reservation and information.
106135+ *
106136+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106137+ * of this software and associated documentation files (the "Software"), to
106138+ * deal in the Software without restriction, including without limitation the
106139+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106140+ * sell copies of the Software, and to permit persons to whom the Software is
106141+ * furnished to do so, subject to the following conditions:
106142+ *
106143+ * The above copyright notice and this permission notice shall be included in
106144+ * all copies or substantial portions of the Software.
106145+ *
106146+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106147+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106148+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106149+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106150+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106151+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106152+ * DEALINGS IN THE SOFTWARE.
106153+ *
106154+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106155+ */
106156+
106157+#ifndef __XEN_PUBLIC_MEMORY_H__
106158+#define __XEN_PUBLIC_MEMORY_H__
106159+
106160+/*
106161+ * Increase or decrease the specified domain's memory reservation. Returns the
106162+ * number of extents successfully allocated or freed.
106163+ * arg == addr of struct xen_memory_reservation.
106164+ */
106165+#define XENMEM_increase_reservation 0
106166+#define XENMEM_decrease_reservation 1
106167+#define XENMEM_populate_physmap 6
106168+struct xen_memory_reservation {
106169+
106170+ /*
106171+ * XENMEM_increase_reservation:
106172+ * OUT: MFN (*not* GMFN) bases of extents that were allocated
106173+ * XENMEM_decrease_reservation:
106174+ * IN: GMFN bases of extents to free
106175+ * XENMEM_populate_physmap:
106176+ * IN: GPFN bases of extents to populate with memory
106177+ * OUT: GMFN bases of extents that were allocated
106178+ * (NB. This command also updates the mach_to_phys translation table)
106179+ */
106180+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
106181+
106182+ /* Number of extents, and size/alignment of each (2^extent_order pages). */
106183+ xen_ulong_t nr_extents;
106184+ unsigned int extent_order;
106185+
106186+ /*
106187+ * Maximum # bits addressable by the user of the allocated region (e.g.,
106188+ * I/O devices often have a 32-bit limitation even in 64-bit systems). If
106189+ * zero then the user has no addressing restriction.
106190+ * This field is not used by XENMEM_decrease_reservation.
106191+ */
106192+ unsigned int address_bits;
106193+
106194+ /*
106195+ * Domain whose reservation is being changed.
106196+ * Unprivileged domains can specify only DOMID_SELF.
106197+ */
106198+ domid_t domid;
106199+};
106200+typedef struct xen_memory_reservation xen_memory_reservation_t;
106201+DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
106202+
106203+/*
106204+ * An atomic exchange of memory pages. If return code is zero then
106205+ * @out.extent_list provides GMFNs of the newly-allocated memory.
106206+ * Returns zero on complete success, otherwise a negative error code.
106207+ * On complete success then always @nr_exchanged == @in.nr_extents.
106208+ * On partial success @nr_exchanged indicates how much work was done.
106209+ */
106210+#define XENMEM_exchange 11
106211+struct xen_memory_exchange {
106212+ /*
106213+ * [IN] Details of memory extents to be exchanged (GMFN bases).
106214+ * Note that @in.address_bits is ignored and unused.
106215+ */
106216+ struct xen_memory_reservation in;
106217+
106218+ /*
106219+ * [IN/OUT] Details of new memory extents.
106220+ * We require that:
106221+ * 1. @in.domid == @out.domid
106222+ * 2. @in.nr_extents << @in.extent_order ==
106223+ * @out.nr_extents << @out.extent_order
106224+ * 3. @in.extent_start and @out.extent_start lists must not overlap
106225+ * 4. @out.extent_start lists GPFN bases to be populated
106226+ * 5. @out.extent_start is overwritten with allocated GMFN bases
106227+ */
106228+ struct xen_memory_reservation out;
106229+
106230+ /*
106231+ * [OUT] Number of input extents that were successfully exchanged:
106232+ * 1. The first @nr_exchanged input extents were successfully
106233+ * deallocated.
106234+ * 2. The corresponding first entries in the output extent list correctly
106235+ * indicate the GMFNs that were successfully exchanged.
106236+ * 3. All other input and output extents are untouched.
106237+ * 4. If not all input exents are exchanged then the return code of this
106238+ * command will be non-zero.
106239+ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
106240+ */
106241+ xen_ulong_t nr_exchanged;
106242+};
106243+typedef struct xen_memory_exchange xen_memory_exchange_t;
106244+DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
106245+
106246+/*
106247+ * Returns the maximum machine frame number of mapped RAM in this system.
106248+ * This command always succeeds (it never returns an error code).
106249+ * arg == NULL.
106250+ */
106251+#define XENMEM_maximum_ram_page 2
106252+
106253+/*
106254+ * Returns the current or maximum memory reservation, in pages, of the
106255+ * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
106256+ * arg == addr of domid_t.
106257+ */
106258+#define XENMEM_current_reservation 3
106259+#define XENMEM_maximum_reservation 4
106260+
106261+/*
106262+ * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
106263+ * mapping table. Architectures which do not have a m2p table do not implement
106264+ * this command.
106265+ * arg == addr of xen_machphys_mfn_list_t.
106266+ */
106267+#define XENMEM_machphys_mfn_list 5
106268+struct xen_machphys_mfn_list {
106269+ /*
106270+ * Size of the 'extent_start' array. Fewer entries will be filled if the
106271+ * machphys table is smaller than max_extents * 2MB.
106272+ */
106273+ unsigned int max_extents;
106274+
106275+ /*
106276+ * Pointer to buffer to fill with list of extent starts. If there are
106277+ * any large discontiguities in the machine address space, 2MB gaps in
106278+ * the machphys table will be represented by an MFN base of zero.
106279+ */
106280+ XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
106281+
106282+ /*
106283+ * Number of extents written to the above array. This will be smaller
106284+ * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
106285+ */
106286+ unsigned int nr_extents;
106287+};
106288+typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
106289+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
106290+
106291+/*
106292+ * Returns the location in virtual address space of the machine_to_phys
106293+ * mapping table. Architectures which do not have a m2p table, or which do not
106294+ * map it by default into guest address space, do not implement this command.
106295+ * arg == addr of xen_machphys_mapping_t.
106296+ */
106297+#define XENMEM_machphys_mapping 12
106298+struct xen_machphys_mapping {
106299+ xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */
106300+ xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */
106301+};
106302+typedef struct xen_machphys_mapping xen_machphys_mapping_t;
106303+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
106304+
106305+/*
106306+ * Sets the GPFN at which a particular page appears in the specified guest's
106307+ * pseudophysical address space.
106308+ * arg == addr of xen_add_to_physmap_t.
106309+ */
106310+#define XENMEM_add_to_physmap 7
106311+struct xen_add_to_physmap {
106312+ /* Which domain to change the mapping for. */
106313+ domid_t domid;
106314+
106315+ /* Source mapping space. */
106316+#define XENMAPSPACE_shared_info 0 /* shared info page */
106317+#define XENMAPSPACE_grant_table 1 /* grant table page */
106318+ unsigned int space;
106319+
106320+ /* Index into source mapping space. */
106321+ xen_ulong_t idx;
106322+
106323+ /* GPFN where the source mapping page should appear. */
106324+ xen_pfn_t gpfn;
106325+};
106326+typedef struct xen_add_to_physmap xen_add_to_physmap_t;
106327+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
106328+
106329+/*
106330+ * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
106331+ * code on failure. This call only works for auto-translated guests.
106332+ */
106333+#define XENMEM_translate_gpfn_list 8
106334+struct xen_translate_gpfn_list {
106335+ /* Which domain to translate for? */
106336+ domid_t domid;
106337+
106338+ /* Length of list. */
106339+ xen_ulong_t nr_gpfns;
106340+
106341+ /* List of GPFNs to translate. */
106342+ XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
106343+
106344+ /*
106345+ * Output list to contain MFN translations. May be the same as the input
106346+ * list (in which case each input GPFN is overwritten with the output MFN).
106347+ */
106348+ XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
106349+};
106350+typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
106351+DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
106352+
106353+/*
106354+ * Returns the pseudo-physical memory map as it was when the domain
106355+ * was started (specified by XENMEM_set_memory_map).
106356+ * arg == addr of xen_memory_map_t.
106357+ */
106358+#define XENMEM_memory_map 9
106359+struct xen_memory_map {
106360+ /*
106361+ * On call the number of entries which can be stored in buffer. On
106362+ * return the number of entries which have been stored in
106363+ * buffer.
106364+ */
106365+ unsigned int nr_entries;
106366+
106367+ /*
106368+ * Entries in the buffer are in the same format as returned by the
106369+ * BIOS INT 0x15 EAX=0xE820 call.
106370+ */
106371+ XEN_GUEST_HANDLE(void) buffer;
106372+};
106373+typedef struct xen_memory_map xen_memory_map_t;
106374+DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
106375+
106376+/*
106377+ * Returns the real physical memory map. Passes the same structure as
106378+ * XENMEM_memory_map.
106379+ * arg == addr of xen_memory_map_t.
106380+ */
106381+#define XENMEM_machine_memory_map 10
106382+
106383+/*
106384+ * Set the pseudo-physical memory map of a domain, as returned by
106385+ * XENMEM_memory_map.
106386+ * arg == addr of xen_foreign_memory_map_t.
106387+ */
106388+#define XENMEM_set_memory_map 13
106389+struct xen_foreign_memory_map {
106390+ domid_t domid;
106391+ struct xen_memory_map map;
106392+};
106393+typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
106394+DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
106395+
106396+#endif /* __XEN_PUBLIC_MEMORY_H__ */
106397+
106398+/*
106399+ * Local variables:
106400+ * mode: C
106401+ * c-set-style: "BSD"
106402+ * c-basic-offset: 4
106403+ * tab-width: 4
106404+ * indent-tabs-mode: nil
106405+ * End:
106406+ */
106407diff -Nur linux-2.6.16.33-noxen/include/xen/interface/nmi.h linux-2.6.16.33/include/xen/interface/nmi.h
106408--- linux-2.6.16.33-noxen/include/xen/interface/nmi.h 1970-01-01 00:00:00.000000000 +0000
106409+++ linux-2.6.16.33/include/xen/interface/nmi.h 2007-01-08 15:00:55.000000000 +0000
106410@@ -0,0 +1,78 @@
106411+/******************************************************************************
106412+ * nmi.h
106413+ *
106414+ * NMI callback registration and reason codes.
106415+ *
106416+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106417+ * of this software and associated documentation files (the "Software"), to
106418+ * deal in the Software without restriction, including without limitation the
106419+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106420+ * sell copies of the Software, and to permit persons to whom the Software is
106421+ * furnished to do so, subject to the following conditions:
106422+ *
106423+ * The above copyright notice and this permission notice shall be included in
106424+ * all copies or substantial portions of the Software.
106425+ *
106426+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106427+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106428+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106429+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106430+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106431+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106432+ * DEALINGS IN THE SOFTWARE.
106433+ *
106434+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106435+ */
106436+
106437+#ifndef __XEN_PUBLIC_NMI_H__
106438+#define __XEN_PUBLIC_NMI_H__
106439+
106440+/*
106441+ * NMI reason codes:
106442+ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
106443+ */
106444+ /* I/O-check error reported via ISA port 0x61, bit 6. */
106445+#define _XEN_NMIREASON_io_error 0
106446+#define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error)
106447+ /* Parity error reported via ISA port 0x61, bit 7. */
106448+#define _XEN_NMIREASON_parity_error 1
106449+#define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error)
106450+ /* Unknown hardware-generated NMI. */
106451+#define _XEN_NMIREASON_unknown 2
106452+#define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown)
106453+
106454+/*
106455+ * long nmi_op(unsigned int cmd, void *arg)
106456+ * NB. All ops return zero on success, else a negative error code.
106457+ */
106458+
106459+/*
106460+ * Register NMI callback for this (calling) VCPU. Currently this only makes
106461+ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
106462+ * arg == pointer to xennmi_callback structure.
106463+ */
106464+#define XENNMI_register_callback 0
106465+struct xennmi_callback {
106466+ unsigned long handler_address;
106467+ unsigned long pad;
106468+};
106469+typedef struct xennmi_callback xennmi_callback_t;
106470+DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
106471+
106472+/*
106473+ * Deregister NMI callback for this (calling) VCPU.
106474+ * arg == NULL.
106475+ */
106476+#define XENNMI_unregister_callback 1
106477+
106478+#endif /* __XEN_PUBLIC_NMI_H__ */
106479+
106480+/*
106481+ * Local variables:
106482+ * mode: C
106483+ * c-set-style: "BSD"
106484+ * c-basic-offset: 4
106485+ * tab-width: 4
106486+ * indent-tabs-mode: nil
106487+ * End:
106488+ */
106489diff -Nur linux-2.6.16.33-noxen/include/xen/interface/physdev.h linux-2.6.16.33/include/xen/interface/physdev.h
106490--- linux-2.6.16.33-noxen/include/xen/interface/physdev.h 1970-01-01 00:00:00.000000000 +0000
106491+++ linux-2.6.16.33/include/xen/interface/physdev.h 2007-01-08 15:00:55.000000000 +0000
106492@@ -0,0 +1,169 @@
106493+/*
106494+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106495+ * of this software and associated documentation files (the "Software"), to
106496+ * deal in the Software without restriction, including without limitation the
106497+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106498+ * sell copies of the Software, and to permit persons to whom the Software is
106499+ * furnished to do so, subject to the following conditions:
106500+ *
106501+ * The above copyright notice and this permission notice shall be included in
106502+ * all copies or substantial portions of the Software.
106503+ *
106504+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106505+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106506+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106507+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106508+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106509+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106510+ * DEALINGS IN THE SOFTWARE.
106511+ */
106512+
106513+#ifndef __XEN_PUBLIC_PHYSDEV_H__
106514+#define __XEN_PUBLIC_PHYSDEV_H__
106515+
106516+/*
106517+ * Prototype for this hypercall is:
106518+ * int physdev_op(int cmd, void *args)
106519+ * @cmd == PHYSDEVOP_??? (physdev operation).
106520+ * @args == Operation-specific extra arguments (NULL if none).
106521+ */
106522+
106523+/*
106524+ * Notify end-of-interrupt (EOI) for the specified IRQ.
106525+ * @arg == pointer to physdev_eoi structure.
106526+ */
106527+#define PHYSDEVOP_eoi 12
106528+struct physdev_eoi {
106529+ /* IN */
106530+ uint32_t irq;
106531+};
106532+typedef struct physdev_eoi physdev_eoi_t;
106533+DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
106534+
106535+/*
106536+ * Query the status of an IRQ line.
106537+ * @arg == pointer to physdev_irq_status_query structure.
106538+ */
106539+#define PHYSDEVOP_irq_status_query 5
106540+struct physdev_irq_status_query {
106541+ /* IN */
106542+ uint32_t irq;
106543+ /* OUT */
106544+ uint32_t flags; /* XENIRQSTAT_* */
106545+};
106546+typedef struct physdev_irq_status_query physdev_irq_status_query_t;
106547+DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
106548+
106549+/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
106550+#define _XENIRQSTAT_needs_eoi (0)
106551+#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
106552+
106553+/* IRQ shared by multiple guests? */
106554+#define _XENIRQSTAT_shared (1)
106555+#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
106556+
106557+/*
106558+ * Set the current VCPU's I/O privilege level.
106559+ * @arg == pointer to physdev_set_iopl structure.
106560+ */
106561+#define PHYSDEVOP_set_iopl 6
106562+struct physdev_set_iopl {
106563+ /* IN */
106564+ uint32_t iopl;
106565+};
106566+typedef struct physdev_set_iopl physdev_set_iopl_t;
106567+DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
106568+
106569+/*
106570+ * Set the current VCPU's I/O-port permissions bitmap.
106571+ * @arg == pointer to physdev_set_iobitmap structure.
106572+ */
106573+#define PHYSDEVOP_set_iobitmap 7
106574+struct physdev_set_iobitmap {
106575+ /* IN */
106576+ XEN_GUEST_HANDLE_00030205(uint8_t) bitmap;
106577+ uint32_t nr_ports;
106578+};
106579+typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
106580+DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
106581+
106582+/*
106583+ * Read or write an IO-APIC register.
106584+ * @arg == pointer to physdev_apic structure.
106585+ */
106586+#define PHYSDEVOP_apic_read 8
106587+#define PHYSDEVOP_apic_write 9
106588+struct physdev_apic {
106589+ /* IN */
106590+ unsigned long apic_physbase;
106591+ uint32_t reg;
106592+ /* IN or OUT */
106593+ uint32_t value;
106594+};
106595+typedef struct physdev_apic physdev_apic_t;
106596+DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
106597+
106598+/*
106599+ * Allocate or free a physical upcall vector for the specified IRQ line.
106600+ * @arg == pointer to physdev_irq structure.
106601+ */
106602+#define PHYSDEVOP_alloc_irq_vector 10
106603+#define PHYSDEVOP_free_irq_vector 11
106604+struct physdev_irq {
106605+ /* IN */
106606+ uint32_t irq;
106607+ /* IN or OUT */
106608+ uint32_t vector;
106609+};
106610+typedef struct physdev_irq physdev_irq_t;
106611+DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
106612+
106613+/*
106614+ * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
106615+ * hypercall since 0x00030202.
106616+ */
106617+struct physdev_op {
106618+ uint32_t cmd;
106619+ union {
106620+ struct physdev_irq_status_query irq_status_query;
106621+ struct physdev_set_iopl set_iopl;
106622+ struct physdev_set_iobitmap set_iobitmap;
106623+ struct physdev_apic apic_op;
106624+ struct physdev_irq irq_op;
106625+ } u;
106626+};
106627+typedef struct physdev_op physdev_op_t;
106628+DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
106629+
106630+/*
106631+ * Notify that some PIRQ-bound event channels have been unmasked.
106632+ * ** This command is obsolete since interface version 0x00030202 and is **
106633+ * ** unsupported by newer versions of Xen. **
106634+ */
106635+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
106636+
106637+/*
106638+ * These all-capitals physdev operation names are superceded by the new names
106639+ * (defined above) since interface version 0x00030202.
106640+ */
106641+#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
106642+#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
106643+#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
106644+#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
106645+#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
106646+#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
106647+#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
106648+#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
106649+#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
106650+
106651+#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
106652+
106653+/*
106654+ * Local variables:
106655+ * mode: C
106656+ * c-set-style: "BSD"
106657+ * c-basic-offset: 4
106658+ * tab-width: 4
106659+ * indent-tabs-mode: nil
106660+ * End:
106661+ */
106662diff -Nur linux-2.6.16.33-noxen/include/xen/interface/platform.h linux-2.6.16.33/include/xen/interface/platform.h
106663--- linux-2.6.16.33-noxen/include/xen/interface/platform.h 1970-01-01 00:00:00.000000000 +0000
106664+++ linux-2.6.16.33/include/xen/interface/platform.h 2007-01-08 15:00:55.000000000 +0000
106665@@ -0,0 +1,143 @@
106666+/******************************************************************************
106667+ * platform.h
106668+ *
106669+ * Hardware platform operations. Intended for use by domain-0 kernel.
106670+ *
106671+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106672+ * of this software and associated documentation files (the "Software"), to
106673+ * deal in the Software without restriction, including without limitation the
106674+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106675+ * sell copies of the Software, and to permit persons to whom the Software is
106676+ * furnished to do so, subject to the following conditions:
106677+ *
106678+ * The above copyright notice and this permission notice shall be included in
106679+ * all copies or substantial portions of the Software.
106680+ *
106681+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106682+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106683+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106684+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106685+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106686+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106687+ * DEALINGS IN THE SOFTWARE.
106688+ *
106689+ * Copyright (c) 2002-2006, K Fraser
106690+ */
106691+
106692+#ifndef __XEN_PUBLIC_PLATFORM_H__
106693+#define __XEN_PUBLIC_PLATFORM_H__
106694+
106695+#include "xen.h"
106696+
106697+#define XENPF_INTERFACE_VERSION 0x03000001
106698+
106699+/*
106700+ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
106701+ * 1 January, 1970 if the current system time was <system_time>.
106702+ */
106703+#define XENPF_settime 17
106704+struct xenpf_settime {
106705+ /* IN variables. */
106706+ uint32_t secs;
106707+ uint32_t nsecs;
106708+ uint64_t system_time;
106709+};
106710+typedef struct xenpf_settime xenpf_settime_t;
106711+DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
106712+
106713+/*
106714+ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
106715+ * On x86, @type is an architecture-defined MTRR memory type.
106716+ * On success, returns the MTRR that was used (@reg) and a handle that can
106717+ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
106718+ * (x86-specific).
106719+ */
106720+#define XENPF_add_memtype 31
106721+struct xenpf_add_memtype {
106722+ /* IN variables. */
106723+ xen_pfn_t mfn;
106724+ uint64_t nr_mfns;
106725+ uint32_t type;
106726+ /* OUT variables. */
106727+ uint32_t handle;
106728+ uint32_t reg;
106729+};
106730+typedef struct xenpf_add_memtype xenpf_add_memtype_t;
106731+DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
106732+
106733+/*
106734+ * Tear down an existing memory-range type. If @handle is remembered then it
106735+ * should be passed in to accurately tear down the correct setting (in case
106736+ * of overlapping memory regions with differing types). If it is not known
106737+ * then @handle should be set to zero. In all cases @reg must be set.
106738+ * (x86-specific).
106739+ */
106740+#define XENPF_del_memtype 32
106741+struct xenpf_del_memtype {
106742+ /* IN variables. */
106743+ uint32_t handle;
106744+ uint32_t reg;
106745+};
106746+typedef struct xenpf_del_memtype xenpf_del_memtype_t;
106747+DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
106748+
106749+/* Read current type of an MTRR (x86-specific). */
106750+#define XENPF_read_memtype 33
106751+struct xenpf_read_memtype {
106752+ /* IN variables. */
106753+ uint32_t reg;
106754+ /* OUT variables. */
106755+ xen_pfn_t mfn;
106756+ uint64_t nr_mfns;
106757+ uint32_t type;
106758+};
106759+typedef struct xenpf_read_memtype xenpf_read_memtype_t;
106760+DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
106761+
106762+#define XENPF_microcode_update 35
106763+struct xenpf_microcode_update {
106764+ /* IN variables. */
106765+ XEN_GUEST_HANDLE(void) data; /* Pointer to microcode data */
106766+ uint32_t length; /* Length of microcode data. */
106767+};
106768+typedef struct xenpf_microcode_update xenpf_microcode_update_t;
106769+DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
106770+
106771+#define XENPF_platform_quirk 39
106772+#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */
106773+#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */
106774+#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */
106775+struct xenpf_platform_quirk {
106776+ /* IN variables. */
106777+ uint32_t quirk_id;
106778+};
106779+typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
106780+DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
106781+
106782+struct xen_platform_op {
106783+ uint32_t cmd;
106784+ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
106785+ union {
106786+ struct xenpf_settime settime;
106787+ struct xenpf_add_memtype add_memtype;
106788+ struct xenpf_del_memtype del_memtype;
106789+ struct xenpf_read_memtype read_memtype;
106790+ struct xenpf_microcode_update microcode;
106791+ struct xenpf_platform_quirk platform_quirk;
106792+ uint8_t pad[128];
106793+ } u;
106794+};
106795+typedef struct xen_platform_op xen_platform_op_t;
106796+DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
106797+
106798+#endif /* __XEN_PUBLIC_PLATFORM_H__ */
106799+
106800+/*
106801+ * Local variables:
106802+ * mode: C
106803+ * c-set-style: "BSD"
106804+ * c-basic-offset: 4
106805+ * tab-width: 4
106806+ * indent-tabs-mode: nil
106807+ * End:
106808+ */
106809diff -Nur linux-2.6.16.33-noxen/include/xen/interface/sched.h linux-2.6.16.33/include/xen/interface/sched.h
106810--- linux-2.6.16.33-noxen/include/xen/interface/sched.h 1970-01-01 00:00:00.000000000 +0000
106811+++ linux-2.6.16.33/include/xen/interface/sched.h 2007-01-08 15:00:55.000000000 +0000
106812@@ -0,0 +1,121 @@
106813+/******************************************************************************
106814+ * sched.h
106815+ *
106816+ * Scheduler state interactions
106817+ *
106818+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106819+ * of this software and associated documentation files (the "Software"), to
106820+ * deal in the Software without restriction, including without limitation the
106821+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106822+ * sell copies of the Software, and to permit persons to whom the Software is
106823+ * furnished to do so, subject to the following conditions:
106824+ *
106825+ * The above copyright notice and this permission notice shall be included in
106826+ * all copies or substantial portions of the Software.
106827+ *
106828+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106829+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106830+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106831+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106832+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106833+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106834+ * DEALINGS IN THE SOFTWARE.
106835+ *
106836+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106837+ */
106838+
106839+#ifndef __XEN_PUBLIC_SCHED_H__
106840+#define __XEN_PUBLIC_SCHED_H__
106841+
106842+#include "event_channel.h"
106843+
106844+/*
106845+ * The prototype for this hypercall is:
106846+ * long sched_op(int cmd, void *arg)
106847+ * @cmd == SCHEDOP_??? (scheduler operation).
106848+ * @arg == Operation-specific extra argument(s), as described below.
106849+ *
106850+ * Versions of Xen prior to 3.0.2 provided only the following legacy version
106851+ * of this hypercall, supporting only the commands yield, block and shutdown:
106852+ * long sched_op(int cmd, unsigned long arg)
106853+ * @cmd == SCHEDOP_??? (scheduler operation).
106854+ * @arg == 0 (SCHEDOP_yield and SCHEDOP_block)
106855+ * == SHUTDOWN_* code (SCHEDOP_shutdown)
106856+ * This legacy version is available to new guests as sched_op_compat().
106857+ */
106858+
106859+/*
106860+ * Voluntarily yield the CPU.
106861+ * @arg == NULL.
106862+ */
106863+#define SCHEDOP_yield 0
106864+
106865+/*
106866+ * Block execution of this VCPU until an event is received for processing.
106867+ * If called with event upcalls masked, this operation will atomically
106868+ * reenable event delivery and check for pending events before blocking the
106869+ * VCPU. This avoids a "wakeup waiting" race.
106870+ * @arg == NULL.
106871+ */
106872+#define SCHEDOP_block 1
106873+
106874+/*
106875+ * Halt execution of this domain (all VCPUs) and notify the system controller.
106876+ * @arg == pointer to sched_shutdown structure.
106877+ */
106878+#define SCHEDOP_shutdown 2
106879+struct sched_shutdown {
106880+ unsigned int reason; /* SHUTDOWN_* */
106881+};
106882+typedef struct sched_shutdown sched_shutdown_t;
106883+DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
106884+
106885+/*
106886+ * Poll a set of event-channel ports. Return when one or more are pending. An
106887+ * optional timeout may be specified.
106888+ * @arg == pointer to sched_poll structure.
106889+ */
106890+#define SCHEDOP_poll 3
106891+struct sched_poll {
106892+ XEN_GUEST_HANDLE(evtchn_port_t) ports;
106893+ unsigned int nr_ports;
106894+ uint64_t timeout;
106895+};
106896+typedef struct sched_poll sched_poll_t;
106897+DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
106898+
106899+/*
106900+ * Declare a shutdown for another domain. The main use of this function is
106901+ * in interpreting shutdown requests and reasons for fully-virtualized
106902+ * domains. A para-virtualized domain may use SCHEDOP_shutdown directly.
106903+ * @arg == pointer to sched_remote_shutdown structure.
106904+ */
106905+#define SCHEDOP_remote_shutdown 4
106906+struct sched_remote_shutdown {
106907+ domid_t domain_id; /* Remote domain ID */
106908+ unsigned int reason; /* SHUTDOWN_xxx reason */
106909+};
106910+typedef struct sched_remote_shutdown sched_remote_shutdown_t;
106911+DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
106912+
106913+/*
106914+ * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
106915+ * software to determine the appropriate action. For the most part, Xen does
106916+ * not care about the shutdown code.
106917+ */
106918+#define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */
106919+#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */
106920+#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
106921+#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
106922+
106923+#endif /* __XEN_PUBLIC_SCHED_H__ */
106924+
106925+/*
106926+ * Local variables:
106927+ * mode: C
106928+ * c-set-style: "BSD"
106929+ * c-basic-offset: 4
106930+ * tab-width: 4
106931+ * indent-tabs-mode: nil
106932+ * End:
106933+ */
106934diff -Nur linux-2.6.16.33-noxen/include/xen/interface/sysctl.h linux-2.6.16.33/include/xen/interface/sysctl.h
106935--- linux-2.6.16.33-noxen/include/xen/interface/sysctl.h 1970-01-01 00:00:00.000000000 +0000
106936+++ linux-2.6.16.33/include/xen/interface/sysctl.h 2007-01-08 15:00:55.000000000 +0000
106937@@ -0,0 +1,169 @@
106938+/******************************************************************************
106939+ * sysctl.h
106940+ *
106941+ * System management operations. For use by node control stack.
106942+ *
106943+ * Permission is hereby granted, free of charge, to any person obtaining a copy
106944+ * of this software and associated documentation files (the "Software"), to
106945+ * deal in the Software without restriction, including without limitation the
106946+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106947+ * sell copies of the Software, and to permit persons to whom the Software is
106948+ * furnished to do so, subject to the following conditions:
106949+ *
106950+ * The above copyright notice and this permission notice shall be included in
106951+ * all copies or substantial portions of the Software.
106952+ *
106953+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106954+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106955+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106956+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106957+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106958+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106959+ * DEALINGS IN THE SOFTWARE.
106960+ *
106961+ * Copyright (c) 2002-2006, K Fraser
106962+ */
106963+
106964+#ifndef __XEN_PUBLIC_SYSCTL_H__
106965+#define __XEN_PUBLIC_SYSCTL_H__
106966+
106967+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
106968+#error "sysctl operations are intended for use by node control tools only"
106969+#endif
106970+
106971+#include "xen.h"
106972+#include "domctl.h"
106973+
106974+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000002
106975+
106976+/*
106977+ * Read console content from Xen buffer ring.
106978+ */
106979+#define XEN_SYSCTL_readconsole 1
106980+struct xen_sysctl_readconsole {
106981+ /* IN variables. */
106982+ uint32_t clear; /* Non-zero -> clear after reading. */
106983+ XEN_GUEST_HANDLE(char) buffer; /* Buffer start */
106984+ /* IN/OUT variables. */
106985+ uint32_t count; /* In: Buffer size; Out: Used buffer size */
106986+};
106987+typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
106988+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
106989+
106990+/* Get trace buffers machine base address */
106991+#define XEN_SYSCTL_tbuf_op 2
106992+struct xen_sysctl_tbuf_op {
106993+ /* IN variables */
106994+#define XEN_SYSCTL_TBUFOP_get_info 0
106995+#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
106996+#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
106997+#define XEN_SYSCTL_TBUFOP_set_size 3
106998+#define XEN_SYSCTL_TBUFOP_enable 4
106999+#define XEN_SYSCTL_TBUFOP_disable 5
107000+ uint32_t cmd;
107001+ /* IN/OUT variables */
107002+ struct xenctl_cpumap cpu_mask;
107003+ uint32_t evt_mask;
107004+ /* OUT variables */
107005+ uint64_t buffer_mfn;
107006+ uint32_t size;
107007+};
107008+typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
107009+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
107010+
107011+/*
107012+ * Get physical information about the host machine
107013+ */
107014+#define XEN_SYSCTL_physinfo 3
107015+struct xen_sysctl_physinfo {
107016+ uint32_t threads_per_core;
107017+ uint32_t cores_per_socket;
107018+ uint32_t sockets_per_node;
107019+ uint32_t nr_nodes;
107020+ uint32_t cpu_khz;
107021+ uint64_t total_pages;
107022+ uint64_t free_pages;
107023+ uint64_t scrub_pages;
107024+ uint32_t hw_cap[8];
107025+};
107026+typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
107027+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
107028+
107029+/*
107030+ * Get the ID of the current scheduler.
107031+ */
107032+#define XEN_SYSCTL_sched_id 4
107033+struct xen_sysctl_sched_id {
107034+ /* OUT variable */
107035+ uint32_t sched_id;
107036+};
107037+typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
107038+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
107039+
107040+/* Interface for controlling Xen software performance counters. */
107041+#define XEN_SYSCTL_perfc_op 5
107042+/* Sub-operations: */
107043+#define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */
107044+#define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */
107045+struct xen_sysctl_perfc_desc {
107046+ char name[80]; /* name of perf counter */
107047+ uint32_t nr_vals; /* number of values for this counter */
107048+};
107049+typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
107050+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
107051+typedef uint32_t xen_sysctl_perfc_val_t;
107052+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
107053+
107054+struct xen_sysctl_perfc_op {
107055+ /* IN variables. */
107056+ uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */
107057+ /* OUT variables. */
107058+ uint32_t nr_counters; /* number of counters description */
107059+ uint32_t nr_vals; /* number of values */
107060+ /* counter information (or NULL) */
107061+ XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t) desc;
107062+ /* counter values (or NULL) */
107063+ XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t) val;
107064+};
107065+typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
107066+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
107067+
107068+#define XEN_SYSCTL_getdomaininfolist 6
107069+struct xen_sysctl_getdomaininfolist {
107070+ /* IN variables. */
107071+ domid_t first_domain;
107072+ uint32_t max_domains;
107073+ XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t) buffer;
107074+ /* OUT variables. */
107075+ uint32_t num_domains;
107076+};
107077+typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
107078+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
107079+
107080+struct xen_sysctl {
107081+ uint32_t cmd;
107082+ uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
107083+ union {
107084+ struct xen_sysctl_readconsole readconsole;
107085+ struct xen_sysctl_tbuf_op tbuf_op;
107086+ struct xen_sysctl_physinfo physinfo;
107087+ struct xen_sysctl_sched_id sched_id;
107088+ struct xen_sysctl_perfc_op perfc_op;
107089+ struct xen_sysctl_getdomaininfolist getdomaininfolist;
107090+ uint8_t pad[128];
107091+ } u;
107092+};
107093+typedef struct xen_sysctl xen_sysctl_t;
107094+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
107095+
107096+#endif /* __XEN_PUBLIC_SYSCTL_H__ */
107097+
107098+/*
107099+ * Local variables:
107100+ * mode: C
107101+ * c-set-style: "BSD"
107102+ * c-basic-offset: 4
107103+ * tab-width: 4
107104+ * indent-tabs-mode: nil
107105+ * End:
107106+ */
107107diff -Nur linux-2.6.16.33-noxen/include/xen/interface/trace.h linux-2.6.16.33/include/xen/interface/trace.h
107108--- linux-2.6.16.33-noxen/include/xen/interface/trace.h 1970-01-01 00:00:00.000000000 +0000
107109+++ linux-2.6.16.33/include/xen/interface/trace.h 2007-01-08 15:00:55.000000000 +0000
107110@@ -0,0 +1,102 @@
107111+/******************************************************************************
107112+ * include/public/trace.h
107113+ *
107114+ * Permission is hereby granted, free of charge, to any person obtaining a copy
107115+ * of this software and associated documentation files (the "Software"), to
107116+ * deal in the Software without restriction, including without limitation the
107117+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107118+ * sell copies of the Software, and to permit persons to whom the Software is
107119+ * furnished to do so, subject to the following conditions:
107120+ *
107121+ * The above copyright notice and this permission notice shall be included in
107122+ * all copies or substantial portions of the Software.
107123+ *
107124+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107125+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107126+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107127+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107128+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107129+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107130+ * DEALINGS IN THE SOFTWARE.
107131+ *
107132+ * Mark Williamson, (C) 2004 Intel Research Cambridge
107133+ * Copyright (C) 2005 Bin Ren
107134+ */
107135+
107136+#ifndef __XEN_PUBLIC_TRACE_H__
107137+#define __XEN_PUBLIC_TRACE_H__
107138+
107139+/* Trace classes */
107140+#define TRC_CLS_SHIFT 16
107141+#define TRC_GEN 0x0001f000 /* General trace */
107142+#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */
107143+#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */
107144+#define TRC_VMX 0x0008f000 /* Xen VMX trace */
107145+#define TRC_MEM 0x0010f000 /* Xen memory trace */
107146+#define TRC_ALL 0xfffff000
107147+
107148+/* Trace subclasses */
107149+#define TRC_SUBCLS_SHIFT 12
107150+
107151+/* trace subclasses for VMX */
107152+#define TRC_VMXEXIT 0x00081000 /* VMX exit trace */
107153+#define TRC_VMXENTRY 0x00082000 /* VMX exit trace */
107154+#define TRC_VMXINTR 0x00084000 /* VMX interrupt trace */
107155+
107156+/* Trace events per class */
107157+#define TRC_LOST_RECORDS (TRC_GEN + 1)
107158+
107159+#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1)
107160+#define TRC_SCHED_DOM_REM (TRC_SCHED + 2)
107161+#define TRC_SCHED_SLEEP (TRC_SCHED + 3)
107162+#define TRC_SCHED_WAKE (TRC_SCHED + 4)
107163+#define TRC_SCHED_YIELD (TRC_SCHED + 5)
107164+#define TRC_SCHED_BLOCK (TRC_SCHED + 6)
107165+#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7)
107166+#define TRC_SCHED_CTL (TRC_SCHED + 8)
107167+#define TRC_SCHED_ADJDOM (TRC_SCHED + 9)
107168+#define TRC_SCHED_SWITCH (TRC_SCHED + 10)
107169+#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11)
107170+#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12)
107171+#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13)
107172+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
107173+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
107174+
107175+#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1)
107176+#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2)
107177+#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
107178+
107179+/* trace events per subclass */
107180+#define TRC_VMX_VMEXIT (TRC_VMXEXIT + 1)
107181+#define TRC_VMX_VMENTRY (TRC_VMXENTRY + 1)
107182+#define TRC_VMX_INTR (TRC_VMXINTR + 1)
107183+
107184+
107185+/* This structure represents a single trace buffer record. */
107186+struct t_rec {
107187+ uint64_t cycles; /* cycle counter timestamp */
107188+ uint32_t event; /* event ID */
107189+ unsigned long data[5]; /* event data items */
107190+};
107191+
107192+/*
107193+ * This structure contains the metadata for a single trace buffer. The head
107194+ * field, indexes into an array of struct t_rec's.
107195+ */
107196+struct t_buf {
107197+ uint32_t cons; /* Next item to be consumed by control tools. */
107198+ uint32_t prod; /* Next item to be produced by Xen. */
107199+ /* 'nr_recs' records follow immediately after the meta-data header. */
107200+};
107201+
107202+#endif /* __XEN_PUBLIC_TRACE_H__ */
107203+
107204+/*
107205+ * Local variables:
107206+ * mode: C
107207+ * c-set-style: "BSD"
107208+ * c-basic-offset: 4
107209+ * tab-width: 4
107210+ * indent-tabs-mode: nil
107211+ * End:
107212+ */
107213diff -Nur linux-2.6.16.33-noxen/include/xen/interface/vcpu.h linux-2.6.16.33/include/xen/interface/vcpu.h
107214--- linux-2.6.16.33-noxen/include/xen/interface/vcpu.h 1970-01-01 00:00:00.000000000 +0000
107215+++ linux-2.6.16.33/include/xen/interface/vcpu.h 2007-01-08 15:00:55.000000000 +0000
107216@@ -0,0 +1,142 @@
107217+/******************************************************************************
107218+ * vcpu.h
107219+ *
107220+ * VCPU initialisation, query, and hotplug.
107221+ *
107222+ * Permission is hereby granted, free of charge, to any person obtaining a copy
107223+ * of this software and associated documentation files (the "Software"), to
107224+ * deal in the Software without restriction, including without limitation the
107225+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107226+ * sell copies of the Software, and to permit persons to whom the Software is
107227+ * furnished to do so, subject to the following conditions:
107228+ *
107229+ * The above copyright notice and this permission notice shall be included in
107230+ * all copies or substantial portions of the Software.
107231+ *
107232+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107233+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107234+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107235+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107236+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107237+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107238+ * DEALINGS IN THE SOFTWARE.
107239+ *
107240+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
107241+ */
107242+
107243+#ifndef __XEN_PUBLIC_VCPU_H__
107244+#define __XEN_PUBLIC_VCPU_H__
107245+
107246+/*
107247+ * Prototype for this hypercall is:
107248+ * int vcpu_op(int cmd, int vcpuid, void *extra_args)
107249+ * @cmd == VCPUOP_??? (VCPU operation).
107250+ * @vcpuid == VCPU to operate on.
107251+ * @extra_args == Operation-specific extra arguments (NULL if none).
107252+ */
107253+
107254+/*
107255+ * Initialise a VCPU. Each VCPU can be initialised only once. A
107256+ * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
107257+ *
107258+ * @extra_arg == pointer to vcpu_guest_context structure containing initial
107259+ * state for the VCPU.
107260+ */
107261+#define VCPUOP_initialise 0
107262+
107263+/*
107264+ * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
107265+ * if the VCPU has not been initialised (VCPUOP_initialise).
107266+ */
107267+#define VCPUOP_up 1
107268+
107269+/*
107270+ * Bring down a VCPU (i.e., make it non-runnable).
107271+ * There are a few caveats that callers should observe:
107272+ * 1. This operation may return, and VCPU_is_up may return false, before the
107273+ * VCPU stops running (i.e., the command is asynchronous). It is a good
107274+ * idea to ensure that the VCPU has entered a non-critical loop before
107275+ * bringing it down. Alternatively, this operation is guaranteed
107276+ * synchronous if invoked by the VCPU itself.
107277+ * 2. After a VCPU is initialised, there is currently no way to drop all its
107278+ * references to domain memory. Even a VCPU that is down still holds
107279+ * memory references via its pagetable base pointer and GDT. It is good
107280+ * practise to move a VCPU onto an 'idle' or default page table, LDT and
107281+ * GDT before bringing it down.
107282+ */
107283+#define VCPUOP_down 2
107284+
107285+/* Returns 1 if the given VCPU is up. */
107286+#define VCPUOP_is_up 3
107287+
107288+/*
107289+ * Return information about the state and running time of a VCPU.
107290+ * @extra_arg == pointer to vcpu_runstate_info structure.
107291+ */
107292+#define VCPUOP_get_runstate_info 4
107293+struct vcpu_runstate_info {
107294+ /* VCPU's current state (RUNSTATE_*). */
107295+ int state;
107296+ /* When was current state entered (system time, ns)? */
107297+ uint64_t state_entry_time;
107298+ /*
107299+ * Time spent in each RUNSTATE_* (ns). The sum of these times is
107300+ * guaranteed not to drift from system time.
107301+ */
107302+ uint64_t time[4];
107303+};
107304+typedef struct vcpu_runstate_info vcpu_runstate_info_t;
107305+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
107306+
107307+/* VCPU is currently running on a physical CPU. */
107308+#define RUNSTATE_running 0
107309+
107310+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
107311+#define RUNSTATE_runnable 1
107312+
107313+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
107314+#define RUNSTATE_blocked 2
107315+
107316+/*
107317+ * VCPU is not runnable, but it is not blocked.
107318+ * This is a 'catch all' state for things like hotplug and pauses by the
107319+ * system administrator (or for critical sections in the hypervisor).
107320+ * RUNSTATE_blocked dominates this state (it is the preferred state).
107321+ */
107322+#define RUNSTATE_offline 3
107323+
107324+/*
107325+ * Register a shared memory area from which the guest may obtain its own
107326+ * runstate information without needing to execute a hypercall.
107327+ * Notes:
107328+ * 1. The registered address may be virtual or physical or guest handle,
107329+ * depending on the platform. Virtual address or guest handle should be
107330+ * registered on x86 systems.
107331+ * 2. Only one shared area may be registered per VCPU. The shared area is
107332+ * updated by the hypervisor each time the VCPU is scheduled. Thus
107333+ * runstate.state will always be RUNSTATE_running and
107334+ * runstate.state_entry_time will indicate the system time at which the
107335+ * VCPU was last scheduled to run.
107336+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
107337+ */
107338+#define VCPUOP_register_runstate_memory_area 5
107339+struct vcpu_register_runstate_memory_area {
107340+ union {
107341+ XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
107342+ struct vcpu_runstate_info *v;
107343+ uint64_t p;
107344+ } addr;
107345+};
107346+typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
107347+
107348+#endif /* __XEN_PUBLIC_VCPU_H__ */
107349+
107350+/*
107351+ * Local variables:
107352+ * mode: C
107353+ * c-set-style: "BSD"
107354+ * c-basic-offset: 4
107355+ * tab-width: 4
107356+ * indent-tabs-mode: nil
107357+ * End:
107358+ */
107359diff -Nur linux-2.6.16.33-noxen/include/xen/interface/version.h linux-2.6.16.33/include/xen/interface/version.h
107360--- linux-2.6.16.33-noxen/include/xen/interface/version.h 1970-01-01 00:00:00.000000000 +0000
107361+++ linux-2.6.16.33/include/xen/interface/version.h 2007-01-08 15:00:55.000000000 +0000
107362@@ -0,0 +1,91 @@
107363+/******************************************************************************
107364+ * version.h
107365+ *
107366+ * Xen version, type, and compile information.
107367+ *
107368+ * Permission is hereby granted, free of charge, to any person obtaining a copy
107369+ * of this software and associated documentation files (the "Software"), to
107370+ * deal in the Software without restriction, including without limitation the
107371+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107372+ * sell copies of the Software, and to permit persons to whom the Software is
107373+ * furnished to do so, subject to the following conditions:
107374+ *
107375+ * The above copyright notice and this permission notice shall be included in
107376+ * all copies or substantial portions of the Software.
107377+ *
107378+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107379+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107380+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107381+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107382+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107383+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107384+ * DEALINGS IN THE SOFTWARE.
107385+ *
107386+ * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
107387+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
107388+ */
107389+
107390+#ifndef __XEN_PUBLIC_VERSION_H__
107391+#define __XEN_PUBLIC_VERSION_H__
107392+
107393+/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
107394+
107395+/* arg == NULL; returns major:minor (16:16). */
107396+#define XENVER_version 0
107397+
107398+/* arg == xen_extraversion_t. */
107399+#define XENVER_extraversion 1
107400+typedef char xen_extraversion_t[16];
107401+#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
107402+
107403+/* arg == xen_compile_info_t. */
107404+#define XENVER_compile_info 2
107405+struct xen_compile_info {
107406+ char compiler[64];
107407+ char compile_by[16];
107408+ char compile_domain[32];
107409+ char compile_date[32];
107410+};
107411+typedef struct xen_compile_info xen_compile_info_t;
107412+
107413+#define XENVER_capabilities 3
107414+typedef char xen_capabilities_info_t[1024];
107415+#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
107416+
107417+#define XENVER_changeset 4
107418+typedef char xen_changeset_info_t[64];
107419+#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
107420+
107421+#define XENVER_platform_parameters 5
107422+struct xen_platform_parameters {
107423+ unsigned long virt_start;
107424+};
107425+typedef struct xen_platform_parameters xen_platform_parameters_t;
107426+
107427+#define XENVER_get_features 6
107428+struct xen_feature_info {
107429+ unsigned int submap_idx; /* IN: which 32-bit submap to return */
107430+ uint32_t submap; /* OUT: 32-bit submap */
107431+};
107432+typedef struct xen_feature_info xen_feature_info_t;
107433+
107434+/* Declares the features reported by XENVER_get_features. */
107435+#include "features.h"
107436+
107437+/* arg == NULL; returns host memory page size. */
107438+#define XENVER_pagesize 7
107439+
107440+/* arg == xen_domain_handle_t. */
107441+#define XENVER_guest_handle 8
107442+
107443+#endif /* __XEN_PUBLIC_VERSION_H__ */
107444+
107445+/*
107446+ * Local variables:
107447+ * mode: C
107448+ * c-set-style: "BSD"
107449+ * c-basic-offset: 4
107450+ * tab-width: 4
107451+ * indent-tabs-mode: nil
107452+ * End:
107453+ */
107454diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xen-compat.h linux-2.6.16.33/include/xen/interface/xen-compat.h
107455--- linux-2.6.16.33-noxen/include/xen/interface/xen-compat.h 1970-01-01 00:00:00.000000000 +0000
107456+++ linux-2.6.16.33/include/xen/interface/xen-compat.h 2007-01-08 15:00:55.000000000 +0000
107457@@ -0,0 +1,51 @@
107458+/******************************************************************************
107459+ * xen-compat.h
107460+ *
107461+ * Guest OS interface to Xen. Compatibility layer.
107462+ *
107463+ * Permission is hereby granted, free of charge, to any person obtaining a copy
107464+ * of this software and associated documentation files (the "Software"), to
107465+ * deal in the Software without restriction, including without limitation the
107466+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107467+ * sell copies of the Software, and to permit persons to whom the Software is
107468+ * furnished to do so, subject to the following conditions:
107469+ *
107470+ * The above copyright notice and this permission notice shall be included in
107471+ * all copies or substantial portions of the Software.
107472+ *
107473+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107474+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107475+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107476+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107477+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107478+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107479+ * DEALINGS IN THE SOFTWARE.
107480+ *
107481+ * Copyright (c) 2006, Christian Limpach
107482+ */
107483+
107484+#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
107485+#define __XEN_PUBLIC_XEN_COMPAT_H__
107486+
107487+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030205
107488+
107489+#if defined(__XEN__) || defined(__XEN_TOOLS__)
107490+/* Xen is built with matching headers and implements the latest interface. */
107491+#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
107492+#elif !defined(__XEN_INTERFACE_VERSION__)
107493+/* Guests which do not specify a version get the legacy interface. */
107494+#define __XEN_INTERFACE_VERSION__ 0x00000000
107495+#endif
107496+
107497+#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
107498+#error "These header files do not support the requested interface version."
107499+#endif
107500+
107501+/* Fields defined as a Xen guest handle since 0x00030205. */
107502+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
107503+#define XEN_GUEST_HANDLE_00030205(type) XEN_GUEST_HANDLE(type)
107504+#else
107505+#define XEN_GUEST_HANDLE_00030205(type) type *
107506+#endif
107507+
107508+#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
107509diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xen.h linux-2.6.16.33/include/xen/interface/xen.h
107510--- linux-2.6.16.33-noxen/include/xen/interface/xen.h 1970-01-01 00:00:00.000000000 +0000
107511+++ linux-2.6.16.33/include/xen/interface/xen.h 2007-01-08 15:00:56.000000000 +0000
107512@@ -0,0 +1,597 @@
107513+/******************************************************************************
107514+ * xen.h
107515+ *
107516+ * Guest OS interface to Xen.
107517+ *
107518+ * Permission is hereby granted, free of charge, to any person obtaining a copy
107519+ * of this software and associated documentation files (the "Software"), to
107520+ * deal in the Software without restriction, including without limitation the
107521+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107522+ * sell copies of the Software, and to permit persons to whom the Software is
107523+ * furnished to do so, subject to the following conditions:
107524+ *
107525+ * The above copyright notice and this permission notice shall be included in
107526+ * all copies or substantial portions of the Software.
107527+ *
107528+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107529+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107530+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107531+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107532+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107533+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107534+ * DEALINGS IN THE SOFTWARE.
107535+ *
107536+ * Copyright (c) 2004, K A Fraser
107537+ */
107538+
107539+#ifndef __XEN_PUBLIC_XEN_H__
107540+#define __XEN_PUBLIC_XEN_H__
107541+
107542+#include "xen-compat.h"
107543+
107544+#if defined(__i386__) || defined(__x86_64__)
107545+#include "arch-x86/xen.h"
107546+#elif defined(__ia64__)
107547+#include "arch-ia64.h"
107548+#elif defined(__powerpc__)
107549+#include "arch-powerpc.h"
107550+#else
107551+#error "Unsupported architecture"
107552+#endif
107553+
107554+/*
107555+ * HYPERCALLS
107556+ */
107557+
107558+#define __HYPERVISOR_set_trap_table 0
107559+#define __HYPERVISOR_mmu_update 1
107560+#define __HYPERVISOR_set_gdt 2
107561+#define __HYPERVISOR_stack_switch 3
107562+#define __HYPERVISOR_set_callbacks 4
107563+#define __HYPERVISOR_fpu_taskswitch 5
107564+#define __HYPERVISOR_sched_op_compat 6 /* compat since 0x00030101 */
107565+#define __HYPERVISOR_platform_op 7
107566+#define __HYPERVISOR_set_debugreg 8
107567+#define __HYPERVISOR_get_debugreg 9
107568+#define __HYPERVISOR_update_descriptor 10
107569+#define __HYPERVISOR_memory_op 12
107570+#define __HYPERVISOR_multicall 13
107571+#define __HYPERVISOR_update_va_mapping 14
107572+#define __HYPERVISOR_set_timer_op 15
107573+#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
107574+#define __HYPERVISOR_xen_version 17
107575+#define __HYPERVISOR_console_io 18
107576+#define __HYPERVISOR_physdev_op_compat 19 /* compat since 0x00030202 */
107577+#define __HYPERVISOR_grant_table_op 20
107578+#define __HYPERVISOR_vm_assist 21
107579+#define __HYPERVISOR_update_va_mapping_otherdomain 22
107580+#define __HYPERVISOR_iret 23 /* x86 only */
107581+#define __HYPERVISOR_vcpu_op 24
107582+#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
107583+#define __HYPERVISOR_mmuext_op 26
107584+#define __HYPERVISOR_acm_op 27
107585+#define __HYPERVISOR_nmi_op 28
107586+#define __HYPERVISOR_sched_op 29
107587+#define __HYPERVISOR_callback_op 30
107588+#define __HYPERVISOR_xenoprof_op 31
107589+#define __HYPERVISOR_event_channel_op 32
107590+#define __HYPERVISOR_physdev_op 33
107591+#define __HYPERVISOR_hvm_op 34
107592+#define __HYPERVISOR_sysctl 35
107593+#define __HYPERVISOR_domctl 36
107594+#define __HYPERVISOR_kexec_op 37
107595+
107596+/* Architecture-specific hypercall definitions. */
107597+#define __HYPERVISOR_arch_0 48
107598+#define __HYPERVISOR_arch_1 49
107599+#define __HYPERVISOR_arch_2 50
107600+#define __HYPERVISOR_arch_3 51
107601+#define __HYPERVISOR_arch_4 52
107602+#define __HYPERVISOR_arch_5 53
107603+#define __HYPERVISOR_arch_6 54
107604+#define __HYPERVISOR_arch_7 55
107605+
107606+/*
107607+ * HYPERCALL COMPATIBILITY.
107608+ */
107609+
107610+/* New sched_op hypercall introduced in 0x00030101. */
107611+#if __XEN_INTERFACE_VERSION__ < 0x00030101
107612+#undef __HYPERVISOR_sched_op
107613+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
107614+#endif
107615+
107616+/* New event-channel and physdev hypercalls introduced in 0x00030202. */
107617+#if __XEN_INTERFACE_VERSION__ < 0x00030202
107618+#undef __HYPERVISOR_event_channel_op
107619+#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
107620+#undef __HYPERVISOR_physdev_op
107621+#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
107622+#endif
107623+
107624+/* New platform_op hypercall introduced in 0x00030204. */
107625+#if __XEN_INTERFACE_VERSION__ < 0x00030204
107626+#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
107627+#endif
107628+
107629+/*
107630+ * VIRTUAL INTERRUPTS
107631+ *
107632+ * Virtual interrupts that a guest OS may receive from Xen.
107633+ *
107634+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
107635+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
107636+ * The latter can be allocated only once per guest: they must initially be
107637+ * allocated to VCPU0 but can subsequently be re-bound.
107638+ */
107639+#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */
107640+#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */
107641+#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */
107642+#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */
107643+#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */
107644+#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */
107645+#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */
107646+
107647+/* Architecture-specific VIRQ definitions. */
107648+#define VIRQ_ARCH_0 16
107649+#define VIRQ_ARCH_1 17
107650+#define VIRQ_ARCH_2 18
107651+#define VIRQ_ARCH_3 19
107652+#define VIRQ_ARCH_4 20
107653+#define VIRQ_ARCH_5 21
107654+#define VIRQ_ARCH_6 22
107655+#define VIRQ_ARCH_7 23
107656+
107657+#define NR_VIRQS 24
107658+
107659+/*
107660+ * MMU-UPDATE REQUESTS
107661+ *
107662+ * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
107663+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
107664+ * Where the FD has some effect, it is described below.
107665+ * ptr[1:0] specifies the appropriate MMU_* command.
107666+ *
107667+ * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
107668+ * Updates an entry in a page table. If updating an L1 table, and the new
107669+ * table entry is valid/present, the mapped frame must belong to the FD, if
107670+ * an FD has been specified. If attempting to map an I/O page then the
107671+ * caller assumes the privilege of the FD.
107672+ * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
107673+ * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
107674+ * ptr[:2] -- Machine address of the page-table entry to modify.
107675+ * val -- Value to write.
107676+ *
107677+ * ptr[1:0] == MMU_MACHPHYS_UPDATE:
107678+ * Updates an entry in the machine->pseudo-physical mapping table.
107679+ * ptr[:2] -- Machine address within the frame whose mapping to modify.
107680+ * The frame must belong to the FD, if one is specified.
107681+ * val -- Value to write into the mapping entry.
107682+ */
107683+#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
107684+#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
107685+
107686+/*
107687+ * MMU EXTENDED OPERATIONS
107688+ *
107689+ * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
107690+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
107691+ * Where the FD has some effect, it is described below.
107692+ *
107693+ * cmd: MMUEXT_(UN)PIN_*_TABLE
107694+ * mfn: Machine frame number to be (un)pinned as a p.t. page.
107695+ * The frame must belong to the FD, if one is specified.
107696+ *
107697+ * cmd: MMUEXT_NEW_BASEPTR
107698+ * mfn: Machine frame number of new page-table base to install in MMU.
107699+ *
107700+ * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
107701+ * mfn: Machine frame number of new page-table base to install in MMU
107702+ * when in user space.
107703+ *
107704+ * cmd: MMUEXT_TLB_FLUSH_LOCAL
107705+ * No additional arguments. Flushes local TLB.
107706+ *
107707+ * cmd: MMUEXT_INVLPG_LOCAL
107708+ * linear_addr: Linear address to be flushed from the local TLB.
107709+ *
107710+ * cmd: MMUEXT_TLB_FLUSH_MULTI
107711+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
107712+ *
107713+ * cmd: MMUEXT_INVLPG_MULTI
107714+ * linear_addr: Linear address to be flushed.
107715+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
107716+ *
107717+ * cmd: MMUEXT_TLB_FLUSH_ALL
107718+ * No additional arguments. Flushes all VCPUs' TLBs.
107719+ *
107720+ * cmd: MMUEXT_INVLPG_ALL
107721+ * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
107722+ *
107723+ * cmd: MMUEXT_FLUSH_CACHE
107724+ * No additional arguments. Writes back and flushes cache contents.
107725+ *
107726+ * cmd: MMUEXT_SET_LDT
107727+ * linear_addr: Linear address of LDT base (NB. must be page-aligned).
107728+ * nr_ents: Number of entries in LDT.
107729+ */
107730+#define MMUEXT_PIN_L1_TABLE 0
107731+#define MMUEXT_PIN_L2_TABLE 1
107732+#define MMUEXT_PIN_L3_TABLE 2
107733+#define MMUEXT_PIN_L4_TABLE 3
107734+#define MMUEXT_UNPIN_TABLE 4
107735+#define MMUEXT_NEW_BASEPTR 5
107736+#define MMUEXT_TLB_FLUSH_LOCAL 6
107737+#define MMUEXT_INVLPG_LOCAL 7
107738+#define MMUEXT_TLB_FLUSH_MULTI 8
107739+#define MMUEXT_INVLPG_MULTI 9
107740+#define MMUEXT_TLB_FLUSH_ALL 10
107741+#define MMUEXT_INVLPG_ALL 11
107742+#define MMUEXT_FLUSH_CACHE 12
107743+#define MMUEXT_SET_LDT 13
107744+#define MMUEXT_NEW_USER_BASEPTR 15
107745+
107746+#ifndef __ASSEMBLY__
107747+struct mmuext_op {
107748+ unsigned int cmd;
107749+ union {
107750+ /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
107751+ xen_pfn_t mfn;
107752+ /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
107753+ unsigned long linear_addr;
107754+ } arg1;
107755+ union {
107756+ /* SET_LDT */
107757+ unsigned int nr_ents;
107758+ /* TLB_FLUSH_MULTI, INVLPG_MULTI */
107759+ XEN_GUEST_HANDLE_00030205(void) vcpumask;
107760+ } arg2;
107761+};
107762+typedef struct mmuext_op mmuext_op_t;
107763+DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
107764+#endif
107765+
107766+/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
107767+/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */
107768+/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */
107769+#define UVMF_NONE (0UL<<0) /* No flushing at all. */
107770+#define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */
107771+#define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */
107772+#define UVMF_FLUSHTYPE_MASK (3UL<<0)
107773+#define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */
107774+#define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */
107775+#define UVMF_ALL (1UL<<2) /* Flush all TLBs. */
107776+
107777+/*
107778+ * Commands to HYPERVISOR_console_io().
107779+ */
107780+#define CONSOLEIO_write 0
107781+#define CONSOLEIO_read 1
107782+
107783+/*
107784+ * Commands to HYPERVISOR_vm_assist().
107785+ */
107786+#define VMASST_CMD_enable 0
107787+#define VMASST_CMD_disable 1
107788+
107789+/* x86/32 guests: simulate full 4GB segment limits. */
107790+#define VMASST_TYPE_4gb_segments 0
107791+
107792+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
107793+#define VMASST_TYPE_4gb_segments_notify 1
107794+
107795+/*
107796+ * x86 guests: support writes to bottom-level PTEs.
107797+ * NB1. Page-directory entries cannot be written.
107798+ * NB2. Guest must continue to remove all writable mappings of PTEs.
107799+ */
107800+#define VMASST_TYPE_writable_pagetables 2
107801+
107802+/* x86/PAE guests: support PDPTs above 4GB. */
107803+#define VMASST_TYPE_pae_extended_cr3 3
107804+
107805+#define MAX_VMASST_TYPE 3
107806+
107807+#ifndef __ASSEMBLY__
107808+
107809+typedef uint16_t domid_t;
107810+
107811+/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
107812+#define DOMID_FIRST_RESERVED (0x7FF0U)
107813+
107814+/* DOMID_SELF is used in certain contexts to refer to oneself. */
107815+#define DOMID_SELF (0x7FF0U)
107816+
107817+/*
107818+ * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
107819+ * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
107820+ * is useful to ensure that no mappings to the OS's own heap are accidentally
107821+ * installed. (e.g., in Linux this could cause havoc as reference counts
107822+ * aren't adjusted on the I/O-mapping code path).
107823+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
107824+ * be specified by any calling domain.
107825+ */
107826+#define DOMID_IO (0x7FF1U)
107827+
107828+/*
107829+ * DOMID_XEN is used to allow privileged domains to map restricted parts of
107830+ * Xen's heap space (e.g., the machine_to_phys table).
107831+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
107832+ * the caller is privileged.
107833+ */
107834+#define DOMID_XEN (0x7FF2U)
107835+
107836+/*
107837+ * Send an array of these to HYPERVISOR_mmu_update().
107838+ * NB. The fields are natural pointer/address size for this architecture.
107839+ */
107840+struct mmu_update {
107841+ uint64_t ptr; /* Machine address of PTE. */
107842+ uint64_t val; /* New contents of PTE. */
107843+};
107844+typedef struct mmu_update mmu_update_t;
107845+DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
107846+
107847+/*
107848+ * Send an array of these to HYPERVISOR_multicall().
107849+ * NB. The fields are natural register size for this architecture.
107850+ */
107851+struct multicall_entry {
107852+ unsigned long op, result;
107853+ unsigned long args[6];
107854+};
107855+typedef struct multicall_entry multicall_entry_t;
107856+DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
107857+
107858+/*
107859+ * Event channel endpoints per domain:
107860+ * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
107861+ */
107862+#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
107863+
107864+struct vcpu_time_info {
107865+ /*
107866+ * Updates to the following values are preceded and followed by an
107867+ * increment of 'version'. The guest can therefore detect updates by
107868+ * looking for changes to 'version'. If the least-significant bit of
107869+ * the version number is set then an update is in progress and the guest
107870+ * must wait to read a consistent set of values.
107871+ * The correct way to interact with the version number is similar to
107872+ * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
107873+ */
107874+ uint32_t version;
107875+ uint32_t pad0;
107876+ uint64_t tsc_timestamp; /* TSC at last update of time vals. */
107877+ uint64_t system_time; /* Time, in nanosecs, since boot. */
107878+ /*
107879+ * Current system time:
107880+ * system_time +
107881+ * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
107882+ * CPU frequency (Hz):
107883+ * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
107884+ */
107885+ uint32_t tsc_to_system_mul;
107886+ int8_t tsc_shift;
107887+ int8_t pad1[3];
107888+}; /* 32 bytes */
107889+typedef struct vcpu_time_info vcpu_time_info_t;
107890+
107891+struct vcpu_info {
107892+ /*
107893+ * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
107894+ * a pending notification for a particular VCPU. It is then cleared
107895+ * by the guest OS /before/ checking for pending work, thus avoiding
107896+ * a set-and-check race. Note that the mask is only accessed by Xen
107897+ * on the CPU that is currently hosting the VCPU. This means that the
107898+ * pending and mask flags can be updated by the guest without special
107899+ * synchronisation (i.e., no need for the x86 LOCK prefix).
107900+ * This may seem suboptimal because if the pending flag is set by
107901+ * a different CPU then an IPI may be scheduled even when the mask
107902+ * is set. However, note:
107903+ * 1. The task of 'interrupt holdoff' is covered by the per-event-
107904+ * channel mask bits. A 'noisy' event that is continually being
107905+ * triggered can be masked at source at this very precise
107906+ * granularity.
107907+ * 2. The main purpose of the per-VCPU mask is therefore to restrict
107908+ * reentrant execution: whether for concurrency control, or to
107909+ * prevent unbounded stack usage. Whatever the purpose, we expect
107910+ * that the mask will be asserted only for short periods at a time,
107911+ * and so the likelihood of a 'spurious' IPI is suitably small.
107912+ * The mask is read before making an event upcall to the guest: a
107913+ * non-zero mask therefore guarantees that the VCPU will not receive
107914+ * an upcall activation. The mask is cleared when the VCPU requests
107915+ * to block: this avoids wakeup-waiting races.
107916+ */
107917+ uint8_t evtchn_upcall_pending;
107918+ uint8_t evtchn_upcall_mask;
107919+ unsigned long evtchn_pending_sel;
107920+ struct arch_vcpu_info arch;
107921+ struct vcpu_time_info time;
107922+}; /* 64 bytes (x86) */
107923+typedef struct vcpu_info vcpu_info_t;
107924+
107925+/*
107926+ * Xen/kernel shared data -- pointer provided in start_info.
107927+ *
107928+ * This structure is defined to be both smaller than a page, and the
107929+ * only data on the shared page, but may vary in actual size even within
107930+ * compatible Xen versions; guests should not rely on the size
107931+ * of this structure remaining constant.
107932+ */
107933+struct shared_info {
107934+ struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
107935+
107936+ /*
107937+ * A domain can create "event channels" on which it can send and receive
107938+ * asynchronous event notifications. There are three classes of event that
107939+ * are delivered by this mechanism:
107940+ * 1. Bi-directional inter- and intra-domain connections. Domains must
107941+ * arrange out-of-band to set up a connection (usually by allocating
107942+ * an unbound 'listener' port and avertising that via a storage service
107943+ * such as xenstore).
107944+ * 2. Physical interrupts. A domain with suitable hardware-access
107945+ * privileges can bind an event-channel port to a physical interrupt
107946+ * source.
107947+ * 3. Virtual interrupts ('events'). A domain can bind an event-channel
107948+ * port to a virtual interrupt source, such as the virtual-timer
107949+ * device or the emergency console.
107950+ *
107951+ * Event channels are addressed by a "port index". Each channel is
107952+ * associated with two bits of information:
107953+ * 1. PENDING -- notifies the domain that there is a pending notification
107954+ * to be processed. This bit is cleared by the guest.
107955+ * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
107956+ * will cause an asynchronous upcall to be scheduled. This bit is only
107957+ * updated by the guest. It is read-only within Xen. If a channel
107958+ * becomes pending while the channel is masked then the 'edge' is lost
107959+ * (i.e., when the channel is unmasked, the guest must manually handle
107960+ * pending notifications as no upcall will be scheduled by Xen).
107961+ *
107962+ * To expedite scanning of pending notifications, any 0->1 pending
107963+ * transition on an unmasked channel causes a corresponding bit in a
107964+ * per-vcpu selector word to be set. Each bit in the selector covers a
107965+ * 'C long' in the PENDING bitfield array.
107966+ */
107967+ unsigned long evtchn_pending[sizeof(unsigned long) * 8];
107968+ unsigned long evtchn_mask[sizeof(unsigned long) * 8];
107969+
107970+ /*
107971+ * Wallclock time: updated only by control software. Guests should base
107972+ * their gettimeofday() syscall on this wallclock-base value.
107973+ */
107974+ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
107975+ uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
107976+ uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
107977+
107978+ struct arch_shared_info arch;
107979+
107980+};
107981+typedef struct shared_info shared_info_t;
107982+
107983+/*
107984+ * Start-of-day memory layout for the initial domain (DOM0):
107985+ * 1. The domain is started within contiguous virtual-memory region.
107986+ * 2. The contiguous region begins and ends on an aligned 4MB boundary.
107987+ * 3. The region start corresponds to the load address of the OS image.
107988+ * If the load address is not 4MB aligned then the address is rounded down.
107989+ * 4. This the order of bootstrap elements in the initial virtual region:
107990+ * a. relocated kernel image
107991+ * b. initial ram disk [mod_start, mod_len]
107992+ * c. list of allocated page frames [mfn_list, nr_pages]
107993+ * d. start_info_t structure [register ESI (x86)]
107994+ * e. bootstrap page tables [pt_base, CR3 (x86)]
107995+ * f. bootstrap stack [register ESP (x86)]
107996+ * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
107997+ * 6. The initial ram disk may be omitted.
107998+ * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
107999+ * layout for the domain. In particular, the bootstrap virtual-memory
108000+ * region is a 1:1 mapping to the first section of the pseudo-physical map.
108001+ * 8. All bootstrap elements are mapped read-writable for the guest OS. The
108002+ * only exception is the bootstrap page table, which is mapped read-only.
108003+ * 9. There is guaranteed to be at least 512kB padding after the final
108004+ * bootstrap element. If necessary, the bootstrap virtual region is
108005+ * extended by an extra 4MB to ensure this.
108006+ */
108007+
108008+#define MAX_GUEST_CMDLINE 1024
108009+struct start_info {
108010+ /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
108011+ char magic[32]; /* "xen-<version>-<platform>". */
108012+ unsigned long nr_pages; /* Total pages allocated to this domain. */
108013+ unsigned long shared_info; /* MACHINE address of shared info struct. */
108014+ uint32_t flags; /* SIF_xxx flags. */
108015+ xen_pfn_t store_mfn; /* MACHINE page number of shared page. */
108016+ uint32_t store_evtchn; /* Event channel for store communication. */
108017+ union {
108018+ struct {
108019+ xen_pfn_t mfn; /* MACHINE page number of console page. */
108020+ uint32_t evtchn; /* Event channel for console page. */
108021+ } domU;
108022+ struct {
108023+ uint32_t info_off; /* Offset of console_info struct. */
108024+ uint32_t info_size; /* Size of console_info struct from start.*/
108025+ } dom0;
108026+ } console;
108027+ /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
108028+ unsigned long pt_base; /* VIRTUAL address of page directory. */
108029+ unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
108030+ unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
108031+ unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
108032+ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
108033+ int8_t cmd_line[MAX_GUEST_CMDLINE];
108034+};
108035+typedef struct start_info start_info_t;
108036+
108037+/* New console union for dom0 introduced in 0x00030203. */
108038+#if __XEN_INTERFACE_VERSION__ < 0x00030203
108039+#define console_mfn console.domU.mfn
108040+#define console_evtchn console.domU.evtchn
108041+#endif
108042+
108043+/* These flags are passed in the 'flags' field of start_info_t. */
108044+#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
108045+#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
108046+
108047+typedef struct dom0_vga_console_info {
108048+ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
108049+#define XEN_VGATYPE_TEXT_MODE_3 0x03
108050+#define XEN_VGATYPE_VESA_LFB 0x23
108051+
108052+ union {
108053+ struct {
108054+ /* Font height, in pixels. */
108055+ uint16_t font_height;
108056+ /* Cursor location (column, row). */
108057+ uint16_t cursor_x, cursor_y;
108058+ /* Number of rows and columns (dimensions in characters). */
108059+ uint16_t rows, columns;
108060+ } text_mode_3;
108061+
108062+ struct {
108063+ /* Width and height, in pixels. */
108064+ uint16_t width, height;
108065+ /* Bytes per scan line. */
108066+ uint16_t bytes_per_line;
108067+ /* Bits per pixel. */
108068+ uint16_t bits_per_pixel;
108069+ /* LFB physical address, and size (in units of 64kB). */
108070+ uint32_t lfb_base;
108071+ uint32_t lfb_size;
108072+ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
108073+ uint8_t red_pos, red_size;
108074+ uint8_t green_pos, green_size;
108075+ uint8_t blue_pos, blue_size;
108076+ uint8_t rsvd_pos, rsvd_size;
108077+ } vesa_lfb;
108078+ } u;
108079+} dom0_vga_console_info_t;
108080+
108081+typedef uint8_t xen_domain_handle_t[16];
108082+
108083+/* Turn a plain number into a C unsigned long constant. */
108084+#define __mk_unsigned_long(x) x ## UL
108085+#define mk_unsigned_long(x) __mk_unsigned_long(x)
108086+
108087+DEFINE_XEN_GUEST_HANDLE(uint8_t);
108088+DEFINE_XEN_GUEST_HANDLE(uint16_t);
108089+DEFINE_XEN_GUEST_HANDLE(uint32_t);
108090+DEFINE_XEN_GUEST_HANDLE(uint64_t);
108091+
108092+#else /* __ASSEMBLY__ */
108093+
108094+/* In assembly code we cannot use C numeric constant suffixes. */
108095+#define mk_unsigned_long(x) x
108096+
108097+#endif /* !__ASSEMBLY__ */
108098+
108099+#endif /* __XEN_PUBLIC_XEN_H__ */
108100+
108101+/*
108102+ * Local variables:
108103+ * mode: C
108104+ * c-set-style: "BSD"
108105+ * c-basic-offset: 4
108106+ * tab-width: 4
108107+ * indent-tabs-mode: nil
108108+ * End:
108109+ */
108110diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xencomm.h linux-2.6.16.33/include/xen/interface/xencomm.h
108111--- linux-2.6.16.33-noxen/include/xen/interface/xencomm.h 1970-01-01 00:00:00.000000000 +0000
108112+++ linux-2.6.16.33/include/xen/interface/xencomm.h 2007-01-08 15:00:56.000000000 +0000
108113@@ -0,0 +1,41 @@
108114+/*
108115+ * Permission is hereby granted, free of charge, to any person obtaining a copy
108116+ * of this software and associated documentation files (the "Software"), to
108117+ * deal in the Software without restriction, including without limitation the
108118+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
108119+ * sell copies of the Software, and to permit persons to whom the Software is
108120+ * furnished to do so, subject to the following conditions:
108121+ *
108122+ * The above copyright notice and this permission notice shall be included in
108123+ * all copies or substantial portions of the Software.
108124+ *
108125+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108126+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108127+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108128+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108129+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108130+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
108131+ * DEALINGS IN THE SOFTWARE.
108132+ *
108133+ * Copyright (C) IBM Corp. 2006
108134+ */
108135+
108136+#ifndef _XEN_XENCOMM_H_
108137+#define _XEN_XENCOMM_H_
108138+
108139+/* A xencomm descriptor is a scatter/gather list containing physical
108140+ * addresses corresponding to a virtually contiguous memory area. The
108141+ * hypervisor translates these physical addresses to machine addresses to copy
108142+ * to and from the virtually contiguous area.
108143+ */
108144+
108145+#define XENCOMM_MAGIC 0x58434F4D /* 'XCOM' */
108146+#define XENCOMM_INVALID (~0UL)
108147+
108148+struct xencomm_desc {
108149+ uint32_t magic;
108150+ uint32_t nr_addrs; /* the number of entries in address[] */
108151+ uint64_t address[0];
108152+};
108153+
108154+#endif /* _XEN_XENCOMM_H_ */
108155diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xenoprof.h linux-2.6.16.33/include/xen/interface/xenoprof.h
108156--- linux-2.6.16.33-noxen/include/xen/interface/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
108157+++ linux-2.6.16.33/include/xen/interface/xenoprof.h 2007-01-08 15:00:56.000000000 +0000
108158@@ -0,0 +1,130 @@
108159+/******************************************************************************
108160+ * xenoprof.h
108161+ *
108162+ * Interface for enabling system wide profiling based on hardware performance
108163+ * counters
108164+ *
108165+ * Permission is hereby granted, free of charge, to any person obtaining a copy
108166+ * of this software and associated documentation files (the "Software"), to
108167+ * deal in the Software without restriction, including without limitation the
108168+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
108169+ * sell copies of the Software, and to permit persons to whom the Software is
108170+ * furnished to do so, subject to the following conditions:
108171+ *
108172+ * The above copyright notice and this permission notice shall be included in
108173+ * all copies or substantial portions of the Software.
108174+ *
108175+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108176+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108177+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108178+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108179+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108180+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
108181+ * DEALINGS IN THE SOFTWARE.
108182+ *
108183+ * Copyright (C) 2005 Hewlett-Packard Co.
108184+ * Written by Aravind Menon & Jose Renato Santos
108185+ */
108186+
108187+#ifndef __XEN_PUBLIC_XENOPROF_H__
108188+#define __XEN_PUBLIC_XENOPROF_H__
108189+
108190+#include "xen.h"
108191+
108192+/*
108193+ * Commands to HYPERVISOR_xenoprof_op().
108194+ */
108195+#define XENOPROF_init 0
108196+#define XENOPROF_reset_active_list 1
108197+#define XENOPROF_reset_passive_list 2
108198+#define XENOPROF_set_active 3
108199+#define XENOPROF_set_passive 4
108200+#define XENOPROF_reserve_counters 5
108201+#define XENOPROF_counter 6
108202+#define XENOPROF_setup_events 7
108203+#define XENOPROF_enable_virq 8
108204+#define XENOPROF_start 9
108205+#define XENOPROF_stop 10
108206+#define XENOPROF_disable_virq 11
108207+#define XENOPROF_release_counters 12
108208+#define XENOPROF_shutdown 13
108209+#define XENOPROF_get_buffer 14
108210+#define XENOPROF_last_op 14
108211+
108212+#define MAX_OPROF_EVENTS 32
108213+#define MAX_OPROF_DOMAINS 25
108214+#define XENOPROF_CPU_TYPE_SIZE 64
108215+
108216+/* Xenoprof performance events (not Xen events) */
108217+struct event_log {
108218+ uint64_t eip;
108219+ uint8_t mode;
108220+ uint8_t event;
108221+};
108222+
108223+/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
108224+struct xenoprof_buf {
108225+ uint32_t event_head;
108226+ uint32_t event_tail;
108227+ uint32_t event_size;
108228+ uint32_t vcpu_id;
108229+ uint64_t xen_samples;
108230+ uint64_t kernel_samples;
108231+ uint64_t user_samples;
108232+ uint64_t lost_samples;
108233+ struct event_log event_log[1];
108234+};
108235+typedef struct xenoprof_buf xenoprof_buf_t;
108236+DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
108237+
108238+struct xenoprof_init {
108239+ int32_t num_events;
108240+ int32_t is_primary;
108241+ char cpu_type[XENOPROF_CPU_TYPE_SIZE];
108242+};
108243+typedef struct xenoprof_init xenoprof_init_t;
108244+DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
108245+
108246+struct xenoprof_get_buffer {
108247+ int32_t max_samples;
108248+ int32_t nbuf;
108249+ int32_t bufsize;
108250+ uint64_t buf_gmaddr;
108251+};
108252+typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
108253+DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
108254+
108255+struct xenoprof_counter {
108256+ uint32_t ind;
108257+ uint64_t count;
108258+ uint32_t enabled;
108259+ uint32_t event;
108260+ uint32_t hypervisor;
108261+ uint32_t kernel;
108262+ uint32_t user;
108263+ uint64_t unit_mask;
108264+};
108265+typedef struct xenoprof_counter xenoprof_counter_t;
108266+DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
108267+
108268+typedef struct xenoprof_passive {
108269+ uint16_t domain_id;
108270+ int32_t max_samples;
108271+ int32_t nbuf;
108272+ int32_t bufsize;
108273+ uint64_t buf_gmaddr;
108274+} xenoprof_passive_t;
108275+DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
108276+
108277+
108278+#endif /* __XEN_PUBLIC_XENOPROF_H__ */
108279+
108280+/*
108281+ * Local variables:
108282+ * mode: C
108283+ * c-set-style: "BSD"
108284+ * c-basic-offset: 4
108285+ * tab-width: 4
108286+ * indent-tabs-mode: nil
108287+ * End:
108288+ */
108289diff -Nur linux-2.6.16.33-noxen/include/xen/pcifront.h linux-2.6.16.33/include/xen/pcifront.h
108290--- linux-2.6.16.33-noxen/include/xen/pcifront.h 1970-01-01 00:00:00.000000000 +0000
108291+++ linux-2.6.16.33/include/xen/pcifront.h 2007-01-08 15:00:46.000000000 +0000
108292@@ -0,0 +1,77 @@
108293+/*
108294+ * PCI Frontend - arch-dependendent declarations
108295+ *
108296+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
108297+ */
108298+#ifndef __XEN_ASM_PCIFRONT_H__
108299+#define __XEN_ASM_PCIFRONT_H__
108300+
108301+#include <linux/config.h>
108302+#include <linux/spinlock.h>
108303+
108304+#ifdef __KERNEL__
108305+
108306+#ifndef __ia64__
108307+
108308+struct pcifront_device;
108309+struct pci_bus;
108310+
108311+struct pcifront_sd {
108312+ int domain;
108313+ struct pcifront_device *pdev;
108314+};
108315+
108316+static inline struct pcifront_device *
108317+pcifront_get_pdev(struct pcifront_sd *sd)
108318+{
108319+ return sd->pdev;
108320+}
108321+
108322+static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
108323+ struct pcifront_device *pdev)
108324+{
108325+ sd->domain = domain;
108326+ sd->pdev = pdev;
108327+}
108328+
108329+#if defined(CONFIG_PCI_DOMAINS)
108330+static inline int pci_domain_nr(struct pci_bus *bus)
108331+{
108332+ struct pcifront_sd *sd = bus->sysdata;
108333+ return sd->domain;
108334+}
108335+static inline int pci_proc_domain(struct pci_bus *bus)
108336+{
108337+ return pci_domain_nr(bus);
108338+}
108339+#endif /* CONFIG_PCI_DOMAINS */
108340+
108341+#else /* __ia64__ */
108342+
108343+#include <asm/pci.h>
108344+#define pcifront_sd pci_controller
108345+
108346+static inline struct pcifront_device *
108347+pcifront_get_pdev(struct pcifront_sd *sd)
108348+{
108349+ return (struct pcifront_device *)sd->platform_data;
108350+}
108351+
108352+static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
108353+ struct pcifront_device *pdev)
108354+{
108355+ sd->segment = domain;
108356+ sd->acpi_handle = NULL;
108357+ sd->iommu = NULL;
108358+ sd->windows = 0;
108359+ sd->window = NULL;
108360+ sd->platform_data = pdev;
108361+}
108362+
108363+#endif /* __ia64__ */
108364+
108365+extern spinlock_t pci_bus_lock;
108366+
108367+#endif /* __KERNEL__ */
108368+
108369+#endif /* __XEN_ASM_PCIFRONT_H__ */
108370diff -Nur linux-2.6.16.33-noxen/include/xen/public/evtchn.h linux-2.6.16.33/include/xen/public/evtchn.h
108371--- linux-2.6.16.33-noxen/include/xen/public/evtchn.h 1970-01-01 00:00:00.000000000 +0000
108372+++ linux-2.6.16.33/include/xen/public/evtchn.h 2007-01-08 15:00:46.000000000 +0000
108373@@ -0,0 +1,88 @@
108374+/******************************************************************************
108375+ * evtchn.h
108376+ *
108377+ * Interface to /dev/xen/evtchn.
108378+ *
108379+ * Copyright (c) 2003-2005, K A Fraser
108380+ *
108381+ * This program is free software; you can redistribute it and/or
108382+ * modify it under the terms of the GNU General Public License version 2
108383+ * as published by the Free Software Foundation; or, when distributed
108384+ * separately from the Linux kernel or incorporated into other
108385+ * software packages, subject to the following license:
108386+ *
108387+ * Permission is hereby granted, free of charge, to any person obtaining a copy
108388+ * of this source file (the "Software"), to deal in the Software without
108389+ * restriction, including without limitation the rights to use, copy, modify,
108390+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108391+ * and to permit persons to whom the Software is furnished to do so, subject to
108392+ * the following conditions:
108393+ *
108394+ * The above copyright notice and this permission notice shall be included in
108395+ * all copies or substantial portions of the Software.
108396+ *
108397+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108398+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108399+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108400+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108401+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108402+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108403+ * IN THE SOFTWARE.
108404+ */
108405+
108406+#ifndef __LINUX_PUBLIC_EVTCHN_H__
108407+#define __LINUX_PUBLIC_EVTCHN_H__
108408+
108409+/*
108410+ * Bind a fresh port to VIRQ @virq.
108411+ * Return allocated port.
108412+ */
108413+#define IOCTL_EVTCHN_BIND_VIRQ \
108414+ _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
108415+struct ioctl_evtchn_bind_virq {
108416+ unsigned int virq;
108417+};
108418+
108419+/*
108420+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
108421+ * Return allocated port.
108422+ */
108423+#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
108424+ _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
108425+struct ioctl_evtchn_bind_interdomain {
108426+ unsigned int remote_domain, remote_port;
108427+};
108428+
108429+/*
108430+ * Allocate a fresh port for binding to @remote_domain.
108431+ * Return allocated port.
108432+ */
108433+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
108434+ _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
108435+struct ioctl_evtchn_bind_unbound_port {
108436+ unsigned int remote_domain;
108437+};
108438+
108439+/*
108440+ * Unbind previously allocated @port.
108441+ */
108442+#define IOCTL_EVTCHN_UNBIND \
108443+ _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
108444+struct ioctl_evtchn_unbind {
108445+ unsigned int port;
108446+};
108447+
108448+/*
108449+ * Unbind previously allocated @port.
108450+ */
108451+#define IOCTL_EVTCHN_NOTIFY \
108452+ _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
108453+struct ioctl_evtchn_notify {
108454+ unsigned int port;
108455+};
108456+
108457+/* Clear and reinitialise the event buffer. Clear error condition. */
108458+#define IOCTL_EVTCHN_RESET \
108459+ _IOC(_IOC_NONE, 'E', 5, 0)
108460+
108461+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
108462diff -Nur linux-2.6.16.33-noxen/include/xen/public/privcmd.h linux-2.6.16.33/include/xen/public/privcmd.h
108463--- linux-2.6.16.33-noxen/include/xen/public/privcmd.h 1970-01-01 00:00:00.000000000 +0000
108464+++ linux-2.6.16.33/include/xen/public/privcmd.h 2007-01-08 15:00:46.000000000 +0000
108465@@ -0,0 +1,79 @@
108466+/******************************************************************************
108467+ * privcmd.h
108468+ *
108469+ * Interface to /proc/xen/privcmd.
108470+ *
108471+ * Copyright (c) 2003-2005, K A Fraser
108472+ *
108473+ * This program is free software; you can redistribute it and/or
108474+ * modify it under the terms of the GNU General Public License version 2
108475+ * as published by the Free Software Foundation; or, when distributed
108476+ * separately from the Linux kernel or incorporated into other
108477+ * software packages, subject to the following license:
108478+ *
108479+ * Permission is hereby granted, free of charge, to any person obtaining a copy
108480+ * of this source file (the "Software"), to deal in the Software without
108481+ * restriction, including without limitation the rights to use, copy, modify,
108482+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108483+ * and to permit persons to whom the Software is furnished to do so, subject to
108484+ * the following conditions:
108485+ *
108486+ * The above copyright notice and this permission notice shall be included in
108487+ * all copies or substantial portions of the Software.
108488+ *
108489+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108490+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108491+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108492+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108493+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108494+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108495+ * IN THE SOFTWARE.
108496+ */
108497+
108498+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
108499+#define __LINUX_PUBLIC_PRIVCMD_H__
108500+
108501+#include <linux/types.h>
108502+
108503+#ifndef __user
108504+#define __user
108505+#endif
108506+
108507+typedef struct privcmd_hypercall
108508+{
108509+ __u64 op;
108510+ __u64 arg[5];
108511+} privcmd_hypercall_t;
108512+
108513+typedef struct privcmd_mmap_entry {
108514+ __u64 va;
108515+ __u64 mfn;
108516+ __u64 npages;
108517+} privcmd_mmap_entry_t;
108518+
108519+typedef struct privcmd_mmap {
108520+ int num;
108521+ domid_t dom; /* target domain */
108522+ privcmd_mmap_entry_t __user *entry;
108523+} privcmd_mmap_t;
108524+
108525+typedef struct privcmd_mmapbatch {
108526+ int num; /* number of pages to populate */
108527+ domid_t dom; /* target domain */
108528+ __u64 addr; /* virtual address */
108529+ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
108530+} privcmd_mmapbatch_t;
108531+
108532+/*
108533+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
108534+ * @arg: &privcmd_hypercall_t
108535+ * Return: Value returned from execution of the specified hypercall.
108536+ */
108537+#define IOCTL_PRIVCMD_HYPERCALL \
108538+ _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
108539+#define IOCTL_PRIVCMD_MMAP \
108540+ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
108541+#define IOCTL_PRIVCMD_MMAPBATCH \
108542+ _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
108543+
108544+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
108545diff -Nur linux-2.6.16.33-noxen/include/xen/xen_proc.h linux-2.6.16.33/include/xen/xen_proc.h
108546--- linux-2.6.16.33-noxen/include/xen/xen_proc.h 1970-01-01 00:00:00.000000000 +0000
108547+++ linux-2.6.16.33/include/xen/xen_proc.h 2007-01-08 15:00:46.000000000 +0000
108548@@ -0,0 +1,13 @@
108549+
108550+#ifndef __ASM_XEN_PROC_H__
108551+#define __ASM_XEN_PROC_H__
108552+
108553+#include <linux/config.h>
108554+#include <linux/proc_fs.h>
108555+
108556+extern struct proc_dir_entry *create_xen_proc_entry(
108557+ const char *name, mode_t mode);
108558+extern void remove_xen_proc_entry(
108559+ const char *name);
108560+
108561+#endif /* __ASM_XEN_PROC_H__ */
108562diff -Nur linux-2.6.16.33-noxen/include/xen/xenbus.h linux-2.6.16.33/include/xen/xenbus.h
108563--- linux-2.6.16.33-noxen/include/xen/xenbus.h 1970-01-01 00:00:00.000000000 +0000
108564+++ linux-2.6.16.33/include/xen/xenbus.h 2007-01-08 15:00:46.000000000 +0000
108565@@ -0,0 +1,307 @@
108566+/******************************************************************************
108567+ * xenbus.h
108568+ *
108569+ * Talks to Xen Store to figure out what devices we have.
108570+ *
108571+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
108572+ * Copyright (C) 2005 XenSource Ltd.
108573+ *
108574+ * This program is free software; you can redistribute it and/or
108575+ * modify it under the terms of the GNU General Public License version 2
108576+ * as published by the Free Software Foundation; or, when distributed
108577+ * separately from the Linux kernel or incorporated into other
108578+ * software packages, subject to the following license:
108579+ *
108580+ * Permission is hereby granted, free of charge, to any person obtaining a copy
108581+ * of this source file (the "Software"), to deal in the Software without
108582+ * restriction, including without limitation the rights to use, copy, modify,
108583+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108584+ * and to permit persons to whom the Software is furnished to do so, subject to
108585+ * the following conditions:
108586+ *
108587+ * The above copyright notice and this permission notice shall be included in
108588+ * all copies or substantial portions of the Software.
108589+ *
108590+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108591+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108592+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108593+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108594+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108595+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108596+ * IN THE SOFTWARE.
108597+ */
108598+
108599+#ifndef _XEN_XENBUS_H
108600+#define _XEN_XENBUS_H
108601+
108602+#include <linux/device.h>
108603+#include <linux/notifier.h>
108604+#include <linux/mutex.h>
108605+#include <linux/completion.h>
108606+#include <linux/init.h>
108607+#include <xen/interface/xen.h>
108608+#include <xen/interface/grant_table.h>
108609+#include <xen/interface/io/xenbus.h>
108610+#include <xen/interface/io/xs_wire.h>
108611+
108612+/* Register callback to watch this node. */
108613+struct xenbus_watch
108614+{
108615+ struct list_head list;
108616+
108617+ /* Path being watched. */
108618+ const char *node;
108619+
108620+ /* Callback (executed in a process context with no locks held). */
108621+ void (*callback)(struct xenbus_watch *,
108622+ const char **vec, unsigned int len);
108623+
108624+ /* See XBWF_ definitions below. */
108625+ unsigned long flags;
108626+};
108627+
108628+/*
108629+ * Execute callback in its own kthread. Useful if the callback is long
108630+ * running or heavily serialised, to avoid taking out the main xenwatch thread
108631+ * for a long period of time (or even unwittingly causing a deadlock).
108632+ */
108633+#define XBWF_new_thread 1
108634+
108635+/* A xenbus device. */
108636+struct xenbus_device {
108637+ const char *devicetype;
108638+ const char *nodename;
108639+ const char *otherend;
108640+ int otherend_id;
108641+ struct xenbus_watch otherend_watch;
108642+ struct device dev;
108643+ enum xenbus_state state;
108644+ struct completion down;
108645+};
108646+
108647+static inline struct xenbus_device *to_xenbus_device(struct device *dev)
108648+{
108649+ return container_of(dev, struct xenbus_device, dev);
108650+}
108651+
108652+struct xenbus_device_id
108653+{
108654+ /* .../device/<device_type>/<identifier> */
108655+ char devicetype[32]; /* General class of device. */
108656+};
108657+
108658+/* A xenbus driver. */
108659+struct xenbus_driver {
108660+ char *name;
108661+ struct module *owner;
108662+ const struct xenbus_device_id *ids;
108663+ int (*probe)(struct xenbus_device *dev,
108664+ const struct xenbus_device_id *id);
108665+ void (*otherend_changed)(struct xenbus_device *dev,
108666+ enum xenbus_state backend_state);
108667+ int (*remove)(struct xenbus_device *dev);
108668+ int (*suspend)(struct xenbus_device *dev);
108669+ int (*resume)(struct xenbus_device *dev);
108670+ int (*uevent)(struct xenbus_device *, char **, int, char *, int);
108671+ struct device_driver driver;
108672+ int (*read_otherend_details)(struct xenbus_device *dev);
108673+};
108674+
108675+static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
108676+{
108677+ return container_of(drv, struct xenbus_driver, driver);
108678+}
108679+
108680+int xenbus_register_frontend(struct xenbus_driver *drv);
108681+int xenbus_register_backend(struct xenbus_driver *drv);
108682+void xenbus_unregister_driver(struct xenbus_driver *drv);
108683+
108684+struct xenbus_transaction
108685+{
108686+ u32 id;
108687+};
108688+
108689+/* Nil transaction ID. */
108690+#define XBT_NIL ((struct xenbus_transaction) { 0 })
108691+
108692+char **xenbus_directory(struct xenbus_transaction t,
108693+ const char *dir, const char *node, unsigned int *num);
108694+void *xenbus_read(struct xenbus_transaction t,
108695+ const char *dir, const char *node, unsigned int *len);
108696+int xenbus_write(struct xenbus_transaction t,
108697+ const char *dir, const char *node, const char *string);
108698+int xenbus_mkdir(struct xenbus_transaction t,
108699+ const char *dir, const char *node);
108700+int xenbus_exists(struct xenbus_transaction t,
108701+ const char *dir, const char *node);
108702+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
108703+int xenbus_transaction_start(struct xenbus_transaction *t);
108704+int xenbus_transaction_end(struct xenbus_transaction t, int abort);
108705+
108706+/* Single read and scanf: returns -errno or num scanned if > 0. */
108707+int xenbus_scanf(struct xenbus_transaction t,
108708+ const char *dir, const char *node, const char *fmt, ...)
108709+ __attribute__((format(scanf, 4, 5)));
108710+
108711+/* Single printf and write: returns -errno or 0. */
108712+int xenbus_printf(struct xenbus_transaction t,
108713+ const char *dir, const char *node, const char *fmt, ...)
108714+ __attribute__((format(printf, 4, 5)));
108715+
108716+/* Generic read function: NULL-terminated triples of name,
108717+ * sprintf-style type string, and pointer. Returns 0 or errno.*/
108718+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
108719+
108720+/* notifer routines for when the xenstore comes up */
108721+int register_xenstore_notifier(struct notifier_block *nb);
108722+void unregister_xenstore_notifier(struct notifier_block *nb);
108723+
108724+int register_xenbus_watch(struct xenbus_watch *watch);
108725+void unregister_xenbus_watch(struct xenbus_watch *watch);
108726+void xs_suspend(void);
108727+void xs_resume(void);
108728+
108729+/* Used by xenbus_dev to borrow kernel's store connection. */
108730+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
108731+
108732+/* Called from xen core code. */
108733+void xenbus_suspend(void);
108734+void xenbus_resume(void);
108735+
108736+#define XENBUS_IS_ERR_READ(str) ({ \
108737+ if (!IS_ERR(str) && strlen(str) == 0) { \
108738+ kfree(str); \
108739+ str = ERR_PTR(-ERANGE); \
108740+ } \
108741+ IS_ERR(str); \
108742+})
108743+
108744+#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
108745+
108746+
108747+/**
108748+ * Register a watch on the given path, using the given xenbus_watch structure
108749+ * for storage, and the given callback function as the callback. Return 0 on
108750+ * success, or -errno on error. On success, the given path will be saved as
108751+ * watch->node, and remains the caller's to free. On error, watch->node will
108752+ * be NULL, the device will switch to XenbusStateClosing, and the error will
108753+ * be saved in the store.
108754+ */
108755+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
108756+ struct xenbus_watch *watch,
108757+ void (*callback)(struct xenbus_watch *,
108758+ const char **, unsigned int));
108759+
108760+
108761+/**
108762+ * Register a watch on the given path/path2, using the given xenbus_watch
108763+ * structure for storage, and the given callback function as the callback.
108764+ * Return 0 on success, or -errno on error. On success, the watched path
108765+ * (path/path2) will be saved as watch->node, and becomes the caller's to
108766+ * kfree(). On error, watch->node will be NULL, so the caller has nothing to
108767+ * free, the device will switch to XenbusStateClosing, and the error will be
108768+ * saved in the store.
108769+ */
108770+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
108771+ const char *path2, struct xenbus_watch *watch,
108772+ void (*callback)(struct xenbus_watch *,
108773+ const char **, unsigned int));
108774+
108775+
108776+/**
108777+ * Advertise in the store a change of the given driver to the given new_state.
108778+ * Return 0 on success, or -errno on error. On error, the device will switch
108779+ * to XenbusStateClosing, and the error will be saved in the store.
108780+ */
108781+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
108782+
108783+
108784+/**
108785+ * Grant access to the given ring_mfn to the peer of the given device. Return
108786+ * 0 on success, or -errno on error. On error, the device will switch to
108787+ * XenbusStateClosing, and the error will be saved in the store.
108788+ */
108789+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
108790+
108791+
108792+/**
108793+ * Map a page of memory into this domain from another domain's grant table.
108794+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
108795+ * page to that address, and sets *vaddr to that address.
108796+ * xenbus_map_ring does not allocate the virtual address space (you must do
108797+ * this yourself!). It only maps in the page to the specified address.
108798+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
108799+ * or -ENOMEM on error. If an error is returned, device will switch to
108800+ * XenbusStateClosing and the error message will be saved in XenStore.
108801+ */
108802+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
108803+ int gnt_ref);
108804+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
108805+ grant_handle_t *handle, void *vaddr);
108806+
108807+
108808+/**
108809+ * Unmap a page of memory in this domain that was imported from another domain.
108810+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
108811+ * xenbus_map_ring_valloc (it will free the virtual address space).
108812+ * Returns 0 on success and returns GNTST_* on error
108813+ * (see xen/include/interface/grant_table.h).
108814+ */
108815+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
108816+int xenbus_unmap_ring(struct xenbus_device *dev,
108817+ grant_handle_t handle, void *vaddr);
108818+
108819+
108820+/**
108821+ * Allocate an event channel for the given xenbus_device, assigning the newly
108822+ * created local port to *port. Return 0 on success, or -errno on error. On
108823+ * error, the device will switch to XenbusStateClosing, and the error will be
108824+ * saved in the store.
108825+ */
108826+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
108827+
108828+
108829+/**
108830+ * Bind to an existing interdomain event channel in another domain. Returns 0
108831+ * on success and stores the local port in *port. On error, returns -errno,
108832+ * switches the device to XenbusStateClosing, and saves the error in XenStore.
108833+ */
108834+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
108835+
108836+
108837+/**
108838+ * Free an existing event channel. Returns 0 on success or -errno on error.
108839+ */
108840+int xenbus_free_evtchn(struct xenbus_device *dev, int port);
108841+
108842+
108843+/**
108844+ * Return the state of the driver rooted at the given store path, or
108845+ * XenbusStateUnknown if no state can be read.
108846+ */
108847+enum xenbus_state xenbus_read_driver_state(const char *path);
108848+
108849+
108850+/***
108851+ * Report the given negative errno into the store, along with the given
108852+ * formatted message.
108853+ */
108854+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
108855+ ...);
108856+
108857+
108858+/***
108859+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
108860+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
108861+ * closedown of this driver and its peer.
108862+ */
108863+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
108864+ ...);
108865+
108866+int __init xenbus_dev_init(void);
108867+
108868+char *xenbus_strstate(enum xenbus_state state);
108869+int xenbus_dev_is_online(struct xenbus_device *dev);
108870+int xenbus_frontend_closed(struct xenbus_device *dev);
108871+
108872+#endif /* _XEN_XENBUS_H */
108873diff -Nur linux-2.6.16.33-noxen/include/xen/xencons.h linux-2.6.16.33/include/xen/xencons.h
108874--- linux-2.6.16.33-noxen/include/xen/xencons.h 1970-01-01 00:00:00.000000000 +0000
108875+++ linux-2.6.16.33/include/xen/xencons.h 2007-01-08 15:00:46.000000000 +0000
108876@@ -0,0 +1,19 @@
108877+#ifndef __ASM_XENCONS_H__
108878+#define __ASM_XENCONS_H__
108879+
108880+struct dom0_vga_console_info;
108881+void dom0_init_screen_info(const struct dom0_vga_console_info *info);
108882+
108883+void xencons_force_flush(void);
108884+void xencons_resume(void);
108885+
108886+/* Interrupt work hooks. Receive data, or kick data out. */
108887+void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
108888+void xencons_tx(void);
108889+
108890+int xencons_ring_init(void);
108891+int xencons_ring_send(const char *data, unsigned len);
108892+
108893+void xencons_early_setup(void);
108894+
108895+#endif /* __ASM_XENCONS_H__ */
108896diff -Nur linux-2.6.16.33-noxen/include/xen/xenoprof.h linux-2.6.16.33/include/xen/xenoprof.h
108897--- linux-2.6.16.33-noxen/include/xen/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
108898+++ linux-2.6.16.33/include/xen/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
108899@@ -0,0 +1,42 @@
108900+/******************************************************************************
108901+ * xen/xenoprof.h
108902+ *
108903+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
108904+ * VA Linux Systems Japan K.K.
108905+ *
108906+ * This program is free software; you can redistribute it and/or modify
108907+ * it under the terms of the GNU General Public License as published by
108908+ * the Free Software Foundation; either version 2 of the License, or
108909+ * (at your option) any later version.
108910+ *
108911+ * This program is distributed in the hope that it will be useful,
108912+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
108913+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
108914+ * GNU General Public License for more details.
108915+ *
108916+ * You should have received a copy of the GNU General Public License
108917+ * along with this program; if not, write to the Free Software
108918+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
108919+ *
108920+ */
108921+
108922+#ifndef __XEN_XENOPROF_H__
108923+#define __XEN_XENOPROF_H__
108924+#ifdef CONFIG_XEN
108925+
108926+#include <asm/xenoprof.h>
108927+
108928+struct oprofile_operations;
108929+int xenoprofile_init(struct oprofile_operations * ops);
108930+void xenoprofile_exit(void);
108931+
108932+struct xenoprof_shared_buffer {
108933+ char *buffer;
108934+ struct xenoprof_arch_shared_buffer arch;
108935+};
108936+#else
108937+#define xenoprofile_init(ops) (-ENOSYS)
108938+#define xenoprofile_exit() do { } while (0)
108939+
108940+#endif /* CONFIG_XEN */
108941+#endif /* __XEN_XENOPROF_H__ */
108942diff -Nur linux-2.6.16.33-noxen/kernel/Kconfig.preempt linux-2.6.16.33/kernel/Kconfig.preempt
108943--- linux-2.6.16.33-noxen/kernel/Kconfig.preempt 2006-11-22 18:06:31.000000000 +0000
108944+++ linux-2.6.16.33/kernel/Kconfig.preempt 2007-01-08 15:00:46.000000000 +0000
108945@@ -35,6 +35,7 @@
108946
108947 config PREEMPT
108948 bool "Preemptible Kernel (Low-Latency Desktop)"
108949+ depends on !XEN
108950 help
108951 This option reduces the latency of the kernel by making
108952 all kernel code (that is not executing in a critical section)
108953diff -Nur linux-2.6.16.33-noxen/kernel/fork.c linux-2.6.16.33/kernel/fork.c
108954--- linux-2.6.16.33-noxen/kernel/fork.c 2006-11-22 18:06:31.000000000 +0000
108955+++ linux-2.6.16.33/kernel/fork.c 2007-01-08 15:00:46.000000000 +0000
108956@@ -274,6 +274,9 @@
108957 if (retval)
108958 goto out;
108959 }
108960+#ifdef arch_dup_mmap
108961+ arch_dup_mmap(mm, oldmm);
108962+#endif
108963 retval = 0;
108964 out:
108965 up_write(&mm->mmap_sem);
108966diff -Nur linux-2.6.16.33-noxen/kernel/irq/spurious.c linux-2.6.16.33/kernel/irq/spurious.c
108967--- linux-2.6.16.33-noxen/kernel/irq/spurious.c 2006-11-22 18:06:31.000000000 +0000
108968+++ linux-2.6.16.33/kernel/irq/spurious.c 2007-01-08 15:00:46.000000000 +0000
108969@@ -137,7 +137,8 @@
108970 struct pt_regs *regs)
108971 {
108972 if (action_ret != IRQ_HANDLED) {
108973- desc->irqs_unhandled++;
108974+ if (!irq_ignore_unhandled(irq))
108975+ desc->irqs_unhandled++;
108976 if (action_ret != IRQ_NONE)
108977 report_bad_irq(irq, desc, action_ret);
108978 }
108979diff -Nur linux-2.6.16.33-noxen/kernel/kexec.c linux-2.6.16.33/kernel/kexec.c
108980--- linux-2.6.16.33-noxen/kernel/kexec.c 2006-11-22 18:06:31.000000000 +0000
108981+++ linux-2.6.16.33/kernel/kexec.c 2007-01-08 15:00:46.000000000 +0000
108982@@ -403,7 +403,7 @@
108983 pages = kimage_alloc_pages(GFP_KERNEL, order);
108984 if (!pages)
108985 break;
108986- pfn = page_to_pfn(pages);
108987+ pfn = kexec_page_to_pfn(pages);
108988 epfn = pfn + count;
108989 addr = pfn << PAGE_SHIFT;
108990 eaddr = epfn << PAGE_SHIFT;
108991@@ -437,6 +437,7 @@
108992 return pages;
108993 }
108994
108995+#ifndef CONFIG_XEN
108996 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
108997 unsigned int order)
108998 {
108999@@ -490,7 +491,7 @@
109000 }
109001 /* If I don't overlap any segments I have found my hole! */
109002 if (i == image->nr_segments) {
109003- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
109004+ pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
109005 break;
109006 }
109007 }
109008@@ -517,6 +518,13 @@
109009
109010 return pages;
109011 }
109012+#else /* !CONFIG_XEN */
109013+struct page *kimage_alloc_control_pages(struct kimage *image,
109014+ unsigned int order)
109015+{
109016+ return kimage_alloc_normal_control_pages(image, order);
109017+}
109018+#endif
109019
109020 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
109021 {
109022@@ -532,7 +540,7 @@
109023 return -ENOMEM;
109024
109025 ind_page = page_address(page);
109026- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
109027+ *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
109028 image->entry = ind_page;
109029 image->last_entry = ind_page +
109030 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
109031@@ -593,13 +601,13 @@
109032 #define for_each_kimage_entry(image, ptr, entry) \
109033 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
109034 ptr = (entry & IND_INDIRECTION)? \
109035- phys_to_virt((entry & PAGE_MASK)): ptr +1)
109036+ kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
109037
109038 static void kimage_free_entry(kimage_entry_t entry)
109039 {
109040 struct page *page;
109041
109042- page = pfn_to_page(entry >> PAGE_SHIFT);
109043+ page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
109044 kimage_free_pages(page);
109045 }
109046
109047@@ -611,6 +619,10 @@
109048 if (!image)
109049 return;
109050
109051+#ifdef CONFIG_XEN
109052+ xen_machine_kexec_unload(image);
109053+#endif
109054+
109055 kimage_free_extra_pages(image);
109056 for_each_kimage_entry(image, ptr, entry) {
109057 if (entry & IND_INDIRECTION) {
109058@@ -686,7 +698,7 @@
109059 * have a match.
109060 */
109061 list_for_each_entry(page, &image->dest_pages, lru) {
109062- addr = page_to_pfn(page) << PAGE_SHIFT;
109063+ addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
109064 if (addr == destination) {
109065 list_del(&page->lru);
109066 return page;
109067@@ -701,12 +713,12 @@
109068 if (!page)
109069 return NULL;
109070 /* If the page cannot be used file it away */
109071- if (page_to_pfn(page) >
109072+ if (kexec_page_to_pfn(page) >
109073 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
109074 list_add(&page->lru, &image->unuseable_pages);
109075 continue;
109076 }
109077- addr = page_to_pfn(page) << PAGE_SHIFT;
109078+ addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
109079
109080 /* If it is the destination page we want use it */
109081 if (addr == destination)
109082@@ -729,7 +741,7 @@
109083 struct page *old_page;
109084
109085 old_addr = *old & PAGE_MASK;
109086- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
109087+ old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
109088 copy_highpage(page, old_page);
109089 *old = addr | (*old & ~PAGE_MASK);
109090
109091@@ -779,7 +791,7 @@
109092 result = -ENOMEM;
109093 goto out;
109094 }
109095- result = kimage_add_page(image, page_to_pfn(page)
109096+ result = kimage_add_page(image, kexec_page_to_pfn(page)
109097 << PAGE_SHIFT);
109098 if (result < 0)
109099 goto out;
109100@@ -811,6 +823,7 @@
109101 return result;
109102 }
109103
109104+#ifndef CONFIG_XEN
109105 static int kimage_load_crash_segment(struct kimage *image,
109106 struct kexec_segment *segment)
109107 {
109108@@ -833,7 +846,7 @@
109109 char *ptr;
109110 size_t uchunk, mchunk;
109111
109112- page = pfn_to_page(maddr >> PAGE_SHIFT);
109113+ page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
109114 if (page == 0) {
109115 result = -ENOMEM;
109116 goto out;
109117@@ -881,6 +894,13 @@
109118
109119 return result;
109120 }
109121+#else /* CONFIG_XEN */
109122+static int kimage_load_segment(struct kimage *image,
109123+ struct kexec_segment *segment)
109124+{
109125+ return kimage_load_normal_segment(image, segment);
109126+}
109127+#endif
109128
109129 /*
109130 * Exec Kernel system call: for obvious reasons only root may call it.
109131@@ -991,6 +1011,11 @@
109132 if (result)
109133 goto out;
109134 }
109135+#ifdef CONFIG_XEN
109136+ result = xen_machine_kexec_load(image);
109137+ if (result)
109138+ goto out;
109139+#endif
109140 /* Install the new kernel, and Uninstall the old */
109141 image = xchg(dest_image, image);
109142
109143@@ -1045,7 +1070,6 @@
109144 struct kimage *image;
109145 int locked;
109146
109147-
109148 /* Take the kexec_lock here to prevent sys_kexec_load
109149 * running on one cpu from replacing the crash kernel
109150 * we are using after a panic on a different cpu.
109151diff -Nur linux-2.6.16.33-noxen/kernel/rcupdate.c linux-2.6.16.33/kernel/rcupdate.c
109152--- linux-2.6.16.33-noxen/kernel/rcupdate.c 2006-11-22 18:06:31.000000000 +0000
109153+++ linux-2.6.16.33/kernel/rcupdate.c 2007-05-23 21:00:01.000000000 +0000
109154@@ -485,6 +485,20 @@
109155 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
109156 }
109157
109158+/*
109159+ * Check to see if any future RCU-related work will need to be done
109160+ * by the current CPU, even if none need be done immediately, returning
109161+ * 1 if so. This function is part of the RCU implementation; it is -not-
109162+ * an exported member of the RCU API.
109163+ */
109164+int rcu_needs_cpu(int cpu)
109165+{
109166+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
109167+ struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
109168+
109169+ return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
109170+}
109171+
109172 void rcu_check_callbacks(int cpu, int user)
109173 {
109174 if (user ||
109175diff -Nur linux-2.6.16.33-noxen/kernel/timer.c linux-2.6.16.33/kernel/timer.c
109176--- linux-2.6.16.33-noxen/kernel/timer.c 2006-11-22 18:06:31.000000000 +0000
109177+++ linux-2.6.16.33/kernel/timer.c 2007-05-23 21:00:01.000000000 +0000
109178@@ -555,6 +555,22 @@
109179 }
109180 spin_unlock(&base->t_base.lock);
109181
109182+ /*
109183+ * It can happen that other CPUs service timer IRQs and increment
109184+ * jiffies, but we have not yet got a local timer tick to process
109185+ * the timer wheels. In that case, the expiry time can be before
109186+ * jiffies, but since the high-resolution timer here is relative to
109187+ * jiffies, the default expression when high-resolution timers are
109188+ * not active,
109189+ *
109190+ * time_before(MAX_JIFFY_OFFSET + jiffies, expires)
109191+ *
109192+ * would falsely evaluate to true. If that is the case, just
109193+ * return jiffies so that we can immediately fire the local timer
109194+ */
109195+ if (time_before(expires, jiffies))
109196+ return jiffies;
109197+
109198 if (time_before(hr_expires, expires))
109199 return hr_expires;
109200
109201diff -Nur linux-2.6.16.33-noxen/lib/Makefile linux-2.6.16.33/lib/Makefile
109202--- linux-2.6.16.33-noxen/lib/Makefile 2006-11-22 18:06:31.000000000 +0000
109203+++ linux-2.6.16.33/lib/Makefile 2007-01-08 15:00:46.000000000 +0000
109204@@ -45,6 +45,7 @@
109205 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
109206
109207 obj-$(CONFIG_SWIOTLB) += swiotlb.o
109208+swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
109209
109210 hostprogs-y := gen_crc32table
109211 clean-files := crc32table.h
109212diff -Nur linux-2.6.16.33-noxen/lib/vsprintf.c linux-2.6.16.33/lib/vsprintf.c
109213--- linux-2.6.16.33-noxen/lib/vsprintf.c 2006-11-22 18:06:31.000000000 +0000
109214+++ linux-2.6.16.33/lib/vsprintf.c 2007-05-23 21:00:01.000000000 +0000
109215@@ -187,49 +187,49 @@
109216 size -= precision;
109217 if (!(type&(ZEROPAD+LEFT))) {
109218 while(size-->0) {
109219- if (buf <= end)
109220+ if (buf < end)
109221 *buf = ' ';
109222 ++buf;
109223 }
109224 }
109225 if (sign) {
109226- if (buf <= end)
109227+ if (buf < end)
109228 *buf = sign;
109229 ++buf;
109230 }
109231 if (type & SPECIAL) {
109232 if (base==8) {
109233- if (buf <= end)
109234+ if (buf < end)
109235 *buf = '0';
109236 ++buf;
109237 } else if (base==16) {
109238- if (buf <= end)
109239+ if (buf < end)
109240 *buf = '0';
109241 ++buf;
109242- if (buf <= end)
109243+ if (buf < end)
109244 *buf = digits[33];
109245 ++buf;
109246 }
109247 }
109248 if (!(type & LEFT)) {
109249 while (size-- > 0) {
109250- if (buf <= end)
109251+ if (buf < end)
109252 *buf = c;
109253 ++buf;
109254 }
109255 }
109256 while (i < precision--) {
109257- if (buf <= end)
109258+ if (buf < end)
109259 *buf = '0';
109260 ++buf;
109261 }
109262 while (i-- > 0) {
109263- if (buf <= end)
109264+ if (buf < end)
109265 *buf = tmp[i];
109266 ++buf;
109267 }
109268 while (size-- > 0) {
109269- if (buf <= end)
109270+ if (buf < end)
109271 *buf = ' ';
109272 ++buf;
109273 }
109274@@ -272,7 +272,8 @@
109275 /* 'z' changed to 'Z' --davidm 1/25/99 */
109276 /* 't' added for ptrdiff_t */
109277
109278- /* Reject out-of-range values early */
109279+ /* Reject out-of-range values early. Large positive sizes are
109280+ used for unknown buffer sizes. */
109281 if (unlikely((int) size < 0)) {
109282 /* There can be only one.. */
109283 static int warn = 1;
109284@@ -282,16 +283,17 @@
109285 }
109286
109287 str = buf;
109288- end = buf + size - 1;
109289+ end = buf + size;
109290
109291- if (end < buf - 1) {
109292- end = ((void *) -1);
109293- size = end - buf + 1;
109294+ /* Make sure end is always >= buf */
109295+ if (end < buf) {
109296+ end = ((void *)-1);
109297+ size = end - buf;
109298 }
109299
109300 for (; *fmt ; ++fmt) {
109301 if (*fmt != '%') {
109302- if (str <= end)
109303+ if (str < end)
109304 *str = *fmt;
109305 ++str;
109306 continue;
109307@@ -357,17 +359,17 @@
109308 case 'c':
109309 if (!(flags & LEFT)) {
109310 while (--field_width > 0) {
109311- if (str <= end)
109312+ if (str < end)
109313 *str = ' ';
109314 ++str;
109315 }
109316 }
109317 c = (unsigned char) va_arg(args, int);
109318- if (str <= end)
109319+ if (str < end)
109320 *str = c;
109321 ++str;
109322 while (--field_width > 0) {
109323- if (str <= end)
109324+ if (str < end)
109325 *str = ' ';
109326 ++str;
109327 }
109328@@ -382,18 +384,18 @@
109329
109330 if (!(flags & LEFT)) {
109331 while (len < field_width--) {
109332- if (str <= end)
109333+ if (str < end)
109334 *str = ' ';
109335 ++str;
109336 }
109337 }
109338 for (i = 0; i < len; ++i) {
109339- if (str <= end)
109340+ if (str < end)
109341 *str = *s;
109342 ++str; ++s;
109343 }
109344 while (len < field_width--) {
109345- if (str <= end)
109346+ if (str < end)
109347 *str = ' ';
109348 ++str;
109349 }
109350@@ -426,7 +428,7 @@
109351 continue;
109352
109353 case '%':
109354- if (str <= end)
109355+ if (str < end)
109356 *str = '%';
109357 ++str;
109358 continue;
109359@@ -449,11 +451,11 @@
109360 break;
109361
109362 default:
109363- if (str <= end)
109364+ if (str < end)
109365 *str = '%';
109366 ++str;
109367 if (*fmt) {
109368- if (str <= end)
109369+ if (str < end)
109370 *str = *fmt;
109371 ++str;
109372 } else {
109373@@ -483,14 +485,13 @@
109374 str = number(str, end, num, base,
109375 field_width, precision, flags);
109376 }
109377- if (str <= end)
109378- *str = '\0';
109379- else if (size > 0)
109380- /* don't write out a null byte if the buf size is zero */
109381- *end = '\0';
109382- /* the trailing null byte doesn't count towards the total
109383- * ++str;
109384- */
109385+ if (size > 0) {
109386+ if (str < end)
109387+ *str = '\0';
109388+ else
109389+ end[-1] = '\0';
109390+ }
109391+ /* the trailing null byte doesn't count towards the total */
109392 return str-buf;
109393 }
109394
109395@@ -848,3 +849,26 @@
109396 }
109397
109398 EXPORT_SYMBOL(sscanf);
109399+
109400+
109401+/* Simplified asprintf. */
109402+char *kasprintf(gfp_t gfp, const char *fmt, ...)
109403+{
109404+ va_list ap;
109405+ unsigned int len;
109406+ char *p;
109407+
109408+ va_start(ap, fmt);
109409+ len = vsnprintf(NULL, 0, fmt, ap);
109410+ va_end(ap);
109411+
109412+ p = kmalloc(len+1, gfp);
109413+ if (!p)
109414+ return NULL;
109415+ va_start(ap, fmt);
109416+ vsnprintf(p, len+1, fmt, ap);
109417+ va_end(ap);
109418+ return p;
109419+}
109420+
109421+EXPORT_SYMBOL(kasprintf);
109422diff -Nur linux-2.6.16.33-noxen/mm/Kconfig linux-2.6.16.33/mm/Kconfig
109423--- linux-2.6.16.33-noxen/mm/Kconfig 2006-11-22 18:06:31.000000000 +0000
109424+++ linux-2.6.16.33/mm/Kconfig 2007-01-08 15:00:46.000000000 +0000
109425@@ -126,11 +126,14 @@
109426 # Default to 4 for wider testing, though 8 might be more appropriate.
109427 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
109428 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
109429+# XEN on x86 architecture uses the mapping field on pagetable pages to store a
109430+# pointer to the destructor. This conflicts with pte_lock_deinit().
109431 #
109432 config SPLIT_PTLOCK_CPUS
109433 int
109434 default "4096" if ARM && !CPU_CACHE_VIPT
109435 default "4096" if PARISC && !PA20
109436+ default "4096" if X86_XEN || X86_64_XEN
109437 default "4"
109438
109439 #
109440diff -Nur linux-2.6.16.33-noxen/mm/highmem.c linux-2.6.16.33/mm/highmem.c
109441--- linux-2.6.16.33-noxen/mm/highmem.c 2006-11-22 18:06:31.000000000 +0000
109442+++ linux-2.6.16.33/mm/highmem.c 2007-01-08 15:00:46.000000000 +0000
109443@@ -152,6 +152,17 @@
109444 return vaddr;
109445 }
109446
109447+#ifdef CONFIG_XEN
109448+void kmap_flush_unused(void)
109449+{
109450+ spin_lock(&kmap_lock);
109451+ flush_all_zero_pkmaps();
109452+ spin_unlock(&kmap_lock);
109453+}
109454+
109455+EXPORT_SYMBOL(kmap_flush_unused);
109456+#endif
109457+
109458 void fastcall *kmap_high(struct page *page)
109459 {
109460 unsigned long vaddr;
109461diff -Nur linux-2.6.16.33-noxen/mm/memory.c linux-2.6.16.33/mm/memory.c
109462--- linux-2.6.16.33-noxen/mm/memory.c 2006-11-22 18:06:31.000000000 +0000
109463+++ linux-2.6.16.33/mm/memory.c 2007-01-08 15:00:46.000000000 +0000
109464@@ -405,7 +405,8 @@
109465 * Remove this test eventually!
109466 */
109467 if (unlikely(!pfn_valid(pfn))) {
109468- print_bad_pte(vma, pte, addr);
109469+ if (!(vma->vm_flags & VM_RESERVED))
109470+ print_bad_pte(vma, pte, addr);
109471 return NULL;
109472 }
109473
109474@@ -881,6 +882,7 @@
109475 tlb_finish_mmu(tlb, address, end);
109476 return end;
109477 }
109478+EXPORT_SYMBOL(zap_page_range);
109479
109480 /*
109481 * Do a quick page-table lookup for a single page.
109482@@ -1020,6 +1022,26 @@
109483 continue;
109484 }
109485
109486+#ifdef CONFIG_XEN
109487+ if (vma && (vma->vm_flags & VM_FOREIGN)) {
109488+ struct page **map = vma->vm_private_data;
109489+ int offset = (start - vma->vm_start) >> PAGE_SHIFT;
109490+ if (map[offset] != NULL) {
109491+ if (pages) {
109492+ struct page *page = map[offset];
109493+
109494+ pages[i] = page;
109495+ get_page(page);
109496+ }
109497+ if (vmas)
109498+ vmas[i] = vma;
109499+ i++;
109500+ start += PAGE_SIZE;
109501+ len--;
109502+ continue;
109503+ }
109504+ }
109505+#endif
109506 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
109507 || !(vm_flags & vma->vm_flags))
109508 return i ? : -EFAULT;
109509@@ -1359,6 +1381,102 @@
109510 }
109511 EXPORT_SYMBOL(remap_pfn_range);
109512
109513+#ifdef CONFIG_XEN
109514+static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
109515+ unsigned long addr, unsigned long end,
109516+ pte_fn_t fn, void *data)
109517+{
109518+ pte_t *pte;
109519+ int err;
109520+ struct page *pmd_page;
109521+ spinlock_t *ptl;
109522+
109523+ pte = (mm == &init_mm) ?
109524+ pte_alloc_kernel(pmd, addr) :
109525+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
109526+ if (!pte)
109527+ return -ENOMEM;
109528+
109529+ BUG_ON(pmd_huge(*pmd));
109530+
109531+ pmd_page = pmd_page(*pmd);
109532+
109533+ do {
109534+ err = fn(pte, pmd_page, addr, data);
109535+ if (err)
109536+ break;
109537+ } while (pte++, addr += PAGE_SIZE, addr != end);
109538+
109539+ if (mm != &init_mm)
109540+ pte_unmap_unlock(pte-1, ptl);
109541+ return err;
109542+}
109543+
109544+static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
109545+ unsigned long addr, unsigned long end,
109546+ pte_fn_t fn, void *data)
109547+{
109548+ pmd_t *pmd;
109549+ unsigned long next;
109550+ int err;
109551+
109552+ pmd = pmd_alloc(mm, pud, addr);
109553+ if (!pmd)
109554+ return -ENOMEM;
109555+ do {
109556+ next = pmd_addr_end(addr, end);
109557+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
109558+ if (err)
109559+ break;
109560+ } while (pmd++, addr = next, addr != end);
109561+ return err;
109562+}
109563+
109564+static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
109565+ unsigned long addr, unsigned long end,
109566+ pte_fn_t fn, void *data)
109567+{
109568+ pud_t *pud;
109569+ unsigned long next;
109570+ int err;
109571+
109572+ pud = pud_alloc(mm, pgd, addr);
109573+ if (!pud)
109574+ return -ENOMEM;
109575+ do {
109576+ next = pud_addr_end(addr, end);
109577+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
109578+ if (err)
109579+ break;
109580+ } while (pud++, addr = next, addr != end);
109581+ return err;
109582+}
109583+
109584+/*
109585+ * Scan a region of virtual memory, filling in page tables as necessary
109586+ * and calling a provided function on each leaf page table.
109587+ */
109588+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
109589+ unsigned long size, pte_fn_t fn, void *data)
109590+{
109591+ pgd_t *pgd;
109592+ unsigned long next;
109593+ unsigned long end = addr + size;
109594+ int err;
109595+
109596+ BUG_ON(addr >= end);
109597+ pgd = pgd_offset(mm, addr);
109598+ do {
109599+ next = pgd_addr_end(addr, end);
109600+ err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
109601+ if (err)
109602+ break;
109603+ } while (pgd++, addr = next, addr != end);
109604+ return err;
109605+}
109606+EXPORT_SYMBOL_GPL(apply_to_page_range);
109607+#endif
109608+
109609 /*
109610 * handle_pte_fault chooses page fault handler according to an entry
109611 * which was read non-atomically. Before making any commitment, on
109612diff -Nur linux-2.6.16.33-noxen/mm/mmap.c linux-2.6.16.33/mm/mmap.c
109613--- linux-2.6.16.33-noxen/mm/mmap.c 2006-11-22 18:06:31.000000000 +0000
109614+++ linux-2.6.16.33/mm/mmap.c 2007-01-08 15:00:46.000000000 +0000
109615@@ -1950,6 +1950,10 @@
109616 unsigned long nr_accounted = 0;
109617 unsigned long end;
109618
109619+#ifdef arch_exit_mmap
109620+ arch_exit_mmap(mm);
109621+#endif
109622+
109623 lru_add_drain();
109624 flush_cache_mm(mm);
109625 tlb = tlb_gather_mmu(mm, 1);
109626diff -Nur linux-2.6.16.33-noxen/mm/page_alloc.c linux-2.6.16.33/mm/page_alloc.c
109627--- linux-2.6.16.33-noxen/mm/page_alloc.c 2006-11-22 18:06:31.000000000 +0000
109628+++ linux-2.6.16.33/mm/page_alloc.c 2007-01-08 15:00:46.000000000 +0000
109629@@ -422,7 +422,8 @@
109630 int i;
109631 int reserved = 0;
109632
109633- arch_free_page(page, order);
109634+ if (arch_free_page(page, order))
109635+ return;
109636 if (!PageHighMem(page))
109637 mutex_debug_check_no_locks_freed(page_address(page),
109638 PAGE_SIZE<<order);
109639@@ -716,7 +717,8 @@
109640 struct per_cpu_pages *pcp;
109641 unsigned long flags;
109642
109643- arch_free_page(page, 0);
109644+ if (arch_free_page(page, 0))
109645+ return;
109646
109647 if (PageAnon(page))
109648 page->mapping = NULL;
109649diff -Nur linux-2.6.16.33-noxen/net/atm/clip.c linux-2.6.16.33/net/atm/clip.c
109650--- linux-2.6.16.33-noxen/net/atm/clip.c 2006-11-22 18:06:31.000000000 +0000
109651+++ linux-2.6.16.33/net/atm/clip.c 2007-05-23 21:00:01.000000000 +0000
109652@@ -101,7 +101,7 @@
109653 printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n",clip_vcc);
109654 return;
109655 }
109656- spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */
109657+ netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
109658 entry->neigh->used = jiffies;
109659 for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
109660 if (*walk == clip_vcc) {
109661@@ -125,7 +125,7 @@
109662 printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
109663 "0x%p)\n",entry,clip_vcc);
109664 out:
109665- spin_unlock_bh(&entry->neigh->dev->xmit_lock);
109666+ netif_tx_unlock_bh(entry->neigh->dev);
109667 }
109668
109669 /* The neighbour entry n->lock is held. */
109670diff -Nur linux-2.6.16.33-noxen/net/bridge/br_device.c linux-2.6.16.33/net/bridge/br_device.c
109671--- linux-2.6.16.33-noxen/net/bridge/br_device.c 2006-11-22 18:06:31.000000000 +0000
109672+++ linux-2.6.16.33/net/bridge/br_device.c 2007-05-23 21:00:01.000000000 +0000
109673@@ -146,9 +146,9 @@
109674 struct net_bridge *br = netdev_priv(dev);
109675
109676 if (data)
109677- br->feature_mask |= NETIF_F_IP_CSUM;
109678+ br->feature_mask |= NETIF_F_NO_CSUM;
109679 else
109680- br->feature_mask &= ~NETIF_F_IP_CSUM;
109681+ br->feature_mask &= ~NETIF_F_ALL_CSUM;
109682
109683 br_features_recompute(br);
109684 return 0;
109685@@ -185,6 +185,6 @@
109686 dev->set_mac_address = br_set_mac_address;
109687 dev->priv_flags = IFF_EBRIDGE;
109688
109689- dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
109690- | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
109691+ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
109692+ NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_GSO_ROBUST;
109693 }
109694diff -Nur linux-2.6.16.33-noxen/net/bridge/br_forward.c linux-2.6.16.33/net/bridge/br_forward.c
109695--- linux-2.6.16.33-noxen/net/bridge/br_forward.c 2006-11-22 18:06:31.000000000 +0000
109696+++ linux-2.6.16.33/net/bridge/br_forward.c 2007-05-23 21:00:01.000000000 +0000
109697@@ -32,7 +32,7 @@
109698 int br_dev_queue_push_xmit(struct sk_buff *skb)
109699 {
109700 /* drop mtu oversized packets except tso */
109701- if (skb->len > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
109702+ if (skb->len > skb->dev->mtu && !skb_is_gso(skb))
109703 kfree_skb(skb);
109704 else {
109705 #ifdef CONFIG_BRIDGE_NETFILTER
109706diff -Nur linux-2.6.16.33-noxen/net/bridge/br_if.c linux-2.6.16.33/net/bridge/br_if.c
109707--- linux-2.6.16.33-noxen/net/bridge/br_if.c 2006-11-22 18:06:31.000000000 +0000
109708+++ linux-2.6.16.33/net/bridge/br_if.c 2007-05-23 21:00:01.000000000 +0000
109709@@ -385,17 +385,28 @@
109710 struct net_bridge_port *p;
109711 unsigned long features, checksum;
109712
109713- features = br->feature_mask &~ NETIF_F_IP_CSUM;
109714- checksum = br->feature_mask & NETIF_F_IP_CSUM;
109715+ checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
109716+ features = br->feature_mask & ~NETIF_F_ALL_CSUM;
109717
109718 list_for_each_entry(p, &br->port_list, list) {
109719- if (!(p->dev->features
109720- & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
109721+ unsigned long feature = p->dev->features;
109722+
109723+ if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
109724+ checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
109725+ if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
109726+ checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
109727+ if (!(feature & NETIF_F_IP_CSUM))
109728 checksum = 0;
109729- features &= p->dev->features;
109730+
109731+ if (feature & NETIF_F_GSO)
109732+ feature |= NETIF_F_TSO;
109733+ feature |= NETIF_F_GSO;
109734+
109735+ features &= feature;
109736 }
109737
109738- br->dev->features = features | checksum | NETIF_F_LLTX;
109739+ br->dev->features = features | checksum | NETIF_F_LLTX |
109740+ NETIF_F_GSO_ROBUST;
109741 }
109742
109743 /* called with RTNL */
109744diff -Nur linux-2.6.16.33-noxen/net/bridge/br_netfilter.c linux-2.6.16.33/net/bridge/br_netfilter.c
109745--- linux-2.6.16.33-noxen/net/bridge/br_netfilter.c 2006-11-22 18:06:31.000000000 +0000
109746+++ linux-2.6.16.33/net/bridge/br_netfilter.c 2007-05-23 21:00:01.000000000 +0000
109747@@ -743,7 +743,7 @@
109748 {
109749 if (skb->protocol == htons(ETH_P_IP) &&
109750 skb->len > skb->dev->mtu &&
109751- !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
109752+ !skb_is_gso(skb))
109753 return ip_fragment(skb, br_dev_queue_push_xmit);
109754 else
109755 return br_dev_queue_push_xmit(skb);
109756diff -Nur linux-2.6.16.33-noxen/net/core/dev.c linux-2.6.16.33/net/core/dev.c
109757--- linux-2.6.16.33-noxen/net/core/dev.c 2006-11-22 18:06:31.000000000 +0000
109758+++ linux-2.6.16.33/net/core/dev.c 2007-01-08 15:00:46.000000000 +0000
109759@@ -115,6 +115,13 @@
109760 #include <net/iw_handler.h>
109761 #endif /* CONFIG_NET_RADIO */
109762 #include <asm/current.h>
109763+#include <linux/err.h>
109764+
109765+#ifdef CONFIG_XEN
109766+#include <net/ip.h>
109767+#include <linux/tcp.h>
109768+#include <linux/udp.h>
109769+#endif
109770
109771 /*
109772 * The list of packet types we will receive (as opposed to discard)
109773@@ -1032,7 +1039,7 @@
109774 * taps currently in use.
109775 */
109776
109777-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
109778+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
109779 {
109780 struct packet_type *ptype;
109781
109782@@ -1082,9 +1089,12 @@
109783 unsigned int csum;
109784 int ret = 0, offset = skb->h.raw - skb->data;
109785
109786- if (inward) {
109787- skb->ip_summed = CHECKSUM_NONE;
109788- goto out;
109789+ if (inward)
109790+ goto out_set_summed;
109791+
109792+ if (unlikely(skb_shinfo(skb)->gso_size)) {
109793+ /* Let GSO fix up the checksum. */
109794+ goto out_set_summed;
109795 }
109796
109797 if (skb_cloned(skb)) {
109798@@ -1101,11 +1111,65 @@
109799 BUG_ON(skb->csum + 2 > offset);
109800
109801 *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
109802+
109803+out_set_summed:
109804 skb->ip_summed = CHECKSUM_NONE;
109805 out:
109806 return ret;
109807 }
109808
109809+/**
109810+ * skb_gso_segment - Perform segmentation on skb.
109811+ * @skb: buffer to segment
109812+ * @features: features for the output path (see dev->features)
109813+ *
109814+ * This function segments the given skb and returns a list of segments.
109815+ *
109816+ * It may return NULL if the skb requires no segmentation. This is
109817+ * only possible when GSO is used for verifying header integrity.
109818+ */
109819+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
109820+{
109821+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
109822+ struct packet_type *ptype;
109823+ int type = skb->protocol;
109824+ int err;
109825+
109826+ BUG_ON(skb_shinfo(skb)->frag_list);
109827+
109828+ skb->mac.raw = skb->data;
109829+ skb->mac_len = skb->nh.raw - skb->data;
109830+ __skb_pull(skb, skb->mac_len);
109831+
109832+ if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
109833+ if (skb_header_cloned(skb) &&
109834+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
109835+ return ERR_PTR(err);
109836+ }
109837+
109838+ rcu_read_lock();
109839+ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
109840+ if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
109841+ if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
109842+ err = ptype->gso_send_check(skb);
109843+ segs = ERR_PTR(err);
109844+ if (err || skb_gso_ok(skb, features))
109845+ break;
109846+ __skb_push(skb, skb->data - skb->nh.raw);
109847+ }
109848+ segs = ptype->gso_segment(skb, features);
109849+ break;
109850+ }
109851+ }
109852+ rcu_read_unlock();
109853+
109854+ __skb_push(skb, skb->data - skb->mac.raw);
109855+
109856+ return segs;
109857+}
109858+
109859+EXPORT_SYMBOL(skb_gso_segment);
109860+
109861 /* Take action when hardware reception checksum errors are detected. */
109862 #ifdef CONFIG_BUG
109863 void netdev_rx_csum_fault(struct net_device *dev)
109864@@ -1142,79 +1206,148 @@
109865 #define illegal_highdma(dev, skb) (0)
109866 #endif
109867
109868-/* Keep head the same: replace data */
109869-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
109870+struct dev_gso_cb {
109871+ void (*destructor)(struct sk_buff *skb);
109872+};
109873+
109874+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
109875+
109876+static void dev_gso_skb_destructor(struct sk_buff *skb)
109877 {
109878- unsigned int size;
109879- u8 *data;
109880- long offset;
109881- struct skb_shared_info *ninfo;
109882- int headerlen = skb->data - skb->head;
109883- int expand = (skb->tail + skb->data_len) - skb->end;
109884-
109885- if (skb_shared(skb))
109886- BUG();
109887-
109888- if (expand <= 0)
109889- expand = 0;
109890-
109891- size = skb->end - skb->head + expand;
109892- size = SKB_DATA_ALIGN(size);
109893- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
109894- if (!data)
109895- return -ENOMEM;
109896-
109897- /* Copy entire thing */
109898- if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
109899- BUG();
109900-
109901- /* Set up shinfo */
109902- ninfo = (struct skb_shared_info*)(data + size);
109903- atomic_set(&ninfo->dataref, 1);
109904- ninfo->tso_size = skb_shinfo(skb)->tso_size;
109905- ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
109906- ninfo->ufo_size = skb_shinfo(skb)->ufo_size;
109907- ninfo->nr_frags = 0;
109908- ninfo->frag_list = NULL;
109909-
109910- /* Offset between the two in bytes */
109911- offset = data - skb->head;
109912-
109913- /* Free old data. */
109914- skb_release_data(skb);
109915-
109916- skb->head = data;
109917- skb->end = data + size;
109918-
109919- /* Set up new pointers */
109920- skb->h.raw += offset;
109921- skb->nh.raw += offset;
109922- skb->mac.raw += offset;
109923- skb->tail += offset;
109924- skb->data += offset;
109925+ struct dev_gso_cb *cb;
109926+
109927+ do {
109928+ struct sk_buff *nskb = skb->next;
109929
109930- /* We are no longer a clone, even if we were. */
109931- skb->cloned = 0;
109932+ skb->next = nskb->next;
109933+ nskb->next = NULL;
109934+ kfree_skb(nskb);
109935+ } while (skb->next);
109936+
109937+ cb = DEV_GSO_CB(skb);
109938+ if (cb->destructor)
109939+ cb->destructor(skb);
109940+}
109941+
109942+/**
109943+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
109944+ * @skb: buffer to segment
109945+ *
109946+ * This function segments the given skb and stores the list of segments
109947+ * in skb->next.
109948+ */
109949+static int dev_gso_segment(struct sk_buff *skb)
109950+{
109951+ struct net_device *dev = skb->dev;
109952+ struct sk_buff *segs;
109953+ int features = dev->features & ~(illegal_highdma(dev, skb) ?
109954+ NETIF_F_SG : 0);
109955+
109956+ segs = skb_gso_segment(skb, features);
109957+
109958+ /* Verifying header integrity only. */
109959+ if (!segs)
109960+ return 0;
109961+
109962+ if (unlikely(IS_ERR(segs)))
109963+ return PTR_ERR(segs);
109964+
109965+ skb->next = segs;
109966+ DEV_GSO_CB(skb)->destructor = skb->destructor;
109967+ skb->destructor = dev_gso_skb_destructor;
109968
109969- skb->tail += skb->data_len;
109970- skb->data_len = 0;
109971+ return 0;
109972+}
109973+
109974+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
109975+{
109976+ if (likely(!skb->next)) {
109977+ if (netdev_nit)
109978+ dev_queue_xmit_nit(skb, dev);
109979+
109980+ if (netif_needs_gso(dev, skb)) {
109981+ if (unlikely(dev_gso_segment(skb)))
109982+ goto out_kfree_skb;
109983+ if (skb->next)
109984+ goto gso;
109985+ }
109986+
109987+ return dev->hard_start_xmit(skb, dev);
109988+ }
109989+
109990+gso:
109991+ do {
109992+ struct sk_buff *nskb = skb->next;
109993+ int rc;
109994+
109995+ skb->next = nskb->next;
109996+ nskb->next = NULL;
109997+ rc = dev->hard_start_xmit(nskb, dev);
109998+ if (unlikely(rc)) {
109999+ nskb->next = skb->next;
110000+ skb->next = nskb;
110001+ return rc;
110002+ }
110003+ if (unlikely(netif_queue_stopped(dev) && skb->next))
110004+ return NETDEV_TX_BUSY;
110005+ } while (skb->next);
110006+
110007+ skb->destructor = DEV_GSO_CB(skb)->destructor;
110008+
110009+out_kfree_skb:
110010+ kfree_skb(skb);
110011 return 0;
110012 }
110013
110014 #define HARD_TX_LOCK(dev, cpu) { \
110015 if ((dev->features & NETIF_F_LLTX) == 0) { \
110016- spin_lock(&dev->xmit_lock); \
110017- dev->xmit_lock_owner = cpu; \
110018+ netif_tx_lock(dev); \
110019 } \
110020 }
110021
110022 #define HARD_TX_UNLOCK(dev) { \
110023 if ((dev->features & NETIF_F_LLTX) == 0) { \
110024- dev->xmit_lock_owner = -1; \
110025- spin_unlock(&dev->xmit_lock); \
110026+ netif_tx_unlock(dev); \
110027 } \
110028 }
110029
110030+#ifdef CONFIG_XEN
110031+inline int skb_checksum_setup(struct sk_buff *skb)
110032+{
110033+ if (skb->proto_csum_blank) {
110034+ if (skb->protocol != htons(ETH_P_IP))
110035+ goto out;
110036+ skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
110037+ if (skb->h.raw >= skb->tail)
110038+ goto out;
110039+ switch (skb->nh.iph->protocol) {
110040+ case IPPROTO_TCP:
110041+ skb->csum = offsetof(struct tcphdr, check);
110042+ break;
110043+ case IPPROTO_UDP:
110044+ skb->csum = offsetof(struct udphdr, check);
110045+ break;
110046+ default:
110047+ if (net_ratelimit())
110048+ printk(KERN_ERR "Attempting to checksum a non-"
110049+ "TCP/UDP packet, dropping a protocol"
110050+ " %d packet", skb->nh.iph->protocol);
110051+ goto out;
110052+ }
110053+ if ((skb->h.raw + skb->csum + 2) > skb->tail)
110054+ goto out;
110055+ skb->ip_summed = CHECKSUM_HW;
110056+ skb->proto_csum_blank = 0;
110057+ }
110058+ return 0;
110059+out:
110060+ return -EPROTO;
110061+}
110062+#else
110063+inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
110064+#endif
110065+
110066+
110067 /**
110068 * dev_queue_xmit - transmit a buffer
110069 * @skb: buffer to transmit
110070@@ -1247,9 +1380,19 @@
110071 struct Qdisc *q;
110072 int rc = -ENOMEM;
110073
110074+ /* If a checksum-deferred packet is forwarded to a device that needs a
110075+ * checksum, correct the pointers and force checksumming.
110076+ */
110077+ if (skb_checksum_setup(skb))
110078+ goto out_kfree_skb;
110079+
110080+ /* GSO will handle the following emulations directly. */
110081+ if (netif_needs_gso(dev, skb))
110082+ goto gso;
110083+
110084 if (skb_shinfo(skb)->frag_list &&
110085 !(dev->features & NETIF_F_FRAGLIST) &&
110086- __skb_linearize(skb, GFP_ATOMIC))
110087+ __skb_linearize(skb))
110088 goto out_kfree_skb;
110089
110090 /* Fragmented skb is linearized if device does not support SG,
110091@@ -1258,25 +1401,26 @@
110092 */
110093 if (skb_shinfo(skb)->nr_frags &&
110094 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
110095- __skb_linearize(skb, GFP_ATOMIC))
110096+ __skb_linearize(skb))
110097 goto out_kfree_skb;
110098
110099 /* If packet is not checksummed and device does not support
110100 * checksumming for this protocol, complete checksumming here.
110101 */
110102 if (skb->ip_summed == CHECKSUM_HW &&
110103- (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
110104+ (!(dev->features & NETIF_F_GEN_CSUM) &&
110105 (!(dev->features & NETIF_F_IP_CSUM) ||
110106 skb->protocol != htons(ETH_P_IP))))
110107 if (skb_checksum_help(skb, 0))
110108 goto out_kfree_skb;
110109
110110+gso:
110111 spin_lock_prefetch(&dev->queue_lock);
110112
110113 /* Disable soft irqs for various locks below. Also
110114 * stops preemption for RCU.
110115 */
110116- local_bh_disable();
110117+ rcu_read_lock_bh();
110118
110119 /* Updates of qdisc are serialized by queue_lock.
110120 * The struct Qdisc which is pointed to by qdisc is now a
110121@@ -1310,8 +1454,8 @@
110122 /* The device has no queue. Common case for software devices:
110123 loopback, all the sorts of tunnels...
110124
110125- Really, it is unlikely that xmit_lock protection is necessary here.
110126- (f.e. loopback and IP tunnels are clean ignoring statistics
110127+ Really, it is unlikely that netif_tx_lock protection is necessary
110128+ here. (f.e. loopback and IP tunnels are clean ignoring statistics
110129 counters.)
110130 However, it is possible, that they rely on protection
110131 made by us here.
110132@@ -1327,11 +1471,8 @@
110133 HARD_TX_LOCK(dev, cpu);
110134
110135 if (!netif_queue_stopped(dev)) {
110136- if (netdev_nit)
110137- dev_queue_xmit_nit(skb, dev);
110138-
110139 rc = 0;
110140- if (!dev->hard_start_xmit(skb, dev)) {
110141+ if (!dev_hard_start_xmit(skb, dev)) {
110142 HARD_TX_UNLOCK(dev);
110143 goto out;
110144 }
110145@@ -1350,13 +1491,13 @@
110146 }
110147
110148 rc = -ENETDOWN;
110149- local_bh_enable();
110150+ rcu_read_unlock_bh();
110151
110152 out_kfree_skb:
110153 kfree_skb(skb);
110154 return rc;
110155 out:
110156- local_bh_enable();
110157+ rcu_read_unlock_bh();
110158 return rc;
110159 }
110160
110161@@ -1610,6 +1751,19 @@
110162 }
110163 #endif
110164
110165+#ifdef CONFIG_XEN
110166+ switch (skb->ip_summed) {
110167+ case CHECKSUM_UNNECESSARY:
110168+ skb->proto_data_valid = 1;
110169+ break;
110170+ case CHECKSUM_HW:
110171+ /* XXX Implement me. */
110172+ default:
110173+ skb->proto_data_valid = 0;
110174+ break;
110175+ }
110176+#endif
110177+
110178 list_for_each_entry_rcu(ptype, &ptype_all, list) {
110179 if (!ptype->dev || ptype->dev == skb->dev) {
110180 if (pt_prev)
110181@@ -2671,7 +2825,7 @@
110182 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
110183
110184 spin_lock_init(&dev->queue_lock);
110185- spin_lock_init(&dev->xmit_lock);
110186+ spin_lock_init(&dev->_xmit_lock);
110187 dev->xmit_lock_owner = -1;
110188 #ifdef CONFIG_NET_CLS_ACT
110189 spin_lock_init(&dev->ingress_lock);
110190@@ -2715,9 +2869,7 @@
110191
110192 /* Fix illegal SG+CSUM combinations. */
110193 if ((dev->features & NETIF_F_SG) &&
110194- !(dev->features & (NETIF_F_IP_CSUM |
110195- NETIF_F_NO_CSUM |
110196- NETIF_F_HW_CSUM))) {
110197+ !(dev->features & NETIF_F_ALL_CSUM)) {
110198 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
110199 dev->name);
110200 dev->features &= ~NETIF_F_SG;
110201@@ -3269,7 +3421,6 @@
110202 EXPORT_SYMBOL(__dev_get_by_index);
110203 EXPORT_SYMBOL(__dev_get_by_name);
110204 EXPORT_SYMBOL(__dev_remove_pack);
110205-EXPORT_SYMBOL(__skb_linearize);
110206 EXPORT_SYMBOL(dev_valid_name);
110207 EXPORT_SYMBOL(dev_add_pack);
110208 EXPORT_SYMBOL(dev_alloc_name);
110209@@ -3301,6 +3452,7 @@
110210 EXPORT_SYMBOL(net_enable_timestamp);
110211 EXPORT_SYMBOL(net_disable_timestamp);
110212 EXPORT_SYMBOL(dev_get_flags);
110213+EXPORT_SYMBOL(skb_checksum_setup);
110214
110215 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
110216 EXPORT_SYMBOL(br_handle_frame_hook);
110217diff -Nur linux-2.6.16.33-noxen/net/core/dev_mcast.c linux-2.6.16.33/net/core/dev_mcast.c
110218--- linux-2.6.16.33-noxen/net/core/dev_mcast.c 2006-11-22 18:06:31.000000000 +0000
110219+++ linux-2.6.16.33/net/core/dev_mcast.c 2007-05-23 21:00:01.000000000 +0000
110220@@ -62,7 +62,7 @@
110221 * Device mc lists are changed by bh at least if IPv6 is enabled,
110222 * so that it must be bh protected.
110223 *
110224- * We block accesses to device mc filters with dev->xmit_lock.
110225+ * We block accesses to device mc filters with netif_tx_lock.
110226 */
110227
110228 /*
110229@@ -93,9 +93,9 @@
110230
110231 void dev_mc_upload(struct net_device *dev)
110232 {
110233- spin_lock_bh(&dev->xmit_lock);
110234+ netif_tx_lock_bh(dev);
110235 __dev_mc_upload(dev);
110236- spin_unlock_bh(&dev->xmit_lock);
110237+ netif_tx_unlock_bh(dev);
110238 }
110239
110240 /*
110241@@ -107,7 +107,7 @@
110242 int err = 0;
110243 struct dev_mc_list *dmi, **dmip;
110244
110245- spin_lock_bh(&dev->xmit_lock);
110246+ netif_tx_lock_bh(dev);
110247
110248 for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
110249 /*
110250@@ -139,13 +139,13 @@
110251 */
110252 __dev_mc_upload(dev);
110253
110254- spin_unlock_bh(&dev->xmit_lock);
110255+ netif_tx_unlock_bh(dev);
110256 return 0;
110257 }
110258 }
110259 err = -ENOENT;
110260 done:
110261- spin_unlock_bh(&dev->xmit_lock);
110262+ netif_tx_unlock_bh(dev);
110263 return err;
110264 }
110265
110266@@ -160,7 +160,7 @@
110267
110268 dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
110269
110270- spin_lock_bh(&dev->xmit_lock);
110271+ netif_tx_lock_bh(dev);
110272 for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
110273 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
110274 dmi->dmi_addrlen == alen) {
110275@@ -176,7 +176,7 @@
110276 }
110277
110278 if ((dmi = dmi1) == NULL) {
110279- spin_unlock_bh(&dev->xmit_lock);
110280+ netif_tx_unlock_bh(dev);
110281 return -ENOMEM;
110282 }
110283 memcpy(dmi->dmi_addr, addr, alen);
110284@@ -189,11 +189,11 @@
110285
110286 __dev_mc_upload(dev);
110287
110288- spin_unlock_bh(&dev->xmit_lock);
110289+ netif_tx_unlock_bh(dev);
110290 return 0;
110291
110292 done:
110293- spin_unlock_bh(&dev->xmit_lock);
110294+ netif_tx_unlock_bh(dev);
110295 kfree(dmi1);
110296 return err;
110297 }
110298@@ -204,7 +204,7 @@
110299
110300 void dev_mc_discard(struct net_device *dev)
110301 {
110302- spin_lock_bh(&dev->xmit_lock);
110303+ netif_tx_lock_bh(dev);
110304
110305 while (dev->mc_list != NULL) {
110306 struct dev_mc_list *tmp = dev->mc_list;
110307@@ -215,7 +215,7 @@
110308 }
110309 dev->mc_count = 0;
110310
110311- spin_unlock_bh(&dev->xmit_lock);
110312+ netif_tx_unlock_bh(dev);
110313 }
110314
110315 #ifdef CONFIG_PROC_FS
110316@@ -250,7 +250,7 @@
110317 struct dev_mc_list *m;
110318 struct net_device *dev = v;
110319
110320- spin_lock_bh(&dev->xmit_lock);
110321+ netif_tx_lock_bh(dev);
110322 for (m = dev->mc_list; m; m = m->next) {
110323 int i;
110324
110325@@ -262,7 +262,7 @@
110326
110327 seq_putc(seq, '\n');
110328 }
110329- spin_unlock_bh(&dev->xmit_lock);
110330+ netif_tx_unlock_bh(dev);
110331 return 0;
110332 }
110333
110334diff -Nur linux-2.6.16.33-noxen/net/core/ethtool.c linux-2.6.16.33/net/core/ethtool.c
110335--- linux-2.6.16.33-noxen/net/core/ethtool.c 2006-11-22 18:06:31.000000000 +0000
110336+++ linux-2.6.16.33/net/core/ethtool.c 2007-05-23 21:00:01.000000000 +0000
110337@@ -30,7 +30,7 @@
110338
110339 u32 ethtool_op_get_tx_csum(struct net_device *dev)
110340 {
110341- return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
110342+ return (dev->features & NETIF_F_ALL_CSUM) != 0;
110343 }
110344
110345 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
110346@@ -551,9 +551,7 @@
110347 return -EFAULT;
110348
110349 if (edata.data &&
110350- !(dev->features & (NETIF_F_IP_CSUM |
110351- NETIF_F_NO_CSUM |
110352- NETIF_F_HW_CSUM)))
110353+ !(dev->features & NETIF_F_ALL_CSUM))
110354 return -EINVAL;
110355
110356 return __ethtool_set_sg(dev, edata.data);
110357@@ -561,7 +559,7 @@
110358
110359 static int ethtool_get_tso(struct net_device *dev, char __user *useraddr)
110360 {
110361- struct ethtool_value edata = { ETHTOOL_GTSO };
110362+ struct ethtool_value edata = { ETHTOOL_GUFO };
110363
110364 if (!dev->ethtool_ops->get_tso)
110365 return -EOPNOTSUPP;
110366@@ -616,6 +614,29 @@
110367 return dev->ethtool_ops->set_ufo(dev, edata.data);
110368 }
110369
110370+static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
110371+{
110372+ struct ethtool_value edata = { ETHTOOL_GGSO };
110373+
110374+ edata.data = dev->features & NETIF_F_GSO;
110375+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
110376+ return -EFAULT;
110377+ return 0;
110378+}
110379+
110380+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
110381+{
110382+ struct ethtool_value edata;
110383+
110384+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
110385+ return -EFAULT;
110386+ if (edata.data)
110387+ dev->features |= NETIF_F_GSO;
110388+ else
110389+ dev->features &= ~NETIF_F_GSO;
110390+ return 0;
110391+}
110392+
110393 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
110394 {
110395 struct ethtool_test test;
110396@@ -907,6 +928,12 @@
110397 case ETHTOOL_SUFO:
110398 rc = ethtool_set_ufo(dev, useraddr);
110399 break;
110400+ case ETHTOOL_GGSO:
110401+ rc = ethtool_get_gso(dev, useraddr);
110402+ break;
110403+ case ETHTOOL_SGSO:
110404+ rc = ethtool_set_gso(dev, useraddr);
110405+ break;
110406 default:
110407 rc = -EOPNOTSUPP;
110408 }
110409diff -Nur linux-2.6.16.33-noxen/net/core/netpoll.c linux-2.6.16.33/net/core/netpoll.c
110410--- linux-2.6.16.33-noxen/net/core/netpoll.c 2006-11-22 18:06:31.000000000 +0000
110411+++ linux-2.6.16.33/net/core/netpoll.c 2007-05-23 21:00:01.000000000 +0000
110412@@ -273,24 +273,21 @@
110413
110414 do {
110415 npinfo->tries--;
110416- spin_lock(&np->dev->xmit_lock);
110417- np->dev->xmit_lock_owner = smp_processor_id();
110418+ netif_tx_lock(np->dev);
110419
110420 /*
110421 * network drivers do not expect to be called if the queue is
110422 * stopped.
110423 */
110424 if (netif_queue_stopped(np->dev)) {
110425- np->dev->xmit_lock_owner = -1;
110426- spin_unlock(&np->dev->xmit_lock);
110427+ netif_tx_unlock(np->dev);
110428 netpoll_poll(np);
110429 udelay(50);
110430 continue;
110431 }
110432
110433 status = np->dev->hard_start_xmit(skb, np->dev);
110434- np->dev->xmit_lock_owner = -1;
110435- spin_unlock(&np->dev->xmit_lock);
110436+ netif_tx_unlock(np->dev);
110437
110438 /* success */
110439 if(!status) {
110440diff -Nur linux-2.6.16.33-noxen/net/core/pktgen.c linux-2.6.16.33/net/core/pktgen.c
110441--- linux-2.6.16.33-noxen/net/core/pktgen.c 2006-11-22 18:06:31.000000000 +0000
110442+++ linux-2.6.16.33/net/core/pktgen.c 2007-05-23 21:00:01.000000000 +0000
110443@@ -2586,7 +2586,7 @@
110444 }
110445 }
110446
110447- spin_lock_bh(&odev->xmit_lock);
110448+ netif_tx_lock_bh(odev);
110449 if (!netif_queue_stopped(odev)) {
110450
110451 atomic_inc(&(pkt_dev->skb->users));
110452@@ -2631,7 +2631,7 @@
110453 pkt_dev->next_tx_ns = 0;
110454 }
110455
110456- spin_unlock_bh(&odev->xmit_lock);
110457+ netif_tx_unlock_bh(odev);
110458
110459 /* If pkt_dev->count is zero, then run forever */
110460 if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
110461diff -Nur linux-2.6.16.33-noxen/net/core/skbuff.c linux-2.6.16.33/net/core/skbuff.c
110462--- linux-2.6.16.33-noxen/net/core/skbuff.c 2006-11-22 18:06:31.000000000 +0000
110463+++ linux-2.6.16.33/net/core/skbuff.c 2007-01-08 15:00:46.000000000 +0000
110464@@ -132,6 +132,7 @@
110465 * Buffers may only be allocated from interrupts using a @gfp_mask of
110466 * %GFP_ATOMIC.
110467 */
110468+#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
110469 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
110470 int fclone)
110471 {
110472@@ -164,9 +165,9 @@
110473 shinfo = skb_shinfo(skb);
110474 atomic_set(&shinfo->dataref, 1);
110475 shinfo->nr_frags = 0;
110476- shinfo->tso_size = 0;
110477- shinfo->tso_segs = 0;
110478- shinfo->ufo_size = 0;
110479+ shinfo->gso_size = 0;
110480+ shinfo->gso_segs = 0;
110481+ shinfo->gso_type = 0;
110482 shinfo->ip6_frag_id = 0;
110483 shinfo->frag_list = NULL;
110484
110485@@ -186,6 +187,7 @@
110486 skb = NULL;
110487 goto out;
110488 }
110489+#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
110490
110491 /**
110492 * alloc_skb_from_cache - allocate a network buffer
110493@@ -203,14 +205,18 @@
110494 */
110495 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
110496 unsigned int size,
110497- gfp_t gfp_mask)
110498+ gfp_t gfp_mask,
110499+ int fclone)
110500 {
110501+ kmem_cache_t *cache;
110502+ struct skb_shared_info *shinfo;
110503 struct sk_buff *skb;
110504 u8 *data;
110505
110506+ cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
110507+
110508 /* Get the HEAD */
110509- skb = kmem_cache_alloc(skbuff_head_cache,
110510- gfp_mask & ~__GFP_DMA);
110511+ skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
110512 if (!skb)
110513 goto out;
110514
110515@@ -227,17 +233,29 @@
110516 skb->data = data;
110517 skb->tail = data;
110518 skb->end = data + size;
110519+ /* make sure we initialize shinfo sequentially */
110520+ shinfo = skb_shinfo(skb);
110521+ atomic_set(&shinfo->dataref, 1);
110522+ shinfo->nr_frags = 0;
110523+ shinfo->gso_size = 0;
110524+ shinfo->gso_segs = 0;
110525+ shinfo->gso_type = 0;
110526+ shinfo->ip6_frag_id = 0;
110527+ shinfo->frag_list = NULL;
110528
110529- atomic_set(&(skb_shinfo(skb)->dataref), 1);
110530- skb_shinfo(skb)->nr_frags = 0;
110531- skb_shinfo(skb)->tso_size = 0;
110532- skb_shinfo(skb)->tso_segs = 0;
110533- skb_shinfo(skb)->ufo_size = 0;
110534- skb_shinfo(skb)->frag_list = NULL;
110535+ if (fclone) {
110536+ struct sk_buff *child = skb + 1;
110537+ atomic_t *fclone_ref = (atomic_t *) (child + 1);
110538+
110539+ skb->fclone = SKB_FCLONE_ORIG;
110540+ atomic_set(fclone_ref, 1);
110541+
110542+ child->fclone = SKB_FCLONE_UNAVAILABLE;
110543+ }
110544 out:
110545 return skb;
110546 nodata:
110547- kmem_cache_free(skbuff_head_cache, skb);
110548+ kmem_cache_free(cache, skb);
110549 skb = NULL;
110550 goto out;
110551 }
110552@@ -414,6 +432,10 @@
110553 C(local_df);
110554 n->cloned = 1;
110555 n->nohdr = 0;
110556+#ifdef CONFIG_XEN
110557+ C(proto_data_valid);
110558+ C(proto_csum_blank);
110559+#endif
110560 C(pkt_type);
110561 C(ip_summed);
110562 C(priority);
110563@@ -507,9 +529,9 @@
110564 new->tc_index = old->tc_index;
110565 #endif
110566 atomic_set(&new->users, 1);
110567- skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
110568- skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
110569- skb_shinfo(new)->ufo_size = skb_shinfo(old)->ufo_size;
110570+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
110571+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
110572+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
110573 }
110574
110575 /**
110576@@ -1822,6 +1844,132 @@
110577 return 0;
110578 }
110579
110580+/**
110581+ * skb_segment - Perform protocol segmentation on skb.
110582+ * @skb: buffer to segment
110583+ * @features: features for the output path (see dev->features)
110584+ *
110585+ * This function performs segmentation on the given skb. It returns
110586+ * the segment at the given position. It returns NULL if there are
110587+ * no more segments to generate, or when an error is encountered.
110588+ */
110589+struct sk_buff *skb_segment(struct sk_buff *skb, int features)
110590+{
110591+ struct sk_buff *segs = NULL;
110592+ struct sk_buff *tail = NULL;
110593+ unsigned int mss = skb_shinfo(skb)->gso_size;
110594+ unsigned int doffset = skb->data - skb->mac.raw;
110595+ unsigned int offset = doffset;
110596+ unsigned int headroom;
110597+ unsigned int len;
110598+ int sg = features & NETIF_F_SG;
110599+ int nfrags = skb_shinfo(skb)->nr_frags;
110600+ int err = -ENOMEM;
110601+ int i = 0;
110602+ int pos;
110603+
110604+ __skb_push(skb, doffset);
110605+ headroom = skb_headroom(skb);
110606+ pos = skb_headlen(skb);
110607+
110608+ do {
110609+ struct sk_buff *nskb;
110610+ skb_frag_t *frag;
110611+ int hsize;
110612+ int k;
110613+ int size;
110614+
110615+ len = skb->len - offset;
110616+ if (len > mss)
110617+ len = mss;
110618+
110619+ hsize = skb_headlen(skb) - offset;
110620+ if (hsize < 0)
110621+ hsize = 0;
110622+ if (hsize > len || !sg)
110623+ hsize = len;
110624+
110625+ nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
110626+ if (unlikely(!nskb))
110627+ goto err;
110628+
110629+ if (segs)
110630+ tail->next = nskb;
110631+ else
110632+ segs = nskb;
110633+ tail = nskb;
110634+
110635+ nskb->dev = skb->dev;
110636+ nskb->priority = skb->priority;
110637+ nskb->protocol = skb->protocol;
110638+ nskb->dst = dst_clone(skb->dst);
110639+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
110640+ nskb->pkt_type = skb->pkt_type;
110641+ nskb->mac_len = skb->mac_len;
110642+
110643+ skb_reserve(nskb, headroom);
110644+ nskb->mac.raw = nskb->data;
110645+ nskb->nh.raw = nskb->data + skb->mac_len;
110646+ nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
110647+ memcpy(skb_put(nskb, doffset), skb->data, doffset);
110648+
110649+ if (!sg) {
110650+ nskb->csum = skb_copy_and_csum_bits(skb, offset,
110651+ skb_put(nskb, len),
110652+ len, 0);
110653+ continue;
110654+ }
110655+
110656+ frag = skb_shinfo(nskb)->frags;
110657+ k = 0;
110658+
110659+ nskb->ip_summed = CHECKSUM_HW;
110660+ nskb->csum = skb->csum;
110661+ memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
110662+
110663+ while (pos < offset + len) {
110664+ BUG_ON(i >= nfrags);
110665+
110666+ *frag = skb_shinfo(skb)->frags[i];
110667+ get_page(frag->page);
110668+ size = frag->size;
110669+
110670+ if (pos < offset) {
110671+ frag->page_offset += offset - pos;
110672+ frag->size -= offset - pos;
110673+ }
110674+
110675+ k++;
110676+
110677+ if (pos + size <= offset + len) {
110678+ i++;
110679+ pos += size;
110680+ } else {
110681+ frag->size -= pos + size - (offset + len);
110682+ break;
110683+ }
110684+
110685+ frag++;
110686+ }
110687+
110688+ skb_shinfo(nskb)->nr_frags = k;
110689+ nskb->data_len = len - hsize;
110690+ nskb->len += nskb->data_len;
110691+ nskb->truesize += nskb->data_len;
110692+ } while ((offset += len) < skb->len);
110693+
110694+ return segs;
110695+
110696+err:
110697+ while ((skb = segs)) {
110698+ segs = skb->next;
110699+ kfree(skb);
110700+ }
110701+ return ERR_PTR(err);
110702+}
110703+
110704+EXPORT_SYMBOL_GPL(skb_segment);
110705+
110706 void __init skb_init(void)
110707 {
110708 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
110709diff -Nur linux-2.6.16.33-noxen/net/core/skbuff.c~ linux-2.6.16.33/net/core/skbuff.c~
110710--- linux-2.6.16.33-noxen/net/core/skbuff.c~ 1970-01-01 00:00:00.000000000 +0000
110711+++ linux-2.6.16.33/net/core/skbuff.c~ 2007-05-23 21:00:01.000000000 +0000
110712@@ -0,0 +1,2003 @@
110713+/*
110714+ * Routines having to do with the 'struct sk_buff' memory handlers.
110715+ *
110716+ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
110717+ * Florian La Roche <rzsfl@rz.uni-sb.de>
110718+ *
110719+ * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
110720+ *
110721+ * Fixes:
110722+ * Alan Cox : Fixed the worst of the load
110723+ * balancer bugs.
110724+ * Dave Platt : Interrupt stacking fix.
110725+ * Richard Kooijman : Timestamp fixes.
110726+ * Alan Cox : Changed buffer format.
110727+ * Alan Cox : destructor hook for AF_UNIX etc.
110728+ * Linus Torvalds : Better skb_clone.
110729+ * Alan Cox : Added skb_copy.
110730+ * Alan Cox : Added all the changed routines Linus
110731+ * only put in the headers
110732+ * Ray VanTassle : Fixed --skb->lock in free
110733+ * Alan Cox : skb_copy copy arp field
110734+ * Andi Kleen : slabified it.
110735+ * Robert Olsson : Removed skb_head_pool
110736+ *
110737+ * NOTE:
110738+ * The __skb_ routines should be called with interrupts
110739+ * disabled, or you better be *real* sure that the operation is atomic
110740+ * with respect to whatever list is being frobbed (e.g. via lock_sock()
110741+ * or via disabling bottom half handlers, etc).
110742+ *
110743+ * This program is free software; you can redistribute it and/or
110744+ * modify it under the terms of the GNU General Public License
110745+ * as published by the Free Software Foundation; either version
110746+ * 2 of the License, or (at your option) any later version.
110747+ */
110748+
110749+/*
110750+ * The functions in this file will not compile correctly with gcc 2.4.x
110751+ */
110752+
110753+#include <linux/config.h>
110754+#include <linux/module.h>
110755+#include <linux/types.h>
110756+#include <linux/kernel.h>
110757+#include <linux/sched.h>
110758+#include <linux/mm.h>
110759+#include <linux/interrupt.h>
110760+#include <linux/in.h>
110761+#include <linux/inet.h>
110762+#include <linux/slab.h>
110763+#include <linux/netdevice.h>
110764+#ifdef CONFIG_NET_CLS_ACT
110765+#include <net/pkt_sched.h>
110766+#endif
110767+#include <linux/string.h>
110768+#include <linux/skbuff.h>
110769+#include <linux/cache.h>
110770+#include <linux/rtnetlink.h>
110771+#include <linux/init.h>
110772+#include <linux/highmem.h>
110773+
110774+#include <net/protocol.h>
110775+#include <net/dst.h>
110776+#include <net/sock.h>
110777+#include <net/checksum.h>
110778+#include <net/xfrm.h>
110779+
110780+#include <asm/uaccess.h>
110781+#include <asm/system.h>
110782+
110783+static kmem_cache_t *skbuff_head_cache __read_mostly;
110784+static kmem_cache_t *skbuff_fclone_cache __read_mostly;
110785+
110786+/*
110787+ * Keep out-of-line to prevent kernel bloat.
110788+ * __builtin_return_address is not used because it is not always
110789+ * reliable.
110790+ */
110791+
110792+/**
110793+ * skb_over_panic - private function
110794+ * @skb: buffer
110795+ * @sz: size
110796+ * @here: address
110797+ *
110798+ * Out of line support code for skb_put(). Not user callable.
110799+ */
110800+void skb_over_panic(struct sk_buff *skb, int sz, void *here)
110801+{
110802+ printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
110803+ "data:%p tail:%p end:%p dev:%s\n",
110804+ here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
110805+ skb->dev ? skb->dev->name : "<NULL>");
110806+ BUG();
110807+}
110808+
110809+/**
110810+ * skb_under_panic - private function
110811+ * @skb: buffer
110812+ * @sz: size
110813+ * @here: address
110814+ *
110815+ * Out of line support code for skb_push(). Not user callable.
110816+ */
110817+
110818+void skb_under_panic(struct sk_buff *skb, int sz, void *here)
110819+{
110820+ printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
110821+ "data:%p tail:%p end:%p dev:%s\n",
110822+ here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
110823+ skb->dev ? skb->dev->name : "<NULL>");
110824+ BUG();
110825+}
110826+
110827+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
110828+ * 'private' fields and also do memory statistics to find all the
110829+ * [BEEP] leaks.
110830+ *
110831+ */
110832+
110833+/**
110834+ * __alloc_skb - allocate a network buffer
110835+ * @size: size to allocate
110836+ * @gfp_mask: allocation mask
110837+ * @fclone: allocate from fclone cache instead of head cache
110838+ * and allocate a cloned (child) skb
110839+ *
110840+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
110841+ * tail room of size bytes. The object has a reference count of one.
110842+ * The return is the buffer. On a failure the return is %NULL.
110843+ *
110844+ * Buffers may only be allocated from interrupts using a @gfp_mask of
110845+ * %GFP_ATOMIC.
110846+ */
110847+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
110848+ int fclone)
110849+{
110850+ kmem_cache_t *cache;
110851+ struct skb_shared_info *shinfo;
110852+ struct sk_buff *skb;
110853+ u8 *data;
110854+
110855+ cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
110856+
110857+ /* Get the HEAD */
110858+ skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
110859+ if (!skb)
110860+ goto out;
110861+
110862+ /* Get the DATA. Size must match skb_add_mtu(). */
110863+ size = SKB_DATA_ALIGN(size);
110864+ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
110865+ if (!data)
110866+ goto nodata;
110867+
110868+ memset(skb, 0, offsetof(struct sk_buff, truesize));
110869+ skb->truesize = size + sizeof(struct sk_buff);
110870+ atomic_set(&skb->users, 1);
110871+ skb->head = data;
110872+ skb->data = data;
110873+ skb->tail = data;
110874+ skb->end = data + size;
110875+ /* make sure we initialize shinfo sequentially */
110876+ shinfo = skb_shinfo(skb);
110877+ atomic_set(&shinfo->dataref, 1);
110878+ shinfo->nr_frags = 0;
110879+ shinfo->gso_size = 0;
110880+ shinfo->gso_segs = 0;
110881+ shinfo->gso_type = 0;
110882+ shinfo->ip6_frag_id = 0;
110883+ shinfo->frag_list = NULL;
110884+
110885+ if (fclone) {
110886+ struct sk_buff *child = skb + 1;
110887+ atomic_t *fclone_ref = (atomic_t *) (child + 1);
110888+
110889+ skb->fclone = SKB_FCLONE_ORIG;
110890+ atomic_set(fclone_ref, 1);
110891+
110892+ child->fclone = SKB_FCLONE_UNAVAILABLE;
110893+ }
110894+out:
110895+ return skb;
110896+nodata:
110897+ kmem_cache_free(cache, skb);
110898+ skb = NULL;
110899+ goto out;
110900+}
110901+
110902+/**
110903+ * alloc_skb_from_cache - allocate a network buffer
110904+ * @cp: kmem_cache from which to allocate the data area
110905+ * (object size must be big enough for @size bytes + skb overheads)
110906+ * @size: size to allocate
110907+ * @gfp_mask: allocation mask
110908+ *
110909+ * Allocate a new &sk_buff. The returned buffer has no headroom and
110910+ * tail room of size bytes. The object has a reference count of one.
110911+ * The return is the buffer. On a failure the return is %NULL.
110912+ *
110913+ * Buffers may only be allocated from interrupts using a @gfp_mask of
110914+ * %GFP_ATOMIC.
110915+ */
110916+struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
110917+ unsigned int size,
110918+ gfp_t gfp_mask)
110919+{
110920+ struct sk_buff *skb;
110921+ u8 *data;
110922+
110923+ /* Get the HEAD */
110924+ skb = kmem_cache_alloc(skbuff_head_cache,
110925+ gfp_mask & ~__GFP_DMA);
110926+ if (!skb)
110927+ goto out;
110928+
110929+ /* Get the DATA. */
110930+ size = SKB_DATA_ALIGN(size);
110931+ data = kmem_cache_alloc(cp, gfp_mask);
110932+ if (!data)
110933+ goto nodata;
110934+
110935+ memset(skb, 0, offsetof(struct sk_buff, truesize));
110936+ skb->truesize = size + sizeof(struct sk_buff);
110937+ atomic_set(&skb->users, 1);
110938+ skb->head = data;
110939+ skb->data = data;
110940+ skb->tail = data;
110941+ skb->end = data + size;
110942+
110943+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
110944+ skb_shinfo(skb)->nr_frags = 0;
110945+ skb_shinfo(skb)->gso_size = 0;
110946+ skb_shinfo(skb)->gso_segs = 0;
110947+ skb_shinfo(skb)->gso_type = 0;
110948+ skb_shinfo(skb)->frag_list = NULL;
110949+out:
110950+ return skb;
110951+nodata:
110952+ kmem_cache_free(skbuff_head_cache, skb);
110953+ skb = NULL;
110954+ goto out;
110955+}
110956+
110957+
110958+static void skb_drop_list(struct sk_buff **listp)
110959+{
110960+ struct sk_buff *list = *listp;
110961+
110962+ *listp = NULL;
110963+
110964+ do {
110965+ struct sk_buff *this = list;
110966+ list = list->next;
110967+ kfree_skb(this);
110968+ } while (list);
110969+}
110970+
110971+static inline void skb_drop_fraglist(struct sk_buff *skb)
110972+{
110973+ skb_drop_list(&skb_shinfo(skb)->frag_list);
110974+}
110975+
110976+static void skb_clone_fraglist(struct sk_buff *skb)
110977+{
110978+ struct sk_buff *list;
110979+
110980+ for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
110981+ skb_get(list);
110982+}
110983+
110984+void skb_release_data(struct sk_buff *skb)
110985+{
110986+ if (!skb->cloned ||
110987+ !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
110988+ &skb_shinfo(skb)->dataref)) {
110989+ if (skb_shinfo(skb)->nr_frags) {
110990+ int i;
110991+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
110992+ put_page(skb_shinfo(skb)->frags[i].page);
110993+ }
110994+
110995+ if (skb_shinfo(skb)->frag_list)
110996+ skb_drop_fraglist(skb);
110997+
110998+ kfree(skb->head);
110999+ }
111000+}
111001+
111002+/*
111003+ * Free an skbuff by memory without cleaning the state.
111004+ */
111005+void kfree_skbmem(struct sk_buff *skb)
111006+{
111007+ struct sk_buff *other;
111008+ atomic_t *fclone_ref;
111009+
111010+ skb_release_data(skb);
111011+ switch (skb->fclone) {
111012+ case SKB_FCLONE_UNAVAILABLE:
111013+ kmem_cache_free(skbuff_head_cache, skb);
111014+ break;
111015+
111016+ case SKB_FCLONE_ORIG:
111017+ fclone_ref = (atomic_t *) (skb + 2);
111018+ if (atomic_dec_and_test(fclone_ref))
111019+ kmem_cache_free(skbuff_fclone_cache, skb);
111020+ break;
111021+
111022+ case SKB_FCLONE_CLONE:
111023+ fclone_ref = (atomic_t *) (skb + 1);
111024+ other = skb - 1;
111025+
111026+ /* The clone portion is available for
111027+ * fast-cloning again.
111028+ */
111029+ skb->fclone = SKB_FCLONE_UNAVAILABLE;
111030+
111031+ if (atomic_dec_and_test(fclone_ref))
111032+ kmem_cache_free(skbuff_fclone_cache, other);
111033+ break;
111034+ };
111035+}
111036+
111037+/**
111038+ * __kfree_skb - private function
111039+ * @skb: buffer
111040+ *
111041+ * Free an sk_buff. Release anything attached to the buffer.
111042+ * Clean the state. This is an internal helper function. Users should
111043+ * always call kfree_skb
111044+ */
111045+
111046+void __kfree_skb(struct sk_buff *skb)
111047+{
111048+ dst_release(skb->dst);
111049+#ifdef CONFIG_XFRM
111050+ secpath_put(skb->sp);
111051+#endif
111052+ if (skb->destructor) {
111053+ WARN_ON(in_irq());
111054+ skb->destructor(skb);
111055+ }
111056+#ifdef CONFIG_NETFILTER
111057+ nf_conntrack_put(skb->nfct);
111058+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111059+ nf_conntrack_put_reasm(skb->nfct_reasm);
111060+#endif
111061+#ifdef CONFIG_BRIDGE_NETFILTER
111062+ nf_bridge_put(skb->nf_bridge);
111063+#endif
111064+#endif
111065+/* XXX: IS this still necessary? - JHS */
111066+#ifdef CONFIG_NET_SCHED
111067+ skb->tc_index = 0;
111068+#ifdef CONFIG_NET_CLS_ACT
111069+ skb->tc_verd = 0;
111070+#endif
111071+#endif
111072+
111073+ kfree_skbmem(skb);
111074+}
111075+
111076+/**
111077+ * skb_clone - duplicate an sk_buff
111078+ * @skb: buffer to clone
111079+ * @gfp_mask: allocation priority
111080+ *
111081+ * Duplicate an &sk_buff. The new one is not owned by a socket. Both
111082+ * copies share the same packet data but not structure. The new
111083+ * buffer has a reference count of 1. If the allocation fails the
111084+ * function returns %NULL otherwise the new buffer is returned.
111085+ *
111086+ * If this function is called from an interrupt gfp_mask() must be
111087+ * %GFP_ATOMIC.
111088+ */
111089+
111090+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
111091+{
111092+ struct sk_buff *n;
111093+
111094+ n = skb + 1;
111095+ if (skb->fclone == SKB_FCLONE_ORIG &&
111096+ n->fclone == SKB_FCLONE_UNAVAILABLE) {
111097+ atomic_t *fclone_ref = (atomic_t *) (n + 1);
111098+ n->fclone = SKB_FCLONE_CLONE;
111099+ atomic_inc(fclone_ref);
111100+ } else {
111101+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
111102+ if (!n)
111103+ return NULL;
111104+ n->fclone = SKB_FCLONE_UNAVAILABLE;
111105+ }
111106+
111107+#define C(x) n->x = skb->x
111108+
111109+ n->next = n->prev = NULL;
111110+ n->sk = NULL;
111111+ C(tstamp);
111112+ C(dev);
111113+ C(h);
111114+ C(nh);
111115+ C(mac);
111116+ C(dst);
111117+ dst_clone(skb->dst);
111118+ C(sp);
111119+#ifdef CONFIG_INET
111120+ secpath_get(skb->sp);
111121+#endif
111122+ memcpy(n->cb, skb->cb, sizeof(skb->cb));
111123+ C(len);
111124+ C(data_len);
111125+ C(csum);
111126+ C(local_df);
111127+ n->cloned = 1;
111128+ n->nohdr = 0;
111129+ C(pkt_type);
111130+ C(ip_summed);
111131+ C(priority);
111132+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
111133+ C(ipvs_property);
111134+#endif
111135+ C(protocol);
111136+ n->destructor = NULL;
111137+#ifdef CONFIG_NETFILTER
111138+ C(nfmark);
111139+ C(nfct);
111140+ nf_conntrack_get(skb->nfct);
111141+ C(nfctinfo);
111142+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111143+ C(nfct_reasm);
111144+ nf_conntrack_get_reasm(skb->nfct_reasm);
111145+#endif
111146+#ifdef CONFIG_BRIDGE_NETFILTER
111147+ C(nf_bridge);
111148+ nf_bridge_get(skb->nf_bridge);
111149+#endif
111150+#endif /*CONFIG_NETFILTER*/
111151+#ifdef CONFIG_NET_SCHED
111152+ C(tc_index);
111153+#ifdef CONFIG_NET_CLS_ACT
111154+ n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
111155+ n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
111156+ n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
111157+ C(input_dev);
111158+#endif
111159+
111160+#endif
111161+ C(truesize);
111162+ atomic_set(&n->users, 1);
111163+ C(head);
111164+ C(data);
111165+ C(tail);
111166+ C(end);
111167+
111168+ atomic_inc(&(skb_shinfo(skb)->dataref));
111169+ skb->cloned = 1;
111170+
111171+ return n;
111172+}
111173+
111174+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
111175+{
111176+ /*
111177+ * Shift between the two data areas in bytes
111178+ */
111179+ unsigned long offset = new->data - old->data;
111180+
111181+ new->sk = NULL;
111182+ new->dev = old->dev;
111183+ new->priority = old->priority;
111184+ new->protocol = old->protocol;
111185+ new->dst = dst_clone(old->dst);
111186+#ifdef CONFIG_INET
111187+ new->sp = secpath_get(old->sp);
111188+#endif
111189+ new->h.raw = old->h.raw + offset;
111190+ new->nh.raw = old->nh.raw + offset;
111191+ new->mac.raw = old->mac.raw + offset;
111192+ memcpy(new->cb, old->cb, sizeof(old->cb));
111193+ new->local_df = old->local_df;
111194+ new->fclone = SKB_FCLONE_UNAVAILABLE;
111195+ new->pkt_type = old->pkt_type;
111196+ new->tstamp = old->tstamp;
111197+ new->destructor = NULL;
111198+#ifdef CONFIG_NETFILTER
111199+ new->nfmark = old->nfmark;
111200+ new->nfct = old->nfct;
111201+ nf_conntrack_get(old->nfct);
111202+ new->nfctinfo = old->nfctinfo;
111203+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111204+ new->nfct_reasm = old->nfct_reasm;
111205+ nf_conntrack_get_reasm(old->nfct_reasm);
111206+#endif
111207+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
111208+ new->ipvs_property = old->ipvs_property;
111209+#endif
111210+#ifdef CONFIG_BRIDGE_NETFILTER
111211+ new->nf_bridge = old->nf_bridge;
111212+ nf_bridge_get(old->nf_bridge);
111213+#endif
111214+#endif
111215+#ifdef CONFIG_NET_SCHED
111216+#ifdef CONFIG_NET_CLS_ACT
111217+ new->tc_verd = old->tc_verd;
111218+#endif
111219+ new->tc_index = old->tc_index;
111220+#endif
111221+ atomic_set(&new->users, 1);
111222+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
111223+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
111224+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
111225+}
111226+
111227+/**
111228+ * skb_copy - create private copy of an sk_buff
111229+ * @skb: buffer to copy
111230+ * @gfp_mask: allocation priority
111231+ *
111232+ * Make a copy of both an &sk_buff and its data. This is used when the
111233+ * caller wishes to modify the data and needs a private copy of the
111234+ * data to alter. Returns %NULL on failure or the pointer to the buffer
111235+ * on success. The returned buffer has a reference count of 1.
111236+ *
111237+ * As by-product this function converts non-linear &sk_buff to linear
111238+ * one, so that &sk_buff becomes completely private and caller is allowed
111239+ * to modify all the data of returned buffer. This means that this
111240+ * function is not recommended for use in circumstances when only
111241+ * header is going to be modified. Use pskb_copy() instead.
111242+ */
111243+
111244+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
111245+{
111246+ int headerlen = skb->data - skb->head;
111247+ /*
111248+ * Allocate the copy buffer
111249+ */
111250+ struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
111251+ gfp_mask);
111252+ if (!n)
111253+ return NULL;
111254+
111255+ /* Set the data pointer */
111256+ skb_reserve(n, headerlen);
111257+ /* Set the tail pointer and length */
111258+ skb_put(n, skb->len);
111259+ n->csum = skb->csum;
111260+ n->ip_summed = skb->ip_summed;
111261+
111262+ if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
111263+ BUG();
111264+
111265+ copy_skb_header(n, skb);
111266+ return n;
111267+}
111268+
111269+
111270+/**
111271+ * pskb_copy - create copy of an sk_buff with private head.
111272+ * @skb: buffer to copy
111273+ * @gfp_mask: allocation priority
111274+ *
111275+ * Make a copy of both an &sk_buff and part of its data, located
111276+ * in header. Fragmented data remain shared. This is used when
111277+ * the caller wishes to modify only header of &sk_buff and needs
111278+ * private copy of the header to alter. Returns %NULL on failure
111279+ * or the pointer to the buffer on success.
111280+ * The returned buffer has a reference count of 1.
111281+ */
111282+
111283+struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
111284+{
111285+ /*
111286+ * Allocate the copy buffer
111287+ */
111288+ struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
111289+
111290+ if (!n)
111291+ goto out;
111292+
111293+ /* Set the data pointer */
111294+ skb_reserve(n, skb->data - skb->head);
111295+ /* Set the tail pointer and length */
111296+ skb_put(n, skb_headlen(skb));
111297+ /* Copy the bytes */
111298+ memcpy(n->data, skb->data, n->len);
111299+ n->csum = skb->csum;
111300+ n->ip_summed = skb->ip_summed;
111301+
111302+ n->truesize += skb->data_len;
111303+ n->data_len = skb->data_len;
111304+ n->len = skb->len;
111305+
111306+ if (skb_shinfo(skb)->nr_frags) {
111307+ int i;
111308+
111309+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111310+ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
111311+ get_page(skb_shinfo(n)->frags[i].page);
111312+ }
111313+ skb_shinfo(n)->nr_frags = i;
111314+ }
111315+
111316+ if (skb_shinfo(skb)->frag_list) {
111317+ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
111318+ skb_clone_fraglist(n);
111319+ }
111320+
111321+ copy_skb_header(n, skb);
111322+out:
111323+ return n;
111324+}
111325+
111326+/**
111327+ * pskb_expand_head - reallocate header of &sk_buff
111328+ * @skb: buffer to reallocate
111329+ * @nhead: room to add at head
111330+ * @ntail: room to add at tail
111331+ * @gfp_mask: allocation priority
111332+ *
111333+ * Expands (or creates identical copy, if &nhead and &ntail are zero)
111334+ * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
111335+ * reference count of 1. Returns zero in the case of success or error,
111336+ * if expansion failed. In the last case, &sk_buff is not changed.
111337+ *
111338+ * All the pointers pointing into skb header may change and must be
111339+ * reloaded after call to this function.
111340+ */
111341+
111342+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
111343+ gfp_t gfp_mask)
111344+{
111345+ int i;
111346+ u8 *data;
111347+ int size = nhead + (skb->end - skb->head) + ntail;
111348+ long off;
111349+
111350+ if (skb_shared(skb))
111351+ BUG();
111352+
111353+ size = SKB_DATA_ALIGN(size);
111354+
111355+ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
111356+ if (!data)
111357+ goto nodata;
111358+
111359+ /* Copy only real data... and, alas, header. This should be
111360+ * optimized for the cases when header is void. */
111361+ memcpy(data + nhead, skb->head, skb->tail - skb->head);
111362+ memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
111363+
111364+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
111365+ get_page(skb_shinfo(skb)->frags[i].page);
111366+
111367+ if (skb_shinfo(skb)->frag_list)
111368+ skb_clone_fraglist(skb);
111369+
111370+ skb_release_data(skb);
111371+
111372+ off = (data + nhead) - skb->head;
111373+
111374+ skb->head = data;
111375+ skb->end = data + size;
111376+ skb->data += off;
111377+ skb->tail += off;
111378+ skb->mac.raw += off;
111379+ skb->h.raw += off;
111380+ skb->nh.raw += off;
111381+ skb->cloned = 0;
111382+ skb->nohdr = 0;
111383+ atomic_set(&skb_shinfo(skb)->dataref, 1);
111384+ return 0;
111385+
111386+nodata:
111387+ return -ENOMEM;
111388+}
111389+
111390+/* Make private copy of skb with writable head and some headroom */
111391+
111392+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
111393+{
111394+ struct sk_buff *skb2;
111395+ int delta = headroom - skb_headroom(skb);
111396+
111397+ if (delta <= 0)
111398+ skb2 = pskb_copy(skb, GFP_ATOMIC);
111399+ else {
111400+ skb2 = skb_clone(skb, GFP_ATOMIC);
111401+ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
111402+ GFP_ATOMIC)) {
111403+ kfree_skb(skb2);
111404+ skb2 = NULL;
111405+ }
111406+ }
111407+ return skb2;
111408+}
111409+
111410+
111411+/**
111412+ * skb_copy_expand - copy and expand sk_buff
111413+ * @skb: buffer to copy
111414+ * @newheadroom: new free bytes at head
111415+ * @newtailroom: new free bytes at tail
111416+ * @gfp_mask: allocation priority
111417+ *
111418+ * Make a copy of both an &sk_buff and its data and while doing so
111419+ * allocate additional space.
111420+ *
111421+ * This is used when the caller wishes to modify the data and needs a
111422+ * private copy of the data to alter as well as more space for new fields.
111423+ * Returns %NULL on failure or the pointer to the buffer
111424+ * on success. The returned buffer has a reference count of 1.
111425+ *
111426+ * You must pass %GFP_ATOMIC as the allocation priority if this function
111427+ * is called from an interrupt.
111428+ *
111429+ * BUG ALERT: ip_summed is not copied. Why does this work? Is it used
111430+ * only by netfilter in the cases when checksum is recalculated? --ANK
111431+ */
111432+struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
111433+ int newheadroom, int newtailroom,
111434+ gfp_t gfp_mask)
111435+{
111436+ /*
111437+ * Allocate the copy buffer
111438+ */
111439+ struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
111440+ gfp_mask);
111441+ int head_copy_len, head_copy_off;
111442+
111443+ if (!n)
111444+ return NULL;
111445+
111446+ skb_reserve(n, newheadroom);
111447+
111448+ /* Set the tail pointer and length */
111449+ skb_put(n, skb->len);
111450+
111451+ head_copy_len = skb_headroom(skb);
111452+ head_copy_off = 0;
111453+ if (newheadroom <= head_copy_len)
111454+ head_copy_len = newheadroom;
111455+ else
111456+ head_copy_off = newheadroom - head_copy_len;
111457+
111458+ /* Copy the linear header and data. */
111459+ if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
111460+ skb->len + head_copy_len))
111461+ BUG();
111462+
111463+ copy_skb_header(n, skb);
111464+
111465+ return n;
111466+}
111467+
111468+/**
111469+ * skb_pad - zero pad the tail of an skb
111470+ * @skb: buffer to pad
111471+ * @pad: space to pad
111472+ *
111473+ * Ensure that a buffer is followed by a padding area that is zero
111474+ * filled. Used by network drivers which may DMA or transfer data
111475+ * beyond the buffer end onto the wire.
111476+ *
111477+ * May return NULL in out of memory cases.
111478+ */
111479+
111480+struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
111481+{
111482+ struct sk_buff *nskb;
111483+
111484+ /* If the skbuff is non linear tailroom is always zero.. */
111485+ if (skb_tailroom(skb) >= pad) {
111486+ memset(skb->data+skb->len, 0, pad);
111487+ return skb;
111488+ }
111489+
111490+ nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
111491+ kfree_skb(skb);
111492+ if (nskb)
111493+ memset(nskb->data+nskb->len, 0, pad);
111494+ return nskb;
111495+}
111496+
111497+/* Trims skb to length len. It can change skb pointers.
111498+ */
111499+
111500+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
111501+{
111502+ struct sk_buff **fragp;
111503+ struct sk_buff *frag;
111504+ int offset = skb_headlen(skb);
111505+ int nfrags = skb_shinfo(skb)->nr_frags;
111506+ int i;
111507+ int err;
111508+
111509+ if (skb_cloned(skb) &&
111510+ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
111511+ return err;
111512+
111513+ i = 0;
111514+ if (offset >= len)
111515+ goto drop_pages;
111516+
111517+ for (; i < nfrags; i++) {
111518+ int end = offset + skb_shinfo(skb)->frags[i].size;
111519+
111520+ if (end < len) {
111521+ offset = end;
111522+ continue;
111523+ }
111524+
111525+ skb_shinfo(skb)->frags[i++].size = len - offset;
111526+
111527+drop_pages:
111528+ skb_shinfo(skb)->nr_frags = i;
111529+
111530+ for (; i < nfrags; i++)
111531+ put_page(skb_shinfo(skb)->frags[i].page);
111532+
111533+ if (skb_shinfo(skb)->frag_list)
111534+ skb_drop_fraglist(skb);
111535+ goto done;
111536+ }
111537+
111538+ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
111539+ fragp = &frag->next) {
111540+ int end = offset + frag->len;
111541+
111542+ if (skb_shared(frag)) {
111543+ struct sk_buff *nfrag;
111544+
111545+ nfrag = skb_clone(frag, GFP_ATOMIC);
111546+ if (unlikely(!nfrag))
111547+ return -ENOMEM;
111548+
111549+ nfrag->next = frag->next;
111550+ kfree_skb(frag);
111551+ frag = nfrag;
111552+ *fragp = frag;
111553+ }
111554+
111555+ if (end < len) {
111556+ offset = end;
111557+ continue;
111558+ }
111559+
111560+ if (end > len &&
111561+ unlikely((err = pskb_trim(frag, len - offset))))
111562+ return err;
111563+
111564+ if (frag->next)
111565+ skb_drop_list(&frag->next);
111566+ break;
111567+ }
111568+
111569+done:
111570+ if (len > skb_headlen(skb)) {
111571+ skb->data_len -= skb->len - len;
111572+ skb->len = len;
111573+ } else {
111574+ skb->len = len;
111575+ skb->data_len = 0;
111576+ skb->tail = skb->data + len;
111577+ }
111578+
111579+ return 0;
111580+}
111581+
111582+/**
111583+ * __pskb_pull_tail - advance tail of skb header
111584+ * @skb: buffer to reallocate
111585+ * @delta: number of bytes to advance tail
111586+ *
111587+ * The function makes a sense only on a fragmented &sk_buff,
111588+ * it expands header moving its tail forward and copying necessary
111589+ * data from fragmented part.
111590+ *
111591+ * &sk_buff MUST have reference count of 1.
111592+ *
111593+ * Returns %NULL (and &sk_buff does not change) if pull failed
111594+ * or value of new tail of skb in the case of success.
111595+ *
111596+ * All the pointers pointing into skb header may change and must be
111597+ * reloaded after call to this function.
111598+ */
111599+
111600+/* Moves tail of skb head forward, copying data from fragmented part,
111601+ * when it is necessary.
111602+ * 1. It may fail due to malloc failure.
111603+ * 2. It may change skb pointers.
111604+ *
111605+ * It is pretty complicated. Luckily, it is called only in exceptional cases.
111606+ */
111607+unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
111608+{
111609+ /* If skb has not enough free space at tail, get new one
111610+ * plus 128 bytes for future expansions. If we have enough
111611+ * room at tail, reallocate without expansion only if skb is cloned.
111612+ */
111613+ int i, k, eat = (skb->tail + delta) - skb->end;
111614+
111615+ if (eat > 0 || skb_cloned(skb)) {
111616+ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
111617+ GFP_ATOMIC))
111618+ return NULL;
111619+ }
111620+
111621+ if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
111622+ BUG();
111623+
111624+ /* Optimization: no fragments, no reasons to preestimate
111625+ * size of pulled pages. Superb.
111626+ */
111627+ if (!skb_shinfo(skb)->frag_list)
111628+ goto pull_pages;
111629+
111630+ /* Estimate size of pulled pages. */
111631+ eat = delta;
111632+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111633+ if (skb_shinfo(skb)->frags[i].size >= eat)
111634+ goto pull_pages;
111635+ eat -= skb_shinfo(skb)->frags[i].size;
111636+ }
111637+
111638+ /* If we need update frag list, we are in troubles.
111639+ * Certainly, it possible to add an offset to skb data,
111640+ * but taking into account that pulling is expected to
111641+ * be very rare operation, it is worth to fight against
111642+ * further bloating skb head and crucify ourselves here instead.
111643+ * Pure masohism, indeed. 8)8)
111644+ */
111645+ if (eat) {
111646+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
111647+ struct sk_buff *clone = NULL;
111648+ struct sk_buff *insp = NULL;
111649+
111650+ do {
111651+ BUG_ON(!list);
111652+
111653+ if (list->len <= eat) {
111654+ /* Eaten as whole. */
111655+ eat -= list->len;
111656+ list = list->next;
111657+ insp = list;
111658+ } else {
111659+ /* Eaten partially. */
111660+
111661+ if (skb_shared(list)) {
111662+ /* Sucks! We need to fork list. :-( */
111663+ clone = skb_clone(list, GFP_ATOMIC);
111664+ if (!clone)
111665+ return NULL;
111666+ insp = list->next;
111667+ list = clone;
111668+ } else {
111669+ /* This may be pulled without
111670+ * problems. */
111671+ insp = list;
111672+ }
111673+ if (!pskb_pull(list, eat)) {
111674+ if (clone)
111675+ kfree_skb(clone);
111676+ return NULL;
111677+ }
111678+ break;
111679+ }
111680+ } while (eat);
111681+
111682+ /* Free pulled out fragments. */
111683+ while ((list = skb_shinfo(skb)->frag_list) != insp) {
111684+ skb_shinfo(skb)->frag_list = list->next;
111685+ kfree_skb(list);
111686+ }
111687+ /* And insert new clone at head. */
111688+ if (clone) {
111689+ clone->next = list;
111690+ skb_shinfo(skb)->frag_list = clone;
111691+ }
111692+ }
111693+ /* Success! Now we may commit changes to skb data. */
111694+
111695+pull_pages:
111696+ eat = delta;
111697+ k = 0;
111698+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111699+ if (skb_shinfo(skb)->frags[i].size <= eat) {
111700+ put_page(skb_shinfo(skb)->frags[i].page);
111701+ eat -= skb_shinfo(skb)->frags[i].size;
111702+ } else {
111703+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
111704+ if (eat) {
111705+ skb_shinfo(skb)->frags[k].page_offset += eat;
111706+ skb_shinfo(skb)->frags[k].size -= eat;
111707+ eat = 0;
111708+ }
111709+ k++;
111710+ }
111711+ }
111712+ skb_shinfo(skb)->nr_frags = k;
111713+
111714+ skb->tail += delta;
111715+ skb->data_len -= delta;
111716+
111717+ return skb->tail;
111718+}
111719+
111720+/* Copy some data bits from skb to kernel buffer. */
111721+
111722+int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
111723+{
111724+ int i, copy;
111725+ int start = skb_headlen(skb);
111726+
111727+ if (offset > (int)skb->len - len)
111728+ goto fault;
111729+
111730+ /* Copy header. */
111731+ if ((copy = start - offset) > 0) {
111732+ if (copy > len)
111733+ copy = len;
111734+ memcpy(to, skb->data + offset, copy);
111735+ if ((len -= copy) == 0)
111736+ return 0;
111737+ offset += copy;
111738+ to += copy;
111739+ }
111740+
111741+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111742+ int end;
111743+
111744+ BUG_TRAP(start <= offset + len);
111745+
111746+ end = start + skb_shinfo(skb)->frags[i].size;
111747+ if ((copy = end - offset) > 0) {
111748+ u8 *vaddr;
111749+
111750+ if (copy > len)
111751+ copy = len;
111752+
111753+ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
111754+ memcpy(to,
111755+ vaddr + skb_shinfo(skb)->frags[i].page_offset+
111756+ offset - start, copy);
111757+ kunmap_skb_frag(vaddr);
111758+
111759+ if ((len -= copy) == 0)
111760+ return 0;
111761+ offset += copy;
111762+ to += copy;
111763+ }
111764+ start = end;
111765+ }
111766+
111767+ if (skb_shinfo(skb)->frag_list) {
111768+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
111769+
111770+ for (; list; list = list->next) {
111771+ int end;
111772+
111773+ BUG_TRAP(start <= offset + len);
111774+
111775+ end = start + list->len;
111776+ if ((copy = end - offset) > 0) {
111777+ if (copy > len)
111778+ copy = len;
111779+ if (skb_copy_bits(list, offset - start,
111780+ to, copy))
111781+ goto fault;
111782+ if ((len -= copy) == 0)
111783+ return 0;
111784+ offset += copy;
111785+ to += copy;
111786+ }
111787+ start = end;
111788+ }
111789+ }
111790+ if (!len)
111791+ return 0;
111792+
111793+fault:
111794+ return -EFAULT;
111795+}
111796+
111797+/**
111798+ * skb_store_bits - store bits from kernel buffer to skb
111799+ * @skb: destination buffer
111800+ * @offset: offset in destination
111801+ * @from: source buffer
111802+ * @len: number of bytes to copy
111803+ *
111804+ * Copy the specified number of bytes from the source buffer to the
111805+ * destination skb. This function handles all the messy bits of
111806+ * traversing fragment lists and such.
111807+ */
111808+
111809+int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
111810+{
111811+ int i, copy;
111812+ int start = skb_headlen(skb);
111813+
111814+ if (offset > (int)skb->len - len)
111815+ goto fault;
111816+
111817+ if ((copy = start - offset) > 0) {
111818+ if (copy > len)
111819+ copy = len;
111820+ memcpy(skb->data + offset, from, copy);
111821+ if ((len -= copy) == 0)
111822+ return 0;
111823+ offset += copy;
111824+ from += copy;
111825+ }
111826+
111827+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111828+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111829+ int end;
111830+
111831+ BUG_TRAP(start <= offset + len);
111832+
111833+ end = start + frag->size;
111834+ if ((copy = end - offset) > 0) {
111835+ u8 *vaddr;
111836+
111837+ if (copy > len)
111838+ copy = len;
111839+
111840+ vaddr = kmap_skb_frag(frag);
111841+ memcpy(vaddr + frag->page_offset + offset - start,
111842+ from, copy);
111843+ kunmap_skb_frag(vaddr);
111844+
111845+ if ((len -= copy) == 0)
111846+ return 0;
111847+ offset += copy;
111848+ from += copy;
111849+ }
111850+ start = end;
111851+ }
111852+
111853+ if (skb_shinfo(skb)->frag_list) {
111854+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
111855+
111856+ for (; list; list = list->next) {
111857+ int end;
111858+
111859+ BUG_TRAP(start <= offset + len);
111860+
111861+ end = start + list->len;
111862+ if ((copy = end - offset) > 0) {
111863+ if (copy > len)
111864+ copy = len;
111865+ if (skb_store_bits(list, offset - start,
111866+ from, copy))
111867+ goto fault;
111868+ if ((len -= copy) == 0)
111869+ return 0;
111870+ offset += copy;
111871+ from += copy;
111872+ }
111873+ start = end;
111874+ }
111875+ }
111876+ if (!len)
111877+ return 0;
111878+
111879+fault:
111880+ return -EFAULT;
111881+}
111882+
111883+EXPORT_SYMBOL(skb_store_bits);
111884+
111885+/* Checksum skb data. */
111886+
111887+unsigned int skb_checksum(const struct sk_buff *skb, int offset,
111888+ int len, unsigned int csum)
111889+{
111890+ int start = skb_headlen(skb);
111891+ int i, copy = start - offset;
111892+ int pos = 0;
111893+
111894+ /* Checksum header. */
111895+ if (copy > 0) {
111896+ if (copy > len)
111897+ copy = len;
111898+ csum = csum_partial(skb->data + offset, copy, csum);
111899+ if ((len -= copy) == 0)
111900+ return csum;
111901+ offset += copy;
111902+ pos = copy;
111903+ }
111904+
111905+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111906+ int end;
111907+
111908+ BUG_TRAP(start <= offset + len);
111909+
111910+ end = start + skb_shinfo(skb)->frags[i].size;
111911+ if ((copy = end - offset) > 0) {
111912+ unsigned int csum2;
111913+ u8 *vaddr;
111914+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111915+
111916+ if (copy > len)
111917+ copy = len;
111918+ vaddr = kmap_skb_frag(frag);
111919+ csum2 = csum_partial(vaddr + frag->page_offset +
111920+ offset - start, copy, 0);
111921+ kunmap_skb_frag(vaddr);
111922+ csum = csum_block_add(csum, csum2, pos);
111923+ if (!(len -= copy))
111924+ return csum;
111925+ offset += copy;
111926+ pos += copy;
111927+ }
111928+ start = end;
111929+ }
111930+
111931+ if (skb_shinfo(skb)->frag_list) {
111932+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
111933+
111934+ for (; list; list = list->next) {
111935+ int end;
111936+
111937+ BUG_TRAP(start <= offset + len);
111938+
111939+ end = start + list->len;
111940+ if ((copy = end - offset) > 0) {
111941+ unsigned int csum2;
111942+ if (copy > len)
111943+ copy = len;
111944+ csum2 = skb_checksum(list, offset - start,
111945+ copy, 0);
111946+ csum = csum_block_add(csum, csum2, pos);
111947+ if ((len -= copy) == 0)
111948+ return csum;
111949+ offset += copy;
111950+ pos += copy;
111951+ }
111952+ start = end;
111953+ }
111954+ }
111955+ BUG_ON(len);
111956+
111957+ return csum;
111958+}
111959+
111960+/* Both of above in one bottle. */
111961+
111962+unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
111963+ u8 *to, int len, unsigned int csum)
111964+{
111965+ int start = skb_headlen(skb);
111966+ int i, copy = start - offset;
111967+ int pos = 0;
111968+
111969+ /* Copy header. */
111970+ if (copy > 0) {
111971+ if (copy > len)
111972+ copy = len;
111973+ csum = csum_partial_copy_nocheck(skb->data + offset, to,
111974+ copy, csum);
111975+ if ((len -= copy) == 0)
111976+ return csum;
111977+ offset += copy;
111978+ to += copy;
111979+ pos = copy;
111980+ }
111981+
111982+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111983+ int end;
111984+
111985+ BUG_TRAP(start <= offset + len);
111986+
111987+ end = start + skb_shinfo(skb)->frags[i].size;
111988+ if ((copy = end - offset) > 0) {
111989+ unsigned int csum2;
111990+ u8 *vaddr;
111991+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111992+
111993+ if (copy > len)
111994+ copy = len;
111995+ vaddr = kmap_skb_frag(frag);
111996+ csum2 = csum_partial_copy_nocheck(vaddr +
111997+ frag->page_offset +
111998+ offset - start, to,
111999+ copy, 0);
112000+ kunmap_skb_frag(vaddr);
112001+ csum = csum_block_add(csum, csum2, pos);
112002+ if (!(len -= copy))
112003+ return csum;
112004+ offset += copy;
112005+ to += copy;
112006+ pos += copy;
112007+ }
112008+ start = end;
112009+ }
112010+
112011+ if (skb_shinfo(skb)->frag_list) {
112012+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
112013+
112014+ for (; list; list = list->next) {
112015+ unsigned int csum2;
112016+ int end;
112017+
112018+ BUG_TRAP(start <= offset + len);
112019+
112020+ end = start + list->len;
112021+ if ((copy = end - offset) > 0) {
112022+ if (copy > len)
112023+ copy = len;
112024+ csum2 = skb_copy_and_csum_bits(list,
112025+ offset - start,
112026+ to, copy, 0);
112027+ csum = csum_block_add(csum, csum2, pos);
112028+ if ((len -= copy) == 0)
112029+ return csum;
112030+ offset += copy;
112031+ to += copy;
112032+ pos += copy;
112033+ }
112034+ start = end;
112035+ }
112036+ }
112037+ BUG_ON(len);
112038+ return csum;
112039+}
112040+
112041+void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
112042+{
112043+ unsigned int csum;
112044+ long csstart;
112045+
112046+ if (skb->ip_summed == CHECKSUM_HW)
112047+ csstart = skb->h.raw - skb->data;
112048+ else
112049+ csstart = skb_headlen(skb);
112050+
112051+ BUG_ON(csstart > skb_headlen(skb));
112052+
112053+ memcpy(to, skb->data, csstart);
112054+
112055+ csum = 0;
112056+ if (csstart != skb->len)
112057+ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
112058+ skb->len - csstart, 0);
112059+
112060+ if (skb->ip_summed == CHECKSUM_HW) {
112061+ long csstuff = csstart + skb->csum;
112062+
112063+ *((unsigned short *)(to + csstuff)) = csum_fold(csum);
112064+ }
112065+}
112066+
112067+/**
112068+ * skb_dequeue - remove from the head of the queue
112069+ * @list: list to dequeue from
112070+ *
112071+ * Remove the head of the list. The list lock is taken so the function
112072+ * may be used safely with other locking list functions. The head item is
112073+ * returned or %NULL if the list is empty.
112074+ */
112075+
112076+struct sk_buff *skb_dequeue(struct sk_buff_head *list)
112077+{
112078+ unsigned long flags;
112079+ struct sk_buff *result;
112080+
112081+ spin_lock_irqsave(&list->lock, flags);
112082+ result = __skb_dequeue(list);
112083+ spin_unlock_irqrestore(&list->lock, flags);
112084+ return result;
112085+}
112086+
112087+/**
112088+ * skb_dequeue_tail - remove from the tail of the queue
112089+ * @list: list to dequeue from
112090+ *
112091+ * Remove the tail of the list. The list lock is taken so the function
112092+ * may be used safely with other locking list functions. The tail item is
112093+ * returned or %NULL if the list is empty.
112094+ */
112095+struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
112096+{
112097+ unsigned long flags;
112098+ struct sk_buff *result;
112099+
112100+ spin_lock_irqsave(&list->lock, flags);
112101+ result = __skb_dequeue_tail(list);
112102+ spin_unlock_irqrestore(&list->lock, flags);
112103+ return result;
112104+}
112105+
112106+/**
112107+ * skb_queue_purge - empty a list
112108+ * @list: list to empty
112109+ *
112110+ * Delete all buffers on an &sk_buff list. Each buffer is removed from
112111+ * the list and one reference dropped. This function takes the list
112112+ * lock and is atomic with respect to other list locking functions.
112113+ */
112114+void skb_queue_purge(struct sk_buff_head *list)
112115+{
112116+ struct sk_buff *skb;
112117+ while ((skb = skb_dequeue(list)) != NULL)
112118+ kfree_skb(skb);
112119+}
112120+
112121+/**
112122+ * skb_queue_head - queue a buffer at the list head
112123+ * @list: list to use
112124+ * @newsk: buffer to queue
112125+ *
112126+ * Queue a buffer at the start of the list. This function takes the
112127+ * list lock and can be used safely with other locking &sk_buff functions
112128+ * safely.
112129+ *
112130+ * A buffer cannot be placed on two lists at the same time.
112131+ */
112132+void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
112133+{
112134+ unsigned long flags;
112135+
112136+ spin_lock_irqsave(&list->lock, flags);
112137+ __skb_queue_head(list, newsk);
112138+ spin_unlock_irqrestore(&list->lock, flags);
112139+}
112140+
112141+/**
112142+ * skb_queue_tail - queue a buffer at the list tail
112143+ * @list: list to use
112144+ * @newsk: buffer to queue
112145+ *
112146+ * Queue a buffer at the tail of the list. This function takes the
112147+ * list lock and can be used safely with other locking &sk_buff functions
112148+ * safely.
112149+ *
112150+ * A buffer cannot be placed on two lists at the same time.
112151+ */
112152+void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
112153+{
112154+ unsigned long flags;
112155+
112156+ spin_lock_irqsave(&list->lock, flags);
112157+ __skb_queue_tail(list, newsk);
112158+ spin_unlock_irqrestore(&list->lock, flags);
112159+}
112160+
112161+/**
112162+ * skb_unlink - remove a buffer from a list
112163+ * @skb: buffer to remove
112164+ * @list: list to use
112165+ *
112166+ * Remove a packet from a list. The list locks are taken and this
112167+ * function is atomic with respect to other list locked calls
112168+ *
112169+ * You must know what list the SKB is on.
112170+ */
112171+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
112172+{
112173+ unsigned long flags;
112174+
112175+ spin_lock_irqsave(&list->lock, flags);
112176+ __skb_unlink(skb, list);
112177+ spin_unlock_irqrestore(&list->lock, flags);
112178+}
112179+
112180+/**
112181+ * skb_append - append a buffer
112182+ * @old: buffer to insert after
112183+ * @newsk: buffer to insert
112184+ * @list: list to use
112185+ *
112186+ * Place a packet after a given packet in a list. The list locks are taken
112187+ * and this function is atomic with respect to other list locked calls.
112188+ * A buffer cannot be placed on two lists at the same time.
112189+ */
112190+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
112191+{
112192+ unsigned long flags;
112193+
112194+ spin_lock_irqsave(&list->lock, flags);
112195+ __skb_append(old, newsk, list);
112196+ spin_unlock_irqrestore(&list->lock, flags);
112197+}
112198+
112199+
112200+/**
112201+ * skb_insert - insert a buffer
112202+ * @old: buffer to insert before
112203+ * @newsk: buffer to insert
112204+ * @list: list to use
112205+ *
112206+ * Place a packet before a given packet in a list. The list locks are
112207+ * taken and this function is atomic with respect to other list locked
112208+ * calls.
112209+ *
112210+ * A buffer cannot be placed on two lists at the same time.
112211+ */
112212+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
112213+{
112214+ unsigned long flags;
112215+
112216+ spin_lock_irqsave(&list->lock, flags);
112217+ __skb_insert(newsk, old->prev, old, list);
112218+ spin_unlock_irqrestore(&list->lock, flags);
112219+}
112220+
112221+#if 0
112222+/*
112223+ * Tune the memory allocator for a new MTU size.
112224+ */
112225+void skb_add_mtu(int mtu)
112226+{
112227+ /* Must match allocation in alloc_skb */
112228+ mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
112229+
112230+ kmem_add_cache_size(mtu);
112231+}
112232+#endif
112233+
112234+static inline void skb_split_inside_header(struct sk_buff *skb,
112235+ struct sk_buff* skb1,
112236+ const u32 len, const int pos)
112237+{
112238+ int i;
112239+
112240+ memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
112241+
112242+ /* And move data appendix as is. */
112243+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
112244+ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
112245+
112246+ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
112247+ skb_shinfo(skb)->nr_frags = 0;
112248+ skb1->data_len = skb->data_len;
112249+ skb1->len += skb1->data_len;
112250+ skb->data_len = 0;
112251+ skb->len = len;
112252+ skb->tail = skb->data + len;
112253+}
112254+
112255+static inline void skb_split_no_header(struct sk_buff *skb,
112256+ struct sk_buff* skb1,
112257+ const u32 len, int pos)
112258+{
112259+ int i, k = 0;
112260+ const int nfrags = skb_shinfo(skb)->nr_frags;
112261+
112262+ skb_shinfo(skb)->nr_frags = 0;
112263+ skb1->len = skb1->data_len = skb->len - len;
112264+ skb->len = len;
112265+ skb->data_len = len - pos;
112266+
112267+ for (i = 0; i < nfrags; i++) {
112268+ int size = skb_shinfo(skb)->frags[i].size;
112269+
112270+ if (pos + size > len) {
112271+ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
112272+
112273+ if (pos < len) {
112274+ /* Split frag.
112275+ * We have two variants in this case:
112276+ * 1. Move all the frag to the second
112277+ * part, if it is possible. F.e.
112278+ * this approach is mandatory for TUX,
112279+ * where splitting is expensive.
112280+ * 2. Split is accurately. We make this.
112281+ */
112282+ get_page(skb_shinfo(skb)->frags[i].page);
112283+ skb_shinfo(skb1)->frags[0].page_offset += len - pos;
112284+ skb_shinfo(skb1)->frags[0].size -= len - pos;
112285+ skb_shinfo(skb)->frags[i].size = len - pos;
112286+ skb_shinfo(skb)->nr_frags++;
112287+ }
112288+ k++;
112289+ } else
112290+ skb_shinfo(skb)->nr_frags++;
112291+ pos += size;
112292+ }
112293+ skb_shinfo(skb1)->nr_frags = k;
112294+}
112295+
112296+/**
112297+ * skb_split - Split fragmented skb to two parts at length len.
112298+ * @skb: the buffer to split
112299+ * @skb1: the buffer to receive the second part
112300+ * @len: new length for skb
112301+ */
112302+void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
112303+{
112304+ int pos = skb_headlen(skb);
112305+
112306+ if (len < pos) /* Split line is inside header. */
112307+ skb_split_inside_header(skb, skb1, len, pos);
112308+ else /* Second chunk has no header, nothing to copy. */
112309+ skb_split_no_header(skb, skb1, len, pos);
112310+}
112311+
112312+/**
112313+ * skb_prepare_seq_read - Prepare a sequential read of skb data
112314+ * @skb: the buffer to read
112315+ * @from: lower offset of data to be read
112316+ * @to: upper offset of data to be read
112317+ * @st: state variable
112318+ *
112319+ * Initializes the specified state variable. Must be called before
112320+ * invoking skb_seq_read() for the first time.
112321+ */
112322+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
112323+ unsigned int to, struct skb_seq_state *st)
112324+{
112325+ st->lower_offset = from;
112326+ st->upper_offset = to;
112327+ st->root_skb = st->cur_skb = skb;
112328+ st->frag_idx = st->stepped_offset = 0;
112329+ st->frag_data = NULL;
112330+}
112331+
112332+/**
112333+ * skb_seq_read - Sequentially read skb data
112334+ * @consumed: number of bytes consumed by the caller so far
112335+ * @data: destination pointer for data to be returned
112336+ * @st: state variable
112337+ *
112338+ * Reads a block of skb data at &consumed relative to the
112339+ * lower offset specified to skb_prepare_seq_read(). Assigns
112340+ * the head of the data block to &data and returns the length
112341+ * of the block or 0 if the end of the skb data or the upper
112342+ * offset has been reached.
112343+ *
112344+ * The caller is not required to consume all of the data
112345+ * returned, i.e. &consumed is typically set to the number
112346+ * of bytes already consumed and the next call to
112347+ * skb_seq_read() will return the remaining part of the block.
112348+ *
112349+ * Note: The size of each block of data returned can be arbitary,
112350+ * this limitation is the cost for zerocopy seqeuental
112351+ * reads of potentially non linear data.
112352+ *
112353+ * Note: Fragment lists within fragments are not implemented
112354+ * at the moment, state->root_skb could be replaced with
112355+ * a stack for this purpose.
112356+ */
112357+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
112358+ struct skb_seq_state *st)
112359+{
112360+ unsigned int block_limit, abs_offset = consumed + st->lower_offset;
112361+ skb_frag_t *frag;
112362+
112363+ if (unlikely(abs_offset >= st->upper_offset))
112364+ return 0;
112365+
112366+next_skb:
112367+ block_limit = skb_headlen(st->cur_skb);
112368+
112369+ if (abs_offset < block_limit) {
112370+ *data = st->cur_skb->data + abs_offset;
112371+ return block_limit - abs_offset;
112372+ }
112373+
112374+ if (st->frag_idx == 0 && !st->frag_data)
112375+ st->stepped_offset += skb_headlen(st->cur_skb);
112376+
112377+ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
112378+ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
112379+ block_limit = frag->size + st->stepped_offset;
112380+
112381+ if (abs_offset < block_limit) {
112382+ if (!st->frag_data)
112383+ st->frag_data = kmap_skb_frag(frag);
112384+
112385+ *data = (u8 *) st->frag_data + frag->page_offset +
112386+ (abs_offset - st->stepped_offset);
112387+
112388+ return block_limit - abs_offset;
112389+ }
112390+
112391+ if (st->frag_data) {
112392+ kunmap_skb_frag(st->frag_data);
112393+ st->frag_data = NULL;
112394+ }
112395+
112396+ st->frag_idx++;
112397+ st->stepped_offset += frag->size;
112398+ }
112399+
112400+ if (st->cur_skb->next) {
112401+ st->cur_skb = st->cur_skb->next;
112402+ st->frag_idx = 0;
112403+ goto next_skb;
112404+ } else if (st->root_skb == st->cur_skb &&
112405+ skb_shinfo(st->root_skb)->frag_list) {
112406+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
112407+ goto next_skb;
112408+ }
112409+
112410+ return 0;
112411+}
112412+
112413+/**
112414+ * skb_abort_seq_read - Abort a sequential read of skb data
112415+ * @st: state variable
112416+ *
112417+ * Must be called if skb_seq_read() was not called until it
112418+ * returned 0.
112419+ */
112420+void skb_abort_seq_read(struct skb_seq_state *st)
112421+{
112422+ if (st->frag_data)
112423+ kunmap_skb_frag(st->frag_data);
112424+}
112425+
112426+#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
112427+
112428+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
112429+ struct ts_config *conf,
112430+ struct ts_state *state)
112431+{
112432+ return skb_seq_read(offset, text, TS_SKB_CB(state));
112433+}
112434+
112435+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
112436+{
112437+ skb_abort_seq_read(TS_SKB_CB(state));
112438+}
112439+
112440+/**
112441+ * skb_find_text - Find a text pattern in skb data
112442+ * @skb: the buffer to look in
112443+ * @from: search offset
112444+ * @to: search limit
112445+ * @config: textsearch configuration
112446+ * @state: uninitialized textsearch state variable
112447+ *
112448+ * Finds a pattern in the skb data according to the specified
112449+ * textsearch configuration. Use textsearch_next() to retrieve
112450+ * subsequent occurrences of the pattern. Returns the offset
112451+ * to the first occurrence or UINT_MAX if no match was found.
112452+ */
112453+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
112454+ unsigned int to, struct ts_config *config,
112455+ struct ts_state *state)
112456+{
112457+ config->get_next_block = skb_ts_get_next_block;
112458+ config->finish = skb_ts_finish;
112459+
112460+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
112461+
112462+ return textsearch_find(config, state);
112463+}
112464+
112465+/**
112466+ * skb_append_datato_frags: - append the user data to a skb
112467+ * @sk: sock structure
112468+ * @skb: skb structure to be appened with user data.
112469+ * @getfrag: call back function to be used for getting the user data
112470+ * @from: pointer to user message iov
112471+ * @length: length of the iov message
112472+ *
112473+ * Description: This procedure append the user data in the fragment part
112474+ * of the skb if any page alloc fails user this procedure returns -ENOMEM
112475+ */
112476+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
112477+ int (*getfrag)(void *from, char *to, int offset,
112478+ int len, int odd, struct sk_buff *skb),
112479+ void *from, int length)
112480+{
112481+ int frg_cnt = 0;
112482+ skb_frag_t *frag = NULL;
112483+ struct page *page = NULL;
112484+ int copy, left;
112485+ int offset = 0;
112486+ int ret;
112487+
112488+ do {
112489+ /* Return error if we don't have space for new frag */
112490+ frg_cnt = skb_shinfo(skb)->nr_frags;
112491+ if (frg_cnt >= MAX_SKB_FRAGS)
112492+ return -EFAULT;
112493+
112494+ /* allocate a new page for next frag */
112495+ page = alloc_pages(sk->sk_allocation, 0);
112496+
112497+ /* If alloc_page fails just return failure and caller will
112498+ * free previous allocated pages by doing kfree_skb()
112499+ */
112500+ if (page == NULL)
112501+ return -ENOMEM;
112502+
112503+ /* initialize the next frag */
112504+ sk->sk_sndmsg_page = page;
112505+ sk->sk_sndmsg_off = 0;
112506+ skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
112507+ skb->truesize += PAGE_SIZE;
112508+ atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
112509+
112510+ /* get the new initialized frag */
112511+ frg_cnt = skb_shinfo(skb)->nr_frags;
112512+ frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
112513+
112514+ /* copy the user data to page */
112515+ left = PAGE_SIZE - frag->page_offset;
112516+ copy = (length > left)? left : length;
112517+
112518+ ret = getfrag(from, (page_address(frag->page) +
112519+ frag->page_offset + frag->size),
112520+ offset, copy, 0, skb);
112521+ if (ret < 0)
112522+ return -EFAULT;
112523+
112524+ /* copy was successful so update the size parameters */
112525+ sk->sk_sndmsg_off += copy;
112526+ frag->size += copy;
112527+ skb->len += copy;
112528+ skb->data_len += copy;
112529+ offset += copy;
112530+ length -= copy;
112531+
112532+ } while (length > 0);
112533+
112534+ return 0;
112535+}
112536+
112537+/**
112538+ * skb_segment - Perform protocol segmentation on skb.
112539+ * @skb: buffer to segment
112540+ * @features: features for the output path (see dev->features)
112541+ *
112542+ * This function performs segmentation on the given skb. It returns
112543+ * the segment at the given position. It returns NULL if there are
112544+ * no more segments to generate, or when an error is encountered.
112545+ */
112546+struct sk_buff *skb_segment(struct sk_buff *skb, int features)
112547+{
112548+ struct sk_buff *segs = NULL;
112549+ struct sk_buff *tail = NULL;
112550+ unsigned int mss = skb_shinfo(skb)->gso_size;
112551+ unsigned int doffset = skb->data - skb->mac.raw;
112552+ unsigned int offset = doffset;
112553+ unsigned int headroom;
112554+ unsigned int len;
112555+ int sg = features & NETIF_F_SG;
112556+ int nfrags = skb_shinfo(skb)->nr_frags;
112557+ int err = -ENOMEM;
112558+ int i = 0;
112559+ int pos;
112560+
112561+ __skb_push(skb, doffset);
112562+ headroom = skb_headroom(skb);
112563+ pos = skb_headlen(skb);
112564+
112565+ do {
112566+ struct sk_buff *nskb;
112567+ skb_frag_t *frag;
112568+ int hsize, nsize;
112569+ int k;
112570+ int size;
112571+
112572+ len = skb->len - offset;
112573+ if (len > mss)
112574+ len = mss;
112575+
112576+ hsize = skb_headlen(skb) - offset;
112577+ if (hsize < 0)
112578+ hsize = 0;
112579+ nsize = hsize + doffset;
112580+ if (nsize > len + doffset || !sg)
112581+ nsize = len + doffset;
112582+
112583+ nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
112584+ if (unlikely(!nskb))
112585+ goto err;
112586+
112587+ if (segs)
112588+ tail->next = nskb;
112589+ else
112590+ segs = nskb;
112591+ tail = nskb;
112592+
112593+ nskb->dev = skb->dev;
112594+ nskb->priority = skb->priority;
112595+ nskb->protocol = skb->protocol;
112596+ nskb->dst = dst_clone(skb->dst);
112597+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
112598+ nskb->pkt_type = skb->pkt_type;
112599+ nskb->mac_len = skb->mac_len;
112600+
112601+ skb_reserve(nskb, headroom);
112602+ nskb->mac.raw = nskb->data;
112603+ nskb->nh.raw = nskb->data + skb->mac_len;
112604+ nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
112605+ memcpy(skb_put(nskb, doffset), skb->data, doffset);
112606+
112607+ if (!sg) {
112608+ nskb->csum = skb_copy_and_csum_bits(skb, offset,
112609+ skb_put(nskb, len),
112610+ len, 0);
112611+ continue;
112612+ }
112613+
112614+ frag = skb_shinfo(nskb)->frags;
112615+ k = 0;
112616+
112617+ nskb->ip_summed = CHECKSUM_HW;
112618+ nskb->csum = skb->csum;
112619+ memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
112620+
112621+ while (pos < offset + len) {
112622+ BUG_ON(i >= nfrags);
112623+
112624+ *frag = skb_shinfo(skb)->frags[i];
112625+ get_page(frag->page);
112626+ size = frag->size;
112627+
112628+ if (pos < offset) {
112629+ frag->page_offset += offset - pos;
112630+ frag->size -= offset - pos;
112631+ }
112632+
112633+ k++;
112634+
112635+ if (pos + size <= offset + len) {
112636+ i++;
112637+ pos += size;
112638+ } else {
112639+ frag->size -= pos + size - (offset + len);
112640+ break;
112641+ }
112642+
112643+ frag++;
112644+ }
112645+
112646+ skb_shinfo(nskb)->nr_frags = k;
112647+ nskb->data_len = len - hsize;
112648+ nskb->len += nskb->data_len;
112649+ nskb->truesize += nskb->data_len;
112650+ } while ((offset += len) < skb->len);
112651+
112652+ return segs;
112653+
112654+err:
112655+ while ((skb = segs)) {
112656+ segs = skb->next;
112657+ kfree(skb);
112658+ }
112659+ return ERR_PTR(err);
112660+}
112661+
112662+EXPORT_SYMBOL_GPL(skb_segment);
112663+
112664+void __init skb_init(void)
112665+{
112666+ skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
112667+ sizeof(struct sk_buff),
112668+ 0,
112669+ SLAB_HWCACHE_ALIGN,
112670+ NULL, NULL);
112671+ if (!skbuff_head_cache)
112672+ panic("cannot create skbuff cache");
112673+
112674+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
112675+ (2*sizeof(struct sk_buff)) +
112676+ sizeof(atomic_t),
112677+ 0,
112678+ SLAB_HWCACHE_ALIGN,
112679+ NULL, NULL);
112680+ if (!skbuff_fclone_cache)
112681+ panic("cannot create skbuff cache");
112682+}
112683+
112684+EXPORT_SYMBOL(___pskb_trim);
112685+EXPORT_SYMBOL(__kfree_skb);
112686+EXPORT_SYMBOL(__pskb_pull_tail);
112687+EXPORT_SYMBOL(__alloc_skb);
112688+EXPORT_SYMBOL(pskb_copy);
112689+EXPORT_SYMBOL(pskb_expand_head);
112690+EXPORT_SYMBOL(skb_checksum);
112691+EXPORT_SYMBOL(skb_clone);
112692+EXPORT_SYMBOL(skb_clone_fraglist);
112693+EXPORT_SYMBOL(skb_copy);
112694+EXPORT_SYMBOL(skb_copy_and_csum_bits);
112695+EXPORT_SYMBOL(skb_copy_and_csum_dev);
112696+EXPORT_SYMBOL(skb_copy_bits);
112697+EXPORT_SYMBOL(skb_copy_expand);
112698+EXPORT_SYMBOL(skb_over_panic);
112699+EXPORT_SYMBOL(skb_pad);
112700+EXPORT_SYMBOL(skb_realloc_headroom);
112701+EXPORT_SYMBOL(skb_under_panic);
112702+EXPORT_SYMBOL(skb_dequeue);
112703+EXPORT_SYMBOL(skb_dequeue_tail);
112704+EXPORT_SYMBOL(skb_insert);
112705+EXPORT_SYMBOL(skb_queue_purge);
112706+EXPORT_SYMBOL(skb_queue_head);
112707+EXPORT_SYMBOL(skb_queue_tail);
112708+EXPORT_SYMBOL(skb_unlink);
112709+EXPORT_SYMBOL(skb_append);
112710+EXPORT_SYMBOL(skb_split);
112711+EXPORT_SYMBOL(skb_prepare_seq_read);
112712+EXPORT_SYMBOL(skb_seq_read);
112713+EXPORT_SYMBOL(skb_abort_seq_read);
112714+EXPORT_SYMBOL(skb_find_text);
112715+EXPORT_SYMBOL(skb_append_datato_frags);
112716diff -Nur linux-2.6.16.33-noxen/net/decnet/dn_nsp_in.c linux-2.6.16.33/net/decnet/dn_nsp_in.c
112717--- linux-2.6.16.33-noxen/net/decnet/dn_nsp_in.c 2006-11-22 18:06:31.000000000 +0000
112718+++ linux-2.6.16.33/net/decnet/dn_nsp_in.c 2007-05-23 21:00:01.000000000 +0000
112719@@ -801,8 +801,7 @@
112720 * We linearize everything except data segments here.
112721 */
112722 if (cb->nsp_flags & ~0x60) {
112723- if (unlikely(skb_is_nonlinear(skb)) &&
112724- skb_linearize(skb, GFP_ATOMIC) != 0)
112725+ if (unlikely(skb_linearize(skb)))
112726 goto free_out;
112727 }
112728
112729diff -Nur linux-2.6.16.33-noxen/net/decnet/dn_route.c linux-2.6.16.33/net/decnet/dn_route.c
112730--- linux-2.6.16.33-noxen/net/decnet/dn_route.c 2006-11-22 18:06:31.000000000 +0000
112731+++ linux-2.6.16.33/net/decnet/dn_route.c 2007-05-23 21:00:01.000000000 +0000
112732@@ -629,8 +629,7 @@
112733 padlen);
112734
112735 if (flags & DN_RT_PKT_CNTL) {
112736- if (unlikely(skb_is_nonlinear(skb)) &&
112737- skb_linearize(skb, GFP_ATOMIC) != 0)
112738+ if (unlikely(skb_linearize(skb)))
112739 goto dump_it;
112740
112741 switch(flags & DN_RT_CNTL_MSK) {
112742diff -Nur linux-2.6.16.33-noxen/net/ipv4/af_inet.c linux-2.6.16.33/net/ipv4/af_inet.c
112743--- linux-2.6.16.33-noxen/net/ipv4/af_inet.c 2006-11-22 18:06:31.000000000 +0000
112744+++ linux-2.6.16.33/net/ipv4/af_inet.c 2007-05-23 21:00:01.000000000 +0000
112745@@ -68,6 +68,7 @@
112746 */
112747
112748 #include <linux/config.h>
112749+#include <linux/err.h>
112750 #include <linux/errno.h>
112751 #include <linux/types.h>
112752 #include <linux/socket.h>
112753@@ -1084,6 +1085,88 @@
112754
112755 EXPORT_SYMBOL(inet_sk_rebuild_header);
112756
112757+static int inet_gso_send_check(struct sk_buff *skb)
112758+{
112759+ struct iphdr *iph;
112760+ struct net_protocol *ops;
112761+ int proto;
112762+ int ihl;
112763+ int err = -EINVAL;
112764+
112765+ if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
112766+ goto out;
112767+
112768+ iph = skb->nh.iph;
112769+ ihl = iph->ihl * 4;
112770+ if (ihl < sizeof(*iph))
112771+ goto out;
112772+
112773+ if (unlikely(!pskb_may_pull(skb, ihl)))
112774+ goto out;
112775+
112776+ skb->h.raw = __skb_pull(skb, ihl);
112777+ iph = skb->nh.iph;
112778+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
112779+ err = -EPROTONOSUPPORT;
112780+
112781+ rcu_read_lock();
112782+ ops = rcu_dereference(inet_protos[proto]);
112783+ if (likely(ops && ops->gso_send_check))
112784+ err = ops->gso_send_check(skb);
112785+ rcu_read_unlock();
112786+
112787+out:
112788+ return err;
112789+}
112790+
112791+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
112792+{
112793+ struct sk_buff *segs = ERR_PTR(-EINVAL);
112794+ struct iphdr *iph;
112795+ struct net_protocol *ops;
112796+ int proto;
112797+ int ihl;
112798+ int id;
112799+
112800+ if (!pskb_may_pull(skb, sizeof(*iph)))
112801+ goto out;
112802+
112803+ iph = skb->nh.iph;
112804+ ihl = iph->ihl * 4;
112805+ if (ihl < sizeof(*iph))
112806+ goto out;
112807+
112808+ if (!pskb_may_pull(skb, ihl))
112809+ goto out;
112810+
112811+ skb->h.raw = __skb_pull(skb, ihl);
112812+ iph = skb->nh.iph;
112813+ id = ntohs(iph->id);
112814+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
112815+ segs = ERR_PTR(-EPROTONOSUPPORT);
112816+
112817+ rcu_read_lock();
112818+ ops = rcu_dereference(inet_protos[proto]);
112819+ if (ops && ops->gso_segment)
112820+ segs = ops->gso_segment(skb, features);
112821+ rcu_read_unlock();
112822+
112823+ if (!segs || unlikely(IS_ERR(segs)))
112824+ goto out;
112825+
112826+ skb = segs;
112827+ do {
112828+ iph = skb->nh.iph;
112829+ iph->id = htons(id++);
112830+ iph->tot_len = htons(skb->len - skb->mac_len);
112831+ iph->check = 0;
112832+ iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
112833+ } while ((skb = skb->next));
112834+
112835+out:
112836+ return segs;
112837+}
112838+
112839 #ifdef CONFIG_IP_MULTICAST
112840 static struct net_protocol igmp_protocol = {
112841 .handler = igmp_rcv,
112842@@ -1093,6 +1176,8 @@
112843 static struct net_protocol tcp_protocol = {
112844 .handler = tcp_v4_rcv,
112845 .err_handler = tcp_v4_err,
112846+ .gso_send_check = tcp_v4_gso_send_check,
112847+ .gso_segment = tcp_tso_segment,
112848 .no_policy = 1,
112849 };
112850
112851@@ -1138,6 +1223,8 @@
112852 static struct packet_type ip_packet_type = {
112853 .type = __constant_htons(ETH_P_IP),
112854 .func = ip_rcv,
112855+ .gso_send_check = inet_gso_send_check,
112856+ .gso_segment = inet_gso_segment,
112857 };
112858
112859 static int __init inet_init(void)
112860diff -Nur linux-2.6.16.33-noxen/net/ipv4/ip_output.c linux-2.6.16.33/net/ipv4/ip_output.c
112861--- linux-2.6.16.33-noxen/net/ipv4/ip_output.c 2006-11-22 18:06:31.000000000 +0000
112862+++ linux-2.6.16.33/net/ipv4/ip_output.c 2007-05-23 21:00:01.000000000 +0000
112863@@ -210,8 +210,7 @@
112864 return dst_output(skb);
112865 }
112866 #endif
112867- if (skb->len > dst_mtu(skb->dst) &&
112868- !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
112869+ if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
112870 return ip_fragment(skb, ip_finish_output2);
112871 else
112872 return ip_finish_output2(skb);
112873@@ -362,7 +361,7 @@
112874 }
112875
112876 ip_select_ident_more(iph, &rt->u.dst, sk,
112877- (skb_shinfo(skb)->tso_segs ?: 1) - 1);
112878+ (skb_shinfo(skb)->gso_segs ?: 1) - 1);
112879
112880 /* Add an IP checksum. */
112881 ip_send_check(iph);
112882@@ -743,7 +742,8 @@
112883 (length - transhdrlen));
112884 if (!err) {
112885 /* specify the length of each IP datagram fragment*/
112886- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
112887+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
112888+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
112889 __skb_queue_tail(&sk->sk_write_queue, skb);
112890
112891 return 0;
112892@@ -839,7 +839,7 @@
112893 */
112894 if (transhdrlen &&
112895 length + fragheaderlen <= mtu &&
112896- rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
112897+ rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
112898 !exthdrlen)
112899 csummode = CHECKSUM_HW;
112900
112901@@ -1086,14 +1086,16 @@
112902
112903 inet->cork.length += size;
112904 if ((sk->sk_protocol == IPPROTO_UDP) &&
112905- (rt->u.dst.dev->features & NETIF_F_UFO))
112906- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
112907+ (rt->u.dst.dev->features & NETIF_F_UFO)) {
112908+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
112909+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
112910+ }
112911
112912
112913 while (size > 0) {
112914 int i;
112915
112916- if (skb_shinfo(skb)->ufo_size)
112917+ if (skb_is_gso(skb))
112918 len = size;
112919 else {
112920
112921diff -Nur linux-2.6.16.33-noxen/net/ipv4/ipcomp.c linux-2.6.16.33/net/ipv4/ipcomp.c
112922--- linux-2.6.16.33-noxen/net/ipv4/ipcomp.c 2006-11-22 18:06:31.000000000 +0000
112923+++ linux-2.6.16.33/net/ipv4/ipcomp.c 2007-05-23 21:00:01.000000000 +0000
112924@@ -84,7 +84,7 @@
112925 struct xfrm_decap_state *decap, struct sk_buff *skb)
112926 {
112927 u8 nexthdr;
112928- int err = 0;
112929+ int err = -ENOMEM;
112930 struct iphdr *iph;
112931 union {
112932 struct iphdr iph;
112933@@ -92,11 +92,8 @@
112934 } tmp_iph;
112935
112936
112937- if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
112938- skb_linearize(skb, GFP_ATOMIC) != 0) {
112939- err = -ENOMEM;
112940+ if (skb_linearize_cow(skb))
112941 goto out;
112942- }
112943
112944 skb->ip_summed = CHECKSUM_NONE;
112945
112946@@ -171,10 +168,8 @@
112947 goto out_ok;
112948 }
112949
112950- if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
112951- skb_linearize(skb, GFP_ATOMIC) != 0) {
112952+ if (skb_linearize_cow(skb))
112953 goto out_ok;
112954- }
112955
112956 err = ipcomp_compress(x, skb);
112957 iph = skb->nh.iph;
112958diff -Nur linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_tcp.c linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_tcp.c
112959--- linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-11-22 18:06:31.000000000 +0000
112960+++ linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_tcp.c 2007-05-23 21:00:01.000000000 +0000
112961@@ -129,7 +129,12 @@
112962 if (hdrsize < sizeof(*hdr))
112963 return 1;
112964
112965- hdr->check = ip_nat_cheat_check(~oldip, newip,
112966+#ifdef CONFIG_XEN
112967+ if ((*pskb)->proto_csum_blank)
112968+ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
112969+ else
112970+#endif
112971+ hdr->check = ip_nat_cheat_check(~oldip, newip,
112972 ip_nat_cheat_check(oldport ^ 0xFFFF,
112973 newport,
112974 hdr->check));
112975diff -Nur linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_udp.c linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_udp.c
112976--- linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_udp.c 2006-11-22 18:06:31.000000000 +0000
112977+++ linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_udp.c 2007-05-23 21:00:01.000000000 +0000
112978@@ -113,11 +113,17 @@
112979 newport = tuple->dst.u.udp.port;
112980 portptr = &hdr->dest;
112981 }
112982- if (hdr->check) /* 0 is a special case meaning no checksum */
112983- hdr->check = ip_nat_cheat_check(~oldip, newip,
112984+ if (hdr->check) { /* 0 is a special case meaning no checksum */
112985+#ifdef CONFIG_XEN
112986+ if ((*pskb)->proto_csum_blank)
112987+ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
112988+ else
112989+#endif
112990+ hdr->check = ip_nat_cheat_check(~oldip, newip,
112991 ip_nat_cheat_check(*portptr ^ 0xFFFF,
112992 newport,
112993 hdr->check));
112994+ }
112995 *portptr = newport;
112996 return 1;
112997 }
112998diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp.c linux-2.6.16.33/net/ipv4/tcp.c
112999--- linux-2.6.16.33-noxen/net/ipv4/tcp.c 2006-11-22 18:06:31.000000000 +0000
113000+++ linux-2.6.16.33/net/ipv4/tcp.c 2007-05-23 21:00:01.000000000 +0000
113001@@ -257,6 +257,7 @@
113002 #include <linux/fs.h>
113003 #include <linux/random.h>
113004 #include <linux/bootmem.h>
113005+#include <linux/err.h>
113006
113007 #include <net/icmp.h>
113008 #include <net/tcp.h>
113009@@ -570,7 +571,7 @@
113010 skb->ip_summed = CHECKSUM_HW;
113011 tp->write_seq += copy;
113012 TCP_SKB_CB(skb)->end_seq += copy;
113013- skb_shinfo(skb)->tso_segs = 0;
113014+ skb_shinfo(skb)->gso_segs = 0;
113015
113016 if (!copied)
113017 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
113018@@ -621,14 +622,10 @@
113019 ssize_t res;
113020 struct sock *sk = sock->sk;
113021
113022-#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
113023-
113024 if (!(sk->sk_route_caps & NETIF_F_SG) ||
113025- !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
113026+ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
113027 return sock_no_sendpage(sock, page, offset, size, flags);
113028
113029-#undef TCP_ZC_CSUM_FLAGS
113030-
113031 lock_sock(sk);
113032 TCP_CHECK_TIMER(sk);
113033 res = do_tcp_sendpages(sk, &page, offset, size, flags);
113034@@ -725,9 +722,7 @@
113035 /*
113036 * Check whether we can use HW checksum.
113037 */
113038- if (sk->sk_route_caps &
113039- (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
113040- NETIF_F_HW_CSUM))
113041+ if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
113042 skb->ip_summed = CHECKSUM_HW;
113043
113044 skb_entail(sk, tp, skb);
113045@@ -823,7 +818,7 @@
113046
113047 tp->write_seq += copy;
113048 TCP_SKB_CB(skb)->end_seq += copy;
113049- skb_shinfo(skb)->tso_segs = 0;
113050+ skb_shinfo(skb)->gso_segs = 0;
113051
113052 from += copy;
113053 copied += copy;
113054@@ -2026,6 +2021,77 @@
113055 }
113056
113057
113058+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
113059+{
113060+ struct sk_buff *segs = ERR_PTR(-EINVAL);
113061+ struct tcphdr *th;
113062+ unsigned thlen;
113063+ unsigned int seq;
113064+ unsigned int delta;
113065+ unsigned int oldlen;
113066+ unsigned int len;
113067+
113068+ if (!pskb_may_pull(skb, sizeof(*th)))
113069+ goto out;
113070+
113071+ th = skb->h.th;
113072+ thlen = th->doff * 4;
113073+ if (thlen < sizeof(*th))
113074+ goto out;
113075+
113076+ if (!pskb_may_pull(skb, thlen))
113077+ goto out;
113078+
113079+ oldlen = (u16)~skb->len;
113080+ __skb_pull(skb, thlen);
113081+
113082+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
113083+ /* Packet is from an untrusted source, reset gso_segs. */
113084+ int mss = skb_shinfo(skb)->gso_size;
113085+
113086+ skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
113087+
113088+ segs = NULL;
113089+ goto out;
113090+ }
113091+
113092+ segs = skb_segment(skb, features);
113093+ if (IS_ERR(segs))
113094+ goto out;
113095+
113096+ len = skb_shinfo(skb)->gso_size;
113097+ delta = htonl(oldlen + (thlen + len));
113098+
113099+ skb = segs;
113100+ th = skb->h.th;
113101+ seq = ntohl(th->seq);
113102+
113103+ do {
113104+ th->fin = th->psh = 0;
113105+
113106+ th->check = ~csum_fold(th->check + delta);
113107+ if (skb->ip_summed != CHECKSUM_HW)
113108+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
113109+ skb->csum));
113110+
113111+ seq += len;
113112+ skb = skb->next;
113113+ th = skb->h.th;
113114+
113115+ th->seq = htonl(seq);
113116+ th->cwr = 0;
113117+ } while (skb->next);
113118+
113119+ delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
113120+ th->check = ~csum_fold(th->check + delta);
113121+ if (skb->ip_summed != CHECKSUM_HW)
113122+ th->check = csum_fold(csum_partial(skb->h.raw, thlen,
113123+ skb->csum));
113124+
113125+out:
113126+ return segs;
113127+}
113128+
113129 extern void __skb_cb_too_small_for_tcp(int, int);
113130 extern struct tcp_congestion_ops tcp_reno;
113131
113132diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_input.c linux-2.6.16.33/net/ipv4/tcp_input.c
113133--- linux-2.6.16.33-noxen/net/ipv4/tcp_input.c 2006-11-22 18:06:31.000000000 +0000
113134+++ linux-2.6.16.33/net/ipv4/tcp_input.c 2007-05-23 21:00:01.000000000 +0000
113135@@ -127,7 +127,7 @@
113136 /* skb->len may jitter because of SACKs, even if peer
113137 * sends good full-sized frames.
113138 */
113139- len = skb->len;
113140+ len = skb_shinfo(skb)->gso_size ?: skb->len;
113141 if (len >= icsk->icsk_ack.rcv_mss) {
113142 icsk->icsk_ack.rcv_mss = len;
113143 } else {
113144@@ -1072,7 +1072,7 @@
113145 else
113146 pkt_len = (end_seq -
113147 TCP_SKB_CB(skb)->seq);
113148- if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
113149+ if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
113150 break;
113151 pcount = tcp_skb_pcount(skb);
113152 }
113153diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_ipv4.c linux-2.6.16.33/net/ipv4/tcp_ipv4.c
113154--- linux-2.6.16.33-noxen/net/ipv4/tcp_ipv4.c 2006-11-22 18:06:31.000000000 +0000
113155+++ linux-2.6.16.33/net/ipv4/tcp_ipv4.c 2007-05-23 21:00:01.000000000 +0000
113156@@ -495,6 +495,24 @@
113157 }
113158 }
113159
113160+int tcp_v4_gso_send_check(struct sk_buff *skb)
113161+{
113162+ struct iphdr *iph;
113163+ struct tcphdr *th;
113164+
113165+ if (!pskb_may_pull(skb, sizeof(*th)))
113166+ return -EINVAL;
113167+
113168+ iph = skb->nh.iph;
113169+ th = skb->h.th;
113170+
113171+ th->check = 0;
113172+ th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
113173+ skb->csum = offsetof(struct tcphdr, check);
113174+ skb->ip_summed = CHECKSUM_HW;
113175+ return 0;
113176+}
113177+
113178 /*
113179 * This routine will send an RST to the other tcp.
113180 *
113181diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_output.c linux-2.6.16.33/net/ipv4/tcp_output.c
113182--- linux-2.6.16.33-noxen/net/ipv4/tcp_output.c 2006-11-22 18:06:31.000000000 +0000
113183+++ linux-2.6.16.33/net/ipv4/tcp_output.c 2007-05-23 21:00:01.000000000 +0000
113184@@ -497,15 +497,17 @@
113185 /* Avoid the costly divide in the normal
113186 * non-TSO case.
113187 */
113188- skb_shinfo(skb)->tso_segs = 1;
113189- skb_shinfo(skb)->tso_size = 0;
113190+ skb_shinfo(skb)->gso_segs = 1;
113191+ skb_shinfo(skb)->gso_size = 0;
113192+ skb_shinfo(skb)->gso_type = 0;
113193 } else {
113194 unsigned int factor;
113195
113196 factor = skb->len + (mss_now - 1);
113197 factor /= mss_now;
113198- skb_shinfo(skb)->tso_segs = factor;
113199- skb_shinfo(skb)->tso_size = mss_now;
113200+ skb_shinfo(skb)->gso_segs = factor;
113201+ skb_shinfo(skb)->gso_size = mss_now;
113202+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
113203 }
113204 }
113205
113206@@ -850,7 +852,7 @@
113207
113208 if (!tso_segs ||
113209 (tso_segs > 1 &&
113210- skb_shinfo(skb)->tso_size != mss_now)) {
113211+ tcp_skb_mss(skb) != mss_now)) {
113212 tcp_set_skb_tso_segs(sk, skb, mss_now);
113213 tso_segs = tcp_skb_pcount(skb);
113214 }
113215@@ -1510,8 +1512,9 @@
113216 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
113217 if (!pskb_trim(skb, 0)) {
113218 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
113219- skb_shinfo(skb)->tso_segs = 1;
113220- skb_shinfo(skb)->tso_size = 0;
113221+ skb_shinfo(skb)->gso_segs = 1;
113222+ skb_shinfo(skb)->gso_size = 0;
113223+ skb_shinfo(skb)->gso_type = 0;
113224 skb->ip_summed = CHECKSUM_NONE;
113225 skb->csum = 0;
113226 }
113227@@ -1716,8 +1719,9 @@
113228 skb->csum = 0;
113229 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
113230 TCP_SKB_CB(skb)->sacked = 0;
113231- skb_shinfo(skb)->tso_segs = 1;
113232- skb_shinfo(skb)->tso_size = 0;
113233+ skb_shinfo(skb)->gso_segs = 1;
113234+ skb_shinfo(skb)->gso_size = 0;
113235+ skb_shinfo(skb)->gso_type = 0;
113236
113237 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
113238 TCP_SKB_CB(skb)->seq = tp->write_seq;
113239@@ -1749,8 +1753,9 @@
113240 skb->csum = 0;
113241 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
113242 TCP_SKB_CB(skb)->sacked = 0;
113243- skb_shinfo(skb)->tso_segs = 1;
113244- skb_shinfo(skb)->tso_size = 0;
113245+ skb_shinfo(skb)->gso_segs = 1;
113246+ skb_shinfo(skb)->gso_size = 0;
113247+ skb_shinfo(skb)->gso_type = 0;
113248
113249 /* Send it off. */
113250 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
113251@@ -1833,8 +1838,9 @@
113252 TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
113253 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
113254 TCP_SKB_CB(skb)->sacked = 0;
113255- skb_shinfo(skb)->tso_segs = 1;
113256- skb_shinfo(skb)->tso_size = 0;
113257+ skb_shinfo(skb)->gso_segs = 1;
113258+ skb_shinfo(skb)->gso_size = 0;
113259+ skb_shinfo(skb)->gso_type = 0;
113260 th->seq = htonl(TCP_SKB_CB(skb)->seq);
113261 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
113262 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
113263@@ -1937,8 +1943,9 @@
113264 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
113265 TCP_ECN_send_syn(sk, tp, buff);
113266 TCP_SKB_CB(buff)->sacked = 0;
113267- skb_shinfo(buff)->tso_segs = 1;
113268- skb_shinfo(buff)->tso_size = 0;
113269+ skb_shinfo(buff)->gso_segs = 1;
113270+ skb_shinfo(buff)->gso_size = 0;
113271+ skb_shinfo(buff)->gso_type = 0;
113272 buff->csum = 0;
113273 TCP_SKB_CB(buff)->seq = tp->write_seq++;
113274 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
113275@@ -2042,8 +2049,9 @@
113276 buff->csum = 0;
113277 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
113278 TCP_SKB_CB(buff)->sacked = 0;
113279- skb_shinfo(buff)->tso_segs = 1;
113280- skb_shinfo(buff)->tso_size = 0;
113281+ skb_shinfo(buff)->gso_segs = 1;
113282+ skb_shinfo(buff)->gso_size = 0;
113283+ skb_shinfo(buff)->gso_type = 0;
113284
113285 /* Send it off, this clears delayed acks for us. */
113286 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
113287@@ -2078,8 +2086,9 @@
113288 skb->csum = 0;
113289 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
113290 TCP_SKB_CB(skb)->sacked = urgent;
113291- skb_shinfo(skb)->tso_segs = 1;
113292- skb_shinfo(skb)->tso_size = 0;
113293+ skb_shinfo(skb)->gso_segs = 1;
113294+ skb_shinfo(skb)->gso_size = 0;
113295+ skb_shinfo(skb)->gso_type = 0;
113296
113297 /* Use a previous sequence. This should cause the other
113298 * end to send an ack. Don't queue or clone SKB, just
113299diff -Nur linux-2.6.16.33-noxen/net/ipv4/xfrm4_output.c linux-2.6.16.33/net/ipv4/xfrm4_output.c
113300--- linux-2.6.16.33-noxen/net/ipv4/xfrm4_output.c 2006-11-22 18:06:31.000000000 +0000
113301+++ linux-2.6.16.33/net/ipv4/xfrm4_output.c 2007-05-23 21:00:01.000000000 +0000
113302@@ -9,6 +9,8 @@
113303 */
113304
113305 #include <linux/compiler.h>
113306+#include <linux/if_ether.h>
113307+#include <linux/kernel.h>
113308 #include <linux/skbuff.h>
113309 #include <linux/spinlock.h>
113310 #include <linux/netfilter_ipv4.h>
113311@@ -17,6 +19,8 @@
113312 #include <net/xfrm.h>
113313 #include <net/icmp.h>
113314
113315+extern int skb_checksum_setup(struct sk_buff *skb);
113316+
113317 /* Add encapsulation header.
113318 *
113319 * In transport mode, the IP header will be moved forward to make space
113320@@ -103,6 +107,10 @@
113321 struct xfrm_state *x = dst->xfrm;
113322 int err;
113323
113324+ err = skb_checksum_setup(skb);
113325+ if (err)
113326+ goto error_nolock;
113327+
113328 if (skb->ip_summed == CHECKSUM_HW) {
113329 err = skb_checksum_help(skb, 0);
113330 if (err)
113331@@ -152,16 +160,10 @@
113332 goto out_exit;
113333 }
113334
113335-static int xfrm4_output_finish(struct sk_buff *skb)
113336+static int xfrm4_output_finish2(struct sk_buff *skb)
113337 {
113338 int err;
113339
113340-#ifdef CONFIG_NETFILTER
113341- if (!skb->dst->xfrm) {
113342- IPCB(skb)->flags |= IPSKB_REROUTED;
113343- return dst_output(skb);
113344- }
113345-#endif
113346 while (likely((err = xfrm4_output_one(skb)) == 0)) {
113347 nf_reset(skb);
113348
113349@@ -174,7 +176,7 @@
113350 return dst_output(skb);
113351
113352 err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
113353- skb->dst->dev, xfrm4_output_finish);
113354+ skb->dst->dev, xfrm4_output_finish2);
113355 if (unlikely(err != 1))
113356 break;
113357 }
113358@@ -182,6 +184,48 @@
113359 return err;
113360 }
113361
113362+static int xfrm4_output_finish(struct sk_buff *skb)
113363+{
113364+ struct sk_buff *segs;
113365+
113366+#ifdef CONFIG_NETFILTER
113367+ if (!skb->dst->xfrm) {
113368+ IPCB(skb)->flags |= IPSKB_REROUTED;
113369+ return dst_output(skb);
113370+ }
113371+#endif
113372+
113373+ if (!skb_is_gso(skb))
113374+ return xfrm4_output_finish2(skb);
113375+
113376+ skb->protocol = htons(ETH_P_IP);
113377+ segs = skb_gso_segment(skb, 0);
113378+ kfree_skb(skb);
113379+ if (unlikely(IS_ERR(segs)))
113380+ return PTR_ERR(segs);
113381+
113382+ do {
113383+ struct sk_buff *nskb = segs->next;
113384+ int err;
113385+
113386+ segs->next = NULL;
113387+ err = xfrm4_output_finish2(segs);
113388+
113389+ if (unlikely(err)) {
113390+ while ((segs = nskb)) {
113391+ nskb = segs->next;
113392+ segs->next = NULL;
113393+ kfree_skb(segs);
113394+ }
113395+ return err;
113396+ }
113397+
113398+ segs = nskb;
113399+ } while (segs);
113400+
113401+ return 0;
113402+}
113403+
113404 int xfrm4_output(struct sk_buff *skb)
113405 {
113406 return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
113407diff -Nur linux-2.6.16.33-noxen/net/ipv6/addrconf.c linux-2.6.16.33/net/ipv6/addrconf.c
113408--- linux-2.6.16.33-noxen/net/ipv6/addrconf.c 2006-11-22 18:06:31.000000000 +0000
113409+++ linux-2.6.16.33/net/ipv6/addrconf.c 2007-05-23 21:00:01.000000000 +0000
113410@@ -2471,6 +2471,7 @@
113411 spin_lock_bh(&ifp->lock);
113412
113413 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
113414+ !(dev->flags&IFF_MULTICAST) ||
113415 !(ifp->flags&IFA_F_TENTATIVE)) {
113416 ifp->flags &= ~IFA_F_TENTATIVE;
113417 spin_unlock_bh(&ifp->lock);
113418@@ -2555,6 +2556,7 @@
113419 if (ifp->idev->cnf.forwarding == 0 &&
113420 ifp->idev->cnf.rtr_solicits > 0 &&
113421 (dev->flags&IFF_LOOPBACK) == 0 &&
113422+ (dev->flags & IFF_MULTICAST) &&
113423 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
113424 struct in6_addr all_routers;
113425
113426diff -Nur linux-2.6.16.33-noxen/net/ipv6/ip6_output.c linux-2.6.16.33/net/ipv6/ip6_output.c
113427--- linux-2.6.16.33-noxen/net/ipv6/ip6_output.c 2006-11-22 18:06:31.000000000 +0000
113428+++ linux-2.6.16.33/net/ipv6/ip6_output.c 2007-05-23 21:00:01.000000000 +0000
113429@@ -147,7 +147,7 @@
113430
113431 int ip6_output(struct sk_buff *skb)
113432 {
113433- if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
113434+ if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
113435 dst_allfrag(skb->dst))
113436 return ip6_fragment(skb, ip6_output2);
113437 else
113438@@ -829,8 +829,9 @@
113439 struct frag_hdr fhdr;
113440
113441 /* specify the length of each IP datagram fragment*/
113442- skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
113443- sizeof(struct frag_hdr);
113444+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
113445+ sizeof(struct frag_hdr);
113446+ skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
113447 ipv6_select_ident(skb, &fhdr);
113448 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
113449 __skb_queue_tail(&sk->sk_write_queue, skb);
113450diff -Nur linux-2.6.16.33-noxen/net/ipv6/ipcomp6.c linux-2.6.16.33/net/ipv6/ipcomp6.c
113451--- linux-2.6.16.33-noxen/net/ipv6/ipcomp6.c 2006-11-22 18:06:31.000000000 +0000
113452+++ linux-2.6.16.33/net/ipv6/ipcomp6.c 2007-05-23 21:00:01.000000000 +0000
113453@@ -64,7 +64,7 @@
113454
113455 static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
113456 {
113457- int err = 0;
113458+ int err = -ENOMEM;
113459 u8 nexthdr = 0;
113460 int hdr_len = skb->h.raw - skb->nh.raw;
113461 unsigned char *tmp_hdr = NULL;
113462@@ -75,11 +75,8 @@
113463 struct crypto_tfm *tfm;
113464 int cpu;
113465
113466- if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
113467- skb_linearize(skb, GFP_ATOMIC) != 0) {
113468- err = -ENOMEM;
113469+ if (skb_linearize_cow(skb))
113470 goto out;
113471- }
113472
113473 skb->ip_summed = CHECKSUM_NONE;
113474
113475@@ -158,10 +155,8 @@
113476 goto out_ok;
113477 }
113478
113479- if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
113480- skb_linearize(skb, GFP_ATOMIC) != 0) {
113481+ if (skb_linearize_cow(skb))
113482 goto out_ok;
113483- }
113484
113485 /* compression */
113486 plen = skb->len - hdr_len;
113487diff -Nur linux-2.6.16.33-noxen/net/ipv6/xfrm6_output.c linux-2.6.16.33/net/ipv6/xfrm6_output.c
113488--- linux-2.6.16.33-noxen/net/ipv6/xfrm6_output.c 2006-11-22 18:06:31.000000000 +0000
113489+++ linux-2.6.16.33/net/ipv6/xfrm6_output.c 2007-05-23 21:00:01.000000000 +0000
113490@@ -151,7 +151,7 @@
113491 goto out_exit;
113492 }
113493
113494-static int xfrm6_output_finish(struct sk_buff *skb)
113495+static int xfrm6_output_finish2(struct sk_buff *skb)
113496 {
113497 int err;
113498
113499@@ -167,7 +167,7 @@
113500 return dst_output(skb);
113501
113502 err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
113503- skb->dst->dev, xfrm6_output_finish);
113504+ skb->dst->dev, xfrm6_output_finish2);
113505 if (unlikely(err != 1))
113506 break;
113507 }
113508@@ -175,6 +175,41 @@
113509 return err;
113510 }
113511
113512+static int xfrm6_output_finish(struct sk_buff *skb)
113513+{
113514+ struct sk_buff *segs;
113515+
113516+ if (!skb_is_gso(skb))
113517+ return xfrm6_output_finish2(skb);
113518+
113519+ skb->protocol = htons(ETH_P_IP);
113520+ segs = skb_gso_segment(skb, 0);
113521+ kfree_skb(skb);
113522+ if (unlikely(IS_ERR(segs)))
113523+ return PTR_ERR(segs);
113524+
113525+ do {
113526+ struct sk_buff *nskb = segs->next;
113527+ int err;
113528+
113529+ segs->next = NULL;
113530+ err = xfrm6_output_finish2(segs);
113531+
113532+ if (unlikely(err)) {
113533+ while ((segs = nskb)) {
113534+ nskb = segs->next;
113535+ segs->next = NULL;
113536+ kfree_skb(segs);
113537+ }
113538+ return err;
113539+ }
113540+
113541+ segs = nskb;
113542+ } while (segs);
113543+
113544+ return 0;
113545+}
113546+
113547 int xfrm6_output(struct sk_buff *skb)
113548 {
113549 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
113550diff -Nur linux-2.6.16.33-noxen/net/sched/sch_generic.c linux-2.6.16.33/net/sched/sch_generic.c
113551--- linux-2.6.16.33-noxen/net/sched/sch_generic.c 2006-11-22 18:06:31.000000000 +0000
113552+++ linux-2.6.16.33/net/sched/sch_generic.c 2007-05-23 21:00:01.000000000 +0000
113553@@ -72,9 +72,9 @@
113554 dev->queue_lock serializes queue accesses for this device
113555 AND dev->qdisc pointer itself.
113556
113557- dev->xmit_lock serializes accesses to device driver.
113558+ netif_tx_lock serializes accesses to device driver.
113559
113560- dev->queue_lock and dev->xmit_lock are mutually exclusive,
113561+ dev->queue_lock and netif_tx_lock are mutually exclusive,
113562 if one is grabbed, another must be free.
113563 */
113564
113565@@ -90,14 +90,17 @@
113566 NOTE: Called under dev->queue_lock with locally disabled BH.
113567 */
113568
113569-int qdisc_restart(struct net_device *dev)
113570+static inline int qdisc_restart(struct net_device *dev)
113571 {
113572 struct Qdisc *q = dev->qdisc;
113573 struct sk_buff *skb;
113574
113575 /* Dequeue packet */
113576- if ((skb = q->dequeue(q)) != NULL) {
113577+ if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
113578 unsigned nolock = (dev->features & NETIF_F_LLTX);
113579+
113580+ dev->gso_skb = NULL;
113581+
113582 /*
113583 * When the driver has LLTX set it does its own locking
113584 * in start_xmit. No need to add additional overhead by
113585@@ -108,7 +111,7 @@
113586 * will be requeued.
113587 */
113588 if (!nolock) {
113589- if (!spin_trylock(&dev->xmit_lock)) {
113590+ if (!netif_tx_trylock(dev)) {
113591 collision:
113592 /* So, someone grabbed the driver. */
113593
113594@@ -126,8 +129,6 @@
113595 __get_cpu_var(netdev_rx_stat).cpu_collision++;
113596 goto requeue;
113597 }
113598- /* Remember that the driver is grabbed by us. */
113599- dev->xmit_lock_owner = smp_processor_id();
113600 }
113601
113602 {
113603@@ -136,14 +137,11 @@
113604
113605 if (!netif_queue_stopped(dev)) {
113606 int ret;
113607- if (netdev_nit)
113608- dev_queue_xmit_nit(skb, dev);
113609
113610- ret = dev->hard_start_xmit(skb, dev);
113611+ ret = dev_hard_start_xmit(skb, dev);
113612 if (ret == NETDEV_TX_OK) {
113613 if (!nolock) {
113614- dev->xmit_lock_owner = -1;
113615- spin_unlock(&dev->xmit_lock);
113616+ netif_tx_unlock(dev);
113617 }
113618 spin_lock(&dev->queue_lock);
113619 return -1;
113620@@ -157,8 +155,7 @@
113621 /* NETDEV_TX_BUSY - we need to requeue */
113622 /* Release the driver */
113623 if (!nolock) {
113624- dev->xmit_lock_owner = -1;
113625- spin_unlock(&dev->xmit_lock);
113626+ netif_tx_unlock(dev);
113627 }
113628 spin_lock(&dev->queue_lock);
113629 q = dev->qdisc;
113630@@ -175,7 +172,10 @@
113631 */
113632
113633 requeue:
113634- q->ops->requeue(skb, q);
113635+ if (skb->next)
113636+ dev->gso_skb = skb;
113637+ else
113638+ q->ops->requeue(skb, q);
113639 netif_schedule(dev);
113640 return 1;
113641 }
113642@@ -183,11 +183,23 @@
113643 return q->q.qlen;
113644 }
113645
113646+void __qdisc_run(struct net_device *dev)
113647+{
113648+ if (unlikely(dev->qdisc == &noop_qdisc))
113649+ goto out;
113650+
113651+ while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
113652+ /* NOTHING */;
113653+
113654+out:
113655+ clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
113656+}
113657+
113658 static void dev_watchdog(unsigned long arg)
113659 {
113660 struct net_device *dev = (struct net_device *)arg;
113661
113662- spin_lock(&dev->xmit_lock);
113663+ netif_tx_lock(dev);
113664 if (dev->qdisc != &noop_qdisc) {
113665 if (netif_device_present(dev) &&
113666 netif_running(dev) &&
113667@@ -201,7 +213,7 @@
113668 dev_hold(dev);
113669 }
113670 }
113671- spin_unlock(&dev->xmit_lock);
113672+ netif_tx_unlock(dev);
113673
113674 dev_put(dev);
113675 }
113676@@ -225,17 +237,17 @@
113677
113678 static void dev_watchdog_up(struct net_device *dev)
113679 {
113680- spin_lock_bh(&dev->xmit_lock);
113681+ netif_tx_lock_bh(dev);
113682 __netdev_watchdog_up(dev);
113683- spin_unlock_bh(&dev->xmit_lock);
113684+ netif_tx_unlock_bh(dev);
113685 }
113686
113687 static void dev_watchdog_down(struct net_device *dev)
113688 {
113689- spin_lock_bh(&dev->xmit_lock);
113690+ netif_tx_lock_bh(dev);
113691 if (del_timer(&dev->watchdog_timer))
113692 __dev_put(dev);
113693- spin_unlock_bh(&dev->xmit_lock);
113694+ netif_tx_unlock_bh(dev);
113695 }
113696
113697 void netif_carrier_on(struct net_device *dev)
113698@@ -577,10 +589,17 @@
113699
113700 dev_watchdog_down(dev);
113701
113702- while (test_bit(__LINK_STATE_SCHED, &dev->state))
113703+ /* Wait for outstanding dev_queue_xmit calls. */
113704+ synchronize_rcu();
113705+
113706+ /* Wait for outstanding qdisc_run calls. */
113707+ while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
113708 yield();
113709
113710- spin_unlock_wait(&dev->xmit_lock);
113711+ if (dev->gso_skb) {
113712+ kfree_skb(dev->gso_skb);
113713+ dev->gso_skb = NULL;
113714+ }
113715 }
113716
113717 void dev_init_scheduler(struct net_device *dev)
113718@@ -622,6 +641,5 @@
113719 EXPORT_SYMBOL(qdisc_alloc);
113720 EXPORT_SYMBOL(qdisc_destroy);
113721 EXPORT_SYMBOL(qdisc_reset);
113722-EXPORT_SYMBOL(qdisc_restart);
113723 EXPORT_SYMBOL(qdisc_lock_tree);
113724 EXPORT_SYMBOL(qdisc_unlock_tree);
113725diff -Nur linux-2.6.16.33-noxen/net/sched/sch_teql.c linux-2.6.16.33/net/sched/sch_teql.c
113726--- linux-2.6.16.33-noxen/net/sched/sch_teql.c 2006-11-22 18:06:31.000000000 +0000
113727+++ linux-2.6.16.33/net/sched/sch_teql.c 2007-05-23 21:00:01.000000000 +0000
113728@@ -302,20 +302,17 @@
113729
113730 switch (teql_resolve(skb, skb_res, slave)) {
113731 case 0:
113732- if (spin_trylock(&slave->xmit_lock)) {
113733- slave->xmit_lock_owner = smp_processor_id();
113734+ if (netif_tx_trylock(slave)) {
113735 if (!netif_queue_stopped(slave) &&
113736 slave->hard_start_xmit(skb, slave) == 0) {
113737- slave->xmit_lock_owner = -1;
113738- spin_unlock(&slave->xmit_lock);
113739+ netif_tx_unlock(slave);
113740 master->slaves = NEXT_SLAVE(q);
113741 netif_wake_queue(dev);
113742 master->stats.tx_packets++;
113743 master->stats.tx_bytes += len;
113744 return 0;
113745 }
113746- slave->xmit_lock_owner = -1;
113747- spin_unlock(&slave->xmit_lock);
113748+ netif_tx_unlock(slave);
113749 }
113750 if (netif_queue_stopped(dev))
113751 busy = 1;
113752diff -Nur linux-2.6.16.33-noxen/scripts/Makefile.xen linux-2.6.16.33/scripts/Makefile.xen
113753--- linux-2.6.16.33-noxen/scripts/Makefile.xen 1970-01-01 00:00:00.000000000 +0000
113754+++ linux-2.6.16.33/scripts/Makefile.xen 2007-01-08 15:00:46.000000000 +0000
113755@@ -0,0 +1,14 @@
113756+
113757+# cherrypickxen($1 = allobj)
113758+cherrypickxen = $(foreach var, $(1), \
113759+ $(shell o=$(var); \
113760+ c=$${o%.o}-xen.c; \
113761+ s=$${o%.o}-xen.S; \
113762+ oxen=$${o%.o}-xen.o; \
113763+ [ -f $(srctree)/$(src)/$${c} ] || \
113764+ [ -f $(srctree)/$(src)/$${s} ] \
113765+ && echo $$oxen \
113766+ || echo $(var) ) \
113767+ )
113768+# filterxen($1 = allobj, $2 = noobjs)
113769+filterxen = $(filter-out $(2), $(1))