]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/xen-3.0.4-2.6.16.x.patch
Am Pakfire weitergearbeitet.
[people/pmueller/ipfire-2.x.git] / src / patches / xen-3.0.4-2.6.16.x.patch
1 diff -Nur linux-2.6.16.33-noxen/Documentation/networking/netdevices.txt linux-2.6.16.33/Documentation/networking/netdevices.txt
2 --- linux-2.6.16.33-noxen/Documentation/networking/netdevices.txt 2006-11-22 18:06:31.000000000 +0000
3 +++ linux-2.6.16.33/Documentation/networking/netdevices.txt 2007-05-23 21:00:01.000000000 +0000
4 @@ -42,9 +42,9 @@
5 Context: nominally process, but don't sleep inside an rwlock
6
7 dev->hard_start_xmit:
8 - Synchronization: dev->xmit_lock spinlock.
9 + Synchronization: netif_tx_lock spinlock.
10 When the driver sets NETIF_F_LLTX in dev->features this will be
11 - called without holding xmit_lock. In this case the driver
12 + called without holding netif_tx_lock. In this case the driver
13 has to lock by itself when needed. It is recommended to use a try lock
14 for this and return -1 when the spin lock fails.
15 The locking there should also properly protect against
16 @@ -62,12 +62,12 @@
17 Only valid when NETIF_F_LLTX is set.
18
19 dev->tx_timeout:
20 - Synchronization: dev->xmit_lock spinlock.
21 + Synchronization: netif_tx_lock spinlock.
22 Context: BHs disabled
23 Notes: netif_queue_stopped() is guaranteed true
24
25 dev->set_multicast_list:
26 - Synchronization: dev->xmit_lock spinlock.
27 + Synchronization: netif_tx_lock spinlock.
28 Context: BHs disabled
29
30 dev->poll:
31 diff -Nur linux-2.6.16.33-noxen/arch/i386/Kconfig linux-2.6.16.33/arch/i386/Kconfig
32 --- linux-2.6.16.33-noxen/arch/i386/Kconfig 2006-11-22 18:06:31.000000000 +0000
33 +++ linux-2.6.16.33/arch/i386/Kconfig 2007-01-08 15:00:45.000000000 +0000
34 @@ -58,6 +58,15 @@
35 help
36 Choose this option if your computer is a standard PC or compatible.
37
38 +config X86_XEN
39 + bool "Xen-compatible"
40 + select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
41 + select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
42 + select SWIOTLB
43 + help
44 + Choose this option if you plan to run this kernel on top of the
45 + Xen Hypervisor.
46 +
47 config X86_ELAN
48 bool "AMD Elan"
49 help
50 @@ -159,6 +168,7 @@
51
52 config HPET_TIMER
53 bool "HPET Timer Support"
54 + depends on !X86_XEN
55 help
56 This enables the use of the HPET for the kernel's internal timer.
57 HPET is the next generation timer replacing legacy 8254s.
58 @@ -202,6 +212,19 @@
59
60 If you don't know what to do here, say N.
61
62 +config SMP_ALTERNATIVES
63 + bool "SMP alternatives support (EXPERIMENTAL)"
64 + depends on SMP && EXPERIMENTAL
65 + help
66 + Try to reduce the overhead of running an SMP kernel on a uniprocessor
67 + host slightly by replacing certain key instruction sequences
68 + according to whether we currently have more than one CPU available.
69 + This should provide a noticeable boost to performance when
70 + running SMP kernels on UP machines, and have negligible impact
71 + when running on an true SMP host.
72 +
73 + If unsure, say N.
74 +
75 config NR_CPUS
76 int "Maximum number of CPUs (2-255)"
77 range 2 255
78 @@ -218,7 +241,7 @@
79
80 config SCHED_SMT
81 bool "SMT (Hyperthreading) scheduler support"
82 - depends on SMP
83 + depends on SMP && !X86_XEN
84 default off
85 help
86 SMT scheduler support improves the CPU scheduler's decision making
87 @@ -230,7 +253,7 @@
88
89 config X86_UP_APIC
90 bool "Local APIC support on uniprocessors"
91 - depends on !SMP && !(X86_VISWS || X86_VOYAGER)
92 + depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
93 help
94 A local APIC (Advanced Programmable Interrupt Controller) is an
95 integrated interrupt controller in the CPU. If you have a single-CPU
96 @@ -255,12 +278,12 @@
97
98 config X86_LOCAL_APIC
99 bool
100 - depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
101 + depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
102 default y
103
104 config X86_IO_APIC
105 bool
106 - depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
107 + depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
108 default y
109
110 config X86_VISWS_APIC
111 @@ -268,9 +291,14 @@
112 depends on X86_VISWS
113 default y
114
115 +config X86_TSC
116 + bool
117 + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN
118 + default y
119 +
120 config X86_MCE
121 bool "Machine Check Exception"
122 - depends on !X86_VOYAGER
123 + depends on !(X86_VOYAGER || X86_XEN)
124 ---help---
125 Machine Check Exception support allows the processor to notify the
126 kernel if it detects a problem (e.g. overheating, component failure).
127 @@ -360,6 +388,7 @@
128
129 config MICROCODE
130 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
131 + depends on !XEN_UNPRIVILEGED_GUEST
132 ---help---
133 If you say Y here and also to "/dev file system support" in the
134 'File systems' section, you will be able to update the microcode on
135 @@ -377,6 +406,7 @@
136
137 config X86_MSR
138 tristate "/dev/cpu/*/msr - Model-specific register support"
139 + depends on !X86_XEN
140 help
141 This device gives privileged processes access to the x86
142 Model-Specific Registers (MSRs). It is a character device with
143 @@ -392,6 +422,10 @@
144 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
145 /dev/cpu/31/cpuid.
146
147 +config SWIOTLB
148 + bool
149 + default n
150 +
151 source "drivers/firmware/Kconfig"
152
153 choice
154 @@ -560,7 +594,7 @@
155
156 config HIGHPTE
157 bool "Allocate 3rd-level pagetables from highmem"
158 - depends on HIGHMEM4G || HIGHMEM64G
159 + depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
160 help
161 The VM uses one page table entry for each page of physical memory.
162 For systems with a lot of RAM, this can be wasteful of precious
163 @@ -569,6 +603,7 @@
164
165 config MATH_EMULATION
166 bool "Math emulation"
167 + depends on !X86_XEN
168 ---help---
169 Linux can emulate a math coprocessor (used for floating point
170 operations) if you don't have one. 486DX and Pentium processors have
171 @@ -594,6 +629,8 @@
172
173 config MTRR
174 bool "MTRR (Memory Type Range Register) support"
175 + depends on !XEN_UNPRIVILEGED_GUEST
176 + default y if X86_XEN
177 ---help---
178 On Intel P6 family processors (Pentium Pro, Pentium II and later)
179 the Memory Type Range Registers (MTRRs) may be used to control
180 @@ -628,7 +665,7 @@
181
182 config EFI
183 bool "Boot from EFI support (EXPERIMENTAL)"
184 - depends on ACPI
185 + depends on ACPI && !X86_XEN
186 default n
187 ---help---
188 This enables the the kernel to boot on EFI platforms using
189 @@ -646,7 +683,7 @@
190
191 config IRQBALANCE
192 bool "Enable kernel irq balancing"
193 - depends on SMP && X86_IO_APIC
194 + depends on SMP && X86_IO_APIC && !X86_XEN
195 default y
196 help
197 The default yes will allow the kernel to do irq load balancing.
198 @@ -689,7 +726,7 @@
199
200 config KEXEC
201 bool "kexec system call (EXPERIMENTAL)"
202 - depends on EXPERIMENTAL
203 + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
204 help
205 kexec is a system call that implements the ability to shutdown your
206 current kernel, and to start another kernel. It is like a reboot
207 @@ -743,6 +780,7 @@
208 config DOUBLEFAULT
209 default y
210 bool "Enable doublefault exception handler" if EMBEDDED
211 + depends on !X86_NO_TSS
212 help
213 This option allows trapping of rare doublefault exceptions that
214 would otherwise cause a system to silently reboot. Disabling this
215 @@ -756,18 +794,20 @@
216 depends on HIGHMEM
217
218 menu "Power management options (ACPI, APM)"
219 - depends on !X86_VOYAGER
220 + depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
221
222 +if !X86_XEN
223 source kernel/power/Kconfig
224 +endif
225
226 source "drivers/acpi/Kconfig"
227
228 menu "APM (Advanced Power Management) BIOS Support"
229 -depends on PM && !X86_VISWS
230 +depends on PM && !(X86_VISWS || X86_XEN)
231
232 config APM
233 tristate "APM (Advanced Power Management) BIOS support"
234 - depends on PM
235 + depends on PM && PM_LEGACY
236 ---help---
237 APM is a BIOS specification for saving power using several different
238 techniques. This is mostly useful for battery powered laptops with
239 @@ -952,6 +992,7 @@
240
241 config PCI_GOBIOS
242 bool "BIOS"
243 + depends on !X86_XEN
244
245 config PCI_GOMMCONFIG
246 bool "MMConfig"
247 @@ -959,6 +1000,13 @@
248 config PCI_GODIRECT
249 bool "Direct"
250
251 +config PCI_GOXEN_FE
252 + bool "Xen PCI Frontend"
253 + depends on X86_XEN
254 + help
255 + The PCI device frontend driver allows the kernel to import arbitrary
256 + PCI devices from a PCI backend to support PCI driver domains.
257 +
258 config PCI_GOANY
259 bool "Any"
260
261 @@ -966,7 +1014,7 @@
262
263 config PCI_BIOS
264 bool
265 - depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
266 + depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
267 default y
268
269 config PCI_DIRECT
270 @@ -979,6 +1027,18 @@
271 depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
272 default y
273
274 +config XEN_PCIDEV_FRONTEND
275 + bool
276 + depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
277 + default y
278 +
279 +config XEN_PCIDEV_FE_DEBUG
280 + bool "Xen PCI Frontend Debugging"
281 + depends on XEN_PCIDEV_FRONTEND
282 + default n
283 + help
284 + Enables some debug statements within the PCI Frontend.
285 +
286 source "drivers/pci/pcie/Kconfig"
287
288 source "drivers/pci/Kconfig"
289 @@ -989,7 +1049,7 @@
290
291 config ISA
292 bool "ISA support"
293 - depends on !(X86_VOYAGER || X86_VISWS)
294 + depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
295 help
296 Find out whether you have ISA slots on your motherboard. ISA is the
297 name of a bus system, i.e. the way the CPU talks to the other stuff
298 @@ -1016,7 +1076,7 @@
299 source "drivers/eisa/Kconfig"
300
301 config MCA
302 - bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
303 + bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
304 default y if X86_VOYAGER
305 help
306 MicroChannel Architecture is found in some IBM PS/2 machines and
307 @@ -1078,6 +1138,8 @@
308
309 source "crypto/Kconfig"
310
311 +source "drivers/xen/Kconfig"
312 +
313 source "lib/Kconfig"
314
315 #
316 @@ -1103,7 +1165,7 @@
317
318 config X86_HT
319 bool
320 - depends on SMP && !(X86_VISWS || X86_VOYAGER)
321 + depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
322 default y
323
324 config X86_BIOS_REBOOT
325 @@ -1116,6 +1178,16 @@
326 depends on X86_SMP || (X86_VOYAGER && SMP)
327 default y
328
329 +config X86_NO_TSS
330 + bool
331 + depends on X86_XEN
332 + default y
333 +
334 +config X86_NO_IDT
335 + bool
336 + depends on X86_XEN
337 + default y
338 +
339 config KTIME_SCALAR
340 bool
341 default y
342 diff -Nur linux-2.6.16.33-noxen/arch/i386/Kconfig.cpu linux-2.6.16.33/arch/i386/Kconfig.cpu
343 --- linux-2.6.16.33-noxen/arch/i386/Kconfig.cpu 2006-11-22 18:06:31.000000000 +0000
344 +++ linux-2.6.16.33/arch/i386/Kconfig.cpu 2007-01-08 15:00:45.000000000 +0000
345 @@ -251,7 +251,7 @@
346
347 config X86_F00F_BUG
348 bool
349 - depends on M586MMX || M586TSC || M586 || M486 || M386
350 + depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
351 default y
352
353 config X86_WP_WORKS_OK
354 diff -Nur linux-2.6.16.33-noxen/arch/i386/Makefile linux-2.6.16.33/arch/i386/Makefile
355 --- linux-2.6.16.33-noxen/arch/i386/Makefile 2006-11-22 18:06:31.000000000 +0000
356 +++ linux-2.6.16.33/arch/i386/Makefile 2007-01-08 15:00:45.000000000 +0000
357 @@ -45,6 +45,11 @@
358
359 CFLAGS += $(cflags-y)
360
361 +cppflags-$(CONFIG_XEN) += \
362 + -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
363 +
364 +CPPFLAGS += $(cppflags-y)
365 +
366 # Default subarch .c files
367 mcore-y := mach-default
368
369 @@ -68,6 +73,10 @@
370 mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
371 mcore-$(CONFIG_X86_SUMMIT) := mach-default
372
373 +# Xen subarch support
374 +mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen
375 +mcore-$(CONFIG_X86_XEN) := mach-xen
376 +
377 # generic subarchitecture
378 mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
379 mcore-$(CONFIG_X86_GENERICARCH) := mach-default
380 @@ -102,6 +111,19 @@
381 .PHONY: zImage bzImage compressed zlilo bzlilo \
382 zdisk bzdisk fdimage fdimage144 fdimage288 install
383
384 +ifdef CONFIG_XEN
385 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
386 +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
387 +boot := arch/i386/boot-xen
388 +.PHONY: vmlinuz
389 +all: vmlinuz
390 +
391 +vmlinuz: vmlinux
392 + $(Q)$(MAKE) $(build)=$(boot) $@
393 +
394 +install:
395 + $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
396 +else
397 all: bzImage
398
399 # KBUILD_IMAGE specify target image being built
400 @@ -124,6 +146,7 @@
401
402 install:
403 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
404 +endif
405
406 archclean:
407 $(Q)$(MAKE) $(clean)=arch/i386/boot
408 @@ -139,3 +162,4 @@
409 endef
410
411 CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
412 +CLEAN_FILES += vmlinuz vmlinux-stripped
413 diff -Nur linux-2.6.16.33-noxen/arch/i386/boot-xen/Makefile linux-2.6.16.33/arch/i386/boot-xen/Makefile
414 --- linux-2.6.16.33-noxen/arch/i386/boot-xen/Makefile 1970-01-01 00:00:00.000000000 +0000
415 +++ linux-2.6.16.33/arch/i386/boot-xen/Makefile 2007-01-08 15:00:45.000000000 +0000
416 @@ -0,0 +1,21 @@
417 +
418 +OBJCOPYFLAGS := -g --strip-unneeded
419 +
420 +vmlinuz: vmlinux-stripped FORCE
421 + $(call if_changed,gzip)
422 +
423 +vmlinux-stripped: vmlinux FORCE
424 + $(call if_changed,objcopy)
425 +
426 +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
427 +
428 +XINSTALL_NAME ?= $(KERNELRELEASE)
429 +install:
430 + mkdir -p $(INSTALL_ROOT)/boot
431 + ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
432 + rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
433 + install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
434 + install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
435 + install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
436 + install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
437 + ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
438 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/Makefile linux-2.6.16.33/arch/i386/kernel/Makefile
439 --- linux-2.6.16.33-noxen/arch/i386/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
440 +++ linux-2.6.16.33/arch/i386/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
441 @@ -37,11 +37,18 @@
442 obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
443 obj-$(CONFIG_VM86) += vm86.o
444 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
445 +obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
446
447 EXTRA_AFLAGS := -traditional
448
449 obj-$(CONFIG_SCx200) += scx200.o
450
451 +ifdef CONFIG_XEN
452 +vsyscall_note := vsyscall-note-xen.o
453 +else
454 +vsyscall_note := vsyscall-note.o
455 +endif
456 +
457 # vsyscall.o contains the vsyscall DSO images as __initdata.
458 # We must build both images before we can assemble it.
459 # Note: kbuild does not track this dependency due to usage of .incbin
460 @@ -62,7 +69,7 @@
461
462 $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
463 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
464 - $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
465 + $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
466 $(call if_changed,syscall)
467
468 # We also create a special relocatable object that should mirror the symbol
469 @@ -74,5 +81,17 @@
470
471 SYSCFLAGS_vsyscall-syms.o = -r
472 $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
473 - $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
474 + $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE
475 $(call if_changed,syscall)
476 +
477 +ifdef CONFIG_XEN
478 +include $(srctree)/scripts/Makefile.xen
479 +
480 +obj-y += fixup.o
481 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
482 +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
483 +
484 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
485 +obj-y := $(call cherrypickxen, $(obj-y))
486 +extra-y := $(call cherrypickxen, $(extra-y))
487 +endif
488 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/acpi/Makefile linux-2.6.16.33/arch/i386/kernel/acpi/Makefile
489 --- linux-2.6.16.33-noxen/arch/i386/kernel/acpi/Makefile 2006-11-22 18:06:31.000000000 +0000
490 +++ linux-2.6.16.33/arch/i386/kernel/acpi/Makefile 2007-01-08 15:00:45.000000000 +0000
491 @@ -6,3 +6,7 @@
492 obj-y += cstate.o processor.o
493 endif
494
495 +ifdef CONFIG_XEN
496 +include $(srctree)/scripts/Makefile.xen
497 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
498 +endif
499 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/acpi/boot-xen.c linux-2.6.16.33/arch/i386/kernel/acpi/boot-xen.c
500 --- linux-2.6.16.33-noxen/arch/i386/kernel/acpi/boot-xen.c 1970-01-01 00:00:00.000000000 +0000
501 +++ linux-2.6.16.33/arch/i386/kernel/acpi/boot-xen.c 2007-01-08 15:00:45.000000000 +0000
502 @@ -0,0 +1,1161 @@
503 +/*
504 + * boot.c - Architecture-Specific Low-Level ACPI Boot Support
505 + *
506 + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
507 + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
508 + *
509 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
510 + *
511 + * This program is free software; you can redistribute it and/or modify
512 + * it under the terms of the GNU General Public License as published by
513 + * the Free Software Foundation; either version 2 of the License, or
514 + * (at your option) any later version.
515 + *
516 + * This program is distributed in the hope that it will be useful,
517 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
518 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
519 + * GNU General Public License for more details.
520 + *
521 + * You should have received a copy of the GNU General Public License
522 + * along with this program; if not, write to the Free Software
523 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
524 + *
525 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
526 + */
527 +
528 +#include <linux/init.h>
529 +#include <linux/config.h>
530 +#include <linux/acpi.h>
531 +#include <linux/efi.h>
532 +#include <linux/module.h>
533 +#include <linux/dmi.h>
534 +#include <linux/irq.h>
535 +
536 +#include <asm/pgtable.h>
537 +#include <asm/io_apic.h>
538 +#include <asm/apic.h>
539 +#include <asm/io.h>
540 +#include <asm/mpspec.h>
541 +
542 +#ifdef CONFIG_X86_64
543 +
544 +extern void __init clustered_apic_check(void);
545 +
546 +extern int gsi_irq_sharing(int gsi);
547 +#include <asm/proto.h>
548 +
549 +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
550 +
551 +
552 +#else /* X86 */
553 +
554 +#ifdef CONFIG_X86_LOCAL_APIC
555 +#include <mach_apic.h>
556 +#include <mach_mpparse.h>
557 +#endif /* CONFIG_X86_LOCAL_APIC */
558 +
559 +static inline int gsi_irq_sharing(int gsi) { return gsi; }
560 +
561 +#endif /* X86 */
562 +
563 +#define BAD_MADT_ENTRY(entry, end) ( \
564 + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
565 + ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
566 +
567 +#define PREFIX "ACPI: "
568 +
569 +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */
570 +int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
571 +int acpi_ht __initdata = 1; /* enable HT */
572 +
573 +int acpi_lapic;
574 +int acpi_ioapic;
575 +int acpi_strict;
576 +EXPORT_SYMBOL(acpi_strict);
577 +
578 +acpi_interrupt_flags acpi_sci_flags __initdata;
579 +int acpi_sci_override_gsi __initdata;
580 +int acpi_skip_timer_override __initdata;
581 +
582 +#ifdef CONFIG_X86_LOCAL_APIC
583 +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
584 +#endif
585 +
586 +#ifndef __HAVE_ARCH_CMPXCHG
587 +#warning ACPI uses CMPXCHG, i486 and later hardware
588 +#endif
589 +
590 +#define MAX_MADT_ENTRIES 256
591 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
592 + {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
593 +EXPORT_SYMBOL(x86_acpiid_to_apicid);
594 +
595 +/* --------------------------------------------------------------------------
596 + Boot-time Configuration
597 + -------------------------------------------------------------------------- */
598 +
599 +/*
600 + * The default interrupt routing model is PIC (8259). This gets
601 + * overriden if IOAPICs are enumerated (below).
602 + */
603 +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
604 +
605 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
606 +
607 +/* rely on all ACPI tables being in the direct mapping */
608 +char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
609 +{
610 + if (!phys_addr || !size)
611 + return NULL;
612 +
613 + if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
614 + return __va(phys_addr);
615 +
616 + return NULL;
617 +}
618 +
619 +#else
620 +
621 +/*
622 + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
623 + * to map the target physical address. The problem is that set_fixmap()
624 + * provides a single page, and it is possible that the page is not
625 + * sufficient.
626 + * By using this area, we can map up to MAX_IO_APICS pages temporarily,
627 + * i.e. until the next __va_range() call.
628 + *
629 + * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
630 + * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
631 + * count idx down while incrementing the phys address.
632 + */
633 +char *__acpi_map_table(unsigned long phys, unsigned long size)
634 +{
635 + unsigned long base, offset, mapped_size;
636 + int idx;
637 +
638 +#ifndef CONFIG_XEN
639 + if (phys + size < 8 * 1024 * 1024)
640 + return __va(phys);
641 +#endif
642 +
643 + offset = phys & (PAGE_SIZE - 1);
644 + mapped_size = PAGE_SIZE - offset;
645 + set_fixmap(FIX_ACPI_END, phys);
646 + base = fix_to_virt(FIX_ACPI_END);
647 +
648 + /*
649 + * Most cases can be covered by the below.
650 + */
651 + idx = FIX_ACPI_END;
652 + while (mapped_size < size) {
653 + if (--idx < FIX_ACPI_BEGIN)
654 + return NULL; /* cannot handle this */
655 + phys += PAGE_SIZE;
656 + set_fixmap(idx, phys);
657 + mapped_size += PAGE_SIZE;
658 + }
659 +
660 + return ((unsigned char *)base + offset);
661 +}
662 +#endif
663 +
664 +#ifdef CONFIG_PCI_MMCONFIG
665 +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
666 +struct acpi_table_mcfg_config *pci_mmcfg_config;
667 +int pci_mmcfg_config_num;
668 +
669 +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
670 +{
671 + struct acpi_table_mcfg *mcfg;
672 + unsigned long i;
673 + int config_size;
674 +
675 + if (!phys_addr || !size)
676 + return -EINVAL;
677 +
678 + mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
679 + if (!mcfg) {
680 + printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
681 + return -ENODEV;
682 + }
683 +
684 + /* how many config structures do we have */
685 + pci_mmcfg_config_num = 0;
686 + i = size - sizeof(struct acpi_table_mcfg);
687 + while (i >= sizeof(struct acpi_table_mcfg_config)) {
688 + ++pci_mmcfg_config_num;
689 + i -= sizeof(struct acpi_table_mcfg_config);
690 + };
691 + if (pci_mmcfg_config_num == 0) {
692 + printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
693 + return -ENODEV;
694 + }
695 +
696 + config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
697 + pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
698 + if (!pci_mmcfg_config) {
699 + printk(KERN_WARNING PREFIX
700 + "No memory for MCFG config tables\n");
701 + return -ENOMEM;
702 + }
703 +
704 + memcpy(pci_mmcfg_config, &mcfg->config, config_size);
705 + for (i = 0; i < pci_mmcfg_config_num; ++i) {
706 + if (mcfg->config[i].base_reserved) {
707 + printk(KERN_ERR PREFIX
708 + "MMCONFIG not in low 4GB of memory\n");
709 + return -ENODEV;
710 + }
711 + }
712 +
713 + return 0;
714 +}
715 +#endif /* CONFIG_PCI_MMCONFIG */
716 +
717 +#ifdef CONFIG_X86_LOCAL_APIC
718 +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
719 +{
720 + struct acpi_table_madt *madt = NULL;
721 +
722 + if (!phys_addr || !size)
723 + return -EINVAL;
724 +
725 + madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
726 + if (!madt) {
727 + printk(KERN_WARNING PREFIX "Unable to map MADT\n");
728 + return -ENODEV;
729 + }
730 +
731 + if (madt->lapic_address) {
732 + acpi_lapic_addr = (u64) madt->lapic_address;
733 +
734 + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
735 + madt->lapic_address);
736 + }
737 +
738 + acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
739 +
740 + return 0;
741 +}
742 +
743 +static int __init
744 +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
745 +{
746 + struct acpi_table_lapic *processor = NULL;
747 +
748 + processor = (struct acpi_table_lapic *)header;
749 +
750 + if (BAD_MADT_ENTRY(processor, end))
751 + return -EINVAL;
752 +
753 + acpi_table_print_madt_entry(header);
754 +
755 + /* Record local apic id only when enabled */
756 + if (processor->flags.enabled)
757 + x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
758 +
759 + /*
760 + * We need to register disabled CPU as well to permit
761 + * counting disabled CPUs. This allows us to size
762 + * cpus_possible_map more accurately, to permit
763 + * to not preallocating memory for all NR_CPUS
764 + * when we use CPU hotplug.
765 + */
766 + mp_register_lapic(processor->id, /* APIC ID */
767 + processor->flags.enabled); /* Enabled? */
768 +
769 + return 0;
770 +}
771 +
772 +static int __init
773 +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
774 + const unsigned long end)
775 +{
776 + struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
777 +
778 + lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
779 +
780 + if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
781 + return -EINVAL;
782 +
783 + acpi_lapic_addr = lapic_addr_ovr->address;
784 +
785 + return 0;
786 +}
787 +
788 +static int __init
789 +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
790 +{
791 + struct acpi_table_lapic_nmi *lapic_nmi = NULL;
792 +
793 + lapic_nmi = (struct acpi_table_lapic_nmi *)header;
794 +
795 + if (BAD_MADT_ENTRY(lapic_nmi, end))
796 + return -EINVAL;
797 +
798 + acpi_table_print_madt_entry(header);
799 +
800 + if (lapic_nmi->lint != 1)
801 + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
802 +
803 + return 0;
804 +}
805 +
806 +#endif /*CONFIG_X86_LOCAL_APIC */
807 +
808 +#ifdef CONFIG_X86_IO_APIC
809 +
810 +static int __init
811 +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
812 +{
813 + struct acpi_table_ioapic *ioapic = NULL;
814 +
815 + ioapic = (struct acpi_table_ioapic *)header;
816 +
817 + if (BAD_MADT_ENTRY(ioapic, end))
818 + return -EINVAL;
819 +
820 + acpi_table_print_madt_entry(header);
821 +
822 + mp_register_ioapic(ioapic->id,
823 + ioapic->address, ioapic->global_irq_base);
824 +
825 + return 0;
826 +}
827 +
828 +/*
829 + * Parse Interrupt Source Override for the ACPI SCI
830 + */
831 +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
832 +{
833 + if (trigger == 0) /* compatible SCI trigger is level */
834 + trigger = 3;
835 +
836 + if (polarity == 0) /* compatible SCI polarity is low */
837 + polarity = 3;
838 +
839 + /* Command-line over-ride via acpi_sci= */
840 + if (acpi_sci_flags.trigger)
841 + trigger = acpi_sci_flags.trigger;
842 +
843 + if (acpi_sci_flags.polarity)
844 + polarity = acpi_sci_flags.polarity;
845 +
846 + /*
847 + * mp_config_acpi_legacy_irqs() already setup IRQs < 16
848 + * If GSI is < 16, this will update its flags,
849 + * else it will create a new mp_irqs[] entry.
850 + */
851 + mp_override_legacy_irq(gsi, polarity, trigger, gsi);
852 +
853 + /*
854 + * stash over-ride to indicate we've been here
855 + * and for later update of acpi_fadt
856 + */
857 + acpi_sci_override_gsi = gsi;
858 + return;
859 +}
860 +
861 +static int __init
862 +acpi_parse_int_src_ovr(acpi_table_entry_header * header,
863 + const unsigned long end)
864 +{
865 + struct acpi_table_int_src_ovr *intsrc = NULL;
866 +
867 + intsrc = (struct acpi_table_int_src_ovr *)header;
868 +
869 + if (BAD_MADT_ENTRY(intsrc, end))
870 + return -EINVAL;
871 +
872 + acpi_table_print_madt_entry(header);
873 +
874 + if (intsrc->bus_irq == acpi_fadt.sci_int) {
875 + acpi_sci_ioapic_setup(intsrc->global_irq,
876 + intsrc->flags.polarity,
877 + intsrc->flags.trigger);
878 + return 0;
879 + }
880 +
881 + if (acpi_skip_timer_override &&
882 + intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
883 + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
884 + return 0;
885 + }
886 +
887 + mp_override_legacy_irq(intsrc->bus_irq,
888 + intsrc->flags.polarity,
889 + intsrc->flags.trigger, intsrc->global_irq);
890 +
891 + return 0;
892 +}
893 +
894 +static int __init
895 +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
896 +{
897 + struct acpi_table_nmi_src *nmi_src = NULL;
898 +
899 + nmi_src = (struct acpi_table_nmi_src *)header;
900 +
901 + if (BAD_MADT_ENTRY(nmi_src, end))
902 + return -EINVAL;
903 +
904 + acpi_table_print_madt_entry(header);
905 +
906 + /* TBD: Support nimsrc entries? */
907 +
908 + return 0;
909 +}
910 +
911 +#endif /* CONFIG_X86_IO_APIC */
912 +
913 +/*
914 + * acpi_pic_sci_set_trigger()
915 + *
916 + * use ELCR to set PIC-mode trigger type for SCI
917 + *
918 + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
919 + * it may require Edge Trigger -- use "acpi_sci=edge"
920 + *
921 + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
922 + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
923 + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
924 + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
925 + */
926 +
927 +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
928 +{
929 + unsigned int mask = 1 << irq;
930 + unsigned int old, new;
931 +
932 + /* Real old ELCR mask */
933 + old = inb(0x4d0) | (inb(0x4d1) << 8);
934 +
935 + /*
936 + * If we use ACPI to set PCI irq's, then we should clear ELCR
937 + * since we will set it correctly as we enable the PCI irq
938 + * routing.
939 + */
940 + new = acpi_noirq ? old : 0;
941 +
942 + /*
943 + * Update SCI information in the ELCR, it isn't in the PCI
944 + * routing tables..
945 + */
946 + switch (trigger) {
947 + case 1: /* Edge - clear */
948 + new &= ~mask;
949 + break;
950 + case 3: /* Level - set */
951 + new |= mask;
952 + break;
953 + }
954 +
955 + if (old == new)
956 + return;
957 +
958 + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
959 + outb(new, 0x4d0);
960 + outb(new >> 8, 0x4d1);
961 +}
962 +
963 +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
964 +{
965 +#ifdef CONFIG_X86_IO_APIC
966 + if (use_pci_vector() && !platform_legacy_irq(gsi))
967 + *irq = IO_APIC_VECTOR(gsi);
968 + else
969 +#endif
970 + *irq = gsi_irq_sharing(gsi);
971 + return 0;
972 +}
973 +
974 +/*
975 + * success: return IRQ number (>=0)
976 + * failure: return < 0
977 + */
978 +int acpi_register_gsi(u32 gsi, int triggering, int polarity)
979 +{
980 + unsigned int irq;
981 + unsigned int plat_gsi = gsi;
982 +
983 +#ifdef CONFIG_PCI
984 + /*
985 + * Make sure all (legacy) PCI IRQs are set as level-triggered.
986 + */
987 + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
988 + extern void eisa_set_level_irq(unsigned int irq);
989 +
990 + if (triggering == ACPI_LEVEL_SENSITIVE)
991 + eisa_set_level_irq(gsi);
992 + }
993 +#endif
994 +
995 +#ifdef CONFIG_X86_IO_APIC
996 + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
997 + plat_gsi = mp_register_gsi(gsi, triggering, polarity);
998 + }
999 +#endif
1000 + acpi_gsi_to_irq(plat_gsi, &irq);
1001 + return irq;
1002 +}
1003 +
1004 +EXPORT_SYMBOL(acpi_register_gsi);
1005 +
1006 +/*
1007 + * ACPI based hotplug support for CPU
1008 + */
1009 +#ifdef CONFIG_ACPI_HOTPLUG_CPU
1010 +int acpi_map_lsapic(acpi_handle handle, int *pcpu)
1011 +{
1012 + /* TBD */
1013 + return -EINVAL;
1014 +}
1015 +
1016 +EXPORT_SYMBOL(acpi_map_lsapic);
1017 +
1018 +int acpi_unmap_lsapic(int cpu)
1019 +{
1020 + /* TBD */
1021 + return -EINVAL;
1022 +}
1023 +
1024 +EXPORT_SYMBOL(acpi_unmap_lsapic);
1025 +#endif /* CONFIG_ACPI_HOTPLUG_CPU */
1026 +
1027 +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
1028 +{
1029 + /* TBD */
1030 + return -EINVAL;
1031 +}
1032 +
1033 +EXPORT_SYMBOL(acpi_register_ioapic);
1034 +
1035 +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
1036 +{
1037 + /* TBD */
1038 + return -EINVAL;
1039 +}
1040 +
1041 +EXPORT_SYMBOL(acpi_unregister_ioapic);
1042 +
1043 +static unsigned long __init
1044 +acpi_scan_rsdp(unsigned long start, unsigned long length)
1045 +{
1046 + unsigned long offset = 0;
1047 + unsigned long sig_len = sizeof("RSD PTR ") - 1;
1048 + unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
1049 +
1050 + /*
1051 + * Scan all 16-byte boundaries of the physical memory region for the
1052 + * RSDP signature.
1053 + */
1054 + for (offset = 0; offset < length; offset += 16) {
1055 + if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
1056 + continue;
1057 + return (start + offset);
1058 + }
1059 +
1060 + return 0;
1061 +}
1062 +
1063 +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
1064 +{
1065 + struct acpi_table_sbf *sb;
1066 +
1067 + if (!phys_addr || !size)
1068 + return -EINVAL;
1069 +
1070 + sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
1071 + if (!sb) {
1072 + printk(KERN_WARNING PREFIX "Unable to map SBF\n");
1073 + return -ENODEV;
1074 + }
1075 +
1076 + sbf_port = sb->sbf_cmos; /* Save CMOS port */
1077 +
1078 + return 0;
1079 +}
1080 +
1081 +#ifdef CONFIG_HPET_TIMER
1082 +
1083 +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
1084 +{
1085 + struct acpi_table_hpet *hpet_tbl;
1086 +
1087 + if (!phys || !size)
1088 + return -EINVAL;
1089 +
1090 + hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
1091 + if (!hpet_tbl) {
1092 + printk(KERN_WARNING PREFIX "Unable to map HPET\n");
1093 + return -ENODEV;
1094 + }
1095 +
1096 + if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
1097 + printk(KERN_WARNING PREFIX "HPET timers must be located in "
1098 + "memory.\n");
1099 + return -1;
1100 + }
1101 +#ifdef CONFIG_X86_64
1102 + vxtime.hpet_address = hpet_tbl->addr.addrl |
1103 + ((long)hpet_tbl->addr.addrh << 32);
1104 +
1105 + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1106 + hpet_tbl->id, vxtime.hpet_address);
1107 +#else /* X86 */
1108 + {
1109 + extern unsigned long hpet_address;
1110 +
1111 + hpet_address = hpet_tbl->addr.addrl;
1112 + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1113 + hpet_tbl->id, hpet_address);
1114 + }
1115 +#endif /* X86 */
1116 +
1117 + return 0;
1118 +}
1119 +#else
1120 +#define acpi_parse_hpet NULL
1121 +#endif
1122 +
1123 +#ifdef CONFIG_X86_PM_TIMER
1124 +extern u32 pmtmr_ioport;
1125 +#endif
1126 +
1127 +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
1128 +{
1129 + struct fadt_descriptor_rev2 *fadt = NULL;
1130 +
1131 + fadt = (struct fadt_descriptor_rev2 *)__acpi_map_table(phys, size);
1132 + if (!fadt) {
1133 + printk(KERN_WARNING PREFIX "Unable to map FADT\n");
1134 + return 0;
1135 + }
1136 + /* initialize sci_int early for INT_SRC_OVR MADT parsing */
1137 + acpi_fadt.sci_int = fadt->sci_int;
1138 +
1139 + /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
1140 + acpi_fadt.revision = fadt->revision;
1141 + acpi_fadt.force_apic_physical_destination_mode =
1142 + fadt->force_apic_physical_destination_mode;
1143 +
1144 +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
1145 + /* detect the location of the ACPI PM Timer */
1146 + if (fadt->revision >= FADT2_REVISION_ID) {
1147 + /* FADT rev. 2 */
1148 + if (fadt->xpm_tmr_blk.address_space_id !=
1149 + ACPI_ADR_SPACE_SYSTEM_IO)
1150 + return 0;
1151 +
1152 + pmtmr_ioport = fadt->xpm_tmr_blk.address;
1153 + /*
1154 + * "X" fields are optional extensions to the original V1.0
1155 + * fields, so we must selectively expand V1.0 fields if the
1156 + * corresponding X field is zero.
1157 + */
1158 + if (!pmtmr_ioport)
1159 + pmtmr_ioport = fadt->V1_pm_tmr_blk;
1160 + } else {
1161 + /* FADT rev. 1 */
1162 + pmtmr_ioport = fadt->V1_pm_tmr_blk;
1163 + }
1164 + if (pmtmr_ioport)
1165 + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1166 + pmtmr_ioport);
1167 +#endif
1168 + return 0;
1169 +}
1170 +
1171 +unsigned long __init acpi_find_rsdp(void)
1172 +{
1173 + unsigned long rsdp_phys = 0;
1174 +
1175 + if (efi_enabled) {
1176 + if (efi.acpi20)
1177 + return __pa(efi.acpi20);
1178 + else if (efi.acpi)
1179 + return __pa(efi.acpi);
1180 + }
1181 + /*
1182 + * Scan memory looking for the RSDP signature. First search EBDA (low
1183 + * memory) paragraphs and then search upper memory (E0000-FFFFF).
1184 + */
1185 + rsdp_phys = acpi_scan_rsdp(0, 0x400);
1186 + if (!rsdp_phys)
1187 + rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1188 +
1189 + return rsdp_phys;
1190 +}
1191 +
1192 +#ifdef CONFIG_X86_LOCAL_APIC
1193 +/*
1194 + * Parse LAPIC entries in MADT
1195 + * returns 0 on success, < 0 on error
1196 + */
1197 +static int __init acpi_parse_madt_lapic_entries(void)
1198 +{
1199 + int count;
1200 +
1201 + /*
1202 + * Note that the LAPIC address is obtained from the MADT (32-bit value)
1203 + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1204 + */
1205 +
1206 + count =
1207 + acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1208 + acpi_parse_lapic_addr_ovr, 0);
1209 + if (count < 0) {
1210 + printk(KERN_ERR PREFIX
1211 + "Error parsing LAPIC address override entry\n");
1212 + return count;
1213 + }
1214 +
1215 + mp_register_lapic_address(acpi_lapic_addr);
1216 +
1217 + count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1218 + MAX_APICS);
1219 + if (!count) {
1220 + printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1221 + /* TBD: Cleanup to allow fallback to MPS */
1222 + return -ENODEV;
1223 + } else if (count < 0) {
1224 + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1225 + /* TBD: Cleanup to allow fallback to MPS */
1226 + return count;
1227 + }
1228 +
1229 + count =
1230 + acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1231 + if (count < 0) {
1232 + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1233 + /* TBD: Cleanup to allow fallback to MPS */
1234 + return count;
1235 + }
1236 + return 0;
1237 +}
1238 +#endif /* CONFIG_X86_LOCAL_APIC */
1239 +
1240 +#ifdef CONFIG_X86_IO_APIC
1241 +/*
1242 + * Parse IOAPIC related entries in MADT
1243 + * returns 0 on success, < 0 on error
1244 + */
1245 +static int __init acpi_parse_madt_ioapic_entries(void)
1246 +{
1247 + int count;
1248 +
1249 + /*
1250 + * ACPI interpreter is required to complete interrupt setup,
1251 + * so if it is off, don't enumerate the io-apics with ACPI.
1252 + * If MPS is present, it will handle them,
1253 + * otherwise the system will stay in PIC mode
1254 + */
1255 + if (acpi_disabled || acpi_noirq) {
1256 + return -ENODEV;
1257 + }
1258 +
1259 + /*
1260 + * if "noapic" boot option, don't look for IO-APICs
1261 + */
1262 + if (skip_ioapic_setup) {
1263 + printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1264 + "due to 'noapic' option.\n");
1265 + return -ENODEV;
1266 + }
1267 +
1268 + count =
1269 + acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1270 + MAX_IO_APICS);
1271 + if (!count) {
1272 + printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1273 + return -ENODEV;
1274 + } else if (count < 0) {
1275 + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1276 + return count;
1277 + }
1278 +
1279 + count =
1280 + acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1281 + NR_IRQ_VECTORS);
1282 + if (count < 0) {
1283 + printk(KERN_ERR PREFIX
1284 + "Error parsing interrupt source overrides entry\n");
1285 + /* TBD: Cleanup to allow fallback to MPS */
1286 + return count;
1287 + }
1288 +
1289 + /*
1290 + * If BIOS did not supply an INT_SRC_OVR for the SCI
1291 + * pretend we got one so we can set the SCI flags.
1292 + */
1293 + if (!acpi_sci_override_gsi)
1294 + acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1295 +
1296 + /* Fill in identity legacy mapings where no override */
1297 + mp_config_acpi_legacy_irqs();
1298 +
1299 + count =
1300 + acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1301 + NR_IRQ_VECTORS);
1302 + if (count < 0) {
1303 + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1304 + /* TBD: Cleanup to allow fallback to MPS */
1305 + return count;
1306 + }
1307 +
1308 + return 0;
1309 +}
1310 +#else
1311 +static inline int acpi_parse_madt_ioapic_entries(void)
1312 +{
1313 + return -1;
1314 +}
1315 +#endif /* !CONFIG_X86_IO_APIC */
1316 +
1317 +static void __init acpi_process_madt(void)
1318 +{
1319 +#ifdef CONFIG_X86_LOCAL_APIC
1320 + int count, error;
1321 +
1322 + count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1323 + if (count >= 1) {
1324 +
1325 + /*
1326 + * Parse MADT LAPIC entries
1327 + */
1328 + error = acpi_parse_madt_lapic_entries();
1329 + if (!error) {
1330 + acpi_lapic = 1;
1331 +
1332 +#ifdef CONFIG_X86_GENERICARCH
1333 + generic_bigsmp_probe();
1334 +#endif
1335 + /*
1336 + * Parse MADT IO-APIC entries
1337 + */
1338 + error = acpi_parse_madt_ioapic_entries();
1339 + if (!error) {
1340 + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1341 + acpi_irq_balance_set(NULL);
1342 + acpi_ioapic = 1;
1343 +
1344 + smp_found_config = 1;
1345 + clustered_apic_check();
1346 + }
1347 + }
1348 + if (error == -EINVAL) {
1349 + /*
1350 + * Dell Precision Workstation 410, 610 come here.
1351 + */
1352 + printk(KERN_ERR PREFIX
1353 + "Invalid BIOS MADT, disabling ACPI\n");
1354 + disable_acpi();
1355 + }
1356 + }
1357 +#endif
1358 + return;
1359 +}
1360 +
1361 +extern int acpi_force;
1362 +
1363 +#ifdef __i386__
1364 +
1365 +static int __init disable_acpi_irq(struct dmi_system_id *d)
1366 +{
1367 + if (!acpi_force) {
1368 + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1369 + d->ident);
1370 + acpi_noirq_set();
1371 + }
1372 + return 0;
1373 +}
1374 +
1375 +static int __init disable_acpi_pci(struct dmi_system_id *d)
1376 +{
1377 + if (!acpi_force) {
1378 + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1379 + d->ident);
1380 + acpi_disable_pci();
1381 + }
1382 + return 0;
1383 +}
1384 +
1385 +static int __init dmi_disable_acpi(struct dmi_system_id *d)
1386 +{
1387 + if (!acpi_force) {
1388 + printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1389 + disable_acpi();
1390 + } else {
1391 + printk(KERN_NOTICE
1392 + "Warning: DMI blacklist says broken, but acpi forced\n");
1393 + }
1394 + return 0;
1395 +}
1396 +
1397 +/*
1398 + * Limit ACPI to CPU enumeration for HT
1399 + */
1400 +static int __init force_acpi_ht(struct dmi_system_id *d)
1401 +{
1402 + if (!acpi_force) {
1403 + printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1404 + d->ident);
1405 + disable_acpi();
1406 + acpi_ht = 1;
1407 + } else {
1408 + printk(KERN_NOTICE
1409 + "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1410 + }
1411 + return 0;
1412 +}
1413 +
1414 +/*
1415 + * If your system is blacklisted here, but you find that acpi=force
1416 + * works for you, please contact acpi-devel@sourceforge.net
1417 + */
1418 +static struct dmi_system_id __initdata acpi_dmi_table[] = {
1419 + /*
1420 + * Boxes that need ACPI disabled
1421 + */
1422 + {
1423 + .callback = dmi_disable_acpi,
1424 + .ident = "IBM Thinkpad",
1425 + .matches = {
1426 + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1427 + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1428 + },
1429 + },
1430 +
1431 + /*
1432 + * Boxes that need acpi=ht
1433 + */
1434 + {
1435 + .callback = force_acpi_ht,
1436 + .ident = "FSC Primergy T850",
1437 + .matches = {
1438 + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1439 + DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1440 + },
1441 + },
1442 + {
1443 + .callback = force_acpi_ht,
1444 + .ident = "DELL GX240",
1445 + .matches = {
1446 + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1447 + DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1448 + },
1449 + },
1450 + {
1451 + .callback = force_acpi_ht,
1452 + .ident = "HP VISUALIZE NT Workstation",
1453 + .matches = {
1454 + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1455 + DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1456 + },
1457 + },
1458 + {
1459 + .callback = force_acpi_ht,
1460 + .ident = "Compaq Workstation W8000",
1461 + .matches = {
1462 + DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1463 + DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1464 + },
1465 + },
1466 + {
1467 + .callback = force_acpi_ht,
1468 + .ident = "ASUS P4B266",
1469 + .matches = {
1470 + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1471 + DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1472 + },
1473 + },
1474 + {
1475 + .callback = force_acpi_ht,
1476 + .ident = "ASUS P2B-DS",
1477 + .matches = {
1478 + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1479 + DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1480 + },
1481 + },
1482 + {
1483 + .callback = force_acpi_ht,
1484 + .ident = "ASUS CUR-DLS",
1485 + .matches = {
1486 + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1487 + DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1488 + },
1489 + },
1490 + {
1491 + .callback = force_acpi_ht,
1492 + .ident = "ABIT i440BX-W83977",
1493 + .matches = {
1494 + DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1495 + DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1496 + },
1497 + },
1498 + {
1499 + .callback = force_acpi_ht,
1500 + .ident = "IBM Bladecenter",
1501 + .matches = {
1502 + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1503 + DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1504 + },
1505 + },
1506 + {
1507 + .callback = force_acpi_ht,
1508 + .ident = "IBM eServer xSeries 360",
1509 + .matches = {
1510 + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1511 + DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1512 + },
1513 + },
1514 + {
1515 + .callback = force_acpi_ht,
1516 + .ident = "IBM eserver xSeries 330",
1517 + .matches = {
1518 + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1519 + DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1520 + },
1521 + },
1522 + {
1523 + .callback = force_acpi_ht,
1524 + .ident = "IBM eserver xSeries 440",
1525 + .matches = {
1526 + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1527 + DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1528 + },
1529 + },
1530 +
1531 + /*
1532 + * Boxes that need ACPI PCI IRQ routing disabled
1533 + */
1534 + {
1535 + .callback = disable_acpi_irq,
1536 + .ident = "ASUS A7V",
1537 + .matches = {
1538 + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1539 + DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1540 + /* newer BIOS, Revision 1011, does work */
1541 + DMI_MATCH(DMI_BIOS_VERSION,
1542 + "ASUS A7V ACPI BIOS Revision 1007"),
1543 + },
1544 + },
1545 +
1546 + /*
1547 + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1548 + */
1549 + { /* _BBN 0 bug */
1550 + .callback = disable_acpi_pci,
1551 + .ident = "ASUS PR-DLS",
1552 + .matches = {
1553 + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1554 + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1555 + DMI_MATCH(DMI_BIOS_VERSION,
1556 + "ASUS PR-DLS ACPI BIOS Revision 1010"),
1557 + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1558 + },
1559 + },
1560 + {
1561 + .callback = disable_acpi_pci,
1562 + .ident = "Acer TravelMate 36x Laptop",
1563 + .matches = {
1564 + DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1565 + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1566 + },
1567 + },
1568 + {}
1569 +};
1570 +
1571 +#endif /* __i386__ */
1572 +
1573 +/*
1574 + * acpi_boot_table_init() and acpi_boot_init()
1575 + * called from setup_arch(), always.
1576 + * 1. checksums all tables
1577 + * 2. enumerates lapics
1578 + * 3. enumerates io-apics
1579 + *
1580 + * acpi_table_init() is separate to allow reading SRAT without
1581 + * other side effects.
1582 + *
1583 + * side effects of acpi_boot_init:
1584 + * acpi_lapic = 1 if LAPIC found
1585 + * acpi_ioapic = 1 if IOAPIC found
1586 + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1587 + * if acpi_blacklisted() acpi_disabled = 1;
1588 + * acpi_irq_model=...
1589 + * ...
1590 + *
1591 + * return value: (currently ignored)
1592 + * 0: success
1593 + * !0: failure
1594 + */
1595 +
1596 +int __init acpi_boot_table_init(void)
1597 +{
1598 + int error;
1599 +
1600 +#ifdef __i386__
1601 + dmi_check_system(acpi_dmi_table);
1602 +#endif
1603 +
1604 + /*
1605 + * If acpi_disabled, bail out
1606 + * One exception: acpi=ht continues far enough to enumerate LAPICs
1607 + */
1608 + if (acpi_disabled && !acpi_ht)
1609 + return 1;
1610 +
1611 + /*
1612 + * Initialize the ACPI boot-time table parser.
1613 + */
1614 + error = acpi_table_init();
1615 + if (error) {
1616 + disable_acpi();
1617 + return error;
1618 + }
1619 +
1620 + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1621 +
1622 + /*
1623 + * blacklist may disable ACPI entirely
1624 + */
1625 + error = acpi_blacklisted();
1626 + if (error) {
1627 + if (acpi_force) {
1628 + printk(KERN_WARNING PREFIX "acpi=force override\n");
1629 + } else {
1630 + printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1631 + disable_acpi();
1632 + return error;
1633 + }
1634 + }
1635 +
1636 + return 0;
1637 +}
1638 +
1639 +int __init acpi_boot_init(void)
1640 +{
1641 + /*
1642 + * If acpi_disabled, bail out
1643 + * One exception: acpi=ht continues far enough to enumerate LAPICs
1644 + */
1645 + if (acpi_disabled && !acpi_ht)
1646 + return 1;
1647 +
1648 + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1649 +
1650 + /*
1651 + * set sci_int and PM timer address
1652 + */
1653 + acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1654 +
1655 + /*
1656 + * Process the Multiple APIC Description Table (MADT), if present
1657 + */
1658 + acpi_process_madt();
1659 +
1660 + acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1661 +
1662 + return 0;
1663 +}
1664 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/apic-xen.c linux-2.6.16.33/arch/i386/kernel/apic-xen.c
1665 --- linux-2.6.16.33-noxen/arch/i386/kernel/apic-xen.c 1970-01-01 00:00:00.000000000 +0000
1666 +++ linux-2.6.16.33/arch/i386/kernel/apic-xen.c 2007-01-08 15:00:45.000000000 +0000
1667 @@ -0,0 +1,140 @@
1668 +/*
1669 + * Local APIC handling, local APIC timers
1670 + *
1671 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1672 + *
1673 + * Fixes
1674 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
1675 + * thanks to Eric Gilmore
1676 + * and Rolf G. Tews
1677 + * for testing these extensively.
1678 + * Maciej W. Rozycki : Various updates and fixes.
1679 + * Mikael Pettersson : Power Management for UP-APIC.
1680 + * Pavel Machek and
1681 + * Mikael Pettersson : PM converted to driver model.
1682 + */
1683 +
1684 +#include <linux/config.h>
1685 +#include <linux/init.h>
1686 +
1687 +#include <linux/mm.h>
1688 +#include <linux/delay.h>
1689 +#include <linux/bootmem.h>
1690 +#include <linux/smp_lock.h>
1691 +#include <linux/interrupt.h>
1692 +#include <linux/mc146818rtc.h>
1693 +#include <linux/kernel_stat.h>
1694 +#include <linux/sysdev.h>
1695 +#include <linux/cpu.h>
1696 +#include <linux/module.h>
1697 +
1698 +#include <asm/atomic.h>
1699 +#include <asm/smp.h>
1700 +#include <asm/mtrr.h>
1701 +#include <asm/mpspec.h>
1702 +#include <asm/desc.h>
1703 +#include <asm/arch_hooks.h>
1704 +#include <asm/hpet.h>
1705 +#include <asm/i8253.h>
1706 +
1707 +#include <mach_apic.h>
1708 +#include <mach_ipi.h>
1709 +
1710 +#include "io_ports.h"
1711 +
1712 +#ifndef CONFIG_XEN
1713 +/*
1714 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1715 + * IPIs in place of local APIC timers
1716 + */
1717 +static cpumask_t timer_bcast_ipi;
1718 +#endif
1719 +
1720 +/*
1721 + * Knob to control our willingness to enable the local APIC.
1722 + */
1723 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1724 +
1725 +/*
1726 + * Debug level
1727 + */
1728 +int apic_verbosity;
1729 +
1730 +/*
1731 + * 'what should we do if we get a hw irq event on an illegal vector'.
1732 + * each architecture has to answer this themselves.
1733 + */
1734 +void ack_bad_irq(unsigned int irq)
1735 +{
1736 + printk("unexpected IRQ trap at vector %02x\n", irq);
1737 + /*
1738 + * Currently unexpected vectors happen only on SMP and APIC.
1739 + * We _must_ ack these because every local APIC has only N
1740 + * irq slots per priority level, and a 'hanging, unacked' IRQ
1741 + * holds up an irq slot - in excessive cases (when multiple
1742 + * unexpected vectors occur) that might lock up the APIC
1743 + * completely.
1744 + * But only ack when the APIC is enabled -AK
1745 + */
1746 + if (cpu_has_apic)
1747 + ack_APIC_irq();
1748 +}
1749 +
1750 +int get_physical_broadcast(void)
1751 +{
1752 + return 0xff;
1753 +}
1754 +
1755 +#ifndef CONFIG_XEN
1756 +#ifndef CONFIG_SMP
1757 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
1758 +{
1759 + int cpu = smp_processor_id();
1760 +
1761 + /*
1762 + * the NMI deadlock-detector uses this.
1763 + */
1764 + per_cpu(irq_stat, cpu).apic_timer_irqs++;
1765 +
1766 + smp_local_timer_interrupt(regs);
1767 +}
1768 +#endif
1769 +
1770 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
1771 +{
1772 + cpumask_t mask;
1773 +
1774 + cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1775 + if (!cpus_empty(mask)) {
1776 +#ifdef CONFIG_SMP
1777 + send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1778 +#else
1779 + /*
1780 + * We can directly call the apic timer interrupt handler
1781 + * in UP case. Minus all irq related functions
1782 + */
1783 + up_apic_timer_interrupt_call(regs);
1784 +#endif
1785 + }
1786 +}
1787 +#endif
1788 +
1789 +int setup_profiling_timer(unsigned int multiplier)
1790 +{
1791 + return -EINVAL;
1792 +}
1793 +
1794 +/*
1795 + * This initializes the IO-APIC and APIC hardware if this is
1796 + * a UP kernel.
1797 + */
1798 +int __init APIC_init_uniprocessor (void)
1799 +{
1800 +#ifdef CONFIG_X86_IO_APIC
1801 + if (smp_found_config)
1802 + if (!skip_ioapic_setup && nr_ioapics)
1803 + setup_IO_APIC();
1804 +#endif
1805 +
1806 + return 0;
1807 +}
1808 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/asm-offsets.c linux-2.6.16.33/arch/i386/kernel/asm-offsets.c
1809 --- linux-2.6.16.33-noxen/arch/i386/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
1810 +++ linux-2.6.16.33/arch/i386/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
1811 @@ -13,6 +13,7 @@
1812 #include <asm/fixmap.h>
1813 #include <asm/processor.h>
1814 #include <asm/thread_info.h>
1815 +#include <asm/elf.h>
1816
1817 #define DEFINE(sym, val) \
1818 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
1819 @@ -63,10 +64,15 @@
1820 OFFSET(pbe_orig_address, pbe, orig_address);
1821 OFFSET(pbe_next, pbe, next);
1822
1823 +#ifndef CONFIG_X86_NO_TSS
1824 /* Offset from the sysenter stack to tss.esp0 */
1825 - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
1826 + DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) -
1827 sizeof(struct tss_struct));
1828 +#else
1829 + /* sysenter stack points directly to esp0 */
1830 + DEFINE(SYSENTER_stack_esp0, 0);
1831 +#endif
1832
1833 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
1834 - DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
1835 + DEFINE(VSYSCALL_BASE, VSYSCALL_BASE);
1836 }
1837 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/Makefile linux-2.6.16.33/arch/i386/kernel/cpu/Makefile
1838 --- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/Makefile 2006-11-22 18:06:31.000000000 +0000
1839 +++ linux-2.6.16.33/arch/i386/kernel/cpu/Makefile 2007-01-08 15:00:45.000000000 +0000
1840 @@ -17,3 +17,8 @@
1841
1842 obj-$(CONFIG_MTRR) += mtrr/
1843 obj-$(CONFIG_CPU_FREQ) += cpufreq/
1844 +
1845 +ifdef CONFIG_XEN
1846 +include $(srctree)/scripts/Makefile.xen
1847 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
1848 +endif
1849 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/common-xen.c linux-2.6.16.33/arch/i386/kernel/cpu/common-xen.c
1850 --- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/common-xen.c 1970-01-01 00:00:00.000000000 +0000
1851 +++ linux-2.6.16.33/arch/i386/kernel/cpu/common-xen.c 2007-01-08 15:00:45.000000000 +0000
1852 @@ -0,0 +1,715 @@
1853 +#include <linux/init.h>
1854 +#include <linux/string.h>
1855 +#include <linux/delay.h>
1856 +#include <linux/smp.h>
1857 +#include <linux/module.h>
1858 +#include <linux/percpu.h>
1859 +#include <linux/bootmem.h>
1860 +#include <asm/semaphore.h>
1861 +#include <asm/processor.h>
1862 +#include <asm/i387.h>
1863 +#include <asm/msr.h>
1864 +#include <asm/io.h>
1865 +#include <asm/mmu_context.h>
1866 +#ifdef CONFIG_X86_LOCAL_APIC
1867 +#include <asm/mpspec.h>
1868 +#include <asm/apic.h>
1869 +#include <mach_apic.h>
1870 +#endif
1871 +#include <asm/hypervisor.h>
1872 +
1873 +#include "cpu.h"
1874 +
1875 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
1876 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
1877 +
1878 +#ifndef CONFIG_XEN
1879 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
1880 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
1881 +#endif
1882 +
1883 +static int cachesize_override __devinitdata = -1;
1884 +static int disable_x86_fxsr __devinitdata = 0;
1885 +static int disable_x86_serial_nr __devinitdata = 1;
1886 +
1887 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
1888 +
1889 +extern int disable_pse;
1890 +
1891 +static void default_init(struct cpuinfo_x86 * c)
1892 +{
1893 + /* Not much we can do here... */
1894 + /* Check if at least it has cpuid */
1895 + if (c->cpuid_level == -1) {
1896 + /* No cpuid. It must be an ancient CPU */
1897 + if (c->x86 == 4)
1898 + strcpy(c->x86_model_id, "486");
1899 + else if (c->x86 == 3)
1900 + strcpy(c->x86_model_id, "386");
1901 + }
1902 +}
1903 +
1904 +static struct cpu_dev default_cpu = {
1905 + .c_init = default_init,
1906 + .c_vendor = "Unknown",
1907 +};
1908 +static struct cpu_dev * this_cpu = &default_cpu;
1909 +
1910 +static int __init cachesize_setup(char *str)
1911 +{
1912 + get_option (&str, &cachesize_override);
1913 + return 1;
1914 +}
1915 +__setup("cachesize=", cachesize_setup);
1916 +
1917 +int __devinit get_model_name(struct cpuinfo_x86 *c)
1918 +{
1919 + unsigned int *v;
1920 + char *p, *q;
1921 +
1922 + if (cpuid_eax(0x80000000) < 0x80000004)
1923 + return 0;
1924 +
1925 + v = (unsigned int *) c->x86_model_id;
1926 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
1927 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
1928 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
1929 + c->x86_model_id[48] = 0;
1930 +
1931 + /* Intel chips right-justify this string for some dumb reason;
1932 + undo that brain damage */
1933 + p = q = &c->x86_model_id[0];
1934 + while ( *p == ' ' )
1935 + p++;
1936 + if ( p != q ) {
1937 + while ( *p )
1938 + *q++ = *p++;
1939 + while ( q <= &c->x86_model_id[48] )
1940 + *q++ = '\0'; /* Zero-pad the rest */
1941 + }
1942 +
1943 + return 1;
1944 +}
1945 +
1946 +
1947 +void __devinit display_cacheinfo(struct cpuinfo_x86 *c)
1948 +{
1949 + unsigned int n, dummy, ecx, edx, l2size;
1950 +
1951 + n = cpuid_eax(0x80000000);
1952 +
1953 + if (n >= 0x80000005) {
1954 + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
1955 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
1956 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
1957 + c->x86_cache_size=(ecx>>24)+(edx>>24);
1958 + }
1959 +
1960 + if (n < 0x80000006) /* Some chips just has a large L1. */
1961 + return;
1962 +
1963 + ecx = cpuid_ecx(0x80000006);
1964 + l2size = ecx >> 16;
1965 +
1966 + /* do processor-specific cache resizing */
1967 + if (this_cpu->c_size_cache)
1968 + l2size = this_cpu->c_size_cache(c,l2size);
1969 +
1970 + /* Allow user to override all this if necessary. */
1971 + if (cachesize_override != -1)
1972 + l2size = cachesize_override;
1973 +
1974 + if ( l2size == 0 )
1975 + return; /* Again, no L2 cache is possible */
1976 +
1977 + c->x86_cache_size = l2size;
1978 +
1979 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
1980 + l2size, ecx & 0xFF);
1981 +}
1982 +
1983 +/* Naming convention should be: <Name> [(<Codename>)] */
1984 +/* This table only is used unless init_<vendor>() below doesn't set it; */
1985 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
1986 +
1987 +/* Look up CPU names by table lookup. */
1988 +static char __devinit *table_lookup_model(struct cpuinfo_x86 *c)
1989 +{
1990 + struct cpu_model_info *info;
1991 +
1992 + if ( c->x86_model >= 16 )
1993 + return NULL; /* Range check */
1994 +
1995 + if (!this_cpu)
1996 + return NULL;
1997 +
1998 + info = this_cpu->c_models;
1999 +
2000 + while (info && info->family) {
2001 + if (info->family == c->x86)
2002 + return info->model_names[c->x86_model];
2003 + info++;
2004 + }
2005 + return NULL; /* Not found */
2006 +}
2007 +
2008 +
2009 +static void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
2010 +{
2011 + char *v = c->x86_vendor_id;
2012 + int i;
2013 + static int printed;
2014 +
2015 + for (i = 0; i < X86_VENDOR_NUM; i++) {
2016 + if (cpu_devs[i]) {
2017 + if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
2018 + (cpu_devs[i]->c_ident[1] &&
2019 + !strcmp(v,cpu_devs[i]->c_ident[1]))) {
2020 + c->x86_vendor = i;
2021 + if (!early)
2022 + this_cpu = cpu_devs[i];
2023 + return;
2024 + }
2025 + }
2026 + }
2027 + if (!printed) {
2028 + printed++;
2029 + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
2030 + printk(KERN_ERR "CPU: Your system may be unstable.\n");
2031 + }
2032 + c->x86_vendor = X86_VENDOR_UNKNOWN;
2033 + this_cpu = &default_cpu;
2034 +}
2035 +
2036 +
2037 +static int __init x86_fxsr_setup(char * s)
2038 +{
2039 + disable_x86_fxsr = 1;
2040 + return 1;
2041 +}
2042 +__setup("nofxsr", x86_fxsr_setup);
2043 +
2044 +
2045 +/* Standard macro to see if a specific flag is changeable */
2046 +static inline int flag_is_changeable_p(u32 flag)
2047 +{
2048 + u32 f1, f2;
2049 +
2050 + asm("pushfl\n\t"
2051 + "pushfl\n\t"
2052 + "popl %0\n\t"
2053 + "movl %0,%1\n\t"
2054 + "xorl %2,%0\n\t"
2055 + "pushl %0\n\t"
2056 + "popfl\n\t"
2057 + "pushfl\n\t"
2058 + "popl %0\n\t"
2059 + "popfl\n\t"
2060 + : "=&r" (f1), "=&r" (f2)
2061 + : "ir" (flag));
2062 +
2063 + return ((f1^f2) & flag) != 0;
2064 +}
2065 +
2066 +
2067 +/* Probe for the CPUID instruction */
2068 +static int __devinit have_cpuid_p(void)
2069 +{
2070 + return flag_is_changeable_p(X86_EFLAGS_ID);
2071 +}
2072 +
2073 +/* Do minimum CPU detection early.
2074 + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
2075 + The others are not touched to avoid unwanted side effects.
2076 +
2077 + WARNING: this function is only called on the BP. Don't add code here
2078 + that is supposed to run on all CPUs. */
2079 +static void __init early_cpu_detect(void)
2080 +{
2081 + struct cpuinfo_x86 *c = &boot_cpu_data;
2082 +
2083 + c->x86_cache_alignment = 32;
2084 +
2085 + if (!have_cpuid_p())
2086 + return;
2087 +
2088 + /* Get vendor name */
2089 + cpuid(0x00000000, &c->cpuid_level,
2090 + (int *)&c->x86_vendor_id[0],
2091 + (int *)&c->x86_vendor_id[8],
2092 + (int *)&c->x86_vendor_id[4]);
2093 +
2094 + get_cpu_vendor(c, 1);
2095 +
2096 + c->x86 = 4;
2097 + if (c->cpuid_level >= 0x00000001) {
2098 + u32 junk, tfms, cap0, misc;
2099 + cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
2100 + c->x86 = (tfms >> 8) & 15;
2101 + c->x86_model = (tfms >> 4) & 15;
2102 + if (c->x86 == 0xf)
2103 + c->x86 += (tfms >> 20) & 0xff;
2104 + if (c->x86 >= 0x6)
2105 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
2106 + c->x86_mask = tfms & 15;
2107 + if (cap0 & (1<<19))
2108 + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
2109 + }
2110 +}
2111 +
2112 +void __devinit generic_identify(struct cpuinfo_x86 * c)
2113 +{
2114 + u32 tfms, xlvl;
2115 + int junk;
2116 +
2117 + if (have_cpuid_p()) {
2118 + /* Get vendor name */
2119 + cpuid(0x00000000, &c->cpuid_level,
2120 + (int *)&c->x86_vendor_id[0],
2121 + (int *)&c->x86_vendor_id[8],
2122 + (int *)&c->x86_vendor_id[4]);
2123 +
2124 + get_cpu_vendor(c, 0);
2125 + /* Initialize the standard set of capabilities */
2126 + /* Note that the vendor-specific code below might override */
2127 +
2128 + /* Intel-defined flags: level 0x00000001 */
2129 + if ( c->cpuid_level >= 0x00000001 ) {
2130 + u32 capability, excap;
2131 + cpuid(0x00000001, &tfms, &junk, &excap, &capability);
2132 + c->x86_capability[0] = capability;
2133 + c->x86_capability[4] = excap;
2134 + c->x86 = (tfms >> 8) & 15;
2135 + c->x86_model = (tfms >> 4) & 15;
2136 + if (c->x86 == 0xf)
2137 + c->x86 += (tfms >> 20) & 0xff;
2138 + if (c->x86 >= 0x6)
2139 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
2140 + c->x86_mask = tfms & 15;
2141 + } else {
2142 + /* Have CPUID level 0 only - unheard of */
2143 + c->x86 = 4;
2144 + }
2145 +
2146 + /* AMD-defined flags: level 0x80000001 */
2147 + xlvl = cpuid_eax(0x80000000);
2148 + if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2149 + if ( xlvl >= 0x80000001 ) {
2150 + c->x86_capability[1] = cpuid_edx(0x80000001);
2151 + c->x86_capability[6] = cpuid_ecx(0x80000001);
2152 + }
2153 + if ( xlvl >= 0x80000004 )
2154 + get_model_name(c); /* Default name */
2155 + }
2156 + }
2157 +
2158 + early_intel_workaround(c);
2159 +
2160 +#ifdef CONFIG_X86_HT
2161 + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
2162 +#endif
2163 +}
2164 +
2165 +static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2166 +{
2167 + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2168 + /* Disable processor serial number */
2169 + unsigned long lo,hi;
2170 + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2171 + lo |= 0x200000;
2172 + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2173 + printk(KERN_NOTICE "CPU serial number disabled.\n");
2174 + clear_bit(X86_FEATURE_PN, c->x86_capability);
2175 +
2176 + /* Disabling the serial number may affect the cpuid level */
2177 + c->cpuid_level = cpuid_eax(0);
2178 + }
2179 +}
2180 +
2181 +static int __init x86_serial_nr_setup(char *s)
2182 +{
2183 + disable_x86_serial_nr = 0;
2184 + return 1;
2185 +}
2186 +__setup("serialnumber", x86_serial_nr_setup);
2187 +
2188 +
2189 +
2190 +/*
2191 + * This does the hard work of actually picking apart the CPU stuff...
2192 + */
2193 +void __devinit identify_cpu(struct cpuinfo_x86 *c)
2194 +{
2195 + int i;
2196 +
2197 + c->loops_per_jiffy = loops_per_jiffy;
2198 + c->x86_cache_size = -1;
2199 + c->x86_vendor = X86_VENDOR_UNKNOWN;
2200 + c->cpuid_level = -1; /* CPUID not detected */
2201 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
2202 + c->x86_vendor_id[0] = '\0'; /* Unset */
2203 + c->x86_model_id[0] = '\0'; /* Unset */
2204 + c->x86_max_cores = 1;
2205 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
2206 +
2207 + if (!have_cpuid_p()) {
2208 + /* First of all, decide if this is a 486 or higher */
2209 + /* It's a 486 if we can modify the AC flag */
2210 + if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2211 + c->x86 = 4;
2212 + else
2213 + c->x86 = 3;
2214 + }
2215 +
2216 + generic_identify(c);
2217 +
2218 + printk(KERN_DEBUG "CPU: After generic identify, caps:");
2219 + for (i = 0; i < NCAPINTS; i++)
2220 + printk(" %08lx", c->x86_capability[i]);
2221 + printk("\n");
2222 +
2223 + if (this_cpu->c_identify) {
2224 + this_cpu->c_identify(c);
2225 +
2226 + printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2227 + for (i = 0; i < NCAPINTS; i++)
2228 + printk(" %08lx", c->x86_capability[i]);
2229 + printk("\n");
2230 + }
2231 +
2232 + /*
2233 + * Vendor-specific initialization. In this section we
2234 + * canonicalize the feature flags, meaning if there are
2235 + * features a certain CPU supports which CPUID doesn't
2236 + * tell us, CPUID claiming incorrect flags, or other bugs,
2237 + * we handle them here.
2238 + *
2239 + * At the end of this section, c->x86_capability better
2240 + * indicate the features this CPU genuinely supports!
2241 + */
2242 + if (this_cpu->c_init)
2243 + this_cpu->c_init(c);
2244 +
2245 + /* Disable the PN if appropriate */
2246 + squash_the_stupid_serial_number(c);
2247 +
2248 + /*
2249 + * The vendor-specific functions might have changed features. Now
2250 + * we do "generic changes."
2251 + */
2252 +
2253 + /* TSC disabled? */
2254 + if ( tsc_disable )
2255 + clear_bit(X86_FEATURE_TSC, c->x86_capability);
2256 +
2257 + /* FXSR disabled? */
2258 + if (disable_x86_fxsr) {
2259 + clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2260 + clear_bit(X86_FEATURE_XMM, c->x86_capability);
2261 + }
2262 +
2263 + if (disable_pse)
2264 + clear_bit(X86_FEATURE_PSE, c->x86_capability);
2265 +
2266 + /* If the model name is still unset, do table lookup. */
2267 + if ( !c->x86_model_id[0] ) {
2268 + char *p;
2269 + p = table_lookup_model(c);
2270 + if ( p )
2271 + strcpy(c->x86_model_id, p);
2272 + else
2273 + /* Last resort... */
2274 + sprintf(c->x86_model_id, "%02x/%02x",
2275 + c->x86_vendor, c->x86_model);
2276 + }
2277 +
2278 + /* Now the feature flags better reflect actual CPU features! */
2279 +
2280 + printk(KERN_DEBUG "CPU: After all inits, caps:");
2281 + for (i = 0; i < NCAPINTS; i++)
2282 + printk(" %08lx", c->x86_capability[i]);
2283 + printk("\n");
2284 +
2285 + /*
2286 + * On SMP, boot_cpu_data holds the common feature set between
2287 + * all CPUs; so make sure that we indicate which features are
2288 + * common between the CPUs. The first time this routine gets
2289 + * executed, c == &boot_cpu_data.
2290 + */
2291 + if ( c != &boot_cpu_data ) {
2292 + /* AND the already accumulated flags with these */
2293 + for ( i = 0 ; i < NCAPINTS ; i++ )
2294 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2295 + }
2296 +
2297 + /* Init Machine Check Exception if available. */
2298 + mcheck_init(c);
2299 +
2300 + if (c == &boot_cpu_data)
2301 + sysenter_setup();
2302 + enable_sep_cpu();
2303 +
2304 + if (c == &boot_cpu_data)
2305 + mtrr_bp_init();
2306 + else
2307 + mtrr_ap_init();
2308 +}
2309 +
2310 +#ifdef CONFIG_X86_HT
2311 +void __devinit detect_ht(struct cpuinfo_x86 *c)
2312 +{
2313 + u32 eax, ebx, ecx, edx;
2314 + int index_msb, core_bits;
2315 + int cpu = smp_processor_id();
2316 +
2317 + cpuid(1, &eax, &ebx, &ecx, &edx);
2318 +
2319 + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2320 +
2321 + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2322 + return;
2323 +
2324 + smp_num_siblings = (ebx & 0xff0000) >> 16;
2325 +
2326 + if (smp_num_siblings == 1) {
2327 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
2328 + } else if (smp_num_siblings > 1 ) {
2329 +
2330 + if (smp_num_siblings > NR_CPUS) {
2331 + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
2332 + smp_num_siblings = 1;
2333 + return;
2334 + }
2335 +
2336 + index_msb = get_count_order(smp_num_siblings);
2337 + phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2338 +
2339 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
2340 + phys_proc_id[cpu]);
2341 +
2342 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2343 +
2344 + index_msb = get_count_order(smp_num_siblings) ;
2345 +
2346 + core_bits = get_count_order(c->x86_max_cores);
2347 +
2348 + cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2349 + ((1 << core_bits) - 1);
2350 +
2351 + if (c->x86_max_cores > 1)
2352 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
2353 + cpu_core_id[cpu]);
2354 + }
2355 +}
2356 +#endif
2357 +
2358 +void __devinit print_cpu_info(struct cpuinfo_x86 *c)
2359 +{
2360 + char *vendor = NULL;
2361 +
2362 + if (c->x86_vendor < X86_VENDOR_NUM)
2363 + vendor = this_cpu->c_vendor;
2364 + else if (c->cpuid_level >= 0)
2365 + vendor = c->x86_vendor_id;
2366 +
2367 + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2368 + printk("%s ", vendor);
2369 +
2370 + if (!c->x86_model_id[0])
2371 + printk("%d86", c->x86);
2372 + else
2373 + printk("%s", c->x86_model_id);
2374 +
2375 + if (c->x86_mask || c->cpuid_level >= 0)
2376 + printk(" stepping %02x\n", c->x86_mask);
2377 + else
2378 + printk("\n");
2379 +}
2380 +
2381 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2382 +
2383 +/* This is hacky. :)
2384 + * We're emulating future behavior.
2385 + * In the future, the cpu-specific init functions will be called implicitly
2386 + * via the magic of initcalls.
2387 + * They will insert themselves into the cpu_devs structure.
2388 + * Then, when cpu_init() is called, we can just iterate over that array.
2389 + */
2390 +
2391 +extern int intel_cpu_init(void);
2392 +extern int cyrix_init_cpu(void);
2393 +extern int nsc_init_cpu(void);
2394 +extern int amd_init_cpu(void);
2395 +extern int centaur_init_cpu(void);
2396 +extern int transmeta_init_cpu(void);
2397 +extern int rise_init_cpu(void);
2398 +extern int nexgen_init_cpu(void);
2399 +extern int umc_init_cpu(void);
2400 +
2401 +void __init early_cpu_init(void)
2402 +{
2403 + intel_cpu_init();
2404 + cyrix_init_cpu();
2405 + nsc_init_cpu();
2406 + amd_init_cpu();
2407 + centaur_init_cpu();
2408 + transmeta_init_cpu();
2409 + rise_init_cpu();
2410 + nexgen_init_cpu();
2411 + umc_init_cpu();
2412 + early_cpu_detect();
2413 +
2414 +#ifdef CONFIG_DEBUG_PAGEALLOC
2415 + /* pse is not compatible with on-the-fly unmapping,
2416 + * disable it even if the cpus claim to support it.
2417 + */
2418 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2419 + disable_pse = 1;
2420 +#endif
2421 +}
2422 +
2423 +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2424 +{
2425 + unsigned long frames[16];
2426 + unsigned long va;
2427 + int f;
2428 +
2429 + for (va = gdt_descr->address, f = 0;
2430 + va < gdt_descr->address + gdt_descr->size;
2431 + va += PAGE_SIZE, f++) {
2432 + frames[f] = virt_to_mfn(va);
2433 + make_lowmem_page_readonly(
2434 + (void *)va, XENFEAT_writable_descriptor_tables);
2435 + }
2436 + if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2437 + BUG();
2438 +}
2439 +
2440 +/*
2441 + * cpu_init() initializes state that is per-CPU. Some data is already
2442 + * initialized (naturally) in the bootstrap process, such as the GDT
2443 + * and IDT. We reload them nevertheless, this function acts as a
2444 + * 'CPU state barrier', nothing should get across.
2445 + */
2446 +void __cpuinit cpu_init(void)
2447 +{
2448 + int cpu = smp_processor_id();
2449 +#ifndef CONFIG_X86_NO_TSS
2450 + struct tss_struct * t = &per_cpu(init_tss, cpu);
2451 +#endif
2452 + struct thread_struct *thread = &current->thread;
2453 + struct desc_struct *gdt;
2454 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2455 +
2456 + if (cpu_test_and_set(cpu, cpu_initialized)) {
2457 + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2458 + for (;;) local_irq_enable();
2459 + }
2460 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2461 +
2462 + if (cpu_has_vme || cpu_has_de)
2463 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2464 + if (tsc_disable && cpu_has_tsc) {
2465 + printk(KERN_NOTICE "Disabling TSC...\n");
2466 + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2467 + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2468 + set_in_cr4(X86_CR4_TSD);
2469 + }
2470 +
2471 +#ifndef CONFIG_XEN
2472 + /*
2473 + * This is a horrible hack to allocate the GDT. The problem
2474 + * is that cpu_init() is called really early for the boot CPU
2475 + * (and hence needs bootmem) but much later for the secondary
2476 + * CPUs, when bootmem will have gone away
2477 + */
2478 + if (NODE_DATA(0)->bdata->node_bootmem_map) {
2479 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2480 + /* alloc_bootmem_pages panics on failure, so no check */
2481 + memset(gdt, 0, PAGE_SIZE);
2482 + } else {
2483 + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2484 + if (unlikely(!gdt)) {
2485 + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2486 + for (;;)
2487 + local_irq_enable();
2488 + }
2489 + }
2490 +
2491 + /*
2492 + * Initialize the per-CPU GDT with the boot GDT,
2493 + * and set up the GDT descriptor:
2494 + */
2495 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2496 +
2497 + /* Set up GDT entry for 16bit stack */
2498 + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2499 + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2500 + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2501 + (CPU_16BIT_STACK_SIZE - 1);
2502 +
2503 + cpu_gdt_descr->size = GDT_SIZE - 1;
2504 + cpu_gdt_descr->address = (unsigned long)gdt;
2505 +#else
2506 + if (cpu == 0 && cpu_gdt_descr->address == 0) {
2507 + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2508 + /* alloc_bootmem_pages panics on failure, so no check */
2509 + memset(gdt, 0, PAGE_SIZE);
2510 +
2511 + memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2512 +
2513 + cpu_gdt_descr->size = GDT_SIZE;
2514 + cpu_gdt_descr->address = (unsigned long)gdt;
2515 + }
2516 +#endif
2517 +
2518 + cpu_gdt_init(cpu_gdt_descr);
2519 +
2520 + /*
2521 + * Set up and load the per-CPU TSS and LDT
2522 + */
2523 + atomic_inc(&init_mm.mm_count);
2524 + current->active_mm = &init_mm;
2525 + if (current->mm)
2526 + BUG();
2527 + enter_lazy_tlb(&init_mm, current);
2528 +
2529 + load_esp0(t, thread);
2530 +
2531 + load_LDT(&init_mm.context);
2532 +
2533 +#ifdef CONFIG_DOUBLEFAULT
2534 + /* Set up doublefault TSS pointer in the GDT */
2535 + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2536 +#endif
2537 +
2538 + /* Clear %fs and %gs. */
2539 + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
2540 +
2541 + /* Clear all 6 debug registers: */
2542 + set_debugreg(0, 0);
2543 + set_debugreg(0, 1);
2544 + set_debugreg(0, 2);
2545 + set_debugreg(0, 3);
2546 + set_debugreg(0, 6);
2547 + set_debugreg(0, 7);
2548 +
2549 + /*
2550 + * Force FPU initialization:
2551 + */
2552 + current_thread_info()->status = 0;
2553 + clear_used_math();
2554 + mxcsr_feature_mask_init();
2555 +}
2556 +
2557 +#ifdef CONFIG_HOTPLUG_CPU
2558 +void __devinit cpu_uninit(void)
2559 +{
2560 + int cpu = raw_smp_processor_id();
2561 + cpu_clear(cpu, cpu_initialized);
2562 +
2563 + /* lazy TLB state */
2564 + per_cpu(cpu_tlbstate, cpu).state = 0;
2565 + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2566 +}
2567 +#endif
2568 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/Makefile
2569 --- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/Makefile 2006-11-22 18:06:31.000000000 +0000
2570 +++ linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/Makefile 2007-01-08 15:00:45.000000000 +0000
2571 @@ -3,3 +3,10 @@
2572 obj-y += cyrix.o
2573 obj-y += centaur.o
2574
2575 +ifdef CONFIG_XEN
2576 +include $(srctree)/scripts/Makefile.xen
2577 +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2578 +
2579 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2580 +obj-y := $(call cherrypickxen, $(obj-y))
2581 +endif
2582 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/main-xen.c linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/main-xen.c
2583 --- linux-2.6.16.33-noxen/arch/i386/kernel/cpu/mtrr/main-xen.c 1970-01-01 00:00:00.000000000 +0000
2584 +++ linux-2.6.16.33/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-01-08 15:00:45.000000000 +0000
2585 @@ -0,0 +1,196 @@
2586 +#include <linux/init.h>
2587 +#include <linux/proc_fs.h>
2588 +#include <linux/ctype.h>
2589 +#include <linux/module.h>
2590 +#include <linux/seq_file.h>
2591 +#include <asm/uaccess.h>
2592 +
2593 +#include <asm/mtrr.h>
2594 +#include "mtrr.h"
2595 +
2596 +static DECLARE_MUTEX(mtrr_sem);
2597 +
2598 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
2599 + unsigned int *size, mtrr_type * type)
2600 +{
2601 + dom0_op_t op;
2602 +
2603 + op.cmd = DOM0_READ_MEMTYPE;
2604 + op.u.read_memtype.reg = reg;
2605 + (void)HYPERVISOR_dom0_op(&op);
2606 +
2607 + *size = op.u.read_memtype.nr_mfns;
2608 + *base = op.u.read_memtype.mfn;
2609 + *type = op.u.read_memtype.type;
2610 +}
2611 +
2612 +struct mtrr_ops generic_mtrr_ops = {
2613 + .use_intel_if = 1,
2614 + .get = generic_get_mtrr,
2615 +};
2616 +
2617 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2618 +unsigned int num_var_ranges;
2619 +unsigned int *usage_table;
2620 +
2621 +static void __init set_num_var_ranges(void)
2622 +{
2623 + dom0_op_t op;
2624 +
2625 + for (num_var_ranges = 0; ; num_var_ranges++) {
2626 + op.cmd = DOM0_READ_MEMTYPE;
2627 + op.u.read_memtype.reg = num_var_ranges;
2628 + if (HYPERVISOR_dom0_op(&op) != 0)
2629 + break;
2630 + }
2631 +}
2632 +
2633 +static void __init init_table(void)
2634 +{
2635 + int i, max;
2636 +
2637 + max = num_var_ranges;
2638 + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2639 + == NULL) {
2640 + printk(KERN_ERR "mtrr: could not allocate\n");
2641 + return;
2642 + }
2643 + for (i = 0; i < max; i++)
2644 + usage_table[i] = 0;
2645 +}
2646 +
2647 +int mtrr_add_page(unsigned long base, unsigned long size,
2648 + unsigned int type, char increment)
2649 +{
2650 + int error;
2651 + dom0_op_t op;
2652 +
2653 + down(&mtrr_sem);
2654 +
2655 + op.cmd = DOM0_ADD_MEMTYPE;
2656 + op.u.add_memtype.mfn = base;
2657 + op.u.add_memtype.nr_mfns = size;
2658 + op.u.add_memtype.type = type;
2659 + error = HYPERVISOR_dom0_op(&op);
2660 + if (error) {
2661 + up(&mtrr_sem);
2662 + BUG_ON(error > 0);
2663 + return error;
2664 + }
2665 +
2666 + if (increment)
2667 + ++usage_table[op.u.add_memtype.reg];
2668 +
2669 + up(&mtrr_sem);
2670 +
2671 + return op.u.add_memtype.reg;
2672 +}
2673 +
2674 +static int mtrr_check(unsigned long base, unsigned long size)
2675 +{
2676 + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2677 + printk(KERN_WARNING
2678 + "mtrr: size and base must be multiples of 4 kiB\n");
2679 + printk(KERN_DEBUG
2680 + "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
2681 + dump_stack();
2682 + return -1;
2683 + }
2684 + return 0;
2685 +}
2686 +
2687 +int
2688 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2689 + char increment)
2690 +{
2691 + if (mtrr_check(base, size))
2692 + return -EINVAL;
2693 + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2694 + increment);
2695 +}
2696 +
2697 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2698 +{
2699 + unsigned i;
2700 + mtrr_type ltype;
2701 + unsigned long lbase;
2702 + unsigned int lsize;
2703 + int error = -EINVAL;
2704 + dom0_op_t op;
2705 +
2706 + down(&mtrr_sem);
2707 +
2708 + if (reg < 0) {
2709 + /* Search for existing MTRR */
2710 + for (i = 0; i < num_var_ranges; ++i) {
2711 + mtrr_if->get(i, &lbase, &lsize, &ltype);
2712 + if (lbase == base && lsize == size) {
2713 + reg = i;
2714 + break;
2715 + }
2716 + }
2717 + if (reg < 0) {
2718 + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2719 + size);
2720 + goto out;
2721 + }
2722 + }
2723 + if (usage_table[reg] < 1) {
2724 + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
2725 + goto out;
2726 + }
2727 + if (--usage_table[reg] < 1) {
2728 + op.cmd = DOM0_DEL_MEMTYPE;
2729 + op.u.del_memtype.handle = 0;
2730 + op.u.del_memtype.reg = reg;
2731 + error = HYPERVISOR_dom0_op(&op);
2732 + if (error) {
2733 + BUG_ON(error > 0);
2734 + goto out;
2735 + }
2736 + }
2737 + error = reg;
2738 + out:
2739 + up(&mtrr_sem);
2740 + return error;
2741 +}
2742 +
2743 +int
2744 +mtrr_del(int reg, unsigned long base, unsigned long size)
2745 +{
2746 + if (mtrr_check(base, size))
2747 + return -EINVAL;
2748 + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
2749 +}
2750 +
2751 +EXPORT_SYMBOL(mtrr_add);
2752 +EXPORT_SYMBOL(mtrr_del);
2753 +
2754 +void __init mtrr_bp_init(void)
2755 +{
2756 +}
2757 +
2758 +void mtrr_ap_init(void)
2759 +{
2760 +}
2761 +
2762 +static int __init mtrr_init(void)
2763 +{
2764 + struct cpuinfo_x86 *c = &boot_cpu_data;
2765 +
2766 + if (!is_initial_xendomain())
2767 + return -ENODEV;
2768 +
2769 + if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
2770 + (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
2771 + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
2772 + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
2773 + return -ENODEV;
2774 +
2775 + set_num_var_ranges();
2776 + init_table();
2777 +
2778 + return 0;
2779 +}
2780 +
2781 +subsys_initcall(mtrr_init);
2782 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/crash.c linux-2.6.16.33/arch/i386/kernel/crash.c
2783 --- linux-2.6.16.33-noxen/arch/i386/kernel/crash.c 2006-11-22 18:06:31.000000000 +0000
2784 +++ linux-2.6.16.33/arch/i386/kernel/crash.c 2007-01-08 15:00:45.000000000 +0000
2785 @@ -90,6 +90,7 @@
2786 crash_save_this_cpu(regs, cpu);
2787 }
2788
2789 +#ifndef CONFIG_XEN
2790 #ifdef CONFIG_SMP
2791 static atomic_t waiting_for_crash_ipi;
2792
2793 @@ -158,6 +159,7 @@
2794 /* There are no cpus to shootdown */
2795 }
2796 #endif
2797 +#endif /* CONFIG_XEN */
2798
2799 void machine_crash_shutdown(struct pt_regs *regs)
2800 {
2801 @@ -174,10 +176,12 @@
2802
2803 /* Make a note of crashing cpu. Will be used in NMI callback.*/
2804 crashing_cpu = smp_processor_id();
2805 +#ifndef CONFIG_XEN
2806 nmi_shootdown_cpus();
2807 lapic_shutdown();
2808 #if defined(CONFIG_X86_IO_APIC)
2809 disable_IO_APIC();
2810 #endif
2811 +#endif /* CONFIG_XEN */
2812 crash_save_self(regs);
2813 }
2814 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/early_printk-xen.c linux-2.6.16.33/arch/i386/kernel/early_printk-xen.c
2815 --- linux-2.6.16.33-noxen/arch/i386/kernel/early_printk-xen.c 1970-01-01 00:00:00.000000000 +0000
2816 +++ linux-2.6.16.33/arch/i386/kernel/early_printk-xen.c 2007-01-08 15:00:45.000000000 +0000
2817 @@ -0,0 +1,2 @@
2818 +
2819 +#include "../../x86_64/kernel/early_printk-xen.c"
2820 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/entry-xen.S linux-2.6.16.33/arch/i386/kernel/entry-xen.S
2821 --- linux-2.6.16.33-noxen/arch/i386/kernel/entry-xen.S 1970-01-01 00:00:00.000000000 +0000
2822 +++ linux-2.6.16.33/arch/i386/kernel/entry-xen.S 2007-01-08 15:00:45.000000000 +0000
2823 @@ -0,0 +1,899 @@
2824 +/*
2825 + * linux/arch/i386/entry.S
2826 + *
2827 + * Copyright (C) 1991, 1992 Linus Torvalds
2828 + */
2829 +
2830 +/*
2831 + * entry.S contains the system-call and fault low-level handling routines.
2832 + * This also contains the timer-interrupt handler, as well as all interrupts
2833 + * and faults that can result in a task-switch.
2834 + *
2835 + * NOTE: This code handles signal-recognition, which happens every time
2836 + * after a timer-interrupt and after each system call.
2837 + *
2838 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2839 + * on a 486.
2840 + *
2841 + * Stack layout in 'ret_from_system_call':
2842 + * ptrace needs to have all regs on the stack.
2843 + * if the order here is changed, it needs to be
2844 + * updated in fork.c:copy_process, signal.c:do_signal,
2845 + * ptrace.c and ptrace.h
2846 + *
2847 + * 0(%esp) - %ebx
2848 + * 4(%esp) - %ecx
2849 + * 8(%esp) - %edx
2850 + * C(%esp) - %esi
2851 + * 10(%esp) - %edi
2852 + * 14(%esp) - %ebp
2853 + * 18(%esp) - %eax
2854 + * 1C(%esp) - %ds
2855 + * 20(%esp) - %es
2856 + * 24(%esp) - orig_eax
2857 + * 28(%esp) - %eip
2858 + * 2C(%esp) - %cs
2859 + * 30(%esp) - %eflags
2860 + * 34(%esp) - %oldesp
2861 + * 38(%esp) - %oldss
2862 + *
2863 + * "current" is in register %ebx during any slow entries.
2864 + */
2865 +
2866 +#include <linux/config.h>
2867 +#include <linux/linkage.h>
2868 +#include <asm/thread_info.h>
2869 +#include <asm/errno.h>
2870 +#include <asm/segment.h>
2871 +#include <asm/smp.h>
2872 +#include <asm/page.h>
2873 +#include <asm/desc.h>
2874 +#include "irq_vectors.h"
2875 +#include <xen/interface/xen.h>
2876 +
2877 +#define nr_syscalls ((syscall_table_size)/4)
2878 +
2879 +EBX = 0x00
2880 +ECX = 0x04
2881 +EDX = 0x08
2882 +ESI = 0x0C
2883 +EDI = 0x10
2884 +EBP = 0x14
2885 +EAX = 0x18
2886 +DS = 0x1C
2887 +ES = 0x20
2888 +ORIG_EAX = 0x24
2889 +EIP = 0x28
2890 +CS = 0x2C
2891 +EFLAGS = 0x30
2892 +OLDESP = 0x34
2893 +OLDSS = 0x38
2894 +
2895 +CF_MASK = 0x00000001
2896 +TF_MASK = 0x00000100
2897 +IF_MASK = 0x00000200
2898 +DF_MASK = 0x00000400
2899 +NT_MASK = 0x00004000
2900 +VM_MASK = 0x00020000
2901 +/* Pseudo-eflags. */
2902 +NMI_MASK = 0x80000000
2903 +
2904 +#ifndef CONFIG_XEN
2905 +#define DISABLE_INTERRUPTS cli
2906 +#define ENABLE_INTERRUPTS sti
2907 +#else
2908 +/* Offsets into shared_info_t. */
2909 +#define evtchn_upcall_pending /* 0 */
2910 +#define evtchn_upcall_mask 1
2911 +
2912 +#define sizeof_vcpu_shift 6
2913 +
2914 +#ifdef CONFIG_SMP
2915 +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
2916 + shl $sizeof_vcpu_shift,%esi ; \
2917 + addl HYPERVISOR_shared_info,%esi
2918 +#else
2919 +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
2920 +#endif
2921 +
2922 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
2923 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
2924 +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
2925 + __DISABLE_INTERRUPTS
2926 +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
2927 + __ENABLE_INTERRUPTS
2928 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
2929 +#endif
2930 +
2931 +#ifdef CONFIG_PREEMPT
2932 +#define preempt_stop cli
2933 +#else
2934 +#define preempt_stop
2935 +#define resume_kernel restore_nocheck
2936 +#endif
2937 +
2938 +#define SAVE_ALL \
2939 + cld; \
2940 + pushl %es; \
2941 + pushl %ds; \
2942 + pushl %eax; \
2943 + pushl %ebp; \
2944 + pushl %edi; \
2945 + pushl %esi; \
2946 + pushl %edx; \
2947 + pushl %ecx; \
2948 + pushl %ebx; \
2949 + movl $(__USER_DS), %edx; \
2950 + movl %edx, %ds; \
2951 + movl %edx, %es;
2952 +
2953 +#define RESTORE_INT_REGS \
2954 + popl %ebx; \
2955 + popl %ecx; \
2956 + popl %edx; \
2957 + popl %esi; \
2958 + popl %edi; \
2959 + popl %ebp; \
2960 + popl %eax
2961 +
2962 +#define RESTORE_REGS \
2963 + RESTORE_INT_REGS; \
2964 +1: popl %ds; \
2965 +2: popl %es; \
2966 +.section .fixup,"ax"; \
2967 +3: movl $0,(%esp); \
2968 + jmp 1b; \
2969 +4: movl $0,(%esp); \
2970 + jmp 2b; \
2971 +.previous; \
2972 +.section __ex_table,"a";\
2973 + .align 4; \
2974 + .long 1b,3b; \
2975 + .long 2b,4b; \
2976 +.previous
2977 +
2978 +
2979 +ENTRY(ret_from_fork)
2980 + pushl %eax
2981 + call schedule_tail
2982 + GET_THREAD_INFO(%ebp)
2983 + popl %eax
2984 + jmp syscall_exit
2985 +
2986 +/*
2987 + * Return to user mode is not as complex as all this looks,
2988 + * but we want the default path for a system call return to
2989 + * go as quickly as possible which is why some of this is
2990 + * less clear than it otherwise should be.
2991 + */
2992 +
2993 + # userspace resumption stub bypassing syscall exit tracing
2994 + ALIGN
2995 +ret_from_exception:
2996 + preempt_stop
2997 +ret_from_intr:
2998 + GET_THREAD_INFO(%ebp)
2999 + movl EFLAGS(%esp), %eax # mix EFLAGS and CS
3000 + movb CS(%esp), %al
3001 + testl $(VM_MASK | 2), %eax
3002 + jz resume_kernel
3003 +ENTRY(resume_userspace)
3004 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3005 + # setting need_resched or sigpending
3006 + # between sampling and the iret
3007 + movl TI_flags(%ebp), %ecx
3008 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
3009 + # int/exception return?
3010 + jne work_pending
3011 + jmp restore_all
3012 +
3013 +#ifdef CONFIG_PREEMPT
3014 +ENTRY(resume_kernel)
3015 + cli
3016 + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
3017 + jnz restore_nocheck
3018 +need_resched:
3019 + movl TI_flags(%ebp), %ecx # need_resched set ?
3020 + testb $_TIF_NEED_RESCHED, %cl
3021 + jz restore_all
3022 + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
3023 + jz restore_all
3024 + call preempt_schedule_irq
3025 + jmp need_resched
3026 +#endif
3027 +
3028 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
3029 + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
3030 +
3031 + # sysenter call handler stub
3032 +ENTRY(sysenter_entry)
3033 + movl SYSENTER_stack_esp0(%esp),%esp
3034 +sysenter_past_esp:
3035 + sti
3036 + pushl $(__USER_DS)
3037 + pushl %ebp
3038 + pushfl
3039 + pushl $(__USER_CS)
3040 + pushl $SYSENTER_RETURN
3041 +
3042 +/*
3043 + * Load the potential sixth argument from user stack.
3044 + * Careful about security.
3045 + */
3046 + cmpl $__PAGE_OFFSET-3,%ebp
3047 + jae syscall_fault
3048 +1: movl (%ebp),%ebp
3049 +.section __ex_table,"a"
3050 + .align 4
3051 + .long 1b,syscall_fault
3052 +.previous
3053 +
3054 + pushl %eax
3055 + SAVE_ALL
3056 + GET_THREAD_INFO(%ebp)
3057 +
3058 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3059 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3060 + jnz syscall_trace_entry
3061 + cmpl $(nr_syscalls), %eax
3062 + jae syscall_badsys
3063 + call *sys_call_table(,%eax,4)
3064 + movl %eax,EAX(%esp)
3065 + DISABLE_INTERRUPTS
3066 + movl TI_flags(%ebp), %ecx
3067 + testw $_TIF_ALLWORK_MASK, %cx
3068 + jne syscall_exit_work
3069 +/* if something modifies registers it must also disable sysexit */
3070 + movl EIP(%esp), %edx
3071 + movl OLDESP(%esp), %ecx
3072 + xorl %ebp,%ebp
3073 +#ifdef CONFIG_XEN
3074 + __ENABLE_INTERRUPTS
3075 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
3076 + __TEST_PENDING
3077 + jnz 14f # process more events if necessary...
3078 + movl ESI(%esp), %esi
3079 + sysexit
3080 +14: __DISABLE_INTERRUPTS
3081 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
3082 + push %esp
3083 + call evtchn_do_upcall
3084 + add $4,%esp
3085 + jmp ret_from_intr
3086 +#else
3087 + sti
3088 + sysexit
3089 +#endif /* !CONFIG_XEN */
3090 +
3091 +
3092 + # system call handler stub
3093 +ENTRY(system_call)
3094 + pushl %eax # save orig_eax
3095 + SAVE_ALL
3096 + GET_THREAD_INFO(%ebp)
3097 + # system call tracing in operation / emulation
3098 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3099 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3100 + jnz syscall_trace_entry
3101 + cmpl $(nr_syscalls), %eax
3102 + jae syscall_badsys
3103 +syscall_call:
3104 + call *sys_call_table(,%eax,4)
3105 + movl %eax,EAX(%esp) # store the return value
3106 +syscall_exit:
3107 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3108 + # setting need_resched or sigpending
3109 + # between sampling and the iret
3110 + movl TI_flags(%ebp), %ecx
3111 + testw $_TIF_ALLWORK_MASK, %cx # current->work
3112 + jne syscall_exit_work
3113 +
3114 +restore_all:
3115 +#ifndef CONFIG_XEN
3116 + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
3117 + # Warning: OLDSS(%esp) contains the wrong/random values if we
3118 + # are returning to the kernel.
3119 + # See comments in process.c:copy_thread() for details.
3120 + movb OLDSS(%esp), %ah
3121 + movb CS(%esp), %al
3122 + andl $(VM_MASK | (4 << 8) | 3), %eax
3123 + cmpl $((4 << 8) | 3), %eax
3124 + je ldt_ss # returning to user-space with LDT SS
3125 +restore_nocheck:
3126 +#else
3127 +restore_nocheck:
3128 + movl EFLAGS(%esp), %eax
3129 + testl $(VM_MASK|NMI_MASK), %eax
3130 + jnz hypervisor_iret
3131 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
3132 + GET_VCPU_INFO
3133 + andb evtchn_upcall_mask(%esi),%al
3134 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
3135 + jnz restore_all_enable_events # != 0 => enable event delivery
3136 +#endif
3137 + RESTORE_REGS
3138 + addl $4, %esp
3139 +1: iret
3140 +.section .fixup,"ax"
3141 +iret_exc:
3142 +#ifndef CONFIG_XEN
3143 + sti
3144 +#endif
3145 + pushl $0 # no error code
3146 + pushl $do_iret_error
3147 + jmp error_code
3148 +.previous
3149 +.section __ex_table,"a"
3150 + .align 4
3151 + .long 1b,iret_exc
3152 +.previous
3153 +
3154 +#ifndef CONFIG_XEN
3155 +ldt_ss:
3156 + larl OLDSS(%esp), %eax
3157 + jnz restore_nocheck
3158 + testl $0x00400000, %eax # returning to 32bit stack?
3159 + jnz restore_nocheck # allright, normal return
3160 + /* If returning to userspace with 16bit stack,
3161 + * try to fix the higher word of ESP, as the CPU
3162 + * won't restore it.
3163 + * This is an "official" bug of all the x86-compatible
3164 + * CPUs, which we can try to work around to make
3165 + * dosemu and wine happy. */
3166 + subl $8, %esp # reserve space for switch16 pointer
3167 + cli
3168 + movl %esp, %eax
3169 + /* Set up the 16bit stack frame with switch32 pointer on top,
3170 + * and a switch16 pointer on top of the current frame. */
3171 + call setup_x86_bogus_stack
3172 + RESTORE_REGS
3173 + lss 20+4(%esp), %esp # switch to 16bit stack
3174 +1: iret
3175 +.section __ex_table,"a"
3176 + .align 4
3177 + .long 1b,iret_exc
3178 +.previous
3179 +#else
3180 +hypervisor_iret:
3181 + andl $~NMI_MASK, EFLAGS(%esp)
3182 + RESTORE_REGS
3183 + addl $4, %esp
3184 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
3185 +#endif
3186 +
3187 + # perform work that needs to be done immediately before resumption
3188 + ALIGN
3189 +work_pending:
3190 + testb $_TIF_NEED_RESCHED, %cl
3191 + jz work_notifysig
3192 +work_resched:
3193 + call schedule
3194 + DISABLE_INTERRUPTS # make sure we don't miss an interrupt
3195 + # setting need_resched or sigpending
3196 + # between sampling and the iret
3197 + movl TI_flags(%ebp), %ecx
3198 + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
3199 + # than syscall tracing?
3200 + jz restore_all
3201 + testb $_TIF_NEED_RESCHED, %cl
3202 + jnz work_resched
3203 +
3204 +work_notifysig: # deal with pending signals and
3205 + # notify-resume requests
3206 + testl $VM_MASK, EFLAGS(%esp)
3207 + movl %esp, %eax
3208 + jne work_notifysig_v86 # returning to kernel-space or
3209 + # vm86-space
3210 + xorl %edx, %edx
3211 + call do_notify_resume
3212 + jmp resume_userspace
3213 +
3214 + ALIGN
3215 +work_notifysig_v86:
3216 +#ifdef CONFIG_VM86
3217 + pushl %ecx # save ti_flags for do_notify_resume
3218 + call save_v86_state # %eax contains pt_regs pointer
3219 + popl %ecx
3220 + movl %eax, %esp
3221 + xorl %edx, %edx
3222 + call do_notify_resume
3223 + jmp resume_userspace
3224 +#endif
3225 +
3226 + # perform syscall exit tracing
3227 + ALIGN
3228 +syscall_trace_entry:
3229 + movl $-ENOSYS,EAX(%esp)
3230 + movl %esp, %eax
3231 + xorl %edx,%edx
3232 + call do_syscall_trace
3233 + cmpl $0, %eax
3234 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
3235 + # so must skip actual syscall
3236 + movl ORIG_EAX(%esp), %eax
3237 + cmpl $(nr_syscalls), %eax
3238 + jnae syscall_call
3239 + jmp syscall_exit
3240 +
3241 + # perform syscall exit tracing
3242 + ALIGN
3243 +syscall_exit_work:
3244 + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3245 + jz work_pending
3246 + ENABLE_INTERRUPTS # could let do_syscall_trace() call
3247 + # schedule() instead
3248 + movl %esp, %eax
3249 + movl $1, %edx
3250 + call do_syscall_trace
3251 + jmp resume_userspace
3252 +
3253 + ALIGN
3254 +syscall_fault:
3255 + pushl %eax # save orig_eax
3256 + SAVE_ALL
3257 + GET_THREAD_INFO(%ebp)
3258 + movl $-EFAULT,EAX(%esp)
3259 + jmp resume_userspace
3260 +
3261 + ALIGN
3262 +syscall_badsys:
3263 + movl $-ENOSYS,EAX(%esp)
3264 + jmp resume_userspace
3265 +
3266 +#ifndef CONFIG_XEN
3267 +#define FIXUP_ESPFIX_STACK \
3268 + movl %esp, %eax; \
3269 + /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3270 + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3271 + /* copy data from 16bit stack to 32bit stack */ \
3272 + call fixup_x86_bogus_stack; \
3273 + /* put ESP to the proper location */ \
3274 + movl %eax, %esp;
3275 +#define UNWIND_ESPFIX_STACK \
3276 + pushl %eax; \
3277 + movl %ss, %eax; \
3278 + /* see if on 16bit stack */ \
3279 + cmpw $__ESPFIX_SS, %ax; \
3280 + jne 28f; \
3281 + movl $__KERNEL_DS, %edx; \
3282 + movl %edx, %ds; \
3283 + movl %edx, %es; \
3284 + /* switch to 32bit stack */ \
3285 + FIXUP_ESPFIX_STACK \
3286 +28: popl %eax;
3287 +
3288 +/*
3289 + * Build the entry stubs and pointer table with
3290 + * some assembler magic.
3291 + */
3292 +.data
3293 +ENTRY(interrupt)
3294 +.text
3295 +
3296 +vector=0
3297 +ENTRY(irq_entries_start)
3298 +.rept NR_IRQS
3299 + ALIGN
3300 +1: pushl $~(vector)
3301 + jmp common_interrupt
3302 +.data
3303 + .long 1b
3304 +.text
3305 +vector=vector+1
3306 +.endr
3307 +
3308 + ALIGN
3309 +common_interrupt:
3310 + SAVE_ALL
3311 + movl %esp,%eax
3312 + call do_IRQ
3313 + jmp ret_from_intr
3314 +
3315 +#define BUILD_INTERRUPT(name, nr) \
3316 +ENTRY(name) \
3317 + pushl $~(nr); \
3318 + SAVE_ALL \
3319 + movl %esp,%eax; \
3320 + call smp_/**/name; \
3321 + jmp ret_from_intr;
3322 +
3323 +/* The include is where all of the SMP etc. interrupts come from */
3324 +#include "entry_arch.h"
3325 +#else
3326 +#define UNWIND_ESPFIX_STACK
3327 +#endif
3328 +
3329 +ENTRY(divide_error)
3330 + pushl $0 # no error code
3331 + pushl $do_divide_error
3332 + ALIGN
3333 +error_code:
3334 + pushl %ds
3335 + pushl %eax
3336 + xorl %eax, %eax
3337 + pushl %ebp
3338 + pushl %edi
3339 + pushl %esi
3340 + pushl %edx
3341 + decl %eax # eax = -1
3342 + pushl %ecx
3343 + pushl %ebx
3344 + cld
3345 + pushl %es
3346 + UNWIND_ESPFIX_STACK
3347 + popl %ecx
3348 + movl ES(%esp), %edi # get the function address
3349 + movl ORIG_EAX(%esp), %edx # get the error code
3350 + movl %eax, ORIG_EAX(%esp)
3351 + movl %ecx, ES(%esp)
3352 + movl $(__USER_DS), %ecx
3353 + movl %ecx, %ds
3354 + movl %ecx, %es
3355 + movl %esp,%eax # pt_regs pointer
3356 + call *%edi
3357 + jmp ret_from_exception
3358 +
3359 +#ifdef CONFIG_XEN
3360 +# A note on the "critical region" in our callback handler.
3361 +# We want to avoid stacking callback handlers due to events occurring
3362 +# during handling of the last event. To do this, we keep events disabled
3363 +# until we've done all processing. HOWEVER, we must enable events before
3364 +# popping the stack frame (can't be done atomically) and so it would still
3365 +# be possible to get enough handler activations to overflow the stack.
3366 +# Although unlikely, bugs of that kind are hard to track down, so we'd
3367 +# like to avoid the possibility.
3368 +# So, on entry to the handler we detect whether we interrupted an
3369 +# existing activation in its critical region -- if so, we pop the current
3370 +# activation and restart the handler using the previous one.
3371 +#
3372 +# The sysexit critical region is slightly different. sysexit
3373 +# atomically removes the entire stack frame. If we interrupt in the
3374 +# critical region we know that the entire frame is present and correct
3375 +# so we can simply throw away the new one.
3376 +ENTRY(hypervisor_callback)
3377 + pushl %eax
3378 + SAVE_ALL
3379 + movl EIP(%esp),%eax
3380 + cmpl $scrit,%eax
3381 + jb 11f
3382 + cmpl $ecrit,%eax
3383 + jb critical_region_fixup
3384 + cmpl $sysexit_scrit,%eax
3385 + jb 11f
3386 + cmpl $sysexit_ecrit,%eax
3387 + ja 11f
3388 + addl $0x34,%esp # Remove cs...ebx from stack frame.
3389 +11: push %esp
3390 + call evtchn_do_upcall
3391 + add $4,%esp
3392 + jmp ret_from_intr
3393 +
3394 + ALIGN
3395 +restore_all_enable_events:
3396 + __ENABLE_INTERRUPTS
3397 +scrit: /**** START OF CRITICAL REGION ****/
3398 + __TEST_PENDING
3399 + jnz 14f # process more events if necessary...
3400 + RESTORE_REGS
3401 + addl $4, %esp
3402 +1: iret
3403 +.section __ex_table,"a"
3404 + .align 4
3405 + .long 1b,iret_exc
3406 +.previous
3407 +14: __DISABLE_INTERRUPTS
3408 + jmp 11b
3409 +ecrit: /**** END OF CRITICAL REGION ****/
3410 +# [How we do the fixup]. We want to merge the current stack frame with the
3411 +# just-interrupted frame. How we do this depends on where in the critical
3412 +# region the interrupted handler was executing, and so how many saved
3413 +# registers are in each frame. We do this quickly using the lookup table
3414 +# 'critical_fixup_table'. For each byte offset in the critical region, it
3415 +# provides the number of bytes which have already been popped from the
3416 +# interrupted stack frame.
3417 +critical_region_fixup:
3418 + addl $critical_fixup_table-scrit,%eax
3419 + movzbl (%eax),%eax # %eax contains num bytes popped
3420 + cmpb $0xff,%al # 0xff => vcpu_info critical region
3421 + jne 15f
3422 + GET_THREAD_INFO(%ebp)
3423 + xorl %eax,%eax
3424 +15: mov %esp,%esi
3425 + add %eax,%esi # %esi points at end of src region
3426 + mov %esp,%edi
3427 + add $0x34,%edi # %edi points at end of dst region
3428 + mov %eax,%ecx
3429 + shr $2,%ecx # convert words to bytes
3430 + je 17f # skip loop if nothing to copy
3431 +16: subl $4,%esi # pre-decrementing copy loop
3432 + subl $4,%edi
3433 + movl (%esi),%eax
3434 + movl %eax,(%edi)
3435 + loop 16b
3436 +17: movl %edi,%esp # final %edi is top of merged stack
3437 + jmp 11b
3438 +
3439 +critical_fixup_table:
3440 + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
3441 + .byte 0xff,0xff # jnz 14f
3442 + .byte 0x00 # pop %ebx
3443 + .byte 0x04 # pop %ecx
3444 + .byte 0x08 # pop %edx
3445 + .byte 0x0c # pop %esi
3446 + .byte 0x10 # pop %edi
3447 + .byte 0x14 # pop %ebp
3448 + .byte 0x18 # pop %eax
3449 + .byte 0x1c # pop %ds
3450 + .byte 0x20 # pop %es
3451 + .byte 0x24,0x24,0x24 # add $4,%esp
3452 + .byte 0x28 # iret
3453 + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
3454 + .byte 0x00,0x00 # jmp 11b
3455 +
3456 +# Hypervisor uses this for application faults while it executes.
3457 +# We get here for two reasons:
3458 +# 1. Fault while reloading DS, ES, FS or GS
3459 +# 2. Fault while executing IRET
3460 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3461 +# register if the load fails.
3462 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3463 +# normal Linux return path in this case because if we use the IRET hypercall
3464 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3465 +# We distinguish between categories by maintaining a status value in EAX.
3466 +ENTRY(failsafe_callback)
3467 + pushl %eax
3468 + movl $1,%eax
3469 +1: mov 4(%esp),%ds
3470 +2: mov 8(%esp),%es
3471 +3: mov 12(%esp),%fs
3472 +4: mov 16(%esp),%gs
3473 + testl %eax,%eax
3474 + popl %eax
3475 + jz 5f
3476 + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET)
3477 + jmp iret_exc
3478 +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment)
3479 + pushl $0
3480 + SAVE_ALL
3481 + jmp ret_from_exception
3482 +.section .fixup,"ax"; \
3483 +6: xorl %eax,%eax; \
3484 + movl %eax,4(%esp); \
3485 + jmp 1b; \
3486 +7: xorl %eax,%eax; \
3487 + movl %eax,8(%esp); \
3488 + jmp 2b; \
3489 +8: xorl %eax,%eax; \
3490 + movl %eax,12(%esp); \
3491 + jmp 3b; \
3492 +9: xorl %eax,%eax; \
3493 + movl %eax,16(%esp); \
3494 + jmp 4b; \
3495 +.previous; \
3496 +.section __ex_table,"a"; \
3497 + .align 4; \
3498 + .long 1b,6b; \
3499 + .long 2b,7b; \
3500 + .long 3b,8b; \
3501 + .long 4b,9b; \
3502 +.previous
3503 +#endif
3504 +
3505 +ENTRY(coprocessor_error)
3506 + pushl $0
3507 + pushl $do_coprocessor_error
3508 + jmp error_code
3509 +
3510 +ENTRY(simd_coprocessor_error)
3511 + pushl $0
3512 + pushl $do_simd_coprocessor_error
3513 + jmp error_code
3514 +
3515 +ENTRY(device_not_available)
3516 + pushl $-1 # mark this as an int
3517 + SAVE_ALL
3518 +#ifndef CONFIG_XEN
3519 + movl %cr0, %eax
3520 + testl $0x4, %eax # EM (math emulation bit)
3521 + je device_available_emulate
3522 + pushl $0 # temporary storage for ORIG_EIP
3523 + call math_emulate
3524 + addl $4, %esp
3525 + jmp ret_from_exception
3526 +device_available_emulate:
3527 +#endif
3528 + preempt_stop
3529 + call math_state_restore
3530 + jmp ret_from_exception
3531 +
3532 +#ifndef CONFIG_XEN
3533 +/*
3534 + * Debug traps and NMI can happen at the one SYSENTER instruction
3535 + * that sets up the real kernel stack. Check here, since we can't
3536 + * allow the wrong stack to be used.
3537 + *
3538 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
3539 + * already pushed 3 words if it hits on the sysenter instruction:
3540 + * eflags, cs and eip.
3541 + *
3542 + * We just load the right stack, and push the three (known) values
3543 + * by hand onto the new stack - while updating the return eip past
3544 + * the instruction that would have done it for sysenter.
3545 + */
3546 +#define FIX_STACK(offset, ok, label) \
3547 + cmpw $__KERNEL_CS,4(%esp); \
3548 + jne ok; \
3549 +label: \
3550 + movl SYSENTER_stack_esp0+offset(%esp),%esp; \
3551 + pushfl; \
3552 + pushl $__KERNEL_CS; \
3553 + pushl $sysenter_past_esp
3554 +#endif /* CONFIG_XEN */
3555 +
3556 +KPROBE_ENTRY(debug)
3557 +#ifndef CONFIG_XEN
3558 + cmpl $sysenter_entry,(%esp)
3559 + jne debug_stack_correct
3560 + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3561 +debug_stack_correct:
3562 +#endif /* !CONFIG_XEN */
3563 + pushl $-1 # mark this as an int
3564 + SAVE_ALL
3565 + xorl %edx,%edx # error code 0
3566 + movl %esp,%eax # pt_regs pointer
3567 + call do_debug
3568 + jmp ret_from_exception
3569 + .previous .text
3570 +
3571 +#ifndef CONFIG_XEN
3572 +/*
3573 + * NMI is doubly nasty. It can happen _while_ we're handling
3574 + * a debug fault, and the debug fault hasn't yet been able to
3575 + * clear up the stack. So we first check whether we got an
3576 + * NMI on the sysenter entry path, but after that we need to
3577 + * check whether we got an NMI on the debug path where the debug
3578 + * fault happened on the sysenter path.
3579 + */
3580 +ENTRY(nmi)
3581 + pushl %eax
3582 + movl %ss, %eax
3583 + cmpw $__ESPFIX_SS, %ax
3584 + popl %eax
3585 + je nmi_16bit_stack
3586 + cmpl $sysenter_entry,(%esp)
3587 + je nmi_stack_fixup
3588 + pushl %eax
3589 + movl %esp,%eax
3590 + /* Do not access memory above the end of our stack page,
3591 + * it might not exist.
3592 + */
3593 + andl $(THREAD_SIZE-1),%eax
3594 + cmpl $(THREAD_SIZE-20),%eax
3595 + popl %eax
3596 + jae nmi_stack_correct
3597 + cmpl $sysenter_entry,12(%esp)
3598 + je nmi_debug_stack_check
3599 +nmi_stack_correct:
3600 + pushl %eax
3601 + SAVE_ALL
3602 + xorl %edx,%edx # zero error code
3603 + movl %esp,%eax # pt_regs pointer
3604 + call do_nmi
3605 + jmp restore_all
3606 +
3607 +nmi_stack_fixup:
3608 + FIX_STACK(12,nmi_stack_correct, 1)
3609 + jmp nmi_stack_correct
3610 +nmi_debug_stack_check:
3611 + cmpw $__KERNEL_CS,16(%esp)
3612 + jne nmi_stack_correct
3613 + cmpl $debug,(%esp)
3614 + jb nmi_stack_correct
3615 + cmpl $debug_esp_fix_insn,(%esp)
3616 + ja nmi_stack_correct
3617 + FIX_STACK(24,nmi_stack_correct, 1)
3618 + jmp nmi_stack_correct
3619 +
3620 +nmi_16bit_stack:
3621 + /* create the pointer to lss back */
3622 + pushl %ss
3623 + pushl %esp
3624 + movzwl %sp, %esp
3625 + addw $4, (%esp)
3626 + /* copy the iret frame of 12 bytes */
3627 + .rept 3
3628 + pushl 16(%esp)
3629 + .endr
3630 + pushl %eax
3631 + SAVE_ALL
3632 + FIXUP_ESPFIX_STACK # %eax == %esp
3633 + xorl %edx,%edx # zero error code
3634 + call do_nmi
3635 + RESTORE_REGS
3636 + lss 12+4(%esp), %esp # back to 16bit stack
3637 +1: iret
3638 +.section __ex_table,"a"
3639 + .align 4
3640 + .long 1b,iret_exc
3641 +.previous
3642 +#else
3643 +ENTRY(nmi)
3644 + pushl %eax
3645 + SAVE_ALL
3646 + xorl %edx,%edx # zero error code
3647 + movl %esp,%eax # pt_regs pointer
3648 + call do_nmi
3649 + orl $NMI_MASK, EFLAGS(%esp)
3650 + jmp restore_all
3651 +#endif
3652 +
3653 +KPROBE_ENTRY(int3)
3654 + pushl $-1 # mark this as an int
3655 + SAVE_ALL
3656 + xorl %edx,%edx # zero error code
3657 + movl %esp,%eax # pt_regs pointer
3658 + call do_int3
3659 + jmp ret_from_exception
3660 + .previous .text
3661 +
3662 +ENTRY(overflow)
3663 + pushl $0
3664 + pushl $do_overflow
3665 + jmp error_code
3666 +
3667 +ENTRY(bounds)
3668 + pushl $0
3669 + pushl $do_bounds
3670 + jmp error_code
3671 +
3672 +ENTRY(invalid_op)
3673 + pushl $0
3674 + pushl $do_invalid_op
3675 + jmp error_code
3676 +
3677 +ENTRY(coprocessor_segment_overrun)
3678 + pushl $0
3679 + pushl $do_coprocessor_segment_overrun
3680 + jmp error_code
3681 +
3682 +ENTRY(invalid_TSS)
3683 + pushl $do_invalid_TSS
3684 + jmp error_code
3685 +
3686 +ENTRY(segment_not_present)
3687 + pushl $do_segment_not_present
3688 + jmp error_code
3689 +
3690 +ENTRY(stack_segment)
3691 + pushl $do_stack_segment
3692 + jmp error_code
3693 +
3694 +KPROBE_ENTRY(general_protection)
3695 + pushl $do_general_protection
3696 + jmp error_code
3697 + .previous .text
3698 +
3699 +ENTRY(alignment_check)
3700 + pushl $do_alignment_check
3701 + jmp error_code
3702 +
3703 +KPROBE_ENTRY(page_fault)
3704 + pushl $do_page_fault
3705 + jmp error_code
3706 + .previous .text
3707 +
3708 +#ifdef CONFIG_X86_MCE
3709 +ENTRY(machine_check)
3710 + pushl $0
3711 + pushl machine_check_vector
3712 + jmp error_code
3713 +#endif
3714 +
3715 +ENTRY(fixup_4gb_segment)
3716 + pushl $do_fixup_4gb_segment
3717 + jmp error_code
3718 +
3719 +.section .rodata,"a"
3720 +#include "syscall_table.S"
3721 +
3722 +syscall_table_size=(.-sys_call_table)
3723 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/entry.S linux-2.6.16.33/arch/i386/kernel/entry.S
3724 --- linux-2.6.16.33-noxen/arch/i386/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
3725 +++ linux-2.6.16.33/arch/i386/kernel/entry.S 2007-05-23 21:00:01.000000000 +0000
3726 @@ -177,7 +177,7 @@
3727
3728 # sysenter call handler stub
3729 ENTRY(sysenter_entry)
3730 - movl TSS_sysenter_esp0(%esp),%esp
3731 + movl SYSENTER_stack_esp0(%esp),%esp
3732 sysenter_past_esp:
3733 sti
3734 pushl $(__USER_DS)
3735 @@ -406,7 +406,7 @@
3736 ENTRY(irq_entries_start)
3737 .rept NR_IRQS
3738 ALIGN
3739 -1: pushl $vector-256
3740 +1: pushl $~(vector)
3741 jmp common_interrupt
3742 .data
3743 .long 1b
3744 @@ -423,7 +423,7 @@
3745
3746 #define BUILD_INTERRUPT(name, nr) \
3747 ENTRY(name) \
3748 - pushl $nr-256; \
3749 + pushl $~(nr); \
3750 SAVE_ALL \
3751 movl %esp,%eax; \
3752 call smp_/**/name; \
3753 @@ -492,7 +492,7 @@
3754 * that sets up the real kernel stack. Check here, since we can't
3755 * allow the wrong stack to be used.
3756 *
3757 - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
3758 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
3759 * already pushed 3 words if it hits on the sysenter instruction:
3760 * eflags, cs and eip.
3761 *
3762 @@ -504,7 +504,7 @@
3763 cmpw $__KERNEL_CS,4(%esp); \
3764 jne ok; \
3765 label: \
3766 - movl TSS_sysenter_esp0+offset(%esp),%esp; \
3767 + movl SYSENTER_stack_esp0+offset(%esp),%esp; \
3768 pushfl; \
3769 pushl $__KERNEL_CS; \
3770 pushl $sysenter_past_esp
3771 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/fixup.c linux-2.6.16.33/arch/i386/kernel/fixup.c
3772 --- linux-2.6.16.33-noxen/arch/i386/kernel/fixup.c 1970-01-01 00:00:00.000000000 +0000
3773 +++ linux-2.6.16.33/arch/i386/kernel/fixup.c 2007-01-08 15:00:45.000000000 +0000
3774 @@ -0,0 +1,89 @@
3775 +/******************************************************************************
3776 + * fixup.c
3777 + *
3778 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
3779 + * Used to avoid repeated slow emulation of common instructions used by the
3780 + * user-space TLS (Thread-Local Storage) libraries.
3781 + *
3782 + * **** NOTE ****
3783 + * Issues with the binary rewriting have caused it to be removed. Instead
3784 + * we rely on Xen's emulator to boot the kernel, and then print a banner
3785 + * message recommending that the user disables /lib/tls.
3786 + *
3787 + * Copyright (c) 2004, K A Fraser
3788 + *
3789 + * This program is free software; you can redistribute it and/or modify
3790 + * it under the terms of the GNU General Public License as published by
3791 + * the Free Software Foundation; either version 2 of the License, or
3792 + * (at your option) any later version.
3793 + *
3794 + * This program is distributed in the hope that it will be useful,
3795 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
3796 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3797 + * GNU General Public License for more details.
3798 + *
3799 + * You should have received a copy of the GNU General Public License
3800 + * along with this program; if not, write to the Free Software
3801 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3802 + */
3803 +
3804 +#include <linux/config.h>
3805 +#include <linux/init.h>
3806 +#include <linux/sched.h>
3807 +#include <linux/slab.h>
3808 +#include <linux/kernel.h>
3809 +#include <linux/delay.h>
3810 +#include <linux/version.h>
3811 +
3812 +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
3813 +
3814 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3815 +{
3816 + static unsigned long printed = 0;
3817 + char info[100];
3818 + int i;
3819 +
3820 + /* Ignore statically-linked init. */
3821 + if (current->tgid == 1)
3822 + return;
3823 +
3824 + HYPERVISOR_vm_assist(
3825 + VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
3826 +
3827 + if (test_and_set_bit(0, &printed))
3828 + return;
3829 +
3830 + sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
3831 +
3832 + DP("");
3833 + DP("***************************************************************");
3834 + DP("***************************************************************");
3835 + DP("** WARNING: Currently emulating unsupported memory accesses **");
3836 + DP("** in /lib/tls glibc libraries. The emulation is **");
3837 + DP("** slow. To ensure full performance you should **");
3838 + DP("** install a 'xen-friendly' (nosegneg) version of **");
3839 + DP("** the library, or disable tls support by executing **");
3840 + DP("** the following as root: **");
3841 + DP("** mv /lib/tls /lib/tls.disabled **");
3842 + DP("** Offending process: %-38.38s **", info);
3843 + DP("***************************************************************");
3844 + DP("***************************************************************");
3845 + DP("");
3846 +
3847 + for (i = 5; i > 0; i--) {
3848 + touch_softlockup_watchdog();
3849 + printk("Pausing... %d", i);
3850 + mdelay(1000);
3851 + printk("\b\b\b\b\b\b\b\b\b\b\b\b");
3852 + }
3853 +
3854 + printk("Continuing...\n\n");
3855 +}
3856 +
3857 +static int __init fixup_init(void)
3858 +{
3859 + HYPERVISOR_vm_assist(
3860 + VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
3861 + return 0;
3862 +}
3863 +__initcall(fixup_init);
3864 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/head-xen.S linux-2.6.16.33/arch/i386/kernel/head-xen.S
3865 --- linux-2.6.16.33-noxen/arch/i386/kernel/head-xen.S 1970-01-01 00:00:00.000000000 +0000
3866 +++ linux-2.6.16.33/arch/i386/kernel/head-xen.S 2007-01-08 15:00:45.000000000 +0000
3867 @@ -0,0 +1,202 @@
3868 +
3869 +
3870 +.text
3871 +#include <linux/config.h>
3872 +#include <linux/elfnote.h>
3873 +#include <linux/threads.h>
3874 +#include <linux/linkage.h>
3875 +#include <asm/segment.h>
3876 +#include <asm/page.h>
3877 +#include <asm/thread_info.h>
3878 +#include <asm/asm-offsets.h>
3879 +#include <xen/interface/xen.h>
3880 +#include <xen/interface/elfnote.h>
3881 +
3882 +/*
3883 + * References to members of the new_cpu_data structure.
3884 + */
3885 +
3886 +#define X86 new_cpu_data+CPUINFO_x86
3887 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
3888 +#define X86_MODEL new_cpu_data+CPUINFO_x86_model
3889 +#define X86_MASK new_cpu_data+CPUINFO_x86_mask
3890 +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
3891 +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
3892 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
3893 +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
3894 +
3895 +#define VIRT_ENTRY_OFFSET 0x0
3896 +.org VIRT_ENTRY_OFFSET
3897 +ENTRY(startup_32)
3898 + movl %esi,xen_start_info
3899 + cld
3900 +
3901 + /* Set up the stack pointer */
3902 + movl $(init_thread_union+THREAD_SIZE),%esp
3903 +
3904 + /* get vendor info */
3905 + xorl %eax,%eax # call CPUID with 0 -> return vendor ID
3906 + XEN_CPUID
3907 + movl %eax,X86_CPUID # save CPUID level
3908 + movl %ebx,X86_VENDOR_ID # lo 4 chars
3909 + movl %edx,X86_VENDOR_ID+4 # next 4 chars
3910 + movl %ecx,X86_VENDOR_ID+8 # last 4 chars
3911 +
3912 + movl $1,%eax # Use the CPUID instruction to get CPU type
3913 + XEN_CPUID
3914 + movb %al,%cl # save reg for future use
3915 + andb $0x0f,%ah # mask processor family
3916 + movb %ah,X86
3917 + andb $0xf0,%al # mask model
3918 + shrb $4,%al
3919 + movb %al,X86_MODEL
3920 + andb $0x0f,%cl # mask mask revision
3921 + movb %cl,X86_MASK
3922 + movl %edx,X86_CAPABILITY
3923 +
3924 + movb $1,X86_HARD_MATH
3925 +
3926 + xorl %eax,%eax # Clear FS/GS and LDT
3927 + movl %eax,%fs
3928 + movl %eax,%gs
3929 + cld # gcc2 wants the direction flag cleared at all times
3930 +
3931 + call start_kernel
3932 +L6:
3933 + jmp L6 # main should never return here, but
3934 + # just in case, we know what happens.
3935 +
3936 +#define HYPERCALL_PAGE_OFFSET 0x1000
3937 +.org HYPERCALL_PAGE_OFFSET
3938 +ENTRY(hypercall_page)
3939 +.skip 0x1000
3940 +
3941 +/*
3942 + * Real beginning of normal "text" segment
3943 + */
3944 +ENTRY(stext)
3945 +ENTRY(_stext)
3946 +
3947 +/*
3948 + * BSS section
3949 + */
3950 +.section ".bss.page_aligned","w"
3951 +ENTRY(empty_zero_page)
3952 + .fill 4096,1,0
3953 +
3954 +/*
3955 + * This starts the data section.
3956 + */
3957 +.data
3958 +
3959 +/*
3960 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
3961 + */
3962 +ENTRY(cpu_gdt_table)
3963 + .quad 0x0000000000000000 /* NULL descriptor */
3964 + .quad 0x0000000000000000 /* 0x0b reserved */
3965 + .quad 0x0000000000000000 /* 0x13 reserved */
3966 + .quad 0x0000000000000000 /* 0x1b reserved */
3967 + .quad 0x0000000000000000 /* 0x20 unused */
3968 + .quad 0x0000000000000000 /* 0x28 unused */
3969 + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
3970 + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
3971 + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
3972 + .quad 0x0000000000000000 /* 0x4b reserved */
3973 + .quad 0x0000000000000000 /* 0x53 reserved */
3974 + .quad 0x0000000000000000 /* 0x5b reserved */
3975 +
3976 + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
3977 + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
3978 + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
3979 + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
3980 +
3981 + .quad 0x0000000000000000 /* 0x80 TSS descriptor */
3982 + .quad 0x0000000000000000 /* 0x88 LDT descriptor */
3983 +
3984 + /*
3985 + * Segments used for calling PnP BIOS have byte granularity.
3986 + * They code segments and data segments have fixed 64k limits,
3987 + * the transfer segment sizes are set at run time.
3988 + */
3989 + .quad 0x0000000000000000 /* 0x90 32-bit code */
3990 + .quad 0x0000000000000000 /* 0x98 16-bit code */
3991 + .quad 0x0000000000000000 /* 0xa0 16-bit data */
3992 + .quad 0x0000000000000000 /* 0xa8 16-bit data */
3993 + .quad 0x0000000000000000 /* 0xb0 16-bit data */
3994 +
3995 + /*
3996 + * The APM segments have byte granularity and their bases
3997 + * are set at run time. All have 64k limits.
3998 + */
3999 + .quad 0x0000000000000000 /* 0xb8 APM CS code */
4000 + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
4001 + .quad 0x0000000000000000 /* 0xc8 APM DS data */
4002 +
4003 + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
4004 + .quad 0x0000000000000000 /* 0xd8 - unused */
4005 + .quad 0x0000000000000000 /* 0xe0 - unused */
4006 + .quad 0x0000000000000000 /* 0xe8 - unused */
4007 + .quad 0x0000000000000000 /* 0xf0 - unused */
4008 + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
4009 +
4010 +#ifdef CONFIG_XEN_COMPAT_030002
4011 +/*
4012 + * __xen_guest information
4013 + */
4014 +.macro utoa value
4015 + .if (\value) < 0 || (\value) >= 0x10
4016 + utoa (((\value)>>4)&0x0fffffff)
4017 + .endif
4018 + .if ((\value) & 0xf) < 10
4019 + .byte '0' + ((\value) & 0xf)
4020 + .else
4021 + .byte 'A' + ((\value) & 0xf) - 10
4022 + .endif
4023 +.endm
4024 +
4025 +.section __xen_guest
4026 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
4027 + .ascii ",XEN_VER=xen-3.0"
4028 + .ascii ",VIRT_BASE=0x"
4029 + utoa __PAGE_OFFSET
4030 + .ascii ",ELF_PADDR_OFFSET=0x"
4031 + utoa __PAGE_OFFSET
4032 + .ascii ",VIRT_ENTRY=0x"
4033 + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
4034 + .ascii ",HYPERCALL_PAGE=0x"
4035 + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
4036 + .ascii ",FEATURES=writable_page_tables"
4037 + .ascii "|writable_descriptor_tables"
4038 + .ascii "|auto_translated_physmap"
4039 + .ascii "|pae_pgdir_above_4gb"
4040 + .ascii "|supervisor_mode_kernel"
4041 +#ifdef CONFIG_X86_PAE
4042 + .ascii ",PAE=yes[extended-cr3]"
4043 +#else
4044 + .ascii ",PAE=no"
4045 +#endif
4046 + .ascii ",LOADER=generic"
4047 + .byte 0
4048 +#endif /* CONFIG_XEN_COMPAT_030002 */
4049 +
4050 +
4051 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
4052 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
4053 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
4054 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET)
4055 +#ifdef CONFIG_XEN_COMPAT_030002
4056 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET)
4057 +#else
4058 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0)
4059 +#endif /* !CONFIG_XEN_COMPAT_030002 */
4060 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32)
4061 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page)
4062 + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START)
4063 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
4064 +#ifdef CONFIG_X86_PAE
4065 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
4066 +#else
4067 + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
4068 +#endif
4069 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
4070 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/init_task-xen.c linux-2.6.16.33/arch/i386/kernel/init_task-xen.c
4071 --- linux-2.6.16.33-noxen/arch/i386/kernel/init_task-xen.c 1970-01-01 00:00:00.000000000 +0000
4072 +++ linux-2.6.16.33/arch/i386/kernel/init_task-xen.c 2007-01-08 15:00:45.000000000 +0000
4073 @@ -0,0 +1,51 @@
4074 +#include <linux/mm.h>
4075 +#include <linux/module.h>
4076 +#include <linux/sched.h>
4077 +#include <linux/init.h>
4078 +#include <linux/init_task.h>
4079 +#include <linux/fs.h>
4080 +#include <linux/mqueue.h>
4081 +
4082 +#include <asm/uaccess.h>
4083 +#include <asm/pgtable.h>
4084 +#include <asm/desc.h>
4085 +
4086 +static struct fs_struct init_fs = INIT_FS;
4087 +static struct files_struct init_files = INIT_FILES;
4088 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
4089 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
4090 +
4091 +#define swapper_pg_dir ((pgd_t *)NULL)
4092 +struct mm_struct init_mm = INIT_MM(init_mm);
4093 +#undef swapper_pg_dir
4094 +
4095 +EXPORT_SYMBOL(init_mm);
4096 +
4097 +/*
4098 + * Initial thread structure.
4099 + *
4100 + * We need to make sure that this is THREAD_SIZE aligned due to the
4101 + * way process stacks are handled. This is done by having a special
4102 + * "init_task" linker map entry..
4103 + */
4104 +union thread_union init_thread_union
4105 + __attribute__((__section__(".data.init_task"))) =
4106 + { INIT_THREAD_INFO(init_task) };
4107 +
4108 +/*
4109 + * Initial task structure.
4110 + *
4111 + * All other task structs will be allocated on slabs in fork.c
4112 + */
4113 +struct task_struct init_task = INIT_TASK(init_task);
4114 +
4115 +EXPORT_SYMBOL(init_task);
4116 +
4117 +#ifndef CONFIG_X86_NO_TSS
4118 +/*
4119 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
4120 + * no more per-task TSS's.
4121 + */
4122 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
4123 +#endif
4124 +
4125 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/io_apic-xen.c linux-2.6.16.33/arch/i386/kernel/io_apic-xen.c
4126 --- linux-2.6.16.33-noxen/arch/i386/kernel/io_apic-xen.c 1970-01-01 00:00:00.000000000 +0000
4127 +++ linux-2.6.16.33/arch/i386/kernel/io_apic-xen.c 2007-01-08 15:00:45.000000000 +0000
4128 @@ -0,0 +1,2748 @@
4129 +/*
4130 + * Intel IO-APIC support for multi-Pentium hosts.
4131 + *
4132 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
4133 + *
4134 + * Many thanks to Stig Venaas for trying out countless experimental
4135 + * patches and reporting/debugging problems patiently!
4136 + *
4137 + * (c) 1999, Multiple IO-APIC support, developed by
4138 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
4139 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
4140 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
4141 + * and Ingo Molnar <mingo@redhat.com>
4142 + *
4143 + * Fixes
4144 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
4145 + * thanks to Eric Gilmore
4146 + * and Rolf G. Tews
4147 + * for testing these extensively
4148 + * Paul Diefenbaugh : Added full ACPI support
4149 + */
4150 +
4151 +#include <linux/mm.h>
4152 +#include <linux/interrupt.h>
4153 +#include <linux/init.h>
4154 +#include <linux/delay.h>
4155 +#include <linux/sched.h>
4156 +#include <linux/config.h>
4157 +#include <linux/smp_lock.h>
4158 +#include <linux/mc146818rtc.h>
4159 +#include <linux/compiler.h>
4160 +#include <linux/acpi.h>
4161 +#include <linux/module.h>
4162 +#include <linux/sysdev.h>
4163 +
4164 +#include <asm/io.h>
4165 +#include <asm/smp.h>
4166 +#include <asm/desc.h>
4167 +#include <asm/timer.h>
4168 +#include <asm/i8259.h>
4169 +
4170 +#include <mach_apic.h>
4171 +
4172 +#include "io_ports.h"
4173 +
4174 +#ifdef CONFIG_XEN
4175 +
4176 +#include <xen/interface/xen.h>
4177 +#include <xen/interface/physdev.h>
4178 +
4179 +/* Fake i8259 */
4180 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
4181 +#define disable_8259A_irq(_irq) ((void)0)
4182 +#define i8259A_irq_pending(_irq) (0)
4183 +
4184 +unsigned long io_apic_irqs;
4185 +
4186 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
4187 +{
4188 + struct physdev_apic apic_op;
4189 + int ret;
4190 +
4191 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4192 + apic_op.reg = reg;
4193 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
4194 + if (ret)
4195 + return ret;
4196 + return apic_op.value;
4197 +}
4198 +
4199 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4200 +{
4201 + struct physdev_apic apic_op;
4202 +
4203 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4204 + apic_op.reg = reg;
4205 + apic_op.value = value;
4206 + HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
4207 +}
4208 +
4209 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
4210 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
4211 +
4212 +#endif /* CONFIG_XEN */
4213 +
4214 +int (*ioapic_renumber_irq)(int ioapic, int irq);
4215 +atomic_t irq_mis_count;
4216 +
4217 +/* Where if anywhere is the i8259 connect in external int mode */
4218 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
4219 +
4220 +static DEFINE_SPINLOCK(ioapic_lock);
4221 +
4222 +int timer_over_8254 __initdata = 1;
4223 +
4224 +/*
4225 + * Is the SiS APIC rmw bug present ?
4226 + * -1 = don't know, 0 = no, 1 = yes
4227 + */
4228 +int sis_apic_bug = -1;
4229 +
4230 +/*
4231 + * # of IRQ routing registers
4232 + */
4233 +int nr_ioapic_registers[MAX_IO_APICS];
4234 +
4235 +int disable_timer_pin_1 __initdata;
4236 +
4237 +/*
4238 + * Rough estimation of how many shared IRQs there are, can
4239 + * be changed anytime.
4240 + */
4241 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
4242 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
4243 +
4244 +/*
4245 + * This is performance-critical, we want to do it O(1)
4246 + *
4247 + * the indexing order of this array favors 1:1 mappings
4248 + * between pins and IRQs.
4249 + */
4250 +
4251 +static struct irq_pin_list {
4252 + int apic, pin, next;
4253 +} irq_2_pin[PIN_MAP_SIZE];
4254 +
4255 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
4256 +#ifdef CONFIG_PCI_MSI
4257 +#define vector_to_irq(vector) \
4258 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
4259 +#else
4260 +#define vector_to_irq(vector) (vector)
4261 +#endif
4262 +
4263 +/*
4264 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4265 + * shared ISA-space IRQs, so we have to support them. We are super
4266 + * fast in the common case, and fast for shared ISA-space IRQs.
4267 + */
4268 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4269 +{
4270 + static int first_free_entry = NR_IRQS;
4271 + struct irq_pin_list *entry = irq_2_pin + irq;
4272 +
4273 + while (entry->next)
4274 + entry = irq_2_pin + entry->next;
4275 +
4276 + if (entry->pin != -1) {
4277 + entry->next = first_free_entry;
4278 + entry = irq_2_pin + entry->next;
4279 + if (++first_free_entry >= PIN_MAP_SIZE)
4280 + panic("io_apic.c: whoops");
4281 + }
4282 + entry->apic = apic;
4283 + entry->pin = pin;
4284 +}
4285 +
4286 +#ifdef CONFIG_XEN
4287 +#define clear_IO_APIC() ((void)0)
4288 +#else
4289 +/*
4290 + * Reroute an IRQ to a different pin.
4291 + */
4292 +static void __init replace_pin_at_irq(unsigned int irq,
4293 + int oldapic, int oldpin,
4294 + int newapic, int newpin)
4295 +{
4296 + struct irq_pin_list *entry = irq_2_pin + irq;
4297 +
4298 + while (1) {
4299 + if (entry->apic == oldapic && entry->pin == oldpin) {
4300 + entry->apic = newapic;
4301 + entry->pin = newpin;
4302 + }
4303 + if (!entry->next)
4304 + break;
4305 + entry = irq_2_pin + entry->next;
4306 + }
4307 +}
4308 +
4309 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
4310 +{
4311 + struct irq_pin_list *entry = irq_2_pin + irq;
4312 + unsigned int pin, reg;
4313 +
4314 + for (;;) {
4315 + pin = entry->pin;
4316 + if (pin == -1)
4317 + break;
4318 + reg = io_apic_read(entry->apic, 0x10 + pin*2);
4319 + reg &= ~disable;
4320 + reg |= enable;
4321 + io_apic_modify(entry->apic, 0x10 + pin*2, reg);
4322 + if (!entry->next)
4323 + break;
4324 + entry = irq_2_pin + entry->next;
4325 + }
4326 +}
4327 +
4328 +/* mask = 1 */
4329 +static void __mask_IO_APIC_irq (unsigned int irq)
4330 +{
4331 + __modify_IO_APIC_irq(irq, 0x00010000, 0);
4332 +}
4333 +
4334 +/* mask = 0 */
4335 +static void __unmask_IO_APIC_irq (unsigned int irq)
4336 +{
4337 + __modify_IO_APIC_irq(irq, 0, 0x00010000);
4338 +}
4339 +
4340 +/* mask = 1, trigger = 0 */
4341 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
4342 +{
4343 + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
4344 +}
4345 +
4346 +/* mask = 0, trigger = 1 */
4347 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
4348 +{
4349 + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
4350 +}
4351 +
4352 +static void mask_IO_APIC_irq (unsigned int irq)
4353 +{
4354 + unsigned long flags;
4355 +
4356 + spin_lock_irqsave(&ioapic_lock, flags);
4357 + __mask_IO_APIC_irq(irq);
4358 + spin_unlock_irqrestore(&ioapic_lock, flags);
4359 +}
4360 +
4361 +static void unmask_IO_APIC_irq (unsigned int irq)
4362 +{
4363 + unsigned long flags;
4364 +
4365 + spin_lock_irqsave(&ioapic_lock, flags);
4366 + __unmask_IO_APIC_irq(irq);
4367 + spin_unlock_irqrestore(&ioapic_lock, flags);
4368 +}
4369 +
4370 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
4371 +{
4372 + struct IO_APIC_route_entry entry;
4373 + unsigned long flags;
4374 +
4375 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
4376 + spin_lock_irqsave(&ioapic_lock, flags);
4377 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4378 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4379 + spin_unlock_irqrestore(&ioapic_lock, flags);
4380 + if (entry.delivery_mode == dest_SMI)
4381 + return;
4382 +
4383 + /*
4384 + * Disable it in the IO-APIC irq-routing table:
4385 + */
4386 + memset(&entry, 0, sizeof(entry));
4387 + entry.mask = 1;
4388 + spin_lock_irqsave(&ioapic_lock, flags);
4389 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
4390 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
4391 + spin_unlock_irqrestore(&ioapic_lock, flags);
4392 +}
4393 +
4394 +static void clear_IO_APIC (void)
4395 +{
4396 + int apic, pin;
4397 +
4398 + for (apic = 0; apic < nr_ioapics; apic++)
4399 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
4400 + clear_IO_APIC_pin(apic, pin);
4401 +}
4402 +
4403 +#ifdef CONFIG_SMP
4404 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
4405 +{
4406 + unsigned long flags;
4407 + int pin;
4408 + struct irq_pin_list *entry = irq_2_pin + irq;
4409 + unsigned int apicid_value;
4410 + cpumask_t tmp;
4411 +
4412 + cpus_and(tmp, cpumask, cpu_online_map);
4413 + if (cpus_empty(tmp))
4414 + tmp = TARGET_CPUS;
4415 +
4416 + cpus_and(cpumask, tmp, CPU_MASK_ALL);
4417 +
4418 + apicid_value = cpu_mask_to_apicid(cpumask);
4419 + /* Prepare to do the io_apic_write */
4420 + apicid_value = apicid_value << 24;
4421 + spin_lock_irqsave(&ioapic_lock, flags);
4422 + for (;;) {
4423 + pin = entry->pin;
4424 + if (pin == -1)
4425 + break;
4426 + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
4427 + if (!entry->next)
4428 + break;
4429 + entry = irq_2_pin + entry->next;
4430 + }
4431 + set_irq_info(irq, cpumask);
4432 + spin_unlock_irqrestore(&ioapic_lock, flags);
4433 +}
4434 +
4435 +#if defined(CONFIG_IRQBALANCE)
4436 +# include <asm/processor.h> /* kernel_thread() */
4437 +# include <linux/kernel_stat.h> /* kstat */
4438 +# include <linux/slab.h> /* kmalloc() */
4439 +# include <linux/timer.h> /* time_after() */
4440 +
4441 +# ifdef CONFIG_BALANCED_IRQ_DEBUG
4442 +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
4443 +# define Dprintk(x...) do { TDprintk(x); } while (0)
4444 +# else
4445 +# define TDprintk(x...)
4446 +# define Dprintk(x...)
4447 +# endif
4448 +
4449 +
4450 +#define IRQBALANCE_CHECK_ARCH -999
4451 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
4452 +static int physical_balance = 0;
4453 +
4454 +static struct irq_cpu_info {
4455 + unsigned long * last_irq;
4456 + unsigned long * irq_delta;
4457 + unsigned long irq;
4458 +} irq_cpu_data[NR_CPUS];
4459 +
4460 +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
4461 +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
4462 +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
4463 +
4464 +#define IDLE_ENOUGH(cpu,now) \
4465 + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
4466 +
4467 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
4468 +
4469 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
4470 +
4471 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
4472 +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
4473 +#define BALANCED_IRQ_MORE_DELTA (HZ/10)
4474 +#define BALANCED_IRQ_LESS_DELTA (HZ)
4475 +
4476 +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
4477 +
4478 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
4479 + unsigned long now, int direction)
4480 +{
4481 + int search_idle = 1;
4482 + int cpu = curr_cpu;
4483 +
4484 + goto inside;
4485 +
4486 + do {
4487 + if (unlikely(cpu == curr_cpu))
4488 + search_idle = 0;
4489 +inside:
4490 + if (direction == 1) {
4491 + cpu++;
4492 + if (cpu >= NR_CPUS)
4493 + cpu = 0;
4494 + } else {
4495 + cpu--;
4496 + if (cpu == -1)
4497 + cpu = NR_CPUS-1;
4498 + }
4499 + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
4500 + (search_idle && !IDLE_ENOUGH(cpu,now)));
4501 +
4502 + return cpu;
4503 +}
4504 +
4505 +static inline void balance_irq(int cpu, int irq)
4506 +{
4507 + unsigned long now = jiffies;
4508 + cpumask_t allowed_mask;
4509 + unsigned int new_cpu;
4510 +
4511 + if (irqbalance_disabled)
4512 + return;
4513 +
4514 + cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
4515 + new_cpu = move(cpu, allowed_mask, now, 1);
4516 + if (cpu != new_cpu) {
4517 + set_pending_irq(irq, cpumask_of_cpu(new_cpu));
4518 + }
4519 +}
4520 +
4521 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
4522 +{
4523 + int i, j;
4524 + Dprintk("Rotating IRQs among CPUs.\n");
4525 + for (i = 0; i < NR_CPUS; i++) {
4526 + for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
4527 + if (!irq_desc[j].action)
4528 + continue;
4529 + /* Is it a significant load ? */
4530 + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
4531 + useful_load_threshold)
4532 + continue;
4533 + balance_irq(i, j);
4534 + }
4535 + }
4536 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4537 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
4538 + return;
4539 +}
4540 +
4541 +static void do_irq_balance(void)
4542 +{
4543 + int i, j;
4544 + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
4545 + unsigned long move_this_load = 0;
4546 + int max_loaded = 0, min_loaded = 0;
4547 + int load;
4548 + unsigned long useful_load_threshold = balanced_irq_interval + 10;
4549 + int selected_irq;
4550 + int tmp_loaded, first_attempt = 1;
4551 + unsigned long tmp_cpu_irq;
4552 + unsigned long imbalance = 0;
4553 + cpumask_t allowed_mask, target_cpu_mask, tmp;
4554 +
4555 + for (i = 0; i < NR_CPUS; i++) {
4556 + int package_index;
4557 + CPU_IRQ(i) = 0;
4558 + if (!cpu_online(i))
4559 + continue;
4560 + package_index = CPU_TO_PACKAGEINDEX(i);
4561 + for (j = 0; j < NR_IRQS; j++) {
4562 + unsigned long value_now, delta;
4563 + /* Is this an active IRQ? */
4564 + if (!irq_desc[j].action)
4565 + continue;
4566 + if ( package_index == i )
4567 + IRQ_DELTA(package_index,j) = 0;
4568 + /* Determine the total count per processor per IRQ */
4569 + value_now = (unsigned long) kstat_cpu(i).irqs[j];
4570 +
4571 + /* Determine the activity per processor per IRQ */
4572 + delta = value_now - LAST_CPU_IRQ(i,j);
4573 +
4574 + /* Update last_cpu_irq[][] for the next time */
4575 + LAST_CPU_IRQ(i,j) = value_now;
4576 +
4577 + /* Ignore IRQs whose rate is less than the clock */
4578 + if (delta < useful_load_threshold)
4579 + continue;
4580 + /* update the load for the processor or package total */
4581 + IRQ_DELTA(package_index,j) += delta;
4582 +
4583 + /* Keep track of the higher numbered sibling as well */
4584 + if (i != package_index)
4585 + CPU_IRQ(i) += delta;
4586 + /*
4587 + * We have sibling A and sibling B in the package
4588 + *
4589 + * cpu_irq[A] = load for cpu A + load for cpu B
4590 + * cpu_irq[B] = load for cpu B
4591 + */
4592 + CPU_IRQ(package_index) += delta;
4593 + }
4594 + }
4595 + /* Find the least loaded processor package */
4596 + for (i = 0; i < NR_CPUS; i++) {
4597 + if (!cpu_online(i))
4598 + continue;
4599 + if (i != CPU_TO_PACKAGEINDEX(i))
4600 + continue;
4601 + if (min_cpu_irq > CPU_IRQ(i)) {
4602 + min_cpu_irq = CPU_IRQ(i);
4603 + min_loaded = i;
4604 + }
4605 + }
4606 + max_cpu_irq = ULONG_MAX;
4607 +
4608 +tryanothercpu:
4609 + /* Look for heaviest loaded processor.
4610 + * We may come back to get the next heaviest loaded processor.
4611 + * Skip processors with trivial loads.
4612 + */
4613 + tmp_cpu_irq = 0;
4614 + tmp_loaded = -1;
4615 + for (i = 0; i < NR_CPUS; i++) {
4616 + if (!cpu_online(i))
4617 + continue;
4618 + if (i != CPU_TO_PACKAGEINDEX(i))
4619 + continue;
4620 + if (max_cpu_irq <= CPU_IRQ(i))
4621 + continue;
4622 + if (tmp_cpu_irq < CPU_IRQ(i)) {
4623 + tmp_cpu_irq = CPU_IRQ(i);
4624 + tmp_loaded = i;
4625 + }
4626 + }
4627 +
4628 + if (tmp_loaded == -1) {
4629 + /* In the case of small number of heavy interrupt sources,
4630 + * loading some of the cpus too much. We use Ingo's original
4631 + * approach to rotate them around.
4632 + */
4633 + if (!first_attempt && imbalance >= useful_load_threshold) {
4634 + rotate_irqs_among_cpus(useful_load_threshold);
4635 + return;
4636 + }
4637 + goto not_worth_the_effort;
4638 + }
4639 +
4640 + first_attempt = 0; /* heaviest search */
4641 + max_cpu_irq = tmp_cpu_irq; /* load */
4642 + max_loaded = tmp_loaded; /* processor */
4643 + imbalance = (max_cpu_irq - min_cpu_irq) / 2;
4644 +
4645 + Dprintk("max_loaded cpu = %d\n", max_loaded);
4646 + Dprintk("min_loaded cpu = %d\n", min_loaded);
4647 + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
4648 + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
4649 + Dprintk("load imbalance = %lu\n", imbalance);
4650 +
4651 + /* if imbalance is less than approx 10% of max load, then
4652 + * observe diminishing returns action. - quit
4653 + */
4654 + if (imbalance < (max_cpu_irq >> 3)) {
4655 + Dprintk("Imbalance too trivial\n");
4656 + goto not_worth_the_effort;
4657 + }
4658 +
4659 +tryanotherirq:
4660 + /* if we select an IRQ to move that can't go where we want, then
4661 + * see if there is another one to try.
4662 + */
4663 + move_this_load = 0;
4664 + selected_irq = -1;
4665 + for (j = 0; j < NR_IRQS; j++) {
4666 + /* Is this an active IRQ? */
4667 + if (!irq_desc[j].action)
4668 + continue;
4669 + if (imbalance <= IRQ_DELTA(max_loaded,j))
4670 + continue;
4671 + /* Try to find the IRQ that is closest to the imbalance
4672 + * without going over.
4673 + */
4674 + if (move_this_load < IRQ_DELTA(max_loaded,j)) {
4675 + move_this_load = IRQ_DELTA(max_loaded,j);
4676 + selected_irq = j;
4677 + }
4678 + }
4679 + if (selected_irq == -1) {
4680 + goto tryanothercpu;
4681 + }
4682 +
4683 + imbalance = move_this_load;
4684 +
4685 + /* For physical_balance case, we accumlated both load
4686 + * values in the one of the siblings cpu_irq[],
4687 + * to use the same code for physical and logical processors
4688 + * as much as possible.
4689 + *
4690 + * NOTE: the cpu_irq[] array holds the sum of the load for
4691 + * sibling A and sibling B in the slot for the lowest numbered
4692 + * sibling (A), _AND_ the load for sibling B in the slot for
4693 + * the higher numbered sibling.
4694 + *
4695 + * We seek the least loaded sibling by making the comparison
4696 + * (A+B)/2 vs B
4697 + */
4698 + load = CPU_IRQ(min_loaded) >> 1;
4699 + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
4700 + if (load > CPU_IRQ(j)) {
4701 + /* This won't change cpu_sibling_map[min_loaded] */
4702 + load = CPU_IRQ(j);
4703 + min_loaded = j;
4704 + }
4705 + }
4706 +
4707 + cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
4708 + target_cpu_mask = cpumask_of_cpu(min_loaded);
4709 + cpus_and(tmp, target_cpu_mask, allowed_mask);
4710 +
4711 + if (!cpus_empty(tmp)) {
4712 +
4713 + Dprintk("irq = %d moved to cpu = %d\n",
4714 + selected_irq, min_loaded);
4715 + /* mark for change destination */
4716 + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
4717 +
4718 + /* Since we made a change, come back sooner to
4719 + * check for more variation.
4720 + */
4721 + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4722 + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
4723 + return;
4724 + }
4725 + goto tryanotherirq;
4726 +
4727 +not_worth_the_effort:
4728 + /*
4729 + * if we did not find an IRQ to move, then adjust the time interval
4730 + * upward
4731 + */
4732 + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
4733 + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
4734 + Dprintk("IRQ worth rotating not found\n");
4735 + return;
4736 +}
4737 +
4738 +static int balanced_irq(void *unused)
4739 +{
4740 + int i;
4741 + unsigned long prev_balance_time = jiffies;
4742 + long time_remaining = balanced_irq_interval;
4743 +
4744 + daemonize("kirqd");
4745 +
4746 + /* push everything to CPU 0 to give us a starting point. */
4747 + for (i = 0 ; i < NR_IRQS ; i++) {
4748 + pending_irq_cpumask[i] = cpumask_of_cpu(0);
4749 + set_pending_irq(i, cpumask_of_cpu(0));
4750 + }
4751 +
4752 + for ( ; ; ) {
4753 + time_remaining = schedule_timeout_interruptible(time_remaining);
4754 + try_to_freeze();
4755 + if (time_after(jiffies,
4756 + prev_balance_time+balanced_irq_interval)) {
4757 + preempt_disable();
4758 + do_irq_balance();
4759 + prev_balance_time = jiffies;
4760 + time_remaining = balanced_irq_interval;
4761 + preempt_enable();
4762 + }
4763 + }
4764 + return 0;
4765 +}
4766 +
4767 +static int __init balanced_irq_init(void)
4768 +{
4769 + int i;
4770 + struct cpuinfo_x86 *c;
4771 + cpumask_t tmp;
4772 +
4773 + cpus_shift_right(tmp, cpu_online_map, 2);
4774 + c = &boot_cpu_data;
4775 + /* When not overwritten by the command line ask subarchitecture. */
4776 + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
4777 + irqbalance_disabled = NO_BALANCE_IRQ;
4778 + if (irqbalance_disabled)
4779 + return 0;
4780 +
4781 + /* disable irqbalance completely if there is only one processor online */
4782 + if (num_online_cpus() < 2) {
4783 + irqbalance_disabled = 1;
4784 + return 0;
4785 + }
4786 + /*
4787 + * Enable physical balance only if more than 1 physical processor
4788 + * is present
4789 + */
4790 + if (smp_num_siblings > 1 && !cpus_empty(tmp))
4791 + physical_balance = 1;
4792 +
4793 + for (i = 0; i < NR_CPUS; i++) {
4794 + if (!cpu_online(i))
4795 + continue;
4796 + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4797 + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4798 + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
4799 + printk(KERN_ERR "balanced_irq_init: out of memory");
4800 + goto failed;
4801 + }
4802 + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
4803 + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
4804 + }
4805 +
4806 + printk(KERN_INFO "Starting balanced_irq\n");
4807 + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
4808 + return 0;
4809 + else
4810 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
4811 +failed:
4812 + for (i = 0; i < NR_CPUS; i++) {
4813 + kfree(irq_cpu_data[i].irq_delta);
4814 + kfree(irq_cpu_data[i].last_irq);
4815 + }
4816 + return 0;
4817 +}
4818 +
4819 +int __init irqbalance_disable(char *str)
4820 +{
4821 + irqbalance_disabled = 1;
4822 + return 0;
4823 +}
4824 +
4825 +__setup("noirqbalance", irqbalance_disable);
4826 +
4827 +late_initcall(balanced_irq_init);
4828 +#endif /* CONFIG_IRQBALANCE */
4829 +#endif /* CONFIG_SMP */
4830 +#endif
4831 +
4832 +#ifndef CONFIG_SMP
4833 +void fastcall send_IPI_self(int vector)
4834 +{
4835 +#ifndef CONFIG_XEN
4836 + unsigned int cfg;
4837 +
4838 + /*
4839 + * Wait for idle.
4840 + */
4841 + apic_wait_icr_idle();
4842 + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
4843 + /*
4844 + * Send the IPI. The write to APIC_ICR fires this off.
4845 + */
4846 + apic_write_around(APIC_ICR, cfg);
4847 +#endif
4848 +}
4849 +#endif /* !CONFIG_SMP */
4850 +
4851 +
4852 +/*
4853 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
4854 + * specific CPU-side IRQs.
4855 + */
4856 +
4857 +#define MAX_PIRQS 8
4858 +static int pirq_entries [MAX_PIRQS];
4859 +static int pirqs_enabled;
4860 +int skip_ioapic_setup;
4861 +
4862 +static int __init ioapic_setup(char *str)
4863 +{
4864 + skip_ioapic_setup = 1;
4865 + return 1;
4866 +}
4867 +
4868 +__setup("noapic", ioapic_setup);
4869 +
4870 +static int __init ioapic_pirq_setup(char *str)
4871 +{
4872 + int i, max;
4873 + int ints[MAX_PIRQS+1];
4874 +
4875 + get_options(str, ARRAY_SIZE(ints), ints);
4876 +
4877 + for (i = 0; i < MAX_PIRQS; i++)
4878 + pirq_entries[i] = -1;
4879 +
4880 + pirqs_enabled = 1;
4881 + apic_printk(APIC_VERBOSE, KERN_INFO
4882 + "PIRQ redirection, working around broken MP-BIOS.\n");
4883 + max = MAX_PIRQS;
4884 + if (ints[0] < MAX_PIRQS)
4885 + max = ints[0];
4886 +
4887 + for (i = 0; i < max; i++) {
4888 + apic_printk(APIC_VERBOSE, KERN_DEBUG
4889 + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
4890 + /*
4891 + * PIRQs are mapped upside down, usually.
4892 + */
4893 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
4894 + }
4895 + return 1;
4896 +}
4897 +
4898 +__setup("pirq=", ioapic_pirq_setup);
4899 +
4900 +/*
4901 + * Find the IRQ entry number of a certain pin.
4902 + */
4903 +static int find_irq_entry(int apic, int pin, int type)
4904 +{
4905 + int i;
4906 +
4907 + for (i = 0; i < mp_irq_entries; i++)
4908 + if (mp_irqs[i].mpc_irqtype == type &&
4909 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
4910 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
4911 + mp_irqs[i].mpc_dstirq == pin)
4912 + return i;
4913 +
4914 + return -1;
4915 +}
4916 +
4917 +/*
4918 + * Find the pin to which IRQ[irq] (ISA) is connected
4919 + */
4920 +static int __init find_isa_irq_pin(int irq, int type)
4921 +{
4922 + int i;
4923 +
4924 + for (i = 0; i < mp_irq_entries; i++) {
4925 + int lbus = mp_irqs[i].mpc_srcbus;
4926 +
4927 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4928 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4929 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4930 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4931 + ) &&
4932 + (mp_irqs[i].mpc_irqtype == type) &&
4933 + (mp_irqs[i].mpc_srcbusirq == irq))
4934 +
4935 + return mp_irqs[i].mpc_dstirq;
4936 + }
4937 + return -1;
4938 +}
4939 +
4940 +static int __init find_isa_irq_apic(int irq, int type)
4941 +{
4942 + int i;
4943 +
4944 + for (i = 0; i < mp_irq_entries; i++) {
4945 + int lbus = mp_irqs[i].mpc_srcbus;
4946 +
4947 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4948 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4949 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4950 + mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4951 + ) &&
4952 + (mp_irqs[i].mpc_irqtype == type) &&
4953 + (mp_irqs[i].mpc_srcbusirq == irq))
4954 + break;
4955 + }
4956 + if (i < mp_irq_entries) {
4957 + int apic;
4958 + for(apic = 0; apic < nr_ioapics; apic++) {
4959 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
4960 + return apic;
4961 + }
4962 + }
4963 +
4964 + return -1;
4965 +}
4966 +
4967 +/*
4968 + * Find a specific PCI IRQ entry.
4969 + * Not an __init, possibly needed by modules
4970 + */
4971 +static int pin_2_irq(int idx, int apic, int pin);
4972 +
4973 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
4974 +{
4975 + int apic, i, best_guess = -1;
4976 +
4977 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4978 + "slot:%d, pin:%d.\n", bus, slot, pin);
4979 + if (mp_bus_id_to_pci_bus[bus] == -1) {
4980 + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4981 + return -1;
4982 + }
4983 + for (i = 0; i < mp_irq_entries; i++) {
4984 + int lbus = mp_irqs[i].mpc_srcbus;
4985 +
4986 + for (apic = 0; apic < nr_ioapics; apic++)
4987 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4988 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4989 + break;
4990 +
4991 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4992 + !mp_irqs[i].mpc_irqtype &&
4993 + (bus == lbus) &&
4994 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4995 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4996 +
4997 + if (!(apic || IO_APIC_IRQ(irq)))
4998 + continue;
4999 +
5000 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
5001 + return irq;
5002 + /*
5003 + * Use the first all-but-pin matching entry as a
5004 + * best-guess fuzzy result for broken mptables.
5005 + */
5006 + if (best_guess < 0)
5007 + best_guess = irq;
5008 + }
5009 + }
5010 + return best_guess;
5011 +}
5012 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
5013 +
5014 +/*
5015 + * This function currently is only a helper for the i386 smp boot process where
5016 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
5017 + * so mask in all cases should simply be TARGET_CPUS
5018 + */
5019 +#ifdef CONFIG_SMP
5020 +#ifndef CONFIG_XEN
5021 +void __init setup_ioapic_dest(void)
5022 +{
5023 + int pin, ioapic, irq, irq_entry;
5024 +
5025 + if (skip_ioapic_setup == 1)
5026 + return;
5027 +
5028 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
5029 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
5030 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
5031 + if (irq_entry == -1)
5032 + continue;
5033 + irq = pin_2_irq(irq_entry, ioapic, pin);
5034 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
5035 + }
5036 +
5037 + }
5038 +}
5039 +#endif /* !CONFIG_XEN */
5040 +#endif
5041 +
5042 +/*
5043 + * EISA Edge/Level control register, ELCR
5044 + */
5045 +static int EISA_ELCR(unsigned int irq)
5046 +{
5047 + if (irq < 16) {
5048 + unsigned int port = 0x4d0 + (irq >> 3);
5049 + return (inb(port) >> (irq & 7)) & 1;
5050 + }
5051 + apic_printk(APIC_VERBOSE, KERN_INFO
5052 + "Broken MPtable reports ISA irq %d\n", irq);
5053 + return 0;
5054 +}
5055 +
5056 +/* EISA interrupts are always polarity zero and can be edge or level
5057 + * trigger depending on the ELCR value. If an interrupt is listed as
5058 + * EISA conforming in the MP table, that means its trigger type must
5059 + * be read in from the ELCR */
5060 +
5061 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
5062 +#define default_EISA_polarity(idx) (0)
5063 +
5064 +/* ISA interrupts are always polarity zero edge triggered,
5065 + * when listed as conforming in the MP table. */
5066 +
5067 +#define default_ISA_trigger(idx) (0)
5068 +#define default_ISA_polarity(idx) (0)
5069 +
5070 +/* PCI interrupts are always polarity one level triggered,
5071 + * when listed as conforming in the MP table. */
5072 +
5073 +#define default_PCI_trigger(idx) (1)
5074 +#define default_PCI_polarity(idx) (1)
5075 +
5076 +/* MCA interrupts are always polarity zero level triggered,
5077 + * when listed as conforming in the MP table. */
5078 +
5079 +#define default_MCA_trigger(idx) (1)
5080 +#define default_MCA_polarity(idx) (0)
5081 +
5082 +/* NEC98 interrupts are always polarity zero edge triggered,
5083 + * when listed as conforming in the MP table. */
5084 +
5085 +#define default_NEC98_trigger(idx) (0)
5086 +#define default_NEC98_polarity(idx) (0)
5087 +
5088 +static int __init MPBIOS_polarity(int idx)
5089 +{
5090 + int bus = mp_irqs[idx].mpc_srcbus;
5091 + int polarity;
5092 +
5093 + /*
5094 + * Determine IRQ line polarity (high active or low active):
5095 + */
5096 + switch (mp_irqs[idx].mpc_irqflag & 3)
5097 + {
5098 + case 0: /* conforms, ie. bus-type dependent polarity */
5099 + {
5100 + switch (mp_bus_id_to_type[bus])
5101 + {
5102 + case MP_BUS_ISA: /* ISA pin */
5103 + {
5104 + polarity = default_ISA_polarity(idx);
5105 + break;
5106 + }
5107 + case MP_BUS_EISA: /* EISA pin */
5108 + {
5109 + polarity = default_EISA_polarity(idx);
5110 + break;
5111 + }
5112 + case MP_BUS_PCI: /* PCI pin */
5113 + {
5114 + polarity = default_PCI_polarity(idx);
5115 + break;
5116 + }
5117 + case MP_BUS_MCA: /* MCA pin */
5118 + {
5119 + polarity = default_MCA_polarity(idx);
5120 + break;
5121 + }
5122 + case MP_BUS_NEC98: /* NEC 98 pin */
5123 + {
5124 + polarity = default_NEC98_polarity(idx);
5125 + break;
5126 + }
5127 + default:
5128 + {
5129 + printk(KERN_WARNING "broken BIOS!!\n");
5130 + polarity = 1;
5131 + break;
5132 + }
5133 + }
5134 + break;
5135 + }
5136 + case 1: /* high active */
5137 + {
5138 + polarity = 0;
5139 + break;
5140 + }
5141 + case 2: /* reserved */
5142 + {
5143 + printk(KERN_WARNING "broken BIOS!!\n");
5144 + polarity = 1;
5145 + break;
5146 + }
5147 + case 3: /* low active */
5148 + {
5149 + polarity = 1;
5150 + break;
5151 + }
5152 + default: /* invalid */
5153 + {
5154 + printk(KERN_WARNING "broken BIOS!!\n");
5155 + polarity = 1;
5156 + break;
5157 + }
5158 + }
5159 + return polarity;
5160 +}
5161 +
5162 +static int MPBIOS_trigger(int idx)
5163 +{
5164 + int bus = mp_irqs[idx].mpc_srcbus;
5165 + int trigger;
5166 +
5167 + /*
5168 + * Determine IRQ trigger mode (edge or level sensitive):
5169 + */
5170 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
5171 + {
5172 + case 0: /* conforms, ie. bus-type dependent */
5173 + {
5174 + switch (mp_bus_id_to_type[bus])
5175 + {
5176 + case MP_BUS_ISA: /* ISA pin */
5177 + {
5178 + trigger = default_ISA_trigger(idx);
5179 + break;
5180 + }
5181 + case MP_BUS_EISA: /* EISA pin */
5182 + {
5183 + trigger = default_EISA_trigger(idx);
5184 + break;
5185 + }
5186 + case MP_BUS_PCI: /* PCI pin */
5187 + {
5188 + trigger = default_PCI_trigger(idx);
5189 + break;
5190 + }
5191 + case MP_BUS_MCA: /* MCA pin */
5192 + {
5193 + trigger = default_MCA_trigger(idx);
5194 + break;
5195 + }
5196 + case MP_BUS_NEC98: /* NEC 98 pin */
5197 + {
5198 + trigger = default_NEC98_trigger(idx);
5199 + break;
5200 + }
5201 + default:
5202 + {
5203 + printk(KERN_WARNING "broken BIOS!!\n");
5204 + trigger = 1;
5205 + break;
5206 + }
5207 + }
5208 + break;
5209 + }
5210 + case 1: /* edge */
5211 + {
5212 + trigger = 0;
5213 + break;
5214 + }
5215 + case 2: /* reserved */
5216 + {
5217 + printk(KERN_WARNING "broken BIOS!!\n");
5218 + trigger = 1;
5219 + break;
5220 + }
5221 + case 3: /* level */
5222 + {
5223 + trigger = 1;
5224 + break;
5225 + }
5226 + default: /* invalid */
5227 + {
5228 + printk(KERN_WARNING "broken BIOS!!\n");
5229 + trigger = 0;
5230 + break;
5231 + }
5232 + }
5233 + return trigger;
5234 +}
5235 +
5236 +static inline int irq_polarity(int idx)
5237 +{
5238 + return MPBIOS_polarity(idx);
5239 +}
5240 +
5241 +static inline int irq_trigger(int idx)
5242 +{
5243 + return MPBIOS_trigger(idx);
5244 +}
5245 +
5246 +static int pin_2_irq(int idx, int apic, int pin)
5247 +{
5248 + int irq, i;
5249 + int bus = mp_irqs[idx].mpc_srcbus;
5250 +
5251 + /*
5252 + * Debugging check, we are in big trouble if this message pops up!
5253 + */
5254 + if (mp_irqs[idx].mpc_dstirq != pin)
5255 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5256 +
5257 + switch (mp_bus_id_to_type[bus])
5258 + {
5259 + case MP_BUS_ISA: /* ISA pin */
5260 + case MP_BUS_EISA:
5261 + case MP_BUS_MCA:
5262 + case MP_BUS_NEC98:
5263 + {
5264 + irq = mp_irqs[idx].mpc_srcbusirq;
5265 + break;
5266 + }
5267 + case MP_BUS_PCI: /* PCI pin */
5268 + {
5269 + /*
5270 + * PCI IRQs are mapped in order
5271 + */
5272 + i = irq = 0;
5273 + while (i < apic)
5274 + irq += nr_ioapic_registers[i++];
5275 + irq += pin;
5276 +
5277 + /*
5278 + * For MPS mode, so far only needed by ES7000 platform
5279 + */
5280 + if (ioapic_renumber_irq)
5281 + irq = ioapic_renumber_irq(apic, irq);
5282 +
5283 + break;
5284 + }
5285 + default:
5286 + {
5287 + printk(KERN_ERR "unknown bus type %d.\n",bus);
5288 + irq = 0;
5289 + break;
5290 + }
5291 + }
5292 +
5293 + /*
5294 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
5295 + */
5296 + if ((pin >= 16) && (pin <= 23)) {
5297 + if (pirq_entries[pin-16] != -1) {
5298 + if (!pirq_entries[pin-16]) {
5299 + apic_printk(APIC_VERBOSE, KERN_DEBUG
5300 + "disabling PIRQ%d\n", pin-16);
5301 + } else {
5302 + irq = pirq_entries[pin-16];
5303 + apic_printk(APIC_VERBOSE, KERN_DEBUG
5304 + "using PIRQ%d -> IRQ %d\n",
5305 + pin-16, irq);
5306 + }
5307 + }
5308 + }
5309 + return irq;
5310 +}
5311 +
5312 +static inline int IO_APIC_irq_trigger(int irq)
5313 +{
5314 + int apic, idx, pin;
5315 +
5316 + for (apic = 0; apic < nr_ioapics; apic++) {
5317 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5318 + idx = find_irq_entry(apic,pin,mp_INT);
5319 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
5320 + return irq_trigger(idx);
5321 + }
5322 + }
5323 + /*
5324 + * nonexistent IRQs are edge default
5325 + */
5326 + return 0;
5327 +}
5328 +
5329 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
5330 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
5331 +
5332 +int assign_irq_vector(int irq)
5333 +{
5334 + struct physdev_irq irq_op;
5335 +
5336 + BUG_ON(irq >= NR_IRQ_VECTORS);
5337 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
5338 + return IO_APIC_VECTOR(irq);
5339 +
5340 + irq_op.irq = irq;
5341 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
5342 + return -ENOSPC;
5343 +
5344 + vector_irq[irq_op.vector] = irq;
5345 + if (irq != AUTO_ASSIGN)
5346 + IO_APIC_VECTOR(irq) = irq_op.vector;
5347 +
5348 + return irq_op.vector;
5349 +}
5350 +
5351 +#ifndef CONFIG_XEN
5352 +static struct hw_interrupt_type ioapic_level_type;
5353 +static struct hw_interrupt_type ioapic_edge_type;
5354 +
5355 +#define IOAPIC_AUTO -1
5356 +#define IOAPIC_EDGE 0
5357 +#define IOAPIC_LEVEL 1
5358 +
5359 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
5360 +{
5361 + if (use_pci_vector() && !platform_legacy_irq(irq)) {
5362 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5363 + trigger == IOAPIC_LEVEL)
5364 + irq_desc[vector].handler = &ioapic_level_type;
5365 + else
5366 + irq_desc[vector].handler = &ioapic_edge_type;
5367 + set_intr_gate(vector, interrupt[vector]);
5368 + } else {
5369 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5370 + trigger == IOAPIC_LEVEL)
5371 + irq_desc[irq].handler = &ioapic_level_type;
5372 + else
5373 + irq_desc[irq].handler = &ioapic_edge_type;
5374 + set_intr_gate(vector, interrupt[irq]);
5375 + }
5376 +}
5377 +#else
5378 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
5379 +#endif
5380 +
5381 +static void __init setup_IO_APIC_irqs(void)
5382 +{
5383 + struct IO_APIC_route_entry entry;
5384 + int apic, pin, idx, irq, first_notcon = 1, vector;
5385 + unsigned long flags;
5386 +
5387 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5388 +
5389 + for (apic = 0; apic < nr_ioapics; apic++) {
5390 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5391 +
5392 + /*
5393 + * add it to the IO-APIC irq-routing table:
5394 + */
5395 + memset(&entry,0,sizeof(entry));
5396 +
5397 + entry.delivery_mode = INT_DELIVERY_MODE;
5398 + entry.dest_mode = INT_DEST_MODE;
5399 + entry.mask = 0; /* enable IRQ */
5400 + entry.dest.logical.logical_dest =
5401 + cpu_mask_to_apicid(TARGET_CPUS);
5402 +
5403 + idx = find_irq_entry(apic,pin,mp_INT);
5404 + if (idx == -1) {
5405 + if (first_notcon) {
5406 + apic_printk(APIC_VERBOSE, KERN_DEBUG
5407 + " IO-APIC (apicid-pin) %d-%d",
5408 + mp_ioapics[apic].mpc_apicid,
5409 + pin);
5410 + first_notcon = 0;
5411 + } else
5412 + apic_printk(APIC_VERBOSE, ", %d-%d",
5413 + mp_ioapics[apic].mpc_apicid, pin);
5414 + continue;
5415 + }
5416 +
5417 + entry.trigger = irq_trigger(idx);
5418 + entry.polarity = irq_polarity(idx);
5419 +
5420 + if (irq_trigger(idx)) {
5421 + entry.trigger = 1;
5422 + entry.mask = 1;
5423 + }
5424 +
5425 + irq = pin_2_irq(idx, apic, pin);
5426 + /*
5427 + * skip adding the timer int on secondary nodes, which causes
5428 + * a small but painful rift in the time-space continuum
5429 + */
5430 + if (multi_timer_check(apic, irq))
5431 + continue;
5432 + else
5433 + add_pin_to_irq(irq, apic, pin);
5434 +
5435 + if (/*!apic &&*/ !IO_APIC_IRQ(irq))
5436 + continue;
5437 +
5438 + if (IO_APIC_IRQ(irq)) {
5439 + vector = assign_irq_vector(irq);
5440 + entry.vector = vector;
5441 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5442 +
5443 + if (!apic && (irq < 16))
5444 + disable_8259A_irq(irq);
5445 + }
5446 + spin_lock_irqsave(&ioapic_lock, flags);
5447 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5448 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5449 + set_native_irq_info(irq, TARGET_CPUS);
5450 + spin_unlock_irqrestore(&ioapic_lock, flags);
5451 + }
5452 + }
5453 +
5454 + if (!first_notcon)
5455 + apic_printk(APIC_VERBOSE, " not connected.\n");
5456 +}
5457 +
5458 +/*
5459 + * Set up the 8259A-master output pin:
5460 + */
5461 +#ifndef CONFIG_XEN
5462 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
5463 +{
5464 + struct IO_APIC_route_entry entry;
5465 + unsigned long flags;
5466 +
5467 + memset(&entry,0,sizeof(entry));
5468 +
5469 + disable_8259A_irq(0);
5470 +
5471 + /* mask LVT0 */
5472 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5473 +
5474 + /*
5475 + * We use logical delivery to get the timer IRQ
5476 + * to the first CPU.
5477 + */
5478 + entry.dest_mode = INT_DEST_MODE;
5479 + entry.mask = 0; /* unmask IRQ now */
5480 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5481 + entry.delivery_mode = INT_DELIVERY_MODE;
5482 + entry.polarity = 0;
5483 + entry.trigger = 0;
5484 + entry.vector = vector;
5485 +
5486 + /*
5487 + * The timer IRQ doesn't have to know that behind the
5488 + * scene we have a 8259A-master in AEOI mode ...
5489 + */
5490 + irq_desc[0].handler = &ioapic_edge_type;
5491 +
5492 + /*
5493 + * Add it to the IO-APIC irq-routing table:
5494 + */
5495 + spin_lock_irqsave(&ioapic_lock, flags);
5496 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5497 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5498 + spin_unlock_irqrestore(&ioapic_lock, flags);
5499 +
5500 + enable_8259A_irq(0);
5501 +}
5502 +
5503 +static inline void UNEXPECTED_IO_APIC(void)
5504 +{
5505 +}
5506 +
5507 +void __init print_IO_APIC(void)
5508 +{
5509 + int apic, i;
5510 + union IO_APIC_reg_00 reg_00;
5511 + union IO_APIC_reg_01 reg_01;
5512 + union IO_APIC_reg_02 reg_02;
5513 + union IO_APIC_reg_03 reg_03;
5514 + unsigned long flags;
5515 +
5516 + if (apic_verbosity == APIC_QUIET)
5517 + return;
5518 +
5519 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
5520 + for (i = 0; i < nr_ioapics; i++)
5521 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
5522 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
5523 +
5524 + /*
5525 + * We are a bit conservative about what we expect. We have to
5526 + * know about every hardware change ASAP.
5527 + */
5528 + printk(KERN_INFO "testing the IO APIC.......................\n");
5529 +
5530 + for (apic = 0; apic < nr_ioapics; apic++) {
5531 +
5532 + spin_lock_irqsave(&ioapic_lock, flags);
5533 + reg_00.raw = io_apic_read(apic, 0);
5534 + reg_01.raw = io_apic_read(apic, 1);
5535 + if (reg_01.bits.version >= 0x10)
5536 + reg_02.raw = io_apic_read(apic, 2);
5537 + if (reg_01.bits.version >= 0x20)
5538 + reg_03.raw = io_apic_read(apic, 3);
5539 + spin_unlock_irqrestore(&ioapic_lock, flags);
5540 +
5541 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
5542 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
5543 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
5544 + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
5545 + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
5546 + if (reg_00.bits.ID >= get_physical_broadcast())
5547 + UNEXPECTED_IO_APIC();
5548 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
5549 + UNEXPECTED_IO_APIC();
5550 +
5551 + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
5552 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
5553 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
5554 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
5555 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
5556 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
5557 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
5558 + (reg_01.bits.entries != 0x2E) &&
5559 + (reg_01.bits.entries != 0x3F)
5560 + )
5561 + UNEXPECTED_IO_APIC();
5562 +
5563 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
5564 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
5565 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
5566 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
5567 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
5568 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
5569 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
5570 + )
5571 + UNEXPECTED_IO_APIC();
5572 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
5573 + UNEXPECTED_IO_APIC();
5574 +
5575 + /*
5576 + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
5577 + * but the value of reg_02 is read as the previous read register
5578 + * value, so ignore it if reg_02 == reg_01.
5579 + */
5580 + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
5581 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
5582 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
5583 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
5584 + UNEXPECTED_IO_APIC();
5585 + }
5586 +
5587 + /*
5588 + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
5589 + * or reg_03, but the value of reg_0[23] is read as the previous read
5590 + * register value, so ignore it if reg_03 == reg_0[12].
5591 + */
5592 + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
5593 + reg_03.raw != reg_01.raw) {
5594 + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
5595 + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
5596 + if (reg_03.bits.__reserved_1)
5597 + UNEXPECTED_IO_APIC();
5598 + }
5599 +
5600 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
5601 +
5602 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
5603 + " Stat Dest Deli Vect: \n");
5604 +
5605 + for (i = 0; i <= reg_01.bits.entries; i++) {
5606 + struct IO_APIC_route_entry entry;
5607 +
5608 + spin_lock_irqsave(&ioapic_lock, flags);
5609 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
5610 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
5611 + spin_unlock_irqrestore(&ioapic_lock, flags);
5612 +
5613 + printk(KERN_DEBUG " %02x %03X %02X ",
5614 + i,
5615 + entry.dest.logical.logical_dest,
5616 + entry.dest.physical.physical_dest
5617 + );
5618 +
5619 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
5620 + entry.mask,
5621 + entry.trigger,
5622 + entry.irr,
5623 + entry.polarity,
5624 + entry.delivery_status,
5625 + entry.dest_mode,
5626 + entry.delivery_mode,
5627 + entry.vector
5628 + );
5629 + }
5630 + }
5631 + if (use_pci_vector())
5632 + printk(KERN_INFO "Using vector-based indexing\n");
5633 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
5634 + for (i = 0; i < NR_IRQS; i++) {
5635 + struct irq_pin_list *entry = irq_2_pin + i;
5636 + if (entry->pin < 0)
5637 + continue;
5638 + if (use_pci_vector() && !platform_legacy_irq(i))
5639 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
5640 + else
5641 + printk(KERN_DEBUG "IRQ%d ", i);
5642 + for (;;) {
5643 + printk("-> %d:%d", entry->apic, entry->pin);
5644 + if (!entry->next)
5645 + break;
5646 + entry = irq_2_pin + entry->next;
5647 + }
5648 + printk("\n");
5649 + }
5650 +
5651 + printk(KERN_INFO ".................................... done.\n");
5652 +
5653 + return;
5654 +}
5655 +
5656 +#if 0
5657 +
5658 +static void print_APIC_bitfield (int base)
5659 +{
5660 + unsigned int v;
5661 + int i, j;
5662 +
5663 + if (apic_verbosity == APIC_QUIET)
5664 + return;
5665 +
5666 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
5667 + for (i = 0; i < 8; i++) {
5668 + v = apic_read(base + i*0x10);
5669 + for (j = 0; j < 32; j++) {
5670 + if (v & (1<<j))
5671 + printk("1");
5672 + else
5673 + printk("0");
5674 + }
5675 + printk("\n");
5676 + }
5677 +}
5678 +
5679 +void /*__init*/ print_local_APIC(void * dummy)
5680 +{
5681 + unsigned int v, ver, maxlvt;
5682 +
5683 + if (apic_verbosity == APIC_QUIET)
5684 + return;
5685 +
5686 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
5687 + smp_processor_id(), hard_smp_processor_id());
5688 + v = apic_read(APIC_ID);
5689 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
5690 + v = apic_read(APIC_LVR);
5691 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
5692 + ver = GET_APIC_VERSION(v);
5693 + maxlvt = get_maxlvt();
5694 +
5695 + v = apic_read(APIC_TASKPRI);
5696 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
5697 +
5698 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
5699 + v = apic_read(APIC_ARBPRI);
5700 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
5701 + v & APIC_ARBPRI_MASK);
5702 + v = apic_read(APIC_PROCPRI);
5703 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
5704 + }
5705 +
5706 + v = apic_read(APIC_EOI);
5707 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
5708 + v = apic_read(APIC_RRR);
5709 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
5710 + v = apic_read(APIC_LDR);
5711 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
5712 + v = apic_read(APIC_DFR);
5713 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
5714 + v = apic_read(APIC_SPIV);
5715 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
5716 +
5717 + printk(KERN_DEBUG "... APIC ISR field:\n");
5718 + print_APIC_bitfield(APIC_ISR);
5719 + printk(KERN_DEBUG "... APIC TMR field:\n");
5720 + print_APIC_bitfield(APIC_TMR);
5721 + printk(KERN_DEBUG "... APIC IRR field:\n");
5722 + print_APIC_bitfield(APIC_IRR);
5723 +
5724 + if (APIC_INTEGRATED(ver)) { /* !82489DX */
5725 + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
5726 + apic_write(APIC_ESR, 0);
5727 + v = apic_read(APIC_ESR);
5728 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
5729 + }
5730 +
5731 + v = apic_read(APIC_ICR);
5732 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
5733 + v = apic_read(APIC_ICR2);
5734 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
5735 +
5736 + v = apic_read(APIC_LVTT);
5737 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
5738 +
5739 + if (maxlvt > 3) { /* PC is LVT#4. */
5740 + v = apic_read(APIC_LVTPC);
5741 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
5742 + }
5743 + v = apic_read(APIC_LVT0);
5744 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
5745 + v = apic_read(APIC_LVT1);
5746 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
5747 +
5748 + if (maxlvt > 2) { /* ERR is LVT#3. */
5749 + v = apic_read(APIC_LVTERR);
5750 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
5751 + }
5752 +
5753 + v = apic_read(APIC_TMICT);
5754 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
5755 + v = apic_read(APIC_TMCCT);
5756 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
5757 + v = apic_read(APIC_TDCR);
5758 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
5759 + printk("\n");
5760 +}
5761 +
5762 +void print_all_local_APICs (void)
5763 +{
5764 + on_each_cpu(print_local_APIC, NULL, 1, 1);
5765 +}
5766 +
5767 +void /*__init*/ print_PIC(void)
5768 +{
5769 + unsigned int v;
5770 + unsigned long flags;
5771 +
5772 + if (apic_verbosity == APIC_QUIET)
5773 + return;
5774 +
5775 + printk(KERN_DEBUG "\nprinting PIC contents\n");
5776 +
5777 + spin_lock_irqsave(&i8259A_lock, flags);
5778 +
5779 + v = inb(0xa1) << 8 | inb(0x21);
5780 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
5781 +
5782 + v = inb(0xa0) << 8 | inb(0x20);
5783 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
5784 +
5785 + outb(0x0b,0xa0);
5786 + outb(0x0b,0x20);
5787 + v = inb(0xa0) << 8 | inb(0x20);
5788 + outb(0x0a,0xa0);
5789 + outb(0x0a,0x20);
5790 +
5791 + spin_unlock_irqrestore(&i8259A_lock, flags);
5792 +
5793 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
5794 +
5795 + v = inb(0x4d1) << 8 | inb(0x4d0);
5796 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
5797 +}
5798 +
5799 +#endif /* 0 */
5800 +
5801 +#else
5802 +void __init print_IO_APIC(void) { }
5803 +#endif /* !CONFIG_XEN */
5804 +
5805 +static void __init enable_IO_APIC(void)
5806 +{
5807 + union IO_APIC_reg_01 reg_01;
5808 + int i8259_apic, i8259_pin;
5809 + int i, apic;
5810 + unsigned long flags;
5811 +
5812 + for (i = 0; i < PIN_MAP_SIZE; i++) {
5813 + irq_2_pin[i].pin = -1;
5814 + irq_2_pin[i].next = 0;
5815 + }
5816 + if (!pirqs_enabled)
5817 + for (i = 0; i < MAX_PIRQS; i++)
5818 + pirq_entries[i] = -1;
5819 +
5820 + /*
5821 + * The number of IO-APIC IRQ registers (== #pins):
5822 + */
5823 + for (apic = 0; apic < nr_ioapics; apic++) {
5824 + spin_lock_irqsave(&ioapic_lock, flags);
5825 + reg_01.raw = io_apic_read(apic, 1);
5826 + spin_unlock_irqrestore(&ioapic_lock, flags);
5827 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
5828 + }
5829 + for(apic = 0; apic < nr_ioapics; apic++) {
5830 + int pin;
5831 + /* See if any of the pins is in ExtINT mode */
5832 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5833 + struct IO_APIC_route_entry entry;
5834 + spin_lock_irqsave(&ioapic_lock, flags);
5835 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5836 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5837 + spin_unlock_irqrestore(&ioapic_lock, flags);
5838 +
5839 +
5840 + /* If the interrupt line is enabled and in ExtInt mode
5841 + * I have found the pin where the i8259 is connected.
5842 + */
5843 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
5844 + ioapic_i8259.apic = apic;
5845 + ioapic_i8259.pin = pin;
5846 + goto found_i8259;
5847 + }
5848 + }
5849 + }
5850 + found_i8259:
5851 + /* Look to see what if the MP table has reported the ExtINT */
5852 + /* If we could not find the appropriate pin by looking at the ioapic
5853 + * the i8259 probably is not connected the ioapic but give the
5854 + * mptable a chance anyway.
5855 + */
5856 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
5857 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
5858 + /* Trust the MP table if nothing is setup in the hardware */
5859 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
5860 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
5861 + ioapic_i8259.pin = i8259_pin;
5862 + ioapic_i8259.apic = i8259_apic;
5863 + }
5864 + /* Complain if the MP table and the hardware disagree */
5865 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
5866 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
5867 + {
5868 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
5869 + }
5870 +
5871 + /*
5872 + * Do not trust the IO-APIC being empty at bootup
5873 + */
5874 + clear_IO_APIC();
5875 +}
5876 +
5877 +/*
5878 + * Not an __init, needed by the reboot code
5879 + */
5880 +void disable_IO_APIC(void)
5881 +{
5882 + /*
5883 + * Clear the IO-APIC before rebooting:
5884 + */
5885 + clear_IO_APIC();
5886 +
5887 +#ifndef CONFIG_XEN
5888 + /*
5889 + * If the i8259 is routed through an IOAPIC
5890 + * Put that IOAPIC in virtual wire mode
5891 + * so legacy interrupts can be delivered.
5892 + */
5893 + if (ioapic_i8259.pin != -1) {
5894 + struct IO_APIC_route_entry entry;
5895 + unsigned long flags;
5896 +
5897 + memset(&entry, 0, sizeof(entry));
5898 + entry.mask = 0; /* Enabled */
5899 + entry.trigger = 0; /* Edge */
5900 + entry.irr = 0;
5901 + entry.polarity = 0; /* High */
5902 + entry.delivery_status = 0;
5903 + entry.dest_mode = 0; /* Physical */
5904 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
5905 + entry.vector = 0;
5906 + entry.dest.physical.physical_dest =
5907 + GET_APIC_ID(apic_read(APIC_ID));
5908 +
5909 + /*
5910 + * Add it to the IO-APIC irq-routing table:
5911 + */
5912 + spin_lock_irqsave(&ioapic_lock, flags);
5913 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
5914 + *(((int *)&entry)+1));
5915 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
5916 + *(((int *)&entry)+0));
5917 + spin_unlock_irqrestore(&ioapic_lock, flags);
5918 + }
5919 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
5920 +#endif
5921 +}
5922 +
5923 +/*
5924 + * function to set the IO-APIC physical IDs based on the
5925 + * values stored in the MPC table.
5926 + *
5927 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
5928 + */
5929 +
5930 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
5931 +static void __init setup_ioapic_ids_from_mpc(void)
5932 +{
5933 + union IO_APIC_reg_00 reg_00;
5934 + physid_mask_t phys_id_present_map;
5935 + int apic;
5936 + int i;
5937 + unsigned char old_id;
5938 + unsigned long flags;
5939 +
5940 + /*
5941 + * Don't check I/O APIC IDs for xAPIC systems. They have
5942 + * no meaning without the serial APIC bus.
5943 + */
5944 + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15))
5945 + return;
5946 + /*
5947 + * This is broken; anything with a real cpu count has to
5948 + * circumvent this idiocy regardless.
5949 + */
5950 + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
5951 +
5952 + /*
5953 + * Set the IOAPIC ID to the value stored in the MPC table.
5954 + */
5955 + for (apic = 0; apic < nr_ioapics; apic++) {
5956 +
5957 + /* Read the register 0 value */
5958 + spin_lock_irqsave(&ioapic_lock, flags);
5959 + reg_00.raw = io_apic_read(apic, 0);
5960 + spin_unlock_irqrestore(&ioapic_lock, flags);
5961 +
5962 + old_id = mp_ioapics[apic].mpc_apicid;
5963 +
5964 + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
5965 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
5966 + apic, mp_ioapics[apic].mpc_apicid);
5967 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5968 + reg_00.bits.ID);
5969 + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5970 + }
5971 +
5972 + /*
5973 + * Sanity check, is the ID really free? Every APIC in a
5974 + * system must have a unique ID or we get lots of nice
5975 + * 'stuck on smp_invalidate_needed IPI wait' messages.
5976 + */
5977 + if (check_apicid_used(phys_id_present_map,
5978 + mp_ioapics[apic].mpc_apicid)) {
5979 + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5980 + apic, mp_ioapics[apic].mpc_apicid);
5981 + for (i = 0; i < get_physical_broadcast(); i++)
5982 + if (!physid_isset(i, phys_id_present_map))
5983 + break;
5984 + if (i >= get_physical_broadcast())
5985 + panic("Max APIC ID exceeded!\n");
5986 + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5987 + i);
5988 + physid_set(i, phys_id_present_map);
5989 + mp_ioapics[apic].mpc_apicid = i;
5990 + } else {
5991 + physid_mask_t tmp;
5992 + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5993 + apic_printk(APIC_VERBOSE, "Setting %d in the "
5994 + "phys_id_present_map\n",
5995 + mp_ioapics[apic].mpc_apicid);
5996 + physids_or(phys_id_present_map, phys_id_present_map, tmp);
5997 + }
5998 +
5999 +
6000 + /*
6001 + * We need to adjust the IRQ routing table
6002 + * if the ID changed.
6003 + */
6004 + if (old_id != mp_ioapics[apic].mpc_apicid)
6005 + for (i = 0; i < mp_irq_entries; i++)
6006 + if (mp_irqs[i].mpc_dstapic == old_id)
6007 + mp_irqs[i].mpc_dstapic
6008 + = mp_ioapics[apic].mpc_apicid;
6009 +
6010 + /*
6011 + * Read the right value from the MPC table and
6012 + * write it into the ID register.
6013 + */
6014 + apic_printk(APIC_VERBOSE, KERN_INFO
6015 + "...changing IO-APIC physical APIC ID to %d ...",
6016 + mp_ioapics[apic].mpc_apicid);
6017 +
6018 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
6019 + spin_lock_irqsave(&ioapic_lock, flags);
6020 + io_apic_write(apic, 0, reg_00.raw);
6021 + spin_unlock_irqrestore(&ioapic_lock, flags);
6022 +
6023 + /*
6024 + * Sanity check
6025 + */
6026 + spin_lock_irqsave(&ioapic_lock, flags);
6027 + reg_00.raw = io_apic_read(apic, 0);
6028 + spin_unlock_irqrestore(&ioapic_lock, flags);
6029 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
6030 + printk("could not set ID!\n");
6031 + else
6032 + apic_printk(APIC_VERBOSE, " ok.\n");
6033 + }
6034 +}
6035 +#else
6036 +static void __init setup_ioapic_ids_from_mpc(void) { }
6037 +#endif
6038 +
6039 +#ifndef CONFIG_XEN
6040 +/*
6041 + * There is a nasty bug in some older SMP boards, their mptable lies
6042 + * about the timer IRQ. We do the following to work around the situation:
6043 + *
6044 + * - timer IRQ defaults to IO-APIC IRQ
6045 + * - if this function detects that timer IRQs are defunct, then we fall
6046 + * back to ISA timer IRQs
6047 + */
6048 +static int __init timer_irq_works(void)
6049 +{
6050 + unsigned long t1 = jiffies;
6051 +
6052 + local_irq_enable();
6053 + /* Let ten ticks pass... */
6054 + mdelay((10 * 1000) / HZ);
6055 +
6056 + /*
6057 + * Expect a few ticks at least, to be sure some possible
6058 + * glue logic does not lock up after one or two first
6059 + * ticks in a non-ExtINT mode. Also the local APIC
6060 + * might have cached one ExtINT interrupt. Finally, at
6061 + * least one tick may be lost due to delays.
6062 + */
6063 + if (jiffies - t1 > 4)
6064 + return 1;
6065 +
6066 + return 0;
6067 +}
6068 +
6069 +/*
6070 + * In the SMP+IOAPIC case it might happen that there are an unspecified
6071 + * number of pending IRQ events unhandled. These cases are very rare,
6072 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
6073 + * better to do it this way as thus we do not have to be aware of
6074 + * 'pending' interrupts in the IRQ path, except at this point.
6075 + */
6076 +/*
6077 + * Edge triggered needs to resend any interrupt
6078 + * that was delayed but this is now handled in the device
6079 + * independent code.
6080 + */
6081 +
6082 +/*
6083 + * Starting up a edge-triggered IO-APIC interrupt is
6084 + * nasty - we need to make sure that we get the edge.
6085 + * If it is already asserted for some reason, we need
6086 + * return 1 to indicate that is was pending.
6087 + *
6088 + * This is not complete - we should be able to fake
6089 + * an edge even if it isn't on the 8259A...
6090 + */
6091 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
6092 +{
6093 + int was_pending = 0;
6094 + unsigned long flags;
6095 +
6096 + spin_lock_irqsave(&ioapic_lock, flags);
6097 + if (irq < 16) {
6098 + disable_8259A_irq(irq);
6099 + if (i8259A_irq_pending(irq))
6100 + was_pending = 1;
6101 + }
6102 + __unmask_IO_APIC_irq(irq);
6103 + spin_unlock_irqrestore(&ioapic_lock, flags);
6104 +
6105 + return was_pending;
6106 +}
6107 +
6108 +/*
6109 + * Once we have recorded IRQ_PENDING already, we can mask the
6110 + * interrupt for real. This prevents IRQ storms from unhandled
6111 + * devices.
6112 + */
6113 +static void ack_edge_ioapic_irq(unsigned int irq)
6114 +{
6115 + move_irq(irq);
6116 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
6117 + == (IRQ_PENDING | IRQ_DISABLED))
6118 + mask_IO_APIC_irq(irq);
6119 + ack_APIC_irq();
6120 +}
6121 +
6122 +/*
6123 + * Level triggered interrupts can just be masked,
6124 + * and shutting down and starting up the interrupt
6125 + * is the same as enabling and disabling them -- except
6126 + * with a startup need to return a "was pending" value.
6127 + *
6128 + * Level triggered interrupts are special because we
6129 + * do not touch any IO-APIC register while handling
6130 + * them. We ack the APIC in the end-IRQ handler, not
6131 + * in the start-IRQ-handler. Protection against reentrance
6132 + * from the same interrupt is still provided, both by the
6133 + * generic IRQ layer and by the fact that an unacked local
6134 + * APIC does not accept IRQs.
6135 + */
6136 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
6137 +{
6138 + unmask_IO_APIC_irq(irq);
6139 +
6140 + return 0; /* don't check for pending */
6141 +}
6142 +
6143 +static void end_level_ioapic_irq (unsigned int irq)
6144 +{
6145 + unsigned long v;
6146 + int i;
6147 +
6148 + move_irq(irq);
6149 +/*
6150 + * It appears there is an erratum which affects at least version 0x11
6151 + * of I/O APIC (that's the 82093AA and cores integrated into various
6152 + * chipsets). Under certain conditions a level-triggered interrupt is
6153 + * erroneously delivered as edge-triggered one but the respective IRR
6154 + * bit gets set nevertheless. As a result the I/O unit expects an EOI
6155 + * message but it will never arrive and further interrupts are blocked
6156 + * from the source. The exact reason is so far unknown, but the
6157 + * phenomenon was observed when two consecutive interrupt requests
6158 + * from a given source get delivered to the same CPU and the source is
6159 + * temporarily disabled in between.
6160 + *
6161 + * A workaround is to simulate an EOI message manually. We achieve it
6162 + * by setting the trigger mode to edge and then to level when the edge
6163 + * trigger mode gets detected in the TMR of a local APIC for a
6164 + * level-triggered interrupt. We mask the source for the time of the
6165 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
6166 + * The idea is from Manfred Spraul. --macro
6167 + */
6168 + i = IO_APIC_VECTOR(irq);
6169 +
6170 + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
6171 +
6172 + ack_APIC_irq();
6173 +
6174 + if (!(v & (1 << (i & 0x1f)))) {
6175 + atomic_inc(&irq_mis_count);
6176 + spin_lock(&ioapic_lock);
6177 + __mask_and_edge_IO_APIC_irq(irq);
6178 + __unmask_and_level_IO_APIC_irq(irq);
6179 + spin_unlock(&ioapic_lock);
6180 + }
6181 +}
6182 +
6183 +#ifdef CONFIG_PCI_MSI
6184 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
6185 +{
6186 + int irq = vector_to_irq(vector);
6187 +
6188 + return startup_edge_ioapic_irq(irq);
6189 +}
6190 +
6191 +static void ack_edge_ioapic_vector(unsigned int vector)
6192 +{
6193 + int irq = vector_to_irq(vector);
6194 +
6195 + move_native_irq(vector);
6196 + ack_edge_ioapic_irq(irq);
6197 +}
6198 +
6199 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
6200 +{
6201 + int irq = vector_to_irq(vector);
6202 +
6203 + return startup_level_ioapic_irq (irq);
6204 +}
6205 +
6206 +static void end_level_ioapic_vector (unsigned int vector)
6207 +{
6208 + int irq = vector_to_irq(vector);
6209 +
6210 + move_native_irq(vector);
6211 + end_level_ioapic_irq(irq);
6212 +}
6213 +
6214 +static void mask_IO_APIC_vector (unsigned int vector)
6215 +{
6216 + int irq = vector_to_irq(vector);
6217 +
6218 + mask_IO_APIC_irq(irq);
6219 +}
6220 +
6221 +static void unmask_IO_APIC_vector (unsigned int vector)
6222 +{
6223 + int irq = vector_to_irq(vector);
6224 +
6225 + unmask_IO_APIC_irq(irq);
6226 +}
6227 +
6228 +#ifdef CONFIG_SMP
6229 +static void set_ioapic_affinity_vector (unsigned int vector,
6230 + cpumask_t cpu_mask)
6231 +{
6232 + int irq = vector_to_irq(vector);
6233 +
6234 + set_native_irq_info(vector, cpu_mask);
6235 + set_ioapic_affinity_irq(irq, cpu_mask);
6236 +}
6237 +#endif
6238 +#endif
6239 +
6240 +/*
6241 + * Level and edge triggered IO-APIC interrupts need different handling,
6242 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
6243 + * handled with the level-triggered descriptor, but that one has slightly
6244 + * more overhead. Level-triggered interrupts cannot be handled with the
6245 + * edge-triggered handler, without risking IRQ storms and other ugly
6246 + * races.
6247 + */
6248 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
6249 + .typename = "IO-APIC-edge",
6250 + .startup = startup_edge_ioapic,
6251 + .shutdown = shutdown_edge_ioapic,
6252 + .enable = enable_edge_ioapic,
6253 + .disable = disable_edge_ioapic,
6254 + .ack = ack_edge_ioapic,
6255 + .end = end_edge_ioapic,
6256 +#ifdef CONFIG_SMP
6257 + .set_affinity = set_ioapic_affinity,
6258 +#endif
6259 +};
6260 +
6261 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
6262 + .typename = "IO-APIC-level",
6263 + .startup = startup_level_ioapic,
6264 + .shutdown = shutdown_level_ioapic,
6265 + .enable = enable_level_ioapic,
6266 + .disable = disable_level_ioapic,
6267 + .ack = mask_and_ack_level_ioapic,
6268 + .end = end_level_ioapic,
6269 +#ifdef CONFIG_SMP
6270 + .set_affinity = set_ioapic_affinity,
6271 +#endif
6272 +};
6273 +#endif /* !CONFIG_XEN */
6274 +
6275 +static inline void init_IO_APIC_traps(void)
6276 +{
6277 + int irq;
6278 +
6279 + /*
6280 + * NOTE! The local APIC isn't very good at handling
6281 + * multiple interrupts at the same interrupt level.
6282 + * As the interrupt level is determined by taking the
6283 + * vector number and shifting that right by 4, we
6284 + * want to spread these out a bit so that they don't
6285 + * all fall in the same interrupt level.
6286 + *
6287 + * Also, we've got to be careful not to trash gate
6288 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
6289 + */
6290 + for (irq = 0; irq < NR_IRQS ; irq++) {
6291 + int tmp = irq;
6292 + if (use_pci_vector()) {
6293 + if (!platform_legacy_irq(tmp))
6294 + if ((tmp = vector_to_irq(tmp)) == -1)
6295 + continue;
6296 + }
6297 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
6298 + /*
6299 + * Hmm.. We don't have an entry for this,
6300 + * so default to an old-fashioned 8259
6301 + * interrupt if we can..
6302 + */
6303 + if (irq < 16)
6304 + make_8259A_irq(irq);
6305 +#ifndef CONFIG_XEN
6306 + else
6307 + /* Strange. Oh, well.. */
6308 + irq_desc[irq].handler = &no_irq_type;
6309 +#endif
6310 + }
6311 + }
6312 +}
6313 +
6314 +#ifndef CONFIG_XEN
6315 +static void enable_lapic_irq (unsigned int irq)
6316 +{
6317 + unsigned long v;
6318 +
6319 + v = apic_read(APIC_LVT0);
6320 + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6321 +}
6322 +
6323 +static void disable_lapic_irq (unsigned int irq)
6324 +{
6325 + unsigned long v;
6326 +
6327 + v = apic_read(APIC_LVT0);
6328 + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6329 +}
6330 +
6331 +static void ack_lapic_irq (unsigned int irq)
6332 +{
6333 + ack_APIC_irq();
6334 +}
6335 +
6336 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
6337 +
6338 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
6339 + .typename = "local-APIC-edge",
6340 + .startup = NULL, /* startup_irq() not used for IRQ0 */
6341 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
6342 + .enable = enable_lapic_irq,
6343 + .disable = disable_lapic_irq,
6344 + .ack = ack_lapic_irq,
6345 + .end = end_lapic_irq
6346 +};
6347 +
6348 +static void setup_nmi (void)
6349 +{
6350 + /*
6351 + * Dirty trick to enable the NMI watchdog ...
6352 + * We put the 8259A master into AEOI mode and
6353 + * unmask on all local APICs LVT0 as NMI.
6354 + *
6355 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6356 + * is from Maciej W. Rozycki - so we do not have to EOI from
6357 + * the NMI handler or the timer interrupt.
6358 + */
6359 + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6360 +
6361 + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6362 +
6363 + apic_printk(APIC_VERBOSE, " done.\n");
6364 +}
6365 +
6366 +/*
6367 + * This looks a bit hackish but it's about the only one way of sending
6368 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
6369 + * not support the ExtINT mode, unfortunately. We need to send these
6370 + * cycles as some i82489DX-based boards have glue logic that keeps the
6371 + * 8259A interrupt line asserted until INTA. --macro
6372 + */
6373 +static inline void unlock_ExtINT_logic(void)
6374 +{
6375 + int apic, pin, i;
6376 + struct IO_APIC_route_entry entry0, entry1;
6377 + unsigned char save_control, save_freq_select;
6378 + unsigned long flags;
6379 +
6380 + pin = find_isa_irq_pin(8, mp_INT);
6381 + apic = find_isa_irq_apic(8, mp_INT);
6382 + if (pin == -1)
6383 + return;
6384 +
6385 + spin_lock_irqsave(&ioapic_lock, flags);
6386 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
6387 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
6388 + spin_unlock_irqrestore(&ioapic_lock, flags);
6389 + clear_IO_APIC_pin(apic, pin);
6390 +
6391 + memset(&entry1, 0, sizeof(entry1));
6392 +
6393 + entry1.dest_mode = 0; /* physical delivery */
6394 + entry1.mask = 0; /* unmask IRQ now */
6395 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
6396 + entry1.delivery_mode = dest_ExtINT;
6397 + entry1.polarity = entry0.polarity;
6398 + entry1.trigger = 0;
6399 + entry1.vector = 0;
6400 +
6401 + spin_lock_irqsave(&ioapic_lock, flags);
6402 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
6403 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
6404 + spin_unlock_irqrestore(&ioapic_lock, flags);
6405 +
6406 + save_control = CMOS_READ(RTC_CONTROL);
6407 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6408 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6409 + RTC_FREQ_SELECT);
6410 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6411 +
6412 + i = 100;
6413 + while (i-- > 0) {
6414 + mdelay(10);
6415 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6416 + i -= 10;
6417 + }
6418 +
6419 + CMOS_WRITE(save_control, RTC_CONTROL);
6420 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
6421 + clear_IO_APIC_pin(apic, pin);
6422 +
6423 + spin_lock_irqsave(&ioapic_lock, flags);
6424 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
6425 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
6426 + spin_unlock_irqrestore(&ioapic_lock, flags);
6427 +}
6428 +
6429 +/*
6430 + * This code may look a bit paranoid, but it's supposed to cooperate with
6431 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
6432 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
6433 + * fanatically on his truly buggy board.
6434 + */
6435 +static inline void check_timer(void)
6436 +{
6437 + int apic1, pin1, apic2, pin2;
6438 + int vector;
6439 +
6440 + /*
6441 + * get/set the timer IRQ vector:
6442 + */
6443 + disable_8259A_irq(0);
6444 + vector = assign_irq_vector(0);
6445 + set_intr_gate(vector, interrupt[0]);
6446 +
6447 + /*
6448 + * Subtle, code in do_timer_interrupt() expects an AEOI
6449 + * mode for the 8259A whenever interrupts are routed
6450 + * through I/O APICs. Also IRQ0 has to be enabled in
6451 + * the 8259A which implies the virtual wire has to be
6452 + * disabled in the local APIC.
6453 + */
6454 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6455 + init_8259A(1);
6456 + timer_ack = 1;
6457 + if (timer_over_8254 > 0)
6458 + enable_8259A_irq(0);
6459 +
6460 + pin1 = find_isa_irq_pin(0, mp_INT);
6461 + apic1 = find_isa_irq_apic(0, mp_INT);
6462 + pin2 = ioapic_i8259.pin;
6463 + apic2 = ioapic_i8259.apic;
6464 +
6465 + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
6466 + vector, apic1, pin1, apic2, pin2);
6467 +
6468 + if (pin1 != -1) {
6469 + /*
6470 + * Ok, does IRQ0 through the IOAPIC work?
6471 + */
6472 + unmask_IO_APIC_irq(0);
6473 + if (timer_irq_works()) {
6474 + if (nmi_watchdog == NMI_IO_APIC) {
6475 + disable_8259A_irq(0);
6476 + setup_nmi();
6477 + enable_8259A_irq(0);
6478 + }
6479 + if (disable_timer_pin_1 > 0)
6480 + clear_IO_APIC_pin(0, pin1);
6481 + return;
6482 + }
6483 + clear_IO_APIC_pin(apic1, pin1);
6484 + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
6485 + "IO-APIC\n");
6486 + }
6487 +
6488 + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
6489 + if (pin2 != -1) {
6490 + printk("\n..... (found pin %d) ...", pin2);
6491 + /*
6492 + * legacy devices should be connected to IO APIC #0
6493 + */
6494 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
6495 + if (timer_irq_works()) {
6496 + printk("works.\n");
6497 + if (pin1 != -1)
6498 + replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
6499 + else
6500 + add_pin_to_irq(0, apic2, pin2);
6501 + if (nmi_watchdog == NMI_IO_APIC) {
6502 + setup_nmi();
6503 + }
6504 + return;
6505 + }
6506 + /*
6507 + * Cleanup, just in case ...
6508 + */
6509 + clear_IO_APIC_pin(apic2, pin2);
6510 + }
6511 + printk(" failed.\n");
6512 +
6513 + if (nmi_watchdog == NMI_IO_APIC) {
6514 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
6515 + nmi_watchdog = 0;
6516 + }
6517 +
6518 + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
6519 +
6520 + disable_8259A_irq(0);
6521 + irq_desc[0].handler = &lapic_irq_type;
6522 + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
6523 + enable_8259A_irq(0);
6524 +
6525 + if (timer_irq_works()) {
6526 + printk(" works.\n");
6527 + return;
6528 + }
6529 + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
6530 + printk(" failed.\n");
6531 +
6532 + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
6533 +
6534 + timer_ack = 0;
6535 + init_8259A(0);
6536 + make_8259A_irq(0);
6537 + apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
6538 +
6539 + unlock_ExtINT_logic();
6540 +
6541 + if (timer_irq_works()) {
6542 + printk(" works.\n");
6543 + return;
6544 + }
6545 + printk(" failed :(.\n");
6546 + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
6547 + "report. Then try booting with the 'noapic' option");
6548 +}
6549 +#else
6550 +#define check_timer() ((void)0)
6551 +#endif
6552 +
6553 +/*
6554 + *
6555 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
6556 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
6557 + * Linux doesn't really care, as it's not actually used
6558 + * for any interrupt handling anyway.
6559 + */
6560 +#define PIC_IRQS (1 << PIC_CASCADE_IR)
6561 +
6562 +void __init setup_IO_APIC(void)
6563 +{
6564 + enable_IO_APIC();
6565 +
6566 + if (acpi_ioapic)
6567 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
6568 + else
6569 + io_apic_irqs = ~PIC_IRQS;
6570 +
6571 + printk("ENABLING IO-APIC IRQs\n");
6572 +
6573 + /*
6574 + * Set up IO-APIC IRQ routing.
6575 + */
6576 + if (!acpi_ioapic)
6577 + setup_ioapic_ids_from_mpc();
6578 +#ifndef CONFIG_XEN
6579 + sync_Arb_IDs();
6580 +#endif
6581 + setup_IO_APIC_irqs();
6582 + init_IO_APIC_traps();
6583 + check_timer();
6584 + if (!acpi_ioapic)
6585 + print_IO_APIC();
6586 +}
6587 +
6588 +static int __init setup_disable_8254_timer(char *s)
6589 +{
6590 + timer_over_8254 = -1;
6591 + return 1;
6592 +}
6593 +static int __init setup_enable_8254_timer(char *s)
6594 +{
6595 + timer_over_8254 = 2;
6596 + return 1;
6597 +}
6598 +
6599 +__setup("disable_8254_timer", setup_disable_8254_timer);
6600 +__setup("enable_8254_timer", setup_enable_8254_timer);
6601 +
6602 +/*
6603 + * Called after all the initialization is done. If we didnt find any
6604 + * APIC bugs then we can allow the modify fast path
6605 + */
6606 +
6607 +static int __init io_apic_bug_finalize(void)
6608 +{
6609 + if(sis_apic_bug == -1)
6610 + sis_apic_bug = 0;
6611 + if (is_initial_xendomain()) {
6612 + dom0_op_t op = { .cmd = DOM0_PLATFORM_QUIRK };
6613 + op.u.platform_quirk.quirk_id = sis_apic_bug ?
6614 + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
6615 + HYPERVISOR_dom0_op(&op);
6616 + }
6617 + return 0;
6618 +}
6619 +
6620 +late_initcall(io_apic_bug_finalize);
6621 +
6622 +struct sysfs_ioapic_data {
6623 + struct sys_device dev;
6624 + struct IO_APIC_route_entry entry[0];
6625 +};
6626 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
6627 +
6628 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
6629 +{
6630 + struct IO_APIC_route_entry *entry;
6631 + struct sysfs_ioapic_data *data;
6632 + unsigned long flags;
6633 + int i;
6634 +
6635 + data = container_of(dev, struct sysfs_ioapic_data, dev);
6636 + entry = data->entry;
6637 + spin_lock_irqsave(&ioapic_lock, flags);
6638 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6639 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
6640 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
6641 + }
6642 + spin_unlock_irqrestore(&ioapic_lock, flags);
6643 +
6644 + return 0;
6645 +}
6646 +
6647 +static int ioapic_resume(struct sys_device *dev)
6648 +{
6649 + struct IO_APIC_route_entry *entry;
6650 + struct sysfs_ioapic_data *data;
6651 + unsigned long flags;
6652 + union IO_APIC_reg_00 reg_00;
6653 + int i;
6654 +
6655 + data = container_of(dev, struct sysfs_ioapic_data, dev);
6656 + entry = data->entry;
6657 +
6658 + spin_lock_irqsave(&ioapic_lock, flags);
6659 + reg_00.raw = io_apic_read(dev->id, 0);
6660 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
6661 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
6662 + io_apic_write(dev->id, 0, reg_00.raw);
6663 + }
6664 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6665 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
6666 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
6667 + }
6668 + spin_unlock_irqrestore(&ioapic_lock, flags);
6669 +
6670 + return 0;
6671 +}
6672 +
6673 +static struct sysdev_class ioapic_sysdev_class = {
6674 + set_kset_name("ioapic"),
6675 + .suspend = ioapic_suspend,
6676 + .resume = ioapic_resume,
6677 +};
6678 +
6679 +static int __init ioapic_init_sysfs(void)
6680 +{
6681 + struct sys_device * dev;
6682 + int i, size, error = 0;
6683 +
6684 + error = sysdev_class_register(&ioapic_sysdev_class);
6685 + if (error)
6686 + return error;
6687 +
6688 + for (i = 0; i < nr_ioapics; i++ ) {
6689 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
6690 + * sizeof(struct IO_APIC_route_entry);
6691 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
6692 + if (!mp_ioapic_data[i]) {
6693 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6694 + continue;
6695 + }
6696 + memset(mp_ioapic_data[i], 0, size);
6697 + dev = &mp_ioapic_data[i]->dev;
6698 + dev->id = i;
6699 + dev->cls = &ioapic_sysdev_class;
6700 + error = sysdev_register(dev);
6701 + if (error) {
6702 + kfree(mp_ioapic_data[i]);
6703 + mp_ioapic_data[i] = NULL;
6704 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6705 + continue;
6706 + }
6707 + }
6708 +
6709 + return 0;
6710 +}
6711 +
6712 +device_initcall(ioapic_init_sysfs);
6713 +
6714 +/* --------------------------------------------------------------------------
6715 + ACPI-based IOAPIC Configuration
6716 + -------------------------------------------------------------------------- */
6717 +
6718 +#ifdef CONFIG_ACPI
6719 +
6720 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
6721 +{
6722 +#ifndef CONFIG_XEN
6723 + union IO_APIC_reg_00 reg_00;
6724 + static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
6725 + physid_mask_t tmp;
6726 + unsigned long flags;
6727 + int i = 0;
6728 +
6729 + /*
6730 + * The P4 platform supports up to 256 APIC IDs on two separate APIC
6731 + * buses (one for LAPICs, one for IOAPICs), where predecessors only
6732 + * supports up to 16 on one shared APIC bus.
6733 + *
6734 + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
6735 + * advantage of new APIC bus architecture.
6736 + */
6737 +
6738 + if (physids_empty(apic_id_map))
6739 + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
6740 +
6741 + spin_lock_irqsave(&ioapic_lock, flags);
6742 + reg_00.raw = io_apic_read(ioapic, 0);
6743 + spin_unlock_irqrestore(&ioapic_lock, flags);
6744 +
6745 + if (apic_id >= get_physical_broadcast()) {
6746 + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
6747 + "%d\n", ioapic, apic_id, reg_00.bits.ID);
6748 + apic_id = reg_00.bits.ID;
6749 + }
6750 +
6751 + /*
6752 + * Every APIC in a system must have a unique ID or we get lots of nice
6753 + * 'stuck on smp_invalidate_needed IPI wait' messages.
6754 + */
6755 + if (check_apicid_used(apic_id_map, apic_id)) {
6756 +
6757 + for (i = 0; i < get_physical_broadcast(); i++) {
6758 + if (!check_apicid_used(apic_id_map, i))
6759 + break;
6760 + }
6761 +
6762 + if (i == get_physical_broadcast())
6763 + panic("Max apic_id exceeded!\n");
6764 +
6765 + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
6766 + "trying %d\n", ioapic, apic_id, i);
6767 +
6768 + apic_id = i;
6769 + }
6770 +
6771 + tmp = apicid_to_cpu_present(apic_id);
6772 + physids_or(apic_id_map, apic_id_map, tmp);
6773 +
6774 + if (reg_00.bits.ID != apic_id) {
6775 + reg_00.bits.ID = apic_id;
6776 +
6777 + spin_lock_irqsave(&ioapic_lock, flags);
6778 + io_apic_write(ioapic, 0, reg_00.raw);
6779 + reg_00.raw = io_apic_read(ioapic, 0);
6780 + spin_unlock_irqrestore(&ioapic_lock, flags);
6781 +
6782 + /* Sanity check */
6783 + if (reg_00.bits.ID != apic_id) {
6784 + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
6785 + return -1;
6786 + }
6787 + }
6788 +
6789 + apic_printk(APIC_VERBOSE, KERN_INFO
6790 + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
6791 +#endif /* !CONFIG_XEN */
6792 +
6793 + return apic_id;
6794 +}
6795 +
6796 +
6797 +int __init io_apic_get_version (int ioapic)
6798 +{
6799 + union IO_APIC_reg_01 reg_01;
6800 + unsigned long flags;
6801 +
6802 + spin_lock_irqsave(&ioapic_lock, flags);
6803 + reg_01.raw = io_apic_read(ioapic, 1);
6804 + spin_unlock_irqrestore(&ioapic_lock, flags);
6805 +
6806 + return reg_01.bits.version;
6807 +}
6808 +
6809 +
6810 +int __init io_apic_get_redir_entries (int ioapic)
6811 +{
6812 + union IO_APIC_reg_01 reg_01;
6813 + unsigned long flags;
6814 +
6815 + spin_lock_irqsave(&ioapic_lock, flags);
6816 + reg_01.raw = io_apic_read(ioapic, 1);
6817 + spin_unlock_irqrestore(&ioapic_lock, flags);
6818 +
6819 + return reg_01.bits.entries;
6820 +}
6821 +
6822 +
6823 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
6824 +{
6825 + struct IO_APIC_route_entry entry;
6826 + unsigned long flags;
6827 +
6828 + if (!IO_APIC_IRQ(irq)) {
6829 + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
6830 + ioapic);
6831 + return -EINVAL;
6832 + }
6833 +
6834 + /*
6835 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
6836 + * Note that we mask (disable) IRQs now -- these get enabled when the
6837 + * corresponding device driver registers for this IRQ.
6838 + */
6839 +
6840 + memset(&entry,0,sizeof(entry));
6841 +
6842 + entry.delivery_mode = INT_DELIVERY_MODE;
6843 + entry.dest_mode = INT_DEST_MODE;
6844 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6845 + entry.trigger = edge_level;
6846 + entry.polarity = active_high_low;
6847 + entry.mask = 1;
6848 +
6849 + /*
6850 + * IRQs < 16 are already in the irq_2_pin[] map
6851 + */
6852 + if (irq >= 16)
6853 + add_pin_to_irq(irq, ioapic, pin);
6854 +
6855 + entry.vector = assign_irq_vector(irq);
6856 +
6857 + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
6858 + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
6859 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
6860 + edge_level, active_high_low);
6861 +
6862 + ioapic_register_intr(irq, entry.vector, edge_level);
6863 +
6864 + if (!ioapic && (irq < 16))
6865 + disable_8259A_irq(irq);
6866 +
6867 + spin_lock_irqsave(&ioapic_lock, flags);
6868 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
6869 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
6870 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
6871 + spin_unlock_irqrestore(&ioapic_lock, flags);
6872 +
6873 + return 0;
6874 +}
6875 +
6876 +#endif /* CONFIG_ACPI */
6877 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/ioport-xen.c linux-2.6.16.33/arch/i386/kernel/ioport-xen.c
6878 --- linux-2.6.16.33-noxen/arch/i386/kernel/ioport-xen.c 1970-01-01 00:00:00.000000000 +0000
6879 +++ linux-2.6.16.33/arch/i386/kernel/ioport-xen.c 2007-01-08 15:00:45.000000000 +0000
6880 @@ -0,0 +1,121 @@
6881 +/*
6882 + * linux/arch/i386/kernel/ioport.c
6883 + *
6884 + * This contains the io-permission bitmap code - written by obz, with changes
6885 + * by Linus.
6886 + */
6887 +
6888 +#include <linux/sched.h>
6889 +#include <linux/kernel.h>
6890 +#include <linux/capability.h>
6891 +#include <linux/errno.h>
6892 +#include <linux/types.h>
6893 +#include <linux/ioport.h>
6894 +#include <linux/smp.h>
6895 +#include <linux/smp_lock.h>
6896 +#include <linux/stddef.h>
6897 +#include <linux/slab.h>
6898 +#include <linux/thread_info.h>
6899 +#include <xen/interface/physdev.h>
6900 +
6901 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
6902 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
6903 +{
6904 + unsigned long mask;
6905 + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
6906 + unsigned int low_index = base & (BITS_PER_LONG-1);
6907 + int length = low_index + extent;
6908 +
6909 + if (low_index != 0) {
6910 + mask = (~0UL << low_index);
6911 + if (length < BITS_PER_LONG)
6912 + mask &= ~(~0UL << length);
6913 + if (new_value)
6914 + *bitmap_base++ |= mask;
6915 + else
6916 + *bitmap_base++ &= ~mask;
6917 + length -= BITS_PER_LONG;
6918 + }
6919 +
6920 + mask = (new_value ? ~0UL : 0UL);
6921 + while (length >= BITS_PER_LONG) {
6922 + *bitmap_base++ = mask;
6923 + length -= BITS_PER_LONG;
6924 + }
6925 +
6926 + if (length > 0) {
6927 + mask = ~(~0UL << length);
6928 + if (new_value)
6929 + *bitmap_base++ |= mask;
6930 + else
6931 + *bitmap_base++ &= ~mask;
6932 + }
6933 +}
6934 +
6935 +
6936 +/*
6937 + * this changes the io permissions bitmap in the current task.
6938 + */
6939 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
6940 +{
6941 + struct thread_struct * t = &current->thread;
6942 + unsigned long *bitmap;
6943 + struct physdev_set_iobitmap set_iobitmap;
6944 +
6945 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
6946 + return -EINVAL;
6947 + if (turn_on && !capable(CAP_SYS_RAWIO))
6948 + return -EPERM;
6949 +
6950 + /*
6951 + * If it's the first ioperm() call in this thread's lifetime, set the
6952 + * IO bitmap up. ioperm() is much less timing critical than clone(),
6953 + * this is why we delay this operation until now:
6954 + */
6955 + if (!t->io_bitmap_ptr) {
6956 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6957 + if (!bitmap)
6958 + return -ENOMEM;
6959 +
6960 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
6961 + t->io_bitmap_ptr = bitmap;
6962 +
6963 + set_iobitmap.bitmap = (char *)bitmap;
6964 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
6965 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
6966 + }
6967 +
6968 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6969 +
6970 + return 0;
6971 +}
6972 +
6973 +/*
6974 + * sys_iopl has to be used when you want to access the IO ports
6975 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6976 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6977 + *
6978 + * Here we just change the eflags value on the stack: we allow
6979 + * only the super-user to do it. This depends on the stack-layout
6980 + * on system-call entry - see also fork() and the signal handling
6981 + * code.
6982 + */
6983 +
6984 +asmlinkage long sys_iopl(unsigned long unused)
6985 +{
6986 + volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6987 + unsigned int level = regs->ebx;
6988 + struct thread_struct *t = &current->thread;
6989 + unsigned int old = (t->iopl >> 12) & 3;
6990 +
6991 + if (level > 3)
6992 + return -EINVAL;
6993 + /* Trying to gain more privileges? */
6994 + if (level > old) {
6995 + if (!capable(CAP_SYS_RAWIO))
6996 + return -EPERM;
6997 + }
6998 + t->iopl = level << 12;
6999 + set_iopl_mask(t->iopl);
7000 + return 0;
7001 +}
7002 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/irq-xen.c linux-2.6.16.33/arch/i386/kernel/irq-xen.c
7003 --- linux-2.6.16.33-noxen/arch/i386/kernel/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
7004 +++ linux-2.6.16.33/arch/i386/kernel/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
7005 @@ -0,0 +1,306 @@
7006 +/*
7007 + * linux/arch/i386/kernel/irq.c
7008 + *
7009 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
7010 + *
7011 + * This file contains the lowest level x86-specific interrupt
7012 + * entry, irq-stacks and irq statistics code. All the remaining
7013 + * irq logic is done by the generic kernel/irq/ code and
7014 + * by the x86-specific irq controller code. (e.g. i8259.c and
7015 + * io_apic.c.)
7016 + */
7017 +
7018 +#include <asm/uaccess.h>
7019 +#include <linux/module.h>
7020 +#include <linux/seq_file.h>
7021 +#include <linux/interrupt.h>
7022 +#include <linux/kernel_stat.h>
7023 +#include <linux/notifier.h>
7024 +#include <linux/cpu.h>
7025 +#include <linux/delay.h>
7026 +
7027 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
7028 +EXPORT_PER_CPU_SYMBOL(irq_stat);
7029 +
7030 +#ifndef CONFIG_X86_LOCAL_APIC
7031 +/*
7032 + * 'what should we do if we get a hw irq event on an illegal vector'.
7033 + * each architecture has to answer this themselves.
7034 + */
7035 +void ack_bad_irq(unsigned int irq)
7036 +{
7037 + printk("unexpected IRQ trap at vector %02x\n", irq);
7038 +}
7039 +#endif
7040 +
7041 +#ifdef CONFIG_4KSTACKS
7042 +/*
7043 + * per-CPU IRQ handling contexts (thread information and stack)
7044 + */
7045 +union irq_ctx {
7046 + struct thread_info tinfo;
7047 + u32 stack[THREAD_SIZE/sizeof(u32)];
7048 +};
7049 +
7050 +static union irq_ctx *hardirq_ctx[NR_CPUS];
7051 +static union irq_ctx *softirq_ctx[NR_CPUS];
7052 +#endif
7053 +
7054 +/*
7055 + * do_IRQ handles all normal device IRQ's (the special
7056 + * SMP cross-CPU interrupts have their own specific
7057 + * handlers).
7058 + */
7059 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
7060 +{
7061 + /* high bit used in ret_from_ code */
7062 + int irq = ~regs->orig_eax;
7063 +#ifdef CONFIG_4KSTACKS
7064 + union irq_ctx *curctx, *irqctx;
7065 + u32 *isp;
7066 +#endif
7067 +
7068 + irq_enter();
7069 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
7070 + /* Debugging check for stack overflow: is there less than 1KB free? */
7071 + {
7072 + long esp;
7073 +
7074 + __asm__ __volatile__("andl %%esp,%0" :
7075 + "=r" (esp) : "0" (THREAD_SIZE - 1));
7076 + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
7077 + printk("do_IRQ: stack overflow: %ld\n",
7078 + esp - sizeof(struct thread_info));
7079 + dump_stack();
7080 + }
7081 + }
7082 +#endif
7083 +
7084 +#ifdef CONFIG_4KSTACKS
7085 +
7086 + curctx = (union irq_ctx *) current_thread_info();
7087 + irqctx = hardirq_ctx[smp_processor_id()];
7088 +
7089 + /*
7090 + * this is where we switch to the IRQ stack. However, if we are
7091 + * already using the IRQ stack (because we interrupted a hardirq
7092 + * handler) we can't do that and just have to keep using the
7093 + * current stack (which is the irq stack already after all)
7094 + */
7095 + if (curctx != irqctx) {
7096 + int arg1, arg2, ebx;
7097 +
7098 + /* build the stack frame on the IRQ stack */
7099 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7100 + irqctx->tinfo.task = curctx->tinfo.task;
7101 + irqctx->tinfo.previous_esp = current_stack_pointer;
7102 +
7103 + asm volatile(
7104 + " xchgl %%ebx,%%esp \n"
7105 + " call __do_IRQ \n"
7106 + " movl %%ebx,%%esp \n"
7107 + : "=a" (arg1), "=d" (arg2), "=b" (ebx)
7108 + : "0" (irq), "1" (regs), "2" (isp)
7109 + : "memory", "cc", "ecx"
7110 + );
7111 + } else
7112 +#endif
7113 + __do_IRQ(irq, regs);
7114 +
7115 + irq_exit();
7116 +
7117 + return 1;
7118 +}
7119 +
7120 +#ifdef CONFIG_4KSTACKS
7121 +
7122 +/*
7123 + * These should really be __section__(".bss.page_aligned") as well, but
7124 + * gcc's 3.0 and earlier don't handle that correctly.
7125 + */
7126 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
7127 + __attribute__((__aligned__(THREAD_SIZE)));
7128 +
7129 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
7130 + __attribute__((__aligned__(THREAD_SIZE)));
7131 +
7132 +/*
7133 + * allocate per-cpu stacks for hardirq and for softirq processing
7134 + */
7135 +void irq_ctx_init(int cpu)
7136 +{
7137 + union irq_ctx *irqctx;
7138 +
7139 + if (hardirq_ctx[cpu])
7140 + return;
7141 +
7142 + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
7143 + irqctx->tinfo.task = NULL;
7144 + irqctx->tinfo.exec_domain = NULL;
7145 + irqctx->tinfo.cpu = cpu;
7146 + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
7147 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
7148 +
7149 + hardirq_ctx[cpu] = irqctx;
7150 +
7151 + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
7152 + irqctx->tinfo.task = NULL;
7153 + irqctx->tinfo.exec_domain = NULL;
7154 + irqctx->tinfo.cpu = cpu;
7155 + irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
7156 + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
7157 +
7158 + softirq_ctx[cpu] = irqctx;
7159 +
7160 + printk("CPU %u irqstacks, hard=%p soft=%p\n",
7161 + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
7162 +}
7163 +
7164 +void irq_ctx_exit(int cpu)
7165 +{
7166 + hardirq_ctx[cpu] = NULL;
7167 +}
7168 +
7169 +extern asmlinkage void __do_softirq(void);
7170 +
7171 +asmlinkage void do_softirq(void)
7172 +{
7173 + unsigned long flags;
7174 + struct thread_info *curctx;
7175 + union irq_ctx *irqctx;
7176 + u32 *isp;
7177 +
7178 + if (in_interrupt())
7179 + return;
7180 +
7181 + local_irq_save(flags);
7182 +
7183 + if (local_softirq_pending()) {
7184 + curctx = current_thread_info();
7185 + irqctx = softirq_ctx[smp_processor_id()];
7186 + irqctx->tinfo.task = curctx->task;
7187 + irqctx->tinfo.previous_esp = current_stack_pointer;
7188 +
7189 + /* build the stack frame on the softirq stack */
7190 + isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7191 +
7192 + asm volatile(
7193 + " xchgl %%ebx,%%esp \n"
7194 + " call __do_softirq \n"
7195 + " movl %%ebx,%%esp \n"
7196 + : "=b"(isp)
7197 + : "0"(isp)
7198 + : "memory", "cc", "edx", "ecx", "eax"
7199 + );
7200 + }
7201 +
7202 + local_irq_restore(flags);
7203 +}
7204 +
7205 +EXPORT_SYMBOL(do_softirq);
7206 +#endif
7207 +
7208 +/*
7209 + * Interrupt statistics:
7210 + */
7211 +
7212 +atomic_t irq_err_count;
7213 +
7214 +/*
7215 + * /proc/interrupts printing:
7216 + */
7217 +
7218 +int show_interrupts(struct seq_file *p, void *v)
7219 +{
7220 + int i = *(loff_t *) v, j;
7221 + struct irqaction * action;
7222 + unsigned long flags;
7223 +
7224 + if (i == 0) {
7225 + seq_printf(p, " ");
7226 + for_each_online_cpu(j)
7227 + seq_printf(p, "CPU%d ",j);
7228 + seq_putc(p, '\n');
7229 + }
7230 +
7231 + if (i < NR_IRQS) {
7232 + spin_lock_irqsave(&irq_desc[i].lock, flags);
7233 + action = irq_desc[i].action;
7234 + if (!action)
7235 + goto skip;
7236 + seq_printf(p, "%3d: ",i);
7237 +#ifndef CONFIG_SMP
7238 + seq_printf(p, "%10u ", kstat_irqs(i));
7239 +#else
7240 + for_each_online_cpu(j)
7241 + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
7242 +#endif
7243 + seq_printf(p, " %14s", irq_desc[i].handler->typename);
7244 + seq_printf(p, " %s", action->name);
7245 +
7246 + for (action=action->next; action; action = action->next)
7247 + seq_printf(p, ", %s", action->name);
7248 +
7249 + seq_putc(p, '\n');
7250 +skip:
7251 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
7252 + } else if (i == NR_IRQS) {
7253 + seq_printf(p, "NMI: ");
7254 + for_each_online_cpu(j)
7255 + seq_printf(p, "%10u ", nmi_count(j));
7256 + seq_putc(p, '\n');
7257 +#ifdef CONFIG_X86_LOCAL_APIC
7258 + seq_printf(p, "LOC: ");
7259 + for_each_online_cpu(j)
7260 + seq_printf(p, "%10u ",
7261 + per_cpu(irq_stat,j).apic_timer_irqs);
7262 + seq_putc(p, '\n');
7263 +#endif
7264 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
7265 +#if defined(CONFIG_X86_IO_APIC)
7266 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
7267 +#endif
7268 + }
7269 + return 0;
7270 +}
7271 +
7272 +#ifdef CONFIG_HOTPLUG_CPU
7273 +
7274 +void fixup_irqs(cpumask_t map)
7275 +{
7276 + unsigned int irq;
7277 + static int warned;
7278 +
7279 + for (irq = 0; irq < NR_IRQS; irq++) {
7280 + cpumask_t mask;
7281 + if (irq == 2)
7282 + continue;
7283 +
7284 + cpus_and(mask, irq_affinity[irq], map);
7285 + if (any_online_cpu(mask) == NR_CPUS) {
7286 + /*printk("Breaking affinity for irq %i\n", irq);*/
7287 + mask = map;
7288 + }
7289 + if (irq_desc[irq].handler->set_affinity)
7290 + irq_desc[irq].handler->set_affinity(irq, mask);
7291 + else if (irq_desc[irq].action && !(warned++))
7292 + printk("Cannot set affinity for irq %i\n", irq);
7293 + }
7294 +
7295 +#if 0
7296 + barrier();
7297 + /* Ingo Molnar says: "after the IO-APIC masks have been redirected
7298 + [note the nop - the interrupt-enable boundary on x86 is two
7299 + instructions from sti] - to flush out pending hardirqs and
7300 + IPIs. After this point nothing is supposed to reach this CPU." */
7301 + __asm__ __volatile__("sti; nop; cli");
7302 + barrier();
7303 +#else
7304 + /* That doesn't seem sufficient. Give it 1ms. */
7305 + local_irq_enable();
7306 + mdelay(1);
7307 + local_irq_disable();
7308 +#endif
7309 +}
7310 +#endif
7311 +
7312 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/irq.c linux-2.6.16.33/arch/i386/kernel/irq.c
7313 --- linux-2.6.16.33-noxen/arch/i386/kernel/irq.c 2006-11-22 18:06:31.000000000 +0000
7314 +++ linux-2.6.16.33/arch/i386/kernel/irq.c 2007-05-23 21:00:01.000000000 +0000
7315 @@ -53,8 +53,8 @@
7316 */
7317 fastcall unsigned int do_IRQ(struct pt_regs *regs)
7318 {
7319 - /* high bits used in ret_from_ code */
7320 - int irq = regs->orig_eax & 0xff;
7321 + /* high bit used in ret_from_ code */
7322 + int irq = ~regs->orig_eax;
7323 #ifdef CONFIG_4KSTACKS
7324 union irq_ctx *curctx, *irqctx;
7325 u32 *isp;
7326 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/ldt-xen.c linux-2.6.16.33/arch/i386/kernel/ldt-xen.c
7327 --- linux-2.6.16.33-noxen/arch/i386/kernel/ldt-xen.c 1970-01-01 00:00:00.000000000 +0000
7328 +++ linux-2.6.16.33/arch/i386/kernel/ldt-xen.c 2007-01-08 15:00:45.000000000 +0000
7329 @@ -0,0 +1,270 @@
7330 +/*
7331 + * linux/kernel/ldt.c
7332 + *
7333 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
7334 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
7335 + */
7336 +
7337 +#include <linux/errno.h>
7338 +#include <linux/sched.h>
7339 +#include <linux/string.h>
7340 +#include <linux/mm.h>
7341 +#include <linux/smp.h>
7342 +#include <linux/smp_lock.h>
7343 +#include <linux/vmalloc.h>
7344 +#include <linux/slab.h>
7345 +
7346 +#include <asm/uaccess.h>
7347 +#include <asm/system.h>
7348 +#include <asm/ldt.h>
7349 +#include <asm/desc.h>
7350 +#include <asm/mmu_context.h>
7351 +
7352 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
7353 +static void flush_ldt(void *null)
7354 +{
7355 + if (current->active_mm)
7356 + load_LDT(&current->active_mm->context);
7357 +}
7358 +#endif
7359 +
7360 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
7361 +{
7362 + void *oldldt;
7363 + void *newldt;
7364 + int oldsize;
7365 +
7366 + if (mincount <= pc->size)
7367 + return 0;
7368 + oldsize = pc->size;
7369 + mincount = (mincount+511)&(~511);
7370 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
7371 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
7372 + else
7373 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
7374 +
7375 + if (!newldt)
7376 + return -ENOMEM;
7377 +
7378 + if (oldsize)
7379 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
7380 + oldldt = pc->ldt;
7381 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
7382 + pc->ldt = newldt;
7383 + wmb();
7384 + pc->size = mincount;
7385 + wmb();
7386 +
7387 + if (reload) {
7388 +#ifdef CONFIG_SMP
7389 + cpumask_t mask;
7390 + preempt_disable();
7391 +#endif
7392 + make_pages_readonly(
7393 + pc->ldt,
7394 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7395 + XENFEAT_writable_descriptor_tables);
7396 + load_LDT(pc);
7397 +#ifdef CONFIG_SMP
7398 + mask = cpumask_of_cpu(smp_processor_id());
7399 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
7400 + smp_call_function(flush_ldt, NULL, 1, 1);
7401 + preempt_enable();
7402 +#endif
7403 + }
7404 + if (oldsize) {
7405 + make_pages_writable(
7406 + oldldt,
7407 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
7408 + XENFEAT_writable_descriptor_tables);
7409 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
7410 + vfree(oldldt);
7411 + else
7412 + kfree(oldldt);
7413 + }
7414 + return 0;
7415 +}
7416 +
7417 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
7418 +{
7419 + int err = alloc_ldt(new, old->size, 0);
7420 + if (err < 0)
7421 + return err;
7422 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
7423 + make_pages_readonly(
7424 + new->ldt,
7425 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7426 + XENFEAT_writable_descriptor_tables);
7427 + return 0;
7428 +}
7429 +
7430 +/*
7431 + * we do not have to muck with descriptors here, that is
7432 + * done in switch_mm() as needed.
7433 + */
7434 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
7435 +{
7436 + struct mm_struct * old_mm;
7437 + int retval = 0;
7438 +
7439 + init_MUTEX(&mm->context.sem);
7440 + mm->context.size = 0;
7441 + mm->context.has_foreign_mappings = 0;
7442 + old_mm = current->mm;
7443 + if (old_mm && old_mm->context.size > 0) {
7444 + down(&old_mm->context.sem);
7445 + retval = copy_ldt(&mm->context, &old_mm->context);
7446 + up(&old_mm->context.sem);
7447 + }
7448 + return retval;
7449 +}
7450 +
7451 +/*
7452 + * No need to lock the MM as we are the last user
7453 + */
7454 +void destroy_context(struct mm_struct *mm)
7455 +{
7456 + if (mm->context.size) {
7457 + if (mm == current->active_mm)
7458 + clear_LDT();
7459 + make_pages_writable(
7460 + mm->context.ldt,
7461 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7462 + XENFEAT_writable_descriptor_tables);
7463 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
7464 + vfree(mm->context.ldt);
7465 + else
7466 + kfree(mm->context.ldt);
7467 + mm->context.size = 0;
7468 + }
7469 +}
7470 +
7471 +static int read_ldt(void __user * ptr, unsigned long bytecount)
7472 +{
7473 + int err;
7474 + unsigned long size;
7475 + struct mm_struct * mm = current->mm;
7476 +
7477 + if (!mm->context.size)
7478 + return 0;
7479 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
7480 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
7481 +
7482 + down(&mm->context.sem);
7483 + size = mm->context.size*LDT_ENTRY_SIZE;
7484 + if (size > bytecount)
7485 + size = bytecount;
7486 +
7487 + err = 0;
7488 + if (copy_to_user(ptr, mm->context.ldt, size))
7489 + err = -EFAULT;
7490 + up(&mm->context.sem);
7491 + if (err < 0)
7492 + goto error_return;
7493 + if (size != bytecount) {
7494 + /* zero-fill the rest */
7495 + if (clear_user(ptr+size, bytecount-size) != 0) {
7496 + err = -EFAULT;
7497 + goto error_return;
7498 + }
7499 + }
7500 + return bytecount;
7501 +error_return:
7502 + return err;
7503 +}
7504 +
7505 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
7506 +{
7507 + int err;
7508 + unsigned long size;
7509 + void *address;
7510 +
7511 + err = 0;
7512 + address = &default_ldt[0];
7513 + size = 5*sizeof(struct desc_struct);
7514 + if (size > bytecount)
7515 + size = bytecount;
7516 +
7517 + err = size;
7518 + if (copy_to_user(ptr, address, size))
7519 + err = -EFAULT;
7520 +
7521 + return err;
7522 +}
7523 +
7524 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
7525 +{
7526 + struct mm_struct * mm = current->mm;
7527 + __u32 entry_1, entry_2;
7528 + int error;
7529 + struct user_desc ldt_info;
7530 +
7531 + error = -EINVAL;
7532 + if (bytecount != sizeof(ldt_info))
7533 + goto out;
7534 + error = -EFAULT;
7535 + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
7536 + goto out;
7537 +
7538 + error = -EINVAL;
7539 + if (ldt_info.entry_number >= LDT_ENTRIES)
7540 + goto out;
7541 + if (ldt_info.contents == 3) {
7542 + if (oldmode)
7543 + goto out;
7544 + if (ldt_info.seg_not_present == 0)
7545 + goto out;
7546 + }
7547 +
7548 + down(&mm->context.sem);
7549 + if (ldt_info.entry_number >= mm->context.size) {
7550 + error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
7551 + if (error < 0)
7552 + goto out_unlock;
7553 + }
7554 +
7555 + /* Allow LDTs to be cleared by the user. */
7556 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
7557 + if (oldmode || LDT_empty(&ldt_info)) {
7558 + entry_1 = 0;
7559 + entry_2 = 0;
7560 + goto install;
7561 + }
7562 + }
7563 +
7564 + entry_1 = LDT_entry_a(&ldt_info);
7565 + entry_2 = LDT_entry_b(&ldt_info);
7566 + if (oldmode)
7567 + entry_2 &= ~(1 << 20);
7568 +
7569 + /* Install the new entry ... */
7570 +install:
7571 + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
7572 + entry_1, entry_2);
7573 +
7574 +out_unlock:
7575 + up(&mm->context.sem);
7576 +out:
7577 + return error;
7578 +}
7579 +
7580 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
7581 +{
7582 + int ret = -ENOSYS;
7583 +
7584 + switch (func) {
7585 + case 0:
7586 + ret = read_ldt(ptr, bytecount);
7587 + break;
7588 + case 1:
7589 + ret = write_ldt(ptr, bytecount, 1);
7590 + break;
7591 + case 2:
7592 + ret = read_default_ldt(ptr, bytecount);
7593 + break;
7594 + case 0x11:
7595 + ret = write_ldt(ptr, bytecount, 0);
7596 + break;
7597 + }
7598 + return ret;
7599 +}
7600 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c linux-2.6.16.33/arch/i386/kernel/machine_kexec.c
7601 --- linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
7602 +++ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
7603 @@ -19,123 +19,52 @@
7604 #include <asm/desc.h>
7605 #include <asm/system.h>
7606
7607 -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7608 -
7609 -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
7610 -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
7611 -#define L2_ATTR (_PAGE_PRESENT)
7612 -
7613 -#define LEVEL0_SIZE (1UL << 12UL)
7614 -
7615 -#ifndef CONFIG_X86_PAE
7616 -#define LEVEL1_SIZE (1UL << 22UL)
7617 -static u32 pgtable_level1[1024] PAGE_ALIGNED;
7618 -
7619 -static void identity_map_page(unsigned long address)
7620 -{
7621 - unsigned long level1_index, level2_index;
7622 - u32 *pgtable_level2;
7623 -
7624 - /* Find the current page table */
7625 - pgtable_level2 = __va(read_cr3());
7626 -
7627 - /* Find the indexes of the physical address to identity map */
7628 - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
7629 - level2_index = address / LEVEL1_SIZE;
7630 -
7631 - /* Identity map the page table entry */
7632 - pgtable_level1[level1_index] = address | L0_ATTR;
7633 - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
7634 -
7635 - /* Flush the tlb so the new mapping takes effect.
7636 - * Global tlb entries are not flushed but that is not an issue.
7637 - */
7638 - load_cr3(pgtable_level2);
7639 -}
7640 -
7641 -#else
7642 -#define LEVEL1_SIZE (1UL << 21UL)
7643 -#define LEVEL2_SIZE (1UL << 30UL)
7644 -static u64 pgtable_level1[512] PAGE_ALIGNED;
7645 -static u64 pgtable_level2[512] PAGE_ALIGNED;
7646 -
7647 -static void identity_map_page(unsigned long address)
7648 -{
7649 - unsigned long level1_index, level2_index, level3_index;
7650 - u64 *pgtable_level3;
7651 -
7652 - /* Find the current page table */
7653 - pgtable_level3 = __va(read_cr3());
7654 +#ifdef CONFIG_XEN
7655 +#include <xen/interface/kexec.h>
7656 +#endif
7657
7658 - /* Find the indexes of the physical address to identity map */
7659 - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
7660 - level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
7661 - level3_index = address / LEVEL2_SIZE;
7662 -
7663 - /* Identity map the page table entry */
7664 - pgtable_level1[level1_index] = address | L0_ATTR;
7665 - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
7666 - set_64bit(&pgtable_level3[level3_index],
7667 - __pa(pgtable_level2) | L2_ATTR);
7668 -
7669 - /* Flush the tlb so the new mapping takes effect.
7670 - * Global tlb entries are not flushed but that is not an issue.
7671 - */
7672 - load_cr3(pgtable_level3);
7673 -}
7674 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7675 +static u32 kexec_pgd[1024] PAGE_ALIGNED;
7676 +#ifdef CONFIG_X86_PAE
7677 +static u32 kexec_pmd0[1024] PAGE_ALIGNED;
7678 +static u32 kexec_pmd1[1024] PAGE_ALIGNED;
7679 #endif
7680 +static u32 kexec_pte0[1024] PAGE_ALIGNED;
7681 +static u32 kexec_pte1[1024] PAGE_ALIGNED;
7682
7683 -static void set_idt(void *newidt, __u16 limit)
7684 -{
7685 - struct Xgt_desc_struct curidt;
7686 +#ifdef CONFIG_XEN
7687
7688 - /* ia32 supports unaliged loads & stores */
7689 - curidt.size = limit;
7690 - curidt.address = (unsigned long)newidt;
7691 +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
7692
7693 - load_idt(&curidt);
7694 -};
7695 +#if PAGES_NR > KEXEC_XEN_NO_PAGES
7696 +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
7697 +#endif
7698
7699 +#if PA_CONTROL_PAGE != 0
7700 +#error PA_CONTROL_PAGE is non zero - Xen support will break
7701 +#endif
7702
7703 -static void set_gdt(void *newgdt, __u16 limit)
7704 +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
7705 {
7706 - struct Xgt_desc_struct curgdt;
7707 + void *control_page;
7708
7709 - /* ia32 supports unaligned loads & stores */
7710 - curgdt.size = limit;
7711 - curgdt.address = (unsigned long)newgdt;
7712 + memset(xki->page_list, 0, sizeof(xki->page_list));
7713
7714 - load_gdt(&curgdt);
7715 -};
7716 + control_page = page_address(image->control_code_page);
7717 + memcpy(control_page, relocate_kernel, PAGE_SIZE);
7718
7719 -static void load_segments(void)
7720 -{
7721 -#define __STR(X) #X
7722 -#define STR(X) __STR(X)
7723 + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
7724 + xki->page_list[PA_PGD] = __ma(kexec_pgd);
7725 +#ifdef CONFIG_X86_PAE
7726 + xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
7727 + xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
7728 +#endif
7729 + xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
7730 + xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
7731
7732 - __asm__ __volatile__ (
7733 - "\tljmp $"STR(__KERNEL_CS)",$1f\n"
7734 - "\t1:\n"
7735 - "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
7736 - "\tmovl %%eax,%%ds\n"
7737 - "\tmovl %%eax,%%es\n"
7738 - "\tmovl %%eax,%%fs\n"
7739 - "\tmovl %%eax,%%gs\n"
7740 - "\tmovl %%eax,%%ss\n"
7741 - ::: "eax", "memory");
7742 -#undef STR
7743 -#undef __STR
7744 }
7745
7746 -typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
7747 - unsigned long indirection_page,
7748 - unsigned long reboot_code_buffer,
7749 - unsigned long start_address,
7750 - unsigned int has_pae) ATTRIB_NORET;
7751 -
7752 -const extern unsigned char relocate_new_kernel[];
7753 -extern void relocate_new_kernel_end(void);
7754 -const extern unsigned int relocate_new_kernel_size;
7755 +#endif /* CONFIG_XEN */
7756
7757 /*
7758 * A architecture hook called to validate the
7759 @@ -163,52 +92,38 @@
7760 {
7761 }
7762
7763 +#ifndef CONFIG_XEN
7764 /*
7765 * Do not allocate memory (or fail in any way) in machine_kexec().
7766 * We are past the point of no return, committed to rebooting now.
7767 */
7768 NORET_TYPE void machine_kexec(struct kimage *image)
7769 {
7770 - unsigned long page_list;
7771 - unsigned long reboot_code_buffer;
7772 -
7773 - relocate_new_kernel_t rnk;
7774 + unsigned long page_list[PAGES_NR];
7775 + void *control_page;
7776
7777 /* Interrupts aren't acceptable while we reboot */
7778 local_irq_disable();
7779
7780 - /* Compute some offsets */
7781 - reboot_code_buffer = page_to_pfn(image->control_code_page)
7782 - << PAGE_SHIFT;
7783 - page_list = image->head;
7784 -
7785 - /* Set up an identity mapping for the reboot_code_buffer */
7786 - identity_map_page(reboot_code_buffer);
7787 -
7788 - /* copy it out */
7789 - memcpy((void *)reboot_code_buffer, relocate_new_kernel,
7790 - relocate_new_kernel_size);
7791 -
7792 - /* The segment registers are funny things, they are
7793 - * automatically loaded from a table, in memory wherever you
7794 - * set them to a specific selector, but this table is never
7795 - * accessed again you set the segment to a different selector.
7796 - *
7797 - * The more common model is are caches where the behide
7798 - * the scenes work is done, but is also dropped at arbitrary
7799 - * times.
7800 - *
7801 - * I take advantage of this here by force loading the
7802 - * segments, before I zap the gdt with an invalid value.
7803 - */
7804 - load_segments();
7805 - /* The gdt & idt are now invalid.
7806 - * If you want to load them you must set up your own idt & gdt.
7807 - */
7808 - set_gdt(phys_to_virt(0),0);
7809 - set_idt(phys_to_virt(0),0);
7810 -
7811 - /* now call it */
7812 - rnk = (relocate_new_kernel_t) reboot_code_buffer;
7813 - (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
7814 + control_page = page_address(image->control_code_page);
7815 + memcpy(control_page, relocate_kernel, PAGE_SIZE);
7816 +
7817 + page_list[PA_CONTROL_PAGE] = __pa(control_page);
7818 + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
7819 + page_list[PA_PGD] = __pa(kexec_pgd);
7820 + page_list[VA_PGD] = (unsigned long)kexec_pgd;
7821 +#ifdef CONFIG_X86_PAE
7822 + page_list[PA_PMD_0] = __pa(kexec_pmd0);
7823 + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
7824 + page_list[PA_PMD_1] = __pa(kexec_pmd1);
7825 + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
7826 +#endif
7827 + page_list[PA_PTE_0] = __pa(kexec_pte0);
7828 + page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
7829 + page_list[PA_PTE_1] = __pa(kexec_pte1);
7830 + page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
7831 +
7832 + relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
7833 + image->start, cpu_has_pae);
7834 }
7835 +#endif
7836 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c~ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c~
7837 --- linux-2.6.16.33-noxen/arch/i386/kernel/machine_kexec.c~ 1970-01-01 00:00:00.000000000 +0000
7838 +++ linux-2.6.16.33/arch/i386/kernel/machine_kexec.c~ 2007-05-23 21:00:01.000000000 +0000
7839 @@ -0,0 +1,148 @@
7840 +/*
7841 + * machine_kexec.c - handle transition of Linux booting another kernel
7842 + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
7843 + *
7844 + * This source code is licensed under the GNU General Public License,
7845 + * Version 2. See the file COPYING for more details.
7846 + */
7847 +
7848 +#include <linux/mm.h>
7849 +#include <linux/kexec.h>
7850 +#include <linux/delay.h>
7851 +#include <asm/pgtable.h>
7852 +#include <asm/pgalloc.h>
7853 +#include <asm/tlbflush.h>
7854 +#include <asm/mmu_context.h>
7855 +#include <asm/io.h>
7856 +#include <asm/apic.h>
7857 +#include <asm/cpufeature.h>
7858 +#include <asm/desc.h>
7859 +#include <asm/system.h>
7860 +
7861 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
7862 +static u32 kexec_pgd[1024] PAGE_ALIGNED;
7863 +#ifdef CONFIG_X86_PAE
7864 +static u32 kexec_pmd0[1024] PAGE_ALIGNED;
7865 +static u32 kexec_pmd1[1024] PAGE_ALIGNED;
7866 +#endif
7867 +static u32 kexec_pte0[1024] PAGE_ALIGNED;
7868 +static u32 kexec_pte1[1024] PAGE_ALIGNED;
7869 +
7870 +static void set_idt(void *newidt, __u16 limit)
7871 +{
7872 + struct Xgt_desc_struct curidt;
7873 +
7874 + /* ia32 supports unaliged loads & stores */
7875 + curidt.size = limit;
7876 + curidt.address = (unsigned long)newidt;
7877 +
7878 + load_idt(&curidt);
7879 +};
7880 +
7881 +
7882 +static void set_gdt(void *newgdt, __u16 limit)
7883 +{
7884 + struct Xgt_desc_struct curgdt;
7885 +
7886 + /* ia32 supports unaligned loads & stores */
7887 + curgdt.size = limit;
7888 + curgdt.address = (unsigned long)newgdt;
7889 +
7890 + load_gdt(&curgdt);
7891 +};
7892 +
7893 +static void load_segments(void)
7894 +{
7895 +#define __STR(X) #X
7896 +#define STR(X) __STR(X)
7897 +
7898 + __asm__ __volatile__ (
7899 + "\tljmp $"STR(__KERNEL_CS)",$1f\n"
7900 + "\t1:\n"
7901 + "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
7902 + "\tmovl %%eax,%%ds\n"
7903 + "\tmovl %%eax,%%es\n"
7904 + "\tmovl %%eax,%%fs\n"
7905 + "\tmovl %%eax,%%gs\n"
7906 + "\tmovl %%eax,%%ss\n"
7907 + ::: "eax", "memory");
7908 +#undef STR
7909 +#undef __STR
7910 +}
7911 +
7912 +/*
7913 + * A architecture hook called to validate the
7914 + * proposed image and prepare the control pages
7915 + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
7916 + * have been allocated, but the segments have yet
7917 + * been copied into the kernel.
7918 + *
7919 + * Do what every setup is needed on image and the
7920 + * reboot code buffer to allow us to avoid allocations
7921 + * later.
7922 + *
7923 + * Currently nothing.
7924 + */
7925 +int machine_kexec_prepare(struct kimage *image)
7926 +{
7927 + return 0;
7928 +}
7929 +
7930 +/*
7931 + * Undo anything leftover by machine_kexec_prepare
7932 + * when an image is freed.
7933 + */
7934 +void machine_kexec_cleanup(struct kimage *image)
7935 +{
7936 +}
7937 +
7938 +/*
7939 + * Do not allocate memory (or fail in any way) in machine_kexec().
7940 + * We are past the point of no return, committed to rebooting now.
7941 + */
7942 +NORET_TYPE void machine_kexec(struct kimage *image)
7943 +{
7944 + unsigned long page_list[PAGES_NR];
7945 + void *control_page;
7946 +
7947 + /* Interrupts aren't acceptable while we reboot */
7948 + local_irq_disable();
7949 +
7950 + control_page = page_address(image->control_code_page);
7951 + memcpy(control_page, relocate_kernel, PAGE_SIZE);
7952 +
7953 + page_list[PA_CONTROL_PAGE] = __pa(control_page);
7954 + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
7955 + page_list[PA_PGD] = __pa(kexec_pgd);
7956 + page_list[VA_PGD] = (unsigned long)kexec_pgd;
7957 +#ifdef CONFIG_X86_PAE
7958 + page_list[PA_PMD_0] = __pa(kexec_pmd0);
7959 + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
7960 + page_list[PA_PMD_1] = __pa(kexec_pmd1);
7961 + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
7962 +#endif
7963 + page_list[PA_PTE_0] = __pa(kexec_pte0);
7964 + page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
7965 + page_list[PA_PTE_1] = __pa(kexec_pte1);
7966 + page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
7967 +
7968 + /* The segment registers are funny things, they have both a
7969 + * visible and an invisible part. Whenever the visible part is
7970 + * set to a specific selector, the invisible part is loaded
7971 + * with from a table in memory. At no other time is the
7972 + * descriptor table in memory accessed.
7973 + *
7974 + * I take advantage of this here by force loading the
7975 + * segments, before I zap the gdt with an invalid value.
7976 + */
7977 + load_segments();
7978 + /* The gdt & idt are now invalid.
7979 + * If you want to load them you must set up your own idt & gdt.
7980 + */
7981 + set_gdt(phys_to_virt(0),0);
7982 + set_idt(phys_to_virt(0),0);
7983 +
7984 + /* now call it */
7985 + relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
7986 + image->start, cpu_has_pae);
7987 +}
7988 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/microcode-xen.c linux-2.6.16.33/arch/i386/kernel/microcode-xen.c
7989 --- linux-2.6.16.33-noxen/arch/i386/kernel/microcode-xen.c 1970-01-01 00:00:00.000000000 +0000
7990 +++ linux-2.6.16.33/arch/i386/kernel/microcode-xen.c 2007-01-08 15:00:45.000000000 +0000
7991 @@ -0,0 +1,159 @@
7992 +/*
7993 + * Intel CPU Microcode Update Driver for Linux
7994 + *
7995 + * Copyright (C) 2000-2004 Tigran Aivazian
7996 + *
7997 + * This driver allows to upgrade microcode on Intel processors
7998 + * belonging to IA-32 family - PentiumPro, Pentium II,
7999 + * Pentium III, Xeon, Pentium 4, etc.
8000 + *
8001 + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
8002 + * Order Number 245472 or free download from:
8003 + *
8004 + * http://developer.intel.com/design/pentium4/manuals/245472.htm
8005 + *
8006 + * For more information, go to http://www.urbanmyth.org/microcode
8007 + *
8008 + * This program is free software; you can redistribute it and/or
8009 + * modify it under the terms of the GNU General Public License
8010 + * as published by the Free Software Foundation; either version
8011 + * 2 of the License, or (at your option) any later version.
8012 + */
8013 +
8014 +//#define DEBUG /* pr_debug */
8015 +#include <linux/capability.h>
8016 +#include <linux/kernel.h>
8017 +#include <linux/init.h>
8018 +#include <linux/sched.h>
8019 +#include <linux/cpumask.h>
8020 +#include <linux/module.h>
8021 +#include <linux/slab.h>
8022 +#include <linux/vmalloc.h>
8023 +#include <linux/miscdevice.h>
8024 +#include <linux/spinlock.h>
8025 +#include <linux/mm.h>
8026 +#include <linux/syscalls.h>
8027 +
8028 +#include <asm/msr.h>
8029 +#include <asm/uaccess.h>
8030 +#include <asm/processor.h>
8031 +
8032 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
8033 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
8034 +MODULE_LICENSE("GPL");
8035 +
8036 +#define MICROCODE_VERSION "1.14-xen"
8037 +
8038 +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
8039 +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
8040 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
8041 +
8042 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
8043 +static DECLARE_MUTEX(microcode_sem);
8044 +
8045 +static int microcode_open (struct inode *unused1, struct file *unused2)
8046 +{
8047 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
8048 +}
8049 +
8050 +
8051 +static int do_microcode_update (const void __user *ubuf, size_t len)
8052 +{
8053 + int err;
8054 + void *kbuf;
8055 +
8056 + kbuf = vmalloc(len);
8057 + if (!kbuf)
8058 + return -ENOMEM;
8059 +
8060 + if (copy_from_user(kbuf, ubuf, len) == 0) {
8061 + dom0_op_t op;
8062 +
8063 + op.cmd = DOM0_MICROCODE;
8064 + set_xen_guest_handle(op.u.microcode.data, kbuf);
8065 + op.u.microcode.length = len;
8066 + err = HYPERVISOR_dom0_op(&op);
8067 + } else
8068 + err = -EFAULT;
8069 +
8070 + vfree(kbuf);
8071 +
8072 + return err;
8073 +}
8074 +
8075 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
8076 +{
8077 + ssize_t ret;
8078 +
8079 + if (len < DEFAULT_UCODE_TOTALSIZE) {
8080 + printk(KERN_ERR "microcode: not enough data\n");
8081 + return -EINVAL;
8082 + }
8083 +
8084 + down(&microcode_sem);
8085 +
8086 + ret = do_microcode_update(buf, len);
8087 + if (!ret)
8088 + ret = (ssize_t)len;
8089 +
8090 + up(&microcode_sem);
8091 +
8092 + return ret;
8093 +}
8094 +
8095 +static int microcode_ioctl (struct inode *inode, struct file *file,
8096 + unsigned int cmd, unsigned long arg)
8097 +{
8098 + switch (cmd) {
8099 + /*
8100 + * XXX: will be removed after microcode_ctl
8101 + * is updated to ignore failure of this ioctl()
8102 + */
8103 + case MICROCODE_IOCFREE:
8104 + return 0;
8105 + default:
8106 + return -EINVAL;
8107 + }
8108 + return -EINVAL;
8109 +}
8110 +
8111 +static struct file_operations microcode_fops = {
8112 + .owner = THIS_MODULE,
8113 + .write = microcode_write,
8114 + .ioctl = microcode_ioctl,
8115 + .open = microcode_open,
8116 +};
8117 +
8118 +static struct miscdevice microcode_dev = {
8119 + .minor = MICROCODE_MINOR,
8120 + .name = "microcode",
8121 + .devfs_name = "cpu/microcode",
8122 + .fops = &microcode_fops,
8123 +};
8124 +
8125 +static int __init microcode_init (void)
8126 +{
8127 + int error;
8128 +
8129 + error = misc_register(&microcode_dev);
8130 + if (error) {
8131 + printk(KERN_ERR
8132 + "microcode: can't misc_register on minor=%d\n",
8133 + MICROCODE_MINOR);
8134 + return error;
8135 + }
8136 +
8137 + printk(KERN_INFO
8138 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
8139 + return 0;
8140 +}
8141 +
8142 +static void __exit microcode_exit (void)
8143 +{
8144 + misc_deregister(&microcode_dev);
8145 + printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n");
8146 +}
8147 +
8148 +module_init(microcode_init)
8149 +module_exit(microcode_exit)
8150 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
8151 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/mpparse-xen.c linux-2.6.16.33/arch/i386/kernel/mpparse-xen.c
8152 --- linux-2.6.16.33-noxen/arch/i386/kernel/mpparse-xen.c 1970-01-01 00:00:00.000000000 +0000
8153 +++ linux-2.6.16.33/arch/i386/kernel/mpparse-xen.c 2007-01-08 15:00:45.000000000 +0000
8154 @@ -0,0 +1,1188 @@
8155 +/*
8156 + * Intel Multiprocessor Specification 1.1 and 1.4
8157 + * compliant MP-table parsing routines.
8158 + *
8159 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
8160 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
8161 + *
8162 + * Fixes
8163 + * Erich Boleyn : MP v1.4 and additional changes.
8164 + * Alan Cox : Added EBDA scanning
8165 + * Ingo Molnar : various cleanups and rewrites
8166 + * Maciej W. Rozycki: Bits for default MP configurations
8167 + * Paul Diefenbaugh: Added full ACPI support
8168 + */
8169 +
8170 +#include <linux/mm.h>
8171 +#include <linux/init.h>
8172 +#include <linux/acpi.h>
8173 +#include <linux/delay.h>
8174 +#include <linux/config.h>
8175 +#include <linux/bootmem.h>
8176 +#include <linux/smp_lock.h>
8177 +#include <linux/kernel_stat.h>
8178 +#include <linux/mc146818rtc.h>
8179 +#include <linux/bitops.h>
8180 +
8181 +#include <asm/smp.h>
8182 +#include <asm/acpi.h>
8183 +#include <asm/mtrr.h>
8184 +#include <asm/mpspec.h>
8185 +#include <asm/io_apic.h>
8186 +
8187 +#include <mach_apic.h>
8188 +#include <mach_mpparse.h>
8189 +#include <bios_ebda.h>
8190 +
8191 +/* Have we found an MP table */
8192 +int smp_found_config;
8193 +unsigned int __initdata maxcpus = NR_CPUS;
8194 +
8195 +#ifdef CONFIG_HOTPLUG_CPU
8196 +#define CPU_HOTPLUG_ENABLED (1)
8197 +#else
8198 +#define CPU_HOTPLUG_ENABLED (0)
8199 +#endif
8200 +
8201 +/*
8202 + * Various Linux-internal data structures created from the
8203 + * MP-table.
8204 + */
8205 +int apic_version [MAX_APICS];
8206 +int mp_bus_id_to_type [MAX_MP_BUSSES];
8207 +int mp_bus_id_to_node [MAX_MP_BUSSES];
8208 +int mp_bus_id_to_local [MAX_MP_BUSSES];
8209 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
8210 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
8211 +static int mp_current_pci_id;
8212 +
8213 +/* I/O APIC entries */
8214 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
8215 +
8216 +/* # of MP IRQ source entries */
8217 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
8218 +
8219 +/* MP IRQ source entries */
8220 +int mp_irq_entries;
8221 +
8222 +int nr_ioapics;
8223 +
8224 +int pic_mode;
8225 +unsigned long mp_lapic_addr;
8226 +
8227 +unsigned int def_to_bigsmp = 0;
8228 +
8229 +/* Processor that is doing the boot up */
8230 +unsigned int boot_cpu_physical_apicid = -1U;
8231 +/* Internal processor count */
8232 +static unsigned int __devinitdata num_processors;
8233 +
8234 +/* Bitmask of physically existing CPUs */
8235 +physid_mask_t phys_cpu_present_map;
8236 +
8237 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
8238 +
8239 +/*
8240 + * Intel MP BIOS table parsing routines:
8241 + */
8242 +
8243 +
8244 +/*
8245 + * Checksum an MP configuration block.
8246 + */
8247 +
8248 +static int __init mpf_checksum(unsigned char *mp, int len)
8249 +{
8250 + int sum = 0;
8251 +
8252 + while (len--)
8253 + sum += *mp++;
8254 +
8255 + return sum & 0xFF;
8256 +}
8257 +
8258 +/*
8259 + * Have to match translation table entries to main table entries by counter
8260 + * hence the mpc_record variable .... can't see a less disgusting way of
8261 + * doing this ....
8262 + */
8263 +
8264 +static int mpc_record;
8265 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
8266 +
8267 +#ifdef CONFIG_X86_NUMAQ
8268 +static int MP_valid_apicid(int apicid, int version)
8269 +{
8270 + return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
8271 +}
8272 +#elif !defined(CONFIG_XEN)
8273 +static int MP_valid_apicid(int apicid, int version)
8274 +{
8275 + if (version >= 0x14)
8276 + return apicid < 0xff;
8277 + else
8278 + return apicid < 0xf;
8279 +}
8280 +#endif
8281 +
8282 +#ifndef CONFIG_XEN
8283 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
8284 +{
8285 + int ver, apicid;
8286 + physid_mask_t phys_cpu;
8287 +
8288 + if (!(m->mpc_cpuflag & CPU_ENABLED))
8289 + return;
8290 +
8291 + apicid = mpc_apic_id(m, translation_table[mpc_record]);
8292 +
8293 + if (m->mpc_featureflag&(1<<0))
8294 + Dprintk(" Floating point unit present.\n");
8295 + if (m->mpc_featureflag&(1<<7))
8296 + Dprintk(" Machine Exception supported.\n");
8297 + if (m->mpc_featureflag&(1<<8))
8298 + Dprintk(" 64 bit compare & exchange supported.\n");
8299 + if (m->mpc_featureflag&(1<<9))
8300 + Dprintk(" Internal APIC present.\n");
8301 + if (m->mpc_featureflag&(1<<11))
8302 + Dprintk(" SEP present.\n");
8303 + if (m->mpc_featureflag&(1<<12))
8304 + Dprintk(" MTRR present.\n");
8305 + if (m->mpc_featureflag&(1<<13))
8306 + Dprintk(" PGE present.\n");
8307 + if (m->mpc_featureflag&(1<<14))
8308 + Dprintk(" MCA present.\n");
8309 + if (m->mpc_featureflag&(1<<15))
8310 + Dprintk(" CMOV present.\n");
8311 + if (m->mpc_featureflag&(1<<16))
8312 + Dprintk(" PAT present.\n");
8313 + if (m->mpc_featureflag&(1<<17))
8314 + Dprintk(" PSE present.\n");
8315 + if (m->mpc_featureflag&(1<<18))
8316 + Dprintk(" PSN present.\n");
8317 + if (m->mpc_featureflag&(1<<19))
8318 + Dprintk(" Cache Line Flush Instruction present.\n");
8319 + /* 20 Reserved */
8320 + if (m->mpc_featureflag&(1<<21))
8321 + Dprintk(" Debug Trace and EMON Store present.\n");
8322 + if (m->mpc_featureflag&(1<<22))
8323 + Dprintk(" ACPI Thermal Throttle Registers present.\n");
8324 + if (m->mpc_featureflag&(1<<23))
8325 + Dprintk(" MMX present.\n");
8326 + if (m->mpc_featureflag&(1<<24))
8327 + Dprintk(" FXSR present.\n");
8328 + if (m->mpc_featureflag&(1<<25))
8329 + Dprintk(" XMM present.\n");
8330 + if (m->mpc_featureflag&(1<<26))
8331 + Dprintk(" Willamette New Instructions present.\n");
8332 + if (m->mpc_featureflag&(1<<27))
8333 + Dprintk(" Self Snoop present.\n");
8334 + if (m->mpc_featureflag&(1<<28))
8335 + Dprintk(" HT present.\n");
8336 + if (m->mpc_featureflag&(1<<29))
8337 + Dprintk(" Thermal Monitor present.\n");
8338 + /* 30, 31 Reserved */
8339 +
8340 +
8341 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
8342 + Dprintk(" Bootup CPU\n");
8343 + boot_cpu_physical_apicid = m->mpc_apicid;
8344 + }
8345 +
8346 + ver = m->mpc_apicver;
8347 +
8348 + if (!MP_valid_apicid(apicid, ver)) {
8349 + printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
8350 + m->mpc_apicid, MAX_APICS);
8351 + return;
8352 + }
8353 +
8354 + /*
8355 + * Validate version
8356 + */
8357 + if (ver == 0x0) {
8358 + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
8359 + "fixing up to 0x10. (tell your hw vendor)\n",
8360 + m->mpc_apicid);
8361 + ver = 0x10;
8362 + }
8363 + apic_version[m->mpc_apicid] = ver;
8364 +
8365 + phys_cpu = apicid_to_cpu_present(apicid);
8366 + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
8367 +
8368 + if (num_processors >= NR_CPUS) {
8369 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
8370 + " Processor ignored.\n", NR_CPUS);
8371 + return;
8372 + }
8373 +
8374 + if (num_processors >= maxcpus) {
8375 + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
8376 + " Processor ignored.\n", maxcpus);
8377 + return;
8378 + }
8379 +
8380 + cpu_set(num_processors, cpu_possible_map);
8381 + num_processors++;
8382 +
8383 + if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
8384 + switch (boot_cpu_data.x86_vendor) {
8385 + case X86_VENDOR_INTEL:
8386 + if (!APIC_XAPIC(ver)) {
8387 + def_to_bigsmp = 0;
8388 + break;
8389 + }
8390 + /* If P4 and above fall through */
8391 + case X86_VENDOR_AMD:
8392 + def_to_bigsmp = 1;
8393 + }
8394 + }
8395 + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
8396 +}
8397 +#else
8398 +void __init MP_processor_info (struct mpc_config_processor *m)
8399 +{
8400 + num_processors++;
8401 +}
8402 +#endif /* CONFIG_XEN */
8403 +
8404 +static void __init MP_bus_info (struct mpc_config_bus *m)
8405 +{
8406 + char str[7];
8407 +
8408 + memcpy(str, m->mpc_bustype, 6);
8409 + str[6] = 0;
8410 +
8411 + mpc_oem_bus_info(m, str, translation_table[mpc_record]);
8412 +
8413 + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
8414 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
8415 + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
8416 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
8417 + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
8418 + mpc_oem_pci_bus(m, translation_table[mpc_record]);
8419 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
8420 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
8421 + mp_current_pci_id++;
8422 + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
8423 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
8424 + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
8425 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
8426 + } else {
8427 + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
8428 + }
8429 +}
8430 +
8431 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
8432 +{
8433 + if (!(m->mpc_flags & MPC_APIC_USABLE))
8434 + return;
8435 +
8436 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
8437 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
8438 + if (nr_ioapics >= MAX_IO_APICS) {
8439 + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
8440 + MAX_IO_APICS, nr_ioapics);
8441 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
8442 + }
8443 + if (!m->mpc_apicaddr) {
8444 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
8445 + " found in MP table, skipping!\n");
8446 + return;
8447 + }
8448 + mp_ioapics[nr_ioapics] = *m;
8449 + nr_ioapics++;
8450 +}
8451 +
8452 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
8453 +{
8454 + mp_irqs [mp_irq_entries] = *m;
8455 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
8456 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
8457 + m->mpc_irqtype, m->mpc_irqflag & 3,
8458 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
8459 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
8460 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
8461 + panic("Max # of irq sources exceeded!!\n");
8462 +}
8463 +
8464 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
8465 +{
8466 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
8467 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
8468 + m->mpc_irqtype, m->mpc_irqflag & 3,
8469 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
8470 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
8471 + /*
8472 + * Well it seems all SMP boards in existence
8473 + * use ExtINT/LVT1 == LINT0 and
8474 + * NMI/LVT2 == LINT1 - the following check
8475 + * will show us if this assumptions is false.
8476 + * Until then we do not have to add baggage.
8477 + */
8478 + if ((m->mpc_irqtype == mp_ExtINT) &&
8479 + (m->mpc_destapiclint != 0))
8480 + BUG();
8481 + if ((m->mpc_irqtype == mp_NMI) &&
8482 + (m->mpc_destapiclint != 1))
8483 + BUG();
8484 +}
8485 +
8486 +#ifdef CONFIG_X86_NUMAQ
8487 +static void __init MP_translation_info (struct mpc_config_translation *m)
8488 +{
8489 + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
8490 +
8491 + if (mpc_record >= MAX_MPC_ENTRY)
8492 + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
8493 + else
8494 + translation_table[mpc_record] = m; /* stash this for later */
8495 + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
8496 + node_set_online(m->trans_quad);
8497 +}
8498 +
8499 +/*
8500 + * Read/parse the MPC oem tables
8501 + */
8502 +
8503 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
8504 + unsigned short oemsize)
8505 +{
8506 + int count = sizeof (*oemtable); /* the header size */
8507 + unsigned char *oemptr = ((unsigned char *)oemtable)+count;
8508 +
8509 + mpc_record = 0;
8510 + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
8511 + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
8512 + {
8513 + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
8514 + oemtable->oem_signature[0],
8515 + oemtable->oem_signature[1],
8516 + oemtable->oem_signature[2],
8517 + oemtable->oem_signature[3]);
8518 + return;
8519 + }
8520 + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
8521 + {
8522 + printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
8523 + return;
8524 + }
8525 + while (count < oemtable->oem_length) {
8526 + switch (*oemptr) {
8527 + case MP_TRANSLATION:
8528 + {
8529 + struct mpc_config_translation *m=
8530 + (struct mpc_config_translation *)oemptr;
8531 + MP_translation_info(m);
8532 + oemptr += sizeof(*m);
8533 + count += sizeof(*m);
8534 + ++mpc_record;
8535 + break;
8536 + }
8537 + default:
8538 + {
8539 + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
8540 + return;
8541 + }
8542 + }
8543 + }
8544 +}
8545 +
8546 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
8547 + char *productid)
8548 +{
8549 + if (strncmp(oem, "IBM NUMA", 8))
8550 + printk("Warning! May not be a NUMA-Q system!\n");
8551 + if (mpc->mpc_oemptr)
8552 + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
8553 + mpc->mpc_oemsize);
8554 +}
8555 +#endif /* CONFIG_X86_NUMAQ */
8556 +
8557 +/*
8558 + * Read/parse the MPC
8559 + */
8560 +
8561 +static int __init smp_read_mpc(struct mp_config_table *mpc)
8562 +{
8563 + char str[16];
8564 + char oem[10];
8565 + int count=sizeof(*mpc);
8566 + unsigned char *mpt=((unsigned char *)mpc)+count;
8567 +
8568 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
8569 + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
8570 + *(u32 *)mpc->mpc_signature);
8571 + return 0;
8572 + }
8573 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
8574 + printk(KERN_ERR "SMP mptable: checksum error!\n");
8575 + return 0;
8576 + }
8577 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
8578 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
8579 + mpc->mpc_spec);
8580 + return 0;
8581 + }
8582 + if (!mpc->mpc_lapic) {
8583 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
8584 + return 0;
8585 + }
8586 + memcpy(oem,mpc->mpc_oem,8);
8587 + oem[8]=0;
8588 + printk(KERN_INFO "OEM ID: %s ",oem);
8589 +
8590 + memcpy(str,mpc->mpc_productid,12);
8591 + str[12]=0;
8592 + printk("Product ID: %s ",str);
8593 +
8594 + mps_oem_check(mpc, oem, str);
8595 +
8596 + printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
8597 +
8598 + /*
8599 + * Save the local APIC address (it might be non-default) -- but only
8600 + * if we're not using ACPI.
8601 + */
8602 + if (!acpi_lapic)
8603 + mp_lapic_addr = mpc->mpc_lapic;
8604 +
8605 + /*
8606 + * Now process the configuration blocks.
8607 + */
8608 + mpc_record = 0;
8609 + while (count < mpc->mpc_length) {
8610 + switch(*mpt) {
8611 + case MP_PROCESSOR:
8612 + {
8613 + struct mpc_config_processor *m=
8614 + (struct mpc_config_processor *)mpt;
8615 + /* ACPI may have already provided this data */
8616 + if (!acpi_lapic)
8617 + MP_processor_info(m);
8618 + mpt += sizeof(*m);
8619 + count += sizeof(*m);
8620 + break;
8621 + }
8622 + case MP_BUS:
8623 + {
8624 + struct mpc_config_bus *m=
8625 + (struct mpc_config_bus *)mpt;
8626 + MP_bus_info(m);
8627 + mpt += sizeof(*m);
8628 + count += sizeof(*m);
8629 + break;
8630 + }
8631 + case MP_IOAPIC:
8632 + {
8633 + struct mpc_config_ioapic *m=
8634 + (struct mpc_config_ioapic *)mpt;
8635 + MP_ioapic_info(m);
8636 + mpt+=sizeof(*m);
8637 + count+=sizeof(*m);
8638 + break;
8639 + }
8640 + case MP_INTSRC:
8641 + {
8642 + struct mpc_config_intsrc *m=
8643 + (struct mpc_config_intsrc *)mpt;
8644 +
8645 + MP_intsrc_info(m);
8646 + mpt+=sizeof(*m);
8647 + count+=sizeof(*m);
8648 + break;
8649 + }
8650 + case MP_LINTSRC:
8651 + {
8652 + struct mpc_config_lintsrc *m=
8653 + (struct mpc_config_lintsrc *)mpt;
8654 + MP_lintsrc_info(m);
8655 + mpt+=sizeof(*m);
8656 + count+=sizeof(*m);
8657 + break;
8658 + }
8659 + default:
8660 + {
8661 + count = mpc->mpc_length;
8662 + break;
8663 + }
8664 + }
8665 + ++mpc_record;
8666 + }
8667 + clustered_apic_check();
8668 + if (!num_processors)
8669 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
8670 + return num_processors;
8671 +}
8672 +
8673 +static int __init ELCR_trigger(unsigned int irq)
8674 +{
8675 + unsigned int port;
8676 +
8677 + port = 0x4d0 + (irq >> 3);
8678 + return (inb(port) >> (irq & 7)) & 1;
8679 +}
8680 +
8681 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
8682 +{
8683 + struct mpc_config_intsrc intsrc;
8684 + int i;
8685 + int ELCR_fallback = 0;
8686 +
8687 + intsrc.mpc_type = MP_INTSRC;
8688 + intsrc.mpc_irqflag = 0; /* conforming */
8689 + intsrc.mpc_srcbus = 0;
8690 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
8691 +
8692 + intsrc.mpc_irqtype = mp_INT;
8693 +
8694 + /*
8695 + * If true, we have an ISA/PCI system with no IRQ entries
8696 + * in the MP table. To prevent the PCI interrupts from being set up
8697 + * incorrectly, we try to use the ELCR. The sanity check to see if
8698 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
8699 + * never be level sensitive, so we simply see if the ELCR agrees.
8700 + * If it does, we assume it's valid.
8701 + */
8702 + if (mpc_default_type == 5) {
8703 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
8704 +
8705 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
8706 + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
8707 + else {
8708 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
8709 + ELCR_fallback = 1;
8710 + }
8711 + }
8712 +
8713 + for (i = 0; i < 16; i++) {
8714 + switch (mpc_default_type) {
8715 + case 2:
8716 + if (i == 0 || i == 13)
8717 + continue; /* IRQ0 & IRQ13 not connected */
8718 + /* fall through */
8719 + default:
8720 + if (i == 2)
8721 + continue; /* IRQ2 is never connected */
8722 + }
8723 +
8724 + if (ELCR_fallback) {
8725 + /*
8726 + * If the ELCR indicates a level-sensitive interrupt, we
8727 + * copy that information over to the MP table in the
8728 + * irqflag field (level sensitive, active high polarity).
8729 + */
8730 + if (ELCR_trigger(i))
8731 + intsrc.mpc_irqflag = 13;
8732 + else
8733 + intsrc.mpc_irqflag = 0;
8734 + }
8735 +
8736 + intsrc.mpc_srcbusirq = i;
8737 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
8738 + MP_intsrc_info(&intsrc);
8739 + }
8740 +
8741 + intsrc.mpc_irqtype = mp_ExtINT;
8742 + intsrc.mpc_srcbusirq = 0;
8743 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
8744 + MP_intsrc_info(&intsrc);
8745 +}
8746 +
8747 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
8748 +{
8749 + struct mpc_config_processor processor;
8750 + struct mpc_config_bus bus;
8751 + struct mpc_config_ioapic ioapic;
8752 + struct mpc_config_lintsrc lintsrc;
8753 + int linttypes[2] = { mp_ExtINT, mp_NMI };
8754 + int i;
8755 +
8756 + /*
8757 + * local APIC has default address
8758 + */
8759 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
8760 +
8761 + /*
8762 + * 2 CPUs, numbered 0 & 1.
8763 + */
8764 + processor.mpc_type = MP_PROCESSOR;
8765 + /* Either an integrated APIC or a discrete 82489DX. */
8766 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8767 + processor.mpc_cpuflag = CPU_ENABLED;
8768 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
8769 + (boot_cpu_data.x86_model << 4) |
8770 + boot_cpu_data.x86_mask;
8771 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8772 + processor.mpc_reserved[0] = 0;
8773 + processor.mpc_reserved[1] = 0;
8774 + for (i = 0; i < 2; i++) {
8775 + processor.mpc_apicid = i;
8776 + MP_processor_info(&processor);
8777 + }
8778 +
8779 + bus.mpc_type = MP_BUS;
8780 + bus.mpc_busid = 0;
8781 + switch (mpc_default_type) {
8782 + default:
8783 + printk("???\n");
8784 + printk(KERN_ERR "Unknown standard configuration %d\n",
8785 + mpc_default_type);
8786 + /* fall through */
8787 + case 1:
8788 + case 5:
8789 + memcpy(bus.mpc_bustype, "ISA ", 6);
8790 + break;
8791 + case 2:
8792 + case 6:
8793 + case 3:
8794 + memcpy(bus.mpc_bustype, "EISA ", 6);
8795 + break;
8796 + case 4:
8797 + case 7:
8798 + memcpy(bus.mpc_bustype, "MCA ", 6);
8799 + }
8800 + MP_bus_info(&bus);
8801 + if (mpc_default_type > 4) {
8802 + bus.mpc_busid = 1;
8803 + memcpy(bus.mpc_bustype, "PCI ", 6);
8804 + MP_bus_info(&bus);
8805 + }
8806 +
8807 + ioapic.mpc_type = MP_IOAPIC;
8808 + ioapic.mpc_apicid = 2;
8809 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8810 + ioapic.mpc_flags = MPC_APIC_USABLE;
8811 + ioapic.mpc_apicaddr = 0xFEC00000;
8812 + MP_ioapic_info(&ioapic);
8813 +
8814 + /*
8815 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
8816 + */
8817 + construct_default_ioirq_mptable(mpc_default_type);
8818 +
8819 + lintsrc.mpc_type = MP_LINTSRC;
8820 + lintsrc.mpc_irqflag = 0; /* conforming */
8821 + lintsrc.mpc_srcbusid = 0;
8822 + lintsrc.mpc_srcbusirq = 0;
8823 + lintsrc.mpc_destapic = MP_APIC_ALL;
8824 + for (i = 0; i < 2; i++) {
8825 + lintsrc.mpc_irqtype = linttypes[i];
8826 + lintsrc.mpc_destapiclint = i;
8827 + MP_lintsrc_info(&lintsrc);
8828 + }
8829 +}
8830 +
8831 +static struct intel_mp_floating *mpf_found;
8832 +
8833 +/*
8834 + * Scan the memory blocks for an SMP configuration block.
8835 + */
8836 +void __init get_smp_config (void)
8837 +{
8838 + struct intel_mp_floating *mpf = mpf_found;
8839 +
8840 + /*
8841 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
8842 + * processors, where MPS only supports physical.
8843 + */
8844 + if (acpi_lapic && acpi_ioapic) {
8845 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
8846 + return;
8847 + }
8848 + else if (acpi_lapic)
8849 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
8850 +
8851 + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
8852 + if (mpf->mpf_feature2 & (1<<7)) {
8853 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
8854 + pic_mode = 1;
8855 + } else {
8856 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
8857 + pic_mode = 0;
8858 + }
8859 +
8860 + /*
8861 + * Now see if we need to read further.
8862 + */
8863 + if (mpf->mpf_feature1 != 0) {
8864 +
8865 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
8866 + construct_default_ISA_mptable(mpf->mpf_feature1);
8867 +
8868 + } else if (mpf->mpf_physptr) {
8869 +
8870 + /*
8871 + * Read the physical hardware table. Anything here will
8872 + * override the defaults.
8873 + */
8874 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
8875 + smp_found_config = 0;
8876 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
8877 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
8878 + return;
8879 + }
8880 + /*
8881 + * If there are no explicit MP IRQ entries, then we are
8882 + * broken. We set up most of the low 16 IO-APIC pins to
8883 + * ISA defaults and hope it will work.
8884 + */
8885 + if (!mp_irq_entries) {
8886 + struct mpc_config_bus bus;
8887 +
8888 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
8889 +
8890 + bus.mpc_type = MP_BUS;
8891 + bus.mpc_busid = 0;
8892 + memcpy(bus.mpc_bustype, "ISA ", 6);
8893 + MP_bus_info(&bus);
8894 +
8895 + construct_default_ioirq_mptable(0);
8896 + }
8897 +
8898 + } else
8899 + BUG();
8900 +
8901 + printk(KERN_INFO "Processors: %d\n", num_processors);
8902 + /*
8903 + * Only use the first configuration found.
8904 + */
8905 +}
8906 +
8907 +static int __init smp_scan_config (unsigned long base, unsigned long length)
8908 +{
8909 + unsigned long *bp = isa_bus_to_virt(base);
8910 + struct intel_mp_floating *mpf;
8911 +
8912 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
8913 + if (sizeof(*mpf) != 16)
8914 + printk("Error: MPF size\n");
8915 +
8916 + while (length > 0) {
8917 + mpf = (struct intel_mp_floating *)bp;
8918 + if ((*bp == SMP_MAGIC_IDENT) &&
8919 + (mpf->mpf_length == 1) &&
8920 + !mpf_checksum((unsigned char *)bp, 16) &&
8921 + ((mpf->mpf_specification == 1)
8922 + || (mpf->mpf_specification == 4)) ) {
8923 +
8924 + smp_found_config = 1;
8925 +#ifndef CONFIG_XEN
8926 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
8927 + virt_to_phys(mpf));
8928 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
8929 + if (mpf->mpf_physptr) {
8930 + /*
8931 + * We cannot access to MPC table to compute
8932 + * table size yet, as only few megabytes from
8933 + * the bottom is mapped now.
8934 + * PC-9800's MPC table places on the very last
8935 + * of physical memory; so that simply reserving
8936 + * PAGE_SIZE from mpg->mpf_physptr yields BUG()
8937 + * in reserve_bootmem.
8938 + */
8939 + unsigned long size = PAGE_SIZE;
8940 + unsigned long end = max_low_pfn * PAGE_SIZE;
8941 + if (mpf->mpf_physptr + size > end)
8942 + size = end - mpf->mpf_physptr;
8943 + reserve_bootmem(mpf->mpf_physptr, size);
8944 + }
8945 +#else
8946 + printk(KERN_INFO "found SMP MP-table at %08lx\n",
8947 + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
8948 +#endif
8949 +
8950 + mpf_found = mpf;
8951 + return 1;
8952 + }
8953 + bp += 4;
8954 + length -= 16;
8955 + }
8956 + return 0;
8957 +}
8958 +
8959 +void __init find_smp_config (void)
8960 +{
8961 +#ifndef CONFIG_XEN
8962 + unsigned int address;
8963 +#endif
8964 +
8965 + /*
8966 + * FIXME: Linux assumes you have 640K of base ram..
8967 + * this continues the error...
8968 + *
8969 + * 1) Scan the bottom 1K for a signature
8970 + * 2) Scan the top 1K of base RAM
8971 + * 3) Scan the 64K of bios
8972 + */
8973 + if (smp_scan_config(0x0,0x400) ||
8974 + smp_scan_config(639*0x400,0x400) ||
8975 + smp_scan_config(0xF0000,0x10000))
8976 + return;
8977 + /*
8978 + * If it is an SMP machine we should know now, unless the
8979 + * configuration is in an EISA/MCA bus machine with an
8980 + * extended bios data area.
8981 + *
8982 + * there is a real-mode segmented pointer pointing to the
8983 + * 4K EBDA area at 0x40E, calculate and scan it here.
8984 + *
8985 + * NOTE! There are Linux loaders that will corrupt the EBDA
8986 + * area, and as such this kind of SMP config may be less
8987 + * trustworthy, simply because the SMP table may have been
8988 + * stomped on during early boot. These loaders are buggy and
8989 + * should be fixed.
8990 + *
8991 + * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
8992 + */
8993 +
8994 +#ifndef CONFIG_XEN
8995 + address = get_bios_ebda();
8996 + if (address)
8997 + smp_scan_config(address, 0x400);
8998 +#endif
8999 +}
9000 +
9001 +/* --------------------------------------------------------------------------
9002 + ACPI-based MP Configuration
9003 + -------------------------------------------------------------------------- */
9004 +
9005 +#ifdef CONFIG_ACPI
9006 +
9007 +void __init mp_register_lapic_address (
9008 + u64 address)
9009 +{
9010 +#ifndef CONFIG_XEN
9011 + mp_lapic_addr = (unsigned long) address;
9012 +
9013 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
9014 +
9015 + if (boot_cpu_physical_apicid == -1U)
9016 + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
9017 +
9018 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
9019 +#endif
9020 +}
9021 +
9022 +
9023 +void __devinit mp_register_lapic (
9024 + u8 id,
9025 + u8 enabled)
9026 +{
9027 + struct mpc_config_processor processor;
9028 + int boot_cpu = 0;
9029 +
9030 + if (MAX_APICS - id <= 0) {
9031 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
9032 + id, MAX_APICS);
9033 + return;
9034 + }
9035 +
9036 + if (id == boot_cpu_physical_apicid)
9037 + boot_cpu = 1;
9038 +
9039 +#ifndef CONFIG_XEN
9040 + processor.mpc_type = MP_PROCESSOR;
9041 + processor.mpc_apicid = id;
9042 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
9043 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
9044 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
9045 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
9046 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
9047 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
9048 + processor.mpc_reserved[0] = 0;
9049 + processor.mpc_reserved[1] = 0;
9050 +#endif
9051 +
9052 + MP_processor_info(&processor);
9053 +}
9054 +
9055 +#ifdef CONFIG_X86_IO_APIC
9056 +
9057 +#define MP_ISA_BUS 0
9058 +#define MP_MAX_IOAPIC_PIN 127
9059 +
9060 +static struct mp_ioapic_routing {
9061 + int apic_id;
9062 + int gsi_base;
9063 + int gsi_end;
9064 + u32 pin_programmed[4];
9065 +} mp_ioapic_routing[MAX_IO_APICS];
9066 +
9067 +
9068 +static int mp_find_ioapic (
9069 + int gsi)
9070 +{
9071 + int i = 0;
9072 +
9073 + /* Find the IOAPIC that manages this GSI. */
9074 + for (i = 0; i < nr_ioapics; i++) {
9075 + if ((gsi >= mp_ioapic_routing[i].gsi_base)
9076 + && (gsi <= mp_ioapic_routing[i].gsi_end))
9077 + return i;
9078 + }
9079 +
9080 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
9081 +
9082 + return -1;
9083 +}
9084 +
9085 +
9086 +void __init mp_register_ioapic (
9087 + u8 id,
9088 + u32 address,
9089 + u32 gsi_base)
9090 +{
9091 + int idx = 0;
9092 + int tmpid;
9093 +
9094 + if (nr_ioapics >= MAX_IO_APICS) {
9095 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
9096 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
9097 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
9098 + }
9099 + if (!address) {
9100 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
9101 + " found in MADT table, skipping!\n");
9102 + return;
9103 + }
9104 +
9105 + idx = nr_ioapics++;
9106 +
9107 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
9108 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
9109 + mp_ioapics[idx].mpc_apicaddr = address;
9110 +
9111 +#ifndef CONFIG_XEN
9112 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
9113 +#endif
9114 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
9115 + tmpid = io_apic_get_unique_id(idx, id);
9116 + else
9117 + tmpid = id;
9118 + if (tmpid == -1) {
9119 + nr_ioapics--;
9120 + return;
9121 + }
9122 + mp_ioapics[idx].mpc_apicid = tmpid;
9123 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
9124 +
9125 + /*
9126 + * Build basic GSI lookup table to facilitate gsi->io_apic lookups
9127 + * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
9128 + */
9129 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
9130 + mp_ioapic_routing[idx].gsi_base = gsi_base;
9131 + mp_ioapic_routing[idx].gsi_end = gsi_base +
9132 + io_apic_get_redir_entries(idx);
9133 +
9134 + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
9135 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
9136 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
9137 + mp_ioapic_routing[idx].gsi_base,
9138 + mp_ioapic_routing[idx].gsi_end);
9139 +
9140 + return;
9141 +}
9142 +
9143 +
9144 +void __init mp_override_legacy_irq (
9145 + u8 bus_irq,
9146 + u8 polarity,
9147 + u8 trigger,
9148 + u32 gsi)
9149 +{
9150 + struct mpc_config_intsrc intsrc;
9151 + int ioapic = -1;
9152 + int pin = -1;
9153 +
9154 + /*
9155 + * Convert 'gsi' to 'ioapic.pin'.
9156 + */
9157 + ioapic = mp_find_ioapic(gsi);
9158 + if (ioapic < 0)
9159 + return;
9160 + pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9161 +
9162 + /*
9163 + * TBD: This check is for faulty timer entries, where the override
9164 + * erroneously sets the trigger to level, resulting in a HUGE
9165 + * increase of timer interrupts!
9166 + */
9167 + if ((bus_irq == 0) && (trigger == 3))
9168 + trigger = 1;
9169 +
9170 + intsrc.mpc_type = MP_INTSRC;
9171 + intsrc.mpc_irqtype = mp_INT;
9172 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
9173 + intsrc.mpc_srcbus = MP_ISA_BUS;
9174 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
9175 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
9176 + intsrc.mpc_dstirq = pin; /* INTIN# */
9177 +
9178 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
9179 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
9180 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
9181 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
9182 +
9183 + mp_irqs[mp_irq_entries] = intsrc;
9184 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9185 + panic("Max # of irq sources exceeded!\n");
9186 +
9187 + return;
9188 +}
9189 +
9190 +int es7000_plat;
9191 +
9192 +void __init mp_config_acpi_legacy_irqs (void)
9193 +{
9194 + struct mpc_config_intsrc intsrc;
9195 + int i = 0;
9196 + int ioapic = -1;
9197 +
9198 + /*
9199 + * Fabricate the legacy ISA bus (bus #31).
9200 + */
9201 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
9202 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
9203 +
9204 + /*
9205 + * Older generations of ES7000 have no legacy identity mappings
9206 + */
9207 + if (es7000_plat == 1)
9208 + return;
9209 +
9210 + /*
9211 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
9212 + */
9213 + ioapic = mp_find_ioapic(0);
9214 + if (ioapic < 0)
9215 + return;
9216 +
9217 + intsrc.mpc_type = MP_INTSRC;
9218 + intsrc.mpc_irqflag = 0; /* Conforming */
9219 + intsrc.mpc_srcbus = MP_ISA_BUS;
9220 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
9221 +
9222 + /*
9223 + * Use the default configuration for the IRQs 0-15. Unless
9224 + * overriden by (MADT) interrupt source override entries.
9225 + */
9226 + for (i = 0; i < 16; i++) {
9227 + int idx;
9228 +
9229 + for (idx = 0; idx < mp_irq_entries; idx++) {
9230 + struct mpc_config_intsrc *irq = mp_irqs + idx;
9231 +
9232 + /* Do we already have a mapping for this ISA IRQ? */
9233 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
9234 + break;
9235 +
9236 + /* Do we already have a mapping for this IOAPIC pin */
9237 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
9238 + (irq->mpc_dstirq == i))
9239 + break;
9240 + }
9241 +
9242 + if (idx != mp_irq_entries) {
9243 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
9244 + continue; /* IRQ already used */
9245 + }
9246 +
9247 + intsrc.mpc_irqtype = mp_INT;
9248 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
9249 + intsrc.mpc_dstirq = i;
9250 +
9251 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
9252 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
9253 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
9254 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
9255 + intsrc.mpc_dstirq);
9256 +
9257 + mp_irqs[mp_irq_entries] = intsrc;
9258 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
9259 + panic("Max # of irq sources exceeded!\n");
9260 + }
9261 +}
9262 +
9263 +#define MAX_GSI_NUM 4096
9264 +
9265 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
9266 +{
9267 + int ioapic = -1;
9268 + int ioapic_pin = 0;
9269 + int idx, bit = 0;
9270 + static int pci_irq = 16;
9271 + /*
9272 + * Mapping between Global System Interrups, which
9273 + * represent all possible interrupts, and IRQs
9274 + * assigned to actual devices.
9275 + */
9276 + static int gsi_to_irq[MAX_GSI_NUM];
9277 +
9278 + /* Don't set up the ACPI SCI because it's already set up */
9279 + if (acpi_fadt.sci_int == gsi)
9280 + return gsi;
9281 +
9282 + ioapic = mp_find_ioapic(gsi);
9283 + if (ioapic < 0) {
9284 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
9285 + return gsi;
9286 + }
9287 +
9288 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
9289 +
9290 + if (ioapic_renumber_irq)
9291 + gsi = ioapic_renumber_irq(ioapic, gsi);
9292 +
9293 + /*
9294 + * Avoid pin reprogramming. PRTs typically include entries
9295 + * with redundant pin->gsi mappings (but unique PCI devices);
9296 + * we only program the IOAPIC on the first.
9297 + */
9298 + bit = ioapic_pin % 32;
9299 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
9300 + if (idx > 3) {
9301 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
9302 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
9303 + ioapic_pin);
9304 + return gsi;
9305 + }
9306 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
9307 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
9308 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
9309 + return gsi_to_irq[gsi];
9310 + }
9311 +
9312 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
9313 +
9314 + if (triggering == ACPI_LEVEL_SENSITIVE) {
9315 + /*
9316 + * For PCI devices assign IRQs in order, avoiding gaps
9317 + * due to unused I/O APIC pins.
9318 + */
9319 + int irq = gsi;
9320 + if (gsi < MAX_GSI_NUM) {
9321 + if (gsi > 15)
9322 + gsi = pci_irq++;
9323 + /*
9324 + * Don't assign IRQ used by ACPI SCI
9325 + */
9326 + if (gsi == acpi_fadt.sci_int)
9327 + gsi = pci_irq++;
9328 + gsi_to_irq[irq] = gsi;
9329 + } else {
9330 + printk(KERN_ERR "GSI %u is too high\n", gsi);
9331 + return gsi;
9332 + }
9333 + }
9334 +
9335 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
9336 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
9337 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
9338 + return gsi;
9339 +}
9340 +
9341 +#endif /* CONFIG_X86_IO_APIC */
9342 +#endif /* CONFIG_ACPI */
9343 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/pci-dma-xen.c linux-2.6.16.33/arch/i386/kernel/pci-dma-xen.c
9344 --- linux-2.6.16.33-noxen/arch/i386/kernel/pci-dma-xen.c 1970-01-01 00:00:00.000000000 +0000
9345 +++ linux-2.6.16.33/arch/i386/kernel/pci-dma-xen.c 2007-01-08 15:00:45.000000000 +0000
9346 @@ -0,0 +1,345 @@
9347 +/*
9348 + * Dynamic DMA mapping support.
9349 + *
9350 + * On i386 there is no hardware dynamic DMA address translation,
9351 + * so consistent alloc/free are merely page allocation/freeing.
9352 + * The rest of the dynamic DMA mapping interface is implemented
9353 + * in asm/pci.h.
9354 + */
9355 +
9356 +#include <linux/types.h>
9357 +#include <linux/mm.h>
9358 +#include <linux/string.h>
9359 +#include <linux/pci.h>
9360 +#include <linux/module.h>
9361 +#include <linux/version.h>
9362 +#include <asm/io.h>
9363 +#include <xen/balloon.h>
9364 +#include <asm/swiotlb.h>
9365 +#include <asm/tlbflush.h>
9366 +#include <asm-i386/mach-xen/asm/swiotlb.h>
9367 +#include <asm/bug.h>
9368 +
9369 +#ifdef __x86_64__
9370 +int iommu_merge __read_mostly = 0;
9371 +EXPORT_SYMBOL(iommu_merge);
9372 +
9373 +dma_addr_t bad_dma_address __read_mostly;
9374 +EXPORT_SYMBOL(bad_dma_address);
9375 +
9376 +/* This tells the BIO block layer to assume merging. Default to off
9377 + because we cannot guarantee merging later. */
9378 +int iommu_bio_merge __read_mostly = 0;
9379 +EXPORT_SYMBOL(iommu_bio_merge);
9380 +
9381 +__init int iommu_setup(char *p)
9382 +{
9383 + return 1;
9384 +}
9385 +#endif
9386 +
9387 +struct dma_coherent_mem {
9388 + void *virt_base;
9389 + u32 device_base;
9390 + int size;
9391 + int flags;
9392 + unsigned long *bitmap;
9393 +};
9394 +
9395 +#define IOMMU_BUG_ON(test) \
9396 +do { \
9397 + if (unlikely(test)) { \
9398 + printk(KERN_ALERT "Fatal DMA error! " \
9399 + "Please use 'swiotlb=force'\n"); \
9400 + BUG(); \
9401 + } \
9402 +} while (0)
9403 +
9404 +int
9405 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9406 + enum dma_data_direction direction)
9407 +{
9408 + int i, rc;
9409 +
9410 + if (direction == DMA_NONE)
9411 + BUG();
9412 + WARN_ON(nents == 0 || sg[0].length == 0);
9413 +
9414 + if (swiotlb) {
9415 + rc = swiotlb_map_sg(hwdev, sg, nents, direction);
9416 + } else {
9417 + for (i = 0; i < nents; i++ ) {
9418 + sg[i].dma_address =
9419 + page_to_bus(sg[i].page) + sg[i].offset;
9420 + sg[i].dma_length = sg[i].length;
9421 + BUG_ON(!sg[i].page);
9422 + IOMMU_BUG_ON(address_needs_mapping(
9423 + hwdev, sg[i].dma_address));
9424 + }
9425 + rc = nents;
9426 + }
9427 +
9428 + flush_write_buffers();
9429 + return rc;
9430 +}
9431 +EXPORT_SYMBOL(dma_map_sg);
9432 +
9433 +void
9434 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
9435 + enum dma_data_direction direction)
9436 +{
9437 + BUG_ON(direction == DMA_NONE);
9438 + if (swiotlb)
9439 + swiotlb_unmap_sg(hwdev, sg, nents, direction);
9440 +}
9441 +EXPORT_SYMBOL(dma_unmap_sg);
9442 +
9443 +/*
9444 + * XXX This file is also used by xenLinux/ia64.
9445 + * "defined(__i386__) || defined (__x86_64__)" means "!defined(__ia64__)".
9446 + * This #if work around should be removed once this file is merbed back into
9447 + * i386' pci-dma or is moved to drivers/xen/core.
9448 + */
9449 +#if defined(__i386__) || defined(__x86_64__)
9450 +dma_addr_t
9451 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
9452 + size_t size, enum dma_data_direction direction)
9453 +{
9454 + dma_addr_t dma_addr;
9455 +
9456 + BUG_ON(direction == DMA_NONE);
9457 +
9458 + if (swiotlb) {
9459 + dma_addr = swiotlb_map_page(
9460 + dev, page, offset, size, direction);
9461 + } else {
9462 + dma_addr = page_to_bus(page) + offset;
9463 + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
9464 + }
9465 +
9466 + return dma_addr;
9467 +}
9468 +EXPORT_SYMBOL(dma_map_page);
9469 +
9470 +void
9471 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
9472 + enum dma_data_direction direction)
9473 +{
9474 + BUG_ON(direction == DMA_NONE);
9475 + if (swiotlb)
9476 + swiotlb_unmap_page(dev, dma_address, size, direction);
9477 +}
9478 +EXPORT_SYMBOL(dma_unmap_page);
9479 +#endif /* defined(__i386__) || defined(__x86_64__) */
9480 +
9481 +int
9482 +dma_mapping_error(dma_addr_t dma_addr)
9483 +{
9484 + if (swiotlb)
9485 + return swiotlb_dma_mapping_error(dma_addr);
9486 + return 0;
9487 +}
9488 +EXPORT_SYMBOL(dma_mapping_error);
9489 +
9490 +int
9491 +dma_supported(struct device *dev, u64 mask)
9492 +{
9493 + if (swiotlb)
9494 + return swiotlb_dma_supported(dev, mask);
9495 + /*
9496 + * By default we'll BUG when an infeasible DMA is requested, and
9497 + * request swiotlb=force (see IOMMU_BUG_ON).
9498 + */
9499 + return 1;
9500 +}
9501 +EXPORT_SYMBOL(dma_supported);
9502 +
9503 +void *dma_alloc_coherent(struct device *dev, size_t size,
9504 + dma_addr_t *dma_handle, gfp_t gfp)
9505 +{
9506 + void *ret;
9507 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
9508 + unsigned int order = get_order(size);
9509 + unsigned long vstart;
9510 + /* ignore region specifiers */
9511 + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
9512 +
9513 + if (mem) {
9514 + int page = bitmap_find_free_region(mem->bitmap, mem->size,
9515 + order);
9516 + if (page >= 0) {
9517 + *dma_handle = mem->device_base + (page << PAGE_SHIFT);
9518 + ret = mem->virt_base + (page << PAGE_SHIFT);
9519 + memset(ret, 0, size);
9520 + return ret;
9521 + }
9522 + if (mem->flags & DMA_MEMORY_EXCLUSIVE)
9523 + return NULL;
9524 + }
9525 +
9526 + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
9527 + gfp |= GFP_DMA;
9528 +
9529 + vstart = __get_free_pages(gfp, order);
9530 + ret = (void *)vstart;
9531 +
9532 + if (ret != NULL) {
9533 + if (xen_create_contiguous_region(vstart, order,
9534 + dma_bits) != 0) {
9535 + free_pages(vstart, order);
9536 + return NULL;
9537 + }
9538 + memset(ret, 0, size);
9539 + *dma_handle = virt_to_bus(ret);
9540 + }
9541 + return ret;
9542 +}
9543 +EXPORT_SYMBOL(dma_alloc_coherent);
9544 +
9545 +void dma_free_coherent(struct device *dev, size_t size,
9546 + void *vaddr, dma_addr_t dma_handle)
9547 +{
9548 + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
9549 + int order = get_order(size);
9550 +
9551 + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
9552 + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
9553 +
9554 + bitmap_release_region(mem->bitmap, page, order);
9555 + } else {
9556 + xen_destroy_contiguous_region((unsigned long)vaddr, order);
9557 + free_pages((unsigned long)vaddr, order);
9558 + }
9559 +}
9560 +EXPORT_SYMBOL(dma_free_coherent);
9561 +
9562 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
9563 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
9564 + dma_addr_t device_addr, size_t size, int flags)
9565 +{
9566 + void __iomem *mem_base;
9567 + int pages = size >> PAGE_SHIFT;
9568 + int bitmap_size = (pages + 31)/32;
9569 +
9570 + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
9571 + goto out;
9572 + if (!size)
9573 + goto out;
9574 + if (dev->dma_mem)
9575 + goto out;
9576 +
9577 + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
9578 +
9579 + mem_base = ioremap(bus_addr, size);
9580 + if (!mem_base)
9581 + goto out;
9582 +
9583 + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
9584 + if (!dev->dma_mem)
9585 + goto out;
9586 + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
9587 + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
9588 + if (!dev->dma_mem->bitmap)
9589 + goto free1_out;
9590 + memset(dev->dma_mem->bitmap, 0, bitmap_size);
9591 +
9592 + dev->dma_mem->virt_base = mem_base;
9593 + dev->dma_mem->device_base = device_addr;
9594 + dev->dma_mem->size = pages;
9595 + dev->dma_mem->flags = flags;
9596 +
9597 + if (flags & DMA_MEMORY_MAP)
9598 + return DMA_MEMORY_MAP;
9599 +
9600 + return DMA_MEMORY_IO;
9601 +
9602 + free1_out:
9603 + kfree(dev->dma_mem->bitmap);
9604 + out:
9605 + return 0;
9606 +}
9607 +EXPORT_SYMBOL(dma_declare_coherent_memory);
9608 +
9609 +void dma_release_declared_memory(struct device *dev)
9610 +{
9611 + struct dma_coherent_mem *mem = dev->dma_mem;
9612 +
9613 + if(!mem)
9614 + return;
9615 + dev->dma_mem = NULL;
9616 + iounmap(mem->virt_base);
9617 + kfree(mem->bitmap);
9618 + kfree(mem);
9619 +}
9620 +EXPORT_SYMBOL(dma_release_declared_memory);
9621 +
9622 +void *dma_mark_declared_memory_occupied(struct device *dev,
9623 + dma_addr_t device_addr, size_t size)
9624 +{
9625 + struct dma_coherent_mem *mem = dev->dma_mem;
9626 + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
9627 + int pos, err;
9628 +
9629 + if (!mem)
9630 + return ERR_PTR(-EINVAL);
9631 +
9632 + pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
9633 + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
9634 + if (err != 0)
9635 + return ERR_PTR(err);
9636 + return mem->virt_base + (pos << PAGE_SHIFT);
9637 +}
9638 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
9639 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
9640 +
9641 +dma_addr_t
9642 +dma_map_single(struct device *dev, void *ptr, size_t size,
9643 + enum dma_data_direction direction)
9644 +{
9645 + dma_addr_t dma;
9646 +
9647 + if (direction == DMA_NONE)
9648 + BUG();
9649 + WARN_ON(size == 0);
9650 +
9651 + if (swiotlb) {
9652 + dma = swiotlb_map_single(dev, ptr, size, direction);
9653 + } else {
9654 + dma = virt_to_bus(ptr);
9655 + IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
9656 + IOMMU_BUG_ON(address_needs_mapping(dev, dma));
9657 + }
9658 +
9659 + flush_write_buffers();
9660 + return dma;
9661 +}
9662 +EXPORT_SYMBOL(dma_map_single);
9663 +
9664 +void
9665 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
9666 + enum dma_data_direction direction)
9667 +{
9668 + if (direction == DMA_NONE)
9669 + BUG();
9670 + if (swiotlb)
9671 + swiotlb_unmap_single(dev, dma_addr, size, direction);
9672 +}
9673 +EXPORT_SYMBOL(dma_unmap_single);
9674 +
9675 +void
9676 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
9677 + enum dma_data_direction direction)
9678 +{
9679 + if (swiotlb)
9680 + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
9681 +}
9682 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
9683 +
9684 +void
9685 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
9686 + enum dma_data_direction direction)
9687 +{
9688 + if (swiotlb)
9689 + swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
9690 +}
9691 +EXPORT_SYMBOL(dma_sync_single_for_device);
9692 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/process-xen.c linux-2.6.16.33/arch/i386/kernel/process-xen.c
9693 --- linux-2.6.16.33-noxen/arch/i386/kernel/process-xen.c 1970-01-01 00:00:00.000000000 +0000
9694 +++ linux-2.6.16.33/arch/i386/kernel/process-xen.c 2007-01-08 15:00:45.000000000 +0000
9695 @@ -0,0 +1,821 @@
9696 +/*
9697 + * linux/arch/i386/kernel/process.c
9698 + *
9699 + * Copyright (C) 1995 Linus Torvalds
9700 + *
9701 + * Pentium III FXSR, SSE support
9702 + * Gareth Hughes <gareth@valinux.com>, May 2000
9703 + */
9704 +
9705 +/*
9706 + * This file handles the architecture-dependent parts of process handling..
9707 + */
9708 +
9709 +#include <stdarg.h>
9710 +
9711 +#include <linux/cpu.h>
9712 +#include <linux/errno.h>
9713 +#include <linux/sched.h>
9714 +#include <linux/fs.h>
9715 +#include <linux/kernel.h>
9716 +#include <linux/mm.h>
9717 +#include <linux/elfcore.h>
9718 +#include <linux/smp.h>
9719 +#include <linux/smp_lock.h>
9720 +#include <linux/stddef.h>
9721 +#include <linux/slab.h>
9722 +#include <linux/vmalloc.h>
9723 +#include <linux/user.h>
9724 +#include <linux/a.out.h>
9725 +#include <linux/interrupt.h>
9726 +#include <linux/config.h>
9727 +#include <linux/utsname.h>
9728 +#include <linux/delay.h>
9729 +#include <linux/reboot.h>
9730 +#include <linux/init.h>
9731 +#include <linux/mc146818rtc.h>
9732 +#include <linux/module.h>
9733 +#include <linux/kallsyms.h>
9734 +#include <linux/ptrace.h>
9735 +#include <linux/random.h>
9736 +#include <linux/kprobes.h>
9737 +
9738 +#include <asm/uaccess.h>
9739 +#include <asm/pgtable.h>
9740 +#include <asm/system.h>
9741 +#include <asm/io.h>
9742 +#include <asm/ldt.h>
9743 +#include <asm/processor.h>
9744 +#include <asm/i387.h>
9745 +#include <asm/desc.h>
9746 +#include <asm/vm86.h>
9747 +#ifdef CONFIG_MATH_EMULATION
9748 +#include <asm/math_emu.h>
9749 +#endif
9750 +
9751 +#include <xen/interface/physdev.h>
9752 +#include <xen/interface/vcpu.h>
9753 +#include <xen/cpu_hotplug.h>
9754 +
9755 +#include <linux/err.h>
9756 +
9757 +#include <asm/tlbflush.h>
9758 +#include <asm/cpu.h>
9759 +
9760 +#include <asm/tlbflush.h>
9761 +#include <asm/cpu.h>
9762 +
9763 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
9764 +
9765 +static int hlt_counter;
9766 +
9767 +unsigned long boot_option_idle_override = 0;
9768 +EXPORT_SYMBOL(boot_option_idle_override);
9769 +
9770 +/*
9771 + * Return saved PC of a blocked thread.
9772 + */
9773 +unsigned long thread_saved_pc(struct task_struct *tsk)
9774 +{
9775 + return ((unsigned long *)tsk->thread.esp)[3];
9776 +}
9777 +
9778 +/*
9779 + * Powermanagement idle function, if any..
9780 + */
9781 +void (*pm_idle)(void);
9782 +EXPORT_SYMBOL(pm_idle);
9783 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
9784 +
9785 +void disable_hlt(void)
9786 +{
9787 + hlt_counter++;
9788 +}
9789 +
9790 +EXPORT_SYMBOL(disable_hlt);
9791 +
9792 +void enable_hlt(void)
9793 +{
9794 + hlt_counter--;
9795 +}
9796 +
9797 +EXPORT_SYMBOL(enable_hlt);
9798 +
9799 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
9800 +void xen_idle(void)
9801 +{
9802 + local_irq_disable();
9803 +
9804 + if (need_resched())
9805 + local_irq_enable();
9806 + else {
9807 + clear_thread_flag(TIF_POLLING_NRFLAG);
9808 + smp_mb__after_clear_bit();
9809 + safe_halt();
9810 + set_thread_flag(TIF_POLLING_NRFLAG);
9811 + }
9812 +}
9813 +#ifdef CONFIG_APM_MODULE
9814 +EXPORT_SYMBOL(default_idle);
9815 +#endif
9816 +
9817 +#ifdef CONFIG_HOTPLUG_CPU
9818 +extern cpumask_t cpu_initialized;
9819 +static inline void play_dead(void)
9820 +{
9821 + idle_task_exit();
9822 + local_irq_disable();
9823 + cpu_clear(smp_processor_id(), cpu_initialized);
9824 + preempt_enable_no_resched();
9825 + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
9826 + cpu_bringup();
9827 +}
9828 +#else
9829 +static inline void play_dead(void)
9830 +{
9831 + BUG();
9832 +}
9833 +#endif /* CONFIG_HOTPLUG_CPU */
9834 +
9835 +/*
9836 + * The idle thread. There's no useful work to be
9837 + * done, so just try to conserve power and have a
9838 + * low exit latency (ie sit in a loop waiting for
9839 + * somebody to say that they'd like to reschedule)
9840 + */
9841 +void cpu_idle(void)
9842 +{
9843 + int cpu = smp_processor_id();
9844 +
9845 + set_thread_flag(TIF_POLLING_NRFLAG);
9846 +
9847 + /* endless idle loop with no priority at all */
9848 + while (1) {
9849 + while (!need_resched()) {
9850 +
9851 + if (__get_cpu_var(cpu_idle_state))
9852 + __get_cpu_var(cpu_idle_state) = 0;
9853 +
9854 + rmb();
9855 +
9856 + if (cpu_is_offline(cpu))
9857 + play_dead();
9858 +
9859 + __get_cpu_var(irq_stat).idle_timestamp = jiffies;
9860 + xen_idle();
9861 + }
9862 + preempt_enable_no_resched();
9863 + schedule();
9864 + preempt_disable();
9865 + }
9866 +}
9867 +
9868 +void cpu_idle_wait(void)
9869 +{
9870 + unsigned int cpu, this_cpu = get_cpu();
9871 + cpumask_t map;
9872 +
9873 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
9874 + put_cpu();
9875 +
9876 + cpus_clear(map);
9877 + for_each_online_cpu(cpu) {
9878 + per_cpu(cpu_idle_state, cpu) = 1;
9879 + cpu_set(cpu, map);
9880 + }
9881 +
9882 + __get_cpu_var(cpu_idle_state) = 0;
9883 +
9884 + wmb();
9885 + do {
9886 + ssleep(1);
9887 + for_each_online_cpu(cpu) {
9888 + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
9889 + cpu_clear(cpu, map);
9890 + }
9891 + cpus_and(map, map, cpu_online_map);
9892 + } while (!cpus_empty(map));
9893 +}
9894 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
9895 +
9896 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
9897 +/* Always use xen_idle() instead. */
9898 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
9899 +
9900 +void show_regs(struct pt_regs * regs)
9901 +{
9902 + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
9903 +
9904 + printk("\n");
9905 + printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
9906 + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
9907 + print_symbol("EIP is at %s\n", regs->eip);
9908 +
9909 + if (user_mode(regs))
9910 + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
9911 + printk(" EFLAGS: %08lx %s (%s %.*s)\n",
9912 + regs->eflags, print_tainted(), system_utsname.release,
9913 + (int)strcspn(system_utsname.version, " "),
9914 + system_utsname.version);
9915 + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
9916 + regs->eax,regs->ebx,regs->ecx,regs->edx);
9917 + printk("ESI: %08lx EDI: %08lx EBP: %08lx",
9918 + regs->esi, regs->edi, regs->ebp);
9919 + printk(" DS: %04x ES: %04x\n",
9920 + 0xffff & regs->xds,0xffff & regs->xes);
9921 +
9922 + cr0 = read_cr0();
9923 + cr2 = read_cr2();
9924 + cr3 = read_cr3();
9925 + cr4 = read_cr4_safe();
9926 + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
9927 + show_trace(NULL, &regs->esp);
9928 +}
9929 +
9930 +/*
9931 + * This gets run with %ebx containing the
9932 + * function to call, and %edx containing
9933 + * the "args".
9934 + */
9935 +extern void kernel_thread_helper(void);
9936 +__asm__(".section .text\n"
9937 + ".align 4\n"
9938 + "kernel_thread_helper:\n\t"
9939 + "movl %edx,%eax\n\t"
9940 + "pushl %edx\n\t"
9941 + "call *%ebx\n\t"
9942 + "pushl %eax\n\t"
9943 + "call do_exit\n"
9944 + ".previous");
9945 +
9946 +/*
9947 + * Create a kernel thread
9948 + */
9949 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
9950 +{
9951 + struct pt_regs regs;
9952 +
9953 + memset(&regs, 0, sizeof(regs));
9954 +
9955 + regs.ebx = (unsigned long) fn;
9956 + regs.edx = (unsigned long) arg;
9957 +
9958 + regs.xds = __USER_DS;
9959 + regs.xes = __USER_DS;
9960 + regs.orig_eax = -1;
9961 + regs.eip = (unsigned long) kernel_thread_helper;
9962 + regs.xcs = GET_KERNEL_CS();
9963 + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
9964 +
9965 + /* Ok, create the new process.. */
9966 + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
9967 +}
9968 +EXPORT_SYMBOL(kernel_thread);
9969 +
9970 +/*
9971 + * Free current thread data structures etc..
9972 + */
9973 +void exit_thread(void)
9974 +{
9975 + struct task_struct *tsk = current;
9976 + struct thread_struct *t = &tsk->thread;
9977 +
9978 + /*
9979 + * Remove function-return probe instances associated with this task
9980 + * and put them back on the free list. Do not insert an exit probe for
9981 + * this function, it will be disabled by kprobe_flush_task if you do.
9982 + */
9983 + kprobe_flush_task(tsk);
9984 +
9985 + /* The process may have allocated an io port bitmap... nuke it. */
9986 + if (unlikely(NULL != t->io_bitmap_ptr)) {
9987 + struct physdev_set_iobitmap set_iobitmap = { 0 };
9988 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
9989 + kfree(t->io_bitmap_ptr);
9990 + t->io_bitmap_ptr = NULL;
9991 + }
9992 +}
9993 +
9994 +void flush_thread(void)
9995 +{
9996 + struct task_struct *tsk = current;
9997 +
9998 + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
9999 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
10000 + /*
10001 + * Forget coprocessor state..
10002 + */
10003 + clear_fpu(tsk);
10004 + clear_used_math();
10005 +}
10006 +
10007 +void release_thread(struct task_struct *dead_task)
10008 +{
10009 + BUG_ON(dead_task->mm);
10010 + release_vm86_irqs(dead_task);
10011 +}
10012 +
10013 +/*
10014 + * This gets called before we allocate a new thread and copy
10015 + * the current task into it.
10016 + */
10017 +void prepare_to_copy(struct task_struct *tsk)
10018 +{
10019 + unlazy_fpu(tsk);
10020 +}
10021 +
10022 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
10023 + unsigned long unused,
10024 + struct task_struct * p, struct pt_regs * regs)
10025 +{
10026 + struct pt_regs * childregs;
10027 + struct task_struct *tsk;
10028 + int err;
10029 +
10030 + childregs = task_pt_regs(p);
10031 + *childregs = *regs;
10032 + childregs->eax = 0;
10033 + childregs->esp = esp;
10034 +
10035 + p->thread.esp = (unsigned long) childregs;
10036 + p->thread.esp0 = (unsigned long) (childregs+1);
10037 +
10038 + p->thread.eip = (unsigned long) ret_from_fork;
10039 +
10040 + savesegment(fs,p->thread.fs);
10041 + savesegment(gs,p->thread.gs);
10042 +
10043 + tsk = current;
10044 + if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
10045 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
10046 + if (!p->thread.io_bitmap_ptr) {
10047 + p->thread.io_bitmap_max = 0;
10048 + return -ENOMEM;
10049 + }
10050 + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
10051 + IO_BITMAP_BYTES);
10052 + }
10053 +
10054 + /*
10055 + * Set a new TLS for the child thread?
10056 + */
10057 + if (clone_flags & CLONE_SETTLS) {
10058 + struct desc_struct *desc;
10059 + struct user_desc info;
10060 + int idx;
10061 +
10062 + err = -EFAULT;
10063 + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
10064 + goto out;
10065 + err = -EINVAL;
10066 + if (LDT_empty(&info))
10067 + goto out;
10068 +
10069 + idx = info.entry_number;
10070 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10071 + goto out;
10072 +
10073 + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
10074 + desc->a = LDT_entry_a(&info);
10075 + desc->b = LDT_entry_b(&info);
10076 + }
10077 +
10078 + p->thread.iopl = current->thread.iopl;
10079 +
10080 + err = 0;
10081 + out:
10082 + if (err && p->thread.io_bitmap_ptr) {
10083 + kfree(p->thread.io_bitmap_ptr);
10084 + p->thread.io_bitmap_max = 0;
10085 + }
10086 + return err;
10087 +}
10088 +
10089 +/*
10090 + * fill in the user structure for a core dump..
10091 + */
10092 +void dump_thread(struct pt_regs * regs, struct user * dump)
10093 +{
10094 + int i;
10095 +
10096 +/* changed the size calculations - should hopefully work better. lbt */
10097 + dump->magic = CMAGIC;
10098 + dump->start_code = 0;
10099 + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
10100 + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
10101 + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
10102 + dump->u_dsize -= dump->u_tsize;
10103 + dump->u_ssize = 0;
10104 + for (i = 0; i < 8; i++)
10105 + dump->u_debugreg[i] = current->thread.debugreg[i];
10106 +
10107 + if (dump->start_stack < TASK_SIZE)
10108 + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
10109 +
10110 + dump->regs.ebx = regs->ebx;
10111 + dump->regs.ecx = regs->ecx;
10112 + dump->regs.edx = regs->edx;
10113 + dump->regs.esi = regs->esi;
10114 + dump->regs.edi = regs->edi;
10115 + dump->regs.ebp = regs->ebp;
10116 + dump->regs.eax = regs->eax;
10117 + dump->regs.ds = regs->xds;
10118 + dump->regs.es = regs->xes;
10119 + savesegment(fs,dump->regs.fs);
10120 + savesegment(gs,dump->regs.gs);
10121 + dump->regs.orig_eax = regs->orig_eax;
10122 + dump->regs.eip = regs->eip;
10123 + dump->regs.cs = regs->xcs;
10124 + dump->regs.eflags = regs->eflags;
10125 + dump->regs.esp = regs->esp;
10126 + dump->regs.ss = regs->xss;
10127 +
10128 + dump->u_fpvalid = dump_fpu (regs, &dump->i387);
10129 +}
10130 +EXPORT_SYMBOL(dump_thread);
10131 +
10132 +/*
10133 + * Capture the user space registers if the task is not running (in user space)
10134 + */
10135 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
10136 +{
10137 + struct pt_regs ptregs = *task_pt_regs(tsk);
10138 + ptregs.xcs &= 0xffff;
10139 + ptregs.xds &= 0xffff;
10140 + ptregs.xes &= 0xffff;
10141 + ptregs.xss &= 0xffff;
10142 +
10143 + elf_core_copy_regs(regs, &ptregs);
10144 +
10145 + return 1;
10146 +}
10147 +
10148 +/*
10149 + * This function selects if the context switch from prev to next
10150 + * has to tweak the TSC disable bit in the cr4.
10151 + */
10152 +static inline void disable_tsc(struct task_struct *prev_p,
10153 + struct task_struct *next_p)
10154 +{
10155 + struct thread_info *prev, *next;
10156 +
10157 + /*
10158 + * gcc should eliminate the ->thread_info dereference if
10159 + * has_secure_computing returns 0 at compile time (SECCOMP=n).
10160 + */
10161 + prev = task_thread_info(prev_p);
10162 + next = task_thread_info(next_p);
10163 +
10164 + if (has_secure_computing(prev) || has_secure_computing(next)) {
10165 + /* slow path here */
10166 + if (has_secure_computing(prev) &&
10167 + !has_secure_computing(next)) {
10168 + write_cr4(read_cr4() & ~X86_CR4_TSD);
10169 + } else if (!has_secure_computing(prev) &&
10170 + has_secure_computing(next))
10171 + write_cr4(read_cr4() | X86_CR4_TSD);
10172 + }
10173 +}
10174 +
10175 +/*
10176 + * switch_to(x,yn) should switch tasks from x to y.
10177 + *
10178 + * We fsave/fwait so that an exception goes off at the right time
10179 + * (as a call from the fsave or fwait in effect) rather than to
10180 + * the wrong process. Lazy FP saving no longer makes any sense
10181 + * with modern CPU's, and this simplifies a lot of things (SMP
10182 + * and UP become the same).
10183 + *
10184 + * NOTE! We used to use the x86 hardware context switching. The
10185 + * reason for not using it any more becomes apparent when you
10186 + * try to recover gracefully from saved state that is no longer
10187 + * valid (stale segment register values in particular). With the
10188 + * hardware task-switch, there is no way to fix up bad state in
10189 + * a reasonable manner.
10190 + *
10191 + * The fact that Intel documents the hardware task-switching to
10192 + * be slow is a fairly red herring - this code is not noticeably
10193 + * faster. However, there _is_ some room for improvement here,
10194 + * so the performance issues may eventually be a valid point.
10195 + * More important, however, is the fact that this allows us much
10196 + * more flexibility.
10197 + *
10198 + * The return value (in %eax) will be the "prev" task after
10199 + * the task-switch, and shows up in ret_from_fork in entry.S,
10200 + * for example.
10201 + */
10202 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
10203 +{
10204 + struct thread_struct *prev = &prev_p->thread,
10205 + *next = &next_p->thread;
10206 + int cpu = smp_processor_id();
10207 +#ifndef CONFIG_X86_NO_TSS
10208 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
10209 +#endif
10210 + struct physdev_set_iopl iopl_op;
10211 + struct physdev_set_iobitmap iobmp_op;
10212 + multicall_entry_t _mcl[8], *mcl = _mcl;
10213 +
10214 + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
10215 +
10216 + /*
10217 + * This is basically '__unlazy_fpu', except that we queue a
10218 + * multicall to indicate FPU task switch, rather than
10219 + * synchronously trapping to Xen.
10220 + */
10221 + if (prev_p->thread_info->status & TS_USEDFPU) {
10222 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
10223 + mcl->op = __HYPERVISOR_fpu_taskswitch;
10224 + mcl->args[0] = 1;
10225 + mcl++;
10226 + }
10227 +#if 0 /* lazy fpu sanity check */
10228 + else BUG_ON(!(read_cr0() & 8));
10229 +#endif
10230 +
10231 + /*
10232 + * Reload esp0.
10233 + * This is load_esp0(tss, next) with a multicall.
10234 + */
10235 + mcl->op = __HYPERVISOR_stack_switch;
10236 + mcl->args[0] = __KERNEL_DS;
10237 + mcl->args[1] = next->esp0;
10238 + mcl++;
10239 +
10240 + /*
10241 + * Load the per-thread Thread-Local Storage descriptor.
10242 + * This is load_TLS(next, cpu) with multicalls.
10243 + */
10244 +#define C(i) do { \
10245 + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
10246 + next->tls_array[i].b != prev->tls_array[i].b)) { \
10247 + mcl->op = __HYPERVISOR_update_descriptor; \
10248 + *(u64 *)&mcl->args[0] = virt_to_machine( \
10249 + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
10250 + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
10251 + mcl++; \
10252 + } \
10253 +} while (0)
10254 + C(0); C(1); C(2);
10255 +#undef C
10256 +
10257 + if (unlikely(prev->iopl != next->iopl)) {
10258 + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
10259 + mcl->op = __HYPERVISOR_physdev_op;
10260 + mcl->args[0] = PHYSDEVOP_set_iopl;
10261 + mcl->args[1] = (unsigned long)&iopl_op;
10262 + mcl++;
10263 + }
10264 +
10265 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
10266 + iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
10267 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
10268 + mcl->op = __HYPERVISOR_physdev_op;
10269 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
10270 + mcl->args[1] = (unsigned long)&iobmp_op;
10271 + mcl++;
10272 + }
10273 +
10274 + (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
10275 +
10276 + /*
10277 + * Restore %fs and %gs if needed.
10278 + *
10279 + * Glibc normally makes %fs be zero, and %gs is one of
10280 + * the TLS segments.
10281 + */
10282 + if (unlikely(next->fs))
10283 + loadsegment(fs, next->fs);
10284 +
10285 + if (next->gs)
10286 + loadsegment(gs, next->gs);
10287 +
10288 + /*
10289 + * Now maybe reload the debug registers
10290 + */
10291 + if (unlikely(next->debugreg[7])) {
10292 + set_debugreg(next->debugreg[0], 0);
10293 + set_debugreg(next->debugreg[1], 1);
10294 + set_debugreg(next->debugreg[2], 2);
10295 + set_debugreg(next->debugreg[3], 3);
10296 + /* no 4 and 5 */
10297 + set_debugreg(next->debugreg[6], 6);
10298 + set_debugreg(next->debugreg[7], 7);
10299 + }
10300 +
10301 + disable_tsc(prev_p, next_p);
10302 +
10303 + return prev_p;
10304 +}
10305 +
10306 +asmlinkage int sys_fork(struct pt_regs regs)
10307 +{
10308 + return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10309 +}
10310 +
10311 +asmlinkage int sys_clone(struct pt_regs regs)
10312 +{
10313 + unsigned long clone_flags;
10314 + unsigned long newsp;
10315 + int __user *parent_tidptr, *child_tidptr;
10316 +
10317 + clone_flags = regs.ebx;
10318 + newsp = regs.ecx;
10319 + parent_tidptr = (int __user *)regs.edx;
10320 + child_tidptr = (int __user *)regs.edi;
10321 + if (!newsp)
10322 + newsp = regs.esp;
10323 + return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
10324 +}
10325 +
10326 +/*
10327 + * This is trivial, and on the face of it looks like it
10328 + * could equally well be done in user mode.
10329 + *
10330 + * Not so, for quite unobvious reasons - register pressure.
10331 + * In user mode vfork() cannot have a stack frame, and if
10332 + * done by calling the "clone()" system call directly, you
10333 + * do not have enough call-clobbered registers to hold all
10334 + * the information you need.
10335 + */
10336 +asmlinkage int sys_vfork(struct pt_regs regs)
10337 +{
10338 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
10339 +}
10340 +
10341 +/*
10342 + * sys_execve() executes a new program.
10343 + */
10344 +asmlinkage int sys_execve(struct pt_regs regs)
10345 +{
10346 + int error;
10347 + char * filename;
10348 +
10349 + filename = getname((char __user *) regs.ebx);
10350 + error = PTR_ERR(filename);
10351 + if (IS_ERR(filename))
10352 + goto out;
10353 + error = do_execve(filename,
10354 + (char __user * __user *) regs.ecx,
10355 + (char __user * __user *) regs.edx,
10356 + &regs);
10357 + if (error == 0) {
10358 + task_lock(current);
10359 + current->ptrace &= ~PT_DTRACE;
10360 + task_unlock(current);
10361 + /* Make sure we don't return using sysenter.. */
10362 + set_thread_flag(TIF_IRET);
10363 + }
10364 + putname(filename);
10365 +out:
10366 + return error;
10367 +}
10368 +
10369 +#define top_esp (THREAD_SIZE - sizeof(unsigned long))
10370 +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
10371 +
10372 +unsigned long get_wchan(struct task_struct *p)
10373 +{
10374 + unsigned long ebp, esp, eip;
10375 + unsigned long stack_page;
10376 + int count = 0;
10377 + if (!p || p == current || p->state == TASK_RUNNING)
10378 + return 0;
10379 + stack_page = (unsigned long)task_stack_page(p);
10380 + esp = p->thread.esp;
10381 + if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
10382 + return 0;
10383 + /* include/asm-i386/system.h:switch_to() pushes ebp last. */
10384 + ebp = *(unsigned long *) esp;
10385 + do {
10386 + if (ebp < stack_page || ebp > top_ebp+stack_page)
10387 + return 0;
10388 + eip = *(unsigned long *) (ebp+4);
10389 + if (!in_sched_functions(eip))
10390 + return eip;
10391 + ebp = *(unsigned long *) ebp;
10392 + } while (count++ < 16);
10393 + return 0;
10394 +}
10395 +EXPORT_SYMBOL(get_wchan);
10396 +
10397 +/*
10398 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
10399 + */
10400 +static int get_free_idx(void)
10401 +{
10402 + struct thread_struct *t = &current->thread;
10403 + int idx;
10404 +
10405 + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
10406 + if (desc_empty(t->tls_array + idx))
10407 + return idx + GDT_ENTRY_TLS_MIN;
10408 + return -ESRCH;
10409 +}
10410 +
10411 +/*
10412 + * Set a given TLS descriptor:
10413 + */
10414 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
10415 +{
10416 + struct thread_struct *t = &current->thread;
10417 + struct user_desc info;
10418 + struct desc_struct *desc;
10419 + int cpu, idx;
10420 +
10421 + if (copy_from_user(&info, u_info, sizeof(info)))
10422 + return -EFAULT;
10423 + idx = info.entry_number;
10424 +
10425 + /*
10426 + * index -1 means the kernel should try to find and
10427 + * allocate an empty descriptor:
10428 + */
10429 + if (idx == -1) {
10430 + idx = get_free_idx();
10431 + if (idx < 0)
10432 + return idx;
10433 + if (put_user(idx, &u_info->entry_number))
10434 + return -EFAULT;
10435 + }
10436 +
10437 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10438 + return -EINVAL;
10439 +
10440 + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
10441 +
10442 + /*
10443 + * We must not get preempted while modifying the TLS.
10444 + */
10445 + cpu = get_cpu();
10446 +
10447 + if (LDT_empty(&info)) {
10448 + desc->a = 0;
10449 + desc->b = 0;
10450 + } else {
10451 + desc->a = LDT_entry_a(&info);
10452 + desc->b = LDT_entry_b(&info);
10453 + }
10454 + load_TLS(t, cpu);
10455 +
10456 + put_cpu();
10457 +
10458 + return 0;
10459 +}
10460 +
10461 +/*
10462 + * Get the current Thread-Local Storage area:
10463 + */
10464 +
10465 +#define GET_BASE(desc) ( \
10466 + (((desc)->a >> 16) & 0x0000ffff) | \
10467 + (((desc)->b << 16) & 0x00ff0000) | \
10468 + ( (desc)->b & 0xff000000) )
10469 +
10470 +#define GET_LIMIT(desc) ( \
10471 + ((desc)->a & 0x0ffff) | \
10472 + ((desc)->b & 0xf0000) )
10473 +
10474 +#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
10475 +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
10476 +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
10477 +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
10478 +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
10479 +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
10480 +
10481 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
10482 +{
10483 + struct user_desc info;
10484 + struct desc_struct *desc;
10485 + int idx;
10486 +
10487 + if (get_user(idx, &u_info->entry_number))
10488 + return -EFAULT;
10489 + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
10490 + return -EINVAL;
10491 +
10492 + memset(&info, 0, sizeof(info));
10493 +
10494 + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
10495 +
10496 + info.entry_number = idx;
10497 + info.base_addr = GET_BASE(desc);
10498 + info.limit = GET_LIMIT(desc);
10499 + info.seg_32bit = GET_32BIT(desc);
10500 + info.contents = GET_CONTENTS(desc);
10501 + info.read_exec_only = !GET_WRITABLE(desc);
10502 + info.limit_in_pages = GET_LIMIT_PAGES(desc);
10503 + info.seg_not_present = !GET_PRESENT(desc);
10504 + info.useable = GET_USEABLE(desc);
10505 +
10506 + if (copy_to_user(u_info, &info, sizeof(info)))
10507 + return -EFAULT;
10508 + return 0;
10509 +}
10510 +
10511 +unsigned long arch_align_stack(unsigned long sp)
10512 +{
10513 + if (randomize_va_space)
10514 + sp -= get_random_int() % 8192;
10515 + return sp & ~0xf;
10516 +}
10517 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/quirks-xen.c linux-2.6.16.33/arch/i386/kernel/quirks-xen.c
10518 --- linux-2.6.16.33-noxen/arch/i386/kernel/quirks-xen.c 1970-01-01 00:00:00.000000000 +0000
10519 +++ linux-2.6.16.33/arch/i386/kernel/quirks-xen.c 2007-01-08 15:00:45.000000000 +0000
10520 @@ -0,0 +1,48 @@
10521 +/*
10522 + * This file contains work-arounds for x86 and x86_64 platform bugs.
10523 + */
10524 +#include <linux/config.h>
10525 +#include <linux/pci.h>
10526 +#include <linux/irq.h>
10527 +
10528 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
10529 +
10530 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10531 +{
10532 + u8 config, rev;
10533 + u32 word;
10534 +
10535 + /* BIOS may enable hardware IRQ balancing for
10536 + * E7520/E7320/E7525(revision ID 0x9 and below)
10537 + * based platforms.
10538 + * Disable SW irqbalance/affinity on those platforms.
10539 + */
10540 + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
10541 + if (rev > 0x9)
10542 + return;
10543 +
10544 + printk(KERN_INFO "Intel E7520/7320/7525 detected.");
10545 +
10546 + /* enable access to config space*/
10547 + pci_read_config_byte(dev, 0xf4, &config);
10548 + pci_write_config_byte(dev, 0xf4, config|0x2);
10549 +
10550 + /* read xTPR register */
10551 + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
10552 +
10553 + if (!(word & (1 << 13))) {
10554 + dom0_op_t op;
10555 + printk(KERN_INFO "Disabling irq balancing and affinity\n");
10556 + op.cmd = DOM0_PLATFORM_QUIRK;
10557 + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
10558 + (void)HYPERVISOR_dom0_op(&op);
10559 + }
10560 +
10561 + /* put back the original value for config space*/
10562 + if (!(config & 0x2))
10563 + pci_write_config_byte(dev, 0xf4, config);
10564 +}
10565 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
10566 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
10567 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
10568 +#endif
10569 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/relocate_kernel.S linux-2.6.16.33/arch/i386/kernel/relocate_kernel.S
10570 --- linux-2.6.16.33-noxen/arch/i386/kernel/relocate_kernel.S 2006-11-22 18:06:31.000000000 +0000
10571 +++ linux-2.6.16.33/arch/i386/kernel/relocate_kernel.S 2007-05-23 21:00:01.000000000 +0000
10572 @@ -7,16 +7,138 @@
10573 */
10574
10575 #include <linux/linkage.h>
10576 +#include <asm/page.h>
10577 +#include <asm/kexec.h>
10578 +
10579 +/*
10580 + * Must be relocatable PIC code callable as a C function
10581 + */
10582 +
10583 +#define PTR(x) (x << 2)
10584 +#define PAGE_ALIGNED (1 << PAGE_SHIFT)
10585 +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
10586 +#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
10587 +
10588 + .text
10589 + .align PAGE_ALIGNED
10590 + .globl relocate_kernel
10591 +relocate_kernel:
10592 + movl 8(%esp), %ebp /* list of pages */
10593 +
10594 +#ifdef CONFIG_X86_PAE
10595 + /* map the control page at its virtual address */
10596 +
10597 + movl PTR(VA_PGD)(%ebp), %edi
10598 + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10599 + andl $0xc0000000, %eax
10600 + shrl $27, %eax
10601 + addl %edi, %eax
10602 +
10603 + movl PTR(PA_PMD_0)(%ebp), %edx
10604 + orl $PAE_PGD_ATTR, %edx
10605 + movl %edx, (%eax)
10606 +
10607 + movl PTR(VA_PMD_0)(%ebp), %edi
10608 + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10609 + andl $0x3fe00000, %eax
10610 + shrl $18, %eax
10611 + addl %edi, %eax
10612 +
10613 + movl PTR(PA_PTE_0)(%ebp), %edx
10614 + orl $PAGE_ATTR, %edx
10615 + movl %edx, (%eax)
10616 +
10617 + movl PTR(VA_PTE_0)(%ebp), %edi
10618 + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10619 + andl $0x001ff000, %eax
10620 + shrl $9, %eax
10621 + addl %edi, %eax
10622 +
10623 + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10624 + orl $PAGE_ATTR, %edx
10625 + movl %edx, (%eax)
10626 +
10627 + /* identity map the control page at its physical address */
10628 +
10629 + movl PTR(VA_PGD)(%ebp), %edi
10630 + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10631 + andl $0xc0000000, %eax
10632 + shrl $27, %eax
10633 + addl %edi, %eax
10634 +
10635 + movl PTR(PA_PMD_1)(%ebp), %edx
10636 + orl $PAE_PGD_ATTR, %edx
10637 + movl %edx, (%eax)
10638 +
10639 + movl PTR(VA_PMD_1)(%ebp), %edi
10640 + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10641 + andl $0x3fe00000, %eax
10642 + shrl $18, %eax
10643 + addl %edi, %eax
10644 +
10645 + movl PTR(PA_PTE_1)(%ebp), %edx
10646 + orl $PAGE_ATTR, %edx
10647 + movl %edx, (%eax)
10648 +
10649 + movl PTR(VA_PTE_1)(%ebp), %edi
10650 + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10651 + andl $0x001ff000, %eax
10652 + shrl $9, %eax
10653 + addl %edi, %eax
10654 +
10655 + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10656 + orl $PAGE_ATTR, %edx
10657 + movl %edx, (%eax)
10658 +#else
10659 + /* map the control page at its virtual address */
10660 +
10661 + movl PTR(VA_PGD)(%ebp), %edi
10662 + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10663 + andl $0xffc00000, %eax
10664 + shrl $20, %eax
10665 + addl %edi, %eax
10666 +
10667 + movl PTR(PA_PTE_0)(%ebp), %edx
10668 + orl $PAGE_ATTR, %edx
10669 + movl %edx, (%eax)
10670 +
10671 + movl PTR(VA_PTE_0)(%ebp), %edi
10672 + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
10673 + andl $0x003ff000, %eax
10674 + shrl $10, %eax
10675 + addl %edi, %eax
10676 +
10677 + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10678 + orl $PAGE_ATTR, %edx
10679 + movl %edx, (%eax)
10680 +
10681 + /* identity map the control page at its physical address */
10682 +
10683 + movl PTR(VA_PGD)(%ebp), %edi
10684 + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10685 + andl $0xffc00000, %eax
10686 + shrl $20, %eax
10687 + addl %edi, %eax
10688 +
10689 + movl PTR(PA_PTE_1)(%ebp), %edx
10690 + orl $PAGE_ATTR, %edx
10691 + movl %edx, (%eax)
10692 +
10693 + movl PTR(VA_PTE_1)(%ebp), %edi
10694 + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
10695 + andl $0x003ff000, %eax
10696 + shrl $10, %eax
10697 + addl %edi, %eax
10698 +
10699 + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
10700 + orl $PAGE_ATTR, %edx
10701 + movl %edx, (%eax)
10702 +#endif
10703
10704 - /*
10705 - * Must be relocatable PIC code callable as a C function, that once
10706 - * it starts can not use the previous processes stack.
10707 - */
10708 - .globl relocate_new_kernel
10709 relocate_new_kernel:
10710 /* read the arguments and say goodbye to the stack */
10711 movl 4(%esp), %ebx /* page_list */
10712 - movl 8(%esp), %ebp /* reboot_code_buffer */
10713 + movl 8(%esp), %ebp /* list of pages */
10714 movl 12(%esp), %edx /* start address */
10715 movl 16(%esp), %ecx /* cpu_has_pae */
10716
10717 @@ -24,11 +146,57 @@
10718 pushl $0
10719 popfl
10720
10721 - /* set a new stack at the bottom of our page... */
10722 - lea 4096(%ebp), %esp
10723 + /* get physical address of control page now */
10724 + /* this is impossible after page table switch */
10725 + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
10726
10727 - /* store the parameters back on the stack */
10728 - pushl %edx /* store the start address */
10729 + /* switch to new set of page tables */
10730 + movl PTR(PA_PGD)(%ebp), %eax
10731 + movl %eax, %cr3
10732 +
10733 + /* setup idt */
10734 + movl %edi, %eax
10735 + addl $(idt_48 - relocate_kernel), %eax
10736 + lidtl (%eax)
10737 +
10738 + /* setup gdt */
10739 + movl %edi, %eax
10740 + addl $(gdt - relocate_kernel), %eax
10741 + movl %edi, %esi
10742 + addl $((gdt_48 - relocate_kernel) + 2), %esi
10743 + movl %eax, (%esi)
10744 +
10745 + movl %edi, %eax
10746 + addl $(gdt_48 - relocate_kernel), %eax
10747 + lgdtl (%eax)
10748 +
10749 + /* setup data segment registers */
10750 + mov $(gdt_ds - gdt), %eax
10751 + mov %eax, %ds
10752 + mov %eax, %es
10753 + mov %eax, %fs
10754 + mov %eax, %gs
10755 + mov %eax, %ss
10756 +
10757 + /* setup a new stack at the end of the physical control page */
10758 + lea 4096(%edi), %esp
10759 +
10760 + /* load new code segment and jump to identity mapped page */
10761 + movl %edi, %esi
10762 + xorl %eax, %eax
10763 + pushl %eax
10764 + pushl %esi
10765 + pushl %eax
10766 + movl $(gdt_cs - gdt), %eax
10767 + pushl %eax
10768 + movl %edi, %eax
10769 + addl $(identity_mapped - relocate_kernel),%eax
10770 + pushl %eax
10771 + iretl
10772 +
10773 +identity_mapped:
10774 + /* store the start address on the stack */
10775 + pushl %edx
10776
10777 /* Set cr0 to a known state:
10778 * 31 0 == Paging disabled
10779 @@ -113,8 +281,20 @@
10780 xorl %edi, %edi
10781 xorl %ebp, %ebp
10782 ret
10783 -relocate_new_kernel_end:
10784
10785 - .globl relocate_new_kernel_size
10786 -relocate_new_kernel_size:
10787 - .long relocate_new_kernel_end - relocate_new_kernel
10788 + .align 16
10789 +gdt:
10790 + .quad 0x0000000000000000 /* NULL descriptor */
10791 +gdt_cs:
10792 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
10793 +gdt_ds:
10794 + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
10795 +gdt_end:
10796 +
10797 +gdt_48:
10798 + .word gdt_end - gdt - 1 /* limit */
10799 + .long 0 /* base - filled in by code above */
10800 +
10801 +idt_48:
10802 + .word 0 /* limit */
10803 + .long 0 /* base */
10804 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/setup-xen.c linux-2.6.16.33/arch/i386/kernel/setup-xen.c
10805 --- linux-2.6.16.33-noxen/arch/i386/kernel/setup-xen.c 1970-01-01 00:00:00.000000000 +0000
10806 +++ linux-2.6.16.33/arch/i386/kernel/setup-xen.c 2007-01-08 15:00:45.000000000 +0000
10807 @@ -0,0 +1,1892 @@
10808 +/*
10809 + * linux/arch/i386/kernel/setup.c
10810 + *
10811 + * Copyright (C) 1995 Linus Torvalds
10812 + *
10813 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10814 + *
10815 + * Memory region support
10816 + * David Parsons <orc@pell.chi.il.us>, July-August 1999
10817 + *
10818 + * Added E820 sanitization routine (removes overlapping memory regions);
10819 + * Brian Moyle <bmoyle@mvista.com>, February 2001
10820 + *
10821 + * Moved CPU detection code to cpu/${cpu}.c
10822 + * Patrick Mochel <mochel@osdl.org>, March 2002
10823 + *
10824 + * Provisions for empty E820 memory regions (reported by certain BIOSes).
10825 + * Alex Achenbach <xela@slit.de>, December 2002.
10826 + *
10827 + */
10828 +
10829 +/*
10830 + * This file handles the architecture-dependent parts of initialization
10831 + */
10832 +
10833 +#include <linux/config.h>
10834 +#include <linux/sched.h>
10835 +#include <linux/mm.h>
10836 +#include <linux/mmzone.h>
10837 +#include <linux/tty.h>
10838 +#include <linux/ioport.h>
10839 +#include <linux/acpi.h>
10840 +#include <linux/apm_bios.h>
10841 +#include <linux/initrd.h>
10842 +#include <linux/bootmem.h>
10843 +#include <linux/seq_file.h>
10844 +#include <linux/console.h>
10845 +#include <linux/mca.h>
10846 +#include <linux/root_dev.h>
10847 +#include <linux/highmem.h>
10848 +#include <linux/module.h>
10849 +#include <linux/efi.h>
10850 +#include <linux/init.h>
10851 +#include <linux/edd.h>
10852 +#include <linux/nodemask.h>
10853 +#include <linux/kernel.h>
10854 +#include <linux/percpu.h>
10855 +#include <linux/notifier.h>
10856 +#include <linux/kexec.h>
10857 +#include <linux/crash_dump.h>
10858 +#include <linux/dmi.h>
10859 +
10860 +#include <video/edid.h>
10861 +
10862 +#include <asm/apic.h>
10863 +#include <asm/e820.h>
10864 +#include <asm/mpspec.h>
10865 +#include <asm/setup.h>
10866 +#include <asm/arch_hooks.h>
10867 +#include <asm/sections.h>
10868 +#include <asm/io_apic.h>
10869 +#include <asm/ist.h>
10870 +#include <asm/io.h>
10871 +#include <asm/hypervisor.h>
10872 +#include <xen/interface/physdev.h>
10873 +#include <xen/interface/memory.h>
10874 +#include <xen/features.h>
10875 +#include <xen/xencons.h>
10876 +#include "setup_arch_pre.h"
10877 +#include <bios_ebda.h>
10878 +
10879 +#ifdef CONFIG_XEN
10880 +#include <xen/interface/kexec.h>
10881 +#endif
10882 +
10883 +/* Forward Declaration. */
10884 +void __init find_max_pfn(void);
10885 +
10886 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
10887 +static struct notifier_block xen_panic_block = {
10888 + xen_panic_event, NULL, 0 /* try to go last */
10889 +};
10890 +
10891 +extern char hypercall_page[PAGE_SIZE];
10892 +EXPORT_SYMBOL(hypercall_page);
10893 +
10894 +int disable_pse __devinitdata = 0;
10895 +
10896 +/*
10897 + * Machine setup..
10898 + */
10899 +
10900 +#ifdef CONFIG_EFI
10901 +int efi_enabled = 0;
10902 +EXPORT_SYMBOL(efi_enabled);
10903 +#endif
10904 +
10905 +/* cpu data as detected by the assembly code in head.S */
10906 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10907 +/* common cpu data for all cpus */
10908 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10909 +EXPORT_SYMBOL(boot_cpu_data);
10910 +
10911 +unsigned long mmu_cr4_features;
10912 +
10913 +#ifdef CONFIG_ACPI
10914 + int acpi_disabled = 0;
10915 +#else
10916 + int acpi_disabled = 1;
10917 +#endif
10918 +EXPORT_SYMBOL(acpi_disabled);
10919 +
10920 +#ifdef CONFIG_ACPI
10921 +int __initdata acpi_force = 0;
10922 +extern acpi_interrupt_flags acpi_sci_flags;
10923 +#endif
10924 +
10925 +/* for MCA, but anyone else can use it if they want */
10926 +unsigned int machine_id;
10927 +#ifdef CONFIG_MCA
10928 +EXPORT_SYMBOL(machine_id);
10929 +#endif
10930 +unsigned int machine_submodel_id;
10931 +unsigned int BIOS_revision;
10932 +unsigned int mca_pentium_flag;
10933 +
10934 +/* For PCI or other memory-mapped resources */
10935 +unsigned long pci_mem_start = 0x10000000;
10936 +#ifdef CONFIG_PCI
10937 +EXPORT_SYMBOL(pci_mem_start);
10938 +#endif
10939 +
10940 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
10941 +int bootloader_type;
10942 +
10943 +/* user-defined highmem size */
10944 +static unsigned int highmem_pages = -1;
10945 +
10946 +/*
10947 + * Setup options
10948 + */
10949 +struct drive_info_struct { char dummy[32]; } drive_info;
10950 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
10951 + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
10952 +EXPORT_SYMBOL(drive_info);
10953 +#endif
10954 +struct screen_info screen_info;
10955 +EXPORT_SYMBOL(screen_info);
10956 +struct apm_info apm_info;
10957 +EXPORT_SYMBOL(apm_info);
10958 +struct sys_desc_table_struct {
10959 + unsigned short length;
10960 + unsigned char table[0];
10961 +};
10962 +struct edid_info edid_info;
10963 +EXPORT_SYMBOL_GPL(edid_info);
10964 +struct ist_info ist_info;
10965 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
10966 + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
10967 +EXPORT_SYMBOL(ist_info);
10968 +#endif
10969 +struct e820map e820;
10970 +#ifdef CONFIG_XEN
10971 +struct e820map machine_e820;
10972 +#endif
10973 +
10974 +extern void early_cpu_init(void);
10975 +extern void generic_apic_probe(char *);
10976 +extern int root_mountflags;
10977 +
10978 +unsigned long saved_videomode;
10979 +
10980 +#define RAMDISK_IMAGE_START_MASK 0x07FF
10981 +#define RAMDISK_PROMPT_FLAG 0x8000
10982 +#define RAMDISK_LOAD_FLAG 0x4000
10983 +
10984 +static char command_line[COMMAND_LINE_SIZE];
10985 +
10986 +unsigned char __initdata boot_params[PARAM_SIZE];
10987 +
10988 +static struct resource data_resource = {
10989 + .name = "Kernel data",
10990 + .start = 0,
10991 + .end = 0,
10992 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
10993 +};
10994 +
10995 +static struct resource code_resource = {
10996 + .name = "Kernel code",
10997 + .start = 0,
10998 + .end = 0,
10999 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11000 +};
11001 +
11002 +static struct resource system_rom_resource = {
11003 + .name = "System ROM",
11004 + .start = 0xf0000,
11005 + .end = 0xfffff,
11006 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11007 +};
11008 +
11009 +static struct resource extension_rom_resource = {
11010 + .name = "Extension ROM",
11011 + .start = 0xe0000,
11012 + .end = 0xeffff,
11013 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11014 +};
11015 +
11016 +static struct resource adapter_rom_resources[] = { {
11017 + .name = "Adapter ROM",
11018 + .start = 0xc8000,
11019 + .end = 0,
11020 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11021 +}, {
11022 + .name = "Adapter ROM",
11023 + .start = 0,
11024 + .end = 0,
11025 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11026 +}, {
11027 + .name = "Adapter ROM",
11028 + .start = 0,
11029 + .end = 0,
11030 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11031 +}, {
11032 + .name = "Adapter ROM",
11033 + .start = 0,
11034 + .end = 0,
11035 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11036 +}, {
11037 + .name = "Adapter ROM",
11038 + .start = 0,
11039 + .end = 0,
11040 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11041 +}, {
11042 + .name = "Adapter ROM",
11043 + .start = 0,
11044 + .end = 0,
11045 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11046 +} };
11047 +
11048 +#define ADAPTER_ROM_RESOURCES \
11049 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
11050 +
11051 +static struct resource video_rom_resource = {
11052 + .name = "Video ROM",
11053 + .start = 0xc0000,
11054 + .end = 0xc7fff,
11055 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
11056 +};
11057 +
11058 +static struct resource video_ram_resource = {
11059 + .name = "Video RAM area",
11060 + .start = 0xa0000,
11061 + .end = 0xbffff,
11062 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
11063 +};
11064 +
11065 +static struct resource standard_io_resources[] = { {
11066 + .name = "dma1",
11067 + .start = 0x0000,
11068 + .end = 0x001f,
11069 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11070 +}, {
11071 + .name = "pic1",
11072 + .start = 0x0020,
11073 + .end = 0x0021,
11074 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11075 +}, {
11076 + .name = "timer0",
11077 + .start = 0x0040,
11078 + .end = 0x0043,
11079 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11080 +}, {
11081 + .name = "timer1",
11082 + .start = 0x0050,
11083 + .end = 0x0053,
11084 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11085 +}, {
11086 + .name = "keyboard",
11087 + .start = 0x0060,
11088 + .end = 0x006f,
11089 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11090 +}, {
11091 + .name = "dma page reg",
11092 + .start = 0x0080,
11093 + .end = 0x008f,
11094 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11095 +}, {
11096 + .name = "pic2",
11097 + .start = 0x00a0,
11098 + .end = 0x00a1,
11099 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11100 +}, {
11101 + .name = "dma2",
11102 + .start = 0x00c0,
11103 + .end = 0x00df,
11104 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11105 +}, {
11106 + .name = "fpu",
11107 + .start = 0x00f0,
11108 + .end = 0x00ff,
11109 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
11110 +} };
11111 +
11112 +#define STANDARD_IO_RESOURCES \
11113 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
11114 +
11115 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
11116 +
11117 +static int __init romchecksum(unsigned char *rom, unsigned long length)
11118 +{
11119 + unsigned char *p, sum = 0;
11120 +
11121 + for (p = rom; p < rom + length; p++)
11122 + sum += *p;
11123 + return sum == 0;
11124 +}
11125 +
11126 +static void __init probe_roms(void)
11127 +{
11128 + unsigned long start, length, upper;
11129 + unsigned char *rom;
11130 + int i;
11131 +
11132 +#ifdef CONFIG_XEN
11133 + /* Nothing to do if not running in dom0. */
11134 + if (!is_initial_xendomain())
11135 + return;
11136 +#endif
11137 +
11138 + /* video rom */
11139 + upper = adapter_rom_resources[0].start;
11140 + for (start = video_rom_resource.start; start < upper; start += 2048) {
11141 + rom = isa_bus_to_virt(start);
11142 + if (!romsignature(rom))
11143 + continue;
11144 +
11145 + video_rom_resource.start = start;
11146 +
11147 + /* 0 < length <= 0x7f * 512, historically */
11148 + length = rom[2] * 512;
11149 +
11150 + /* if checksum okay, trust length byte */
11151 + if (length && romchecksum(rom, length))
11152 + video_rom_resource.end = start + length - 1;
11153 +
11154 + request_resource(&iomem_resource, &video_rom_resource);
11155 + break;
11156 + }
11157 +
11158 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
11159 + if (start < upper)
11160 + start = upper;
11161 +
11162 + /* system rom */
11163 + request_resource(&iomem_resource, &system_rom_resource);
11164 + upper = system_rom_resource.start;
11165 +
11166 + /* check for extension rom (ignore length byte!) */
11167 + rom = isa_bus_to_virt(extension_rom_resource.start);
11168 + if (romsignature(rom)) {
11169 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
11170 + if (romchecksum(rom, length)) {
11171 + request_resource(&iomem_resource, &extension_rom_resource);
11172 + upper = extension_rom_resource.start;
11173 + }
11174 + }
11175 +
11176 + /* check for adapter roms on 2k boundaries */
11177 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
11178 + rom = isa_bus_to_virt(start);
11179 + if (!romsignature(rom))
11180 + continue;
11181 +
11182 + /* 0 < length <= 0x7f * 512, historically */
11183 + length = rom[2] * 512;
11184 +
11185 + /* but accept any length that fits if checksum okay */
11186 + if (!length || start + length > upper || !romchecksum(rom, length))
11187 + continue;
11188 +
11189 + adapter_rom_resources[i].start = start;
11190 + adapter_rom_resources[i].end = start + length - 1;
11191 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
11192 +
11193 + start = adapter_rom_resources[i++].end & ~2047UL;
11194 + }
11195 +}
11196 +
11197 +/*
11198 + * Point at the empty zero page to start with. We map the real shared_info
11199 + * page as soon as fixmap is up and running.
11200 + */
11201 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
11202 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
11203 +
11204 +unsigned long *phys_to_machine_mapping;
11205 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
11206 +EXPORT_SYMBOL(phys_to_machine_mapping);
11207 +
11208 +/* Raw start-of-day parameters from the hypervisor. */
11209 +start_info_t *xen_start_info;
11210 +EXPORT_SYMBOL(xen_start_info);
11211 +
11212 +static void __init add_memory_region(unsigned long long start,
11213 + unsigned long long size, int type)
11214 +{
11215 + int x;
11216 +
11217 + if (!efi_enabled) {
11218 + x = e820.nr_map;
11219 +
11220 + if (x == E820MAX) {
11221 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
11222 + return;
11223 + }
11224 +
11225 + e820.map[x].addr = start;
11226 + e820.map[x].size = size;
11227 + e820.map[x].type = type;
11228 + e820.nr_map++;
11229 + }
11230 +} /* add_memory_region */
11231 +
11232 +static void __init limit_regions(unsigned long long size)
11233 +{
11234 + unsigned long long current_addr = 0;
11235 + int i;
11236 +
11237 + if (efi_enabled) {
11238 + efi_memory_desc_t *md;
11239 + void *p;
11240 +
11241 + for (p = memmap.map, i = 0; p < memmap.map_end;
11242 + p += memmap.desc_size, i++) {
11243 + md = p;
11244 + current_addr = md->phys_addr + (md->num_pages << 12);
11245 + if (md->type == EFI_CONVENTIONAL_MEMORY) {
11246 + if (current_addr >= size) {
11247 + md->num_pages -=
11248 + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
11249 + memmap.nr_map = i + 1;
11250 + return;
11251 + }
11252 + }
11253 + }
11254 + }
11255 + for (i = 0; i < e820.nr_map; i++) {
11256 + current_addr = e820.map[i].addr + e820.map[i].size;
11257 + if (current_addr < size)
11258 + continue;
11259 +
11260 + if (e820.map[i].type != E820_RAM)
11261 + continue;
11262 +
11263 + if (e820.map[i].addr >= size) {
11264 + /*
11265 + * This region starts past the end of the
11266 + * requested size, skip it completely.
11267 + */
11268 + e820.nr_map = i;
11269 + } else {
11270 + e820.nr_map = i + 1;
11271 + e820.map[i].size -= current_addr - size;
11272 + }
11273 + return;
11274 + }
11275 +#ifdef CONFIG_XEN
11276 + if (i==e820.nr_map && current_addr < size) {
11277 + /*
11278 + * The e820 map finished before our requested size so
11279 + * extend the final entry to the requested address.
11280 + */
11281 + --i;
11282 + if (e820.map[i].type == E820_RAM)
11283 + e820.map[i].size -= current_addr - size;
11284 + else
11285 + add_memory_region(current_addr, size - current_addr, E820_RAM);
11286 + }
11287 +#endif
11288 +}
11289 +
11290 +#define E820_DEBUG 1
11291 +
11292 +static void __init print_memory_map(char *who)
11293 +{
11294 + int i;
11295 +
11296 + for (i = 0; i < e820.nr_map; i++) {
11297 + printk(" %s: %016Lx - %016Lx ", who,
11298 + e820.map[i].addr,
11299 + e820.map[i].addr + e820.map[i].size);
11300 + switch (e820.map[i].type) {
11301 + case E820_RAM: printk("(usable)\n");
11302 + break;
11303 + case E820_RESERVED:
11304 + printk("(reserved)\n");
11305 + break;
11306 + case E820_ACPI:
11307 + printk("(ACPI data)\n");
11308 + break;
11309 + case E820_NVS:
11310 + printk("(ACPI NVS)\n");
11311 + break;
11312 + default: printk("type %lu\n", e820.map[i].type);
11313 + break;
11314 + }
11315 + }
11316 +}
11317 +
11318 +/*
11319 + * Sanitize the BIOS e820 map.
11320 + *
11321 + * Some e820 responses include overlapping entries. The following
11322 + * replaces the original e820 map with a new one, removing overlaps.
11323 + *
11324 + */
11325 +struct change_member {
11326 + struct e820entry *pbios; /* pointer to original bios entry */
11327 + unsigned long long addr; /* address for this change point */
11328 +};
11329 +static struct change_member change_point_list[2*E820MAX] __initdata;
11330 +static struct change_member *change_point[2*E820MAX] __initdata;
11331 +static struct e820entry *overlap_list[E820MAX] __initdata;
11332 +static struct e820entry new_bios[E820MAX] __initdata;
11333 +
11334 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
11335 +{
11336 + struct change_member *change_tmp;
11337 + unsigned long current_type, last_type;
11338 + unsigned long long last_addr;
11339 + int chgidx, still_changing;
11340 + int overlap_entries;
11341 + int new_bios_entry;
11342 + int old_nr, new_nr, chg_nr;
11343 + int i;
11344 +
11345 + /*
11346 + Visually we're performing the following (1,2,3,4 = memory types)...
11347 +
11348 + Sample memory map (w/overlaps):
11349 + ____22__________________
11350 + ______________________4_
11351 + ____1111________________
11352 + _44_____________________
11353 + 11111111________________
11354 + ____________________33__
11355 + ___________44___________
11356 + __________33333_________
11357 + ______________22________
11358 + ___________________2222_
11359 + _________111111111______
11360 + _____________________11_
11361 + _________________4______
11362 +
11363 + Sanitized equivalent (no overlap):
11364 + 1_______________________
11365 + _44_____________________
11366 + ___1____________________
11367 + ____22__________________
11368 + ______11________________
11369 + _________1______________
11370 + __________3_____________
11371 + ___________44___________
11372 + _____________33_________
11373 + _______________2________
11374 + ________________1_______
11375 + _________________4______
11376 + ___________________2____
11377 + ____________________33__
11378 + ______________________4_
11379 + */
11380 +
11381 + /* if there's only one memory region, don't bother */
11382 + if (*pnr_map < 2)
11383 + return -1;
11384 +
11385 + old_nr = *pnr_map;
11386 +
11387 + /* bail out if we find any unreasonable addresses in bios map */
11388 + for (i=0; i<old_nr; i++)
11389 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
11390 + return -1;
11391 +
11392 + /* create pointers for initial change-point information (for sorting) */
11393 + for (i=0; i < 2*old_nr; i++)
11394 + change_point[i] = &change_point_list[i];
11395 +
11396 + /* record all known change-points (starting and ending addresses),
11397 + omitting those that are for empty memory regions */
11398 + chgidx = 0;
11399 + for (i=0; i < old_nr; i++) {
11400 + if (biosmap[i].size != 0) {
11401 + change_point[chgidx]->addr = biosmap[i].addr;
11402 + change_point[chgidx++]->pbios = &biosmap[i];
11403 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
11404 + change_point[chgidx++]->pbios = &biosmap[i];
11405 + }
11406 + }
11407 + chg_nr = chgidx; /* true number of change-points */
11408 +
11409 + /* sort change-point list by memory addresses (low -> high) */
11410 + still_changing = 1;
11411 + while (still_changing) {
11412 + still_changing = 0;
11413 + for (i=1; i < chg_nr; i++) {
11414 + /* if <current_addr> > <last_addr>, swap */
11415 + /* or, if current=<start_addr> & last=<end_addr>, swap */
11416 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
11417 + ((change_point[i]->addr == change_point[i-1]->addr) &&
11418 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
11419 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
11420 + )
11421 + {
11422 + change_tmp = change_point[i];
11423 + change_point[i] = change_point[i-1];
11424 + change_point[i-1] = change_tmp;
11425 + still_changing=1;
11426 + }
11427 + }
11428 + }
11429 +
11430 + /* create a new bios memory map, removing overlaps */
11431 + overlap_entries=0; /* number of entries in the overlap table */
11432 + new_bios_entry=0; /* index for creating new bios map entries */
11433 + last_type = 0; /* start with undefined memory type */
11434 + last_addr = 0; /* start with 0 as last starting address */
11435 + /* loop through change-points, determining affect on the new bios map */
11436 + for (chgidx=0; chgidx < chg_nr; chgidx++)
11437 + {
11438 + /* keep track of all overlapping bios entries */
11439 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
11440 + {
11441 + /* add map entry to overlap list (> 1 entry implies an overlap) */
11442 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
11443 + }
11444 + else
11445 + {
11446 + /* remove entry from list (order independent, so swap with last) */
11447 + for (i=0; i<overlap_entries; i++)
11448 + {
11449 + if (overlap_list[i] == change_point[chgidx]->pbios)
11450 + overlap_list[i] = overlap_list[overlap_entries-1];
11451 + }
11452 + overlap_entries--;
11453 + }
11454 + /* if there are overlapping entries, decide which "type" to use */
11455 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
11456 + current_type = 0;
11457 + for (i=0; i<overlap_entries; i++)
11458 + if (overlap_list[i]->type > current_type)
11459 + current_type = overlap_list[i]->type;
11460 + /* continue building up new bios map based on this information */
11461 + if (current_type != last_type) {
11462 + if (last_type != 0) {
11463 + new_bios[new_bios_entry].size =
11464 + change_point[chgidx]->addr - last_addr;
11465 + /* move forward only if the new size was non-zero */
11466 + if (new_bios[new_bios_entry].size != 0)
11467 + if (++new_bios_entry >= E820MAX)
11468 + break; /* no more space left for new bios entries */
11469 + }
11470 + if (current_type != 0) {
11471 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
11472 + new_bios[new_bios_entry].type = current_type;
11473 + last_addr=change_point[chgidx]->addr;
11474 + }
11475 + last_type = current_type;
11476 + }
11477 + }
11478 + new_nr = new_bios_entry; /* retain count for new bios entries */
11479 +
11480 + /* copy new bios mapping into original location */
11481 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
11482 + *pnr_map = new_nr;
11483 +
11484 + return 0;
11485 +}
11486 +
11487 +/*
11488 + * Copy the BIOS e820 map into a safe place.
11489 + *
11490 + * Sanity-check it while we're at it..
11491 + *
11492 + * If we're lucky and live on a modern system, the setup code
11493 + * will have given us a memory map that we can use to properly
11494 + * set up memory. If we aren't, we'll fake a memory map.
11495 + *
11496 + * We check to see that the memory map contains at least 2 elements
11497 + * before we'll use it, because the detection code in setup.S may
11498 + * not be perfect and most every PC known to man has two memory
11499 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
11500 + * thinkpad 560x, for example, does not cooperate with the memory
11501 + * detection code.)
11502 + */
11503 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
11504 +{
11505 +#ifndef CONFIG_XEN
11506 + /* Only one memory region (or negative)? Ignore it */
11507 + if (nr_map < 2)
11508 + return -1;
11509 +#else
11510 + BUG_ON(nr_map < 1);
11511 +#endif
11512 +
11513 + do {
11514 + unsigned long long start = biosmap->addr;
11515 + unsigned long long size = biosmap->size;
11516 + unsigned long long end = start + size;
11517 + unsigned long type = biosmap->type;
11518 +
11519 + /* Overflow in 64 bits? Ignore the memory map. */
11520 + if (start > end)
11521 + return -1;
11522 +
11523 +#ifndef CONFIG_XEN
11524 + /*
11525 + * Some BIOSes claim RAM in the 640k - 1M region.
11526 + * Not right. Fix it up.
11527 + */
11528 + if (type == E820_RAM) {
11529 + if (start < 0x100000ULL && end > 0xA0000ULL) {
11530 + if (start < 0xA0000ULL)
11531 + add_memory_region(start, 0xA0000ULL-start, type);
11532 + if (end <= 0x100000ULL)
11533 + continue;
11534 + start = 0x100000ULL;
11535 + size = end - start;
11536 + }
11537 + }
11538 +#endif
11539 + add_memory_region(start, size, type);
11540 + } while (biosmap++,--nr_map);
11541 + return 0;
11542 +}
11543 +
11544 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
11545 +struct edd edd;
11546 +#ifdef CONFIG_EDD_MODULE
11547 +EXPORT_SYMBOL(edd);
11548 +#endif
11549 +/**
11550 + * copy_edd() - Copy the BIOS EDD information
11551 + * from boot_params into a safe place.
11552 + *
11553 + */
11554 +static inline void copy_edd(void)
11555 +{
11556 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
11557 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
11558 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
11559 + edd.edd_info_nr = EDD_NR;
11560 +}
11561 +#else
11562 +static inline void copy_edd(void)
11563 +{
11564 +}
11565 +#endif
11566 +
11567 +/*
11568 + * Do NOT EVER look at the BIOS memory size location.
11569 + * It does not work on many machines.
11570 + */
11571 +#define LOWMEMSIZE() (0x9f000)
11572 +
11573 +static void __init parse_cmdline_early (char ** cmdline_p)
11574 +{
11575 + char c = ' ', *to = command_line, *from = saved_command_line;
11576 + int len = 0, max_cmdline;
11577 + int userdef = 0;
11578 +
11579 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
11580 + max_cmdline = COMMAND_LINE_SIZE;
11581 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
11582 + /* Save unparsed command line copy for /proc/cmdline */
11583 + saved_command_line[max_cmdline-1] = '\0';
11584 +
11585 + for (;;) {
11586 + if (c != ' ')
11587 + goto next_char;
11588 + /*
11589 + * "mem=nopentium" disables the 4MB page tables.
11590 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
11591 + * to <mem>, overriding the bios size.
11592 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
11593 + * <start> to <start>+<mem>, overriding the bios size.
11594 + *
11595 + * HPA tells me bootloaders need to parse mem=, so no new
11596 + * option should be mem= [also see Documentation/i386/boot.txt]
11597 + */
11598 + if (!memcmp(from, "mem=", 4)) {
11599 + if (to != command_line)
11600 + to--;
11601 + if (!memcmp(from+4, "nopentium", 9)) {
11602 + from += 9+4;
11603 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
11604 + disable_pse = 1;
11605 + } else {
11606 + /* If the user specifies memory size, we
11607 + * limit the BIOS-provided memory map to
11608 + * that size. exactmap can be used to specify
11609 + * the exact map. mem=number can be used to
11610 + * trim the existing memory map.
11611 + */
11612 + unsigned long long mem_size;
11613 +
11614 + mem_size = memparse(from+4, &from);
11615 + limit_regions(mem_size);
11616 + userdef=1;
11617 + }
11618 + }
11619 +
11620 + else if (!memcmp(from, "memmap=", 7)) {
11621 + if (to != command_line)
11622 + to--;
11623 + if (!memcmp(from+7, "exactmap", 8)) {
11624 +#ifdef CONFIG_CRASH_DUMP
11625 + /* If we are doing a crash dump, we
11626 + * still need to know the real mem
11627 + * size before original memory map is
11628 + * reset.
11629 + */
11630 + find_max_pfn();
11631 + saved_max_pfn = max_pfn;
11632 +#endif
11633 + from += 8+7;
11634 + e820.nr_map = 0;
11635 + userdef = 1;
11636 + } else {
11637 + /* If the user specifies memory size, we
11638 + * limit the BIOS-provided memory map to
11639 + * that size. exactmap can be used to specify
11640 + * the exact map. mem=number can be used to
11641 + * trim the existing memory map.
11642 + */
11643 + unsigned long long start_at, mem_size;
11644 +
11645 + mem_size = memparse(from+7, &from);
11646 + if (*from == '@') {
11647 + start_at = memparse(from+1, &from);
11648 + add_memory_region(start_at, mem_size, E820_RAM);
11649 + } else if (*from == '#') {
11650 + start_at = memparse(from+1, &from);
11651 + add_memory_region(start_at, mem_size, E820_ACPI);
11652 + } else if (*from == '$') {
11653 + start_at = memparse(from+1, &from);
11654 + add_memory_region(start_at, mem_size, E820_RESERVED);
11655 + } else {
11656 + limit_regions(mem_size);
11657 + userdef=1;
11658 + }
11659 + }
11660 + }
11661 +
11662 + else if (!memcmp(from, "noexec=", 7))
11663 + noexec_setup(from + 7);
11664 +
11665 +
11666 +#ifdef CONFIG_X86_MPPARSE
11667 + /*
11668 + * If the BIOS enumerates physical processors before logical,
11669 + * maxcpus=N at enumeration-time can be used to disable HT.
11670 + */
11671 + else if (!memcmp(from, "maxcpus=", 8)) {
11672 + extern unsigned int maxcpus;
11673 +
11674 + maxcpus = simple_strtoul(from + 8, NULL, 0);
11675 + }
11676 +#endif
11677 +
11678 +#ifdef CONFIG_ACPI
11679 + /* "acpi=off" disables both ACPI table parsing and interpreter */
11680 + else if (!memcmp(from, "acpi=off", 8)) {
11681 + disable_acpi();
11682 + }
11683 +
11684 + /* acpi=force to over-ride black-list */
11685 + else if (!memcmp(from, "acpi=force", 10)) {
11686 + acpi_force = 1;
11687 + acpi_ht = 1;
11688 + acpi_disabled = 0;
11689 + }
11690 +
11691 + /* acpi=strict disables out-of-spec workarounds */
11692 + else if (!memcmp(from, "acpi=strict", 11)) {
11693 + acpi_strict = 1;
11694 + }
11695 +
11696 + /* Limit ACPI just to boot-time to enable HT */
11697 + else if (!memcmp(from, "acpi=ht", 7)) {
11698 + if (!acpi_force)
11699 + disable_acpi();
11700 + acpi_ht = 1;
11701 + }
11702 +
11703 + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
11704 + else if (!memcmp(from, "pci=noacpi", 10)) {
11705 + acpi_disable_pci();
11706 + }
11707 + /* "acpi=noirq" disables ACPI interrupt routing */
11708 + else if (!memcmp(from, "acpi=noirq", 10)) {
11709 + acpi_noirq_set();
11710 + }
11711 +
11712 + else if (!memcmp(from, "acpi_sci=edge", 13))
11713 + acpi_sci_flags.trigger = 1;
11714 +
11715 + else if (!memcmp(from, "acpi_sci=level", 14))
11716 + acpi_sci_flags.trigger = 3;
11717 +
11718 + else if (!memcmp(from, "acpi_sci=high", 13))
11719 + acpi_sci_flags.polarity = 1;
11720 +
11721 + else if (!memcmp(from, "acpi_sci=low", 12))
11722 + acpi_sci_flags.polarity = 3;
11723 +
11724 +#ifdef CONFIG_X86_IO_APIC
11725 + else if (!memcmp(from, "acpi_skip_timer_override", 24))
11726 + acpi_skip_timer_override = 1;
11727 +
11728 + if (!memcmp(from, "disable_timer_pin_1", 19))
11729 + disable_timer_pin_1 = 1;
11730 + if (!memcmp(from, "enable_timer_pin_1", 18))
11731 + disable_timer_pin_1 = -1;
11732 +
11733 + /* disable IO-APIC */
11734 + else if (!memcmp(from, "noapic", 6))
11735 + disable_ioapic_setup();
11736 +#endif /* CONFIG_X86_IO_APIC */
11737 +#endif /* CONFIG_ACPI */
11738 +
11739 +#ifdef CONFIG_X86_LOCAL_APIC
11740 + /* enable local APIC */
11741 + else if (!memcmp(from, "lapic", 5))
11742 + lapic_enable();
11743 +
11744 + /* disable local APIC */
11745 + else if (!memcmp(from, "nolapic", 6))
11746 + lapic_disable();
11747 +#endif /* CONFIG_X86_LOCAL_APIC */
11748 +
11749 +#ifdef CONFIG_KEXEC
11750 + /* crashkernel=size@addr specifies the location to reserve for
11751 + * a crash kernel. By reserving this memory we guarantee
11752 + * that linux never set's it up as a DMA target.
11753 + * Useful for holding code to do something appropriate
11754 + * after a kernel panic.
11755 + */
11756 + else if (!memcmp(from, "crashkernel=", 12)) {
11757 +#ifndef CONFIG_XEN
11758 + unsigned long size, base;
11759 + size = memparse(from+12, &from);
11760 + if (*from == '@') {
11761 + base = memparse(from+1, &from);
11762 + /* FIXME: Do I want a sanity check
11763 + * to validate the memory range?
11764 + */
11765 + crashk_res.start = base;
11766 + crashk_res.end = base + size - 1;
11767 + }
11768 +#else
11769 + printk("Ignoring crashkernel command line, "
11770 + "parameter will be supplied by xen\n");
11771 +#endif
11772 + }
11773 +#endif
11774 +#ifdef CONFIG_PROC_VMCORE
11775 + /* elfcorehdr= specifies the location of elf core header
11776 + * stored by the crashed kernel.
11777 + */
11778 + else if (!memcmp(from, "elfcorehdr=", 11))
11779 + elfcorehdr_addr = memparse(from+11, &from);
11780 +#endif
11781 +
11782 + /*
11783 + * highmem=size forces highmem to be exactly 'size' bytes.
11784 + * This works even on boxes that have no highmem otherwise.
11785 + * This also works to reduce highmem size on bigger boxes.
11786 + */
11787 + else if (!memcmp(from, "highmem=", 8))
11788 + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
11789 +
11790 + /*
11791 + * vmalloc=size forces the vmalloc area to be exactly 'size'
11792 + * bytes. This can be used to increase (or decrease) the
11793 + * vmalloc area - the default is 128m.
11794 + */
11795 + else if (!memcmp(from, "vmalloc=", 8))
11796 + __VMALLOC_RESERVE = memparse(from+8, &from);
11797 +
11798 + next_char:
11799 + c = *(from++);
11800 + if (!c)
11801 + break;
11802 + if (COMMAND_LINE_SIZE <= ++len)
11803 + break;
11804 + *(to++) = c;
11805 + }
11806 + *to = '\0';
11807 + *cmdline_p = command_line;
11808 + if (userdef) {
11809 + printk(KERN_INFO "user-defined physical RAM map:\n");
11810 + print_memory_map("user");
11811 + }
11812 +}
11813 +
11814 +/*
11815 + * Callback for efi_memory_walk.
11816 + */
11817 +static int __init
11818 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
11819 +{
11820 + unsigned long *max_pfn = arg, pfn;
11821 +
11822 + if (start < end) {
11823 + pfn = PFN_UP(end -1);
11824 + if (pfn > *max_pfn)
11825 + *max_pfn = pfn;
11826 + }
11827 + return 0;
11828 +}
11829 +
11830 +static int __init
11831 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
11832 +{
11833 + memory_present(0, start, end);
11834 + return 0;
11835 +}
11836 +
11837 +/*
11838 + * Find the highest page frame number we have available
11839 + */
11840 +void __init find_max_pfn(void)
11841 +{
11842 + int i;
11843 +
11844 + max_pfn = 0;
11845 + if (efi_enabled) {
11846 + efi_memmap_walk(efi_find_max_pfn, &max_pfn);
11847 + efi_memmap_walk(efi_memory_present_wrapper, NULL);
11848 + return;
11849 + }
11850 +
11851 + for (i = 0; i < e820.nr_map; i++) {
11852 + unsigned long start, end;
11853 + /* RAM? */
11854 + if (e820.map[i].type != E820_RAM)
11855 + continue;
11856 + start = PFN_UP(e820.map[i].addr);
11857 + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11858 + if (start >= end)
11859 + continue;
11860 + if (end > max_pfn)
11861 + max_pfn = end;
11862 + memory_present(0, start, end);
11863 + }
11864 +}
11865 +
11866 +/*
11867 + * Determine low and high memory ranges:
11868 + */
11869 +unsigned long __init find_max_low_pfn(void)
11870 +{
11871 + unsigned long max_low_pfn;
11872 +
11873 + max_low_pfn = max_pfn;
11874 + if (max_low_pfn > MAXMEM_PFN) {
11875 + if (highmem_pages == -1)
11876 + highmem_pages = max_pfn - MAXMEM_PFN;
11877 + if (highmem_pages + MAXMEM_PFN < max_pfn)
11878 + max_pfn = MAXMEM_PFN + highmem_pages;
11879 + if (highmem_pages + MAXMEM_PFN > max_pfn) {
11880 + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
11881 + highmem_pages = 0;
11882 + }
11883 + max_low_pfn = MAXMEM_PFN;
11884 +#ifndef CONFIG_HIGHMEM
11885 + /* Maximum memory usable is what is directly addressable */
11886 + printk(KERN_WARNING "Warning only %ldMB will be used.\n",
11887 + MAXMEM>>20);
11888 + if (max_pfn > MAX_NONPAE_PFN)
11889 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11890 + else
11891 + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
11892 + max_pfn = MAXMEM_PFN;
11893 +#else /* !CONFIG_HIGHMEM */
11894 +#ifndef CONFIG_X86_PAE
11895 + if (max_pfn > MAX_NONPAE_PFN) {
11896 + max_pfn = MAX_NONPAE_PFN;
11897 + printk(KERN_WARNING "Warning only 4GB will be used.\n");
11898 + printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11899 + }
11900 +#endif /* !CONFIG_X86_PAE */
11901 +#endif /* !CONFIG_HIGHMEM */
11902 + } else {
11903 + if (highmem_pages == -1)
11904 + highmem_pages = 0;
11905 +#ifdef CONFIG_HIGHMEM
11906 + if (highmem_pages >= max_pfn) {
11907 + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
11908 + highmem_pages = 0;
11909 + }
11910 + if (highmem_pages) {
11911 + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
11912 + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
11913 + highmem_pages = 0;
11914 + }
11915 + max_low_pfn -= highmem_pages;
11916 + }
11917 +#else
11918 + if (highmem_pages)
11919 + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
11920 +#endif
11921 + }
11922 + return max_low_pfn;
11923 +}
11924 +
11925 +/*
11926 + * Free all available memory for boot time allocation. Used
11927 + * as a callback function by efi_memory_walk()
11928 + */
11929 +
11930 +static int __init
11931 +free_available_memory(unsigned long start, unsigned long end, void *arg)
11932 +{
11933 + /* check max_low_pfn */
11934 + if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
11935 + return 0;
11936 + if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
11937 + end = (max_low_pfn + 1) << PAGE_SHIFT;
11938 + if (start < end)
11939 + free_bootmem(start, end - start);
11940 +
11941 + return 0;
11942 +}
11943 +/*
11944 + * Register fully available low RAM pages with the bootmem allocator.
11945 + */
11946 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
11947 +{
11948 + int i;
11949 +
11950 + if (efi_enabled) {
11951 + efi_memmap_walk(free_available_memory, NULL);
11952 + return;
11953 + }
11954 + for (i = 0; i < e820.nr_map; i++) {
11955 + unsigned long curr_pfn, last_pfn, size;
11956 + /*
11957 + * Reserve usable low memory
11958 + */
11959 + if (e820.map[i].type != E820_RAM)
11960 + continue;
11961 + /*
11962 + * We are rounding up the start address of usable memory:
11963 + */
11964 + curr_pfn = PFN_UP(e820.map[i].addr);
11965 + if (curr_pfn >= max_low_pfn)
11966 + continue;
11967 + /*
11968 + * ... and at the end of the usable range downwards:
11969 + */
11970 + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11971 +
11972 +#ifdef CONFIG_XEN
11973 + /*
11974 + * Truncate to the number of actual pages currently
11975 + * present.
11976 + */
11977 + if (last_pfn > xen_start_info->nr_pages)
11978 + last_pfn = xen_start_info->nr_pages;
11979 +#endif
11980 +
11981 + if (last_pfn > max_low_pfn)
11982 + last_pfn = max_low_pfn;
11983 +
11984 + /*
11985 + * .. finally, did all the rounding and playing
11986 + * around just make the area go away?
11987 + */
11988 + if (last_pfn <= curr_pfn)
11989 + continue;
11990 +
11991 + size = last_pfn - curr_pfn;
11992 + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
11993 + }
11994 +}
11995 +
11996 +#ifndef CONFIG_XEN
11997 +/*
11998 + * workaround for Dell systems that neglect to reserve EBDA
11999 + */
12000 +static void __init reserve_ebda_region(void)
12001 +{
12002 + unsigned int addr;
12003 + addr = get_bios_ebda();
12004 + if (addr)
12005 + reserve_bootmem(addr, PAGE_SIZE);
12006 +}
12007 +#endif
12008 +
12009 +#ifndef CONFIG_NEED_MULTIPLE_NODES
12010 +void __init setup_bootmem_allocator(void);
12011 +static unsigned long __init setup_memory(void)
12012 +{
12013 + /*
12014 + * partially used pages are not usable - thus
12015 + * we are rounding upwards:
12016 + */
12017 + min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
12018 + xen_start_info->nr_pt_frames;
12019 +
12020 + find_max_pfn();
12021 +
12022 + max_low_pfn = find_max_low_pfn();
12023 +
12024 +#ifdef CONFIG_HIGHMEM
12025 + highstart_pfn = highend_pfn = max_pfn;
12026 + if (max_pfn > max_low_pfn) {
12027 + highstart_pfn = max_low_pfn;
12028 + }
12029 + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
12030 + pages_to_mb(highend_pfn - highstart_pfn));
12031 +#endif
12032 + printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
12033 + pages_to_mb(max_low_pfn));
12034 +
12035 + setup_bootmem_allocator();
12036 +
12037 + return max_low_pfn;
12038 +}
12039 +
12040 +void __init zone_sizes_init(void)
12041 +{
12042 + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
12043 + unsigned int max_dma, low;
12044 +
12045 + /*
12046 + * XEN: Our notion of "DMA memory" is fake when running over Xen.
12047 + * We simply put all RAM in the DMA zone so that those drivers which
12048 + * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
12049 + * Those drivers that *do* require lowmem are screwed anyway when
12050 + * running over Xen!
12051 + */
12052 + max_dma = max_low_pfn;
12053 + low = max_low_pfn;
12054 +
12055 + if (low < max_dma)
12056 + zones_size[ZONE_DMA] = low;
12057 + else {
12058 + zones_size[ZONE_DMA] = max_dma;
12059 + zones_size[ZONE_NORMAL] = low - max_dma;
12060 +#ifdef CONFIG_HIGHMEM
12061 + zones_size[ZONE_HIGHMEM] = highend_pfn - low;
12062 +#endif
12063 + }
12064 + free_area_init(zones_size);
12065 +}
12066 +#else
12067 +extern unsigned long __init setup_memory(void);
12068 +extern void zone_sizes_init(void);
12069 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
12070 +
12071 +void __init setup_bootmem_allocator(void)
12072 +{
12073 + unsigned long bootmap_size;
12074 + /*
12075 + * Initialize the boot-time allocator (with low memory only):
12076 + */
12077 + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
12078 +
12079 + register_bootmem_low_pages(max_low_pfn);
12080 +
12081 + /*
12082 + * Reserve the bootmem bitmap itself as well. We do this in two
12083 + * steps (first step was init_bootmem()) because this catches
12084 + * the (very unlikely) case of us accidentally initializing the
12085 + * bootmem allocator with an invalid RAM area.
12086 + */
12087 + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
12088 + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
12089 +
12090 +#ifndef CONFIG_XEN
12091 + /*
12092 + * reserve physical page 0 - it's a special BIOS page on many boxes,
12093 + * enabling clean reboots, SMP operation, laptop functions.
12094 + */
12095 + reserve_bootmem(0, PAGE_SIZE);
12096 +
12097 + /* reserve EBDA region, it's a 4K region */
12098 + reserve_ebda_region();
12099 +
12100 + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
12101 + PCI prefetch into it (errata #56). Usually the page is reserved anyways,
12102 + unless you have no PS/2 mouse plugged in. */
12103 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
12104 + boot_cpu_data.x86 == 6)
12105 + reserve_bootmem(0xa0000 - 4096, 4096);
12106 +
12107 +#ifdef CONFIG_SMP
12108 + /*
12109 + * But first pinch a few for the stack/trampoline stuff
12110 + * FIXME: Don't need the extra page at 4K, but need to fix
12111 + * trampoline before removing it. (see the GDT stuff)
12112 + */
12113 + reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
12114 +#endif
12115 +#ifdef CONFIG_ACPI_SLEEP
12116 + /*
12117 + * Reserve low memory region for sleep support.
12118 + */
12119 + acpi_reserve_bootmem();
12120 +#endif
12121 +#endif /* !CONFIG_XEN */
12122 +
12123 +#ifdef CONFIG_BLK_DEV_INITRD
12124 + if (xen_start_info->mod_start) {
12125 + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
12126 + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
12127 + initrd_start = INITRD_START + PAGE_OFFSET;
12128 + initrd_end = initrd_start+INITRD_SIZE;
12129 + initrd_below_start_ok = 1;
12130 + }
12131 + else {
12132 + printk(KERN_ERR "initrd extends beyond end of memory "
12133 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
12134 + INITRD_START + INITRD_SIZE,
12135 + max_low_pfn << PAGE_SHIFT);
12136 + initrd_start = 0;
12137 + }
12138 + }
12139 +#endif
12140 +#ifdef CONFIG_KEXEC
12141 +#ifdef CONFIG_XEN
12142 + xen_machine_kexec_setup_resources();
12143 +#else
12144 + if (crashk_res.start != crashk_res.end)
12145 + reserve_bootmem(crashk_res.start,
12146 + crashk_res.end - crashk_res.start + 1);
12147 +#endif
12148 +#endif
12149 +
12150 + if (!xen_feature(XENFEAT_auto_translated_physmap))
12151 + phys_to_machine_mapping =
12152 + (unsigned long *)xen_start_info->mfn_list;
12153 +}
12154 +
12155 +/*
12156 + * The node 0 pgdat is initialized before all of these because
12157 + * it's needed for bootmem. node>0 pgdats have their virtual
12158 + * space allocated before the pagetables are in place to access
12159 + * them, so they can't be cleared then.
12160 + *
12161 + * This should all compile down to nothing when NUMA is off.
12162 + */
12163 +void __init remapped_pgdat_init(void)
12164 +{
12165 + int nid;
12166 +
12167 + for_each_online_node(nid) {
12168 + if (nid != 0)
12169 + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
12170 + }
12171 +}
12172 +
12173 +/*
12174 + * Request address space for all standard RAM and ROM resources
12175 + * and also for regions reported as reserved by the e820.
12176 + */
12177 +static void __init
12178 +legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
12179 + struct resource *code_resource,
12180 + struct resource *data_resource)
12181 +{
12182 + int i;
12183 +
12184 + probe_roms();
12185 +
12186 + for (i = 0; i < nr_map; i++) {
12187 + struct resource *res;
12188 + if (e820[i].addr + e820[i].size > 0x100000000ULL)
12189 + continue;
12190 + res = alloc_bootmem_low(sizeof(struct resource));
12191 + switch (e820[i].type) {
12192 + case E820_RAM: res->name = "System RAM"; break;
12193 + case E820_ACPI: res->name = "ACPI Tables"; break;
12194 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
12195 + default: res->name = "reserved";
12196 + }
12197 + res->start = e820[i].addr;
12198 + res->end = res->start + e820[i].size - 1;
12199 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
12200 + request_resource(&iomem_resource, res);
12201 + if (e820[i].type == E820_RAM) {
12202 + /*
12203 + * We don't know which RAM region contains kernel data,
12204 + * so we try it repeatedly and let the resource manager
12205 + * test it.
12206 + */
12207 +#ifndef CONFIG_XEN
12208 + request_resource(res, code_resource);
12209 + request_resource(res, data_resource);
12210 +#endif
12211 +#ifdef CONFIG_KEXEC
12212 + if (crashk_res.start != crashk_res.end)
12213 + request_resource(res, &crashk_res);
12214 +#ifdef CONFIG_XEN
12215 + xen_machine_kexec_register_resources(res);
12216 +#endif
12217 +#endif
12218 + }
12219 + }
12220 +}
12221 +
12222 +/*
12223 + * Locate a unused range of the physical address space below 4G which
12224 + * can be used for PCI mappings.
12225 + */
12226 +static void __init
12227 +e820_setup_gap(struct e820entry *e820, int nr_map)
12228 +{
12229 + unsigned long gapstart, gapsize, round;
12230 + unsigned long long last;
12231 + int i;
12232 +
12233 + /*
12234 + * Search for the bigest gap in the low 32 bits of the e820
12235 + * memory space.
12236 + */
12237 + last = 0x100000000ull;
12238 + gapstart = 0x10000000;
12239 + gapsize = 0x400000;
12240 + i = nr_map;
12241 + while (--i >= 0) {
12242 + unsigned long long start = e820[i].addr;
12243 + unsigned long long end = start + e820[i].size;
12244 +
12245 + /*
12246 + * Since "last" is at most 4GB, we know we'll
12247 + * fit in 32 bits if this condition is true
12248 + */
12249 + if (last > end) {
12250 + unsigned long gap = last - end;
12251 +
12252 + if (gap > gapsize) {
12253 + gapsize = gap;
12254 + gapstart = end;
12255 + }
12256 + }
12257 + if (start < last)
12258 + last = start;
12259 + }
12260 +
12261 + /*
12262 + * See how much we want to round up: start off with
12263 + * rounding to the next 1MB area.
12264 + */
12265 + round = 0x100000;
12266 + while ((gapsize >> 4) > round)
12267 + round += round;
12268 + /* Fun with two's complement */
12269 + pci_mem_start = (gapstart + round) & -round;
12270 +
12271 + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
12272 + pci_mem_start, gapstart, gapsize);
12273 +}
12274 +
12275 +/*
12276 + * Request address space for all standard resources
12277 + */
12278 +static void __init register_memory(void)
12279 +{
12280 +#ifdef CONFIG_XEN
12281 + struct xen_memory_map memmap;
12282 +#endif
12283 + int i;
12284 +
12285 + /* Nothing to do if not running in dom0. */
12286 + if (!is_initial_xendomain())
12287 + return;
12288 +
12289 +#ifdef CONFIG_XEN
12290 + memmap.nr_entries = E820MAX;
12291 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
12292 +
12293 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
12294 + BUG();
12295 + machine_e820.nr_map = memmap.nr_entries;
12296 +
12297 + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
12298 + &code_resource, &data_resource);
12299 +#else
12300 + if (efi_enabled)
12301 + efi_initialize_iomem_resources(&code_resource, &data_resource);
12302 + else
12303 + legacy_init_iomem_resources(e820.map, e820.nr_map,
12304 + &code_resource, &data_resource);
12305 +#endif
12306 +
12307 + /* EFI systems may still have VGA */
12308 + request_resource(&iomem_resource, &video_ram_resource);
12309 +
12310 + /* request I/O space for devices used on all i[345]86 PCs */
12311 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
12312 + request_resource(&ioport_resource, &standard_io_resources[i]);
12313 +
12314 +#ifdef CONFIG_XEN
12315 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
12316 +#else
12317 + e820_setup_gap(e820.map, e820.nr_map);
12318 +#endif
12319 +}
12320 +
12321 +/* Use inline assembly to define this because the nops are defined
12322 + as inline assembly strings in the include files and we cannot
12323 + get them easily into strings. */
12324 +asm("\t.data\nintelnops: "
12325 + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
12326 + GENERIC_NOP7 GENERIC_NOP8);
12327 +asm("\t.data\nk8nops: "
12328 + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
12329 + K8_NOP7 K8_NOP8);
12330 +asm("\t.data\nk7nops: "
12331 + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
12332 + K7_NOP7 K7_NOP8);
12333 +
12334 +extern unsigned char intelnops[], k8nops[], k7nops[];
12335 +static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
12336 + NULL,
12337 + intelnops,
12338 + intelnops + 1,
12339 + intelnops + 1 + 2,
12340 + intelnops + 1 + 2 + 3,
12341 + intelnops + 1 + 2 + 3 + 4,
12342 + intelnops + 1 + 2 + 3 + 4 + 5,
12343 + intelnops + 1 + 2 + 3 + 4 + 5 + 6,
12344 + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12345 +};
12346 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
12347 + NULL,
12348 + k8nops,
12349 + k8nops + 1,
12350 + k8nops + 1 + 2,
12351 + k8nops + 1 + 2 + 3,
12352 + k8nops + 1 + 2 + 3 + 4,
12353 + k8nops + 1 + 2 + 3 + 4 + 5,
12354 + k8nops + 1 + 2 + 3 + 4 + 5 + 6,
12355 + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12356 +};
12357 +static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
12358 + NULL,
12359 + k7nops,
12360 + k7nops + 1,
12361 + k7nops + 1 + 2,
12362 + k7nops + 1 + 2 + 3,
12363 + k7nops + 1 + 2 + 3 + 4,
12364 + k7nops + 1 + 2 + 3 + 4 + 5,
12365 + k7nops + 1 + 2 + 3 + 4 + 5 + 6,
12366 + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
12367 +};
12368 +static struct nop {
12369 + int cpuid;
12370 + unsigned char **noptable;
12371 +} noptypes[] = {
12372 + { X86_FEATURE_K8, k8_nops },
12373 + { X86_FEATURE_K7, k7_nops },
12374 + { -1, NULL }
12375 +};
12376 +
12377 +/* Replace instructions with better alternatives for this CPU type.
12378 +
12379 + This runs before SMP is initialized to avoid SMP problems with
12380 + self modifying code. This implies that assymetric systems where
12381 + APs have less capabilities than the boot processor are not handled.
12382 + Tough. Make sure you disable such features by hand. */
12383 +void apply_alternatives(void *start, void *end)
12384 +{
12385 + struct alt_instr *a;
12386 + int diff, i, k;
12387 + unsigned char **noptable = intel_nops;
12388 + for (i = 0; noptypes[i].cpuid >= 0; i++) {
12389 + if (boot_cpu_has(noptypes[i].cpuid)) {
12390 + noptable = noptypes[i].noptable;
12391 + break;
12392 + }
12393 + }
12394 + for (a = start; (void *)a < end; a++) {
12395 + if (!boot_cpu_has(a->cpuid))
12396 + continue;
12397 + BUG_ON(a->replacementlen > a->instrlen);
12398 + memcpy(a->instr, a->replacement, a->replacementlen);
12399 + diff = a->instrlen - a->replacementlen;
12400 + /* Pad the rest with nops */
12401 + for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
12402 + k = diff;
12403 + if (k > ASM_NOP_MAX)
12404 + k = ASM_NOP_MAX;
12405 + memcpy(a->instr + i, noptable[k], k);
12406 + }
12407 + }
12408 +}
12409 +
12410 +void __init alternative_instructions(void)
12411 +{
12412 + extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
12413 + apply_alternatives(__alt_instructions, __alt_instructions_end);
12414 +}
12415 +
12416 +static char * __init machine_specific_memory_setup(void);
12417 +
12418 +#ifdef CONFIG_MCA
12419 +static void set_mca_bus(int x)
12420 +{
12421 + MCA_bus = x;
12422 +}
12423 +#else
12424 +static void set_mca_bus(int x) { }
12425 +#endif
12426 +
12427 +/*
12428 + * Determine if we were loaded by an EFI loader. If so, then we have also been
12429 + * passed the efi memmap, systab, etc., so we should use these data structures
12430 + * for initialization. Note, the efi init code path is determined by the
12431 + * global efi_enabled. This allows the same kernel image to be used on existing
12432 + * systems (with a traditional BIOS) as well as on EFI systems.
12433 + */
12434 +void __init setup_arch(char **cmdline_p)
12435 +{
12436 + int i, j, k, fpp;
12437 + struct physdev_set_iopl set_iopl;
12438 + unsigned long max_low_pfn;
12439 +
12440 + /* Force a quick death if the kernel panics (not domain 0). */
12441 + extern int panic_timeout;
12442 + if (!panic_timeout && !is_initial_xendomain())
12443 + panic_timeout = 1;
12444 +
12445 + /* Register a call for panic conditions. */
12446 + notifier_chain_register(&panic_notifier_list, &xen_panic_block);
12447 +
12448 + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
12449 + HYPERVISOR_vm_assist(VMASST_CMD_enable,
12450 + VMASST_TYPE_writable_pagetables);
12451 +
12452 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
12453 + early_cpu_init();
12454 +
12455 + /*
12456 + * FIXME: This isn't an official loader_type right
12457 + * now but does currently work with elilo.
12458 + * If we were configured as an EFI kernel, check to make
12459 + * sure that we were loaded correctly from elilo and that
12460 + * the system table is valid. If not, then initialize normally.
12461 + */
12462 +#ifdef CONFIG_EFI
12463 + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
12464 + efi_enabled = 1;
12465 +#endif
12466 +
12467 + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
12468 + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
12469 + */
12470 + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
12471 + drive_info = DRIVE_INFO;
12472 + screen_info = SCREEN_INFO;
12473 + edid_info = EDID_INFO;
12474 + apm_info.bios = APM_BIOS_INFO;
12475 + ist_info = IST_INFO;
12476 + saved_videomode = VIDEO_MODE;
12477 + if( SYS_DESC_TABLE.length != 0 ) {
12478 + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
12479 + machine_id = SYS_DESC_TABLE.table[0];
12480 + machine_submodel_id = SYS_DESC_TABLE.table[1];
12481 + BIOS_revision = SYS_DESC_TABLE.table[2];
12482 + }
12483 + bootloader_type = LOADER_TYPE;
12484 +
12485 + if (is_initial_xendomain()) {
12486 + /* This is drawn from a dump from vgacon:startup in
12487 + * standard Linux. */
12488 + screen_info.orig_video_mode = 3;
12489 + screen_info.orig_video_isVGA = 1;
12490 + screen_info.orig_video_lines = 25;
12491 + screen_info.orig_video_cols = 80;
12492 + screen_info.orig_video_ega_bx = 3;
12493 + screen_info.orig_video_points = 16;
12494 + screen_info.orig_y = screen_info.orig_video_lines - 1;
12495 + if (xen_start_info->console.dom0.info_size >=
12496 + sizeof(struct dom0_vga_console_info)) {
12497 + const struct dom0_vga_console_info *info =
12498 + (struct dom0_vga_console_info *)(
12499 + (char *)xen_start_info +
12500 + xen_start_info->console.dom0.info_off);
12501 + dom0_init_screen_info(info);
12502 + }
12503 + xen_start_info->console.domU.mfn = 0;
12504 + xen_start_info->console.domU.evtchn = 0;
12505 + } else
12506 + screen_info.orig_video_isVGA = 0;
12507 +
12508 +#ifdef CONFIG_BLK_DEV_RAM
12509 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
12510 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
12511 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
12512 +#endif
12513 +
12514 + setup_xen_features();
12515 +
12516 + ARCH_SETUP
12517 + if (efi_enabled)
12518 + efi_init();
12519 + else {
12520 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
12521 + print_memory_map(machine_specific_memory_setup());
12522 + }
12523 +
12524 + copy_edd();
12525 +
12526 + if (!MOUNT_ROOT_RDONLY)
12527 + root_mountflags &= ~MS_RDONLY;
12528 + init_mm.start_code = (unsigned long) _text;
12529 + init_mm.end_code = (unsigned long) _etext;
12530 + init_mm.end_data = (unsigned long) _edata;
12531 + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
12532 + xen_start_info->nr_pt_frames) << PAGE_SHIFT;
12533 +
12534 + code_resource.start = virt_to_phys(_text);
12535 + code_resource.end = virt_to_phys(_etext)-1;
12536 + data_resource.start = virt_to_phys(_etext);
12537 + data_resource.end = virt_to_phys(_edata)-1;
12538 +
12539 + parse_cmdline_early(cmdline_p);
12540 +
12541 + max_low_pfn = setup_memory();
12542 +
12543 + /*
12544 + * NOTE: before this point _nobody_ is allowed to allocate
12545 + * any memory using the bootmem allocator. Although the
12546 + * alloctor is now initialised only the first 8Mb of the kernel
12547 + * virtual address space has been mapped. All allocations before
12548 + * paging_init() has completed must use the alloc_bootmem_low_pages()
12549 + * variant (which allocates DMA'able memory) and care must be taken
12550 + * not to exceed the 8Mb limit.
12551 + */
12552 +
12553 +#ifdef CONFIG_SMP
12554 + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
12555 +#endif
12556 + paging_init();
12557 + remapped_pgdat_init();
12558 + sparse_init();
12559 + zone_sizes_init();
12560 +
12561 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
12562 + /*
12563 + * Find and reserve possible boot-time SMP configuration:
12564 + */
12565 + find_smp_config();
12566 +#endif
12567 +
12568 + /* Make sure we have a correctly sized P->M table. */
12569 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
12570 + phys_to_machine_mapping = alloc_bootmem_low_pages(
12571 + max_pfn * sizeof(unsigned long));
12572 + memset(phys_to_machine_mapping, ~0,
12573 + max_pfn * sizeof(unsigned long));
12574 + memcpy(phys_to_machine_mapping,
12575 + (unsigned long *)xen_start_info->mfn_list,
12576 + xen_start_info->nr_pages * sizeof(unsigned long));
12577 + free_bootmem(
12578 + __pa(xen_start_info->mfn_list),
12579 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
12580 + sizeof(unsigned long))));
12581 +
12582 + /*
12583 + * Initialise the list of the frames that specify the list of
12584 + * frames that make up the p2m table. Used by save/restore
12585 + */
12586 + pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
12587 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
12588 + virt_to_mfn(pfn_to_mfn_frame_list_list);
12589 +
12590 + fpp = PAGE_SIZE/sizeof(unsigned long);
12591 + for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
12592 + if ((j % fpp) == 0) {
12593 + k++;
12594 + BUG_ON(k>=16);
12595 + pfn_to_mfn_frame_list[k] =
12596 + alloc_bootmem_low_pages(PAGE_SIZE);
12597 + pfn_to_mfn_frame_list_list[k] =
12598 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
12599 + j=0;
12600 + }
12601 + pfn_to_mfn_frame_list[k][j] =
12602 + virt_to_mfn(&phys_to_machine_mapping[i]);
12603 + }
12604 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
12605 + }
12606 +
12607 + /*
12608 + * NOTE: at this point the bootmem allocator is fully available.
12609 + */
12610 +
12611 +#ifdef CONFIG_EARLY_PRINTK
12612 + {
12613 + char *s = strstr(*cmdline_p, "earlyprintk=");
12614 + if (s) {
12615 + extern void setup_early_printk(char *);
12616 +
12617 + setup_early_printk(strchr(s, '=') + 1);
12618 + printk("early console enabled\n");
12619 + }
12620 + }
12621 +#endif
12622 +
12623 + if (is_initial_xendomain())
12624 + dmi_scan_machine();
12625 +
12626 +#ifdef CONFIG_X86_GENERICARCH
12627 + generic_apic_probe(*cmdline_p);
12628 +#endif
12629 + if (efi_enabled)
12630 + efi_map_memmap();
12631 +
12632 + set_iopl.iopl = 1;
12633 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
12634 +
12635 +#ifdef CONFIG_X86_IO_APIC
12636 + check_acpi_pci(); /* Checks more than just ACPI actually */
12637 +#endif
12638 +
12639 +#ifdef CONFIG_ACPI
12640 + if (!is_initial_xendomain()) {
12641 + printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
12642 + acpi_disabled = 1;
12643 + acpi_ht = 0;
12644 + }
12645 +
12646 + /*
12647 + * Parse the ACPI tables for possible boot-time SMP configuration.
12648 + */
12649 + acpi_boot_table_init();
12650 + acpi_boot_init();
12651 +
12652 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
12653 + if (def_to_bigsmp)
12654 + printk(KERN_WARNING "More than 8 CPUs detected and "
12655 + "CONFIG_X86_PC cannot handle it.\nUse "
12656 + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
12657 +#endif
12658 +#endif
12659 +#ifdef CONFIG_X86_LOCAL_APIC
12660 + if (smp_found_config)
12661 + get_smp_config();
12662 +#endif
12663 +
12664 + register_memory();
12665 +
12666 + if (is_initial_xendomain()) {
12667 +#ifdef CONFIG_VT
12668 +#if defined(CONFIG_VGA_CONSOLE)
12669 + if (!efi_enabled ||
12670 + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
12671 + conswitchp = &vga_con;
12672 +#elif defined(CONFIG_DUMMY_CONSOLE)
12673 + conswitchp = &dummy_con;
12674 +#endif
12675 +#endif
12676 + } else {
12677 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
12678 + conswitchp = &dummy_con;
12679 +#endif
12680 + }
12681 + xencons_early_setup();
12682 +}
12683 +
12684 +static int
12685 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
12686 +{
12687 + HYPERVISOR_shutdown(SHUTDOWN_crash);
12688 + /* we're never actually going to get here... */
12689 + return NOTIFY_DONE;
12690 +}
12691 +
12692 +#include "setup_arch_post.h"
12693 +/*
12694 + * Local Variables:
12695 + * mode:c
12696 + * c-file-style:"k&r"
12697 + * c-basic-offset:8
12698 + * End:
12699 + */
12700 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smp-xen.c linux-2.6.16.33/arch/i386/kernel/smp-xen.c
12701 --- linux-2.6.16.33-noxen/arch/i386/kernel/smp-xen.c 1970-01-01 00:00:00.000000000 +0000
12702 +++ linux-2.6.16.33/arch/i386/kernel/smp-xen.c 2007-01-08 15:00:45.000000000 +0000
12703 @@ -0,0 +1,618 @@
12704 +/*
12705 + * Intel SMP support routines.
12706 + *
12707 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
12708 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
12709 + *
12710 + * This code is released under the GNU General Public License version 2 or
12711 + * later.
12712 + */
12713 +
12714 +#include <linux/init.h>
12715 +
12716 +#include <linux/mm.h>
12717 +#include <linux/delay.h>
12718 +#include <linux/spinlock.h>
12719 +#include <linux/smp_lock.h>
12720 +#include <linux/kernel_stat.h>
12721 +#include <linux/mc146818rtc.h>
12722 +#include <linux/cache.h>
12723 +#include <linux/interrupt.h>
12724 +#include <linux/cpu.h>
12725 +#include <linux/module.h>
12726 +
12727 +#include <asm/mtrr.h>
12728 +#include <asm/tlbflush.h>
12729 +#if 0
12730 +#include <mach_apic.h>
12731 +#endif
12732 +#include <xen/evtchn.h>
12733 +
12734 +/*
12735 + * Some notes on x86 processor bugs affecting SMP operation:
12736 + *
12737 + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
12738 + * The Linux implications for SMP are handled as follows:
12739 + *
12740 + * Pentium III / [Xeon]
12741 + * None of the E1AP-E3AP errata are visible to the user.
12742 + *
12743 + * E1AP. see PII A1AP
12744 + * E2AP. see PII A2AP
12745 + * E3AP. see PII A3AP
12746 + *
12747 + * Pentium II / [Xeon]
12748 + * None of the A1AP-A3AP errata are visible to the user.
12749 + *
12750 + * A1AP. see PPro 1AP
12751 + * A2AP. see PPro 2AP
12752 + * A3AP. see PPro 7AP
12753 + *
12754 + * Pentium Pro
12755 + * None of 1AP-9AP errata are visible to the normal user,
12756 + * except occasional delivery of 'spurious interrupt' as trap #15.
12757 + * This is very rare and a non-problem.
12758 + *
12759 + * 1AP. Linux maps APIC as non-cacheable
12760 + * 2AP. worked around in hardware
12761 + * 3AP. fixed in C0 and above steppings microcode update.
12762 + * Linux does not use excessive STARTUP_IPIs.
12763 + * 4AP. worked around in hardware
12764 + * 5AP. symmetric IO mode (normal Linux operation) not affected.
12765 + * 'noapic' mode has vector 0xf filled out properly.
12766 + * 6AP. 'noapic' mode might be affected - fixed in later steppings
12767 + * 7AP. We do not assume writes to the LVT deassering IRQs
12768 + * 8AP. We do not enable low power mode (deep sleep) during MP bootup
12769 + * 9AP. We do not use mixed mode
12770 + *
12771 + * Pentium
12772 + * There is a marginal case where REP MOVS on 100MHz SMP
12773 + * machines with B stepping processors can fail. XXX should provide
12774 + * an L1cache=Writethrough or L1cache=off option.
12775 + *
12776 + * B stepping CPUs may hang. There are hardware work arounds
12777 + * for this. We warn about it in case your board doesn't have the work
12778 + * arounds. Basically thats so I can tell anyone with a B stepping
12779 + * CPU and SMP problems "tough".
12780 + *
12781 + * Specific items [From Pentium Processor Specification Update]
12782 + *
12783 + * 1AP. Linux doesn't use remote read
12784 + * 2AP. Linux doesn't trust APIC errors
12785 + * 3AP. We work around this
12786 + * 4AP. Linux never generated 3 interrupts of the same priority
12787 + * to cause a lost local interrupt.
12788 + * 5AP. Remote read is never used
12789 + * 6AP. not affected - worked around in hardware
12790 + * 7AP. not affected - worked around in hardware
12791 + * 8AP. worked around in hardware - we get explicit CS errors if not
12792 + * 9AP. only 'noapic' mode affected. Might generate spurious
12793 + * interrupts, we log only the first one and count the
12794 + * rest silently.
12795 + * 10AP. not affected - worked around in hardware
12796 + * 11AP. Linux reads the APIC between writes to avoid this, as per
12797 + * the documentation. Make sure you preserve this as it affects
12798 + * the C stepping chips too.
12799 + * 12AP. not affected - worked around in hardware
12800 + * 13AP. not affected - worked around in hardware
12801 + * 14AP. we always deassert INIT during bootup
12802 + * 15AP. not affected - worked around in hardware
12803 + * 16AP. not affected - worked around in hardware
12804 + * 17AP. not affected - worked around in hardware
12805 + * 18AP. not affected - worked around in hardware
12806 + * 19AP. not affected - worked around in BIOS
12807 + *
12808 + * If this sounds worrying believe me these bugs are either ___RARE___,
12809 + * or are signal timing bugs worked around in hardware and there's
12810 + * about nothing of note with C stepping upwards.
12811 + */
12812 +
12813 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
12814 +
12815 +/*
12816 + * the following functions deal with sending IPIs between CPUs.
12817 + *
12818 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
12819 + */
12820 +
12821 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
12822 +{
12823 + return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
12824 +}
12825 +
12826 +static inline int __prepare_ICR2 (unsigned int mask)
12827 +{
12828 + return SET_APIC_DEST_FIELD(mask);
12829 +}
12830 +
12831 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
12832 +
12833 +static inline void __send_IPI_one(unsigned int cpu, int vector)
12834 +{
12835 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
12836 + BUG_ON(irq < 0);
12837 + notify_remote_via_irq(irq);
12838 +}
12839 +
12840 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
12841 +{
12842 + int cpu;
12843 +
12844 + switch (shortcut) {
12845 + case APIC_DEST_SELF:
12846 + __send_IPI_one(smp_processor_id(), vector);
12847 + break;
12848 + case APIC_DEST_ALLBUT:
12849 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12850 + if (cpu == smp_processor_id())
12851 + continue;
12852 + if (cpu_isset(cpu, cpu_online_map)) {
12853 + __send_IPI_one(cpu, vector);
12854 + }
12855 + }
12856 + break;
12857 + default:
12858 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
12859 + vector);
12860 + break;
12861 + }
12862 +}
12863 +
12864 +void fastcall send_IPI_self(int vector)
12865 +{
12866 + __send_IPI_shortcut(APIC_DEST_SELF, vector);
12867 +}
12868 +
12869 +/*
12870 + * This is only used on smaller machines.
12871 + */
12872 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
12873 +{
12874 + unsigned long flags;
12875 + unsigned int cpu;
12876 +
12877 + local_irq_save(flags);
12878 + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
12879 +
12880 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12881 + if (cpu_isset(cpu, mask)) {
12882 + __send_IPI_one(cpu, vector);
12883 + }
12884 + }
12885 +
12886 + local_irq_restore(flags);
12887 +}
12888 +
12889 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
12890 +{
12891 +
12892 + send_IPI_mask_bitmask(mask, vector);
12893 +}
12894 +
12895 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
12896 +
12897 +#if 0 /* XEN */
12898 +/*
12899 + * Smarter SMP flushing macros.
12900 + * c/o Linus Torvalds.
12901 + *
12902 + * These mean you can really definitely utterly forget about
12903 + * writing to user space from interrupts. (Its not allowed anyway).
12904 + *
12905 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
12906 + */
12907 +
12908 +static cpumask_t flush_cpumask;
12909 +static struct mm_struct * flush_mm;
12910 +static unsigned long flush_va;
12911 +static DEFINE_SPINLOCK(tlbstate_lock);
12912 +#define FLUSH_ALL 0xffffffff
12913 +
12914 +/*
12915 + * We cannot call mmdrop() because we are in interrupt context,
12916 + * instead update mm->cpu_vm_mask.
12917 + *
12918 + * We need to reload %cr3 since the page tables may be going
12919 + * away from under us..
12920 + */
12921 +static inline void leave_mm (unsigned long cpu)
12922 +{
12923 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
12924 + BUG();
12925 + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
12926 + load_cr3(swapper_pg_dir);
12927 +}
12928 +
12929 +/*
12930 + *
12931 + * The flush IPI assumes that a thread switch happens in this order:
12932 + * [cpu0: the cpu that switches]
12933 + * 1) switch_mm() either 1a) or 1b)
12934 + * 1a) thread switch to a different mm
12935 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
12936 + * Stop ipi delivery for the old mm. This is not synchronized with
12937 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
12938 + * for the wrong mm, and in the worst case we perform a superflous
12939 + * tlb flush.
12940 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
12941 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
12942 + * was in lazy tlb mode.
12943 + * 1a3) update cpu_tlbstate[].active_mm
12944 + * Now cpu0 accepts tlb flushes for the new mm.
12945 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
12946 + * Now the other cpus will send tlb flush ipis.
12947 + * 1a4) change cr3.
12948 + * 1b) thread switch without mm change
12949 + * cpu_tlbstate[].active_mm is correct, cpu0 already handles
12950 + * flush ipis.
12951 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
12952 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
12953 + * Atomically set the bit [other cpus will start sending flush ipis],
12954 + * and test the bit.
12955 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
12956 + * 2) switch %%esp, ie current
12957 + *
12958 + * The interrupt must handle 2 special cases:
12959 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
12960 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
12961 + * runs in kernel space, the cpu could load tlb entries for user space
12962 + * pages.
12963 + *
12964 + * The good news is that cpu_tlbstate is local to each cpu, no
12965 + * write/read ordering problems.
12966 + */
12967 +
12968 +/*
12969 + * TLB flush IPI:
12970 + *
12971 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
12972 + * 2) Leave the mm if we are in the lazy tlb mode.
12973 + */
12974 +
12975 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12976 + struct pt_regs *regs)
12977 +{
12978 + unsigned long cpu;
12979 +
12980 + cpu = get_cpu();
12981 +
12982 + if (!cpu_isset(cpu, flush_cpumask))
12983 + goto out;
12984 + /*
12985 + * This was a BUG() but until someone can quote me the
12986 + * line from the intel manual that guarantees an IPI to
12987 + * multiple CPUs is retried _only_ on the erroring CPUs
12988 + * its staying as a return
12989 + *
12990 + * BUG();
12991 + */
12992 +
12993 + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
12994 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
12995 + if (flush_va == FLUSH_ALL)
12996 + local_flush_tlb();
12997 + else
12998 + __flush_tlb_one(flush_va);
12999 + } else
13000 + leave_mm(cpu);
13001 + }
13002 + smp_mb__before_clear_bit();
13003 + cpu_clear(cpu, flush_cpumask);
13004 + smp_mb__after_clear_bit();
13005 +out:
13006 + put_cpu_no_resched();
13007 +
13008 + return IRQ_HANDLED;
13009 +}
13010 +
13011 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
13012 + unsigned long va)
13013 +{
13014 + /*
13015 + * A couple of (to be removed) sanity checks:
13016 + *
13017 + * - current CPU must not be in mask
13018 + * - mask must exist :)
13019 + */
13020 + BUG_ON(cpus_empty(cpumask));
13021 + BUG_ON(cpu_isset(smp_processor_id(), cpumask));
13022 + BUG_ON(!mm);
13023 +
13024 + /* If a CPU which we ran on has gone down, OK. */
13025 + cpus_and(cpumask, cpumask, cpu_online_map);
13026 + if (cpus_empty(cpumask))
13027 + return;
13028 +
13029 + /*
13030 + * i'm not happy about this global shared spinlock in the
13031 + * MM hot path, but we'll see how contended it is.
13032 + * Temporarily this turns IRQs off, so that lockups are
13033 + * detected by the NMI watchdog.
13034 + */
13035 + spin_lock(&tlbstate_lock);
13036 +
13037 + flush_mm = mm;
13038 + flush_va = va;
13039 +#if NR_CPUS <= BITS_PER_LONG
13040 + atomic_set_mask(cpumask, &flush_cpumask);
13041 +#else
13042 + {
13043 + int k;
13044 + unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
13045 + unsigned long *cpu_mask = (unsigned long *)&cpumask;
13046 + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
13047 + atomic_set_mask(cpu_mask[k], &flush_mask[k]);
13048 + }
13049 +#endif
13050 + /*
13051 + * We have to send the IPI only to
13052 + * CPUs affected.
13053 + */
13054 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
13055 +
13056 + while (!cpus_empty(flush_cpumask))
13057 + /* nothing. lockup detection does not belong here */
13058 + mb();
13059 +
13060 + flush_mm = NULL;
13061 + flush_va = 0;
13062 + spin_unlock(&tlbstate_lock);
13063 +}
13064 +
13065 +void flush_tlb_current_task(void)
13066 +{
13067 + struct mm_struct *mm = current->mm;
13068 + cpumask_t cpu_mask;
13069 +
13070 + preempt_disable();
13071 + cpu_mask = mm->cpu_vm_mask;
13072 + cpu_clear(smp_processor_id(), cpu_mask);
13073 +
13074 + local_flush_tlb();
13075 + if (!cpus_empty(cpu_mask))
13076 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13077 + preempt_enable();
13078 +}
13079 +
13080 +void flush_tlb_mm (struct mm_struct * mm)
13081 +{
13082 + cpumask_t cpu_mask;
13083 +
13084 + preempt_disable();
13085 + cpu_mask = mm->cpu_vm_mask;
13086 + cpu_clear(smp_processor_id(), cpu_mask);
13087 +
13088 + if (current->active_mm == mm) {
13089 + if (current->mm)
13090 + local_flush_tlb();
13091 + else
13092 + leave_mm(smp_processor_id());
13093 + }
13094 + if (!cpus_empty(cpu_mask))
13095 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
13096 +
13097 + preempt_enable();
13098 +}
13099 +
13100 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
13101 +{
13102 + struct mm_struct *mm = vma->vm_mm;
13103 + cpumask_t cpu_mask;
13104 +
13105 + preempt_disable();
13106 + cpu_mask = mm->cpu_vm_mask;
13107 + cpu_clear(smp_processor_id(), cpu_mask);
13108 +
13109 + if (current->active_mm == mm) {
13110 + if(current->mm)
13111 + __flush_tlb_one(va);
13112 + else
13113 + leave_mm(smp_processor_id());
13114 + }
13115 +
13116 + if (!cpus_empty(cpu_mask))
13117 + flush_tlb_others(cpu_mask, mm, va);
13118 +
13119 + preempt_enable();
13120 +}
13121 +EXPORT_SYMBOL(flush_tlb_page);
13122 +
13123 +static void do_flush_tlb_all(void* info)
13124 +{
13125 + unsigned long cpu = smp_processor_id();
13126 +
13127 + __flush_tlb_all();
13128 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
13129 + leave_mm(cpu);
13130 +}
13131 +
13132 +void flush_tlb_all(void)
13133 +{
13134 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
13135 +}
13136 +
13137 +#else
13138 +
13139 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
13140 + struct pt_regs *regs)
13141 +{ return 0; }
13142 +void flush_tlb_current_task(void)
13143 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
13144 +void flush_tlb_mm(struct mm_struct * mm)
13145 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
13146 +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
13147 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
13148 +EXPORT_SYMBOL(flush_tlb_page);
13149 +void flush_tlb_all(void)
13150 +{ xen_tlb_flush_all(); }
13151 +
13152 +#endif /* XEN */
13153 +
13154 +/*
13155 + * this function sends a 'reschedule' IPI to another CPU.
13156 + * it goes straight through and wastes no time serializing
13157 + * anything. Worst case is that we lose a reschedule ...
13158 + */
13159 +void smp_send_reschedule(int cpu)
13160 +{
13161 + WARN_ON(cpu_is_offline(cpu));
13162 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
13163 +}
13164 +
13165 +/*
13166 + * Structure and data for smp_call_function(). This is designed to minimise
13167 + * static memory requirements. It also looks cleaner.
13168 + */
13169 +static DEFINE_SPINLOCK(call_lock);
13170 +
13171 +struct call_data_struct {
13172 + void (*func) (void *info);
13173 + void *info;
13174 + atomic_t started;
13175 + atomic_t finished;
13176 + int wait;
13177 +};
13178 +
13179 +void lock_ipi_call_lock(void)
13180 +{
13181 + spin_lock_irq(&call_lock);
13182 +}
13183 +
13184 +void unlock_ipi_call_lock(void)
13185 +{
13186 + spin_unlock_irq(&call_lock);
13187 +}
13188 +
13189 +static struct call_data_struct * call_data;
13190 +
13191 +/*
13192 + * this function sends a 'generic call function' IPI to all other CPUs
13193 + * in the system.
13194 + */
13195 +
13196 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
13197 + int wait)
13198 +/*
13199 + * [SUMMARY] Run a function on all other CPUs.
13200 + * <func> The function to run. This must be fast and non-blocking.
13201 + * <info> An arbitrary pointer to pass to the function.
13202 + * <nonatomic> currently unused.
13203 + * <wait> If true, wait (atomically) until function has completed on other CPUs.
13204 + * [RETURNS] 0 on success, else a negative status code. Does not return until
13205 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
13206 + *
13207 + * You must not call this function with disabled interrupts or from a
13208 + * hardware interrupt handler or from a bottom half handler.
13209 + */
13210 +{
13211 + struct call_data_struct data;
13212 + int cpus;
13213 +
13214 + /* Holding any lock stops cpus from going down. */
13215 + spin_lock(&call_lock);
13216 + cpus = num_online_cpus() - 1;
13217 + if (!cpus) {
13218 + spin_unlock(&call_lock);
13219 + return 0;
13220 + }
13221 +
13222 + /* Can deadlock when called with interrupts disabled */
13223 + WARN_ON(irqs_disabled());
13224 +
13225 + data.func = func;
13226 + data.info = info;
13227 + atomic_set(&data.started, 0);
13228 + data.wait = wait;
13229 + if (wait)
13230 + atomic_set(&data.finished, 0);
13231 +
13232 + call_data = &data;
13233 + mb();
13234 +
13235 + /* Send a message to all other CPUs and wait for them to respond */
13236 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
13237 +
13238 + /* Wait for response */
13239 + while (atomic_read(&data.started) != cpus)
13240 + barrier();
13241 +
13242 + if (wait)
13243 + while (atomic_read(&data.finished) != cpus)
13244 + barrier();
13245 + spin_unlock(&call_lock);
13246 +
13247 + return 0;
13248 +}
13249 +EXPORT_SYMBOL(smp_call_function);
13250 +
13251 +static void stop_this_cpu (void * dummy)
13252 +{
13253 + /*
13254 + * Remove this CPU:
13255 + */
13256 + cpu_clear(smp_processor_id(), cpu_online_map);
13257 + local_irq_disable();
13258 +#if 0
13259 + disable_local_APIC();
13260 +#endif
13261 + if (cpu_data[smp_processor_id()].hlt_works_ok)
13262 + for(;;) halt();
13263 + for (;;);
13264 +}
13265 +
13266 +/*
13267 + * this function calls the 'stop' function on all other CPUs in the system.
13268 + */
13269 +
13270 +void smp_send_stop(void)
13271 +{
13272 + smp_call_function(stop_this_cpu, NULL, 1, 0);
13273 +
13274 + local_irq_disable();
13275 +#if 0
13276 + disable_local_APIC();
13277 +#endif
13278 + local_irq_enable();
13279 +}
13280 +
13281 +/*
13282 + * Reschedule call back. Nothing to do,
13283 + * all the work is done automatically when
13284 + * we return from the interrupt.
13285 + */
13286 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
13287 + struct pt_regs *regs)
13288 +{
13289 +
13290 + return IRQ_HANDLED;
13291 +}
13292 +
13293 +#include <linux/kallsyms.h>
13294 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
13295 + struct pt_regs *regs)
13296 +{
13297 + void (*func) (void *info) = call_data->func;
13298 + void *info = call_data->info;
13299 + int wait = call_data->wait;
13300 +
13301 + /*
13302 + * Notify initiating CPU that I've grabbed the data and am
13303 + * about to execute the function
13304 + */
13305 + mb();
13306 + atomic_inc(&call_data->started);
13307 + /*
13308 + * At this point the info structure may be out of scope unless wait==1
13309 + */
13310 + irq_enter();
13311 + (*func)(info);
13312 + irq_exit();
13313 +
13314 + if (wait) {
13315 + mb();
13316 + atomic_inc(&call_data->finished);
13317 + }
13318 +
13319 + return IRQ_HANDLED;
13320 +}
13321 +
13322 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smpalts.c linux-2.6.16.33/arch/i386/kernel/smpalts.c
13323 --- linux-2.6.16.33-noxen/arch/i386/kernel/smpalts.c 1970-01-01 00:00:00.000000000 +0000
13324 +++ linux-2.6.16.33/arch/i386/kernel/smpalts.c 2007-01-08 15:00:45.000000000 +0000
13325 @@ -0,0 +1,85 @@
13326 +#include <linux/kernel.h>
13327 +#include <asm/system.h>
13328 +#include <asm/smp_alt.h>
13329 +#include <asm/processor.h>
13330 +#include <asm/string.h>
13331 +
13332 +struct smp_replacement_record {
13333 + unsigned char targ_size;
13334 + unsigned char smp1_size;
13335 + unsigned char smp2_size;
13336 + unsigned char up_size;
13337 + unsigned char feature;
13338 + unsigned char data[0];
13339 +};
13340 +
13341 +struct smp_alternative_record {
13342 + void *targ_start;
13343 + struct smp_replacement_record *repl;
13344 +};
13345 +
13346 +extern struct smp_alternative_record __start_smp_alternatives_table,
13347 + __stop_smp_alternatives_table;
13348 +extern unsigned long __init_begin, __init_end;
13349 +
13350 +void prepare_for_smp(void)
13351 +{
13352 + struct smp_alternative_record *r;
13353 + printk(KERN_INFO "Enabling SMP...\n");
13354 + for (r = &__start_smp_alternatives_table;
13355 + r != &__stop_smp_alternatives_table;
13356 + r++) {
13357 + BUG_ON(r->repl->targ_size < r->repl->smp1_size);
13358 + BUG_ON(r->repl->targ_size < r->repl->smp2_size);
13359 + BUG_ON(r->repl->targ_size < r->repl->up_size);
13360 + if (system_state == SYSTEM_RUNNING &&
13361 + r->targ_start >= (void *)&__init_begin &&
13362 + r->targ_start < (void *)&__init_end)
13363 + continue;
13364 + if (r->repl->feature != (unsigned char)-1 &&
13365 + boot_cpu_has(r->repl->feature)) {
13366 + memcpy(r->targ_start,
13367 + r->repl->data + r->repl->smp1_size,
13368 + r->repl->smp2_size);
13369 + memset(r->targ_start + r->repl->smp2_size,
13370 + 0x90,
13371 + r->repl->targ_size - r->repl->smp2_size);
13372 + } else {
13373 + memcpy(r->targ_start,
13374 + r->repl->data,
13375 + r->repl->smp1_size);
13376 + memset(r->targ_start + r->repl->smp1_size,
13377 + 0x90,
13378 + r->repl->targ_size - r->repl->smp1_size);
13379 + }
13380 + }
13381 + /* Paranoia */
13382 + asm volatile ("jmp 1f\n1:");
13383 + mb();
13384 +}
13385 +
13386 +void unprepare_for_smp(void)
13387 +{
13388 + struct smp_alternative_record *r;
13389 + printk(KERN_INFO "Disabling SMP...\n");
13390 + for (r = &__start_smp_alternatives_table;
13391 + r != &__stop_smp_alternatives_table;
13392 + r++) {
13393 + BUG_ON(r->repl->targ_size < r->repl->smp1_size);
13394 + BUG_ON(r->repl->targ_size < r->repl->smp2_size);
13395 + BUG_ON(r->repl->targ_size < r->repl->up_size);
13396 + if (system_state == SYSTEM_RUNNING &&
13397 + r->targ_start >= (void *)&__init_begin &&
13398 + r->targ_start < (void *)&__init_end)
13399 + continue;
13400 + memcpy(r->targ_start,
13401 + r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
13402 + r->repl->up_size);
13403 + memset(r->targ_start + r->repl->up_size,
13404 + 0x90,
13405 + r->repl->targ_size - r->repl->up_size);
13406 + }
13407 + /* Paranoia */
13408 + asm volatile ("jmp 1f\n1:");
13409 + mb();
13410 +}
13411 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/smpboot.c linux-2.6.16.33/arch/i386/kernel/smpboot.c
13412 --- linux-2.6.16.33-noxen/arch/i386/kernel/smpboot.c 2006-11-22 18:06:31.000000000 +0000
13413 +++ linux-2.6.16.33/arch/i386/kernel/smpboot.c 2007-01-08 15:00:45.000000000 +0000
13414 @@ -1218,6 +1218,11 @@
13415 if (max_cpus <= cpucount+1)
13416 continue;
13417
13418 +#ifdef CONFIG_SMP_ALTERNATIVES
13419 + if (kicked == 1)
13420 + prepare_for_smp();
13421 +#endif
13422 +
13423 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
13424 printk("CPU #%d not responding - cannot use it.\n",
13425 apicid);
13426 @@ -1396,6 +1401,11 @@
13427 return -EIO;
13428 }
13429
13430 +#ifdef CONFIG_SMP_ALTERNATIVES
13431 + if (num_online_cpus() == 1)
13432 + prepare_for_smp();
13433 +#endif
13434 +
13435 local_irq_enable();
13436 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
13437 /* Unleash the CPU! */
13438 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/swiotlb.c linux-2.6.16.33/arch/i386/kernel/swiotlb.c
13439 --- linux-2.6.16.33-noxen/arch/i386/kernel/swiotlb.c 1970-01-01 00:00:00.000000000 +0000
13440 +++ linux-2.6.16.33/arch/i386/kernel/swiotlb.c 2007-01-08 15:00:45.000000000 +0000
13441 @@ -0,0 +1,683 @@
13442 +/*
13443 + * Dynamic DMA mapping support.
13444 + *
13445 + * This implementation is a fallback for platforms that do not support
13446 + * I/O TLBs (aka DMA address translation hardware).
13447 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
13448 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
13449 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
13450 + * David Mosberger-Tang <davidm@hpl.hp.com>
13451 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
13452 + */
13453 +
13454 +#include <linux/cache.h>
13455 +#include <linux/mm.h>
13456 +#include <linux/module.h>
13457 +#include <linux/pci.h>
13458 +#include <linux/spinlock.h>
13459 +#include <linux/string.h>
13460 +#include <linux/types.h>
13461 +#include <linux/ctype.h>
13462 +#include <linux/init.h>
13463 +#include <linux/bootmem.h>
13464 +#include <linux/highmem.h>
13465 +#include <asm/io.h>
13466 +#include <asm/pci.h>
13467 +#include <asm/dma.h>
13468 +#include <asm/uaccess.h>
13469 +#include <xen/interface/memory.h>
13470 +
13471 +int swiotlb;
13472 +EXPORT_SYMBOL(swiotlb);
13473 +
13474 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
13475 +
13476 +#define SG_ENT_PHYS_ADDRESS(sg) (page_to_bus((sg)->page) + (sg)->offset)
13477 +
13478 +/*
13479 + * Maximum allowable number of contiguous slabs to map,
13480 + * must be a power of 2. What is the appropriate value ?
13481 + * The complexity of {map,unmap}_single is linearly dependent on this value.
13482 + */
13483 +#define IO_TLB_SEGSIZE 128
13484 +
13485 +/*
13486 + * log of the size of each IO TLB slab. The number of slabs is command line
13487 + * controllable.
13488 + */
13489 +#define IO_TLB_SHIFT 11
13490 +
13491 +/* Width of DMA addresses. 30 bits is a b44 limitation. */
13492 +#define DEFAULT_DMA_BITS 30
13493 +
13494 +static int swiotlb_force;
13495 +static char *iotlb_virt_start;
13496 +static unsigned long iotlb_nslabs;
13497 +
13498 +/*
13499 + * Used to do a quick range check in swiotlb_unmap_single and
13500 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
13501 + * API.
13502 + */
13503 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
13504 +
13505 +/* Does the given dma address reside within the swiotlb aperture? */
13506 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
13507 +{
13508 + unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
13509 + return (pfn_valid(pfn)
13510 + && (pfn >= iotlb_pfn_start)
13511 + && (pfn < iotlb_pfn_end));
13512 +}
13513 +
13514 +/*
13515 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
13516 + */
13517 +static unsigned long io_tlb_overflow = 32*1024;
13518 +
13519 +void *io_tlb_overflow_buffer;
13520 +
13521 +/*
13522 + * This is a free list describing the number of free entries available from
13523 + * each index
13524 + */
13525 +static unsigned int *io_tlb_list;
13526 +static unsigned int io_tlb_index;
13527 +
13528 +/*
13529 + * We need to save away the original address corresponding to a mapped entry
13530 + * for the sync operations.
13531 + */
13532 +static struct phys_addr {
13533 + struct page *page;
13534 + unsigned int offset;
13535 +} *io_tlb_orig_addr;
13536 +
13537 +/*
13538 + * Protect the above data structures in the map and unmap calls
13539 + */
13540 +static DEFINE_SPINLOCK(io_tlb_lock);
13541 +
13542 +unsigned int dma_bits = DEFAULT_DMA_BITS;
13543 +static int __init
13544 +setup_dma_bits(char *str)
13545 +{
13546 + dma_bits = simple_strtoul(str, NULL, 0);
13547 + return 0;
13548 +}
13549 +__setup("dma_bits=", setup_dma_bits);
13550 +
13551 +static int __init
13552 +setup_io_tlb_npages(char *str)
13553 +{
13554 + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
13555 + if (isdigit(*str)) {
13556 + iotlb_nslabs = simple_strtoul(str, &str, 0) <<
13557 + (20 - IO_TLB_SHIFT);
13558 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13559 + /* Round up to power of two (xen_create_contiguous_region). */
13560 + while (iotlb_nslabs & (iotlb_nslabs-1))
13561 + iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13562 + }
13563 + if (*str == ',')
13564 + ++str;
13565 + /*
13566 + * NB. 'force' enables the swiotlb, but doesn't force its use for
13567 + * every DMA like it does on native Linux. 'off' forcibly disables
13568 + * use of the swiotlb.
13569 + */
13570 + if (!strcmp(str, "force"))
13571 + swiotlb_force = 1;
13572 + else if (!strcmp(str, "off"))
13573 + swiotlb_force = -1;
13574 + return 1;
13575 +}
13576 +__setup("swiotlb=", setup_io_tlb_npages);
13577 +/* make io_tlb_overflow tunable too? */
13578 +
13579 +/*
13580 + * Statically reserve bounce buffer space and initialize bounce buffer data
13581 + * structures for the software IO TLB used to implement the PCI DMA API.
13582 + */
13583 +void
13584 +swiotlb_init_with_default_size (size_t default_size)
13585 +{
13586 + unsigned long i, bytes;
13587 +
13588 + if (!iotlb_nslabs) {
13589 + iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
13590 + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
13591 + /* Round up to power of two (xen_create_contiguous_region). */
13592 + while (iotlb_nslabs & (iotlb_nslabs-1))
13593 + iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
13594 + }
13595 +
13596 + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
13597 +
13598 + /*
13599 + * Get IO TLB memory from the low pages
13600 + */
13601 + iotlb_virt_start = alloc_bootmem_low_pages(bytes);
13602 + if (!iotlb_virt_start)
13603 + panic("Cannot allocate SWIOTLB buffer!\n"
13604 + "Use dom0_mem Xen boot parameter to reserve\n"
13605 + "some DMA memory (e.g., dom0_mem=-128M).\n");
13606 +
13607 + for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
13608 + int rc = xen_create_contiguous_region(
13609 + (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
13610 + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
13611 + dma_bits);
13612 + BUG_ON(rc);
13613 + }
13614 +
13615 + /*
13616 + * Allocate and initialize the free list array. This array is used
13617 + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
13618 + */
13619 + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
13620 + for (i = 0; i < iotlb_nslabs; i++)
13621 + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
13622 + io_tlb_index = 0;
13623 + io_tlb_orig_addr = alloc_bootmem(
13624 + iotlb_nslabs * sizeof(*io_tlb_orig_addr));
13625 +
13626 + /*
13627 + * Get the overflow emergency buffer
13628 + */
13629 + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
13630 +
13631 + iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
13632 + iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
13633 +
13634 + printk(KERN_INFO "Software IO TLB enabled: \n"
13635 + " Aperture: %lu megabytes\n"
13636 + " Kernel range: 0x%016lx - 0x%016lx\n"
13637 + " Address size: %u bits\n",
13638 + bytes >> 20,
13639 + (unsigned long)iotlb_virt_start,
13640 + (unsigned long)iotlb_virt_start + bytes,
13641 + dma_bits);
13642 +}
13643 +
13644 +void
13645 +swiotlb_init(void)
13646 +{
13647 + long ram_end;
13648 + size_t defsz = 64 * (1 << 20); /* 64MB default size */
13649 +
13650 + if (swiotlb_force == 1) {
13651 + swiotlb = 1;
13652 + } else if ((swiotlb_force != -1) &&
13653 + is_running_on_xen() &&
13654 + is_initial_xendomain()) {
13655 + /* Domain 0 always has a swiotlb. */
13656 + ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
13657 + if (ram_end <= 0x7ffff)
13658 + defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
13659 + swiotlb = 1;
13660 + }
13661 +
13662 + if (swiotlb)
13663 + swiotlb_init_with_default_size(defsz);
13664 + else
13665 + printk(KERN_INFO "Software IO TLB disabled\n");
13666 +}
13667 +
13668 +/*
13669 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
13670 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
13671 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
13672 + * unnecessary copy from the aperture to the host buffer, and a page fault.
13673 + */
13674 +static void
13675 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
13676 +{
13677 + if (PageHighMem(buffer.page)) {
13678 + size_t len, bytes;
13679 + char *dev, *host, *kmp;
13680 + len = size;
13681 + while (len != 0) {
13682 + if (((bytes = len) + buffer.offset) > PAGE_SIZE)
13683 + bytes = PAGE_SIZE - buffer.offset;
13684 + kmp = kmap_atomic(buffer.page, KM_SWIOTLB);
13685 + dev = dma_addr + size - len;
13686 + host = kmp + buffer.offset;
13687 + if (dir == DMA_FROM_DEVICE) {
13688 + if (__copy_to_user_inatomic(host, dev, bytes))
13689 + /* inaccessible */;
13690 + } else
13691 + memcpy(dev, host, bytes);
13692 + kunmap_atomic(kmp, KM_SWIOTLB);
13693 + len -= bytes;
13694 + buffer.page++;
13695 + buffer.offset = 0;
13696 + }
13697 + } else {
13698 + char *host = (char *)phys_to_virt(
13699 + page_to_pseudophys(buffer.page)) + buffer.offset;
13700 + if (dir == DMA_FROM_DEVICE) {
13701 + if (__copy_to_user_inatomic(host, dma_addr, size))
13702 + /* inaccessible */;
13703 + } else if (dir == DMA_TO_DEVICE)
13704 + memcpy(dma_addr, host, size);
13705 + }
13706 +}
13707 +
13708 +/*
13709 + * Allocates bounce buffer and returns its kernel virtual address.
13710 + */
13711 +static void *
13712 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
13713 +{
13714 + unsigned long flags;
13715 + char *dma_addr;
13716 + unsigned int nslots, stride, index, wrap;
13717 + int i;
13718 +
13719 + /*
13720 + * For mappings greater than a page, we limit the stride (and
13721 + * hence alignment) to a page size.
13722 + */
13723 + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13724 + if (size > PAGE_SIZE)
13725 + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
13726 + else
13727 + stride = 1;
13728 +
13729 + BUG_ON(!nslots);
13730 +
13731 + /*
13732 + * Find suitable number of IO TLB entries size that will fit this
13733 + * request and allocate a buffer from that IO TLB pool.
13734 + */
13735 + spin_lock_irqsave(&io_tlb_lock, flags);
13736 + {
13737 + wrap = index = ALIGN(io_tlb_index, stride);
13738 +
13739 + if (index >= iotlb_nslabs)
13740 + wrap = index = 0;
13741 +
13742 + do {
13743 + /*
13744 + * If we find a slot that indicates we have 'nslots'
13745 + * number of contiguous buffers, we allocate the
13746 + * buffers from that slot and mark the entries as '0'
13747 + * indicating unavailable.
13748 + */
13749 + if (io_tlb_list[index] >= nslots) {
13750 + int count = 0;
13751 +
13752 + for (i = index; i < (int)(index + nslots); i++)
13753 + io_tlb_list[i] = 0;
13754 + for (i = index - 1;
13755 + (OFFSET(i, IO_TLB_SEGSIZE) !=
13756 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13757 + i--)
13758 + io_tlb_list[i] = ++count;
13759 + dma_addr = iotlb_virt_start +
13760 + (index << IO_TLB_SHIFT);
13761 +
13762 + /*
13763 + * Update the indices to avoid searching in
13764 + * the next round.
13765 + */
13766 + io_tlb_index =
13767 + ((index + nslots) < iotlb_nslabs
13768 + ? (index + nslots) : 0);
13769 +
13770 + goto found;
13771 + }
13772 + index += stride;
13773 + if (index >= iotlb_nslabs)
13774 + index = 0;
13775 + } while (index != wrap);
13776 +
13777 + spin_unlock_irqrestore(&io_tlb_lock, flags);
13778 + return NULL;
13779 + }
13780 + found:
13781 + spin_unlock_irqrestore(&io_tlb_lock, flags);
13782 +
13783 + /*
13784 + * Save away the mapping from the original address to the DMA address.
13785 + * This is needed when we sync the memory. Then we sync the buffer if
13786 + * needed.
13787 + */
13788 + io_tlb_orig_addr[index] = buffer;
13789 + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13790 + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
13791 +
13792 + return dma_addr;
13793 +}
13794 +
13795 +/*
13796 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
13797 + */
13798 +static void
13799 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13800 +{
13801 + unsigned long flags;
13802 + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13803 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13804 + struct phys_addr buffer = io_tlb_orig_addr[index];
13805 +
13806 + /*
13807 + * First, sync the memory before unmapping the entry
13808 + */
13809 + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13810 + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
13811 +
13812 + /*
13813 + * Return the buffer to the free list by setting the corresponding
13814 + * entries to indicate the number of contigous entries available.
13815 + * While returning the entries to the free list, we merge the entries
13816 + * with slots below and above the pool being returned.
13817 + */
13818 + spin_lock_irqsave(&io_tlb_lock, flags);
13819 + {
13820 + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
13821 + io_tlb_list[index + nslots] : 0);
13822 + /*
13823 + * Step 1: return the slots to the free list, merging the
13824 + * slots with superceeding slots
13825 + */
13826 + for (i = index + nslots - 1; i >= index; i--)
13827 + io_tlb_list[i] = ++count;
13828 + /*
13829 + * Step 2: merge the returned slots with the preceding slots,
13830 + * if available (non zero)
13831 + */
13832 + for (i = index - 1;
13833 + (OFFSET(i, IO_TLB_SEGSIZE) !=
13834 + IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13835 + i--)
13836 + io_tlb_list[i] = ++count;
13837 + }
13838 + spin_unlock_irqrestore(&io_tlb_lock, flags);
13839 +}
13840 +
13841 +static void
13842 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13843 +{
13844 + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13845 + struct phys_addr buffer = io_tlb_orig_addr[index];
13846 + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
13847 + __sync_single(buffer, dma_addr, size, dir);
13848 +}
13849 +
13850 +static void
13851 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
13852 +{
13853 + /*
13854 + * Ran out of IOMMU space for this operation. This is very bad.
13855 + * Unfortunately the drivers cannot handle this operation properly.
13856 + * unless they check for pci_dma_mapping_error (most don't)
13857 + * When the mapping is small enough return a static buffer to limit
13858 + * the damage, or panic when the transfer is too big.
13859 + */
13860 + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
13861 + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
13862 +
13863 + if (size > io_tlb_overflow && do_panic) {
13864 + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13865 + panic("PCI-DMA: Memory would be corrupted\n");
13866 + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13867 + panic("PCI-DMA: Random memory would be DMAed\n");
13868 + }
13869 +}
13870 +
13871 +/*
13872 + * Map a single buffer of the indicated size for DMA in streaming mode. The
13873 + * PCI address to use is returned.
13874 + *
13875 + * Once the device is given the dma address, the device owns this memory until
13876 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
13877 + */
13878 +dma_addr_t
13879 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
13880 +{
13881 + dma_addr_t dev_addr = virt_to_bus(ptr);
13882 + void *map;
13883 + struct phys_addr buffer;
13884 +
13885 + BUG_ON(dir == DMA_NONE);
13886 +
13887 + /*
13888 + * If the pointer passed in happens to be in the device's DMA window,
13889 + * we can safely return the device addr and not worry about bounce
13890 + * buffering it.
13891 + */
13892 + if (!range_straddles_page_boundary(ptr, size) &&
13893 + !address_needs_mapping(hwdev, dev_addr))
13894 + return dev_addr;
13895 +
13896 + /*
13897 + * Oh well, have to allocate and map a bounce buffer.
13898 + */
13899 + buffer.page = virt_to_page(ptr);
13900 + buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
13901 + map = map_single(hwdev, buffer, size, dir);
13902 + if (!map) {
13903 + swiotlb_full(hwdev, size, dir, 1);
13904 + map = io_tlb_overflow_buffer;
13905 + }
13906 +
13907 + dev_addr = virt_to_bus(map);
13908 + return dev_addr;
13909 +}
13910 +
13911 +/*
13912 + * Unmap a single streaming mode DMA translation. The dma_addr and size must
13913 + * match what was provided for in a previous swiotlb_map_single call. All
13914 + * other usages are undefined.
13915 + *
13916 + * After this call, reads by the cpu to the buffer are guaranteed to see
13917 + * whatever the device wrote there.
13918 + */
13919 +void
13920 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
13921 + int dir)
13922 +{
13923 + BUG_ON(dir == DMA_NONE);
13924 + if (in_swiotlb_aperture(dev_addr))
13925 + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
13926 +}
13927 +
13928 +/*
13929 + * Make physical memory consistent for a single streaming mode DMA translation
13930 + * after a transfer.
13931 + *
13932 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
13933 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
13934 + * call this function before doing so. At the next point you give the PCI dma
13935 + * address back to the card, you must first perform a
13936 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
13937 + */
13938 +void
13939 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
13940 + size_t size, int dir)
13941 +{
13942 + BUG_ON(dir == DMA_NONE);
13943 + if (in_swiotlb_aperture(dev_addr))
13944 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13945 +}
13946 +
13947 +void
13948 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
13949 + size_t size, int dir)
13950 +{
13951 + BUG_ON(dir == DMA_NONE);
13952 + if (in_swiotlb_aperture(dev_addr))
13953 + sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13954 +}
13955 +
13956 +/*
13957 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
13958 + * This is the scatter-gather version of the above swiotlb_map_single
13959 + * interface. Here the scatter gather list elements are each tagged with the
13960 + * appropriate dma address and length. They are obtained via
13961 + * sg_dma_{address,length}(SG).
13962 + *
13963 + * NOTE: An implementation may be able to use a smaller number of
13964 + * DMA address/length pairs than there are SG table elements.
13965 + * (for example via virtual mapping capabilities)
13966 + * The routine returns the number of addr/length pairs actually
13967 + * used, at most nents.
13968 + *
13969 + * Device ownership issues as mentioned above for swiotlb_map_single are the
13970 + * same here.
13971 + */
13972 +int
13973 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13974 + int dir)
13975 +{
13976 + struct phys_addr buffer;
13977 + dma_addr_t dev_addr;
13978 + char *map;
13979 + int i;
13980 +
13981 + BUG_ON(dir == DMA_NONE);
13982 +
13983 + for (i = 0; i < nelems; i++, sg++) {
13984 + dev_addr = SG_ENT_PHYS_ADDRESS(sg);
13985 + if (address_needs_mapping(hwdev, dev_addr)) {
13986 + buffer.page = sg->page;
13987 + buffer.offset = sg->offset;
13988 + map = map_single(hwdev, buffer, sg->length, dir);
13989 + if (!map) {
13990 + /* Don't panic here, we expect map_sg users
13991 + to do proper error handling. */
13992 + swiotlb_full(hwdev, sg->length, dir, 0);
13993 + swiotlb_unmap_sg(hwdev, sg - i, i, dir);
13994 + sg[0].dma_length = 0;
13995 + return 0;
13996 + }
13997 + sg->dma_address = (dma_addr_t)virt_to_bus(map);
13998 + } else
13999 + sg->dma_address = dev_addr;
14000 + sg->dma_length = sg->length;
14001 + }
14002 + return nelems;
14003 +}
14004 +
14005 +/*
14006 + * Unmap a set of streaming mode DMA translations. Again, cpu read rules
14007 + * concerning calls here are the same as for swiotlb_unmap_single() above.
14008 + */
14009 +void
14010 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
14011 + int dir)
14012 +{
14013 + int i;
14014 +
14015 + BUG_ON(dir == DMA_NONE);
14016 +
14017 + for (i = 0; i < nelems; i++, sg++)
14018 + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14019 + unmap_single(hwdev,
14020 + (void *)bus_to_virt(sg->dma_address),
14021 + sg->dma_length, dir);
14022 +}
14023 +
14024 +/*
14025 + * Make physical memory consistent for a set of streaming mode DMA translations
14026 + * after a transfer.
14027 + *
14028 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
14029 + * and usage.
14030 + */
14031 +void
14032 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
14033 + int nelems, int dir)
14034 +{
14035 + int i;
14036 +
14037 + BUG_ON(dir == DMA_NONE);
14038 +
14039 + for (i = 0; i < nelems; i++, sg++)
14040 + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14041 + sync_single(hwdev,
14042 + (void *)bus_to_virt(sg->dma_address),
14043 + sg->dma_length, dir);
14044 +}
14045 +
14046 +void
14047 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
14048 + int nelems, int dir)
14049 +{
14050 + int i;
14051 +
14052 + BUG_ON(dir == DMA_NONE);
14053 +
14054 + for (i = 0; i < nelems; i++, sg++)
14055 + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
14056 + sync_single(hwdev,
14057 + (void *)bus_to_virt(sg->dma_address),
14058 + sg->dma_length, dir);
14059 +}
14060 +
14061 +dma_addr_t
14062 +swiotlb_map_page(struct device *hwdev, struct page *page,
14063 + unsigned long offset, size_t size,
14064 + enum dma_data_direction direction)
14065 +{
14066 + struct phys_addr buffer;
14067 + dma_addr_t dev_addr;
14068 + char *map;
14069 +
14070 + dev_addr = page_to_bus(page) + offset;
14071 + if (address_needs_mapping(hwdev, dev_addr)) {
14072 + buffer.page = page;
14073 + buffer.offset = offset;
14074 + map = map_single(hwdev, buffer, size, direction);
14075 + if (!map) {
14076 + swiotlb_full(hwdev, size, direction, 1);
14077 + map = io_tlb_overflow_buffer;
14078 + }
14079 + dev_addr = (dma_addr_t)virt_to_bus(map);
14080 + }
14081 +
14082 + return dev_addr;
14083 +}
14084 +
14085 +void
14086 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
14087 + size_t size, enum dma_data_direction direction)
14088 +{
14089 + BUG_ON(direction == DMA_NONE);
14090 + if (in_swiotlb_aperture(dma_address))
14091 + unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
14092 +}
14093 +
14094 +int
14095 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
14096 +{
14097 + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
14098 +}
14099 +
14100 +/*
14101 + * Return whether the given PCI device DMA address mask can be supported
14102 + * properly. For example, if your device can only drive the low 24-bits
14103 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
14104 + * this function.
14105 + */
14106 +int
14107 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
14108 +{
14109 + return (mask >= ((1UL << dma_bits) - 1));
14110 +}
14111 +
14112 +EXPORT_SYMBOL(swiotlb_init);
14113 +EXPORT_SYMBOL(swiotlb_map_single);
14114 +EXPORT_SYMBOL(swiotlb_unmap_single);
14115 +EXPORT_SYMBOL(swiotlb_map_sg);
14116 +EXPORT_SYMBOL(swiotlb_unmap_sg);
14117 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
14118 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
14119 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
14120 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
14121 +EXPORT_SYMBOL(swiotlb_map_page);
14122 +EXPORT_SYMBOL(swiotlb_unmap_page);
14123 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
14124 +EXPORT_SYMBOL(swiotlb_dma_supported);
14125 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/sysenter.c linux-2.6.16.33/arch/i386/kernel/sysenter.c
14126 --- linux-2.6.16.33-noxen/arch/i386/kernel/sysenter.c 2006-11-22 18:06:31.000000000 +0000
14127 +++ linux-2.6.16.33/arch/i386/kernel/sysenter.c 2007-01-08 15:00:45.000000000 +0000
14128 @@ -13,16 +13,22 @@
14129 #include <linux/gfp.h>
14130 #include <linux/string.h>
14131 #include <linux/elf.h>
14132 +#include <linux/mm.h>
14133
14134 #include <asm/cpufeature.h>
14135 #include <asm/msr.h>
14136 #include <asm/pgtable.h>
14137 #include <asm/unistd.h>
14138
14139 +#ifdef CONFIG_XEN
14140 +#include <xen/interface/callback.h>
14141 +#endif
14142 +
14143 extern asmlinkage void sysenter_entry(void);
14144
14145 void enable_sep_cpu(void)
14146 {
14147 +#ifndef CONFIG_X86_NO_TSS
14148 int cpu = get_cpu();
14149 struct tss_struct *tss = &per_cpu(init_tss, cpu);
14150
14151 @@ -37,6 +43,7 @@
14152 wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
14153 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
14154 put_cpu();
14155 +#endif
14156 }
14157
14158 /*
14159 @@ -45,23 +52,100 @@
14160 */
14161 extern const char vsyscall_int80_start, vsyscall_int80_end;
14162 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
14163 +static void *syscall_page;
14164
14165 int __init sysenter_setup(void)
14166 {
14167 - void *page = (void *)get_zeroed_page(GFP_ATOMIC);
14168 + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
14169
14170 - __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
14171 +#ifdef CONFIG_XEN
14172 + if (boot_cpu_has(X86_FEATURE_SEP)) {
14173 + static struct callback_register __initdata sysenter = {
14174 + .type = CALLBACKTYPE_sysenter,
14175 + .address = { __KERNEL_CS, (unsigned long)sysenter_entry },
14176 + };
14177
14178 - if (!boot_cpu_has(X86_FEATURE_SEP)) {
14179 - memcpy(page,
14180 - &vsyscall_int80_start,
14181 - &vsyscall_int80_end - &vsyscall_int80_start);
14182 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
14183 + clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
14184 + }
14185 +#endif
14186 +
14187 + if (boot_cpu_has(X86_FEATURE_SEP)) {
14188 + memcpy(syscall_page,
14189 + &vsyscall_sysenter_start,
14190 + &vsyscall_sysenter_end - &vsyscall_sysenter_start);
14191 return 0;
14192 }
14193
14194 - memcpy(page,
14195 - &vsyscall_sysenter_start,
14196 - &vsyscall_sysenter_end - &vsyscall_sysenter_start);
14197 + memcpy(syscall_page,
14198 + &vsyscall_int80_start,
14199 + &vsyscall_int80_end - &vsyscall_int80_start);
14200 +
14201 + return 0;
14202 +}
14203 +
14204 +static struct page*
14205 +syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
14206 +{
14207 + struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
14208 + get_page(p);
14209 + return p;
14210 +}
14211 +
14212 +/* Prevent VMA merging */
14213 +static void syscall_vma_close(struct vm_area_struct *vma)
14214 +{
14215 +}
14216 +
14217 +static struct vm_operations_struct syscall_vm_ops = {
14218 + .close = syscall_vma_close,
14219 + .nopage = syscall_nopage,
14220 +};
14221
14222 +/* Setup a VMA at program startup for the vsyscall page */
14223 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
14224 +{
14225 + struct vm_area_struct *vma;
14226 + struct mm_struct *mm = current->mm;
14227 + int ret;
14228 +
14229 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
14230 + if (!vma)
14231 + return -ENOMEM;
14232 +
14233 + memset(vma, 0, sizeof(struct vm_area_struct));
14234 + /* Could randomize here */
14235 + vma->vm_start = VSYSCALL_BASE;
14236 + vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
14237 + /* MAYWRITE to allow gdb to COW and set breakpoints */
14238 + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
14239 + vma->vm_flags |= mm->def_flags;
14240 + vma->vm_page_prot = protection_map[vma->vm_flags & 7];
14241 + vma->vm_ops = &syscall_vm_ops;
14242 + vma->vm_mm = mm;
14243 +
14244 + down_write(&mm->mmap_sem);
14245 + if ((ret = insert_vm_struct(mm, vma))) {
14246 + up_write(&mm->mmap_sem);
14247 + kmem_cache_free(vm_area_cachep, vma);
14248 + return ret;
14249 + }
14250 + mm->total_vm++;
14251 + up_write(&mm->mmap_sem);
14252 + return 0;
14253 +}
14254 +
14255 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
14256 +{
14257 + return NULL;
14258 +}
14259 +
14260 +int in_gate_area(struct task_struct *task, unsigned long addr)
14261 +{
14262 + return 0;
14263 +}
14264 +
14265 +int in_gate_area_no_task(unsigned long addr)
14266 +{
14267 return 0;
14268 }
14269 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/time-xen.c linux-2.6.16.33/arch/i386/kernel/time-xen.c
14270 --- linux-2.6.16.33-noxen/arch/i386/kernel/time-xen.c 1970-01-01 00:00:00.000000000 +0000
14271 +++ linux-2.6.16.33/arch/i386/kernel/time-xen.c 2007-01-08 15:00:45.000000000 +0000
14272 @@ -0,0 +1,1121 @@
14273 +/*
14274 + * linux/arch/i386/kernel/time.c
14275 + *
14276 + * Copyright (C) 1991, 1992, 1995 Linus Torvalds
14277 + *
14278 + * This file contains the PC-specific time handling details:
14279 + * reading the RTC at bootup, etc..
14280 + * 1994-07-02 Alan Modra
14281 + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
14282 + * 1995-03-26 Markus Kuhn
14283 + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
14284 + * precision CMOS clock update
14285 + * 1996-05-03 Ingo Molnar
14286 + * fixed time warps in do_[slow|fast]_gettimeoffset()
14287 + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14288 + * "A Kernel Model for Precision Timekeeping" by Dave Mills
14289 + * 1998-09-05 (Various)
14290 + * More robust do_fast_gettimeoffset() algorithm implemented
14291 + * (works with APM, Cyrix 6x86MX and Centaur C6),
14292 + * monotonic gettimeofday() with fast_get_timeoffset(),
14293 + * drift-proof precision TSC calibration on boot
14294 + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
14295 + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
14296 + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
14297 + * 1998-12-16 Andrea Arcangeli
14298 + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
14299 + * because was not accounting lost_ticks.
14300 + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
14301 + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
14302 + * serialize accesses to xtime/lost_ticks).
14303 + */
14304 +
14305 +#include <linux/errno.h>
14306 +#include <linux/sched.h>
14307 +#include <linux/kernel.h>
14308 +#include <linux/param.h>
14309 +#include <linux/string.h>
14310 +#include <linux/mm.h>
14311 +#include <linux/interrupt.h>
14312 +#include <linux/time.h>
14313 +#include <linux/delay.h>
14314 +#include <linux/init.h>
14315 +#include <linux/smp.h>
14316 +#include <linux/module.h>
14317 +#include <linux/sysdev.h>
14318 +#include <linux/bcd.h>
14319 +#include <linux/efi.h>
14320 +#include <linux/mca.h>
14321 +#include <linux/sysctl.h>
14322 +#include <linux/percpu.h>
14323 +#include <linux/kernel_stat.h>
14324 +#include <linux/posix-timers.h>
14325 +
14326 +#include <asm/io.h>
14327 +#include <asm/smp.h>
14328 +#include <asm/irq.h>
14329 +#include <asm/msr.h>
14330 +#include <asm/delay.h>
14331 +#include <asm/mpspec.h>
14332 +#include <asm/uaccess.h>
14333 +#include <asm/processor.h>
14334 +#include <asm/timer.h>
14335 +#include <asm/sections.h>
14336 +
14337 +#include "mach_time.h"
14338 +
14339 +#include <linux/timex.h>
14340 +#include <linux/config.h>
14341 +
14342 +#include <asm/hpet.h>
14343 +
14344 +#include <asm/arch_hooks.h>
14345 +
14346 +#include <xen/evtchn.h>
14347 +#include <xen/interface/vcpu.h>
14348 +
14349 +#if defined (__i386__)
14350 +#include <asm/i8259.h>
14351 +#endif
14352 +
14353 +int pit_latch_buggy; /* extern */
14354 +
14355 +#if defined(__x86_64__)
14356 +unsigned long vxtime_hz = PIT_TICK_RATE;
14357 +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
14358 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
14359 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
14360 +struct timespec __xtime __section_xtime;
14361 +struct timezone __sys_tz __section_sys_tz;
14362 +#endif
14363 +
14364 +unsigned int cpu_khz; /* Detected as we calibrate the TSC */
14365 +EXPORT_SYMBOL(cpu_khz);
14366 +
14367 +extern unsigned long wall_jiffies;
14368 +
14369 +DEFINE_SPINLOCK(rtc_lock);
14370 +EXPORT_SYMBOL(rtc_lock);
14371 +
14372 +#if defined (__i386__)
14373 +#include <asm/i8253.h>
14374 +#endif
14375 +
14376 +DEFINE_SPINLOCK(i8253_lock);
14377 +EXPORT_SYMBOL(i8253_lock);
14378 +
14379 +extern struct init_timer_opts timer_tsc_init;
14380 +extern struct timer_opts timer_tsc;
14381 +#define timer_none timer_tsc
14382 +struct timer_opts *cur_timer __read_mostly = &timer_tsc;
14383 +
14384 +/* These are peridically updated in shared_info, and then copied here. */
14385 +struct shadow_time_info {
14386 + u64 tsc_timestamp; /* TSC at last update of time vals. */
14387 + u64 system_timestamp; /* Time, in nanosecs, since boot. */
14388 + u32 tsc_to_nsec_mul;
14389 + u32 tsc_to_usec_mul;
14390 + int tsc_shift;
14391 + u32 version;
14392 +};
14393 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
14394 +static struct timespec shadow_tv;
14395 +static u32 shadow_tv_version;
14396 +
14397 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
14398 +static u64 processed_system_time; /* System time (ns) at last processing. */
14399 +static DEFINE_PER_CPU(u64, processed_system_time);
14400 +
14401 +/* How much CPU time was spent blocked and how much was 'stolen'? */
14402 +static DEFINE_PER_CPU(u64, processed_stolen_time);
14403 +static DEFINE_PER_CPU(u64, processed_blocked_time);
14404 +
14405 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
14406 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
14407 +
14408 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
14409 +#define NS_PER_TICK (1000000000LL/HZ)
14410 +
14411 +static inline void __normalize_time(time_t *sec, s64 *nsec)
14412 +{
14413 + while (*nsec >= NSEC_PER_SEC) {
14414 + (*nsec) -= NSEC_PER_SEC;
14415 + (*sec)++;
14416 + }
14417 + while (*nsec < 0) {
14418 + (*nsec) += NSEC_PER_SEC;
14419 + (*sec)--;
14420 + }
14421 +}
14422 +
14423 +/* Does this guest OS track Xen time, or set its wall clock independently? */
14424 +static int independent_wallclock = 0;
14425 +static int __init __independent_wallclock(char *str)
14426 +{
14427 + independent_wallclock = 1;
14428 + return 1;
14429 +}
14430 +__setup("independent_wallclock", __independent_wallclock);
14431 +
14432 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
14433 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
14434 +static int __init __permitted_clock_jitter(char *str)
14435 +{
14436 + permitted_clock_jitter = simple_strtoul(str, NULL, 0);
14437 + return 1;
14438 +}
14439 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
14440 +
14441 +int tsc_disable __devinitdata = 0;
14442 +
14443 +static void delay_tsc(unsigned long loops)
14444 +{
14445 + unsigned long bclock, now;
14446 +
14447 + rdtscl(bclock);
14448 + do {
14449 + rep_nop();
14450 + rdtscl(now);
14451 + } while ((now - bclock) < loops);
14452 +}
14453 +
14454 +struct timer_opts timer_tsc = {
14455 + .name = "tsc",
14456 + .delay = delay_tsc,
14457 +};
14458 +
14459 +/*
14460 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
14461 + * yielding a 64-bit result.
14462 + */
14463 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
14464 +{
14465 + u64 product;
14466 +#ifdef __i386__
14467 + u32 tmp1, tmp2;
14468 +#endif
14469 +
14470 + if (shift < 0)
14471 + delta >>= -shift;
14472 + else
14473 + delta <<= shift;
14474 +
14475 +#ifdef __i386__
14476 + __asm__ (
14477 + "mul %5 ; "
14478 + "mov %4,%%eax ; "
14479 + "mov %%edx,%4 ; "
14480 + "mul %5 ; "
14481 + "xor %5,%5 ; "
14482 + "add %4,%%eax ; "
14483 + "adc %5,%%edx ; "
14484 + : "=A" (product), "=r" (tmp1), "=r" (tmp2)
14485 + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
14486 +#else
14487 + __asm__ (
14488 + "mul %%rdx ; shrd $32,%%rdx,%%rax"
14489 + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
14490 +#endif
14491 +
14492 + return product;
14493 +}
14494 +
14495 +#if defined (__i386__)
14496 +int read_current_timer(unsigned long *timer_val)
14497 +{
14498 + rdtscl(*timer_val);
14499 + return 0;
14500 +}
14501 +#endif
14502 +
14503 +void init_cpu_khz(void)
14504 +{
14505 + u64 __cpu_khz = 1000000ULL << 32;
14506 + struct vcpu_time_info *info;
14507 + info = &HYPERVISOR_shared_info->vcpu_info[0].time;
14508 + do_div(__cpu_khz, info->tsc_to_system_mul);
14509 + if (info->tsc_shift < 0)
14510 + cpu_khz = __cpu_khz << -info->tsc_shift;
14511 + else
14512 + cpu_khz = __cpu_khz >> info->tsc_shift;
14513 +}
14514 +
14515 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
14516 +{
14517 + u64 now, delta;
14518 + rdtscll(now);
14519 + delta = now - shadow->tsc_timestamp;
14520 + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
14521 +}
14522 +
14523 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
14524 +{
14525 + u64 now, delta;
14526 + rdtscll(now);
14527 + delta = now - shadow->tsc_timestamp;
14528 + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
14529 +}
14530 +
14531 +static void __update_wallclock(time_t sec, long nsec)
14532 +{
14533 + long wtm_nsec, xtime_nsec;
14534 + time_t wtm_sec, xtime_sec;
14535 + u64 tmp, wc_nsec;
14536 +
14537 + /* Adjust wall-clock time base based on wall_jiffies ticks. */
14538 + wc_nsec = processed_system_time;
14539 + wc_nsec += sec * (u64)NSEC_PER_SEC;
14540 + wc_nsec += nsec;
14541 + wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
14542 +
14543 + /* Split wallclock base into seconds and nanoseconds. */
14544 + tmp = wc_nsec;
14545 + xtime_nsec = do_div(tmp, 1000000000);
14546 + xtime_sec = (time_t)tmp;
14547 +
14548 + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
14549 + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
14550 +
14551 + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
14552 + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
14553 +
14554 + ntp_clear();
14555 +}
14556 +
14557 +static void update_wallclock(void)
14558 +{
14559 + shared_info_t *s = HYPERVISOR_shared_info;
14560 +
14561 + do {
14562 + shadow_tv_version = s->wc_version;
14563 + rmb();
14564 + shadow_tv.tv_sec = s->wc_sec;
14565 + shadow_tv.tv_nsec = s->wc_nsec;
14566 + rmb();
14567 + } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
14568 +
14569 + if (!independent_wallclock)
14570 + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
14571 +}
14572 +
14573 +/*
14574 + * Reads a consistent set of time-base values from Xen, into a shadow data
14575 + * area.
14576 + */
14577 +static void get_time_values_from_xen(void)
14578 +{
14579 + shared_info_t *s = HYPERVISOR_shared_info;
14580 + struct vcpu_time_info *src;
14581 + struct shadow_time_info *dst;
14582 +
14583 + src = &s->vcpu_info[smp_processor_id()].time;
14584 + dst = &per_cpu(shadow_time, smp_processor_id());
14585 +
14586 + do {
14587 + dst->version = src->version;
14588 + rmb();
14589 + dst->tsc_timestamp = src->tsc_timestamp;
14590 + dst->system_timestamp = src->system_time;
14591 + dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
14592 + dst->tsc_shift = src->tsc_shift;
14593 + rmb();
14594 + } while ((src->version & 1) | (dst->version ^ src->version));
14595 +
14596 + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
14597 +}
14598 +
14599 +static inline int time_values_up_to_date(int cpu)
14600 +{
14601 + struct vcpu_time_info *src;
14602 + struct shadow_time_info *dst;
14603 +
14604 + src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
14605 + dst = &per_cpu(shadow_time, cpu);
14606 +
14607 + rmb();
14608 + return (dst->version == src->version);
14609 +}
14610 +
14611 +/*
14612 + * This is a special lock that is owned by the CPU and holds the index
14613 + * register we are working with. It is required for NMI access to the
14614 + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
14615 + */
14616 +volatile unsigned long cmos_lock = 0;
14617 +EXPORT_SYMBOL(cmos_lock);
14618 +
14619 +/* Routines for accessing the CMOS RAM/RTC. */
14620 +unsigned char rtc_cmos_read(unsigned char addr)
14621 +{
14622 + unsigned char val;
14623 + lock_cmos_prefix(addr);
14624 + outb_p(addr, RTC_PORT(0));
14625 + val = inb_p(RTC_PORT(1));
14626 + lock_cmos_suffix(addr);
14627 + return val;
14628 +}
14629 +EXPORT_SYMBOL(rtc_cmos_read);
14630 +
14631 +void rtc_cmos_write(unsigned char val, unsigned char addr)
14632 +{
14633 + lock_cmos_prefix(addr);
14634 + outb_p(addr, RTC_PORT(0));
14635 + outb_p(val, RTC_PORT(1));
14636 + lock_cmos_suffix(addr);
14637 +}
14638 +EXPORT_SYMBOL(rtc_cmos_write);
14639 +
14640 +/*
14641 + * This version of gettimeofday has microsecond resolution
14642 + * and better than microsecond precision on fast x86 machines with TSC.
14643 + */
14644 +void do_gettimeofday(struct timeval *tv)
14645 +{
14646 + unsigned long seq;
14647 + unsigned long usec, sec;
14648 + unsigned long max_ntp_tick;
14649 + s64 nsec;
14650 + unsigned int cpu;
14651 + struct shadow_time_info *shadow;
14652 + u32 local_time_version;
14653 +
14654 + cpu = get_cpu();
14655 + shadow = &per_cpu(shadow_time, cpu);
14656 +
14657 + do {
14658 + unsigned long lost;
14659 +
14660 + local_time_version = shadow->version;
14661 + seq = read_seqbegin(&xtime_lock);
14662 +
14663 + usec = get_usec_offset(shadow);
14664 + lost = jiffies - wall_jiffies;
14665 +
14666 + /*
14667 + * If time_adjust is negative then NTP is slowing the clock
14668 + * so make sure not to go into next possible interval.
14669 + * Better to lose some accuracy than have time go backwards..
14670 + */
14671 + if (unlikely(time_adjust < 0)) {
14672 + max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
14673 + usec = min(usec, max_ntp_tick);
14674 +
14675 + if (lost)
14676 + usec += lost * max_ntp_tick;
14677 + }
14678 + else if (unlikely(lost))
14679 + usec += lost * (USEC_PER_SEC / HZ);
14680 +
14681 + sec = xtime.tv_sec;
14682 + usec += (xtime.tv_nsec / NSEC_PER_USEC);
14683 +
14684 + nsec = shadow->system_timestamp - processed_system_time;
14685 + __normalize_time(&sec, &nsec);
14686 + usec += (long)nsec / NSEC_PER_USEC;
14687 +
14688 + if (unlikely(!time_values_up_to_date(cpu))) {
14689 + /*
14690 + * We may have blocked for a long time,
14691 + * rendering our calculations invalid
14692 + * (e.g. the time delta may have
14693 + * overflowed). Detect that and recalculate
14694 + * with fresh values.
14695 + */
14696 + get_time_values_from_xen();
14697 + continue;
14698 + }
14699 + } while (read_seqretry(&xtime_lock, seq) ||
14700 + (local_time_version != shadow->version));
14701 +
14702 + put_cpu();
14703 +
14704 + while (usec >= USEC_PER_SEC) {
14705 + usec -= USEC_PER_SEC;
14706 + sec++;
14707 + }
14708 +
14709 + tv->tv_sec = sec;
14710 + tv->tv_usec = usec;
14711 +}
14712 +
14713 +EXPORT_SYMBOL(do_gettimeofday);
14714 +
14715 +int do_settimeofday(struct timespec *tv)
14716 +{
14717 + time_t sec;
14718 + s64 nsec;
14719 + unsigned int cpu;
14720 + struct shadow_time_info *shadow;
14721 + dom0_op_t op;
14722 +
14723 + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
14724 + return -EINVAL;
14725 +
14726 + cpu = get_cpu();
14727 + shadow = &per_cpu(shadow_time, cpu);
14728 +
14729 + write_seqlock_irq(&xtime_lock);
14730 +
14731 + /*
14732 + * Ensure we don't get blocked for a long time so that our time delta
14733 + * overflows. If that were to happen then our shadow time values would
14734 + * be stale, so we can retry with fresh ones.
14735 + */
14736 + for (;;) {
14737 + nsec = tv->tv_nsec - get_nsec_offset(shadow);
14738 + if (time_values_up_to_date(cpu))
14739 + break;
14740 + get_time_values_from_xen();
14741 + }
14742 + sec = tv->tv_sec;
14743 + __normalize_time(&sec, &nsec);
14744 +
14745 + if (is_initial_xendomain() && !independent_wallclock) {
14746 + op.cmd = DOM0_SETTIME;
14747 + op.u.settime.secs = sec;
14748 + op.u.settime.nsecs = nsec;
14749 + op.u.settime.system_time = shadow->system_timestamp;
14750 + HYPERVISOR_dom0_op(&op);
14751 + update_wallclock();
14752 + } else if (independent_wallclock) {
14753 + nsec -= shadow->system_timestamp;
14754 + __normalize_time(&sec, &nsec);
14755 + __update_wallclock(sec, nsec);
14756 + }
14757 +
14758 + write_sequnlock_irq(&xtime_lock);
14759 +
14760 + put_cpu();
14761 +
14762 + clock_was_set();
14763 + return 0;
14764 +}
14765 +
14766 +EXPORT_SYMBOL(do_settimeofday);
14767 +
14768 +static void sync_xen_wallclock(unsigned long dummy);
14769 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
14770 +static void sync_xen_wallclock(unsigned long dummy)
14771 +{
14772 + time_t sec;
14773 + s64 nsec;
14774 + dom0_op_t op;
14775 +
14776 + if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
14777 + return;
14778 +
14779 + write_seqlock_irq(&xtime_lock);
14780 +
14781 + sec = xtime.tv_sec;
14782 + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
14783 + __normalize_time(&sec, &nsec);
14784 +
14785 + op.cmd = DOM0_SETTIME;
14786 + op.u.settime.secs = sec;
14787 + op.u.settime.nsecs = nsec;
14788 + op.u.settime.system_time = processed_system_time;
14789 + HYPERVISOR_dom0_op(&op);
14790 +
14791 + update_wallclock();
14792 +
14793 + write_sequnlock_irq(&xtime_lock);
14794 +
14795 + /* Once per minute. */
14796 + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
14797 +}
14798 +
14799 +static int set_rtc_mmss(unsigned long nowtime)
14800 +{
14801 + int retval;
14802 +
14803 + WARN_ON(irqs_disabled());
14804 +
14805 + if (independent_wallclock || !is_initial_xendomain())
14806 + return 0;
14807 +
14808 + /* gets recalled with irq locally disabled */
14809 + spin_lock_irq(&rtc_lock);
14810 + if (efi_enabled)
14811 + retval = efi_set_rtc_mmss(nowtime);
14812 + else
14813 + retval = mach_set_rtc_mmss(nowtime);
14814 + spin_unlock_irq(&rtc_lock);
14815 +
14816 + return retval;
14817 +}
14818 +
14819 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
14820 + * Note: This function is required to return accurate
14821 + * time even in the absence of multiple timer ticks.
14822 + */
14823 +unsigned long long monotonic_clock(void)
14824 +{
14825 + int cpu = get_cpu();
14826 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14827 + u64 time;
14828 + u32 local_time_version;
14829 +
14830 + do {
14831 + local_time_version = shadow->version;
14832 + barrier();
14833 + time = shadow->system_timestamp + get_nsec_offset(shadow);
14834 + if (!time_values_up_to_date(cpu))
14835 + get_time_values_from_xen();
14836 + barrier();
14837 + } while (local_time_version != shadow->version);
14838 +
14839 + put_cpu();
14840 +
14841 + return time;
14842 +}
14843 +EXPORT_SYMBOL(monotonic_clock);
14844 +
14845 +unsigned long long sched_clock(void)
14846 +{
14847 + return monotonic_clock();
14848 +}
14849 +
14850 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
14851 +unsigned long profile_pc(struct pt_regs *regs)
14852 +{
14853 + unsigned long pc = instruction_pointer(regs);
14854 +
14855 +#ifdef __x86_64__
14856 + /* Assume the lock function has either no stack frame or only a single word.
14857 + This checks if the address on the stack looks like a kernel text address.
14858 + There is a small window for false hits, but in that case the tick
14859 + is just accounted to the spinlock function.
14860 + Better would be to write these functions in assembler again
14861 + and check exactly. */
14862 + if (in_lock_functions(pc)) {
14863 + char *v = *(char **)regs->rsp;
14864 + if ((v >= _stext && v <= _etext) ||
14865 + (v >= _sinittext && v <= _einittext) ||
14866 + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
14867 + return (unsigned long)v;
14868 + return ((unsigned long *)regs->rsp)[1];
14869 + }
14870 +#else
14871 + if (in_lock_functions(pc))
14872 + return *(unsigned long *)(regs->ebp + 4);
14873 +#endif
14874 +
14875 + return pc;
14876 +}
14877 +EXPORT_SYMBOL(profile_pc);
14878 +#endif
14879 +
14880 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
14881 +{
14882 + s64 delta, delta_cpu, stolen, blocked;
14883 + u64 sched_time;
14884 + int i, cpu = smp_processor_id();
14885 + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14886 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14887 +
14888 + write_seqlock(&xtime_lock);
14889 +
14890 + do {
14891 + get_time_values_from_xen();
14892 +
14893 + /* Obtain a consistent snapshot of elapsed wallclock cycles. */
14894 + delta = delta_cpu =
14895 + shadow->system_timestamp + get_nsec_offset(shadow);
14896 + delta -= processed_system_time;
14897 + delta_cpu -= per_cpu(processed_system_time, cpu);
14898 +
14899 + /*
14900 + * Obtain a consistent snapshot of stolen/blocked cycles. We
14901 + * can use state_entry_time to detect if we get preempted here.
14902 + */
14903 + do {
14904 + sched_time = runstate->state_entry_time;
14905 + barrier();
14906 + stolen = runstate->time[RUNSTATE_runnable] +
14907 + runstate->time[RUNSTATE_offline] -
14908 + per_cpu(processed_stolen_time, cpu);
14909 + blocked = runstate->time[RUNSTATE_blocked] -
14910 + per_cpu(processed_blocked_time, cpu);
14911 + barrier();
14912 + } while (sched_time != runstate->state_entry_time);
14913 + } while (!time_values_up_to_date(cpu));
14914 +
14915 + if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
14916 + unlikely(delta_cpu < -(s64)permitted_clock_jitter))
14917 + && printk_ratelimit()) {
14918 + printk("Timer ISR/%d: Time went backwards: "
14919 + "delta=%lld delta_cpu=%lld shadow=%lld "
14920 + "off=%lld processed=%lld cpu_processed=%lld\n",
14921 + cpu, delta, delta_cpu, shadow->system_timestamp,
14922 + (s64)get_nsec_offset(shadow),
14923 + processed_system_time,
14924 + per_cpu(processed_system_time, cpu));
14925 + for (i = 0; i < num_online_cpus(); i++)
14926 + printk(" %d: %lld\n", i,
14927 + per_cpu(processed_system_time, i));
14928 + }
14929 +
14930 + /* System-wide jiffy work. */
14931 + while (delta >= NS_PER_TICK) {
14932 + delta -= NS_PER_TICK;
14933 + processed_system_time += NS_PER_TICK;
14934 + do_timer(regs);
14935 + }
14936 +
14937 + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
14938 + update_wallclock();
14939 + clock_was_set();
14940 + }
14941 +
14942 + write_sequnlock(&xtime_lock);
14943 +
14944 + /*
14945 + * Account stolen ticks.
14946 + * HACK: Passing NULL to account_steal_time()
14947 + * ensures that the ticks are accounted as stolen.
14948 + */
14949 + if ((stolen > 0) && (delta_cpu > 0)) {
14950 + delta_cpu -= stolen;
14951 + if (unlikely(delta_cpu < 0))
14952 + stolen += delta_cpu; /* clamp local-time progress */
14953 + do_div(stolen, NS_PER_TICK);
14954 + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14955 + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14956 + account_steal_time(NULL, (cputime_t)stolen);
14957 + }
14958 +
14959 + /*
14960 + * Account blocked ticks.
14961 + * HACK: Passing idle_task to account_steal_time()
14962 + * ensures that the ticks are accounted as idle/wait.
14963 + */
14964 + if ((blocked > 0) && (delta_cpu > 0)) {
14965 + delta_cpu -= blocked;
14966 + if (unlikely(delta_cpu < 0))
14967 + blocked += delta_cpu; /* clamp local-time progress */
14968 + do_div(blocked, NS_PER_TICK);
14969 + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14970 + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
14971 + account_steal_time(idle_task(cpu), (cputime_t)blocked);
14972 + }
14973 +
14974 + /* Account user/system ticks. */
14975 + if (delta_cpu > 0) {
14976 + do_div(delta_cpu, NS_PER_TICK);
14977 + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14978 + if (user_mode(regs))
14979 + account_user_time(current, (cputime_t)delta_cpu);
14980 + else
14981 + account_system_time(current, HARDIRQ_OFFSET,
14982 + (cputime_t)delta_cpu);
14983 + }
14984 +
14985 + /* Offlined for more than a few seconds? Avoid lockup warnings. */
14986 + if (stolen > 5*HZ)
14987 + touch_softlockup_watchdog();
14988 +
14989 + /* Local timer processing (see update_process_times()). */
14990 + run_local_timers();
14991 + if (rcu_pending(cpu))
14992 + rcu_check_callbacks(cpu, user_mode(regs));
14993 + scheduler_tick();
14994 + run_posix_cpu_timers(current);
14995 + profile_tick(CPU_PROFILING, regs);
14996 +
14997 + return IRQ_HANDLED;
14998 +}
14999 +
15000 +static void init_missing_ticks_accounting(int cpu)
15001 +{
15002 + struct vcpu_register_runstate_memory_area area;
15003 + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
15004 +
15005 + memset(runstate, 0, sizeof(*runstate));
15006 +
15007 + area.addr.v = runstate;
15008 + HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
15009 +
15010 + per_cpu(processed_blocked_time, cpu) =
15011 + runstate->time[RUNSTATE_blocked];
15012 + per_cpu(processed_stolen_time, cpu) =
15013 + runstate->time[RUNSTATE_runnable] +
15014 + runstate->time[RUNSTATE_offline];
15015 +}
15016 +
15017 +/* not static: needed by APM */
15018 +unsigned long get_cmos_time(void)
15019 +{
15020 + unsigned long retval;
15021 +
15022 + spin_lock(&rtc_lock);
15023 +
15024 + if (efi_enabled)
15025 + retval = efi_get_time();
15026 + else
15027 + retval = mach_get_cmos_time();
15028 +
15029 + spin_unlock(&rtc_lock);
15030 +
15031 + return retval;
15032 +}
15033 +EXPORT_SYMBOL(get_cmos_time);
15034 +
15035 +static void sync_cmos_clock(unsigned long dummy);
15036 +
15037 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
15038 +
15039 +static void sync_cmos_clock(unsigned long dummy)
15040 +{
15041 + struct timeval now, next;
15042 + int fail = 1;
15043 +
15044 + /*
15045 + * If we have an externally synchronized Linux clock, then update
15046 + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
15047 + * called as close as possible to 500 ms before the new second starts.
15048 + * This code is run on a timer. If the clock is set, that timer
15049 + * may not expire at the correct time. Thus, we adjust...
15050 + */
15051 + if (!ntp_synced())
15052 + /*
15053 + * Not synced, exit, do not restart a timer (if one is
15054 + * running, let it run out).
15055 + */
15056 + return;
15057 +
15058 + do_gettimeofday(&now);
15059 + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
15060 + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
15061 + fail = set_rtc_mmss(now.tv_sec);
15062 +
15063 + next.tv_usec = USEC_AFTER - now.tv_usec;
15064 + if (next.tv_usec <= 0)
15065 + next.tv_usec += USEC_PER_SEC;
15066 +
15067 + if (!fail)
15068 + next.tv_sec = 659;
15069 + else
15070 + next.tv_sec = 0;
15071 +
15072 + if (next.tv_usec >= USEC_PER_SEC) {
15073 + next.tv_sec++;
15074 + next.tv_usec -= USEC_PER_SEC;
15075 + }
15076 + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
15077 +}
15078 +
15079 +void notify_arch_cmos_timer(void)
15080 +{
15081 + mod_timer(&sync_cmos_timer, jiffies + 1);
15082 + mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
15083 +}
15084 +
15085 +static long clock_cmos_diff, sleep_start;
15086 +
15087 +static struct timer_opts *last_timer;
15088 +static int timer_suspend(struct sys_device *dev, pm_message_t state)
15089 +{
15090 + /*
15091 + * Estimate time zone so that set_time can update the clock
15092 + */
15093 + clock_cmos_diff = -get_cmos_time();
15094 + clock_cmos_diff += get_seconds();
15095 + sleep_start = get_cmos_time();
15096 + last_timer = cur_timer;
15097 + cur_timer = &timer_none;
15098 + if (last_timer->suspend)
15099 + last_timer->suspend(state);
15100 + return 0;
15101 +}
15102 +
15103 +static int timer_resume(struct sys_device *dev)
15104 +{
15105 + unsigned long flags;
15106 + unsigned long sec;
15107 + unsigned long sleep_length;
15108 +
15109 +#ifdef CONFIG_HPET_TIMER
15110 + if (is_hpet_enabled())
15111 + hpet_reenable();
15112 +#endif
15113 + sec = get_cmos_time() + clock_cmos_diff;
15114 + sleep_length = (get_cmos_time() - sleep_start) * HZ;
15115 + write_seqlock_irqsave(&xtime_lock, flags);
15116 + xtime.tv_sec = sec;
15117 + xtime.tv_nsec = 0;
15118 + jiffies_64 += sleep_length;
15119 + wall_jiffies += sleep_length;
15120 + write_sequnlock_irqrestore(&xtime_lock, flags);
15121 + if (last_timer->resume)
15122 + last_timer->resume();
15123 + cur_timer = last_timer;
15124 + last_timer = NULL;
15125 + touch_softlockup_watchdog();
15126 + return 0;
15127 +}
15128 +
15129 +static struct sysdev_class timer_sysclass = {
15130 + .resume = timer_resume,
15131 + .suspend = timer_suspend,
15132 + set_kset_name("timer"),
15133 +};
15134 +
15135 +
15136 +/* XXX this driverfs stuff should probably go elsewhere later -john */
15137 +static struct sys_device device_timer = {
15138 + .id = 0,
15139 + .cls = &timer_sysclass,
15140 +};
15141 +
15142 +static int time_init_device(void)
15143 +{
15144 + int error = sysdev_class_register(&timer_sysclass);
15145 + if (!error)
15146 + error = sysdev_register(&device_timer);
15147 + return error;
15148 +}
15149 +
15150 +device_initcall(time_init_device);
15151 +
15152 +#ifdef CONFIG_HPET_TIMER
15153 +extern void (*late_time_init)(void);
15154 +/* Duplicate of time_init() below, with hpet_enable part added */
15155 +static void __init hpet_time_init(void)
15156 +{
15157 + xtime.tv_sec = get_cmos_time();
15158 + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
15159 + set_normalized_timespec(&wall_to_monotonic,
15160 + -xtime.tv_sec, -xtime.tv_nsec);
15161 +
15162 + if ((hpet_enable() >= 0) && hpet_use_timer) {
15163 + printk("Using HPET for base-timer\n");
15164 + }
15165 +
15166 + cur_timer = select_timer();
15167 + printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
15168 +
15169 + time_init_hook();
15170 +}
15171 +#endif
15172 +
15173 +/* Dynamically-mapped IRQ. */
15174 +DEFINE_PER_CPU(int, timer_irq);
15175 +
15176 +extern void (*late_time_init)(void);
15177 +static void setup_cpu0_timer_irq(void)
15178 +{
15179 + per_cpu(timer_irq, 0) =
15180 + bind_virq_to_irqhandler(
15181 + VIRQ_TIMER,
15182 + 0,
15183 + timer_interrupt,
15184 + SA_INTERRUPT,
15185 + "timer0",
15186 + NULL);
15187 + BUG_ON(per_cpu(timer_irq, 0) < 0);
15188 +}
15189 +
15190 +void __init time_init(void)
15191 +{
15192 +#ifdef CONFIG_HPET_TIMER
15193 + if (is_hpet_capable()) {
15194 + /*
15195 + * HPET initialization needs to do memory-mapped io. So, let
15196 + * us do a late initialization after mem_init().
15197 + */
15198 + late_time_init = hpet_time_init;
15199 + return;
15200 + }
15201 +#endif
15202 + get_time_values_from_xen();
15203 +
15204 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15205 + per_cpu(processed_system_time, 0) = processed_system_time;
15206 + init_missing_ticks_accounting(0);
15207 +
15208 + update_wallclock();
15209 +
15210 + init_cpu_khz();
15211 + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
15212 + cpu_khz / 1000, cpu_khz % 1000);
15213 +
15214 +#if defined(__x86_64__)
15215 + vxtime.mode = VXTIME_TSC;
15216 + vxtime.quot = (1000000L << 32) / vxtime_hz;
15217 + vxtime.tsc_quot = (1000L << 32) / cpu_khz;
15218 + sync_core();
15219 + rdtscll(vxtime.last_tsc);
15220 +#endif
15221 +
15222 + /* Cannot request_irq() until kmem is initialised. */
15223 + late_time_init = setup_cpu0_timer_irq;
15224 +}
15225 +
15226 +/* Convert jiffies to system time. */
15227 +u64 jiffies_to_st(unsigned long j)
15228 +{
15229 + unsigned long seq;
15230 + long delta;
15231 + u64 st;
15232 +
15233 + do {
15234 + seq = read_seqbegin(&xtime_lock);
15235 + delta = j - jiffies;
15236 + if (delta < 1) {
15237 + /* Triggers in some wrap-around cases, but that's okay:
15238 + * we just end up with a shorter timeout. */
15239 + st = processed_system_time + NS_PER_TICK;
15240 + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
15241 + /* Very long timeout means there is no pending timer.
15242 + * We indicate this to Xen by passing zero timeout. */
15243 + st = 0;
15244 + } else {
15245 + st = processed_system_time + delta * (u64)NS_PER_TICK;
15246 + }
15247 + } while (read_seqretry(&xtime_lock, seq));
15248 +
15249 + return st;
15250 +}
15251 +EXPORT_SYMBOL(jiffies_to_st);
15252 +
15253 +/*
15254 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
15255 + * These functions are based on implementations from arch/s390/kernel/time.c
15256 + */
15257 +static void stop_hz_timer(void)
15258 +{
15259 + unsigned int cpu = smp_processor_id();
15260 + unsigned long j;
15261 +
15262 + cpu_set(cpu, nohz_cpu_mask);
15263 +
15264 + /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
15265 + /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
15266 + /* value of rcp->cur that matches rdp->quiescbatch and allows us to */
15267 + /* stop the hz timer then the cpumasks created for subsequent values */
15268 + /* of cur in rcu_start_batch are guaranteed to pick up the updated */
15269 + /* nohz_cpu_mask and so will not depend on this cpu. */
15270 +
15271 + smp_mb();
15272 +
15273 + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
15274 + if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
15275 + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
15276 + cpu_clear(cpu, nohz_cpu_mask);
15277 + j = jiffies + 1;
15278 + }
15279 +
15280 + if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
15281 + BUG();
15282 +}
15283 +
15284 +static void start_hz_timer(void)
15285 +{
15286 + cpu_clear(smp_processor_id(), nohz_cpu_mask);
15287 +}
15288 +
15289 +void safe_halt(void)
15290 +{
15291 + stop_hz_timer();
15292 + /* Blocking includes an implicit local_irq_enable(). */
15293 + HYPERVISOR_block();
15294 + start_hz_timer();
15295 +}
15296 +EXPORT_SYMBOL(safe_halt);
15297 +
15298 +void halt(void)
15299 +{
15300 + if (irqs_disabled())
15301 + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
15302 +}
15303 +EXPORT_SYMBOL(halt);
15304 +
15305 +/* No locking required. We are only CPU running, and interrupts are off. */
15306 +void time_resume(void)
15307 +{
15308 + init_cpu_khz();
15309 +
15310 + get_time_values_from_xen();
15311 +
15312 + processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
15313 + per_cpu(processed_system_time, 0) = processed_system_time;
15314 + init_missing_ticks_accounting(0);
15315 +
15316 + update_wallclock();
15317 +}
15318 +
15319 +#ifdef CONFIG_SMP
15320 +static char timer_name[NR_CPUS][15];
15321 +
15322 +int local_setup_timer(unsigned int cpu)
15323 +{
15324 + int seq, irq;
15325 +
15326 + BUG_ON(cpu == 0);
15327 +
15328 + do {
15329 + seq = read_seqbegin(&xtime_lock);
15330 + /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
15331 + per_cpu(processed_system_time, cpu) =
15332 + per_cpu(shadow_time, 0).system_timestamp;
15333 + init_missing_ticks_accounting(cpu);
15334 + } while (read_seqretry(&xtime_lock, seq));
15335 +
15336 + sprintf(timer_name[cpu], "timer%d", cpu);
15337 + irq = bind_virq_to_irqhandler(VIRQ_TIMER,
15338 + cpu,
15339 + timer_interrupt,
15340 + SA_INTERRUPT,
15341 + timer_name[cpu],
15342 + NULL);
15343 + if (irq < 0)
15344 + return irq;
15345 + per_cpu(timer_irq, cpu) = irq;
15346 +
15347 + return 0;
15348 +}
15349 +
15350 +void local_teardown_timer(unsigned int cpu)
15351 +{
15352 + BUG_ON(cpu == 0);
15353 + unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
15354 +}
15355 +#endif
15356 +
15357 +/*
15358 + * /proc/sys/xen: This really belongs in another file. It can stay here for
15359 + * now however.
15360 + */
15361 +static ctl_table xen_subtable[] = {
15362 + {
15363 + .ctl_name = 1,
15364 + .procname = "independent_wallclock",
15365 + .data = &independent_wallclock,
15366 + .maxlen = sizeof(independent_wallclock),
15367 + .mode = 0644,
15368 + .proc_handler = proc_dointvec
15369 + },
15370 + {
15371 + .ctl_name = 2,
15372 + .procname = "permitted_clock_jitter",
15373 + .data = &permitted_clock_jitter,
15374 + .maxlen = sizeof(permitted_clock_jitter),
15375 + .mode = 0644,
15376 + .proc_handler = proc_doulongvec_minmax
15377 + },
15378 + { 0 }
15379 +};
15380 +static ctl_table xen_table[] = {
15381 + {
15382 + .ctl_name = 123,
15383 + .procname = "xen",
15384 + .mode = 0555,
15385 + .child = xen_subtable},
15386 + { 0 }
15387 +};
15388 +static int __init xen_sysctl_init(void)
15389 +{
15390 + (void)register_sysctl_table(xen_table, 0);
15391 + return 0;
15392 +}
15393 +__initcall(xen_sysctl_init);
15394 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/traps-xen.c linux-2.6.16.33/arch/i386/kernel/traps-xen.c
15395 --- linux-2.6.16.33-noxen/arch/i386/kernel/traps-xen.c 1970-01-01 00:00:00.000000000 +0000
15396 +++ linux-2.6.16.33/arch/i386/kernel/traps-xen.c 2007-01-08 15:00:45.000000000 +0000
15397 @@ -0,0 +1,1094 @@
15398 +/*
15399 + * linux/arch/i386/traps.c
15400 + *
15401 + * Copyright (C) 1991, 1992 Linus Torvalds
15402 + *
15403 + * Pentium III FXSR, SSE support
15404 + * Gareth Hughes <gareth@valinux.com>, May 2000
15405 + */
15406 +
15407 +/*
15408 + * 'Traps.c' handles hardware traps and faults after we have saved some
15409 + * state in 'asm.s'.
15410 + */
15411 +#include <linux/config.h>
15412 +#include <linux/sched.h>
15413 +#include <linux/kernel.h>
15414 +#include <linux/string.h>
15415 +#include <linux/errno.h>
15416 +#include <linux/timer.h>
15417 +#include <linux/mm.h>
15418 +#include <linux/init.h>
15419 +#include <linux/delay.h>
15420 +#include <linux/spinlock.h>
15421 +#include <linux/interrupt.h>
15422 +#include <linux/highmem.h>
15423 +#include <linux/kallsyms.h>
15424 +#include <linux/ptrace.h>
15425 +#include <linux/utsname.h>
15426 +#include <linux/kprobes.h>
15427 +#include <linux/kexec.h>
15428 +
15429 +#ifdef CONFIG_EISA
15430 +#include <linux/ioport.h>
15431 +#include <linux/eisa.h>
15432 +#endif
15433 +
15434 +#ifdef CONFIG_MCA
15435 +#include <linux/mca.h>
15436 +#endif
15437 +
15438 +#include <asm/processor.h>
15439 +#include <asm/system.h>
15440 +#include <asm/uaccess.h>
15441 +#include <asm/io.h>
15442 +#include <asm/atomic.h>
15443 +#include <asm/debugreg.h>
15444 +#include <asm/desc.h>
15445 +#include <asm/i387.h>
15446 +#include <asm/nmi.h>
15447 +
15448 +#include <asm/smp.h>
15449 +#include <asm/arch_hooks.h>
15450 +#include <asm/kdebug.h>
15451 +
15452 +#include <linux/module.h>
15453 +
15454 +#include "mach_traps.h"
15455 +
15456 +asmlinkage int system_call(void);
15457 +
15458 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
15459 + { 0, 0 }, { 0, 0 } };
15460 +
15461 +/* Do we ignore FPU interrupts ? */
15462 +char ignore_fpu_irq = 0;
15463 +
15464 +#ifndef CONFIG_X86_NO_IDT
15465 +/*
15466 + * The IDT has to be page-aligned to simplify the Pentium
15467 + * F0 0F bug workaround.. We have a special link segment
15468 + * for this.
15469 + */
15470 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
15471 +#endif
15472 +
15473 +asmlinkage void divide_error(void);
15474 +asmlinkage void debug(void);
15475 +asmlinkage void nmi(void);
15476 +asmlinkage void int3(void);
15477 +asmlinkage void overflow(void);
15478 +asmlinkage void bounds(void);
15479 +asmlinkage void invalid_op(void);
15480 +asmlinkage void device_not_available(void);
15481 +asmlinkage void coprocessor_segment_overrun(void);
15482 +asmlinkage void invalid_TSS(void);
15483 +asmlinkage void segment_not_present(void);
15484 +asmlinkage void stack_segment(void);
15485 +asmlinkage void general_protection(void);
15486 +asmlinkage void page_fault(void);
15487 +asmlinkage void coprocessor_error(void);
15488 +asmlinkage void simd_coprocessor_error(void);
15489 +asmlinkage void alignment_check(void);
15490 +#ifndef CONFIG_XEN
15491 +asmlinkage void spurious_interrupt_bug(void);
15492 +#else
15493 +asmlinkage void fixup_4gb_segment(void);
15494 +#endif
15495 +asmlinkage void machine_check(void);
15496 +
15497 +static int kstack_depth_to_print = 24;
15498 +struct notifier_block *i386die_chain;
15499 +static DEFINE_SPINLOCK(die_notifier_lock);
15500 +
15501 +int register_die_notifier(struct notifier_block *nb)
15502 +{
15503 + int err = 0;
15504 + unsigned long flags;
15505 + spin_lock_irqsave(&die_notifier_lock, flags);
15506 + err = notifier_chain_register(&i386die_chain, nb);
15507 + spin_unlock_irqrestore(&die_notifier_lock, flags);
15508 + return err;
15509 +}
15510 +EXPORT_SYMBOL(register_die_notifier);
15511 +
15512 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
15513 +{
15514 + return p > (void *)tinfo &&
15515 + p < (void *)tinfo + THREAD_SIZE - 3;
15516 +}
15517 +
15518 +static void print_addr_and_symbol(unsigned long addr, char *log_lvl)
15519 +{
15520 + printk(log_lvl);
15521 + printk(" [<%08lx>] ", addr);
15522 + print_symbol("%s", addr);
15523 + printk("\n");
15524 +}
15525 +
15526 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
15527 + unsigned long *stack, unsigned long ebp,
15528 + char *log_lvl)
15529 +{
15530 + unsigned long addr;
15531 +
15532 +#ifdef CONFIG_FRAME_POINTER
15533 + while (valid_stack_ptr(tinfo, (void *)ebp)) {
15534 + addr = *(unsigned long *)(ebp + 4);
15535 + print_addr_and_symbol(addr, log_lvl);
15536 + ebp = *(unsigned long *)ebp;
15537 + }
15538 +#else
15539 + while (valid_stack_ptr(tinfo, stack)) {
15540 + addr = *stack++;
15541 + if (__kernel_text_address(addr))
15542 + print_addr_and_symbol(addr, log_lvl);
15543 + }
15544 +#endif
15545 + return ebp;
15546 +}
15547 +
15548 +static void show_trace_log_lvl(struct task_struct *task,
15549 + unsigned long *stack, char *log_lvl)
15550 +{
15551 + unsigned long ebp;
15552 +
15553 + if (!task)
15554 + task = current;
15555 +
15556 + if (task == current) {
15557 + /* Grab ebp right from our regs */
15558 + asm ("movl %%ebp, %0" : "=r" (ebp) : );
15559 + } else {
15560 + /* ebp is the last reg pushed by switch_to */
15561 + ebp = *(unsigned long *) task->thread.esp;
15562 + }
15563 +
15564 + while (1) {
15565 + struct thread_info *context;
15566 + context = (struct thread_info *)
15567 + ((unsigned long)stack & (~(THREAD_SIZE - 1)));
15568 + ebp = print_context_stack(context, stack, ebp, log_lvl);
15569 + stack = (unsigned long*)context->previous_esp;
15570 + if (!stack)
15571 + break;
15572 + printk(log_lvl);
15573 + printk(" =======================\n");
15574 + }
15575 +}
15576 +
15577 +void show_trace(struct task_struct *task, unsigned long * stack)
15578 +{
15579 + show_trace_log_lvl(task, stack, "");
15580 +}
15581 +
15582 +static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
15583 + char *log_lvl)
15584 +{
15585 + unsigned long *stack;
15586 + int i;
15587 +
15588 + if (esp == NULL) {
15589 + if (task)
15590 + esp = (unsigned long*)task->thread.esp;
15591 + else
15592 + esp = (unsigned long *)&esp;
15593 + }
15594 +
15595 + stack = esp;
15596 + printk(log_lvl);
15597 + for(i = 0; i < kstack_depth_to_print; i++) {
15598 + if (kstack_end(stack))
15599 + break;
15600 + if (i && ((i % 8) == 0)) {
15601 + printk("\n");
15602 + printk(log_lvl);
15603 + printk(" ");
15604 + }
15605 + printk("%08lx ", *stack++);
15606 + }
15607 + printk("\n");
15608 + printk(log_lvl);
15609 + printk("Call Trace:\n");
15610 + show_trace_log_lvl(task, esp, log_lvl);
15611 +}
15612 +
15613 +void show_stack(struct task_struct *task, unsigned long *esp)
15614 +{
15615 + show_stack_log_lvl(task, esp, "");
15616 +}
15617 +
15618 +/*
15619 + * The architecture-independent dump_stack generator
15620 + */
15621 +void dump_stack(void)
15622 +{
15623 + unsigned long stack;
15624 +
15625 + show_trace(current, &stack);
15626 +}
15627 +
15628 +EXPORT_SYMBOL(dump_stack);
15629 +
15630 +void show_registers(struct pt_regs *regs)
15631 +{
15632 + int i;
15633 + int in_kernel = 1;
15634 + unsigned long esp;
15635 + unsigned short ss;
15636 +
15637 + esp = (unsigned long) (&regs->esp);
15638 + savesegment(ss, ss);
15639 + if (user_mode(regs)) {
15640 + in_kernel = 0;
15641 + esp = regs->esp;
15642 + ss = regs->xss & 0xffff;
15643 + }
15644 + print_modules();
15645 + printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
15646 + "EFLAGS: %08lx (%s %.*s) \n",
15647 + smp_processor_id(), 0xffff & regs->xcs, regs->eip,
15648 + print_tainted(), regs->eflags, system_utsname.release,
15649 + (int)strcspn(system_utsname.version, " "),
15650 + system_utsname.version);
15651 + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
15652 + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
15653 + regs->eax, regs->ebx, regs->ecx, regs->edx);
15654 + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
15655 + regs->esi, regs->edi, regs->ebp, esp);
15656 + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
15657 + regs->xds & 0xffff, regs->xes & 0xffff, ss);
15658 + printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
15659 + current->comm, current->pid, current_thread_info(), current);
15660 + /*
15661 + * When in-kernel, we also print out the stack and code at the
15662 + * time of the fault..
15663 + */
15664 + if (in_kernel) {
15665 + u8 __user *eip;
15666 +
15667 + printk("\n" KERN_EMERG "Stack: ");
15668 + show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
15669 +
15670 + printk(KERN_EMERG "Code: ");
15671 +
15672 + eip = (u8 __user *)regs->eip - 43;
15673 + for (i = 0; i < 64; i++, eip++) {
15674 + unsigned char c;
15675 +
15676 + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
15677 + printk(" Bad EIP value.");
15678 + break;
15679 + }
15680 + if (eip == (u8 __user *)regs->eip)
15681 + printk("<%02x> ", c);
15682 + else
15683 + printk("%02x ", c);
15684 + }
15685 + }
15686 + printk("\n");
15687 +}
15688 +
15689 +static void handle_BUG(struct pt_regs *regs)
15690 +{
15691 + unsigned short ud2;
15692 + unsigned short line;
15693 + char *file;
15694 + char c;
15695 + unsigned long eip;
15696 +
15697 + eip = regs->eip;
15698 +
15699 + if (eip < PAGE_OFFSET)
15700 + goto no_bug;
15701 + if (__get_user(ud2, (unsigned short __user *)eip))
15702 + goto no_bug;
15703 + if (ud2 != 0x0b0f)
15704 + goto no_bug;
15705 + if (__get_user(line, (unsigned short __user *)(eip + 2)))
15706 + goto bug;
15707 + if (__get_user(file, (char * __user *)(eip + 4)) ||
15708 + (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
15709 + file = "<bad filename>";
15710 +
15711 + printk(KERN_EMERG "------------[ cut here ]------------\n");
15712 + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
15713 +
15714 +no_bug:
15715 + return;
15716 +
15717 + /* Here we know it was a BUG but file-n-line is unavailable */
15718 +bug:
15719 + printk(KERN_EMERG "Kernel BUG\n");
15720 +}
15721 +
15722 +/* This is gone through when something in the kernel
15723 + * has done something bad and is about to be terminated.
15724 +*/
15725 +void die(const char * str, struct pt_regs * regs, long err)
15726 +{
15727 + static struct {
15728 + spinlock_t lock;
15729 + u32 lock_owner;
15730 + int lock_owner_depth;
15731 + } die = {
15732 + .lock = SPIN_LOCK_UNLOCKED,
15733 + .lock_owner = -1,
15734 + .lock_owner_depth = 0
15735 + };
15736 + static int die_counter;
15737 + unsigned long flags;
15738 +
15739 + if (die.lock_owner != raw_smp_processor_id()) {
15740 + console_verbose();
15741 + spin_lock_irqsave(&die.lock, flags);
15742 + die.lock_owner = smp_processor_id();
15743 + die.lock_owner_depth = 0;
15744 + bust_spinlocks(1);
15745 + }
15746 + else
15747 + local_save_flags(flags);
15748 +
15749 + if (++die.lock_owner_depth < 3) {
15750 + int nl = 0;
15751 + handle_BUG(regs);
15752 + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
15753 +#ifdef CONFIG_PREEMPT
15754 + printk(KERN_EMERG "PREEMPT ");
15755 + nl = 1;
15756 +#endif
15757 +#ifdef CONFIG_SMP
15758 + if (!nl)
15759 + printk(KERN_EMERG);
15760 + printk("SMP ");
15761 + nl = 1;
15762 +#endif
15763 +#ifdef CONFIG_DEBUG_PAGEALLOC
15764 + if (!nl)
15765 + printk(KERN_EMERG);
15766 + printk("DEBUG_PAGEALLOC");
15767 + nl = 1;
15768 +#endif
15769 + if (nl)
15770 + printk("\n");
15771 + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
15772 + show_registers(regs);
15773 + } else
15774 + printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
15775 +
15776 + bust_spinlocks(0);
15777 + die.lock_owner = -1;
15778 + spin_unlock_irqrestore(&die.lock, flags);
15779 +
15780 + if (kexec_should_crash(current))
15781 + crash_kexec(regs);
15782 +
15783 + if (in_interrupt())
15784 + panic("Fatal exception in interrupt");
15785 +
15786 + if (panic_on_oops) {
15787 + printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
15788 + ssleep(5);
15789 + panic("Fatal exception");
15790 + }
15791 + do_exit(SIGSEGV);
15792 +}
15793 +
15794 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
15795 +{
15796 + if (!user_mode_vm(regs))
15797 + die(str, regs, err);
15798 +}
15799 +
15800 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
15801 + struct pt_regs * regs, long error_code,
15802 + siginfo_t *info)
15803 +{
15804 + struct task_struct *tsk = current;
15805 + tsk->thread.error_code = error_code;
15806 + tsk->thread.trap_no = trapnr;
15807 +
15808 + if (regs->eflags & VM_MASK) {
15809 + if (vm86)
15810 + goto vm86_trap;
15811 + goto trap_signal;
15812 + }
15813 +
15814 + if (!user_mode(regs))
15815 + goto kernel_trap;
15816 +
15817 + trap_signal: {
15818 + if (info)
15819 + force_sig_info(signr, info, tsk);
15820 + else
15821 + force_sig(signr, tsk);
15822 + return;
15823 + }
15824 +
15825 + kernel_trap: {
15826 + if (!fixup_exception(regs))
15827 + die(str, regs, error_code);
15828 + return;
15829 + }
15830 +
15831 + vm86_trap: {
15832 + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
15833 + if (ret) goto trap_signal;
15834 + return;
15835 + }
15836 +}
15837 +
15838 +#define DO_ERROR(trapnr, signr, str, name) \
15839 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15840 +{ \
15841 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15842 + == NOTIFY_STOP) \
15843 + return; \
15844 + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
15845 +}
15846 +
15847 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15848 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15849 +{ \
15850 + siginfo_t info; \
15851 + info.si_signo = signr; \
15852 + info.si_errno = 0; \
15853 + info.si_code = sicode; \
15854 + info.si_addr = (void __user *)siaddr; \
15855 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15856 + == NOTIFY_STOP) \
15857 + return; \
15858 + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
15859 +}
15860 +
15861 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
15862 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15863 +{ \
15864 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15865 + == NOTIFY_STOP) \
15866 + return; \
15867 + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
15868 +}
15869 +
15870 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15871 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15872 +{ \
15873 + siginfo_t info; \
15874 + info.si_signo = signr; \
15875 + info.si_errno = 0; \
15876 + info.si_code = sicode; \
15877 + info.si_addr = (void __user *)siaddr; \
15878 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15879 + == NOTIFY_STOP) \
15880 + return; \
15881 + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
15882 +}
15883 +
15884 +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
15885 +#ifndef CONFIG_KPROBES
15886 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
15887 +#endif
15888 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
15889 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
15890 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
15891 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
15892 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
15893 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
15894 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
15895 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
15896 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
15897 +
15898 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
15899 + long error_code)
15900 +{
15901 + current->thread.error_code = error_code;
15902 + current->thread.trap_no = 13;
15903 +
15904 + if (regs->eflags & VM_MASK)
15905 + goto gp_in_vm86;
15906 +
15907 + if (!user_mode(regs))
15908 + goto gp_in_kernel;
15909 +
15910 + current->thread.error_code = error_code;
15911 + current->thread.trap_no = 13;
15912 + force_sig(SIGSEGV, current);
15913 + return;
15914 +
15915 +gp_in_vm86:
15916 + local_irq_enable();
15917 + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
15918 + return;
15919 +
15920 +gp_in_kernel:
15921 + if (!fixup_exception(regs)) {
15922 + if (notify_die(DIE_GPF, "general protection fault", regs,
15923 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
15924 + return;
15925 + die("general protection fault", regs, error_code);
15926 + }
15927 +}
15928 +
15929 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
15930 +{
15931 + printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
15932 + "to continue\n");
15933 + printk(KERN_EMERG "You probably have a hardware problem with your RAM "
15934 + "chips\n");
15935 +
15936 + /* Clear and disable the memory parity error line. */
15937 + clear_mem_error(reason);
15938 +}
15939 +
15940 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
15941 +{
15942 + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15943 + show_registers(regs);
15944 +
15945 + /* Re-enable the IOCK line, wait for a few seconds */
15946 + clear_io_check_error(reason);
15947 +}
15948 +
15949 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15950 +{
15951 +#ifdef CONFIG_MCA
15952 + /* Might actually be able to figure out what the guilty party
15953 + * is. */
15954 + if( MCA_bus ) {
15955 + mca_handle_nmi();
15956 + return;
15957 + }
15958 +#endif
15959 + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
15960 + reason, smp_processor_id());
15961 + printk("Dazed and confused, but trying to continue\n");
15962 + printk("Do you have a strange power saving mode enabled?\n");
15963 +}
15964 +
15965 +static DEFINE_SPINLOCK(nmi_print_lock);
15966 +
15967 +void die_nmi (struct pt_regs *regs, const char *msg)
15968 +{
15969 + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
15970 + NOTIFY_STOP)
15971 + return;
15972 +
15973 + spin_lock(&nmi_print_lock);
15974 + /*
15975 + * We are in trouble anyway, lets at least try
15976 + * to get a message out.
15977 + */
15978 + bust_spinlocks(1);
15979 + printk(KERN_EMERG "%s", msg);
15980 + printk(" on CPU%d, eip %08lx, registers:\n",
15981 + smp_processor_id(), regs->eip);
15982 + show_registers(regs);
15983 + printk(KERN_EMERG "console shuts up ...\n");
15984 + console_silent();
15985 + spin_unlock(&nmi_print_lock);
15986 + bust_spinlocks(0);
15987 +
15988 + /* If we are in kernel we are probably nested up pretty bad
15989 + * and might aswell get out now while we still can.
15990 + */
15991 + if (!user_mode(regs)) {
15992 + current->thread.trap_no = 2;
15993 + crash_kexec(regs);
15994 + }
15995 +
15996 + do_exit(SIGSEGV);
15997 +}
15998 +
15999 +static void default_do_nmi(struct pt_regs * regs)
16000 +{
16001 + unsigned char reason = 0;
16002 +
16003 + /* Only the BSP gets external NMIs from the system. */
16004 + if (!smp_processor_id())
16005 + reason = get_nmi_reason();
16006 +
16007 + if (!(reason & 0xc0)) {
16008 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
16009 + == NOTIFY_STOP)
16010 + return;
16011 +#ifdef CONFIG_X86_LOCAL_APIC
16012 + /*
16013 + * Ok, so this is none of the documented NMI sources,
16014 + * so it must be the NMI watchdog.
16015 + */
16016 + if (nmi_watchdog) {
16017 + nmi_watchdog_tick(regs);
16018 + return;
16019 + }
16020 +#endif
16021 + unknown_nmi_error(reason, regs);
16022 + return;
16023 + }
16024 + if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
16025 + return;
16026 + if (reason & 0x80)
16027 + mem_parity_error(reason, regs);
16028 + if (reason & 0x40)
16029 + io_check_error(reason, regs);
16030 + /*
16031 + * Reassert NMI in case it became active meanwhile
16032 + * as it's edge-triggered.
16033 + */
16034 + reassert_nmi();
16035 +}
16036 +
16037 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
16038 +{
16039 + return 0;
16040 +}
16041 +
16042 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
16043 +
16044 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
16045 +{
16046 + int cpu;
16047 +
16048 + nmi_enter();
16049 +
16050 + cpu = smp_processor_id();
16051 +
16052 + ++nmi_count(cpu);
16053 +
16054 + if (!rcu_dereference(nmi_callback)(regs, cpu))
16055 + default_do_nmi(regs);
16056 +
16057 + nmi_exit();
16058 +}
16059 +
16060 +void set_nmi_callback(nmi_callback_t callback)
16061 +{
16062 + rcu_assign_pointer(nmi_callback, callback);
16063 +}
16064 +EXPORT_SYMBOL_GPL(set_nmi_callback);
16065 +
16066 +void unset_nmi_callback(void)
16067 +{
16068 + nmi_callback = dummy_nmi_callback;
16069 +}
16070 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
16071 +
16072 +#ifdef CONFIG_KPROBES
16073 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
16074 +{
16075 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
16076 + == NOTIFY_STOP)
16077 + return;
16078 + /* This is an interrupt gate, because kprobes wants interrupts
16079 + disabled. Normal trap handlers don't. */
16080 + restore_interrupts(regs);
16081 + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
16082 +}
16083 +#endif
16084 +
16085 +/*
16086 + * Our handling of the processor debug registers is non-trivial.
16087 + * We do not clear them on entry and exit from the kernel. Therefore
16088 + * it is possible to get a watchpoint trap here from inside the kernel.
16089 + * However, the code in ./ptrace.c has ensured that the user can
16090 + * only set watchpoints on userspace addresses. Therefore the in-kernel
16091 + * watchpoint trap can only occur in code which is reading/writing
16092 + * from user space. Such code must not hold kernel locks (since it
16093 + * can equally take a page fault), therefore it is safe to call
16094 + * force_sig_info even though that claims and releases locks.
16095 + *
16096 + * Code in ./signal.c ensures that the debug control register
16097 + * is restored before we deliver any signal, and therefore that
16098 + * user code runs with the correct debug control register even though
16099 + * we clear it here.
16100 + *
16101 + * Being careful here means that we don't have to be as careful in a
16102 + * lot of more complicated places (task switching can be a bit lazy
16103 + * about restoring all the debug state, and ptrace doesn't have to
16104 + * find every occurrence of the TF bit that could be saved away even
16105 + * by user code)
16106 + */
16107 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
16108 +{
16109 + unsigned int condition;
16110 + struct task_struct *tsk = current;
16111 +
16112 + get_debugreg(condition, 6);
16113 +
16114 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
16115 + SIGTRAP) == NOTIFY_STOP)
16116 + return;
16117 + /* It's safe to allow irq's after DR6 has been saved */
16118 + if (regs->eflags & X86_EFLAGS_IF)
16119 + local_irq_enable();
16120 +
16121 + /* Mask out spurious debug traps due to lazy DR7 setting */
16122 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
16123 + if (!tsk->thread.debugreg[7])
16124 + goto clear_dr7;
16125 + }
16126 +
16127 + if (regs->eflags & VM_MASK)
16128 + goto debug_vm86;
16129 +
16130 + /* Save debug status register where ptrace can see it */
16131 + tsk->thread.debugreg[6] = condition;
16132 +
16133 + /*
16134 + * Single-stepping through TF: make sure we ignore any events in
16135 + * kernel space (but re-enable TF when returning to user mode).
16136 + */
16137 + if (condition & DR_STEP) {
16138 + /*
16139 + * We already checked v86 mode above, so we can
16140 + * check for kernel mode by just checking the CPL
16141 + * of CS.
16142 + */
16143 + if (!user_mode(regs))
16144 + goto clear_TF_reenable;
16145 + }
16146 +
16147 + /* Ok, finally something we can handle */
16148 + send_sigtrap(tsk, regs, error_code);
16149 +
16150 + /* Disable additional traps. They'll be re-enabled when
16151 + * the signal is delivered.
16152 + */
16153 +clear_dr7:
16154 + set_debugreg(0, 7);
16155 + return;
16156 +
16157 +debug_vm86:
16158 + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
16159 + return;
16160 +
16161 +clear_TF_reenable:
16162 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
16163 + regs->eflags &= ~TF_MASK;
16164 + return;
16165 +}
16166 +
16167 +/*
16168 + * Note that we play around with the 'TS' bit in an attempt to get
16169 + * the correct behaviour even in the presence of the asynchronous
16170 + * IRQ13 behaviour
16171 + */
16172 +void math_error(void __user *eip)
16173 +{
16174 + struct task_struct * task;
16175 + siginfo_t info;
16176 + unsigned short cwd, swd;
16177 +
16178 + /*
16179 + * Save the info for the exception handler and clear the error.
16180 + */
16181 + task = current;
16182 + save_init_fpu(task);
16183 + task->thread.trap_no = 16;
16184 + task->thread.error_code = 0;
16185 + info.si_signo = SIGFPE;
16186 + info.si_errno = 0;
16187 + info.si_code = __SI_FAULT;
16188 + info.si_addr = eip;
16189 + /*
16190 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
16191 + * status. 0x3f is the exception bits in these regs, 0x200 is the
16192 + * C1 reg you need in case of a stack fault, 0x040 is the stack
16193 + * fault bit. We should only be taking one exception at a time,
16194 + * so if this combination doesn't produce any single exception,
16195 + * then we have a bad program that isn't syncronizing its FPU usage
16196 + * and it will suffer the consequences since we won't be able to
16197 + * fully reproduce the context of the exception
16198 + */
16199 + cwd = get_fpu_cwd(task);
16200 + swd = get_fpu_swd(task);
16201 + switch (swd & ~cwd & 0x3f) {
16202 + case 0x000: /* No unmasked exception */
16203 + return;
16204 + default: /* Multiple exceptions */
16205 + break;
16206 + case 0x001: /* Invalid Op */
16207 + /*
16208 + * swd & 0x240 == 0x040: Stack Underflow
16209 + * swd & 0x240 == 0x240: Stack Overflow
16210 + * User must clear the SF bit (0x40) if set
16211 + */
16212 + info.si_code = FPE_FLTINV;
16213 + break;
16214 + case 0x002: /* Denormalize */
16215 + case 0x010: /* Underflow */
16216 + info.si_code = FPE_FLTUND;
16217 + break;
16218 + case 0x004: /* Zero Divide */
16219 + info.si_code = FPE_FLTDIV;
16220 + break;
16221 + case 0x008: /* Overflow */
16222 + info.si_code = FPE_FLTOVF;
16223 + break;
16224 + case 0x020: /* Precision */
16225 + info.si_code = FPE_FLTRES;
16226 + break;
16227 + }
16228 + force_sig_info(SIGFPE, &info, task);
16229 +}
16230 +
16231 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
16232 +{
16233 + ignore_fpu_irq = 1;
16234 + math_error((void __user *)regs->eip);
16235 +}
16236 +
16237 +static void simd_math_error(void __user *eip)
16238 +{
16239 + struct task_struct * task;
16240 + siginfo_t info;
16241 + unsigned short mxcsr;
16242 +
16243 + /*
16244 + * Save the info for the exception handler and clear the error.
16245 + */
16246 + task = current;
16247 + save_init_fpu(task);
16248 + task->thread.trap_no = 19;
16249 + task->thread.error_code = 0;
16250 + info.si_signo = SIGFPE;
16251 + info.si_errno = 0;
16252 + info.si_code = __SI_FAULT;
16253 + info.si_addr = eip;
16254 + /*
16255 + * The SIMD FPU exceptions are handled a little differently, as there
16256 + * is only a single status/control register. Thus, to determine which
16257 + * unmasked exception was caught we must mask the exception mask bits
16258 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
16259 + */
16260 + mxcsr = get_fpu_mxcsr(task);
16261 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
16262 + case 0x000:
16263 + default:
16264 + break;
16265 + case 0x001: /* Invalid Op */
16266 + info.si_code = FPE_FLTINV;
16267 + break;
16268 + case 0x002: /* Denormalize */
16269 + case 0x010: /* Underflow */
16270 + info.si_code = FPE_FLTUND;
16271 + break;
16272 + case 0x004: /* Zero Divide */
16273 + info.si_code = FPE_FLTDIV;
16274 + break;
16275 + case 0x008: /* Overflow */
16276 + info.si_code = FPE_FLTOVF;
16277 + break;
16278 + case 0x020: /* Precision */
16279 + info.si_code = FPE_FLTRES;
16280 + break;
16281 + }
16282 + force_sig_info(SIGFPE, &info, task);
16283 +}
16284 +
16285 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
16286 + long error_code)
16287 +{
16288 + if (cpu_has_xmm) {
16289 + /* Handle SIMD FPU exceptions on PIII+ processors. */
16290 + ignore_fpu_irq = 1;
16291 + simd_math_error((void __user *)regs->eip);
16292 + } else {
16293 + /*
16294 + * Handle strange cache flush from user space exception
16295 + * in all other cases. This is undocumented behaviour.
16296 + */
16297 + if (regs->eflags & VM_MASK) {
16298 + handle_vm86_fault((struct kernel_vm86_regs *)regs,
16299 + error_code);
16300 + return;
16301 + }
16302 + current->thread.trap_no = 19;
16303 + current->thread.error_code = error_code;
16304 + die_if_kernel("cache flush denied", regs, error_code);
16305 + force_sig(SIGSEGV, current);
16306 + }
16307 +}
16308 +
16309 +#ifndef CONFIG_XEN
16310 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
16311 + long error_code)
16312 +{
16313 +#if 0
16314 + /* No need to warn about this any longer. */
16315 + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
16316 +#endif
16317 +}
16318 +
16319 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
16320 +{
16321 + unsigned long *switch16_ptr, *switch32_ptr;
16322 + struct pt_regs *regs;
16323 + unsigned long stack_top, stack_bot;
16324 + unsigned short iret_frame16_off;
16325 + int cpu = smp_processor_id();
16326 + /* reserve the space on 32bit stack for the magic switch16 pointer */
16327 + memmove(stk, stk + 8, sizeof(struct pt_regs));
16328 + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
16329 + regs = (struct pt_regs *)stk;
16330 + /* now the switch32 on 16bit stack */
16331 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16332 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16333 + switch32_ptr = (unsigned long *)(stack_top - 8);
16334 + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
16335 + /* copy iret frame on 16bit stack */
16336 + memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
16337 + /* fill in the switch pointers */
16338 + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
16339 + switch16_ptr[1] = __ESPFIX_SS;
16340 + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
16341 + 8 - CPU_16BIT_STACK_SIZE;
16342 + switch32_ptr[1] = __KERNEL_DS;
16343 +}
16344 +
16345 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
16346 +{
16347 + unsigned long *switch32_ptr;
16348 + unsigned char *stack16, *stack32;
16349 + unsigned long stack_top, stack_bot;
16350 + int len;
16351 + int cpu = smp_processor_id();
16352 + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
16353 + stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
16354 + switch32_ptr = (unsigned long *)(stack_top - 8);
16355 + /* copy the data from 16bit stack to 32bit stack */
16356 + len = CPU_16BIT_STACK_SIZE - 8 - sp;
16357 + stack16 = (unsigned char *)(stack_bot + sp);
16358 + stack32 = (unsigned char *)
16359 + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
16360 + memcpy(stack32, stack16, len);
16361 + return stack32;
16362 +}
16363 +#endif
16364 +
16365 +/*
16366 + * 'math_state_restore()' saves the current math information in the
16367 + * old math state array, and gets the new ones from the current task
16368 + *
16369 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
16370 + * Don't touch unless you *really* know how it works.
16371 + *
16372 + * Must be called with kernel preemption disabled (in this case,
16373 + * local interrupts are disabled at the call-site in entry.S).
16374 + */
16375 +asmlinkage void math_state_restore(struct pt_regs regs)
16376 +{
16377 + struct thread_info *thread = current_thread_info();
16378 + struct task_struct *tsk = thread->task;
16379 +
16380 + /* NB. 'clts' is done for us by Xen during virtual trap. */
16381 + if (!tsk_used_math(tsk))
16382 + init_fpu(tsk);
16383 + restore_fpu(tsk);
16384 + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
16385 +}
16386 +
16387 +#ifndef CONFIG_MATH_EMULATION
16388 +
16389 +asmlinkage void math_emulate(long arg)
16390 +{
16391 + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
16392 + printk(KERN_EMERG "killing %s.\n",current->comm);
16393 + force_sig(SIGFPE,current);
16394 + schedule();
16395 +}
16396 +
16397 +#endif /* CONFIG_MATH_EMULATION */
16398 +
16399 +#ifdef CONFIG_X86_F00F_BUG
16400 +void __init trap_init_f00f_bug(void)
16401 +{
16402 + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
16403 +
16404 + /*
16405 + * Update the IDT descriptor and reload the IDT so that
16406 + * it uses the read-only mapped virtual address.
16407 + */
16408 + idt_descr.address = fix_to_virt(FIX_F00F_IDT);
16409 + load_idt(&idt_descr);
16410 +}
16411 +#endif
16412 +
16413 +
16414 +/*
16415 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
16416 + * for those that specify <dpl>|4 in the second field.
16417 + */
16418 +static trap_info_t trap_table[] = {
16419 + { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
16420 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
16421 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
16422 + { 4, 3, __KERNEL_CS, (unsigned long)overflow },
16423 + { 5, 0, __KERNEL_CS, (unsigned long)bounds },
16424 + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
16425 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
16426 + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
16427 + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
16428 + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
16429 + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
16430 + { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
16431 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
16432 + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
16433 + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
16434 + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
16435 +#ifdef CONFIG_X86_MCE
16436 + { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
16437 +#endif
16438 + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
16439 + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
16440 + { 0, 0, 0, 0 }
16441 +};
16442 +
16443 +void __init trap_init(void)
16444 +{
16445 + HYPERVISOR_set_trap_table(trap_table);
16446 +
16447 + if (cpu_has_fxsr) {
16448 + /*
16449 + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
16450 + * Generates a compile-time "error: zero width for bit-field" if
16451 + * the alignment is wrong.
16452 + */
16453 + struct fxsrAlignAssert {
16454 + int _:!(offsetof(struct task_struct,
16455 + thread.i387.fxsave) & 15);
16456 + };
16457 +
16458 + printk(KERN_INFO "Enabling fast FPU save and restore... ");
16459 + set_in_cr4(X86_CR4_OSFXSR);
16460 + printk("done.\n");
16461 + }
16462 + if (cpu_has_xmm) {
16463 + printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
16464 + "support... ");
16465 + set_in_cr4(X86_CR4_OSXMMEXCPT);
16466 + printk("done.\n");
16467 + }
16468 +
16469 + /*
16470 + * Should be a barrier for any external CPU state.
16471 + */
16472 + cpu_init();
16473 +}
16474 +
16475 +void smp_trap_init(trap_info_t *trap_ctxt)
16476 +{
16477 + trap_info_t *t = trap_table;
16478 +
16479 + for (t = trap_table; t->address; t++) {
16480 + trap_ctxt[t->vector].flags = t->flags;
16481 + trap_ctxt[t->vector].cs = t->cs;
16482 + trap_ctxt[t->vector].address = t->address;
16483 + }
16484 +}
16485 +
16486 +static int __init kstack_setup(char *s)
16487 +{
16488 + kstack_depth_to_print = simple_strtoul(s, NULL, 0);
16489 + return 0;
16490 +}
16491 +__setup("kstack=", kstack_setup);
16492 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/traps.c linux-2.6.16.33/arch/i386/kernel/traps.c
16493 --- linux-2.6.16.33-noxen/arch/i386/kernel/traps.c 2006-11-22 18:06:31.000000000 +0000
16494 +++ linux-2.6.16.33/arch/i386/kernel/traps.c 2007-01-08 15:00:45.000000000 +0000
16495 @@ -567,18 +567,11 @@
16496
16497 static void io_check_error(unsigned char reason, struct pt_regs * regs)
16498 {
16499 - unsigned long i;
16500 -
16501 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
16502 show_registers(regs);
16503
16504 /* Re-enable the IOCK line, wait for a few seconds */
16505 - reason = (reason & 0xf) | 8;
16506 - outb(reason, 0x61);
16507 - i = 2000;
16508 - while (--i) udelay(1000);
16509 - reason &= ~8;
16510 - outb(reason, 0x61);
16511 + clear_io_check_error(reason);
16512 }
16513
16514 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
16515 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vm86.c linux-2.6.16.33/arch/i386/kernel/vm86.c
16516 --- linux-2.6.16.33-noxen/arch/i386/kernel/vm86.c 2006-11-22 18:06:31.000000000 +0000
16517 +++ linux-2.6.16.33/arch/i386/kernel/vm86.c 2007-01-08 15:00:45.000000000 +0000
16518 @@ -98,7 +98,9 @@
16519 struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
16520 struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
16521 {
16522 +#ifndef CONFIG_X86_NO_TSS
16523 struct tss_struct *tss;
16524 +#endif
16525 struct pt_regs *ret;
16526 unsigned long tmp;
16527
16528 @@ -123,12 +125,16 @@
16529 do_exit(SIGSEGV);
16530 }
16531
16532 +#ifndef CONFIG_X86_NO_TSS
16533 tss = &per_cpu(init_tss, get_cpu());
16534 +#endif
16535 current->thread.esp0 = current->thread.saved_esp0;
16536 current->thread.sysenter_cs = __KERNEL_CS;
16537 load_esp0(tss, &current->thread);
16538 current->thread.saved_esp0 = 0;
16539 +#ifndef CONFIG_X86_NO_TSS
16540 put_cpu();
16541 +#endif
16542
16543 loadsegment(fs, current->thread.saved_fs);
16544 loadsegment(gs, current->thread.saved_gs);
16545 @@ -252,7 +258,9 @@
16546
16547 static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
16548 {
16549 +#ifndef CONFIG_X86_NO_TSS
16550 struct tss_struct *tss;
16551 +#endif
16552 long eax;
16553 /*
16554 * make sure the vm86() system call doesn't try to do anything silly
16555 @@ -297,12 +305,16 @@
16556 savesegment(fs, tsk->thread.saved_fs);
16557 savesegment(gs, tsk->thread.saved_gs);
16558
16559 +#ifndef CONFIG_X86_NO_TSS
16560 tss = &per_cpu(init_tss, get_cpu());
16561 +#endif
16562 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
16563 if (cpu_has_sep)
16564 tsk->thread.sysenter_cs = 0;
16565 load_esp0(tss, &tsk->thread);
16566 +#ifndef CONFIG_X86_NO_TSS
16567 put_cpu();
16568 +#endif
16569
16570 tsk->thread.screen_bitmap = info->screen_bitmap;
16571 if (info->flags & VM86_SCREEN_BITMAP)
16572 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vmlinux.lds.S linux-2.6.16.33/arch/i386/kernel/vmlinux.lds.S
16573 --- linux-2.6.16.33-noxen/arch/i386/kernel/vmlinux.lds.S 2006-11-22 18:06:31.000000000 +0000
16574 +++ linux-2.6.16.33/arch/i386/kernel/vmlinux.lds.S 2007-01-08 15:00:45.000000000 +0000
16575 @@ -12,6 +12,12 @@
16576 OUTPUT_ARCH(i386)
16577 ENTRY(phys_startup_32)
16578 jiffies = jiffies_64;
16579 +
16580 +PHDRS {
16581 + text PT_LOAD FLAGS(5); /* R_E */
16582 + data PT_LOAD FLAGS(7); /* RWE */
16583 + note PT_NOTE FLAGS(4); /* R__ */
16584 +}
16585 SECTIONS
16586 {
16587 . = __KERNEL_START;
16588 @@ -25,7 +31,7 @@
16589 KPROBES_TEXT
16590 *(.fixup)
16591 *(.gnu.warning)
16592 - } = 0x9090
16593 + } :text = 0x9090
16594
16595 _etext = .; /* End of text section */
16596
16597 @@ -34,13 +40,20 @@
16598 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
16599 __stop___ex_table = .;
16600
16601 + . = ALIGN(16);
16602 + __start_smp_alternatives_table = .;
16603 + __smp_alternatives : AT(ADDR(__smp_alternatives) - LOAD_OFFSET) { *(__smp_alternatives) }
16604 + __stop_smp_alternatives_table = .;
16605 +
16606 + __smp_replacements : AT(ADDR(__smp_replacements) - LOAD_OFFSET) { *(__smp_replacements) }
16607 +
16608 RODATA
16609
16610 /* writeable */
16611 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
16612 *(.data)
16613 CONSTRUCTORS
16614 - }
16615 + } :data
16616
16617 . = ALIGN(4096);
16618 __nosave_begin = .;
16619 @@ -147,4 +160,6 @@
16620 STABS_DEBUG
16621
16622 DWARF_DEBUG
16623 +
16624 + NOTES
16625 }
16626 diff -Nur linux-2.6.16.33-noxen/arch/i386/kernel/vsyscall-note-xen.S linux-2.6.16.33/arch/i386/kernel/vsyscall-note-xen.S
16627 --- linux-2.6.16.33-noxen/arch/i386/kernel/vsyscall-note-xen.S 1970-01-01 00:00:00.000000000 +0000
16628 +++ linux-2.6.16.33/arch/i386/kernel/vsyscall-note-xen.S 2007-01-08 15:00:45.000000000 +0000
16629 @@ -0,0 +1,32 @@
16630 +/*
16631 + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
16632 + * Here we can supply some information useful to userland.
16633 + * First we get the vanilla i386 note that supplies the kernel version info.
16634 + */
16635 +
16636 +#include "vsyscall-note.S"
16637 +
16638 +/*
16639 + * Now we add a special note telling glibc's dynamic linker a fake hardware
16640 + * flavor that it will use to choose the search path for libraries in the
16641 + * same way it uses real hardware capabilities like "mmx".
16642 + * We supply "nosegneg" as the fake capability, to indicate that we
16643 + * do not like negative offsets in instructions using segment overrides,
16644 + * since we implement those inefficiently. This makes it possible to
16645 + * install libraries optimized to avoid those access patterns in someplace
16646 + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
16647 + * corresponding to the bits here is needed to make ldconfig work right.
16648 + * It should contain:
16649 + * hwcap 0 nosegneg
16650 + * to match the mapping of bit to name that we give here.
16651 + */
16652 +#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
16653 + ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
16654 + .long ncaps, mask
16655 +#define NOTE_KERNELCAP(bit, name) \
16656 + .byte bit; .asciz name
16657 +#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
16658 +
16659 +NOTE_KERNELCAP_BEGIN(1, 1)
16660 +NOTE_KERNELCAP(1, "nosegneg") /* Change 1 back to 0 when glibc is fixed! */
16661 +NOTE_KERNELCAP_END
16662 diff -Nur linux-2.6.16.33-noxen/arch/i386/mach-xen/Makefile linux-2.6.16.33/arch/i386/mach-xen/Makefile
16663 --- linux-2.6.16.33-noxen/arch/i386/mach-xen/Makefile 1970-01-01 00:00:00.000000000 +0000
16664 +++ linux-2.6.16.33/arch/i386/mach-xen/Makefile 2007-01-08 15:00:45.000000000 +0000
16665 @@ -0,0 +1,5 @@
16666 +#
16667 +# Makefile for the linux kernel.
16668 +#
16669 +
16670 +obj-y := setup.o
16671 diff -Nur linux-2.6.16.33-noxen/arch/i386/mach-xen/setup.c linux-2.6.16.33/arch/i386/mach-xen/setup.c
16672 --- linux-2.6.16.33-noxen/arch/i386/mach-xen/setup.c 1970-01-01 00:00:00.000000000 +0000
16673 +++ linux-2.6.16.33/arch/i386/mach-xen/setup.c 2007-01-08 15:00:45.000000000 +0000
16674 @@ -0,0 +1,37 @@
16675 +/*
16676 + * Machine specific setup for generic
16677 + */
16678 +
16679 +#include <linux/config.h>
16680 +#include <linux/smp.h>
16681 +#include <linux/init.h>
16682 +#include <linux/interrupt.h>
16683 +#include <asm/acpi.h>
16684 +#include <asm/arch_hooks.h>
16685 +
16686 +#ifdef CONFIG_HOTPLUG_CPU
16687 +#define DEFAULT_SEND_IPI (1)
16688 +#else
16689 +#define DEFAULT_SEND_IPI (0)
16690 +#endif
16691 +
16692 +int no_broadcast=DEFAULT_SEND_IPI;
16693 +
16694 +static __init int no_ipi_broadcast(char *str)
16695 +{
16696 + get_option(&str, &no_broadcast);
16697 + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
16698 + "IPI Broadcast");
16699 + return 1;
16700 +}
16701 +
16702 +__setup("no_ipi_broadcast", no_ipi_broadcast);
16703 +
16704 +static int __init print_ipi_mode(void)
16705 +{
16706 + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
16707 + "Shortcut");
16708 + return 0;
16709 +}
16710 +
16711 +late_initcall(print_ipi_mode);
16712 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/Makefile linux-2.6.16.33/arch/i386/mm/Makefile
16713 --- linux-2.6.16.33-noxen/arch/i386/mm/Makefile 2006-11-22 18:06:31.000000000 +0000
16714 +++ linux-2.6.16.33/arch/i386/mm/Makefile 2007-01-08 15:00:45.000000000 +0000
16715 @@ -8,3 +8,11 @@
16716 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
16717 obj-$(CONFIG_HIGHMEM) += highmem.o
16718 obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
16719 +
16720 +ifdef CONFIG_XEN
16721 +include $(srctree)/scripts/Makefile.xen
16722 +
16723 +obj-y += hypervisor.o
16724 +
16725 +obj-y := $(call cherrypickxen, $(obj-y))
16726 +endif
16727 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/fault-xen.c linux-2.6.16.33/arch/i386/mm/fault-xen.c
16728 --- linux-2.6.16.33-noxen/arch/i386/mm/fault-xen.c 1970-01-01 00:00:00.000000000 +0000
16729 +++ linux-2.6.16.33/arch/i386/mm/fault-xen.c 2007-01-08 15:00:45.000000000 +0000
16730 @@ -0,0 +1,662 @@
16731 +/*
16732 + * linux/arch/i386/mm/fault.c
16733 + *
16734 + * Copyright (C) 1995 Linus Torvalds
16735 + */
16736 +
16737 +#include <linux/signal.h>
16738 +#include <linux/sched.h>
16739 +#include <linux/kernel.h>
16740 +#include <linux/errno.h>
16741 +#include <linux/string.h>
16742 +#include <linux/types.h>
16743 +#include <linux/ptrace.h>
16744 +#include <linux/mman.h>
16745 +#include <linux/mm.h>
16746 +#include <linux/smp.h>
16747 +#include <linux/smp_lock.h>
16748 +#include <linux/interrupt.h>
16749 +#include <linux/init.h>
16750 +#include <linux/tty.h>
16751 +#include <linux/vt_kern.h> /* For unblank_screen() */
16752 +#include <linux/highmem.h>
16753 +#include <linux/module.h>
16754 +#include <linux/kprobes.h>
16755 +
16756 +#include <asm/system.h>
16757 +#include <asm/uaccess.h>
16758 +#include <asm/desc.h>
16759 +#include <asm/kdebug.h>
16760 +
16761 +extern void die(const char *,struct pt_regs *,long);
16762 +
16763 +/*
16764 + * Unlock any spinlocks which will prevent us from getting the
16765 + * message out
16766 + */
16767 +void bust_spinlocks(int yes)
16768 +{
16769 + int loglevel_save = console_loglevel;
16770 +
16771 + if (yes) {
16772 + oops_in_progress = 1;
16773 + return;
16774 + }
16775 +#ifdef CONFIG_VT
16776 + unblank_screen();
16777 +#endif
16778 + oops_in_progress = 0;
16779 + /*
16780 + * OK, the message is on the console. Now we call printk()
16781 + * without oops_in_progress set so that printk will give klogd
16782 + * a poke. Hold onto your hats...
16783 + */
16784 + console_loglevel = 15; /* NMI oopser may have shut the console up */
16785 + printk(" ");
16786 + console_loglevel = loglevel_save;
16787 +}
16788 +
16789 +/*
16790 + * Return EIP plus the CS segment base. The segment limit is also
16791 + * adjusted, clamped to the kernel/user address space (whichever is
16792 + * appropriate), and returned in *eip_limit.
16793 + *
16794 + * The segment is checked, because it might have been changed by another
16795 + * task between the original faulting instruction and here.
16796 + *
16797 + * If CS is no longer a valid code segment, or if EIP is beyond the
16798 + * limit, or if it is a kernel address when CS is not a kernel segment,
16799 + * then the returned value will be greater than *eip_limit.
16800 + *
16801 + * This is slow, but is very rarely executed.
16802 + */
16803 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
16804 + unsigned long *eip_limit)
16805 +{
16806 + unsigned long eip = regs->eip;
16807 + unsigned seg = regs->xcs & 0xffff;
16808 + u32 seg_ar, seg_limit, base, *desc;
16809 +
16810 + /* The standard kernel/user address space limit. */
16811 + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
16812 +
16813 + /* Unlikely, but must come before segment checks. */
16814 + if (unlikely((regs->eflags & VM_MASK) != 0))
16815 + return eip + (seg << 4);
16816 +
16817 + /* By far the most common cases. */
16818 + if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
16819 + return eip;
16820 +
16821 + /* Check the segment exists, is within the current LDT/GDT size,
16822 + that kernel/user (ring 0..3) has the appropriate privilege,
16823 + that it's a code segment, and get the limit. */
16824 + __asm__ ("larl %3,%0; lsll %3,%1"
16825 + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
16826 + if ((~seg_ar & 0x9800) || eip > seg_limit) {
16827 + *eip_limit = 0;
16828 + return 1; /* So that returned eip > *eip_limit. */
16829 + }
16830 +
16831 + /* Get the GDT/LDT descriptor base.
16832 + When you look for races in this code remember that
16833 + LDT and other horrors are only used in user space. */
16834 + if (seg & (1<<2)) {
16835 + /* Must lock the LDT while reading it. */
16836 + down(&current->mm->context.sem);
16837 + desc = current->mm->context.ldt;
16838 + desc = (void *)desc + (seg & ~7);
16839 + } else {
16840 + /* Must disable preemption while reading the GDT. */
16841 + desc = (u32 *)get_cpu_gdt_table(get_cpu());
16842 + desc = (void *)desc + (seg & ~7);
16843 + }
16844 +
16845 + /* Decode the code segment base from the descriptor */
16846 + base = get_desc_base((unsigned long *)desc);
16847 +
16848 + if (seg & (1<<2)) {
16849 + up(&current->mm->context.sem);
16850 + } else
16851 + put_cpu();
16852 +
16853 + /* Adjust EIP and segment limit, and clamp at the kernel limit.
16854 + It's legitimate for segments to wrap at 0xffffffff. */
16855 + seg_limit += base;
16856 + if (seg_limit < *eip_limit && seg_limit >= base)
16857 + *eip_limit = seg_limit;
16858 + return eip + base;
16859 +}
16860 +
16861 +/*
16862 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
16863 + * Check that here and ignore it.
16864 + */
16865 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
16866 +{
16867 + unsigned long limit;
16868 + unsigned long instr = get_segment_eip (regs, &limit);
16869 + int scan_more = 1;
16870 + int prefetch = 0;
16871 + int i;
16872 +
16873 + for (i = 0; scan_more && i < 15; i++) {
16874 + unsigned char opcode;
16875 + unsigned char instr_hi;
16876 + unsigned char instr_lo;
16877 +
16878 + if (instr > limit)
16879 + break;
16880 + if (__get_user(opcode, (unsigned char __user *) instr))
16881 + break;
16882 +
16883 + instr_hi = opcode & 0xf0;
16884 + instr_lo = opcode & 0x0f;
16885 + instr++;
16886 +
16887 + switch (instr_hi) {
16888 + case 0x20:
16889 + case 0x30:
16890 + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
16891 + scan_more = ((instr_lo & 7) == 0x6);
16892 + break;
16893 +
16894 + case 0x60:
16895 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
16896 + scan_more = (instr_lo & 0xC) == 0x4;
16897 + break;
16898 + case 0xF0:
16899 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
16900 + scan_more = !instr_lo || (instr_lo>>1) == 1;
16901 + break;
16902 + case 0x00:
16903 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
16904 + scan_more = 0;
16905 + if (instr > limit)
16906 + break;
16907 + if (__get_user(opcode, (unsigned char __user *) instr))
16908 + break;
16909 + prefetch = (instr_lo == 0xF) &&
16910 + (opcode == 0x0D || opcode == 0x18);
16911 + break;
16912 + default:
16913 + scan_more = 0;
16914 + break;
16915 + }
16916 + }
16917 + return prefetch;
16918 +}
16919 +
16920 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
16921 + unsigned long error_code)
16922 +{
16923 + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
16924 + boot_cpu_data.x86 >= 6)) {
16925 + /* Catch an obscure case of prefetch inside an NX page. */
16926 + if (nx_enabled && (error_code & 16))
16927 + return 0;
16928 + return __is_prefetch(regs, addr);
16929 + }
16930 + return 0;
16931 +}
16932 +
16933 +static noinline void force_sig_info_fault(int si_signo, int si_code,
16934 + unsigned long address, struct task_struct *tsk)
16935 +{
16936 + siginfo_t info;
16937 +
16938 + info.si_signo = si_signo;
16939 + info.si_errno = 0;
16940 + info.si_code = si_code;
16941 + info.si_addr = (void __user *)address;
16942 + force_sig_info(si_signo, &info, tsk);
16943 +}
16944 +
16945 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
16946 +
16947 +#ifdef CONFIG_X86_PAE
16948 +static void dump_fault_path(unsigned long address)
16949 +{
16950 + unsigned long *p, page;
16951 + unsigned long mfn;
16952 +
16953 + page = read_cr3();
16954 + p = (unsigned long *)__va(page);
16955 + p += (address >> 30) * 2;
16956 + printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
16957 + if (p[0] & 1) {
16958 + mfn = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
16959 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
16960 + p = (unsigned long *)__va(page);
16961 + address &= 0x3fffffff;
16962 + p += (address >> 21) * 2;
16963 + printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
16964 + page, p[1], p[0]);
16965 +#ifndef CONFIG_HIGHPTE
16966 + if (p[0] & 1) {
16967 + mfn = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20);
16968 + page = mfn_to_pfn(mfn) << PAGE_SHIFT;
16969 + p = (unsigned long *) __va(page);
16970 + address &= 0x001fffff;
16971 + p += (address >> 12) * 2;
16972 + printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
16973 + page, p[1], p[0]);
16974 + }
16975 +#endif
16976 + }
16977 +}
16978 +#else
16979 +static void dump_fault_path(unsigned long address)
16980 +{
16981 + unsigned long page;
16982 +
16983 + page = read_cr3();
16984 + page = ((unsigned long *) __va(page))[address >> 22];
16985 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
16986 + machine_to_phys(page));
16987 + /*
16988 + * We must not directly access the pte in the highpte
16989 + * case, the page table might be allocated in highmem.
16990 + * And lets rather not kmap-atomic the pte, just in case
16991 + * it's allocated already.
16992 + */
16993 +#ifndef CONFIG_HIGHPTE
16994 + if (page & 1) {
16995 + page &= PAGE_MASK;
16996 + address &= 0x003ff000;
16997 + page = machine_to_phys(page);
16998 + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
16999 + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
17000 + machine_to_phys(page));
17001 + }
17002 +#endif
17003 +}
17004 +#endif
17005 +
17006 +static int spurious_fault(struct pt_regs *regs,
17007 + unsigned long address,
17008 + unsigned long error_code)
17009 +{
17010 + pgd_t *pgd;
17011 + pud_t *pud;
17012 + pmd_t *pmd;
17013 + pte_t *pte;
17014 +
17015 + /* Reserved-bit violation or user access to kernel space? */
17016 + if (error_code & 0x0c)
17017 + return 0;
17018 +
17019 + pgd = init_mm.pgd + pgd_index(address);
17020 + if (!pgd_present(*pgd))
17021 + return 0;
17022 +
17023 + pud = pud_offset(pgd, address);
17024 + if (!pud_present(*pud))
17025 + return 0;
17026 +
17027 + pmd = pmd_offset(pud, address);
17028 + if (!pmd_present(*pmd))
17029 + return 0;
17030 +
17031 + pte = pte_offset_kernel(pmd, address);
17032 + if (!pte_present(*pte))
17033 + return 0;
17034 + if ((error_code & 0x02) && !pte_write(*pte))
17035 + return 0;
17036 +#ifdef CONFIG_X86_PAE
17037 + if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
17038 + return 0;
17039 +#endif
17040 +
17041 + return 1;
17042 +}
17043 +
17044 +/*
17045 + * This routine handles page faults. It determines the address,
17046 + * and the problem, and then passes it off to one of the appropriate
17047 + * routines.
17048 + *
17049 + * error_code:
17050 + * bit 0 == 0 means no page found, 1 means protection fault
17051 + * bit 1 == 0 means read, 1 means write
17052 + * bit 2 == 0 means kernel, 1 means user-mode
17053 + */
17054 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
17055 + unsigned long error_code)
17056 +{
17057 + struct task_struct *tsk;
17058 + struct mm_struct *mm;
17059 + struct vm_area_struct * vma;
17060 + unsigned long address;
17061 + int write, si_code;
17062 +
17063 + /* get the address */
17064 + address = read_cr2();
17065 +
17066 + /* Set the "privileged fault" bit to something sane. */
17067 + error_code &= ~4;
17068 + error_code |= (regs->xcs & 2) << 1;
17069 + if (regs->eflags & X86_EFLAGS_VM)
17070 + error_code |= 4;
17071 +
17072 + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
17073 + SIGSEGV) == NOTIFY_STOP)
17074 + return;
17075 + /* It's safe to allow irq's after cr2 has been saved */
17076 + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
17077 + local_irq_enable();
17078 +
17079 + tsk = current;
17080 +
17081 + si_code = SEGV_MAPERR;
17082 +
17083 + /*
17084 + * We fault-in kernel-space virtual memory on-demand. The
17085 + * 'reference' page table is init_mm.pgd.
17086 + *
17087 + * NOTE! We MUST NOT take any locks for this case. We may
17088 + * be in an interrupt or a critical region, and should
17089 + * only copy the information from the master page table,
17090 + * nothing more.
17091 + *
17092 + * This verifies that the fault happens in kernel space
17093 + * (error_code & 4) == 0, and that the fault was not a
17094 + * protection error (error_code & 1) == 0.
17095 + */
17096 + if (unlikely(address >= TASK_SIZE)) {
17097 +#ifdef CONFIG_XEN
17098 + /* Faults in hypervisor area can never be patched up. */
17099 + if (address >= hypervisor_virt_start)
17100 + goto bad_area_nosemaphore;
17101 +#endif
17102 + if (!(error_code & 5))
17103 + goto vmalloc_fault;
17104 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
17105 + if (spurious_fault(regs, address, error_code))
17106 + return;
17107 + /*
17108 + * Don't take the mm semaphore here. If we fixup a prefetch
17109 + * fault we could otherwise deadlock.
17110 + */
17111 + goto bad_area_nosemaphore;
17112 + }
17113 +
17114 + mm = tsk->mm;
17115 +
17116 + /*
17117 + * If we're in an interrupt, have no user context or are running in an
17118 + * atomic region then we must not take the fault..
17119 + */
17120 + if (in_atomic() || !mm)
17121 + goto bad_area_nosemaphore;
17122 +
17123 + /* When running in the kernel we expect faults to occur only to
17124 + * addresses in user space. All other faults represent errors in the
17125 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
17126 + * erroneous fault occuring in a code path which already holds mmap_sem
17127 + * we will deadlock attempting to validate the fault against the
17128 + * address space. Luckily the kernel only validly references user
17129 + * space from well defined areas of code, which are listed in the
17130 + * exceptions table.
17131 + *
17132 + * As the vast majority of faults will be valid we will only perform
17133 + * the source reference check when there is a possibilty of a deadlock.
17134 + * Attempt to lock the address space, if we cannot we then validate the
17135 + * source. If this is invalid we can skip the address space check,
17136 + * thus avoiding the deadlock.
17137 + */
17138 + if (!down_read_trylock(&mm->mmap_sem)) {
17139 + if ((error_code & 4) == 0 &&
17140 + !search_exception_tables(regs->eip))
17141 + goto bad_area_nosemaphore;
17142 + down_read(&mm->mmap_sem);
17143 + }
17144 +
17145 + vma = find_vma(mm, address);
17146 + if (!vma)
17147 + goto bad_area;
17148 + if (vma->vm_start <= address)
17149 + goto good_area;
17150 + if (!(vma->vm_flags & VM_GROWSDOWN))
17151 + goto bad_area;
17152 + if (error_code & 4) {
17153 + /*
17154 + * accessing the stack below %esp is always a bug.
17155 + * The "+ 32" is there due to some instructions (like
17156 + * pusha) doing post-decrement on the stack and that
17157 + * doesn't show up until later..
17158 + */
17159 + if (address + 32 < regs->esp)
17160 + goto bad_area;
17161 + }
17162 + if (expand_stack(vma, address))
17163 + goto bad_area;
17164 +/*
17165 + * Ok, we have a good vm_area for this memory access, so
17166 + * we can handle it..
17167 + */
17168 +good_area:
17169 + si_code = SEGV_ACCERR;
17170 + write = 0;
17171 + switch (error_code & 3) {
17172 + default: /* 3: write, present */
17173 +#ifdef TEST_VERIFY_AREA
17174 + if (regs->cs == GET_KERNEL_CS())
17175 + printk("WP fault at %08lx\n", regs->eip);
17176 +#endif
17177 + /* fall through */
17178 + case 2: /* write, not present */
17179 + if (!(vma->vm_flags & VM_WRITE))
17180 + goto bad_area;
17181 + write++;
17182 + break;
17183 + case 1: /* read, present */
17184 + goto bad_area;
17185 + case 0: /* read, not present */
17186 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
17187 + goto bad_area;
17188 + }
17189 +
17190 + survive:
17191 + /*
17192 + * If for any reason at all we couldn't handle the fault,
17193 + * make sure we exit gracefully rather than endlessly redo
17194 + * the fault.
17195 + */
17196 + switch (handle_mm_fault(mm, vma, address, write)) {
17197 + case VM_FAULT_MINOR:
17198 + tsk->min_flt++;
17199 + break;
17200 + case VM_FAULT_MAJOR:
17201 + tsk->maj_flt++;
17202 + break;
17203 + case VM_FAULT_SIGBUS:
17204 + goto do_sigbus;
17205 + case VM_FAULT_OOM:
17206 + goto out_of_memory;
17207 + default:
17208 + BUG();
17209 + }
17210 +
17211 + /*
17212 + * Did it hit the DOS screen memory VA from vm86 mode?
17213 + */
17214 + if (regs->eflags & VM_MASK) {
17215 + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
17216 + if (bit < 32)
17217 + tsk->thread.screen_bitmap |= 1 << bit;
17218 + }
17219 + up_read(&mm->mmap_sem);
17220 + return;
17221 +
17222 +/*
17223 + * Something tried to access memory that isn't in our memory map..
17224 + * Fix it, but check if it's kernel or user first..
17225 + */
17226 +bad_area:
17227 + up_read(&mm->mmap_sem);
17228 +
17229 +bad_area_nosemaphore:
17230 + /* User mode accesses just cause a SIGSEGV */
17231 + if (error_code & 4) {
17232 + /*
17233 + * Valid to do another page fault here because this one came
17234 + * from user space.
17235 + */
17236 + if (is_prefetch(regs, address, error_code))
17237 + return;
17238 +
17239 + tsk->thread.cr2 = address;
17240 + /* Kernel addresses are always protection faults */
17241 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
17242 + tsk->thread.trap_no = 14;
17243 + force_sig_info_fault(SIGSEGV, si_code, address, tsk);
17244 + return;
17245 + }
17246 +
17247 +#ifdef CONFIG_X86_F00F_BUG
17248 + /*
17249 + * Pentium F0 0F C7 C8 bug workaround.
17250 + */
17251 + if (boot_cpu_data.f00f_bug) {
17252 + unsigned long nr;
17253 +
17254 + nr = (address - idt_descr.address) >> 3;
17255 +
17256 + if (nr == 6) {
17257 + do_invalid_op(regs, 0);
17258 + return;
17259 + }
17260 + }
17261 +#endif
17262 +
17263 +no_context:
17264 + /* Are we prepared to handle this kernel fault? */
17265 + if (fixup_exception(regs))
17266 + return;
17267 +
17268 + /*
17269 + * Valid to do another page fault here, because if this fault
17270 + * had been triggered by is_prefetch fixup_exception would have
17271 + * handled it.
17272 + */
17273 + if (is_prefetch(regs, address, error_code))
17274 + return;
17275 +
17276 +/*
17277 + * Oops. The kernel tried to access some bad page. We'll have to
17278 + * terminate things with extreme prejudice.
17279 + */
17280 +
17281 + bust_spinlocks(1);
17282 +
17283 +#ifdef CONFIG_X86_PAE
17284 + if (error_code & 16) {
17285 + pte_t *pte = lookup_address(address);
17286 +
17287 + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
17288 + printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
17289 + }
17290 +#endif
17291 + if (address < PAGE_SIZE)
17292 + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
17293 + else
17294 + printk(KERN_ALERT "Unable to handle kernel paging request");
17295 + printk(" at virtual address %08lx\n",address);
17296 + printk(KERN_ALERT " printing eip:\n");
17297 + printk("%08lx\n", regs->eip);
17298 + dump_fault_path(address);
17299 + tsk->thread.cr2 = address;
17300 + tsk->thread.trap_no = 14;
17301 + tsk->thread.error_code = error_code;
17302 + die("Oops", regs, error_code);
17303 + bust_spinlocks(0);
17304 + do_exit(SIGKILL);
17305 +
17306 +/*
17307 + * We ran out of memory, or some other thing happened to us that made
17308 + * us unable to handle the page fault gracefully.
17309 + */
17310 +out_of_memory:
17311 + up_read(&mm->mmap_sem);
17312 + if (tsk->pid == 1) {
17313 + yield();
17314 + down_read(&mm->mmap_sem);
17315 + goto survive;
17316 + }
17317 + printk("VM: killing process %s\n", tsk->comm);
17318 + if (error_code & 4)
17319 + do_exit(SIGKILL);
17320 + goto no_context;
17321 +
17322 +do_sigbus:
17323 + up_read(&mm->mmap_sem);
17324 +
17325 + /* Kernel mode? Handle exceptions or die */
17326 + if (!(error_code & 4))
17327 + goto no_context;
17328 +
17329 + /* User space => ok to do another page fault */
17330 + if (is_prefetch(regs, address, error_code))
17331 + return;
17332 +
17333 + tsk->thread.cr2 = address;
17334 + tsk->thread.error_code = error_code;
17335 + tsk->thread.trap_no = 14;
17336 + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
17337 + return;
17338 +
17339 +vmalloc_fault:
17340 + {
17341 + /*
17342 + * Synchronize this task's top level page-table
17343 + * with the 'reference' page table.
17344 + *
17345 + * Do _not_ use "tsk" here. We might be inside
17346 + * an interrupt in the middle of a task switch..
17347 + */
17348 + int index = pgd_index(address);
17349 + unsigned long pgd_paddr;
17350 + pgd_t *pgd, *pgd_k;
17351 + pud_t *pud, *pud_k;
17352 + pmd_t *pmd, *pmd_k;
17353 + pte_t *pte_k;
17354 +
17355 + pgd_paddr = read_cr3();
17356 + pgd = index + (pgd_t *)__va(pgd_paddr);
17357 + pgd_k = init_mm.pgd + index;
17358 +
17359 + if (!pgd_present(*pgd_k))
17360 + goto no_context;
17361 +
17362 + /*
17363 + * set_pgd(pgd, *pgd_k); here would be useless on PAE
17364 + * and redundant with the set_pmd() on non-PAE. As would
17365 + * set_pud.
17366 + */
17367 +
17368 + pud = pud_offset(pgd, address);
17369 + pud_k = pud_offset(pgd_k, address);
17370 + if (!pud_present(*pud_k))
17371 + goto no_context;
17372 +
17373 + pmd = pmd_offset(pud, address);
17374 + pmd_k = pmd_offset(pud_k, address);
17375 + if (!pmd_present(*pmd_k))
17376 + goto no_context;
17377 +#ifndef CONFIG_XEN
17378 + set_pmd(pmd, *pmd_k);
17379 +#else
17380 + /*
17381 + * When running on Xen we must launder *pmd_k through
17382 + * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
17383 + */
17384 + set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
17385 +#endif
17386 +
17387 + pte_k = pte_offset_kernel(pmd_k, address);
17388 + if (!pte_present(*pte_k))
17389 + goto no_context;
17390 + return;
17391 + }
17392 +}
17393 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/highmem-xen.c linux-2.6.16.33/arch/i386/mm/highmem-xen.c
17394 --- linux-2.6.16.33-noxen/arch/i386/mm/highmem-xen.c 1970-01-01 00:00:00.000000000 +0000
17395 +++ linux-2.6.16.33/arch/i386/mm/highmem-xen.c 2007-01-08 15:00:45.000000000 +0000
17396 @@ -0,0 +1,133 @@
17397 +#include <linux/highmem.h>
17398 +#include <linux/module.h>
17399 +
17400 +void *kmap(struct page *page)
17401 +{
17402 + might_sleep();
17403 + if (!PageHighMem(page))
17404 + return page_address(page);
17405 + return kmap_high(page);
17406 +}
17407 +
17408 +void kunmap(struct page *page)
17409 +{
17410 + if (in_interrupt())
17411 + BUG();
17412 + if (!PageHighMem(page))
17413 + return;
17414 + kunmap_high(page);
17415 +}
17416 +
17417 +/*
17418 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
17419 + * no global lock is needed and because the kmap code must perform a global TLB
17420 + * invalidation when the kmap pool wraps.
17421 + *
17422 + * However when holding an atomic kmap is is not legal to sleep, so atomic
17423 + * kmaps are appropriate for short, tight code paths only.
17424 + */
17425 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
17426 +{
17427 + enum fixed_addresses idx;
17428 + unsigned long vaddr;
17429 +
17430 + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
17431 + inc_preempt_count();
17432 + if (!PageHighMem(page))
17433 + return page_address(page);
17434 +
17435 + idx = type + KM_TYPE_NR*smp_processor_id();
17436 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17437 +#ifdef CONFIG_DEBUG_HIGHMEM
17438 + if (!pte_none(*(kmap_pte-idx)))
17439 + BUG();
17440 +#endif
17441 + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
17442 +
17443 + return (void*) vaddr;
17444 +}
17445 +
17446 +void *kmap_atomic(struct page *page, enum km_type type)
17447 +{
17448 + return __kmap_atomic(page, type, kmap_prot);
17449 +}
17450 +
17451 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
17452 +void *kmap_atomic_pte(struct page *page, enum km_type type)
17453 +{
17454 + return __kmap_atomic(page, type, PAGE_KERNEL_RO);
17455 +}
17456 +
17457 +void kunmap_atomic(void *kvaddr, enum km_type type)
17458 +{
17459 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
17460 + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
17461 + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
17462 +
17463 + if (vaddr < FIXADDR_START) { // FIXME
17464 + dec_preempt_count();
17465 + preempt_check_resched();
17466 + return;
17467 + }
17468 +#endif
17469 +
17470 +#if defined(CONFIG_DEBUG_HIGHMEM)
17471 + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
17472 + BUG();
17473 +
17474 + /*
17475 + * force other mappings to Oops if they'll try to access
17476 + * this pte without first remap it
17477 + */
17478 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
17479 + __flush_tlb_one(vaddr);
17480 +#elif defined(CONFIG_XEN)
17481 + /*
17482 + * We must ensure there are no dangling pagetable references when
17483 + * returning memory to Xen (decrease_reservation).
17484 + * XXX TODO: We could make this faster by only zapping when
17485 + * kmap_flush_unused is called but that is trickier and more invasive.
17486 + */
17487 + pte_clear(&init_mm, vaddr, kmap_pte-idx);
17488 +#endif
17489 +
17490 + dec_preempt_count();
17491 + preempt_check_resched();
17492 +}
17493 +
17494 +/* This is the same as kmap_atomic() but can map memory that doesn't
17495 + * have a struct page associated with it.
17496 + */
17497 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
17498 +{
17499 + enum fixed_addresses idx;
17500 + unsigned long vaddr;
17501 +
17502 + inc_preempt_count();
17503 +
17504 + idx = type + KM_TYPE_NR*smp_processor_id();
17505 + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
17506 + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
17507 + __flush_tlb_one(vaddr);
17508 +
17509 + return (void*) vaddr;
17510 +}
17511 +
17512 +struct page *kmap_atomic_to_page(void *ptr)
17513 +{
17514 + unsigned long idx, vaddr = (unsigned long)ptr;
17515 + pte_t *pte;
17516 +
17517 + if (vaddr < FIXADDR_START)
17518 + return virt_to_page(ptr);
17519 +
17520 + idx = virt_to_fix(vaddr);
17521 + pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
17522 + return pte_page(*pte);
17523 +}
17524 +
17525 +EXPORT_SYMBOL(kmap);
17526 +EXPORT_SYMBOL(kunmap);
17527 +EXPORT_SYMBOL(kmap_atomic);
17528 +EXPORT_SYMBOL(kunmap_atomic);
17529 +EXPORT_SYMBOL(kmap_atomic_to_page);
17530 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/hypervisor.c linux-2.6.16.33/arch/i386/mm/hypervisor.c
17531 --- linux-2.6.16.33-noxen/arch/i386/mm/hypervisor.c 1970-01-01 00:00:00.000000000 +0000
17532 +++ linux-2.6.16.33/arch/i386/mm/hypervisor.c 2007-01-08 15:00:45.000000000 +0000
17533 @@ -0,0 +1,450 @@
17534 +/******************************************************************************
17535 + * mm/hypervisor.c
17536 + *
17537 + * Update page tables via the hypervisor.
17538 + *
17539 + * Copyright (c) 2002-2004, K A Fraser
17540 + *
17541 + * This program is free software; you can redistribute it and/or
17542 + * modify it under the terms of the GNU General Public License version 2
17543 + * as published by the Free Software Foundation; or, when distributed
17544 + * separately from the Linux kernel or incorporated into other
17545 + * software packages, subject to the following license:
17546 + *
17547 + * Permission is hereby granted, free of charge, to any person obtaining a copy
17548 + * of this source file (the "Software"), to deal in the Software without
17549 + * restriction, including without limitation the rights to use, copy, modify,
17550 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17551 + * and to permit persons to whom the Software is furnished to do so, subject to
17552 + * the following conditions:
17553 + *
17554 + * The above copyright notice and this permission notice shall be included in
17555 + * all copies or substantial portions of the Software.
17556 + *
17557 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17558 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17559 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17560 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17561 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17562 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
17563 + * IN THE SOFTWARE.
17564 + */
17565 +
17566 +#include <linux/config.h>
17567 +#include <linux/sched.h>
17568 +#include <linux/mm.h>
17569 +#include <linux/vmalloc.h>
17570 +#include <asm/page.h>
17571 +#include <asm/pgtable.h>
17572 +#include <asm/hypervisor.h>
17573 +#include <xen/balloon.h>
17574 +#include <xen/features.h>
17575 +#include <xen/interface/memory.h>
17576 +#include <linux/module.h>
17577 +#include <linux/percpu.h>
17578 +#include <asm/tlbflush.h>
17579 +
17580 +#ifdef CONFIG_X86_64
17581 +#define pmd_val_ma(v) (v).pmd
17582 +#else
17583 +#ifdef CONFIG_X86_PAE
17584 +# define pmd_val_ma(v) ((v).pmd)
17585 +# define pud_val_ma(v) ((v).pgd.pgd)
17586 +#else
17587 +# define pmd_val_ma(v) ((v).pud.pgd.pgd)
17588 +#endif
17589 +#endif
17590 +
17591 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
17592 +{
17593 + mmu_update_t u;
17594 + u.ptr = virt_to_machine(ptr);
17595 + u.val = pte_val_ma(val);
17596 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17597 +}
17598 +
17599 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
17600 +{
17601 + mmu_update_t u;
17602 + u.ptr = virt_to_machine(ptr);
17603 + u.val = pmd_val_ma(val);
17604 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17605 +}
17606 +
17607 +#ifdef CONFIG_X86_PAE
17608 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
17609 +{
17610 + mmu_update_t u;
17611 + u.ptr = virt_to_machine(ptr);
17612 + u.val = pud_val_ma(val);
17613 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17614 +}
17615 +#endif
17616 +
17617 +#ifdef CONFIG_X86_64
17618 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
17619 +{
17620 + mmu_update_t u;
17621 + u.ptr = virt_to_machine(ptr);
17622 + u.val = val.pud;
17623 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17624 +}
17625 +
17626 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
17627 +{
17628 + mmu_update_t u;
17629 + u.ptr = virt_to_machine(ptr);
17630 + u.val = val.pgd;
17631 + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
17632 +}
17633 +#endif /* CONFIG_X86_64 */
17634 +
17635 +void xen_pt_switch(unsigned long ptr)
17636 +{
17637 + struct mmuext_op op;
17638 + op.cmd = MMUEXT_NEW_BASEPTR;
17639 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17640 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17641 +}
17642 +
17643 +void xen_new_user_pt(unsigned long ptr)
17644 +{
17645 + struct mmuext_op op;
17646 + op.cmd = MMUEXT_NEW_USER_BASEPTR;
17647 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17648 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17649 +}
17650 +
17651 +void xen_tlb_flush(void)
17652 +{
17653 + struct mmuext_op op;
17654 + op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
17655 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17656 +}
17657 +EXPORT_SYMBOL(xen_tlb_flush);
17658 +
17659 +void xen_invlpg(unsigned long ptr)
17660 +{
17661 + struct mmuext_op op;
17662 + op.cmd = MMUEXT_INVLPG_LOCAL;
17663 + op.arg1.linear_addr = ptr & PAGE_MASK;
17664 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17665 +}
17666 +EXPORT_SYMBOL(xen_invlpg);
17667 +
17668 +#ifdef CONFIG_SMP
17669 +
17670 +void xen_tlb_flush_all(void)
17671 +{
17672 + struct mmuext_op op;
17673 + op.cmd = MMUEXT_TLB_FLUSH_ALL;
17674 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17675 +}
17676 +
17677 +void xen_tlb_flush_mask(cpumask_t *mask)
17678 +{
17679 + struct mmuext_op op;
17680 + if ( cpus_empty(*mask) )
17681 + return;
17682 + op.cmd = MMUEXT_TLB_FLUSH_MULTI;
17683 + op.arg2.vcpumask = mask->bits;
17684 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17685 +}
17686 +
17687 +void xen_invlpg_all(unsigned long ptr)
17688 +{
17689 + struct mmuext_op op;
17690 + op.cmd = MMUEXT_INVLPG_ALL;
17691 + op.arg1.linear_addr = ptr & PAGE_MASK;
17692 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17693 +}
17694 +
17695 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
17696 +{
17697 + struct mmuext_op op;
17698 + if ( cpus_empty(*mask) )
17699 + return;
17700 + op.cmd = MMUEXT_INVLPG_MULTI;
17701 + op.arg1.linear_addr = ptr & PAGE_MASK;
17702 + op.arg2.vcpumask = mask->bits;
17703 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17704 +}
17705 +
17706 +#endif /* CONFIG_SMP */
17707 +
17708 +void xen_pgd_pin(unsigned long ptr)
17709 +{
17710 + struct mmuext_op op;
17711 +#ifdef CONFIG_X86_64
17712 + op.cmd = MMUEXT_PIN_L4_TABLE;
17713 +#elif defined(CONFIG_X86_PAE)
17714 + op.cmd = MMUEXT_PIN_L3_TABLE;
17715 +#else
17716 + op.cmd = MMUEXT_PIN_L2_TABLE;
17717 +#endif
17718 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17719 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17720 +}
17721 +
17722 +void xen_pgd_unpin(unsigned long ptr)
17723 +{
17724 + struct mmuext_op op;
17725 + op.cmd = MMUEXT_UNPIN_TABLE;
17726 + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
17727 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17728 +}
17729 +
17730 +void xen_set_ldt(unsigned long ptr, unsigned long len)
17731 +{
17732 + struct mmuext_op op;
17733 + op.cmd = MMUEXT_SET_LDT;
17734 + op.arg1.linear_addr = ptr;
17735 + op.arg2.nr_ents = len;
17736 + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
17737 +}
17738 +
17739 +/*
17740 + * Bitmap is indexed by page number. If bit is set, the page is part of a
17741 + * xen_create_contiguous_region() area of memory.
17742 + */
17743 +unsigned long *contiguous_bitmap;
17744 +
17745 +static void contiguous_bitmap_set(
17746 + unsigned long first_page, unsigned long nr_pages)
17747 +{
17748 + unsigned long start_off, end_off, curr_idx, end_idx;
17749 +
17750 + curr_idx = first_page / BITS_PER_LONG;
17751 + start_off = first_page & (BITS_PER_LONG-1);
17752 + end_idx = (first_page + nr_pages) / BITS_PER_LONG;
17753 + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
17754 +
17755 + if (curr_idx == end_idx) {
17756 + contiguous_bitmap[curr_idx] |=
17757 + ((1UL<<end_off)-1) & -(1UL<<start_off);
17758 + } else {
17759 + contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
17760 + while ( ++curr_idx < end_idx )
17761 + contiguous_bitmap[curr_idx] = ~0UL;
17762 + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
17763 + }
17764 +}
17765 +
17766 +static void contiguous_bitmap_clear(
17767 + unsigned long first_page, unsigned long nr_pages)
17768 +{
17769 + unsigned long start_off, end_off, curr_idx, end_idx;
17770 +
17771 + curr_idx = first_page / BITS_PER_LONG;
17772 + start_off = first_page & (BITS_PER_LONG-1);
17773 + end_idx = (first_page + nr_pages) / BITS_PER_LONG;
17774 + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
17775 +
17776 + if (curr_idx == end_idx) {
17777 + contiguous_bitmap[curr_idx] &=
17778 + -(1UL<<end_off) | ((1UL<<start_off)-1);
17779 + } else {
17780 + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
17781 + while ( ++curr_idx != end_idx )
17782 + contiguous_bitmap[curr_idx] = 0;
17783 + contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
17784 + }
17785 +}
17786 +
17787 +/* Protected by balloon_lock. */
17788 +#define MAX_CONTIG_ORDER 9 /* 2MB */
17789 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
17790 +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
17791 +
17792 +/* Ensure multi-page extents are contiguous in machine memory. */
17793 +int xen_create_contiguous_region(
17794 + unsigned long vstart, unsigned int order, unsigned int address_bits)
17795 +{
17796 + unsigned long *in_frames = discontig_frames, out_frame;
17797 + unsigned long frame, i, flags;
17798 + long rc;
17799 + int success;
17800 + struct xen_memory_exchange exchange = {
17801 + .in = {
17802 + .nr_extents = 1UL << order,
17803 + .extent_order = 0,
17804 + .domid = DOMID_SELF
17805 + },
17806 + .out = {
17807 + .nr_extents = 1,
17808 + .extent_order = order,
17809 + .address_bits = address_bits,
17810 + .domid = DOMID_SELF
17811 + }
17812 + };
17813 +
17814 + /*
17815 + * Currently an auto-translated guest will not perform I/O, nor will
17816 + * it require PAE page directories below 4GB. Therefore any calls to
17817 + * this function are redundant and can be ignored.
17818 + */
17819 + if (xen_feature(XENFEAT_auto_translated_physmap))
17820 + return 0;
17821 +
17822 + if (unlikely(order > MAX_CONTIG_ORDER))
17823 + return -ENOMEM;
17824 +
17825 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
17826 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
17827 +
17828 + scrub_pages(vstart, 1 << order);
17829 +
17830 + balloon_lock(flags);
17831 +
17832 + /* 1. Zap current PTEs, remembering MFNs. */
17833 + for (i = 0; i < (1UL<<order); i++) {
17834 + in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
17835 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17836 + __pte_ma(0), 0);
17837 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17838 + INVALID_P2M_ENTRY);
17839 + }
17840 + if (HYPERVISOR_multicall(cr_mcl, i))
17841 + BUG();
17842 +
17843 + /* 2. Get a new contiguous memory extent. */
17844 + out_frame = __pa(vstart) >> PAGE_SHIFT;
17845 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17846 + success = (exchange.nr_exchanged == (1UL << order));
17847 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17848 + BUG_ON(success && (rc != 0));
17849 +#ifdef CONFIG_XEN_COMPAT_030002
17850 + if (unlikely(rc == -ENOSYS)) {
17851 + /* Compatibility when XENMEM_exchange is unsupported. */
17852 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17853 + &exchange.in) != (1UL << order))
17854 + BUG();
17855 + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17856 + &exchange.out) == 1);
17857 + if (!success) {
17858 + /* Couldn't get special memory: fall back to normal. */
17859 + for (i = 0; i < (1UL<<order); i++)
17860 + in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
17861 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17862 + &exchange.in) != (1UL<<order))
17863 + BUG();
17864 + }
17865 + }
17866 +#endif
17867 +
17868 + /* 3. Map the new extent in place of old pages. */
17869 + for (i = 0; i < (1UL<<order); i++) {
17870 + frame = success ? (out_frame + i) : in_frames[i];
17871 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17872 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
17873 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17874 + }
17875 +
17876 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
17877 + ? UVMF_TLB_FLUSH|UVMF_ALL
17878 + : UVMF_INVLPG|UVMF_ALL;
17879 + if (HYPERVISOR_multicall(cr_mcl, i))
17880 + BUG();
17881 +
17882 + if (success)
17883 + contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT,
17884 + 1UL << order);
17885 +
17886 + balloon_unlock(flags);
17887 +
17888 + return success ? 0 : -ENOMEM;
17889 +}
17890 +
17891 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
17892 +{
17893 + unsigned long *out_frames = discontig_frames, in_frame;
17894 + unsigned long frame, i, flags;
17895 + long rc;
17896 + int success;
17897 + struct xen_memory_exchange exchange = {
17898 + .in = {
17899 + .nr_extents = 1,
17900 + .extent_order = order,
17901 + .domid = DOMID_SELF
17902 + },
17903 + .out = {
17904 + .nr_extents = 1UL << order,
17905 + .extent_order = 0,
17906 + .domid = DOMID_SELF
17907 + }
17908 + };
17909 +
17910 + if (xen_feature(XENFEAT_auto_translated_physmap) ||
17911 + !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap))
17912 + return;
17913 +
17914 + if (unlikely(order > MAX_CONTIG_ORDER))
17915 + return;
17916 +
17917 + set_xen_guest_handle(exchange.in.extent_start, &in_frame);
17918 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
17919 +
17920 + scrub_pages(vstart, 1 << order);
17921 +
17922 + balloon_lock(flags);
17923 +
17924 + contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
17925 +
17926 + /* 1. Find start MFN of contiguous extent. */
17927 + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
17928 +
17929 + /* 2. Zap current PTEs. */
17930 + for (i = 0; i < (1UL<<order); i++) {
17931 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17932 + __pte_ma(0), 0);
17933 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17934 + INVALID_P2M_ENTRY);
17935 + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
17936 + }
17937 + if (HYPERVISOR_multicall(cr_mcl, i))
17938 + BUG();
17939 +
17940 + /* 3. Do the exchange for non-contiguous MFNs. */
17941 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17942 + success = (exchange.nr_exchanged == 1);
17943 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17944 + BUG_ON(success && (rc != 0));
17945 +#ifdef CONFIG_XEN_COMPAT_030002
17946 + if (unlikely(rc == -ENOSYS)) {
17947 + /* Compatibility when XENMEM_exchange is unsupported. */
17948 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17949 + &exchange.in) != 1)
17950 + BUG();
17951 + if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17952 + &exchange.out) != (1UL << order))
17953 + BUG();
17954 + success = 1;
17955 + }
17956 +#endif
17957 +
17958 + /* 4. Map new pages in place of old pages. */
17959 + for (i = 0; i < (1UL<<order); i++) {
17960 + frame = success ? out_frames[i] : (in_frame + i);
17961 + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
17962 + pfn_pte_ma(frame, PAGE_KERNEL), 0);
17963 + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17964 + }
17965 +
17966 + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
17967 + ? UVMF_TLB_FLUSH|UVMF_ALL
17968 + : UVMF_INVLPG|UVMF_ALL;
17969 + if (HYPERVISOR_multicall(cr_mcl, i))
17970 + BUG();
17971 +
17972 + balloon_unlock(flags);
17973 +}
17974 +
17975 +#ifdef __i386__
17976 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
17977 +{
17978 + __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
17979 + maddr_t mach_lp = arbitrary_virt_to_machine(lp);
17980 + return HYPERVISOR_update_descriptor(
17981 + mach_lp, (u64)entry_a | ((u64)entry_b<<32));
17982 +}
17983 +#endif
17984 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/init-xen.c linux-2.6.16.33/arch/i386/mm/init-xen.c
17985 --- linux-2.6.16.33-noxen/arch/i386/mm/init-xen.c 1970-01-01 00:00:00.000000000 +0000
17986 +++ linux-2.6.16.33/arch/i386/mm/init-xen.c 2007-01-08 15:00:45.000000000 +0000
17987 @@ -0,0 +1,849 @@
17988 +/*
17989 + * linux/arch/i386/mm/init.c
17990 + *
17991 + * Copyright (C) 1995 Linus Torvalds
17992 + *
17993 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
17994 + */
17995 +
17996 +#include <linux/config.h>
17997 +#include <linux/module.h>
17998 +#include <linux/signal.h>
17999 +#include <linux/sched.h>
18000 +#include <linux/kernel.h>
18001 +#include <linux/errno.h>
18002 +#include <linux/string.h>
18003 +#include <linux/types.h>
18004 +#include <linux/ptrace.h>
18005 +#include <linux/mman.h>
18006 +#include <linux/mm.h>
18007 +#include <linux/hugetlb.h>
18008 +#include <linux/swap.h>
18009 +#include <linux/smp.h>
18010 +#include <linux/init.h>
18011 +#include <linux/highmem.h>
18012 +#include <linux/pagemap.h>
18013 +#include <linux/bootmem.h>
18014 +#include <linux/slab.h>
18015 +#include <linux/proc_fs.h>
18016 +#include <linux/efi.h>
18017 +#include <linux/memory_hotplug.h>
18018 +#include <linux/initrd.h>
18019 +#include <linux/dma-mapping.h>
18020 +#include <linux/scatterlist.h>
18021 +
18022 +#include <asm/processor.h>
18023 +#include <asm/system.h>
18024 +#include <asm/uaccess.h>
18025 +#include <asm/pgtable.h>
18026 +#include <asm/dma.h>
18027 +#include <asm/fixmap.h>
18028 +#include <asm/e820.h>
18029 +#include <asm/apic.h>
18030 +#include <asm/tlb.h>
18031 +#include <asm/tlbflush.h>
18032 +#include <asm/sections.h>
18033 +#include <asm/hypervisor.h>
18034 +#include <asm/swiotlb.h>
18035 +
18036 +extern unsigned long *contiguous_bitmap;
18037 +
18038 +unsigned int __VMALLOC_RESERVE = 128 << 20;
18039 +
18040 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18041 +unsigned long highstart_pfn, highend_pfn;
18042 +
18043 +static int noinline do_test_wp_bit(void);
18044 +
18045 +/*
18046 + * Creates a middle page table and puts a pointer to it in the
18047 + * given global directory entry. This only returns the gd entry
18048 + * in non-PAE compilation mode, since the middle layer is folded.
18049 + */
18050 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
18051 +{
18052 + pud_t *pud;
18053 + pmd_t *pmd_table;
18054 +
18055 +#ifdef CONFIG_X86_PAE
18056 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18057 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
18058 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
18059 + pud = pud_offset(pgd, 0);
18060 + if (pmd_table != pmd_offset(pud, 0))
18061 + BUG();
18062 +#else
18063 + pud = pud_offset(pgd, 0);
18064 + pmd_table = pmd_offset(pud, 0);
18065 +#endif
18066 +
18067 + return pmd_table;
18068 +}
18069 +
18070 +/*
18071 + * Create a page table and place a pointer to it in a middle page
18072 + * directory entry.
18073 + */
18074 +static pte_t * __init one_page_table_init(pmd_t *pmd)
18075 +{
18076 + if (pmd_none(*pmd)) {
18077 + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
18078 + make_lowmem_page_readonly(page_table,
18079 + XENFEAT_writable_page_tables);
18080 + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
18081 + if (page_table != pte_offset_kernel(pmd, 0))
18082 + BUG();
18083 +
18084 + return page_table;
18085 + }
18086 +
18087 + return pte_offset_kernel(pmd, 0);
18088 +}
18089 +
18090 +/*
18091 + * This function initializes a certain range of kernel virtual memory
18092 + * with new bootmem page tables, everywhere page tables are missing in
18093 + * the given range.
18094 + */
18095 +
18096 +/*
18097 + * NOTE: The pagetables are allocated contiguous on the physical space
18098 + * so we can cache the place of the first one and move around without
18099 + * checking the pgd every time.
18100 + */
18101 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
18102 +{
18103 + pgd_t *pgd;
18104 + pud_t *pud;
18105 + pmd_t *pmd;
18106 + int pgd_idx, pmd_idx;
18107 + unsigned long vaddr;
18108 +
18109 + vaddr = start;
18110 + pgd_idx = pgd_index(vaddr);
18111 + pmd_idx = pmd_index(vaddr);
18112 + pgd = pgd_base + pgd_idx;
18113 +
18114 + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
18115 + if (pgd_none(*pgd))
18116 + one_md_table_init(pgd);
18117 + pud = pud_offset(pgd, vaddr);
18118 + pmd = pmd_offset(pud, vaddr);
18119 + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
18120 + if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
18121 + one_page_table_init(pmd);
18122 +
18123 + vaddr += PMD_SIZE;
18124 + }
18125 + pmd_idx = 0;
18126 + }
18127 +}
18128 +
18129 +static inline int is_kernel_text(unsigned long addr)
18130 +{
18131 + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
18132 + return 1;
18133 + return 0;
18134 +}
18135 +
18136 +/*
18137 + * This maps the physical memory to kernel virtual address space, a total
18138 + * of max_low_pfn pages, by creating page tables starting from address
18139 + * PAGE_OFFSET.
18140 + */
18141 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
18142 +{
18143 + unsigned long pfn;
18144 + pgd_t *pgd;
18145 + pmd_t *pmd;
18146 + pte_t *pte;
18147 + int pgd_idx, pmd_idx, pte_ofs;
18148 +
18149 + unsigned long max_ram_pfn = xen_start_info->nr_pages;
18150 + if (max_ram_pfn > max_low_pfn)
18151 + max_ram_pfn = max_low_pfn;
18152 +
18153 + pgd_idx = pgd_index(PAGE_OFFSET);
18154 + pgd = pgd_base + pgd_idx;
18155 + pfn = 0;
18156 + pmd_idx = pmd_index(PAGE_OFFSET);
18157 + pte_ofs = pte_index(PAGE_OFFSET);
18158 +
18159 + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
18160 +#ifdef CONFIG_XEN
18161 + /*
18162 + * Native linux hasn't PAE-paging enabled yet at this
18163 + * point. When running as xen domain we are in PAE
18164 + * mode already, thus we can't simply hook a empty
18165 + * pmd. That would kill the mappings we are currently
18166 + * using ...
18167 + */
18168 + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
18169 +#else
18170 + pmd = one_md_table_init(pgd);
18171 +#endif
18172 + if (pfn >= max_low_pfn)
18173 + continue;
18174 + pmd += pmd_idx;
18175 + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
18176 + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
18177 + if (address >= hypervisor_virt_start)
18178 + continue;
18179 +
18180 + /* Map with big pages if possible, otherwise create normal page tables. */
18181 + if (cpu_has_pse) {
18182 + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
18183 +
18184 + if (is_kernel_text(address) || is_kernel_text(address2))
18185 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
18186 + else
18187 + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
18188 + pfn += PTRS_PER_PTE;
18189 + } else {
18190 + pte = one_page_table_init(pmd);
18191 +
18192 + pte += pte_ofs;
18193 + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
18194 + /* XEN: Only map initial RAM allocation. */
18195 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
18196 + continue;
18197 + if (is_kernel_text(address))
18198 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
18199 + else
18200 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
18201 + }
18202 + pte_ofs = 0;
18203 + }
18204 + }
18205 + pmd_idx = 0;
18206 + }
18207 +}
18208 +
18209 +#ifndef CONFIG_XEN
18210 +
18211 +static inline int page_kills_ppro(unsigned long pagenr)
18212 +{
18213 + if (pagenr >= 0x70000 && pagenr <= 0x7003F)
18214 + return 1;
18215 + return 0;
18216 +}
18217 +
18218 +#else
18219 +
18220 +#define page_kills_ppro(p) 0
18221 +
18222 +#endif
18223 +
18224 +extern int is_available_memory(efi_memory_desc_t *);
18225 +
18226 +int page_is_ram(unsigned long pagenr)
18227 +{
18228 + int i;
18229 + unsigned long addr, end;
18230 +
18231 + if (efi_enabled) {
18232 + efi_memory_desc_t *md;
18233 + void *p;
18234 +
18235 + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
18236 + md = p;
18237 + if (!is_available_memory(md))
18238 + continue;
18239 + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18240 + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
18241 +
18242 + if ((pagenr >= addr) && (pagenr < end))
18243 + return 1;
18244 + }
18245 + return 0;
18246 + }
18247 +
18248 + for (i = 0; i < e820.nr_map; i++) {
18249 +
18250 + if (e820.map[i].type != E820_RAM) /* not usable memory */
18251 + continue;
18252 + /*
18253 + * !!!FIXME!!! Some BIOSen report areas as RAM that
18254 + * are not. Notably the 640->1Mb area. We need a sanity
18255 + * check here.
18256 + */
18257 + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
18258 + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
18259 + if ((pagenr >= addr) && (pagenr < end))
18260 + return 1;
18261 + }
18262 + return 0;
18263 +}
18264 +
18265 +#ifdef CONFIG_HIGHMEM
18266 +pte_t *kmap_pte;
18267 +pgprot_t kmap_prot;
18268 +
18269 +#define kmap_get_fixmap_pte(vaddr) \
18270 + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
18271 +
18272 +static void __init kmap_init(void)
18273 +{
18274 + unsigned long kmap_vstart;
18275 +
18276 + /* cache the first kmap pte */
18277 + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
18278 + kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
18279 +
18280 + kmap_prot = PAGE_KERNEL;
18281 +}
18282 +
18283 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
18284 +{
18285 + pgd_t *pgd;
18286 + pud_t *pud;
18287 + pmd_t *pmd;
18288 + pte_t *pte;
18289 + unsigned long vaddr;
18290 +
18291 + vaddr = PKMAP_BASE;
18292 + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
18293 +
18294 + pgd = swapper_pg_dir + pgd_index(vaddr);
18295 + pud = pud_offset(pgd, vaddr);
18296 + pmd = pmd_offset(pud, vaddr);
18297 + pte = pte_offset_kernel(pmd, vaddr);
18298 + pkmap_page_table = pte;
18299 +}
18300 +
18301 +static void __meminit free_new_highpage(struct page *page, int pfn)
18302 +{
18303 + set_page_count(page, 1);
18304 + if (pfn < xen_start_info->nr_pages)
18305 + __free_page(page);
18306 + totalhigh_pages++;
18307 +}
18308 +
18309 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
18310 +{
18311 + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
18312 + ClearPageReserved(page);
18313 + free_new_highpage(page, pfn);
18314 + } else
18315 + SetPageReserved(page);
18316 +}
18317 +
18318 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
18319 +{
18320 + free_new_highpage(page, pfn);
18321 + totalram_pages++;
18322 +#ifdef CONFIG_FLATMEM
18323 + max_mapnr = max(pfn, max_mapnr);
18324 +#endif
18325 + num_physpages++;
18326 + return 0;
18327 +}
18328 +
18329 +/*
18330 + * Not currently handling the NUMA case.
18331 + * Assuming single node and all memory that
18332 + * has been added dynamically that would be
18333 + * onlined here is in HIGHMEM
18334 + */
18335 +void online_page(struct page *page)
18336 +{
18337 + ClearPageReserved(page);
18338 + add_one_highpage_hotplug(page, page_to_pfn(page));
18339 +}
18340 +
18341 +
18342 +#ifdef CONFIG_NUMA
18343 +extern void set_highmem_pages_init(int);
18344 +#else
18345 +static void __init set_highmem_pages_init(int bad_ppro)
18346 +{
18347 + int pfn;
18348 + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
18349 + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
18350 + totalram_pages += totalhigh_pages;
18351 +}
18352 +#endif /* CONFIG_FLATMEM */
18353 +
18354 +#else
18355 +#define kmap_init() do { } while (0)
18356 +#define permanent_kmaps_init(pgd_base) do { } while (0)
18357 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
18358 +#endif /* CONFIG_HIGHMEM */
18359 +
18360 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
18361 +EXPORT_SYMBOL(__PAGE_KERNEL);
18362 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
18363 +
18364 +#ifdef CONFIG_NUMA
18365 +extern void __init remap_numa_kva(void);
18366 +#else
18367 +#define remap_numa_kva() do {} while (0)
18368 +#endif
18369 +
18370 +pgd_t *swapper_pg_dir;
18371 +
18372 +static void __init pagetable_init (void)
18373 +{
18374 + unsigned long vaddr;
18375 + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
18376 +
18377 + swapper_pg_dir = pgd_base;
18378 + init_mm.pgd = pgd_base;
18379 +
18380 + /* Enable PSE if available */
18381 + if (cpu_has_pse) {
18382 + set_in_cr4(X86_CR4_PSE);
18383 + }
18384 +
18385 + /* Enable PGE if available */
18386 + if (cpu_has_pge) {
18387 + set_in_cr4(X86_CR4_PGE);
18388 + __PAGE_KERNEL |= _PAGE_GLOBAL;
18389 + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
18390 + }
18391 +
18392 + kernel_physical_mapping_init(pgd_base);
18393 + remap_numa_kva();
18394 +
18395 + /*
18396 + * Fixed mappings, only the page table structure has to be
18397 + * created - mappings will be set by set_fixmap():
18398 + */
18399 + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
18400 + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
18401 +
18402 + permanent_kmaps_init(pgd_base);
18403 +}
18404 +
18405 +#ifdef CONFIG_SOFTWARE_SUSPEND
18406 +/*
18407 + * Swap suspend & friends need this for resume because things like the intel-agp
18408 + * driver might have split up a kernel 4MB mapping.
18409 + */
18410 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
18411 + __attribute__ ((aligned (PAGE_SIZE)));
18412 +
18413 +static inline void save_pg_dir(void)
18414 +{
18415 + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
18416 +}
18417 +#else
18418 +static inline void save_pg_dir(void)
18419 +{
18420 +}
18421 +#endif
18422 +
18423 +void zap_low_mappings (void)
18424 +{
18425 + int i;
18426 +
18427 + save_pg_dir();
18428 +
18429 + /*
18430 + * Zap initial low-memory mappings.
18431 + *
18432 + * Note that "pgd_clear()" doesn't do it for
18433 + * us, because pgd_clear() is a no-op on i386.
18434 + */
18435 + for (i = 0; i < USER_PTRS_PER_PGD; i++)
18436 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18437 + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
18438 +#else
18439 + set_pgd(swapper_pg_dir+i, __pgd(0));
18440 +#endif
18441 + flush_tlb_all();
18442 +}
18443 +
18444 +static int disable_nx __initdata = 0;
18445 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
18446 +EXPORT_SYMBOL(__supported_pte_mask);
18447 +
18448 +/*
18449 + * noexec = on|off
18450 + *
18451 + * Control non executable mappings.
18452 + *
18453 + * on Enable
18454 + * off Disable
18455 + */
18456 +void __init noexec_setup(const char *str)
18457 +{
18458 + if (!strncmp(str, "on",2) && cpu_has_nx) {
18459 + __supported_pte_mask |= _PAGE_NX;
18460 + disable_nx = 0;
18461 + } else if (!strncmp(str,"off",3)) {
18462 + disable_nx = 1;
18463 + __supported_pte_mask &= ~_PAGE_NX;
18464 + }
18465 +}
18466 +
18467 +int nx_enabled = 0;
18468 +#ifdef CONFIG_X86_PAE
18469 +
18470 +static void __init set_nx(void)
18471 +{
18472 + unsigned int v[4], l, h;
18473 +
18474 + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
18475 + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
18476 + if ((v[3] & (1 << 20)) && !disable_nx) {
18477 + rdmsr(MSR_EFER, l, h);
18478 + l |= EFER_NX;
18479 + wrmsr(MSR_EFER, l, h);
18480 + nx_enabled = 1;
18481 + __supported_pte_mask |= _PAGE_NX;
18482 + }
18483 + }
18484 +}
18485 +
18486 +/*
18487 + * Enables/disables executability of a given kernel page and
18488 + * returns the previous setting.
18489 + */
18490 +int __init set_kernel_exec(unsigned long vaddr, int enable)
18491 +{
18492 + pte_t *pte;
18493 + int ret = 1;
18494 +
18495 + if (!nx_enabled)
18496 + goto out;
18497 +
18498 + pte = lookup_address(vaddr);
18499 + BUG_ON(!pte);
18500 +
18501 + if (!pte_exec_kernel(*pte))
18502 + ret = 0;
18503 +
18504 + if (enable)
18505 + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
18506 + else
18507 + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
18508 + __flush_tlb_all();
18509 +out:
18510 + return ret;
18511 +}
18512 +
18513 +#endif
18514 +
18515 +/*
18516 + * paging_init() sets up the page tables - note that the first 8MB are
18517 + * already mapped by head.S.
18518 + *
18519 + * This routines also unmaps the page at virtual kernel address 0, so
18520 + * that we can trap those pesky NULL-reference errors in the kernel.
18521 + */
18522 +void __init paging_init(void)
18523 +{
18524 + int i;
18525 +
18526 +#ifdef CONFIG_X86_PAE
18527 + set_nx();
18528 + if (nx_enabled)
18529 + printk("NX (Execute Disable) protection: active\n");
18530 +#endif
18531 +
18532 + pagetable_init();
18533 +
18534 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
18535 + /*
18536 + * We will bail out later - printk doesn't work right now so
18537 + * the user would just see a hanging kernel.
18538 + * when running as xen domain we are already in PAE mode at
18539 + * this point.
18540 + */
18541 + if (cpu_has_pae)
18542 + set_in_cr4(X86_CR4_PAE);
18543 +#endif
18544 + __flush_tlb_all();
18545 +
18546 + kmap_init();
18547 +
18548 + /* Switch to the real shared_info page, and clear the
18549 + * dummy page. */
18550 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
18551 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
18552 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
18553 +
18554 + /* Setup mapping of lower 1st MB */
18555 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
18556 + if (is_initial_xendomain())
18557 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
18558 + else
18559 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
18560 + virt_to_machine(empty_zero_page),
18561 + PAGE_KERNEL_RO);
18562 +}
18563 +
18564 +/*
18565 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
18566 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
18567 + * used to involve black magic jumps to work around some nasty CPU bugs,
18568 + * but fortunately the switch to using exceptions got rid of all that.
18569 + */
18570 +
18571 +static void __init test_wp_bit(void)
18572 +{
18573 + printk("Checking if this processor honours the WP bit even in supervisor mode... ");
18574 +
18575 + /* Any page-aligned address will do, the test is non-destructive */
18576 + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
18577 + boot_cpu_data.wp_works_ok = do_test_wp_bit();
18578 + clear_fixmap(FIX_WP_TEST);
18579 +
18580 + if (!boot_cpu_data.wp_works_ok) {
18581 + printk("No.\n");
18582 +#ifdef CONFIG_X86_WP_WORKS_OK
18583 + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
18584 +#endif
18585 + } else {
18586 + printk("Ok.\n");
18587 + }
18588 +}
18589 +
18590 +static void __init set_max_mapnr_init(void)
18591 +{
18592 +#ifdef CONFIG_HIGHMEM
18593 + num_physpages = highend_pfn;
18594 +#else
18595 + num_physpages = max_low_pfn;
18596 +#endif
18597 +#ifdef CONFIG_FLATMEM
18598 + max_mapnr = num_physpages;
18599 +#endif
18600 +}
18601 +
18602 +static struct kcore_list kcore_mem, kcore_vmalloc;
18603 +
18604 +void __init mem_init(void)
18605 +{
18606 + extern int ppro_with_ram_bug(void);
18607 + int codesize, reservedpages, datasize, initsize;
18608 + int tmp;
18609 + int bad_ppro;
18610 + unsigned long pfn;
18611 +
18612 + contiguous_bitmap = alloc_bootmem_low_pages(
18613 + (max_low_pfn + 2*BITS_PER_LONG) >> 3);
18614 + BUG_ON(!contiguous_bitmap);
18615 + memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
18616 +
18617 +#if defined(CONFIG_SWIOTLB)
18618 + swiotlb_init();
18619 +#endif
18620 +
18621 +#ifdef CONFIG_FLATMEM
18622 + if (!mem_map)
18623 + BUG();
18624 +#endif
18625 +
18626 + bad_ppro = ppro_with_ram_bug();
18627 +
18628 +#ifdef CONFIG_HIGHMEM
18629 + /* check that fixmap and pkmap do not overlap */
18630 + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
18631 + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
18632 + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
18633 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
18634 + BUG();
18635 + }
18636 +#endif
18637 +
18638 + set_max_mapnr_init();
18639 +
18640 +#ifdef CONFIG_HIGHMEM
18641 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
18642 +#else
18643 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
18644 +#endif
18645 + printk("vmalloc area: %lx-%lx, maxmem %lx\n",
18646 + VMALLOC_START,VMALLOC_END,MAXMEM);
18647 + BUG_ON(VMALLOC_START > VMALLOC_END);
18648 +
18649 + /* this will put all low memory onto the freelists */
18650 + totalram_pages += free_all_bootmem();
18651 + /* XEN: init and count low-mem pages outside initial allocation. */
18652 + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
18653 + ClearPageReserved(pfn_to_page(pfn));
18654 + set_page_count(pfn_to_page(pfn), 1);
18655 + totalram_pages++;
18656 + }
18657 +
18658 + reservedpages = 0;
18659 + for (tmp = 0; tmp < max_low_pfn; tmp++)
18660 + /*
18661 + * Only count reserved RAM pages
18662 + */
18663 + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
18664 + reservedpages++;
18665 +
18666 + set_highmem_pages_init(bad_ppro);
18667 +
18668 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
18669 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
18670 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
18671 +
18672 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
18673 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
18674 + VMALLOC_END-VMALLOC_START);
18675 +
18676 + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
18677 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
18678 + num_physpages << (PAGE_SHIFT-10),
18679 + codesize >> 10,
18680 + reservedpages << (PAGE_SHIFT-10),
18681 + datasize >> 10,
18682 + initsize >> 10,
18683 + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
18684 + );
18685 +
18686 +#ifdef CONFIG_X86_PAE
18687 + if (!cpu_has_pae)
18688 + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
18689 +#endif
18690 + if (boot_cpu_data.wp_works_ok < 0)
18691 + test_wp_bit();
18692 +
18693 + /*
18694 + * Subtle. SMP is doing it's boot stuff late (because it has to
18695 + * fork idle threads) - but it also needs low mappings for the
18696 + * protected-mode entry to work. We zap these entries only after
18697 + * the WP-bit has been tested.
18698 + */
18699 +#ifndef CONFIG_SMP
18700 + zap_low_mappings();
18701 +#endif
18702 +
18703 + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
18704 +}
18705 +
18706 +/*
18707 + * this is for the non-NUMA, single node SMP system case.
18708 + * Specifically, in the case of x86, we will always add
18709 + * memory to the highmem for now.
18710 + */
18711 +#ifndef CONFIG_NEED_MULTIPLE_NODES
18712 +int add_memory(u64 start, u64 size)
18713 +{
18714 + struct pglist_data *pgdata = &contig_page_data;
18715 + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
18716 + unsigned long start_pfn = start >> PAGE_SHIFT;
18717 + unsigned long nr_pages = size >> PAGE_SHIFT;
18718 +
18719 + return __add_pages(zone, start_pfn, nr_pages);
18720 +}
18721 +
18722 +int remove_memory(u64 start, u64 size)
18723 +{
18724 + return -EINVAL;
18725 +}
18726 +#endif
18727 +
18728 +kmem_cache_t *pgd_cache;
18729 +kmem_cache_t *pmd_cache;
18730 +
18731 +void __init pgtable_cache_init(void)
18732 +{
18733 + if (PTRS_PER_PMD > 1) {
18734 + pmd_cache = kmem_cache_create("pmd",
18735 + PTRS_PER_PMD*sizeof(pmd_t),
18736 + PTRS_PER_PMD*sizeof(pmd_t),
18737 + 0,
18738 + pmd_ctor,
18739 + NULL);
18740 + if (!pmd_cache)
18741 + panic("pgtable_cache_init(): cannot create pmd cache");
18742 + }
18743 + pgd_cache = kmem_cache_create("pgd",
18744 +#ifndef CONFIG_XEN
18745 + PTRS_PER_PGD*sizeof(pgd_t),
18746 + PTRS_PER_PGD*sizeof(pgd_t),
18747 +#else
18748 + PAGE_SIZE,
18749 + PAGE_SIZE,
18750 +#endif
18751 + 0,
18752 + pgd_ctor,
18753 + PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
18754 + if (!pgd_cache)
18755 + panic("pgtable_cache_init(): Cannot create pgd cache");
18756 +}
18757 +
18758 +/*
18759 + * This function cannot be __init, since exceptions don't work in that
18760 + * section. Put this after the callers, so that it cannot be inlined.
18761 + */
18762 +static int noinline do_test_wp_bit(void)
18763 +{
18764 + char tmp_reg;
18765 + int flag;
18766 +
18767 + __asm__ __volatile__(
18768 + " movb %0,%1 \n"
18769 + "1: movb %1,%0 \n"
18770 + " xorl %2,%2 \n"
18771 + "2: \n"
18772 + ".section __ex_table,\"a\"\n"
18773 + " .align 4 \n"
18774 + " .long 1b,2b \n"
18775 + ".previous \n"
18776 + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
18777 + "=q" (tmp_reg),
18778 + "=r" (flag)
18779 + :"2" (1)
18780 + :"memory");
18781 +
18782 + return flag;
18783 +}
18784 +
18785 +void free_initmem(void)
18786 +{
18787 + unsigned long addr;
18788 +
18789 + addr = (unsigned long)(&__init_begin);
18790 + for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
18791 + ClearPageReserved(virt_to_page(addr));
18792 + set_page_count(virt_to_page(addr), 1);
18793 + memset((void *)addr, 0xcc, PAGE_SIZE);
18794 + free_page(addr);
18795 + totalram_pages++;
18796 + }
18797 + printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
18798 +}
18799 +
18800 +#ifdef CONFIG_DEBUG_RODATA
18801 +
18802 +extern char __start_rodata, __end_rodata;
18803 +void mark_rodata_ro(void)
18804 +{
18805 + unsigned long addr = (unsigned long)&__start_rodata;
18806 +
18807 + for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
18808 + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
18809 +
18810 + printk ("Write protecting the kernel read-only data: %luk\n",
18811 + (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
18812 +
18813 + /*
18814 + * change_page_attr() requires a global_flush_tlb() call after it.
18815 + * We do this after the printk so that if something went wrong in the
18816 + * change, the printk gets out at least to give a better debug hint
18817 + * of who is the culprit.
18818 + */
18819 + global_flush_tlb();
18820 +}
18821 +#endif
18822 +
18823 +
18824 +#ifdef CONFIG_BLK_DEV_INITRD
18825 +void free_initrd_mem(unsigned long start, unsigned long end)
18826 +{
18827 + if (start < end)
18828 + printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
18829 + for (; start < end; start += PAGE_SIZE) {
18830 + ClearPageReserved(virt_to_page(start));
18831 + set_page_count(virt_to_page(start), 1);
18832 + free_page(start);
18833 + totalram_pages++;
18834 + }
18835 +}
18836 +#endif
18837 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/ioremap-xen.c linux-2.6.16.33/arch/i386/mm/ioremap-xen.c
18838 --- linux-2.6.16.33-noxen/arch/i386/mm/ioremap-xen.c 1970-01-01 00:00:00.000000000 +0000
18839 +++ linux-2.6.16.33/arch/i386/mm/ioremap-xen.c 2007-01-08 15:00:45.000000000 +0000
18840 @@ -0,0 +1,447 @@
18841 +/*
18842 + * arch/i386/mm/ioremap.c
18843 + *
18844 + * Re-map IO memory to kernel address space so that we can access it.
18845 + * This is needed for high PCI addresses that aren't mapped in the
18846 + * 640k-1MB IO memory area on PC's
18847 + *
18848 + * (C) Copyright 1995 1996 Linus Torvalds
18849 + */
18850 +
18851 +#include <linux/vmalloc.h>
18852 +#include <linux/init.h>
18853 +#include <linux/slab.h>
18854 +#include <linux/module.h>
18855 +#include <asm/io.h>
18856 +#include <asm/fixmap.h>
18857 +#include <asm/cacheflush.h>
18858 +#include <asm/tlbflush.h>
18859 +#include <asm/pgtable.h>
18860 +#include <asm/pgalloc.h>
18861 +
18862 +#define ISA_START_ADDRESS 0x0
18863 +#define ISA_END_ADDRESS 0x100000
18864 +
18865 +static int direct_remap_area_pte_fn(pte_t *pte,
18866 + struct page *pmd_page,
18867 + unsigned long address,
18868 + void *data)
18869 +{
18870 + mmu_update_t **v = (mmu_update_t **)data;
18871 +
18872 + BUG_ON(!pte_none(*pte));
18873 +
18874 + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18875 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18876 + (*v)++;
18877 +
18878 + return 0;
18879 +}
18880 +
18881 +static int __direct_remap_pfn_range(struct mm_struct *mm,
18882 + unsigned long address,
18883 + unsigned long mfn,
18884 + unsigned long size,
18885 + pgprot_t prot,
18886 + domid_t domid)
18887 +{
18888 + int rc;
18889 + unsigned long i, start_address;
18890 + mmu_update_t *u, *v, *w;
18891 +
18892 + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
18893 + if (u == NULL)
18894 + return -ENOMEM;
18895 +
18896 + start_address = address;
18897 +
18898 + flush_cache_all();
18899 +
18900 + for (i = 0; i < size; i += PAGE_SIZE) {
18901 + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
18902 + /* Flush a full batch after filling in the PTE ptrs. */
18903 + rc = apply_to_page_range(mm, start_address,
18904 + address - start_address,
18905 + direct_remap_area_pte_fn, &w);
18906 + if (rc)
18907 + goto out;
18908 + rc = -EFAULT;
18909 + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
18910 + goto out;
18911 + v = w = u;
18912 + start_address = address;
18913 + }
18914 +
18915 + /*
18916 + * Fill in the machine address: PTE ptr is done later by
18917 + * __direct_remap_area_pages().
18918 + */
18919 + v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
18920 +
18921 + mfn++;
18922 + address += PAGE_SIZE;
18923 + v++;
18924 + }
18925 +
18926 + if (v != u) {
18927 + /* Final batch. */
18928 + rc = apply_to_page_range(mm, start_address,
18929 + address - start_address,
18930 + direct_remap_area_pte_fn, &w);
18931 + if (rc)
18932 + goto out;
18933 + rc = -EFAULT;
18934 + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
18935 + goto out;
18936 + }
18937 +
18938 + rc = 0;
18939 +
18940 + out:
18941 + flush_tlb_all();
18942 +
18943 + free_page((unsigned long)u);
18944 +
18945 + return rc;
18946 +}
18947 +
18948 +int direct_remap_pfn_range(struct vm_area_struct *vma,
18949 + unsigned long address,
18950 + unsigned long mfn,
18951 + unsigned long size,
18952 + pgprot_t prot,
18953 + domid_t domid)
18954 +{
18955 + if (xen_feature(XENFEAT_auto_translated_physmap))
18956 + return remap_pfn_range(vma, address, mfn, size, prot);
18957 +
18958 + if (domid == DOMID_SELF)
18959 + return -EINVAL;
18960 +
18961 + vma->vm_flags |= VM_IO | VM_RESERVED;
18962 +
18963 + vma->vm_mm->context.has_foreign_mappings = 1;
18964 +
18965 + return __direct_remap_pfn_range(
18966 + vma->vm_mm, address, mfn, size, prot, domid);
18967 +}
18968 +EXPORT_SYMBOL(direct_remap_pfn_range);
18969 +
18970 +int direct_kernel_remap_pfn_range(unsigned long address,
18971 + unsigned long mfn,
18972 + unsigned long size,
18973 + pgprot_t prot,
18974 + domid_t domid)
18975 +{
18976 + return __direct_remap_pfn_range(
18977 + &init_mm, address, mfn, size, prot, domid);
18978 +}
18979 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
18980 +
18981 +static int lookup_pte_fn(
18982 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18983 +{
18984 + uint64_t *ptep = (uint64_t *)data;
18985 + if (ptep)
18986 + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18987 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18988 + return 0;
18989 +}
18990 +
18991 +int create_lookup_pte_addr(struct mm_struct *mm,
18992 + unsigned long address,
18993 + uint64_t *ptep)
18994 +{
18995 + return apply_to_page_range(mm, address, PAGE_SIZE,
18996 + lookup_pte_fn, ptep);
18997 +}
18998 +
18999 +EXPORT_SYMBOL(create_lookup_pte_addr);
19000 +
19001 +static int noop_fn(
19002 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
19003 +{
19004 + return 0;
19005 +}
19006 +
19007 +int touch_pte_range(struct mm_struct *mm,
19008 + unsigned long address,
19009 + unsigned long size)
19010 +{
19011 + return apply_to_page_range(mm, address, size, noop_fn, NULL);
19012 +}
19013 +
19014 +EXPORT_SYMBOL(touch_pte_range);
19015 +
19016 +/*
19017 + * Does @address reside within a non-highmem page that is local to this virtual
19018 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
19019 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
19020 + * why this works.
19021 + */
19022 +static inline int is_local_lowmem(unsigned long address)
19023 +{
19024 + extern unsigned long max_low_pfn;
19025 + return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
19026 +}
19027 +
19028 +/*
19029 + * Generic mapping function (not visible outside):
19030 + */
19031 +
19032 +/*
19033 + * Remap an arbitrary physical address space into the kernel virtual
19034 + * address space. Needed when the kernel wants to access high addresses
19035 + * directly.
19036 + *
19037 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
19038 + * have to convert them into an offset in a page-aligned mapping, but the
19039 + * caller shouldn't need to know that small detail.
19040 + */
19041 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
19042 +{
19043 + void __iomem * addr;
19044 + struct vm_struct * area;
19045 + unsigned long offset, last_addr;
19046 + domid_t domid = DOMID_IO;
19047 +
19048 + /* Don't allow wraparound or zero size */
19049 + last_addr = phys_addr + size - 1;
19050 + if (!size || last_addr < phys_addr)
19051 + return NULL;
19052 +
19053 + /*
19054 + * Don't remap the low PCI/ISA area, it's always mapped..
19055 + */
19056 + if (is_initial_xendomain() &&
19057 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19058 + return (void __iomem *) isa_bus_to_virt(phys_addr);
19059 +
19060 + /*
19061 + * Don't allow anybody to remap normal RAM that we're using..
19062 + */
19063 + if (is_local_lowmem(phys_addr)) {
19064 + char *t_addr, *t_end;
19065 + struct page *page;
19066 +
19067 + t_addr = bus_to_virt(phys_addr);
19068 + t_end = t_addr + (size - 1);
19069 +
19070 + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
19071 + if(!PageReserved(page))
19072 + return NULL;
19073 +
19074 + domid = DOMID_SELF;
19075 + }
19076 +
19077 + /*
19078 + * Mappings have to be page-aligned
19079 + */
19080 + offset = phys_addr & ~PAGE_MASK;
19081 + phys_addr &= PAGE_MASK;
19082 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
19083 +
19084 + /*
19085 + * Ok, go for it..
19086 + */
19087 + area = get_vm_area(size, VM_IOREMAP | (flags << 20));
19088 + if (!area)
19089 + return NULL;
19090 + area->phys_addr = phys_addr;
19091 + addr = (void __iomem *) area->addr;
19092 + flags |= _KERNPG_TABLE;
19093 + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
19094 + phys_addr>>PAGE_SHIFT,
19095 + size, __pgprot(flags), domid)) {
19096 + vunmap((void __force *) addr);
19097 + return NULL;
19098 + }
19099 + return (void __iomem *) (offset + (char __iomem *)addr);
19100 +}
19101 +EXPORT_SYMBOL(__ioremap);
19102 +
19103 +/**
19104 + * ioremap_nocache - map bus memory into CPU space
19105 + * @offset: bus address of the memory
19106 + * @size: size of the resource to map
19107 + *
19108 + * ioremap_nocache performs a platform specific sequence of operations to
19109 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
19110 + * writew/writel functions and the other mmio helpers. The returned
19111 + * address is not guaranteed to be usable directly as a virtual
19112 + * address.
19113 + *
19114 + * This version of ioremap ensures that the memory is marked uncachable
19115 + * on the CPU as well as honouring existing caching rules from things like
19116 + * the PCI bus. Note that there are other caches and buffers on many
19117 + * busses. In particular driver authors should read up on PCI writes
19118 + *
19119 + * It's useful if some control registers are in such an area and
19120 + * write combining or read caching is not desirable:
19121 + *
19122 + * Must be freed with iounmap.
19123 + */
19124 +
19125 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
19126 +{
19127 + unsigned long last_addr;
19128 + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
19129 + if (!p)
19130 + return p;
19131 +
19132 + /* Guaranteed to be > phys_addr, as per __ioremap() */
19133 + last_addr = phys_addr + size - 1;
19134 +
19135 + if (is_local_lowmem(last_addr)) {
19136 + struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
19137 + unsigned long npages;
19138 +
19139 + phys_addr &= PAGE_MASK;
19140 +
19141 + /* This might overflow and become zero.. */
19142 + last_addr = PAGE_ALIGN(last_addr);
19143 +
19144 + /* .. but that's ok, because modulo-2**n arithmetic will make
19145 + * the page-aligned "last - first" come out right.
19146 + */
19147 + npages = (last_addr - phys_addr) >> PAGE_SHIFT;
19148 +
19149 + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
19150 + iounmap(p);
19151 + p = NULL;
19152 + }
19153 + global_flush_tlb();
19154 + }
19155 +
19156 + return p;
19157 +}
19158 +EXPORT_SYMBOL(ioremap_nocache);
19159 +
19160 +/**
19161 + * iounmap - Free a IO remapping
19162 + * @addr: virtual address from ioremap_*
19163 + *
19164 + * Caller must ensure there is only one unmapping for the same pointer.
19165 + */
19166 +void iounmap(volatile void __iomem *addr)
19167 +{
19168 + struct vm_struct *p, *o;
19169 +
19170 + if ((void __force *)addr <= high_memory)
19171 + return;
19172 +
19173 + /*
19174 + * __ioremap special-cases the PCI/ISA range by not instantiating a
19175 + * vm_area and by simply returning an address into the kernel mapping
19176 + * of ISA space. So handle that here.
19177 + */
19178 + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19179 + return;
19180 +
19181 + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
19182 +
19183 + /* Use the vm area unlocked, assuming the caller
19184 + ensures there isn't another iounmap for the same address
19185 + in parallel. Reuse of the virtual address is prevented by
19186 + leaving it in the global lists until we're done with it.
19187 + cpa takes care of the direct mappings. */
19188 + read_lock(&vmlist_lock);
19189 + for (p = vmlist; p; p = p->next) {
19190 + if (p->addr == addr)
19191 + break;
19192 + }
19193 + read_unlock(&vmlist_lock);
19194 +
19195 + if (!p) {
19196 + printk("iounmap: bad address %p\n", addr);
19197 + dump_stack();
19198 + return;
19199 + }
19200 +
19201 + /* Reset the direct mapping. Can block */
19202 + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
19203 + /* p->size includes the guard page, but cpa doesn't like that */
19204 + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
19205 + (p->size - PAGE_SIZE) >> PAGE_SHIFT,
19206 + PAGE_KERNEL);
19207 + global_flush_tlb();
19208 + }
19209 +
19210 + /* Finally remove it */
19211 + o = remove_vm_area((void *)addr);
19212 + BUG_ON(p != o || o == NULL);
19213 + kfree(p);
19214 +}
19215 +EXPORT_SYMBOL(iounmap);
19216 +
19217 +#ifdef __i386__
19218 +
19219 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
19220 +{
19221 + unsigned long offset, last_addr;
19222 + unsigned int nrpages;
19223 + enum fixed_addresses idx;
19224 +
19225 + /* Don't allow wraparound or zero size */
19226 + last_addr = phys_addr + size - 1;
19227 + if (!size || last_addr < phys_addr)
19228 + return NULL;
19229 +
19230 + /*
19231 + * Don't remap the low PCI/ISA area, it's always mapped..
19232 + */
19233 + if (is_initial_xendomain() &&
19234 + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
19235 + return isa_bus_to_virt(phys_addr);
19236 +
19237 + /*
19238 + * Mappings have to be page-aligned
19239 + */
19240 + offset = phys_addr & ~PAGE_MASK;
19241 + phys_addr &= PAGE_MASK;
19242 + size = PAGE_ALIGN(last_addr) - phys_addr;
19243 +
19244 + /*
19245 + * Mappings have to fit in the FIX_BTMAP area.
19246 + */
19247 + nrpages = size >> PAGE_SHIFT;
19248 + if (nrpages > NR_FIX_BTMAPS)
19249 + return NULL;
19250 +
19251 + /*
19252 + * Ok, go for it..
19253 + */
19254 + idx = FIX_BTMAP_BEGIN;
19255 + while (nrpages > 0) {
19256 + set_fixmap(idx, phys_addr);
19257 + phys_addr += PAGE_SIZE;
19258 + --idx;
19259 + --nrpages;
19260 + }
19261 + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
19262 +}
19263 +
19264 +void __init bt_iounmap(void *addr, unsigned long size)
19265 +{
19266 + unsigned long virt_addr;
19267 + unsigned long offset;
19268 + unsigned int nrpages;
19269 + enum fixed_addresses idx;
19270 +
19271 + virt_addr = (unsigned long)addr;
19272 + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
19273 + return;
19274 + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
19275 + return;
19276 + offset = virt_addr & ~PAGE_MASK;
19277 + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
19278 +
19279 + idx = FIX_BTMAP_BEGIN;
19280 + while (nrpages > 0) {
19281 + clear_fixmap(idx);
19282 + --idx;
19283 + --nrpages;
19284 + }
19285 +}
19286 +
19287 +#endif /* __i386__ */
19288 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pageattr.c linux-2.6.16.33/arch/i386/mm/pageattr.c
19289 --- linux-2.6.16.33-noxen/arch/i386/mm/pageattr.c 2006-11-22 18:06:31.000000000 +0000
19290 +++ linux-2.6.16.33/arch/i386/mm/pageattr.c 2007-05-23 21:00:01.000000000 +0000
19291 @@ -78,7 +78,7 @@
19292 unsigned long flags;
19293
19294 set_pte_atomic(kpte, pte); /* change init_mm */
19295 - if (PTRS_PER_PMD > 1)
19296 + if (HAVE_SHARED_KERNEL_PMD)
19297 return;
19298
19299 spin_lock_irqsave(&pgd_lock, flags);
19300 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pgtable-xen.c linux-2.6.16.33/arch/i386/mm/pgtable-xen.c
19301 --- linux-2.6.16.33-noxen/arch/i386/mm/pgtable-xen.c 1970-01-01 00:00:00.000000000 +0000
19302 +++ linux-2.6.16.33/arch/i386/mm/pgtable-xen.c 2007-01-08 15:00:45.000000000 +0000
19303 @@ -0,0 +1,707 @@
19304 +/*
19305 + * linux/arch/i386/mm/pgtable.c
19306 + */
19307 +
19308 +#include <linux/config.h>
19309 +#include <linux/sched.h>
19310 +#include <linux/kernel.h>
19311 +#include <linux/errno.h>
19312 +#include <linux/mm.h>
19313 +#include <linux/swap.h>
19314 +#include <linux/smp.h>
19315 +#include <linux/highmem.h>
19316 +#include <linux/slab.h>
19317 +#include <linux/pagemap.h>
19318 +#include <linux/spinlock.h>
19319 +#include <linux/module.h>
19320 +
19321 +#include <asm/system.h>
19322 +#include <asm/pgtable.h>
19323 +#include <asm/pgalloc.h>
19324 +#include <asm/fixmap.h>
19325 +#include <asm/e820.h>
19326 +#include <asm/tlb.h>
19327 +#include <asm/tlbflush.h>
19328 +#include <asm/io.h>
19329 +#include <asm/mmu_context.h>
19330 +
19331 +#include <xen/features.h>
19332 +#include <xen/foreign_page.h>
19333 +#include <asm/hypervisor.h>
19334 +
19335 +static void pgd_test_and_unpin(pgd_t *pgd);
19336 +
19337 +void show_mem(void)
19338 +{
19339 + int total = 0, reserved = 0;
19340 + int shared = 0, cached = 0;
19341 + int highmem = 0;
19342 + struct page *page;
19343 + pg_data_t *pgdat;
19344 + unsigned long i;
19345 + struct page_state ps;
19346 + unsigned long flags;
19347 +
19348 + printk(KERN_INFO "Mem-info:\n");
19349 + show_free_areas();
19350 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
19351 + for_each_pgdat(pgdat) {
19352 + pgdat_resize_lock(pgdat, &flags);
19353 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
19354 + page = pgdat_page_nr(pgdat, i);
19355 + total++;
19356 + if (PageHighMem(page))
19357 + highmem++;
19358 + if (PageReserved(page))
19359 + reserved++;
19360 + else if (PageSwapCache(page))
19361 + cached++;
19362 + else if (page_count(page))
19363 + shared += page_count(page) - 1;
19364 + }
19365 + pgdat_resize_unlock(pgdat, &flags);
19366 + }
19367 + printk(KERN_INFO "%d pages of RAM\n", total);
19368 + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
19369 + printk(KERN_INFO "%d reserved pages\n", reserved);
19370 + printk(KERN_INFO "%d pages shared\n", shared);
19371 + printk(KERN_INFO "%d pages swap cached\n", cached);
19372 +
19373 + get_page_state(&ps);
19374 + printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
19375 + printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
19376 + printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
19377 + printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
19378 + printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
19379 +}
19380 +
19381 +/*
19382 + * Associate a virtual page frame with a given physical page frame
19383 + * and protection flags for that frame.
19384 + */
19385 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19386 +{
19387 + pgd_t *pgd;
19388 + pud_t *pud;
19389 + pmd_t *pmd;
19390 + pte_t *pte;
19391 +
19392 + pgd = swapper_pg_dir + pgd_index(vaddr);
19393 + if (pgd_none(*pgd)) {
19394 + BUG();
19395 + return;
19396 + }
19397 + pud = pud_offset(pgd, vaddr);
19398 + if (pud_none(*pud)) {
19399 + BUG();
19400 + return;
19401 + }
19402 + pmd = pmd_offset(pud, vaddr);
19403 + if (pmd_none(*pmd)) {
19404 + BUG();
19405 + return;
19406 + }
19407 + pte = pte_offset_kernel(pmd, vaddr);
19408 + if (pgprot_val(flags))
19409 + /* <pfn,flags> stored as-is, to permit clearing entries */
19410 + set_pte(pte, pfn_pte(pfn, flags));
19411 + else
19412 + pte_clear(&init_mm, vaddr, pte);
19413 +
19414 + /*
19415 + * It's enough to flush this one mapping.
19416 + * (PGE mappings get flushed as well)
19417 + */
19418 + __flush_tlb_one(vaddr);
19419 +}
19420 +
19421 +/*
19422 + * Associate a virtual page frame with a given physical page frame
19423 + * and protection flags for that frame.
19424 + */
19425 +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
19426 + pgprot_t flags)
19427 +{
19428 + pgd_t *pgd;
19429 + pud_t *pud;
19430 + pmd_t *pmd;
19431 + pte_t *pte;
19432 +
19433 + pgd = swapper_pg_dir + pgd_index(vaddr);
19434 + if (pgd_none(*pgd)) {
19435 + BUG();
19436 + return;
19437 + }
19438 + pud = pud_offset(pgd, vaddr);
19439 + if (pud_none(*pud)) {
19440 + BUG();
19441 + return;
19442 + }
19443 + pmd = pmd_offset(pud, vaddr);
19444 + if (pmd_none(*pmd)) {
19445 + BUG();
19446 + return;
19447 + }
19448 + pte = pte_offset_kernel(pmd, vaddr);
19449 + if (pgprot_val(flags))
19450 + /* <pfn,flags> stored as-is, to permit clearing entries */
19451 + set_pte(pte, pfn_pte_ma(pfn, flags));
19452 + else
19453 + pte_clear(&init_mm, vaddr, pte);
19454 +
19455 + /*
19456 + * It's enough to flush this one mapping.
19457 + * (PGE mappings get flushed as well)
19458 + */
19459 + __flush_tlb_one(vaddr);
19460 +}
19461 +
19462 +/*
19463 + * Associate a large virtual page frame with a given physical page frame
19464 + * and protection flags for that frame. pfn is for the base of the page,
19465 + * vaddr is what the page gets mapped to - both must be properly aligned.
19466 + * The pmd must already be instantiated. Assumes PAE mode.
19467 + */
19468 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
19469 +{
19470 + pgd_t *pgd;
19471 + pud_t *pud;
19472 + pmd_t *pmd;
19473 +
19474 + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
19475 + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
19476 + return; /* BUG(); */
19477 + }
19478 + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
19479 + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
19480 + return; /* BUG(); */
19481 + }
19482 + pgd = swapper_pg_dir + pgd_index(vaddr);
19483 + if (pgd_none(*pgd)) {
19484 + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
19485 + return; /* BUG(); */
19486 + }
19487 + pud = pud_offset(pgd, vaddr);
19488 + pmd = pmd_offset(pud, vaddr);
19489 + set_pmd(pmd, pfn_pmd(pfn, flags));
19490 + /*
19491 + * It's enough to flush this one mapping.
19492 + * (PGE mappings get flushed as well)
19493 + */
19494 + __flush_tlb_one(vaddr);
19495 +}
19496 +
19497 +static int nr_fixmaps = 0;
19498 +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
19499 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
19500 +EXPORT_SYMBOL(__FIXADDR_TOP);
19501 +
19502 +void __init set_fixaddr_top()
19503 +{
19504 + BUG_ON(nr_fixmaps > 0);
19505 + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
19506 +}
19507 +
19508 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
19509 +{
19510 + unsigned long address = __fix_to_virt(idx);
19511 +
19512 + if (idx >= __end_of_fixed_addresses) {
19513 + BUG();
19514 + return;
19515 + }
19516 + switch (idx) {
19517 + case FIX_WP_TEST:
19518 +#ifdef CONFIG_X86_F00F_BUG
19519 + case FIX_F00F_IDT:
19520 +#endif
19521 + set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
19522 + break;
19523 + default:
19524 + set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
19525 + break;
19526 + }
19527 + nr_fixmaps++;
19528 +}
19529 +
19530 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
19531 +{
19532 + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
19533 + if (pte)
19534 + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
19535 + return pte;
19536 +}
19537 +
19538 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
19539 +{
19540 + struct page *pte;
19541 +
19542 +#ifdef CONFIG_HIGHPTE
19543 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
19544 +#else
19545 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
19546 + if (pte) {
19547 + SetPageForeign(pte, pte_free);
19548 + set_page_count(pte, 1);
19549 + }
19550 +#endif
19551 + return pte;
19552 +}
19553 +
19554 +void pte_free(struct page *pte)
19555 +{
19556 + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
19557 +
19558 + if (!pte_write(*virt_to_ptep(va)))
19559 + BUG_ON(HYPERVISOR_update_va_mapping(
19560 + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
19561 +
19562 + ClearPageForeign(pte);
19563 + set_page_count(pte, 1);
19564 +
19565 + __free_page(pte);
19566 +}
19567 +
19568 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
19569 +{
19570 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
19571 +}
19572 +
19573 +/*
19574 + * List of all pgd's needed for non-PAE so it can invalidate entries
19575 + * in both cached and uncached pgd's; not needed for PAE since the
19576 + * kernel pmd is shared. If PAE were not to share the pmd a similar
19577 + * tactic would be needed. This is essentially codepath-based locking
19578 + * against pageattr.c; it is the unique case in which a valid change
19579 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
19580 + * vmalloc faults work because attached pagetables are never freed.
19581 + * The locking scheme was chosen on the basis of manfred's
19582 + * recommendations and having no core impact whatsoever.
19583 + * -- wli
19584 + */
19585 +DEFINE_SPINLOCK(pgd_lock);
19586 +struct page *pgd_list;
19587 +
19588 +static inline void pgd_list_add(pgd_t *pgd)
19589 +{
19590 + struct page *page = virt_to_page(pgd);
19591 + page->index = (unsigned long)pgd_list;
19592 + if (pgd_list)
19593 + set_page_private(pgd_list, (unsigned long)&page->index);
19594 + pgd_list = page;
19595 + set_page_private(page, (unsigned long)&pgd_list);
19596 +}
19597 +
19598 +static inline void pgd_list_del(pgd_t *pgd)
19599 +{
19600 + struct page *next, **pprev, *page = virt_to_page(pgd);
19601 + next = (struct page *)page->index;
19602 + pprev = (struct page **)page_private(page);
19603 + *pprev = next;
19604 + if (next)
19605 + set_page_private(next, (unsigned long)pprev);
19606 +}
19607 +
19608 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
19609 +{
19610 + unsigned long flags;
19611 +
19612 + if (PTRS_PER_PMD > 1) {
19613 + if (HAVE_SHARED_KERNEL_PMD)
19614 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
19615 + swapper_pg_dir + USER_PTRS_PER_PGD,
19616 + KERNEL_PGD_PTRS);
19617 + } else {
19618 + spin_lock_irqsave(&pgd_lock, flags);
19619 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
19620 + swapper_pg_dir + USER_PTRS_PER_PGD,
19621 + KERNEL_PGD_PTRS);
19622 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
19623 + pgd_list_add(pgd);
19624 + spin_unlock_irqrestore(&pgd_lock, flags);
19625 + }
19626 +}
19627 +
19628 +/* never called when PTRS_PER_PMD > 1 */
19629 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
19630 +{
19631 + unsigned long flags; /* can be called from interrupt context */
19632 +
19633 + spin_lock_irqsave(&pgd_lock, flags);
19634 + pgd_list_del(pgd);
19635 + spin_unlock_irqrestore(&pgd_lock, flags);
19636 +
19637 + pgd_test_and_unpin(pgd);
19638 +}
19639 +
19640 +pgd_t *pgd_alloc(struct mm_struct *mm)
19641 +{
19642 + int i;
19643 + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
19644 + pmd_t **pmd;
19645 + unsigned long flags;
19646 +
19647 + pgd_test_and_unpin(pgd);
19648 +
19649 + if (PTRS_PER_PMD == 1 || !pgd)
19650 + return pgd;
19651 +
19652 + if (HAVE_SHARED_KERNEL_PMD) {
19653 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
19654 + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
19655 + if (!pmd)
19656 + goto out_oom;
19657 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
19658 + }
19659 + return pgd;
19660 + }
19661 +
19662 + /*
19663 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
19664 + * allocation). We therefore store virtual addresses of pmds as they
19665 + * do not change across save/restore, and poke the machine addresses
19666 + * into the pgdir under the pgd_lock.
19667 + */
19668 + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
19669 + if (!pmd) {
19670 + kmem_cache_free(pgd_cache, pgd);
19671 + return NULL;
19672 + }
19673 +
19674 + /* Allocate pmds, remember virtual addresses. */
19675 + for (i = 0; i < PTRS_PER_PGD; ++i) {
19676 + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
19677 + if (!pmd[i])
19678 + goto out_oom;
19679 + }
19680 +
19681 + spin_lock_irqsave(&pgd_lock, flags);
19682 +
19683 + /* Protect against save/restore: move below 4GB under pgd_lock. */
19684 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
19685 + int rc = xen_create_contiguous_region(
19686 + (unsigned long)pgd, 0, 32);
19687 + if (rc) {
19688 + spin_unlock_irqrestore(&pgd_lock, flags);
19689 + goto out_oom;
19690 + }
19691 + }
19692 +
19693 + /* Copy kernel pmd contents and write-protect the new pmds. */
19694 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
19695 + unsigned long v = (unsigned long)i << PGDIR_SHIFT;
19696 + pgd_t *kpgd = pgd_offset_k(v);
19697 + pud_t *kpud = pud_offset(kpgd, v);
19698 + pmd_t *kpmd = pmd_offset(kpud, v);
19699 + memcpy(pmd[i], kpmd, PAGE_SIZE);
19700 + make_lowmem_page_readonly(
19701 + pmd[i], XENFEAT_writable_page_tables);
19702 + }
19703 +
19704 + /* It is safe to poke machine addresses of pmds under the pmd_lock. */
19705 + for (i = 0; i < PTRS_PER_PGD; i++)
19706 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
19707 +
19708 + /* Ensure this pgd gets picked up and pinned on save/restore. */
19709 + pgd_list_add(pgd);
19710 +
19711 + spin_unlock_irqrestore(&pgd_lock, flags);
19712 +
19713 + kfree(pmd);
19714 +
19715 + return pgd;
19716 +
19717 +out_oom:
19718 + if (HAVE_SHARED_KERNEL_PMD) {
19719 + for (i--; i >= 0; i--)
19720 + kmem_cache_free(pmd_cache,
19721 + (void *)__va(pgd_val(pgd[i])-1));
19722 + } else {
19723 + for (i--; i >= 0; i--)
19724 + kmem_cache_free(pmd_cache, pmd[i]);
19725 + kfree(pmd);
19726 + }
19727 + kmem_cache_free(pgd_cache, pgd);
19728 + return NULL;
19729 +}
19730 +
19731 +void pgd_free(pgd_t *pgd)
19732 +{
19733 + int i;
19734 +
19735 + /*
19736 + * After this the pgd should not be pinned for the duration of this
19737 + * function's execution. We should never sleep and thus never race:
19738 + * 1. User pmds will not become write-protected under our feet due
19739 + * to a concurrent mm_pin_all().
19740 + * 2. The machine addresses in PGD entries will not become invalid
19741 + * due to a concurrent save/restore.
19742 + */
19743 + pgd_test_and_unpin(pgd);
19744 +
19745 + /* in the PAE case user pgd entries are overwritten before usage */
19746 + if (PTRS_PER_PMD > 1) {
19747 + for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
19748 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
19749 + kmem_cache_free(pmd_cache, pmd);
19750 + }
19751 +
19752 + if (!HAVE_SHARED_KERNEL_PMD) {
19753 + unsigned long flags;
19754 + spin_lock_irqsave(&pgd_lock, flags);
19755 + pgd_list_del(pgd);
19756 + spin_unlock_irqrestore(&pgd_lock, flags);
19757 +
19758 + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
19759 + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
19760 + make_lowmem_page_writable(
19761 + pmd, XENFEAT_writable_page_tables);
19762 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
19763 + kmem_cache_free(pmd_cache, pmd);
19764 + }
19765 +
19766 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
19767 + xen_destroy_contiguous_region(
19768 + (unsigned long)pgd, 0);
19769 + }
19770 + }
19771 +
19772 + /* in the non-PAE case, free_pgtables() clears user pgd entries */
19773 + kmem_cache_free(pgd_cache, pgd);
19774 +}
19775 +
19776 +void make_lowmem_page_readonly(void *va, unsigned int feature)
19777 +{
19778 + pte_t *pte;
19779 + int rc;
19780 +
19781 + if (xen_feature(feature))
19782 + return;
19783 +
19784 + pte = virt_to_ptep(va);
19785 + rc = HYPERVISOR_update_va_mapping(
19786 + (unsigned long)va, pte_wrprotect(*pte), 0);
19787 + BUG_ON(rc);
19788 +}
19789 +
19790 +void make_lowmem_page_writable(void *va, unsigned int feature)
19791 +{
19792 + pte_t *pte;
19793 + int rc;
19794 +
19795 + if (xen_feature(feature))
19796 + return;
19797 +
19798 + pte = virt_to_ptep(va);
19799 + rc = HYPERVISOR_update_va_mapping(
19800 + (unsigned long)va, pte_mkwrite(*pte), 0);
19801 + BUG_ON(rc);
19802 +}
19803 +
19804 +void make_page_readonly(void *va, unsigned int feature)
19805 +{
19806 + pte_t *pte;
19807 + int rc;
19808 +
19809 + if (xen_feature(feature))
19810 + return;
19811 +
19812 + pte = virt_to_ptep(va);
19813 + rc = HYPERVISOR_update_va_mapping(
19814 + (unsigned long)va, pte_wrprotect(*pte), 0);
19815 + if (rc) /* fallback? */
19816 + xen_l1_entry_update(pte, pte_wrprotect(*pte));
19817 + if ((unsigned long)va >= (unsigned long)high_memory) {
19818 + unsigned long pfn = pte_pfn(*pte);
19819 +#ifdef CONFIG_HIGHMEM
19820 + if (pfn >= highstart_pfn)
19821 + kmap_flush_unused(); /* flush stale writable kmaps */
19822 + else
19823 +#endif
19824 + make_lowmem_page_readonly(
19825 + phys_to_virt(pfn << PAGE_SHIFT), feature);
19826 + }
19827 +}
19828 +
19829 +void make_page_writable(void *va, unsigned int feature)
19830 +{
19831 + pte_t *pte;
19832 + int rc;
19833 +
19834 + if (xen_feature(feature))
19835 + return;
19836 +
19837 + pte = virt_to_ptep(va);
19838 + rc = HYPERVISOR_update_va_mapping(
19839 + (unsigned long)va, pte_mkwrite(*pte), 0);
19840 + if (rc) /* fallback? */
19841 + xen_l1_entry_update(pte, pte_mkwrite(*pte));
19842 + if ((unsigned long)va >= (unsigned long)high_memory) {
19843 + unsigned long pfn = pte_pfn(*pte);
19844 +#ifdef CONFIG_HIGHMEM
19845 + if (pfn < highstart_pfn)
19846 +#endif
19847 + make_lowmem_page_writable(
19848 + phys_to_virt(pfn << PAGE_SHIFT), feature);
19849 + }
19850 +}
19851 +
19852 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
19853 +{
19854 + if (xen_feature(feature))
19855 + return;
19856 +
19857 + while (nr-- != 0) {
19858 + make_page_readonly(va, feature);
19859 + va = (void *)((unsigned long)va + PAGE_SIZE);
19860 + }
19861 +}
19862 +
19863 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
19864 +{
19865 + if (xen_feature(feature))
19866 + return;
19867 +
19868 + while (nr-- != 0) {
19869 + make_page_writable(va, feature);
19870 + va = (void *)((unsigned long)va + PAGE_SIZE);
19871 + }
19872 +}
19873 +
19874 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
19875 +{
19876 + struct page *page = virt_to_page(pt);
19877 + unsigned long pfn = page_to_pfn(page);
19878 +
19879 + if (PageHighMem(page))
19880 + return;
19881 + BUG_ON(HYPERVISOR_update_va_mapping(
19882 + (unsigned long)__va(pfn << PAGE_SHIFT),
19883 + pfn_pte(pfn, flags), 0));
19884 +}
19885 +
19886 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
19887 +{
19888 + pgd_t *pgd = pgd_base;
19889 + pud_t *pud;
19890 + pmd_t *pmd;
19891 + pte_t *pte;
19892 + int g, u, m;
19893 +
19894 + if (xen_feature(XENFEAT_auto_translated_physmap))
19895 + return;
19896 +
19897 + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
19898 + if (pgd_none(*pgd))
19899 + continue;
19900 + pud = pud_offset(pgd, 0);
19901 + if (PTRS_PER_PUD > 1) /* not folded */
19902 + pgd_walk_set_prot(pud,flags);
19903 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
19904 + if (pud_none(*pud))
19905 + continue;
19906 + pmd = pmd_offset(pud, 0);
19907 + if (PTRS_PER_PMD > 1) /* not folded */
19908 + pgd_walk_set_prot(pmd,flags);
19909 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
19910 + if (pmd_none(*pmd))
19911 + continue;
19912 + pte = pte_offset_kernel(pmd,0);
19913 + pgd_walk_set_prot(pte,flags);
19914 + }
19915 + }
19916 + }
19917 +
19918 + BUG_ON(HYPERVISOR_update_va_mapping(
19919 + (unsigned long)pgd_base,
19920 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
19921 + UVMF_TLB_FLUSH));
19922 +}
19923 +
19924 +static void __pgd_pin(pgd_t *pgd)
19925 +{
19926 + pgd_walk(pgd, PAGE_KERNEL_RO);
19927 + xen_pgd_pin(__pa(pgd));
19928 + set_bit(PG_pinned, &virt_to_page(pgd)->flags);
19929 +}
19930 +
19931 +static void __pgd_unpin(pgd_t *pgd)
19932 +{
19933 + xen_pgd_unpin(__pa(pgd));
19934 + pgd_walk(pgd, PAGE_KERNEL);
19935 + clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
19936 +}
19937 +
19938 +static void pgd_test_and_unpin(pgd_t *pgd)
19939 +{
19940 + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
19941 + __pgd_unpin(pgd);
19942 +}
19943 +
19944 +void mm_pin(struct mm_struct *mm)
19945 +{
19946 + if (xen_feature(XENFEAT_writable_page_tables))
19947 + return;
19948 + spin_lock(&mm->page_table_lock);
19949 + __pgd_pin(mm->pgd);
19950 + spin_unlock(&mm->page_table_lock);
19951 +}
19952 +
19953 +void mm_unpin(struct mm_struct *mm)
19954 +{
19955 + if (xen_feature(XENFEAT_writable_page_tables))
19956 + return;
19957 + spin_lock(&mm->page_table_lock);
19958 + __pgd_unpin(mm->pgd);
19959 + spin_unlock(&mm->page_table_lock);
19960 +}
19961 +
19962 +void mm_pin_all(void)
19963 +{
19964 + struct page *page;
19965 +
19966 + /* Only pgds on the pgd_list please: none hidden in the slab cache. */
19967 + kmem_cache_shrink(pgd_cache);
19968 +
19969 + if (xen_feature(XENFEAT_writable_page_tables))
19970 + return;
19971 +
19972 + for (page = pgd_list; page; page = (struct page *)page->index) {
19973 + if (!test_bit(PG_pinned, &page->flags))
19974 + __pgd_pin((pgd_t *)page_address(page));
19975 + }
19976 +}
19977 +
19978 +void _arch_dup_mmap(struct mm_struct *mm)
19979 +{
19980 + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
19981 + mm_pin(mm);
19982 +}
19983 +
19984 +void _arch_exit_mmap(struct mm_struct *mm)
19985 +{
19986 + struct task_struct *tsk = current;
19987 +
19988 + task_lock(tsk);
19989 +
19990 + /*
19991 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
19992 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
19993 + */
19994 + if (tsk->active_mm == mm) {
19995 + tsk->active_mm = &init_mm;
19996 + atomic_inc(&init_mm.mm_count);
19997 +
19998 + switch_mm(mm, &init_mm, tsk);
19999 +
20000 + atomic_dec(&mm->mm_count);
20001 + BUG_ON(atomic_read(&mm->mm_count) == 0);
20002 + }
20003 +
20004 + task_unlock(tsk);
20005 +
20006 + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
20007 + (atomic_read(&mm->mm_count) == 1) &&
20008 + !mm->context.has_foreign_mappings)
20009 + mm_unpin(mm);
20010 +}
20011 diff -Nur linux-2.6.16.33-noxen/arch/i386/mm/pgtable.c linux-2.6.16.33/arch/i386/mm/pgtable.c
20012 --- linux-2.6.16.33-noxen/arch/i386/mm/pgtable.c 2006-11-22 18:06:31.000000000 +0000
20013 +++ linux-2.6.16.33/arch/i386/mm/pgtable.c 2007-01-08 15:00:45.000000000 +0000
20014 @@ -13,6 +13,7 @@
20015 #include <linux/slab.h>
20016 #include <linux/pagemap.h>
20017 #include <linux/spinlock.h>
20018 +#include <linux/module.h>
20019
20020 #include <asm/system.h>
20021 #include <asm/pgtable.h>
20022 @@ -138,6 +139,10 @@
20023 __flush_tlb_one(vaddr);
20024 }
20025
20026 +static int nr_fixmaps = 0;
20027 +unsigned long __FIXADDR_TOP = 0xfffff000;
20028 +EXPORT_SYMBOL(__FIXADDR_TOP);
20029 +
20030 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
20031 {
20032 unsigned long address = __fix_to_virt(idx);
20033 @@ -147,6 +152,13 @@
20034 return;
20035 }
20036 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
20037 + nr_fixmaps++;
20038 +}
20039 +
20040 +void set_fixaddr_top(unsigned long top)
20041 +{
20042 + BUG_ON(nr_fixmaps > 0);
20043 + __FIXADDR_TOP = top - PAGE_SIZE;
20044 }
20045
20046 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
20047 diff -Nur linux-2.6.16.33-noxen/arch/i386/oprofile/Makefile linux-2.6.16.33/arch/i386/oprofile/Makefile
20048 --- linux-2.6.16.33-noxen/arch/i386/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
20049 +++ linux-2.6.16.33/arch/i386/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
20050 @@ -6,7 +6,14 @@
20051 oprofilefs.o oprofile_stats.o \
20052 timer_int.o )
20053
20054 +ifdef CONFIG_XEN
20055 +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
20056 + xenoprofile.o)
20057 +oprofile-y := $(DRIVER_OBJS) \
20058 + $(XENOPROF_COMMON_OBJS) xenoprof.o
20059 +else
20060 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
20061 oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \
20062 op_model_ppro.o op_model_p4.o
20063 oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
20064 +endif
20065 diff -Nur linux-2.6.16.33-noxen/arch/i386/oprofile/xenoprof.c linux-2.6.16.33/arch/i386/oprofile/xenoprof.c
20066 --- linux-2.6.16.33-noxen/arch/i386/oprofile/xenoprof.c 1970-01-01 00:00:00.000000000 +0000
20067 +++ linux-2.6.16.33/arch/i386/oprofile/xenoprof.c 2007-01-08 15:00:45.000000000 +0000
20068 @@ -0,0 +1,179 @@
20069 +/**
20070 + * @file xenoprof.c
20071 + *
20072 + * @remark Copyright 2002 OProfile authors
20073 + * @remark Read the file COPYING
20074 + *
20075 + * @author John Levon <levon@movementarian.org>
20076 + *
20077 + * Modified by Aravind Menon and Jose Renato Santos for Xen
20078 + * These modifications are:
20079 + * Copyright (C) 2005 Hewlett-Packard Co.
20080 + *
20081 + * x86-specific part
20082 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
20083 + * VA Linux Systems Japan K.K.
20084 + */
20085 +
20086 +#include <linux/init.h>
20087 +#include <linux/oprofile.h>
20088 +#include <linux/sched.h>
20089 +#include <asm/pgtable.h>
20090 +
20091 +#include <xen/driver_util.h>
20092 +#include <xen/interface/xen.h>
20093 +#include <xen/interface/xenoprof.h>
20094 +#include <xen/xenoprof.h>
20095 +#include "op_counter.h"
20096 +
20097 +static unsigned int num_events = 0;
20098 +
20099 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
20100 +{
20101 + num_events = init->num_events;
20102 + /* just in case - make sure we do not overflow event list
20103 + (i.e. counter_config list) */
20104 + if (num_events > OP_MAX_COUNTER) {
20105 + num_events = OP_MAX_COUNTER;
20106 + init->num_events = num_events;
20107 + }
20108 +}
20109 +
20110 +void xenoprof_arch_counter(void)
20111 +{
20112 + int i;
20113 + struct xenoprof_counter counter;
20114 +
20115 + for (i=0; i<num_events; i++) {
20116 + counter.ind = i;
20117 + counter.count = (uint64_t)counter_config[i].count;
20118 + counter.enabled = (uint32_t)counter_config[i].enabled;
20119 + counter.event = (uint32_t)counter_config[i].event;
20120 + counter.kernel = (uint32_t)counter_config[i].kernel;
20121 + counter.user = (uint32_t)counter_config[i].user;
20122 + counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
20123 + HYPERVISOR_xenoprof_op(XENOPROF_counter,
20124 + &counter);
20125 + }
20126 +}
20127 +
20128 +void xenoprof_arch_start(void)
20129 +{
20130 + /* nothing */
20131 +}
20132 +
20133 +void xenoprof_arch_stop(void)
20134 +{
20135 + /* nothing */
20136 +}
20137 +
20138 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
20139 +{
20140 + if (sbuf->buffer) {
20141 + vunmap(sbuf->buffer);
20142 + sbuf->buffer = NULL;
20143 + }
20144 +}
20145 +
20146 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
20147 + struct xenoprof_shared_buffer * sbuf)
20148 +{
20149 + int npages, ret;
20150 + struct vm_struct *area;
20151 +
20152 + sbuf->buffer = NULL;
20153 + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
20154 + return ret;
20155 +
20156 + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
20157 +
20158 + area = alloc_vm_area(npages * PAGE_SIZE);
20159 + if (area == NULL)
20160 + return -ENOMEM;
20161 +
20162 + if ( (ret = direct_kernel_remap_pfn_range(
20163 + (unsigned long)area->addr,
20164 + get_buffer->buf_gmaddr >> PAGE_SHIFT,
20165 + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
20166 + DOMID_SELF)) ) {
20167 + vunmap(area->addr);
20168 + return ret;
20169 + }
20170 +
20171 + sbuf->buffer = area->addr;
20172 + return ret;
20173 +}
20174 +
20175 +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
20176 + struct xenoprof_shared_buffer * sbuf)
20177 +{
20178 + int ret;
20179 + int npages;
20180 + struct vm_struct *area;
20181 + pgprot_t prot = __pgprot(_KERNPG_TABLE);
20182 +
20183 + sbuf->buffer = NULL;
20184 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
20185 + if (ret)
20186 + goto out;
20187 +
20188 + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
20189 +
20190 + area = alloc_vm_area(npages * PAGE_SIZE);
20191 + if (area == NULL) {
20192 + ret = -ENOMEM;
20193 + goto out;
20194 + }
20195 +
20196 + ret = direct_kernel_remap_pfn_range(
20197 + (unsigned long)area->addr,
20198 + pdomain->buf_gmaddr >> PAGE_SHIFT,
20199 + npages * PAGE_SIZE, prot, DOMID_SELF);
20200 + if (ret) {
20201 + vunmap(area->addr);
20202 + goto out;
20203 + }
20204 + sbuf->buffer = area->addr;
20205 +
20206 +out:
20207 + return ret;
20208 +}
20209 +
20210 +struct op_counter_config counter_config[OP_MAX_COUNTER];
20211 +
20212 +int xenoprof_create_files(struct super_block * sb, struct dentry * root)
20213 +{
20214 + unsigned int i;
20215 +
20216 + for (i = 0; i < num_events; ++i) {
20217 + struct dentry * dir;
20218 + char buf[2];
20219 +
20220 + snprintf(buf, 2, "%d", i);
20221 + dir = oprofilefs_mkdir(sb, root, buf);
20222 + oprofilefs_create_ulong(sb, dir, "enabled",
20223 + &counter_config[i].enabled);
20224 + oprofilefs_create_ulong(sb, dir, "event",
20225 + &counter_config[i].event);
20226 + oprofilefs_create_ulong(sb, dir, "count",
20227 + &counter_config[i].count);
20228 + oprofilefs_create_ulong(sb, dir, "unit_mask",
20229 + &counter_config[i].unit_mask);
20230 + oprofilefs_create_ulong(sb, dir, "kernel",
20231 + &counter_config[i].kernel);
20232 + oprofilefs_create_ulong(sb, dir, "user",
20233 + &counter_config[i].user);
20234 + }
20235 +
20236 + return 0;
20237 +}
20238 +
20239 +int __init oprofile_arch_init(struct oprofile_operations * ops)
20240 +{
20241 + return xenoprofile_init(ops);
20242 +}
20243 +
20244 +void oprofile_arch_exit(void)
20245 +{
20246 + xenoprofile_exit();
20247 +}
20248 diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/Makefile linux-2.6.16.33/arch/i386/pci/Makefile
20249 --- linux-2.6.16.33-noxen/arch/i386/pci/Makefile 2006-11-22 18:06:31.000000000 +0000
20250 +++ linux-2.6.16.33/arch/i386/pci/Makefile 2007-01-08 15:00:45.000000000 +0000
20251 @@ -4,6 +4,10 @@
20252 obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
20253 obj-$(CONFIG_PCI_DIRECT) += direct.o
20254
20255 +# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
20256 +# take over if direct access to the PCI bus is unavailable
20257 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
20258 +
20259 pci-y := fixup.o
20260 pci-$(CONFIG_ACPI) += acpi.o
20261 pci-y += legacy.o irq.o
20262 @@ -12,3 +16,8 @@
20263 pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
20264
20265 obj-y += $(pci-y) common.o
20266 +
20267 +ifdef CONFIG_XEN
20268 +include $(srctree)/scripts/Makefile.xen
20269 +obj-y := $(call cherrypickxen, $(obj-y))
20270 +endif
20271 diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/irq-xen.c linux-2.6.16.33/arch/i386/pci/irq-xen.c
20272 --- linux-2.6.16.33-noxen/arch/i386/pci/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
20273 +++ linux-2.6.16.33/arch/i386/pci/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
20274 @@ -0,0 +1,1204 @@
20275 +/*
20276 + * Low-Level PCI Support for PC -- Routing of Interrupts
20277 + *
20278 + * (c) 1999--2000 Martin Mares <mj@ucw.cz>
20279 + */
20280 +
20281 +#include <linux/config.h>
20282 +#include <linux/types.h>
20283 +#include <linux/kernel.h>
20284 +#include <linux/pci.h>
20285 +#include <linux/init.h>
20286 +#include <linux/slab.h>
20287 +#include <linux/interrupt.h>
20288 +#include <linux/dmi.h>
20289 +#include <asm/io.h>
20290 +#include <asm/smp.h>
20291 +#include <asm/io_apic.h>
20292 +#include <linux/irq.h>
20293 +#include <linux/acpi.h>
20294 +
20295 +#include "pci.h"
20296 +
20297 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
20298 +#define PIRQ_VERSION 0x0100
20299 +
20300 +static int broken_hp_bios_irq9;
20301 +static int acer_tm360_irqrouting;
20302 +
20303 +static struct irq_routing_table *pirq_table;
20304 +
20305 +static int pirq_enable_irq(struct pci_dev *dev);
20306 +
20307 +/*
20308 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
20309 + * Avoid using: 13, 14 and 15 (FP error and IDE).
20310 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
20311 + */
20312 +unsigned int pcibios_irq_mask = 0xfff8;
20313 +
20314 +static int pirq_penalty[16] = {
20315 + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
20316 + 0, 0, 0, 0, 1000, 100000, 100000, 100000
20317 +};
20318 +
20319 +struct irq_router {
20320 + char *name;
20321 + u16 vendor, device;
20322 + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
20323 + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
20324 +};
20325 +
20326 +struct irq_router_handler {
20327 + u16 vendor;
20328 + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
20329 +};
20330 +
20331 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
20332 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
20333 +
20334 +/*
20335 + * Check passed address for the PCI IRQ Routing Table signature
20336 + * and perform checksum verification.
20337 + */
20338 +
20339 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
20340 +{
20341 + struct irq_routing_table *rt;
20342 + int i;
20343 + u8 sum;
20344 +
20345 + rt = (struct irq_routing_table *) addr;
20346 + if (rt->signature != PIRQ_SIGNATURE ||
20347 + rt->version != PIRQ_VERSION ||
20348 + rt->size % 16 ||
20349 + rt->size < sizeof(struct irq_routing_table))
20350 + return NULL;
20351 + sum = 0;
20352 + for (i=0; i < rt->size; i++)
20353 + sum += addr[i];
20354 + if (!sum) {
20355 + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
20356 + return rt;
20357 + }
20358 + return NULL;
20359 +}
20360 +
20361 +
20362 +
20363 +/*
20364 + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
20365 + */
20366 +
20367 +static struct irq_routing_table * __init pirq_find_routing_table(void)
20368 +{
20369 + u8 *addr;
20370 + struct irq_routing_table *rt;
20371 +
20372 +#ifdef CONFIG_XEN
20373 + if (!is_initial_xendomain())
20374 + return NULL;
20375 +#endif
20376 + if (pirq_table_addr) {
20377 + rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
20378 + if (rt)
20379 + return rt;
20380 + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
20381 + }
20382 + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
20383 + rt = pirq_check_routing_table(addr);
20384 + if (rt)
20385 + return rt;
20386 + }
20387 +
20388 + return NULL;
20389 +}
20390 +
20391 +/*
20392 + * If we have a IRQ routing table, use it to search for peer host
20393 + * bridges. It's a gross hack, but since there are no other known
20394 + * ways how to get a list of buses, we have to go this way.
20395 + */
20396 +
20397 +static void __init pirq_peer_trick(void)
20398 +{
20399 + struct irq_routing_table *rt = pirq_table;
20400 + u8 busmap[256];
20401 + int i;
20402 + struct irq_info *e;
20403 +
20404 + memset(busmap, 0, sizeof(busmap));
20405 + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
20406 + e = &rt->slots[i];
20407 +#ifdef DEBUG
20408 + {
20409 + int j;
20410 + DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
20411 + for(j=0; j<4; j++)
20412 + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
20413 + DBG("\n");
20414 + }
20415 +#endif
20416 + busmap[e->bus] = 1;
20417 + }
20418 + for(i = 1; i < 256; i++) {
20419 + if (!busmap[i] || pci_find_bus(0, i))
20420 + continue;
20421 + if (pci_scan_bus(i, &pci_root_ops, NULL))
20422 + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
20423 + }
20424 + pcibios_last_bus = -1;
20425 +}
20426 +
20427 +/*
20428 + * Code for querying and setting of IRQ routes on various interrupt routers.
20429 + */
20430 +
20431 +void eisa_set_level_irq(unsigned int irq)
20432 +{
20433 + unsigned char mask = 1 << (irq & 7);
20434 + unsigned int port = 0x4d0 + (irq >> 3);
20435 + unsigned char val;
20436 + static u16 eisa_irq_mask;
20437 +
20438 + if (irq >= 16 || (1 << irq) & eisa_irq_mask)
20439 + return;
20440 +
20441 + eisa_irq_mask |= (1 << irq);
20442 + printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
20443 + val = inb(port);
20444 + if (!(val & mask)) {
20445 + DBG(KERN_DEBUG " -> edge");
20446 + outb(val | mask, port);
20447 + }
20448 +}
20449 +
20450 +/*
20451 + * Common IRQ routing practice: nybbles in config space,
20452 + * offset by some magic constant.
20453 + */
20454 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
20455 +{
20456 + u8 x;
20457 + unsigned reg = offset + (nr >> 1);
20458 +
20459 + pci_read_config_byte(router, reg, &x);
20460 + return (nr & 1) ? (x >> 4) : (x & 0xf);
20461 +}
20462 +
20463 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
20464 +{
20465 + u8 x;
20466 + unsigned reg = offset + (nr >> 1);
20467 +
20468 + pci_read_config_byte(router, reg, &x);
20469 + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
20470 + pci_write_config_byte(router, reg, x);
20471 +}
20472 +
20473 +/*
20474 + * ALI pirq entries are damn ugly, and completely undocumented.
20475 + * This has been figured out from pirq tables, and it's not a pretty
20476 + * picture.
20477 + */
20478 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20479 +{
20480 + static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
20481 +
20482 + return irqmap[read_config_nybble(router, 0x48, pirq-1)];
20483 +}
20484 +
20485 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20486 +{
20487 + static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
20488 + unsigned int val = irqmap[irq];
20489 +
20490 + if (val) {
20491 + write_config_nybble(router, 0x48, pirq-1, val);
20492 + return 1;
20493 + }
20494 + return 0;
20495 +}
20496 +
20497 +/*
20498 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
20499 + * just a pointer to the config space.
20500 + */
20501 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20502 +{
20503 + u8 x;
20504 +
20505 + pci_read_config_byte(router, pirq, &x);
20506 + return (x < 16) ? x : 0;
20507 +}
20508 +
20509 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20510 +{
20511 + pci_write_config_byte(router, pirq, irq);
20512 + return 1;
20513 +}
20514 +
20515 +/*
20516 + * The VIA pirq rules are nibble-based, like ALI,
20517 + * but without the ugly irq number munging.
20518 + * However, PIRQD is in the upper instead of lower 4 bits.
20519 + */
20520 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20521 +{
20522 + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
20523 +}
20524 +
20525 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20526 +{
20527 + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
20528 + return 1;
20529 +}
20530 +
20531 +/*
20532 + * The VIA pirq rules are nibble-based, like ALI,
20533 + * but without the ugly irq number munging.
20534 + * However, for 82C586, nibble map is different .
20535 + */
20536 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20537 +{
20538 + static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20539 + return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
20540 +}
20541 +
20542 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20543 +{
20544 + static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20545 + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
20546 + return 1;
20547 +}
20548 +
20549 +/*
20550 + * ITE 8330G pirq rules are nibble-based
20551 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
20552 + * 2+3 are both mapped to irq 9 on my system
20553 + */
20554 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20555 +{
20556 + static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20557 + return read_config_nybble(router,0x43, pirqmap[pirq-1]);
20558 +}
20559 +
20560 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20561 +{
20562 + static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20563 + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
20564 + return 1;
20565 +}
20566 +
20567 +/*
20568 + * OPTI: high four bits are nibble pointer..
20569 + * I wonder what the low bits do?
20570 + */
20571 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20572 +{
20573 + return read_config_nybble(router, 0xb8, pirq >> 4);
20574 +}
20575 +
20576 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20577 +{
20578 + write_config_nybble(router, 0xb8, pirq >> 4, irq);
20579 + return 1;
20580 +}
20581 +
20582 +/*
20583 + * Cyrix: nibble offset 0x5C
20584 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA
20585 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
20586 + */
20587 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20588 +{
20589 + return read_config_nybble(router, 0x5C, (pirq-1)^1);
20590 +}
20591 +
20592 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20593 +{
20594 + write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
20595 + return 1;
20596 +}
20597 +
20598 +/*
20599 + * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
20600 + * We have to deal with the following issues here:
20601 + * - vendors have different ideas about the meaning of link values
20602 + * - some onboard devices (integrated in the chipset) have special
20603 + * links and are thus routed differently (i.e. not via PCI INTA-INTD)
20604 + * - different revision of the router have a different layout for
20605 + * the routing registers, particularly for the onchip devices
20606 + *
20607 + * For all routing registers the common thing is we have one byte
20608 + * per routeable link which is defined as:
20609 + * bit 7 IRQ mapping enabled (0) or disabled (1)
20610 + * bits [6:4] reserved (sometimes used for onchip devices)
20611 + * bits [3:0] IRQ to map to
20612 + * allowed: 3-7, 9-12, 14-15
20613 + * reserved: 0, 1, 2, 8, 13
20614 + *
20615 + * The config-space registers located at 0x41/0x42/0x43/0x44 are
20616 + * always used to route the normal PCI INT A/B/C/D respectively.
20617 + * Apparently there are systems implementing PCI routing table using
20618 + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
20619 + * We try our best to handle both link mappings.
20620 + *
20621 + * Currently (2003-05-21) it appears most SiS chipsets follow the
20622 + * definition of routing registers from the SiS-5595 southbridge.
20623 + * According to the SiS 5595 datasheets the revision id's of the
20624 + * router (ISA-bridge) should be 0x01 or 0xb0.
20625 + *
20626 + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
20627 + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
20628 + * They seem to work with the current routing code. However there is
20629 + * some concern because of the two USB-OHCI HCs (original SiS 5595
20630 + * had only one). YMMV.
20631 + *
20632 + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
20633 + *
20634 + * 0x61: IDEIRQ:
20635 + * bits [6:5] must be written 01
20636 + * bit 4 channel-select primary (0), secondary (1)
20637 + *
20638 + * 0x62: USBIRQ:
20639 + * bit 6 OHCI function disabled (0), enabled (1)
20640 + *
20641 + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
20642 + *
20643 + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
20644 + *
20645 + * We support USBIRQ (in addition to INTA-INTD) and keep the
20646 + * IDE, ACPI and DAQ routing untouched as set by the BIOS.
20647 + *
20648 + * Currently the only reported exception is the new SiS 65x chipset
20649 + * which includes the SiS 69x southbridge. Here we have the 85C503
20650 + * router revision 0x04 and there are changes in the register layout
20651 + * mostly related to the different USB HCs with USB 2.0 support.
20652 + *
20653 + * Onchip routing for router rev-id 0x04 (try-and-error observation)
20654 + *
20655 + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
20656 + * bit 6-4 are probably unused, not like 5595
20657 + */
20658 +
20659 +#define PIRQ_SIS_IRQ_MASK 0x0f
20660 +#define PIRQ_SIS_IRQ_DISABLE 0x80
20661 +#define PIRQ_SIS_USB_ENABLE 0x40
20662 +
20663 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20664 +{
20665 + u8 x;
20666 + int reg;
20667 +
20668 + reg = pirq;
20669 + if (reg >= 0x01 && reg <= 0x04)
20670 + reg += 0x40;
20671 + pci_read_config_byte(router, reg, &x);
20672 + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
20673 +}
20674 +
20675 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20676 +{
20677 + u8 x;
20678 + int reg;
20679 +
20680 + reg = pirq;
20681 + if (reg >= 0x01 && reg <= 0x04)
20682 + reg += 0x40;
20683 + pci_read_config_byte(router, reg, &x);
20684 + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
20685 + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
20686 + pci_write_config_byte(router, reg, x);
20687 + return 1;
20688 +}
20689 +
20690 +
20691 +/*
20692 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
20693 + * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
20694 + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
20695 + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
20696 + * for the busbridge to the docking station.
20697 + */
20698 +
20699 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20700 +{
20701 + if (pirq > 8) {
20702 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20703 + return 0;
20704 + }
20705 + return read_config_nybble(router, 0x74, pirq-1);
20706 +}
20707 +
20708 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20709 +{
20710 + if (pirq > 8) {
20711 + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20712 + return 0;
20713 + }
20714 + write_config_nybble(router, 0x74, pirq-1, irq);
20715 + return 1;
20716 +}
20717 +
20718 +/*
20719 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
20720 + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
20721 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
20722 + * register is a straight binary coding of desired PIC IRQ (low nibble).
20723 + *
20724 + * The 'link' value in the PIRQ table is already in the correct format
20725 + * for the Index register. There are some special index values:
20726 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
20727 + * and 0x03 for SMBus.
20728 + */
20729 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20730 +{
20731 + outb_p(pirq, 0xc00);
20732 + return inb(0xc01) & 0xf;
20733 +}
20734 +
20735 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20736 +{
20737 + outb_p(pirq, 0xc00);
20738 + outb_p(irq, 0xc01);
20739 + return 1;
20740 +}
20741 +
20742 +/* Support for AMD756 PCI IRQ Routing
20743 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
20744 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
20745 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
20746 + * The AMD756 pirq rules are nibble-based
20747 + * offset 0x56 0-3 PIRQA 4-7 PIRQB
20748 + * offset 0x57 0-3 PIRQC 4-7 PIRQD
20749 + */
20750 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20751 +{
20752 + u8 irq;
20753 + irq = 0;
20754 + if (pirq <= 4)
20755 + {
20756 + irq = read_config_nybble(router, 0x56, pirq - 1);
20757 + }
20758 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
20759 + dev->vendor, dev->device, pirq, irq);
20760 + return irq;
20761 +}
20762 +
20763 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20764 +{
20765 + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
20766 + dev->vendor, dev->device, pirq, irq);
20767 + if (pirq <= 4)
20768 + {
20769 + write_config_nybble(router, 0x56, pirq - 1, irq);
20770 + }
20771 + return 1;
20772 +}
20773 +
20774 +#ifdef CONFIG_PCI_BIOS
20775 +
20776 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20777 +{
20778 + struct pci_dev *bridge;
20779 + int pin = pci_get_interrupt_pin(dev, &bridge);
20780 + return pcibios_set_irq_routing(bridge, pin, irq);
20781 +}
20782 +
20783 +#endif
20784 +
20785 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20786 +{
20787 + static struct pci_device_id pirq_440gx[] = {
20788 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
20789 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
20790 + { },
20791 + };
20792 +
20793 + /* 440GX has a proprietary PIRQ router -- don't use it */
20794 + if (pci_dev_present(pirq_440gx))
20795 + return 0;
20796 +
20797 + switch(device)
20798 + {
20799 + case PCI_DEVICE_ID_INTEL_82371FB_0:
20800 + case PCI_DEVICE_ID_INTEL_82371SB_0:
20801 + case PCI_DEVICE_ID_INTEL_82371AB_0:
20802 + case PCI_DEVICE_ID_INTEL_82371MX:
20803 + case PCI_DEVICE_ID_INTEL_82443MX_0:
20804 + case PCI_DEVICE_ID_INTEL_82801AA_0:
20805 + case PCI_DEVICE_ID_INTEL_82801AB_0:
20806 + case PCI_DEVICE_ID_INTEL_82801BA_0:
20807 + case PCI_DEVICE_ID_INTEL_82801BA_10:
20808 + case PCI_DEVICE_ID_INTEL_82801CA_0:
20809 + case PCI_DEVICE_ID_INTEL_82801CA_12:
20810 + case PCI_DEVICE_ID_INTEL_82801DB_0:
20811 + case PCI_DEVICE_ID_INTEL_82801E_0:
20812 + case PCI_DEVICE_ID_INTEL_82801EB_0:
20813 + case PCI_DEVICE_ID_INTEL_ESB_1:
20814 + case PCI_DEVICE_ID_INTEL_ICH6_0:
20815 + case PCI_DEVICE_ID_INTEL_ICH6_1:
20816 + case PCI_DEVICE_ID_INTEL_ICH7_0:
20817 + case PCI_DEVICE_ID_INTEL_ICH7_1:
20818 + case PCI_DEVICE_ID_INTEL_ICH7_30:
20819 + case PCI_DEVICE_ID_INTEL_ICH7_31:
20820 + case PCI_DEVICE_ID_INTEL_ESB2_0:
20821 + case PCI_DEVICE_ID_INTEL_ICH8_0:
20822 + case PCI_DEVICE_ID_INTEL_ICH8_1:
20823 + case PCI_DEVICE_ID_INTEL_ICH8_2:
20824 + case PCI_DEVICE_ID_INTEL_ICH8_3:
20825 + case PCI_DEVICE_ID_INTEL_ICH8_4:
20826 + r->name = "PIIX/ICH";
20827 + r->get = pirq_piix_get;
20828 + r->set = pirq_piix_set;
20829 + return 1;
20830 + }
20831 + return 0;
20832 +}
20833 +
20834 +static __init int via_router_probe(struct irq_router *r,
20835 + struct pci_dev *router, u16 device)
20836 +{
20837 + /* FIXME: We should move some of the quirk fixup stuff here */
20838 +
20839 + /*
20840 + * work arounds for some buggy BIOSes
20841 + */
20842 + if (device == PCI_DEVICE_ID_VIA_82C586_0) {
20843 + switch(router->device) {
20844 + case PCI_DEVICE_ID_VIA_82C686:
20845 + /*
20846 + * Asus k7m bios wrongly reports 82C686A
20847 + * as 586-compatible
20848 + */
20849 + device = PCI_DEVICE_ID_VIA_82C686;
20850 + break;
20851 + case PCI_DEVICE_ID_VIA_8235:
20852 + /**
20853 + * Asus a7v-x bios wrongly reports 8235
20854 + * as 586-compatible
20855 + */
20856 + device = PCI_DEVICE_ID_VIA_8235;
20857 + break;
20858 + }
20859 + }
20860 +
20861 + switch(device) {
20862 + case PCI_DEVICE_ID_VIA_82C586_0:
20863 + r->name = "VIA";
20864 + r->get = pirq_via586_get;
20865 + r->set = pirq_via586_set;
20866 + return 1;
20867 + case PCI_DEVICE_ID_VIA_82C596:
20868 + case PCI_DEVICE_ID_VIA_82C686:
20869 + case PCI_DEVICE_ID_VIA_8231:
20870 + case PCI_DEVICE_ID_VIA_8235:
20871 + /* FIXME: add new ones for 8233/5 */
20872 + r->name = "VIA";
20873 + r->get = pirq_via_get;
20874 + r->set = pirq_via_set;
20875 + return 1;
20876 + }
20877 + return 0;
20878 +}
20879 +
20880 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20881 +{
20882 + switch(device)
20883 + {
20884 + case PCI_DEVICE_ID_VLSI_82C534:
20885 + r->name = "VLSI 82C534";
20886 + r->get = pirq_vlsi_get;
20887 + r->set = pirq_vlsi_set;
20888 + return 1;
20889 + }
20890 + return 0;
20891 +}
20892 +
20893 +
20894 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20895 +{
20896 + switch(device)
20897 + {
20898 + case PCI_DEVICE_ID_SERVERWORKS_OSB4:
20899 + case PCI_DEVICE_ID_SERVERWORKS_CSB5:
20900 + r->name = "ServerWorks";
20901 + r->get = pirq_serverworks_get;
20902 + r->set = pirq_serverworks_set;
20903 + return 1;
20904 + }
20905 + return 0;
20906 +}
20907 +
20908 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20909 +{
20910 + if (device != PCI_DEVICE_ID_SI_503)
20911 + return 0;
20912 +
20913 + r->name = "SIS";
20914 + r->get = pirq_sis_get;
20915 + r->set = pirq_sis_set;
20916 + return 1;
20917 +}
20918 +
20919 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20920 +{
20921 + switch(device)
20922 + {
20923 + case PCI_DEVICE_ID_CYRIX_5520:
20924 + r->name = "NatSemi";
20925 + r->get = pirq_cyrix_get;
20926 + r->set = pirq_cyrix_set;
20927 + return 1;
20928 + }
20929 + return 0;
20930 +}
20931 +
20932 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20933 +{
20934 + switch(device)
20935 + {
20936 + case PCI_DEVICE_ID_OPTI_82C700:
20937 + r->name = "OPTI";
20938 + r->get = pirq_opti_get;
20939 + r->set = pirq_opti_set;
20940 + return 1;
20941 + }
20942 + return 0;
20943 +}
20944 +
20945 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20946 +{
20947 + switch(device)
20948 + {
20949 + case PCI_DEVICE_ID_ITE_IT8330G_0:
20950 + r->name = "ITE";
20951 + r->get = pirq_ite_get;
20952 + r->set = pirq_ite_set;
20953 + return 1;
20954 + }
20955 + return 0;
20956 +}
20957 +
20958 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20959 +{
20960 + switch(device)
20961 + {
20962 + case PCI_DEVICE_ID_AL_M1533:
20963 + case PCI_DEVICE_ID_AL_M1563:
20964 + printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
20965 + r->name = "ALI";
20966 + r->get = pirq_ali_get;
20967 + r->set = pirq_ali_set;
20968 + return 1;
20969 + }
20970 + return 0;
20971 +}
20972 +
20973 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20974 +{
20975 + switch(device)
20976 + {
20977 + case PCI_DEVICE_ID_AMD_VIPER_740B:
20978 + r->name = "AMD756";
20979 + break;
20980 + case PCI_DEVICE_ID_AMD_VIPER_7413:
20981 + r->name = "AMD766";
20982 + break;
20983 + case PCI_DEVICE_ID_AMD_VIPER_7443:
20984 + r->name = "AMD768";
20985 + break;
20986 + default:
20987 + return 0;
20988 + }
20989 + r->get = pirq_amd756_get;
20990 + r->set = pirq_amd756_set;
20991 + return 1;
20992 +}
20993 +
20994 +static __initdata struct irq_router_handler pirq_routers[] = {
20995 + { PCI_VENDOR_ID_INTEL, intel_router_probe },
20996 + { PCI_VENDOR_ID_AL, ali_router_probe },
20997 + { PCI_VENDOR_ID_ITE, ite_router_probe },
20998 + { PCI_VENDOR_ID_VIA, via_router_probe },
20999 + { PCI_VENDOR_ID_OPTI, opti_router_probe },
21000 + { PCI_VENDOR_ID_SI, sis_router_probe },
21001 + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
21002 + { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
21003 + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
21004 + { PCI_VENDOR_ID_AMD, amd_router_probe },
21005 + /* Someone with docs needs to add the ATI Radeon IGP */
21006 + { 0, NULL }
21007 +};
21008 +static struct irq_router pirq_router;
21009 +static struct pci_dev *pirq_router_dev;
21010 +
21011 +
21012 +/*
21013 + * FIXME: should we have an option to say "generic for
21014 + * chipset" ?
21015 + */
21016 +
21017 +static void __init pirq_find_router(struct irq_router *r)
21018 +{
21019 + struct irq_routing_table *rt = pirq_table;
21020 + struct irq_router_handler *h;
21021 +
21022 +#ifdef CONFIG_PCI_BIOS
21023 + if (!rt->signature) {
21024 + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
21025 + r->set = pirq_bios_set;
21026 + r->name = "BIOS";
21027 + return;
21028 + }
21029 +#endif
21030 +
21031 + /* Default unless a driver reloads it */
21032 + r->name = "default";
21033 + r->get = NULL;
21034 + r->set = NULL;
21035 +
21036 + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
21037 + rt->rtr_vendor, rt->rtr_device);
21038 +
21039 + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
21040 + if (!pirq_router_dev) {
21041 + DBG(KERN_DEBUG "PCI: Interrupt router not found at "
21042 + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
21043 + return;
21044 + }
21045 +
21046 + for( h = pirq_routers; h->vendor; h++) {
21047 + /* First look for a router match */
21048 + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
21049 + break;
21050 + /* Fall back to a device match */
21051 + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
21052 + break;
21053 + }
21054 + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
21055 + pirq_router.name,
21056 + pirq_router_dev->vendor,
21057 + pirq_router_dev->device,
21058 + pci_name(pirq_router_dev));
21059 +}
21060 +
21061 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
21062 +{
21063 + struct irq_routing_table *rt = pirq_table;
21064 + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
21065 + struct irq_info *info;
21066 +
21067 + for (info = rt->slots; entries--; info++)
21068 + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
21069 + return info;
21070 + return NULL;
21071 +}
21072 +
21073 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
21074 +{
21075 + u8 pin;
21076 + struct irq_info *info;
21077 + int i, pirq, newirq;
21078 + int irq = 0;
21079 + u32 mask;
21080 + struct irq_router *r = &pirq_router;
21081 + struct pci_dev *dev2 = NULL;
21082 + char *msg = NULL;
21083 +
21084 + /* Find IRQ pin */
21085 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21086 + if (!pin) {
21087 + DBG(KERN_DEBUG " -> no interrupt pin\n");
21088 + return 0;
21089 + }
21090 + pin = pin - 1;
21091 +
21092 + /* Find IRQ routing entry */
21093 +
21094 + if (!pirq_table)
21095 + return 0;
21096 +
21097 + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
21098 + info = pirq_get_info(dev);
21099 + if (!info) {
21100 + DBG(" -> not found in routing table\n" KERN_DEBUG);
21101 + return 0;
21102 + }
21103 + pirq = info->irq[pin].link;
21104 + mask = info->irq[pin].bitmap;
21105 + if (!pirq) {
21106 + DBG(" -> not routed\n" KERN_DEBUG);
21107 + return 0;
21108 + }
21109 + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
21110 + mask &= pcibios_irq_mask;
21111 +
21112 + /* Work around broken HP Pavilion Notebooks which assign USB to
21113 + IRQ 9 even though it is actually wired to IRQ 11 */
21114 +
21115 + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
21116 + dev->irq = 11;
21117 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
21118 + r->set(pirq_router_dev, dev, pirq, 11);
21119 + }
21120 +
21121 + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
21122 + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
21123 + pirq = 0x68;
21124 + mask = 0x400;
21125 + dev->irq = r->get(pirq_router_dev, dev, pirq);
21126 + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
21127 + }
21128 +
21129 + /*
21130 + * Find the best IRQ to assign: use the one
21131 + * reported by the device if possible.
21132 + */
21133 + newirq = dev->irq;
21134 + if (newirq && !((1 << newirq) & mask)) {
21135 + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
21136 + else printk("\n" KERN_WARNING
21137 + "PCI: IRQ %i for device %s doesn't match PIRQ mask "
21138 + "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
21139 + pci_name(dev));
21140 + }
21141 + if (!newirq && assign) {
21142 + for (i = 0; i < 16; i++) {
21143 + if (!(mask & (1 << i)))
21144 + continue;
21145 + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ))
21146 + newirq = i;
21147 + }
21148 + }
21149 + DBG(" -> newirq=%d", newirq);
21150 +
21151 + /* Check if it is hardcoded */
21152 + if ((pirq & 0xf0) == 0xf0) {
21153 + irq = pirq & 0xf;
21154 + DBG(" -> hardcoded IRQ %d\n", irq);
21155 + msg = "Hardcoded";
21156 + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
21157 + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
21158 + DBG(" -> got IRQ %d\n", irq);
21159 + msg = "Found";
21160 + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
21161 + DBG(" -> assigning IRQ %d", newirq);
21162 + if (r->set(pirq_router_dev, dev, pirq, newirq)) {
21163 + eisa_set_level_irq(newirq);
21164 + DBG(" ... OK\n");
21165 + msg = "Assigned";
21166 + irq = newirq;
21167 + }
21168 + }
21169 +
21170 + if (!irq) {
21171 + DBG(" ... failed\n");
21172 + if (newirq && mask == (1 << newirq)) {
21173 + msg = "Guessed";
21174 + irq = newirq;
21175 + } else
21176 + return 0;
21177 + }
21178 + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
21179 +
21180 + /* Update IRQ for all devices with the same pirq value */
21181 + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
21182 + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
21183 + if (!pin)
21184 + continue;
21185 + pin--;
21186 + info = pirq_get_info(dev2);
21187 + if (!info)
21188 + continue;
21189 + if (info->irq[pin].link == pirq) {
21190 + /* We refuse to override the dev->irq information. Give a warning! */
21191 + if ( dev2->irq && dev2->irq != irq && \
21192 + (!(pci_probe & PCI_USE_PIRQ_MASK) || \
21193 + ((1 << dev2->irq) & mask)) ) {
21194 +#ifndef CONFIG_PCI_MSI
21195 + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
21196 + pci_name(dev2), dev2->irq, irq);
21197 +#endif
21198 + continue;
21199 + }
21200 + dev2->irq = irq;
21201 + pirq_penalty[irq]++;
21202 + if (dev != dev2)
21203 + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
21204 + }
21205 + }
21206 + return 1;
21207 +}
21208 +
21209 +static void __init pcibios_fixup_irqs(void)
21210 +{
21211 + struct pci_dev *dev = NULL;
21212 + u8 pin;
21213 +
21214 + DBG(KERN_DEBUG "PCI: IRQ fixup\n");
21215 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21216 + /*
21217 + * If the BIOS has set an out of range IRQ number, just ignore it.
21218 + * Also keep track of which IRQ's are already in use.
21219 + */
21220 + if (dev->irq >= 16) {
21221 + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
21222 + dev->irq = 0;
21223 + }
21224 + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
21225 + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
21226 + pirq_penalty[dev->irq] = 0;
21227 + pirq_penalty[dev->irq]++;
21228 + }
21229 +
21230 + dev = NULL;
21231 + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
21232 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21233 +#ifdef CONFIG_X86_IO_APIC
21234 + /*
21235 + * Recalculate IRQ numbers if we use the I/O APIC.
21236 + */
21237 + if (io_apic_assign_pci_irqs)
21238 + {
21239 + int irq;
21240 +
21241 + if (pin) {
21242 + pin--; /* interrupt pins are numbered starting from 1 */
21243 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21244 + /*
21245 + * Busses behind bridges are typically not listed in the MP-table.
21246 + * In this case we have to look up the IRQ based on the parent bus,
21247 + * parent slot, and pin number. The SMP code detects such bridged
21248 + * busses itself so we should get into this branch reliably.
21249 + */
21250 + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21251 + struct pci_dev * bridge = dev->bus->self;
21252 +
21253 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21254 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
21255 + PCI_SLOT(bridge->devfn), pin);
21256 + if (irq >= 0)
21257 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21258 + pci_name(bridge), 'A' + pin, irq);
21259 + }
21260 + if (irq >= 0) {
21261 + if (use_pci_vector() &&
21262 + !platform_legacy_irq(irq))
21263 + irq = IO_APIC_VECTOR(irq);
21264 +
21265 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21266 + pci_name(dev), 'A' + pin, irq);
21267 + dev->irq = irq;
21268 + }
21269 + }
21270 + }
21271 +#endif
21272 + /*
21273 + * Still no IRQ? Try to lookup one...
21274 + */
21275 + if (pin && !dev->irq)
21276 + pcibios_lookup_irq(dev, 0);
21277 + }
21278 +}
21279 +
21280 +/*
21281 + * Work around broken HP Pavilion Notebooks which assign USB to
21282 + * IRQ 9 even though it is actually wired to IRQ 11
21283 + */
21284 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
21285 +{
21286 + if (!broken_hp_bios_irq9) {
21287 + broken_hp_bios_irq9 = 1;
21288 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21289 + }
21290 + return 0;
21291 +}
21292 +
21293 +/*
21294 + * Work around broken Acer TravelMate 360 Notebooks which assign
21295 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
21296 + */
21297 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
21298 +{
21299 + if (!acer_tm360_irqrouting) {
21300 + acer_tm360_irqrouting = 1;
21301 + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
21302 + }
21303 + return 0;
21304 +}
21305 +
21306 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
21307 + {
21308 + .callback = fix_broken_hp_bios_irq9,
21309 + .ident = "HP Pavilion N5400 Series Laptop",
21310 + .matches = {
21311 + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
21312 + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
21313 + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
21314 + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
21315 + },
21316 + },
21317 + {
21318 + .callback = fix_acer_tm360_irqrouting,
21319 + .ident = "Acer TravelMate 36x Laptop",
21320 + .matches = {
21321 + DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
21322 + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
21323 + },
21324 + },
21325 + { }
21326 +};
21327 +
21328 +static int __init pcibios_irq_init(void)
21329 +{
21330 + DBG(KERN_DEBUG "PCI: IRQ init\n");
21331 +
21332 + if (pcibios_enable_irq || raw_pci_ops == NULL)
21333 + return 0;
21334 +
21335 + dmi_check_system(pciirq_dmi_table);
21336 +
21337 + pirq_table = pirq_find_routing_table();
21338 +
21339 +#ifdef CONFIG_PCI_BIOS
21340 + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
21341 + pirq_table = pcibios_get_irq_routing_table();
21342 +#endif
21343 + if (pirq_table) {
21344 + pirq_peer_trick();
21345 + pirq_find_router(&pirq_router);
21346 + if (pirq_table->exclusive_irqs) {
21347 + int i;
21348 + for (i=0; i<16; i++)
21349 + if (!(pirq_table->exclusive_irqs & (1 << i)))
21350 + pirq_penalty[i] += 100;
21351 + }
21352 + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
21353 + if (io_apic_assign_pci_irqs)
21354 + pirq_table = NULL;
21355 + }
21356 +
21357 + pcibios_enable_irq = pirq_enable_irq;
21358 +
21359 + pcibios_fixup_irqs();
21360 + return 0;
21361 +}
21362 +
21363 +subsys_initcall(pcibios_irq_init);
21364 +
21365 +
21366 +static void pirq_penalize_isa_irq(int irq, int active)
21367 +{
21368 + /*
21369 + * If any ISAPnP device reports an IRQ in its list of possible
21370 + * IRQ's, we try to avoid assigning it to PCI devices.
21371 + */
21372 + if (irq < 16) {
21373 + if (active)
21374 + pirq_penalty[irq] += 1000;
21375 + else
21376 + pirq_penalty[irq] += 100;
21377 + }
21378 +}
21379 +
21380 +void pcibios_penalize_isa_irq(int irq, int active)
21381 +{
21382 +#ifdef CONFIG_ACPI
21383 + if (!acpi_noirq)
21384 + acpi_penalize_isa_irq(irq, active);
21385 + else
21386 +#endif
21387 + pirq_penalize_isa_irq(irq, active);
21388 +}
21389 +
21390 +static int pirq_enable_irq(struct pci_dev *dev)
21391 +{
21392 + u8 pin;
21393 + struct pci_dev *temp_dev;
21394 +
21395 + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
21396 + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
21397 + char *msg = "";
21398 +
21399 + pin--; /* interrupt pins are numbered starting from 1 */
21400 +
21401 + if (io_apic_assign_pci_irqs) {
21402 + int irq;
21403 +
21404 + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
21405 + /*
21406 + * Busses behind bridges are typically not listed in the MP-table.
21407 + * In this case we have to look up the IRQ based on the parent bus,
21408 + * parent slot, and pin number. The SMP code detects such bridged
21409 + * busses itself so we should get into this branch reliably.
21410 + */
21411 + temp_dev = dev;
21412 + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
21413 + struct pci_dev * bridge = dev->bus->self;
21414 +
21415 + pin = (pin + PCI_SLOT(dev->devfn)) % 4;
21416 + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
21417 + PCI_SLOT(bridge->devfn), pin);
21418 + if (irq >= 0)
21419 + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
21420 + pci_name(bridge), 'A' + pin, irq);
21421 + dev = bridge;
21422 + }
21423 + dev = temp_dev;
21424 + if (irq >= 0) {
21425 +#ifdef CONFIG_PCI_MSI
21426 + if (!platform_legacy_irq(irq))
21427 + irq = IO_APIC_VECTOR(irq);
21428 +#endif
21429 + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
21430 + pci_name(dev), 'A' + pin, irq);
21431 + dev->irq = irq;
21432 + return 0;
21433 + } else
21434 + msg = " Probably buggy MP table.";
21435 + } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
21436 + msg = "";
21437 + else
21438 + msg = " Please try using pci=biosirq.";
21439 +
21440 + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
21441 + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
21442 + return 0;
21443 +
21444 + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
21445 + 'A' + pin, pci_name(dev), msg);
21446 + }
21447 + return 0;
21448 +}
21449 +
21450 +int pci_vector_resources(int last, int nr_released)
21451 +{
21452 + int count = nr_released;
21453 +
21454 + int next = last;
21455 + int offset = (last % 8);
21456 +
21457 + while (next < FIRST_SYSTEM_VECTOR) {
21458 + next += 8;
21459 +#ifdef CONFIG_X86_64
21460 + if (next == IA32_SYSCALL_VECTOR)
21461 + continue;
21462 +#else
21463 + if (next == SYSCALL_VECTOR)
21464 + continue;
21465 +#endif
21466 + count++;
21467 + if (next >= FIRST_SYSTEM_VECTOR) {
21468 + if (offset%8) {
21469 + next = FIRST_DEVICE_VECTOR + offset;
21470 + offset++;
21471 + continue;
21472 + }
21473 + count--;
21474 + }
21475 + }
21476 +
21477 + return count;
21478 +}
21479 diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/mmconfig.c linux-2.6.16.33/arch/i386/pci/mmconfig.c
21480 --- linux-2.6.16.33-noxen/arch/i386/pci/mmconfig.c 2006-11-22 18:06:31.000000000 +0000
21481 +++ linux-2.6.16.33/arch/i386/pci/mmconfig.c 2007-05-23 21:00:01.000000000 +0000
21482 @@ -12,14 +12,22 @@
21483 #include <linux/pci.h>
21484 #include <linux/init.h>
21485 #include <linux/acpi.h>
21486 +#include <asm/e820.h>
21487 #include "pci.h"
21488
21489 +/* aperture is up to 256MB but BIOS may reserve less */
21490 +#define MMCONFIG_APER_MIN (2 * 1024*1024)
21491 +#define MMCONFIG_APER_MAX (256 * 1024*1024)
21492 +
21493 +/* Assume systems with more busses have correct MCFG */
21494 +#define MAX_CHECK_BUS 16
21495 +
21496 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
21497
21498 /* The base address of the last MMCONFIG device accessed */
21499 static u32 mmcfg_last_accessed_device;
21500
21501 -static DECLARE_BITMAP(fallback_slots, 32);
21502 +static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
21503
21504 /*
21505 * Functions for accessing PCI configuration space with MMCONFIG accesses
21506 @@ -29,8 +37,8 @@
21507 int cfg_num = -1;
21508 struct acpi_table_mcfg_config *cfg;
21509
21510 - if (seg == 0 && bus == 0 &&
21511 - test_bit(PCI_SLOT(devfn), fallback_slots))
21512 + if (seg == 0 && bus < MAX_CHECK_BUS &&
21513 + test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots))
21514 return 0;
21515
21516 while (1) {
21517 @@ -74,8 +82,10 @@
21518 unsigned long flags;
21519 u32 base;
21520
21521 - if (!value || (bus > 255) || (devfn > 255) || (reg > 4095))
21522 + if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
21523 + *value = -1;
21524 return -EINVAL;
21525 + }
21526
21527 base = get_base_addr(seg, bus, devfn);
21528 if (!base)
21529 @@ -146,30 +156,66 @@
21530 Normally this can be expressed in the MCFG by not listing them
21531 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
21532 Instead try to discover all devices on bus 0 that are unreachable using MM
21533 - and fallback for them.
21534 - We only do this for bus 0/seg 0 */
21535 + and fallback for them. */
21536 static __init void unreachable_devices(void)
21537 {
21538 - int i;
21539 + int i, k;
21540 unsigned long flags;
21541
21542 - for (i = 0; i < 32; i++) {
21543 - u32 val1;
21544 - u32 addr;
21545 + for (k = 0; k < MAX_CHECK_BUS; k++) {
21546 + for (i = 0; i < 32; i++) {
21547 + u32 val1;
21548 + u32 addr;
21549 +
21550 + pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
21551 + if (val1 == 0xffffffff)
21552 + continue;
21553 +
21554 + /* Locking probably not needed, but safer */
21555 + spin_lock_irqsave(&pci_config_lock, flags);
21556 + addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
21557 + if (addr != 0)
21558 + pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
21559 + if (addr == 0 ||
21560 + readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
21561 + set_bit(i, fallback_slots);
21562 + printk(KERN_NOTICE
21563 + "PCI: No mmconfig possible on %x:%x\n", k, i);
21564 + }
21565 + spin_unlock_irqrestore(&pci_config_lock, flags);
21566 + }
21567 + }
21568 +}
21569
21570 - pci_conf1_read(0, 0, PCI_DEVFN(i, 0), 0, 4, &val1);
21571 - if (val1 == 0xffffffff)
21572 +/* NB. Ripped from arch/i386/kernel/setup.c for this Xen bugfix patch. */
21573 +#ifdef CONFIG_XEN
21574 +extern struct e820map machine_e820;
21575 +#define e820 machine_e820
21576 +#endif
21577 +static int __init
21578 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
21579 +{
21580 + u64 start = s;
21581 + u64 end = e;
21582 + int i;
21583 + for (i = 0; i < e820.nr_map; i++) {
21584 + struct e820entry *ei = &e820.map[i];
21585 + if (type && ei->type != type)
21586 continue;
21587 -
21588 - /* Locking probably not needed, but safer */
21589 - spin_lock_irqsave(&pci_config_lock, flags);
21590 - addr = get_base_addr(0, 0, PCI_DEVFN(i, 0));
21591 - if (addr != 0)
21592 - pci_exp_set_dev_base(addr, 0, PCI_DEVFN(i, 0));
21593 - if (addr == 0 || readl((u32 __iomem *)mmcfg_virt_addr) != val1)
21594 - set_bit(i, fallback_slots);
21595 - spin_unlock_irqrestore(&pci_config_lock, flags);
21596 + /* is the region (part) in overlap with the current region ?*/
21597 + if (ei->addr >= end || ei->addr + ei->size <= start)
21598 + continue;
21599 + /* if the region is at the beginning of <start,end> we move
21600 + * start to the end of the region since it's ok until there
21601 + */
21602 + if (ei->addr <= start)
21603 + start = ei->addr + ei->size;
21604 + /* if start is now at or beyond end, we're done, full
21605 + * coverage */
21606 + if (start >= end)
21607 + return 1; /* we're done */
21608 }
21609 + return 0;
21610 }
21611
21612 static int __init pci_mmcfg_init(void)
21613 @@ -183,6 +229,15 @@
21614 (pci_mmcfg_config[0].base_address == 0))
21615 goto out;
21616
21617 + if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
21618 + pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
21619 + E820_RESERVED)) {
21620 + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
21621 + pci_mmcfg_config[0].base_address);
21622 + printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
21623 + goto out;
21624 + }
21625 +
21626 printk(KERN_INFO "PCI: Using MMCONFIG\n");
21627 raw_pci_ops = &pci_mmcfg;
21628 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
21629 diff -Nur linux-2.6.16.33-noxen/arch/i386/pci/pcifront.c linux-2.6.16.33/arch/i386/pci/pcifront.c
21630 --- linux-2.6.16.33-noxen/arch/i386/pci/pcifront.c 1970-01-01 00:00:00.000000000 +0000
21631 +++ linux-2.6.16.33/arch/i386/pci/pcifront.c 2007-01-08 15:00:45.000000000 +0000
21632 @@ -0,0 +1,55 @@
21633 +/*
21634 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
21635 + * to support the Xen PCI Frontend's operation
21636 + *
21637 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
21638 + */
21639 +#include <linux/module.h>
21640 +#include <linux/init.h>
21641 +#include <linux/pci.h>
21642 +#include <asm/acpi.h>
21643 +#include "pci.h"
21644 +
21645 +static int pcifront_enable_irq(struct pci_dev *dev)
21646 +{
21647 + u8 irq;
21648 + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
21649 + dev->irq = irq;
21650 +
21651 + return 0;
21652 +}
21653 +
21654 +extern u8 pci_cache_line_size;
21655 +
21656 +static int __init pcifront_x86_stub_init(void)
21657 +{
21658 + struct cpuinfo_x86 *c = &boot_cpu_data;
21659 +
21660 + /* Only install our method if we haven't found real hardware already */
21661 + if (raw_pci_ops)
21662 + return 0;
21663 +
21664 + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
21665 +
21666 + /* Copied from arch/i386/pci/common.c */
21667 + pci_cache_line_size = 32 >> 2;
21668 + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
21669 + pci_cache_line_size = 64 >> 2; /* K7 & K8 */
21670 + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
21671 + pci_cache_line_size = 128 >> 2; /* P4 */
21672 +
21673 + /* On x86, we need to disable the normal IRQ routing table and
21674 + * just ask the backend
21675 + */
21676 + pcibios_enable_irq = pcifront_enable_irq;
21677 + pcibios_disable_irq = NULL;
21678 +
21679 +#ifdef CONFIG_ACPI
21680 + /* Keep ACPI out of the picture */
21681 + acpi_noirq = 1;
21682 +#endif
21683 +
21684 + return 0;
21685 +}
21686 +
21687 +arch_initcall(pcifront_x86_stub_init);
21688 diff -Nur linux-2.6.16.33-noxen/arch/i386/power/Makefile linux-2.6.16.33/arch/i386/power/Makefile
21689 --- linux-2.6.16.33-noxen/arch/i386/power/Makefile 2006-11-22 18:06:31.000000000 +0000
21690 +++ linux-2.6.16.33/arch/i386/power/Makefile 2007-01-08 15:00:45.000000000 +0000
21691 @@ -1,2 +1,4 @@
21692 -obj-$(CONFIG_PM) += cpu.o
21693 +obj-$(CONFIG_PM_LEGACY) += cpu.o
21694 +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
21695 +obj-$(CONFIG_ACPI_SLEEP) += cpu.o
21696 obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
21697 diff -Nur linux-2.6.16.33-noxen/arch/ia64/Kconfig linux-2.6.16.33/arch/ia64/Kconfig
21698 --- linux-2.6.16.33-noxen/arch/ia64/Kconfig 2006-11-22 18:06:31.000000000 +0000
21699 +++ linux-2.6.16.33/arch/ia64/Kconfig 2007-01-08 15:00:45.000000000 +0000
21700 @@ -50,6 +50,34 @@
21701 bool
21702 default y
21703
21704 +config XEN
21705 + bool "Xen hypervisor support"
21706 + default y
21707 + help
21708 + Enable Xen hypervisor support. Resulting kernel runs
21709 + both as a guest OS on Xen and natively on hardware.
21710 +
21711 +config XEN_IA64_VDSO_PARAVIRT
21712 + bool
21713 + depends on XEN && !ITANIUM
21714 + default y
21715 + help
21716 + vDSO paravirtualization
21717 +
21718 +config XEN_IA64_EXPOSE_P2M
21719 + bool "Xen/IA64 exposure p2m table"
21720 + depends on XEN
21721 + default y
21722 + help
21723 + expose p2m from xen
21724 +
21725 +config XEN_IA64_EXPOSE_P2M_USE_DTR
21726 + bool "Xen/IA64 map p2m table with dtr"
21727 + depends on XEN_IA64_EXPOSE_P2M
21728 + default y
21729 + help
21730 + use dtr to map the exposed p2m table
21731 +
21732 config SCHED_NO_NO_OMIT_FRAME_POINTER
21733 bool
21734 default y
21735 @@ -413,6 +441,21 @@
21736 bool
21737 default PCI
21738
21739 +config XEN_PCIDEV_FRONTEND
21740 + bool "Xen PCI Frontend"
21741 + depends on PCI && XEN
21742 + default y
21743 + help
21744 + The PCI device frontend driver allows the kernel to import arbitrary
21745 + PCI devices from a PCI backend to support PCI driver domains.
21746 +
21747 +config XEN_PCIDEV_FE_DEBUG
21748 + bool "Xen PCI Frontend Debugging"
21749 + depends on XEN_PCIDEV_FRONTEND
21750 + default n
21751 + help
21752 + Enables some debug statements within the PCI Frontend.
21753 +
21754 source "drivers/pci/Kconfig"
21755
21756 source "drivers/pci/hotplug/Kconfig"
21757 @@ -470,3 +513,32 @@
21758 source "security/Kconfig"
21759
21760 source "crypto/Kconfig"
21761 +
21762 +#
21763 +# override default values of drivers/xen/Kconfig
21764 +#
21765 +if XEN
21766 +config XEN_UTIL
21767 + default n
21768 +
21769 +config HAVE_ARCH_ALLOC_SKB
21770 + default y
21771 +
21772 +config HAVE_ARCH_DEV_ALLOC_SKB
21773 + default y
21774 +
21775 +config XEN_BALLOON
21776 + default y
21777 +
21778 +config XEN_SKBUFF
21779 + default y
21780 + depends on NET
21781 +
21782 +config XEN_REBOOT
21783 + default y
21784 +
21785 +config XEN_SMPBOOT
21786 + default n
21787 +endif
21788 +
21789 +source "drivers/xen/Kconfig"
21790 diff -Nur linux-2.6.16.33-noxen/arch/ia64/Makefile linux-2.6.16.33/arch/ia64/Makefile
21791 --- linux-2.6.16.33-noxen/arch/ia64/Makefile 2006-11-22 18:06:31.000000000 +0000
21792 +++ linux-2.6.16.33/arch/ia64/Makefile 2007-01-08 15:00:45.000000000 +0000
21793 @@ -42,6 +42,12 @@
21794 endif
21795
21796 CFLAGS += $(cflags-y)
21797 +
21798 +cppflags-$(CONFIG_XEN) += \
21799 + -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
21800 +
21801 +CPPFLAGS += $(cppflags-y)
21802 +
21803 head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
21804
21805 libs-y += arch/ia64/lib/
21806 @@ -52,9 +58,15 @@
21807 core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/
21808 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
21809 core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/
21810 +core-$(CONFIG_XEN) += arch/ia64/xen/
21811
21812 drivers-$(CONFIG_PCI) += arch/ia64/pci/
21813 +ifneq ($(CONFIG_XEN),y)
21814 drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
21815 +endif
21816 +ifneq ($(CONFIG_IA64_GENERIC),y)
21817 +drivers-$(CONFIG_XEN) += arch/ia64/hp/sim/
21818 +endif
21819 drivers-$(CONFIG_IA64_HP_ZX1) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
21820 drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
21821 drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
21822 @@ -68,6 +80,8 @@
21823
21824 compressed: vmlinux.gz
21825
21826 +vmlinuz: vmlinux.gz
21827 +
21828 vmlinux.gz: vmlinux
21829 $(Q)$(MAKE) $(build)=$(boot) $@
21830
21831 @@ -82,8 +96,8 @@
21832 boot: lib/lib.a vmlinux
21833 $(Q)$(MAKE) $(build)=$(boot) $@
21834
21835 -install: vmlinux.gz
21836 - sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
21837 +install:
21838 + -yes | sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) vmlinux.gz System.map "$(INSTALL_PATH)"
21839
21840 define archhelp
21841 echo '* compressed - Build compressed kernel image'
21842 diff -Nur linux-2.6.16.33-noxen/arch/ia64/dig/setup.c linux-2.6.16.33/arch/ia64/dig/setup.c
21843 --- linux-2.6.16.33-noxen/arch/ia64/dig/setup.c 2006-11-22 18:06:31.000000000 +0000
21844 +++ linux-2.6.16.33/arch/ia64/dig/setup.c 2007-01-08 15:00:45.000000000 +0000
21845 @@ -25,6 +25,8 @@
21846 #include <asm/machvec.h>
21847 #include <asm/system.h>
21848
21849 +#include <xen/xencons.h>
21850 +
21851 void __init
21852 dig_setup (char **cmdline_p)
21853 {
21854 @@ -68,6 +70,21 @@
21855 screen_info.orig_video_mode = 3; /* XXX fake */
21856 screen_info.orig_video_isVGA = 1; /* XXX fake */
21857 screen_info.orig_video_ega_bx = 3; /* XXX fake */
21858 +#ifdef CONFIG_XEN
21859 + if (!is_running_on_xen() || !is_initial_xendomain())
21860 + return;
21861 +
21862 + if (xen_start_info->console.dom0.info_size >=
21863 + sizeof(struct dom0_vga_console_info)) {
21864 + const struct dom0_vga_console_info *info =
21865 + (struct dom0_vga_console_info *)(
21866 + (char *)xen_start_info +
21867 + xen_start_info->console.dom0.info_off);
21868 + dom0_init_screen_info(info);
21869 + }
21870 + xen_start_info->console.domU.mfn = 0;
21871 + xen_start_info->console.domU.evtchn = 0;
21872 +#endif
21873 }
21874
21875 void __init
21876 diff -Nur linux-2.6.16.33-noxen/arch/ia64/hp/sim/Makefile linux-2.6.16.33/arch/ia64/hp/sim/Makefile
21877 --- linux-2.6.16.33-noxen/arch/ia64/hp/sim/Makefile 2006-11-22 18:06:31.000000000 +0000
21878 +++ linux-2.6.16.33/arch/ia64/hp/sim/Makefile 2007-01-08 15:00:45.000000000 +0000
21879 @@ -14,3 +14,5 @@
21880 obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
21881 obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
21882 obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
21883 +obj-$(CONFIG_XEN) += simserial.o
21884 +obj-$(CONFIG_XEN) += hpsim_console.o
21885 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/Makefile linux-2.6.16.33/arch/ia64/kernel/Makefile
21886 --- linux-2.6.16.33-noxen/arch/ia64/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
21887 +++ linux-2.6.16.33/arch/ia64/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
21888 @@ -44,7 +44,8 @@
21889 quiet_cmd_gate = GATE $@
21890 cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
21891
21892 -GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1
21893 +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
21894 + $(call ld-option, -Wl$(comma)--hash-style=sysv)
21895 $(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
21896 $(call if_changed,gate)
21897
21898 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/asm-offsets.c linux-2.6.16.33/arch/ia64/kernel/asm-offsets.c
21899 --- linux-2.6.16.33-noxen/arch/ia64/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
21900 +++ linux-2.6.16.33/arch/ia64/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
21901 @@ -261,4 +261,28 @@
21902 DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
21903 DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
21904 DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
21905 +
21906 +#ifdef CONFIG_XEN
21907 + BLANK();
21908 +
21909 +#define DEFINE_MAPPED_REG_OFS(sym, field) \
21910 + DEFINE(sym, (XMAPPEDREGS_OFS + offsetof(mapped_regs_t, field)))
21911 +
21912 + DEFINE_MAPPED_REG_OFS(XSI_PSR_I_ADDR_OFS, interrupt_mask_addr);
21913 + DEFINE_MAPPED_REG_OFS(XSI_IPSR_OFS, ipsr);
21914 + DEFINE_MAPPED_REG_OFS(XSI_IIP_OFS, iip);
21915 + DEFINE_MAPPED_REG_OFS(XSI_IFS_OFS, ifs);
21916 + DEFINE_MAPPED_REG_OFS(XSI_PRECOVER_IFS_OFS, precover_ifs);
21917 + DEFINE_MAPPED_REG_OFS(XSI_ISR_OFS, isr);
21918 + DEFINE_MAPPED_REG_OFS(XSI_IFA_OFS, ifa);
21919 + DEFINE_MAPPED_REG_OFS(XSI_IIPA_OFS, iipa);
21920 + DEFINE_MAPPED_REG_OFS(XSI_IIM_OFS, iim);
21921 + DEFINE_MAPPED_REG_OFS(XSI_IHA_OFS, iha);
21922 + DEFINE_MAPPED_REG_OFS(XSI_ITIR_OFS, itir);
21923 + DEFINE_MAPPED_REG_OFS(XSI_PSR_IC_OFS, interrupt_collection_enabled);
21924 + DEFINE_MAPPED_REG_OFS(XSI_INCOMPL_REGFR_OFS, incomplete_regframe);
21925 + DEFINE_MAPPED_REG_OFS(XSI_BANKNUM_OFS, banknum);
21926 + DEFINE_MAPPED_REG_OFS(XSI_BANK0_R16_OFS, bank0_regs[0]);
21927 + DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
21928 +#endif /* CONFIG_XEN */
21929 }
21930 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/entry.S linux-2.6.16.33/arch/ia64/kernel/entry.S
21931 --- linux-2.6.16.33-noxen/arch/ia64/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
21932 +++ linux-2.6.16.33/arch/ia64/kernel/entry.S 2007-01-08 15:00:45.000000000 +0000
21933 @@ -181,7 +181,7 @@
21934 * called. The code starting at .map relies on this. The rest of the code
21935 * doesn't care about the interrupt masking status.
21936 */
21937 -GLOBAL_ENTRY(ia64_switch_to)
21938 +GLOBAL_ENTRY(__ia64_switch_to)
21939 .prologue
21940 alloc r16=ar.pfs,1,0,0,0
21941 DO_SAVE_SWITCH_STACK
21942 @@ -235,7 +235,7 @@
21943 ;;
21944 srlz.d
21945 br.cond.sptk .done
21946 -END(ia64_switch_to)
21947 +END(__ia64_switch_to)
21948
21949 /*
21950 * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This
21951 @@ -376,7 +376,7 @@
21952 * - b7 holds address to return to
21953 * - must not touch r8-r11
21954 */
21955 -ENTRY(load_switch_stack)
21956 +GLOBAL_ENTRY(load_switch_stack)
21957 .prologue
21958 .altrp b7
21959
21960 @@ -511,7 +511,7 @@
21961 * because some system calls (such as ia64_execve) directly
21962 * manipulate ar.pfs.
21963 */
21964 -GLOBAL_ENTRY(ia64_trace_syscall)
21965 +GLOBAL_ENTRY(__ia64_trace_syscall)
21966 PT_REGS_UNWIND_INFO(0)
21967 /*
21968 * We need to preserve the scratch registers f6-f11 in case the system
21969 @@ -583,7 +583,7 @@
21970 (p6) mov r10=-1
21971 (p6) mov r8=r9
21972 br.cond.sptk .strace_save_retval
21973 -END(ia64_trace_syscall)
21974 +END(__ia64_trace_syscall)
21975
21976 /*
21977 * When traced and returning from sigreturn, we invoke syscall_trace but then
21978 @@ -602,7 +602,7 @@
21979 .ret4: br.cond.sptk ia64_leave_kernel
21980 END(ia64_strace_leave_kernel)
21981
21982 -GLOBAL_ENTRY(ia64_ret_from_clone)
21983 +GLOBAL_ENTRY(__ia64_ret_from_clone)
21984 PT_REGS_UNWIND_INFO(0)
21985 { /*
21986 * Some versions of gas generate bad unwind info if the first instruction of a
21987 @@ -628,7 +628,7 @@
21988 cmp.ne p6,p0=r2,r0
21989 (p6) br.cond.spnt .strace_check_retval
21990 ;; // added stop bits to prevent r8 dependency
21991 -END(ia64_ret_from_clone)
21992 +END(__ia64_ret_from_clone)
21993 // fall through
21994 GLOBAL_ENTRY(ia64_ret_from_syscall)
21995 PT_REGS_UNWIND_INFO(0)
21996 @@ -636,8 +636,11 @@
21997 adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
21998 mov r10=r0 // clear error indication in r10
21999 (p7) br.cond.spnt handle_syscall_error // handle potential syscall failure
22000 + ;;
22001 + // don't fall through, ia64_leave_syscall may be #define'd
22002 + br.cond.sptk.few ia64_leave_syscall
22003 + ;;
22004 END(ia64_ret_from_syscall)
22005 - // fall through
22006 /*
22007 * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
22008 * need to switch to bank 0 and doesn't restore the scratch registers.
22009 @@ -682,7 +685,7 @@
22010 * ar.csd: cleared
22011 * ar.ssd: cleared
22012 */
22013 -ENTRY(ia64_leave_syscall)
22014 +GLOBAL_ENTRY(__ia64_leave_syscall)
22015 PT_REGS_UNWIND_INFO(0)
22016 /*
22017 * work.need_resched etc. mustn't get changed by this CPU before it returns to
22018 @@ -790,7 +793,7 @@
22019 mov.m ar.ssd=r0 // M2 clear ar.ssd
22020 mov f11=f0 // F clear f11
22021 br.cond.sptk.many rbs_switch // B
22022 -END(ia64_leave_syscall)
22023 +END(__ia64_leave_syscall)
22024
22025 #ifdef CONFIG_IA32_SUPPORT
22026 GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
22027 @@ -802,10 +805,13 @@
22028 st8.spill [r2]=r8 // store return value in slot for r8 and set unat bit
22029 .mem.offset 8,0
22030 st8.spill [r3]=r0 // clear error indication in slot for r10 and set unat bit
22031 + ;;
22032 + // don't fall through, ia64_leave_kernel may be #define'd
22033 + br.cond.sptk.few ia64_leave_kernel
22034 + ;;
22035 END(ia64_ret_from_ia32_execve)
22036 - // fall through
22037 #endif /* CONFIG_IA32_SUPPORT */
22038 -GLOBAL_ENTRY(ia64_leave_kernel)
22039 +GLOBAL_ENTRY(__ia64_leave_kernel)
22040 PT_REGS_UNWIND_INFO(0)
22041 /*
22042 * work.need_resched etc. mustn't get changed by this CPU before it returns to
22043 @@ -1150,7 +1156,7 @@
22044 ld8 r10=[r3]
22045 br.cond.sptk.many .work_processed_syscall // re-check
22046
22047 -END(ia64_leave_kernel)
22048 +END(__ia64_leave_kernel)
22049
22050 ENTRY(handle_syscall_error)
22051 /*
22052 @@ -1190,7 +1196,7 @@
22053 * be set up by the caller. We declare 8 input registers so the system call
22054 * args get preserved, in case we need to restart a system call.
22055 */
22056 -ENTRY(notify_resume_user)
22057 +GLOBAL_ENTRY(notify_resume_user)
22058 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
22059 alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
22060 mov r9=ar.unat
22061 @@ -1278,7 +1284,7 @@
22062 adds sp=16,sp
22063 ;;
22064 ld8 r9=[sp] // load new ar.unat
22065 - mov.sptk b7=r8,ia64_leave_kernel
22066 + mov.sptk b7=r8,__ia64_leave_kernel
22067 ;;
22068 mov ar.unat=r9
22069 br.many b7
22070 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/gate.S linux-2.6.16.33/arch/ia64/kernel/gate.S
22071 --- linux-2.6.16.33-noxen/arch/ia64/kernel/gate.S 2006-11-22 18:06:31.000000000 +0000
22072 +++ linux-2.6.16.33/arch/ia64/kernel/gate.S 2007-01-08 15:00:45.000000000 +0000
22073 @@ -14,6 +14,9 @@
22074 #include <asm/sigcontext.h>
22075 #include <asm/system.h>
22076 #include <asm/unistd.h>
22077 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22078 +# include <asm/privop.h>
22079 +#endif
22080
22081 /*
22082 * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation,
22083 @@ -33,6 +36,52 @@
22084 [1:](pr)brl.cond.sptk 0; \
22085 .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
22086
22087 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22088 + // The page in which hyperprivop lives must be pinned by ITR.
22089 + // However vDSO area isn't pinned. So issuing hyperprivop
22090 + // from vDSO page causes trouble that Kevin pointed out.
22091 + // After clearing vpsr.ic, the vcpu is pre-empted and the itlb
22092 + // is flushed. Then vcpu get cpu again, tlb miss fault occures.
22093 + // However it results in nested dtlb fault because vpsr.ic is off.
22094 + // To avoid such a situation, we jump into the kernel text area
22095 + // which is pinned, and then issue hyperprivop and return back
22096 + // to vDSO page.
22097 + // This is Dan Magenheimer's idea.
22098 +
22099 + // Currently is_running_on_xen() is defined as running_on_xen.
22100 + // If is_running_on_xen() is a real function, we must update
22101 + // according to it.
22102 + .section ".data.patch.running_on_xen", "a"
22103 + .previous
22104 +#define LOAD_RUNNING_ON_XEN(reg) \
22105 +[1:] movl reg=0; \
22106 + .xdata4 ".data.patch.running_on_xen", 1b-.
22107 +
22108 + .section ".data.patch.brl_xen_rsm_be_i", "a"
22109 + .previous
22110 +#define BRL_COND_XEN_RSM_BE_I(pr) \
22111 +[1:](pr)brl.cond.sptk 0; \
22112 + .xdata4 ".data.patch.brl_xen_rsm_be_i", 1b-.
22113 +
22114 + .section ".data.patch.brl_xen_get_psr", "a"
22115 + .previous
22116 +#define BRL_COND_XEN_GET_PSR(pr) \
22117 +[1:](pr)brl.cond.sptk 0; \
22118 + .xdata4 ".data.patch.brl_xen_get_psr", 1b-.
22119 +
22120 + .section ".data.patch.brl_xen_ssm_i_0", "a"
22121 + .previous
22122 +#define BRL_COND_XEN_SSM_I_0(pr) \
22123 +[1:](pr)brl.cond.sptk 0; \
22124 + .xdata4 ".data.patch.brl_xen_ssm_i_0", 1b-.
22125 +
22126 + .section ".data.patch.brl_xen_ssm_i_1", "a"
22127 + .previous
22128 +#define BRL_COND_XEN_SSM_I_1(pr) \
22129 +[1:](pr)brl.cond.sptk 0; \
22130 + .xdata4 ".data.patch.brl_xen_ssm_i_1", 1b-.
22131 +#endif
22132 +
22133 GLOBAL_ENTRY(__kernel_syscall_via_break)
22134 .prologue
22135 .altrp b6
22136 @@ -77,7 +126,42 @@
22137 epc // B causes split-issue
22138 }
22139 ;;
22140 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22141 + // r20 = 1
22142 + // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
22143 + // r23 = &vpsr.ic
22144 + // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
22145 + // r25 = tmp
22146 + // r28 = &running_on_xen
22147 + // r30 = running_on_xen
22148 + // r31 = tmp
22149 + // p11 = tmp
22150 + // p12 = running_on_xen
22151 + // p13 = !running_on_xen
22152 + // p14 = tmp
22153 + // p15 = tmp
22154 +#define isXen p12
22155 +#define isRaw p13
22156 + LOAD_RUNNING_ON_XEN(r28)
22157 + movl r22=XSI_PSR_I_ADDR
22158 + ;;
22159 + ld8 r22=[r22]
22160 + ;;
22161 + movl r23=XSI_PSR_IC
22162 + adds r24=-1,r22
22163 + mov r20=1
22164 + ;;
22165 + ld4 r30=[r28]
22166 + ;;
22167 + cmp.ne isXen,isRaw=r0,r30
22168 + ;;
22169 +(isRaw) rsm psr.be | psr.i
22170 + BRL_COND_XEN_RSM_BE_I(isXen)
22171 + .global .vdso_rsm_be_i_ret
22172 +.vdso_rsm_be_i_ret:
22173 +#else
22174 rsm psr.be | psr.i // M2 (5 cyc to srlz.d)
22175 +#endif
22176 LOAD_FSYSCALL_TABLE(r14) // X
22177 ;;
22178 mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
22179 @@ -85,7 +169,14 @@
22180 mov r19=NR_syscalls-1 // A
22181 ;;
22182 lfetch [r18] // M0|1
22183 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22184 +(isRaw) mov r29=psr
22185 + BRL_COND_XEN_GET_PSR(isXen)
22186 + .global .vdso_get_psr_ret
22187 +.vdso_get_psr_ret:
22188 +#else
22189 mov r29=psr // M2 (12 cyc)
22190 +#endif
22191 // If r17 is a NaT, p6 will be zero
22192 cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
22193 ;;
22194 @@ -99,9 +190,21 @@
22195 ;;
22196 nop.m 0
22197 (p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
22198 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22199 + ;;
22200 + // p14 = running_on_xen && p8
22201 + // p15 = !running_on_xen && p8
22202 +(p8) cmp.ne.unc p14,p15=r0,r30
22203 + ;;
22204 +(p15) ssm psr.i
22205 + BRL_COND_XEN_SSM_I_0(p14)
22206 + .global .vdso_ssm_i_0_ret
22207 +.vdso_ssm_i_0_ret:
22208 +#else
22209 nop.i 0
22210 ;;
22211 (p8) ssm psr.i
22212 +#endif
22213 (p6) mov b7=r18 // I0
22214 (p8) br.dptk.many b7 // B
22215
22216 @@ -122,9 +225,21 @@
22217 #else
22218 BRL_COND_FSYS_BUBBLE_DOWN(p6)
22219 #endif
22220 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22221 +(isRaw) ssm psr.i
22222 + BRL_COND_XEN_SSM_I_1(isXen)
22223 + .global .vdso_ssm_i_1_ret
22224 +.vdso_ssm_i_1_ret:
22225 +#else
22226 ssm psr.i
22227 +#endif
22228 mov r10=-1
22229 (p10) mov r8=EINVAL
22230 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22231 + dv_serialize_data // shut up gas warning.
22232 + // we know xen_hyper_ssm_i_0 or xen_hyper_ssm_i_1
22233 + // doesn't change p9 and p10
22234 +#endif
22235 (p9) mov r8=ENOSYS
22236 FSYS_RETURN
22237 END(__kernel_syscall_via_epc)
22238 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/gate.lds.S linux-2.6.16.33/arch/ia64/kernel/gate.lds.S
22239 --- linux-2.6.16.33-noxen/arch/ia64/kernel/gate.lds.S 2006-11-22 18:06:31.000000000 +0000
22240 +++ linux-2.6.16.33/arch/ia64/kernel/gate.lds.S 2007-01-08 15:00:45.000000000 +0000
22241 @@ -13,6 +13,7 @@
22242 . = GATE_ADDR + SIZEOF_HEADERS;
22243
22244 .hash : { *(.hash) } :readable
22245 + .gnu.hash : { *(.gnu.hash) }
22246 .dynsym : { *(.dynsym) }
22247 .dynstr : { *(.dynstr) }
22248 .gnu.version : { *(.gnu.version) }
22249 @@ -43,6 +44,28 @@
22250 __start_gate_brl_fsys_bubble_down_patchlist = .;
22251 *(.data.patch.brl_fsys_bubble_down)
22252 __end_gate_brl_fsys_bubble_down_patchlist = .;
22253 +
22254 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22255 + __start_gate_running_on_xen_patchlist = .;
22256 + *(.data.patch.running_on_xen)
22257 + __end_gate_running_on_xen_patchlist = .;
22258 +
22259 + __start_gate_brl_xen_rsm_be_i_patchlist = .;
22260 + *(.data.patch.brl_xen_rsm_be_i)
22261 + __end_gate_brl_xen_rsm_be_i_patchlist = .;
22262 +
22263 + __start_gate_brl_xen_get_psr_patchlist = .;
22264 + *(.data.patch.brl_xen_get_psr)
22265 + __end_gate_brl_xen_get_psr_patchlist = .;
22266 +
22267 + __start_gate_brl_xen_ssm_i_0_patchlist = .;
22268 + *(.data.patch.brl_xen_ssm_i_0)
22269 + __end_gate_brl_xen_ssm_i_0_patchlist = .;
22270 +
22271 + __start_gate_brl_xen_ssm_i_1_patchlist = .;
22272 + *(.data.patch.brl_xen_ssm_i_1)
22273 + __end_gate_brl_xen_ssm_i_1_patchlist = .;
22274 +#endif
22275 } :readable
22276 .IA_64.unwind_info : { *(.IA_64.unwind_info*) }
22277 .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind
22278 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/head.S linux-2.6.16.33/arch/ia64/kernel/head.S
22279 --- linux-2.6.16.33-noxen/arch/ia64/kernel/head.S 2006-11-22 18:06:31.000000000 +0000
22280 +++ linux-2.6.16.33/arch/ia64/kernel/head.S 2007-01-08 15:00:45.000000000 +0000
22281 @@ -363,6 +363,12 @@
22282 ;;
22283 (isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader
22284
22285 +#ifdef CONFIG_XEN
22286 + // Note: isBP is used by the subprogram.
22287 + br.call.sptk.many rp=early_xen_setup
22288 + ;;
22289 +#endif
22290 +
22291 #ifdef CONFIG_SMP
22292 (isAP) br.call.sptk.many rp=start_secondary
22293 .ret0:
22294 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/iosapic.c linux-2.6.16.33/arch/ia64/kernel/iosapic.c
22295 --- linux-2.6.16.33-noxen/arch/ia64/kernel/iosapic.c 2006-11-22 18:06:31.000000000 +0000
22296 +++ linux-2.6.16.33/arch/ia64/kernel/iosapic.c 2007-01-08 15:00:45.000000000 +0000
22297 @@ -140,6 +140,75 @@
22298 static int iosapic_kmalloc_ok;
22299 static LIST_HEAD(free_rte_list);
22300
22301 +#ifdef CONFIG_XEN
22302 +#include <xen/interface/xen.h>
22303 +#include <xen/interface/physdev.h>
22304 +#include <asm/hypervisor.h>
22305 +static inline unsigned int xen_iosapic_read(char __iomem *iosapic, unsigned int reg)
22306 +{
22307 + struct physdev_apic apic_op;
22308 + int ret;
22309 +
22310 + apic_op.apic_physbase = (unsigned long)iosapic -
22311 + __IA64_UNCACHED_OFFSET;
22312 + apic_op.reg = reg;
22313 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
22314 + if (ret)
22315 + return ret;
22316 + return apic_op.value;
22317 +}
22318 +
22319 +static inline void xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22320 +{
22321 + struct physdev_apic apic_op;
22322 +
22323 + apic_op.apic_physbase = (unsigned long)iosapic -
22324 + __IA64_UNCACHED_OFFSET;
22325 + apic_op.reg = reg;
22326 + apic_op.value = val;
22327 + HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
22328 +}
22329 +
22330 +static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
22331 +{
22332 + if (!is_running_on_xen()) {
22333 + writel(reg, iosapic + IOSAPIC_REG_SELECT);
22334 + return readl(iosapic + IOSAPIC_WINDOW);
22335 + } else
22336 + return xen_iosapic_read(iosapic, reg);
22337 +}
22338 +
22339 +static inline void iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
22340 +{
22341 + if (!is_running_on_xen()) {
22342 + writel(reg, iosapic + IOSAPIC_REG_SELECT);
22343 + writel(val, iosapic + IOSAPIC_WINDOW);
22344 + } else
22345 + xen_iosapic_write(iosapic, reg, val);
22346 +}
22347 +
22348 +int xen_assign_irq_vector(int irq)
22349 +{
22350 + struct physdev_irq irq_op;
22351 +
22352 + irq_op.irq = irq;
22353 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
22354 + return -ENOSPC;
22355 +
22356 + return irq_op.vector;
22357 +}
22358 +
22359 +void xen_free_irq_vector(int vector)
22360 +{
22361 + struct physdev_irq irq_op;
22362 +
22363 + irq_op.vector = vector;
22364 + if (HYPERVISOR_physdev_op(PHYSDEVOP_free_irq_vector, &irq_op))
22365 + printk(KERN_WARNING "%s: xen_free_irq_vecotr fail vector=%d\n",
22366 + __FUNCTION__, vector);
22367 +}
22368 +#endif /* XEN */
22369 +
22370 /*
22371 * Find an IOSAPIC associated with a GSI
22372 */
22373 @@ -611,6 +680,9 @@
22374 iosapic_intr_info[vector].dmode = delivery;
22375 iosapic_intr_info[vector].trigger = trigger;
22376
22377 + if (is_running_on_xen())
22378 + return 0;
22379 +
22380 if (trigger == IOSAPIC_EDGE)
22381 irq_type = &irq_type_iosapic_edge;
22382 else
22383 @@ -953,6 +1025,9 @@
22384 }
22385
22386 pcat_compat = system_pcat_compat;
22387 + if (is_running_on_xen())
22388 + return;
22389 +
22390 if (pcat_compat) {
22391 /*
22392 * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support
22393 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/irq_ia64.c linux-2.6.16.33/arch/ia64/kernel/irq_ia64.c
22394 --- linux-2.6.16.33-noxen/arch/ia64/kernel/irq_ia64.c 2006-11-22 18:06:31.000000000 +0000
22395 +++ linux-2.6.16.33/arch/ia64/kernel/irq_ia64.c 2007-01-08 15:00:45.000000000 +0000
22396 @@ -31,6 +31,9 @@
22397 #include <linux/smp_lock.h>
22398 #include <linux/threads.h>
22399 #include <linux/bitops.h>
22400 +#ifdef CONFIG_XEN
22401 +#include <linux/cpu.h>
22402 +#endif
22403
22404 #include <asm/delay.h>
22405 #include <asm/intrinsics.h>
22406 @@ -66,6 +69,13 @@
22407 assign_irq_vector (int irq)
22408 {
22409 int pos, vector;
22410 +
22411 +#ifdef CONFIG_XEN
22412 + if (is_running_on_xen()) {
22413 + extern int xen_assign_irq_vector(int);
22414 + return xen_assign_irq_vector(irq);
22415 + }
22416 +#endif
22417 again:
22418 pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
22419 vector = IA64_FIRST_DEVICE_VECTOR + pos;
22420 @@ -84,6 +94,13 @@
22421 if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
22422 return;
22423
22424 +#ifdef CONFIG_XEN
22425 + if (is_running_on_xen()) {
22426 + extern void xen_free_irq_vector(int);
22427 + xen_free_irq_vector(vector);
22428 + return;
22429 + }
22430 +#endif
22431 pos = vector - IA64_FIRST_DEVICE_VECTOR;
22432 if (!test_and_clear_bit(pos, ia64_vector_mask))
22433 printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
22434 @@ -224,12 +241,264 @@
22435 };
22436 #endif
22437
22438 +#ifdef CONFIG_XEN
22439 +#include <xen/evtchn.h>
22440 +#include <xen/interface/callback.h>
22441 +
22442 +static DEFINE_PER_CPU(int, timer_irq) = -1;
22443 +static DEFINE_PER_CPU(int, ipi_irq) = -1;
22444 +static DEFINE_PER_CPU(int, resched_irq) = -1;
22445 +static DEFINE_PER_CPU(int, cmc_irq) = -1;
22446 +static DEFINE_PER_CPU(int, cmcp_irq) = -1;
22447 +static DEFINE_PER_CPU(int, cpep_irq) = -1;
22448 +static char timer_name[NR_CPUS][15];
22449 +static char ipi_name[NR_CPUS][15];
22450 +static char resched_name[NR_CPUS][15];
22451 +static char cmc_name[NR_CPUS][15];
22452 +static char cmcp_name[NR_CPUS][15];
22453 +static char cpep_name[NR_CPUS][15];
22454 +
22455 +struct saved_irq {
22456 + unsigned int irq;
22457 + struct irqaction *action;
22458 +};
22459 +/* 16 should be far optimistic value, since only several percpu irqs
22460 + * are registered early.
22461 + */
22462 +#define MAX_LATE_IRQ 16
22463 +static struct saved_irq saved_percpu_irqs[MAX_LATE_IRQ];
22464 +static unsigned short late_irq_cnt = 0;
22465 +static unsigned short saved_irq_cnt = 0;
22466 +static int xen_slab_ready = 0;
22467 +
22468 +#ifdef CONFIG_SMP
22469 +/* Dummy stub. Though we may check RESCHEDULE_VECTOR before __do_IRQ,
22470 + * it ends up to issue several memory accesses upon percpu data and
22471 + * thus adds unnecessary traffic to other paths.
22472 + */
22473 +static irqreturn_t
22474 +handle_reschedule(int irq, void *dev_id, struct pt_regs *regs)
22475 +{
22476 +
22477 + return IRQ_HANDLED;
22478 +}
22479 +
22480 +static struct irqaction resched_irqaction = {
22481 + .handler = handle_reschedule,
22482 + .flags = SA_INTERRUPT,
22483 + .name = "RESCHED"
22484 +};
22485 +#endif
22486 +
22487 +/*
22488 + * This is xen version percpu irq registration, which needs bind
22489 + * to xen specific evtchn sub-system. One trick here is that xen
22490 + * evtchn binding interface depends on kmalloc because related
22491 + * port needs to be freed at device/cpu down. So we cache the
22492 + * registration on BSP before slab is ready and then deal them
22493 + * at later point. For rest instances happening after slab ready,
22494 + * we hook them to xen evtchn immediately.
22495 + *
22496 + * FIXME: MCA is not supported by far, and thus "nomca" boot param is
22497 + * required.
22498 + */
22499 +static void
22500 +xen_register_percpu_irq (unsigned int irq, struct irqaction *action, int save)
22501 +{
22502 + unsigned int cpu = smp_processor_id();
22503 + int ret = 0;
22504 +
22505 + if (xen_slab_ready) {
22506 + switch (irq) {
22507 + case IA64_TIMER_VECTOR:
22508 + sprintf(timer_name[cpu], "%s%d", action->name, cpu);
22509 + ret = bind_virq_to_irqhandler(VIRQ_ITC, cpu,
22510 + action->handler, action->flags,
22511 + timer_name[cpu], action->dev_id);
22512 + per_cpu(timer_irq,cpu) = ret;
22513 + printk(KERN_INFO "register VIRQ_ITC (%s) to xen irq (%d)\n", timer_name[cpu], ret);
22514 + break;
22515 + case IA64_IPI_RESCHEDULE:
22516 + sprintf(resched_name[cpu], "%s%d", action->name, cpu);
22517 + ret = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, cpu,
22518 + action->handler, action->flags,
22519 + resched_name[cpu], action->dev_id);
22520 + per_cpu(resched_irq,cpu) = ret;
22521 + printk(KERN_INFO "register RESCHEDULE_VECTOR (%s) to xen irq (%d)\n", resched_name[cpu], ret);
22522 + break;
22523 + case IA64_IPI_VECTOR:
22524 + sprintf(ipi_name[cpu], "%s%d", action->name, cpu);
22525 + ret = bind_ipi_to_irqhandler(IPI_VECTOR, cpu,
22526 + action->handler, action->flags,
22527 + ipi_name[cpu], action->dev_id);
22528 + per_cpu(ipi_irq,cpu) = ret;
22529 + printk(KERN_INFO "register IPI_VECTOR (%s) to xen irq (%d)\n", ipi_name[cpu], ret);
22530 + break;
22531 + case IA64_SPURIOUS_INT_VECTOR:
22532 + break;
22533 + case IA64_CMC_VECTOR:
22534 + sprintf(cmc_name[cpu], "%s%d", action->name, cpu);
22535 + ret = bind_virq_to_irqhandler(VIRQ_MCA_CMC, cpu,
22536 + action->handler,
22537 + action->flags,
22538 + cmc_name[cpu],
22539 + action->dev_id);
22540 + per_cpu(cmc_irq,cpu) = ret;
22541 + printk(KERN_INFO "register VIRQ_MCA_CMC (%s) to xen "
22542 + "irq (%d)\n", cmc_name[cpu], ret);
22543 + break;
22544 + case IA64_CMCP_VECTOR:
22545 + sprintf(cmcp_name[cpu], "%s%d", action->name, cpu);
22546 + ret = bind_ipi_to_irqhandler(CMCP_VECTOR, cpu,
22547 + action->handler,
22548 + action->flags,
22549 + cmcp_name[cpu],
22550 + action->dev_id);
22551 + per_cpu(cmcp_irq,cpu) = ret;
22552 + printk(KERN_INFO "register CMCP_VECTOR (%s) to xen "
22553 + "irq (%d)\n", cmcp_name[cpu], ret);
22554 + break;
22555 + case IA64_CPEP_VECTOR:
22556 + sprintf(cpep_name[cpu], "%s%d", action->name, cpu);
22557 + ret = bind_ipi_to_irqhandler(CPEP_VECTOR, cpu,
22558 + action->handler,
22559 + action->flags,
22560 + cpep_name[cpu],
22561 + action->dev_id);
22562 + per_cpu(cpep_irq,cpu) = ret;
22563 + printk(KERN_INFO "register CPEP_VECTOR (%s) to xen "
22564 + "irq (%d)\n", cpep_name[cpu], ret);
22565 + break;
22566 + case IA64_CPE_VECTOR:
22567 + printk(KERN_WARNING "register IA64_CPE_VECTOR "
22568 + "IGNORED\n");
22569 + break;
22570 + default:
22571 + printk(KERN_WARNING "Percpu irq %d is unsupported by xen!\n", irq);
22572 + break;
22573 + }
22574 + BUG_ON(ret < 0);
22575 + }
22576 +
22577 + /* For BSP, we cache registered percpu irqs, and then re-walk
22578 + * them when initializing APs
22579 + */
22580 + if (!cpu && save) {
22581 + BUG_ON(saved_irq_cnt == MAX_LATE_IRQ);
22582 + saved_percpu_irqs[saved_irq_cnt].irq = irq;
22583 + saved_percpu_irqs[saved_irq_cnt].action = action;
22584 + saved_irq_cnt++;
22585 + if (!xen_slab_ready)
22586 + late_irq_cnt++;
22587 + }
22588 +}
22589 +
22590 +static void
22591 +xen_bind_early_percpu_irq (void)
22592 +{
22593 + int i;
22594 +
22595 + xen_slab_ready = 1;
22596 + /* There's no race when accessing this cached array, since only
22597 + * BSP will face with such step shortly
22598 + */
22599 + for (i = 0; i < late_irq_cnt; i++)
22600 + xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22601 + saved_percpu_irqs[i].action, 0);
22602 +}
22603 +
22604 +/* FIXME: There's no obvious point to check whether slab is ready. So
22605 + * a hack is used here by utilizing a late time hook.
22606 + */
22607 +extern void (*late_time_init)(void);
22608 +extern char xen_event_callback;
22609 +extern void xen_init_IRQ(void);
22610 +
22611 +#ifdef CONFIG_HOTPLUG_CPU
22612 +static int __devinit
22613 +unbind_evtchn_callback(struct notifier_block *nfb,
22614 + unsigned long action, void *hcpu)
22615 +{
22616 + unsigned int cpu = (unsigned long)hcpu;
22617 +
22618 + if (action == CPU_DEAD) {
22619 + /* Unregister evtchn. */
22620 + if (per_cpu(cpep_irq,cpu) >= 0) {
22621 + unbind_from_irqhandler(per_cpu(cpep_irq, cpu), NULL);
22622 + per_cpu(cpep_irq, cpu) = -1;
22623 + }
22624 + if (per_cpu(cmcp_irq,cpu) >= 0) {
22625 + unbind_from_irqhandler(per_cpu(cmcp_irq, cpu), NULL);
22626 + per_cpu(cmcp_irq, cpu) = -1;
22627 + }
22628 + if (per_cpu(cmc_irq,cpu) >= 0) {
22629 + unbind_from_irqhandler(per_cpu(cmc_irq, cpu), NULL);
22630 + per_cpu(cmc_irq, cpu) = -1;
22631 + }
22632 + if (per_cpu(ipi_irq,cpu) >= 0) {
22633 + unbind_from_irqhandler (per_cpu(ipi_irq, cpu), NULL);
22634 + per_cpu(ipi_irq, cpu) = -1;
22635 + }
22636 + if (per_cpu(resched_irq,cpu) >= 0) {
22637 + unbind_from_irqhandler (per_cpu(resched_irq, cpu),
22638 + NULL);
22639 + per_cpu(resched_irq, cpu) = -1;
22640 + }
22641 + if (per_cpu(timer_irq,cpu) >= 0) {
22642 + unbind_from_irqhandler (per_cpu(timer_irq, cpu), NULL);
22643 + per_cpu(timer_irq, cpu) = -1;
22644 + }
22645 + }
22646 + return NOTIFY_OK;
22647 +}
22648 +
22649 +static struct notifier_block unbind_evtchn_notifier = {
22650 + .notifier_call = unbind_evtchn_callback,
22651 + .priority = 0
22652 +};
22653 +#endif
22654 +
22655 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
22656 +void xen_smp_intr_init(void)
22657 +{
22658 +#ifdef CONFIG_SMP
22659 + unsigned int cpu = smp_processor_id();
22660 + unsigned int i = 0;
22661 + struct callback_register event = {
22662 + .type = CALLBACKTYPE_event,
22663 + .address = (unsigned long)&xen_event_callback,
22664 + };
22665 +
22666 + if (cpu == 0) {
22667 + /* Initialization was already done for boot cpu. */
22668 +#ifdef CONFIG_HOTPLUG_CPU
22669 + /* Register the notifier only once. */
22670 + register_cpu_notifier(&unbind_evtchn_notifier);
22671 +#endif
22672 + return;
22673 + }
22674 +
22675 + /* This should be piggyback when setup vcpu guest context */
22676 + BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22677 +
22678 + for (i = 0; i < saved_irq_cnt; i++)
22679 + xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22680 + saved_percpu_irqs[i].action, 0);
22681 +#endif /* CONFIG_SMP */
22682 +}
22683 +#endif /* CONFIG_XEN */
22684 +
22685 void
22686 register_percpu_irq (ia64_vector vec, struct irqaction *action)
22687 {
22688 irq_desc_t *desc;
22689 unsigned int irq;
22690
22691 +#ifdef CONFIG_XEN
22692 + if (is_running_on_xen())
22693 + return xen_register_percpu_irq(vec, action, 1);
22694 +#endif
22695 +
22696 for (irq = 0; irq < NR_IRQS; ++irq)
22697 if (irq_to_vector(irq) == vec) {
22698 desc = irq_descp(irq);
22699 @@ -243,6 +512,21 @@
22700 void __init
22701 init_IRQ (void)
22702 {
22703 +#ifdef CONFIG_XEN
22704 + /* Maybe put into platform_irq_init later */
22705 + if (is_running_on_xen()) {
22706 + struct callback_register event = {
22707 + .type = CALLBACKTYPE_event,
22708 + .address = (unsigned long)&xen_event_callback,
22709 + };
22710 + xen_init_IRQ();
22711 + BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22712 + late_time_init = xen_bind_early_percpu_irq;
22713 +#ifdef CONFIG_SMP
22714 + register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
22715 +#endif /* CONFIG_SMP */
22716 + }
22717 +#endif /* CONFIG_XEN */
22718 register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
22719 #ifdef CONFIG_SMP
22720 register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
22721 @@ -260,6 +544,45 @@
22722 unsigned long ipi_data;
22723 unsigned long phys_cpu_id;
22724
22725 +#ifdef CONFIG_XEN
22726 + if (is_running_on_xen()) {
22727 + int irq = -1;
22728 +
22729 +#ifdef CONFIG_SMP
22730 + /* TODO: we need to call vcpu_up here */
22731 + if (unlikely(vector == ap_wakeup_vector)) {
22732 + extern void xen_send_ipi (int cpu, int vec);
22733 + xen_send_ipi (cpu, vector);
22734 + //vcpu_prepare_and_up(cpu);
22735 + return;
22736 + }
22737 +#endif
22738 +
22739 + switch(vector) {
22740 + case IA64_IPI_VECTOR:
22741 + irq = per_cpu(ipi_to_irq, cpu)[IPI_VECTOR];
22742 + break;
22743 + case IA64_IPI_RESCHEDULE:
22744 + irq = per_cpu(ipi_to_irq, cpu)[RESCHEDULE_VECTOR];
22745 + break;
22746 + case IA64_CMCP_VECTOR:
22747 + irq = per_cpu(ipi_to_irq, cpu)[CMCP_VECTOR];
22748 + break;
22749 + case IA64_CPEP_VECTOR:
22750 + irq = per_cpu(ipi_to_irq, cpu)[CPEP_VECTOR];
22751 + break;
22752 + default:
22753 + printk(KERN_WARNING"Unsupported IPI type 0x%x\n", vector);
22754 + irq = 0;
22755 + break;
22756 + }
22757 +
22758 + BUG_ON(irq < 0);
22759 + notify_remote_via_irq(irq);
22760 + return;
22761 + }
22762 +#endif /* CONFIG_XEN */
22763 +
22764 #ifdef CONFIG_SMP
22765 phys_cpu_id = cpu_physical_id(cpu);
22766 #else
22767 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/pal.S linux-2.6.16.33/arch/ia64/kernel/pal.S
22768 --- linux-2.6.16.33-noxen/arch/ia64/kernel/pal.S 2006-11-22 18:06:31.000000000 +0000
22769 +++ linux-2.6.16.33/arch/ia64/kernel/pal.S 2007-01-08 15:00:45.000000000 +0000
22770 @@ -16,6 +16,7 @@
22771 #include <asm/processor.h>
22772
22773 .data
22774 + .globl pal_entry_point
22775 pal_entry_point:
22776 data8 ia64_pal_default_handler
22777 .text
22778 @@ -53,7 +54,7 @@
22779 * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic
22780 *
22781 */
22782 -GLOBAL_ENTRY(ia64_pal_call_static)
22783 +GLOBAL_ENTRY(__ia64_pal_call_static)
22784 .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
22785 alloc loc1 = ar.pfs,5,5,0,0
22786 movl loc2 = pal_entry_point
22787 @@ -90,7 +91,7 @@
22788 ;;
22789 srlz.d // seralize restoration of psr.l
22790 br.ret.sptk.many b0
22791 -END(ia64_pal_call_static)
22792 +END(__ia64_pal_call_static)
22793
22794 /*
22795 * Make a PAL call using the stacked registers calling convention.
22796 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/patch.c linux-2.6.16.33/arch/ia64/kernel/patch.c
22797 --- linux-2.6.16.33-noxen/arch/ia64/kernel/patch.c 2006-11-22 18:06:31.000000000 +0000
22798 +++ linux-2.6.16.33/arch/ia64/kernel/patch.c 2007-01-08 15:00:45.000000000 +0000
22799 @@ -184,6 +184,73 @@
22800 ia64_srlz_i();
22801 }
22802
22803 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
22804 +extern char __start_gate_running_on_xen_patchlist[];
22805 +extern char __end_gate_running_on_xen_patchlist[];
22806 +
22807 +void
22808 +patch_running_on_xen(unsigned long start, unsigned long end)
22809 +{
22810 + extern int running_on_xen;
22811 + s32 *offp = (s32 *)start;
22812 + u64 ip;
22813 +
22814 + while (offp < (s32 *)end) {
22815 + ip = (u64)ia64_imva((char *)offp + *offp);
22816 + ia64_patch_imm64(ip, (u64)&running_on_xen);
22817 + ia64_fc((void *)ip);
22818 + ++offp;
22819 + }
22820 + ia64_sync_i();
22821 + ia64_srlz_i();
22822 +}
22823 +
22824 +static void
22825 +patch_brl_symaddr(unsigned long start, unsigned long end,
22826 + unsigned long symaddr)
22827 +{
22828 + s32 *offp = (s32 *)start;
22829 + u64 ip;
22830 +
22831 + while (offp < (s32 *)end) {
22832 + ip = (u64)offp + *offp;
22833 + ia64_patch_imm60((u64)ia64_imva((void *)ip),
22834 + (u64)(symaddr - (ip & -16)) / 16);
22835 + ia64_fc((void *)ip);
22836 + ++offp;
22837 + }
22838 + ia64_sync_i();
22839 + ia64_srlz_i();
22840 +}
22841 +
22842 +#define EXTERN_PATCHLIST(name) \
22843 + extern char __start_gate_brl_##name##_patchlist[]; \
22844 + extern char __end_gate_brl_##name##_patchlist[]; \
22845 + extern char name[]
22846 +
22847 +#define PATCH_BRL_SYMADDR(name) \
22848 + patch_brl_symaddr((unsigned long)__start_gate_brl_##name##_patchlist, \
22849 + (unsigned long)__end_gate_brl_##name##_patchlist, \
22850 + (unsigned long)name)
22851 +
22852 +static void
22853 +patch_brl_in_vdso(void)
22854 +{
22855 + EXTERN_PATCHLIST(xen_rsm_be_i);
22856 + EXTERN_PATCHLIST(xen_get_psr);
22857 + EXTERN_PATCHLIST(xen_ssm_i_0);
22858 + EXTERN_PATCHLIST(xen_ssm_i_1);
22859 +
22860 + PATCH_BRL_SYMADDR(xen_rsm_be_i);
22861 + PATCH_BRL_SYMADDR(xen_get_psr);
22862 + PATCH_BRL_SYMADDR(xen_ssm_i_0);
22863 + PATCH_BRL_SYMADDR(xen_ssm_i_1);
22864 +}
22865 +#else
22866 +#define patch_running_on_xen(start, end) do { } while (0)
22867 +#define patch_brl_in_vdso() do { } while (0)
22868 +#endif
22869 +
22870 void
22871 ia64_patch_gate (void)
22872 {
22873 @@ -192,6 +259,10 @@
22874
22875 patch_fsyscall_table(START(fsyscall), END(fsyscall));
22876 patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
22877 +#ifdef CONFIG_XEN
22878 + patch_running_on_xen(START(running_on_xen), END(running_on_xen));
22879 + patch_brl_in_vdso();
22880 +#endif
22881 ia64_patch_vtop(START(vtop), END(vtop));
22882 ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
22883 }
22884 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/perfmon.c linux-2.6.16.33/arch/ia64/kernel/perfmon.c
22885 --- linux-2.6.16.33-noxen/arch/ia64/kernel/perfmon.c 2006-11-22 18:06:31.000000000 +0000
22886 +++ linux-2.6.16.33/arch/ia64/kernel/perfmon.c 2007-01-08 15:00:45.000000000 +0000
22887 @@ -53,6 +53,28 @@
22888 #include <asm/delay.h>
22889
22890 #ifdef CONFIG_PERFMON
22891 +#ifdef CONFIG_XEN
22892 +//#include <xen/xenoprof.h>
22893 +#include <xen/interface/xenoprof.h>
22894 +
22895 +static int xenoprof_is_primary = 0;
22896 +#define init_xenoprof_primary(is_primary) (xenoprof_is_primary = (is_primary))
22897 +#define is_xenoprof_primary() (xenoprof_is_primary)
22898 +#define XEN_NOT_SUPPORTED_YET \
22899 + do { \
22900 + if (is_running_on_xen()) { \
22901 + printk("%s is not supported yet under xen.\n", \
22902 + __func__); \
22903 + return -ENOSYS; \
22904 + } \
22905 + } while (0)
22906 +#else
22907 +#define init_xenoprof_primary(is_primary) do { } while (0)
22908 +#define is_xenoprof_primary() (0)
22909 +#define XEN_NOT_SUPPORTED_YET do { } while (0)
22910 +#define HYPERVISOR_perfmon_op(cmd, arg, count) do { } while (0)
22911 +#endif
22912 +
22913 /*
22914 * perfmon context state
22915 */
22916 @@ -1515,6 +1537,7 @@
22917 ssize_t ret;
22918 unsigned long flags;
22919 DECLARE_WAITQUEUE(wait, current);
22920 + XEN_NOT_SUPPORTED_YET;
22921 if (PFM_IS_FILE(filp) == 0) {
22922 printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
22923 return -EINVAL;
22924 @@ -2113,6 +2136,15 @@
22925 */
22926 if (free_possible) pfm_context_free(ctx);
22927
22928 + if (is_running_on_xen()) {
22929 + if (is_xenoprof_primary()) {
22930 + int ret = HYPERVISOR_perfmon_op(PFM_DESTROY_CONTEXT,
22931 + NULL, 0);
22932 + if (ret)
22933 + printk("%s:%d PFM_DESTROY_CONTEXT hypercall "
22934 + "failed\n", __func__, __LINE__);
22935 + }
22936 + }
22937 return 0;
22938 }
22939
22940 @@ -2736,6 +2768,23 @@
22941 */
22942 pfm_reset_pmu_state(ctx);
22943
22944 + if (is_running_on_xen()) {
22945 + /*
22946 + * kludge to get xenoprof.is_primary.
22947 + * XENOPROF_init/ia64 is nop. so it is safe to call it here.
22948 + */
22949 + struct xenoprof_init init;
22950 + ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
22951 + if (ret)
22952 + goto buffer_error;
22953 + init_xenoprof_primary(init.is_primary);
22954 +
22955 + if (is_xenoprof_primary()) {
22956 + ret = HYPERVISOR_perfmon_op(PFM_CREATE_CONTEXT, arg, 0);
22957 + if (ret)
22958 + goto buffer_error;
22959 + }
22960 + }
22961 return 0;
22962
22963 buffer_error:
22964 @@ -2872,6 +2921,12 @@
22965 pfm_reg_check_t wr_func;
22966 #define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
22967
22968 + if (is_running_on_xen()) {
22969 + if (is_xenoprof_primary())
22970 + return HYPERVISOR_perfmon_op(PFM_WRITE_PMCS,
22971 + arg, count);
22972 + return 0;
22973 + }
22974 state = ctx->ctx_state;
22975 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
22976 is_system = ctx->ctx_fl_system;
22977 @@ -3112,6 +3167,12 @@
22978 int ret = -EINVAL;
22979 pfm_reg_check_t wr_func;
22980
22981 + if (is_running_on_xen()) {
22982 + if (is_xenoprof_primary())
22983 + return HYPERVISOR_perfmon_op(PFM_WRITE_PMDS,
22984 + arg, count);
22985 + return 0;
22986 + }
22987
22988 state = ctx->ctx_state;
22989 is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
22990 @@ -3309,6 +3370,7 @@
22991 int is_loaded, is_system, is_counting, expert_mode;
22992 int ret = -EINVAL;
22993 pfm_reg_check_t rd_func;
22994 + XEN_NOT_SUPPORTED_YET;
22995
22996 /*
22997 * access is possible when loaded only for
22998 @@ -3560,6 +3622,7 @@
22999 pfm_ovfl_ctrl_t rst_ctrl;
23000 int state, is_system;
23001 int ret = 0;
23002 + XEN_NOT_SUPPORTED_YET;
23003
23004 state = ctx->ctx_state;
23005 fmt = ctx->ctx_buf_fmt;
23006 @@ -3709,6 +3772,7 @@
23007 pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
23008 {
23009 unsigned int m = *(unsigned int *)arg;
23010 + XEN_NOT_SUPPORTED_YET;
23011
23012 pfm_sysctl.debug = m == 0 ? 0 : 1;
23013
23014 @@ -3979,6 +4043,8 @@
23015 {
23016 pfarg_features_t *req = (pfarg_features_t *)arg;
23017
23018 + if (is_running_on_xen())
23019 + return HYPERVISOR_perfmon_op(PFM_GET_FEATURES, &arg, 0);
23020 req->ft_version = PFM_VERSION;
23021 return 0;
23022 }
23023 @@ -3990,6 +4056,12 @@
23024 struct task_struct *task = PFM_CTX_TASK(ctx);
23025 int state, is_system;
23026
23027 + if (is_running_on_xen()) {
23028 + if (is_xenoprof_primary())
23029 + return HYPERVISOR_perfmon_op(PFM_STOP, NULL, 0);
23030 + return 0;
23031 + }
23032 +
23033 state = ctx->ctx_state;
23034 is_system = ctx->ctx_fl_system;
23035
23036 @@ -4078,6 +4150,11 @@
23037 struct pt_regs *tregs;
23038 int state, is_system;
23039
23040 + if (is_running_on_xen()) {
23041 + if (is_xenoprof_primary())
23042 + return HYPERVISOR_perfmon_op(PFM_START, NULL, 0);
23043 + return 0;
23044 + }
23045 state = ctx->ctx_state;
23046 is_system = ctx->ctx_fl_system;
23047
23048 @@ -4160,6 +4237,7 @@
23049 unsigned int cnum;
23050 int i;
23051 int ret = -EINVAL;
23052 + XEN_NOT_SUPPORTED_YET;
23053
23054 for (i = 0; i < count; i++, req++) {
23055
23056 @@ -4218,6 +4296,11 @@
23057 int ret = 0;
23058 int state, is_system, set_dbregs = 0;
23059
23060 + if (is_running_on_xen()) {
23061 + if (is_xenoprof_primary())
23062 + return HYPERVISOR_perfmon_op(PFM_LOAD_CONTEXT, arg, 0);
23063 + return 0;
23064 + }
23065 state = ctx->ctx_state;
23066 is_system = ctx->ctx_fl_system;
23067 /*
23068 @@ -4466,6 +4549,12 @@
23069 int prev_state, is_system;
23070 int ret;
23071
23072 + if (is_running_on_xen()) {
23073 + if (is_xenoprof_primary())
23074 + return HYPERVISOR_perfmon_op(PFM_UNLOAD_CONTEXT,
23075 + NULL, 0);
23076 + return 0;
23077 + }
23078 DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
23079
23080 prev_state = ctx->ctx_state;
23081 diff -Nur linux-2.6.16.33-noxen/arch/ia64/kernel/setup.c linux-2.6.16.33/arch/ia64/kernel/setup.c
23082 --- linux-2.6.16.33-noxen/arch/ia64/kernel/setup.c 2006-11-22 18:06:31.000000000 +0000
23083 +++ linux-2.6.16.33/arch/ia64/kernel/setup.c 2007-01-08 15:00:45.000000000 +0000
23084 @@ -61,6 +61,11 @@
23085 #include <asm/system.h>
23086 #include <asm/unistd.h>
23087 #include <asm/system.h>
23088 +#ifdef CONFIG_XEN
23089 +#include <asm/hypervisor.h>
23090 +#include <asm/xen/xencomm.h>
23091 +#endif
23092 +#include <linux/dma-mapping.h>
23093
23094 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
23095 # error "struct cpuinfo_ia64 too big!"
23096 @@ -71,6 +76,20 @@
23097 EXPORT_SYMBOL(__per_cpu_offset);
23098 #endif
23099
23100 +#ifdef CONFIG_XEN
23101 +static int
23102 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
23103 +{
23104 + HYPERVISOR_shutdown(SHUTDOWN_crash);
23105 + /* we're never actually going to get here... */
23106 + return NOTIFY_DONE;
23107 +}
23108 +
23109 +static struct notifier_block xen_panic_block = {
23110 + xen_panic_event, NULL, 0 /* try to go last */
23111 +};
23112 +#endif
23113 +
23114 extern void ia64_setup_printk_clock(void);
23115
23116 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
23117 @@ -243,6 +262,14 @@
23118 rsvd_region[n].end = (unsigned long) ia64_imva(_end);
23119 n++;
23120
23121 +#ifdef CONFIG_XEN
23122 + if (is_running_on_xen()) {
23123 + rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
23124 + rsvd_region[n].end = rsvd_region[n].start + PAGE_SIZE;
23125 + n++;
23126 + }
23127 +#endif
23128 +
23129 #ifdef CONFIG_BLK_DEV_INITRD
23130 if (ia64_boot_param->initrd_start) {
23131 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
23132 @@ -260,6 +287,7 @@
23133 n++;
23134
23135 num_rsvd_regions = n;
23136 + BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
23137
23138 sort_regions(rsvd_region, num_rsvd_regions);
23139 }
23140 @@ -333,6 +361,16 @@
23141 {
23142 int earlycons = 0;
23143
23144 +#ifdef CONFIG_XEN
23145 +#ifndef CONFIG_IA64_HP_SIM
23146 + if (is_running_on_xen()) {
23147 + extern struct console hpsim_cons;
23148 + hpsim_cons.flags |= CON_BOOT;
23149 + register_console(&hpsim_cons);
23150 + earlycons++;
23151 + }
23152 +#endif
23153 +#endif
23154 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
23155 {
23156 extern int sn_serial_console_early_setup(void);
23157 @@ -394,6 +432,17 @@
23158 {
23159 unw_init();
23160
23161 +#ifdef CONFIG_XEN
23162 + if (is_running_on_xen()) {
23163 + /* Must be done before any hypercall. */
23164 + xencomm_init();
23165 +
23166 + setup_xen_features();
23167 + /* Register a call for panic conditions. */
23168 + notifier_chain_register(&panic_notifier_list, &xen_panic_block);
23169 + }
23170 +#endif
23171 +
23172 ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
23173
23174 *cmdline_p = __va(ia64_boot_param->command_line);
23175 @@ -490,7 +539,26 @@
23176 conswitchp = &vga_con;
23177 # endif
23178 }
23179 +#ifdef CONFIG_XEN
23180 + if (is_running_on_xen()) {
23181 + shared_info_t *s = HYPERVISOR_shared_info;
23182 +
23183 + xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
23184 +
23185 + printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%ld "
23186 + "flags=0x%x\n", s->arch.start_info_pfn,
23187 + xen_start_info->nr_pages, xen_start_info->flags);
23188 +
23189 + if (!is_initial_xendomain()) {
23190 +#if !defined(CONFIG_VT) || !defined(CONFIG_DUMMY_CONSOLE)
23191 + conswitchp = NULL;
23192 +#endif
23193 + }
23194 + }
23195 + xencons_early_setup();
23196 #endif
23197 +#endif
23198 +
23199
23200 /* enable IA-64 Machine Check Abort Handling unless disabled */
23201 if (!strstr(saved_command_line, "nomca"))
23202 @@ -498,6 +566,9 @@
23203
23204 platform_setup(cmdline_p);
23205 paging_init();
23206 +#ifdef CONFIG_XEN
23207 + contiguous_bitmap_init(max_pfn);
23208 +#endif
23209 }
23210
23211 /*
23212 @@ -882,6 +953,15 @@
23213 /* size of physical stacked register partition plus 8 bytes: */
23214 __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
23215 platform_cpu_init();
23216 +
23217 +#ifdef CONFIG_XEN
23218 + /* Need to be moved into platform_cpu_init later */
23219 + if (is_running_on_xen()) {
23220 + extern void xen_smp_intr_init(void);
23221 + xen_smp_intr_init();
23222 + }
23223 +#endif
23224 +
23225 pm_idle = default_idle;
23226 }
23227
23228 diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/Makefile linux-2.6.16.33/arch/ia64/oprofile/Makefile
23229 --- linux-2.6.16.33-noxen/arch/ia64/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
23230 +++ linux-2.6.16.33/arch/ia64/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
23231 @@ -8,3 +8,7 @@
23232
23233 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
23234 oprofile-$(CONFIG_PERFMON) += perfmon.o
23235 +ifeq ($(CONFIG_XEN), y)
23236 +oprofile-$(CONFIG_PERFMON) += xenoprof.o \
23237 + ../../../drivers/xen/xenoprof/xenoprofile.o
23238 +endif
23239 diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/init.c linux-2.6.16.33/arch/ia64/oprofile/init.c
23240 --- linux-2.6.16.33-noxen/arch/ia64/oprofile/init.c 2006-11-22 18:06:31.000000000 +0000
23241 +++ linux-2.6.16.33/arch/ia64/oprofile/init.c 2007-01-08 15:00:45.000000000 +0000
23242 @@ -11,6 +11,7 @@
23243 #include <linux/oprofile.h>
23244 #include <linux/init.h>
23245 #include <linux/errno.h>
23246 +#include "oprofile_perfmon.h"
23247
23248 extern int perfmon_init(struct oprofile_operations * ops);
23249 extern void perfmon_exit(void);
23250 @@ -20,6 +21,13 @@
23251 {
23252 int ret = -ENODEV;
23253
23254 + if (is_running_on_xen()) {
23255 + ret = xen_perfmon_init();
23256 + if (ret)
23257 + return ret;
23258 + return xenoprofile_init(ops);
23259 + }
23260 +
23261 #ifdef CONFIG_PERFMON
23262 /* perfmon_init() can fail, but we have no way to report it */
23263 ret = perfmon_init(ops);
23264 @@ -32,6 +40,12 @@
23265
23266 void oprofile_arch_exit(void)
23267 {
23268 + if (is_running_on_xen()) {
23269 + xenoprofile_exit();
23270 + xen_perfmon_exit();
23271 + return;
23272 + }
23273 +
23274 #ifdef CONFIG_PERFMON
23275 perfmon_exit();
23276 #endif
23277 diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/oprofile_perfmon.h linux-2.6.16.33/arch/ia64/oprofile/oprofile_perfmon.h
23278 --- linux-2.6.16.33-noxen/arch/ia64/oprofile/oprofile_perfmon.h 1970-01-01 00:00:00.000000000 +0000
23279 +++ linux-2.6.16.33/arch/ia64/oprofile/oprofile_perfmon.h 2007-01-08 15:00:45.000000000 +0000
23280 @@ -0,0 +1,30 @@
23281 +#ifndef OPROFILE_PERFMON_H
23282 +#define OPROFILE_PERFMON_H
23283 +
23284 +#include <linux/config.h>
23285 +
23286 +#ifdef CONFIG_PERFMON
23287 +int __perfmon_init(void);
23288 +void __perfmon_exit(void);
23289 +int perfmon_start(void);
23290 +void perfmon_stop(void);
23291 +#else
23292 +#define __perfmon_init() (-ENOSYS)
23293 +#define __perfmon_exit() do {} while (0)
23294 +#endif /* CONFIG_PERFMON */
23295 +
23296 +#ifdef CONFIG_XEN
23297 +#define STATIC_IF_NO_XEN /* nothing */
23298 +#define xen_perfmon_init() __perfmon_init()
23299 +#define xen_perfmon_exit() __perfmon_exit()
23300 +extern int xenoprofile_init(struct oprofile_operations * ops);
23301 +extern void xenoprofile_exit(void);
23302 +#else
23303 +#define STATIC_IF_NO_XEN static
23304 +#define xen_perfmon_init() (-ENOSYS)
23305 +#define xen_perfmon_exit() do {} while (0)
23306 +#define xenoprofile_init() (-ENOSYS)
23307 +#define xenoprofile_exit() do {} while (0)
23308 +#endif /* CONFIG_XEN */
23309 +
23310 +#endif /* OPROFILE_PERFMON_H */
23311 diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/perfmon.c linux-2.6.16.33/arch/ia64/oprofile/perfmon.c
23312 --- linux-2.6.16.33-noxen/arch/ia64/oprofile/perfmon.c 2006-11-22 18:06:31.000000000 +0000
23313 +++ linux-2.6.16.33/arch/ia64/oprofile/perfmon.c 2007-01-08 15:00:45.000000000 +0000
23314 @@ -14,6 +14,7 @@
23315 #include <asm/perfmon.h>
23316 #include <asm/ptrace.h>
23317 #include <asm/errno.h>
23318 +#include "oprofile_perfmon.h"
23319
23320 static int allow_ints;
23321
23322 @@ -34,14 +35,16 @@
23323 }
23324
23325
23326 -static int perfmon_start(void)
23327 +STATIC_IF_NO_XEN
23328 +int perfmon_start(void)
23329 {
23330 allow_ints = 1;
23331 return 0;
23332 }
23333
23334
23335 -static void perfmon_stop(void)
23336 +STATIC_IF_NO_XEN
23337 +void perfmon_stop(void)
23338 {
23339 allow_ints = 0;
23340 }
23341 @@ -76,16 +79,35 @@
23342
23343 static int using_perfmon;
23344
23345 -int perfmon_init(struct oprofile_operations * ops)
23346 +STATIC_IF_NO_XEN
23347 +int __perfmon_init(void)
23348 {
23349 int ret = pfm_register_buffer_fmt(&oprofile_fmt);
23350 if (ret)
23351 return -ENODEV;
23352
23353 + using_perfmon = 1;
23354 + return 0;
23355 +}
23356 +
23357 +STATIC_IF_NO_XEN
23358 +void __perfmon_exit(void)
23359 +{
23360 + if (!using_perfmon)
23361 + return;
23362 +
23363 + pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23364 +}
23365 +
23366 +int perfmon_init(struct oprofile_operations * ops)
23367 +{
23368 + int ret = __perfmon_init();
23369 + if (ret)
23370 + return -ENODEV;
23371 +
23372 ops->cpu_type = get_cpu_type();
23373 ops->start = perfmon_start;
23374 ops->stop = perfmon_stop;
23375 - using_perfmon = 1;
23376 printk(KERN_INFO "oprofile: using perfmon.\n");
23377 return 0;
23378 }
23379 @@ -93,8 +115,5 @@
23380
23381 void perfmon_exit(void)
23382 {
23383 - if (!using_perfmon)
23384 - return;
23385 -
23386 - pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
23387 + __perfmon_exit();
23388 }
23389 diff -Nur linux-2.6.16.33-noxen/arch/ia64/oprofile/xenoprof.c linux-2.6.16.33/arch/ia64/oprofile/xenoprof.c
23390 --- linux-2.6.16.33-noxen/arch/ia64/oprofile/xenoprof.c 1970-01-01 00:00:00.000000000 +0000
23391 +++ linux-2.6.16.33/arch/ia64/oprofile/xenoprof.c 2007-01-08 15:00:45.000000000 +0000
23392 @@ -0,0 +1,142 @@
23393 +/******************************************************************************
23394 + * xenoprof ia64 specific part
23395 + *
23396 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23397 + * VA Linux Systems Japan K.K.
23398 + *
23399 + * This program is free software; you can redistribute it and/or modify
23400 + * it under the terms of the GNU General Public License as published by
23401 + * the Free Software Foundation; either version 2 of the License, or
23402 + * (at your option) any later version.
23403 + *
23404 + * This program is distributed in the hope that it will be useful,
23405 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
23406 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23407 + * GNU General Public License for more details.
23408 + *
23409 + * You should have received a copy of the GNU General Public License
23410 + * along with this program; if not, write to the Free Software
23411 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23412 + *
23413 + */
23414 +#include <linux/init.h>
23415 +#include <linux/oprofile.h>
23416 +#include <linux/ioport.h>
23417 +
23418 +#include <xen/driver_util.h>
23419 +#include <xen/interface/xen.h>
23420 +#include <xen/interface/xenoprof.h>
23421 +#include <xen/xenoprof.h>
23422 +
23423 +#include "oprofile_perfmon.h"
23424 +
23425 +void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
23426 +{
23427 + init->num_events = 0; /* perfmon manages. */
23428 +}
23429 +
23430 +void xenoprof_arch_counter(void)
23431 +{
23432 + /* nothing. perfmon does. */
23433 +}
23434 +
23435 +void xenoprof_arch_start(void)
23436 +{
23437 + perfmon_start();
23438 +}
23439 +
23440 +void xenoprof_arch_stop(void)
23441 +{
23442 + perfmon_stop();
23443 +}
23444 +
23445 +/* XXX move them to an appropriate header file. */
23446 +struct resource* xen_ia64_allocate_resource(unsigned long size);
23447 +void xen_ia64_release_resource(struct resource* res);
23448 +void xen_ia64_unmap_resource(struct resource* res);
23449 +
23450 +struct resource*
23451 +xenoprof_ia64_allocate_resource(int32_t max_samples)
23452 +{
23453 + unsigned long bufsize;
23454 +
23455 + /* XXX add hypercall to get bufsize? */
23456 + /* this value is taken from alloc_xenoprof_struct(). */
23457 +#if 0
23458 + bufsize = NR_CPUS * (sizeof(struct xenoprof_buf) +
23459 + (max_samples - 1) * sizeof(struct event_log));
23460 + bufsize = PAGE_ALIGN(bufsize) + PAGE_SIZE;
23461 +#else
23462 +#define MAX_OPROF_SHARED_PAGES 32
23463 + bufsize = (MAX_OPROF_SHARED_PAGES + 1) * PAGE_SIZE;
23464 +#endif
23465 + return xen_ia64_allocate_resource(bufsize);
23466 +}
23467 +
23468 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf)
23469 +{
23470 + if (sbuf->buffer) {
23471 + xen_ia64_unmap_resource(sbuf->arch.res);
23472 + sbuf->buffer = NULL;
23473 + sbuf->arch.res = NULL;
23474 + }
23475 +}
23476 +
23477 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
23478 + struct xenoprof_shared_buffer* sbuf)
23479 +{
23480 + int ret;
23481 + struct resource* res;
23482 +
23483 + sbuf->buffer = NULL;
23484 + sbuf->arch.res = NULL;
23485 +
23486 + res = xenoprof_ia64_allocate_resource(get_buffer->max_samples);
23487 + if (IS_ERR(res))
23488 + return PTR_ERR(res);
23489 +
23490 + get_buffer->buf_gmaddr = res->start;
23491 +
23492 + ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer);
23493 + if (ret) {
23494 + xen_ia64_release_resource(res);
23495 + return ret;
23496 + }
23497 +
23498 + BUG_ON((res->end - res->start + 1) <
23499 + get_buffer->bufsize * get_buffer->nbuf);
23500 +
23501 + sbuf->buffer = __va(res->start);
23502 + sbuf->arch.res = res;
23503 +
23504 + return ret;
23505 +}
23506 +
23507 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
23508 + struct xenoprof_shared_buffer* sbuf)
23509 +{
23510 + int ret;
23511 + struct resource* res;
23512 +
23513 + sbuf->buffer = NULL;
23514 + sbuf->arch.res = NULL;
23515 +
23516 + res = xenoprof_ia64_allocate_resource(pdomain->max_samples);
23517 + if (IS_ERR(res))
23518 + return PTR_ERR(res);
23519 +
23520 + pdomain->buf_gmaddr = res->start;
23521 +
23522 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
23523 + if (ret) {
23524 + xen_ia64_release_resource(res);
23525 + return ret;
23526 + }
23527 +
23528 + BUG_ON((res->end - res->start + 1) < pdomain->bufsize * pdomain->nbuf);
23529 +
23530 + sbuf->buffer = __va(res->start);
23531 + sbuf->arch.res = res;
23532 +
23533 + return ret;
23534 +}
23535 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/Makefile linux-2.6.16.33/arch/ia64/xen/Makefile
23536 --- linux-2.6.16.33-noxen/arch/ia64/xen/Makefile 1970-01-01 00:00:00.000000000 +0000
23537 +++ linux-2.6.16.33/arch/ia64/xen/Makefile 2007-01-08 15:00:45.000000000 +0000
23538 @@ -0,0 +1,9 @@
23539 +#
23540 +# Makefile for Xen components
23541 +#
23542 +
23543 +obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o \
23544 + hypervisor.o pci-dma-xen.o util.o xencomm.o xcom_hcall.o \
23545 + xcom_mini.o xcom_privcmd.o mem.o
23546 +
23547 +pci-dma-xen-y := ../../i386/kernel/pci-dma-xen.o
23548 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/hypercall.S linux-2.6.16.33/arch/ia64/xen/hypercall.S
23549 --- linux-2.6.16.33-noxen/arch/ia64/xen/hypercall.S 1970-01-01 00:00:00.000000000 +0000
23550 +++ linux-2.6.16.33/arch/ia64/xen/hypercall.S 2007-01-08 15:00:45.000000000 +0000
23551 @@ -0,0 +1,412 @@
23552 +/*
23553 + * Support routines for Xen hypercalls
23554 + *
23555 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
23556 + */
23557 +
23558 +#include <linux/config.h>
23559 +#include <asm/processor.h>
23560 +#include <asm/asmmacro.h>
23561 +
23562 +/* To clear vpsr.ic, vpsr.i needs to be cleared first */
23563 +#define XEN_CLEAR_PSR_IC \
23564 + mov r14=1; \
23565 + movl r15=XSI_PSR_I_ADDR; \
23566 + movl r2=XSI_PSR_IC; \
23567 + ;; \
23568 + ld8 r15=[r15]; \
23569 + ld4 r3=[r2]; \
23570 + ;; \
23571 + ld1 r16=[r15]; \
23572 + ;; \
23573 + st1 [r15]=r14; \
23574 + st4 [r2]=r0; \
23575 + ;;
23576 +
23577 +/* First restore vpsr.ic, and then vpsr.i */
23578 +#define XEN_RESTORE_PSR_IC \
23579 + st4 [r2]=r3; \
23580 + st1 [r15]=r16; \
23581 + ;;
23582 +
23583 +GLOBAL_ENTRY(xen_get_ivr)
23584 + movl r8=running_on_xen;;
23585 + ld4 r8=[r8];;
23586 + cmp.eq p7,p0=r8,r0;;
23587 +(p7) mov r8=cr.ivr;;
23588 +(p7) br.ret.sptk.many rp
23589 + ;;
23590 + XEN_CLEAR_PSR_IC
23591 + ;;
23592 + XEN_HYPER_GET_IVR
23593 + ;;
23594 + XEN_RESTORE_PSR_IC
23595 + ;;
23596 + br.ret.sptk.many rp
23597 + ;;
23598 +END(xen_get_ivr)
23599 +
23600 +GLOBAL_ENTRY(xen_get_tpr)
23601 + movl r8=running_on_xen;;
23602 + ld4 r8=[r8];;
23603 + cmp.eq p7,p0=r8,r0;;
23604 +(p7) mov r8=cr.tpr;;
23605 +(p7) br.ret.sptk.many rp
23606 + ;;
23607 + XEN_CLEAR_PSR_IC
23608 + ;;
23609 + XEN_HYPER_GET_TPR
23610 + ;;
23611 + XEN_RESTORE_PSR_IC
23612 + ;;
23613 + br.ret.sptk.many rp
23614 + ;;
23615 +END(xen_get_tpr)
23616 +
23617 +GLOBAL_ENTRY(xen_set_tpr)
23618 + movl r8=running_on_xen;;
23619 + ld4 r8=[r8];;
23620 + cmp.eq p7,p0=r8,r0;;
23621 +(p7) mov cr.tpr=r32;;
23622 +(p7) br.ret.sptk.many rp
23623 + ;;
23624 + mov r8=r32
23625 + ;;
23626 + XEN_CLEAR_PSR_IC
23627 + ;;
23628 + XEN_HYPER_SET_TPR
23629 + ;;
23630 + XEN_RESTORE_PSR_IC
23631 + ;;
23632 + br.ret.sptk.many rp
23633 + ;;
23634 +END(xen_set_tpr)
23635 +
23636 +GLOBAL_ENTRY(xen_eoi)
23637 + movl r8=running_on_xen;;
23638 + ld4 r8=[r8];;
23639 + cmp.eq p7,p0=r8,r0;;
23640 +(p7) mov cr.eoi=r0;;
23641 +(p7) br.ret.sptk.many rp
23642 + ;;
23643 + mov r8=r32
23644 + ;;
23645 + XEN_CLEAR_PSR_IC
23646 + ;;
23647 + XEN_HYPER_EOI
23648 + ;;
23649 + XEN_RESTORE_PSR_IC
23650 + ;;
23651 + br.ret.sptk.many rp
23652 + ;;
23653 +END(xen_eoi)
23654 +
23655 +GLOBAL_ENTRY(xen_thash)
23656 + movl r8=running_on_xen;;
23657 + ld4 r8=[r8];;
23658 + cmp.eq p7,p0=r8,r0;;
23659 +(p7) thash r8=r32;;
23660 +(p7) br.ret.sptk.many rp
23661 + ;;
23662 + mov r8=r32
23663 + ;;
23664 + XEN_CLEAR_PSR_IC
23665 + ;;
23666 + XEN_HYPER_THASH
23667 + ;;
23668 + XEN_RESTORE_PSR_IC
23669 + ;;
23670 + br.ret.sptk.many rp
23671 + ;;
23672 +END(xen_thash)
23673 +
23674 +GLOBAL_ENTRY(xen_set_itm)
23675 + movl r8=running_on_xen;;
23676 + ld4 r8=[r8];;
23677 + cmp.eq p7,p0=r8,r0;;
23678 +(p7) mov cr.itm=r32;;
23679 +(p7) br.ret.sptk.many rp
23680 + ;;
23681 + mov r8=r32
23682 + ;;
23683 + XEN_CLEAR_PSR_IC
23684 + ;;
23685 + XEN_HYPER_SET_ITM
23686 + ;;
23687 + XEN_RESTORE_PSR_IC
23688 + ;;
23689 + br.ret.sptk.many rp
23690 + ;;
23691 +END(xen_set_itm)
23692 +
23693 +GLOBAL_ENTRY(xen_ptcga)
23694 + movl r8=running_on_xen;;
23695 + ld4 r8=[r8];;
23696 + cmp.eq p7,p0=r8,r0;;
23697 +(p7) ptc.ga r32,r33;;
23698 +(p7) br.ret.sptk.many rp
23699 + ;;
23700 + mov r8=r32
23701 + mov r9=r33
23702 + ;;
23703 + XEN_CLEAR_PSR_IC
23704 + ;;
23705 + XEN_HYPER_PTC_GA
23706 + ;;
23707 + XEN_RESTORE_PSR_IC
23708 + ;;
23709 + br.ret.sptk.many rp
23710 + ;;
23711 +END(xen_ptcga)
23712 +
23713 +GLOBAL_ENTRY(xen_get_rr)
23714 + movl r8=running_on_xen;;
23715 + ld4 r8=[r8];;
23716 + cmp.eq p7,p0=r8,r0;;
23717 +(p7) mov r8=rr[r32];;
23718 +(p7) br.ret.sptk.many rp
23719 + ;;
23720 + mov r8=r32
23721 + ;;
23722 + XEN_CLEAR_PSR_IC
23723 + ;;
23724 + XEN_HYPER_GET_RR
23725 + ;;
23726 + XEN_RESTORE_PSR_IC
23727 + ;;
23728 + br.ret.sptk.many rp
23729 + ;;
23730 +END(xen_get_rr)
23731 +
23732 +GLOBAL_ENTRY(xen_set_rr)
23733 + movl r8=running_on_xen;;
23734 + ld4 r8=[r8];;
23735 + cmp.eq p7,p0=r8,r0;;
23736 +(p7) mov rr[r32]=r33;;
23737 +(p7) br.ret.sptk.many rp
23738 + ;;
23739 + mov r8=r32
23740 + mov r9=r33
23741 + ;;
23742 + XEN_CLEAR_PSR_IC
23743 + ;;
23744 + XEN_HYPER_SET_RR
23745 + ;;
23746 + XEN_RESTORE_PSR_IC
23747 + ;;
23748 + br.ret.sptk.many rp
23749 + ;;
23750 +END(xen_set_rr)
23751 +
23752 +GLOBAL_ENTRY(xen_set_kr)
23753 + movl r8=running_on_xen;;
23754 + ld4 r8=[r8];;
23755 + cmp.ne p7,p0=r8,r0;;
23756 +(p7) br.cond.spnt.few 1f;
23757 + ;;
23758 + cmp.eq p7,p0=r8,r0
23759 + adds r8=-1,r8;;
23760 +(p7) mov ar0=r9
23761 +(p7) br.ret.sptk.many rp;;
23762 + cmp.eq p7,p0=r8,r0
23763 + adds r8=-1,r8;;
23764 +(p7) mov ar1=r9
23765 +(p7) br.ret.sptk.many rp;;
23766 + cmp.eq p7,p0=r8,r0
23767 + adds r8=-1,r8;;
23768 +(p7) mov ar2=r9
23769 +(p7) br.ret.sptk.many rp;;
23770 + cmp.eq p7,p0=r8,r0
23771 + adds r8=-1,r8;;
23772 +(p7) mov ar3=r9
23773 +(p7) br.ret.sptk.many rp;;
23774 + cmp.eq p7,p0=r8,r0
23775 + adds r8=-1,r8;;
23776 +(p7) mov ar4=r9
23777 +(p7) br.ret.sptk.many rp;;
23778 + cmp.eq p7,p0=r8,r0
23779 + adds r8=-1,r8;;
23780 +(p7) mov ar5=r9
23781 +(p7) br.ret.sptk.many rp;;
23782 + cmp.eq p7,p0=r8,r0
23783 + adds r8=-1,r8;;
23784 +(p7) mov ar6=r9
23785 +(p7) br.ret.sptk.many rp;;
23786 + cmp.eq p7,p0=r8,r0
23787 + adds r8=-1,r8;;
23788 +(p7) mov ar7=r9
23789 +(p7) br.ret.sptk.many rp;;
23790 +
23791 +1: mov r8=r32
23792 + mov r9=r33
23793 + ;;
23794 + XEN_CLEAR_PSR_IC
23795 + ;;
23796 + XEN_HYPER_SET_KR
23797 + ;;
23798 + XEN_RESTORE_PSR_IC
23799 + ;;
23800 + br.ret.sptk.many rp
23801 +END(xen_set_kr)
23802 +
23803 +GLOBAL_ENTRY(xen_fc)
23804 + movl r8=running_on_xen;;
23805 + ld4 r8=[r8];;
23806 + cmp.eq p7,p0=r8,r0;;
23807 +(p7) fc r32;;
23808 +(p7) br.ret.sptk.many rp
23809 + ;;
23810 + mov r8=r32
23811 + ;;
23812 + XEN_CLEAR_PSR_IC
23813 + ;;
23814 + XEN_HYPER_FC
23815 + ;;
23816 + XEN_RESTORE_PSR_IC
23817 + ;;
23818 + br.ret.sptk.many rp
23819 +END(xen_fc)
23820 +
23821 +GLOBAL_ENTRY(xen_get_cpuid)
23822 + movl r8=running_on_xen;;
23823 + ld4 r8=[r8];;
23824 + cmp.eq p7,p0=r8,r0;;
23825 +(p7) mov r8=cpuid[r32];;
23826 +(p7) br.ret.sptk.many rp
23827 + ;;
23828 + mov r8=r32
23829 + ;;
23830 + XEN_CLEAR_PSR_IC
23831 + ;;
23832 + XEN_HYPER_GET_CPUID
23833 + ;;
23834 + XEN_RESTORE_PSR_IC
23835 + ;;
23836 + br.ret.sptk.many rp
23837 +END(xen_get_cpuid)
23838 +
23839 +GLOBAL_ENTRY(xen_get_pmd)
23840 + movl r8=running_on_xen;;
23841 + ld4 r8=[r8];;
23842 + cmp.eq p7,p0=r8,r0;;
23843 +(p7) mov r8=pmd[r32];;
23844 +(p7) br.ret.sptk.many rp
23845 + ;;
23846 + mov r8=r32
23847 + ;;
23848 + XEN_CLEAR_PSR_IC
23849 + ;;
23850 + XEN_HYPER_GET_PMD
23851 + ;;
23852 + XEN_RESTORE_PSR_IC
23853 + ;;
23854 + br.ret.sptk.many rp
23855 +END(xen_get_pmd)
23856 +
23857 +#ifdef CONFIG_IA32_SUPPORT
23858 +GLOBAL_ENTRY(xen_get_eflag)
23859 + movl r8=running_on_xen;;
23860 + ld4 r8=[r8];;
23861 + cmp.eq p7,p0=r8,r0;;
23862 +(p7) mov r8=ar24;;
23863 +(p7) br.ret.sptk.many rp
23864 + ;;
23865 + mov r8=r32
23866 + ;;
23867 + XEN_CLEAR_PSR_IC
23868 + ;;
23869 + XEN_HYPER_GET_EFLAG
23870 + ;;
23871 + XEN_RESTORE_PSR_IC
23872 + ;;
23873 + br.ret.sptk.many rp
23874 +END(xen_get_eflag)
23875 +
23876 +// some bits aren't set if pl!=0, see SDM vol1 3.1.8
23877 +GLOBAL_ENTRY(xen_set_eflag)
23878 + movl r8=running_on_xen;;
23879 + ld4 r8=[r8];;
23880 + cmp.eq p7,p0=r8,r0;;
23881 +(p7) mov ar24=r32
23882 +(p7) br.ret.sptk.many rp
23883 + ;;
23884 + mov r8=r32
23885 + ;;
23886 + XEN_CLEAR_PSR_IC
23887 + ;;
23888 + XEN_HYPER_SET_EFLAG
23889 + ;;
23890 + XEN_RESTORE_PSR_IC
23891 + ;;
23892 + br.ret.sptk.many rp
23893 +END(xen_set_eflag)
23894 +#endif
23895 +
23896 +GLOBAL_ENTRY(xen_send_ipi)
23897 + mov r14=r32
23898 + mov r15=r33
23899 + mov r2=0x400
23900 + break 0x1000
23901 + ;;
23902 + br.ret.sptk.many rp
23903 + ;;
23904 +END(xen_send_ipi)
23905 +
23906 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT
23907 +// Those are vdso specialized.
23908 +// In fsys mode, call, ret can't be used.
23909 +GLOBAL_ENTRY(xen_rsm_be_i)
23910 + st1 [r22]=r20
23911 + st4 [r23]=r0
23912 + XEN_HYPER_RSM_BE
23913 + st4 [r23]=r20
23914 + brl.cond.sptk .vdso_rsm_be_i_ret
23915 + ;;
23916 +END(xen_rsm_be_i)
23917 +
23918 +GLOBAL_ENTRY(xen_get_psr)
23919 + mov r31=r8
23920 + mov r25=IA64_PSR_IC
23921 + st4 [r23]=r0
23922 + XEN_HYPER_GET_PSR
23923 + ;;
23924 + st4 [r23]=r20
23925 + or r29=r8,r25 // vpsr.ic was cleared for hyperprivop
23926 + mov r8=r31
23927 + brl.cond.sptk .vdso_get_psr_ret
23928 + ;;
23929 +END(xen_get_psr)
23930 +
23931 + // see xen_ssm_i() in privop.h
23932 + // r22 = &vcpu->vcpu_info->evtchn_upcall_mask
23933 + // r23 = &vpsr.ic
23934 + // r24 = &vcpu->vcpu_info->evtchn_upcall_pending
23935 + // r25 = tmp
23936 + // r31 = tmp
23937 + // p11 = tmp
23938 + // p14 = tmp
23939 +#define XEN_SET_PSR_I \
23940 + ld1 r31=[r22]; \
23941 + ld1 r25=[r24]; \
23942 + ;; \
23943 + st1 [r22]=r0; \
23944 + cmp.ne.unc p14,p0=r0,r31; \
23945 + ;; \
23946 +(p14) cmp.ne.unc p11,p0=r0,r25; \
23947 + ;; \
23948 +(p11) st1 [r22]=r20; \
23949 +(p11) st4 [r23]=r0; \
23950 +(p11) XEN_HYPER_SSM_I;
23951 +
23952 +GLOBAL_ENTRY(xen_ssm_i_0)
23953 + XEN_SET_PSR_I
23954 + brl.cond.sptk .vdso_ssm_i_0_ret
23955 + ;;
23956 +END(xen_ssm_i_0)
23957 +
23958 +GLOBAL_ENTRY(xen_ssm_i_1)
23959 + XEN_SET_PSR_I
23960 + brl.cond.sptk .vdso_ssm_i_1_ret
23961 + ;;
23962 +END(xen_ssm_i_1)
23963 +#endif
23964 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/hypervisor.c linux-2.6.16.33/arch/ia64/xen/hypervisor.c
23965 --- linux-2.6.16.33-noxen/arch/ia64/xen/hypervisor.c 1970-01-01 00:00:00.000000000 +0000
23966 +++ linux-2.6.16.33/arch/ia64/xen/hypervisor.c 2007-01-08 15:00:45.000000000 +0000
23967 @@ -0,0 +1,1104 @@
23968 +/******************************************************************************
23969 + * include/asm-ia64/shadow.h
23970 + *
23971 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23972 + * VA Linux Systems Japan K.K.
23973 + *
23974 + * This program is free software; you can redistribute it and/or modify
23975 + * it under the terms of the GNU General Public License as published by
23976 + * the Free Software Foundation; either version 2 of the License, or
23977 + * (at your option) any later version.
23978 + *
23979 + * This program is distributed in the hope that it will be useful,
23980 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
23981 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23982 + * GNU General Public License for more details.
23983 + *
23984 + * You should have received a copy of the GNU General Public License
23985 + * along with this program; if not, write to the Free Software
23986 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23987 + *
23988 + */
23989 +
23990 +//#include <linux/kernel.h>
23991 +#include <linux/spinlock.h>
23992 +#include <linux/bootmem.h>
23993 +#include <linux/module.h>
23994 +#include <linux/vmalloc.h>
23995 +#include <asm/page.h>
23996 +#include <asm/hypervisor.h>
23997 +#include <asm/hypercall.h>
23998 +#include <xen/interface/memory.h>
23999 +#include <xen/balloon.h>
24000 +
24001 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)XSI_BASE;
24002 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
24003 +
24004 +start_info_t *xen_start_info;
24005 +EXPORT_SYMBOL(xen_start_info);
24006 +
24007 +int running_on_xen;
24008 +EXPORT_SYMBOL(running_on_xen);
24009 +
24010 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
24011 +static int p2m_expose_init(void);
24012 +#else
24013 +#define p2m_expose_init() (-ENOSYS)
24014 +#endif
24015 +
24016 +//XXX same as i386, x86_64 contiguous_bitmap_set(), contiguous_bitmap_clear()
24017 +// move those to lib/contiguous_bitmap?
24018 +//XXX discontigmem/sparsemem
24019 +
24020 +/*
24021 + * Bitmap is indexed by page number. If bit is set, the page is part of a
24022 + * xen_create_contiguous_region() area of memory.
24023 + */
24024 +unsigned long *contiguous_bitmap;
24025 +
24026 +void
24027 +contiguous_bitmap_init(unsigned long end_pfn)
24028 +{
24029 + unsigned long size = (end_pfn + 2 * BITS_PER_LONG) >> 3;
24030 + contiguous_bitmap = alloc_bootmem_low_pages(size);
24031 + BUG_ON(!contiguous_bitmap);
24032 + memset(contiguous_bitmap, 0, size);
24033 +}
24034 +
24035 +#if 0
24036 +int
24037 +contiguous_bitmap_test(void* p)
24038 +{
24039 + return test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap);
24040 +}
24041 +#endif
24042 +
24043 +static void contiguous_bitmap_set(
24044 + unsigned long first_page, unsigned long nr_pages)
24045 +{
24046 + unsigned long start_off, end_off, curr_idx, end_idx;
24047 +
24048 + curr_idx = first_page / BITS_PER_LONG;
24049 + start_off = first_page & (BITS_PER_LONG-1);
24050 + end_idx = (first_page + nr_pages) / BITS_PER_LONG;
24051 + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
24052 +
24053 + if (curr_idx == end_idx) {
24054 + contiguous_bitmap[curr_idx] |=
24055 + ((1UL<<end_off)-1) & -(1UL<<start_off);
24056 + } else {
24057 + contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
24058 + while ( ++curr_idx < end_idx )
24059 + contiguous_bitmap[curr_idx] = ~0UL;
24060 + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
24061 + }
24062 +}
24063 +
24064 +static void contiguous_bitmap_clear(
24065 + unsigned long first_page, unsigned long nr_pages)
24066 +{
24067 + unsigned long start_off, end_off, curr_idx, end_idx;
24068 +
24069 + curr_idx = first_page / BITS_PER_LONG;
24070 + start_off = first_page & (BITS_PER_LONG-1);
24071 + end_idx = (first_page + nr_pages) / BITS_PER_LONG;
24072 + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
24073 +
24074 + if (curr_idx == end_idx) {
24075 + contiguous_bitmap[curr_idx] &=
24076 + -(1UL<<end_off) | ((1UL<<start_off)-1);
24077 + } else {
24078 + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
24079 + while ( ++curr_idx != end_idx )
24080 + contiguous_bitmap[curr_idx] = 0;
24081 + contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
24082 + }
24083 +}
24084 +
24085 +// __xen_create_contiguous_region(), __xen_destroy_contiguous_region()
24086 +// are based on i386 xen_create_contiguous_region(),
24087 +// xen_destroy_contiguous_region()
24088 +
24089 +/* Protected by balloon_lock. */
24090 +#define MAX_CONTIG_ORDER 7
24091 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
24092 +
24093 +/* Ensure multi-page extents are contiguous in machine memory. */
24094 +int
24095 +__xen_create_contiguous_region(unsigned long vstart,
24096 + unsigned int order, unsigned int address_bits)
24097 +{
24098 + unsigned long error = 0;
24099 + unsigned long gphys = __pa(vstart);
24100 + unsigned long start_gpfn = gphys >> PAGE_SHIFT;
24101 + unsigned long num_gpfn = 1 << order;
24102 + unsigned long i;
24103 + unsigned long flags;
24104 +
24105 + unsigned long *in_frames = discontig_frames, out_frame;
24106 + int success;
24107 + struct xen_memory_exchange exchange = {
24108 + .in = {
24109 + .nr_extents = num_gpfn,
24110 + .extent_order = 0,
24111 + .domid = DOMID_SELF
24112 + },
24113 + .out = {
24114 + .nr_extents = 1,
24115 + .extent_order = order,
24116 + .address_bits = address_bits,
24117 + .domid = DOMID_SELF
24118 + },
24119 + .nr_exchanged = 0
24120 + };
24121 +
24122 + if (unlikely(order > MAX_CONTIG_ORDER))
24123 + return -ENOMEM;
24124 +
24125 + set_xen_guest_handle(exchange.in.extent_start, in_frames);
24126 + set_xen_guest_handle(exchange.out.extent_start, &out_frame);
24127 +
24128 + scrub_pages(vstart, num_gpfn);
24129 +
24130 + balloon_lock(flags);
24131 +
24132 + /* Get a new contiguous memory extent. */
24133 + for (i = 0; i < num_gpfn; i++) {
24134 + in_frames[i] = start_gpfn + i;
24135 + }
24136 + out_frame = start_gpfn;
24137 + error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24138 + success = (exchange.nr_exchanged == num_gpfn);
24139 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24140 + BUG_ON(success && (error != 0));
24141 + if (unlikely(error == -ENOSYS)) {
24142 + /* Compatibility when XENMEM_exchange is unsupported. */
24143 + error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24144 + &exchange.in);
24145 + BUG_ON(error != num_gpfn);
24146 + error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24147 + &exchange.out);
24148 + if (error != 1) {
24149 + /* Couldn't get special memory: fall back to normal. */
24150 + for (i = 0; i < num_gpfn; i++) {
24151 + in_frames[i] = start_gpfn + i;
24152 + }
24153 + error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24154 + &exchange.in);
24155 + BUG_ON(error != num_gpfn);
24156 + success = 0;
24157 + } else
24158 + success = 1;
24159 + }
24160 + if (success)
24161 + contiguous_bitmap_set(start_gpfn, num_gpfn);
24162 +#if 0
24163 + if (success) {
24164 + unsigned long mfn;
24165 + unsigned long mfn_prev = ~0UL;
24166 + for (i = 0; i < num_gpfn; i++) {
24167 + mfn = pfn_to_mfn_for_dma(start_gpfn + i);
24168 + if (mfn_prev != ~0UL && mfn != mfn_prev + 1) {
24169 + xprintk("\n");
24170 + xprintk("%s:%d order %d "
24171 + "start 0x%lx bus 0x%lx "
24172 + "machine 0x%lx\n",
24173 + __func__, __LINE__, order,
24174 + vstart, virt_to_bus((void*)vstart),
24175 + phys_to_machine_for_dma(gphys));
24176 + xprintk("mfn: ");
24177 + for (i = 0; i < num_gpfn; i++) {
24178 + mfn = pfn_to_mfn_for_dma(
24179 + start_gpfn + i);
24180 + xprintk("0x%lx ", mfn);
24181 + }
24182 + xprintk("\n");
24183 + break;
24184 + }
24185 + mfn_prev = mfn;
24186 + }
24187 + }
24188 +#endif
24189 + balloon_unlock(flags);
24190 + return success? 0: -ENOMEM;
24191 +}
24192 +
24193 +void
24194 +__xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
24195 +{
24196 + unsigned long flags;
24197 + unsigned long error = 0;
24198 + unsigned long start_gpfn = __pa(vstart) >> PAGE_SHIFT;
24199 + unsigned long num_gpfn = 1UL << order;
24200 + unsigned long i;
24201 +
24202 + unsigned long *out_frames = discontig_frames, in_frame;
24203 + int success;
24204 + struct xen_memory_exchange exchange = {
24205 + .in = {
24206 + .nr_extents = 1,
24207 + .extent_order = order,
24208 + .domid = DOMID_SELF
24209 + },
24210 + .out = {
24211 + .nr_extents = num_gpfn,
24212 + .extent_order = 0,
24213 + .address_bits = 0,
24214 + .domid = DOMID_SELF
24215 + },
24216 + .nr_exchanged = 0
24217 + };
24218 +
24219 +
24220 + if (!test_bit(start_gpfn, contiguous_bitmap))
24221 + return;
24222 +
24223 + if (unlikely(order > MAX_CONTIG_ORDER))
24224 + return;
24225 +
24226 + set_xen_guest_handle(exchange.in.extent_start, &in_frame);
24227 + set_xen_guest_handle(exchange.out.extent_start, out_frames);
24228 +
24229 + scrub_pages(vstart, num_gpfn);
24230 +
24231 + balloon_lock(flags);
24232 +
24233 + contiguous_bitmap_clear(start_gpfn, num_gpfn);
24234 +
24235 + /* Do the exchange for non-contiguous MFNs. */
24236 + in_frame = start_gpfn;
24237 + for (i = 0; i < num_gpfn; i++) {
24238 + out_frames[i] = start_gpfn + i;
24239 + }
24240 + error = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
24241 + success = (exchange.nr_exchanged == 1);
24242 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (error == 0)));
24243 + BUG_ON(success && (error != 0));
24244 + if (unlikely(error == -ENOSYS)) {
24245 + /* Compatibility when XENMEM_exchange is unsupported. */
24246 + error = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
24247 + &exchange.in);
24248 + BUG_ON(error != 1);
24249 +
24250 + error = HYPERVISOR_memory_op(XENMEM_populate_physmap,
24251 + &exchange.out);
24252 + BUG_ON(error != num_gpfn);
24253 + }
24254 + balloon_unlock(flags);
24255 +}
24256 +
24257 +
24258 +///////////////////////////////////////////////////////////////////////////
24259 +// grant table hack
24260 +// cmd: GNTTABOP_xxx
24261 +
24262 +#include <linux/mm.h>
24263 +#include <xen/interface/xen.h>
24264 +#include <xen/gnttab.h>
24265 +
24266 +static void
24267 +gnttab_map_grant_ref_pre(struct gnttab_map_grant_ref *uop)
24268 +{
24269 + uint32_t flags;
24270 +
24271 + flags = uop->flags;
24272 +
24273 + if (flags & GNTMAP_host_map) {
24274 + if (flags & GNTMAP_application_map) {
24275 + xprintd("GNTMAP_application_map is not supported yet: flags 0x%x\n", flags);
24276 + BUG();
24277 + }
24278 + if (flags & GNTMAP_contains_pte) {
24279 + xprintd("GNTMAP_contains_pte is not supported yet flags 0x%x\n", flags);
24280 + BUG();
24281 + }
24282 + } else if (flags & GNTMAP_device_map) {
24283 + xprintd("GNTMAP_device_map is not supported yet 0x%x\n", flags);
24284 + BUG();//XXX not yet. actually this flag is not used.
24285 + } else {
24286 + BUG();
24287 + }
24288 +}
24289 +
24290 +int
24291 +HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
24292 +{
24293 + if (cmd == GNTTABOP_map_grant_ref) {
24294 + unsigned int i;
24295 + for (i = 0; i < count; i++) {
24296 + gnttab_map_grant_ref_pre(
24297 + (struct gnttab_map_grant_ref*)uop + i);
24298 + }
24299 + }
24300 + return xencomm_mini_hypercall_grant_table_op(cmd, uop, count);
24301 +}
24302 +EXPORT_SYMBOL(HYPERVISOR_grant_table_op);
24303 +
24304 +///////////////////////////////////////////////////////////////////////////
24305 +// PageForeign(), SetPageForeign(), ClearPageForeign()
24306 +
24307 +struct address_space xen_ia64_foreign_dummy_mapping;
24308 +EXPORT_SYMBOL(xen_ia64_foreign_dummy_mapping);
24309 +
24310 +///////////////////////////////////////////////////////////////////////////
24311 +// foreign mapping
24312 +#include <linux/efi.h>
24313 +#include <asm/meminit.h> // for IA64_GRANULE_SIZE, GRANULEROUND{UP,DOWN}()
24314 +
24315 +static unsigned long privcmd_resource_min = 0;
24316 +// Xen/ia64 currently can handle pseudo physical address bits up to
24317 +// (PAGE_SHIFT * 3)
24318 +static unsigned long privcmd_resource_max = GRANULEROUNDDOWN((1UL << (PAGE_SHIFT * 3)) - 1);
24319 +static unsigned long privcmd_resource_align = IA64_GRANULE_SIZE;
24320 +
24321 +static unsigned long
24322 +md_end_addr(const efi_memory_desc_t *md)
24323 +{
24324 + return md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
24325 +}
24326 +
24327 +#define XEN_IA64_PRIVCMD_LEAST_GAP_SIZE (1024 * 1024 * 1024UL)
24328 +static int
24329 +xen_ia64_privcmd_check_size(unsigned long start, unsigned long end)
24330 +{
24331 + return (start < end &&
24332 + (end - start) > XEN_IA64_PRIVCMD_LEAST_GAP_SIZE);
24333 +}
24334 +
24335 +static int __init
24336 +xen_ia64_privcmd_init(void)
24337 +{
24338 + void *efi_map_start, *efi_map_end, *p;
24339 + u64 efi_desc_size;
24340 + efi_memory_desc_t *md;
24341 + unsigned long tmp_min;
24342 + unsigned long tmp_max;
24343 + unsigned long gap_size;
24344 + unsigned long prev_end;
24345 +
24346 + if (!is_running_on_xen())
24347 + return -1;
24348 +
24349 + efi_map_start = __va(ia64_boot_param->efi_memmap);
24350 + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size;
24351 + efi_desc_size = ia64_boot_param->efi_memdesc_size;
24352 +
24353 + // at first check the used highest address
24354 + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
24355 + // nothing
24356 + }
24357 + md = p - efi_desc_size;
24358 + privcmd_resource_min = GRANULEROUNDUP(md_end_addr(md));
24359 + if (xen_ia64_privcmd_check_size(privcmd_resource_min,
24360 + privcmd_resource_max)) {
24361 + goto out;
24362 + }
24363 +
24364 + // the used highest address is too large. try to find the largest gap.
24365 + tmp_min = privcmd_resource_max;
24366 + tmp_max = 0;
24367 + gap_size = 0;
24368 + prev_end = 0;
24369 + for (p = efi_map_start;
24370 + p < efi_map_end - efi_desc_size;
24371 + p += efi_desc_size) {
24372 + unsigned long end;
24373 + efi_memory_desc_t* next;
24374 + unsigned long next_start;
24375 +
24376 + md = p;
24377 + end = md_end_addr(md);
24378 + if (end > privcmd_resource_max) {
24379 + break;
24380 + }
24381 + if (end < prev_end) {
24382 + // work around.
24383 + // Xen may pass incompletely sorted memory
24384 + // descriptors like
24385 + // [x, x + length]
24386 + // [x, x]
24387 + // this order should be reversed.
24388 + continue;
24389 + }
24390 + next = p + efi_desc_size;
24391 + next_start = next->phys_addr;
24392 + if (next_start > privcmd_resource_max) {
24393 + next_start = privcmd_resource_max;
24394 + }
24395 + if (end < next_start && gap_size < (next_start - end)) {
24396 + tmp_min = end;
24397 + tmp_max = next_start;
24398 + gap_size = tmp_max - tmp_min;
24399 + }
24400 + prev_end = end;
24401 + }
24402 +
24403 + privcmd_resource_min = GRANULEROUNDUP(tmp_min);
24404 + if (xen_ia64_privcmd_check_size(privcmd_resource_min, tmp_max)) {
24405 + privcmd_resource_max = tmp_max;
24406 + goto out;
24407 + }
24408 +
24409 + privcmd_resource_min = tmp_min;
24410 + privcmd_resource_max = tmp_max;
24411 + if (!xen_ia64_privcmd_check_size(privcmd_resource_min,
24412 + privcmd_resource_max)) {
24413 + // Any large enough gap isn't found.
24414 + // go ahead anyway with the warning hoping that large region
24415 + // won't be requested.
24416 + printk(KERN_WARNING "xen privcmd: large enough region for privcmd mmap is not found.\n");
24417 + }
24418 +
24419 +out:
24420 + printk(KERN_INFO "xen privcmd uses pseudo physical addr range [0x%lx, 0x%lx] (%ldMB)\n",
24421 + privcmd_resource_min, privcmd_resource_max,
24422 + (privcmd_resource_max - privcmd_resource_min) >> 20);
24423 + BUG_ON(privcmd_resource_min >= privcmd_resource_max);
24424 +
24425 + // XXX this should be somewhere appropriate
24426 + (void)p2m_expose_init();
24427 +
24428 + return 0;
24429 +}
24430 +late_initcall(xen_ia64_privcmd_init);
24431 +
24432 +struct xen_ia64_privcmd_entry {
24433 + atomic_t map_count;
24434 +#define INVALID_GPFN (~0UL)
24435 + unsigned long gpfn;
24436 +};
24437 +
24438 +struct xen_ia64_privcmd_range {
24439 + atomic_t ref_count;
24440 + unsigned long pgoff; // in PAGE_SIZE
24441 + struct resource* res;
24442 +
24443 + unsigned long num_entries;
24444 + struct xen_ia64_privcmd_entry entries[0];
24445 +};
24446 +
24447 +struct xen_ia64_privcmd_vma {
24448 + int is_privcmd_mmapped;
24449 + struct xen_ia64_privcmd_range* range;
24450 +
24451 + unsigned long num_entries;
24452 + struct xen_ia64_privcmd_entry* entries;
24453 +};
24454 +
24455 +static void
24456 +xen_ia64_privcmd_init_entry(struct xen_ia64_privcmd_entry* entry)
24457 +{
24458 + atomic_set(&entry->map_count, 0);
24459 + entry->gpfn = INVALID_GPFN;
24460 +}
24461 +
24462 +static int
24463 +xen_ia64_privcmd_entry_mmap(struct vm_area_struct* vma,
24464 + unsigned long addr,
24465 + struct xen_ia64_privcmd_range* privcmd_range,
24466 + int i,
24467 + unsigned long gmfn,
24468 + pgprot_t prot,
24469 + domid_t domid)
24470 +{
24471 + int error = 0;
24472 + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24473 + unsigned long gpfn;
24474 + unsigned long flags;
24475 +
24476 + if ((addr & ~PAGE_MASK) != 0 || gmfn == INVALID_MFN) {
24477 + error = -EINVAL;
24478 + goto out;
24479 + }
24480 +
24481 + if (entry->gpfn != INVALID_GPFN) {
24482 + error = -EBUSY;
24483 + goto out;
24484 + }
24485 + gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + i;
24486 +
24487 + flags = ASSIGN_writable;
24488 + if (pgprot_val(prot) == PROT_READ) {
24489 + flags = ASSIGN_readonly;
24490 + }
24491 + error = HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn, flags, domid);
24492 + if (error != 0) {
24493 + goto out;
24494 + }
24495 +
24496 + prot = vma->vm_page_prot;
24497 + error = remap_pfn_range(vma, addr, gpfn, 1 << PAGE_SHIFT, prot);
24498 + if (error != 0) {
24499 + error = HYPERVISOR_zap_physmap(gpfn, 0);
24500 + if (error) {
24501 + BUG();//XXX
24502 + }
24503 + } else {
24504 + atomic_inc(&entry->map_count);
24505 + entry->gpfn = gpfn;
24506 + }
24507 +
24508 +out:
24509 + return error;
24510 +}
24511 +
24512 +static void
24513 +xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_range* privcmd_range,
24514 + int i)
24515 +{
24516 + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24517 + unsigned long gpfn = entry->gpfn;
24518 + //gpfn = (privcmd_range->res->start >> PAGE_SHIFT) +
24519 + // (vma->vm_pgoff - privcmd_range->pgoff);
24520 + int error;
24521 +
24522 + error = HYPERVISOR_zap_physmap(gpfn, 0);
24523 + if (error) {
24524 + BUG();//XXX
24525 + }
24526 + entry->gpfn = INVALID_GPFN;
24527 +}
24528 +
24529 +static void
24530 +xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_range* privcmd_range,
24531 + int i)
24532 +{
24533 + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24534 + if (entry->gpfn != INVALID_GPFN) {
24535 + atomic_inc(&entry->map_count);
24536 + } else {
24537 + BUG_ON(atomic_read(&entry->map_count) != 0);
24538 + }
24539 +}
24540 +
24541 +static void
24542 +xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_range* privcmd_range,
24543 + int i)
24544 +{
24545 + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
24546 + if (entry->gpfn != INVALID_GPFN &&
24547 + atomic_dec_and_test(&entry->map_count)) {
24548 + xen_ia64_privcmd_entry_munmap(privcmd_range, i);
24549 + }
24550 +}
24551 +
24552 +static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma);
24553 +static void xen_ia64_privcmd_vma_close(struct vm_area_struct* vma);
24554 +
24555 +struct vm_operations_struct xen_ia64_privcmd_vm_ops = {
24556 + .open = &xen_ia64_privcmd_vma_open,
24557 + .close = &xen_ia64_privcmd_vma_close,
24558 +};
24559 +
24560 +static void
24561 +__xen_ia64_privcmd_vma_open(struct vm_area_struct* vma,
24562 + struct xen_ia64_privcmd_vma* privcmd_vma,
24563 + struct xen_ia64_privcmd_range* privcmd_range)
24564 +{
24565 + unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24566 + unsigned long num_entries = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
24567 + unsigned long i;
24568 +
24569 + BUG_ON(entry_offset < 0);
24570 + BUG_ON(entry_offset + num_entries > privcmd_range->num_entries);
24571 +
24572 + privcmd_vma->range = privcmd_range;
24573 + privcmd_vma->num_entries = num_entries;
24574 + privcmd_vma->entries = &privcmd_range->entries[entry_offset];
24575 + vma->vm_private_data = privcmd_vma;
24576 + for (i = 0; i < privcmd_vma->num_entries; i++) {
24577 + xen_ia64_privcmd_entry_open(privcmd_range, entry_offset + i);
24578 + }
24579 +
24580 + vma->vm_private_data = privcmd_vma;
24581 + vma->vm_ops = &xen_ia64_privcmd_vm_ops;
24582 +}
24583 +
24584 +static void
24585 +xen_ia64_privcmd_vma_open(struct vm_area_struct* vma)
24586 +{
24587 + struct xen_ia64_privcmd_vma* old_privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24588 + struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24589 + struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24590 +
24591 + atomic_inc(&privcmd_range->ref_count);
24592 + // vm_op->open() can't fail.
24593 + privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL | __GFP_NOFAIL);
24594 + // copy original value if necessary
24595 + privcmd_vma->is_privcmd_mmapped = old_privcmd_vma->is_privcmd_mmapped;
24596 +
24597 + __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
24598 +}
24599 +
24600 +static void
24601 +xen_ia64_privcmd_vma_close(struct vm_area_struct* vma)
24602 +{
24603 + struct xen_ia64_privcmd_vma* privcmd_vma =
24604 + (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24605 + struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24606 + unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24607 + unsigned long i;
24608 +
24609 + for (i = 0; i < privcmd_vma->num_entries; i++) {
24610 + xen_ia64_privcmd_entry_close(privcmd_range, entry_offset + i);
24611 + }
24612 + vma->vm_private_data = NULL;
24613 + kfree(privcmd_vma);
24614 +
24615 + if (atomic_dec_and_test(&privcmd_range->ref_count)) {
24616 +#if 1
24617 + for (i = 0; i < privcmd_range->num_entries; i++) {
24618 + struct xen_ia64_privcmd_entry* entry =
24619 + &privcmd_range->entries[i];
24620 + BUG_ON(atomic_read(&entry->map_count) != 0);
24621 + BUG_ON(entry->gpfn != INVALID_GPFN);
24622 + }
24623 +#endif
24624 + release_resource(privcmd_range->res);
24625 + kfree(privcmd_range->res);
24626 + vfree(privcmd_range);
24627 + }
24628 +}
24629 +
24630 +int
24631 +privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
24632 +{
24633 + struct xen_ia64_privcmd_vma* privcmd_vma =
24634 + (struct xen_ia64_privcmd_vma *)vma->vm_private_data;
24635 + return (xchg(&privcmd_vma->is_privcmd_mmapped, 1) == 0);
24636 +}
24637 +
24638 +int
24639 +privcmd_mmap(struct file * file, struct vm_area_struct * vma)
24640 +{
24641 + int error;
24642 + unsigned long size = vma->vm_end - vma->vm_start;
24643 + unsigned long num_entries = size >> PAGE_SHIFT;
24644 + struct xen_ia64_privcmd_range* privcmd_range = NULL;
24645 + struct xen_ia64_privcmd_vma* privcmd_vma = NULL;
24646 + struct resource* res = NULL;
24647 + unsigned long i;
24648 + BUG_ON(!is_running_on_xen());
24649 +
24650 + BUG_ON(file->private_data != NULL);
24651 +
24652 + error = -ENOMEM;
24653 + privcmd_range =
24654 + vmalloc(sizeof(*privcmd_range) +
24655 + sizeof(privcmd_range->entries[0]) * num_entries);
24656 + if (privcmd_range == NULL) {
24657 + goto out_enomem0;
24658 + }
24659 + privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL);
24660 + if (privcmd_vma == NULL) {
24661 + goto out_enomem1;
24662 + }
24663 + privcmd_vma->is_privcmd_mmapped = 0;
24664 +
24665 + res = kzalloc(sizeof(*res), GFP_KERNEL);
24666 + if (res == NULL) {
24667 + goto out_enomem1;
24668 + }
24669 + res->name = "Xen privcmd mmap";
24670 + error = allocate_resource(&iomem_resource, res, size,
24671 + privcmd_resource_min, privcmd_resource_max,
24672 + privcmd_resource_align, NULL, NULL);
24673 + if (error) {
24674 + goto out_enomem1;
24675 + }
24676 + privcmd_range->res = res;
24677 +
24678 + /* DONTCOPY is essential for Xen as copy_page_range is broken. */
24679 + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
24680 +
24681 + atomic_set(&privcmd_range->ref_count, 1);
24682 + privcmd_range->pgoff = vma->vm_pgoff;
24683 + privcmd_range->num_entries = num_entries;
24684 + for (i = 0; i < privcmd_range->num_entries; i++) {
24685 + xen_ia64_privcmd_init_entry(&privcmd_range->entries[i]);
24686 + }
24687 +
24688 + __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
24689 + return 0;
24690 +
24691 +out_enomem1:
24692 + kfree(res);
24693 + kfree(privcmd_vma);
24694 +out_enomem0:
24695 + vfree(privcmd_range);
24696 + return error;
24697 +}
24698 +
24699 +int
24700 +direct_remap_pfn_range(struct vm_area_struct *vma,
24701 + unsigned long address, // process virtual address
24702 + unsigned long gmfn, // gmfn, gmfn + 1, ... gmfn + size/PAGE_SIZE
24703 + unsigned long size,
24704 + pgprot_t prot,
24705 + domid_t domid) // target domain
24706 +{
24707 + struct xen_ia64_privcmd_vma* privcmd_vma =
24708 + (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
24709 + struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
24710 + unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
24711 +
24712 + unsigned long i;
24713 + unsigned long offset;
24714 + int error = 0;
24715 + BUG_ON(!is_running_on_xen());
24716 +
24717 +#if 0
24718 + if (prot != vm->vm_page_prot) {
24719 + return -EINVAL;
24720 + }
24721 +#endif
24722 +
24723 + i = (address - vma->vm_start) >> PAGE_SHIFT;
24724 + for (offset = 0; offset < size; offset += PAGE_SIZE) {
24725 + error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, privcmd_range, entry_offset + i, gmfn, prot, domid);
24726 + if (error != 0) {
24727 + break;
24728 + }
24729 +
24730 + i++;
24731 + gmfn++;
24732 + }
24733 +
24734 + return error;
24735 +}
24736 +
24737 +
24738 +/* Called after suspend, to resume time. */
24739 +void
24740 +time_resume(void)
24741 +{
24742 + extern void ia64_cpu_local_tick(void);
24743 +
24744 + /* Just trigger a tick. */
24745 + ia64_cpu_local_tick();
24746 +}
24747 +
24748 +///////////////////////////////////////////////////////////////////////////
24749 +// expose p2m table
24750 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
24751 +#include <linux/cpu.h>
24752 +#include <asm/uaccess.h>
24753 +
24754 +int p2m_initialized __read_mostly = 0;
24755 +
24756 +unsigned long p2m_min_low_pfn __read_mostly;
24757 +unsigned long p2m_max_low_pfn __read_mostly;
24758 +unsigned long p2m_convert_min_pfn __read_mostly;
24759 +unsigned long p2m_convert_max_pfn __read_mostly;
24760 +
24761 +static struct resource p2m_resource = {
24762 + .name = "Xen p2m table",
24763 + .flags = IORESOURCE_MEM,
24764 +};
24765 +static unsigned long p2m_assign_start_pfn __read_mostly;
24766 +static unsigned long p2m_assign_end_pfn __read_mostly;
24767 +volatile const pte_t* p2m_pte __read_mostly;
24768 +
24769 +#define GRNULE_PFN PTRS_PER_PTE
24770 +static unsigned long p2m_granule_pfn __read_mostly = GRNULE_PFN;
24771 +
24772 +#define ROUNDDOWN(x, y) ((x) & ~((y) - 1))
24773 +#define ROUNDUP(x, y) (((x) + (y) - 1) & ~((y) - 1))
24774 +
24775 +#define P2M_PREFIX "Xen p2m: "
24776 +
24777 +static int xen_ia64_p2m_expose __read_mostly = 1;
24778 +module_param(xen_ia64_p2m_expose, int, 0);
24779 +MODULE_PARM_DESC(xen_ia64_p2m_expose,
24780 + "enable/disable xen/ia64 p2m exposure optimization\n");
24781 +
24782 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24783 +static int xen_ia64_p2m_expose_use_dtr __read_mostly = 1;
24784 +module_param(xen_ia64_p2m_expose_use_dtr, int, 0);
24785 +MODULE_PARM_DESC(xen_ia64_p2m_expose_use_dtr,
24786 + "use/unuse dtr to map exposed p2m table\n");
24787 +
24788 +static const int p2m_page_shifts[] = {
24789 + _PAGE_SIZE_4K,
24790 + _PAGE_SIZE_8K,
24791 + _PAGE_SIZE_16K,
24792 + _PAGE_SIZE_64K,
24793 + _PAGE_SIZE_256K,
24794 + _PAGE_SIZE_1M,
24795 + _PAGE_SIZE_4M,
24796 + _PAGE_SIZE_16M,
24797 + _PAGE_SIZE_64M,
24798 + _PAGE_SIZE_256M,
24799 +};
24800 +
24801 +struct p2m_itr_arg {
24802 + unsigned long vaddr;
24803 + unsigned long pteval;
24804 + unsigned long log_page_size;
24805 +};
24806 +static struct p2m_itr_arg p2m_itr_arg __read_mostly;
24807 +
24808 +// This should be in asm-ia64/kregs.h
24809 +#define IA64_TR_P2M_TABLE 3
24810 +
24811 +static void
24812 +p2m_itr(void* info)
24813 +{
24814 + struct p2m_itr_arg* arg = (struct p2m_itr_arg*)info;
24815 + ia64_itr(0x2, IA64_TR_P2M_TABLE,
24816 + arg->vaddr, arg->pteval, arg->log_page_size);
24817 + ia64_srlz_d();
24818 +}
24819 +
24820 +static int
24821 +p2m_expose_dtr_call(struct notifier_block *self,
24822 + unsigned long event, void* ptr)
24823 +{
24824 + unsigned int cpu = (unsigned int)(long)ptr;
24825 + if (event != CPU_ONLINE)
24826 + return 0;
24827 + if (!(p2m_initialized && xen_ia64_p2m_expose_use_dtr))
24828 + smp_call_function_single(cpu, &p2m_itr, &p2m_itr_arg, 1, 1);
24829 + return 0;
24830 +}
24831 +
24832 +static struct notifier_block p2m_expose_dtr_hotplug_notifier = {
24833 + .notifier_call = p2m_expose_dtr_call,
24834 + .next = NULL,
24835 + .priority = 0
24836 +};
24837 +#endif
24838 +
24839 +static int
24840 +p2m_expose_init(void)
24841 +{
24842 + unsigned long num_pfn;
24843 + unsigned long size = 0;
24844 + unsigned long p2m_size = 0;
24845 + unsigned long align = ~0UL;
24846 + int error = 0;
24847 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24848 + int i;
24849 + unsigned long page_size;
24850 + unsigned long log_page_size = 0;
24851 +#endif
24852 +
24853 + if (!xen_ia64_p2m_expose)
24854 + return -ENOSYS;
24855 + if (p2m_initialized)
24856 + return 0;
24857 +
24858 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24859 + error = register_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
24860 + if (error < 0)
24861 + return error;
24862 +#endif
24863 +
24864 + lock_cpu_hotplug();
24865 + if (p2m_initialized)
24866 + goto out;
24867 +
24868 +#ifdef CONFIG_DISCONTIGMEM
24869 + p2m_min_low_pfn = min_low_pfn;
24870 + p2m_max_low_pfn = max_low_pfn;
24871 +#else
24872 + p2m_min_low_pfn = 0;
24873 + p2m_max_low_pfn = max_pfn;
24874 +#endif
24875 +
24876 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24877 + if (xen_ia64_p2m_expose_use_dtr) {
24878 + unsigned long granule_pfn = 0;
24879 + p2m_size = p2m_max_low_pfn - p2m_min_low_pfn;
24880 + for (i = 0;
24881 + i < sizeof(p2m_page_shifts)/sizeof(p2m_page_shifts[0]);
24882 + i++) {
24883 + log_page_size = p2m_page_shifts[i];
24884 + page_size = 1UL << log_page_size;
24885 + if (page_size < p2m_size)
24886 + continue;
24887 +
24888 + granule_pfn = max(page_size >> PAGE_SHIFT,
24889 + p2m_granule_pfn);
24890 + p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
24891 + granule_pfn);
24892 + p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn,
24893 + granule_pfn);
24894 + num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
24895 + size = num_pfn << PAGE_SHIFT;
24896 + p2m_size = num_pfn / PTRS_PER_PTE;
24897 + p2m_size = ROUNDUP(p2m_size, granule_pfn << PAGE_SHIFT);
24898 + if (p2m_size == page_size)
24899 + break;
24900 + }
24901 + if (p2m_size != page_size) {
24902 + printk(KERN_ERR "p2m_size != page_size\n");
24903 + error = -EINVAL;
24904 + goto out;
24905 + }
24906 + align = max(privcmd_resource_align, granule_pfn << PAGE_SHIFT);
24907 + } else
24908 +#endif
24909 + {
24910 + BUG_ON(p2m_granule_pfn & (p2m_granule_pfn - 1));
24911 + p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn,
24912 + p2m_granule_pfn);
24913 + p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn, p2m_granule_pfn);
24914 + num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn;
24915 + size = num_pfn << PAGE_SHIFT;
24916 + p2m_size = num_pfn / PTRS_PER_PTE;
24917 + p2m_size = ROUNDUP(p2m_size, p2m_granule_pfn << PAGE_SHIFT);
24918 + align = max(privcmd_resource_align,
24919 + p2m_granule_pfn << PAGE_SHIFT);
24920 + }
24921 +
24922 + // use privcmd region
24923 + error = allocate_resource(&iomem_resource, &p2m_resource, p2m_size,
24924 + privcmd_resource_min, privcmd_resource_max,
24925 + align, NULL, NULL);
24926 + if (error) {
24927 + printk(KERN_ERR P2M_PREFIX
24928 + "can't allocate region for p2m exposure "
24929 + "[0x%016lx, 0x%016lx) 0x%016lx\n",
24930 + p2m_convert_min_pfn, p2m_convert_max_pfn, p2m_size);
24931 + goto out;
24932 + }
24933 +
24934 + p2m_assign_start_pfn = p2m_resource.start >> PAGE_SHIFT;
24935 + p2m_assign_end_pfn = p2m_resource.end >> PAGE_SHIFT;
24936 +
24937 + error = HYPERVISOR_expose_p2m(p2m_convert_min_pfn,
24938 + p2m_assign_start_pfn,
24939 + size, p2m_granule_pfn);
24940 + if (error) {
24941 + printk(KERN_ERR P2M_PREFIX "failed expose p2m hypercall %d\n",
24942 + error);
24943 + printk(KERN_ERR P2M_PREFIX "conv 0x%016lx assign 0x%016lx "
24944 + "size 0x%016lx granule 0x%016lx\n",
24945 + p2m_convert_min_pfn, p2m_assign_start_pfn,
24946 + size, p2m_granule_pfn);;
24947 + release_resource(&p2m_resource);
24948 + goto out;
24949 + }
24950 + p2m_pte = (volatile const pte_t*)pfn_to_kaddr(p2m_assign_start_pfn);
24951 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24952 + if (xen_ia64_p2m_expose_use_dtr) {
24953 + p2m_itr_arg.vaddr = (unsigned long)__va(p2m_assign_start_pfn
24954 + << PAGE_SHIFT);
24955 + p2m_itr_arg.pteval = pte_val(pfn_pte(p2m_assign_start_pfn,
24956 + PAGE_KERNEL));
24957 + p2m_itr_arg.log_page_size = log_page_size;
24958 + smp_mb();
24959 + smp_call_function(&p2m_itr, &p2m_itr_arg, 1, 1);
24960 + p2m_itr(&p2m_itr_arg);
24961 + }
24962 +#endif
24963 + smp_mb();
24964 + p2m_initialized = 1;
24965 + printk(P2M_PREFIX "assign p2m table of [0x%016lx, 0x%016lx)\n",
24966 + p2m_convert_min_pfn << PAGE_SHIFT,
24967 + p2m_convert_max_pfn << PAGE_SHIFT);
24968 + printk(P2M_PREFIX "to [0x%016lx, 0x%016lx) (%ld KBytes)\n",
24969 + p2m_assign_start_pfn << PAGE_SHIFT,
24970 + p2m_assign_end_pfn << PAGE_SHIFT,
24971 + p2m_size / 1024);
24972 +out:
24973 + unlock_cpu_hotplug();
24974 + return error;
24975 +}
24976 +
24977 +#ifdef notyet
24978 +void
24979 +p2m_expose_cleanup(void)
24980 +{
24981 + BUG_ON(!p2m_initialized);
24982 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR
24983 + unregister_cpu_notifier(&p2m_expose_dtr_hotplug_notifier);
24984 +#endif
24985 + release_resource(&p2m_resource);
24986 +}
24987 +#endif
24988 +
24989 +//XXX inlinize?
24990 +unsigned long
24991 +p2m_phystomach(unsigned long gpfn)
24992 +{
24993 + volatile const pte_t* pte;
24994 + unsigned long mfn;
24995 + unsigned long pteval;
24996 +
24997 + if (!p2m_initialized ||
24998 + gpfn < p2m_min_low_pfn || gpfn > p2m_max_low_pfn
24999 + /* || !pfn_valid(gpfn) */)
25000 + return INVALID_MFN;
25001 + pte = p2m_pte + (gpfn - p2m_convert_min_pfn);
25002 +
25003 + mfn = INVALID_MFN;
25004 + if (likely(__get_user(pteval, (unsigned long __user *)pte) == 0 &&
25005 + pte_present(__pte(pteval)) &&
25006 + pte_pfn(__pte(pteval)) != (INVALID_MFN >> PAGE_SHIFT)))
25007 + mfn = (pteval & _PFN_MASK) >> PAGE_SHIFT;
25008 +
25009 + return mfn;
25010 +}
25011 +
25012 +EXPORT_SYMBOL_GPL(p2m_initialized);
25013 +EXPORT_SYMBOL_GPL(p2m_min_low_pfn);
25014 +EXPORT_SYMBOL_GPL(p2m_max_low_pfn);
25015 +EXPORT_SYMBOL_GPL(p2m_convert_min_pfn);
25016 +EXPORT_SYMBOL_GPL(p2m_convert_max_pfn);
25017 +EXPORT_SYMBOL_GPL(p2m_pte);
25018 +EXPORT_SYMBOL_GPL(p2m_phystomach);
25019 +#endif
25020 +
25021 +///////////////////////////////////////////////////////////////////////////
25022 +// for xenoprof
25023 +
25024 +struct resource*
25025 +xen_ia64_allocate_resource(unsigned long size)
25026 +{
25027 + struct resource* res;
25028 + int error;
25029 +
25030 + res = kmalloc(sizeof(*res), GFP_KERNEL);
25031 + if (res == NULL)
25032 + return ERR_PTR(-ENOMEM);
25033 +
25034 + res->name = "Xen";
25035 + res->flags = IORESOURCE_MEM;
25036 + error = allocate_resource(&iomem_resource, res, PAGE_ALIGN(size),
25037 + privcmd_resource_min, privcmd_resource_max,
25038 + IA64_GRANULE_SIZE, NULL, NULL);
25039 + if (error) {
25040 + kfree(res);
25041 + return ERR_PTR(error);
25042 + }
25043 + return res;
25044 +}
25045 +EXPORT_SYMBOL_GPL(xen_ia64_allocate_resource);
25046 +
25047 +void
25048 +xen_ia64_release_resource(struct resource* res)
25049 +{
25050 + release_resource(res);
25051 + kfree(res);
25052 +}
25053 +EXPORT_SYMBOL_GPL(xen_ia64_release_resource);
25054 +
25055 +void
25056 +xen_ia64_unmap_resource(struct resource* res)
25057 +{
25058 + unsigned long gpfn = res->start >> PAGE_SHIFT;
25059 + unsigned long nr_pages = (res->end - res->start) >> PAGE_SHIFT;
25060 + unsigned long i;
25061 +
25062 + for (i = 0; i < nr_pages; i++) {
25063 + int error = HYPERVISOR_zap_physmap(gpfn + i, 0);
25064 + if (error)
25065 + printk(KERN_ERR
25066 + "%s:%d zap_phsymap failed %d gpfn %lx\n",
25067 + __func__, __LINE__, error, gpfn + i);
25068 + }
25069 + xen_ia64_release_resource(res);
25070 +}
25071 +EXPORT_SYMBOL_GPL(xen_ia64_unmap_resource);
25072 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/mem.c linux-2.6.16.33/arch/ia64/xen/mem.c
25073 --- linux-2.6.16.33-noxen/arch/ia64/xen/mem.c 1970-01-01 00:00:00.000000000 +0000
25074 +++ linux-2.6.16.33/arch/ia64/xen/mem.c 2007-01-08 15:00:45.000000000 +0000
25075 @@ -0,0 +1,76 @@
25076 +/*
25077 + * Originally from linux/drivers/char/mem.c
25078 + *
25079 + * Copyright (C) 1991, 1992 Linus Torvalds
25080 + *
25081 + * Added devfs support.
25082 + * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
25083 + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
25084 + */
25085 +/*
25086 + * taken from
25087 + * linux/drivers/char/mem.c and linux-2.6-xen-sparse/drivers/xen/char/mem.c.
25088 + * adjusted for IA64 and made transparent.
25089 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25090 + * VA Linux Systems Japan K.K.
25091 + */
25092 +
25093 +#include <linux/config.h>
25094 +#include <linux/mm.h>
25095 +#include <linux/efi.h>
25096 +
25097 +/*
25098 + * Architectures vary in how they handle caching for addresses
25099 + * outside of main memory.
25100 + *
25101 + */
25102 +static inline int uncached_access(struct file *file, unsigned long addr)
25103 +{
25104 + /*
25105 + * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
25106 + */
25107 + return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
25108 +}
25109 +
25110 +int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
25111 +{
25112 + unsigned long addr = vma->vm_pgoff << PAGE_SHIFT;
25113 + size_t size = vma->vm_end - vma->vm_start;
25114 +
25115 +
25116 +#if 0
25117 + /*
25118 + *XXX FIXME: linux-2.6.16.29, linux-2.6.17
25119 + * valid_mmap_phys_addr_range() in linux/arch/ia64/kernel/efi.c
25120 + * fails checks.
25121 + * linux-2.6.18.1's returns always 1.
25122 + * Its comments says
25123 + *
25124 + * MMIO regions are often missing from the EFI memory map.
25125 + * We must allow mmap of them for programs like X, so we
25126 + * currently can't do any useful validation.
25127 + */
25128 + if (!valid_mmap_phys_addr_range(addr, &size))
25129 + return -EINVAL;
25130 + if (size < vma->vm_end - vma->vm_start)
25131 + return -EINVAL;
25132 +#endif
25133 +
25134 + if (is_running_on_xen()) {
25135 + unsigned long offset = HYPERVISOR_ioremap(addr, size);
25136 + if (IS_ERR_VALUE(offset))
25137 + return offset;
25138 + }
25139 +
25140 + if (uncached_access(file, vma->vm_pgoff << PAGE_SHIFT))
25141 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
25142 +
25143 + /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
25144 + if (remap_pfn_range(vma,
25145 + vma->vm_start,
25146 + vma->vm_pgoff,
25147 + size,
25148 + vma->vm_page_prot))
25149 + return -EAGAIN;
25150 + return 0;
25151 +}
25152 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/util.c linux-2.6.16.33/arch/ia64/xen/util.c
25153 --- linux-2.6.16.33-noxen/arch/ia64/xen/util.c 1970-01-01 00:00:00.000000000 +0000
25154 +++ linux-2.6.16.33/arch/ia64/xen/util.c 2007-01-08 15:00:45.000000000 +0000
25155 @@ -0,0 +1,118 @@
25156 +/******************************************************************************
25157 + * arch/ia64/xen/util.c
25158 + * This file is the ia64 counterpart of drivers/xen/util.c
25159 + *
25160 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
25161 + * VA Linux Systems Japan K.K.
25162 + *
25163 + * This program is free software; you can redistribute it and/or modify
25164 + * it under the terms of the GNU General Public License as published by
25165 + * the Free Software Foundation; either version 2 of the License, or
25166 + * (at your option) any later version.
25167 + *
25168 + * This program is distributed in the hope that it will be useful,
25169 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25170 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25171 + * GNU General Public License for more details.
25172 + *
25173 + * You should have received a copy of the GNU General Public License
25174 + * along with this program; if not, write to the Free Software
25175 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25176 + *
25177 + */
25178 +
25179 +#include <linux/config.h>
25180 +#include <linux/mm.h>
25181 +#include <linux/module.h>
25182 +#include <linux/slab.h>
25183 +#include <linux/vmalloc.h>
25184 +#include <asm/uaccess.h>
25185 +#include <xen/driver_util.h>
25186 +#include <xen/interface/memory.h>
25187 +#include <asm/hypercall.h>
25188 +
25189 +struct vm_struct *alloc_vm_area(unsigned long size)
25190 +{
25191 + int order;
25192 + unsigned long virt;
25193 + unsigned long nr_pages;
25194 + struct vm_struct* area;
25195 +
25196 + order = get_order(size);
25197 + virt = __get_free_pages(GFP_KERNEL, order);
25198 + if (virt == 0) {
25199 + goto err0;
25200 + }
25201 + nr_pages = 1 << order;
25202 + scrub_pages(virt, nr_pages);
25203 +
25204 + area = kmalloc(sizeof(*area), GFP_KERNEL);
25205 + if (area == NULL) {
25206 + goto err1;
25207 + }
25208 +
25209 + area->flags = VM_IOREMAP;//XXX
25210 + area->addr = (void*)virt;
25211 + area->size = size;
25212 + area->pages = NULL; //XXX
25213 + area->nr_pages = nr_pages;
25214 + area->phys_addr = 0; /* xenbus_map_ring_valloc uses this field! */
25215 +
25216 + return area;
25217 +
25218 +err1:
25219 + free_pages(virt, order);
25220 +err0:
25221 + return NULL;
25222 +
25223 +}
25224 +EXPORT_SYMBOL_GPL(alloc_vm_area);
25225 +
25226 +void free_vm_area(struct vm_struct *area)
25227 +{
25228 + unsigned int order = get_order(area->size);
25229 + unsigned long i;
25230 + unsigned long phys_addr = __pa(area->addr);
25231 +
25232 + // This area is used for foreign page mappping.
25233 + // So underlying machine page may not be assigned.
25234 + for (i = 0; i < (1 << order); i++) {
25235 + unsigned long ret;
25236 + unsigned long gpfn = (phys_addr >> PAGE_SHIFT) + i;
25237 + struct xen_memory_reservation reservation = {
25238 + .nr_extents = 1,
25239 + .address_bits = 0,
25240 + .extent_order = 0,
25241 + .domid = DOMID_SELF
25242 + };
25243 + set_xen_guest_handle(reservation.extent_start, &gpfn);
25244 + ret = HYPERVISOR_memory_op(XENMEM_populate_physmap,
25245 + &reservation);
25246 + BUG_ON(ret != 1);
25247 + }
25248 + free_pages((unsigned long)area->addr, order);
25249 + kfree(area);
25250 +}
25251 +EXPORT_SYMBOL_GPL(free_vm_area);
25252 +
25253 +void lock_vm_area(struct vm_struct *area)
25254 +{
25255 + // nothing
25256 +}
25257 +EXPORT_SYMBOL_GPL(lock_vm_area);
25258 +
25259 +void unlock_vm_area(struct vm_struct *area)
25260 +{
25261 + // nothing
25262 +}
25263 +EXPORT_SYMBOL_GPL(unlock_vm_area);
25264 +
25265 +/*
25266 + * Local variables:
25267 + * c-file-style: "linux"
25268 + * indent-tabs-mode: t
25269 + * c-indent-level: 8
25270 + * c-basic-offset: 8
25271 + * tab-width: 8
25272 + * End:
25273 + */
25274 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_hcall.c linux-2.6.16.33/arch/ia64/xen/xcom_hcall.c
25275 --- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_hcall.c 1970-01-01 00:00:00.000000000 +0000
25276 +++ linux-2.6.16.33/arch/ia64/xen/xcom_hcall.c 2007-01-08 15:00:45.000000000 +0000
25277 @@ -0,0 +1,365 @@
25278 +/*
25279 + * This program is free software; you can redistribute it and/or modify
25280 + * it under the terms of the GNU General Public License as published by
25281 + * the Free Software Foundation; either version 2 of the License, or
25282 + * (at your option) any later version.
25283 + *
25284 + * This program is distributed in the hope that it will be useful,
25285 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25286 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25287 + * GNU General Public License for more details.
25288 + *
25289 + * You should have received a copy of the GNU General Public License
25290 + * along with this program; if not, write to the Free Software
25291 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25292 + *
25293 + * Tristan Gingold <tristan.gingold@bull.net>
25294 + */
25295 +#include <linux/types.h>
25296 +#include <linux/errno.h>
25297 +#include <linux/kernel.h>
25298 +#include <linux/gfp.h>
25299 +#include <linux/module.h>
25300 +#include <xen/interface/xen.h>
25301 +#include <xen/interface/dom0_ops.h>
25302 +#include <xen/interface/memory.h>
25303 +#include <xen/interface/xencomm.h>
25304 +#include <xen/interface/version.h>
25305 +#include <xen/interface/sched.h>
25306 +#include <xen/interface/event_channel.h>
25307 +#include <xen/interface/physdev.h>
25308 +#include <xen/interface/grant_table.h>
25309 +#include <xen/interface/callback.h>
25310 +#include <xen/interface/acm_ops.h>
25311 +#include <xen/interface/hvm/params.h>
25312 +#include <xen/interface/xenoprof.h>
25313 +#include <asm/hypercall.h>
25314 +#include <asm/page.h>
25315 +#include <asm/uaccess.h>
25316 +#include <asm/xen/xencomm.h>
25317 +#include <asm/perfmon.h>
25318 +
25319 +/* Xencomm notes:
25320 + * This file defines hypercalls to be used by xencomm. The hypercalls simply
25321 + * create inlines descriptors for pointers and then call the raw arch hypercall
25322 + * xencomm_arch_hypercall_XXX
25323 + *
25324 + * If the arch wants to directly use these hypercalls, simply define macros
25325 + * in asm/hypercall.h, eg:
25326 + * #define HYPERVISOR_sched_op xencomm_hypercall_sched_op
25327 + *
25328 + * The arch may also define HYPERVISOR_xxx as a function and do more operations
25329 + * before/after doing the hypercall.
25330 + *
25331 + * Note: because only inline descriptors are created these functions must only
25332 + * be called with in kernel memory parameters.
25333 + */
25334 +
25335 +int
25336 +xencomm_hypercall_console_io(int cmd, int count, char *str)
25337 +{
25338 + return xencomm_arch_hypercall_console_io
25339 + (cmd, count, xencomm_create_inline(str));
25340 +}
25341 +
25342 +int
25343 +xencomm_hypercall_event_channel_op(int cmd, void *op)
25344 +{
25345 + return xencomm_arch_hypercall_event_channel_op
25346 + (cmd, xencomm_create_inline(op));
25347 +}
25348 +
25349 +int
25350 +xencomm_hypercall_xen_version(int cmd, void *arg)
25351 +{
25352 + switch (cmd) {
25353 + case XENVER_version:
25354 + case XENVER_extraversion:
25355 + case XENVER_compile_info:
25356 + case XENVER_capabilities:
25357 + case XENVER_changeset:
25358 + case XENVER_platform_parameters:
25359 + case XENVER_pagesize:
25360 + case XENVER_get_features:
25361 + break;
25362 + default:
25363 + printk("%s: unknown version cmd %d\n", __func__, cmd);
25364 + return -ENOSYS;
25365 + }
25366 +
25367 + return xencomm_arch_hypercall_xen_version
25368 + (cmd, xencomm_create_inline(arg));
25369 +}
25370 +
25371 +int
25372 +xencomm_hypercall_physdev_op(int cmd, void *op)
25373 +{
25374 + return xencomm_arch_hypercall_physdev_op
25375 + (cmd, xencomm_create_inline(op));
25376 +}
25377 +
25378 +static void *
25379 +xencommize_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25380 +{
25381 + switch (cmd) {
25382 + case GNTTABOP_map_grant_ref:
25383 + case GNTTABOP_unmap_grant_ref:
25384 + break;
25385 + case GNTTABOP_setup_table:
25386 + {
25387 + struct gnttab_setup_table *setup = op;
25388 + struct xencomm_handle *frame_list;
25389 +
25390 + frame_list = xencomm_create_inline
25391 + (xen_guest_handle(setup->frame_list));
25392 +
25393 + set_xen_guest_handle(setup->frame_list, (void *)frame_list);
25394 + break;
25395 + }
25396 + case GNTTABOP_dump_table:
25397 + case GNTTABOP_transfer:
25398 + case GNTTABOP_copy:
25399 + break;
25400 + default:
25401 + printk("%s: unknown grant table op %d\n", __func__, cmd);
25402 + BUG();
25403 + }
25404 +
25405 + return xencomm_create_inline(op);
25406 +}
25407 +
25408 +int
25409 +xencomm_hypercall_grant_table_op(unsigned int cmd, void *op, unsigned int count)
25410 +{
25411 + void *desc = xencommize_grant_table_op (cmd, op, count);
25412 +
25413 + return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
25414 +}
25415 +
25416 +int
25417 +xencomm_hypercall_sched_op(int cmd, void *arg)
25418 +{
25419 + switch (cmd) {
25420 + case SCHEDOP_yield:
25421 + case SCHEDOP_block:
25422 + case SCHEDOP_shutdown:
25423 + case SCHEDOP_remote_shutdown:
25424 + break;
25425 + case SCHEDOP_poll:
25426 + {
25427 + sched_poll_t *poll = arg;
25428 + struct xencomm_handle *ports;
25429 +
25430 + ports = xencomm_create_inline(xen_guest_handle(poll->ports));
25431 +
25432 + set_xen_guest_handle(poll->ports, (void *)ports);
25433 + break;
25434 + }
25435 + default:
25436 + printk("%s: unknown sched op %d\n", __func__, cmd);
25437 + return -ENOSYS;
25438 + }
25439 +
25440 + return xencomm_arch_hypercall_sched_op(cmd, xencomm_create_inline(arg));
25441 +}
25442 +
25443 +int
25444 +xencomm_hypercall_multicall(void *call_list, int nr_calls)
25445 +{
25446 + int i;
25447 + multicall_entry_t *mce;
25448 +
25449 + for (i = 0; i < nr_calls; i++) {
25450 + mce = (multicall_entry_t *)call_list + i;
25451 +
25452 + switch (mce->op) {
25453 + case __HYPERVISOR_update_va_mapping:
25454 + case __HYPERVISOR_mmu_update:
25455 + /* No-op on ia64. */
25456 + break;
25457 + case __HYPERVISOR_grant_table_op:
25458 + mce->args[1] = (unsigned long)xencommize_grant_table_op
25459 + (mce->args[0], (void *)mce->args[1],
25460 + mce->args[2]);
25461 + break;
25462 + case __HYPERVISOR_memory_op:
25463 + default:
25464 + printk("%s: unhandled multicall op entry op %lu\n",
25465 + __func__, mce->op);
25466 + return -ENOSYS;
25467 + }
25468 + }
25469 +
25470 + return xencomm_arch_hypercall_multicall
25471 + (xencomm_create_inline(call_list), nr_calls);
25472 +}
25473 +
25474 +int
25475 +xencomm_hypercall_callback_op(int cmd, void *arg)
25476 +{
25477 + switch (cmd)
25478 + {
25479 + case CALLBACKOP_register:
25480 + case CALLBACKOP_unregister:
25481 + break;
25482 + default:
25483 + printk("%s: unknown callback op %d\n", __func__, cmd);
25484 + return -ENOSYS;
25485 + }
25486 +
25487 + return xencomm_arch_hypercall_callback_op
25488 + (cmd, xencomm_create_inline(arg));
25489 +}
25490 +
25491 +static void
25492 +xencommize_memory_reservation (xen_memory_reservation_t *mop)
25493 +{
25494 + struct xencomm_handle *desc;
25495 +
25496 + desc = xencomm_create_inline(xen_guest_handle(mop->extent_start));
25497 + set_xen_guest_handle(mop->extent_start, (void *)desc);
25498 +}
25499 +
25500 +int
25501 +xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
25502 +{
25503 + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2];
25504 + xen_memory_reservation_t *xmr = NULL, *xme_in = NULL, *xme_out = NULL;
25505 + int rc;
25506 +
25507 + switch (cmd) {
25508 + case XENMEM_increase_reservation:
25509 + case XENMEM_decrease_reservation:
25510 + case XENMEM_populate_physmap:
25511 + xmr = (xen_memory_reservation_t *)arg;
25512 + xen_guest_handle(extent_start_va[0]) =
25513 + xen_guest_handle(xmr->extent_start);
25514 + xencommize_memory_reservation((xen_memory_reservation_t *)arg);
25515 + break;
25516 +
25517 + case XENMEM_maximum_ram_page:
25518 + break;
25519 +
25520 + case XENMEM_exchange:
25521 + xme_in = &((xen_memory_exchange_t *)arg)->in;
25522 + xme_out = &((xen_memory_exchange_t *)arg)->out;
25523 + xen_guest_handle(extent_start_va[0]) =
25524 + xen_guest_handle(xme_in->extent_start);
25525 + xen_guest_handle(extent_start_va[1]) =
25526 + xen_guest_handle(xme_out->extent_start);
25527 + xencommize_memory_reservation
25528 + (&((xen_memory_exchange_t *)arg)->in);
25529 + xencommize_memory_reservation
25530 + (&((xen_memory_exchange_t *)arg)->out);
25531 + break;
25532 +
25533 + default:
25534 + printk("%s: unknown memory op %d\n", __func__, cmd);
25535 + return -ENOSYS;
25536 + }
25537 +
25538 + rc = xencomm_arch_hypercall_memory_op(cmd, xencomm_create_inline(arg));
25539 +
25540 + switch (cmd) {
25541 + case XENMEM_increase_reservation:
25542 + case XENMEM_decrease_reservation:
25543 + case XENMEM_populate_physmap:
25544 + xen_guest_handle(xmr->extent_start) =
25545 + xen_guest_handle(extent_start_va[0]);
25546 + break;
25547 +
25548 + case XENMEM_exchange:
25549 + xen_guest_handle(xme_in->extent_start) =
25550 + xen_guest_handle(extent_start_va[0]);
25551 + xen_guest_handle(xme_out->extent_start) =
25552 + xen_guest_handle(extent_start_va[1]);
25553 + break;
25554 + }
25555 +
25556 + return rc;
25557 +}
25558 +
25559 +unsigned long
25560 +xencomm_hypercall_hvm_op(int cmd, void *arg)
25561 +{
25562 + switch (cmd) {
25563 + case HVMOP_set_param:
25564 + case HVMOP_get_param:
25565 + break;
25566 + default:
25567 + printk("%s: unknown hvm op %d\n", __func__, cmd);
25568 + return -ENOSYS;
25569 + }
25570 +
25571 + return xencomm_arch_hypercall_hvm_op(cmd, xencomm_create_inline(arg));
25572 +}
25573 +
25574 +int
25575 +xencomm_hypercall_suspend(unsigned long srec)
25576 +{
25577 + struct sched_shutdown arg;
25578 +
25579 + arg.reason = SHUTDOWN_suspend;
25580 +
25581 + return xencomm_arch_hypercall_suspend(xencomm_create_inline(&arg));
25582 +}
25583 +
25584 +int
25585 +xencomm_hypercall_xenoprof_op(int op, void *arg)
25586 +{
25587 + switch (op) {
25588 + case XENOPROF_init:
25589 + case XENOPROF_set_active:
25590 + case XENOPROF_set_passive:
25591 + case XENOPROF_counter:
25592 + case XENOPROF_get_buffer:
25593 + break;
25594 +
25595 + case XENOPROF_reset_active_list:
25596 + case XENOPROF_reset_passive_list:
25597 + case XENOPROF_reserve_counters:
25598 + case XENOPROF_setup_events:
25599 + case XENOPROF_enable_virq:
25600 + case XENOPROF_start:
25601 + case XENOPROF_stop:
25602 + case XENOPROF_disable_virq:
25603 + case XENOPROF_release_counters:
25604 + case XENOPROF_shutdown:
25605 + return xencomm_arch_hypercall_xenoprof_op(op, arg);
25606 + break;
25607 +
25608 + default:
25609 + printk("%s: op %d isn't supported\n", __func__, op);
25610 + return -ENOSYS;
25611 + }
25612 + return xencomm_arch_hypercall_xenoprof_op(op,
25613 + xencomm_create_inline(arg));
25614 +}
25615 +
25616 +int
25617 +xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg, unsigned long count)
25618 +{
25619 + switch (cmd) {
25620 + case PFM_GET_FEATURES:
25621 + case PFM_CREATE_CONTEXT:
25622 + case PFM_WRITE_PMCS:
25623 + case PFM_WRITE_PMDS:
25624 + case PFM_LOAD_CONTEXT:
25625 + break;
25626 +
25627 + case PFM_DESTROY_CONTEXT:
25628 + case PFM_UNLOAD_CONTEXT:
25629 + case PFM_START:
25630 + case PFM_STOP:
25631 + return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
25632 +
25633 + default:
25634 + printk("%s:%d cmd %ld isn't supported\n",
25635 + __func__,__LINE__, cmd);
25636 + BUG();
25637 + }
25638 +
25639 + return xencomm_arch_hypercall_perfmon_op(cmd,
25640 + xencomm_create_inline(arg),
25641 + count);
25642 +}
25643 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_mini.c linux-2.6.16.33/arch/ia64/xen/xcom_mini.c
25644 --- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_mini.c 1970-01-01 00:00:00.000000000 +0000
25645 +++ linux-2.6.16.33/arch/ia64/xen/xcom_mini.c 2007-01-08 15:00:45.000000000 +0000
25646 @@ -0,0 +1,417 @@
25647 +/*
25648 + * This program is free software; you can redistribute it and/or modify
25649 + * it under the terms of the GNU General Public License as published by
25650 + * the Free Software Foundation; either version 2 of the License, or
25651 + * (at your option) any later version.
25652 + *
25653 + * This program is distributed in the hope that it will be useful,
25654 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
25655 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25656 + * GNU General Public License for more details.
25657 + *
25658 + * You should have received a copy of the GNU General Public License
25659 + * along with this program; if not, write to the Free Software
25660 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25661 + *
25662 + * Tristan Gingold <tristan.gingold@bull.net>
25663 + */
25664 +#include <linux/types.h>
25665 +#include <linux/errno.h>
25666 +#include <linux/kernel.h>
25667 +#include <linux/module.h>
25668 +#include <xen/interface/xen.h>
25669 +#include <xen/interface/dom0_ops.h>
25670 +#include <xen/interface/memory.h>
25671 +#include <xen/interface/xencomm.h>
25672 +#include <xen/interface/version.h>
25673 +#include <xen/interface/event_channel.h>
25674 +#include <xen/interface/physdev.h>
25675 +#include <xen/interface/grant_table.h>
25676 +#include <xen/interface/hvm/params.h>
25677 +#include <xen/interface/xenoprof.h>
25678 +#ifdef CONFIG_VMX_GUEST
25679 +#include <asm/hypervisor.h>
25680 +#else
25681 +#include <asm/hypercall.h>
25682 +#endif
25683 +#include <asm/xen/xencomm.h>
25684 +#include <asm/perfmon.h>
25685 +
25686 +int
25687 +xencomm_mini_hypercall_event_channel_op(int cmd, void *op)
25688 +{
25689 + struct xencomm_mini xc_area[2];
25690 + int nbr_area = 2;
25691 + struct xencomm_handle *desc;
25692 + int rc;
25693 +
25694 + rc = xencomm_create_mini(xc_area, &nbr_area,
25695 + op, sizeof(evtchn_op_t), &desc);
25696 + if (rc)
25697 + return rc;
25698 +
25699 + return xencomm_arch_hypercall_event_channel_op(cmd, desc);
25700 +}
25701 +EXPORT_SYMBOL(xencomm_mini_hypercall_event_channel_op);
25702 +
25703 +static int
25704 +xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area,
25705 + unsigned int cmd, void *op, unsigned int count,
25706 + struct xencomm_handle **desc)
25707 +{
25708 + struct xencomm_handle *desc1;
25709 + unsigned int argsize;
25710 + int rc;
25711 +
25712 + switch (cmd) {
25713 + case GNTTABOP_map_grant_ref:
25714 + argsize = sizeof(struct gnttab_map_grant_ref);
25715 + break;
25716 + case GNTTABOP_unmap_grant_ref:
25717 + argsize = sizeof(struct gnttab_unmap_grant_ref);
25718 + break;
25719 + case GNTTABOP_setup_table:
25720 + {
25721 + struct gnttab_setup_table *setup = op;
25722 +
25723 + argsize = sizeof(*setup);
25724 +
25725 + if (count != 1)
25726 + return -EINVAL;
25727 + rc = xencomm_create_mini
25728 + (xc_area, nbr_area,
25729 + xen_guest_handle(setup->frame_list),
25730 + setup->nr_frames
25731 + * sizeof(*xen_guest_handle(setup->frame_list)),
25732 + &desc1);
25733 + if (rc)
25734 + return rc;
25735 + set_xen_guest_handle(setup->frame_list, (void *)desc1);
25736 + break;
25737 + }
25738 + case GNTTABOP_dump_table:
25739 + argsize = sizeof(struct gnttab_dump_table);
25740 + break;
25741 + case GNTTABOP_transfer:
25742 + argsize = sizeof(struct gnttab_transfer);
25743 + break;
25744 + case GNTTABOP_copy:
25745 + argsize = sizeof(struct gnttab_copy);
25746 + break;
25747 + default:
25748 + printk("%s: unknown mini grant table op %d\n", __func__, cmd);
25749 + BUG();
25750 + }
25751 +
25752 + rc = xencomm_create_mini(xc_area, nbr_area, op, count * argsize, desc);
25753 + if (rc)
25754 + return rc;
25755 +
25756 + return 0;
25757 +}
25758 +
25759 +int
25760 +xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
25761 + unsigned int count)
25762 +{
25763 + int rc;
25764 + struct xencomm_handle *desc;
25765 + int nbr_area = 2;
25766 + struct xencomm_mini xc_area[2];
25767 +
25768 + rc = xencommize_mini_grant_table_op(xc_area, &nbr_area,
25769 + cmd, op, count, &desc);
25770 + if (rc)
25771 + return rc;
25772 +
25773 + return xencomm_arch_hypercall_grant_table_op(cmd, desc, count);
25774 +}
25775 +EXPORT_SYMBOL(xencomm_mini_hypercall_grant_table_op);
25776 +
25777 +int
25778 +xencomm_mini_hypercall_multicall(void *call_list, int nr_calls)
25779 +{
25780 + int i;
25781 + multicall_entry_t *mce;
25782 + int nbr_area = 2 + nr_calls * 3;
25783 + struct xencomm_mini xc_area[nbr_area];
25784 + struct xencomm_handle *desc;
25785 + int rc;
25786 +
25787 + for (i = 0; i < nr_calls; i++) {
25788 + mce = (multicall_entry_t *)call_list + i;
25789 +
25790 + switch (mce->op) {
25791 + case __HYPERVISOR_update_va_mapping:
25792 + case __HYPERVISOR_mmu_update:
25793 + /* No-op on ia64. */
25794 + break;
25795 + case __HYPERVISOR_grant_table_op:
25796 + rc = xencommize_mini_grant_table_op
25797 + (xc_area, &nbr_area,
25798 + mce->args[0], (void *)mce->args[1],
25799 + mce->args[2], &desc);
25800 + if (rc)
25801 + return rc;
25802 + mce->args[1] = (unsigned long)desc;
25803 + break;
25804 + case __HYPERVISOR_memory_op:
25805 + default:
25806 + printk("%s: unhandled multicall op entry op %lu\n",
25807 + __func__, mce->op);
25808 + return -ENOSYS;
25809 + }
25810 + }
25811 +
25812 + rc = xencomm_create_mini(xc_area, &nbr_area, call_list,
25813 + nr_calls * sizeof(multicall_entry_t), &desc);
25814 + if (rc)
25815 + return rc;
25816 +
25817 + return xencomm_arch_hypercall_multicall(desc, nr_calls);
25818 +}
25819 +EXPORT_SYMBOL(xencomm_mini_hypercall_multicall);
25820 +
25821 +static int
25822 +xencommize_mini_memory_reservation(struct xencomm_mini *area, int *nbr_area,
25823 + xen_memory_reservation_t *mop)
25824 +{
25825 + struct xencomm_handle *desc;
25826 + int rc;
25827 +
25828 + rc = xencomm_create_mini
25829 + (area, nbr_area,
25830 + xen_guest_handle(mop->extent_start),
25831 + mop->nr_extents
25832 + * sizeof(*xen_guest_handle(mop->extent_start)),
25833 + &desc);
25834 + if (rc)
25835 + return rc;
25836 +
25837 + set_xen_guest_handle(mop->extent_start, (void *)desc);
25838 +
25839 + return 0;
25840 +}
25841 +
25842 +int
25843 +xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg)
25844 +{
25845 + int nbr_area = 4;
25846 + struct xencomm_mini xc_area[4];
25847 + struct xencomm_handle *desc;
25848 + int rc;
25849 + unsigned int argsize;
25850 +
25851 + switch (cmd) {
25852 + case XENMEM_increase_reservation:
25853 + case XENMEM_decrease_reservation:
25854 + case XENMEM_populate_physmap:
25855 + argsize = sizeof(xen_memory_reservation_t);
25856 + rc = xencommize_mini_memory_reservation
25857 + (xc_area, &nbr_area, (xen_memory_reservation_t *)arg);
25858 + if (rc)
25859 + return rc;
25860 + break;
25861 +
25862 + case XENMEM_maximum_ram_page:
25863 + argsize = 0;
25864 + break;
25865 +
25866 + case XENMEM_exchange:
25867 + argsize = sizeof(xen_memory_exchange_t);
25868 + rc = xencommize_mini_memory_reservation
25869 + (xc_area, &nbr_area,
25870 + &((xen_memory_exchange_t *)arg)->in);
25871 + if (rc)
25872 + return rc;
25873 + rc = xencommize_mini_memory_reservation
25874 + (xc_area, &nbr_area,
25875 + &((xen_memory_exchange_t *)arg)->out);
25876 + if (rc)
25877 + return rc;
25878 + break;
25879 +
25880 + case XENMEM_add_to_physmap:
25881 + argsize = sizeof (xen_add_to_physmap_t);
25882 + break;
25883 +
25884 + default:
25885 + printk("%s: unknown mini memory op %d\n", __func__, cmd);
25886 + return -ENOSYS;
25887 + }
25888 +
25889 + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25890 + if (rc)
25891 + return rc;
25892 +
25893 + return xencomm_arch_hypercall_memory_op(cmd, desc);
25894 +}
25895 +EXPORT_SYMBOL(xencomm_mini_hypercall_memory_op);
25896 +
25897 +unsigned long
25898 +xencomm_mini_hypercall_hvm_op(int cmd, void *arg)
25899 +{
25900 + struct xencomm_handle *desc;
25901 + int nbr_area = 2;
25902 + struct xencomm_mini xc_area[2];
25903 + unsigned int argsize;
25904 + int rc;
25905 +
25906 + switch (cmd) {
25907 + case HVMOP_get_param:
25908 + case HVMOP_set_param:
25909 + argsize = sizeof(xen_hvm_param_t);
25910 + break;
25911 + default:
25912 + printk("%s: unknown HVMOP %d\n", __func__, cmd);
25913 + return -EINVAL;
25914 + }
25915 +
25916 + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25917 + if (rc)
25918 + return rc;
25919 +
25920 + return xencomm_arch_hypercall_hvm_op(cmd, desc);
25921 +}
25922 +EXPORT_SYMBOL(xencomm_mini_hypercall_hvm_op);
25923 +
25924 +int
25925 +xencomm_mini_hypercall_xen_version(int cmd, void *arg)
25926 +{
25927 + struct xencomm_handle *desc;
25928 + int nbr_area = 2;
25929 + struct xencomm_mini xc_area[2];
25930 + unsigned int argsize;
25931 + int rc;
25932 +
25933 + switch (cmd) {
25934 + case XENVER_version:
25935 + /* do not actually pass an argument */
25936 + return xencomm_arch_hypercall_xen_version(cmd, 0);
25937 + case XENVER_extraversion:
25938 + argsize = sizeof(xen_extraversion_t);
25939 + break;
25940 + case XENVER_compile_info:
25941 + argsize = sizeof(xen_compile_info_t);
25942 + break;
25943 + case XENVER_capabilities:
25944 + argsize = sizeof(xen_capabilities_info_t);
25945 + break;
25946 + case XENVER_changeset:
25947 + argsize = sizeof(xen_changeset_info_t);
25948 + break;
25949 + case XENVER_platform_parameters:
25950 + argsize = sizeof(xen_platform_parameters_t);
25951 + break;
25952 + case XENVER_pagesize:
25953 + argsize = (arg == NULL) ? 0 : sizeof(void *);
25954 + break;
25955 + case XENVER_get_features:
25956 + argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
25957 + break;
25958 +
25959 + default:
25960 + printk("%s: unknown version op %d\n", __func__, cmd);
25961 + return -ENOSYS;
25962 + }
25963 +
25964 + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
25965 + if (rc)
25966 + return rc;
25967 +
25968 + return xencomm_arch_hypercall_xen_version(cmd, desc);
25969 +}
25970 +EXPORT_SYMBOL(xencomm_mini_hypercall_xen_version);
25971 +
25972 +int
25973 +xencomm_mini_hypercall_xenoprof_op(int op, void *arg)
25974 +{
25975 + unsigned int argsize;
25976 + struct xencomm_mini xc_area[2];
25977 + int nbr_area = 2;
25978 + struct xencomm_handle *desc;
25979 + int rc;
25980 +
25981 + switch (op) {
25982 + case XENOPROF_init:
25983 + argsize = sizeof(xenoprof_init_t);
25984 + break;
25985 + case XENOPROF_set_active:
25986 + argsize = sizeof(domid_t);
25987 + break;
25988 + case XENOPROF_set_passive:
25989 + argsize = sizeof(xenoprof_passive_t);
25990 + break;
25991 + case XENOPROF_counter:
25992 + argsize = sizeof(xenoprof_counter_t);
25993 + break;
25994 + case XENOPROF_get_buffer:
25995 + argsize = sizeof(xenoprof_get_buffer_t);
25996 + break;
25997 +
25998 + case XENOPROF_reset_active_list:
25999 + case XENOPROF_reset_passive_list:
26000 + case XENOPROF_reserve_counters:
26001 + case XENOPROF_setup_events:
26002 + case XENOPROF_enable_virq:
26003 + case XENOPROF_start:
26004 + case XENOPROF_stop:
26005 + case XENOPROF_disable_virq:
26006 + case XENOPROF_release_counters:
26007 + case XENOPROF_shutdown:
26008 + return xencomm_arch_hypercall_xenoprof_op(op, arg);
26009 +
26010 + default:
26011 + printk("%s: op %d isn't supported\n", __func__, op);
26012 + return -ENOSYS;
26013 + }
26014 + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26015 + if (rc)
26016 + return rc;
26017 + return xencomm_arch_hypercall_xenoprof_op(op, desc);
26018 +}
26019 +EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_xenoprof_op);
26020 +
26021 +int
26022 +xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
26023 + unsigned long count)
26024 +{
26025 + unsigned int argsize;
26026 + struct xencomm_mini xc_area[2];
26027 + int nbr_area = 2;
26028 + struct xencomm_handle *desc;
26029 + int rc;
26030 +
26031 + switch (cmd) {
26032 + case PFM_GET_FEATURES:
26033 + argsize = sizeof(pfarg_features_t);
26034 + break;
26035 + case PFM_CREATE_CONTEXT:
26036 + argsize = sizeof(pfarg_context_t);
26037 + break;
26038 + case PFM_LOAD_CONTEXT:
26039 + argsize = sizeof(pfarg_load_t);
26040 + break;
26041 + case PFM_WRITE_PMCS:
26042 + case PFM_WRITE_PMDS:
26043 + argsize = sizeof(pfarg_reg_t) * count;
26044 + break;
26045 +
26046 + case PFM_DESTROY_CONTEXT:
26047 + case PFM_UNLOAD_CONTEXT:
26048 + case PFM_START:
26049 + case PFM_STOP:
26050 + return xencomm_arch_hypercall_perfmon_op(cmd, arg, count);
26051 +
26052 + default:
26053 + printk("%s:%d cmd %ld isn't supported\n",
26054 + __func__, __LINE__, cmd);
26055 + BUG();
26056 + }
26057 +
26058 + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc);
26059 + if (rc)
26060 + return rc;
26061 + return xencomm_arch_hypercall_perfmon_op(cmd, desc, count);
26062 +}
26063 +EXPORT_SYMBOL_GPL(xencomm_mini_hypercall_perfmon_op);
26064 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xcom_privcmd.c linux-2.6.16.33/arch/ia64/xen/xcom_privcmd.c
26065 --- linux-2.6.16.33-noxen/arch/ia64/xen/xcom_privcmd.c 1970-01-01 00:00:00.000000000 +0000
26066 +++ linux-2.6.16.33/arch/ia64/xen/xcom_privcmd.c 2007-01-08 15:00:45.000000000 +0000
26067 @@ -0,0 +1,663 @@
26068 +/*
26069 + * This program is free software; you can redistribute it and/or modify
26070 + * it under the terms of the GNU General Public License as published by
26071 + * the Free Software Foundation; either version 2 of the License, or
26072 + * (at your option) any later version.
26073 + *
26074 + * This program is distributed in the hope that it will be useful,
26075 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
26076 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26077 + * GNU General Public License for more details.
26078 + *
26079 + * You should have received a copy of the GNU General Public License
26080 + * along with this program; if not, write to the Free Software
26081 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
26082 + *
26083 + * Authors: Hollis Blanchard <hollisb@us.ibm.com>
26084 + * Tristan Gingold <tristan.gingold@bull.net>
26085 + */
26086 +#include <linux/types.h>
26087 +#include <linux/errno.h>
26088 +#include <linux/kernel.h>
26089 +#include <linux/gfp.h>
26090 +#include <linux/module.h>
26091 +#include <xen/interface/xen.h>
26092 +#include <xen/interface/dom0_ops.h>
26093 +#define __XEN__
26094 +#include <xen/interface/domctl.h>
26095 +#include <xen/interface/sysctl.h>
26096 +#include <xen/interface/memory.h>
26097 +#include <xen/interface/version.h>
26098 +#include <xen/interface/event_channel.h>
26099 +#include <xen/interface/acm_ops.h>
26100 +#include <xen/interface/hvm/params.h>
26101 +#include <xen/public/privcmd.h>
26102 +#include <asm/hypercall.h>
26103 +#include <asm/page.h>
26104 +#include <asm/uaccess.h>
26105 +#include <asm/xen/xencomm.h>
26106 +
26107 +#define ROUND_DIV(v,s) (((v) + (s) - 1) / (s))
26108 +
26109 +static int
26110 +xencomm_privcmd_dom0_op(privcmd_hypercall_t *hypercall)
26111 +{
26112 + dom0_op_t kern_op;
26113 + dom0_op_t __user *user_op = (dom0_op_t __user *)hypercall->arg[0];
26114 + struct xencomm_handle *op_desc;
26115 + struct xencomm_handle *desc = NULL;
26116 + int ret = 0;
26117 +
26118 + if (copy_from_user(&kern_op, user_op, sizeof(dom0_op_t)))
26119 + return -EFAULT;
26120 +
26121 + if (kern_op.interface_version != DOM0_INTERFACE_VERSION)
26122 + return -EACCES;
26123 +
26124 + op_desc = xencomm_create_inline(&kern_op);
26125 +
26126 + switch (kern_op.cmd) {
26127 + default:
26128 + printk("%s: unknown dom0 cmd %d\n", __func__, kern_op.cmd);
26129 + return -ENOSYS;
26130 + }
26131 +
26132 + if (ret) {
26133 + /* error mapping the nested pointer */
26134 + return ret;
26135 + }
26136 +
26137 + ret = xencomm_arch_hypercall_dom0_op(op_desc);
26138 +
26139 + /* FIXME: should we restore the handle? */
26140 + if (copy_to_user(user_op, &kern_op, sizeof(dom0_op_t)))
26141 + ret = -EFAULT;
26142 +
26143 + if (desc)
26144 + xencomm_free(desc);
26145 + return ret;
26146 +}
26147 +
26148 +/*
26149 + * Temporarily disable the NUMA PHYSINFO code until the rest of the
26150 + * changes are upstream.
26151 + */
26152 +#undef IA64_NUMA_PHYSINFO
26153 +
26154 +static int
26155 +xencomm_privcmd_sysctl(privcmd_hypercall_t *hypercall)
26156 +{
26157 + xen_sysctl_t kern_op;
26158 + xen_sysctl_t __user *user_op;
26159 + struct xencomm_handle *op_desc;
26160 + struct xencomm_handle *desc = NULL;
26161 + struct xencomm_handle *desc1 = NULL;
26162 + int ret = 0;
26163 +
26164 + user_op = (xen_sysctl_t __user *)hypercall->arg[0];
26165 +
26166 + if (copy_from_user(&kern_op, user_op, sizeof(xen_sysctl_t)))
26167 + return -EFAULT;
26168 +
26169 + if (kern_op.interface_version != XEN_SYSCTL_INTERFACE_VERSION)
26170 + return -EACCES;
26171 +
26172 + op_desc = xencomm_create_inline(&kern_op);
26173 +
26174 + switch (kern_op.cmd) {
26175 + case XEN_SYSCTL_readconsole:
26176 + ret = xencomm_create(
26177 + xen_guest_handle(kern_op.u.readconsole.buffer),
26178 + kern_op.u.readconsole.count,
26179 + &desc, GFP_KERNEL);
26180 + set_xen_guest_handle(kern_op.u.readconsole.buffer,
26181 + (void *)desc);
26182 + break;
26183 + case XEN_SYSCTL_tbuf_op:
26184 +#ifndef IA64_NUMA_PHYSINFO
26185 + case XEN_SYSCTL_physinfo:
26186 +#endif
26187 + case XEN_SYSCTL_sched_id:
26188 + break;
26189 + case XEN_SYSCTL_perfc_op:
26190 + {
26191 + struct xencomm_handle *tmp_desc;
26192 + xen_sysctl_t tmp_op = {
26193 + .cmd = XEN_SYSCTL_perfc_op,
26194 + .interface_version = XEN_SYSCTL_INTERFACE_VERSION,
26195 + .u.perfc_op = {
26196 + .cmd = XEN_SYSCTL_PERFCOP_query,
26197 + // .desc.p = NULL,
26198 + // .val.p = NULL,
26199 + },
26200 + };
26201 +
26202 + if (xen_guest_handle(kern_op.u.perfc_op.desc) == NULL) {
26203 + if (xen_guest_handle(kern_op.u.perfc_op.val) != NULL)
26204 + return -EINVAL;
26205 + break;
26206 + }
26207 +
26208 + /* query the buffer size for xencomm */
26209 + tmp_desc = xencomm_create_inline(&tmp_op);
26210 + ret = xencomm_arch_hypercall_sysctl(tmp_desc);
26211 + if (ret)
26212 + return ret;
26213 +
26214 + ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.desc),
26215 + tmp_op.u.perfc_op.nr_counters *
26216 + sizeof(xen_sysctl_perfc_desc_t),
26217 + &desc, GFP_KERNEL);
26218 + if (ret)
26219 + return ret;
26220 +
26221 + set_xen_guest_handle(kern_op.u.perfc_op.desc, (void *)desc);
26222 +
26223 + ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.val),
26224 + tmp_op.u.perfc_op.nr_vals *
26225 + sizeof(xen_sysctl_perfc_val_t),
26226 + &desc1, GFP_KERNEL);
26227 + if (ret)
26228 + xencomm_free(desc);
26229 +
26230 + set_xen_guest_handle(kern_op.u.perfc_op.val, (void *)desc1);
26231 + break;
26232 + }
26233 + case XEN_SYSCTL_getdomaininfolist:
26234 + ret = xencomm_create(
26235 + xen_guest_handle(kern_op.u.getdomaininfolist.buffer),
26236 + kern_op.u.getdomaininfolist.max_domains *
26237 + sizeof(xen_domctl_getdomaininfo_t),
26238 + &desc, GFP_KERNEL);
26239 + set_xen_guest_handle(kern_op.u.getdomaininfolist.buffer,
26240 + (void *)desc);
26241 + break;
26242 +#ifdef IA64_NUMA_PHYSINFO
26243 + case XEN_SYSCTL_physinfo:
26244 + ret = xencomm_create(
26245 + xen_guest_handle(kern_op.u.physinfo.memory_chunks),
26246 + PUBLIC_MAXCHUNKS * sizeof(node_data_t),
26247 + &desc, GFP_KERNEL);
26248 + if (ret)
26249 + return ret;
26250 + set_xen_guest_handle(kern_op.u.physinfo.memory_chunks,
26251 + (void *)desc);
26252 +
26253 + ret = xencomm_create(
26254 + xen_guest_handle(kern_op.u.physinfo.cpu_to_node),
26255 + PUBLIC_MAX_NUMNODES * sizeof(u64),
26256 + &desc1, GFP_KERNEL);
26257 + if (ret)
26258 + xencomm_free(desc);
26259 + set_xen_guest_handle(kern_op.u.physinfo.cpu_to_node,
26260 + (void *)desc1);
26261 + break;
26262 +#endif
26263 + default:
26264 + printk("%s: unknown sysctl cmd %d\n", __func__, kern_op.cmd);
26265 + return -ENOSYS;
26266 + }
26267 +
26268 + if (ret) {
26269 + /* error mapping the nested pointer */
26270 + return ret;
26271 + }
26272 +
26273 + ret = xencomm_arch_hypercall_sysctl(op_desc);
26274 +
26275 + /* FIXME: should we restore the handles? */
26276 + if (copy_to_user(user_op, &kern_op, sizeof(xen_sysctl_t)))
26277 + ret = -EFAULT;
26278 +
26279 + if (desc)
26280 + xencomm_free(desc);
26281 + if (desc1)
26282 + xencomm_free(desc1);
26283 + return ret;
26284 +}
26285 +
26286 +static int
26287 +xencomm_privcmd_domctl(privcmd_hypercall_t *hypercall)
26288 +{
26289 + xen_domctl_t kern_op;
26290 + xen_domctl_t __user *user_op;
26291 + struct xencomm_handle *op_desc;
26292 + struct xencomm_handle *desc = NULL;
26293 + int ret = 0;
26294 +
26295 + user_op = (xen_domctl_t __user *)hypercall->arg[0];
26296 +
26297 + if (copy_from_user(&kern_op, user_op, sizeof(xen_domctl_t)))
26298 + return -EFAULT;
26299 +
26300 + if (kern_op.interface_version != XEN_DOMCTL_INTERFACE_VERSION)
26301 + return -EACCES;
26302 +
26303 + op_desc = xencomm_create_inline(&kern_op);
26304 +
26305 + switch (kern_op.cmd) {
26306 + case XEN_DOMCTL_createdomain:
26307 + case XEN_DOMCTL_destroydomain:
26308 + case XEN_DOMCTL_pausedomain:
26309 + case XEN_DOMCTL_unpausedomain:
26310 + case XEN_DOMCTL_getdomaininfo:
26311 + break;
26312 + case XEN_DOMCTL_getmemlist:
26313 + {
26314 + unsigned long nr_pages = kern_op.u.getmemlist.max_pfns;
26315 +
26316 + ret = xencomm_create(
26317 + xen_guest_handle(kern_op.u.getmemlist.buffer),
26318 + nr_pages * sizeof(unsigned long),
26319 + &desc, GFP_KERNEL);
26320 + set_xen_guest_handle(kern_op.u.getmemlist.buffer,
26321 + (void *)desc);
26322 + break;
26323 + }
26324 + case XEN_DOMCTL_getpageframeinfo:
26325 + break;
26326 + case XEN_DOMCTL_getpageframeinfo2:
26327 + ret = xencomm_create(
26328 + xen_guest_handle(kern_op.u.getpageframeinfo2.array),
26329 + kern_op.u.getpageframeinfo2.num,
26330 + &desc, GFP_KERNEL);
26331 + set_xen_guest_handle(kern_op.u.getpageframeinfo2.array,
26332 + (void *)desc);
26333 + break;
26334 + case XEN_DOMCTL_shadow_op:
26335 + ret = xencomm_create(
26336 + xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap),
26337 + ROUND_DIV(kern_op.u.shadow_op.pages, 8),
26338 + &desc, GFP_KERNEL);
26339 + set_xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap,
26340 + (void *)desc);
26341 + break;
26342 + case XEN_DOMCTL_max_mem:
26343 + break;
26344 + case XEN_DOMCTL_setvcpucontext:
26345 + case XEN_DOMCTL_getvcpucontext:
26346 + ret = xencomm_create(
26347 + xen_guest_handle(kern_op.u.vcpucontext.ctxt),
26348 + sizeof(vcpu_guest_context_t),
26349 + &desc, GFP_KERNEL);
26350 + set_xen_guest_handle(kern_op.u.vcpucontext.ctxt, (void *)desc);
26351 + break;
26352 + case XEN_DOMCTL_getvcpuinfo:
26353 + break;
26354 + case XEN_DOMCTL_setvcpuaffinity:
26355 + case XEN_DOMCTL_getvcpuaffinity:
26356 + ret = xencomm_create(
26357 + xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap),
26358 + ROUND_DIV(kern_op.u.vcpuaffinity.cpumap.nr_cpus, 8),
26359 + &desc, GFP_KERNEL);
26360 + set_xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap,
26361 + (void *)desc);
26362 + break;
26363 + case XEN_DOMCTL_max_vcpus:
26364 + case XEN_DOMCTL_scheduler_op:
26365 + case XEN_DOMCTL_setdomainhandle:
26366 + case XEN_DOMCTL_setdebugging:
26367 + case XEN_DOMCTL_irq_permission:
26368 + case XEN_DOMCTL_iomem_permission:
26369 + case XEN_DOMCTL_ioport_permission:
26370 + case XEN_DOMCTL_hypercall_init:
26371 + case XEN_DOMCTL_arch_setup:
26372 + case XEN_DOMCTL_settimeoffset:
26373 + break;
26374 + default:
26375 + printk("%s: unknown domctl cmd %d\n", __func__, kern_op.cmd);
26376 + return -ENOSYS;
26377 + }
26378 +
26379 + if (ret) {
26380 + /* error mapping the nested pointer */
26381 + return ret;
26382 + }
26383 +
26384 + ret = xencomm_arch_hypercall_domctl (op_desc);
26385 +
26386 + /* FIXME: should we restore the handle? */
26387 + if (copy_to_user(user_op, &kern_op, sizeof(xen_domctl_t)))
26388 + ret = -EFAULT;
26389 +
26390 + if (desc)
26391 + xencomm_free(desc);
26392 + return ret;
26393 +}
26394 +
26395 +static int
26396 +xencomm_privcmd_acm_op(privcmd_hypercall_t *hypercall)
26397 +{
26398 + int cmd = hypercall->arg[0];
26399 + void __user *arg = (void __user *)hypercall->arg[1];
26400 + struct xencomm_handle *op_desc;
26401 + struct xencomm_handle *desc = NULL;
26402 + int ret;
26403 +
26404 + switch (cmd) {
26405 + case ACMOP_getssid:
26406 + {
26407 + struct acm_getssid kern_arg;
26408 +
26409 + if (copy_from_user(&kern_arg, arg, sizeof (kern_arg)))
26410 + return -EFAULT;
26411 +
26412 + op_desc = xencomm_create_inline(&kern_arg);
26413 +
26414 + ret = xencomm_create(xen_guest_handle(kern_arg.ssidbuf),
26415 + kern_arg.ssidbuf_size, &desc, GFP_KERNEL);
26416 + if (ret)
26417 + return ret;
26418 +
26419 + set_xen_guest_handle(kern_arg.ssidbuf, (void *)desc);
26420 +
26421 + ret = xencomm_arch_hypercall_acm_op(cmd, op_desc);
26422 +
26423 + xencomm_free(desc);
26424 +
26425 + if (copy_to_user(arg, &kern_arg, sizeof (kern_arg)))
26426 + return -EFAULT;
26427 +
26428 + return ret;
26429 + }
26430 + default:
26431 + printk("%s: unknown acm_op cmd %d\n", __func__, cmd);
26432 + return -ENOSYS;
26433 + }
26434 +
26435 + return ret;
26436 +}
26437 +
26438 +static int
26439 +xencomm_privcmd_memory_op(privcmd_hypercall_t *hypercall)
26440 +{
26441 + const unsigned long cmd = hypercall->arg[0];
26442 + int ret = 0;
26443 +
26444 + switch (cmd) {
26445 + case XENMEM_increase_reservation:
26446 + case XENMEM_decrease_reservation:
26447 + case XENMEM_populate_physmap:
26448 + {
26449 + xen_memory_reservation_t kern_op;
26450 + xen_memory_reservation_t __user *user_op;
26451 + struct xencomm_handle *desc = NULL;
26452 + struct xencomm_handle *desc_op;
26453 +
26454 + user_op = (xen_memory_reservation_t __user *)hypercall->arg[1];
26455 + if (copy_from_user(&kern_op, user_op,
26456 + sizeof(xen_memory_reservation_t)))
26457 + return -EFAULT;
26458 + desc_op = xencomm_create_inline(&kern_op);
26459 +
26460 + if (xen_guest_handle(kern_op.extent_start)) {
26461 + void * addr;
26462 +
26463 + addr = xen_guest_handle(kern_op.extent_start);
26464 + ret = xencomm_create
26465 + (addr,
26466 + kern_op.nr_extents *
26467 + sizeof(*xen_guest_handle
26468 + (kern_op.extent_start)),
26469 + &desc, GFP_KERNEL);
26470 + if (ret)
26471 + return ret;
26472 + set_xen_guest_handle(kern_op.extent_start,
26473 + (void *)desc);
26474 + }
26475 +
26476 + ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26477 +
26478 + if (desc)
26479 + xencomm_free(desc);
26480 +
26481 + if (ret != 0)
26482 + return ret;
26483 +
26484 + if (copy_to_user(user_op, &kern_op,
26485 + sizeof(xen_memory_reservation_t)))
26486 + return -EFAULT;
26487 +
26488 + return ret;
26489 + }
26490 + case XENMEM_translate_gpfn_list:
26491 + {
26492 + xen_translate_gpfn_list_t kern_op;
26493 + xen_translate_gpfn_list_t __user *user_op;
26494 + struct xencomm_handle *desc_gpfn = NULL;
26495 + struct xencomm_handle *desc_mfn = NULL;
26496 + struct xencomm_handle *desc_op;
26497 + void *addr;
26498 +
26499 + user_op = (xen_translate_gpfn_list_t __user *)
26500 + hypercall->arg[1];
26501 + if (copy_from_user(&kern_op, user_op,
26502 + sizeof(xen_translate_gpfn_list_t)))
26503 + return -EFAULT;
26504 + desc_op = xencomm_create_inline(&kern_op);
26505 +
26506 + if (kern_op.nr_gpfns) {
26507 + /* gpfn_list. */
26508 + addr = xen_guest_handle(kern_op.gpfn_list);
26509 +
26510 + ret = xencomm_create(addr, kern_op.nr_gpfns *
26511 + sizeof(*xen_guest_handle
26512 + (kern_op.gpfn_list)),
26513 + &desc_gpfn, GFP_KERNEL);
26514 + if (ret)
26515 + return ret;
26516 + set_xen_guest_handle(kern_op.gpfn_list,
26517 + (void *)desc_gpfn);
26518 +
26519 + /* mfn_list. */
26520 + addr = xen_guest_handle(kern_op.mfn_list);
26521 +
26522 + ret = xencomm_create(addr, kern_op.nr_gpfns *
26523 + sizeof(*xen_guest_handle
26524 + (kern_op.mfn_list)),
26525 + &desc_mfn, GFP_KERNEL);
26526 + if (ret)
26527 + return ret;
26528 + set_xen_guest_handle(kern_op.mfn_list,
26529 + (void *)desc_mfn);
26530 + }
26531 +
26532 + ret = xencomm_arch_hypercall_memory_op(cmd, desc_op);
26533 +
26534 + if (desc_gpfn)
26535 + xencomm_free(desc_gpfn);
26536 +
26537 + if (desc_mfn)
26538 + xencomm_free(desc_mfn);
26539 +
26540 + if (ret != 0)
26541 + return ret;
26542 +
26543 + return ret;
26544 + }
26545 + default:
26546 + printk("%s: unknown memory op %lu\n", __func__, cmd);
26547 + ret = -ENOSYS;
26548 + }
26549 + return ret;
26550 +}
26551 +
26552 +static int
26553 +xencomm_privcmd_xen_version(privcmd_hypercall_t *hypercall)
26554 +{
26555 + int cmd = hypercall->arg[0];
26556 + void __user *arg = (void __user *)hypercall->arg[1];
26557 + struct xencomm_handle *desc;
26558 + size_t argsize;
26559 + int rc;
26560 +
26561 + switch (cmd) {
26562 + case XENVER_version:
26563 + /* do not actually pass an argument */
26564 + return xencomm_arch_hypercall_xen_version(cmd, 0);
26565 + case XENVER_extraversion:
26566 + argsize = sizeof(xen_extraversion_t);
26567 + break;
26568 + case XENVER_compile_info:
26569 + argsize = sizeof(xen_compile_info_t);
26570 + break;
26571 + case XENVER_capabilities:
26572 + argsize = sizeof(xen_capabilities_info_t);
26573 + break;
26574 + case XENVER_changeset:
26575 + argsize = sizeof(xen_changeset_info_t);
26576 + break;
26577 + case XENVER_platform_parameters:
26578 + argsize = sizeof(xen_platform_parameters_t);
26579 + break;
26580 + case XENVER_pagesize:
26581 + argsize = (arg == NULL) ? 0 : sizeof(void *);
26582 + break;
26583 + case XENVER_get_features:
26584 + argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t);
26585 + break;
26586 +
26587 + default:
26588 + printk("%s: unknown version op %d\n", __func__, cmd);
26589 + return -ENOSYS;
26590 + }
26591 +
26592 + rc = xencomm_create(arg, argsize, &desc, GFP_KERNEL);
26593 + if (rc)
26594 + return rc;
26595 +
26596 + rc = xencomm_arch_hypercall_xen_version(cmd, desc);
26597 +
26598 + xencomm_free(desc);
26599 +
26600 + return rc;
26601 +}
26602 +
26603 +static int
26604 +xencomm_privcmd_event_channel_op(privcmd_hypercall_t *hypercall)
26605 +{
26606 + int cmd = hypercall->arg[0];
26607 + struct xencomm_handle *desc;
26608 + unsigned int argsize;
26609 + int ret;
26610 +
26611 + switch (cmd) {
26612 + case EVTCHNOP_alloc_unbound:
26613 + argsize = sizeof(evtchn_alloc_unbound_t);
26614 + break;
26615 +
26616 + case EVTCHNOP_status:
26617 + argsize = sizeof(evtchn_status_t);
26618 + break;
26619 +
26620 + default:
26621 + printk("%s: unknown EVTCHNOP %d\n", __func__, cmd);
26622 + return -EINVAL;
26623 + }
26624 +
26625 + ret = xencomm_create((void *)hypercall->arg[1], argsize,
26626 + &desc, GFP_KERNEL);
26627 + if (ret)
26628 + return ret;
26629 +
26630 + ret = xencomm_arch_hypercall_event_channel_op(cmd, desc);
26631 +
26632 + xencomm_free(desc);
26633 + return ret;
26634 +}
26635 +
26636 +static int
26637 +xencomm_privcmd_hvm_op(privcmd_hypercall_t *hypercall)
26638 +{
26639 + int cmd = hypercall->arg[0];
26640 + struct xencomm_handle *desc;
26641 + unsigned int argsize;
26642 + int ret;
26643 +
26644 + switch (cmd) {
26645 + case HVMOP_get_param:
26646 + case HVMOP_set_param:
26647 + argsize = sizeof(xen_hvm_param_t);
26648 + break;
26649 + case HVMOP_set_pci_intx_level:
26650 + argsize = sizeof(xen_hvm_set_pci_intx_level_t);
26651 + break;
26652 + case HVMOP_set_isa_irq_level:
26653 + argsize = sizeof(xen_hvm_set_isa_irq_level_t);
26654 + break;
26655 + case HVMOP_set_pci_link_route:
26656 + argsize = sizeof(xen_hvm_set_pci_link_route_t);
26657 + break;
26658 +
26659 + default:
26660 + printk("%s: unknown HVMOP %d\n", __func__, cmd);
26661 + return -EINVAL;
26662 + }
26663 +
26664 + ret = xencomm_create((void *)hypercall->arg[1], argsize,
26665 + &desc, GFP_KERNEL);
26666 + if (ret)
26667 + return ret;
26668 +
26669 + ret = xencomm_arch_hypercall_hvm_op(cmd, desc);
26670 +
26671 + xencomm_free(desc);
26672 + return ret;
26673 +}
26674 +
26675 +static int
26676 +xencomm_privcmd_sched_op(privcmd_hypercall_t *hypercall)
26677 +{
26678 + int cmd = hypercall->arg[0];
26679 + struct xencomm_handle *desc;
26680 + unsigned int argsize;
26681 + int ret;
26682 +
26683 + switch (cmd) {
26684 + case SCHEDOP_remote_shutdown:
26685 + argsize = sizeof(sched_remote_shutdown_t);
26686 + break;
26687 + default:
26688 + printk("%s: unknown SCHEDOP %d\n", __func__, cmd);
26689 + return -EINVAL;
26690 + }
26691 +
26692 + ret = xencomm_create((void *)hypercall->arg[1], argsize,
26693 + &desc, GFP_KERNEL);
26694 + if (ret)
26695 + return ret;
26696 +
26697 + ret = xencomm_arch_hypercall_sched_op(cmd, desc);
26698 +
26699 + xencomm_free(desc);
26700 + return ret;
26701 +}
26702 +
26703 +int
26704 +privcmd_hypercall(privcmd_hypercall_t *hypercall)
26705 +{
26706 + switch (hypercall->op) {
26707 + case __HYPERVISOR_dom0_op:
26708 + return xencomm_privcmd_dom0_op(hypercall);
26709 + case __HYPERVISOR_domctl:
26710 + return xencomm_privcmd_domctl(hypercall);
26711 + case __HYPERVISOR_sysctl:
26712 + return xencomm_privcmd_sysctl(hypercall);
26713 + case __HYPERVISOR_acm_op:
26714 + return xencomm_privcmd_acm_op(hypercall);
26715 + case __HYPERVISOR_xen_version:
26716 + return xencomm_privcmd_xen_version(hypercall);
26717 + case __HYPERVISOR_memory_op:
26718 + return xencomm_privcmd_memory_op(hypercall);
26719 + case __HYPERVISOR_event_channel_op:
26720 + return xencomm_privcmd_event_channel_op(hypercall);
26721 + case __HYPERVISOR_hvm_op:
26722 + return xencomm_privcmd_hvm_op(hypercall);
26723 + case __HYPERVISOR_sched_op:
26724 + return xencomm_privcmd_sched_op(hypercall);
26725 + default:
26726 + printk("%s: unknown hcall (%ld)\n", __func__, hypercall->op);
26727 + return -ENOSYS;
26728 + }
26729 +}
26730 +
26731 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xencomm.c linux-2.6.16.33/arch/ia64/xen/xencomm.c
26732 --- linux-2.6.16.33-noxen/arch/ia64/xen/xencomm.c 1970-01-01 00:00:00.000000000 +0000
26733 +++ linux-2.6.16.33/arch/ia64/xen/xencomm.c 2007-01-08 15:00:45.000000000 +0000
26734 @@ -0,0 +1,263 @@
26735 +/*
26736 + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
26737 + *
26738 + * This program is free software; you can redistribute it and/or modify
26739 + * it under the terms of the GNU General Public License as published by
26740 + * the Free Software Foundation; either version 2 of the License, or
26741 + * (at your option) any later version.
26742 + *
26743 + * This program is distributed in the hope that it will be useful,
26744 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
26745 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26746 + * GNU General Public License for more details.
26747 + *
26748 + * You should have received a copy of the GNU General Public License
26749 + * along with this program; if not, write to the Free Software
26750 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26751 + */
26752 +
26753 +#include <linux/gfp.h>
26754 +#include <linux/mm.h>
26755 +#include <xen/interface/xen.h>
26756 +#include <asm/page.h>
26757 +
26758 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
26759 +#include <xen/platform-compat.h>
26760 +#endif
26761 +
26762 +#include <asm/xen/xencomm.h>
26763 +
26764 +static int xencomm_debug = 0;
26765 +
26766 +static unsigned long kernel_start_pa;
26767 +
26768 +void
26769 +xencomm_init (void)
26770 +{
26771 + kernel_start_pa = KERNEL_START - ia64_tpa(KERNEL_START);
26772 +}
26773 +
26774 +/* Translate virtual address to physical address. */
26775 +unsigned long
26776 +xencomm_vaddr_to_paddr(unsigned long vaddr)
26777 +{
26778 +#ifndef CONFIG_VMX_GUEST
26779 + struct page *page;
26780 + struct vm_area_struct *vma;
26781 +#endif
26782 +
26783 + if (vaddr == 0)
26784 + return 0;
26785 +
26786 +#ifdef __ia64__
26787 + if (REGION_NUMBER(vaddr) == 5) {
26788 + pgd_t *pgd;
26789 + pud_t *pud;
26790 + pmd_t *pmd;
26791 + pte_t *ptep;
26792 +
26793 + /* On ia64, TASK_SIZE refers to current. It is not initialized
26794 + during boot.
26795 + Furthermore the kernel is relocatable and __pa() doesn't
26796 + work on addresses. */
26797 + if (vaddr >= KERNEL_START
26798 + && vaddr < (KERNEL_START + KERNEL_TR_PAGE_SIZE)) {
26799 + return vaddr - kernel_start_pa;
26800 + }
26801 +
26802 + /* In kernel area -- virtually mapped. */
26803 + pgd = pgd_offset_k(vaddr);
26804 + if (pgd_none(*pgd) || pgd_bad(*pgd))
26805 + return ~0UL;
26806 +
26807 + pud = pud_offset(pgd, vaddr);
26808 + if (pud_none(*pud) || pud_bad(*pud))
26809 + return ~0UL;
26810 +
26811 + pmd = pmd_offset(pud, vaddr);
26812 + if (pmd_none(*pmd) || pmd_bad(*pmd))
26813 + return ~0UL;
26814 +
26815 + ptep = pte_offset_kernel(pmd, vaddr);
26816 + if (!ptep)
26817 + return ~0UL;
26818 +
26819 + return (pte_val(*ptep) & _PFN_MASK) | (vaddr & ~PAGE_MASK);
26820 + }
26821 +#endif
26822 +
26823 + if (vaddr > TASK_SIZE) {
26824 + /* kernel address */
26825 + return __pa(vaddr);
26826 + }
26827 +
26828 +
26829 +#ifdef CONFIG_VMX_GUEST
26830 + /* No privcmd within vmx guest. */
26831 + return ~0UL;
26832 +#else
26833 + /* XXX double-check (lack of) locking */
26834 + vma = find_extend_vma(current->mm, vaddr);
26835 + if (!vma)
26836 + return ~0UL;
26837 +
26838 + /* We assume the page is modified. */
26839 + page = follow_page(vma, vaddr, FOLL_WRITE | FOLL_TOUCH);
26840 + if (!page)
26841 + return ~0UL;
26842 +
26843 + return (page_to_pfn(page) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
26844 +#endif
26845 +}
26846 +
26847 +static int
26848 +xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
26849 +{
26850 + unsigned long recorded = 0;
26851 + int i = 0;
26852 +
26853 + BUG_ON((buffer == NULL) && (bytes > 0));
26854 +
26855 + /* record the physical pages used */
26856 + if (buffer == NULL)
26857 + desc->nr_addrs = 0;
26858 +
26859 + while ((recorded < bytes) && (i < desc->nr_addrs)) {
26860 + unsigned long vaddr = (unsigned long)buffer + recorded;
26861 + unsigned long paddr;
26862 + int offset;
26863 + int chunksz;
26864 +
26865 + offset = vaddr % PAGE_SIZE; /* handle partial pages */
26866 + chunksz = min(PAGE_SIZE - offset, bytes - recorded);
26867 +
26868 + paddr = xencomm_vaddr_to_paddr(vaddr);
26869 + if (paddr == ~0UL) {
26870 + printk("%s: couldn't translate vaddr %lx\n",
26871 + __func__, vaddr);
26872 + return -EINVAL;
26873 + }
26874 +
26875 + desc->address[i++] = paddr;
26876 + recorded += chunksz;
26877 + }
26878 +
26879 + if (recorded < bytes) {
26880 + printk("%s: could only translate %ld of %ld bytes\n",
26881 + __func__, recorded, bytes);
26882 + return -ENOSPC;
26883 + }
26884 +
26885 + /* mark remaining addresses invalid (just for safety) */
26886 + while (i < desc->nr_addrs)
26887 + desc->address[i++] = XENCOMM_INVALID;
26888 +
26889 + desc->magic = XENCOMM_MAGIC;
26890 +
26891 + return 0;
26892 +}
26893 +
26894 +static struct xencomm_desc *
26895 +xencomm_alloc(gfp_t gfp_mask)
26896 +{
26897 + struct xencomm_desc *desc;
26898 +
26899 + desc = (struct xencomm_desc *)__get_free_page(gfp_mask);
26900 + if (desc == NULL)
26901 + panic("%s: page allocation failed\n", __func__);
26902 +
26903 + desc->nr_addrs = (PAGE_SIZE - sizeof(struct xencomm_desc)) /
26904 + sizeof(*desc->address);
26905 +
26906 + return desc;
26907 +}
26908 +
26909 +void
26910 +xencomm_free(struct xencomm_handle *desc)
26911 +{
26912 + if (desc)
26913 + free_page((unsigned long)__va(desc));
26914 +}
26915 +
26916 +int
26917 +xencomm_create(void *buffer, unsigned long bytes,
26918 + struct xencomm_handle **ret, gfp_t gfp_mask)
26919 +{
26920 + struct xencomm_desc *desc;
26921 + struct xencomm_handle *handle;
26922 + int rc;
26923 +
26924 + if (xencomm_debug)
26925 + printk("%s: %p[%ld]\n", __func__, buffer, bytes);
26926 +
26927 + if (buffer == NULL || bytes == 0) {
26928 + *ret = (struct xencomm_handle *)NULL;
26929 + return 0;
26930 + }
26931 +
26932 + desc = xencomm_alloc(gfp_mask);
26933 + if (!desc) {
26934 + printk("%s failure\n", "xencomm_alloc");
26935 + return -ENOMEM;
26936 + }
26937 + handle = (struct xencomm_handle *)__pa(desc);
26938 +
26939 + rc = xencomm_init_desc(desc, buffer, bytes);
26940 + if (rc) {
26941 + printk("%s failure: %d\n", "xencomm_init_desc", rc);
26942 + xencomm_free(handle);
26943 + return rc;
26944 + }
26945 +
26946 + *ret = handle;
26947 + return 0;
26948 +}
26949 +
26950 +/* "mini" routines, for stack-based communications: */
26951 +
26952 +static void *
26953 +xencomm_alloc_mini(struct xencomm_mini *area, int *nbr_area)
26954 +{
26955 + unsigned long base;
26956 + unsigned int pageoffset;
26957 +
26958 + while (*nbr_area >= 0) {
26959 + /* Allocate an area. */
26960 + (*nbr_area)--;
26961 +
26962 + base = (unsigned long)(area + *nbr_area);
26963 + pageoffset = base % PAGE_SIZE;
26964 +
26965 + /* If the area does not cross a page, use it. */
26966 + if ((PAGE_SIZE - pageoffset) >= sizeof(struct xencomm_mini))
26967 + return &area[*nbr_area];
26968 + }
26969 + /* No more area. */
26970 + return NULL;
26971 +}
26972 +
26973 +int
26974 +xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
26975 + void *buffer, unsigned long bytes,
26976 + struct xencomm_handle **ret)
26977 +{
26978 + struct xencomm_desc *desc;
26979 + int rc;
26980 + unsigned long res;
26981 +
26982 + desc = xencomm_alloc_mini(area, nbr_area);
26983 + if (!desc)
26984 + return -ENOMEM;
26985 + desc->nr_addrs = XENCOMM_MINI_ADDRS;
26986 +
26987 + rc = xencomm_init_desc(desc, buffer, bytes);
26988 + if (rc)
26989 + return rc;
26990 +
26991 + res = xencomm_vaddr_to_paddr((unsigned long)desc);
26992 + if (res == ~0UL)
26993 + return -EINVAL;
26994 +
26995 + *ret = (struct xencomm_handle*)res;
26996 + return 0;
26997 +}
26998 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenentry.S linux-2.6.16.33/arch/ia64/xen/xenentry.S
26999 --- linux-2.6.16.33-noxen/arch/ia64/xen/xenentry.S 1970-01-01 00:00:00.000000000 +0000
27000 +++ linux-2.6.16.33/arch/ia64/xen/xenentry.S 2007-01-08 15:00:45.000000000 +0000
27001 @@ -0,0 +1,924 @@
27002 +/*
27003 + * ia64/xen/entry.S
27004 + *
27005 + * Alternate kernel routines for Xen. Heavily leveraged from
27006 + * ia64/kernel/entry.S
27007 + *
27008 + * Copyright (C) 2005 Hewlett-Packard Co
27009 + * Dan Magenheimer <dan.magenheimer@.hp.com>
27010 + */
27011 +
27012 +#include <linux/config.h>
27013 +
27014 +#include <asm/asmmacro.h>
27015 +#include <asm/cache.h>
27016 +#include <asm/errno.h>
27017 +#include <asm/kregs.h>
27018 +#include <asm/asm-offsets.h>
27019 +#include <asm/pgtable.h>
27020 +#include <asm/percpu.h>
27021 +#include <asm/processor.h>
27022 +#include <asm/thread_info.h>
27023 +#include <asm/unistd.h>
27024 +
27025 +#ifdef CONFIG_XEN
27026 +#include "xenminstate.h"
27027 +#else
27028 +#include "minstate.h"
27029 +#endif
27030 +
27031 +/*
27032 + * prev_task <- ia64_switch_to(struct task_struct *next)
27033 + * With Ingo's new scheduler, interrupts are disabled when this routine gets
27034 + * called. The code starting at .map relies on this. The rest of the code
27035 + * doesn't care about the interrupt masking status.
27036 + */
27037 +#ifdef CONFIG_XEN
27038 +GLOBAL_ENTRY(xen_switch_to)
27039 + .prologue
27040 + alloc r16=ar.pfs,1,0,0,0
27041 + movl r22=running_on_xen;;
27042 + ld4 r22=[r22];;
27043 + cmp.eq p7,p0=r22,r0
27044 +(p7) br.cond.sptk.many __ia64_switch_to;;
27045 +#else
27046 +GLOBAL_ENTRY(ia64_switch_to)
27047 + .prologue
27048 + alloc r16=ar.pfs,1,0,0,0
27049 +#endif
27050 + DO_SAVE_SWITCH_STACK
27051 + .body
27052 +
27053 + adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
27054 + movl r25=init_task
27055 + mov r27=IA64_KR(CURRENT_STACK)
27056 + adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
27057 + dep r20=0,in0,61,3 // physical address of "next"
27058 + ;;
27059 + st8 [r22]=sp // save kernel stack pointer of old task
27060 + shr.u r26=r20,IA64_GRANULE_SHIFT
27061 + cmp.eq p7,p6=r25,in0
27062 + ;;
27063 +#ifdef CONFIG_XEN
27064 + movl r8=XSI_PSR_IC
27065 + ;;
27066 + st4 [r8]=r0 // force psr.ic off for hyperprivop(s)
27067 + ;;
27068 +#endif
27069 + /*
27070 + * If we've already mapped this task's page, we can skip doing it again.
27071 + */
27072 +(p6) cmp.eq p7,p6=r26,r27
27073 +(p6) br.cond.dpnt .map
27074 + ;;
27075 +.done:
27076 +#ifdef CONFIG_XEN
27077 + // psr.ic already off
27078 + // update "current" application register
27079 + mov r8=IA64_KR_CURRENT
27080 + mov r9=in0;;
27081 + XEN_HYPER_SET_KR
27082 + ld8 sp=[r21] // load kernel stack pointer of new task
27083 + movl r27=XSI_PSR_IC
27084 + mov r8=1
27085 + ;;
27086 + st4 [r27]=r8 // psr.ic back on
27087 +#else
27088 + ld8 sp=[r21] // load kernel stack pointer of new task
27089 + mov IA64_KR(CURRENT)=in0 // update "current" application register
27090 +#endif
27091 + mov r8=r13 // return pointer to previously running task
27092 + mov r13=in0 // set "current" pointer
27093 + ;;
27094 + DO_LOAD_SWITCH_STACK
27095 +
27096 +#ifdef CONFIG_SMP
27097 + sync.i // ensure "fc"s done by this CPU are visible on other CPUs
27098 +#endif
27099 + br.ret.sptk.many rp // boogie on out in new context
27100 +
27101 +.map:
27102 +#ifdef CONFIG_XEN
27103 + // psr.ic already off
27104 +#else
27105 + rsm psr.ic // interrupts (psr.i) are already disabled here
27106 +#endif
27107 + movl r25=PAGE_KERNEL
27108 + ;;
27109 + srlz.d
27110 + or r23=r25,r20 // construct PA | page properties
27111 + mov r25=IA64_GRANULE_SHIFT<<2
27112 + ;;
27113 +#ifdef CONFIG_XEN
27114 + movl r8=XSI_ITIR
27115 + ;;
27116 + st8 [r8]=r25
27117 + ;;
27118 + movl r8=XSI_IFA
27119 + ;;
27120 + st8 [r8]=in0 // VA of next task...
27121 + ;;
27122 + mov r25=IA64_TR_CURRENT_STACK
27123 + // remember last page we mapped...
27124 + mov r8=IA64_KR_CURRENT_STACK
27125 + mov r9=r26;;
27126 + XEN_HYPER_SET_KR;;
27127 +#else
27128 + mov cr.itir=r25
27129 + mov cr.ifa=in0 // VA of next task...
27130 + ;;
27131 + mov r25=IA64_TR_CURRENT_STACK
27132 + mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped...
27133 +#endif
27134 + ;;
27135 + itr.d dtr[r25]=r23 // wire in new mapping...
27136 +#ifndef CONFIG_XEN
27137 + ssm psr.ic // reenable the psr.ic bit
27138 + ;;
27139 + srlz.d
27140 +#endif
27141 + br.cond.sptk .done
27142 +#ifdef CONFIG_XEN
27143 +END(xen_switch_to)
27144 +#else
27145 +END(ia64_switch_to)
27146 +#endif
27147 +
27148 + /*
27149 + * Invoke a system call, but do some tracing before and after the call.
27150 + * We MUST preserve the current register frame throughout this routine
27151 + * because some system calls (such as ia64_execve) directly
27152 + * manipulate ar.pfs.
27153 + */
27154 +#ifdef CONFIG_XEN
27155 +GLOBAL_ENTRY(xen_trace_syscall)
27156 + PT_REGS_UNWIND_INFO(0)
27157 + movl r16=running_on_xen;;
27158 + ld4 r16=[r16];;
27159 + cmp.eq p7,p0=r16,r0
27160 +(p7) br.cond.sptk.many __ia64_trace_syscall;;
27161 +#else
27162 +GLOBAL_ENTRY(ia64_trace_syscall)
27163 + PT_REGS_UNWIND_INFO(0)
27164 +#endif
27165 + /*
27166 + * We need to preserve the scratch registers f6-f11 in case the system
27167 + * call is sigreturn.
27168 + */
27169 + adds r16=PT(F6)+16,sp
27170 + adds r17=PT(F7)+16,sp
27171 + ;;
27172 + stf.spill [r16]=f6,32
27173 + stf.spill [r17]=f7,32
27174 + ;;
27175 + stf.spill [r16]=f8,32
27176 + stf.spill [r17]=f9,32
27177 + ;;
27178 + stf.spill [r16]=f10
27179 + stf.spill [r17]=f11
27180 + br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
27181 + adds r16=PT(F6)+16,sp
27182 + adds r17=PT(F7)+16,sp
27183 + ;;
27184 + ldf.fill f6=[r16],32
27185 + ldf.fill f7=[r17],32
27186 + ;;
27187 + ldf.fill f8=[r16],32
27188 + ldf.fill f9=[r17],32
27189 + ;;
27190 + ldf.fill f10=[r16]
27191 + ldf.fill f11=[r17]
27192 + // the syscall number may have changed, so re-load it and re-calculate the
27193 + // syscall entry-point:
27194 + adds r15=PT(R15)+16,sp // r15 = &pt_regs.r15 (syscall #)
27195 + ;;
27196 + ld8 r15=[r15]
27197 + mov r3=NR_syscalls - 1
27198 + ;;
27199 + adds r15=-1024,r15
27200 + movl r16=sys_call_table
27201 + ;;
27202 + shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024)
27203 + cmp.leu p6,p7=r15,r3
27204 + ;;
27205 +(p6) ld8 r20=[r20] // load address of syscall entry point
27206 +(p7) movl r20=sys_ni_syscall
27207 + ;;
27208 + mov b6=r20
27209 + br.call.sptk.many rp=b6 // do the syscall
27210 +.strace_check_retval:
27211 + cmp.lt p6,p0=r8,r0 // syscall failed?
27212 + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8
27213 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10
27214 + mov r10=0
27215 +(p6) br.cond.sptk strace_error // syscall failed ->
27216 + ;; // avoid RAW on r10
27217 +.strace_save_retval:
27218 +.mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8
27219 +.mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10
27220 + br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
27221 +.ret3:
27222 +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27223 + br.cond.sptk .work_pending_syscall_end
27224 +
27225 +strace_error:
27226 + ld8 r3=[r2] // load pt_regs.r8
27227 + sub r9=0,r8 // negate return value to get errno value
27228 + ;;
27229 + cmp.ne p6,p0=r3,r0 // is pt_regs.r8!=0?
27230 + adds r3=16,r2 // r3=&pt_regs.r10
27231 + ;;
27232 +(p6) mov r10=-1
27233 +(p6) mov r8=r9
27234 + br.cond.sptk .strace_save_retval
27235 +#ifdef CONFIG_XEN
27236 +END(xen_trace_syscall)
27237 +#else
27238 +END(ia64_trace_syscall)
27239 +#endif
27240 +
27241 +#ifdef CONFIG_XEN
27242 +GLOBAL_ENTRY(xen_ret_from_clone)
27243 + PT_REGS_UNWIND_INFO(0)
27244 + movl r16=running_on_xen;;
27245 + ld4 r16=[r16];;
27246 + cmp.eq p7,p0=r16,r0
27247 +(p7) br.cond.sptk.many __ia64_ret_from_clone;;
27248 +#else
27249 +GLOBAL_ENTRY(ia64_ret_from_clone)
27250 + PT_REGS_UNWIND_INFO(0)
27251 +#endif
27252 +{ /*
27253 + * Some versions of gas generate bad unwind info if the first instruction of a
27254 + * procedure doesn't go into the first slot of a bundle. This is a workaround.
27255 + */
27256 + nop.m 0
27257 + nop.i 0
27258 + /*
27259 + * We need to call schedule_tail() to complete the scheduling process.
27260 + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the
27261 + * address of the previously executing task.
27262 + */
27263 + br.call.sptk.many rp=ia64_invoke_schedule_tail
27264 +}
27265 +.ret8:
27266 + adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
27267 + ;;
27268 + ld4 r2=[r2]
27269 + ;;
27270 + mov r8=0
27271 + and r2=_TIF_SYSCALL_TRACEAUDIT,r2
27272 + ;;
27273 + cmp.ne p6,p0=r2,r0
27274 +(p6) br.cond.spnt .strace_check_retval
27275 + ;; // added stop bits to prevent r8 dependency
27276 +#ifdef CONFIG_XEN
27277 + br.cond.sptk ia64_ret_from_syscall
27278 +END(xen_ret_from_clone)
27279 +#else
27280 +END(ia64_ret_from_clone)
27281 +#endif
27282 +/*
27283 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
27284 + * need to switch to bank 0 and doesn't restore the scratch registers.
27285 + * To avoid leaking kernel bits, the scratch registers are set to
27286 + * the following known-to-be-safe values:
27287 + *
27288 + * r1: restored (global pointer)
27289 + * r2: cleared
27290 + * r3: 1 (when returning to user-level)
27291 + * r8-r11: restored (syscall return value(s))
27292 + * r12: restored (user-level stack pointer)
27293 + * r13: restored (user-level thread pointer)
27294 + * r14: set to __kernel_syscall_via_epc
27295 + * r15: restored (syscall #)
27296 + * r16-r17: cleared
27297 + * r18: user-level b6
27298 + * r19: cleared
27299 + * r20: user-level ar.fpsr
27300 + * r21: user-level b0
27301 + * r22: cleared
27302 + * r23: user-level ar.bspstore
27303 + * r24: user-level ar.rnat
27304 + * r25: user-level ar.unat
27305 + * r26: user-level ar.pfs
27306 + * r27: user-level ar.rsc
27307 + * r28: user-level ip
27308 + * r29: user-level psr
27309 + * r30: user-level cfm
27310 + * r31: user-level pr
27311 + * f6-f11: cleared
27312 + * pr: restored (user-level pr)
27313 + * b0: restored (user-level rp)
27314 + * b6: restored
27315 + * b7: set to __kernel_syscall_via_epc
27316 + * ar.unat: restored (user-level ar.unat)
27317 + * ar.pfs: restored (user-level ar.pfs)
27318 + * ar.rsc: restored (user-level ar.rsc)
27319 + * ar.rnat: restored (user-level ar.rnat)
27320 + * ar.bspstore: restored (user-level ar.bspstore)
27321 + * ar.fpsr: restored (user-level ar.fpsr)
27322 + * ar.ccv: cleared
27323 + * ar.csd: cleared
27324 + * ar.ssd: cleared
27325 + */
27326 +#ifdef CONFIG_XEN
27327 +GLOBAL_ENTRY(xen_leave_syscall)
27328 + PT_REGS_UNWIND_INFO(0)
27329 + movl r22=running_on_xen;;
27330 + ld4 r22=[r22];;
27331 + cmp.eq p7,p0=r22,r0
27332 +(p7) br.cond.sptk.many __ia64_leave_syscall;;
27333 +#else
27334 +ENTRY(ia64_leave_syscall)
27335 + PT_REGS_UNWIND_INFO(0)
27336 +#endif
27337 + /*
27338 + * work.need_resched etc. mustn't get changed by this CPU before it returns to
27339 + * user- or fsys-mode, hence we disable interrupts early on.
27340 + *
27341 + * p6 controls whether current_thread_info()->flags needs to be check for
27342 + * extra work. We always check for extra work when returning to user-level.
27343 + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27344 + * is 0. After extra work processing has been completed, execution
27345 + * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27346 + * needs to be redone.
27347 + */
27348 +#ifdef CONFIG_PREEMPT
27349 + rsm psr.i // disable interrupts
27350 + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
27351 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27352 + ;;
27353 + .pred.rel.mutex pUStk,pKStk
27354 +(pKStk) ld4 r21=[r20] // r21 <- preempt_count
27355 +(pUStk) mov r21=0 // r21 <- 0
27356 + ;;
27357 + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
27358 +#else /* !CONFIG_PREEMPT */
27359 +#ifdef CONFIG_XEN
27360 + movl r2=XSI_PSR_I_ADDR
27361 + mov r18=1
27362 + ;;
27363 + ld8 r2=[r2]
27364 + ;;
27365 +(pUStk) st1 [r2]=r18
27366 +#else
27367 +(pUStk) rsm psr.i
27368 +#endif
27369 + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall
27370 +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27371 +#endif
27372 +.work_processed_syscall:
27373 + adds r2=PT(LOADRS)+16,r12
27374 + adds r3=PT(AR_BSPSTORE)+16,r12
27375 + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
27376 + ;;
27377 +(p6) ld4 r31=[r18] // load current_thread_info()->flags
27378 + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
27379 + nop.i 0
27380 + ;;
27381 + mov r16=ar.bsp // M2 get existing backing store pointer
27382 + ld8 r18=[r2],PT(R9)-PT(B6) // load b6
27383 +(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
27384 + ;;
27385 + ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
27386 +(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending?
27387 +(p6) br.cond.spnt .work_pending_syscall
27388 + ;;
27389 + // start restoring the state saved on the kernel stack (struct pt_regs):
27390 + ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
27391 + ld8 r11=[r3],PT(CR_IIP)-PT(R11)
27392 +(pNonSys) break 0 // bug check: we shouldn't be here if pNonSys is TRUE!
27393 + ;;
27394 + invala // M0|1 invalidate ALAT
27395 +#ifdef CONFIG_XEN
27396 + movl r28=XSI_PSR_I_ADDR
27397 + movl r29=XSI_PSR_IC
27398 + ;;
27399 + ld8 r28=[r28]
27400 + mov r30=1
27401 + ;;
27402 + st1 [r28]=r30
27403 + st4 [r29]=r0 // note: clears both vpsr.i and vpsr.ic!
27404 + ;;
27405 +#else
27406 + rsm psr.i | psr.ic // M2 turn off interrupts and interruption collection
27407 +#endif
27408 + cmp.eq p9,p0=r0,r0 // A set p9 to indicate that we should restore cr.ifs
27409 +
27410 + ld8 r29=[r2],16 // M0|1 load cr.ipsr
27411 + ld8 r28=[r3],16 // M0|1 load cr.iip
27412 + mov r22=r0 // A clear r22
27413 + ;;
27414 + ld8 r30=[r2],16 // M0|1 load cr.ifs
27415 + ld8 r25=[r3],16 // M0|1 load ar.unat
27416 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
27417 + ;;
27418 + ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
27419 +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
27420 + nop 0
27421 + ;;
27422 + ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
27423 + ld8 r27=[r3],PT(PR)-PT(AR_RSC) // M0|1 load ar.rsc
27424 + mov f6=f0 // F clear f6
27425 + ;;
27426 + ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // M0|1 load ar.rnat (may be garbage)
27427 + ld8 r31=[r3],PT(R1)-PT(PR) // M0|1 load predicates
27428 + mov f7=f0 // F clear f7
27429 + ;;
27430 + ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // M0|1 load ar.fpsr
27431 + ld8.fill r1=[r3],16 // M0|1 load r1
27432 +(pUStk) mov r17=1 // A
27433 + ;;
27434 +(pUStk) st1 [r14]=r17 // M2|3
27435 + ld8.fill r13=[r3],16 // M0|1
27436 + mov f8=f0 // F clear f8
27437 + ;;
27438 + ld8.fill r12=[r2] // M0|1 restore r12 (sp)
27439 + ld8.fill r15=[r3] // M0|1 restore r15
27440 + mov b6=r18 // I0 restore b6
27441 +
27442 + addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
27443 + mov f9=f0 // F clear f9
27444 +(pKStk) br.cond.dpnt.many skip_rbs_switch // B
27445 +
27446 + srlz.d // M0 ensure interruption collection is off (for cover)
27447 + shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition
27448 +#ifdef CONFIG_XEN
27449 + XEN_HYPER_COVER;
27450 +#else
27451 + cover // B add current frame into dirty partition & set cr.ifs
27452 +#endif
27453 + ;;
27454 +(pUStk) ld4 r17=[r17] // M0|1 r17 = cpu_data->phys_stacked_size_p8
27455 + mov r19=ar.bsp // M2 get new backing store pointer
27456 + mov f10=f0 // F clear f10
27457 +
27458 + nop.m 0
27459 + movl r14=__kernel_syscall_via_epc // X
27460 + ;;
27461 + mov.m ar.csd=r0 // M2 clear ar.csd
27462 + mov.m ar.ccv=r0 // M2 clear ar.ccv
27463 + mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc)
27464 +
27465 + mov.m ar.ssd=r0 // M2 clear ar.ssd
27466 + mov f11=f0 // F clear f11
27467 + br.cond.sptk.many rbs_switch // B
27468 +#ifdef CONFIG_XEN
27469 +END(xen_leave_syscall)
27470 +#else
27471 +END(ia64_leave_syscall)
27472 +#endif
27473 +
27474 +#ifdef CONFIG_XEN
27475 +GLOBAL_ENTRY(xen_leave_kernel)
27476 + PT_REGS_UNWIND_INFO(0)
27477 + movl r22=running_on_xen;;
27478 + ld4 r22=[r22];;
27479 + cmp.eq p7,p0=r22,r0
27480 +(p7) br.cond.sptk.many __ia64_leave_kernel;;
27481 +#else
27482 +GLOBAL_ENTRY(ia64_leave_kernel)
27483 + PT_REGS_UNWIND_INFO(0)
27484 +#endif
27485 + /*
27486 + * work.need_resched etc. mustn't get changed by this CPU before it returns to
27487 + * user- or fsys-mode, hence we disable interrupts early on.
27488 + *
27489 + * p6 controls whether current_thread_info()->flags needs to be check for
27490 + * extra work. We always check for extra work when returning to user-level.
27491 + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
27492 + * is 0. After extra work processing has been completed, execution
27493 + * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
27494 + * needs to be redone.
27495 + */
27496 +#ifdef CONFIG_PREEMPT
27497 + rsm psr.i // disable interrupts
27498 + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
27499 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27500 + ;;
27501 + .pred.rel.mutex pUStk,pKStk
27502 +(pKStk) ld4 r21=[r20] // r21 <- preempt_count
27503 +(pUStk) mov r21=0 // r21 <- 0
27504 + ;;
27505 + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0)
27506 +#else
27507 +#ifdef CONFIG_XEN
27508 +(pUStk) movl r17=XSI_PSR_I_ADDR
27509 +(pUStk) mov r31=1
27510 + ;;
27511 +(pUStk) ld8 r17=[r17]
27512 + ;;
27513 +(pUStk) st1 [r17]=r31
27514 + ;;
27515 +#else
27516 +(pUStk) rsm psr.i
27517 +#endif
27518 + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel
27519 +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
27520 +#endif
27521 +.work_processed_kernel:
27522 + adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
27523 + ;;
27524 +(p6) ld4 r31=[r17] // load current_thread_info()->flags
27525 + adds r21=PT(PR)+16,r12
27526 + ;;
27527 +
27528 + lfetch [r21],PT(CR_IPSR)-PT(PR)
27529 + adds r2=PT(B6)+16,r12
27530 + adds r3=PT(R16)+16,r12
27531 + ;;
27532 + lfetch [r21]
27533 + ld8 r28=[r2],8 // load b6
27534 + adds r29=PT(R24)+16,r12
27535 +
27536 + ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
27537 + adds r30=PT(AR_CCV)+16,r12
27538 +(p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
27539 + ;;
27540 + ld8.fill r24=[r29]
27541 + ld8 r15=[r30] // load ar.ccv
27542 +(p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending?
27543 + ;;
27544 + ld8 r29=[r2],16 // load b7
27545 + ld8 r30=[r3],16 // load ar.csd
27546 +(p6) br.cond.spnt .work_pending
27547 + ;;
27548 + ld8 r31=[r2],16 // load ar.ssd
27549 + ld8.fill r8=[r3],16
27550 + ;;
27551 + ld8.fill r9=[r2],16
27552 + ld8.fill r10=[r3],PT(R17)-PT(R10)
27553 + ;;
27554 + ld8.fill r11=[r2],PT(R18)-PT(R11)
27555 + ld8.fill r17=[r3],16
27556 + ;;
27557 + ld8.fill r18=[r2],16
27558 + ld8.fill r19=[r3],16
27559 + ;;
27560 + ld8.fill r20=[r2],16
27561 + ld8.fill r21=[r3],16
27562 + mov ar.csd=r30
27563 + mov ar.ssd=r31
27564 + ;;
27565 +#ifdef CONFIG_XEN
27566 + movl r23=XSI_PSR_I_ADDR
27567 + movl r22=XSI_PSR_IC
27568 + ;;
27569 + ld8 r23=[r23]
27570 + mov r25=1
27571 + ;;
27572 + st1 [r23]=r25
27573 + st4 [r22]=r0 // note: clears both vpsr.i and vpsr.ic!
27574 + ;;
27575 +#else
27576 + rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection
27577 +#endif
27578 + invala // invalidate ALAT
27579 + ;;
27580 + ld8.fill r22=[r2],24
27581 + ld8.fill r23=[r3],24
27582 + mov b6=r28
27583 + ;;
27584 + ld8.fill r25=[r2],16
27585 + ld8.fill r26=[r3],16
27586 + mov b7=r29
27587 + ;;
27588 + ld8.fill r27=[r2],16
27589 + ld8.fill r28=[r3],16
27590 + ;;
27591 + ld8.fill r29=[r2],16
27592 + ld8.fill r30=[r3],24
27593 + ;;
27594 + ld8.fill r31=[r2],PT(F9)-PT(R31)
27595 + adds r3=PT(F10)-PT(F6),r3
27596 + ;;
27597 + ldf.fill f9=[r2],PT(F6)-PT(F9)
27598 + ldf.fill f10=[r3],PT(F8)-PT(F10)
27599 + ;;
27600 + ldf.fill f6=[r2],PT(F7)-PT(F6)
27601 + ;;
27602 + ldf.fill f7=[r2],PT(F11)-PT(F7)
27603 + ldf.fill f8=[r3],32
27604 + ;;
27605 + srlz.d // ensure that inter. collection is off (VHPT is don't care, since text is pinned)
27606 + mov ar.ccv=r15
27607 + ;;
27608 + ldf.fill f11=[r2]
27609 +#ifdef CONFIG_XEN
27610 + ;;
27611 + // r16-r31 all now hold bank1 values
27612 + movl r2=XSI_BANK1_R16
27613 + movl r3=XSI_BANK1_R16+8
27614 + ;;
27615 +.mem.offset 0,0; st8.spill [r2]=r16,16
27616 +.mem.offset 8,0; st8.spill [r3]=r17,16
27617 + ;;
27618 +.mem.offset 0,0; st8.spill [r2]=r18,16
27619 +.mem.offset 8,0; st8.spill [r3]=r19,16
27620 + ;;
27621 +.mem.offset 0,0; st8.spill [r2]=r20,16
27622 +.mem.offset 8,0; st8.spill [r3]=r21,16
27623 + ;;
27624 +.mem.offset 0,0; st8.spill [r2]=r22,16
27625 +.mem.offset 8,0; st8.spill [r3]=r23,16
27626 + ;;
27627 +.mem.offset 0,0; st8.spill [r2]=r24,16
27628 +.mem.offset 8,0; st8.spill [r3]=r25,16
27629 + ;;
27630 +.mem.offset 0,0; st8.spill [r2]=r26,16
27631 +.mem.offset 8,0; st8.spill [r3]=r27,16
27632 + ;;
27633 +.mem.offset 0,0; st8.spill [r2]=r28,16
27634 +.mem.offset 8,0; st8.spill [r3]=r29,16
27635 + ;;
27636 +.mem.offset 0,0; st8.spill [r2]=r30,16
27637 +.mem.offset 8,0; st8.spill [r3]=r31,16
27638 + ;;
27639 + movl r2=XSI_BANKNUM;;
27640 + st4 [r2]=r0;
27641 +#else
27642 + bsw.0 // switch back to bank 0 (no stop bit required beforehand...)
27643 +#endif
27644 + ;;
27645 +(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
27646 + adds r16=PT(CR_IPSR)+16,r12
27647 + adds r17=PT(CR_IIP)+16,r12
27648 +
27649 +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled
27650 + nop.i 0
27651 + nop.i 0
27652 + ;;
27653 + ld8 r29=[r16],16 // load cr.ipsr
27654 + ld8 r28=[r17],16 // load cr.iip
27655 + ;;
27656 + ld8 r30=[r16],16 // load cr.ifs
27657 + ld8 r25=[r17],16 // load ar.unat
27658 + ;;
27659 + ld8 r26=[r16],16 // load ar.pfs
27660 + ld8 r27=[r17],16 // load ar.rsc
27661 + cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
27662 + ;;
27663 + ld8 r24=[r16],16 // load ar.rnat (may be garbage)
27664 + ld8 r23=[r17],16 // load ar.bspstore (may be garbage)
27665 + ;;
27666 + ld8 r31=[r16],16 // load predicates
27667 + ld8 r21=[r17],16 // load b0
27668 + ;;
27669 + ld8 r19=[r16],16 // load ar.rsc value for "loadrs"
27670 + ld8.fill r1=[r17],16 // load r1
27671 + ;;
27672 + ld8.fill r12=[r16],16
27673 + ld8.fill r13=[r17],16
27674 +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
27675 + ;;
27676 + ld8 r20=[r16],16 // ar.fpsr
27677 + ld8.fill r15=[r17],16
27678 + ;;
27679 + ld8.fill r14=[r16],16
27680 + ld8.fill r2=[r17]
27681 +(pUStk) mov r17=1
27682 + ;;
27683 + ld8.fill r3=[r16]
27684 +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack
27685 + shr.u r18=r19,16 // get byte size of existing "dirty" partition
27686 + ;;
27687 + mov r16=ar.bsp // get existing backing store pointer
27688 + addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
27689 + ;;
27690 + ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8
27691 +(pKStk) br.cond.dpnt skip_rbs_switch
27692 +
27693 + /*
27694 + * Restore user backing store.
27695 + *
27696 + * NOTE: alloc, loadrs, and cover can't be predicated.
27697 + */
27698 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
27699 +
27700 +#ifdef CONFIG_XEN
27701 + XEN_HYPER_COVER;
27702 +#else
27703 + cover // add current frame into dirty partition and set cr.ifs
27704 +#endif
27705 + ;;
27706 + mov r19=ar.bsp // get new backing store pointer
27707 +rbs_switch:
27708 + sub r16=r16,r18 // krbs = old bsp - size of dirty partition
27709 + cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs
27710 + ;;
27711 + sub r19=r19,r16 // calculate total byte size of dirty partition
27712 + add r18=64,r18 // don't force in0-in7 into memory...
27713 + ;;
27714 + shl r19=r19,16 // shift size of dirty partition into loadrs position
27715 + ;;
27716 +dont_preserve_current_frame:
27717 + /*
27718 + * To prevent leaking bits between the kernel and user-space,
27719 + * we must clear the stacked registers in the "invalid" partition here.
27720 + * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
27721 + * 5 registers/cycle on McKinley).
27722 + */
27723 +# define pRecurse p6
27724 +# define pReturn p7
27725 +#ifdef CONFIG_ITANIUM
27726 +# define Nregs 10
27727 +#else
27728 +# define Nregs 14
27729 +#endif
27730 + alloc loc0=ar.pfs,2,Nregs-2,2,0
27731 + shr.u loc1=r18,9 // RNaTslots <= floor(dirtySize / (64*8))
27732 + sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize
27733 + ;;
27734 + mov ar.rsc=r19 // load ar.rsc to be used for "loadrs"
27735 + shladd in0=loc1,3,r17
27736 + mov in1=0
27737 + ;;
27738 + TEXT_ALIGN(32)
27739 +rse_clear_invalid:
27740 +#ifdef CONFIG_ITANIUM
27741 + // cycle 0
27742 + { .mii
27743 + alloc loc0=ar.pfs,2,Nregs-2,2,0
27744 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
27745 + add out0=-Nregs*8,in0
27746 +}{ .mfb
27747 + add out1=1,in1 // increment recursion count
27748 + nop.f 0
27749 + nop.b 0 // can't do br.call here because of alloc (WAW on CFM)
27750 + ;;
27751 +}{ .mfi // cycle 1
27752 + mov loc1=0
27753 + nop.f 0
27754 + mov loc2=0
27755 +}{ .mib
27756 + mov loc3=0
27757 + mov loc4=0
27758 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
27759 +
27760 +}{ .mfi // cycle 2
27761 + mov loc5=0
27762 + nop.f 0
27763 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
27764 +}{ .mib
27765 + mov loc6=0
27766 + mov loc7=0
27767 +(pReturn) br.ret.sptk.many b0
27768 +}
27769 +#else /* !CONFIG_ITANIUM */
27770 + alloc loc0=ar.pfs,2,Nregs-2,2,0
27771 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse
27772 + add out0=-Nregs*8,in0
27773 + add out1=1,in1 // increment recursion count
27774 + mov loc1=0
27775 + mov loc2=0
27776 + ;;
27777 + mov loc3=0
27778 + mov loc4=0
27779 + mov loc5=0
27780 + mov loc6=0
27781 + mov loc7=0
27782 +(pRecurse) br.call.dptk.few b0=rse_clear_invalid
27783 + ;;
27784 + mov loc8=0
27785 + mov loc9=0
27786 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
27787 + mov loc10=0
27788 + mov loc11=0
27789 +(pReturn) br.ret.dptk.many b0
27790 +#endif /* !CONFIG_ITANIUM */
27791 +# undef pRecurse
27792 +# undef pReturn
27793 + ;;
27794 + alloc r17=ar.pfs,0,0,0,0 // drop current register frame
27795 + ;;
27796 + loadrs
27797 + ;;
27798 +skip_rbs_switch:
27799 + mov ar.unat=r25 // M2
27800 +(pKStk) extr.u r22=r22,21,1 // I0 extract current value of psr.pp from r22
27801 +(pLvSys)mov r19=r0 // A clear r19 for leave_syscall, no-op otherwise
27802 + ;;
27803 +(pUStk) mov ar.bspstore=r23 // M2
27804 +(pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp
27805 +(pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise
27806 + ;;
27807 +#ifdef CONFIG_XEN
27808 + movl r25=XSI_IPSR
27809 + ;;
27810 + st8[r25]=r29,XSI_IFS_OFS-XSI_IPSR_OFS
27811 + ;;
27812 +#else
27813 + mov cr.ipsr=r29 // M2
27814 +#endif
27815 + mov ar.pfs=r26 // I0
27816 +(pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise
27817 +
27818 +#ifdef CONFIG_XEN
27819 +(p9) st8 [r25]=r30
27820 + ;;
27821 + adds r25=XSI_IIP_OFS-XSI_IFS_OFS,r25
27822 + ;;
27823 +#else
27824 +(p9) mov cr.ifs=r30 // M2
27825 +#endif
27826 + mov b0=r21 // I0
27827 +(pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise
27828 +
27829 + mov ar.fpsr=r20 // M2
27830 +#ifdef CONFIG_XEN
27831 + st8 [r25]=r28
27832 +#else
27833 + mov cr.iip=r28 // M2
27834 +#endif
27835 + nop 0
27836 + ;;
27837 +(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode
27838 + nop 0
27839 +(pLvSys)mov r2=r0
27840 +
27841 + mov ar.rsc=r27 // M2
27842 + mov pr=r31,-1 // I0
27843 +#ifdef CONFIG_XEN
27844 + ;;
27845 + XEN_HYPER_RFI;
27846 +#else
27847 + rfi // B
27848 +#endif
27849 +
27850 + /*
27851 + * On entry:
27852 + * r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
27853 + * r31 = current->thread_info->flags
27854 + * On exit:
27855 + * p6 = TRUE if work-pending-check needs to be redone
27856 + */
27857 +.work_pending_syscall:
27858 + add r2=-8,r2
27859 + add r3=-8,r3
27860 + ;;
27861 + st8 [r2]=r8
27862 + st8 [r3]=r10
27863 +.work_pending:
27864 + tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context?
27865 +(p6) br.cond.sptk.few .sigdelayed
27866 + ;;
27867 + tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0?
27868 +(p6) br.cond.sptk.few .notify
27869 +#ifdef CONFIG_PREEMPT
27870 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
27871 + ;;
27872 +(pKStk) st4 [r20]=r21
27873 + ssm psr.i // enable interrupts
27874 +#endif
27875 + br.call.spnt.many rp=schedule
27876 +.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1
27877 +#ifdef CONFIG_XEN
27878 + movl r2=XSI_PSR_I_ADDR
27879 + mov r20=1
27880 + ;;
27881 + ld8 r2=[r2]
27882 + ;;
27883 + st1 [r2]=r20
27884 +#else
27885 + rsm psr.i // disable interrupts
27886 +#endif
27887 + ;;
27888 +#ifdef CONFIG_PREEMPT
27889 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
27890 + ;;
27891 +(pKStk) st4 [r20]=r0 // preempt_count() <- 0
27892 +#endif
27893 +(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27894 + br.cond.sptk.many .work_processed_kernel // re-check
27895 +
27896 +.notify:
27897 +(pUStk) br.call.spnt.many rp=notify_resume_user
27898 +.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0
27899 +(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27900 + br.cond.sptk.many .work_processed_kernel // don't re-check
27901 +
27902 +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
27903 +// it could not be delivered. Deliver it now. The signal might be for us and
27904 +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
27905 +// signal.
27906 +
27907 +.sigdelayed:
27908 + br.call.sptk.many rp=do_sigdelayed
27909 + cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check
27910 +(pLvSys)br.cond.sptk.few .work_pending_syscall_end
27911 + br.cond.sptk.many .work_processed_kernel // re-check
27912 +
27913 +.work_pending_syscall_end:
27914 + adds r2=PT(R8)+16,r12
27915 + adds r3=PT(R10)+16,r12
27916 + ;;
27917 + ld8 r8=[r2]
27918 + ld8 r10=[r3]
27919 + br.cond.sptk.many .work_processed_syscall // re-check
27920 +
27921 +#ifdef CONFIG_XEN
27922 +END(xen_leave_kernel)
27923 +#else
27924 +END(ia64_leave_kernel)
27925 +#endif
27926 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenhpski.c linux-2.6.16.33/arch/ia64/xen/xenhpski.c
27927 --- linux-2.6.16.33-noxen/arch/ia64/xen/xenhpski.c 1970-01-01 00:00:00.000000000 +0000
27928 +++ linux-2.6.16.33/arch/ia64/xen/xenhpski.c 2007-01-08 15:00:45.000000000 +0000
27929 @@ -0,0 +1,19 @@
27930 +
27931 +extern unsigned long xen_get_cpuid(int);
27932 +
27933 +int
27934 +running_on_sim(void)
27935 +{
27936 + int i;
27937 + long cpuid[6];
27938 +
27939 + for (i = 0; i < 5; ++i)
27940 + cpuid[i] = xen_get_cpuid(i);
27941 + if ((cpuid[0] & 0xff) != 'H') return 0;
27942 + if ((cpuid[3] & 0xff) != 0x4) return 0;
27943 + if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
27944 + if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
27945 + if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
27946 + return 1;
27947 +}
27948 +
27949 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenivt.S linux-2.6.16.33/arch/ia64/xen/xenivt.S
27950 --- linux-2.6.16.33-noxen/arch/ia64/xen/xenivt.S 1970-01-01 00:00:00.000000000 +0000
27951 +++ linux-2.6.16.33/arch/ia64/xen/xenivt.S 2007-01-08 15:00:45.000000000 +0000
27952 @@ -0,0 +1,2180 @@
27953 +/*
27954 + * arch/ia64/xen/ivt.S
27955 + *
27956 + * Copyright (C) 2005 Hewlett-Packard Co
27957 + * Dan Magenheimer <dan.magenheimer@hp.com>
27958 + */
27959 +/*
27960 + * This file defines the interruption vector table used by the CPU.
27961 + * It does not include one entry per possible cause of interruption.
27962 + *
27963 + * The first 20 entries of the table contain 64 bundles each while the
27964 + * remaining 48 entries contain only 16 bundles each.
27965 + *
27966 + * The 64 bundles are used to allow inlining the whole handler for critical
27967 + * interruptions like TLB misses.
27968 + *
27969 + * For each entry, the comment is as follows:
27970 + *
27971 + * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
27972 + * entry offset ----/ / / / /
27973 + * entry number ---------/ / / /
27974 + * size of the entry -------------/ / /
27975 + * vector name -------------------------------------/ /
27976 + * interruptions triggering this vector ----------------------/
27977 + *
27978 + * The table is 32KB in size and must be aligned on 32KB boundary.
27979 + * (The CPU ignores the 15 lower bits of the address)
27980 + *
27981 + * Table is based upon EAS2.6 (Oct 1999)
27982 + */
27983 +
27984 +#include <linux/config.h>
27985 +
27986 +#include <asm/asmmacro.h>
27987 +#include <asm/break.h>
27988 +#include <asm/ia32.h>
27989 +#include <asm/kregs.h>
27990 +#include <asm/asm-offsets.h>
27991 +#include <asm/pgtable.h>
27992 +#include <asm/processor.h>
27993 +#include <asm/ptrace.h>
27994 +#include <asm/system.h>
27995 +#include <asm/thread_info.h>
27996 +#include <asm/unistd.h>
27997 +#include <asm/errno.h>
27998 +
27999 +#ifdef CONFIG_XEN
28000 +#define ia64_ivt xen_ivt
28001 +#endif
28002 +
28003 +#if 1
28004 +# define PSR_DEFAULT_BITS psr.ac
28005 +#else
28006 +# define PSR_DEFAULT_BITS 0
28007 +#endif
28008 +
28009 +#if 0
28010 + /*
28011 + * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't
28012 + * needed for something else before enabling this...
28013 + */
28014 +# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
28015 +#else
28016 +# define DBG_FAULT(i)
28017 +#endif
28018 +
28019 +#define MINSTATE_VIRT /* needed by minstate.h */
28020 +#include "xenminstate.h"
28021 +
28022 +#define FAULT(n) \
28023 + mov r31=pr; \
28024 + mov r19=n;; /* prepare to save predicates */ \
28025 + br.sptk.many dispatch_to_fault_handler
28026 +
28027 + .section .text.ivt,"ax"
28028 +
28029 + .align 32768 // align on 32KB boundary
28030 + .global ia64_ivt
28031 +ia64_ivt:
28032 +/////////////////////////////////////////////////////////////////////////////////////////
28033 +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
28034 +ENTRY(vhpt_miss)
28035 + DBG_FAULT(0)
28036 + /*
28037 + * The VHPT vector is invoked when the TLB entry for the virtual page table
28038 + * is missing. This happens only as a result of a previous
28039 + * (the "original") TLB miss, which may either be caused by an instruction
28040 + * fetch or a data access (or non-access).
28041 + *
28042 + * What we do here is normal TLB miss handing for the _original_ miss,
28043 + * followed by inserting the TLB entry for the virtual page table page
28044 + * that the VHPT walker was attempting to access. The latter gets
28045 + * inserted as long as page table entry above pte level have valid
28046 + * mappings for the faulting address. The TLB entry for the original
28047 + * miss gets inserted only if the pte entry indicates that the page is
28048 + * present.
28049 + *
28050 + * do_page_fault gets invoked in the following cases:
28051 + * - the faulting virtual address uses unimplemented address bits
28052 + * - the faulting virtual address has no valid page table mapping
28053 + */
28054 +#ifdef CONFIG_XEN
28055 + movl r16=XSI_IFA
28056 + ;;
28057 + ld8 r16=[r16]
28058 +#ifdef CONFIG_HUGETLB_PAGE
28059 + movl r18=PAGE_SHIFT
28060 + movl r25=XSI_ITIR
28061 + ;;
28062 + ld8 r25=[r25]
28063 +#endif
28064 + ;;
28065 +#else
28066 + mov r16=cr.ifa // get address that caused the TLB miss
28067 +#ifdef CONFIG_HUGETLB_PAGE
28068 + movl r18=PAGE_SHIFT
28069 + mov r25=cr.itir
28070 +#endif
28071 +#endif
28072 + ;;
28073 +#ifdef CONFIG_XEN
28074 + XEN_HYPER_RSM_PSR_DT;
28075 +#else
28076 + rsm psr.dt // use physical addressing for data
28077 +#endif
28078 + mov r31=pr // save the predicate registers
28079 + mov r19=IA64_KR(PT_BASE) // get page table base address
28080 + shl r21=r16,3 // shift bit 60 into sign bit
28081 + shr.u r17=r16,61 // get the region number into r17
28082 + ;;
28083 + shr.u r22=r21,3
28084 +#ifdef CONFIG_HUGETLB_PAGE
28085 + extr.u r26=r25,2,6
28086 + ;;
28087 + cmp.ne p8,p0=r18,r26
28088 + sub r27=r26,r18
28089 + ;;
28090 +(p8) dep r25=r18,r25,2,6
28091 +(p8) shr r22=r22,r27
28092 +#endif
28093 + ;;
28094 + cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5?
28095 + shr.u r18=r22,PGDIR_SHIFT // get bottom portion of pgd index bit
28096 + ;;
28097 +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
28098 +
28099 + srlz.d
28100 + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
28101 +
28102 + .pred.rel "mutex", p6, p7
28103 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28104 +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28105 + ;;
28106 +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
28107 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
28108 + cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
28109 +#ifdef CONFIG_PGTABLE_4
28110 + shr.u r28=r22,PUD_SHIFT // shift pud index into position
28111 +#else
28112 + shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28113 +#endif
28114 + ;;
28115 + ld8 r17=[r17] // get *pgd (may be 0)
28116 + ;;
28117 +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
28118 +#ifdef CONFIG_PGTABLE_4
28119 + dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr)
28120 + ;;
28121 + shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28122 +(p7) ld8 r29=[r28] // get *pud (may be 0)
28123 + ;;
28124 +(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) == NULL?
28125 + dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
28126 +#else
28127 + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr)
28128 +#endif
28129 + ;;
28130 +(p7) ld8 r20=[r17] // get *pmd (may be 0)
28131 + shr.u r19=r22,PAGE_SHIFT // shift pte index into position
28132 + ;;
28133 +(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was pmd_present(*pmd) == NULL?
28134 + dep r21=r19,r20,3,(PAGE_SHIFT-3) // r21=pte_offset(pmd,addr)
28135 + ;;
28136 +(p7) ld8 r18=[r21] // read *pte
28137 +#ifdef CONFIG_XEN
28138 + movl r19=XSI_ISR
28139 + ;;
28140 + ld8 r19=[r19]
28141 +#else
28142 + mov r19=cr.isr // cr.isr bit 32 tells us if this is an insn miss
28143 +#endif
28144 + ;;
28145 +(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared?
28146 +#ifdef CONFIG_XEN
28147 + movl r22=XSI_IHA
28148 + ;;
28149 + ld8 r22=[r22]
28150 +#else
28151 + mov r22=cr.iha // get the VHPT address that caused the TLB miss
28152 +#endif
28153 + ;; // avoid RAW on p7
28154 +(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss?
28155 + dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address
28156 + ;;
28157 +#ifdef CONFIG_XEN
28158 + mov r24=r8
28159 + mov r8=r18
28160 + ;;
28161 +(p10) XEN_HYPER_ITC_I
28162 + ;;
28163 +(p11) XEN_HYPER_ITC_D
28164 + ;;
28165 + mov r8=r24
28166 + ;;
28167 +#else
28168 +(p10) itc.i r18 // insert the instruction TLB entry
28169 +(p11) itc.d r18 // insert the data TLB entry
28170 +#endif
28171 +(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault)
28172 +#ifdef CONFIG_XEN
28173 + movl r24=XSI_IFA
28174 + ;;
28175 + st8 [r24]=r22
28176 + ;;
28177 +#else
28178 + mov cr.ifa=r22
28179 +#endif
28180 +
28181 +#ifdef CONFIG_HUGETLB_PAGE
28182 +(p8) mov cr.itir=r25 // change to default page-size for VHPT
28183 +#endif
28184 +
28185 + /*
28186 + * Now compute and insert the TLB entry for the virtual page table. We never
28187 + * execute in a page table page so there is no need to set the exception deferral
28188 + * bit.
28189 + */
28190 + adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
28191 + ;;
28192 +#ifdef CONFIG_XEN
28193 +(p7) mov r25=r8
28194 +(p7) mov r8=r24
28195 + ;;
28196 +(p7) XEN_HYPER_ITC_D
28197 + ;;
28198 +(p7) mov r8=r25
28199 + ;;
28200 +#else
28201 +(p7) itc.d r24
28202 +#endif
28203 + ;;
28204 +#ifdef CONFIG_SMP
28205 + /*
28206 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28207 + * cannot possibly affect the following loads:
28208 + */
28209 + dv_serialize_data
28210 +
28211 + /*
28212 + * Re-check pagetable entry. If they changed, we may have received a ptc.g
28213 + * between reading the pagetable and the "itc". If so, flush the entry we
28214 + * inserted and retry. At this point, we have:
28215 + *
28216 + * r28 = equivalent of pud_offset(pgd, ifa)
28217 + * r17 = equivalent of pmd_offset(pud, ifa)
28218 + * r21 = equivalent of pte_offset(pmd, ifa)
28219 + *
28220 + * r29 = *pud
28221 + * r20 = *pmd
28222 + * r18 = *pte
28223 + */
28224 + ld8 r25=[r21] // read *pte again
28225 + ld8 r26=[r17] // read *pmd again
28226 +#ifdef CONFIG_PGTABLE_4
28227 + ld8 r19=[r28] // read *pud again
28228 +#endif
28229 + cmp.ne p6,p7=r0,r0
28230 + ;;
28231 + cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change
28232 +#ifdef CONFIG_PGTABLE_4
28233 + cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change
28234 +#endif
28235 + mov r27=PAGE_SHIFT<<2
28236 + ;;
28237 +(p6) ptc.l r22,r27 // purge PTE page translation
28238 +(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change
28239 + ;;
28240 +(p6) ptc.l r16,r27 // purge translation
28241 +#endif
28242 +
28243 + mov pr=r31,-1 // restore predicate registers
28244 +#ifdef CONFIG_XEN
28245 + XEN_HYPER_RFI
28246 + dv_serialize_data
28247 +#else
28248 + rfi
28249 +#endif
28250 +END(vhpt_miss)
28251 +
28252 + .org ia64_ivt+0x400
28253 +/////////////////////////////////////////////////////////////////////////////////////////
28254 +// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
28255 +ENTRY(itlb_miss)
28256 + DBG_FAULT(1)
28257 + /*
28258 + * The ITLB handler accesses the PTE via the virtually mapped linear
28259 + * page table. If a nested TLB miss occurs, we switch into physical
28260 + * mode, walk the page table, and then re-execute the PTE read and
28261 + * go on normally after that.
28262 + */
28263 +#ifdef CONFIG_XEN
28264 + movl r16=XSI_IFA
28265 + ;;
28266 + ld8 r16=[r16]
28267 +#else
28268 + mov r16=cr.ifa // get virtual address
28269 +#endif
28270 + mov r29=b0 // save b0
28271 + mov r31=pr // save predicates
28272 +.itlb_fault:
28273 +#ifdef CONFIG_XEN
28274 + movl r17=XSI_IHA
28275 + ;;
28276 + ld8 r17=[r17] // get virtual address of L3 PTE
28277 +#else
28278 + mov r17=cr.iha // get virtual address of PTE
28279 +#endif
28280 + movl r30=1f // load nested fault continuation point
28281 + ;;
28282 +1: ld8 r18=[r17] // read *pte
28283 + ;;
28284 + mov b0=r29
28285 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
28286 +(p6) br.cond.spnt page_fault
28287 + ;;
28288 +#ifdef CONFIG_XEN
28289 + mov r19=r8
28290 + mov r8=r18
28291 + ;;
28292 + XEN_HYPER_ITC_I
28293 + ;;
28294 + mov r8=r19
28295 +#else
28296 + itc.i r18
28297 +#endif
28298 + ;;
28299 +#ifdef CONFIG_SMP
28300 + /*
28301 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28302 + * cannot possibly affect the following loads:
28303 + */
28304 + dv_serialize_data
28305 +
28306 + ld8 r19=[r17] // read *pte again and see if same
28307 + mov r20=PAGE_SHIFT<<2 // setup page size for purge
28308 + ;;
28309 + cmp.ne p7,p0=r18,r19
28310 + ;;
28311 +(p7) ptc.l r16,r20
28312 +#endif
28313 + mov pr=r31,-1
28314 +#ifdef CONFIG_XEN
28315 + XEN_HYPER_RFI
28316 + dv_serialize_data
28317 +#else
28318 + rfi
28319 +#endif
28320 +END(itlb_miss)
28321 +
28322 + .org ia64_ivt+0x0800
28323 +/////////////////////////////////////////////////////////////////////////////////////////
28324 +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
28325 +ENTRY(dtlb_miss)
28326 + DBG_FAULT(2)
28327 + /*
28328 + * The DTLB handler accesses the PTE via the virtually mapped linear
28329 + * page table. If a nested TLB miss occurs, we switch into physical
28330 + * mode, walk the page table, and then re-execute the PTE read and
28331 + * go on normally after that.
28332 + */
28333 +#ifdef CONFIG_XEN
28334 + movl r16=XSI_IFA
28335 + ;;
28336 + ld8 r16=[r16]
28337 +#else
28338 + mov r16=cr.ifa // get virtual address
28339 +#endif
28340 + mov r29=b0 // save b0
28341 + mov r31=pr // save predicates
28342 +dtlb_fault:
28343 +#ifdef CONFIG_XEN
28344 + movl r17=XSI_IHA
28345 + ;;
28346 + ld8 r17=[r17] // get virtual address of L3 PTE
28347 +#else
28348 + mov r17=cr.iha // get virtual address of PTE
28349 +#endif
28350 + movl r30=1f // load nested fault continuation point
28351 + ;;
28352 +1: ld8 r18=[r17] // read *pte
28353 + ;;
28354 + mov b0=r29
28355 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared?
28356 +(p6) br.cond.spnt page_fault
28357 + ;;
28358 +#ifdef CONFIG_XEN
28359 + mov r19=r8
28360 + mov r8=r18
28361 + ;;
28362 + XEN_HYPER_ITC_D
28363 + ;;
28364 + mov r8=r19
28365 + ;;
28366 +#else
28367 + itc.d r18
28368 +#endif
28369 + ;;
28370 +#ifdef CONFIG_SMP
28371 + /*
28372 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28373 + * cannot possibly affect the following loads:
28374 + */
28375 + dv_serialize_data
28376 +
28377 + ld8 r19=[r17] // read *pte again and see if same
28378 + mov r20=PAGE_SHIFT<<2 // setup page size for purge
28379 + ;;
28380 + cmp.ne p7,p0=r18,r19
28381 + ;;
28382 +(p7) ptc.l r16,r20
28383 +#endif
28384 + mov pr=r31,-1
28385 +#ifdef CONFIG_XEN
28386 + XEN_HYPER_RFI
28387 + dv_serialize_data
28388 +#else
28389 + rfi
28390 +#endif
28391 +END(dtlb_miss)
28392 +
28393 + .org ia64_ivt+0x0c00
28394 +/////////////////////////////////////////////////////////////////////////////////////////
28395 +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
28396 +ENTRY(alt_itlb_miss)
28397 + DBG_FAULT(3)
28398 +#ifdef CONFIG_XEN
28399 + movl r31=XSI_IPSR
28400 + ;;
28401 + ld8 r21=[r31],XSI_IFA_OFS-XSI_IPSR_OFS // get ipsr, point to ifa
28402 + movl r17=PAGE_KERNEL
28403 + ;;
28404 + ld8 r16=[r31] // get ifa
28405 +#else
28406 + mov r16=cr.ifa // get address that caused the TLB miss
28407 + movl r17=PAGE_KERNEL
28408 + mov r21=cr.ipsr
28409 +#endif
28410 + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28411 + mov r31=pr
28412 + ;;
28413 +#ifdef CONFIG_DISABLE_VHPT
28414 + shr.u r22=r16,61 // get the region number into r21
28415 + ;;
28416 + cmp.gt p8,p0=6,r22 // user mode
28417 + ;;
28418 +#ifndef CONFIG_XEN
28419 +(p8) thash r17=r16
28420 + ;;
28421 +(p8) mov cr.iha=r17
28422 +#endif
28423 +(p8) mov r29=b0 // save b0
28424 +(p8) br.cond.dptk .itlb_fault
28425 +#endif
28426 + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
28427 + and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
28428 + shr.u r18=r16,57 // move address bit 61 to bit 4
28429 + ;;
28430 + andcm r18=0x10,r18 // bit 4=~address-bit(61)
28431 + cmp.ne p8,p0=r0,r23 // psr.cpl != 0?
28432 + or r19=r17,r19 // insert PTE control bits into r19
28433 + ;;
28434 + or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
28435 +(p8) br.cond.spnt page_fault
28436 + ;;
28437 +#ifdef CONFIG_XEN
28438 + mov r18=r8
28439 + mov r8=r19
28440 + ;;
28441 + XEN_HYPER_ITC_I
28442 + ;;
28443 + mov r8=r18
28444 + ;;
28445 + mov pr=r31,-1
28446 + ;;
28447 + XEN_HYPER_RFI;
28448 +#else
28449 + itc.i r19 // insert the TLB entry
28450 + mov pr=r31,-1
28451 + rfi
28452 +#endif
28453 +END(alt_itlb_miss)
28454 +
28455 + .org ia64_ivt+0x1000
28456 +/////////////////////////////////////////////////////////////////////////////////////////
28457 +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
28458 +ENTRY(alt_dtlb_miss)
28459 + DBG_FAULT(4)
28460 +#ifdef CONFIG_XEN
28461 + movl r31=XSI_IPSR
28462 + ;;
28463 + ld8 r21=[r31],XSI_ISR_OFS-XSI_IPSR_OFS // get ipsr, point to isr
28464 + movl r17=PAGE_KERNEL
28465 + ;;
28466 + ld8 r20=[r31],XSI_IFA_OFS-XSI_ISR_OFS // get isr, point to ifa
28467 + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28468 + ;;
28469 + ld8 r16=[r31] // get ifa
28470 +#else
28471 + mov r16=cr.ifa // get address that caused the TLB miss
28472 + movl r17=PAGE_KERNEL
28473 + mov r20=cr.isr
28474 + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
28475 + mov r21=cr.ipsr
28476 +#endif
28477 + mov r31=pr
28478 + ;;
28479 +#ifdef CONFIG_DISABLE_VHPT
28480 + shr.u r22=r16,61 // get the region number into r21
28481 + ;;
28482 + cmp.gt p8,p0=6,r22 // access to region 0-5
28483 + ;;
28484 +#ifndef CONFIG_XEN
28485 +(p8) thash r17=r16
28486 + ;;
28487 +(p8) mov cr.iha=r17
28488 +#endif
28489 +(p8) mov r29=b0 // save b0
28490 +(p8) br.cond.dptk dtlb_fault
28491 +#endif
28492 + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
28493 + and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field
28494 + tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on?
28495 + shr.u r18=r16,57 // move address bit 61 to bit 4
28496 + and r19=r19,r16 // clear ed, reserved bits, and PTE control bits
28497 + tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on?
28498 + ;;
28499 + andcm r18=0x10,r18 // bit 4=~address-bit(61)
28500 + cmp.ne p8,p0=r0,r23
28501 +(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field
28502 +(p8) br.cond.spnt page_fault
28503 +
28504 + dep r21=-1,r21,IA64_PSR_ED_BIT,1
28505 + or r19=r19,r17 // insert PTE control bits into r19
28506 + ;;
28507 + or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6
28508 +(p6) mov cr.ipsr=r21
28509 + ;;
28510 +#ifdef CONFIG_XEN
28511 +(p7) mov r18=r8
28512 +(p7) mov r8=r19
28513 + ;;
28514 +(p7) XEN_HYPER_ITC_D
28515 + ;;
28516 +(p7) mov r8=r18
28517 + ;;
28518 + mov pr=r31,-1
28519 + ;;
28520 + XEN_HYPER_RFI;
28521 +#else
28522 +(p7) itc.d r19 // insert the TLB entry
28523 + mov pr=r31,-1
28524 + rfi
28525 +#endif
28526 +END(alt_dtlb_miss)
28527 +
28528 + .org ia64_ivt+0x1400
28529 +/////////////////////////////////////////////////////////////////////////////////////////
28530 +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
28531 +ENTRY(nested_dtlb_miss)
28532 + /*
28533 + * In the absence of kernel bugs, we get here when the virtually mapped linear
28534 + * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
28535 + * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page
28536 + * table is missing, a nested TLB miss fault is triggered and control is
28537 + * transferred to this point. When this happens, we lookup the pte for the
28538 + * faulting address by walking the page table in physical mode and return to the
28539 + * continuation point passed in register r30 (or call page_fault if the address is
28540 + * not mapped).
28541 + *
28542 + * Input: r16: faulting address
28543 + * r29: saved b0
28544 + * r30: continuation address
28545 + * r31: saved pr
28546 + *
28547 + * Output: r17: physical address of PTE of faulting address
28548 + * r29: saved b0
28549 + * r30: continuation address
28550 + * r31: saved pr
28551 + *
28552 + * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared)
28553 + */
28554 +#ifdef CONFIG_XEN
28555 + XEN_HYPER_RSM_PSR_DT;
28556 +#else
28557 + rsm psr.dt // switch to using physical data addressing
28558 +#endif
28559 + mov r19=IA64_KR(PT_BASE) // get the page table base address
28560 + shl r21=r16,3 // shift bit 60 into sign bit
28561 +#ifdef CONFIG_XEN
28562 + movl r18=XSI_ITIR
28563 + ;;
28564 + ld8 r18=[r18]
28565 +#else
28566 + mov r18=cr.itir
28567 +#endif
28568 + ;;
28569 + shr.u r17=r16,61 // get the region number into r17
28570 + extr.u r18=r18,2,6 // get the faulting page size
28571 + ;;
28572 + cmp.eq p6,p7=5,r17 // is faulting address in region 5?
28573 + add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address
28574 + add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
28575 + ;;
28576 + shr.u r22=r16,r22
28577 + shr.u r18=r16,r18
28578 +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place
28579 +
28580 + srlz.d
28581 + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir
28582 +
28583 + .pred.rel "mutex", p6, p7
28584 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
28585 +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
28586 + ;;
28587 +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5
28588 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4]
28589 + cmp.eq p7,p6=0,r21 // unused address bits all zeroes?
28590 +#ifdef CONFIG_PGTABLE_4
28591 + shr.u r18=r22,PUD_SHIFT // shift pud index into position
28592 +#else
28593 + shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28594 +#endif
28595 + ;;
28596 + ld8 r17=[r17] // get *pgd (may be 0)
28597 + ;;
28598 +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL?
28599 + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr)
28600 + ;;
28601 +#ifdef CONFIG_PGTABLE_4
28602 +(p7) ld8 r17=[r17] // get *pud (may be 0)
28603 + shr.u r18=r22,PMD_SHIFT // shift pmd index into position
28604 + ;;
28605 +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) == NULL?
28606 + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr)
28607 + ;;
28608 +#endif
28609 +(p7) ld8 r17=[r17] // get *pmd (may be 0)
28610 + shr.u r19=r22,PAGE_SHIFT // shift pte index into position
28611 + ;;
28612 +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) == NULL?
28613 + dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr);
28614 +(p6) br.cond.spnt page_fault
28615 + mov b0=r30
28616 + br.sptk.many b0 // return to continuation point
28617 +END(nested_dtlb_miss)
28618 +
28619 + .org ia64_ivt+0x1800
28620 +/////////////////////////////////////////////////////////////////////////////////////////
28621 +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
28622 +ENTRY(ikey_miss)
28623 + DBG_FAULT(6)
28624 + FAULT(6)
28625 +END(ikey_miss)
28626 +
28627 + //-----------------------------------------------------------------------------------
28628 + // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
28629 +ENTRY(page_fault)
28630 +#ifdef CONFIG_XEN
28631 + XEN_HYPER_SSM_PSR_DT
28632 +#else
28633 + ssm psr.dt
28634 + ;;
28635 + srlz.i
28636 +#endif
28637 + ;;
28638 + SAVE_MIN_WITH_COVER
28639 + alloc r15=ar.pfs,0,0,3,0
28640 +#ifdef CONFIG_XEN
28641 + movl r3=XSI_ISR
28642 + ;;
28643 + ld8 out1=[r3],XSI_IFA_OFS-XSI_ISR_OFS // get vcr.isr, point to ifa
28644 + ;;
28645 + ld8 out0=[r3] // get vcr.ifa
28646 + mov r14=1
28647 + ;;
28648 + add r3=XSI_PSR_IC_OFS-XSI_IFA_OFS, r3 // point to vpsr.ic
28649 + ;;
28650 + st4 [r3]=r14 // vpsr.ic = 1
28651 + adds r3=8,r2 // set up second base pointer
28652 + ;;
28653 +#else
28654 + mov out0=cr.ifa
28655 + mov out1=cr.isr
28656 + adds r3=8,r2 // set up second base pointer
28657 + ;;
28658 + ssm psr.ic | PSR_DEFAULT_BITS
28659 + ;;
28660 + srlz.i // guarantee that interruption collectin is on
28661 + ;;
28662 +#endif
28663 +#ifdef CONFIG_XEN
28664 + br.cond.sptk.many xen_page_fault
28665 + ;;
28666 +done_xen_page_fault:
28667 +#endif
28668 +(p15) ssm psr.i // restore psr.i
28669 + movl r14=ia64_leave_kernel
28670 + ;;
28671 + SAVE_REST
28672 + mov rp=r14
28673 + ;;
28674 + adds out2=16,r12 // out2 = pointer to pt_regs
28675 + br.call.sptk.many b6=ia64_do_page_fault // ignore return address
28676 +END(page_fault)
28677 +
28678 + .org ia64_ivt+0x1c00
28679 +/////////////////////////////////////////////////////////////////////////////////////////
28680 +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
28681 +ENTRY(dkey_miss)
28682 + DBG_FAULT(7)
28683 + FAULT(7)
28684 +#ifdef CONFIG_XEN
28685 + // Leaving this code inline above results in an IVT section overflow
28686 + // There is no particular reason for this code to be here...
28687 +xen_page_fault:
28688 +(p15) movl r3=XSI_PSR_I_ADDR
28689 + ;;
28690 +(p15) ld8 r3=[r3]
28691 + ;;
28692 +(p15) st1 [r3]=r0,-1 // if (p15) vpsr.i = 1
28693 + mov r14=r0
28694 + ;;
28695 +(p15) ld1 r14=[r3] // if (pending_events)
28696 + adds r3=8,r2 // re-set up second base pointer
28697 + ;;
28698 +(p15) cmp.ne p15,p0=r14,r0
28699 + ;;
28700 + br.cond.sptk.many done_xen_page_fault
28701 + ;;
28702 +#endif
28703 +END(dkey_miss)
28704 +
28705 + .org ia64_ivt+0x2000
28706 +/////////////////////////////////////////////////////////////////////////////////////////
28707 +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
28708 +ENTRY(dirty_bit)
28709 + DBG_FAULT(8)
28710 + /*
28711 + * What we do here is to simply turn on the dirty bit in the PTE. We need to
28712 + * update both the page-table and the TLB entry. To efficiently access the PTE,
28713 + * we address it through the virtual page table. Most likely, the TLB entry for
28714 + * the relevant virtual page table page is still present in the TLB so we can
28715 + * normally do this without additional TLB misses. In case the necessary virtual
28716 + * page table TLB entry isn't present, we take a nested TLB miss hit where we look
28717 + * up the physical address of the L3 PTE and then continue at label 1 below.
28718 + */
28719 +#ifdef CONFIG_XEN
28720 + movl r16=XSI_IFA
28721 + ;;
28722 + ld8 r16=[r16]
28723 + ;;
28724 +#else
28725 + mov r16=cr.ifa // get the address that caused the fault
28726 +#endif
28727 + movl r30=1f // load continuation point in case of nested fault
28728 + ;;
28729 +#ifdef CONFIG_XEN
28730 + mov r18=r8;
28731 + mov r8=r16;
28732 + XEN_HYPER_THASH;;
28733 + mov r17=r8;
28734 + mov r8=r18;;
28735 +#else
28736 + thash r17=r16 // compute virtual address of L3 PTE
28737 +#endif
28738 + mov r29=b0 // save b0 in case of nested fault
28739 + mov r31=pr // save pr
28740 +#ifdef CONFIG_SMP
28741 + mov r28=ar.ccv // save ar.ccv
28742 + ;;
28743 +1: ld8 r18=[r17]
28744 + ;; // avoid RAW on r18
28745 + mov ar.ccv=r18 // set compare value for cmpxchg
28746 + or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
28747 + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28748 + ;;
28749 +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only update if page is present
28750 + mov r24=PAGE_SHIFT<<2
28751 + ;;
28752 +(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present
28753 + ;;
28754 +#ifdef CONFIG_XEN
28755 +(p6) mov r18=r8
28756 +(p6) mov r8=r25
28757 + ;;
28758 +(p6) XEN_HYPER_ITC_D
28759 + ;;
28760 +(p6) mov r8=r18
28761 +#else
28762 +(p6) itc.d r25 // install updated PTE
28763 +#endif
28764 + ;;
28765 + /*
28766 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28767 + * cannot possibly affect the following loads:
28768 + */
28769 + dv_serialize_data
28770 +
28771 + ld8 r18=[r17] // read PTE again
28772 + ;;
28773 + cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28774 + ;;
28775 +(p7) ptc.l r16,r24
28776 + mov b0=r29 // restore b0
28777 + mov ar.ccv=r28
28778 +#else
28779 + ;;
28780 +1: ld8 r18=[r17]
28781 + ;; // avoid RAW on r18
28782 + or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits
28783 + mov b0=r29 // restore b0
28784 + ;;
28785 + st8 [r17]=r18 // store back updated PTE
28786 + itc.d r18 // install updated PTE
28787 +#endif
28788 + mov pr=r31,-1 // restore pr
28789 +#ifdef CONFIG_XEN
28790 + XEN_HYPER_RFI
28791 + dv_serialize_data
28792 +#else
28793 + rfi
28794 +#endif
28795 +END(dirty_bit)
28796 +
28797 + .org ia64_ivt+0x2400
28798 +/////////////////////////////////////////////////////////////////////////////////////////
28799 +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
28800 +ENTRY(iaccess_bit)
28801 + DBG_FAULT(9)
28802 + // Like Entry 8, except for instruction access
28803 +#ifdef CONFIG_XEN
28804 + movl r16=XSI_IFA
28805 + ;;
28806 + ld8 r16=[r16]
28807 + ;;
28808 +#else
28809 + mov r16=cr.ifa // get the address that caused the fault
28810 +#endif
28811 + movl r30=1f // load continuation point in case of nested fault
28812 + mov r31=pr // save predicates
28813 +#ifdef CONFIG_ITANIUM
28814 + /*
28815 + * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
28816 + */
28817 + mov r17=cr.ipsr
28818 + ;;
28819 + mov r18=cr.iip
28820 + tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set?
28821 + ;;
28822 +(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa
28823 +#endif /* CONFIG_ITANIUM */
28824 + ;;
28825 +#ifdef CONFIG_XEN
28826 + mov r18=r8;
28827 + mov r8=r16;
28828 + XEN_HYPER_THASH;;
28829 + mov r17=r8;
28830 + mov r8=r18;;
28831 +#else
28832 + thash r17=r16 // compute virtual address of L3 PTE
28833 +#endif
28834 + mov r29=b0 // save b0 in case of nested fault)
28835 +#ifdef CONFIG_SMP
28836 + mov r28=ar.ccv // save ar.ccv
28837 + ;;
28838 +1: ld8 r18=[r17]
28839 + ;;
28840 + mov ar.ccv=r18 // set compare value for cmpxchg
28841 + or r25=_PAGE_A,r18 // set the accessed bit
28842 + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28843 + ;;
28844 +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page present
28845 + mov r24=PAGE_SHIFT<<2
28846 + ;;
28847 +(p6) cmp.eq p6,p7=r26,r18 // Only if page present
28848 + ;;
28849 +#ifdef CONFIG_XEN
28850 + mov r26=r8
28851 + mov r8=r25
28852 + ;;
28853 +(p6) XEN_HYPER_ITC_I
28854 + ;;
28855 + mov r8=r26
28856 + ;;
28857 +#else
28858 +(p6) itc.i r25 // install updated PTE
28859 +#endif
28860 + ;;
28861 + /*
28862 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28863 + * cannot possibly affect the following loads:
28864 + */
28865 + dv_serialize_data
28866 +
28867 + ld8 r18=[r17] // read PTE again
28868 + ;;
28869 + cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28870 + ;;
28871 +(p7) ptc.l r16,r24
28872 + mov b0=r29 // restore b0
28873 + mov ar.ccv=r28
28874 +#else /* !CONFIG_SMP */
28875 + ;;
28876 +1: ld8 r18=[r17]
28877 + ;;
28878 + or r18=_PAGE_A,r18 // set the accessed bit
28879 + mov b0=r29 // restore b0
28880 + ;;
28881 + st8 [r17]=r18 // store back updated PTE
28882 + itc.i r18 // install updated PTE
28883 +#endif /* !CONFIG_SMP */
28884 + mov pr=r31,-1
28885 +#ifdef CONFIG_XEN
28886 + XEN_HYPER_RFI
28887 + dv_serialize_data
28888 +#else
28889 + rfi
28890 +#endif
28891 +END(iaccess_bit)
28892 +
28893 + .org ia64_ivt+0x2800
28894 +/////////////////////////////////////////////////////////////////////////////////////////
28895 +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
28896 +ENTRY(daccess_bit)
28897 + DBG_FAULT(10)
28898 + // Like Entry 8, except for data access
28899 +#ifdef CONFIG_XEN
28900 + movl r16=XSI_IFA
28901 + ;;
28902 + ld8 r16=[r16]
28903 + ;;
28904 +#else
28905 + mov r16=cr.ifa // get the address that caused the fault
28906 +#endif
28907 + movl r30=1f // load continuation point in case of nested fault
28908 + ;;
28909 +#ifdef CONFIG_XEN
28910 + mov r18=r8
28911 + mov r8=r16
28912 + XEN_HYPER_THASH
28913 + ;;
28914 + mov r17=r8
28915 + mov r8=r18
28916 + ;;
28917 +#else
28918 + thash r17=r16 // compute virtual address of L3 PTE
28919 +#endif
28920 + mov r31=pr
28921 + mov r29=b0 // save b0 in case of nested fault)
28922 +#ifdef CONFIG_SMP
28923 + mov r28=ar.ccv // save ar.ccv
28924 + ;;
28925 +1: ld8 r18=[r17]
28926 + ;; // avoid RAW on r18
28927 + mov ar.ccv=r18 // set compare value for cmpxchg
28928 + or r25=_PAGE_A,r18 // set the dirty bit
28929 + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit
28930 + ;;
28931 +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page is present
28932 + mov r24=PAGE_SHIFT<<2
28933 + ;;
28934 +(p6) cmp.eq p6,p7=r26,r18 // Only if page is present
28935 + ;;
28936 +#ifdef CONFIG_XEN
28937 + mov r26=r8
28938 + mov r8=r25
28939 + ;;
28940 +(p6) XEN_HYPER_ITC_D
28941 + ;;
28942 + mov r8=r26
28943 + ;;
28944 +#else
28945 +(p6) itc.d r25 // install updated PTE
28946 +#endif
28947 + /*
28948 + * Tell the assemblers dependency-violation checker that the above "itc" instructions
28949 + * cannot possibly affect the following loads:
28950 + */
28951 + dv_serialize_data
28952 + ;;
28953 + ld8 r18=[r17] // read PTE again
28954 + ;;
28955 + cmp.eq p6,p7=r18,r25 // is it same as the newly installed
28956 + ;;
28957 +(p7) ptc.l r16,r24
28958 + mov ar.ccv=r28
28959 +#else
28960 + ;;
28961 +1: ld8 r18=[r17]
28962 + ;; // avoid RAW on r18
28963 + or r18=_PAGE_A,r18 // set the accessed bit
28964 + ;;
28965 + st8 [r17]=r18 // store back updated PTE
28966 + itc.d r18 // install updated PTE
28967 +#endif
28968 + mov b0=r29 // restore b0
28969 + mov pr=r31,-1
28970 +#ifdef CONFIG_XEN
28971 + XEN_HYPER_RFI
28972 + dv_serialize_data
28973 +#else
28974 + rfi
28975 +#endif
28976 +END(daccess_bit)
28977 +
28978 + .org ia64_ivt+0x2c00
28979 +/////////////////////////////////////////////////////////////////////////////////////////
28980 +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
28981 +ENTRY(break_fault)
28982 + /*
28983 + * The streamlined system call entry/exit paths only save/restore the initial part
28984 + * of pt_regs. This implies that the callers of system-calls must adhere to the
28985 + * normal procedure calling conventions.
28986 + *
28987 + * Registers to be saved & restored:
28988 + * CR registers: cr.ipsr, cr.iip, cr.ifs
28989 + * AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
28990 + * others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
28991 + * Registers to be restored only:
28992 + * r8-r11: output value from the system call.
28993 + *
28994 + * During system call exit, scratch registers (including r15) are modified/cleared
28995 + * to prevent leaking bits from kernel to user level.
28996 + */
28997 + DBG_FAULT(11)
28998 + mov.m r16=IA64_KR(CURRENT) // M2 r16 <- current task (12 cyc)
28999 +#ifdef CONFIG_XEN
29000 + movl r22=XSI_IPSR
29001 + ;;
29002 + ld8 r29=[r22],XSI_IIM_OFS-XSI_IPSR_OFS // get ipsr, point to iip
29003 +#else
29004 + mov r29=cr.ipsr // M2 (12 cyc)
29005 +#endif
29006 + mov r31=pr // I0 (2 cyc)
29007 +
29008 +#ifdef CONFIG_XEN
29009 + ;;
29010 + ld8 r17=[r22],XSI_IIP_OFS-XSI_IIM_OFS
29011 +#else
29012 + mov r17=cr.iim // M2 (2 cyc)
29013 +#endif
29014 + mov.m r27=ar.rsc // M2 (12 cyc)
29015 + mov r18=__IA64_BREAK_SYSCALL // A
29016 +
29017 + mov.m ar.rsc=0 // M2
29018 + mov.m r21=ar.fpsr // M2 (12 cyc)
29019 + mov r19=b6 // I0 (2 cyc)
29020 + ;;
29021 + mov.m r23=ar.bspstore // M2 (12 cyc)
29022 + mov.m r24=ar.rnat // M2 (5 cyc)
29023 + mov.i r26=ar.pfs // I0 (2 cyc)
29024 +
29025 + invala // M0|1
29026 + nop.m 0 // M
29027 + mov r20=r1 // A save r1
29028 +
29029 + nop.m 0
29030 + movl r30=sys_call_table // X
29031 +
29032 +#ifdef CONFIG_XEN
29033 + ld8 r28=[r22]
29034 +#else
29035 + mov r28=cr.iip // M2 (2 cyc)
29036 +#endif
29037 + cmp.eq p0,p7=r18,r17 // I0 is this a system call?
29038 +(p7) br.cond.spnt non_syscall // B no ->
29039 + //
29040 + // From this point on, we are definitely on the syscall-path
29041 + // and we can use (non-banked) scratch registers.
29042 + //
29043 +///////////////////////////////////////////////////////////////////////
29044 + mov r1=r16 // A move task-pointer to "addl"-addressable reg
29045 + mov r2=r16 // A setup r2 for ia64_syscall_setup
29046 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 // A r9 = &current_thread_info()->flags
29047 +
29048 + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
29049 + adds r15=-1024,r15 // A subtract 1024 from syscall number
29050 + mov r3=NR_syscalls - 1
29051 + ;;
29052 + ld1.bias r17=[r16] // M0|1 r17 = current->thread.on_ustack flag
29053 + ld4 r9=[r9] // M0|1 r9 = current_thread_info()->flags
29054 + extr.u r8=r29,41,2 // I0 extract ei field from cr.ipsr
29055 +
29056 + shladd r30=r15,3,r30 // A r30 = sys_call_table + 8*(syscall-1024)
29057 + addl r22=IA64_RBS_OFFSET,r1 // A compute base of RBS
29058 + cmp.leu p6,p7=r15,r3 // A syscall number in range?
29059 + ;;
29060 +
29061 + lfetch.fault.excl.nt1 [r22] // M0|1 prefetch RBS
29062 +(p6) ld8 r30=[r30] // M0|1 load address of syscall entry point
29063 + tnat.nz.or p7,p0=r15 // I0 is syscall nr a NaT?
29064 +
29065 + mov.m ar.bspstore=r22 // M2 switch to kernel RBS
29066 + cmp.eq p8,p9=2,r8 // A isr.ei==2?
29067 + ;;
29068 +
29069 +(p8) mov r8=0 // A clear ei to 0
29070 +(p7) movl r30=sys_ni_syscall // X
29071 +
29072 +(p8) adds r28=16,r28 // A switch cr.iip to next bundle
29073 +(p9) adds r8=1,r8 // A increment ei to next slot
29074 + nop.i 0
29075 + ;;
29076 +
29077 + mov.m r25=ar.unat // M2 (5 cyc)
29078 + dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr
29079 + adds r15=1024,r15 // A restore original syscall number
29080 + //
29081 + // If any of the above loads miss in L1D, we'll stall here until
29082 + // the data arrives.
29083 + //
29084 +///////////////////////////////////////////////////////////////////////
29085 + st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag
29086 + mov b6=r30 // I0 setup syscall handler branch reg early
29087 + cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already?
29088 +
29089 + and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit
29090 + mov r18=ar.bsp // M2 (12 cyc)
29091 +(pKStk) br.cond.spnt .break_fixup // B we're already in kernel-mode -- fix up RBS
29092 + ;;
29093 +.back_from_break_fixup:
29094 +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A compute base of memory stack
29095 + cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited?
29096 + br.call.sptk.many b7=ia64_syscall_setup // B
29097 +1:
29098 + mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0
29099 + nop 0
29100 +#ifdef CONFIG_XEN
29101 + mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
29102 +#else
29103 + bsw.1 // B (6 cyc) regs are saved, switch to bank 1
29104 +#endif
29105 + ;;
29106 +
29107 +#ifdef CONFIG_XEN
29108 + movl r16=XSI_PSR_IC
29109 + mov r3=1
29110 + ;;
29111 + st4 [r16]=r3,XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS // vpsr.ic = 1
29112 +#else
29113 + ssm psr.ic | PSR_DEFAULT_BITS // M2 now it's safe to re-enable intr.-collection
29114 +#endif
29115 + movl r3=ia64_ret_from_syscall // X
29116 + ;;
29117 +
29118 + srlz.i // M0 ensure interruption collection is on
29119 + mov rp=r3 // I0 set the real return addr
29120 +(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT
29121 +
29122 +#ifdef CONFIG_XEN
29123 +(p15) ld8 r16=[r16] // vpsr.i
29124 + ;;
29125 +(p15) st1 [r16]=r0,-1 // if (p15) vpsr.i = 1
29126 + mov r2=r0
29127 + ;;
29128 +(p15) ld1 r2=[r16] // if (pending_events)
29129 + ;;
29130 + cmp.ne p6,p0=r2,r0
29131 + ;;
29132 +(p6) ssm psr.i // do a real ssm psr.i
29133 +#else
29134 +(p15) ssm psr.i // M2 restore psr.i
29135 +#endif
29136 +(p14) br.call.sptk.many b6=b6 // B invoke syscall-handker (ignore return addr)
29137 + br.cond.spnt.many ia64_trace_syscall // B do syscall-tracing thingamagic
29138 + // NOT REACHED
29139 +///////////////////////////////////////////////////////////////////////
29140 + // On entry, we optimistically assumed that we're coming from user-space.
29141 + // For the rare cases where a system-call is done from within the kernel,
29142 + // we fix things up at this point:
29143 +.break_fixup:
29144 + add r1=-IA64_PT_REGS_SIZE,sp // A allocate space for pt_regs structure
29145 + mov ar.rnat=r24 // M2 restore kernel's AR.RNAT
29146 + ;;
29147 + mov ar.bspstore=r23 // M2 restore kernel's AR.BSPSTORE
29148 + br.cond.sptk .back_from_break_fixup
29149 +END(break_fault)
29150 +
29151 + .org ia64_ivt+0x3000
29152 +/////////////////////////////////////////////////////////////////////////////////////////
29153 +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
29154 +ENTRY(interrupt)
29155 + DBG_FAULT(12)
29156 + mov r31=pr // prepare to save predicates
29157 + ;;
29158 + SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3
29159 +#ifdef CONFIG_XEN
29160 + movl r3=XSI_PSR_IC
29161 + mov r14=1
29162 + ;;
29163 + st4 [r3]=r14
29164 +#else
29165 + ssm psr.ic | PSR_DEFAULT_BITS
29166 +#endif
29167 + ;;
29168 + adds r3=8,r2 // set up second base pointer for SAVE_REST
29169 + srlz.i // ensure everybody knows psr.ic is back on
29170 + ;;
29171 + SAVE_REST
29172 + ;;
29173 + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
29174 +#ifdef CONFIG_XEN
29175 + ;;
29176 + br.call.sptk.many rp=xen_get_ivr
29177 + ;;
29178 + mov out0=r8 // pass cr.ivr as first arg
29179 +#else
29180 + mov out0=cr.ivr // pass cr.ivr as first arg
29181 +#endif
29182 + add out1=16,sp // pass pointer to pt_regs as second arg
29183 + ;;
29184 + srlz.d // make sure we see the effect of cr.ivr
29185 + movl r14=ia64_leave_kernel
29186 + ;;
29187 + mov rp=r14
29188 + br.call.sptk.many b6=ia64_handle_irq
29189 +END(interrupt)
29190 +
29191 + .org ia64_ivt+0x3400
29192 +/////////////////////////////////////////////////////////////////////////////////////////
29193 +// 0x3400 Entry 13 (size 64 bundles) Reserved
29194 + DBG_FAULT(13)
29195 + FAULT(13)
29196 +
29197 + .org ia64_ivt+0x3800
29198 +/////////////////////////////////////////////////////////////////////////////////////////
29199 +// 0x3800 Entry 14 (size 64 bundles) Reserved
29200 + DBG_FAULT(14)
29201 + FAULT(14)
29202 +
29203 + /*
29204 + * There is no particular reason for this code to be here, other than that
29205 + * there happens to be space here that would go unused otherwise. If this
29206 + * fault ever gets "unreserved", simply moved the following code to a more
29207 + * suitable spot...
29208 + *
29209 + * ia64_syscall_setup() is a separate subroutine so that it can
29210 + * allocate stacked registers so it can safely demine any
29211 + * potential NaT values from the input registers.
29212 + *
29213 + * On entry:
29214 + * - executing on bank 0 or bank 1 register set (doesn't matter)
29215 + * - r1: stack pointer
29216 + * - r2: current task pointer
29217 + * - r3: preserved
29218 + * - r11: original contents (saved ar.pfs to be saved)
29219 + * - r12: original contents (sp to be saved)
29220 + * - r13: original contents (tp to be saved)
29221 + * - r15: original contents (syscall # to be saved)
29222 + * - r18: saved bsp (after switching to kernel stack)
29223 + * - r19: saved b6
29224 + * - r20: saved r1 (gp)
29225 + * - r21: saved ar.fpsr
29226 + * - r22: kernel's register backing store base (krbs_base)
29227 + * - r23: saved ar.bspstore
29228 + * - r24: saved ar.rnat
29229 + * - r25: saved ar.unat
29230 + * - r26: saved ar.pfs
29231 + * - r27: saved ar.rsc
29232 + * - r28: saved cr.iip
29233 + * - r29: saved cr.ipsr
29234 + * - r31: saved pr
29235 + * - b0: original contents (to be saved)
29236 + * On exit:
29237 + * - p10: TRUE if syscall is invoked with more than 8 out
29238 + * registers or r15's Nat is true
29239 + * - r1: kernel's gp
29240 + * - r3: preserved (same as on entry)
29241 + * - r8: -EINVAL if p10 is true
29242 + * - r12: points to kernel stack
29243 + * - r13: points to current task
29244 + * - r14: preserved (same as on entry)
29245 + * - p13: preserved
29246 + * - p15: TRUE if interrupts need to be re-enabled
29247 + * - ar.fpsr: set to kernel settings
29248 + * - b6: preserved (same as on entry)
29249 + */
29250 +#ifndef CONFIG_XEN
29251 +GLOBAL_ENTRY(ia64_syscall_setup)
29252 +#if PT(B6) != 0
29253 +# error This code assumes that b6 is the first field in pt_regs.
29254 +#endif
29255 + st8 [r1]=r19 // save b6
29256 + add r16=PT(CR_IPSR),r1 // initialize first base pointer
29257 + add r17=PT(R11),r1 // initialize second base pointer
29258 + ;;
29259 + alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable
29260 + st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr
29261 + tnat.nz p8,p0=in0
29262 +
29263 + st8.spill [r17]=r11,PT(CR_IIP)-PT(R11) // save r11
29264 + tnat.nz p9,p0=in1
29265 +(pKStk) mov r18=r0 // make sure r18 isn't NaT
29266 + ;;
29267 +
29268 + st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS) // save ar.pfs
29269 + st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP) // save cr.iip
29270 + mov r28=b0 // save b0 (2 cyc)
29271 + ;;
29272 +
29273 + st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT) // save ar.unat
29274 + dep r19=0,r19,38,26 // clear all bits but 0..37 [I0]
29275 +(p8) mov in0=-1
29276 + ;;
29277 +
29278 + st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS) // store ar.pfs.pfm in cr.ifs
29279 + extr.u r11=r19,7,7 // I0 // get sol of ar.pfs
29280 + and r8=0x7f,r19 // A // get sof of ar.pfs
29281 +
29282 + st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
29283 + tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
29284 +(p9) mov in1=-1
29285 + ;;
29286 +
29287 +(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8
29288 + tnat.nz p10,p0=in2
29289 + add r11=8,r11
29290 + ;;
29291 +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16 // skip over ar_rnat field
29292 +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17 // skip over ar_bspstore field
29293 + tnat.nz p11,p0=in3
29294 + ;;
29295 +(p10) mov in2=-1
29296 + tnat.nz p12,p0=in4 // [I0]
29297 +(p11) mov in3=-1
29298 + ;;
29299 +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT) // save ar.rnat
29300 +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE) // save ar.bspstore
29301 + shl r18=r18,16 // compute ar.rsc to be used for "loadrs"
29302 + ;;
29303 + st8 [r16]=r31,PT(LOADRS)-PT(PR) // save predicates
29304 + st8 [r17]=r28,PT(R1)-PT(B0) // save b0
29305 + tnat.nz p13,p0=in5 // [I0]
29306 + ;;
29307 + st8 [r16]=r18,PT(R12)-PT(LOADRS) // save ar.rsc value for "loadrs"
29308 + st8.spill [r17]=r20,PT(R13)-PT(R1) // save original r1
29309 +(p12) mov in4=-1
29310 + ;;
29311 +
29312 +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12) // save r12
29313 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13) // save r13
29314 +(p13) mov in5=-1
29315 + ;;
29316 + st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr
29317 + tnat.nz p13,p0=in6
29318 + cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8
29319 + ;;
29320 + mov r8=1
29321 +(p9) tnat.nz p10,p0=r15
29322 + adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch)
29323 +
29324 + st8.spill [r17]=r15 // save r15
29325 + tnat.nz p8,p0=in7
29326 + nop.i 0
29327 +
29328 + mov r13=r2 // establish `current'
29329 + movl r1=__gp // establish kernel global pointer
29330 + ;;
29331 + st8 [r16]=r8 // ensure pt_regs.r8 != 0 (see handle_syscall_error)
29332 +(p13) mov in6=-1
29333 +(p8) mov in7=-1
29334 +
29335 + cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
29336 + movl r17=FPSR_DEFAULT
29337 + ;;
29338 + mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value
29339 +(p10) mov r8=-EINVAL
29340 + br.ret.sptk.many b7
29341 +END(ia64_syscall_setup)
29342 +#endif
29343 +
29344 + .org ia64_ivt+0x3c00
29345 +/////////////////////////////////////////////////////////////////////////////////////////
29346 +// 0x3c00 Entry 15 (size 64 bundles) Reserved
29347 + DBG_FAULT(15)
29348 + FAULT(15)
29349 +
29350 + /*
29351 + * Squatting in this space ...
29352 + *
29353 + * This special case dispatcher for illegal operation faults allows preserved
29354 + * registers to be modified through a callback function (asm only) that is handed
29355 + * back from the fault handler in r8. Up to three arguments can be passed to the
29356 + * callback function by returning an aggregate with the callback as its first
29357 + * element, followed by the arguments.
29358 + */
29359 +ENTRY(dispatch_illegal_op_fault)
29360 + .prologue
29361 + .body
29362 + SAVE_MIN_WITH_COVER
29363 + ssm psr.ic | PSR_DEFAULT_BITS
29364 + ;;
29365 + srlz.i // guarantee that interruption collection is on
29366 + ;;
29367 +(p15) ssm psr.i // restore psr.i
29368 + adds r3=8,r2 // set up second base pointer for SAVE_REST
29369 + ;;
29370 + alloc r14=ar.pfs,0,0,1,0 // must be first in insn group
29371 + mov out0=ar.ec
29372 + ;;
29373 + SAVE_REST
29374 + PT_REGS_UNWIND_INFO(0)
29375 + ;;
29376 + br.call.sptk.many rp=ia64_illegal_op_fault
29377 +.ret0: ;;
29378 + alloc r14=ar.pfs,0,0,3,0 // must be first in insn group
29379 + mov out0=r9
29380 + mov out1=r10
29381 + mov out2=r11
29382 + movl r15=ia64_leave_kernel
29383 + ;;
29384 + mov rp=r15
29385 + mov b6=r8
29386 + ;;
29387 + cmp.ne p6,p0=0,r8
29388 +(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel
29389 + br.sptk.many ia64_leave_kernel
29390 +END(dispatch_illegal_op_fault)
29391 +
29392 + .org ia64_ivt+0x4000
29393 +/////////////////////////////////////////////////////////////////////////////////////////
29394 +// 0x4000 Entry 16 (size 64 bundles) Reserved
29395 + DBG_FAULT(16)
29396 + FAULT(16)
29397 +
29398 + .org ia64_ivt+0x4400
29399 +/////////////////////////////////////////////////////////////////////////////////////////
29400 +// 0x4400 Entry 17 (size 64 bundles) Reserved
29401 + DBG_FAULT(17)
29402 + FAULT(17)
29403 +
29404 +ENTRY(non_syscall)
29405 + mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER
29406 + ;;
29407 + SAVE_MIN_WITH_COVER
29408 +
29409 + // There is no particular reason for this code to be here, other than that
29410 + // there happens to be space here that would go unused otherwise. If this
29411 + // fault ever gets "unreserved", simply moved the following code to a more
29412 + // suitable spot...
29413 +
29414 + alloc r14=ar.pfs,0,0,2,0
29415 + mov out0=cr.iim
29416 + add out1=16,sp
29417 + adds r3=8,r2 // set up second base pointer for SAVE_REST
29418 +
29419 + ssm psr.ic | PSR_DEFAULT_BITS
29420 + ;;
29421 + srlz.i // guarantee that interruption collection is on
29422 + ;;
29423 +(p15) ssm psr.i // restore psr.i
29424 + movl r15=ia64_leave_kernel
29425 + ;;
29426 + SAVE_REST
29427 + mov rp=r15
29428 + ;;
29429 + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr
29430 +END(non_syscall)
29431 +
29432 + .org ia64_ivt+0x4800
29433 +/////////////////////////////////////////////////////////////////////////////////////////
29434 +// 0x4800 Entry 18 (size 64 bundles) Reserved
29435 + DBG_FAULT(18)
29436 + FAULT(18)
29437 +
29438 + /*
29439 + * There is no particular reason for this code to be here, other than that
29440 + * there happens to be space here that would go unused otherwise. If this
29441 + * fault ever gets "unreserved", simply moved the following code to a more
29442 + * suitable spot...
29443 + */
29444 +
29445 +ENTRY(dispatch_unaligned_handler)
29446 + SAVE_MIN_WITH_COVER
29447 + ;;
29448 + alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
29449 + mov out0=cr.ifa
29450 + adds out1=16,sp
29451 +
29452 + ssm psr.ic | PSR_DEFAULT_BITS
29453 + ;;
29454 + srlz.i // guarantee that interruption collection is on
29455 + ;;
29456 +(p15) ssm psr.i // restore psr.i
29457 + adds r3=8,r2 // set up second base pointer
29458 + ;;
29459 + SAVE_REST
29460 + movl r14=ia64_leave_kernel
29461 + ;;
29462 + mov rp=r14
29463 + br.sptk.many ia64_prepare_handle_unaligned
29464 +END(dispatch_unaligned_handler)
29465 +
29466 + .org ia64_ivt+0x4c00
29467 +/////////////////////////////////////////////////////////////////////////////////////////
29468 +// 0x4c00 Entry 19 (size 64 bundles) Reserved
29469 + DBG_FAULT(19)
29470 + FAULT(19)
29471 +
29472 + /*
29473 + * There is no particular reason for this code to be here, other than that
29474 + * there happens to be space here that would go unused otherwise. If this
29475 + * fault ever gets "unreserved", simply moved the following code to a more
29476 + * suitable spot...
29477 + */
29478 +
29479 +ENTRY(dispatch_to_fault_handler)
29480 + /*
29481 + * Input:
29482 + * psr.ic: off
29483 + * r19: fault vector number (e.g., 24 for General Exception)
29484 + * r31: contains saved predicates (pr)
29485 + */
29486 + SAVE_MIN_WITH_COVER_R19
29487 + alloc r14=ar.pfs,0,0,5,0
29488 + mov out0=r15
29489 +#ifdef CONFIG_XEN
29490 + movl out1=XSI_ISR
29491 + ;;
29492 + adds out2=XSI_IFA-XSI_ISR,out1
29493 + adds out3=XSI_IIM-XSI_ISR,out1
29494 + adds out4=XSI_ITIR-XSI_ISR,out1
29495 + ;;
29496 + ld8 out1=[out1]
29497 + ld8 out2=[out2]
29498 + ld8 out3=[out4]
29499 + ld8 out4=[out4]
29500 + ;;
29501 +#else
29502 + mov out1=cr.isr
29503 + mov out2=cr.ifa
29504 + mov out3=cr.iim
29505 + mov out4=cr.itir
29506 + ;;
29507 +#endif
29508 + ssm psr.ic | PSR_DEFAULT_BITS
29509 + ;;
29510 + srlz.i // guarantee that interruption collection is on
29511 + ;;
29512 +(p15) ssm psr.i // restore psr.i
29513 + adds r3=8,r2 // set up second base pointer for SAVE_REST
29514 + ;;
29515 + SAVE_REST
29516 + movl r14=ia64_leave_kernel
29517 + ;;
29518 + mov rp=r14
29519 + br.call.sptk.many b6=ia64_fault
29520 +END(dispatch_to_fault_handler)
29521 +
29522 +//
29523 +// --- End of long entries, Beginning of short entries
29524 +//
29525 +
29526 + .org ia64_ivt+0x5000
29527 +/////////////////////////////////////////////////////////////////////////////////////////
29528 +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
29529 +ENTRY(page_not_present)
29530 + DBG_FAULT(20)
29531 + mov r16=cr.ifa
29532 + rsm psr.dt
29533 + /*
29534 + * The Linux page fault handler doesn't expect non-present pages to be in
29535 + * the TLB. Flush the existing entry now, so we meet that expectation.
29536 + */
29537 + mov r17=PAGE_SHIFT<<2
29538 + ;;
29539 + ptc.l r16,r17
29540 + ;;
29541 + mov r31=pr
29542 + srlz.d
29543 + br.sptk.many page_fault
29544 +END(page_not_present)
29545 +
29546 + .org ia64_ivt+0x5100
29547 +/////////////////////////////////////////////////////////////////////////////////////////
29548 +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
29549 +ENTRY(key_permission)
29550 + DBG_FAULT(21)
29551 + mov r16=cr.ifa
29552 + rsm psr.dt
29553 + mov r31=pr
29554 + ;;
29555 + srlz.d
29556 + br.sptk.many page_fault
29557 +END(key_permission)
29558 +
29559 + .org ia64_ivt+0x5200
29560 +/////////////////////////////////////////////////////////////////////////////////////////
29561 +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
29562 +ENTRY(iaccess_rights)
29563 + DBG_FAULT(22)
29564 + mov r16=cr.ifa
29565 + rsm psr.dt
29566 + mov r31=pr
29567 + ;;
29568 + srlz.d
29569 + br.sptk.many page_fault
29570 +END(iaccess_rights)
29571 +
29572 + .org ia64_ivt+0x5300
29573 +/////////////////////////////////////////////////////////////////////////////////////////
29574 +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
29575 +ENTRY(daccess_rights)
29576 + DBG_FAULT(23)
29577 +#ifdef CONFIG_XEN
29578 + movl r16=XSI_IFA
29579 + ;;
29580 + ld8 r16=[r16]
29581 + ;;
29582 + XEN_HYPER_RSM_PSR_DT
29583 +#else
29584 + mov r16=cr.ifa
29585 + rsm psr.dt
29586 +#endif
29587 + mov r31=pr
29588 + ;;
29589 + srlz.d
29590 + br.sptk.many page_fault
29591 +END(daccess_rights)
29592 +
29593 + .org ia64_ivt+0x5400
29594 +/////////////////////////////////////////////////////////////////////////////////////////
29595 +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
29596 +ENTRY(general_exception)
29597 + DBG_FAULT(24)
29598 + mov r16=cr.isr
29599 + mov r31=pr
29600 + ;;
29601 + cmp4.eq p6,p0=0,r16
29602 +(p6) br.sptk.many dispatch_illegal_op_fault
29603 + ;;
29604 + mov r19=24 // fault number
29605 + br.sptk.many dispatch_to_fault_handler
29606 +END(general_exception)
29607 +
29608 + .org ia64_ivt+0x5500
29609 +/////////////////////////////////////////////////////////////////////////////////////////
29610 +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
29611 +ENTRY(disabled_fp_reg)
29612 + DBG_FAULT(25)
29613 + rsm psr.dfh // ensure we can access fph
29614 + ;;
29615 + srlz.d
29616 + mov r31=pr
29617 + mov r19=25
29618 + br.sptk.many dispatch_to_fault_handler
29619 +END(disabled_fp_reg)
29620 +
29621 + .org ia64_ivt+0x5600
29622 +/////////////////////////////////////////////////////////////////////////////////////////
29623 +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
29624 +ENTRY(nat_consumption)
29625 + DBG_FAULT(26)
29626 +
29627 + mov r16=cr.ipsr
29628 + mov r17=cr.isr
29629 + mov r31=pr // save PR
29630 + ;;
29631 + and r18=0xf,r17 // r18 = cr.ipsr.code{3:0}
29632 + tbit.z p6,p0=r17,IA64_ISR_NA_BIT
29633 + ;;
29634 + cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
29635 + dep r16=-1,r16,IA64_PSR_ED_BIT,1
29636 +(p6) br.cond.spnt 1f // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
29637 + ;;
29638 + mov cr.ipsr=r16 // set cr.ipsr.na
29639 + mov pr=r31,-1
29640 + ;;
29641 + rfi
29642 +
29643 +1: mov pr=r31,-1
29644 + ;;
29645 + FAULT(26)
29646 +END(nat_consumption)
29647 +
29648 + .org ia64_ivt+0x5700
29649 +/////////////////////////////////////////////////////////////////////////////////////////
29650 +// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
29651 +ENTRY(speculation_vector)
29652 + DBG_FAULT(27)
29653 + /*
29654 + * A [f]chk.[as] instruction needs to take the branch to the recovery code but
29655 + * this part of the architecture is not implemented in hardware on some CPUs, such
29656 + * as Itanium. Thus, in general we need to emulate the behavior. IIM contains
29657 + * the relative target (not yet sign extended). So after sign extending it we
29658 + * simply add it to IIP. We also need to reset the EI field of the IPSR to zero,
29659 + * i.e., the slot to restart into.
29660 + *
29661 + * cr.imm contains zero_ext(imm21)
29662 + */
29663 + mov r18=cr.iim
29664 + ;;
29665 + mov r17=cr.iip
29666 + shl r18=r18,43 // put sign bit in position (43=64-21)
29667 + ;;
29668 +
29669 + mov r16=cr.ipsr
29670 + shr r18=r18,39 // sign extend (39=43-4)
29671 + ;;
29672 +
29673 + add r17=r17,r18 // now add the offset
29674 + ;;
29675 + mov cr.iip=r17
29676 + dep r16=0,r16,41,2 // clear EI
29677 + ;;
29678 +
29679 + mov cr.ipsr=r16
29680 + ;;
29681 +
29682 +#ifdef CONFIG_XEN
29683 + XEN_HYPER_RFI;
29684 +#else
29685 + rfi // and go back
29686 +#endif
29687 +END(speculation_vector)
29688 +
29689 + .org ia64_ivt+0x5800
29690 +/////////////////////////////////////////////////////////////////////////////////////////
29691 +// 0x5800 Entry 28 (size 16 bundles) Reserved
29692 + DBG_FAULT(28)
29693 + FAULT(28)
29694 +
29695 + .org ia64_ivt+0x5900
29696 +/////////////////////////////////////////////////////////////////////////////////////////
29697 +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
29698 +ENTRY(debug_vector)
29699 + DBG_FAULT(29)
29700 + FAULT(29)
29701 +END(debug_vector)
29702 +
29703 + .org ia64_ivt+0x5a00
29704 +/////////////////////////////////////////////////////////////////////////////////////////
29705 +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
29706 +ENTRY(unaligned_access)
29707 + DBG_FAULT(30)
29708 + mov r31=pr // prepare to save predicates
29709 + ;;
29710 + br.sptk.many dispatch_unaligned_handler
29711 +END(unaligned_access)
29712 +
29713 + .org ia64_ivt+0x5b00
29714 +/////////////////////////////////////////////////////////////////////////////////////////
29715 +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
29716 +ENTRY(unsupported_data_reference)
29717 + DBG_FAULT(31)
29718 + FAULT(31)
29719 +END(unsupported_data_reference)
29720 +
29721 + .org ia64_ivt+0x5c00
29722 +/////////////////////////////////////////////////////////////////////////////////////////
29723 +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
29724 +ENTRY(floating_point_fault)
29725 + DBG_FAULT(32)
29726 + FAULT(32)
29727 +END(floating_point_fault)
29728 +
29729 + .org ia64_ivt+0x5d00
29730 +/////////////////////////////////////////////////////////////////////////////////////////
29731 +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
29732 +ENTRY(floating_point_trap)
29733 + DBG_FAULT(33)
29734 + FAULT(33)
29735 +END(floating_point_trap)
29736 +
29737 + .org ia64_ivt+0x5e00
29738 +/////////////////////////////////////////////////////////////////////////////////////////
29739 +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
29740 +ENTRY(lower_privilege_trap)
29741 + DBG_FAULT(34)
29742 + FAULT(34)
29743 +END(lower_privilege_trap)
29744 +
29745 + .org ia64_ivt+0x5f00
29746 +/////////////////////////////////////////////////////////////////////////////////////////
29747 +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
29748 +ENTRY(taken_branch_trap)
29749 + DBG_FAULT(35)
29750 + FAULT(35)
29751 +END(taken_branch_trap)
29752 +
29753 + .org ia64_ivt+0x6000
29754 +/////////////////////////////////////////////////////////////////////////////////////////
29755 +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
29756 +ENTRY(single_step_trap)
29757 + DBG_FAULT(36)
29758 + FAULT(36)
29759 +END(single_step_trap)
29760 +
29761 + .org ia64_ivt+0x6100
29762 +/////////////////////////////////////////////////////////////////////////////////////////
29763 +// 0x6100 Entry 37 (size 16 bundles) Reserved
29764 + DBG_FAULT(37)
29765 + FAULT(37)
29766 +
29767 + .org ia64_ivt+0x6200
29768 +/////////////////////////////////////////////////////////////////////////////////////////
29769 +// 0x6200 Entry 38 (size 16 bundles) Reserved
29770 + DBG_FAULT(38)
29771 + FAULT(38)
29772 +
29773 + .org ia64_ivt+0x6300
29774 +/////////////////////////////////////////////////////////////////////////////////////////
29775 +// 0x6300 Entry 39 (size 16 bundles) Reserved
29776 + DBG_FAULT(39)
29777 + FAULT(39)
29778 +
29779 + .org ia64_ivt+0x6400
29780 +/////////////////////////////////////////////////////////////////////////////////////////
29781 +// 0x6400 Entry 40 (size 16 bundles) Reserved
29782 + DBG_FAULT(40)
29783 + FAULT(40)
29784 +
29785 + .org ia64_ivt+0x6500
29786 +/////////////////////////////////////////////////////////////////////////////////////////
29787 +// 0x6500 Entry 41 (size 16 bundles) Reserved
29788 + DBG_FAULT(41)
29789 + FAULT(41)
29790 +
29791 + .org ia64_ivt+0x6600
29792 +/////////////////////////////////////////////////////////////////////////////////////////
29793 +// 0x6600 Entry 42 (size 16 bundles) Reserved
29794 + DBG_FAULT(42)
29795 + FAULT(42)
29796 +
29797 + .org ia64_ivt+0x6700
29798 +/////////////////////////////////////////////////////////////////////////////////////////
29799 +// 0x6700 Entry 43 (size 16 bundles) Reserved
29800 + DBG_FAULT(43)
29801 + FAULT(43)
29802 +
29803 + .org ia64_ivt+0x6800
29804 +/////////////////////////////////////////////////////////////////////////////////////////
29805 +// 0x6800 Entry 44 (size 16 bundles) Reserved
29806 + DBG_FAULT(44)
29807 + FAULT(44)
29808 +
29809 + .org ia64_ivt+0x6900
29810 +/////////////////////////////////////////////////////////////////////////////////////////
29811 +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
29812 +ENTRY(ia32_exception)
29813 + DBG_FAULT(45)
29814 + FAULT(45)
29815 +END(ia32_exception)
29816 +
29817 + .org ia64_ivt+0x6a00
29818 +/////////////////////////////////////////////////////////////////////////////////////////
29819 +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71)
29820 +ENTRY(ia32_intercept)
29821 + DBG_FAULT(46)
29822 +#ifdef CONFIG_IA32_SUPPORT
29823 + mov r31=pr
29824 + mov r16=cr.isr
29825 + ;;
29826 + extr.u r17=r16,16,8 // get ISR.code
29827 + mov r18=ar.eflag
29828 + mov r19=cr.iim // old eflag value
29829 + ;;
29830 + cmp.ne p6,p0=2,r17
29831 +(p6) br.cond.spnt 1f // not a system flag fault
29832 + xor r16=r18,r19
29833 + ;;
29834 + extr.u r17=r16,18,1 // get the eflags.ac bit
29835 + ;;
29836 + cmp.eq p6,p0=0,r17
29837 +(p6) br.cond.spnt 1f // eflags.ac bit didn't change
29838 + ;;
29839 + mov pr=r31,-1 // restore predicate registers
29840 +#ifdef CONFIG_XEN
29841 + XEN_HYPER_RFI;
29842 +#else
29843 + rfi
29844 +#endif
29845 +
29846 +1:
29847 +#endif // CONFIG_IA32_SUPPORT
29848 + FAULT(46)
29849 +END(ia32_intercept)
29850 +
29851 + .org ia64_ivt+0x6b00
29852 +/////////////////////////////////////////////////////////////////////////////////////////
29853 +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74)
29854 +ENTRY(ia32_interrupt)
29855 + DBG_FAULT(47)
29856 +#ifdef CONFIG_IA32_SUPPORT
29857 + mov r31=pr
29858 + br.sptk.many dispatch_to_ia32_handler
29859 +#else
29860 + FAULT(47)
29861 +#endif
29862 +END(ia32_interrupt)
29863 +
29864 + .org ia64_ivt+0x6c00
29865 +/////////////////////////////////////////////////////////////////////////////////////////
29866 +// 0x6c00 Entry 48 (size 16 bundles) Reserved
29867 + DBG_FAULT(48)
29868 + FAULT(48)
29869 +
29870 + .org ia64_ivt+0x6d00
29871 +/////////////////////////////////////////////////////////////////////////////////////////
29872 +// 0x6d00 Entry 49 (size 16 bundles) Reserved
29873 + DBG_FAULT(49)
29874 + FAULT(49)
29875 +
29876 + .org ia64_ivt+0x6e00
29877 +/////////////////////////////////////////////////////////////////////////////////////////
29878 +// 0x6e00 Entry 50 (size 16 bundles) Reserved
29879 + DBG_FAULT(50)
29880 + FAULT(50)
29881 +
29882 + .org ia64_ivt+0x6f00
29883 +/////////////////////////////////////////////////////////////////////////////////////////
29884 +// 0x6f00 Entry 51 (size 16 bundles) Reserved
29885 + DBG_FAULT(51)
29886 + FAULT(51)
29887 +
29888 + .org ia64_ivt+0x7000
29889 +/////////////////////////////////////////////////////////////////////////////////////////
29890 +// 0x7000 Entry 52 (size 16 bundles) Reserved
29891 + DBG_FAULT(52)
29892 + FAULT(52)
29893 +
29894 + .org ia64_ivt+0x7100
29895 +/////////////////////////////////////////////////////////////////////////////////////////
29896 +// 0x7100 Entry 53 (size 16 bundles) Reserved
29897 + DBG_FAULT(53)
29898 + FAULT(53)
29899 +
29900 + .org ia64_ivt+0x7200
29901 +/////////////////////////////////////////////////////////////////////////////////////////
29902 +// 0x7200 Entry 54 (size 16 bundles) Reserved
29903 + DBG_FAULT(54)
29904 + FAULT(54)
29905 +
29906 + .org ia64_ivt+0x7300
29907 +/////////////////////////////////////////////////////////////////////////////////////////
29908 +// 0x7300 Entry 55 (size 16 bundles) Reserved
29909 + DBG_FAULT(55)
29910 + FAULT(55)
29911 +
29912 + .org ia64_ivt+0x7400
29913 +/////////////////////////////////////////////////////////////////////////////////////////
29914 +// 0x7400 Entry 56 (size 16 bundles) Reserved
29915 + DBG_FAULT(56)
29916 + FAULT(56)
29917 +
29918 + .org ia64_ivt+0x7500
29919 +/////////////////////////////////////////////////////////////////////////////////////////
29920 +// 0x7500 Entry 57 (size 16 bundles) Reserved
29921 + DBG_FAULT(57)
29922 + FAULT(57)
29923 +
29924 + .org ia64_ivt+0x7600
29925 +/////////////////////////////////////////////////////////////////////////////////////////
29926 +// 0x7600 Entry 58 (size 16 bundles) Reserved
29927 + DBG_FAULT(58)
29928 + FAULT(58)
29929 +
29930 + .org ia64_ivt+0x7700
29931 +/////////////////////////////////////////////////////////////////////////////////////////
29932 +// 0x7700 Entry 59 (size 16 bundles) Reserved
29933 + DBG_FAULT(59)
29934 + FAULT(59)
29935 +
29936 + .org ia64_ivt+0x7800
29937 +/////////////////////////////////////////////////////////////////////////////////////////
29938 +// 0x7800 Entry 60 (size 16 bundles) Reserved
29939 + DBG_FAULT(60)
29940 + FAULT(60)
29941 +
29942 + .org ia64_ivt+0x7900
29943 +/////////////////////////////////////////////////////////////////////////////////////////
29944 +// 0x7900 Entry 61 (size 16 bundles) Reserved
29945 + DBG_FAULT(61)
29946 + FAULT(61)
29947 +
29948 + .org ia64_ivt+0x7a00
29949 +/////////////////////////////////////////////////////////////////////////////////////////
29950 +// 0x7a00 Entry 62 (size 16 bundles) Reserved
29951 + DBG_FAULT(62)
29952 + FAULT(62)
29953 +
29954 + .org ia64_ivt+0x7b00
29955 +/////////////////////////////////////////////////////////////////////////////////////////
29956 +// 0x7b00 Entry 63 (size 16 bundles) Reserved
29957 + DBG_FAULT(63)
29958 + FAULT(63)
29959 +
29960 + .org ia64_ivt+0x7c00
29961 +/////////////////////////////////////////////////////////////////////////////////////////
29962 +// 0x7c00 Entry 64 (size 16 bundles) Reserved
29963 + DBG_FAULT(64)
29964 + FAULT(64)
29965 +
29966 + .org ia64_ivt+0x7d00
29967 +/////////////////////////////////////////////////////////////////////////////////////////
29968 +// 0x7d00 Entry 65 (size 16 bundles) Reserved
29969 + DBG_FAULT(65)
29970 + FAULT(65)
29971 +
29972 + .org ia64_ivt+0x7e00
29973 +/////////////////////////////////////////////////////////////////////////////////////////
29974 +// 0x7e00 Entry 66 (size 16 bundles) Reserved
29975 + DBG_FAULT(66)
29976 + FAULT(66)
29977 +
29978 +#ifdef CONFIG_XEN
29979 + /*
29980 + * There is no particular reason for this code to be here, other than that
29981 + * there happens to be space here that would go unused otherwise. If this
29982 + * fault ever gets "unreserved", simply moved the following code to a more
29983 + * suitable spot...
29984 + */
29985 +
29986 +GLOBAL_ENTRY(xen_bsw1)
29987 + /* FIXME: THIS CODE IS NOT NaT SAFE! */
29988 + movl r30=XSI_BANKNUM;
29989 + mov r31=1;;
29990 + st4 [r30]=r31;
29991 + movl r30=XSI_BANK1_R16;
29992 + movl r31=XSI_BANK1_R16+8;;
29993 + ld8 r16=[r30],16; ld8 r17=[r31],16;;
29994 + ld8 r18=[r30],16; ld8 r19=[r31],16;;
29995 + ld8 r20=[r30],16; ld8 r21=[r31],16;;
29996 + ld8 r22=[r30],16; ld8 r23=[r31],16;;
29997 + ld8 r24=[r30],16; ld8 r25=[r31],16;;
29998 + ld8 r26=[r30],16; ld8 r27=[r31],16;;
29999 + ld8 r28=[r30],16; ld8 r29=[r31],16;;
30000 + ld8 r30=[r30]; ld8 r31=[r31];;
30001 + br.ret.sptk.many b0
30002 +END(xen_bsw1)
30003 +#endif
30004 +
30005 + .org ia64_ivt+0x7f00
30006 +/////////////////////////////////////////////////////////////////////////////////////////
30007 +// 0x7f00 Entry 67 (size 16 bundles) Reserved
30008 + DBG_FAULT(67)
30009 + FAULT(67)
30010 +
30011 +#ifdef CONFIG_IA32_SUPPORT
30012 +
30013 + /*
30014 + * There is no particular reason for this code to be here, other than that
30015 + * there happens to be space here that would go unused otherwise. If this
30016 + * fault ever gets "unreserved", simply moved the following code to a more
30017 + * suitable spot...
30018 + */
30019 +
30020 + // IA32 interrupt entry point
30021 +
30022 +ENTRY(dispatch_to_ia32_handler)
30023 + SAVE_MIN
30024 + ;;
30025 + mov r14=cr.isr
30026 + ssm psr.ic | PSR_DEFAULT_BITS
30027 + ;;
30028 + srlz.i // guarantee that interruption collection is on
30029 + ;;
30030 +(p15) ssm psr.i
30031 + adds r3=8,r2 // Base pointer for SAVE_REST
30032 + ;;
30033 + SAVE_REST
30034 + ;;
30035 + mov r15=0x80
30036 + shr r14=r14,16 // Get interrupt number
30037 + ;;
30038 + cmp.ne p6,p0=r14,r15
30039 +(p6) br.call.dpnt.many b6=non_ia32_syscall
30040 +
30041 + adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
30042 + adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
30043 + ;;
30044 + cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
30045 + ld8 r8=[r14] // get r8
30046 + ;;
30047 + st8 [r15]=r8 // save original EAX in r1 (IA32 procs don't use the GP)
30048 + ;;
30049 + alloc r15=ar.pfs,0,0,6,0 // must first in an insn group
30050 + ;;
30051 + ld4 r8=[r14],8 // r8 == eax (syscall number)
30052 + mov r15=IA32_NR_syscalls
30053 + ;;
30054 + cmp.ltu.unc p6,p7=r8,r15
30055 + ld4 out1=[r14],8 // r9 == ecx
30056 + ;;
30057 + ld4 out2=[r14],8 // r10 == edx
30058 + ;;
30059 + ld4 out0=[r14] // r11 == ebx
30060 + adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
30061 + ;;
30062 + ld4 out5=[r14],PT(R14)-PT(R13) // r13 == ebp
30063 + ;;
30064 + ld4 out3=[r14],PT(R15)-PT(R14) // r14 == esi
30065 + adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
30066 + ;;
30067 + ld4 out4=[r14] // r15 == edi
30068 + movl r16=ia32_syscall_table
30069 + ;;
30070 +(p6) shladd r16=r8,3,r16 // force ni_syscall if not valid syscall number
30071 + ld4 r2=[r2] // r2 = current_thread_info()->flags
30072 + ;;
30073 + ld8 r16=[r16]
30074 + and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit
30075 + ;;
30076 + mov b6=r16
30077 + movl r15=ia32_ret_from_syscall
30078 + cmp.eq p8,p0=r2,r0
30079 + ;;
30080 + mov rp=r15
30081 +(p8) br.call.sptk.many b6=b6
30082 + br.cond.sptk ia32_trace_syscall
30083 +
30084 +non_ia32_syscall:
30085 + alloc r15=ar.pfs,0,0,2,0
30086 + mov out0=r14 // interrupt #
30087 + add out1=16,sp // pointer to pt_regs
30088 + ;; // avoid WAW on CFM
30089 + br.call.sptk.many rp=ia32_bad_interrupt
30090 +.ret1: movl r15=ia64_leave_kernel
30091 + ;;
30092 + mov rp=r15
30093 + br.ret.sptk.many rp
30094 +END(dispatch_to_ia32_handler)
30095 +#endif /* CONFIG_IA32_SUPPORT */
30096 +
30097 +#ifdef CONFIG_XEN
30098 + .section .text,"ax"
30099 +GLOBAL_ENTRY(xen_event_callback)
30100 + mov r31=pr // prepare to save predicates
30101 + ;;
30102 + SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3
30103 + ;;
30104 + movl r3=XSI_PSR_IC
30105 + mov r14=1
30106 + ;;
30107 + st4 [r3]=r14
30108 + ;;
30109 + adds r3=8,r2 // set up second base pointer for SAVE_REST
30110 + srlz.i // ensure everybody knows psr.ic is back on
30111 + ;;
30112 + SAVE_REST
30113 + ;;
30114 +1:
30115 + alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
30116 + add out0=16,sp // pass pointer to pt_regs as first arg
30117 + ;;
30118 + br.call.sptk.many b0=evtchn_do_upcall
30119 + ;;
30120 + movl r20=XSI_PSR_I_ADDR
30121 + ;;
30122 + ld8 r20=[r20]
30123 + ;;
30124 + adds r20=-1,r20 // vcpu_info->evtchn_upcall_pending
30125 + ;;
30126 + ld1 r20=[r20]
30127 + ;;
30128 + cmp.ne p6,p0=r20,r0 // if there are pending events,
30129 + (p6) br.spnt.few 1b // call evtchn_do_upcall again.
30130 + br.sptk.many ia64_leave_kernel
30131 +END(xen_event_callback)
30132 +#endif
30133 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenminstate.h linux-2.6.16.33/arch/ia64/xen/xenminstate.h
30134 --- linux-2.6.16.33-noxen/arch/ia64/xen/xenminstate.h 1970-01-01 00:00:00.000000000 +0000
30135 +++ linux-2.6.16.33/arch/ia64/xen/xenminstate.h 2007-01-08 15:00:45.000000000 +0000
30136 @@ -0,0 +1,369 @@
30137 +#include <linux/config.h>
30138 +
30139 +#include <asm/cache.h>
30140 +
30141 +#ifdef CONFIG_XEN
30142 +#include "../kernel/entry.h"
30143 +#else
30144 +#include "entry.h"
30145 +#endif
30146 +
30147 +/*
30148 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
30149 + * on interrupts.
30150 + *
30151 + * On entry:
30152 + * r1: pointer to current task (ar.k6)
30153 + */
30154 +#define MINSTATE_START_SAVE_MIN_VIRT \
30155 +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
30156 + ;; \
30157 +(pUStk) mov.m r24=ar.rnat; \
30158 +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \
30159 +(pKStk) mov r1=sp; /* get sp */ \
30160 + ;; \
30161 +(pUStk) lfetch.fault.excl.nt1 [r22]; \
30162 +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
30163 +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
30164 + ;; \
30165 +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
30166 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
30167 + ;; \
30168 +(pUStk) mov r18=ar.bsp; \
30169 +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
30170 +
30171 +#define MINSTATE_END_SAVE_MIN_VIRT \
30172 + bsw.1; /* switch back to bank 1 (must be last in insn group) */ \
30173 + ;;
30174 +
30175 +/*
30176 + * For mca_asm.S we want to access the stack physically since the state is saved before we
30177 + * go virtual and don't want to destroy the iip or ipsr.
30178 + */
30179 +#define MINSTATE_START_SAVE_MIN_PHYS \
30180 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \
30181 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \
30182 +(pKStk) ld8 r3 = [r3];; \
30183 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \
30184 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \
30185 +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \
30186 +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \
30187 + ;; \
30188 +(pUStk) mov r24=ar.rnat; \
30189 +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \
30190 +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \
30191 +(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \
30192 + ;; \
30193 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \
30194 +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \
30195 + ;; \
30196 +(pUStk) mov r18=ar.bsp; \
30197 +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \
30198 +
30199 +#define MINSTATE_END_SAVE_MIN_PHYS \
30200 + dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \
30201 + ;;
30202 +
30203 +#ifdef MINSTATE_VIRT
30204 +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT)
30205 +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT
30206 +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT
30207 +#endif
30208 +
30209 +#ifdef MINSTATE_PHYS
30210 +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg
30211 +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS
30212 +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS
30213 +#endif
30214 +
30215 +/*
30216 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
30217 + * the minimum state necessary that allows us to turn psr.ic back
30218 + * on.
30219 + *
30220 + * Assumed state upon entry:
30221 + * psr.ic: off
30222 + * r31: contains saved predicates (pr)
30223 + *
30224 + * Upon exit, the state is as follows:
30225 + * psr.ic: off
30226 + * r2 = points to &pt_regs.r16
30227 + * r8 = contents of ar.ccv
30228 + * r9 = contents of ar.csd
30229 + * r10 = contents of ar.ssd
30230 + * r11 = FPSR_DEFAULT
30231 + * r12 = kernel sp (kernel virtual address)
30232 + * r13 = points to current task_struct (kernel virtual address)
30233 + * p15 = TRUE if psr.i is set in cr.ipsr
30234 + * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
30235 + * preserved
30236 + * CONFIG_XEN note: p6/p7 are not preserved
30237 + *
30238 + * Note that psr.ic is NOT turned on by this macro. This is so that
30239 + * we can pass interruption state as arguments to a handler.
30240 + */
30241 +#ifdef CONFIG_XEN
30242 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \
30243 + MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \
30244 + mov r27=ar.rsc; /* M */ \
30245 + mov r20=r1; /* A */ \
30246 + mov r25=ar.unat; /* M */ \
30247 + /* mov r29=cr.ipsr; /* M */ \
30248 + movl r29=XSI_IPSR;; \
30249 + ld8 r29=[r29];; \
30250 + mov r26=ar.pfs; /* I */ \
30251 + /* mov r28=cr.iip; /* M */ \
30252 + movl r28=XSI_IIP;; \
30253 + ld8 r28=[r28];; \
30254 + mov r21=ar.fpsr; /* M */ \
30255 + COVER; /* B;; (or nothing) */ \
30256 + ;; \
30257 + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \
30258 + ;; \
30259 + ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \
30260 + st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \
30261 + adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \
30262 + /* switch from user to kernel RBS: */ \
30263 + ;; \
30264 + invala; /* M */ \
30265 + /* SAVE_IFS; /* see xen special handling below */ \
30266 + cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \
30267 + ;; \
30268 + MINSTATE_START_SAVE_MIN \
30269 + adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \
30270 + adds r16=PT(CR_IPSR),r1; \
30271 + ;; \
30272 + lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
30273 + st8 [r16]=r29; /* save cr.ipsr */ \
30274 + ;; \
30275 + lfetch.fault.excl.nt1 [r17]; \
30276 + tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \
30277 + mov r29=b0 \
30278 + ;; \
30279 + adds r16=PT(R8),r1; /* initialize first base pointer */ \
30280 + adds r17=PT(R9),r1; /* initialize second base pointer */ \
30281 +(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \
30282 + ;; \
30283 +.mem.offset 0,0; st8.spill [r16]=r8,16; \
30284 +.mem.offset 8,0; st8.spill [r17]=r9,16; \
30285 + ;; \
30286 +.mem.offset 0,0; st8.spill [r16]=r10,24; \
30287 +.mem.offset 8,0; st8.spill [r17]=r11,24; \
30288 + ;; \
30289 + /* xen special handling for possibly lazy cover */ \
30290 + movl r8=XSI_INCOMPL_REGFR; \
30291 + ;; \
30292 + ld4 r30=[r8]; \
30293 + ;; \
30294 + /* set XSI_INCOMPL_REGFR 0 */ \
30295 + st4 [r8]=r0; \
30296 + cmp.eq p6,p7=r30,r0; \
30297 + ;; /* not sure if this stop bit is necessary */ \
30298 +(p6) adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8; \
30299 +(p7) adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8; \
30300 + ;; \
30301 + ld8 r30=[r8]; \
30302 + ;; \
30303 + st8 [r16]=r28,16; /* save cr.iip */ \
30304 + st8 [r17]=r30,16; /* save cr.ifs */ \
30305 +(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \
30306 + mov r8=ar.ccv; \
30307 + mov r9=ar.csd; \
30308 + mov r10=ar.ssd; \
30309 + movl r11=FPSR_DEFAULT; /* L-unit */ \
30310 + ;; \
30311 + st8 [r16]=r25,16; /* save ar.unat */ \
30312 + st8 [r17]=r26,16; /* save ar.pfs */ \
30313 + shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
30314 + ;; \
30315 + st8 [r16]=r27,16; /* save ar.rsc */ \
30316 +(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \
30317 +(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \
30318 + ;; /* avoid RAW on r16 & r17 */ \
30319 +(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \
30320 + st8 [r17]=r31,16; /* save predicates */ \
30321 +(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \
30322 + ;; \
30323 + st8 [r16]=r29,16; /* save b0 */ \
30324 + st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \
30325 + cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
30326 + ;; \
30327 +.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \
30328 +.mem.offset 8,0; st8.spill [r17]=r12,16; \
30329 + adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
30330 + ;; \
30331 +.mem.offset 0,0; st8.spill [r16]=r13,16; \
30332 +.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \
30333 + mov r13=IA64_KR(CURRENT); /* establish `current' */ \
30334 + ;; \
30335 +.mem.offset 0,0; st8.spill [r16]=r15,16; \
30336 +.mem.offset 8,0; st8.spill [r17]=r14,16; \
30337 + ;; \
30338 +.mem.offset 0,0; st8.spill [r16]=r2,16; \
30339 +.mem.offset 8,0; st8.spill [r17]=r3,16; \
30340 + ;; \
30341 + EXTRA; \
30342 + mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2; \
30343 + adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
30344 + ;; \
30345 + movl r1=__gp; /* establish kernel global pointer */ \
30346 + ;; \
30347 + /* MINSTATE_END_SAVE_MIN */
30348 +#else
30349 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \
30350 + MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \
30351 + mov r27=ar.rsc; /* M */ \
30352 + mov r20=r1; /* A */ \
30353 + mov r25=ar.unat; /* M */ \
30354 + mov r29=cr.ipsr; /* M */ \
30355 + mov r26=ar.pfs; /* I */ \
30356 + mov r28=cr.iip; /* M */ \
30357 + mov r21=ar.fpsr; /* M */ \
30358 + COVER; /* B;; (or nothing) */ \
30359 + ;; \
30360 + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \
30361 + ;; \
30362 + ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \
30363 + st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \
30364 + adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \
30365 + /* switch from user to kernel RBS: */ \
30366 + ;; \
30367 + invala; /* M */ \
30368 + SAVE_IFS; \
30369 + cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \
30370 + ;; \
30371 + MINSTATE_START_SAVE_MIN \
30372 + adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \
30373 + adds r16=PT(CR_IPSR),r1; \
30374 + ;; \
30375 + lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \
30376 + st8 [r16]=r29; /* save cr.ipsr */ \
30377 + ;; \
30378 + lfetch.fault.excl.nt1 [r17]; \
30379 + tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \
30380 + mov r29=b0 \
30381 + ;; \
30382 + adds r16=PT(R8),r1; /* initialize first base pointer */ \
30383 + adds r17=PT(R9),r1; /* initialize second base pointer */ \
30384 +(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \
30385 + ;; \
30386 +.mem.offset 0,0; st8.spill [r16]=r8,16; \
30387 +.mem.offset 8,0; st8.spill [r17]=r9,16; \
30388 + ;; \
30389 +.mem.offset 0,0; st8.spill [r16]=r10,24; \
30390 +.mem.offset 8,0; st8.spill [r17]=r11,24; \
30391 + ;; \
30392 + st8 [r16]=r28,16; /* save cr.iip */ \
30393 + st8 [r17]=r30,16; /* save cr.ifs */ \
30394 +(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \
30395 + mov r8=ar.ccv; \
30396 + mov r9=ar.csd; \
30397 + mov r10=ar.ssd; \
30398 + movl r11=FPSR_DEFAULT; /* L-unit */ \
30399 + ;; \
30400 + st8 [r16]=r25,16; /* save ar.unat */ \
30401 + st8 [r17]=r26,16; /* save ar.pfs */ \
30402 + shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \
30403 + ;; \
30404 + st8 [r16]=r27,16; /* save ar.rsc */ \
30405 +(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \
30406 +(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \
30407 + ;; /* avoid RAW on r16 & r17 */ \
30408 +(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \
30409 + st8 [r17]=r31,16; /* save predicates */ \
30410 +(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \
30411 + ;; \
30412 + st8 [r16]=r29,16; /* save b0 */ \
30413 + st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \
30414 + cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \
30415 + ;; \
30416 +.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \
30417 +.mem.offset 8,0; st8.spill [r17]=r12,16; \
30418 + adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \
30419 + ;; \
30420 +.mem.offset 0,0; st8.spill [r16]=r13,16; \
30421 +.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \
30422 + mov r13=IA64_KR(CURRENT); /* establish `current' */ \
30423 + ;; \
30424 +.mem.offset 0,0; st8.spill [r16]=r15,16; \
30425 +.mem.offset 8,0; st8.spill [r17]=r14,16; \
30426 + ;; \
30427 +.mem.offset 0,0; st8.spill [r16]=r2,16; \
30428 +.mem.offset 8,0; st8.spill [r17]=r3,16; \
30429 + adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
30430 + ;; \
30431 + EXTRA; \
30432 + movl r1=__gp; /* establish kernel global pointer */ \
30433 + ;; \
30434 + MINSTATE_END_SAVE_MIN
30435 +#endif
30436 +
30437 +/*
30438 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
30439 + *
30440 + * Assumed state upon entry:
30441 + * psr.ic: on
30442 + * r2: points to &pt_regs.r16
30443 + * r3: points to &pt_regs.r17
30444 + * r8: contents of ar.ccv
30445 + * r9: contents of ar.csd
30446 + * r10: contents of ar.ssd
30447 + * r11: FPSR_DEFAULT
30448 + *
30449 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
30450 + */
30451 +#define SAVE_REST \
30452 +.mem.offset 0,0; st8.spill [r2]=r16,16; \
30453 +.mem.offset 8,0; st8.spill [r3]=r17,16; \
30454 + ;; \
30455 +.mem.offset 0,0; st8.spill [r2]=r18,16; \
30456 +.mem.offset 8,0; st8.spill [r3]=r19,16; \
30457 + ;; \
30458 +.mem.offset 0,0; st8.spill [r2]=r20,16; \
30459 +.mem.offset 8,0; st8.spill [r3]=r21,16; \
30460 + mov r18=b6; \
30461 + ;; \
30462 +.mem.offset 0,0; st8.spill [r2]=r22,16; \
30463 +.mem.offset 8,0; st8.spill [r3]=r23,16; \
30464 + mov r19=b7; \
30465 + ;; \
30466 +.mem.offset 0,0; st8.spill [r2]=r24,16; \
30467 +.mem.offset 8,0; st8.spill [r3]=r25,16; \
30468 + ;; \
30469 +.mem.offset 0,0; st8.spill [r2]=r26,16; \
30470 +.mem.offset 8,0; st8.spill [r3]=r27,16; \
30471 + ;; \
30472 +.mem.offset 0,0; st8.spill [r2]=r28,16; \
30473 +.mem.offset 8,0; st8.spill [r3]=r29,16; \
30474 + ;; \
30475 +.mem.offset 0,0; st8.spill [r2]=r30,16; \
30476 +.mem.offset 8,0; st8.spill [r3]=r31,32; \
30477 + ;; \
30478 + mov ar.fpsr=r11; /* M-unit */ \
30479 + st8 [r2]=r8,8; /* ar.ccv */ \
30480 + adds r24=PT(B6)-PT(F7),r3; \
30481 + ;; \
30482 + stf.spill [r2]=f6,32; \
30483 + stf.spill [r3]=f7,32; \
30484 + ;; \
30485 + stf.spill [r2]=f8,32; \
30486 + stf.spill [r3]=f9,32; \
30487 + ;; \
30488 + stf.spill [r2]=f10; \
30489 + stf.spill [r3]=f11; \
30490 + adds r25=PT(B7)-PT(F11),r3; \
30491 + ;; \
30492 + st8 [r24]=r18,16; /* b6 */ \
30493 + st8 [r25]=r19,16; /* b7 */ \
30494 + ;; \
30495 + st8 [r24]=r9; /* ar.csd */ \
30496 + st8 [r25]=r10; /* ar.ssd */ \
30497 + ;;
30498 +
30499 +#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,)
30500 +#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
30501 +#ifdef CONFIG_XEN
30502 +#define SAVE_MIN break 0;; /* FIXME: non-cover version only for ia32 support? */
30503 +#else
30504 +#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, )
30505 +#endif
30506 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xenpal.S linux-2.6.16.33/arch/ia64/xen/xenpal.S
30507 --- linux-2.6.16.33-noxen/arch/ia64/xen/xenpal.S 1970-01-01 00:00:00.000000000 +0000
30508 +++ linux-2.6.16.33/arch/ia64/xen/xenpal.S 2007-01-08 15:00:45.000000000 +0000
30509 @@ -0,0 +1,76 @@
30510 +/*
30511 + * ia64/xen/xenpal.S
30512 + *
30513 + * Alternate PAL routines for Xen. Heavily leveraged from
30514 + * ia64/kernel/pal.S
30515 + *
30516 + * Copyright (C) 2005 Hewlett-Packard Co
30517 + * Dan Magenheimer <dan.magenheimer@.hp.com>
30518 + */
30519 +
30520 +#include <asm/asmmacro.h>
30521 +#include <asm/processor.h>
30522 +
30523 +GLOBAL_ENTRY(xen_pal_call_static)
30524 + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
30525 + alloc loc1 = ar.pfs,5,5,0,0
30526 +#ifdef CONFIG_XEN
30527 + movl r22=running_on_xen;;
30528 + ld4 r22=[r22];;
30529 + cmp.eq p7,p0=r22,r0
30530 +(p7) br.cond.spnt.many __ia64_pal_call_static;;
30531 +#endif
30532 + movl loc2 = pal_entry_point
30533 +1: {
30534 + mov r28 = in0
30535 + mov r29 = in1
30536 + mov r8 = ip
30537 + }
30538 + ;;
30539 + ld8 loc2 = [loc2] // loc2 <- entry point
30540 + tbit.nz p6,p7 = in4, 0
30541 + adds r8 = 1f-1b,r8
30542 + mov loc4=ar.rsc // save RSE configuration
30543 + ;;
30544 + mov ar.rsc=0 // put RSE in enforced lazy, LE mode
30545 + mov loc3 = psr
30546 + mov loc0 = rp
30547 + .body
30548 + mov r30 = in2
30549 +
30550 +#ifdef CONFIG_XEN
30551 + // this is low priority for paravirtualization, but is called
30552 + // from the idle loop so confuses privop counting
30553 + movl r31=XSI_PSR_IC
30554 + ;;
30555 +(p6) st4 [r31]=r0
30556 + ;;
30557 +(p7) adds r31=XSI_PSR_I_ADDR_OFS-XSI_PSR_IC_OFS,r31
30558 +(p7) mov r22=1
30559 + ;;
30560 +(p7) ld8 r31=[r31]
30561 + ;;
30562 +(p7) st1 [r31]=r22
30563 + ;;
30564 + mov r31 = in3
30565 + mov b7 = loc2
30566 + ;;
30567 +#else
30568 +(p6) rsm psr.i | psr.ic
30569 + mov r31 = in3
30570 + mov b7 = loc2
30571 +
30572 +(p7) rsm psr.i
30573 + ;;
30574 +(p6) srlz.i
30575 +#endif
30576 + mov rp = r8
30577 + br.cond.sptk.many b7
30578 +1: mov psr.l = loc3
30579 + mov ar.rsc = loc4 // restore RSE configuration
30580 + mov ar.pfs = loc1
30581 + mov rp = loc0
30582 + ;;
30583 + srlz.d // seralize restoration of psr.l
30584 + br.ret.sptk.many b0
30585 +END(xen_pal_call_static)
30586 diff -Nur linux-2.6.16.33-noxen/arch/ia64/xen/xensetup.S linux-2.6.16.33/arch/ia64/xen/xensetup.S
30587 --- linux-2.6.16.33-noxen/arch/ia64/xen/xensetup.S 1970-01-01 00:00:00.000000000 +0000
30588 +++ linux-2.6.16.33/arch/ia64/xen/xensetup.S 2007-01-08 15:00:45.000000000 +0000
30589 @@ -0,0 +1,54 @@
30590 +/*
30591 + * Support routines for Xen
30592 + *
30593 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
30594 + */
30595 +
30596 +#include <linux/config.h>
30597 +#include <asm/processor.h>
30598 +#include <asm/asmmacro.h>
30599 +
30600 +#define isBP p3 // are we the Bootstrap Processor?
30601 +
30602 + .text
30603 +GLOBAL_ENTRY(early_xen_setup)
30604 + mov r8=ar.rsc // Initialized in head.S
30605 +(isBP) movl r9=running_on_xen;;
30606 + extr.u r8=r8,2,2;; // Extract pl fields
30607 + cmp.eq p7,p0=r8,r0 // p7: !running on xen
30608 + mov r8=1 // booleanize.
30609 +(p7) br.ret.sptk.many rp;;
30610 +(isBP) st4 [r9]=r8
30611 + movl r10=xen_ivt;;
30612 +
30613 + mov cr.iva=r10
30614 +
30615 + /* Set xsi base. */
30616 +#define FW_HYPERCALL_SET_SHARED_INFO_VA 0x600
30617 +(isBP) mov r2=FW_HYPERCALL_SET_SHARED_INFO_VA
30618 +(isBP) movl r28=XSI_BASE;;
30619 +(isBP) break 0x1000;;
30620 +
30621 + br.ret.sptk.many rp
30622 + ;;
30623 +END(early_xen_setup)
30624 +
30625 +#include <xen/interface/xen.h>
30626 +
30627 +/* Stub for suspend.
30628 + Just force the stacked registers to be written in memory. */
30629 +GLOBAL_ENTRY(xencomm_arch_hypercall_suspend)
30630 + mov r15=r32
30631 + ;;
30632 + alloc r20=ar.pfs,0,0,0,0
30633 + mov r2=__HYPERVISOR_sched_op
30634 + ;;
30635 + /* We don't want to deal with RSE. */
30636 + flushrs
30637 + mov r14=2 // SCHEDOP_shutdown
30638 + ;;
30639 + break 0x1000
30640 + ;;
30641 + mov ar.pfs=r20
30642 + br.ret.sptk.many b0
30643 +END(xencomm_arch_hypercall_suspend)
30644 diff -Nur linux-2.6.16.33-noxen/arch/powerpc/kernel/machine_kexec_32.c linux-2.6.16.33/arch/powerpc/kernel/machine_kexec_32.c
30645 --- linux-2.6.16.33-noxen/arch/powerpc/kernel/machine_kexec_32.c 2006-11-22 18:06:31.000000000 +0000
30646 +++ linux-2.6.16.33/arch/powerpc/kernel/machine_kexec_32.c 2007-05-23 21:00:01.000000000 +0000
30647 @@ -30,8 +30,8 @@
30648 */
30649 void default_machine_kexec(struct kimage *image)
30650 {
30651 - const extern unsigned char relocate_new_kernel[];
30652 - const extern unsigned int relocate_new_kernel_size;
30653 + extern const unsigned char relocate_new_kernel[];
30654 + extern const unsigned int relocate_new_kernel_size;
30655 unsigned long page_list;
30656 unsigned long reboot_code_buffer, reboot_code_buffer_phys;
30657 relocate_new_kernel_t rnk;
30658 diff -Nur linux-2.6.16.33-noxen/arch/ppc/kernel/machine_kexec.c linux-2.6.16.33/arch/ppc/kernel/machine_kexec.c
30659 --- linux-2.6.16.33-noxen/arch/ppc/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30660 +++ linux-2.6.16.33/arch/ppc/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30661 @@ -25,8 +25,8 @@
30662 unsigned long reboot_code_buffer,
30663 unsigned long start_address) ATTRIB_NORET;
30664
30665 -const extern unsigned char relocate_new_kernel[];
30666 -const extern unsigned int relocate_new_kernel_size;
30667 +extern const unsigned char relocate_new_kernel[];
30668 +extern const unsigned int relocate_new_kernel_size;
30669
30670 void machine_shutdown(void)
30671 {
30672 diff -Nur linux-2.6.16.33-noxen/arch/s390/kernel/machine_kexec.c linux-2.6.16.33/arch/s390/kernel/machine_kexec.c
30673 --- linux-2.6.16.33-noxen/arch/s390/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30674 +++ linux-2.6.16.33/arch/s390/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30675 @@ -27,8 +27,8 @@
30676
30677 typedef void (*relocate_kernel_t) (kimage_entry_t *, unsigned long);
30678
30679 -const extern unsigned char relocate_kernel[];
30680 -const extern unsigned long long relocate_kernel_len;
30681 +extern const unsigned char relocate_kernel[];
30682 +extern const unsigned long long relocate_kernel_len;
30683
30684 int
30685 machine_kexec_prepare(struct kimage *image)
30686 diff -Nur linux-2.6.16.33-noxen/arch/sh/kernel/machine_kexec.c linux-2.6.16.33/arch/sh/kernel/machine_kexec.c
30687 --- linux-2.6.16.33-noxen/arch/sh/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
30688 +++ linux-2.6.16.33/arch/sh/kernel/machine_kexec.c 2007-05-23 21:00:01.000000000 +0000
30689 @@ -25,8 +25,8 @@
30690 unsigned long start_address,
30691 unsigned long vbr_reg) ATTRIB_NORET;
30692
30693 -const extern unsigned char relocate_new_kernel[];
30694 -const extern unsigned int relocate_new_kernel_size;
30695 +extern const unsigned char relocate_new_kernel[];
30696 +extern const unsigned int relocate_new_kernel_size;
30697 extern void *gdb_vbr_vector;
30698
30699 /*
30700 diff -Nur linux-2.6.16.33-noxen/arch/um/kernel/physmem.c linux-2.6.16.33/arch/um/kernel/physmem.c
30701 --- linux-2.6.16.33-noxen/arch/um/kernel/physmem.c 2006-11-22 18:06:31.000000000 +0000
30702 +++ linux-2.6.16.33/arch/um/kernel/physmem.c 2007-01-08 15:00:45.000000000 +0000
30703 @@ -225,7 +225,7 @@
30704 EXPORT_SYMBOL(physmem_remove_mapping);
30705 EXPORT_SYMBOL(physmem_subst_mapping);
30706
30707 -void arch_free_page(struct page *page, int order)
30708 +int arch_free_page(struct page *page, int order)
30709 {
30710 void *virt;
30711 int i;
30712 @@ -234,6 +234,8 @@
30713 virt = __va(page_to_phys(page + i));
30714 physmem_remove_mapping(virt);
30715 }
30716 +
30717 + return 0;
30718 }
30719
30720 int is_remapped(void *virt)
30721 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/Kconfig linux-2.6.16.33/arch/x86_64/Kconfig
30722 --- linux-2.6.16.33-noxen/arch/x86_64/Kconfig 2006-11-22 18:06:31.000000000 +0000
30723 +++ linux-2.6.16.33/arch/x86_64/Kconfig 2007-01-08 15:00:45.000000000 +0000
30724 @@ -119,6 +119,22 @@
30725
30726 endchoice
30727
30728 +config X86_64_XEN
30729 + bool "Enable Xen compatible kernel"
30730 + select SWIOTLB
30731 + help
30732 + This option will compile a kernel compatible with Xen hypervisor
30733 +
30734 +config X86_NO_TSS
30735 + bool
30736 + depends on X86_64_XEN
30737 + default y
30738 +
30739 +config X86_NO_IDT
30740 + bool
30741 + depends on X86_64_XEN
30742 + default y
30743 +
30744 #
30745 # Define implied options from the CPU selection here
30746 #
30747 @@ -134,6 +150,7 @@
30748
30749 config X86_TSC
30750 bool
30751 + depends on !X86_64_XEN
30752 default y
30753
30754 config X86_GOOD_APIC
30755 @@ -176,7 +193,7 @@
30756
30757 config X86_HT
30758 bool
30759 - depends on SMP && !MK8
30760 + depends on SMP && !MK8 && !X86_64_XEN
30761 default y
30762
30763 config MATH_EMULATION
30764 @@ -190,14 +207,22 @@
30765
30766 config X86_IO_APIC
30767 bool
30768 + depends !XEN_UNPRIVILEGED_GUEST
30769 default y
30770
30771 +config X86_XEN_GENAPIC
30772 + bool
30773 + depends X86_64_XEN
30774 + default XEN_PRIVILEGED_GUEST || SMP
30775 +
30776 config X86_LOCAL_APIC
30777 bool
30778 + depends !XEN_UNPRIVILEGED_GUEST
30779 default y
30780
30781 config MTRR
30782 bool "MTRR (Memory Type Range Register) support"
30783 + depends on !XEN_UNPRIVILEGED_GUEST
30784 ---help---
30785 On Intel P6 family processors (Pentium Pro, Pentium II and later)
30786 the Memory Type Range Registers (MTRRs) may be used to control
30787 @@ -238,7 +263,7 @@
30788
30789 config SCHED_SMT
30790 bool "SMT (Hyperthreading) scheduler support"
30791 - depends on SMP
30792 + depends on SMP && !X86_64_XEN
30793 default n
30794 help
30795 SMT scheduler support improves the CPU scheduler's decision making
30796 @@ -250,7 +275,7 @@
30797
30798 config NUMA
30799 bool "Non Uniform Memory Access (NUMA) Support"
30800 - depends on SMP
30801 + depends on SMP && !X86_64_XEN
30802 help
30803 Enable NUMA (Non Uniform Memory Access) support. The kernel
30804 will try to allocate memory used by a CPU on the local memory
30805 @@ -305,7 +330,7 @@
30806
30807 config ARCH_SPARSEMEM_ENABLE
30808 def_bool y
30809 - depends on (NUMA || EXPERIMENTAL)
30810 + depends on (NUMA || EXPERIMENTAL) && !X86_64_XEN
30811
30812 config ARCH_MEMORY_PROBE
30813 def_bool y
30814 @@ -325,6 +350,7 @@
30815 int "Maximum number of CPUs (2-256)"
30816 range 2 256
30817 depends on SMP
30818 + default "16" if X86_64_XEN
30819 default "8"
30820 help
30821 This allows you to specify the maximum number of CPUs which this
30822 @@ -347,6 +373,7 @@
30823
30824 config HPET_TIMER
30825 bool
30826 + depends on !X86_64_XEN
30827 default y
30828 help
30829 Use the IA-PC HPET (High Precision Event Timer) to manage
30830 @@ -364,7 +391,7 @@
30831 bool "K8 GART IOMMU support"
30832 default y
30833 select SWIOTLB
30834 - depends on PCI
30835 + depends on PCI && !X86_64_XEN
30836 help
30837 Support the IOMMU. Needed to run systems with more than 3GB of memory
30838 properly with 32-bit PCI devices that do not support DAC (Double Address
30839 @@ -382,6 +409,7 @@
30840
30841 config X86_MCE
30842 bool "Machine check support" if EMBEDDED
30843 + depends on !X86_64_XEN
30844 default y
30845 help
30846 Include a machine check error handler to report hardware errors.
30847 @@ -407,7 +435,7 @@
30848
30849 config KEXEC
30850 bool "kexec system call (EXPERIMENTAL)"
30851 - depends on EXPERIMENTAL
30852 + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
30853 help
30854 kexec is a system call that implements the ability to shutdown your
30855 current kernel, and to start another kernel. It is like a reboot
30856 @@ -490,8 +518,11 @@
30857 default y
30858
30859 menu "Power management options"
30860 + depends on !XEN_UNPRIVILEGED_GUEST
30861
30862 +if !X86_64_XEN
30863 source kernel/power/Kconfig
30864 +endif
30865
30866 source "drivers/acpi/Kconfig"
30867
30868 @@ -514,6 +545,21 @@
30869 bool "Support mmconfig PCI config space access"
30870 depends on PCI && ACPI
30871
30872 +config XEN_PCIDEV_FRONTEND
30873 + bool "Xen PCI Frontend"
30874 + depends on PCI && X86_64_XEN
30875 + default y
30876 + help
30877 + The PCI device frontend driver allows the kernel to import arbitrary
30878 + PCI devices from a PCI backend to support PCI driver domains.
30879 +
30880 +config XEN_PCIDEV_FE_DEBUG
30881 + bool "Xen PCI Frontend Debugging"
30882 + depends on XEN_PCIDEV_FRONTEND
30883 + default n
30884 + help
30885 + Enables some debug statements within the PCI Frontend.
30886 +
30887 config UNORDERED_IO
30888 bool "Unordered IO mapping access"
30889 depends on EXPERIMENTAL
30890 @@ -594,4 +640,6 @@
30891
30892 source "crypto/Kconfig"
30893
30894 +source "drivers/xen/Kconfig"
30895 +
30896 source "lib/Kconfig"
30897 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/Makefile linux-2.6.16.33/arch/x86_64/Makefile
30898 --- linux-2.6.16.33-noxen/arch/x86_64/Makefile 2006-11-22 18:06:31.000000000 +0000
30899 +++ linux-2.6.16.33/arch/x86_64/Makefile 2007-01-08 15:00:45.000000000 +0000
30900 @@ -31,6 +31,10 @@
30901 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
30902 CFLAGS += $(cflags-y)
30903
30904 +cppflags-$(CONFIG_XEN) += \
30905 + -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
30906 +CPPFLAGS += $(cppflags-y)
30907 +
30908 CFLAGS += -m64
30909 CFLAGS += -mno-red-zone
30910 CFLAGS += -mcmodel=kernel
30911 @@ -70,6 +74,21 @@
30912 .PHONY: bzImage bzlilo install archmrproper \
30913 fdimage fdimage144 fdimage288 archclean
30914
30915 +ifdef CONFIG_XEN
30916 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
30917 +head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
30918 +LDFLAGS_vmlinux := -e _start
30919 +boot := arch/i386/boot-xen
30920 +.PHONY: vmlinuz
30921 +#Default target when executing "make"
30922 +all: vmlinuz
30923 +
30924 +vmlinuz: vmlinux
30925 + $(Q)$(MAKE) $(build)=$(boot) $@
30926 +
30927 +install:
30928 + $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
30929 +else
30930 #Default target when executing "make"
30931 all: bzImage
30932
30933 @@ -90,6 +109,7 @@
30934
30935 install:
30936 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
30937 +endif
30938
30939 archclean:
30940 $(Q)$(MAKE) $(clean)=$(boot)
30941 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/Makefile linux-2.6.16.33/arch/x86_64/ia32/Makefile
30942 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/Makefile 2006-11-22 18:06:31.000000000 +0000
30943 +++ linux-2.6.16.33/arch/x86_64/ia32/Makefile 2007-01-08 15:00:45.000000000 +0000
30944 @@ -23,9 +23,25 @@
30945 -Wl,-soname=linux-gate.so.1 -o $@ \
30946 -Wl,-T,$(filter-out FORCE,$^)
30947
30948 +$(obj)/vsyscall-int80.so \
30949 $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
30950 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
30951 $(call if_changed,syscall)
30952
30953 -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
30954 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
30955 +AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -Iarch/i386/kernel
30956 +AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 -Iarch/i386/kernel
30957 +
30958 +ifdef CONFIG_XEN
30959 +AFLAGS_vsyscall-int80.o = -m32 -Wa,-32 -Iarch/i386/kernel
30960 +CFLAGS_syscall32-xen.o += -DUSE_INT80
30961 +AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
30962 +
30963 +$(obj)/syscall32_syscall-xen.o: \
30964 + $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
30965 +
30966 +targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
30967 +
30968 +include $(srctree)/scripts/Makefile.xen
30969 +
30970 +obj-y := $(call cherrypickxen, $(obj-y))
30971 +endif
30972 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/ia32entry-xen.S linux-2.6.16.33/arch/x86_64/ia32/ia32entry-xen.S
30973 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/ia32entry-xen.S 1970-01-01 00:00:00.000000000 +0000
30974 +++ linux-2.6.16.33/arch/x86_64/ia32/ia32entry-xen.S 2007-01-08 15:00:45.000000000 +0000
30975 @@ -0,0 +1,721 @@
30976 +/*
30977 + * Compatibility mode system call entry point for x86-64.
30978 + *
30979 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
30980 + */
30981 +
30982 +#include <asm/dwarf2.h>
30983 +#include <asm/calling.h>
30984 +#include <asm/asm-offsets.h>
30985 +#include <asm/current.h>
30986 +#include <asm/errno.h>
30987 +#include <asm/ia32_unistd.h>
30988 +#include <asm/thread_info.h>
30989 +#include <asm/segment.h>
30990 +#include <asm/vsyscall32.h>
30991 +#include <linux/linkage.h>
30992 +
30993 +#define __XEN_X86_64 1
30994 +
30995 + .macro IA32_ARG_FIXUP noebp=0
30996 + movl %edi,%r8d
30997 + .if \noebp
30998 + .else
30999 + movl %ebp,%r9d
31000 + .endif
31001 + xchg %ecx,%esi
31002 + movl %ebx,%edi
31003 + movl %edx,%edx /* zero extension */
31004 + .endm
31005 +
31006 + /* clobbers %eax */
31007 + .macro CLEAR_RREGS
31008 + xorl %eax,%eax
31009 + movq %rax,R11(%rsp)
31010 + movq %rax,R10(%rsp)
31011 + movq %rax,R9(%rsp)
31012 + movq %rax,R8(%rsp)
31013 + .endm
31014 +
31015 +#if defined (__XEN_X86_64)
31016 +#include "../kernel/xen_entry.S"
31017 +
31018 +#define __swapgs
31019 +#define __cli
31020 +#define __sti
31021 +#else
31022 +/*
31023 + * Use the native instructions
31024 + */
31025 +#define __swapgs swapgs
31026 +#define __cli cli
31027 +#define __sti sti
31028 +#endif
31029 +
31030 + .macro CFI_STARTPROC32 simple
31031 + CFI_STARTPROC \simple
31032 + CFI_UNDEFINED r8
31033 + CFI_UNDEFINED r9
31034 + CFI_UNDEFINED r10
31035 + CFI_UNDEFINED r11
31036 + CFI_UNDEFINED r12
31037 + CFI_UNDEFINED r13
31038 + CFI_UNDEFINED r14
31039 + CFI_UNDEFINED r15
31040 + .endm
31041 +
31042 +/*
31043 + * 32bit SYSENTER instruction entry.
31044 + *
31045 + * Arguments:
31046 + * %eax System call number.
31047 + * %ebx Arg1
31048 + * %ecx Arg2
31049 + * %edx Arg3
31050 + * %esi Arg4
31051 + * %edi Arg5
31052 + * %ebp user stack
31053 + * 0(%ebp) Arg6
31054 + *
31055 + * Interrupts off.
31056 + *
31057 + * This is purely a fast path. For anything complicated we use the int 0x80
31058 + * path below. Set up a complete hardware stack frame to share code
31059 + * with the int 0x80 path.
31060 + */
31061 +ENTRY(ia32_sysenter_target)
31062 + CFI_STARTPROC32 simple
31063 + CFI_DEF_CFA rsp,0
31064 + CFI_REGISTER rsp,rbp
31065 + __swapgs
31066 + movq %gs:pda_kernelstack, %rsp
31067 + addq $(PDA_STACKOFFSET),%rsp
31068 + XEN_UNBLOCK_EVENTS(%r11)
31069 + __sti
31070 + movl %ebp,%ebp /* zero extension */
31071 + pushq $__USER32_DS
31072 + CFI_ADJUST_CFA_OFFSET 8
31073 + /*CFI_REL_OFFSET ss,0*/
31074 + pushq %rbp
31075 + CFI_ADJUST_CFA_OFFSET 8
31076 + CFI_REL_OFFSET rsp,0
31077 + pushfq
31078 + CFI_ADJUST_CFA_OFFSET 8
31079 + /*CFI_REL_OFFSET rflags,0*/
31080 + movl $VSYSCALL32_SYSEXIT, %r10d
31081 + CFI_REGISTER rip,r10
31082 + pushq $__USER32_CS
31083 + CFI_ADJUST_CFA_OFFSET 8
31084 + /*CFI_REL_OFFSET cs,0*/
31085 + movl %eax, %eax
31086 + pushq %r10
31087 + CFI_ADJUST_CFA_OFFSET 8
31088 + CFI_REL_OFFSET rip,0
31089 + pushq %rax
31090 + CFI_ADJUST_CFA_OFFSET 8
31091 + cld
31092 + SAVE_ARGS 0,0,1
31093 + /* no need to do an access_ok check here because rbp has been
31094 + 32bit zero extended */
31095 +1: movl (%rbp),%r9d
31096 + .section __ex_table,"a"
31097 + .quad 1b,ia32_badarg
31098 + .previous
31099 + GET_THREAD_INFO(%r10)
31100 + orl $TS_COMPAT,threadinfo_status(%r10)
31101 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31102 + CFI_REMEMBER_STATE
31103 + jnz sysenter_tracesys
31104 +sysenter_do_call:
31105 + cmpl $(IA32_NR_syscalls),%eax
31106 + jae ia32_badsys
31107 + IA32_ARG_FIXUP 1
31108 + call *ia32_sys_call_table(,%rax,8)
31109 + movq %rax,RAX-ARGOFFSET(%rsp)
31110 + GET_THREAD_INFO(%r10)
31111 + XEN_BLOCK_EVENTS(%r11)
31112 + __cli
31113 + testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31114 + jnz int_ret_from_sys_call
31115 + andl $~TS_COMPAT,threadinfo_status(%r10)
31116 + /* clear IF, that popfq doesn't enable interrupts early */
31117 + andl $~0x200,EFLAGS-R11(%rsp)
31118 + RESTORE_ARGS 1,24,1,1,1,1
31119 + popfq
31120 + CFI_ADJUST_CFA_OFFSET -8
31121 + /*CFI_RESTORE rflags*/
31122 + popq %rcx /* User %esp */
31123 + CFI_ADJUST_CFA_OFFSET -8
31124 + CFI_REGISTER rsp,rcx
31125 + movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
31126 + CFI_REGISTER rip,rdx
31127 + __swapgs
31128 + XEN_UNBLOCK_EVENTS(%r11)
31129 + __sti /* sti only takes effect after the next instruction */
31130 + /* sysexit */
31131 + .byte 0xf, 0x35 /* TBD */
31132 +
31133 +sysenter_tracesys:
31134 + CFI_RESTORE_STATE
31135 + SAVE_REST
31136 + CLEAR_RREGS
31137 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
31138 + movq %rsp,%rdi /* &pt_regs -> arg1 */
31139 + call syscall_trace_enter
31140 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31141 + RESTORE_REST
31142 + movl %ebp, %ebp
31143 + /* no need to do an access_ok check here because rbp has been
31144 + 32bit zero extended */
31145 +1: movl (%rbp),%r9d
31146 + .section __ex_table,"a"
31147 + .quad 1b,ia32_badarg
31148 + .previous
31149 + jmp sysenter_do_call
31150 + CFI_ENDPROC
31151 +
31152 +/*
31153 + * 32bit SYSCALL instruction entry.
31154 + *
31155 + * Arguments:
31156 + * %eax System call number.
31157 + * %ebx Arg1
31158 + * %ecx return EIP
31159 + * %edx Arg3
31160 + * %esi Arg4
31161 + * %edi Arg5
31162 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
31163 + * %esp user stack
31164 + * 0(%esp) Arg6
31165 + *
31166 + * Interrupts off.
31167 + *
31168 + * This is purely a fast path. For anything complicated we use the int 0x80
31169 + * path below. Set up a complete hardware stack frame to share code
31170 + * with the int 0x80 path.
31171 + */
31172 +ENTRY(ia32_cstar_target)
31173 + CFI_STARTPROC32 simple
31174 + CFI_DEF_CFA rsp,0
31175 + CFI_REGISTER rip,rcx
31176 + /*CFI_REGISTER rflags,r11*/
31177 + __swapgs
31178 + movl %esp,%r8d
31179 + CFI_REGISTER rsp,r8
31180 + movq %gs:pda_kernelstack,%rsp
31181 + XEN_UNBLOCK_EVENTS(%r11)
31182 + __sti
31183 + SAVE_ARGS 8,1,1
31184 + movl %eax,%eax /* zero extension */
31185 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
31186 + movq %rcx,RIP-ARGOFFSET(%rsp)
31187 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
31188 + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
31189 + movl %ebp,%ecx
31190 + movq $__USER32_CS,CS-ARGOFFSET(%rsp)
31191 + movq $__USER32_DS,SS-ARGOFFSET(%rsp)
31192 + movq %r11,EFLAGS-ARGOFFSET(%rsp)
31193 + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
31194 + movq %r8,RSP-ARGOFFSET(%rsp)
31195 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
31196 + /* no need to do an access_ok check here because r8 has been
31197 + 32bit zero extended */
31198 + /* hardware stack frame is complete now */
31199 +1: movl (%r8),%r9d
31200 + .section __ex_table,"a"
31201 + .quad 1b,ia32_badarg
31202 + .previous
31203 + GET_THREAD_INFO(%r10)
31204 + orl $TS_COMPAT,threadinfo_status(%r10)
31205 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31206 + CFI_REMEMBER_STATE
31207 + jnz cstar_tracesys
31208 +cstar_do_call:
31209 + cmpl $IA32_NR_syscalls,%eax
31210 + jae ia32_badsys
31211 + IA32_ARG_FIXUP 1
31212 + call *ia32_sys_call_table(,%rax,8)
31213 + movq %rax,RAX-ARGOFFSET(%rsp)
31214 + GET_THREAD_INFO(%r10)
31215 + XEN_BLOCK_EVENTS(%r11)
31216 + __cli
31217 + testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
31218 + jnz int_ret_from_sys_call
31219 + andl $~TS_COMPAT,threadinfo_status(%r10)
31220 + RESTORE_ARGS 1,-ARG_SKIP,1,1,1
31221 + movl RIP-ARGOFFSET(%rsp),%ecx
31222 + CFI_REGISTER rip,rcx
31223 + movl EFLAGS-ARGOFFSET(%rsp),%r11d
31224 + /*CFI_REGISTER rflags,r11*/
31225 + movl RSP-ARGOFFSET(%rsp),%esp
31226 + CFI_RESTORE rsp
31227 + __swapgs
31228 + sysretl /* TBD */
31229 +
31230 +cstar_tracesys:
31231 + CFI_RESTORE_STATE
31232 + SAVE_REST
31233 + CLEAR_RREGS
31234 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
31235 + movq %rsp,%rdi /* &pt_regs -> arg1 */
31236 + call syscall_trace_enter
31237 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31238 + RESTORE_REST
31239 + movl RSP-ARGOFFSET(%rsp), %r8d
31240 + /* no need to do an access_ok check here because r8 has been
31241 + 32bit zero extended */
31242 +1: movl (%r8),%r9d
31243 + .section __ex_table,"a"
31244 + .quad 1b,ia32_badarg
31245 + .previous
31246 + jmp cstar_do_call
31247 +
31248 +ia32_badarg:
31249 + movq $-EFAULT,%rax
31250 + jmp ia32_sysret
31251 + CFI_ENDPROC
31252 +
31253 +/*
31254 + * Emulated IA32 system calls via int 0x80.
31255 + *
31256 + * Arguments:
31257 + * %eax System call number.
31258 + * %ebx Arg1
31259 + * %ecx Arg2
31260 + * %edx Arg3
31261 + * %esi Arg4
31262 + * %edi Arg5
31263 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
31264 + *
31265 + * Notes:
31266 + * Uses the same stack frame as the x86-64 version.
31267 + * All registers except %eax must be saved (but ptrace may violate that)
31268 + * Arguments are zero extended. For system calls that want sign extension and
31269 + * take long arguments a wrapper is needed. Most calls can just be called
31270 + * directly.
31271 + * Assumes it is only called from user space and entered with interrupts off.
31272 + */
31273 +
31274 +ENTRY(ia32_syscall)
31275 + CFI_STARTPROC simple
31276 + CFI_DEF_CFA rsp,SS+8-RIP
31277 + /*CFI_REL_OFFSET ss,SS-RIP*/
31278 + CFI_REL_OFFSET rsp,RSP-RIP
31279 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
31280 + /*CFI_REL_OFFSET cs,CS-RIP*/
31281 + CFI_REL_OFFSET rip,RIP-RIP
31282 + __swapgs
31283 + XEN_UNBLOCK_EVENTS(%r11)
31284 + __sti
31285 + movq (%rsp),%rcx
31286 + movq 8(%rsp),%r11
31287 + addq $0x10,%rsp /* skip rcx and r11 */
31288 + movl %eax,%eax
31289 + pushq %rax
31290 + CFI_ADJUST_CFA_OFFSET 8
31291 + cld
31292 +/* 1: jmp 1b */
31293 + /* note the registers are not zero extended to the sf.
31294 + this could be a problem. */
31295 + SAVE_ARGS 0,0,1
31296 + GET_THREAD_INFO(%r10)
31297 + orl $TS_COMPAT,threadinfo_status(%r10)
31298 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
31299 + jnz ia32_tracesys
31300 +ia32_do_syscall:
31301 + cmpl $(IA32_NR_syscalls),%eax
31302 + jae ia32_badsys
31303 + IA32_ARG_FIXUP
31304 + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
31305 +ia32_sysret:
31306 + movq %rax,RAX-ARGOFFSET(%rsp)
31307 + jmp int_ret_from_sys_call
31308 +
31309 +ia32_tracesys:
31310 + SAVE_REST
31311 + movq $-ENOSYS,RAX(%rsp) /* really needed? */
31312 + movq %rsp,%rdi /* &pt_regs -> arg1 */
31313 + call syscall_trace_enter
31314 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
31315 + RESTORE_REST
31316 + jmp ia32_do_syscall
31317 +
31318 +ia32_badsys:
31319 + movq $0,ORIG_RAX-ARGOFFSET(%rsp)
31320 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
31321 + jmp int_ret_from_sys_call
31322 +
31323 +ni_syscall:
31324 + movq %rax,%rdi
31325 + jmp sys32_ni_syscall
31326 +
31327 +quiet_ni_syscall:
31328 + movq $-ENOSYS,%rax
31329 + ret
31330 + CFI_ENDPROC
31331 +
31332 + .macro PTREGSCALL label, func, arg
31333 + .globl \label
31334 +\label:
31335 + leaq \func(%rip),%rax
31336 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
31337 + jmp ia32_ptregs_common
31338 + .endm
31339 +
31340 + CFI_STARTPROC32
31341 +
31342 + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
31343 + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
31344 + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
31345 + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
31346 + PTREGSCALL stub32_execve, sys32_execve, %rcx
31347 + PTREGSCALL stub32_fork, sys_fork, %rdi
31348 + PTREGSCALL stub32_clone, sys32_clone, %rdx
31349 + PTREGSCALL stub32_vfork, sys_vfork, %rdi
31350 + PTREGSCALL stub32_iopl, sys_iopl, %rsi
31351 + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
31352 +
31353 +ENTRY(ia32_ptregs_common)
31354 + popq %r11
31355 + CFI_ENDPROC
31356 + CFI_STARTPROC32 simple
31357 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
31358 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
31359 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
31360 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
31361 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
31362 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
31363 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
31364 +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
31365 +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
31366 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
31367 +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
31368 + SAVE_REST
31369 + call *%rax
31370 + RESTORE_REST
31371 + jmp ia32_sysret /* misbalances the return cache */
31372 + CFI_ENDPROC
31373 +
31374 + .section .rodata,"a"
31375 + .align 8
31376 + .globl ia32_sys_call_table
31377 +ia32_sys_call_table:
31378 + .quad sys_restart_syscall
31379 + .quad sys_exit
31380 + .quad stub32_fork
31381 + .quad sys_read
31382 + .quad sys_write
31383 + .quad compat_sys_open /* 5 */
31384 + .quad sys_close
31385 + .quad sys32_waitpid
31386 + .quad sys_creat
31387 + .quad sys_link
31388 + .quad sys_unlink /* 10 */
31389 + .quad stub32_execve
31390 + .quad sys_chdir
31391 + .quad compat_sys_time
31392 + .quad sys_mknod
31393 + .quad sys_chmod /* 15 */
31394 + .quad sys_lchown16
31395 + .quad quiet_ni_syscall /* old break syscall holder */
31396 + .quad sys_stat
31397 + .quad sys32_lseek
31398 + .quad sys_getpid /* 20 */
31399 + .quad compat_sys_mount /* mount */
31400 + .quad sys_oldumount /* old_umount */
31401 + .quad sys_setuid16
31402 + .quad sys_getuid16
31403 + .quad compat_sys_stime /* stime */ /* 25 */
31404 + .quad sys32_ptrace /* ptrace */
31405 + .quad sys_alarm
31406 + .quad sys_fstat /* (old)fstat */
31407 + .quad sys_pause
31408 + .quad compat_sys_utime /* 30 */
31409 + .quad quiet_ni_syscall /* old stty syscall holder */
31410 + .quad quiet_ni_syscall /* old gtty syscall holder */
31411 + .quad sys_access
31412 + .quad sys_nice
31413 + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
31414 + .quad sys_sync
31415 + .quad sys32_kill
31416 + .quad sys_rename
31417 + .quad sys_mkdir
31418 + .quad sys_rmdir /* 40 */
31419 + .quad sys_dup
31420 + .quad sys32_pipe
31421 + .quad compat_sys_times
31422 + .quad quiet_ni_syscall /* old prof syscall holder */
31423 + .quad sys_brk /* 45 */
31424 + .quad sys_setgid16
31425 + .quad sys_getgid16
31426 + .quad sys_signal
31427 + .quad sys_geteuid16
31428 + .quad sys_getegid16 /* 50 */
31429 + .quad sys_acct
31430 + .quad sys_umount /* new_umount */
31431 + .quad quiet_ni_syscall /* old lock syscall holder */
31432 + .quad compat_sys_ioctl
31433 + .quad compat_sys_fcntl64 /* 55 */
31434 + .quad quiet_ni_syscall /* old mpx syscall holder */
31435 + .quad sys_setpgid
31436 + .quad quiet_ni_syscall /* old ulimit syscall holder */
31437 + .quad sys32_olduname
31438 + .quad sys_umask /* 60 */
31439 + .quad sys_chroot
31440 + .quad sys32_ustat
31441 + .quad sys_dup2
31442 + .quad sys_getppid
31443 + .quad sys_getpgrp /* 65 */
31444 + .quad sys_setsid
31445 + .quad sys32_sigaction
31446 + .quad sys_sgetmask
31447 + .quad sys_ssetmask
31448 + .quad sys_setreuid16 /* 70 */
31449 + .quad sys_setregid16
31450 + .quad stub32_sigsuspend
31451 + .quad compat_sys_sigpending
31452 + .quad sys_sethostname
31453 + .quad compat_sys_setrlimit /* 75 */
31454 + .quad compat_sys_old_getrlimit /* old_getrlimit */
31455 + .quad compat_sys_getrusage
31456 + .quad sys32_gettimeofday
31457 + .quad sys32_settimeofday
31458 + .quad sys_getgroups16 /* 80 */
31459 + .quad sys_setgroups16
31460 + .quad sys32_old_select
31461 + .quad sys_symlink
31462 + .quad sys_lstat
31463 + .quad sys_readlink /* 85 */
31464 +#ifdef CONFIG_IA32_AOUT
31465 + .quad sys_uselib
31466 +#else
31467 + .quad quiet_ni_syscall
31468 +#endif
31469 + .quad sys_swapon
31470 + .quad sys_reboot
31471 + .quad compat_sys_old_readdir
31472 + .quad sys32_mmap /* 90 */
31473 + .quad sys_munmap
31474 + .quad sys_truncate
31475 + .quad sys_ftruncate
31476 + .quad sys_fchmod
31477 + .quad sys_fchown16 /* 95 */
31478 + .quad sys_getpriority
31479 + .quad sys_setpriority
31480 + .quad quiet_ni_syscall /* old profil syscall holder */
31481 + .quad compat_sys_statfs
31482 + .quad compat_sys_fstatfs /* 100 */
31483 + .quad sys_ioperm
31484 + .quad compat_sys_socketcall
31485 + .quad sys_syslog
31486 + .quad compat_sys_setitimer
31487 + .quad compat_sys_getitimer /* 105 */
31488 + .quad compat_sys_newstat
31489 + .quad compat_sys_newlstat
31490 + .quad compat_sys_newfstat
31491 + .quad sys32_uname
31492 + .quad stub32_iopl /* 110 */
31493 + .quad sys_vhangup
31494 + .quad quiet_ni_syscall /* old "idle" system call */
31495 + .quad sys32_vm86_warning /* vm86old */
31496 + .quad compat_sys_wait4
31497 + .quad sys_swapoff /* 115 */
31498 + .quad sys32_sysinfo
31499 + .quad sys32_ipc
31500 + .quad sys_fsync
31501 + .quad stub32_sigreturn
31502 + .quad stub32_clone /* 120 */
31503 + .quad sys_setdomainname
31504 + .quad sys_uname
31505 + .quad sys_modify_ldt
31506 + .quad sys32_adjtimex
31507 + .quad sys32_mprotect /* 125 */
31508 + .quad compat_sys_sigprocmask
31509 + .quad quiet_ni_syscall /* create_module */
31510 + .quad sys_init_module
31511 + .quad sys_delete_module
31512 + .quad quiet_ni_syscall /* 130 get_kernel_syms */
31513 + .quad sys_quotactl
31514 + .quad sys_getpgid
31515 + .quad sys_fchdir
31516 + .quad quiet_ni_syscall /* bdflush */
31517 + .quad sys_sysfs /* 135 */
31518 + .quad sys_personality
31519 + .quad quiet_ni_syscall /* for afs_syscall */
31520 + .quad sys_setfsuid16
31521 + .quad sys_setfsgid16
31522 + .quad sys_llseek /* 140 */
31523 + .quad compat_sys_getdents
31524 + .quad compat_sys_select
31525 + .quad sys_flock
31526 + .quad sys_msync
31527 + .quad compat_sys_readv /* 145 */
31528 + .quad compat_sys_writev
31529 + .quad sys_getsid
31530 + .quad sys_fdatasync
31531 + .quad sys32_sysctl /* sysctl */
31532 + .quad sys_mlock /* 150 */
31533 + .quad sys_munlock
31534 + .quad sys_mlockall
31535 + .quad sys_munlockall
31536 + .quad sys_sched_setparam
31537 + .quad sys_sched_getparam /* 155 */
31538 + .quad sys_sched_setscheduler
31539 + .quad sys_sched_getscheduler
31540 + .quad sys_sched_yield
31541 + .quad sys_sched_get_priority_max
31542 + .quad sys_sched_get_priority_min /* 160 */
31543 + .quad sys_sched_rr_get_interval
31544 + .quad compat_sys_nanosleep
31545 + .quad sys_mremap
31546 + .quad sys_setresuid16
31547 + .quad sys_getresuid16 /* 165 */
31548 + .quad sys32_vm86_warning /* vm86 */
31549 + .quad quiet_ni_syscall /* query_module */
31550 + .quad sys_poll
31551 + .quad compat_sys_nfsservctl
31552 + .quad sys_setresgid16 /* 170 */
31553 + .quad sys_getresgid16
31554 + .quad sys_prctl
31555 + .quad stub32_rt_sigreturn
31556 + .quad sys32_rt_sigaction
31557 + .quad sys32_rt_sigprocmask /* 175 */
31558 + .quad sys32_rt_sigpending
31559 + .quad compat_sys_rt_sigtimedwait
31560 + .quad sys32_rt_sigqueueinfo
31561 + .quad stub32_rt_sigsuspend
31562 + .quad sys32_pread /* 180 */
31563 + .quad sys32_pwrite
31564 + .quad sys_chown16
31565 + .quad sys_getcwd
31566 + .quad sys_capget
31567 + .quad sys_capset
31568 + .quad stub32_sigaltstack
31569 + .quad sys32_sendfile
31570 + .quad quiet_ni_syscall /* streams1 */
31571 + .quad quiet_ni_syscall /* streams2 */
31572 + .quad stub32_vfork /* 190 */
31573 + .quad compat_sys_getrlimit
31574 + .quad sys32_mmap2
31575 + .quad sys32_truncate64
31576 + .quad sys32_ftruncate64
31577 + .quad sys32_stat64 /* 195 */
31578 + .quad sys32_lstat64
31579 + .quad sys32_fstat64
31580 + .quad sys_lchown
31581 + .quad sys_getuid
31582 + .quad sys_getgid /* 200 */
31583 + .quad sys_geteuid
31584 + .quad sys_getegid
31585 + .quad sys_setreuid
31586 + .quad sys_setregid
31587 + .quad sys_getgroups /* 205 */
31588 + .quad sys_setgroups
31589 + .quad sys_fchown
31590 + .quad sys_setresuid
31591 + .quad sys_getresuid
31592 + .quad sys_setresgid /* 210 */
31593 + .quad sys_getresgid
31594 + .quad sys_chown
31595 + .quad sys_setuid
31596 + .quad sys_setgid
31597 + .quad sys_setfsuid /* 215 */
31598 + .quad sys_setfsgid
31599 + .quad sys_pivot_root
31600 + .quad sys_mincore
31601 + .quad sys_madvise
31602 + .quad compat_sys_getdents64 /* 220 getdents64 */
31603 + .quad compat_sys_fcntl64
31604 + .quad quiet_ni_syscall /* tux */
31605 + .quad quiet_ni_syscall /* security */
31606 + .quad sys_gettid
31607 + .quad sys_readahead /* 225 */
31608 + .quad sys_setxattr
31609 + .quad sys_lsetxattr
31610 + .quad sys_fsetxattr
31611 + .quad sys_getxattr
31612 + .quad sys_lgetxattr /* 230 */
31613 + .quad sys_fgetxattr
31614 + .quad sys_listxattr
31615 + .quad sys_llistxattr
31616 + .quad sys_flistxattr
31617 + .quad sys_removexattr /* 235 */
31618 + .quad sys_lremovexattr
31619 + .quad sys_fremovexattr
31620 + .quad sys_tkill
31621 + .quad sys_sendfile64
31622 + .quad compat_sys_futex /* 240 */
31623 + .quad compat_sys_sched_setaffinity
31624 + .quad compat_sys_sched_getaffinity
31625 + .quad sys32_set_thread_area
31626 + .quad sys32_get_thread_area
31627 + .quad compat_sys_io_setup /* 245 */
31628 + .quad sys_io_destroy
31629 + .quad compat_sys_io_getevents
31630 + .quad compat_sys_io_submit
31631 + .quad sys_io_cancel
31632 + .quad sys_fadvise64 /* 250 */
31633 + .quad quiet_ni_syscall /* free_huge_pages */
31634 + .quad sys_exit_group
31635 + .quad sys32_lookup_dcookie
31636 + .quad sys_epoll_create
31637 + .quad sys_epoll_ctl /* 255 */
31638 + .quad sys_epoll_wait
31639 + .quad sys_remap_file_pages
31640 + .quad sys_set_tid_address
31641 + .quad compat_sys_timer_create
31642 + .quad compat_sys_timer_settime /* 260 */
31643 + .quad compat_sys_timer_gettime
31644 + .quad sys_timer_getoverrun
31645 + .quad sys_timer_delete
31646 + .quad compat_sys_clock_settime
31647 + .quad compat_sys_clock_gettime /* 265 */
31648 + .quad compat_sys_clock_getres
31649 + .quad compat_sys_clock_nanosleep
31650 + .quad compat_sys_statfs64
31651 + .quad compat_sys_fstatfs64
31652 + .quad sys_tgkill /* 270 */
31653 + .quad compat_sys_utimes
31654 + .quad sys32_fadvise64_64
31655 + .quad quiet_ni_syscall /* sys_vserver */
31656 + .quad sys_mbind
31657 + .quad compat_sys_get_mempolicy /* 275 */
31658 + .quad sys_set_mempolicy
31659 + .quad compat_sys_mq_open
31660 + .quad sys_mq_unlink
31661 + .quad compat_sys_mq_timedsend
31662 + .quad compat_sys_mq_timedreceive /* 280 */
31663 + .quad compat_sys_mq_notify
31664 + .quad compat_sys_mq_getsetattr
31665 + .quad compat_sys_kexec_load /* reserved for kexec */
31666 + .quad compat_sys_waitid
31667 + .quad quiet_ni_syscall /* 285: sys_altroot */
31668 + .quad sys_add_key
31669 + .quad sys_request_key
31670 + .quad sys_keyctl
31671 + .quad sys_ioprio_set
31672 + .quad sys_ioprio_get /* 290 */
31673 + .quad sys_inotify_init
31674 + .quad sys_inotify_add_watch
31675 + .quad sys_inotify_rm_watch
31676 + .quad sys_migrate_pages
31677 + .quad compat_sys_openat /* 295 */
31678 + .quad sys_mkdirat
31679 + .quad sys_mknodat
31680 + .quad sys_fchownat
31681 + .quad compat_sys_futimesat
31682 + .quad sys32_fstatat /* 300 */
31683 + .quad sys_unlinkat
31684 + .quad sys_renameat
31685 + .quad sys_linkat
31686 + .quad sys_symlinkat
31687 + .quad sys_readlinkat /* 305 */
31688 + .quad sys_fchmodat
31689 + .quad sys_faccessat
31690 + .quad sys_ni_syscall /* pselect6 for now */
31691 + .quad sys_ni_syscall /* ppoll for now */
31692 + .quad sys_unshare /* 310 */
31693 +ia32_syscall_end:
31694 + .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
31695 + .quad ni_syscall
31696 + .endr
31697 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32-xen.c linux-2.6.16.33/arch/x86_64/ia32/syscall32-xen.c
31698 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32-xen.c 1970-01-01 00:00:00.000000000 +0000
31699 +++ linux-2.6.16.33/arch/x86_64/ia32/syscall32-xen.c 2007-01-08 15:00:45.000000000 +0000
31700 @@ -0,0 +1,128 @@
31701 +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
31702 +
31703 +/* vsyscall handling for 32bit processes. Map a stub page into it
31704 + on demand because 32bit cannot reach the kernel's fixmaps */
31705 +
31706 +#include <linux/mm.h>
31707 +#include <linux/string.h>
31708 +#include <linux/kernel.h>
31709 +#include <linux/gfp.h>
31710 +#include <linux/init.h>
31711 +#include <linux/stringify.h>
31712 +#include <linux/security.h>
31713 +#include <asm/proto.h>
31714 +#include <asm/tlbflush.h>
31715 +#include <asm/ia32_unistd.h>
31716 +
31717 +#ifdef USE_INT80
31718 +extern unsigned char syscall32_int80[], syscall32_int80_end[];
31719 +#endif
31720 +extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
31721 +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
31722 +extern int sysctl_vsyscall32;
31723 +
31724 +char *syscall32_page;
31725 +#ifndef USE_INT80
31726 +static int use_sysenter = -1;
31727 +#endif
31728 +
31729 +static struct page *
31730 +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
31731 +{
31732 + struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
31733 + get_page(p);
31734 + return p;
31735 +}
31736 +
31737 +/* Prevent VMA merging */
31738 +static void syscall32_vma_close(struct vm_area_struct *vma)
31739 +{
31740 +}
31741 +
31742 +static struct vm_operations_struct syscall32_vm_ops = {
31743 + .close = syscall32_vma_close,
31744 + .nopage = syscall32_nopage,
31745 +};
31746 +
31747 +struct linux_binprm;
31748 +
31749 +/* Setup a VMA at program startup for the vsyscall page */
31750 +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
31751 +{
31752 + int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
31753 + struct vm_area_struct *vma;
31754 + struct mm_struct *mm = current->mm;
31755 + int ret;
31756 +
31757 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
31758 + if (!vma)
31759 + return -ENOMEM;
31760 +
31761 + memset(vma, 0, sizeof(struct vm_area_struct));
31762 + /* Could randomize here */
31763 + vma->vm_start = VSYSCALL32_BASE;
31764 + vma->vm_end = VSYSCALL32_END;
31765 + /* MAYWRITE to allow gdb to COW and set breakpoints */
31766 + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
31767 + vma->vm_flags |= mm->def_flags;
31768 + vma->vm_page_prot = protection_map[vma->vm_flags & 7];
31769 + vma->vm_ops = &syscall32_vm_ops;
31770 + vma->vm_mm = mm;
31771 +
31772 + down_write(&mm->mmap_sem);
31773 + if ((ret = insert_vm_struct(mm, vma))) {
31774 + up_write(&mm->mmap_sem);
31775 + kmem_cache_free(vm_area_cachep, vma);
31776 + return ret;
31777 + }
31778 + mm->total_vm += npages;
31779 + up_write(&mm->mmap_sem);
31780 + return 0;
31781 +}
31782 +
31783 +static int __init init_syscall32(void)
31784 +{
31785 + syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
31786 + if (!syscall32_page)
31787 + panic("Cannot allocate syscall32 page");
31788 +
31789 +#ifdef USE_INT80
31790 + /*
31791 + * At this point we use int 0x80.
31792 + */
31793 + memcpy(syscall32_page, syscall32_int80,
31794 + syscall32_int80_end - syscall32_int80);
31795 +#else
31796 + if (use_sysenter > 0) {
31797 + memcpy(syscall32_page, syscall32_sysenter,
31798 + syscall32_sysenter_end - syscall32_sysenter);
31799 + } else {
31800 + memcpy(syscall32_page, syscall32_syscall,
31801 + syscall32_syscall_end - syscall32_syscall);
31802 + }
31803 +#endif
31804 + return 0;
31805 +}
31806 +
31807 +/*
31808 + * This must be done early in case we have an initrd containing 32-bit
31809 + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
31810 + */
31811 +core_initcall(init_syscall32);
31812 +
31813 +/* May not be __init: called during resume */
31814 +void syscall32_cpu_init(void)
31815 +{
31816 +#ifndef USE_INT80
31817 + if (use_sysenter < 0)
31818 + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
31819 +
31820 + /* Load these always in case some future AMD CPU supports
31821 + SYSENTER from compat mode too. */
31822 + checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
31823 + checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
31824 + checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
31825 +
31826 + wrmsrl(MSR_CSTAR, ia32_cstar_target);
31827 +#endif
31828 +}
31829 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32_syscall-xen.S linux-2.6.16.33/arch/x86_64/ia32/syscall32_syscall-xen.S
31830 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/syscall32_syscall-xen.S 1970-01-01 00:00:00.000000000 +0000
31831 +++ linux-2.6.16.33/arch/x86_64/ia32/syscall32_syscall-xen.S 2007-01-08 15:00:45.000000000 +0000
31832 @@ -0,0 +1,28 @@
31833 +/* 32bit VDSOs mapped into user space. */
31834 +
31835 + .section ".init.data","aw"
31836 +
31837 +#ifdef USE_INT80
31838 +
31839 + .globl syscall32_int80
31840 + .globl syscall32_int80_end
31841 +
31842 +syscall32_int80:
31843 + .incbin "arch/x86_64/ia32/vsyscall-int80.so"
31844 +syscall32_int80_end:
31845 +
31846 +#endif
31847 +
31848 + .globl syscall32_syscall
31849 + .globl syscall32_syscall_end
31850 +
31851 +syscall32_syscall:
31852 + .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
31853 +syscall32_syscall_end:
31854 +
31855 + .globl syscall32_sysenter
31856 + .globl syscall32_sysenter_end
31857 +
31858 +syscall32_sysenter:
31859 + .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
31860 +syscall32_sysenter_end:
31861 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-int80.S linux-2.6.16.33/arch/x86_64/ia32/vsyscall-int80.S
31862 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-int80.S 1970-01-01 00:00:00.000000000 +0000
31863 +++ linux-2.6.16.33/arch/x86_64/ia32/vsyscall-int80.S 2007-01-08 15:00:45.000000000 +0000
31864 @@ -0,0 +1,58 @@
31865 +/*
31866 + * Code for the vsyscall page. This version uses the old int $0x80 method.
31867 + *
31868 + * NOTE:
31869 + * 1) __kernel_vsyscall _must_ be first in this page.
31870 + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
31871 + * for details.
31872 + */
31873 +#include <asm/ia32_unistd.h>
31874 +#include <asm/asm-offsets.h>
31875 +
31876 + .code32
31877 + .text
31878 + .section .text.vsyscall,"ax"
31879 + .globl __kernel_vsyscall
31880 + .type __kernel_vsyscall,@function
31881 +__kernel_vsyscall:
31882 +.LSTART_vsyscall:
31883 + int $0x80
31884 + ret
31885 +.LEND_vsyscall:
31886 + .size __kernel_vsyscall,.-.LSTART_vsyscall
31887 + .previous
31888 +
31889 + .section .eh_frame,"a",@progbits
31890 +.LSTARTFRAME:
31891 + .long .LENDCIE-.LSTARTCIE
31892 +.LSTARTCIE:
31893 + .long 0 /* CIE ID */
31894 + .byte 1 /* Version number */
31895 + .string "zR" /* NUL-terminated augmentation string */
31896 + .uleb128 1 /* Code alignment factor */
31897 + .sleb128 -4 /* Data alignment factor */
31898 + .byte 8 /* Return address register column */
31899 + .uleb128 1 /* Augmentation value length */
31900 + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
31901 + .byte 0x0c /* DW_CFA_def_cfa */
31902 + .uleb128 4
31903 + .uleb128 4
31904 + .byte 0x88 /* DW_CFA_offset, column 0x8 */
31905 + .uleb128 1
31906 + .align 4
31907 +.LENDCIE:
31908 +
31909 + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
31910 +.LSTARTFDE1:
31911 + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
31912 + .long .LSTART_vsyscall-. /* PC-relative start address */
31913 + .long .LEND_vsyscall-.LSTART_vsyscall
31914 + .uleb128 0 /* Augmentation length */
31915 + .align 4
31916 +.LENDFDE1:
31917 +
31918 +/*
31919 + * Get the common code for the sigreturn entry points.
31920 + */
31921 +#define SYSCALL_ENTER_KERNEL int $0x80
31922 +#include "vsyscall-sigreturn.S"
31923 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-sigreturn.S linux-2.6.16.33/arch/x86_64/ia32/vsyscall-sigreturn.S
31924 --- linux-2.6.16.33-noxen/arch/x86_64/ia32/vsyscall-sigreturn.S 2006-11-22 18:06:31.000000000 +0000
31925 +++ linux-2.6.16.33/arch/x86_64/ia32/vsyscall-sigreturn.S 2007-01-08 15:00:45.000000000 +0000
31926 @@ -120,5 +120,5 @@
31927 .align 4
31928 .LENDFDE3:
31929
31930 -#include "../../i386/kernel/vsyscall-note.S"
31931 +#include <vsyscall-note.S>
31932
31933 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/Makefile linux-2.6.16.33/arch/x86_64/kernel/Makefile
31934 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/Makefile 2006-11-22 18:06:31.000000000 +0000
31935 +++ linux-2.6.16.33/arch/x86_64/kernel/Makefile 2007-01-08 15:00:45.000000000 +0000
31936 @@ -20,11 +20,13 @@
31937 obj-$(CONFIG_X86_CPUID) += cpuid.o
31938 obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
31939 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
31940 +obj-$(CONFIG_X86_XEN_GENAPIC) += genapic.o genapic_xen.o
31941 obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
31942 genapic.o genapic_cluster.o genapic_flat.o
31943 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
31944 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
31945 -obj-$(CONFIG_PM) += suspend.o
31946 +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
31947 +obj-$(CONFIG_ACPI_SLEEP) += suspend.o
31948 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
31949 obj-$(CONFIG_CPU_FREQ) += cpufreq/
31950 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
31951 @@ -51,3 +53,17 @@
31952 msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
31953 dmi_scan-y += ../../i386/kernel/dmi_scan.o
31954
31955 +ifdef CONFIG_XEN
31956 +time-y += ../../i386/kernel/time-xen.o
31957 +pci-dma-y += ../../i386/kernel/pci-dma-xen.o
31958 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := ../../i386/kernel/microcode-xen.o
31959 +quirks-y := ../../i386/kernel/quirks-xen.o
31960 +
31961 +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
31962 +
31963 +include $(srctree)/scripts/Makefile.xen
31964 +
31965 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
31966 +obj-y := $(call cherrypickxen, $(obj-y))
31967 +extra-y := $(call cherrypickxen, $(extra-y))
31968 +endif
31969 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/acpi/Makefile linux-2.6.16.33/arch/x86_64/kernel/acpi/Makefile
31970 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/acpi/Makefile 2006-11-22 18:06:31.000000000 +0000
31971 +++ linux-2.6.16.33/arch/x86_64/kernel/acpi/Makefile 2007-01-08 15:00:45.000000000 +0000
31972 @@ -7,3 +7,4 @@
31973 processor-y := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o
31974 endif
31975
31976 +boot-$(CONFIG_XEN) := ../../../i386/kernel/acpi/boot-xen.o
31977 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/apic-xen.c linux-2.6.16.33/arch/x86_64/kernel/apic-xen.c
31978 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/apic-xen.c 1970-01-01 00:00:00.000000000 +0000
31979 +++ linux-2.6.16.33/arch/x86_64/kernel/apic-xen.c 2007-01-08 15:00:45.000000000 +0000
31980 @@ -0,0 +1,198 @@
31981 +/*
31982 + * Local APIC handling, local APIC timers
31983 + *
31984 + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
31985 + *
31986 + * Fixes
31987 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
31988 + * thanks to Eric Gilmore
31989 + * and Rolf G. Tews
31990 + * for testing these extensively.
31991 + * Maciej W. Rozycki : Various updates and fixes.
31992 + * Mikael Pettersson : Power Management for UP-APIC.
31993 + * Pavel Machek and
31994 + * Mikael Pettersson : PM converted to driver model.
31995 + */
31996 +
31997 +#include <linux/config.h>
31998 +#include <linux/init.h>
31999 +
32000 +#include <linux/mm.h>
32001 +#include <linux/delay.h>
32002 +#include <linux/bootmem.h>
32003 +#include <linux/smp_lock.h>
32004 +#include <linux/interrupt.h>
32005 +#include <linux/mc146818rtc.h>
32006 +#include <linux/kernel_stat.h>
32007 +#include <linux/sysdev.h>
32008 +#include <linux/module.h>
32009 +
32010 +#include <asm/atomic.h>
32011 +#include <asm/smp.h>
32012 +#include <asm/mtrr.h>
32013 +#include <asm/mpspec.h>
32014 +#include <asm/desc.h>
32015 +#include <asm/arch_hooks.h>
32016 +#include <asm/hpet.h>
32017 +#include <asm/idle.h>
32018 +
32019 +int apic_verbosity;
32020 +
32021 +/*
32022 + * 'what should we do if we get a hw irq event on an illegal vector'.
32023 + * each architecture has to answer this themselves.
32024 + */
32025 +void ack_bad_irq(unsigned int irq)
32026 +{
32027 + printk("unexpected IRQ trap at vector %02x\n", irq);
32028 + /*
32029 + * Currently unexpected vectors happen only on SMP and APIC.
32030 + * We _must_ ack these because every local APIC has only N
32031 + * irq slots per priority level, and a 'hanging, unacked' IRQ
32032 + * holds up an irq slot - in excessive cases (when multiple
32033 + * unexpected vectors occur) that might lock up the APIC
32034 + * completely.
32035 + * But don't ack when the APIC is disabled. -AK
32036 + */
32037 + if (!disable_apic)
32038 + ack_APIC_irq();
32039 +}
32040 +
32041 +int setup_profiling_timer(unsigned int multiplier)
32042 +{
32043 + return -EINVAL;
32044 +}
32045 +
32046 +void smp_local_timer_interrupt(struct pt_regs *regs)
32047 +{
32048 + profile_tick(CPU_PROFILING, regs);
32049 +#ifndef CONFIG_XEN
32050 +#ifdef CONFIG_SMP
32051 + update_process_times(user_mode(regs));
32052 +#endif
32053 +#endif
32054 + /*
32055 + * We take the 'long' return path, and there every subsystem
32056 + * grabs the appropriate locks (kernel lock/ irq lock).
32057 + *
32058 + * we might want to decouple profiling from the 'long path',
32059 + * and do the profiling totally in assembly.
32060 + *
32061 + * Currently this isn't too much of an issue (performance wise),
32062 + * we can take more than 100K local irqs per second on a 100 MHz P5.
32063 + */
32064 +}
32065 +
32066 +/*
32067 + * Local APIC timer interrupt. This is the most natural way for doing
32068 + * local interrupts, but local timer interrupts can be emulated by
32069 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
32070 + *
32071 + * [ if a single-CPU system runs an SMP kernel then we call the local
32072 + * interrupt as well. Thus we cannot inline the local irq ... ]
32073 + */
32074 +void smp_apic_timer_interrupt(struct pt_regs *regs)
32075 +{
32076 + /*
32077 + * the NMI deadlock-detector uses this.
32078 + */
32079 + add_pda(apic_timer_irqs, 1);
32080 +
32081 + /*
32082 + * NOTE! We'd better ACK the irq immediately,
32083 + * because timer handling can be slow.
32084 + */
32085 + ack_APIC_irq();
32086 + /*
32087 + * update_process_times() expects us to have done irq_enter().
32088 + * Besides, if we don't timer interrupts ignore the global
32089 + * interrupt lock, which is the WrongThing (tm) to do.
32090 + */
32091 + exit_idle();
32092 + irq_enter();
32093 + smp_local_timer_interrupt(regs);
32094 + irq_exit();
32095 +}
32096 +
32097 +/*
32098 + * This interrupt should _never_ happen with our APIC/SMP architecture
32099 + */
32100 +asmlinkage void smp_spurious_interrupt(void)
32101 +{
32102 + unsigned int v;
32103 + exit_idle();
32104 + irq_enter();
32105 + /*
32106 + * Check if this really is a spurious interrupt and ACK it
32107 + * if it is a vectored one. Just in case...
32108 + * Spurious interrupts should not be ACKed.
32109 + */
32110 + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
32111 + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
32112 + ack_APIC_irq();
32113 +
32114 +#if 0
32115 + static unsigned long last_warning;
32116 + static unsigned long skipped;
32117 +
32118 + /* see sw-dev-man vol 3, chapter 7.4.13.5 */
32119 + if (time_before(last_warning+30*HZ,jiffies)) {
32120 + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
32121 + smp_processor_id(), skipped);
32122 + last_warning = jiffies;
32123 + skipped = 0;
32124 + } else {
32125 + skipped++;
32126 + }
32127 +#endif
32128 + irq_exit();
32129 +}
32130 +
32131 +/*
32132 + * This interrupt should never happen with our APIC/SMP architecture
32133 + */
32134 +
32135 +asmlinkage void smp_error_interrupt(void)
32136 +{
32137 + unsigned int v, v1;
32138 +
32139 + exit_idle();
32140 + irq_enter();
32141 + /* First tickle the hardware, only then report what went on. -- REW */
32142 + v = apic_read(APIC_ESR);
32143 + apic_write(APIC_ESR, 0);
32144 + v1 = apic_read(APIC_ESR);
32145 + ack_APIC_irq();
32146 + atomic_inc(&irq_err_count);
32147 +
32148 + /* Here is what the APIC error bits mean:
32149 + 0: Send CS error
32150 + 1: Receive CS error
32151 + 2: Send accept error
32152 + 3: Receive accept error
32153 + 4: Reserved
32154 + 5: Send illegal vector
32155 + 6: Received illegal vector
32156 + 7: Illegal register address
32157 + */
32158 + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
32159 + smp_processor_id(), v , v1);
32160 + irq_exit();
32161 +}
32162 +
32163 +int disable_apic;
32164 +
32165 +/*
32166 + * This initializes the IO-APIC and APIC hardware if this is
32167 + * a UP kernel.
32168 + */
32169 +int __init APIC_init_uniprocessor (void)
32170 +{
32171 +#ifdef CONFIG_X86_IO_APIC
32172 + if (smp_found_config)
32173 + if (!skip_ioapic_setup && nr_ioapics)
32174 + setup_IO_APIC();
32175 +#endif
32176 +
32177 + return 0;
32178 +}
32179 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/asm-offsets.c linux-2.6.16.33/arch/x86_64/kernel/asm-offsets.c
32180 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/asm-offsets.c 2006-11-22 18:06:31.000000000 +0000
32181 +++ linux-2.6.16.33/arch/x86_64/kernel/asm-offsets.c 2007-01-08 15:00:45.000000000 +0000
32182 @@ -66,7 +66,9 @@
32183 DEFINE(pbe_address, offsetof(struct pbe, address));
32184 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
32185 DEFINE(pbe_next, offsetof(struct pbe, next));
32186 +#ifndef CONFIG_X86_NO_TSS
32187 BLANK();
32188 DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
32189 +#endif
32190 return 0;
32191 }
32192 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/crash.c linux-2.6.16.33/arch/x86_64/kernel/crash.c
32193 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/crash.c 2006-11-22 18:06:31.000000000 +0000
32194 +++ linux-2.6.16.33/arch/x86_64/kernel/crash.c 2007-01-08 15:00:45.000000000 +0000
32195 @@ -92,6 +92,7 @@
32196 crash_save_this_cpu(regs, cpu);
32197 }
32198
32199 +#ifndef CONFIG_XEN
32200 #ifdef CONFIG_SMP
32201 static atomic_t waiting_for_crash_ipi;
32202
32203 @@ -156,6 +157,7 @@
32204 /* There are no cpus to shootdown */
32205 }
32206 #endif
32207 +#endif /* CONFIG_XEN */
32208
32209 void machine_crash_shutdown(struct pt_regs *regs)
32210 {
32211 @@ -173,6 +175,8 @@
32212
32213 /* Make a note of crashing cpu. Will be used in NMI callback.*/
32214 crashing_cpu = smp_processor_id();
32215 +
32216 +#ifndef CONFIG_XEN
32217 nmi_shootdown_cpus();
32218
32219 if(cpu_has_apic)
32220 @@ -181,6 +185,6 @@
32221 #if defined(CONFIG_X86_IO_APIC)
32222 disable_IO_APIC();
32223 #endif
32224 -
32225 +#endif /* CONFIG_XEN */
32226 crash_save_self(regs);
32227 }
32228 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/e820-xen.c linux-2.6.16.33/arch/x86_64/kernel/e820-xen.c
32229 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/e820-xen.c 1970-01-01 00:00:00.000000000 +0000
32230 +++ linux-2.6.16.33/arch/x86_64/kernel/e820-xen.c 2007-01-08 15:00:45.000000000 +0000
32231 @@ -0,0 +1,719 @@
32232 +/*
32233 + * Handle the memory map.
32234 + * The functions here do the job until bootmem takes over.
32235 + * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
32236 + *
32237 + * Getting sanitize_e820_map() in sync with i386 version by applying change:
32238 + * - Provisions for empty E820 memory regions (reported by certain BIOSes).
32239 + * Alex Achenbach <xela@slit.de>, December 2002.
32240 + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
32241 + *
32242 + */
32243 +#include <linux/config.h>
32244 +#include <linux/kernel.h>
32245 +#include <linux/types.h>
32246 +#include <linux/init.h>
32247 +#include <linux/bootmem.h>
32248 +#include <linux/ioport.h>
32249 +#include <linux/string.h>
32250 +#include <linux/kexec.h>
32251 +#include <linux/module.h>
32252 +
32253 +#include <asm/page.h>
32254 +#include <asm/e820.h>
32255 +#include <asm/proto.h>
32256 +#include <asm/bootsetup.h>
32257 +#include <asm/sections.h>
32258 +#include <xen/interface/memory.h>
32259 +
32260 +/*
32261 + * PFN of last memory page.
32262 + */
32263 +unsigned long end_pfn;
32264 +EXPORT_SYMBOL(end_pfn);
32265 +
32266 +/*
32267 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
32268 + * The direct mapping extends to end_pfn_map, so that we can directly access
32269 + * apertures, ACPI and other tables without having to play with fixmaps.
32270 + */
32271 +unsigned long end_pfn_map;
32272 +
32273 +/*
32274 + * Last pfn which the user wants to use.
32275 + */
32276 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
32277 +
32278 +extern struct resource code_resource, data_resource;
32279 +
32280 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */
32281 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
32282 +{
32283 + unsigned long addr = *addrp, last = addr + size;
32284 +
32285 +#ifndef CONFIG_XEN
32286 + /* various gunk below that needed for SMP startup */
32287 + if (addr < 0x8000) {
32288 + *addrp = 0x8000;
32289 + return 1;
32290 + }
32291 +
32292 + /* direct mapping tables of the kernel */
32293 + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
32294 + *addrp = table_end << PAGE_SHIFT;
32295 + return 1;
32296 + }
32297 +
32298 + /* initrd */
32299 +#ifdef CONFIG_BLK_DEV_INITRD
32300 + if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
32301 + addr < INITRD_START+INITRD_SIZE) {
32302 + *addrp = INITRD_START + INITRD_SIZE;
32303 + return 1;
32304 + }
32305 +#endif
32306 + /* kernel code + 640k memory hole (later should not be needed, but
32307 + be paranoid for now) */
32308 + if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
32309 + *addrp = __pa_symbol(&_end);
32310 + return 1;
32311 + }
32312 + /* XXX ramdisk image here? */
32313 +#else
32314 + if (last < (table_end<<PAGE_SHIFT)) {
32315 + *addrp = table_end << PAGE_SHIFT;
32316 + return 1;
32317 + }
32318 +#endif
32319 + return 0;
32320 +}
32321 +
32322 +#ifndef CONFIG_XEN
32323 +int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
32324 +{
32325 + int i;
32326 + for (i = 0; i < e820.nr_map; i++) {
32327 + struct e820entry *ei = &e820.map[i];
32328 + if (type && ei->type != type)
32329 + continue;
32330 + if (ei->addr >= end || ei->addr + ei->size <= start)
32331 + continue;
32332 + return 1;
32333 + }
32334 + return 0;
32335 +}
32336 +#endif
32337 +
32338 +/*
32339 + * Find a free area in a specific range.
32340 + */
32341 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
32342 +{
32343 + int i;
32344 + for (i = 0; i < e820.nr_map; i++) {
32345 + struct e820entry *ei = &e820.map[i];
32346 + unsigned long addr = ei->addr, last;
32347 + if (ei->type != E820_RAM)
32348 + continue;
32349 + if (addr < start)
32350 + addr = start;
32351 + if (addr > ei->addr + ei->size)
32352 + continue;
32353 + while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
32354 + ;
32355 + last = addr + size;
32356 + if (last > ei->addr + ei->size)
32357 + continue;
32358 + if (last > end)
32359 + continue;
32360 + return addr;
32361 + }
32362 + return -1UL;
32363 +}
32364 +
32365 +/*
32366 + * Free bootmem based on the e820 table for a node.
32367 + */
32368 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
32369 +{
32370 + int i;
32371 + for (i = 0; i < e820.nr_map; i++) {
32372 + struct e820entry *ei = &e820.map[i];
32373 + unsigned long last, addr;
32374 +
32375 + if (ei->type != E820_RAM ||
32376 + ei->addr+ei->size <= start ||
32377 + ei->addr >= end)
32378 + continue;
32379 +
32380 + addr = round_up(ei->addr, PAGE_SIZE);
32381 + if (addr < start)
32382 + addr = start;
32383 +
32384 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
32385 + if (last >= end)
32386 + last = end;
32387 +
32388 + if (last > addr && last-addr >= PAGE_SIZE)
32389 + free_bootmem_node(pgdat, addr, last-addr);
32390 + }
32391 +}
32392 +
32393 +/*
32394 + * Find the highest page frame number we have available
32395 + */
32396 +unsigned long __init e820_end_of_ram(void)
32397 +{
32398 + int i;
32399 + unsigned long end_pfn = 0;
32400 +
32401 + for (i = 0; i < e820.nr_map; i++) {
32402 + struct e820entry *ei = &e820.map[i];
32403 + unsigned long start, end;
32404 +
32405 + start = round_up(ei->addr, PAGE_SIZE);
32406 + end = round_down(ei->addr + ei->size, PAGE_SIZE);
32407 + if (start >= end)
32408 + continue;
32409 + if (ei->type == E820_RAM) {
32410 + if (end > end_pfn<<PAGE_SHIFT)
32411 + end_pfn = end>>PAGE_SHIFT;
32412 + } else {
32413 + if (end > end_pfn_map<<PAGE_SHIFT)
32414 + end_pfn_map = end>>PAGE_SHIFT;
32415 + }
32416 + }
32417 +
32418 + if (end_pfn > end_pfn_map)
32419 + end_pfn_map = end_pfn;
32420 + if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
32421 + end_pfn_map = MAXMEM>>PAGE_SHIFT;
32422 + if (end_pfn > end_user_pfn)
32423 + end_pfn = end_user_pfn;
32424 + if (end_pfn > end_pfn_map)
32425 + end_pfn = end_pfn_map;
32426 +
32427 + return end_pfn;
32428 +}
32429 +
32430 +/*
32431 + * Compute how much memory is missing in a range.
32432 + * Unlike the other functions in this file the arguments are in page numbers.
32433 + */
32434 +unsigned long __init
32435 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
32436 +{
32437 + unsigned long ram = 0;
32438 + unsigned long start = start_pfn << PAGE_SHIFT;
32439 + unsigned long end = end_pfn << PAGE_SHIFT;
32440 + int i;
32441 + for (i = 0; i < e820.nr_map; i++) {
32442 + struct e820entry *ei = &e820.map[i];
32443 + unsigned long last, addr;
32444 +
32445 + if (ei->type != E820_RAM ||
32446 + ei->addr+ei->size <= start ||
32447 + ei->addr >= end)
32448 + continue;
32449 +
32450 + addr = round_up(ei->addr, PAGE_SIZE);
32451 + if (addr < start)
32452 + addr = start;
32453 +
32454 + last = round_down(ei->addr + ei->size, PAGE_SIZE);
32455 + if (last >= end)
32456 + last = end;
32457 +
32458 + if (last > addr)
32459 + ram += last - addr;
32460 + }
32461 + return ((end - start) - ram) >> PAGE_SHIFT;
32462 +}
32463 +
32464 +/*
32465 + * Mark e820 reserved areas as busy for the resource manager.
32466 + */
32467 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
32468 +{
32469 + int i;
32470 + for (i = 0; i < nr_map; i++) {
32471 + struct resource *res;
32472 + res = alloc_bootmem_low(sizeof(struct resource));
32473 + switch (e820[i].type) {
32474 + case E820_RAM: res->name = "System RAM"; break;
32475 + case E820_ACPI: res->name = "ACPI Tables"; break;
32476 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
32477 + default: res->name = "reserved";
32478 + }
32479 + res->start = e820[i].addr;
32480 + res->end = res->start + e820[i].size - 1;
32481 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
32482 + request_resource(&iomem_resource, res);
32483 + if (e820[i].type == E820_RAM) {
32484 + /*
32485 + * We don't know which RAM region contains kernel data,
32486 + * so we try it repeatedly and let the resource manager
32487 + * test it.
32488 + */
32489 +#ifndef CONFIG_XEN
32490 + request_resource(res, &code_resource);
32491 + request_resource(res, &data_resource);
32492 +#endif
32493 +#ifdef CONFIG_KEXEC
32494 + if (crashk_res.start != crashk_res.end)
32495 + request_resource(res, &crashk_res);
32496 +#ifdef CONFIG_XEN
32497 + xen_machine_kexec_register_resources(res);
32498 +#endif
32499 +#endif
32500 + }
32501 + }
32502 +}
32503 +
32504 +/*
32505 + * Add a memory region to the kernel e820 map.
32506 + */
32507 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
32508 +{
32509 + int x = e820.nr_map;
32510 +
32511 + if (x == E820MAX) {
32512 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
32513 + return;
32514 + }
32515 +
32516 + e820.map[x].addr = start;
32517 + e820.map[x].size = size;
32518 + e820.map[x].type = type;
32519 + e820.nr_map++;
32520 +}
32521 +
32522 +void __init e820_print_map(char *who)
32523 +{
32524 + int i;
32525 +
32526 + for (i = 0; i < e820.nr_map; i++) {
32527 + printk(" %s: %016Lx - %016Lx ", who,
32528 + (unsigned long long) e820.map[i].addr,
32529 + (unsigned long long) (e820.map[i].addr + e820.map[i].size));
32530 + switch (e820.map[i].type) {
32531 + case E820_RAM: printk("(usable)\n");
32532 + break;
32533 + case E820_RESERVED:
32534 + printk("(reserved)\n");
32535 + break;
32536 + case E820_ACPI:
32537 + printk("(ACPI data)\n");
32538 + break;
32539 + case E820_NVS:
32540 + printk("(ACPI NVS)\n");
32541 + break;
32542 + default: printk("type %u\n", e820.map[i].type);
32543 + break;
32544 + }
32545 + }
32546 +}
32547 +
32548 +/*
32549 + * Sanitize the BIOS e820 map.
32550 + *
32551 + * Some e820 responses include overlapping entries. The following
32552 + * replaces the original e820 map with a new one, removing overlaps.
32553 + *
32554 + */
32555 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
32556 +{
32557 + struct change_member {
32558 + struct e820entry *pbios; /* pointer to original bios entry */
32559 + unsigned long long addr; /* address for this change point */
32560 + };
32561 + static struct change_member change_point_list[2*E820MAX] __initdata;
32562 + static struct change_member *change_point[2*E820MAX] __initdata;
32563 + static struct e820entry *overlap_list[E820MAX] __initdata;
32564 + static struct e820entry new_bios[E820MAX] __initdata;
32565 + struct change_member *change_tmp;
32566 + unsigned long current_type, last_type;
32567 + unsigned long long last_addr;
32568 + int chgidx, still_changing;
32569 + int overlap_entries;
32570 + int new_bios_entry;
32571 + int old_nr, new_nr, chg_nr;
32572 + int i;
32573 +
32574 + /*
32575 + Visually we're performing the following (1,2,3,4 = memory types)...
32576 +
32577 + Sample memory map (w/overlaps):
32578 + ____22__________________
32579 + ______________________4_
32580 + ____1111________________
32581 + _44_____________________
32582 + 11111111________________
32583 + ____________________33__
32584 + ___________44___________
32585 + __________33333_________
32586 + ______________22________
32587 + ___________________2222_
32588 + _________111111111______
32589 + _____________________11_
32590 + _________________4______
32591 +
32592 + Sanitized equivalent (no overlap):
32593 + 1_______________________
32594 + _44_____________________
32595 + ___1____________________
32596 + ____22__________________
32597 + ______11________________
32598 + _________1______________
32599 + __________3_____________
32600 + ___________44___________
32601 + _____________33_________
32602 + _______________2________
32603 + ________________1_______
32604 + _________________4______
32605 + ___________________2____
32606 + ____________________33__
32607 + ______________________4_
32608 + */
32609 +
32610 + /* if there's only one memory region, don't bother */
32611 + if (*pnr_map < 2)
32612 + return -1;
32613 +
32614 + old_nr = *pnr_map;
32615 +
32616 + /* bail out if we find any unreasonable addresses in bios map */
32617 + for (i=0; i<old_nr; i++)
32618 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
32619 + return -1;
32620 +
32621 + /* create pointers for initial change-point information (for sorting) */
32622 + for (i=0; i < 2*old_nr; i++)
32623 + change_point[i] = &change_point_list[i];
32624 +
32625 + /* record all known change-points (starting and ending addresses),
32626 + omitting those that are for empty memory regions */
32627 + chgidx = 0;
32628 + for (i=0; i < old_nr; i++) {
32629 + if (biosmap[i].size != 0) {
32630 + change_point[chgidx]->addr = biosmap[i].addr;
32631 + change_point[chgidx++]->pbios = &biosmap[i];
32632 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
32633 + change_point[chgidx++]->pbios = &biosmap[i];
32634 + }
32635 + }
32636 + chg_nr = chgidx;
32637 +
32638 + /* sort change-point list by memory addresses (low -> high) */
32639 + still_changing = 1;
32640 + while (still_changing) {
32641 + still_changing = 0;
32642 + for (i=1; i < chg_nr; i++) {
32643 + /* if <current_addr> > <last_addr>, swap */
32644 + /* or, if current=<start_addr> & last=<end_addr>, swap */
32645 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
32646 + ((change_point[i]->addr == change_point[i-1]->addr) &&
32647 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
32648 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
32649 + )
32650 + {
32651 + change_tmp = change_point[i];
32652 + change_point[i] = change_point[i-1];
32653 + change_point[i-1] = change_tmp;
32654 + still_changing=1;
32655 + }
32656 + }
32657 + }
32658 +
32659 + /* create a new bios memory map, removing overlaps */
32660 + overlap_entries=0; /* number of entries in the overlap table */
32661 + new_bios_entry=0; /* index for creating new bios map entries */
32662 + last_type = 0; /* start with undefined memory type */
32663 + last_addr = 0; /* start with 0 as last starting address */
32664 + /* loop through change-points, determining affect on the new bios map */
32665 + for (chgidx=0; chgidx < chg_nr; chgidx++)
32666 + {
32667 + /* keep track of all overlapping bios entries */
32668 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
32669 + {
32670 + /* add map entry to overlap list (> 1 entry implies an overlap) */
32671 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
32672 + }
32673 + else
32674 + {
32675 + /* remove entry from list (order independent, so swap with last) */
32676 + for (i=0; i<overlap_entries; i++)
32677 + {
32678 + if (overlap_list[i] == change_point[chgidx]->pbios)
32679 + overlap_list[i] = overlap_list[overlap_entries-1];
32680 + }
32681 + overlap_entries--;
32682 + }
32683 + /* if there are overlapping entries, decide which "type" to use */
32684 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
32685 + current_type = 0;
32686 + for (i=0; i<overlap_entries; i++)
32687 + if (overlap_list[i]->type > current_type)
32688 + current_type = overlap_list[i]->type;
32689 + /* continue building up new bios map based on this information */
32690 + if (current_type != last_type) {
32691 + if (last_type != 0) {
32692 + new_bios[new_bios_entry].size =
32693 + change_point[chgidx]->addr - last_addr;
32694 + /* move forward only if the new size was non-zero */
32695 + if (new_bios[new_bios_entry].size != 0)
32696 + if (++new_bios_entry >= E820MAX)
32697 + break; /* no more space left for new bios entries */
32698 + }
32699 + if (current_type != 0) {
32700 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
32701 + new_bios[new_bios_entry].type = current_type;
32702 + last_addr=change_point[chgidx]->addr;
32703 + }
32704 + last_type = current_type;
32705 + }
32706 + }
32707 + new_nr = new_bios_entry; /* retain count for new bios entries */
32708 +
32709 + /* copy new bios mapping into original location */
32710 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
32711 + *pnr_map = new_nr;
32712 +
32713 + return 0;
32714 +}
32715 +
32716 +/*
32717 + * Copy the BIOS e820 map into a safe place.
32718 + *
32719 + * Sanity-check it while we're at it..
32720 + *
32721 + * If we're lucky and live on a modern system, the setup code
32722 + * will have given us a memory map that we can use to properly
32723 + * set up memory. If we aren't, we'll fake a memory map.
32724 + *
32725 + * We check to see that the memory map contains at least 2 elements
32726 + * before we'll use it, because the detection code in setup.S may
32727 + * not be perfect and most every PC known to man has two memory
32728 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
32729 + * thinkpad 560x, for example, does not cooperate with the memory
32730 + * detection code.)
32731 + */
32732 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
32733 +{
32734 +#ifndef CONFIG_XEN
32735 + /* Only one memory region (or negative)? Ignore it */
32736 + if (nr_map < 2)
32737 + return -1;
32738 +#else
32739 + BUG_ON(nr_map < 1);
32740 +#endif
32741 +
32742 + do {
32743 + unsigned long start = biosmap->addr;
32744 + unsigned long size = biosmap->size;
32745 + unsigned long end = start + size;
32746 + unsigned long type = biosmap->type;
32747 +
32748 + /* Overflow in 64 bits? Ignore the memory map. */
32749 + if (start > end)
32750 + return -1;
32751 +
32752 +#ifndef CONFIG_XEN
32753 + /*
32754 + * Some BIOSes claim RAM in the 640k - 1M region.
32755 + * Not right. Fix it up.
32756 + *
32757 + * This should be removed on Hammer which is supposed to not
32758 + * have non e820 covered ISA mappings there, but I had some strange
32759 + * problems so it stays for now. -AK
32760 + */
32761 + if (type == E820_RAM) {
32762 + if (start < 0x100000ULL && end > 0xA0000ULL) {
32763 + if (start < 0xA0000ULL)
32764 + add_memory_region(start, 0xA0000ULL-start, type);
32765 + if (end <= 0x100000ULL)
32766 + continue;
32767 + start = 0x100000ULL;
32768 + size = end - start;
32769 + }
32770 + }
32771 +#endif
32772 +
32773 + add_memory_region(start, size, type);
32774 + } while (biosmap++,--nr_map);
32775 + return 0;
32776 +}
32777 +
32778 +#ifndef CONFIG_XEN
32779 +void __init setup_memory_region(void)
32780 +{
32781 + char *who = "BIOS-e820";
32782 +
32783 + /*
32784 + * Try to copy the BIOS-supplied E820-map.
32785 + *
32786 + * Otherwise fake a memory map; one section from 0k->640k,
32787 + * the next section from 1mb->appropriate_mem_k
32788 + */
32789 + sanitize_e820_map(E820_MAP, &E820_MAP_NR);
32790 + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
32791 + unsigned long mem_size;
32792 +
32793 + /* compare results from other methods and take the greater */
32794 + if (ALT_MEM_K < EXT_MEM_K) {
32795 + mem_size = EXT_MEM_K;
32796 + who = "BIOS-88";
32797 + } else {
32798 + mem_size = ALT_MEM_K;
32799 + who = "BIOS-e801";
32800 + }
32801 +
32802 + e820.nr_map = 0;
32803 + add_memory_region(0, LOWMEMSIZE(), E820_RAM);
32804 + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
32805 + }
32806 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
32807 + e820_print_map(who);
32808 +}
32809 +
32810 +#else /* CONFIG_XEN */
32811 +
32812 +void __init setup_memory_region(void)
32813 +{
32814 + int rc;
32815 + struct xen_memory_map memmap;
32816 + /*
32817 + * This is rather large for a stack variable but this early in
32818 + * the boot process we know we have plenty slack space.
32819 + */
32820 + struct e820entry map[E820MAX];
32821 +
32822 + memmap.nr_entries = E820MAX;
32823 + set_xen_guest_handle(memmap.buffer, map);
32824 +
32825 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
32826 + if ( rc == -ENOSYS ) {
32827 + memmap.nr_entries = 1;
32828 + map[0].addr = 0ULL;
32829 + map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
32830 + /* 8MB slack (to balance backend allocations). */
32831 + map[0].size += 8 << 20;
32832 + map[0].type = E820_RAM;
32833 + rc = 0;
32834 + }
32835 + BUG_ON(rc);
32836 +
32837 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
32838 +
32839 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
32840 +
32841 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
32842 + e820_print_map("Xen");
32843 +}
32844 +#endif
32845 +
32846 +void __init parse_memopt(char *p, char **from)
32847 +{
32848 + int i;
32849 + unsigned long current_end;
32850 + unsigned long end;
32851 +
32852 + end_user_pfn = memparse(p, from);
32853 + end_user_pfn >>= PAGE_SHIFT;
32854 +
32855 + end = end_user_pfn<<PAGE_SHIFT;
32856 + i = e820.nr_map-1;
32857 + current_end = e820.map[i].addr + e820.map[i].size;
32858 +
32859 + if (current_end < end) {
32860 + /*
32861 + * The e820 map ends before our requested size so
32862 + * extend the final entry to the requested address.
32863 + */
32864 + if (e820.map[i].type == E820_RAM)
32865 + e820.map[i].size = end - e820.map[i].addr;
32866 + else
32867 + add_memory_region(current_end, end - current_end, E820_RAM);
32868 + }
32869 +}
32870 +
32871 +void __init parse_memmapopt(char *p, char **from)
32872 +{
32873 + unsigned long long start_at, mem_size;
32874 +
32875 + mem_size = memparse(p, from);
32876 + p = *from;
32877 + if (*p == '@') {
32878 + start_at = memparse(p+1, from);
32879 + add_memory_region(start_at, mem_size, E820_RAM);
32880 + } else if (*p == '#') {
32881 + start_at = memparse(p+1, from);
32882 + add_memory_region(start_at, mem_size, E820_ACPI);
32883 + } else if (*p == '$') {
32884 + start_at = memparse(p+1, from);
32885 + add_memory_region(start_at, mem_size, E820_RESERVED);
32886 + } else {
32887 + end_user_pfn = (mem_size >> PAGE_SHIFT);
32888 + }
32889 + p = *from;
32890 +}
32891 +
32892 +unsigned long pci_mem_start = 0xaeedbabe;
32893 +
32894 +/*
32895 + * Search for the biggest gap in the low 32 bits of the e820
32896 + * memory space. We pass this space to PCI to assign MMIO resources
32897 + * for hotplug or unconfigured devices in.
32898 + * Hopefully the BIOS let enough space left.
32899 + */
32900 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
32901 +{
32902 + unsigned long gapstart, gapsize, round;
32903 + unsigned long last;
32904 + int i;
32905 + int found = 0;
32906 +
32907 + last = 0x100000000ull;
32908 + gapstart = 0x10000000;
32909 + gapsize = 0x400000;
32910 + i = nr_map;
32911 + while (--i >= 0) {
32912 + unsigned long long start = e820[i].addr;
32913 + unsigned long long end = start + e820[i].size;
32914 +
32915 + /*
32916 + * Since "last" is at most 4GB, we know we'll
32917 + * fit in 32 bits if this condition is true
32918 + */
32919 + if (last > end) {
32920 + unsigned long gap = last - end;
32921 +
32922 + if (gap > gapsize) {
32923 + gapsize = gap;
32924 + gapstart = end;
32925 + found = 1;
32926 + }
32927 + }
32928 + if (start < last)
32929 + last = start;
32930 + }
32931 +
32932 + if (!found) {
32933 + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
32934 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
32935 + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
32936 + }
32937 +
32938 + /*
32939 + * See how much we want to round up: start off with
32940 + * rounding to the next 1MB area.
32941 + */
32942 + round = 0x100000;
32943 + while ((gapsize >> 4) > round)
32944 + round += round;
32945 + /* Fun with two's complement */
32946 + pci_mem_start = (gapstart + round) & -round;
32947 +
32948 + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
32949 + pci_mem_start, gapstart, gapsize);
32950 +}
32951 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/early_printk-xen.c linux-2.6.16.33/arch/x86_64/kernel/early_printk-xen.c
32952 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/early_printk-xen.c 1970-01-01 00:00:00.000000000 +0000
32953 +++ linux-2.6.16.33/arch/x86_64/kernel/early_printk-xen.c 2007-01-08 15:00:45.000000000 +0000
32954 @@ -0,0 +1,306 @@
32955 +#include <linux/config.h>
32956 +#include <linux/console.h>
32957 +#include <linux/kernel.h>
32958 +#include <linux/init.h>
32959 +#include <linux/string.h>
32960 +#include <linux/tty.h>
32961 +#include <asm/io.h>
32962 +#include <asm/processor.h>
32963 +#include <asm/fcntl.h>
32964 +
32965 +/* Simple VGA output */
32966 +
32967 +#ifdef __i386__
32968 +#include <asm/setup.h>
32969 +#define VGABASE (__ISA_IO_base + 0xb8000)
32970 +#else
32971 +#include <asm/bootsetup.h>
32972 +#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
32973 +#endif
32974 +
32975 +#define MAX_YPOS max_ypos
32976 +#define MAX_XPOS max_xpos
32977 +
32978 +static int max_ypos = 25, max_xpos = 80;
32979 +
32980 +#ifndef CONFIG_XEN
32981 +static int current_ypos = 1, current_xpos = 0;
32982 +
32983 +static void early_vga_write(struct console *con, const char *str, unsigned n)
32984 +{
32985 + char c;
32986 + int i, k, j;
32987 +
32988 + while ((c = *str++) != '\0' && n-- > 0) {
32989 + if (current_ypos >= MAX_YPOS) {
32990 + /* scroll 1 line up */
32991 + for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
32992 + for (i = 0; i < MAX_XPOS; i++) {
32993 + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
32994 + VGABASE + 2*(MAX_XPOS*j + i));
32995 + }
32996 + }
32997 + for (i = 0; i < MAX_XPOS; i++)
32998 + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
32999 + current_ypos = MAX_YPOS-1;
33000 + }
33001 + if (c == '\n') {
33002 + current_xpos = 0;
33003 + current_ypos++;
33004 + } else if (c != '\r') {
33005 + writew(((0x7 << 8) | (unsigned short) c),
33006 + VGABASE + 2*(MAX_XPOS*current_ypos +
33007 + current_xpos++));
33008 + if (current_xpos >= MAX_XPOS) {
33009 + current_xpos = 0;
33010 + current_ypos++;
33011 + }
33012 + }
33013 + }
33014 +}
33015 +
33016 +static struct console early_vga_console = {
33017 + .name = "earlyvga",
33018 + .write = early_vga_write,
33019 + .flags = CON_PRINTBUFFER,
33020 + .index = -1,
33021 +};
33022 +
33023 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
33024 +
33025 +static int early_serial_base = 0x3f8; /* ttyS0 */
33026 +
33027 +#define XMTRDY 0x20
33028 +
33029 +#define DLAB 0x80
33030 +
33031 +#define TXR 0 /* Transmit register (WRITE) */
33032 +#define RXR 0 /* Receive register (READ) */
33033 +#define IER 1 /* Interrupt Enable */
33034 +#define IIR 2 /* Interrupt ID */
33035 +#define FCR 2 /* FIFO control */
33036 +#define LCR 3 /* Line control */
33037 +#define MCR 4 /* Modem control */
33038 +#define LSR 5 /* Line Status */
33039 +#define MSR 6 /* Modem Status */
33040 +#define DLL 0 /* Divisor Latch Low */
33041 +#define DLH 1 /* Divisor latch High */
33042 +
33043 +static int early_serial_putc(unsigned char ch)
33044 +{
33045 + unsigned timeout = 0xffff;
33046 + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
33047 + cpu_relax();
33048 + outb(ch, early_serial_base + TXR);
33049 + return timeout ? 0 : -1;
33050 +}
33051 +
33052 +static void early_serial_write(struct console *con, const char *s, unsigned n)
33053 +{
33054 + while (*s && n-- > 0) {
33055 + early_serial_putc(*s);
33056 + if (*s == '\n')
33057 + early_serial_putc('\r');
33058 + s++;
33059 + }
33060 +}
33061 +
33062 +#define DEFAULT_BAUD 9600
33063 +
33064 +static __init void early_serial_init(char *s)
33065 +{
33066 + unsigned char c;
33067 + unsigned divisor;
33068 + unsigned baud = DEFAULT_BAUD;
33069 + char *e;
33070 +
33071 + if (*s == ',')
33072 + ++s;
33073 +
33074 + if (*s) {
33075 + unsigned port;
33076 + if (!strncmp(s,"0x",2)) {
33077 + early_serial_base = simple_strtoul(s, &e, 16);
33078 + } else {
33079 + static int bases[] = { 0x3f8, 0x2f8 };
33080 +
33081 + if (!strncmp(s,"ttyS",4))
33082 + s += 4;
33083 + port = simple_strtoul(s, &e, 10);
33084 + if (port > 1 || s == e)
33085 + port = 0;
33086 + early_serial_base = bases[port];
33087 + }
33088 + s += strcspn(s, ",");
33089 + if (*s == ',')
33090 + s++;
33091 + }
33092 +
33093 + outb(0x3, early_serial_base + LCR); /* 8n1 */
33094 + outb(0, early_serial_base + IER); /* no interrupt */
33095 + outb(0, early_serial_base + FCR); /* no fifo */
33096 + outb(0x3, early_serial_base + MCR); /* DTR + RTS */
33097 +
33098 + if (*s) {
33099 + baud = simple_strtoul(s, &e, 0);
33100 + if (baud == 0 || s == e)
33101 + baud = DEFAULT_BAUD;
33102 + }
33103 +
33104 + divisor = 115200 / baud;
33105 + c = inb(early_serial_base + LCR);
33106 + outb(c | DLAB, early_serial_base + LCR);
33107 + outb(divisor & 0xff, early_serial_base + DLL);
33108 + outb((divisor >> 8) & 0xff, early_serial_base + DLH);
33109 + outb(c & ~DLAB, early_serial_base + LCR);
33110 +}
33111 +
33112 +#else /* CONFIG_XEN */
33113 +
33114 +#undef SCREEN_INFO
33115 +#define SCREEN_INFO screen_info
33116 +extern struct screen_info screen_info;
33117 +
33118 +static void
33119 +early_serial_write(struct console *con, const char *s, unsigned count)
33120 +{
33121 + int n;
33122 +
33123 + while (count > 0) {
33124 + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
33125 + if (n <= 0)
33126 + break;
33127 + count -= n;
33128 + s += n;
33129 + }
33130 +}
33131 +
33132 +static __init void early_serial_init(char *s)
33133 +{
33134 +}
33135 +
33136 +/*
33137 + * No early VGA console on Xen, as we do not have convenient ISA-space
33138 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
33139 + */
33140 +#define early_vga_console early_serial_console
33141 +
33142 +#endif
33143 +
33144 +static struct console early_serial_console = {
33145 + .name = "earlyser",
33146 + .write = early_serial_write,
33147 + .flags = CON_PRINTBUFFER,
33148 + .index = -1,
33149 +};
33150 +
33151 +/* Console interface to a host file on AMD's SimNow! */
33152 +
33153 +static int simnow_fd;
33154 +
33155 +enum {
33156 + MAGIC1 = 0xBACCD00A,
33157 + MAGIC2 = 0xCA110000,
33158 + XOPEN = 5,
33159 + XWRITE = 4,
33160 +};
33161 +
33162 +static noinline long simnow(long cmd, long a, long b, long c)
33163 +{
33164 + long ret;
33165 + asm volatile("cpuid" :
33166 + "=a" (ret) :
33167 + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
33168 + return ret;
33169 +}
33170 +
33171 +void __init simnow_init(char *str)
33172 +{
33173 + char *fn = "klog";
33174 + if (*str == '=')
33175 + fn = ++str;
33176 + /* error ignored */
33177 + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
33178 +}
33179 +
33180 +static void simnow_write(struct console *con, const char *s, unsigned n)
33181 +{
33182 + simnow(XWRITE, simnow_fd, (unsigned long)s, n);
33183 +}
33184 +
33185 +static struct console simnow_console = {
33186 + .name = "simnow",
33187 + .write = simnow_write,
33188 + .flags = CON_PRINTBUFFER,
33189 + .index = -1,
33190 +};
33191 +
33192 +/* Direct interface for emergencies */
33193 +struct console *early_console = &early_vga_console;
33194 +static int early_console_initialized = 0;
33195 +
33196 +void early_printk(const char *fmt, ...)
33197 +{
33198 + char buf[512];
33199 + int n;
33200 + va_list ap;
33201 +
33202 + va_start(ap,fmt);
33203 + n = vscnprintf(buf,512,fmt,ap);
33204 + early_console->write(early_console,buf,n);
33205 + va_end(ap);
33206 +}
33207 +
33208 +static int __initdata keep_early;
33209 +
33210 +int __init setup_early_printk(char *opt)
33211 +{
33212 + char *space;
33213 + char buf[256];
33214 +
33215 + if (early_console_initialized)
33216 + return -1;
33217 +
33218 + strlcpy(buf,opt,sizeof(buf));
33219 + space = strchr(buf, ' ');
33220 + if (space)
33221 + *space = 0;
33222 +
33223 + if (strstr(buf,"keep"))
33224 + keep_early = 1;
33225 +
33226 + if (!strncmp(buf, "serial", 6)) {
33227 + early_serial_init(buf + 6);
33228 + early_console = &early_serial_console;
33229 + } else if (!strncmp(buf, "ttyS", 4)) {
33230 + early_serial_init(buf);
33231 + early_console = &early_serial_console;
33232 + } else if (!strncmp(buf, "vga", 3)
33233 + && SCREEN_INFO.orig_video_isVGA == 1) {
33234 + max_xpos = SCREEN_INFO.orig_video_cols;
33235 + max_ypos = SCREEN_INFO.orig_video_lines;
33236 + early_console = &early_vga_console;
33237 + } else if (!strncmp(buf, "simnow", 6)) {
33238 + simnow_init(buf + 6);
33239 + early_console = &simnow_console;
33240 + keep_early = 1;
33241 + }
33242 + early_console_initialized = 1;
33243 + register_console(early_console);
33244 + return 0;
33245 +}
33246 +
33247 +void __init disable_early_printk(void)
33248 +{
33249 + if (!early_console_initialized || !early_console)
33250 + return;
33251 + if (!keep_early) {
33252 + printk("disabling early console\n");
33253 + unregister_console(early_console);
33254 + early_console_initialized = 0;
33255 + } else {
33256 + printk("keeping early console\n");
33257 + }
33258 +}
33259 +
33260 +__setup("earlyprintk=", setup_early_printk);
33261 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/entry-xen.S linux-2.6.16.33/arch/x86_64/kernel/entry-xen.S
33262 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/entry-xen.S 1970-01-01 00:00:00.000000000 +0000
33263 +++ linux-2.6.16.33/arch/x86_64/kernel/entry-xen.S 2007-01-08 15:00:45.000000000 +0000
33264 @@ -0,0 +1,1136 @@
33265 +/*
33266 + * linux/arch/x86_64/entry.S
33267 + *
33268 + * Copyright (C) 1991, 1992 Linus Torvalds
33269 + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
33270 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
33271 + *
33272 + * $Id$
33273 + *
33274 + * Jun Nakajima <jun.nakajima@intel.com>
33275 + * Asit Mallick <asit.k.mallick@intel.com>
33276 + * Modified for Xen
33277 + */
33278 +
33279 +/*
33280 + * entry.S contains the system-call and fault low-level handling routines.
33281 + *
33282 + * NOTE: This code handles signal-recognition, which happens every time
33283 + * after an interrupt and after each system call.
33284 + *
33285 + * Normal syscalls and interrupts don't save a full stack frame, this is
33286 + * only done for syscall tracing, signals or fork/exec et.al.
33287 + *
33288 + * A note on terminology:
33289 + * - top of stack: Architecture defined interrupt frame from SS to RIP
33290 + * at the top of the kernel process stack.
33291 + * - partial stack frame: partially saved registers upto R11.
33292 + * - full stack frame: Like partial stack frame, but all register saved.
33293 + *
33294 + * TODO:
33295 + * - schedule it carefully for the final hardware.
33296 + */
33297 +
33298 +#define ASSEMBLY 1
33299 +#include <linux/config.h>
33300 +#ifdef CONFIG_DEBUG_INFO
33301 +#undef CONFIG_DEBUG_INFO
33302 +#endif
33303 +#include <linux/linkage.h>
33304 +#include <asm/segment.h>
33305 +#include <asm/smp.h>
33306 +#include <asm/cache.h>
33307 +#include <asm/errno.h>
33308 +#include <asm/dwarf2.h>
33309 +#include <asm/calling.h>
33310 +#include <asm/asm-offsets.h>
33311 +#include <asm/msr.h>
33312 +#include <asm/unistd.h>
33313 +#include <asm/thread_info.h>
33314 +#include <asm/hw_irq.h>
33315 +#include <asm/page.h>
33316 +#include <asm/errno.h>
33317 +#include <xen/interface/arch-x86_64.h>
33318 +#include <xen/interface/features.h>
33319 +
33320 +#include "irq_vectors.h"
33321 +
33322 +#include "xen_entry.S"
33323 +
33324 + .code64
33325 +
33326 +#ifndef CONFIG_PREEMPT
33327 +#define retint_kernel retint_restore_args
33328 +#endif
33329 +
33330 +NMI_MASK = 0x80000000
33331 +
33332 +/*
33333 + * C code is not supposed to know about undefined top of stack. Every time
33334 + * a C function with an pt_regs argument is called from the SYSCALL based
33335 + * fast path FIXUP_TOP_OF_STACK is needed.
33336 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
33337 + * manipulation.
33338 + */
33339 +
33340 + /* %rsp:at FRAMEEND */
33341 + .macro FIXUP_TOP_OF_STACK tmp
33342 + movq $__USER_CS,CS(%rsp)
33343 + movq $-1,RCX(%rsp)
33344 + .endm
33345 +
33346 + .macro RESTORE_TOP_OF_STACK tmp,offset=0
33347 + .endm
33348 +
33349 + .macro FAKE_STACK_FRAME child_rip
33350 + /* push in order ss, rsp, eflags, cs, rip */
33351 + xorl %eax, %eax
33352 + pushq %rax /* ss */
33353 + CFI_ADJUST_CFA_OFFSET 8
33354 + /*CFI_REL_OFFSET ss,0*/
33355 + pushq %rax /* rsp */
33356 + CFI_ADJUST_CFA_OFFSET 8
33357 + CFI_REL_OFFSET rsp,0
33358 + pushq $(1<<9) /* eflags - interrupts on */
33359 + CFI_ADJUST_CFA_OFFSET 8
33360 + /*CFI_REL_OFFSET rflags,0*/
33361 + pushq $__KERNEL_CS /* cs */
33362 + CFI_ADJUST_CFA_OFFSET 8
33363 + /*CFI_REL_OFFSET cs,0*/
33364 + pushq \child_rip /* rip */
33365 + CFI_ADJUST_CFA_OFFSET 8
33366 + CFI_REL_OFFSET rip,0
33367 + pushq %rax /* orig rax */
33368 + CFI_ADJUST_CFA_OFFSET 8
33369 + .endm
33370 +
33371 + .macro UNFAKE_STACK_FRAME
33372 + addq $8*6, %rsp
33373 + CFI_ADJUST_CFA_OFFSET -(6*8)
33374 + .endm
33375 +
33376 + .macro CFI_DEFAULT_STACK start=1
33377 + .if \start
33378 + CFI_STARTPROC simple
33379 + CFI_DEF_CFA rsp,SS+8
33380 + .else
33381 + CFI_DEF_CFA_OFFSET SS+8
33382 + .endif
33383 + CFI_REL_OFFSET r15,R15
33384 + CFI_REL_OFFSET r14,R14
33385 + CFI_REL_OFFSET r13,R13
33386 + CFI_REL_OFFSET r12,R12
33387 + CFI_REL_OFFSET rbp,RBP
33388 + CFI_REL_OFFSET rbx,RBX
33389 + CFI_REL_OFFSET r11,R11
33390 + CFI_REL_OFFSET r10,R10
33391 + CFI_REL_OFFSET r9,R9
33392 + CFI_REL_OFFSET r8,R8
33393 + CFI_REL_OFFSET rax,RAX
33394 + CFI_REL_OFFSET rcx,RCX
33395 + CFI_REL_OFFSET rdx,RDX
33396 + CFI_REL_OFFSET rsi,RSI
33397 + CFI_REL_OFFSET rdi,RDI
33398 + CFI_REL_OFFSET rip,RIP
33399 + /*CFI_REL_OFFSET cs,CS*/
33400 + /*CFI_REL_OFFSET rflags,EFLAGS*/
33401 + CFI_REL_OFFSET rsp,RSP
33402 + /*CFI_REL_OFFSET ss,SS*/
33403 + .endm
33404 +
33405 + /*
33406 + * Must be consistent with the definition in arch-x86_64.h:
33407 + * struct iret_context {
33408 + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
33409 + * };
33410 + * #define VGCF_IN_SYSCALL (1<<8)
33411 + */
33412 + .macro HYPERVISOR_IRET flag
33413 + testb $3,1*8(%rsp)
33414 + jnz 2f
33415 + testl $NMI_MASK,2*8(%rsp)
33416 + jnz 2f
33417 +
33418 + testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
33419 + jnz 1f
33420 +
33421 + /* Direct iret to kernel space. Correct CS and SS. */
33422 + orb $3,1*8(%rsp)
33423 + orb $3,4*8(%rsp)
33424 +1: iretq
33425 +
33426 +2: /* Slow iret via hypervisor. */
33427 + andl $~NMI_MASK, 16(%rsp)
33428 + pushq $\flag
33429 + jmp hypercall_page + (__HYPERVISOR_iret * 32)
33430 + .endm
33431 +
33432 + .macro SWITCH_TO_KERNEL ssoff,adjust=0
33433 + jc 1f
33434 + orb $1,\ssoff-\adjust+4(%rsp)
33435 +1:
33436 + .endm
33437 +
33438 +/*
33439 + * A newly forked process directly context switches into this.
33440 + */
33441 +/* rdi: prev */
33442 +ENTRY(ret_from_fork)
33443 + CFI_DEFAULT_STACK
33444 + call schedule_tail
33445 + GET_THREAD_INFO(%rcx)
33446 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
33447 + jnz rff_trace
33448 +rff_action:
33449 + RESTORE_REST
33450 + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
33451 + je int_ret_from_sys_call
33452 + testl $_TIF_IA32,threadinfo_flags(%rcx)
33453 + jnz int_ret_from_sys_call
33454 + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
33455 + jmp ret_from_sys_call
33456 +rff_trace:
33457 + movq %rsp,%rdi
33458 + call syscall_trace_leave
33459 + GET_THREAD_INFO(%rcx)
33460 + jmp rff_action
33461 + CFI_ENDPROC
33462 +
33463 +/*
33464 + * System call entry. Upto 6 arguments in registers are supported.
33465 + *
33466 + * SYSCALL does not save anything on the stack and does not change the
33467 + * stack pointer.
33468 + */
33469 +
33470 +/*
33471 + * Register setup:
33472 + * rax system call number
33473 + * rdi arg0
33474 + * rcx return address for syscall/sysret, C arg3
33475 + * rsi arg1
33476 + * rdx arg2
33477 + * r10 arg3 (--> moved to rcx for C)
33478 + * r8 arg4
33479 + * r9 arg5
33480 + * r11 eflags for syscall/sysret, temporary for C
33481 + * r12-r15,rbp,rbx saved by C code, not touched.
33482 + *
33483 + * Interrupts are off on entry.
33484 + * Only called from user space.
33485 + *
33486 + * XXX if we had a free scratch register we could save the RSP into the stack frame
33487 + * and report it properly in ps. Unfortunately we haven't.
33488 + *
33489 + * When user can change the frames always force IRET. That is because
33490 + * it deals with uncanonical addresses better. SYSRET has trouble
33491 + * with them due to bugs in both AMD and Intel CPUs.
33492 + */
33493 +
33494 +ENTRY(system_call)
33495 + CFI_STARTPROC simple
33496 + CFI_DEF_CFA rsp,0
33497 + CFI_REGISTER rip,rcx
33498 + /*CFI_REGISTER rflags,r11*/
33499 + SAVE_ARGS -8,0
33500 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
33501 + XEN_UNBLOCK_EVENTS(%r11)
33502 + GET_THREAD_INFO(%rcx)
33503 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
33504 + CFI_REMEMBER_STATE
33505 + jnz tracesys
33506 + cmpq $__NR_syscall_max,%rax
33507 + ja badsys
33508 + movq %r10,%rcx
33509 + call *sys_call_table(,%rax,8) # XXX: rip relative
33510 + movq %rax,RAX-ARGOFFSET(%rsp)
33511 +/*
33512 + * Syscall return path ending with SYSRET (fast path)
33513 + * Has incomplete stack frame and undefined top of stack.
33514 + */
33515 + .globl ret_from_sys_call
33516 +ret_from_sys_call:
33517 + movl $_TIF_ALLWORK_MASK,%edi
33518 + /* edi: flagmask */
33519 +sysret_check:
33520 + GET_THREAD_INFO(%rcx)
33521 + XEN_BLOCK_EVENTS(%rsi)
33522 + movl threadinfo_flags(%rcx),%edx
33523 + andl %edi,%edx
33524 + CFI_REMEMBER_STATE
33525 + jnz sysret_careful
33526 + XEN_UNBLOCK_EVENTS(%rsi)
33527 + CFI_REGISTER rip,rcx
33528 + RESTORE_ARGS 0,8,0
33529 + /*CFI_REGISTER rflags,r11*/
33530 + HYPERVISOR_IRET VGCF_IN_SYSCALL
33531 +
33532 + /* Handle reschedules */
33533 + /* edx: work, edi: workmask */
33534 +sysret_careful:
33535 + CFI_RESTORE_STATE
33536 + bt $TIF_NEED_RESCHED,%edx
33537 + jnc sysret_signal
33538 + XEN_UNBLOCK_EVENTS(%rsi)
33539 + pushq %rdi
33540 + CFI_ADJUST_CFA_OFFSET 8
33541 + call schedule
33542 + popq %rdi
33543 + CFI_ADJUST_CFA_OFFSET -8
33544 + jmp sysret_check
33545 +
33546 + /* Handle a signal */
33547 +sysret_signal:
33548 +/* sti */
33549 + XEN_UNBLOCK_EVENTS(%rsi)
33550 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
33551 + jz 1f
33552 +
33553 + /* Really a signal */
33554 + /* edx: work flags (arg3) */
33555 + leaq do_notify_resume(%rip),%rax
33556 + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
33557 + xorl %esi,%esi # oldset -> arg2
33558 + call ptregscall_common
33559 +1: movl $_TIF_NEED_RESCHED,%edi
33560 + /* Use IRET because user could have changed frame. This
33561 + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
33562 + XEN_BLOCK_EVENTS(%rsi)
33563 + jmp int_with_check
33564 +
33565 +badsys:
33566 + movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
33567 + jmp ret_from_sys_call
33568 +
33569 + /* Do syscall tracing */
33570 +tracesys:
33571 + CFI_RESTORE_STATE
33572 + SAVE_REST
33573 + movq $-ENOSYS,RAX(%rsp)
33574 + FIXUP_TOP_OF_STACK %rdi
33575 + movq %rsp,%rdi
33576 + call syscall_trace_enter
33577 + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
33578 + RESTORE_REST
33579 + cmpq $__NR_syscall_max,%rax
33580 + ja 1f
33581 + movq %r10,%rcx /* fixup for C */
33582 + call *sys_call_table(,%rax,8)
33583 +1: movq %rax,RAX-ARGOFFSET(%rsp)
33584 + /* Use IRET because user could have changed frame */
33585 + jmp int_ret_from_sys_call
33586 + CFI_ENDPROC
33587 +
33588 +/*
33589 + * Syscall return path ending with IRET.
33590 + * Has correct top of stack, but partial stack frame.
33591 + */
33592 +ENTRY(int_ret_from_sys_call)
33593 + CFI_STARTPROC simple
33594 + CFI_DEF_CFA rsp,SS+8-ARGOFFSET
33595 + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
33596 + CFI_REL_OFFSET rsp,RSP-ARGOFFSET
33597 + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
33598 + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
33599 + CFI_REL_OFFSET rip,RIP-ARGOFFSET
33600 + CFI_REL_OFFSET rdx,RDX-ARGOFFSET
33601 + CFI_REL_OFFSET rcx,RCX-ARGOFFSET
33602 + CFI_REL_OFFSET rax,RAX-ARGOFFSET
33603 + CFI_REL_OFFSET rdi,RDI-ARGOFFSET
33604 + CFI_REL_OFFSET rsi,RSI-ARGOFFSET
33605 + CFI_REL_OFFSET r8,R8-ARGOFFSET
33606 + CFI_REL_OFFSET r9,R9-ARGOFFSET
33607 + CFI_REL_OFFSET r10,R10-ARGOFFSET
33608 + CFI_REL_OFFSET r11,R11-ARGOFFSET
33609 + XEN_BLOCK_EVENTS(%rsi)
33610 + testb $3,CS-ARGOFFSET(%rsp)
33611 + jnz 1f
33612 + /* Need to set the proper %ss (not NULL) for ring 3 iretq */
33613 + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
33614 + jmp retint_restore_args # retrun from ring3 kernel
33615 +1:
33616 + movl $_TIF_ALLWORK_MASK,%edi
33617 + /* edi: mask to check */
33618 +int_with_check:
33619 + GET_THREAD_INFO(%rcx)
33620 + movl threadinfo_flags(%rcx),%edx
33621 + andl %edi,%edx
33622 + jnz int_careful
33623 + andl $~TS_COMPAT,threadinfo_status(%rcx)
33624 + jmp retint_restore_args
33625 +
33626 + /* Either reschedule or signal or syscall exit tracking needed. */
33627 + /* First do a reschedule test. */
33628 + /* edx: work, edi: workmask */
33629 +int_careful:
33630 + bt $TIF_NEED_RESCHED,%edx
33631 + jnc int_very_careful
33632 +/* sti */
33633 + XEN_UNBLOCK_EVENTS(%rsi)
33634 + pushq %rdi
33635 + CFI_ADJUST_CFA_OFFSET 8
33636 + call schedule
33637 + popq %rdi
33638 + CFI_ADJUST_CFA_OFFSET -8
33639 + XEN_BLOCK_EVENTS(%rsi)
33640 + jmp int_with_check
33641 +
33642 + /* handle signals and tracing -- both require a full stack frame */
33643 +int_very_careful:
33644 +/* sti */
33645 + XEN_UNBLOCK_EVENTS(%rsi)
33646 + SAVE_REST
33647 + /* Check for syscall exit trace */
33648 + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
33649 + jz int_signal
33650 + pushq %rdi
33651 + CFI_ADJUST_CFA_OFFSET 8
33652 + leaq 8(%rsp),%rdi # &ptregs -> arg1
33653 + call syscall_trace_leave
33654 + popq %rdi
33655 + CFI_ADJUST_CFA_OFFSET -8
33656 + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
33657 + XEN_BLOCK_EVENTS(%rsi)
33658 + jmp int_restore_rest
33659 +
33660 +int_signal:
33661 + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
33662 + jz 1f
33663 + movq %rsp,%rdi # &ptregs -> arg1
33664 + xorl %esi,%esi # oldset -> arg2
33665 + call do_notify_resume
33666 +1: movl $_TIF_NEED_RESCHED,%edi
33667 +int_restore_rest:
33668 + RESTORE_REST
33669 + XEN_BLOCK_EVENTS(%rsi)
33670 + jmp int_with_check
33671 + CFI_ENDPROC
33672 +
33673 +/*
33674 + * Certain special system calls that need to save a complete full stack frame.
33675 + */
33676 +
33677 + .macro PTREGSCALL label,func,arg
33678 + .globl \label
33679 +\label:
33680 + leaq \func(%rip),%rax
33681 + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
33682 + jmp ptregscall_common
33683 + .endm
33684 +
33685 + CFI_STARTPROC
33686 +
33687 + PTREGSCALL stub_clone, sys_clone, %r8
33688 + PTREGSCALL stub_fork, sys_fork, %rdi
33689 + PTREGSCALL stub_vfork, sys_vfork, %rdi
33690 + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
33691 + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
33692 + PTREGSCALL stub_iopl, sys_iopl, %rsi
33693 +
33694 +ENTRY(ptregscall_common)
33695 + popq %r11
33696 + CFI_ADJUST_CFA_OFFSET -8
33697 + CFI_REGISTER rip, r11
33698 + SAVE_REST
33699 + movq %r11, %r15
33700 + CFI_REGISTER rip, r15
33701 + FIXUP_TOP_OF_STACK %r11
33702 + call *%rax
33703 + RESTORE_TOP_OF_STACK %r11
33704 + movq %r15, %r11
33705 + CFI_REGISTER rip, r11
33706 + RESTORE_REST
33707 + pushq %r11
33708 + CFI_ADJUST_CFA_OFFSET 8
33709 + CFI_REL_OFFSET rip, 0
33710 + ret
33711 + CFI_ENDPROC
33712 +
33713 +ENTRY(stub_execve)
33714 + CFI_STARTPROC
33715 + popq %r11
33716 + CFI_ADJUST_CFA_OFFSET -8
33717 + CFI_REGISTER rip, r11
33718 + SAVE_REST
33719 + FIXUP_TOP_OF_STACK %r11
33720 + call sys_execve
33721 + RESTORE_TOP_OF_STACK %r11
33722 + movq %rax,RAX(%rsp)
33723 + RESTORE_REST
33724 + jmp int_ret_from_sys_call
33725 + CFI_ENDPROC
33726 +
33727 +/*
33728 + * sigreturn is special because it needs to restore all registers on return.
33729 + * This cannot be done with SYSRET, so use the IRET return path instead.
33730 + */
33731 +ENTRY(stub_rt_sigreturn)
33732 + CFI_STARTPROC
33733 + addq $8, %rsp
33734 + CFI_ADJUST_CFA_OFFSET -8
33735 + SAVE_REST
33736 + movq %rsp,%rdi
33737 + FIXUP_TOP_OF_STACK %r11
33738 + call sys_rt_sigreturn
33739 + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
33740 + RESTORE_REST
33741 + jmp int_ret_from_sys_call
33742 + CFI_ENDPROC
33743 +
33744 +/*
33745 + * initial frame state for interrupts and exceptions
33746 + */
33747 + .macro _frame ref
33748 + CFI_STARTPROC simple
33749 + CFI_DEF_CFA rsp,SS+8-\ref
33750 + /*CFI_REL_OFFSET ss,SS-\ref*/
33751 + CFI_REL_OFFSET rsp,RSP-\ref
33752 + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
33753 + /*CFI_REL_OFFSET cs,CS-\ref*/
33754 + CFI_REL_OFFSET rip,RIP-\ref
33755 + .endm
33756 +
33757 +/* initial frame state for interrupts (and exceptions without error code) */
33758 +#define INTR_FRAME _frame RIP
33759 +/* initial frame state for exceptions with error code (and interrupts with
33760 + vector already pushed) */
33761 +#define XCPT_FRAME _frame ORIG_RAX
33762 +
33763 +/*
33764 + * Interrupt exit.
33765 + *
33766 + */
33767 +
33768 +retint_check:
33769 + movl threadinfo_flags(%rcx),%edx
33770 + andl %edi,%edx
33771 + CFI_REMEMBER_STATE
33772 + jnz retint_careful
33773 +retint_restore_args:
33774 + movl EFLAGS-REST_SKIP(%rsp), %eax
33775 + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
33776 + XEN_GET_VCPU_INFO(%rsi)
33777 + andb evtchn_upcall_mask(%rsi),%al
33778 + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
33779 + jnz restore_all_enable_events # != 0 => enable event delivery
33780 + XEN_PUT_VCPU_INFO(%rsi)
33781 +
33782 + RESTORE_ARGS 0,8,0
33783 + HYPERVISOR_IRET 0
33784 +
33785 + /* edi: workmask, edx: work */
33786 +retint_careful:
33787 + CFI_RESTORE_STATE
33788 + bt $TIF_NEED_RESCHED,%edx
33789 + jnc retint_signal
33790 + XEN_UNBLOCK_EVENTS(%rsi)
33791 +/* sti */
33792 + pushq %rdi
33793 + CFI_ADJUST_CFA_OFFSET 8
33794 + call schedule
33795 + popq %rdi
33796 + CFI_ADJUST_CFA_OFFSET -8
33797 + GET_THREAD_INFO(%rcx)
33798 + XEN_BLOCK_EVENTS(%rsi)
33799 +/* cli */
33800 + jmp retint_check
33801 +
33802 +retint_signal:
33803 + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
33804 + jz retint_restore_args
33805 + XEN_UNBLOCK_EVENTS(%rsi)
33806 + SAVE_REST
33807 + movq $-1,ORIG_RAX(%rsp)
33808 + xorl %esi,%esi # oldset
33809 + movq %rsp,%rdi # &pt_regs
33810 + call do_notify_resume
33811 + RESTORE_REST
33812 + XEN_BLOCK_EVENTS(%rsi)
33813 + movl $_TIF_NEED_RESCHED,%edi
33814 + GET_THREAD_INFO(%rcx)
33815 + jmp retint_check
33816 +
33817 +#ifdef CONFIG_PREEMPT
33818 + /* Returning to kernel space. Check if we need preemption */
33819 + /* rcx: threadinfo. interrupts off. */
33820 + .p2align
33821 +retint_kernel:
33822 + cmpl $0,threadinfo_preempt_count(%rcx)
33823 + jnz retint_restore_args
33824 + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
33825 + jnc retint_restore_args
33826 + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
33827 + jnc retint_restore_args
33828 + call preempt_schedule_irq
33829 + jmp retint_kernel /* check again */
33830 +#endif
33831 + CFI_ENDPROC
33832 +
33833 +/*
33834 + * APIC interrupts.
33835 + */
33836 + .macro apicinterrupt num,func
33837 + INTR_FRAME
33838 + pushq $~(\num)
33839 + CFI_ADJUST_CFA_OFFSET 8
33840 + interrupt \func
33841 + jmp error_entry
33842 + CFI_ENDPROC
33843 + .endm
33844 +
33845 +#ifndef CONFIG_XEN
33846 +ENTRY(thermal_interrupt)
33847 + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
33848 +
33849 +ENTRY(threshold_interrupt)
33850 + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
33851 +
33852 +#ifdef CONFIG_SMP
33853 +ENTRY(reschedule_interrupt)
33854 + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
33855 +
33856 + .macro INVALIDATE_ENTRY num
33857 +ENTRY(invalidate_interrupt\num)
33858 + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
33859 + .endm
33860 +
33861 + INVALIDATE_ENTRY 0
33862 + INVALIDATE_ENTRY 1
33863 + INVALIDATE_ENTRY 2
33864 + INVALIDATE_ENTRY 3
33865 + INVALIDATE_ENTRY 4
33866 + INVALIDATE_ENTRY 5
33867 + INVALIDATE_ENTRY 6
33868 + INVALIDATE_ENTRY 7
33869 +
33870 +ENTRY(call_function_interrupt)
33871 + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
33872 +#endif
33873 +
33874 +#ifdef CONFIG_X86_LOCAL_APIC
33875 +ENTRY(apic_timer_interrupt)
33876 + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
33877 +
33878 +ENTRY(error_interrupt)
33879 + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
33880 +
33881 +ENTRY(spurious_interrupt)
33882 + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
33883 +#endif
33884 +#endif /* !CONFIG_XEN */
33885 +
33886 +/*
33887 + * Exception entry points.
33888 + */
33889 + .macro zeroentry sym
33890 + INTR_FRAME
33891 + movq (%rsp),%rcx
33892 + movq 8(%rsp),%r11
33893 + addq $0x10,%rsp /* skip rcx and r11 */
33894 + pushq $0 /* push error code/oldrax */
33895 + CFI_ADJUST_CFA_OFFSET 8
33896 + pushq %rax /* push real oldrax to the rdi slot */
33897 + CFI_ADJUST_CFA_OFFSET 8
33898 + leaq \sym(%rip),%rax
33899 + jmp error_entry
33900 + CFI_ENDPROC
33901 + .endm
33902 +
33903 + .macro errorentry sym
33904 + XCPT_FRAME
33905 + movq (%rsp),%rcx
33906 + movq 8(%rsp),%r11
33907 + addq $0x10,%rsp /* rsp points to the error code */
33908 + pushq %rax
33909 + CFI_ADJUST_CFA_OFFSET 8
33910 + leaq \sym(%rip),%rax
33911 + jmp error_entry
33912 + CFI_ENDPROC
33913 + .endm
33914 +
33915 +#if 0 /* not XEN */
33916 + /* error code is on the stack already */
33917 + /* handle NMI like exceptions that can happen everywhere */
33918 + .macro paranoidentry sym, ist=0
33919 + movq (%rsp),%rcx
33920 + movq 8(%rsp),%r11
33921 + addq $0x10,%rsp /* skip rcx and r11 */
33922 + SAVE_ALL
33923 + cld
33924 +#if 0 /* not XEN */
33925 + movl $1,%ebx
33926 + movl $MSR_GS_BASE,%ecx
33927 + rdmsr
33928 + testl %edx,%edx
33929 + js 1f
33930 + swapgs
33931 + xorl %ebx,%ebx
33932 +1:
33933 +#endif
33934 + .if \ist
33935 + movq %gs:pda_data_offset, %rbp
33936 + .endif
33937 + movq %rsp,%rdi
33938 + movq ORIG_RAX(%rsp),%rsi
33939 + movq $-1,ORIG_RAX(%rsp)
33940 + .if \ist
33941 + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
33942 + .endif
33943 + call \sym
33944 + .if \ist
33945 + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
33946 + .endif
33947 +/* cli */
33948 + XEN_BLOCK_EVENTS(%rsi)
33949 + .endm
33950 +#endif
33951 +
33952 +/*
33953 + * Exception entry point. This expects an error code/orig_rax on the stack
33954 + * and the exception handler in %rax.
33955 + */
33956 +ENTRY(error_entry)
33957 + _frame RDI
33958 + /* rdi slot contains rax, oldrax contains error code */
33959 + cld
33960 + subq $14*8,%rsp
33961 + CFI_ADJUST_CFA_OFFSET (14*8)
33962 + movq %rsi,13*8(%rsp)
33963 + CFI_REL_OFFSET rsi,RSI
33964 + movq 14*8(%rsp),%rsi /* load rax from rdi slot */
33965 + movq %rdx,12*8(%rsp)
33966 + CFI_REL_OFFSET rdx,RDX
33967 + movq %rcx,11*8(%rsp)
33968 + CFI_REL_OFFSET rcx,RCX
33969 + movq %rsi,10*8(%rsp) /* store rax */
33970 + CFI_REL_OFFSET rax,RAX
33971 + movq %r8, 9*8(%rsp)
33972 + CFI_REL_OFFSET r8,R8
33973 + movq %r9, 8*8(%rsp)
33974 + CFI_REL_OFFSET r9,R9
33975 + movq %r10,7*8(%rsp)
33976 + CFI_REL_OFFSET r10,R10
33977 + movq %r11,6*8(%rsp)
33978 + CFI_REL_OFFSET r11,R11
33979 + movq %rbx,5*8(%rsp)
33980 + CFI_REL_OFFSET rbx,RBX
33981 + movq %rbp,4*8(%rsp)
33982 + CFI_REL_OFFSET rbp,RBP
33983 + movq %r12,3*8(%rsp)
33984 + CFI_REL_OFFSET r12,R12
33985 + movq %r13,2*8(%rsp)
33986 + CFI_REL_OFFSET r13,R13
33987 + movq %r14,1*8(%rsp)
33988 + CFI_REL_OFFSET r14,R14
33989 + movq %r15,(%rsp)
33990 + CFI_REL_OFFSET r15,R15
33991 +#if 0
33992 + cmpl $__KERNEL_CS,CS(%rsp)
33993 + je error_kernelspace
33994 +#endif
33995 +error_call_handler:
33996 + movq %rdi, RDI(%rsp)
33997 + movq %rsp,%rdi
33998 + movq ORIG_RAX(%rsp),%rsi # get error code
33999 + movq $-1,ORIG_RAX(%rsp)
34000 + call *%rax
34001 +error_exit:
34002 + RESTORE_REST
34003 +/* cli */
34004 + XEN_BLOCK_EVENTS(%rsi)
34005 + GET_THREAD_INFO(%rcx)
34006 + testb $3,CS-ARGOFFSET(%rsp)
34007 + jz retint_kernel
34008 + movl threadinfo_flags(%rcx),%edx
34009 + movl $_TIF_WORK_MASK,%edi
34010 + andl %edi,%edx
34011 + jnz retint_careful
34012 + jmp retint_restore_args
34013 +
34014 +error_kernelspace:
34015 + /*
34016 + * We need to re-write the logic here because we don't do iretq to
34017 + * to return to user mode. It's still possible that we get trap/fault
34018 + * in the kernel (when accessing buffers pointed to by system calls,
34019 + * for example).
34020 + *
34021 + */
34022 +#if 0
34023 + incl %ebx
34024 + /* There are two places in the kernel that can potentially fault with
34025 + usergs. Handle them here. The exception handlers after
34026 + iret run with kernel gs again, so don't set the user space flag.
34027 + B stepping K8s sometimes report an truncated RIP for IRET
34028 + exceptions returning to compat mode. Check for these here too. */
34029 + leaq iret_label(%rip),%rbp
34030 + cmpq %rbp,RIP(%rsp)
34031 + je error_swapgs
34032 + movl %ebp,%ebp /* zero extend */
34033 + cmpq %rbp,RIP(%rsp)
34034 + je error_swapgs
34035 + cmpq $gs_change,RIP(%rsp)
34036 + je error_swapgs
34037 + jmp error_sti
34038 +#endif
34039 +
34040 +ENTRY(hypervisor_callback)
34041 + zeroentry do_hypervisor_callback
34042 +
34043 +/*
34044 + * Copied from arch/xen/i386/kernel/entry.S
34045 + */
34046 +# A note on the "critical region" in our callback handler.
34047 +# We want to avoid stacking callback handlers due to events occurring
34048 +# during handling of the last event. To do this, we keep events disabled
34049 +# until we've done all processing. HOWEVER, we must enable events before
34050 +# popping the stack frame (can't be done atomically) and so it would still
34051 +# be possible to get enough handler activations to overflow the stack.
34052 +# Although unlikely, bugs of that kind are hard to track down, so we'd
34053 +# like to avoid the possibility.
34054 +# So, on entry to the handler we detect whether we interrupted an
34055 +# existing activation in its critical region -- if so, we pop the current
34056 +# activation and restart the handler using the previous one.
34057 +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
34058 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
34059 +# see the correct pointer to the pt_regs
34060 + movq %rdi, %rsp # we don't return, adjust the stack frame
34061 +11: movq %gs:pda_irqstackptr,%rax
34062 + incl %gs:pda_irqcount
34063 + cmovzq %rax,%rsp
34064 + pushq %rdi
34065 + call evtchn_do_upcall
34066 + popq %rsp
34067 + decl %gs:pda_irqcount
34068 + jmp error_exit
34069 +
34070 +#ifdef CONFIG_X86_LOCAL_APIC
34071 +KPROBE_ENTRY(nmi)
34072 + zeroentry do_nmi_callback
34073 +ENTRY(do_nmi_callback)
34074 + addq $8, %rsp
34075 + call do_nmi
34076 + orl $NMI_MASK,EFLAGS(%rsp)
34077 + RESTORE_REST
34078 + XEN_BLOCK_EVENTS(%rsi)
34079 + GET_THREAD_INFO(%rcx)
34080 + jmp retint_restore_args
34081 + .previous .text
34082 +#endif
34083 +
34084 + ALIGN
34085 +restore_all_enable_events:
34086 + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
34087 +
34088 +scrit: /**** START OF CRITICAL REGION ****/
34089 + XEN_TEST_PENDING(%rsi)
34090 + jnz 14f # process more events if necessary...
34091 + XEN_PUT_VCPU_INFO(%rsi)
34092 + RESTORE_ARGS 0,8,0
34093 + HYPERVISOR_IRET 0
34094 +
34095 +14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
34096 + XEN_PUT_VCPU_INFO(%rsi)
34097 + SAVE_REST
34098 + movq %rsp,%rdi # set the argument again
34099 + jmp 11b
34100 +ecrit: /**** END OF CRITICAL REGION ****/
34101 +# At this point, unlike on x86-32, we don't do the fixup to simplify the
34102 +# code and the stack frame is more complex on x86-64.
34103 +# When the kernel is interrupted in the critical section, the kernel
34104 +# will do IRET in that case, and everything will be restored at that point,
34105 +# i.e. it just resumes from the next instruction interrupted with the same context.
34106 +
34107 +# Hypervisor uses this for application faults while it executes.
34108 +# We get here for two reasons:
34109 +# 1. Fault while reloading DS, ES, FS or GS
34110 +# 2. Fault while executing IRET
34111 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
34112 +# registers that could be reloaded and zeroed the others.
34113 +# Category 2 we fix up by killing the current process. We cannot use the
34114 +# normal Linux return path in this case because if we use the IRET hypercall
34115 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
34116 +# We distinguish between categories by comparing each saved segment register
34117 +# with its current contents: any discrepancy means we in category 1.
34118 +ENTRY(failsafe_callback)
34119 + movw %ds,%cx
34120 + cmpw %cx,0x10(%rsp)
34121 + jne 1f
34122 + movw %es,%cx
34123 + cmpw %cx,0x18(%rsp)
34124 + jne 1f
34125 + movw %fs,%cx
34126 + cmpw %cx,0x20(%rsp)
34127 + jne 1f
34128 + movw %gs,%cx
34129 + cmpw %cx,0x28(%rsp)
34130 + jne 1f
34131 + /* All segments match their saved values => Category 2 (Bad IRET). */
34132 + movq (%rsp),%rcx
34133 + movq 8(%rsp),%r11
34134 + addq $0x30,%rsp
34135 + movq $-9999,%rdi /* better code? */
34136 + jmp do_exit
34137 +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
34138 + movq (%rsp),%rcx
34139 + movq 8(%rsp),%r11
34140 + addq $0x30,%rsp
34141 + pushq $0
34142 + SAVE_ALL
34143 + jmp error_exit
34144 +#if 0
34145 + .section __ex_table,"a"
34146 + .align 8
34147 + .quad gs_change,bad_gs
34148 + .previous
34149 + .section .fixup,"ax"
34150 + /* running with kernelgs */
34151 +bad_gs:
34152 +/* swapgs */ /* switch back to user gs */
34153 + xorl %eax,%eax
34154 + movl %eax,%gs
34155 + jmp 2b
34156 + .previous
34157 +#endif
34158 +
34159 +/*
34160 + * Create a kernel thread.
34161 + *
34162 + * C extern interface:
34163 + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
34164 + *
34165 + * asm input arguments:
34166 + * rdi: fn, rsi: arg, rdx: flags
34167 + */
34168 +ENTRY(kernel_thread)
34169 + CFI_STARTPROC
34170 + FAKE_STACK_FRAME $child_rip
34171 + SAVE_ALL
34172 +
34173 + # rdi: flags, rsi: usp, rdx: will be &pt_regs
34174 + movq %rdx,%rdi
34175 + orq kernel_thread_flags(%rip),%rdi
34176 + movq $-1, %rsi
34177 + movq %rsp, %rdx
34178 +
34179 + xorl %r8d,%r8d
34180 + xorl %r9d,%r9d
34181 +
34182 + # clone now
34183 + call do_fork
34184 + movq %rax,RAX(%rsp)
34185 + xorl %edi,%edi
34186 +
34187 + /*
34188 + * It isn't worth to check for reschedule here,
34189 + * so internally to the x86_64 port you can rely on kernel_thread()
34190 + * not to reschedule the child before returning, this avoids the need
34191 + * of hacks for example to fork off the per-CPU idle tasks.
34192 + * [Hopefully no generic code relies on the reschedule -AK]
34193 + */
34194 + RESTORE_ALL
34195 + UNFAKE_STACK_FRAME
34196 + ret
34197 + CFI_ENDPROC
34198 +
34199 +
34200 +child_rip:
34201 + /*
34202 + * Here we are in the child and the registers are set as they were
34203 + * at kernel_thread() invocation in the parent.
34204 + */
34205 + movq %rdi, %rax
34206 + movq %rsi, %rdi
34207 + call *%rax
34208 + # exit
34209 + xorl %edi, %edi
34210 + call do_exit
34211 +
34212 +/*
34213 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
34214 + *
34215 + * C extern interface:
34216 + * extern long execve(char *name, char **argv, char **envp)
34217 + *
34218 + * asm input arguments:
34219 + * rdi: name, rsi: argv, rdx: envp
34220 + *
34221 + * We want to fallback into:
34222 + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
34223 + *
34224 + * do_sys_execve asm fallback arguments:
34225 + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
34226 + */
34227 +ENTRY(execve)
34228 + CFI_STARTPROC
34229 + FAKE_STACK_FRAME $0
34230 + SAVE_ALL
34231 + call sys_execve
34232 + movq %rax, RAX(%rsp)
34233 + RESTORE_REST
34234 + testq %rax,%rax
34235 + jne 1f
34236 + jmp int_ret_from_sys_call
34237 +1: RESTORE_ARGS
34238 + UNFAKE_STACK_FRAME
34239 + ret
34240 + CFI_ENDPROC
34241 +
34242 +KPROBE_ENTRY(page_fault)
34243 + errorentry do_page_fault
34244 + .previous .text
34245 +
34246 +ENTRY(coprocessor_error)
34247 + zeroentry do_coprocessor_error
34248 +
34249 +ENTRY(simd_coprocessor_error)
34250 + zeroentry do_simd_coprocessor_error
34251 +
34252 +ENTRY(device_not_available)
34253 + zeroentry math_state_restore
34254 +
34255 + /* runs on exception stack */
34256 +KPROBE_ENTRY(debug)
34257 + INTR_FRAME
34258 +/* pushq $0
34259 + CFI_ADJUST_CFA_OFFSET 8 */
34260 + zeroentry do_debug
34261 +/* jmp paranoid_exit */
34262 + CFI_ENDPROC
34263 + .previous .text
34264 +
34265 +#if 0
34266 + /* runs on exception stack */
34267 +KPROBE_ENTRY(nmi)
34268 + INTR_FRAME
34269 + pushq $-1
34270 + CFI_ADJUST_CFA_OFFSET 8
34271 + paranoidentry do_nmi
34272 + /*
34273 + * "Paranoid" exit path from exception stack.
34274 + * Paranoid because this is used by NMIs and cannot take
34275 + * any kernel state for granted.
34276 + * We don't do kernel preemption checks here, because only
34277 + * NMI should be common and it does not enable IRQs and
34278 + * cannot get reschedule ticks.
34279 + */
34280 + /* ebx: no swapgs flag */
34281 +paranoid_exit:
34282 + testl %ebx,%ebx /* swapgs needed? */
34283 + jnz paranoid_restore
34284 + testl $3,CS(%rsp)
34285 + jnz paranoid_userspace
34286 +paranoid_swapgs:
34287 + swapgs
34288 +paranoid_restore:
34289 + RESTORE_ALL 8
34290 + iretq
34291 +paranoid_userspace:
34292 + GET_THREAD_INFO(%rcx)
34293 + movl threadinfo_flags(%rcx),%ebx
34294 + andl $_TIF_WORK_MASK,%ebx
34295 + jz paranoid_swapgs
34296 + movq %rsp,%rdi /* &pt_regs */
34297 + call sync_regs
34298 + movq %rax,%rsp /* switch stack for scheduling */
34299 + testl $_TIF_NEED_RESCHED,%ebx
34300 + jnz paranoid_schedule
34301 + movl %ebx,%edx /* arg3: thread flags */
34302 + sti
34303 + xorl %esi,%esi /* arg2: oldset */
34304 + movq %rsp,%rdi /* arg1: &pt_regs */
34305 + call do_notify_resume
34306 + cli
34307 + jmp paranoid_userspace
34308 +paranoid_schedule:
34309 + sti
34310 + call schedule
34311 + cli
34312 + jmp paranoid_userspace
34313 + CFI_ENDPROC
34314 + .previous .text
34315 +#endif
34316 +
34317 +KPROBE_ENTRY(int3)
34318 + INTR_FRAME
34319 +/* pushq $0
34320 + CFI_ADJUST_CFA_OFFSET 8 */
34321 + zeroentry do_int3
34322 +/* jmp paranoid_exit */
34323 + CFI_ENDPROC
34324 + .previous .text
34325 +
34326 +ENTRY(overflow)
34327 + zeroentry do_overflow
34328 +
34329 +ENTRY(bounds)
34330 + zeroentry do_bounds
34331 +
34332 +ENTRY(invalid_op)
34333 + zeroentry do_invalid_op
34334 +
34335 +ENTRY(coprocessor_segment_overrun)
34336 + zeroentry do_coprocessor_segment_overrun
34337 +
34338 +ENTRY(reserved)
34339 + zeroentry do_reserved
34340 +
34341 +#if 0
34342 + /* runs on exception stack */
34343 +ENTRY(double_fault)
34344 + XCPT_FRAME
34345 + paranoidentry do_double_fault
34346 + jmp paranoid_exit
34347 + CFI_ENDPROC
34348 +#endif
34349 +
34350 +ENTRY(invalid_TSS)
34351 + errorentry do_invalid_TSS
34352 +
34353 +ENTRY(segment_not_present)
34354 + errorentry do_segment_not_present
34355 +
34356 + /* runs on exception stack */
34357 +ENTRY(stack_segment)
34358 + XCPT_FRAME
34359 + errorentry do_stack_segment
34360 + CFI_ENDPROC
34361 +
34362 +KPROBE_ENTRY(general_protection)
34363 + errorentry do_general_protection
34364 + .previous .text
34365 +
34366 +ENTRY(alignment_check)
34367 + errorentry do_alignment_check
34368 +
34369 +ENTRY(divide_error)
34370 + zeroentry do_divide_error
34371 +
34372 +ENTRY(spurious_interrupt_bug)
34373 + zeroentry do_spurious_interrupt_bug
34374 +
34375 +#ifdef CONFIG_X86_MCE
34376 + /* runs on exception stack */
34377 +ENTRY(machine_check)
34378 + INTR_FRAME
34379 + pushq $0
34380 + CFI_ADJUST_CFA_OFFSET 8
34381 + paranoidentry do_machine_check
34382 + jmp paranoid_exit
34383 + CFI_ENDPROC
34384 +#endif
34385 +
34386 +ENTRY(call_softirq)
34387 + CFI_STARTPROC
34388 + movq %gs:pda_irqstackptr,%rax
34389 + movq %rsp,%rdx
34390 + CFI_DEF_CFA_REGISTER rdx
34391 + incl %gs:pda_irqcount
34392 + cmove %rax,%rsp
34393 + pushq %rdx
34394 + /*todo CFI_DEF_CFA_EXPRESSION ...*/
34395 + call __do_softirq
34396 + popq %rsp
34397 + CFI_DEF_CFA_REGISTER rsp
34398 + decl %gs:pda_irqcount
34399 + ret
34400 + CFI_ENDPROC
34401 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/entry.S linux-2.6.16.33/arch/x86_64/kernel/entry.S
34402 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/entry.S 2006-11-22 18:06:31.000000000 +0000
34403 +++ linux-2.6.16.33/arch/x86_64/kernel/entry.S 2007-05-23 21:00:01.000000000 +0000
34404 @@ -596,7 +596,7 @@
34405 */
34406 .macro apicinterrupt num,func
34407 INTR_FRAME
34408 - pushq $\num-256
34409 + pushq $~(\num)
34410 CFI_ADJUST_CFA_OFFSET 8
34411 interrupt \func
34412 jmp ret_from_intr
34413 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic-xen.c linux-2.6.16.33/arch/x86_64/kernel/genapic-xen.c
34414 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic-xen.c 1970-01-01 00:00:00.000000000 +0000
34415 +++ linux-2.6.16.33/arch/x86_64/kernel/genapic-xen.c 2007-01-08 15:00:45.000000000 +0000
34416 @@ -0,0 +1,144 @@
34417 +/*
34418 + * Copyright 2004 James Cleverdon, IBM.
34419 + * Subject to the GNU Public License, v.2
34420 + *
34421 + * Generic APIC sub-arch probe layer.
34422 + *
34423 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
34424 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
34425 + * James Cleverdon.
34426 + */
34427 +#include <linux/config.h>
34428 +#include <linux/threads.h>
34429 +#include <linux/cpumask.h>
34430 +#include <linux/string.h>
34431 +#include <linux/kernel.h>
34432 +#include <linux/ctype.h>
34433 +#include <linux/init.h>
34434 +#include <linux/module.h>
34435 +
34436 +#include <asm/smp.h>
34437 +#include <asm/ipi.h>
34438 +
34439 +#if defined(CONFIG_ACPI)
34440 +#include <acpi/acpi_bus.h>
34441 +#endif
34442 +
34443 +/* which logical CPU number maps to which CPU (physical APIC ID) */
34444 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
34445 +EXPORT_SYMBOL(x86_cpu_to_apicid);
34446 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
34447 +
34448 +extern struct genapic apic_cluster;
34449 +extern struct genapic apic_flat;
34450 +extern struct genapic apic_physflat;
34451 +
34452 +#ifndef CONFIG_XEN
34453 +struct genapic *genapic = &apic_flat;
34454 +#else
34455 +extern struct genapic apic_xen;
34456 +struct genapic *genapic = &apic_xen;
34457 +#endif
34458 +
34459 +
34460 +/*
34461 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
34462 + */
34463 +void __init clustered_apic_check(void)
34464 +{
34465 +#ifndef CONFIG_XEN
34466 + long i;
34467 + u8 clusters, max_cluster;
34468 + u8 id;
34469 + u8 cluster_cnt[NUM_APIC_CLUSTERS];
34470 + int max_apic = 0;
34471 +
34472 +#if defined(CONFIG_ACPI)
34473 + /*
34474 + * Some x86_64 machines use physical APIC mode regardless of how many
34475 + * procs/clusters are present (x86_64 ES7000 is an example).
34476 + */
34477 + if (acpi_fadt.revision > FADT2_REVISION_ID)
34478 + if (acpi_fadt.force_apic_physical_destination_mode) {
34479 + genapic = &apic_cluster;
34480 + goto print;
34481 + }
34482 +#endif
34483 +
34484 + memset(cluster_cnt, 0, sizeof(cluster_cnt));
34485 + for (i = 0; i < NR_CPUS; i++) {
34486 + id = bios_cpu_apicid[i];
34487 + if (id == BAD_APICID)
34488 + continue;
34489 + if (id > max_apic)
34490 + max_apic = id;
34491 + cluster_cnt[APIC_CLUSTERID(id)]++;
34492 + }
34493 +
34494 + /* Don't use clustered mode on AMD platforms. */
34495 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
34496 + genapic = &apic_physflat;
34497 +#ifndef CONFIG_HOTPLUG_CPU
34498 + /* In the CPU hotplug case we cannot use broadcast mode
34499 + because that opens a race when a CPU is removed.
34500 + Stay at physflat mode in this case.
34501 + It is bad to do this unconditionally though. Once
34502 + we have ACPI platform support for CPU hotplug
34503 + we should detect hotplug capablity from ACPI tables and
34504 + only do this when really needed. -AK */
34505 + if (max_apic <= 8)
34506 + genapic = &apic_flat;
34507 +#endif
34508 + goto print;
34509 + }
34510 +
34511 + clusters = 0;
34512 + max_cluster = 0;
34513 +
34514 + for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
34515 + if (cluster_cnt[i] > 0) {
34516 + ++clusters;
34517 + if (cluster_cnt[i] > max_cluster)
34518 + max_cluster = cluster_cnt[i];
34519 + }
34520 + }
34521 +
34522 + /*
34523 + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
34524 + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
34525 + * else physical mode.
34526 + * (We don't use lowest priority delivery + HW APIC IRQ steering, so
34527 + * can ignore the clustered logical case and go straight to physical.)
34528 + */
34529 + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
34530 +#ifdef CONFIG_HOTPLUG_CPU
34531 + /* Don't use APIC shortcuts in CPU hotplug to avoid races */
34532 + genapic = &apic_physflat;
34533 +#else
34534 + genapic = &apic_flat;
34535 +#endif
34536 + } else
34537 + genapic = &apic_cluster;
34538 +
34539 +print:
34540 +#else
34541 + /* hardcode to xen apic functions */
34542 + genapic = &apic_xen;
34543 +#endif
34544 + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
34545 +}
34546 +
34547 +/* Same for both flat and clustered. */
34548 +
34549 +#ifdef CONFIG_XEN
34550 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
34551 +#endif
34552 +
34553 +void send_IPI_self(int vector)
34554 +{
34555 +#ifndef CONFIG_XEN
34556 + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34557 +#else
34558 + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
34559 +#endif
34560 +}
34561 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic_xen.c linux-2.6.16.33/arch/x86_64/kernel/genapic_xen.c
34562 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/genapic_xen.c 1970-01-01 00:00:00.000000000 +0000
34563 +++ linux-2.6.16.33/arch/x86_64/kernel/genapic_xen.c 2007-01-08 15:00:45.000000000 +0000
34564 @@ -0,0 +1,162 @@
34565 +/*
34566 + * Copyright 2004 James Cleverdon, IBM.
34567 + * Subject to the GNU Public License, v.2
34568 + *
34569 + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery.
34570 + *
34571 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
34572 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
34573 + * James Cleverdon.
34574 + *
34575 + * Hacked to pieces for Xen by Chris Wright.
34576 + */
34577 +#include <linux/config.h>
34578 +#include <linux/threads.h>
34579 +#include <linux/cpumask.h>
34580 +#include <linux/string.h>
34581 +#include <linux/kernel.h>
34582 +#include <linux/ctype.h>
34583 +#include <linux/init.h>
34584 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34585 +#include <asm/smp.h>
34586 +#include <asm/ipi.h>
34587 +#else
34588 +#include <asm/apic.h>
34589 +#include <asm/apicdef.h>
34590 +#include <asm/genapic.h>
34591 +#endif
34592 +#include <xen/evtchn.h>
34593 +
34594 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
34595 +
34596 +static inline void __send_IPI_one(unsigned int cpu, int vector)
34597 +{
34598 + int irq = per_cpu(ipi_to_irq, cpu)[vector];
34599 + BUG_ON(irq < 0);
34600 + notify_remote_via_irq(irq);
34601 +}
34602 +
34603 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
34604 +{
34605 + int cpu;
34606 +
34607 + switch (shortcut) {
34608 + case APIC_DEST_SELF:
34609 + __send_IPI_one(smp_processor_id(), vector);
34610 + break;
34611 + case APIC_DEST_ALLBUT:
34612 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34613 + if (cpu == smp_processor_id())
34614 + continue;
34615 + if (cpu_isset(cpu, cpu_online_map)) {
34616 + __send_IPI_one(cpu, vector);
34617 + }
34618 + }
34619 + break;
34620 + case APIC_DEST_ALLINC:
34621 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34622 + if (cpu_isset(cpu, cpu_online_map)) {
34623 + __send_IPI_one(cpu, vector);
34624 + }
34625 + }
34626 + break;
34627 + default:
34628 + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
34629 + vector);
34630 + break;
34631 + }
34632 +}
34633 +
34634 +static cpumask_t xen_target_cpus(void)
34635 +{
34636 + return cpu_online_map;
34637 +}
34638 +
34639 +/*
34640 + * Set up the logical destination ID.
34641 + * Do nothing, not called now.
34642 + */
34643 +static void xen_init_apic_ldr(void)
34644 +{
34645 + Dprintk("%s\n", __FUNCTION__);
34646 + return;
34647 +}
34648 +
34649 +static void xen_send_IPI_allbutself(int vector)
34650 +{
34651 + /*
34652 + * if there are no other CPUs in the system then
34653 + * we get an APIC send error if we try to broadcast.
34654 + * thus we have to avoid sending IPIs in this case.
34655 + */
34656 + Dprintk("%s\n", __FUNCTION__);
34657 + if (num_online_cpus() > 1)
34658 + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
34659 +}
34660 +
34661 +static void xen_send_IPI_all(int vector)
34662 +{
34663 + Dprintk("%s\n", __FUNCTION__);
34664 + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
34665 +}
34666 +
34667 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
34668 +{
34669 + unsigned long mask = cpus_addr(cpumask)[0];
34670 + unsigned int cpu;
34671 + unsigned long flags;
34672 +
34673 + Dprintk("%s\n", __FUNCTION__);
34674 + local_irq_save(flags);
34675 + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
34676 +
34677 + for (cpu = 0; cpu < NR_CPUS; ++cpu) {
34678 + if (cpu_isset(cpu, cpumask)) {
34679 + __send_IPI_one(cpu, vector);
34680 + }
34681 + }
34682 + local_irq_restore(flags);
34683 +}
34684 +
34685 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34686 +static int xen_apic_id_registered(void)
34687 +{
34688 + /* better be set */
34689 + Dprintk("%s\n", __FUNCTION__);
34690 + return physid_isset(smp_processor_id(), phys_cpu_present_map);
34691 +}
34692 +#endif
34693 +
34694 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
34695 +{
34696 + Dprintk("%s\n", __FUNCTION__);
34697 + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
34698 +}
34699 +
34700 +static unsigned int phys_pkg_id(int index_msb)
34701 +{
34702 + u32 ebx;
34703 +
34704 + Dprintk("%s\n", __FUNCTION__);
34705 + ebx = cpuid_ebx(1);
34706 + return ((ebx >> 24) & 0xFF) >> index_msb;
34707 +}
34708 +
34709 +struct genapic apic_xen = {
34710 + .name = "xen",
34711 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34712 + .int_delivery_mode = dest_LowestPrio,
34713 +#endif
34714 + .int_dest_mode = (APIC_DEST_LOGICAL != 0),
34715 + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
34716 + .target_cpus = xen_target_cpus,
34717 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
34718 + .apic_id_registered = xen_apic_id_registered,
34719 +#endif
34720 + .init_apic_ldr = xen_init_apic_ldr,
34721 + .send_IPI_all = xen_send_IPI_all,
34722 + .send_IPI_allbutself = xen_send_IPI_allbutself,
34723 + .send_IPI_mask = xen_send_IPI_mask,
34724 + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
34725 + .phys_pkg_id = phys_pkg_id,
34726 +};
34727 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/head-xen.S linux-2.6.16.33/arch/x86_64/kernel/head-xen.S
34728 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/head-xen.S 1970-01-01 00:00:00.000000000 +0000
34729 +++ linux-2.6.16.33/arch/x86_64/kernel/head-xen.S 2007-01-08 15:00:45.000000000 +0000
34730 @@ -0,0 +1,183 @@
34731 +/*
34732 + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
34733 + *
34734 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
34735 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
34736 + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
34737 + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
34738 + *
34739 + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
34740 + *
34741 + * Jun Nakajima <jun.nakajima@intel.com>
34742 + * Modified for Xen
34743 + */
34744 +
34745 +
34746 +#include <linux/linkage.h>
34747 +#include <linux/threads.h>
34748 +#include <linux/init.h>
34749 +#include <linux/elfnote.h>
34750 +#include <asm/desc.h>
34751 +#include <asm/segment.h>
34752 +#include <asm/page.h>
34753 +#include <asm/msr.h>
34754 +#include <asm/cache.h>
34755 +
34756 +#include <xen/interface/elfnote.h>
34757 +
34758 + .text
34759 + .code64
34760 +#define VIRT_ENTRY_OFFSET 0x0
34761 +.org VIRT_ENTRY_OFFSET
34762 + .globl startup_64
34763 +startup_64:
34764 +ENTRY(_start)
34765 + movq $(init_thread_union+THREAD_SIZE-8),%rsp
34766 + /* zero EFLAGS after setting rsp */
34767 + pushq $0
34768 + popfq
34769 +
34770 + /* rsi is pointer to startup info structure.
34771 + pass it to C */
34772 + movq %rsi,%rdi
34773 + jmp x86_64_start_kernel
34774 +
34775 +ENTRY(stext)
34776 +ENTRY(_stext)
34777 +
34778 + $page = 0
34779 +#define NEXT_PAGE(name) \
34780 + $page = $page + 1; \
34781 + .org $page * 0x1000; \
34782 + phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
34783 +ENTRY(name)
34784 +
34785 +NEXT_PAGE(init_level4_pgt)
34786 + /* This gets initialized in x86_64_start_kernel */
34787 + .fill 512,8,0
34788 +
34789 + /*
34790 + * We update two pgd entries to make kernel and user pgd consistent
34791 + * at pgd_populate(). It can be used for kernel modules. So we place
34792 + * this page here for those cases to avoid memory corruption.
34793 + * We also use this page to establish the initiali mapping for
34794 + * vsyscall area.
34795 + */
34796 +NEXT_PAGE(init_level4_user_pgt)
34797 + .fill 512,8,0
34798 +
34799 +NEXT_PAGE(level3_kernel_pgt)
34800 + .fill 512,8,0
34801 +
34802 + /*
34803 + * This is used for vsyscall area mapping as we have a different
34804 + * level4 page table for user.
34805 + */
34806 +NEXT_PAGE(level3_user_pgt)
34807 + .fill 512,8,0
34808 +
34809 +NEXT_PAGE(level2_kernel_pgt)
34810 + .fill 512,8,0
34811 +
34812 +NEXT_PAGE(empty_zero_page)
34813 + .skip PAGE_SIZE
34814 +
34815 +NEXT_PAGE(hypercall_page)
34816 + .fill 512,8,0
34817 +
34818 +#undef NEXT_PAGE
34819 +
34820 + .data
34821 +
34822 + .align 16
34823 + .globl cpu_gdt_descr
34824 +cpu_gdt_descr:
34825 + .word gdt_end-cpu_gdt_table
34826 +gdt:
34827 + .quad cpu_gdt_table
34828 +#ifdef CONFIG_SMP
34829 + .rept NR_CPUS-1
34830 + .word 0
34831 + .quad 0
34832 + .endr
34833 +#endif
34834 +
34835 +/* We need valid kernel segments for data and code in long mode too
34836 + * IRET will check the segment types kkeil 2000/10/28
34837 + * Also sysret mandates a special GDT layout
34838 + */
34839 +
34840 + .section .data.page_aligned, "aw"
34841 + .align PAGE_SIZE
34842 +
34843 +/* The TLS descriptors are currently at a different place compared to i386.
34844 + Hopefully nobody expects them at a fixed place (Wine?) */
34845 +
34846 +ENTRY(cpu_gdt_table)
34847 + .quad 0x0000000000000000 /* NULL descriptor */
34848 + .quad 0x0 /* unused */
34849 + .quad 0x00af9a000000ffff /* __KERNEL_CS */
34850 + .quad 0x00cf92000000ffff /* __KERNEL_DS */
34851 + .quad 0x00cffa000000ffff /* __USER32_CS */
34852 + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
34853 + .quad 0x00affa000000ffff /* __USER_CS */
34854 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
34855 + .quad 0,0 /* TSS */
34856 + .quad 0,0 /* LDT */
34857 + .quad 0,0,0 /* three TLS descriptors */
34858 + .quad 0 /* unused */
34859 +gdt_end:
34860 + /* asm/segment.h:GDT_ENTRIES must match this */
34861 + /* This should be a multiple of the cache line size */
34862 + /* GDTs of other CPUs are now dynamically allocated */
34863 +
34864 + /* zero the remaining page */
34865 + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
34866 +
34867 +#ifdef CONFIG_XEN_COMPAT_030002
34868 +/*
34869 + * __xen_guest information
34870 + */
34871 +.macro utoh value
34872 + .if (\value) < 0 || (\value) >= 0x10
34873 + utoh (((\value)>>4)&0x0fffffffffffffff)
34874 + .endif
34875 + .if ((\value) & 0xf) < 10
34876 + .byte '0' + ((\value) & 0xf)
34877 + .else
34878 + .byte 'A' + ((\value) & 0xf) - 10
34879 + .endif
34880 +.endm
34881 +
34882 +.section __xen_guest
34883 + .ascii "GUEST_OS=linux,GUEST_VER=2.6"
34884 + .ascii ",XEN_VER=xen-3.0"
34885 + .ascii ",VIRT_BASE=0x"
34886 + utoh __START_KERNEL_map
34887 + .ascii ",ELF_PADDR_OFFSET=0x"
34888 + utoh __START_KERNEL_map
34889 + .ascii ",VIRT_ENTRY=0x"
34890 + utoh (__START_KERNEL_map + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
34891 + .ascii ",HYPERCALL_PAGE=0x"
34892 + utoh (phys_hypercall_page >> PAGE_SHIFT)
34893 + .ascii ",FEATURES=writable_page_tables"
34894 + .ascii "|writable_descriptor_tables"
34895 + .ascii "|auto_translated_physmap"
34896 + .ascii "|supervisor_mode_kernel"
34897 + .ascii ",LOADER=generic"
34898 + .byte 0
34899 +#endif /* CONFIG_XEN_COMPAT_030002 */
34900 +
34901 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux")
34902 + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6")
34903 + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
34904 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map)
34905 +#ifdef CONFIG_XEN_COMPAT_030002
34906 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map)
34907 +#else
34908 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0)
34909 +#endif /* !CONFIG_XEN_COMPAT_030002 */
34910 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64)
34911 + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page)
34912 + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
34913 + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
34914 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/head64-xen.c linux-2.6.16.33/arch/x86_64/kernel/head64-xen.c
34915 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/head64-xen.c 1970-01-01 00:00:00.000000000 +0000
34916 +++ linux-2.6.16.33/arch/x86_64/kernel/head64-xen.c 2007-01-08 15:00:45.000000000 +0000
34917 @@ -0,0 +1,159 @@
34918 +/*
34919 + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
34920 + *
34921 + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
34922 + *
34923 + * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
34924 + *
34925 + * Jun Nakajima <jun.nakajima@intel.com>
34926 + * Modified for Xen.
34927 + */
34928 +
34929 +#include <linux/init.h>
34930 +#include <linux/linkage.h>
34931 +#include <linux/types.h>
34932 +#include <linux/kernel.h>
34933 +#include <linux/string.h>
34934 +#include <linux/percpu.h>
34935 +#include <linux/module.h>
34936 +
34937 +#include <asm/processor.h>
34938 +#include <asm/proto.h>
34939 +#include <asm/smp.h>
34940 +#include <asm/bootsetup.h>
34941 +#include <asm/setup.h>
34942 +#include <asm/desc.h>
34943 +#include <asm/pgtable.h>
34944 +#include <asm/sections.h>
34945 +
34946 +unsigned long start_pfn;
34947 +
34948 +/* Don't add a printk in there. printk relies on the PDA which is not initialized
34949 + yet. */
34950 +#if 0
34951 +static void __init clear_bss(void)
34952 +{
34953 + memset(__bss_start, 0,
34954 + (unsigned long) __bss_stop - (unsigned long) __bss_start);
34955 +}
34956 +#endif
34957 +
34958 +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
34959 +#define OLD_CL_MAGIC_ADDR 0x90020
34960 +#define OLD_CL_MAGIC 0xA33F
34961 +#define OLD_CL_BASE_ADDR 0x90000
34962 +#define OLD_CL_OFFSET 0x90022
34963 +
34964 +extern char saved_command_line[];
34965 +
34966 +static void __init copy_bootdata(char *real_mode_data)
34967 +{
34968 +#ifndef CONFIG_XEN
34969 + int new_data;
34970 + char * command_line;
34971 +
34972 + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
34973 + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
34974 + if (!new_data) {
34975 + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
34976 + printk("so old bootloader that it does not support commandline?!\n");
34977 + return;
34978 + }
34979 + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
34980 + printk("old bootloader convention, maybe loadlin?\n");
34981 + }
34982 + command_line = (char *) ((u64)(new_data));
34983 + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
34984 +#else
34985 + int max_cmdline;
34986 +
34987 + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
34988 + max_cmdline = COMMAND_LINE_SIZE;
34989 + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
34990 + saved_command_line[max_cmdline-1] = '\0';
34991 +#endif
34992 + printk("Bootdata ok (command line is %s)\n", saved_command_line);
34993 +}
34994 +
34995 +static void __init setup_boot_cpu_data(void)
34996 +{
34997 + unsigned int dummy, eax;
34998 +
34999 + /* get vendor info */
35000 + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
35001 + (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
35002 + (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
35003 + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
35004 +
35005 + /* get cpu type */
35006 + cpuid(1, &eax, &dummy, &dummy,
35007 + (unsigned int *) &boot_cpu_data.x86_capability);
35008 + boot_cpu_data.x86 = (eax >> 8) & 0xf;
35009 + boot_cpu_data.x86_model = (eax >> 4) & 0xf;
35010 + boot_cpu_data.x86_mask = eax & 0xf;
35011 +}
35012 +
35013 +#include <xen/interface/memory.h>
35014 +unsigned long *machine_to_phys_mapping;
35015 +EXPORT_SYMBOL(machine_to_phys_mapping);
35016 +unsigned int machine_to_phys_order;
35017 +EXPORT_SYMBOL(machine_to_phys_order);
35018 +
35019 +void __init x86_64_start_kernel(char * real_mode_data)
35020 +{
35021 + struct xen_machphys_mapping mapping;
35022 + unsigned long machine_to_phys_nr_ents;
35023 + char *s;
35024 + int i;
35025 +
35026 + xen_start_info = (struct start_info *)real_mode_data;
35027 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
35028 + phys_to_machine_mapping =
35029 + (unsigned long *)xen_start_info->mfn_list;
35030 + start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
35031 + xen_start_info->nr_pt_frames;
35032 + }
35033 +
35034 +
35035 + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
35036 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
35037 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
35038 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
35039 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
35040 + }
35041 + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
35042 + machine_to_phys_order++;
35043 +
35044 +#if 0
35045 + for (i = 0; i < 256; i++)
35046 + set_intr_gate(i, early_idt_handler);
35047 + asm volatile("lidt %0" :: "m" (idt_descr));
35048 +#endif
35049 +
35050 + for (i = 0; i < NR_CPUS; i++)
35051 + cpu_pda(i) = &boot_cpu_pda[i];
35052 +
35053 + pda_init(0);
35054 + copy_bootdata(real_mode_data);
35055 +#ifdef CONFIG_SMP
35056 + cpu_set(0, cpu_online_map);
35057 +#endif
35058 + s = strstr(saved_command_line, "earlyprintk=");
35059 + if (s != NULL)
35060 + setup_early_printk(strchr(s, '=') + 1);
35061 +#ifdef CONFIG_NUMA
35062 + s = strstr(saved_command_line, "numa=");
35063 + if (s != NULL)
35064 + numa_setup(s+5);
35065 +#endif
35066 +#ifdef CONFIG_X86_IO_APIC
35067 + if (strstr(saved_command_line, "disableapic"))
35068 + disable_apic = 1;
35069 +#endif
35070 + /* You need early console to see that */
35071 + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
35072 + panic("Kernel too big for kernel mapping\n");
35073 +
35074 + setup_boot_cpu_data();
35075 + start_kernel();
35076 +}
35077 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/init_task.c linux-2.6.16.33/arch/x86_64/kernel/init_task.c
35078 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/init_task.c 2006-11-22 18:06:31.000000000 +0000
35079 +++ linux-2.6.16.33/arch/x86_64/kernel/init_task.c 2007-01-08 15:00:45.000000000 +0000
35080 @@ -37,6 +37,8 @@
35081 struct task_struct init_task = INIT_TASK(init_task);
35082
35083 EXPORT_SYMBOL(init_task);
35084 +
35085 +#ifndef CONFIG_X86_NO_TSS
35086 /*
35087 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
35088 * no more per-task TSS's. The TSS size is kept cacheline-aligned
35089 @@ -45,5 +47,6 @@
35090 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
35091 */
35092 DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
35093 +#endif
35094
35095 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
35096 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/io_apic-xen.c linux-2.6.16.33/arch/x86_64/kernel/io_apic-xen.c
35097 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/io_apic-xen.c 1970-01-01 00:00:00.000000000 +0000
35098 +++ linux-2.6.16.33/arch/x86_64/kernel/io_apic-xen.c 2007-01-08 15:00:45.000000000 +0000
35099 @@ -0,0 +1,2226 @@
35100 +/*
35101 + * Intel IO-APIC support for multi-Pentium hosts.
35102 + *
35103 + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
35104 + *
35105 + * Many thanks to Stig Venaas for trying out countless experimental
35106 + * patches and reporting/debugging problems patiently!
35107 + *
35108 + * (c) 1999, Multiple IO-APIC support, developed by
35109 + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
35110 + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
35111 + * further tested and cleaned up by Zach Brown <zab@redhat.com>
35112 + * and Ingo Molnar <mingo@redhat.com>
35113 + *
35114 + * Fixes
35115 + * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
35116 + * thanks to Eric Gilmore
35117 + * and Rolf G. Tews
35118 + * for testing these extensively
35119 + * Paul Diefenbaugh : Added full ACPI support
35120 + */
35121 +
35122 +#include <linux/mm.h>
35123 +#include <linux/interrupt.h>
35124 +#include <linux/init.h>
35125 +#include <linux/delay.h>
35126 +#include <linux/sched.h>
35127 +#include <linux/config.h>
35128 +#include <linux/smp_lock.h>
35129 +#include <linux/mc146818rtc.h>
35130 +#include <linux/acpi.h>
35131 +#include <linux/sysdev.h>
35132 +#ifdef CONFIG_ACPI
35133 +#include <acpi/acpi_bus.h>
35134 +#endif
35135 +
35136 +#include <asm/io.h>
35137 +#include <asm/smp.h>
35138 +#include <asm/desc.h>
35139 +#include <asm/proto.h>
35140 +#include <asm/mach_apic.h>
35141 +#include <asm/acpi.h>
35142 +#include <asm/dma.h>
35143 +
35144 +#define __apicdebuginit __init
35145 +
35146 +int sis_apic_bug; /* not actually supported, dummy for compile */
35147 +
35148 +static int no_timer_check;
35149 +
35150 +int disable_timer_pin_1 __initdata;
35151 +
35152 +#ifndef CONFIG_XEN
35153 +int timer_over_8254 __initdata = 1;
35154 +
35155 +/* Where if anywhere is the i8259 connect in external int mode */
35156 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
35157 +#endif
35158 +
35159 +static DEFINE_SPINLOCK(ioapic_lock);
35160 +
35161 +/*
35162 + * # of IRQ routing registers
35163 + */
35164 +int nr_ioapic_registers[MAX_IO_APICS];
35165 +
35166 +/*
35167 + * Rough estimation of how many shared IRQs there are, can
35168 + * be changed anytime.
35169 + */
35170 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
35171 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
35172 +
35173 +/*
35174 + * This is performance-critical, we want to do it O(1)
35175 + *
35176 + * the indexing order of this array favors 1:1 mappings
35177 + * between pins and IRQs.
35178 + */
35179 +
35180 +static struct irq_pin_list {
35181 + short apic, pin, next;
35182 +} irq_2_pin[PIN_MAP_SIZE];
35183 +
35184 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
35185 +#ifdef CONFIG_PCI_MSI
35186 +#define vector_to_irq(vector) \
35187 + (platform_legacy_irq(vector) ? vector : vector_irq[vector])
35188 +#else
35189 +#define vector_to_irq(vector) (vector)
35190 +#endif
35191 +
35192 +#ifdef CONFIG_XEN
35193 +
35194 +#include <xen/interface/xen.h>
35195 +#include <xen/interface/physdev.h>
35196 +
35197 +/* Fake i8259 */
35198 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
35199 +#define disable_8259A_irq(_irq) ((void)0)
35200 +#define i8259A_irq_pending(_irq) (0)
35201 +
35202 +unsigned long io_apic_irqs;
35203 +
35204 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
35205 +{
35206 + struct physdev_apic apic_op;
35207 + int ret;
35208 +
35209 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35210 + apic_op.reg = reg;
35211 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
35212 + if (ret)
35213 + return ret;
35214 + return apic_op.value;
35215 +}
35216 +
35217 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
35218 +{
35219 + struct physdev_apic apic_op;
35220 +
35221 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
35222 + apic_op.reg = reg;
35223 + apic_op.value = value;
35224 + HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
35225 +}
35226 +
35227 +#define io_apic_read(a,r) xen_io_apic_read(a,r)
35228 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
35229 +
35230 +#define clear_IO_APIC() ((void)0)
35231 +
35232 +#else
35233 +
35234 +#ifdef CONFIG_SMP
35235 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
35236 +{
35237 + unsigned long flags;
35238 + unsigned int dest;
35239 + cpumask_t tmp;
35240 +
35241 + cpus_and(tmp, mask, cpu_online_map);
35242 + if (cpus_empty(tmp))
35243 + tmp = TARGET_CPUS;
35244 +
35245 + cpus_and(mask, tmp, CPU_MASK_ALL);
35246 +
35247 + dest = cpu_mask_to_apicid(mask);
35248 +
35249 + /*
35250 + * Only the high 8 bits are valid.
35251 + */
35252 + dest = SET_APIC_LOGICAL_ID(dest);
35253 +
35254 + spin_lock_irqsave(&ioapic_lock, flags);
35255 + __DO_ACTION(1, = dest, )
35256 + set_irq_info(irq, mask);
35257 + spin_unlock_irqrestore(&ioapic_lock, flags);
35258 +}
35259 +#endif
35260 +
35261 +#endif /* !CONFIG_XEN */
35262 +
35263 +/*
35264 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
35265 + * shared ISA-space IRQs, so we have to support them. We are super
35266 + * fast in the common case, and fast for shared ISA-space IRQs.
35267 + */
35268 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
35269 +{
35270 + static int first_free_entry = NR_IRQS;
35271 + struct irq_pin_list *entry = irq_2_pin + irq;
35272 +
35273 + BUG_ON(irq >= NR_IRQS);
35274 + while (entry->next)
35275 + entry = irq_2_pin + entry->next;
35276 +
35277 + if (entry->pin != -1) {
35278 + entry->next = first_free_entry;
35279 + entry = irq_2_pin + entry->next;
35280 + if (++first_free_entry >= PIN_MAP_SIZE)
35281 + panic("io_apic.c: ran out of irq_2_pin entries!");
35282 + }
35283 + entry->apic = apic;
35284 + entry->pin = pin;
35285 +}
35286 +
35287 +#ifndef CONFIG_XEN
35288 +#define __DO_ACTION(R, ACTION, FINAL) \
35289 + \
35290 +{ \
35291 + int pin; \
35292 + struct irq_pin_list *entry = irq_2_pin + irq; \
35293 + \
35294 + BUG_ON(irq >= NR_IRQS); \
35295 + for (;;) { \
35296 + unsigned int reg; \
35297 + pin = entry->pin; \
35298 + if (pin == -1) \
35299 + break; \
35300 + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
35301 + reg ACTION; \
35302 + io_apic_modify(entry->apic, reg); \
35303 + if (!entry->next) \
35304 + break; \
35305 + entry = irq_2_pin + entry->next; \
35306 + } \
35307 + FINAL; \
35308 +}
35309 +
35310 +#define DO_ACTION(name,R,ACTION, FINAL) \
35311 + \
35312 + static void name##_IO_APIC_irq (unsigned int irq) \
35313 + __DO_ACTION(R, ACTION, FINAL)
35314 +
35315 +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
35316 + /* mask = 1 */
35317 +DO_ACTION( __unmask, 0, &= 0xfffeffff, )
35318 + /* mask = 0 */
35319 +
35320 +static void mask_IO_APIC_irq (unsigned int irq)
35321 +{
35322 + unsigned long flags;
35323 +
35324 + spin_lock_irqsave(&ioapic_lock, flags);
35325 + __mask_IO_APIC_irq(irq);
35326 + spin_unlock_irqrestore(&ioapic_lock, flags);
35327 +}
35328 +
35329 +static void unmask_IO_APIC_irq (unsigned int irq)
35330 +{
35331 + unsigned long flags;
35332 +
35333 + spin_lock_irqsave(&ioapic_lock, flags);
35334 + __unmask_IO_APIC_irq(irq);
35335 + spin_unlock_irqrestore(&ioapic_lock, flags);
35336 +}
35337 +
35338 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
35339 +{
35340 + struct IO_APIC_route_entry entry;
35341 + unsigned long flags;
35342 +
35343 + /* Check delivery_mode to be sure we're not clearing an SMI pin */
35344 + spin_lock_irqsave(&ioapic_lock, flags);
35345 + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
35346 + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
35347 + spin_unlock_irqrestore(&ioapic_lock, flags);
35348 + if (entry.delivery_mode == dest_SMI)
35349 + return;
35350 + /*
35351 + * Disable it in the IO-APIC irq-routing table:
35352 + */
35353 + memset(&entry, 0, sizeof(entry));
35354 + entry.mask = 1;
35355 + spin_lock_irqsave(&ioapic_lock, flags);
35356 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
35357 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
35358 + spin_unlock_irqrestore(&ioapic_lock, flags);
35359 +}
35360 +
35361 +static void clear_IO_APIC (void)
35362 +{
35363 + int apic, pin;
35364 +
35365 + for (apic = 0; apic < nr_ioapics; apic++)
35366 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
35367 + clear_IO_APIC_pin(apic, pin);
35368 +}
35369 +
35370 +#endif /* !CONFIG_XEN */
35371 +
35372 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
35373 +
35374 +/*
35375 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
35376 + * specific CPU-side IRQs.
35377 + */
35378 +
35379 +#define MAX_PIRQS 8
35380 +static int pirq_entries [MAX_PIRQS];
35381 +static int pirqs_enabled;
35382 +int skip_ioapic_setup;
35383 +int ioapic_force;
35384 +
35385 +/* dummy parsing: see setup.c */
35386 +
35387 +static int __init disable_ioapic_setup(char *str)
35388 +{
35389 + skip_ioapic_setup = 1;
35390 + return 1;
35391 +}
35392 +
35393 +static int __init enable_ioapic_setup(char *str)
35394 +{
35395 + ioapic_force = 1;
35396 + skip_ioapic_setup = 0;
35397 + return 1;
35398 +}
35399 +
35400 +__setup("noapic", disable_ioapic_setup);
35401 +__setup("apic", enable_ioapic_setup);
35402 +
35403 +#ifndef CONFIG_XEN
35404 +static int __init setup_disable_8254_timer(char *s)
35405 +{
35406 + timer_over_8254 = -1;
35407 + return 1;
35408 +}
35409 +static int __init setup_enable_8254_timer(char *s)
35410 +{
35411 + timer_over_8254 = 2;
35412 + return 1;
35413 +}
35414 +
35415 +__setup("disable_8254_timer", setup_disable_8254_timer);
35416 +__setup("enable_8254_timer", setup_enable_8254_timer);
35417 +#endif /* !CONFIG_XEN */
35418 +
35419 +#include <asm/pci-direct.h>
35420 +#include <linux/pci_ids.h>
35421 +#include <linux/pci.h>
35422 +
35423 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
35424 + off. Check for an Nvidia or VIA PCI bridge and turn it off.
35425 + Use pci direct infrastructure because this runs before the PCI subsystem.
35426 +
35427 + Can be overwritten with "apic"
35428 +
35429 + And another hack to disable the IOMMU on VIA chipsets.
35430 +
35431 + ... and others. Really should move this somewhere else.
35432 +
35433 + Kludge-O-Rama. */
35434 +void __init check_ioapic(void)
35435 +{
35436 + int num,slot,func;
35437 + /* Poor man's PCI discovery */
35438 + for (num = 0; num < 32; num++) {
35439 + for (slot = 0; slot < 32; slot++) {
35440 + for (func = 0; func < 8; func++) {
35441 + u32 class;
35442 + u32 vendor;
35443 + u8 type;
35444 + class = read_pci_config(num,slot,func,
35445 + PCI_CLASS_REVISION);
35446 + if (class == 0xffffffff)
35447 + break;
35448 +
35449 + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
35450 + continue;
35451 +
35452 + vendor = read_pci_config(num, slot, func,
35453 + PCI_VENDOR_ID);
35454 + vendor &= 0xffff;
35455 + switch (vendor) {
35456 + case PCI_VENDOR_ID_VIA:
35457 +#ifdef CONFIG_GART_IOMMU
35458 + if ((end_pfn > MAX_DMA32_PFN ||
35459 + force_iommu) &&
35460 + !iommu_aperture_allowed) {
35461 + printk(KERN_INFO
35462 + "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
35463 + iommu_aperture_disabled = 1;
35464 + }
35465 +#endif
35466 + return;
35467 + case PCI_VENDOR_ID_NVIDIA:
35468 +#ifdef CONFIG_ACPI
35469 + /* All timer overrides on Nvidia
35470 + seem to be wrong. Skip them. */
35471 + acpi_skip_timer_override = 1;
35472 + printk(KERN_INFO
35473 + "Nvidia board detected. Ignoring ACPI timer override.\n");
35474 +#endif
35475 + /* RED-PEN skip them on mptables too? */
35476 + return;
35477 + case PCI_VENDOR_ID_ATI:
35478 +
35479 + /* This should be actually default, but
35480 + for 2.6.16 let's do it for ATI only where
35481 + it's really needed. */
35482 +#ifndef CONFIG_XEN
35483 + if (timer_over_8254 == 1) {
35484 + timer_over_8254 = 0;
35485 + printk(KERN_INFO
35486 + "ATI board detected. Disabling timer routing over 8254.\n");
35487 + }
35488 +#endif
35489 + return;
35490 + }
35491 +
35492 +
35493 + /* No multi-function device? */
35494 + type = read_pci_config_byte(num,slot,func,
35495 + PCI_HEADER_TYPE);
35496 + if (!(type & 0x80))
35497 + break;
35498 + }
35499 + }
35500 + }
35501 +}
35502 +
35503 +static int __init ioapic_pirq_setup(char *str)
35504 +{
35505 + int i, max;
35506 + int ints[MAX_PIRQS+1];
35507 +
35508 + get_options(str, ARRAY_SIZE(ints), ints);
35509 +
35510 + for (i = 0; i < MAX_PIRQS; i++)
35511 + pirq_entries[i] = -1;
35512 +
35513 + pirqs_enabled = 1;
35514 + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
35515 + max = MAX_PIRQS;
35516 + if (ints[0] < MAX_PIRQS)
35517 + max = ints[0];
35518 +
35519 + for (i = 0; i < max; i++) {
35520 + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
35521 + /*
35522 + * PIRQs are mapped upside down, usually.
35523 + */
35524 + pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
35525 + }
35526 + return 1;
35527 +}
35528 +
35529 +__setup("pirq=", ioapic_pirq_setup);
35530 +
35531 +/*
35532 + * Find the IRQ entry number of a certain pin.
35533 + */
35534 +static int find_irq_entry(int apic, int pin, int type)
35535 +{
35536 + int i;
35537 +
35538 + for (i = 0; i < mp_irq_entries; i++)
35539 + if (mp_irqs[i].mpc_irqtype == type &&
35540 + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
35541 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
35542 + mp_irqs[i].mpc_dstirq == pin)
35543 + return i;
35544 +
35545 + return -1;
35546 +}
35547 +
35548 +#ifndef CONFIG_XEN
35549 +/*
35550 + * Find the pin to which IRQ[irq] (ISA) is connected
35551 + */
35552 +static int __init find_isa_irq_pin(int irq, int type)
35553 +{
35554 + int i;
35555 +
35556 + for (i = 0; i < mp_irq_entries; i++) {
35557 + int lbus = mp_irqs[i].mpc_srcbus;
35558 +
35559 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
35560 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
35561 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
35562 + (mp_irqs[i].mpc_irqtype == type) &&
35563 + (mp_irqs[i].mpc_srcbusirq == irq))
35564 +
35565 + return mp_irqs[i].mpc_dstirq;
35566 + }
35567 + return -1;
35568 +}
35569 +
35570 +static int __init find_isa_irq_apic(int irq, int type)
35571 +{
35572 + int i;
35573 +
35574 + for (i = 0; i < mp_irq_entries; i++) {
35575 + int lbus = mp_irqs[i].mpc_srcbus;
35576 +
35577 + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
35578 + mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
35579 + mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
35580 + (mp_irqs[i].mpc_irqtype == type) &&
35581 + (mp_irqs[i].mpc_srcbusirq == irq))
35582 + break;
35583 + }
35584 + if (i < mp_irq_entries) {
35585 + int apic;
35586 + for(apic = 0; apic < nr_ioapics; apic++) {
35587 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
35588 + return apic;
35589 + }
35590 + }
35591 +
35592 + return -1;
35593 +}
35594 +#endif
35595 +
35596 +/*
35597 + * Find a specific PCI IRQ entry.
35598 + * Not an __init, possibly needed by modules
35599 + */
35600 +static int pin_2_irq(int idx, int apic, int pin);
35601 +
35602 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
35603 +{
35604 + int apic, i, best_guess = -1;
35605 +
35606 + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
35607 + bus, slot, pin);
35608 + if (mp_bus_id_to_pci_bus[bus] == -1) {
35609 + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
35610 + return -1;
35611 + }
35612 + for (i = 0; i < mp_irq_entries; i++) {
35613 + int lbus = mp_irqs[i].mpc_srcbus;
35614 +
35615 + for (apic = 0; apic < nr_ioapics; apic++)
35616 + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
35617 + mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
35618 + break;
35619 +
35620 + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
35621 + !mp_irqs[i].mpc_irqtype &&
35622 + (bus == lbus) &&
35623 + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
35624 + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
35625 +
35626 + if (!(apic || IO_APIC_IRQ(irq)))
35627 + continue;
35628 +
35629 + if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
35630 + return irq;
35631 + /*
35632 + * Use the first all-but-pin matching entry as a
35633 + * best-guess fuzzy result for broken mptables.
35634 + */
35635 + if (best_guess < 0)
35636 + best_guess = irq;
35637 + }
35638 + }
35639 + BUG_ON(best_guess >= NR_IRQS);
35640 + return best_guess;
35641 +}
35642 +
35643 +/*
35644 + * EISA Edge/Level control register, ELCR
35645 + */
35646 +static int EISA_ELCR(unsigned int irq)
35647 +{
35648 + if (irq < 16) {
35649 + unsigned int port = 0x4d0 + (irq >> 3);
35650 + return (inb(port) >> (irq & 7)) & 1;
35651 + }
35652 + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
35653 + return 0;
35654 +}
35655 +
35656 +/* EISA interrupts are always polarity zero and can be edge or level
35657 + * trigger depending on the ELCR value. If an interrupt is listed as
35658 + * EISA conforming in the MP table, that means its trigger type must
35659 + * be read in from the ELCR */
35660 +
35661 +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
35662 +#define default_EISA_polarity(idx) (0)
35663 +
35664 +/* ISA interrupts are always polarity zero edge triggered,
35665 + * when listed as conforming in the MP table. */
35666 +
35667 +#define default_ISA_trigger(idx) (0)
35668 +#define default_ISA_polarity(idx) (0)
35669 +
35670 +/* PCI interrupts are always polarity one level triggered,
35671 + * when listed as conforming in the MP table. */
35672 +
35673 +#define default_PCI_trigger(idx) (1)
35674 +#define default_PCI_polarity(idx) (1)
35675 +
35676 +/* MCA interrupts are always polarity zero level triggered,
35677 + * when listed as conforming in the MP table. */
35678 +
35679 +#define default_MCA_trigger(idx) (1)
35680 +#define default_MCA_polarity(idx) (0)
35681 +
35682 +static int __init MPBIOS_polarity(int idx)
35683 +{
35684 + int bus = mp_irqs[idx].mpc_srcbus;
35685 + int polarity;
35686 +
35687 + /*
35688 + * Determine IRQ line polarity (high active or low active):
35689 + */
35690 + switch (mp_irqs[idx].mpc_irqflag & 3)
35691 + {
35692 + case 0: /* conforms, ie. bus-type dependent polarity */
35693 + {
35694 + switch (mp_bus_id_to_type[bus])
35695 + {
35696 + case MP_BUS_ISA: /* ISA pin */
35697 + {
35698 + polarity = default_ISA_polarity(idx);
35699 + break;
35700 + }
35701 + case MP_BUS_EISA: /* EISA pin */
35702 + {
35703 + polarity = default_EISA_polarity(idx);
35704 + break;
35705 + }
35706 + case MP_BUS_PCI: /* PCI pin */
35707 + {
35708 + polarity = default_PCI_polarity(idx);
35709 + break;
35710 + }
35711 + case MP_BUS_MCA: /* MCA pin */
35712 + {
35713 + polarity = default_MCA_polarity(idx);
35714 + break;
35715 + }
35716 + default:
35717 + {
35718 + printk(KERN_WARNING "broken BIOS!!\n");
35719 + polarity = 1;
35720 + break;
35721 + }
35722 + }
35723 + break;
35724 + }
35725 + case 1: /* high active */
35726 + {
35727 + polarity = 0;
35728 + break;
35729 + }
35730 + case 2: /* reserved */
35731 + {
35732 + printk(KERN_WARNING "broken BIOS!!\n");
35733 + polarity = 1;
35734 + break;
35735 + }
35736 + case 3: /* low active */
35737 + {
35738 + polarity = 1;
35739 + break;
35740 + }
35741 + default: /* invalid */
35742 + {
35743 + printk(KERN_WARNING "broken BIOS!!\n");
35744 + polarity = 1;
35745 + break;
35746 + }
35747 + }
35748 + return polarity;
35749 +}
35750 +
35751 +static int MPBIOS_trigger(int idx)
35752 +{
35753 + int bus = mp_irqs[idx].mpc_srcbus;
35754 + int trigger;
35755 +
35756 + /*
35757 + * Determine IRQ trigger mode (edge or level sensitive):
35758 + */
35759 + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
35760 + {
35761 + case 0: /* conforms, ie. bus-type dependent */
35762 + {
35763 + switch (mp_bus_id_to_type[bus])
35764 + {
35765 + case MP_BUS_ISA: /* ISA pin */
35766 + {
35767 + trigger = default_ISA_trigger(idx);
35768 + break;
35769 + }
35770 + case MP_BUS_EISA: /* EISA pin */
35771 + {
35772 + trigger = default_EISA_trigger(idx);
35773 + break;
35774 + }
35775 + case MP_BUS_PCI: /* PCI pin */
35776 + {
35777 + trigger = default_PCI_trigger(idx);
35778 + break;
35779 + }
35780 + case MP_BUS_MCA: /* MCA pin */
35781 + {
35782 + trigger = default_MCA_trigger(idx);
35783 + break;
35784 + }
35785 + default:
35786 + {
35787 + printk(KERN_WARNING "broken BIOS!!\n");
35788 + trigger = 1;
35789 + break;
35790 + }
35791 + }
35792 + break;
35793 + }
35794 + case 1: /* edge */
35795 + {
35796 + trigger = 0;
35797 + break;
35798 + }
35799 + case 2: /* reserved */
35800 + {
35801 + printk(KERN_WARNING "broken BIOS!!\n");
35802 + trigger = 1;
35803 + break;
35804 + }
35805 + case 3: /* level */
35806 + {
35807 + trigger = 1;
35808 + break;
35809 + }
35810 + default: /* invalid */
35811 + {
35812 + printk(KERN_WARNING "broken BIOS!!\n");
35813 + trigger = 0;
35814 + break;
35815 + }
35816 + }
35817 + return trigger;
35818 +}
35819 +
35820 +static inline int irq_polarity(int idx)
35821 +{
35822 + return MPBIOS_polarity(idx);
35823 +}
35824 +
35825 +static inline int irq_trigger(int idx)
35826 +{
35827 + return MPBIOS_trigger(idx);
35828 +}
35829 +
35830 +static int next_irq = 16;
35831 +
35832 +/*
35833 + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
35834 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
35835 + * from ACPI, which can reach 800 in large boxen.
35836 + *
35837 + * Compact the sparse GSI space into a sequential IRQ series and reuse
35838 + * vectors if possible.
35839 + */
35840 +int gsi_irq_sharing(int gsi)
35841 +{
35842 + int i, tries, vector;
35843 +
35844 + BUG_ON(gsi >= NR_IRQ_VECTORS);
35845 +
35846 + if (platform_legacy_irq(gsi))
35847 + return gsi;
35848 +
35849 + if (gsi_2_irq[gsi] != 0xFF)
35850 + return (int)gsi_2_irq[gsi];
35851 +
35852 + tries = NR_IRQS;
35853 + try_again:
35854 + vector = assign_irq_vector(gsi);
35855 +
35856 + /*
35857 + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
35858 + * use of vector and if found, return that IRQ. However, we never want
35859 + * to share legacy IRQs, which usually have a different trigger mode
35860 + * than PCI.
35861 + */
35862 + for (i = 0; i < NR_IRQS; i++)
35863 + if (IO_APIC_VECTOR(i) == vector)
35864 + break;
35865 + if (platform_legacy_irq(i)) {
35866 + if (--tries >= 0) {
35867 + IO_APIC_VECTOR(i) = 0;
35868 + goto try_again;
35869 + }
35870 + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
35871 + }
35872 + if (i < NR_IRQS) {
35873 + gsi_2_irq[gsi] = i;
35874 + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
35875 + gsi, vector, i);
35876 + return i;
35877 + }
35878 +
35879 + i = next_irq++;
35880 + BUG_ON(i >= NR_IRQS);
35881 + gsi_2_irq[gsi] = i;
35882 + IO_APIC_VECTOR(i) = vector;
35883 + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
35884 + gsi, vector, i);
35885 + return i;
35886 +}
35887 +
35888 +static int pin_2_irq(int idx, int apic, int pin)
35889 +{
35890 + int irq, i;
35891 + int bus = mp_irqs[idx].mpc_srcbus;
35892 +
35893 + /*
35894 + * Debugging check, we are in big trouble if this message pops up!
35895 + */
35896 + if (mp_irqs[idx].mpc_dstirq != pin)
35897 + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
35898 +
35899 + switch (mp_bus_id_to_type[bus])
35900 + {
35901 + case MP_BUS_ISA: /* ISA pin */
35902 + case MP_BUS_EISA:
35903 + case MP_BUS_MCA:
35904 + {
35905 + irq = mp_irqs[idx].mpc_srcbusirq;
35906 + break;
35907 + }
35908 + case MP_BUS_PCI: /* PCI pin */
35909 + {
35910 + /*
35911 + * PCI IRQs are mapped in order
35912 + */
35913 + i = irq = 0;
35914 + while (i < apic)
35915 + irq += nr_ioapic_registers[i++];
35916 + irq += pin;
35917 + irq = gsi_irq_sharing(irq);
35918 + break;
35919 + }
35920 + default:
35921 + {
35922 + printk(KERN_ERR "unknown bus type %d.\n",bus);
35923 + irq = 0;
35924 + break;
35925 + }
35926 + }
35927 + BUG_ON(irq >= NR_IRQS);
35928 +
35929 + /*
35930 + * PCI IRQ command line redirection. Yes, limits are hardcoded.
35931 + */
35932 + if ((pin >= 16) && (pin <= 23)) {
35933 + if (pirq_entries[pin-16] != -1) {
35934 + if (!pirq_entries[pin-16]) {
35935 + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
35936 + } else {
35937 + irq = pirq_entries[pin-16];
35938 + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
35939 + pin-16, irq);
35940 + }
35941 + }
35942 + }
35943 + BUG_ON(irq >= NR_IRQS);
35944 + return irq;
35945 +}
35946 +
35947 +static inline int IO_APIC_irq_trigger(int irq)
35948 +{
35949 + int apic, idx, pin;
35950 +
35951 + for (apic = 0; apic < nr_ioapics; apic++) {
35952 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
35953 + idx = find_irq_entry(apic,pin,mp_INT);
35954 + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
35955 + return irq_trigger(idx);
35956 + }
35957 + }
35958 + /*
35959 + * nonexistent IRQs are edge default
35960 + */
35961 + return 0;
35962 +}
35963 +
35964 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
35965 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
35966 +
35967 +int assign_irq_vector(int irq)
35968 +{
35969 + struct physdev_irq irq_op;
35970 +
35971 + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
35972 + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
35973 + return IO_APIC_VECTOR(irq);
35974 +
35975 + irq_op.irq = irq;
35976 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
35977 + return -ENOSPC;
35978 +
35979 + vector_irq[irq_op.vector] = irq;
35980 + if (irq != AUTO_ASSIGN)
35981 + IO_APIC_VECTOR(irq) = irq_op.vector;
35982 +
35983 + return irq_op.vector;
35984 +}
35985 +
35986 +extern void (*interrupt[NR_IRQS])(void);
35987 +#ifndef CONFIG_XEN
35988 +static struct hw_interrupt_type ioapic_level_type;
35989 +static struct hw_interrupt_type ioapic_edge_type;
35990 +
35991 +#define IOAPIC_AUTO -1
35992 +#define IOAPIC_EDGE 0
35993 +#define IOAPIC_LEVEL 1
35994 +
35995 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
35996 +{
35997 + if (use_pci_vector() && !platform_legacy_irq(irq)) {
35998 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
35999 + trigger == IOAPIC_LEVEL)
36000 + irq_desc[vector].handler = &ioapic_level_type;
36001 + else
36002 + irq_desc[vector].handler = &ioapic_edge_type;
36003 + set_intr_gate(vector, interrupt[vector]);
36004 + } else {
36005 + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
36006 + trigger == IOAPIC_LEVEL)
36007 + irq_desc[irq].handler = &ioapic_level_type;
36008 + else
36009 + irq_desc[irq].handler = &ioapic_edge_type;
36010 + set_intr_gate(vector, interrupt[irq]);
36011 + }
36012 +}
36013 +#else
36014 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
36015 +#endif /* !CONFIG_XEN */
36016 +
36017 +static void __init setup_IO_APIC_irqs(void)
36018 +{
36019 + struct IO_APIC_route_entry entry;
36020 + int apic, pin, idx, irq, first_notcon = 1, vector;
36021 + unsigned long flags;
36022 +
36023 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
36024 +
36025 + for (apic = 0; apic < nr_ioapics; apic++) {
36026 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36027 +
36028 + /*
36029 + * add it to the IO-APIC irq-routing table:
36030 + */
36031 + memset(&entry,0,sizeof(entry));
36032 +
36033 + entry.delivery_mode = INT_DELIVERY_MODE;
36034 + entry.dest_mode = INT_DEST_MODE;
36035 + entry.mask = 0; /* enable IRQ */
36036 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36037 +
36038 + idx = find_irq_entry(apic,pin,mp_INT);
36039 + if (idx == -1) {
36040 + if (first_notcon) {
36041 + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36042 + first_notcon = 0;
36043 + } else
36044 + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
36045 + continue;
36046 + }
36047 +
36048 + entry.trigger = irq_trigger(idx);
36049 + entry.polarity = irq_polarity(idx);
36050 +
36051 + if (irq_trigger(idx)) {
36052 + entry.trigger = 1;
36053 + entry.mask = 1;
36054 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36055 + }
36056 +
36057 + irq = pin_2_irq(idx, apic, pin);
36058 + add_pin_to_irq(irq, apic, pin);
36059 +
36060 + if (/* !apic && */ !IO_APIC_IRQ(irq))
36061 + continue;
36062 +
36063 + if (IO_APIC_IRQ(irq)) {
36064 + vector = assign_irq_vector(irq);
36065 + entry.vector = vector;
36066 +
36067 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
36068 + if (!apic && (irq < 16))
36069 + disable_8259A_irq(irq);
36070 + }
36071 + spin_lock_irqsave(&ioapic_lock, flags);
36072 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
36073 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
36074 + set_native_irq_info(irq, TARGET_CPUS);
36075 + spin_unlock_irqrestore(&ioapic_lock, flags);
36076 + }
36077 + }
36078 +
36079 + if (!first_notcon)
36080 + apic_printk(APIC_VERBOSE," not connected.\n");
36081 +}
36082 +
36083 +#ifndef CONFIG_XEN
36084 +/*
36085 + * Set up the 8259A-master output pin as broadcast to all
36086 + * CPUs.
36087 + */
36088 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
36089 +{
36090 + struct IO_APIC_route_entry entry;
36091 + unsigned long flags;
36092 +
36093 + memset(&entry,0,sizeof(entry));
36094 +
36095 + disable_8259A_irq(0);
36096 +
36097 + /* mask LVT0 */
36098 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
36099 +
36100 + /*
36101 + * We use logical delivery to get the timer IRQ
36102 + * to the first CPU.
36103 + */
36104 + entry.dest_mode = INT_DEST_MODE;
36105 + entry.mask = 0; /* unmask IRQ now */
36106 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
36107 + entry.delivery_mode = INT_DELIVERY_MODE;
36108 + entry.polarity = 0;
36109 + entry.trigger = 0;
36110 + entry.vector = vector;
36111 +
36112 + /*
36113 + * The timer IRQ doesn't have to know that behind the
36114 + * scene we have a 8259A-master in AEOI mode ...
36115 + */
36116 + irq_desc[0].handler = &ioapic_edge_type;
36117 +
36118 + /*
36119 + * Add it to the IO-APIC irq-routing table:
36120 + */
36121 + spin_lock_irqsave(&ioapic_lock, flags);
36122 + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
36123 + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
36124 + spin_unlock_irqrestore(&ioapic_lock, flags);
36125 +
36126 + enable_8259A_irq(0);
36127 +}
36128 +
36129 +void __init UNEXPECTED_IO_APIC(void)
36130 +{
36131 +}
36132 +
36133 +void __apicdebuginit print_IO_APIC(void)
36134 +{
36135 + int apic, i;
36136 + union IO_APIC_reg_00 reg_00;
36137 + union IO_APIC_reg_01 reg_01;
36138 + union IO_APIC_reg_02 reg_02;
36139 + unsigned long flags;
36140 +
36141 + if (apic_verbosity == APIC_QUIET)
36142 + return;
36143 +
36144 + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
36145 + for (i = 0; i < nr_ioapics; i++)
36146 + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
36147 + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
36148 +
36149 + /*
36150 + * We are a bit conservative about what we expect. We have to
36151 + * know about every hardware change ASAP.
36152 + */
36153 + printk(KERN_INFO "testing the IO APIC.......................\n");
36154 +
36155 + for (apic = 0; apic < nr_ioapics; apic++) {
36156 +
36157 + spin_lock_irqsave(&ioapic_lock, flags);
36158 + reg_00.raw = io_apic_read(apic, 0);
36159 + reg_01.raw = io_apic_read(apic, 1);
36160 + if (reg_01.bits.version >= 0x10)
36161 + reg_02.raw = io_apic_read(apic, 2);
36162 + spin_unlock_irqrestore(&ioapic_lock, flags);
36163 +
36164 + printk("\n");
36165 + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
36166 + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
36167 + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
36168 + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
36169 + UNEXPECTED_IO_APIC();
36170 +
36171 + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
36172 + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
36173 + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
36174 + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
36175 + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
36176 + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
36177 + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
36178 + (reg_01.bits.entries != 0x2E) &&
36179 + (reg_01.bits.entries != 0x3F) &&
36180 + (reg_01.bits.entries != 0x03)
36181 + )
36182 + UNEXPECTED_IO_APIC();
36183 +
36184 + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
36185 + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
36186 + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
36187 + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
36188 + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
36189 + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
36190 + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
36191 + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
36192 + )
36193 + UNEXPECTED_IO_APIC();
36194 + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
36195 + UNEXPECTED_IO_APIC();
36196 +
36197 + if (reg_01.bits.version >= 0x10) {
36198 + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
36199 + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
36200 + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
36201 + UNEXPECTED_IO_APIC();
36202 + }
36203 +
36204 + printk(KERN_DEBUG ".... IRQ redirection table:\n");
36205 +
36206 + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
36207 + " Stat Dest Deli Vect: \n");
36208 +
36209 + for (i = 0; i <= reg_01.bits.entries; i++) {
36210 + struct IO_APIC_route_entry entry;
36211 +
36212 + spin_lock_irqsave(&ioapic_lock, flags);
36213 + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
36214 + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
36215 + spin_unlock_irqrestore(&ioapic_lock, flags);
36216 +
36217 + printk(KERN_DEBUG " %02x %03X %02X ",
36218 + i,
36219 + entry.dest.logical.logical_dest,
36220 + entry.dest.physical.physical_dest
36221 + );
36222 +
36223 + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
36224 + entry.mask,
36225 + entry.trigger,
36226 + entry.irr,
36227 + entry.polarity,
36228 + entry.delivery_status,
36229 + entry.dest_mode,
36230 + entry.delivery_mode,
36231 + entry.vector
36232 + );
36233 + }
36234 + }
36235 + if (use_pci_vector())
36236 + printk(KERN_INFO "Using vector-based indexing\n");
36237 + printk(KERN_DEBUG "IRQ to pin mappings:\n");
36238 + for (i = 0; i < NR_IRQS; i++) {
36239 + struct irq_pin_list *entry = irq_2_pin + i;
36240 + if (entry->pin < 0)
36241 + continue;
36242 + if (use_pci_vector() && !platform_legacy_irq(i))
36243 + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
36244 + else
36245 + printk(KERN_DEBUG "IRQ%d ", i);
36246 + for (;;) {
36247 + printk("-> %d:%d", entry->apic, entry->pin);
36248 + if (!entry->next)
36249 + break;
36250 + entry = irq_2_pin + entry->next;
36251 + }
36252 + printk("\n");
36253 + }
36254 +
36255 + printk(KERN_INFO ".................................... done.\n");
36256 +
36257 + return;
36258 +}
36259 +
36260 +#if 0
36261 +
36262 +static __apicdebuginit void print_APIC_bitfield (int base)
36263 +{
36264 + unsigned int v;
36265 + int i, j;
36266 +
36267 + if (apic_verbosity == APIC_QUIET)
36268 + return;
36269 +
36270 + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
36271 + for (i = 0; i < 8; i++) {
36272 + v = apic_read(base + i*0x10);
36273 + for (j = 0; j < 32; j++) {
36274 + if (v & (1<<j))
36275 + printk("1");
36276 + else
36277 + printk("0");
36278 + }
36279 + printk("\n");
36280 + }
36281 +}
36282 +
36283 +void __apicdebuginit print_local_APIC(void * dummy)
36284 +{
36285 + unsigned int v, ver, maxlvt;
36286 +
36287 + if (apic_verbosity == APIC_QUIET)
36288 + return;
36289 +
36290 + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
36291 + smp_processor_id(), hard_smp_processor_id());
36292 + v = apic_read(APIC_ID);
36293 + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
36294 + v = apic_read(APIC_LVR);
36295 + printk(KERN_INFO "... APIC VERSION: %08x\n", v);
36296 + ver = GET_APIC_VERSION(v);
36297 + maxlvt = get_maxlvt();
36298 +
36299 + v = apic_read(APIC_TASKPRI);
36300 + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
36301 +
36302 + v = apic_read(APIC_ARBPRI);
36303 + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
36304 + v & APIC_ARBPRI_MASK);
36305 + v = apic_read(APIC_PROCPRI);
36306 + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
36307 +
36308 + v = apic_read(APIC_EOI);
36309 + printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
36310 + v = apic_read(APIC_RRR);
36311 + printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
36312 + v = apic_read(APIC_LDR);
36313 + printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
36314 + v = apic_read(APIC_DFR);
36315 + printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
36316 + v = apic_read(APIC_SPIV);
36317 + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
36318 +
36319 + printk(KERN_DEBUG "... APIC ISR field:\n");
36320 + print_APIC_bitfield(APIC_ISR);
36321 + printk(KERN_DEBUG "... APIC TMR field:\n");
36322 + print_APIC_bitfield(APIC_TMR);
36323 + printk(KERN_DEBUG "... APIC IRR field:\n");
36324 + print_APIC_bitfield(APIC_IRR);
36325 +
36326 + v = apic_read(APIC_ESR);
36327 + printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
36328 +
36329 + v = apic_read(APIC_ICR);
36330 + printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
36331 + v = apic_read(APIC_ICR2);
36332 + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
36333 +
36334 + v = apic_read(APIC_LVTT);
36335 + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
36336 +
36337 + if (maxlvt > 3) { /* PC is LVT#4. */
36338 + v = apic_read(APIC_LVTPC);
36339 + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
36340 + }
36341 + v = apic_read(APIC_LVT0);
36342 + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
36343 + v = apic_read(APIC_LVT1);
36344 + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
36345 +
36346 + if (maxlvt > 2) { /* ERR is LVT#3. */
36347 + v = apic_read(APIC_LVTERR);
36348 + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
36349 + }
36350 +
36351 + v = apic_read(APIC_TMICT);
36352 + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
36353 + v = apic_read(APIC_TMCCT);
36354 + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
36355 + v = apic_read(APIC_TDCR);
36356 + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
36357 + printk("\n");
36358 +}
36359 +
36360 +void print_all_local_APICs (void)
36361 +{
36362 + on_each_cpu(print_local_APIC, NULL, 1, 1);
36363 +}
36364 +
36365 +void __apicdebuginit print_PIC(void)
36366 +{
36367 + unsigned int v;
36368 + unsigned long flags;
36369 +
36370 + if (apic_verbosity == APIC_QUIET)
36371 + return;
36372 +
36373 + printk(KERN_DEBUG "\nprinting PIC contents\n");
36374 +
36375 + spin_lock_irqsave(&i8259A_lock, flags);
36376 +
36377 + v = inb(0xa1) << 8 | inb(0x21);
36378 + printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
36379 +
36380 + v = inb(0xa0) << 8 | inb(0x20);
36381 + printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
36382 +
36383 + outb(0x0b,0xa0);
36384 + outb(0x0b,0x20);
36385 + v = inb(0xa0) << 8 | inb(0x20);
36386 + outb(0x0a,0xa0);
36387 + outb(0x0a,0x20);
36388 +
36389 + spin_unlock_irqrestore(&i8259A_lock, flags);
36390 +
36391 + printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
36392 +
36393 + v = inb(0x4d1) << 8 | inb(0x4d0);
36394 + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
36395 +}
36396 +
36397 +#endif /* 0 */
36398 +
36399 +#else
36400 +void __init print_IO_APIC(void) { }
36401 +#endif /* !CONFIG_XEN */
36402 +
36403 +static void __init enable_IO_APIC(void)
36404 +{
36405 + union IO_APIC_reg_01 reg_01;
36406 +#ifndef CONFIG_XEN
36407 + int i8259_apic, i8259_pin;
36408 +#endif
36409 + int i, apic;
36410 + unsigned long flags;
36411 +
36412 + for (i = 0; i < PIN_MAP_SIZE; i++) {
36413 + irq_2_pin[i].pin = -1;
36414 + irq_2_pin[i].next = 0;
36415 + }
36416 + if (!pirqs_enabled)
36417 + for (i = 0; i < MAX_PIRQS; i++)
36418 + pirq_entries[i] = -1;
36419 +
36420 + /*
36421 + * The number of IO-APIC IRQ registers (== #pins):
36422 + */
36423 + for (apic = 0; apic < nr_ioapics; apic++) {
36424 + spin_lock_irqsave(&ioapic_lock, flags);
36425 + reg_01.raw = io_apic_read(apic, 1);
36426 + spin_unlock_irqrestore(&ioapic_lock, flags);
36427 + nr_ioapic_registers[apic] = reg_01.bits.entries+1;
36428 + }
36429 +#ifndef CONFIG_XEN
36430 + for(apic = 0; apic < nr_ioapics; apic++) {
36431 + int pin;
36432 + /* See if any of the pins is in ExtINT mode */
36433 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
36434 + struct IO_APIC_route_entry entry;
36435 + spin_lock_irqsave(&ioapic_lock, flags);
36436 + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
36437 + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
36438 + spin_unlock_irqrestore(&ioapic_lock, flags);
36439 +
36440 +
36441 + /* If the interrupt line is enabled and in ExtInt mode
36442 + * I have found the pin where the i8259 is connected.
36443 + */
36444 + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
36445 + ioapic_i8259.apic = apic;
36446 + ioapic_i8259.pin = pin;
36447 + goto found_i8259;
36448 + }
36449 + }
36450 + }
36451 + found_i8259:
36452 + /* Look to see what if the MP table has reported the ExtINT */
36453 + i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
36454 + i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
36455 + /* Trust the MP table if nothing is setup in the hardware */
36456 + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
36457 + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
36458 + ioapic_i8259.pin = i8259_pin;
36459 + ioapic_i8259.apic = i8259_apic;
36460 + }
36461 + /* Complain if the MP table and the hardware disagree */
36462 + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
36463 + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
36464 + {
36465 + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
36466 + }
36467 +#endif
36468 +
36469 + /*
36470 + * Do not trust the IO-APIC being empty at bootup
36471 + */
36472 + clear_IO_APIC();
36473 +}
36474 +
36475 +/*
36476 + * Not an __init, needed by the reboot code
36477 + */
36478 +void disable_IO_APIC(void)
36479 +{
36480 + /*
36481 + * Clear the IO-APIC before rebooting:
36482 + */
36483 + clear_IO_APIC();
36484 +
36485 +#ifndef CONFIG_XEN
36486 + /*
36487 + * If the i8259 is routed through an IOAPIC
36488 + * Put that IOAPIC in virtual wire mode
36489 + * so legacy interrupts can be delivered.
36490 + */
36491 + if (ioapic_i8259.pin != -1) {
36492 + struct IO_APIC_route_entry entry;
36493 + unsigned long flags;
36494 +
36495 + memset(&entry, 0, sizeof(entry));
36496 + entry.mask = 0; /* Enabled */
36497 + entry.trigger = 0; /* Edge */
36498 + entry.irr = 0;
36499 + entry.polarity = 0; /* High */
36500 + entry.delivery_status = 0;
36501 + entry.dest_mode = 0; /* Physical */
36502 + entry.delivery_mode = dest_ExtINT; /* ExtInt */
36503 + entry.vector = 0;
36504 + entry.dest.physical.physical_dest =
36505 + GET_APIC_ID(apic_read(APIC_ID));
36506 +
36507 + /*
36508 + * Add it to the IO-APIC irq-routing table:
36509 + */
36510 + spin_lock_irqsave(&ioapic_lock, flags);
36511 + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
36512 + *(((int *)&entry)+1));
36513 + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
36514 + *(((int *)&entry)+0));
36515 + spin_unlock_irqrestore(&ioapic_lock, flags);
36516 + }
36517 +
36518 + disconnect_bsp_APIC(ioapic_i8259.pin != -1);
36519 +#endif
36520 +}
36521 +
36522 +/*
36523 + * function to set the IO-APIC physical IDs based on the
36524 + * values stored in the MPC table.
36525 + *
36526 + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
36527 + */
36528 +
36529 +#ifndef CONFIG_XEN
36530 +static void __init setup_ioapic_ids_from_mpc (void)
36531 +{
36532 + union IO_APIC_reg_00 reg_00;
36533 + int apic;
36534 + int i;
36535 + unsigned char old_id;
36536 + unsigned long flags;
36537 +
36538 + /*
36539 + * Set the IOAPIC ID to the value stored in the MPC table.
36540 + */
36541 + for (apic = 0; apic < nr_ioapics; apic++) {
36542 +
36543 + /* Read the register 0 value */
36544 + spin_lock_irqsave(&ioapic_lock, flags);
36545 + reg_00.raw = io_apic_read(apic, 0);
36546 + spin_unlock_irqrestore(&ioapic_lock, flags);
36547 +
36548 + old_id = mp_ioapics[apic].mpc_apicid;
36549 +
36550 +
36551 + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
36552 +
36553 +
36554 + /*
36555 + * We need to adjust the IRQ routing table
36556 + * if the ID changed.
36557 + */
36558 + if (old_id != mp_ioapics[apic].mpc_apicid)
36559 + for (i = 0; i < mp_irq_entries; i++)
36560 + if (mp_irqs[i].mpc_dstapic == old_id)
36561 + mp_irqs[i].mpc_dstapic
36562 + = mp_ioapics[apic].mpc_apicid;
36563 +
36564 + /*
36565 + * Read the right value from the MPC table and
36566 + * write it into the ID register.
36567 + */
36568 + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
36569 + mp_ioapics[apic].mpc_apicid);
36570 +
36571 + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
36572 + spin_lock_irqsave(&ioapic_lock, flags);
36573 + io_apic_write(apic, 0, reg_00.raw);
36574 + spin_unlock_irqrestore(&ioapic_lock, flags);
36575 +
36576 + /*
36577 + * Sanity check
36578 + */
36579 + spin_lock_irqsave(&ioapic_lock, flags);
36580 + reg_00.raw = io_apic_read(apic, 0);
36581 + spin_unlock_irqrestore(&ioapic_lock, flags);
36582 + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
36583 + printk("could not set ID!\n");
36584 + else
36585 + apic_printk(APIC_VERBOSE," ok.\n");
36586 + }
36587 +}
36588 +#else
36589 +static void __init setup_ioapic_ids_from_mpc(void) { }
36590 +#endif
36591 +
36592 +/*
36593 + * There is a nasty bug in some older SMP boards, their mptable lies
36594 + * about the timer IRQ. We do the following to work around the situation:
36595 + *
36596 + * - timer IRQ defaults to IO-APIC IRQ
36597 + * - if this function detects that timer IRQs are defunct, then we fall
36598 + * back to ISA timer IRQs
36599 + */
36600 +#ifndef CONFIG_XEN
36601 +static int __init timer_irq_works(void)
36602 +{
36603 + unsigned long t1 = jiffies;
36604 +
36605 + local_irq_enable();
36606 + /* Let ten ticks pass... */
36607 + mdelay((10 * 1000) / HZ);
36608 +
36609 + /*
36610 + * Expect a few ticks at least, to be sure some possible
36611 + * glue logic does not lock up after one or two first
36612 + * ticks in a non-ExtINT mode. Also the local APIC
36613 + * might have cached one ExtINT interrupt. Finally, at
36614 + * least one tick may be lost due to delays.
36615 + */
36616 +
36617 + /* jiffies wrap? */
36618 + if (jiffies - t1 > 4)
36619 + return 1;
36620 + return 0;
36621 +}
36622 +
36623 +/*
36624 + * In the SMP+IOAPIC case it might happen that there are an unspecified
36625 + * number of pending IRQ events unhandled. These cases are very rare,
36626 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
36627 + * better to do it this way as thus we do not have to be aware of
36628 + * 'pending' interrupts in the IRQ path, except at this point.
36629 + */
36630 +/*
36631 + * Edge triggered needs to resend any interrupt
36632 + * that was delayed but this is now handled in the device
36633 + * independent code.
36634 + */
36635 +
36636 +/*
36637 + * Starting up a edge-triggered IO-APIC interrupt is
36638 + * nasty - we need to make sure that we get the edge.
36639 + * If it is already asserted for some reason, we need
36640 + * return 1 to indicate that is was pending.
36641 + *
36642 + * This is not complete - we should be able to fake
36643 + * an edge even if it isn't on the 8259A...
36644 + */
36645 +
36646 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
36647 +{
36648 + int was_pending = 0;
36649 + unsigned long flags;
36650 +
36651 + spin_lock_irqsave(&ioapic_lock, flags);
36652 + if (irq < 16) {
36653 + disable_8259A_irq(irq);
36654 + if (i8259A_irq_pending(irq))
36655 + was_pending = 1;
36656 + }
36657 + __unmask_IO_APIC_irq(irq);
36658 + spin_unlock_irqrestore(&ioapic_lock, flags);
36659 +
36660 + return was_pending;
36661 +}
36662 +
36663 +/*
36664 + * Once we have recorded IRQ_PENDING already, we can mask the
36665 + * interrupt for real. This prevents IRQ storms from unhandled
36666 + * devices.
36667 + */
36668 +static void ack_edge_ioapic_irq(unsigned int irq)
36669 +{
36670 + move_irq(irq);
36671 + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
36672 + == (IRQ_PENDING | IRQ_DISABLED))
36673 + mask_IO_APIC_irq(irq);
36674 + ack_APIC_irq();
36675 +}
36676 +
36677 +/*
36678 + * Level triggered interrupts can just be masked,
36679 + * and shutting down and starting up the interrupt
36680 + * is the same as enabling and disabling them -- except
36681 + * with a startup need to return a "was pending" value.
36682 + *
36683 + * Level triggered interrupts are special because we
36684 + * do not touch any IO-APIC register while handling
36685 + * them. We ack the APIC in the end-IRQ handler, not
36686 + * in the start-IRQ-handler. Protection against reentrance
36687 + * from the same interrupt is still provided, both by the
36688 + * generic IRQ layer and by the fact that an unacked local
36689 + * APIC does not accept IRQs.
36690 + */
36691 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
36692 +{
36693 + unmask_IO_APIC_irq(irq);
36694 +
36695 + return 0; /* don't check for pending */
36696 +}
36697 +
36698 +static void end_level_ioapic_irq (unsigned int irq)
36699 +{
36700 + move_irq(irq);
36701 + ack_APIC_irq();
36702 +}
36703 +
36704 +#ifdef CONFIG_PCI_MSI
36705 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
36706 +{
36707 + int irq = vector_to_irq(vector);
36708 +
36709 + return startup_edge_ioapic_irq(irq);
36710 +}
36711 +
36712 +static void ack_edge_ioapic_vector(unsigned int vector)
36713 +{
36714 + int irq = vector_to_irq(vector);
36715 +
36716 + move_native_irq(vector);
36717 + ack_edge_ioapic_irq(irq);
36718 +}
36719 +
36720 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
36721 +{
36722 + int irq = vector_to_irq(vector);
36723 +
36724 + return startup_level_ioapic_irq (irq);
36725 +}
36726 +
36727 +static void end_level_ioapic_vector (unsigned int vector)
36728 +{
36729 + int irq = vector_to_irq(vector);
36730 +
36731 + move_native_irq(vector);
36732 + end_level_ioapic_irq(irq);
36733 +}
36734 +
36735 +static void mask_IO_APIC_vector (unsigned int vector)
36736 +{
36737 + int irq = vector_to_irq(vector);
36738 +
36739 + mask_IO_APIC_irq(irq);
36740 +}
36741 +
36742 +static void unmask_IO_APIC_vector (unsigned int vector)
36743 +{
36744 + int irq = vector_to_irq(vector);
36745 +
36746 + unmask_IO_APIC_irq(irq);
36747 +}
36748 +
36749 +#ifdef CONFIG_SMP
36750 +static void set_ioapic_affinity_vector (unsigned int vector,
36751 + cpumask_t cpu_mask)
36752 +{
36753 + int irq = vector_to_irq(vector);
36754 +
36755 + set_native_irq_info(vector, cpu_mask);
36756 + set_ioapic_affinity_irq(irq, cpu_mask);
36757 +}
36758 +#endif // CONFIG_SMP
36759 +#endif // CONFIG_PCI_MSI
36760 +
36761 +/*
36762 + * Level and edge triggered IO-APIC interrupts need different handling,
36763 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
36764 + * handled with the level-triggered descriptor, but that one has slightly
36765 + * more overhead. Level-triggered interrupts cannot be handled with the
36766 + * edge-triggered handler, without risking IRQ storms and other ugly
36767 + * races.
36768 + */
36769 +
36770 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
36771 + .typename = "IO-APIC-edge",
36772 + .startup = startup_edge_ioapic,
36773 + .shutdown = shutdown_edge_ioapic,
36774 + .enable = enable_edge_ioapic,
36775 + .disable = disable_edge_ioapic,
36776 + .ack = ack_edge_ioapic,
36777 + .end = end_edge_ioapic,
36778 +#ifdef CONFIG_SMP
36779 + .set_affinity = set_ioapic_affinity,
36780 +#endif
36781 +};
36782 +
36783 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
36784 + .typename = "IO-APIC-level",
36785 + .startup = startup_level_ioapic,
36786 + .shutdown = shutdown_level_ioapic,
36787 + .enable = enable_level_ioapic,
36788 + .disable = disable_level_ioapic,
36789 + .ack = mask_and_ack_level_ioapic,
36790 + .end = end_level_ioapic,
36791 +#ifdef CONFIG_SMP
36792 + .set_affinity = set_ioapic_affinity,
36793 +#endif
36794 +};
36795 +#endif /* !CONFIG_XEN */
36796 +
36797 +static inline void init_IO_APIC_traps(void)
36798 +{
36799 + int irq;
36800 +
36801 + /*
36802 + * NOTE! The local APIC isn't very good at handling
36803 + * multiple interrupts at the same interrupt level.
36804 + * As the interrupt level is determined by taking the
36805 + * vector number and shifting that right by 4, we
36806 + * want to spread these out a bit so that they don't
36807 + * all fall in the same interrupt level.
36808 + *
36809 + * Also, we've got to be careful not to trash gate
36810 + * 0x80, because int 0x80 is hm, kind of importantish. ;)
36811 + */
36812 + for (irq = 0; irq < NR_IRQS ; irq++) {
36813 + int tmp = irq;
36814 + if (use_pci_vector()) {
36815 + if (!platform_legacy_irq(tmp))
36816 + if ((tmp = vector_to_irq(tmp)) == -1)
36817 + continue;
36818 + }
36819 + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
36820 + /*
36821 + * Hmm.. We don't have an entry for this,
36822 + * so default to an old-fashioned 8259
36823 + * interrupt if we can..
36824 + */
36825 + if (irq < 16)
36826 + make_8259A_irq(irq);
36827 +#ifndef CONFIG_XEN
36828 + else
36829 + /* Strange. Oh, well.. */
36830 + irq_desc[irq].handler = &no_irq_type;
36831 +#endif
36832 + }
36833 + }
36834 +}
36835 +
36836 +#ifndef CONFIG_XEN
36837 +static void enable_lapic_irq (unsigned int irq)
36838 +{
36839 + unsigned long v;
36840 +
36841 + v = apic_read(APIC_LVT0);
36842 + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
36843 +}
36844 +
36845 +static void disable_lapic_irq (unsigned int irq)
36846 +{
36847 + unsigned long v;
36848 +
36849 + v = apic_read(APIC_LVT0);
36850 + apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
36851 +}
36852 +
36853 +static void ack_lapic_irq (unsigned int irq)
36854 +{
36855 + ack_APIC_irq();
36856 +}
36857 +
36858 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
36859 +
36860 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
36861 + .typename = "local-APIC-edge",
36862 + .startup = NULL, /* startup_irq() not used for IRQ0 */
36863 + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
36864 + .enable = enable_lapic_irq,
36865 + .disable = disable_lapic_irq,
36866 + .ack = ack_lapic_irq,
36867 + .end = end_lapic_irq,
36868 +};
36869 +
36870 +static void setup_nmi (void)
36871 +{
36872 + /*
36873 + * Dirty trick to enable the NMI watchdog ...
36874 + * We put the 8259A master into AEOI mode and
36875 + * unmask on all local APICs LVT0 as NMI.
36876 + *
36877 + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
36878 + * is from Maciej W. Rozycki - so we do not have to EOI from
36879 + * the NMI handler or the timer interrupt.
36880 + */
36881 + printk(KERN_INFO "activating NMI Watchdog ...");
36882 +
36883 + enable_NMI_through_LVT0(NULL);
36884 +
36885 + printk(" done.\n");
36886 +}
36887 +
36888 +/*
36889 + * This looks a bit hackish but it's about the only one way of sending
36890 + * a few INTA cycles to 8259As and any associated glue logic. ICR does
36891 + * not support the ExtINT mode, unfortunately. We need to send these
36892 + * cycles as some i82489DX-based boards have glue logic that keeps the
36893 + * 8259A interrupt line asserted until INTA. --macro
36894 + */
36895 +static inline void unlock_ExtINT_logic(void)
36896 +{
36897 + int apic, pin, i;
36898 + struct IO_APIC_route_entry entry0, entry1;
36899 + unsigned char save_control, save_freq_select;
36900 + unsigned long flags;
36901 +
36902 + pin = find_isa_irq_pin(8, mp_INT);
36903 + apic = find_isa_irq_apic(8, mp_INT);
36904 + if (pin == -1)
36905 + return;
36906 +
36907 + spin_lock_irqsave(&ioapic_lock, flags);
36908 + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
36909 + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
36910 + spin_unlock_irqrestore(&ioapic_lock, flags);
36911 + clear_IO_APIC_pin(apic, pin);
36912 +
36913 + memset(&entry1, 0, sizeof(entry1));
36914 +
36915 + entry1.dest_mode = 0; /* physical delivery */
36916 + entry1.mask = 0; /* unmask IRQ now */
36917 + entry1.dest.physical.physical_dest = hard_smp_processor_id();
36918 + entry1.delivery_mode = dest_ExtINT;
36919 + entry1.polarity = entry0.polarity;
36920 + entry1.trigger = 0;
36921 + entry1.vector = 0;
36922 +
36923 + spin_lock_irqsave(&ioapic_lock, flags);
36924 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
36925 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
36926 + spin_unlock_irqrestore(&ioapic_lock, flags);
36927 +
36928 + save_control = CMOS_READ(RTC_CONTROL);
36929 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
36930 + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
36931 + RTC_FREQ_SELECT);
36932 + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
36933 +
36934 + i = 100;
36935 + while (i-- > 0) {
36936 + mdelay(10);
36937 + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
36938 + i -= 10;
36939 + }
36940 +
36941 + CMOS_WRITE(save_control, RTC_CONTROL);
36942 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
36943 + clear_IO_APIC_pin(apic, pin);
36944 +
36945 + spin_lock_irqsave(&ioapic_lock, flags);
36946 + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
36947 + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
36948 + spin_unlock_irqrestore(&ioapic_lock, flags);
36949 +}
36950 +
36951 +/*
36952 + * This code may look a bit paranoid, but it's supposed to cooperate with
36953 + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
36954 + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
36955 + * fanatically on his truly buggy board.
36956 + *
36957 + * FIXME: really need to revamp this for modern platforms only.
36958 + */
36959 +static inline void check_timer(void)
36960 +{
36961 + int apic1, pin1, apic2, pin2;
36962 + int vector;
36963 +
36964 + /*
36965 + * get/set the timer IRQ vector:
36966 + */
36967 + disable_8259A_irq(0);
36968 + vector = assign_irq_vector(0);
36969 + set_intr_gate(vector, interrupt[0]);
36970 +
36971 + /*
36972 + * Subtle, code in do_timer_interrupt() expects an AEOI
36973 + * mode for the 8259A whenever interrupts are routed
36974 + * through I/O APICs. Also IRQ0 has to be enabled in
36975 + * the 8259A which implies the virtual wire has to be
36976 + * disabled in the local APIC.
36977 + */
36978 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
36979 + init_8259A(1);
36980 + if (timer_over_8254 > 0)
36981 + enable_8259A_irq(0);
36982 +
36983 + pin1 = find_isa_irq_pin(0, mp_INT);
36984 + apic1 = find_isa_irq_apic(0, mp_INT);
36985 + pin2 = ioapic_i8259.pin;
36986 + apic2 = ioapic_i8259.apic;
36987 +
36988 + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
36989 + vector, apic1, pin1, apic2, pin2);
36990 +
36991 + if (pin1 != -1) {
36992 + /*
36993 + * Ok, does IRQ0 through the IOAPIC work?
36994 + */
36995 + unmask_IO_APIC_irq(0);
36996 + if (!no_timer_check && timer_irq_works()) {
36997 + nmi_watchdog_default();
36998 + if (nmi_watchdog == NMI_IO_APIC) {
36999 + disable_8259A_irq(0);
37000 + setup_nmi();
37001 + enable_8259A_irq(0);
37002 + }
37003 + if (disable_timer_pin_1 > 0)
37004 + clear_IO_APIC_pin(0, pin1);
37005 + return;
37006 + }
37007 + clear_IO_APIC_pin(apic1, pin1);
37008 + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
37009 + "connected to IO-APIC\n");
37010 + }
37011 +
37012 + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
37013 + "through the 8259A ... ");
37014 + if (pin2 != -1) {
37015 + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
37016 + apic2, pin2);
37017 + /*
37018 + * legacy devices should be connected to IO APIC #0
37019 + */
37020 + setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
37021 + if (timer_irq_works()) {
37022 + printk("works.\n");
37023 + nmi_watchdog_default();
37024 + if (nmi_watchdog == NMI_IO_APIC) {
37025 + setup_nmi();
37026 + }
37027 + return;
37028 + }
37029 + /*
37030 + * Cleanup, just in case ...
37031 + */
37032 + clear_IO_APIC_pin(apic2, pin2);
37033 + }
37034 + printk(" failed.\n");
37035 +
37036 + if (nmi_watchdog == NMI_IO_APIC) {
37037 + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
37038 + nmi_watchdog = 0;
37039 + }
37040 +
37041 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
37042 +
37043 + disable_8259A_irq(0);
37044 + irq_desc[0].handler = &lapic_irq_type;
37045 + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
37046 + enable_8259A_irq(0);
37047 +
37048 + if (timer_irq_works()) {
37049 + apic_printk(APIC_QUIET, " works.\n");
37050 + return;
37051 + }
37052 + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
37053 + apic_printk(APIC_VERBOSE," failed.\n");
37054 +
37055 + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
37056 +
37057 + init_8259A(0);
37058 + make_8259A_irq(0);
37059 + apic_write(APIC_LVT0, APIC_DM_EXTINT);
37060 +
37061 + unlock_ExtINT_logic();
37062 +
37063 + if (timer_irq_works()) {
37064 + apic_printk(APIC_VERBOSE," works.\n");
37065 + return;
37066 + }
37067 + apic_printk(APIC_VERBOSE," failed :(.\n");
37068 + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
37069 +}
37070 +#else
37071 +#define check_timer() ((void)0)
37072 +#endif /* !CONFIG_XEN */
37073 +
37074 +static int __init notimercheck(char *s)
37075 +{
37076 + no_timer_check = 1;
37077 + return 1;
37078 +}
37079 +__setup("no_timer_check", notimercheck);
37080 +
37081 +/*
37082 + *
37083 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
37084 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
37085 + * Linux doesn't really care, as it's not actually used
37086 + * for any interrupt handling anyway.
37087 + */
37088 +#define PIC_IRQS (1<<2)
37089 +
37090 +void __init setup_IO_APIC(void)
37091 +{
37092 + enable_IO_APIC();
37093 +
37094 + if (acpi_ioapic)
37095 + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
37096 + else
37097 + io_apic_irqs = ~PIC_IRQS;
37098 +
37099 + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
37100 +
37101 + /*
37102 + * Set up the IO-APIC IRQ routing table.
37103 + */
37104 + if (!acpi_ioapic)
37105 + setup_ioapic_ids_from_mpc();
37106 +#ifndef CONFIG_XEN
37107 + sync_Arb_IDs();
37108 +#endif /* !CONFIG_XEN */
37109 + setup_IO_APIC_irqs();
37110 + init_IO_APIC_traps();
37111 + check_timer();
37112 + if (!acpi_ioapic)
37113 + print_IO_APIC();
37114 +}
37115 +
37116 +struct sysfs_ioapic_data {
37117 + struct sys_device dev;
37118 + struct IO_APIC_route_entry entry[0];
37119 +};
37120 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
37121 +
37122 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
37123 +{
37124 + struct IO_APIC_route_entry *entry;
37125 + struct sysfs_ioapic_data *data;
37126 + unsigned long flags;
37127 + int i;
37128 +
37129 + data = container_of(dev, struct sysfs_ioapic_data, dev);
37130 + entry = data->entry;
37131 + spin_lock_irqsave(&ioapic_lock, flags);
37132 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
37133 + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
37134 + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
37135 + }
37136 + spin_unlock_irqrestore(&ioapic_lock, flags);
37137 +
37138 + return 0;
37139 +}
37140 +
37141 +static int ioapic_resume(struct sys_device *dev)
37142 +{
37143 + struct IO_APIC_route_entry *entry;
37144 + struct sysfs_ioapic_data *data;
37145 + unsigned long flags;
37146 + union IO_APIC_reg_00 reg_00;
37147 + int i;
37148 +
37149 + data = container_of(dev, struct sysfs_ioapic_data, dev);
37150 + entry = data->entry;
37151 +
37152 + spin_lock_irqsave(&ioapic_lock, flags);
37153 + reg_00.raw = io_apic_read(dev->id, 0);
37154 + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
37155 + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
37156 + io_apic_write(dev->id, 0, reg_00.raw);
37157 + }
37158 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
37159 + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
37160 + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
37161 + }
37162 + spin_unlock_irqrestore(&ioapic_lock, flags);
37163 +
37164 + return 0;
37165 +}
37166 +
37167 +static struct sysdev_class ioapic_sysdev_class = {
37168 + set_kset_name("ioapic"),
37169 + .suspend = ioapic_suspend,
37170 + .resume = ioapic_resume,
37171 +};
37172 +
37173 +static int __init ioapic_init_sysfs(void)
37174 +{
37175 + struct sys_device * dev;
37176 + int i, size, error = 0;
37177 +
37178 + error = sysdev_class_register(&ioapic_sysdev_class);
37179 + if (error)
37180 + return error;
37181 +
37182 + for (i = 0; i < nr_ioapics; i++ ) {
37183 + size = sizeof(struct sys_device) + nr_ioapic_registers[i]
37184 + * sizeof(struct IO_APIC_route_entry);
37185 + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
37186 + if (!mp_ioapic_data[i]) {
37187 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37188 + continue;
37189 + }
37190 + memset(mp_ioapic_data[i], 0, size);
37191 + dev = &mp_ioapic_data[i]->dev;
37192 + dev->id = i;
37193 + dev->cls = &ioapic_sysdev_class;
37194 + error = sysdev_register(dev);
37195 + if (error) {
37196 + kfree(mp_ioapic_data[i]);
37197 + mp_ioapic_data[i] = NULL;
37198 + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
37199 + continue;
37200 + }
37201 + }
37202 +
37203 + return 0;
37204 +}
37205 +
37206 +device_initcall(ioapic_init_sysfs);
37207 +
37208 +/* --------------------------------------------------------------------------
37209 + ACPI-based IOAPIC Configuration
37210 + -------------------------------------------------------------------------- */
37211 +
37212 +#ifdef CONFIG_ACPI
37213 +
37214 +#define IO_APIC_MAX_ID 0xFE
37215 +
37216 +int __init io_apic_get_version (int ioapic)
37217 +{
37218 + union IO_APIC_reg_01 reg_01;
37219 + unsigned long flags;
37220 +
37221 + spin_lock_irqsave(&ioapic_lock, flags);
37222 + reg_01.raw = io_apic_read(ioapic, 1);
37223 + spin_unlock_irqrestore(&ioapic_lock, flags);
37224 +
37225 + return reg_01.bits.version;
37226 +}
37227 +
37228 +
37229 +int __init io_apic_get_redir_entries (int ioapic)
37230 +{
37231 + union IO_APIC_reg_01 reg_01;
37232 + unsigned long flags;
37233 +
37234 + spin_lock_irqsave(&ioapic_lock, flags);
37235 + reg_01.raw = io_apic_read(ioapic, 1);
37236 + spin_unlock_irqrestore(&ioapic_lock, flags);
37237 +
37238 + return reg_01.bits.entries;
37239 +}
37240 +
37241 +
37242 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
37243 +{
37244 + struct IO_APIC_route_entry entry;
37245 + unsigned long flags;
37246 +
37247 + if (!IO_APIC_IRQ(irq)) {
37248 + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
37249 + ioapic);
37250 + return -EINVAL;
37251 + }
37252 +
37253 + /*
37254 + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
37255 + * Note that we mask (disable) IRQs now -- these get enabled when the
37256 + * corresponding device driver registers for this IRQ.
37257 + */
37258 +
37259 + memset(&entry,0,sizeof(entry));
37260 +
37261 + entry.delivery_mode = INT_DELIVERY_MODE;
37262 + entry.dest_mode = INT_DEST_MODE;
37263 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
37264 + entry.trigger = edge_level;
37265 + entry.polarity = active_high_low;
37266 + entry.mask = 1; /* Disabled (masked) */
37267 +
37268 + irq = gsi_irq_sharing(irq);
37269 + /*
37270 + * IRQs < 16 are already in the irq_2_pin[] map
37271 + */
37272 + if (irq >= 16)
37273 + add_pin_to_irq(irq, ioapic, pin);
37274 +
37275 + entry.vector = assign_irq_vector(irq);
37276 +
37277 + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
37278 + "IRQ %d Mode:%i Active:%i)\n", ioapic,
37279 + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
37280 + edge_level, active_high_low);
37281 +
37282 + ioapic_register_intr(irq, entry.vector, edge_level);
37283 +
37284 + if (!ioapic && (irq < 16))
37285 + disable_8259A_irq(irq);
37286 +
37287 + spin_lock_irqsave(&ioapic_lock, flags);
37288 + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
37289 + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
37290 + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
37291 + spin_unlock_irqrestore(&ioapic_lock, flags);
37292 +
37293 + return 0;
37294 +}
37295 +
37296 +#endif /* CONFIG_ACPI */
37297 +
37298 +
37299 +#ifndef CONFIG_XEN
37300 +/*
37301 + * This function currently is only a helper for the i386 smp boot process where
37302 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
37303 + * so mask in all cases should simply be TARGET_CPUS
37304 + */
37305 +#ifdef CONFIG_SMP
37306 +void __init setup_ioapic_dest(void)
37307 +{
37308 + int pin, ioapic, irq, irq_entry;
37309 +
37310 + if (skip_ioapic_setup == 1)
37311 + return;
37312 +
37313 + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
37314 + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
37315 + irq_entry = find_irq_entry(ioapic, pin, mp_INT);
37316 + if (irq_entry == -1)
37317 + continue;
37318 + irq = pin_2_irq(irq_entry, ioapic, pin);
37319 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
37320 + }
37321 +
37322 + }
37323 +}
37324 +#endif
37325 +#endif /* !CONFIG_XEN */
37326 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/ioport-xen.c linux-2.6.16.33/arch/x86_64/kernel/ioport-xen.c
37327 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/ioport-xen.c 1970-01-01 00:00:00.000000000 +0000
37328 +++ linux-2.6.16.33/arch/x86_64/kernel/ioport-xen.c 2007-01-08 15:00:45.000000000 +0000
37329 @@ -0,0 +1,99 @@
37330 +/*
37331 + * linux/arch/x86_64/kernel/ioport.c
37332 + *
37333 + * This contains the io-permission bitmap code - written by obz, with changes
37334 + * by Linus.
37335 + */
37336 +
37337 +#include <linux/sched.h>
37338 +#include <linux/kernel.h>
37339 +#include <linux/capability.h>
37340 +#include <linux/errno.h>
37341 +#include <linux/types.h>
37342 +#include <linux/ioport.h>
37343 +#include <linux/mm.h>
37344 +#include <linux/smp.h>
37345 +#include <linux/smp_lock.h>
37346 +#include <linux/stddef.h>
37347 +#include <linux/slab.h>
37348 +#include <linux/thread_info.h>
37349 +#include <xen/interface/physdev.h>
37350 +
37351 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
37352 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
37353 +{
37354 + int i;
37355 +
37356 + if (new_value)
37357 + for (i = base; i < base + extent; i++)
37358 + __set_bit(i, bitmap);
37359 + else
37360 + for (i = base; i < base + extent; i++)
37361 + clear_bit(i, bitmap);
37362 +}
37363 +
37364 +/*
37365 + * this changes the io permissions bitmap in the current task.
37366 + */
37367 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
37368 +{
37369 + struct thread_struct * t = &current->thread;
37370 + unsigned long *bitmap;
37371 + struct physdev_set_iobitmap set_iobitmap;
37372 +
37373 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
37374 + return -EINVAL;
37375 + if (turn_on && !capable(CAP_SYS_RAWIO))
37376 + return -EPERM;
37377 +
37378 + /*
37379 + * If it's the first ioperm() call in this thread's lifetime, set the
37380 + * IO bitmap up. ioperm() is much less timing critical than clone(),
37381 + * this is why we delay this operation until now:
37382 + */
37383 + if (!t->io_bitmap_ptr) {
37384 + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
37385 + if (!bitmap)
37386 + return -ENOMEM;
37387 +
37388 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
37389 + t->io_bitmap_ptr = bitmap;
37390 +
37391 + set_iobitmap.bitmap = (char *)bitmap;
37392 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
37393 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
37394 + }
37395 +
37396 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
37397 +
37398 + return 0;
37399 +}
37400 +
37401 +/*
37402 + * sys_iopl has to be used when you want to access the IO ports
37403 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
37404 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
37405 + *
37406 + */
37407 +
37408 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
37409 +{
37410 + unsigned int old_iopl = current->thread.iopl;
37411 + struct physdev_set_iopl set_iopl;
37412 +
37413 + if (new_iopl > 3)
37414 + return -EINVAL;
37415 +
37416 + /* Need "raw I/O" privileges for direct port access. */
37417 + if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
37418 + return -EPERM;
37419 +
37420 + /* Change our version of the privilege levels. */
37421 + current->thread.iopl = new_iopl;
37422 +
37423 + /* Force the change at ring 0. */
37424 + set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
37425 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
37426 +
37427 + return 0;
37428 +}
37429 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/irq-xen.c linux-2.6.16.33/arch/x86_64/kernel/irq-xen.c
37430 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/irq-xen.c 1970-01-01 00:00:00.000000000 +0000
37431 +++ linux-2.6.16.33/arch/x86_64/kernel/irq-xen.c 2007-01-08 15:00:45.000000000 +0000
37432 @@ -0,0 +1,168 @@
37433 +/*
37434 + * linux/arch/x86_64/kernel/irq.c
37435 + *
37436 + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
37437 + *
37438 + * This file contains the lowest level x86_64-specific interrupt
37439 + * entry and irq statistics code. All the remaining irq logic is
37440 + * done by the generic kernel/irq/ code and in the
37441 + * x86_64-specific irq controller code. (e.g. i8259.c and
37442 + * io_apic.c.)
37443 + */
37444 +
37445 +#include <linux/kernel_stat.h>
37446 +#include <linux/interrupt.h>
37447 +#include <linux/seq_file.h>
37448 +#include <linux/module.h>
37449 +#include <linux/delay.h>
37450 +#include <asm/uaccess.h>
37451 +#include <asm/io_apic.h>
37452 +#include <asm/idle.h>
37453 +
37454 +atomic_t irq_err_count;
37455 +#ifdef CONFIG_X86_IO_APIC
37456 +#ifdef APIC_MISMATCH_DEBUG
37457 +atomic_t irq_mis_count;
37458 +#endif
37459 +#endif
37460 +
37461 +/*
37462 + * Generic, controller-independent functions:
37463 + */
37464 +
37465 +int show_interrupts(struct seq_file *p, void *v)
37466 +{
37467 + int i = *(loff_t *) v, j;
37468 + struct irqaction * action;
37469 + unsigned long flags;
37470 +
37471 + if (i == 0) {
37472 + seq_printf(p, " ");
37473 + for (j=0; j<NR_CPUS; j++)
37474 + if (cpu_online(j))
37475 + seq_printf(p, "CPU%d ",j);
37476 + seq_putc(p, '\n');
37477 + }
37478 +
37479 + if (i < NR_IRQS) {
37480 + spin_lock_irqsave(&irq_desc[i].lock, flags);
37481 + action = irq_desc[i].action;
37482 + if (!action)
37483 + goto skip;
37484 + seq_printf(p, "%3d: ",i);
37485 +#ifndef CONFIG_SMP
37486 + seq_printf(p, "%10u ", kstat_irqs(i));
37487 +#else
37488 + for (j=0; j<NR_CPUS; j++)
37489 + if (cpu_online(j))
37490 + seq_printf(p, "%10u ",
37491 + kstat_cpu(j).irqs[i]);
37492 +#endif
37493 + seq_printf(p, " %14s", irq_desc[i].handler->typename);
37494 +
37495 + seq_printf(p, " %s", action->name);
37496 + for (action=action->next; action; action = action->next)
37497 + seq_printf(p, ", %s", action->name);
37498 + seq_putc(p, '\n');
37499 +skip:
37500 + spin_unlock_irqrestore(&irq_desc[i].lock, flags);
37501 + } else if (i == NR_IRQS) {
37502 + seq_printf(p, "NMI: ");
37503 + for (j = 0; j < NR_CPUS; j++)
37504 + if (cpu_online(j))
37505 + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
37506 + seq_putc(p, '\n');
37507 +#ifdef CONFIG_X86_LOCAL_APIC
37508 + seq_printf(p, "LOC: ");
37509 + for (j = 0; j < NR_CPUS; j++)
37510 + if (cpu_online(j))
37511 + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
37512 + seq_putc(p, '\n');
37513 +#endif
37514 + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
37515 +#ifdef CONFIG_X86_IO_APIC
37516 +#ifdef APIC_MISMATCH_DEBUG
37517 + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
37518 +#endif
37519 +#endif
37520 + }
37521 + return 0;
37522 +}
37523 +
37524 +/*
37525 + * do_IRQ handles all normal device IRQ's (the special
37526 + * SMP cross-CPU interrupts have their own specific
37527 + * handlers).
37528 + */
37529 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
37530 +{
37531 + /* high bit used in ret_from_ code */
37532 + unsigned irq = ~regs->orig_rax;
37533 +
37534 + exit_idle();
37535 + irq_enter();
37536 +
37537 + __do_IRQ(irq, regs);
37538 + irq_exit();
37539 +
37540 + return 1;
37541 +}
37542 +
37543 +#ifdef CONFIG_HOTPLUG_CPU
37544 +void fixup_irqs(cpumask_t map)
37545 +{
37546 + unsigned int irq;
37547 + static int warned;
37548 +
37549 + for (irq = 0; irq < NR_IRQS; irq++) {
37550 + cpumask_t mask;
37551 + if (irq == 2)
37552 + continue;
37553 +
37554 + cpus_and(mask, irq_affinity[irq], map);
37555 + if (any_online_cpu(mask) == NR_CPUS) {
37556 + printk("Breaking affinity for irq %i\n", irq);
37557 + mask = map;
37558 + }
37559 + if (irq_desc[irq].handler->set_affinity)
37560 + irq_desc[irq].handler->set_affinity(irq, mask);
37561 + else if (irq_desc[irq].action && !(warned++))
37562 + printk("Cannot set affinity for irq %i\n", irq);
37563 + }
37564 +
37565 + /* That doesn't seem sufficient. Give it 1ms. */
37566 + local_irq_enable();
37567 + mdelay(1);
37568 + local_irq_disable();
37569 +}
37570 +#endif
37571 +
37572 +extern void call_softirq(void);
37573 +
37574 +asmlinkage void do_softirq(void)
37575 +{
37576 + __u32 pending;
37577 + unsigned long flags;
37578 +
37579 + if (in_interrupt())
37580 + return;
37581 +
37582 + local_irq_save(flags);
37583 + pending = local_softirq_pending();
37584 + /* Switch to interrupt stack */
37585 + if (pending)
37586 + call_softirq();
37587 + local_irq_restore(flags);
37588 +}
37589 +EXPORT_SYMBOL(do_softirq);
37590 +
37591 +#ifndef CONFIG_X86_LOCAL_APIC
37592 +/*
37593 + * 'what should we do if we get a hw irq event on an illegal vector'.
37594 + * each architecture has to answer this themselves.
37595 + */
37596 +void ack_bad_irq(unsigned int irq)
37597 +{
37598 + printk("unexpected IRQ trap at vector %02x\n", irq);
37599 +}
37600 +#endif
37601 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/irq.c linux-2.6.16.33/arch/x86_64/kernel/irq.c
37602 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/irq.c 2006-11-22 18:06:31.000000000 +0000
37603 +++ linux-2.6.16.33/arch/x86_64/kernel/irq.c 2007-05-23 21:00:01.000000000 +0000
37604 @@ -96,8 +96,8 @@
37605 */
37606 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
37607 {
37608 - /* high bits used in ret_from_ code */
37609 - unsigned irq = regs->orig_rax & 0xff;
37610 + /* high bit used in ret_from_ code */
37611 + unsigned irq = ~regs->orig_rax;
37612
37613 exit_idle();
37614 irq_enter();
37615 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/ldt-xen.c linux-2.6.16.33/arch/x86_64/kernel/ldt-xen.c
37616 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/ldt-xen.c 1970-01-01 00:00:00.000000000 +0000
37617 +++ linux-2.6.16.33/arch/x86_64/kernel/ldt-xen.c 2007-01-08 15:00:45.000000000 +0000
37618 @@ -0,0 +1,282 @@
37619 +/*
37620 + * linux/arch/x86_64/kernel/ldt.c
37621 + *
37622 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
37623 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
37624 + * Copyright (C) 2002 Andi Kleen
37625 + *
37626 + * This handles calls from both 32bit and 64bit mode.
37627 + */
37628 +
37629 +#include <linux/errno.h>
37630 +#include <linux/sched.h>
37631 +#include <linux/string.h>
37632 +#include <linux/mm.h>
37633 +#include <linux/smp.h>
37634 +#include <linux/smp_lock.h>
37635 +#include <linux/vmalloc.h>
37636 +#include <linux/slab.h>
37637 +
37638 +#include <asm/uaccess.h>
37639 +#include <asm/system.h>
37640 +#include <asm/ldt.h>
37641 +#include <asm/desc.h>
37642 +#include <asm/proto.h>
37643 +#include <asm/pgalloc.h>
37644 +
37645 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
37646 +static void flush_ldt(void *null)
37647 +{
37648 + if (current->active_mm)
37649 + load_LDT(&current->active_mm->context);
37650 +}
37651 +#endif
37652 +
37653 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
37654 +{
37655 + void *oldldt;
37656 + void *newldt;
37657 + unsigned oldsize;
37658 +
37659 + if (mincount <= (unsigned)pc->size)
37660 + return 0;
37661 + oldsize = pc->size;
37662 + mincount = (mincount+511)&(~511);
37663 + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
37664 + newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
37665 + else
37666 + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
37667 +
37668 + if (!newldt)
37669 + return -ENOMEM;
37670 +
37671 + if (oldsize)
37672 + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
37673 + oldldt = pc->ldt;
37674 + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
37675 + wmb();
37676 + pc->ldt = newldt;
37677 + wmb();
37678 + pc->size = mincount;
37679 + wmb();
37680 + if (reload) {
37681 +#ifdef CONFIG_SMP
37682 + cpumask_t mask;
37683 +
37684 + preempt_disable();
37685 +#endif
37686 + make_pages_readonly(
37687 + pc->ldt,
37688 + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37689 + XENFEAT_writable_descriptor_tables);
37690 + load_LDT(pc);
37691 +#ifdef CONFIG_SMP
37692 + mask = cpumask_of_cpu(smp_processor_id());
37693 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
37694 + smp_call_function(flush_ldt, NULL, 1, 1);
37695 + preempt_enable();
37696 +#endif
37697 + }
37698 + if (oldsize) {
37699 + make_pages_writable(
37700 + oldldt,
37701 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
37702 + XENFEAT_writable_descriptor_tables);
37703 + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
37704 + vfree(oldldt);
37705 + else
37706 + kfree(oldldt);
37707 + }
37708 + return 0;
37709 +}
37710 +
37711 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
37712 +{
37713 + int err = alloc_ldt(new, old->size, 0);
37714 + if (err < 0)
37715 + return err;
37716 + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
37717 + make_pages_readonly(
37718 + new->ldt,
37719 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37720 + XENFEAT_writable_descriptor_tables);
37721 + return 0;
37722 +}
37723 +
37724 +/*
37725 + * we do not have to muck with descriptors here, that is
37726 + * done in switch_mm() as needed.
37727 + */
37728 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
37729 +{
37730 + struct mm_struct * old_mm;
37731 + int retval = 0;
37732 +
37733 + memset(&mm->context, 0, sizeof(mm->context));
37734 + init_MUTEX(&mm->context.sem);
37735 + old_mm = current->mm;
37736 + if (old_mm && old_mm->context.size > 0) {
37737 + down(&old_mm->context.sem);
37738 + retval = copy_ldt(&mm->context, &old_mm->context);
37739 + up(&old_mm->context.sem);
37740 + }
37741 + if (retval == 0) {
37742 + spin_lock(&mm_unpinned_lock);
37743 + list_add(&mm->context.unpinned, &mm_unpinned);
37744 + spin_unlock(&mm_unpinned_lock);
37745 + }
37746 + return retval;
37747 +}
37748 +
37749 +/*
37750 + *
37751 + * Don't touch the LDT register - we're already in the next thread.
37752 + */
37753 +void destroy_context(struct mm_struct *mm)
37754 +{
37755 + if (mm->context.size) {
37756 + if (mm == current->active_mm)
37757 + clear_LDT();
37758 + make_pages_writable(
37759 + mm->context.ldt,
37760 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
37761 + XENFEAT_writable_descriptor_tables);
37762 + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
37763 + vfree(mm->context.ldt);
37764 + else
37765 + kfree(mm->context.ldt);
37766 + mm->context.size = 0;
37767 + }
37768 + if (!mm->context.pinned) {
37769 + spin_lock(&mm_unpinned_lock);
37770 + list_del(&mm->context.unpinned);
37771 + spin_unlock(&mm_unpinned_lock);
37772 + }
37773 +}
37774 +
37775 +static int read_ldt(void __user * ptr, unsigned long bytecount)
37776 +{
37777 + int err;
37778 + unsigned long size;
37779 + struct mm_struct * mm = current->mm;
37780 +
37781 + if (!mm->context.size)
37782 + return 0;
37783 + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
37784 + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
37785 +
37786 + down(&mm->context.sem);
37787 + size = mm->context.size*LDT_ENTRY_SIZE;
37788 + if (size > bytecount)
37789 + size = bytecount;
37790 +
37791 + err = 0;
37792 + if (copy_to_user(ptr, mm->context.ldt, size))
37793 + err = -EFAULT;
37794 + up(&mm->context.sem);
37795 + if (err < 0)
37796 + goto error_return;
37797 + if (size != bytecount) {
37798 + /* zero-fill the rest */
37799 + if (clear_user(ptr+size, bytecount-size) != 0) {
37800 + err = -EFAULT;
37801 + goto error_return;
37802 + }
37803 + }
37804 + return bytecount;
37805 +error_return:
37806 + return err;
37807 +}
37808 +
37809 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
37810 +{
37811 + /* Arbitrary number */
37812 + /* x86-64 default LDT is all zeros */
37813 + if (bytecount > 128)
37814 + bytecount = 128;
37815 + if (clear_user(ptr, bytecount))
37816 + return -EFAULT;
37817 + return bytecount;
37818 +}
37819 +
37820 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
37821 +{
37822 + struct task_struct *me = current;
37823 + struct mm_struct * mm = me->mm;
37824 + __u32 entry_1, entry_2, *lp;
37825 + unsigned long mach_lp;
37826 + int error;
37827 + struct user_desc ldt_info;
37828 +
37829 + error = -EINVAL;
37830 +
37831 + if (bytecount != sizeof(ldt_info))
37832 + goto out;
37833 + error = -EFAULT;
37834 + if (copy_from_user(&ldt_info, ptr, bytecount))
37835 + goto out;
37836 +
37837 + error = -EINVAL;
37838 + if (ldt_info.entry_number >= LDT_ENTRIES)
37839 + goto out;
37840 + if (ldt_info.contents == 3) {
37841 + if (oldmode)
37842 + goto out;
37843 + if (ldt_info.seg_not_present == 0)
37844 + goto out;
37845 + }
37846 +
37847 + down(&mm->context.sem);
37848 + if (ldt_info.entry_number >= (unsigned)mm->context.size) {
37849 + error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
37850 + if (error < 0)
37851 + goto out_unlock;
37852 + }
37853 +
37854 + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
37855 + mach_lp = arbitrary_virt_to_machine(lp);
37856 +
37857 + /* Allow LDTs to be cleared by the user. */
37858 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
37859 + if (oldmode || LDT_empty(&ldt_info)) {
37860 + entry_1 = 0;
37861 + entry_2 = 0;
37862 + goto install;
37863 + }
37864 + }
37865 +
37866 + entry_1 = LDT_entry_a(&ldt_info);
37867 + entry_2 = LDT_entry_b(&ldt_info);
37868 + if (oldmode)
37869 + entry_2 &= ~(1 << 20);
37870 +
37871 + /* Install the new entry ... */
37872 +install:
37873 + error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
37874 +
37875 +out_unlock:
37876 + up(&mm->context.sem);
37877 +out:
37878 + return error;
37879 +}
37880 +
37881 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
37882 +{
37883 + int ret = -ENOSYS;
37884 +
37885 + switch (func) {
37886 + case 0:
37887 + ret = read_ldt(ptr, bytecount);
37888 + break;
37889 + case 1:
37890 + ret = write_ldt(ptr, bytecount, 1);
37891 + break;
37892 + case 2:
37893 + ret = read_default_ldt(ptr, bytecount);
37894 + break;
37895 + case 0x11:
37896 + ret = write_ldt(ptr, bytecount, 0);
37897 + break;
37898 + }
37899 + return ret;
37900 +}
37901 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c
37902 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c 2006-11-22 18:06:31.000000000 +0000
37903 +++ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
37904 @@ -15,6 +15,113 @@
37905 #include <asm/mmu_context.h>
37906 #include <asm/io.h>
37907
37908 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
37909 +static u64 kexec_pgd[512] PAGE_ALIGNED;
37910 +static u64 kexec_pud0[512] PAGE_ALIGNED;
37911 +static u64 kexec_pmd0[512] PAGE_ALIGNED;
37912 +static u64 kexec_pte0[512] PAGE_ALIGNED;
37913 +static u64 kexec_pud1[512] PAGE_ALIGNED;
37914 +static u64 kexec_pmd1[512] PAGE_ALIGNED;
37915 +static u64 kexec_pte1[512] PAGE_ALIGNED;
37916 +
37917 +#ifdef CONFIG_XEN
37918 +
37919 +/* In the case of Xen, override hypervisor functions to be able to create
37920 + * a regular identity mapping page table...
37921 + */
37922 +
37923 +#include <xen/interface/kexec.h>
37924 +#include <xen/interface/memory.h>
37925 +
37926 +#define x__pmd(x) ((pmd_t) { (x) } )
37927 +#define x__pud(x) ((pud_t) { (x) } )
37928 +#define x__pgd(x) ((pgd_t) { (x) } )
37929 +
37930 +#define x_pmd_val(x) ((x).pmd)
37931 +#define x_pud_val(x) ((x).pud)
37932 +#define x_pgd_val(x) ((x).pgd)
37933 +
37934 +static inline void x_set_pmd(pmd_t *dst, pmd_t val)
37935 +{
37936 + x_pmd_val(*dst) = x_pmd_val(val);
37937 +}
37938 +
37939 +static inline void x_set_pud(pud_t *dst, pud_t val)
37940 +{
37941 + x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
37942 +}
37943 +
37944 +static inline void x_pud_clear (pud_t *pud)
37945 +{
37946 + x_pud_val(*pud) = 0;
37947 +}
37948 +
37949 +static inline void x_set_pgd(pgd_t *dst, pgd_t val)
37950 +{
37951 + x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
37952 +}
37953 +
37954 +static inline void x_pgd_clear (pgd_t * pgd)
37955 +{
37956 + x_pgd_val(*pgd) = 0;
37957 +}
37958 +
37959 +#define X__PAGE_KERNEL_LARGE_EXEC \
37960 + _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
37961 +#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
37962 +
37963 +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
37964 +
37965 +#if PAGES_NR > KEXEC_XEN_NO_PAGES
37966 +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
37967 +#endif
37968 +
37969 +#if PA_CONTROL_PAGE != 0
37970 +#error PA_CONTROL_PAGE is non zero - Xen support will break
37971 +#endif
37972 +
37973 +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
37974 +{
37975 + void *control_page;
37976 + void *table_page;
37977 +
37978 + memset(xki->page_list, 0, sizeof(xki->page_list));
37979 +
37980 + control_page = page_address(image->control_code_page) + PAGE_SIZE;
37981 + memcpy(control_page, relocate_kernel, PAGE_SIZE);
37982 +
37983 + table_page = page_address(image->control_code_page);
37984 +
37985 + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
37986 + xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
37987 +
37988 + xki->page_list[PA_PGD] = __ma(kexec_pgd);
37989 + xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
37990 + xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
37991 + xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
37992 + xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
37993 + xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
37994 + xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
37995 +}
37996 +
37997 +#else /* CONFIG_XEN */
37998 +
37999 +#define x__pmd(x) __pmd(x)
38000 +#define x__pud(x) __pud(x)
38001 +#define x__pgd(x) __pgd(x)
38002 +
38003 +#define x_set_pmd(x, y) set_pmd(x, y)
38004 +#define x_set_pud(x, y) set_pud(x, y)
38005 +#define x_set_pgd(x, y) set_pgd(x, y)
38006 +
38007 +#define x_pud_clear(x) pud_clear(x)
38008 +#define x_pgd_clear(x) pgd_clear(x)
38009 +
38010 +#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
38011 +#define X_KERNPG_TABLE _KERNPG_TABLE
38012 +
38013 +#endif /* CONFIG_XEN */
38014 +
38015 static void init_level2_page(pmd_t *level2p, unsigned long addr)
38016 {
38017 unsigned long end_addr;
38018 @@ -22,7 +129,7 @@
38019 addr &= PAGE_MASK;
38020 end_addr = addr + PUD_SIZE;
38021 while (addr < end_addr) {
38022 - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
38023 + x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
38024 addr += PMD_SIZE;
38025 }
38026 }
38027 @@ -47,12 +154,12 @@
38028 }
38029 level2p = (pmd_t *)page_address(page);
38030 init_level2_page(level2p, addr);
38031 - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
38032 + x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
38033 addr += PUD_SIZE;
38034 }
38035 /* clear the unused entries */
38036 while (addr < end_addr) {
38037 - pud_clear(level3p++);
38038 + x_pud_clear(level3p++);
38039 addr += PUD_SIZE;
38040 }
38041 out:
38042 @@ -83,12 +190,12 @@
38043 if (result) {
38044 goto out;
38045 }
38046 - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
38047 + x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
38048 addr += PGDIR_SIZE;
38049 }
38050 /* clear the unused entries */
38051 while (addr < end_addr) {
38052 - pgd_clear(level4p++);
38053 + x_pgd_clear(level4p++);
38054 addr += PGDIR_SIZE;
38055 }
38056 out:
38057 @@ -99,77 +206,29 @@
38058 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
38059 {
38060 pgd_t *level4p;
38061 - level4p = (pgd_t *)__va(start_pgtable);
38062 - return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
38063 -}
38064 -
38065 -static void set_idt(void *newidt, u16 limit)
38066 -{
38067 - struct desc_ptr curidt;
38068 + unsigned long x_end_pfn = end_pfn;
38069
38070 - /* x86-64 supports unaliged loads & stores */
38071 - curidt.size = limit;
38072 - curidt.address = (unsigned long)newidt;
38073 +#ifdef CONFIG_XEN
38074 + x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
38075 +#endif
38076
38077 - __asm__ __volatile__ (
38078 - "lidtq %0\n"
38079 - : : "m" (curidt)
38080 - );
38081 -};
38082 -
38083 -
38084 -static void set_gdt(void *newgdt, u16 limit)
38085 -{
38086 - struct desc_ptr curgdt;
38087 -
38088 - /* x86-64 supports unaligned loads & stores */
38089 - curgdt.size = limit;
38090 - curgdt.address = (unsigned long)newgdt;
38091 -
38092 - __asm__ __volatile__ (
38093 - "lgdtq %0\n"
38094 - : : "m" (curgdt)
38095 - );
38096 -};
38097 -
38098 -static void load_segments(void)
38099 -{
38100 - __asm__ __volatile__ (
38101 - "\tmovl %0,%%ds\n"
38102 - "\tmovl %0,%%es\n"
38103 - "\tmovl %0,%%ss\n"
38104 - "\tmovl %0,%%fs\n"
38105 - "\tmovl %0,%%gs\n"
38106 - : : "a" (__KERNEL_DS) : "memory"
38107 - );
38108 + level4p = (pgd_t *)__va(start_pgtable);
38109 + return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
38110 }
38111
38112 -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
38113 - unsigned long control_code_buffer,
38114 - unsigned long start_address,
38115 - unsigned long pgtable) ATTRIB_NORET;
38116 -
38117 -const extern unsigned char relocate_new_kernel[];
38118 -const extern unsigned long relocate_new_kernel_size;
38119 -
38120 int machine_kexec_prepare(struct kimage *image)
38121 {
38122 - unsigned long start_pgtable, control_code_buffer;
38123 + unsigned long start_pgtable;
38124 int result;
38125
38126 /* Calculate the offsets */
38127 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38128 - control_code_buffer = start_pgtable + PAGE_SIZE;
38129
38130 /* Setup the identity mapped 64bit page table */
38131 result = init_pgtable(image, start_pgtable);
38132 if (result)
38133 return result;
38134
38135 - /* Place the code in the reboot code buffer */
38136 - memcpy(__va(control_code_buffer), relocate_new_kernel,
38137 - relocate_new_kernel_size);
38138 -
38139 return 0;
38140 }
38141
38142 @@ -178,54 +237,43 @@
38143 return;
38144 }
38145
38146 +#ifndef CONFIG_XEN
38147 /*
38148 * Do not allocate memory (or fail in any way) in machine_kexec().
38149 * We are past the point of no return, committed to rebooting now.
38150 */
38151 NORET_TYPE void machine_kexec(struct kimage *image)
38152 {
38153 - unsigned long page_list;
38154 - unsigned long control_code_buffer;
38155 - unsigned long start_pgtable;
38156 - relocate_new_kernel_t rnk;
38157 + unsigned long page_list[PAGES_NR];
38158 + void *control_page;
38159
38160 /* Interrupts aren't acceptable while we reboot */
38161 local_irq_disable();
38162
38163 - /* Calculate the offsets */
38164 - page_list = image->head;
38165 - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38166 - control_code_buffer = start_pgtable + PAGE_SIZE;
38167 + control_page = page_address(image->control_code_page) + PAGE_SIZE;
38168 + memcpy(control_page, relocate_kernel, PAGE_SIZE);
38169 +
38170 + page_list[PA_CONTROL_PAGE] = __pa(control_page);
38171 + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
38172 + page_list[PA_PGD] = __pa(kexec_pgd);
38173 + page_list[VA_PGD] = (unsigned long)kexec_pgd;
38174 + page_list[PA_PUD_0] = __pa(kexec_pud0);
38175 + page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
38176 + page_list[PA_PMD_0] = __pa(kexec_pmd0);
38177 + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
38178 + page_list[PA_PTE_0] = __pa(kexec_pte0);
38179 + page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
38180 + page_list[PA_PUD_1] = __pa(kexec_pud1);
38181 + page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
38182 + page_list[PA_PMD_1] = __pa(kexec_pmd1);
38183 + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
38184 + page_list[PA_PTE_1] = __pa(kexec_pte1);
38185 + page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
38186 +
38187 + page_list[PA_TABLE_PAGE] =
38188 + (unsigned long)__pa(page_address(image->control_code_page));
38189
38190 - /* Set the low half of the page table to my identity mapped
38191 - * page table for kexec. Leave the high half pointing at the
38192 - * kernel pages. Don't bother to flush the global pages
38193 - * as that will happen when I fully switch to my identity mapped
38194 - * page table anyway.
38195 - */
38196 - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
38197 - __flush_tlb();
38198 -
38199 -
38200 - /* The segment registers are funny things, they are
38201 - * automatically loaded from a table, in memory wherever you
38202 - * set them to a specific selector, but this table is never
38203 - * accessed again unless you set the segment to a different selector.
38204 - *
38205 - * The more common model are caches where the behide
38206 - * the scenes work is done, but is also dropped at arbitrary
38207 - * times.
38208 - *
38209 - * I take advantage of this here by force loading the
38210 - * segments, before I zap the gdt with an invalid value.
38211 - */
38212 - load_segments();
38213 - /* The gdt & idt are now invalid.
38214 - * If you want to load them you must set up your own idt & gdt.
38215 - */
38216 - set_gdt(phys_to_virt(0),0);
38217 - set_idt(phys_to_virt(0),0);
38218 - /* now call it */
38219 - rnk = (relocate_new_kernel_t) control_code_buffer;
38220 - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
38221 + relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
38222 + image->start);
38223 }
38224 +#endif
38225 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c~ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c~
38226 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/machine_kexec.c~ 1970-01-01 00:00:00.000000000 +0000
38227 +++ linux-2.6.16.33/arch/x86_64/kernel/machine_kexec.c~ 2007-05-23 21:00:01.000000000 +0000
38228 @@ -0,0 +1,228 @@
38229 +/*
38230 + * machine_kexec.c - handle transition of Linux booting another kernel
38231 + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
38232 + *
38233 + * This source code is licensed under the GNU General Public License,
38234 + * Version 2. See the file COPYING for more details.
38235 + */
38236 +
38237 +#include <linux/mm.h>
38238 +#include <linux/kexec.h>
38239 +#include <linux/string.h>
38240 +#include <linux/reboot.h>
38241 +#include <asm/pgtable.h>
38242 +#include <asm/tlbflush.h>
38243 +#include <asm/mmu_context.h>
38244 +#include <asm/io.h>
38245 +
38246 +static void init_level2_page(pmd_t *level2p, unsigned long addr)
38247 +{
38248 + unsigned long end_addr;
38249 +
38250 + addr &= PAGE_MASK;
38251 + end_addr = addr + PUD_SIZE;
38252 + while (addr < end_addr) {
38253 + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
38254 + addr += PMD_SIZE;
38255 + }
38256 +}
38257 +
38258 +static int init_level3_page(struct kimage *image, pud_t *level3p,
38259 + unsigned long addr, unsigned long last_addr)
38260 +{
38261 + unsigned long end_addr;
38262 + int result;
38263 +
38264 + result = 0;
38265 + addr &= PAGE_MASK;
38266 + end_addr = addr + PGDIR_SIZE;
38267 + while ((addr < last_addr) && (addr < end_addr)) {
38268 + struct page *page;
38269 + pmd_t *level2p;
38270 +
38271 + page = kimage_alloc_control_pages(image, 0);
38272 + if (!page) {
38273 + result = -ENOMEM;
38274 + goto out;
38275 + }
38276 + level2p = (pmd_t *)page_address(page);
38277 + init_level2_page(level2p, addr);
38278 + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
38279 + addr += PUD_SIZE;
38280 + }
38281 + /* clear the unused entries */
38282 + while (addr < end_addr) {
38283 + pud_clear(level3p++);
38284 + addr += PUD_SIZE;
38285 + }
38286 +out:
38287 + return result;
38288 +}
38289 +
38290 +
38291 +static int init_level4_page(struct kimage *image, pgd_t *level4p,
38292 + unsigned long addr, unsigned long last_addr)
38293 +{
38294 + unsigned long end_addr;
38295 + int result;
38296 +
38297 + result = 0;
38298 + addr &= PAGE_MASK;
38299 + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
38300 + while ((addr < last_addr) && (addr < end_addr)) {
38301 + struct page *page;
38302 + pud_t *level3p;
38303 +
38304 + page = kimage_alloc_control_pages(image, 0);
38305 + if (!page) {
38306 + result = -ENOMEM;
38307 + goto out;
38308 + }
38309 + level3p = (pud_t *)page_address(page);
38310 + result = init_level3_page(image, level3p, addr, last_addr);
38311 + if (result) {
38312 + goto out;
38313 + }
38314 + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
38315 + addr += PGDIR_SIZE;
38316 + }
38317 + /* clear the unused entries */
38318 + while (addr < end_addr) {
38319 + pgd_clear(level4p++);
38320 + addr += PGDIR_SIZE;
38321 + }
38322 +out:
38323 + return result;
38324 +}
38325 +
38326 +
38327 +static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
38328 +{
38329 + pgd_t *level4p;
38330 + level4p = (pgd_t *)__va(start_pgtable);
38331 + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
38332 +}
38333 +
38334 +static void set_idt(void *newidt, u16 limit)
38335 +{
38336 + struct desc_ptr curidt;
38337 +
38338 + /* x86-64 supports unaliged loads & stores */
38339 + curidt.size = limit;
38340 + curidt.address = (unsigned long)newidt;
38341 +
38342 + __asm__ __volatile__ (
38343 + "lidtq %0\n"
38344 + : : "m" (curidt)
38345 + );
38346 +};
38347 +
38348 +
38349 +static void set_gdt(void *newgdt, u16 limit)
38350 +{
38351 + struct desc_ptr curgdt;
38352 +
38353 + /* x86-64 supports unaligned loads & stores */
38354 + curgdt.size = limit;
38355 + curgdt.address = (unsigned long)newgdt;
38356 +
38357 + __asm__ __volatile__ (
38358 + "lgdtq %0\n"
38359 + : : "m" (curgdt)
38360 + );
38361 +};
38362 +
38363 +static void load_segments(void)
38364 +{
38365 + __asm__ __volatile__ (
38366 + "\tmovl %0,%%ds\n"
38367 + "\tmovl %0,%%es\n"
38368 + "\tmovl %0,%%ss\n"
38369 + "\tmovl %0,%%fs\n"
38370 + "\tmovl %0,%%gs\n"
38371 + : : "a" (__KERNEL_DS) : "memory"
38372 + );
38373 +}
38374 +
38375 +typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
38376 + unsigned long control_code_buffer,
38377 + unsigned long start_address,
38378 + unsigned long pgtable) ATTRIB_NORET;
38379 +
38380 +extern const unsigned char relocate_new_kernel[];
38381 +extern const unsigned long relocate_new_kernel_size;
38382 +
38383 +int machine_kexec_prepare(struct kimage *image)
38384 +{
38385 + unsigned long start_pgtable, control_code_buffer;
38386 + int result;
38387 +
38388 + /* Calculate the offsets */
38389 + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38390 + control_code_buffer = start_pgtable + PAGE_SIZE;
38391 +
38392 + /* Setup the identity mapped 64bit page table */
38393 + result = init_pgtable(image, start_pgtable);
38394 + if (result)
38395 + return result;
38396 +
38397 + /* Place the code in the reboot code buffer */
38398 + memcpy(__va(control_code_buffer), relocate_new_kernel,
38399 + relocate_new_kernel_size);
38400 +
38401 + return 0;
38402 +}
38403 +
38404 +void machine_kexec_cleanup(struct kimage *image)
38405 +{
38406 + return;
38407 +}
38408 +
38409 +/*
38410 + * Do not allocate memory (or fail in any way) in machine_kexec().
38411 + * We are past the point of no return, committed to rebooting now.
38412 + */
38413 +NORET_TYPE void machine_kexec(struct kimage *image)
38414 +{
38415 + unsigned long page_list;
38416 + unsigned long control_code_buffer;
38417 + unsigned long start_pgtable;
38418 + relocate_new_kernel_t rnk;
38419 +
38420 + /* Interrupts aren't acceptable while we reboot */
38421 + local_irq_disable();
38422 +
38423 + /* Calculate the offsets */
38424 + page_list = image->head;
38425 + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
38426 + control_code_buffer = start_pgtable + PAGE_SIZE;
38427 +
38428 + /* Set the low half of the page table to my identity mapped
38429 + * page table for kexec. Leave the high half pointing at the
38430 + * kernel pages. Don't bother to flush the global pages
38431 + * as that will happen when I fully switch to my identity mapped
38432 + * page table anyway.
38433 + */
38434 + memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
38435 + __flush_tlb();
38436 +
38437 +
38438 + /* The segment registers are funny things, they have both a
38439 + * visible and an invisible part. Whenever the visible part is
38440 + * set to a specific selector, the invisible part is loaded
38441 + * with from a table in memory. At no other time is the
38442 + * descriptor table in memory accessed.
38443 + *
38444 + * I take advantage of this here by force loading the
38445 + * segments, before I zap the gdt with an invalid value.
38446 + */
38447 + load_segments();
38448 + /* The gdt & idt are now invalid.
38449 + * If you want to load them you must set up your own idt & gdt.
38450 + */
38451 + set_gdt(phys_to_virt(0),0);
38452 + set_idt(phys_to_virt(0),0);
38453 + /* now call it */
38454 + rnk = (relocate_new_kernel_t) control_code_buffer;
38455 + (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
38456 +}
38457 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/mpparse-xen.c linux-2.6.16.33/arch/x86_64/kernel/mpparse-xen.c
38458 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/mpparse-xen.c 1970-01-01 00:00:00.000000000 +0000
38459 +++ linux-2.6.16.33/arch/x86_64/kernel/mpparse-xen.c 2007-01-08 15:00:45.000000000 +0000
38460 @@ -0,0 +1,1005 @@
38461 +/*
38462 + * Intel Multiprocessor Specification 1.1 and 1.4
38463 + * compliant MP-table parsing routines.
38464 + *
38465 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
38466 + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
38467 + *
38468 + * Fixes
38469 + * Erich Boleyn : MP v1.4 and additional changes.
38470 + * Alan Cox : Added EBDA scanning
38471 + * Ingo Molnar : various cleanups and rewrites
38472 + * Maciej W. Rozycki: Bits for default MP configurations
38473 + * Paul Diefenbaugh: Added full ACPI support
38474 + */
38475 +
38476 +#include <linux/mm.h>
38477 +#include <linux/init.h>
38478 +#include <linux/delay.h>
38479 +#include <linux/config.h>
38480 +#include <linux/bootmem.h>
38481 +#include <linux/smp_lock.h>
38482 +#include <linux/kernel_stat.h>
38483 +#include <linux/mc146818rtc.h>
38484 +#include <linux/acpi.h>
38485 +#include <linux/module.h>
38486 +
38487 +#include <asm/smp.h>
38488 +#include <asm/mtrr.h>
38489 +#include <asm/mpspec.h>
38490 +#include <asm/pgalloc.h>
38491 +#include <asm/io_apic.h>
38492 +#include <asm/proto.h>
38493 +#include <asm/acpi.h>
38494 +
38495 +/* Have we found an MP table */
38496 +int smp_found_config;
38497 +unsigned int __initdata maxcpus = NR_CPUS;
38498 +
38499 +int acpi_found_madt;
38500 +
38501 +/*
38502 + * Various Linux-internal data structures created from the
38503 + * MP-table.
38504 + */
38505 +unsigned char apic_version [MAX_APICS];
38506 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
38507 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
38508 +
38509 +static int mp_current_pci_id = 0;
38510 +/* I/O APIC entries */
38511 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
38512 +
38513 +/* # of MP IRQ source entries */
38514 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
38515 +
38516 +/* MP IRQ source entries */
38517 +int mp_irq_entries;
38518 +
38519 +int nr_ioapics;
38520 +int pic_mode;
38521 +unsigned long mp_lapic_addr = 0;
38522 +
38523 +
38524 +
38525 +/* Processor that is doing the boot up */
38526 +unsigned int boot_cpu_id = -1U;
38527 +/* Internal processor count */
38528 +unsigned int num_processors __initdata = 0;
38529 +
38530 +unsigned disabled_cpus __initdata;
38531 +
38532 +/* Bitmask of physically existing CPUs */
38533 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
38534 +
38535 +/* ACPI MADT entry parsing functions */
38536 +#ifdef CONFIG_ACPI
38537 +extern struct acpi_boot_flags acpi_boot;
38538 +#ifdef CONFIG_X86_LOCAL_APIC
38539 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
38540 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
38541 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
38542 +#endif /*CONFIG_X86_LOCAL_APIC*/
38543 +#ifdef CONFIG_X86_IO_APIC
38544 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
38545 +#endif /*CONFIG_X86_IO_APIC*/
38546 +#endif /*CONFIG_ACPI*/
38547 +
38548 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
38549 +
38550 +
38551 +/*
38552 + * Intel MP BIOS table parsing routines:
38553 + */
38554 +
38555 +/*
38556 + * Checksum an MP configuration block.
38557 + */
38558 +
38559 +static int __init mpf_checksum(unsigned char *mp, int len)
38560 +{
38561 + int sum = 0;
38562 +
38563 + while (len--)
38564 + sum += *mp++;
38565 +
38566 + return sum & 0xFF;
38567 +}
38568 +
38569 +#ifndef CONFIG_XEN
38570 +static void __init MP_processor_info (struct mpc_config_processor *m)
38571 +{
38572 + int cpu;
38573 + unsigned char ver;
38574 + static int found_bsp=0;
38575 +
38576 + if (!(m->mpc_cpuflag & CPU_ENABLED)) {
38577 + disabled_cpus++;
38578 + return;
38579 + }
38580 +
38581 + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
38582 + m->mpc_apicid,
38583 + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
38584 + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
38585 + m->mpc_apicver);
38586 +
38587 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38588 + Dprintk(" Bootup CPU\n");
38589 + boot_cpu_id = m->mpc_apicid;
38590 + }
38591 + if (num_processors >= NR_CPUS) {
38592 + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
38593 + " Processor ignored.\n", NR_CPUS);
38594 + return;
38595 + }
38596 +
38597 + cpu = num_processors++;
38598 +
38599 +#if MAX_APICS < 255
38600 + if ((int)m->mpc_apicid > MAX_APICS) {
38601 + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
38602 + m->mpc_apicid, MAX_APICS);
38603 + return;
38604 + }
38605 +#endif
38606 + ver = m->mpc_apicver;
38607 +
38608 + physid_set(m->mpc_apicid, phys_cpu_present_map);
38609 + /*
38610 + * Validate version
38611 + */
38612 + if (ver == 0x0) {
38613 + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
38614 + ver = 0x10;
38615 + }
38616 + apic_version[m->mpc_apicid] = ver;
38617 + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
38618 + /*
38619 + * bios_cpu_apicid is required to have processors listed
38620 + * in same order as logical cpu numbers. Hence the first
38621 + * entry is BSP, and so on.
38622 + */
38623 + cpu = 0;
38624 +
38625 + bios_cpu_apicid[0] = m->mpc_apicid;
38626 + x86_cpu_to_apicid[0] = m->mpc_apicid;
38627 + found_bsp = 1;
38628 + } else
38629 + cpu = num_processors - found_bsp;
38630 + bios_cpu_apicid[cpu] = m->mpc_apicid;
38631 + x86_cpu_to_apicid[cpu] = m->mpc_apicid;
38632 +
38633 + cpu_set(cpu, cpu_possible_map);
38634 + cpu_set(cpu, cpu_present_map);
38635 +}
38636 +#else
38637 +void __init MP_processor_info (struct mpc_config_processor *m)
38638 +{
38639 + num_processors++;
38640 +}
38641 +#endif /* CONFIG_XEN */
38642 +
38643 +static void __init MP_bus_info (struct mpc_config_bus *m)
38644 +{
38645 + char str[7];
38646 +
38647 + memcpy(str, m->mpc_bustype, 6);
38648 + str[6] = 0;
38649 + Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
38650 +
38651 + if (strncmp(str, "ISA", 3) == 0) {
38652 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
38653 + } else if (strncmp(str, "EISA", 4) == 0) {
38654 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
38655 + } else if (strncmp(str, "PCI", 3) == 0) {
38656 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
38657 + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
38658 + mp_current_pci_id++;
38659 + } else if (strncmp(str, "MCA", 3) == 0) {
38660 + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
38661 + } else {
38662 + printk(KERN_ERR "Unknown bustype %s\n", str);
38663 + }
38664 +}
38665 +
38666 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
38667 +{
38668 + if (!(m->mpc_flags & MPC_APIC_USABLE))
38669 + return;
38670 +
38671 + printk("I/O APIC #%d Version %d at 0x%X.\n",
38672 + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
38673 + if (nr_ioapics >= MAX_IO_APICS) {
38674 + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
38675 + MAX_IO_APICS, nr_ioapics);
38676 + panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
38677 + }
38678 + if (!m->mpc_apicaddr) {
38679 + printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
38680 + " found in MP table, skipping!\n");
38681 + return;
38682 + }
38683 + mp_ioapics[nr_ioapics] = *m;
38684 + nr_ioapics++;
38685 +}
38686 +
38687 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
38688 +{
38689 + mp_irqs [mp_irq_entries] = *m;
38690 + Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
38691 + " IRQ %02x, APIC ID %x, APIC INT %02x\n",
38692 + m->mpc_irqtype, m->mpc_irqflag & 3,
38693 + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
38694 + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
38695 + if (++mp_irq_entries >= MAX_IRQ_SOURCES)
38696 + panic("Max # of irq sources exceeded!!\n");
38697 +}
38698 +
38699 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
38700 +{
38701 + Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
38702 + " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
38703 + m->mpc_irqtype, m->mpc_irqflag & 3,
38704 + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
38705 + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
38706 + /*
38707 + * Well it seems all SMP boards in existence
38708 + * use ExtINT/LVT1 == LINT0 and
38709 + * NMI/LVT2 == LINT1 - the following check
38710 + * will show us if this assumptions is false.
38711 + * Until then we do not have to add baggage.
38712 + */
38713 + if ((m->mpc_irqtype == mp_ExtINT) &&
38714 + (m->mpc_destapiclint != 0))
38715 + BUG();
38716 + if ((m->mpc_irqtype == mp_NMI) &&
38717 + (m->mpc_destapiclint != 1))
38718 + BUG();
38719 +}
38720 +
38721 +/*
38722 + * Read/parse the MPC
38723 + */
38724 +
38725 +static int __init smp_read_mpc(struct mp_config_table *mpc)
38726 +{
38727 + char str[16];
38728 + int count=sizeof(*mpc);
38729 + unsigned char *mpt=((unsigned char *)mpc)+count;
38730 +
38731 + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
38732 + printk("SMP mptable: bad signature [%c%c%c%c]!\n",
38733 + mpc->mpc_signature[0],
38734 + mpc->mpc_signature[1],
38735 + mpc->mpc_signature[2],
38736 + mpc->mpc_signature[3]);
38737 + return 0;
38738 + }
38739 + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
38740 + printk("SMP mptable: checksum error!\n");
38741 + return 0;
38742 + }
38743 + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
38744 + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
38745 + mpc->mpc_spec);
38746 + return 0;
38747 + }
38748 + if (!mpc->mpc_lapic) {
38749 + printk(KERN_ERR "SMP mptable: null local APIC address!\n");
38750 + return 0;
38751 + }
38752 + memcpy(str,mpc->mpc_oem,8);
38753 + str[8]=0;
38754 + printk(KERN_INFO "OEM ID: %s ",str);
38755 +
38756 + memcpy(str,mpc->mpc_productid,12);
38757 + str[12]=0;
38758 + printk("Product ID: %s ",str);
38759 +
38760 + printk("APIC at: 0x%X\n",mpc->mpc_lapic);
38761 +
38762 + /* save the local APIC address, it might be non-default */
38763 + if (!acpi_lapic)
38764 + mp_lapic_addr = mpc->mpc_lapic;
38765 +
38766 + /*
38767 + * Now process the configuration blocks.
38768 + */
38769 + while (count < mpc->mpc_length) {
38770 + switch(*mpt) {
38771 + case MP_PROCESSOR:
38772 + {
38773 + struct mpc_config_processor *m=
38774 + (struct mpc_config_processor *)mpt;
38775 + if (!acpi_lapic)
38776 + MP_processor_info(m);
38777 + mpt += sizeof(*m);
38778 + count += sizeof(*m);
38779 + break;
38780 + }
38781 + case MP_BUS:
38782 + {
38783 + struct mpc_config_bus *m=
38784 + (struct mpc_config_bus *)mpt;
38785 + MP_bus_info(m);
38786 + mpt += sizeof(*m);
38787 + count += sizeof(*m);
38788 + break;
38789 + }
38790 + case MP_IOAPIC:
38791 + {
38792 + struct mpc_config_ioapic *m=
38793 + (struct mpc_config_ioapic *)mpt;
38794 + MP_ioapic_info(m);
38795 + mpt+=sizeof(*m);
38796 + count+=sizeof(*m);
38797 + break;
38798 + }
38799 + case MP_INTSRC:
38800 + {
38801 + struct mpc_config_intsrc *m=
38802 + (struct mpc_config_intsrc *)mpt;
38803 +
38804 + MP_intsrc_info(m);
38805 + mpt+=sizeof(*m);
38806 + count+=sizeof(*m);
38807 + break;
38808 + }
38809 + case MP_LINTSRC:
38810 + {
38811 + struct mpc_config_lintsrc *m=
38812 + (struct mpc_config_lintsrc *)mpt;
38813 + MP_lintsrc_info(m);
38814 + mpt+=sizeof(*m);
38815 + count+=sizeof(*m);
38816 + break;
38817 + }
38818 + }
38819 + }
38820 + clustered_apic_check();
38821 + if (!num_processors)
38822 + printk(KERN_ERR "SMP mptable: no processors registered!\n");
38823 + return num_processors;
38824 +}
38825 +
38826 +static int __init ELCR_trigger(unsigned int irq)
38827 +{
38828 + unsigned int port;
38829 +
38830 + port = 0x4d0 + (irq >> 3);
38831 + return (inb(port) >> (irq & 7)) & 1;
38832 +}
38833 +
38834 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
38835 +{
38836 + struct mpc_config_intsrc intsrc;
38837 + int i;
38838 + int ELCR_fallback = 0;
38839 +
38840 + intsrc.mpc_type = MP_INTSRC;
38841 + intsrc.mpc_irqflag = 0; /* conforming */
38842 + intsrc.mpc_srcbus = 0;
38843 + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
38844 +
38845 + intsrc.mpc_irqtype = mp_INT;
38846 +
38847 + /*
38848 + * If true, we have an ISA/PCI system with no IRQ entries
38849 + * in the MP table. To prevent the PCI interrupts from being set up
38850 + * incorrectly, we try to use the ELCR. The sanity check to see if
38851 + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
38852 + * never be level sensitive, so we simply see if the ELCR agrees.
38853 + * If it does, we assume it's valid.
38854 + */
38855 + if (mpc_default_type == 5) {
38856 + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
38857 +
38858 + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
38859 + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
38860 + else {
38861 + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
38862 + ELCR_fallback = 1;
38863 + }
38864 + }
38865 +
38866 + for (i = 0; i < 16; i++) {
38867 + switch (mpc_default_type) {
38868 + case 2:
38869 + if (i == 0 || i == 13)
38870 + continue; /* IRQ0 & IRQ13 not connected */
38871 + /* fall through */
38872 + default:
38873 + if (i == 2)
38874 + continue; /* IRQ2 is never connected */
38875 + }
38876 +
38877 + if (ELCR_fallback) {
38878 + /*
38879 + * If the ELCR indicates a level-sensitive interrupt, we
38880 + * copy that information over to the MP table in the
38881 + * irqflag field (level sensitive, active high polarity).
38882 + */
38883 + if (ELCR_trigger(i))
38884 + intsrc.mpc_irqflag = 13;
38885 + else
38886 + intsrc.mpc_irqflag = 0;
38887 + }
38888 +
38889 + intsrc.mpc_srcbusirq = i;
38890 + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
38891 + MP_intsrc_info(&intsrc);
38892 + }
38893 +
38894 + intsrc.mpc_irqtype = mp_ExtINT;
38895 + intsrc.mpc_srcbusirq = 0;
38896 + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
38897 + MP_intsrc_info(&intsrc);
38898 +}
38899 +
38900 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
38901 +{
38902 + struct mpc_config_processor processor;
38903 + struct mpc_config_bus bus;
38904 + struct mpc_config_ioapic ioapic;
38905 + struct mpc_config_lintsrc lintsrc;
38906 + int linttypes[2] = { mp_ExtINT, mp_NMI };
38907 + int i;
38908 +
38909 + /*
38910 + * local APIC has default address
38911 + */
38912 + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
38913 +
38914 + /*
38915 + * 2 CPUs, numbered 0 & 1.
38916 + */
38917 + processor.mpc_type = MP_PROCESSOR;
38918 + /* Either an integrated APIC or a discrete 82489DX. */
38919 + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
38920 + processor.mpc_cpuflag = CPU_ENABLED;
38921 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
38922 + (boot_cpu_data.x86_model << 4) |
38923 + boot_cpu_data.x86_mask;
38924 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
38925 + processor.mpc_reserved[0] = 0;
38926 + processor.mpc_reserved[1] = 0;
38927 + for (i = 0; i < 2; i++) {
38928 + processor.mpc_apicid = i;
38929 + MP_processor_info(&processor);
38930 + }
38931 +
38932 + bus.mpc_type = MP_BUS;
38933 + bus.mpc_busid = 0;
38934 + switch (mpc_default_type) {
38935 + default:
38936 + printk(KERN_ERR "???\nUnknown standard configuration %d\n",
38937 + mpc_default_type);
38938 + /* fall through */
38939 + case 1:
38940 + case 5:
38941 + memcpy(bus.mpc_bustype, "ISA ", 6);
38942 + break;
38943 + case 2:
38944 + case 6:
38945 + case 3:
38946 + memcpy(bus.mpc_bustype, "EISA ", 6);
38947 + break;
38948 + case 4:
38949 + case 7:
38950 + memcpy(bus.mpc_bustype, "MCA ", 6);
38951 + }
38952 + MP_bus_info(&bus);
38953 + if (mpc_default_type > 4) {
38954 + bus.mpc_busid = 1;
38955 + memcpy(bus.mpc_bustype, "PCI ", 6);
38956 + MP_bus_info(&bus);
38957 + }
38958 +
38959 + ioapic.mpc_type = MP_IOAPIC;
38960 + ioapic.mpc_apicid = 2;
38961 + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
38962 + ioapic.mpc_flags = MPC_APIC_USABLE;
38963 + ioapic.mpc_apicaddr = 0xFEC00000;
38964 + MP_ioapic_info(&ioapic);
38965 +
38966 + /*
38967 + * We set up most of the low 16 IO-APIC pins according to MPS rules.
38968 + */
38969 + construct_default_ioirq_mptable(mpc_default_type);
38970 +
38971 + lintsrc.mpc_type = MP_LINTSRC;
38972 + lintsrc.mpc_irqflag = 0; /* conforming */
38973 + lintsrc.mpc_srcbusid = 0;
38974 + lintsrc.mpc_srcbusirq = 0;
38975 + lintsrc.mpc_destapic = MP_APIC_ALL;
38976 + for (i = 0; i < 2; i++) {
38977 + lintsrc.mpc_irqtype = linttypes[i];
38978 + lintsrc.mpc_destapiclint = i;
38979 + MP_lintsrc_info(&lintsrc);
38980 + }
38981 +}
38982 +
38983 +static struct intel_mp_floating *mpf_found;
38984 +
38985 +/*
38986 + * Scan the memory blocks for an SMP configuration block.
38987 + */
38988 +void __init get_smp_config (void)
38989 +{
38990 + struct intel_mp_floating *mpf = mpf_found;
38991 +
38992 + /*
38993 + * ACPI supports both logical (e.g. Hyper-Threading) and physical
38994 + * processors, where MPS only supports physical.
38995 + */
38996 + if (acpi_lapic && acpi_ioapic) {
38997 + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
38998 + return;
38999 + }
39000 + else if (acpi_lapic)
39001 + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
39002 +
39003 + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
39004 + if (mpf->mpf_feature2 & (1<<7)) {
39005 + printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
39006 + pic_mode = 1;
39007 + } else {
39008 + printk(KERN_INFO " Virtual Wire compatibility mode.\n");
39009 + pic_mode = 0;
39010 + }
39011 +
39012 + /*
39013 + * Now see if we need to read further.
39014 + */
39015 + if (mpf->mpf_feature1 != 0) {
39016 +
39017 + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
39018 + construct_default_ISA_mptable(mpf->mpf_feature1);
39019 +
39020 + } else if (mpf->mpf_physptr) {
39021 +
39022 + /*
39023 + * Read the physical hardware table. Anything here will
39024 + * override the defaults.
39025 + */
39026 + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
39027 + smp_found_config = 0;
39028 + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
39029 + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
39030 + return;
39031 + }
39032 + /*
39033 + * If there are no explicit MP IRQ entries, then we are
39034 + * broken. We set up most of the low 16 IO-APIC pins to
39035 + * ISA defaults and hope it will work.
39036 + */
39037 + if (!mp_irq_entries) {
39038 + struct mpc_config_bus bus;
39039 +
39040 + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
39041 +
39042 + bus.mpc_type = MP_BUS;
39043 + bus.mpc_busid = 0;
39044 + memcpy(bus.mpc_bustype, "ISA ", 6);
39045 + MP_bus_info(&bus);
39046 +
39047 + construct_default_ioirq_mptable(0);
39048 + }
39049 +
39050 + } else
39051 + BUG();
39052 +
39053 + printk(KERN_INFO "Processors: %d\n", num_processors);
39054 + /*
39055 + * Only use the first configuration found.
39056 + */
39057 +}
39058 +
39059 +static int __init smp_scan_config (unsigned long base, unsigned long length)
39060 +{
39061 + extern void __bad_mpf_size(void);
39062 + unsigned int *bp = isa_bus_to_virt(base);
39063 + struct intel_mp_floating *mpf;
39064 +
39065 + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
39066 + if (sizeof(*mpf) != 16)
39067 + __bad_mpf_size();
39068 +
39069 + while (length > 0) {
39070 + mpf = (struct intel_mp_floating *)bp;
39071 + if ((*bp == SMP_MAGIC_IDENT) &&
39072 + (mpf->mpf_length == 1) &&
39073 + !mpf_checksum((unsigned char *)bp, 16) &&
39074 + ((mpf->mpf_specification == 1)
39075 + || (mpf->mpf_specification == 4)) ) {
39076 +
39077 + smp_found_config = 1;
39078 + mpf_found = mpf;
39079 + return 1;
39080 + }
39081 + bp += 4;
39082 + length -= 16;
39083 + }
39084 + return 0;
39085 +}
39086 +
39087 +void __init find_intel_smp (void)
39088 +{
39089 + unsigned int address;
39090 +
39091 + /*
39092 + * FIXME: Linux assumes you have 640K of base ram..
39093 + * this continues the error...
39094 + *
39095 + * 1) Scan the bottom 1K for a signature
39096 + * 2) Scan the top 1K of base RAM
39097 + * 3) Scan the 64K of bios
39098 + */
39099 + if (smp_scan_config(0x0,0x400) ||
39100 + smp_scan_config(639*0x400,0x400) ||
39101 + smp_scan_config(0xF0000,0x10000))
39102 + return;
39103 + /*
39104 + * If it is an SMP machine we should know now, unless the
39105 + * configuration is in an EISA/MCA bus machine with an
39106 + * extended bios data area.
39107 + *
39108 + * there is a real-mode segmented pointer pointing to the
39109 + * 4K EBDA area at 0x40E, calculate and scan it here.
39110 + *
39111 + * NOTE! There are Linux loaders that will corrupt the EBDA
39112 + * area, and as such this kind of SMP config may be less
39113 + * trustworthy, simply because the SMP table may have been
39114 + * stomped on during early boot. These loaders are buggy and
39115 + * should be fixed.
39116 + */
39117 +
39118 + address = *(unsigned short *)phys_to_virt(0x40E);
39119 + address <<= 4;
39120 + if (smp_scan_config(address, 0x1000))
39121 + return;
39122 +
39123 + /* If we have come this far, we did not find an MP table */
39124 + printk(KERN_INFO "No mptable found.\n");
39125 +}
39126 +
39127 +/*
39128 + * - Intel MP Configuration Table
39129 + */
39130 +void __init find_smp_config (void)
39131 +{
39132 +#ifdef CONFIG_X86_LOCAL_APIC
39133 + find_intel_smp();
39134 +#endif
39135 +}
39136 +
39137 +
39138 +/* --------------------------------------------------------------------------
39139 + ACPI-based MP Configuration
39140 + -------------------------------------------------------------------------- */
39141 +
39142 +#ifdef CONFIG_ACPI
39143 +
39144 +void __init mp_register_lapic_address (
39145 + u64 address)
39146 +{
39147 +#ifndef CONFIG_XEN
39148 + mp_lapic_addr = (unsigned long) address;
39149 +
39150 + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
39151 +
39152 + if (boot_cpu_id == -1U)
39153 + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
39154 +
39155 + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
39156 +#endif
39157 +}
39158 +
39159 +
39160 +void __init mp_register_lapic (
39161 + u8 id,
39162 + u8 enabled)
39163 +{
39164 + struct mpc_config_processor processor;
39165 + int boot_cpu = 0;
39166 +
39167 + if (id >= MAX_APICS) {
39168 + printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
39169 + id, MAX_APICS);
39170 + return;
39171 + }
39172 +
39173 + if (id == boot_cpu_physical_apicid)
39174 + boot_cpu = 1;
39175 +
39176 +#ifndef CONFIG_XEN
39177 + processor.mpc_type = MP_PROCESSOR;
39178 + processor.mpc_apicid = id;
39179 + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
39180 + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
39181 + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
39182 + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
39183 + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
39184 + processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
39185 + processor.mpc_reserved[0] = 0;
39186 + processor.mpc_reserved[1] = 0;
39187 +#endif
39188 +
39189 + MP_processor_info(&processor);
39190 +}
39191 +
39192 +#ifdef CONFIG_X86_IO_APIC
39193 +
39194 +#define MP_ISA_BUS 0
39195 +#define MP_MAX_IOAPIC_PIN 127
39196 +
39197 +static struct mp_ioapic_routing {
39198 + int apic_id;
39199 + int gsi_start;
39200 + int gsi_end;
39201 + u32 pin_programmed[4];
39202 +} mp_ioapic_routing[MAX_IO_APICS];
39203 +
39204 +
39205 +static int mp_find_ioapic (
39206 + int gsi)
39207 +{
39208 + int i = 0;
39209 +
39210 + /* Find the IOAPIC that manages this GSI. */
39211 + for (i = 0; i < nr_ioapics; i++) {
39212 + if ((gsi >= mp_ioapic_routing[i].gsi_start)
39213 + && (gsi <= mp_ioapic_routing[i].gsi_end))
39214 + return i;
39215 + }
39216 +
39217 + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
39218 +
39219 + return -1;
39220 +}
39221 +
39222 +
39223 +void __init mp_register_ioapic (
39224 + u8 id,
39225 + u32 address,
39226 + u32 gsi_base)
39227 +{
39228 + int idx = 0;
39229 +
39230 + if (nr_ioapics >= MAX_IO_APICS) {
39231 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
39232 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
39233 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
39234 + }
39235 + if (!address) {
39236 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
39237 + " found in MADT table, skipping!\n");
39238 + return;
39239 + }
39240 +
39241 + idx = nr_ioapics++;
39242 +
39243 + mp_ioapics[idx].mpc_type = MP_IOAPIC;
39244 + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
39245 + mp_ioapics[idx].mpc_apicaddr = address;
39246 +
39247 +#ifndef CONFIG_XEN
39248 + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
39249 +#endif
39250 + mp_ioapics[idx].mpc_apicid = id;
39251 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
39252 +
39253 + /*
39254 + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
39255 + * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
39256 + */
39257 + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
39258 + mp_ioapic_routing[idx].gsi_start = gsi_base;
39259 + mp_ioapic_routing[idx].gsi_end = gsi_base +
39260 + io_apic_get_redir_entries(idx);
39261 +
39262 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
39263 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
39264 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
39265 + mp_ioapic_routing[idx].gsi_start,
39266 + mp_ioapic_routing[idx].gsi_end);
39267 +
39268 + return;
39269 +}
39270 +
39271 +
39272 +void __init mp_override_legacy_irq (
39273 + u8 bus_irq,
39274 + u8 polarity,
39275 + u8 trigger,
39276 + u32 gsi)
39277 +{
39278 + struct mpc_config_intsrc intsrc;
39279 + int ioapic = -1;
39280 + int pin = -1;
39281 +
39282 + /*
39283 + * Convert 'gsi' to 'ioapic.pin'.
39284 + */
39285 + ioapic = mp_find_ioapic(gsi);
39286 + if (ioapic < 0)
39287 + return;
39288 + pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39289 +
39290 + /*
39291 + * TBD: This check is for faulty timer entries, where the override
39292 + * erroneously sets the trigger to level, resulting in a HUGE
39293 + * increase of timer interrupts!
39294 + */
39295 + if ((bus_irq == 0) && (trigger == 3))
39296 + trigger = 1;
39297 +
39298 + intsrc.mpc_type = MP_INTSRC;
39299 + intsrc.mpc_irqtype = mp_INT;
39300 + intsrc.mpc_irqflag = (trigger << 2) | polarity;
39301 + intsrc.mpc_srcbus = MP_ISA_BUS;
39302 + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
39303 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
39304 + intsrc.mpc_dstirq = pin; /* INTIN# */
39305 +
39306 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
39307 + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
39308 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
39309 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
39310 +
39311 + mp_irqs[mp_irq_entries] = intsrc;
39312 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
39313 + panic("Max # of irq sources exceeded!\n");
39314 +
39315 + return;
39316 +}
39317 +
39318 +
39319 +void __init mp_config_acpi_legacy_irqs (void)
39320 +{
39321 + struct mpc_config_intsrc intsrc;
39322 + int i = 0;
39323 + int ioapic = -1;
39324 +
39325 + /*
39326 + * Fabricate the legacy ISA bus (bus #31).
39327 + */
39328 + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
39329 + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
39330 +
39331 + /*
39332 + * Locate the IOAPIC that manages the ISA IRQs (0-15).
39333 + */
39334 + ioapic = mp_find_ioapic(0);
39335 + if (ioapic < 0)
39336 + return;
39337 +
39338 + intsrc.mpc_type = MP_INTSRC;
39339 + intsrc.mpc_irqflag = 0; /* Conforming */
39340 + intsrc.mpc_srcbus = MP_ISA_BUS;
39341 + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
39342 +
39343 + /*
39344 + * Use the default configuration for the IRQs 0-15. Unless
39345 + * overridden by (MADT) interrupt source override entries.
39346 + */
39347 + for (i = 0; i < 16; i++) {
39348 + int idx;
39349 +
39350 + for (idx = 0; idx < mp_irq_entries; idx++) {
39351 + struct mpc_config_intsrc *irq = mp_irqs + idx;
39352 +
39353 + /* Do we already have a mapping for this ISA IRQ? */
39354 + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
39355 + break;
39356 +
39357 + /* Do we already have a mapping for this IOAPIC pin */
39358 + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
39359 + (irq->mpc_dstirq == i))
39360 + break;
39361 + }
39362 +
39363 + if (idx != mp_irq_entries) {
39364 + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
39365 + continue; /* IRQ already used */
39366 + }
39367 +
39368 + intsrc.mpc_irqtype = mp_INT;
39369 + intsrc.mpc_srcbusirq = i; /* Identity mapped */
39370 + intsrc.mpc_dstirq = i;
39371 +
39372 + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
39373 + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
39374 + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
39375 + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
39376 + intsrc.mpc_dstirq);
39377 +
39378 + mp_irqs[mp_irq_entries] = intsrc;
39379 + if (++mp_irq_entries == MAX_IRQ_SOURCES)
39380 + panic("Max # of irq sources exceeded!\n");
39381 + }
39382 +
39383 + return;
39384 +}
39385 +
39386 +#define MAX_GSI_NUM 4096
39387 +
39388 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
39389 +{
39390 + int ioapic = -1;
39391 + int ioapic_pin = 0;
39392 + int idx, bit = 0;
39393 + static int pci_irq = 16;
39394 + /*
39395 + * Mapping between Global System Interrupts, which
39396 + * represent all possible interrupts, to the IRQs
39397 + * assigned to actual devices.
39398 + */
39399 + static int gsi_to_irq[MAX_GSI_NUM];
39400 +
39401 + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
39402 + return gsi;
39403 +
39404 + /* Don't set up the ACPI SCI because it's already set up */
39405 + if (acpi_fadt.sci_int == gsi)
39406 + return gsi;
39407 +
39408 + ioapic = mp_find_ioapic(gsi);
39409 + if (ioapic < 0) {
39410 + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
39411 + return gsi;
39412 + }
39413 +
39414 + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
39415 +
39416 + /*
39417 + * Avoid pin reprogramming. PRTs typically include entries
39418 + * with redundant pin->gsi mappings (but unique PCI devices);
39419 + * we only program the IOAPIC on the first.
39420 + */
39421 + bit = ioapic_pin % 32;
39422 + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
39423 + if (idx > 3) {
39424 + printk(KERN_ERR "Invalid reference to IOAPIC pin "
39425 + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
39426 + ioapic_pin);
39427 + return gsi;
39428 + }
39429 + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
39430 + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
39431 + mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
39432 + return gsi_to_irq[gsi];
39433 + }
39434 +
39435 + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
39436 +
39437 + if (triggering == ACPI_LEVEL_SENSITIVE) {
39438 + /*
39439 + * For PCI devices assign IRQs in order, avoiding gaps
39440 + * due to unused I/O APIC pins.
39441 + */
39442 + int irq = gsi;
39443 + if (gsi < MAX_GSI_NUM) {
39444 + if (gsi > 15)
39445 + gsi = pci_irq++;
39446 + /*
39447 + * Don't assign IRQ used by ACPI SCI
39448 + */
39449 + if (gsi == acpi_fadt.sci_int)
39450 + gsi = pci_irq++;
39451 + gsi_to_irq[irq] = gsi;
39452 + } else {
39453 + printk(KERN_ERR "GSI %u is too high\n", gsi);
39454 + return gsi;
39455 + }
39456 + }
39457 +
39458 + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
39459 + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
39460 + polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
39461 + return gsi;
39462 +}
39463 +
39464 +#endif /*CONFIG_X86_IO_APIC*/
39465 +#endif /*CONFIG_ACPI*/
39466 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/pci-swiotlb-xen.c linux-2.6.16.33/arch/x86_64/kernel/pci-swiotlb-xen.c
39467 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/pci-swiotlb-xen.c 1970-01-01 00:00:00.000000000 +0000
39468 +++ linux-2.6.16.33/arch/x86_64/kernel/pci-swiotlb-xen.c 2007-01-08 15:00:45.000000000 +0000
39469 @@ -0,0 +1,54 @@
39470 +/* Glue code to lib/swiotlb.c */
39471 +
39472 +#include <linux/pci.h>
39473 +#include <linux/cache.h>
39474 +#include <linux/module.h>
39475 +#include <asm/dma-mapping.h>
39476 +#include <asm/proto.h>
39477 +#include <asm/swiotlb.h>
39478 +#include <asm/dma.h>
39479 +
39480 +#if 0
39481 +int swiotlb __read_mostly;
39482 +EXPORT_SYMBOL(swiotlb);
39483 +#endif
39484 +
39485 +struct dma_mapping_ops swiotlb_dma_ops = {
39486 +#if 0
39487 + .mapping_error = swiotlb_dma_mapping_error,
39488 + .alloc_coherent = swiotlb_alloc_coherent,
39489 + .free_coherent = swiotlb_free_coherent,
39490 + .map_single = swiotlb_map_single,
39491 + .unmap_single = swiotlb_unmap_single,
39492 + .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
39493 + .sync_single_for_device = swiotlb_sync_single_for_device,
39494 + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
39495 + .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
39496 + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
39497 + .sync_sg_for_device = swiotlb_sync_sg_for_device,
39498 + .map_sg = swiotlb_map_sg,
39499 + .unmap_sg = swiotlb_unmap_sg,
39500 + .dma_supported = NULL,
39501 +#endif
39502 +};
39503 +
39504 +void pci_swiotlb_init(void)
39505 +{
39506 +#if 0
39507 + /* don't initialize swiotlb if iommu=off (no_iommu=1) */
39508 + if (!iommu_aperture && !no_iommu &&
39509 + (end_pfn > MAX_DMA32_PFN || force_iommu))
39510 + swiotlb = 1;
39511 + if (swiotlb) {
39512 + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39513 + swiotlb_init();
39514 + dma_ops = &swiotlb_dma_ops;
39515 + }
39516 +#else
39517 + swiotlb_init();
39518 + if (swiotlb) {
39519 + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
39520 + dma_ops = &swiotlb_dma_ops;
39521 + }
39522 +#endif
39523 +}
39524 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/process-xen.c linux-2.6.16.33/arch/x86_64/kernel/process-xen.c
39525 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/process-xen.c 1970-01-01 00:00:00.000000000 +0000
39526 +++ linux-2.6.16.33/arch/x86_64/kernel/process-xen.c 2007-01-08 15:00:45.000000000 +0000
39527 @@ -0,0 +1,802 @@
39528 +/*
39529 + * linux/arch/x86-64/kernel/process.c
39530 + *
39531 + * Copyright (C) 1995 Linus Torvalds
39532 + *
39533 + * Pentium III FXSR, SSE support
39534 + * Gareth Hughes <gareth@valinux.com>, May 2000
39535 + *
39536 + * X86-64 port
39537 + * Andi Kleen.
39538 + *
39539 + * CPU hotplug support - ashok.raj@intel.com
39540 + * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
39541 + *
39542 + * Jun Nakajima <jun.nakajima@intel.com>
39543 + * Modified for Xen
39544 + */
39545 +
39546 +/*
39547 + * This file handles the architecture-dependent parts of process handling..
39548 + */
39549 +
39550 +#include <stdarg.h>
39551 +
39552 +#include <linux/cpu.h>
39553 +#include <linux/errno.h>
39554 +#include <linux/sched.h>
39555 +#include <linux/kernel.h>
39556 +#include <linux/mm.h>
39557 +#include <linux/elfcore.h>
39558 +#include <linux/smp.h>
39559 +#include <linux/slab.h>
39560 +#include <linux/user.h>
39561 +#include <linux/module.h>
39562 +#include <linux/a.out.h>
39563 +#include <linux/interrupt.h>
39564 +#include <linux/delay.h>
39565 +#include <linux/ptrace.h>
39566 +#include <linux/utsname.h>
39567 +#include <linux/random.h>
39568 +#include <linux/kprobes.h>
39569 +#include <linux/notifier.h>
39570 +
39571 +#include <asm/uaccess.h>
39572 +#include <asm/pgtable.h>
39573 +#include <asm/system.h>
39574 +#include <asm/io.h>
39575 +#include <asm/processor.h>
39576 +#include <asm/i387.h>
39577 +#include <asm/mmu_context.h>
39578 +#include <asm/pda.h>
39579 +#include <asm/prctl.h>
39580 +#include <asm/kdebug.h>
39581 +#include <xen/interface/dom0_ops.h>
39582 +#include <xen/interface/physdev.h>
39583 +#include <xen/interface/vcpu.h>
39584 +#include <asm/desc.h>
39585 +#include <asm/proto.h>
39586 +#include <asm/hardirq.h>
39587 +#include <asm/ia32.h>
39588 +#include <asm/idle.h>
39589 +
39590 +#include <xen/cpu_hotplug.h>
39591 +
39592 +asmlinkage extern void ret_from_fork(void);
39593 +
39594 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
39595 +
39596 +unsigned long boot_option_idle_override = 0;
39597 +EXPORT_SYMBOL(boot_option_idle_override);
39598 +
39599 +/*
39600 + * Powermanagement idle function, if any..
39601 + */
39602 +void (*pm_idle)(void);
39603 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
39604 +
39605 +static struct notifier_block *idle_notifier;
39606 +static DEFINE_SPINLOCK(idle_notifier_lock);
39607 +
39608 +void idle_notifier_register(struct notifier_block *n)
39609 +{
39610 + unsigned long flags;
39611 + spin_lock_irqsave(&idle_notifier_lock, flags);
39612 + notifier_chain_register(&idle_notifier, n);
39613 + spin_unlock_irqrestore(&idle_notifier_lock, flags);
39614 +}
39615 +EXPORT_SYMBOL_GPL(idle_notifier_register);
39616 +
39617 +void idle_notifier_unregister(struct notifier_block *n)
39618 +{
39619 + unsigned long flags;
39620 + spin_lock_irqsave(&idle_notifier_lock, flags);
39621 + notifier_chain_unregister(&idle_notifier, n);
39622 + spin_unlock_irqrestore(&idle_notifier_lock, flags);
39623 +}
39624 +EXPORT_SYMBOL(idle_notifier_unregister);
39625 +
39626 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
39627 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
39628 +
39629 +void enter_idle(void)
39630 +{
39631 + __get_cpu_var(idle_state) = CPU_IDLE;
39632 + notifier_call_chain(&idle_notifier, IDLE_START, NULL);
39633 +}
39634 +
39635 +static void __exit_idle(void)
39636 +{
39637 + __get_cpu_var(idle_state) = CPU_NOT_IDLE;
39638 + notifier_call_chain(&idle_notifier, IDLE_END, NULL);
39639 +}
39640 +
39641 +/* Called from interrupts to signify idle end */
39642 +void exit_idle(void)
39643 +{
39644 + if (current->pid | read_pda(irqcount))
39645 + return;
39646 + __exit_idle();
39647 +}
39648 +
39649 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
39650 +void xen_idle(void)
39651 +{
39652 + local_irq_disable();
39653 +
39654 + if (need_resched())
39655 + local_irq_enable();
39656 + else {
39657 + clear_thread_flag(TIF_POLLING_NRFLAG);
39658 + smp_mb__after_clear_bit();
39659 + safe_halt();
39660 + set_thread_flag(TIF_POLLING_NRFLAG);
39661 + }
39662 +}
39663 +
39664 +#ifdef CONFIG_HOTPLUG_CPU
39665 +static inline void play_dead(void)
39666 +{
39667 + idle_task_exit();
39668 + local_irq_disable();
39669 + cpu_clear(smp_processor_id(), cpu_initialized);
39670 + preempt_enable_no_resched();
39671 + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
39672 + cpu_bringup();
39673 +}
39674 +#else
39675 +static inline void play_dead(void)
39676 +{
39677 + BUG();
39678 +}
39679 +#endif /* CONFIG_HOTPLUG_CPU */
39680 +
39681 +/*
39682 + * The idle thread. There's no useful work to be
39683 + * done, so just try to conserve power and have a
39684 + * low exit latency (ie sit in a loop waiting for
39685 + * somebody to say that they'd like to reschedule)
39686 + */
39687 +void cpu_idle (void)
39688 +{
39689 + set_thread_flag(TIF_POLLING_NRFLAG);
39690 +
39691 + /* endless idle loop with no priority at all */
39692 + while (1) {
39693 + while (!need_resched()) {
39694 + if (__get_cpu_var(cpu_idle_state))
39695 + __get_cpu_var(cpu_idle_state) = 0;
39696 + rmb();
39697 +
39698 + if (cpu_is_offline(smp_processor_id()))
39699 + play_dead();
39700 + enter_idle();
39701 + xen_idle();
39702 + __exit_idle();
39703 + }
39704 +
39705 + preempt_enable_no_resched();
39706 + schedule();
39707 + preempt_disable();
39708 + }
39709 +}
39710 +
39711 +void cpu_idle_wait(void)
39712 +{
39713 + unsigned int cpu, this_cpu = get_cpu();
39714 + cpumask_t map;
39715 +
39716 + set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
39717 + put_cpu();
39718 +
39719 + cpus_clear(map);
39720 + for_each_online_cpu(cpu) {
39721 + per_cpu(cpu_idle_state, cpu) = 1;
39722 + cpu_set(cpu, map);
39723 + }
39724 +
39725 + __get_cpu_var(cpu_idle_state) = 0;
39726 +
39727 + wmb();
39728 + do {
39729 + ssleep(1);
39730 + for_each_online_cpu(cpu) {
39731 + if (cpu_isset(cpu, map) &&
39732 + !per_cpu(cpu_idle_state, cpu))
39733 + cpu_clear(cpu, map);
39734 + }
39735 + cpus_and(map, map, cpu_online_map);
39736 + } while (!cpus_empty(map));
39737 +}
39738 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
39739 +
39740 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
39741 +/* Always use xen_idle() instead. */
39742 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
39743 +
39744 +/* Prints also some state that isn't saved in the pt_regs */
39745 +void __show_regs(struct pt_regs * regs)
39746 +{
39747 + unsigned long fs, gs, shadowgs;
39748 + unsigned int fsindex,gsindex;
39749 + unsigned int ds,cs,es;
39750 +
39751 + printk("\n");
39752 + print_modules();
39753 + printk("Pid: %d, comm: %.20s %s %s %.*s\n",
39754 + current->pid, current->comm, print_tainted(),
39755 + system_utsname.release,
39756 + (int)strcspn(system_utsname.version, " "),
39757 + system_utsname.version);
39758 + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
39759 + printk_address(regs->rip);
39760 + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
39761 + regs->eflags);
39762 + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
39763 + regs->rax, regs->rbx, regs->rcx);
39764 + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
39765 + regs->rdx, regs->rsi, regs->rdi);
39766 + printk("RBP: %016lx R08: %016lx R09: %016lx\n",
39767 + regs->rbp, regs->r8, regs->r9);
39768 + printk("R10: %016lx R11: %016lx R12: %016lx\n",
39769 + regs->r10, regs->r11, regs->r12);
39770 + printk("R13: %016lx R14: %016lx R15: %016lx\n",
39771 + regs->r13, regs->r14, regs->r15);
39772 +
39773 + asm("mov %%ds,%0" : "=r" (ds));
39774 + asm("mov %%cs,%0" : "=r" (cs));
39775 + asm("mov %%es,%0" : "=r" (es));
39776 + asm("mov %%fs,%0" : "=r" (fsindex));
39777 + asm("mov %%gs,%0" : "=r" (gsindex));
39778 +
39779 + rdmsrl(MSR_FS_BASE, fs);
39780 + rdmsrl(MSR_GS_BASE, gs);
39781 + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
39782 +
39783 + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
39784 + fs,fsindex,gs,gsindex,shadowgs);
39785 + printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es);
39786 +
39787 +}
39788 +
39789 +void show_regs(struct pt_regs *regs)
39790 +{
39791 + printk("CPU %d:", smp_processor_id());
39792 + __show_regs(regs);
39793 + show_trace(&regs->rsp);
39794 +}
39795 +
39796 +/*
39797 + * Free current thread data structures etc..
39798 + */
39799 +void exit_thread(void)
39800 +{
39801 + struct task_struct *me = current;
39802 + struct thread_struct *t = &me->thread;
39803 +
39804 + /*
39805 + * Remove function-return probe instances associated with this task
39806 + * and put them back on the free list. Do not insert an exit probe for
39807 + * this function, it will be disabled by kprobe_flush_task if you do.
39808 + */
39809 + kprobe_flush_task(me);
39810 +
39811 + if (me->thread.io_bitmap_ptr) {
39812 +#ifndef CONFIG_X86_NO_TSS
39813 + struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
39814 +#endif
39815 +#ifdef CONFIG_XEN
39816 + struct physdev_set_iobitmap iobmp_op = { 0 };
39817 +#endif
39818 +
39819 + kfree(t->io_bitmap_ptr);
39820 + t->io_bitmap_ptr = NULL;
39821 + /*
39822 + * Careful, clear this in the TSS too:
39823 + */
39824 +#ifndef CONFIG_X86_NO_TSS
39825 + memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
39826 + put_cpu();
39827 +#endif
39828 +#ifdef CONFIG_XEN
39829 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
39830 +#endif
39831 + t->io_bitmap_max = 0;
39832 + }
39833 +}
39834 +
39835 +void load_gs_index(unsigned gs)
39836 +{
39837 + HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
39838 +}
39839 +
39840 +void flush_thread(void)
39841 +{
39842 + struct task_struct *tsk = current;
39843 + struct thread_info *t = current_thread_info();
39844 +
39845 + if (t->flags & _TIF_ABI_PENDING)
39846 + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
39847 +
39848 + tsk->thread.debugreg0 = 0;
39849 + tsk->thread.debugreg1 = 0;
39850 + tsk->thread.debugreg2 = 0;
39851 + tsk->thread.debugreg3 = 0;
39852 + tsk->thread.debugreg6 = 0;
39853 + tsk->thread.debugreg7 = 0;
39854 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
39855 + /*
39856 + * Forget coprocessor state..
39857 + */
39858 + clear_fpu(tsk);
39859 + clear_used_math();
39860 +}
39861 +
39862 +void release_thread(struct task_struct *dead_task)
39863 +{
39864 + if (dead_task->mm) {
39865 + if (dead_task->mm->context.size) {
39866 + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
39867 + dead_task->comm,
39868 + dead_task->mm->context.ldt,
39869 + dead_task->mm->context.size);
39870 + BUG();
39871 + }
39872 + }
39873 +}
39874 +
39875 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
39876 +{
39877 + struct user_desc ud = {
39878 + .base_addr = addr,
39879 + .limit = 0xfffff,
39880 + .seg_32bit = 1,
39881 + .limit_in_pages = 1,
39882 + .useable = 1,
39883 + };
39884 + struct n_desc_struct *desc = (void *)t->thread.tls_array;
39885 + desc += tls;
39886 + desc->a = LDT_entry_a(&ud);
39887 + desc->b = LDT_entry_b(&ud);
39888 +}
39889 +
39890 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
39891 +{
39892 + struct desc_struct *desc = (void *)t->thread.tls_array;
39893 + desc += tls;
39894 + return desc->base0 |
39895 + (((u32)desc->base1) << 16) |
39896 + (((u32)desc->base2) << 24);
39897 +}
39898 +
39899 +/*
39900 + * This gets called before we allocate a new thread and copy
39901 + * the current task into it.
39902 + */
39903 +void prepare_to_copy(struct task_struct *tsk)
39904 +{
39905 + unlazy_fpu(tsk);
39906 +}
39907 +
39908 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
39909 + unsigned long unused,
39910 + struct task_struct * p, struct pt_regs * regs)
39911 +{
39912 + int err;
39913 + struct pt_regs * childregs;
39914 + struct task_struct *me = current;
39915 +
39916 + childregs = ((struct pt_regs *)
39917 + (THREAD_SIZE + task_stack_page(p))) - 1;
39918 + *childregs = *regs;
39919 +
39920 + childregs->rax = 0;
39921 + childregs->rsp = rsp;
39922 + if (rsp == ~0UL)
39923 + childregs->rsp = (unsigned long)childregs;
39924 +
39925 + p->thread.rsp = (unsigned long) childregs;
39926 + p->thread.rsp0 = (unsigned long) (childregs+1);
39927 + p->thread.userrsp = me->thread.userrsp;
39928 +
39929 + set_tsk_thread_flag(p, TIF_FORK);
39930 +
39931 + p->thread.fs = me->thread.fs;
39932 + p->thread.gs = me->thread.gs;
39933 +
39934 + asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
39935 + asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
39936 + asm("mov %%es,%0" : "=m" (p->thread.es));
39937 + asm("mov %%ds,%0" : "=m" (p->thread.ds));
39938 +
39939 + if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
39940 + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
39941 + if (!p->thread.io_bitmap_ptr) {
39942 + p->thread.io_bitmap_max = 0;
39943 + return -ENOMEM;
39944 + }
39945 + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
39946 + IO_BITMAP_BYTES);
39947 + }
39948 +
39949 + /*
39950 + * Set a new TLS for the child thread?
39951 + */
39952 + if (clone_flags & CLONE_SETTLS) {
39953 +#ifdef CONFIG_IA32_EMULATION
39954 + if (test_thread_flag(TIF_IA32))
39955 + err = ia32_child_tls(p, childregs);
39956 + else
39957 +#endif
39958 + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
39959 + if (err)
39960 + goto out;
39961 + }
39962 + p->thread.iopl = current->thread.iopl;
39963 +
39964 + err = 0;
39965 +out:
39966 + if (err && p->thread.io_bitmap_ptr) {
39967 + kfree(p->thread.io_bitmap_ptr);
39968 + p->thread.io_bitmap_max = 0;
39969 + }
39970 + return err;
39971 +}
39972 +
39973 +static inline void __save_init_fpu( struct task_struct *tsk )
39974 +{
39975 + asm volatile( "rex64 ; fxsave %0 ; fnclex"
39976 + : "=m" (tsk->thread.i387.fxsave));
39977 + tsk->thread_info->status &= ~TS_USEDFPU;
39978 +}
39979 +
39980 +/*
39981 + * switch_to(x,y) should switch tasks from x to y.
39982 + *
39983 + * This could still be optimized:
39984 + * - fold all the options into a flag word and test it with a single test.
39985 + * - could test fs/gs bitsliced
39986 + *
39987 + * Kprobes not supported here. Set the probe on schedule instead.
39988 + */
39989 +__kprobes struct task_struct *
39990 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
39991 +{
39992 + struct thread_struct *prev = &prev_p->thread,
39993 + *next = &next_p->thread;
39994 + int cpu = smp_processor_id();
39995 +#ifndef CONFIG_X86_NO_TSS
39996 + struct tss_struct *tss = &per_cpu(init_tss, cpu);
39997 +#endif
39998 + struct physdev_set_iopl iopl_op;
39999 + struct physdev_set_iobitmap iobmp_op;
40000 + multicall_entry_t _mcl[8], *mcl = _mcl;
40001 +
40002 + /*
40003 + * This is basically '__unlazy_fpu', except that we queue a
40004 + * multicall to indicate FPU task switch, rather than
40005 + * synchronously trapping to Xen.
40006 + * This must be here to ensure both math_state_restore() and
40007 + * kernel_fpu_begin() work consistently.
40008 + * The AMD workaround requires it to be after DS reload, or
40009 + * after DS has been cleared, which we do in __prepare_arch_switch.
40010 + */
40011 + if (prev_p->thread_info->status & TS_USEDFPU) {
40012 + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
40013 + mcl->op = __HYPERVISOR_fpu_taskswitch;
40014 + mcl->args[0] = 1;
40015 + mcl++;
40016 + }
40017 +
40018 + /*
40019 + * Reload esp0, LDT and the page table pointer:
40020 + */
40021 + mcl->op = __HYPERVISOR_stack_switch;
40022 + mcl->args[0] = __KERNEL_DS;
40023 + mcl->args[1] = next->rsp0;
40024 + mcl++;
40025 +
40026 + /*
40027 + * Load the per-thread Thread-Local Storage descriptor.
40028 + * This is load_TLS(next, cpu) with multicalls.
40029 + */
40030 +#define C(i) do { \
40031 + if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
40032 + mcl->op = __HYPERVISOR_update_descriptor; \
40033 + mcl->args[0] = virt_to_machine( \
40034 + &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
40035 + mcl->args[1] = next->tls_array[i]; \
40036 + mcl++; \
40037 + } \
40038 +} while (0)
40039 + C(0); C(1); C(2);
40040 +#undef C
40041 +
40042 + if (unlikely(prev->iopl != next->iopl)) {
40043 + iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
40044 + mcl->op = __HYPERVISOR_physdev_op;
40045 + mcl->args[0] = PHYSDEVOP_set_iopl;
40046 + mcl->args[1] = (unsigned long)&iopl_op;
40047 + mcl++;
40048 + }
40049 +
40050 + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
40051 + iobmp_op.bitmap = (char *)next->io_bitmap_ptr;
40052 + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
40053 + mcl->op = __HYPERVISOR_physdev_op;
40054 + mcl->args[0] = PHYSDEVOP_set_iobitmap;
40055 + mcl->args[1] = (unsigned long)&iobmp_op;
40056 + mcl++;
40057 + }
40058 +
40059 + (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
40060 + /*
40061 + * Switch DS and ES.
40062 + * This won't pick up thread selector changes, but I guess that is ok.
40063 + */
40064 + if (unlikely(next->es))
40065 + loadsegment(es, next->es);
40066 +
40067 + if (unlikely(next->ds))
40068 + loadsegment(ds, next->ds);
40069 +
40070 + /*
40071 + * Switch FS and GS.
40072 + */
40073 + if (unlikely(next->fsindex))
40074 + loadsegment(fs, next->fsindex);
40075 +
40076 + if (next->fs)
40077 + HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs);
40078 +
40079 + if (unlikely(next->gsindex))
40080 + load_gs_index(next->gsindex);
40081 +
40082 + if (next->gs)
40083 + HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs);
40084 +
40085 + /*
40086 + * Switch the PDA context.
40087 + */
40088 + prev->userrsp = read_pda(oldrsp);
40089 + write_pda(oldrsp, next->userrsp);
40090 + write_pda(pcurrent, next_p);
40091 + write_pda(kernelstack,
40092 + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
40093 +
40094 + /*
40095 + * Now maybe reload the debug registers
40096 + */
40097 + if (unlikely(next->debugreg7)) {
40098 + set_debugreg(next->debugreg0, 0);
40099 + set_debugreg(next->debugreg1, 1);
40100 + set_debugreg(next->debugreg2, 2);
40101 + set_debugreg(next->debugreg3, 3);
40102 + /* no 4 and 5 */
40103 + set_debugreg(next->debugreg6, 6);
40104 + set_debugreg(next->debugreg7, 7);
40105 + }
40106 +
40107 + return prev_p;
40108 +}
40109 +
40110 +/*
40111 + * sys_execve() executes a new program.
40112 + */
40113 +asmlinkage
40114 +long sys_execve(char __user *name, char __user * __user *argv,
40115 + char __user * __user *envp, struct pt_regs regs)
40116 +{
40117 + long error;
40118 + char * filename;
40119 +
40120 + filename = getname(name);
40121 + error = PTR_ERR(filename);
40122 + if (IS_ERR(filename))
40123 + return error;
40124 + error = do_execve(filename, argv, envp, &regs);
40125 + if (error == 0) {
40126 + task_lock(current);
40127 + current->ptrace &= ~PT_DTRACE;
40128 + task_unlock(current);
40129 + }
40130 + putname(filename);
40131 + return error;
40132 +}
40133 +
40134 +void set_personality_64bit(void)
40135 +{
40136 + /* inherit personality from parent */
40137 +
40138 + /* Make sure to be in 64bit mode */
40139 + clear_thread_flag(TIF_IA32);
40140 +
40141 + /* TBD: overwrites user setup. Should have two bits.
40142 + But 64bit processes have always behaved this way,
40143 + so it's not too bad. The main problem is just that
40144 + 32bit childs are affected again. */
40145 + current->personality &= ~READ_IMPLIES_EXEC;
40146 +}
40147 +
40148 +asmlinkage long sys_fork(struct pt_regs *regs)
40149 +{
40150 + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
40151 +}
40152 +
40153 +asmlinkage long
40154 +sys_clone(unsigned long clone_flags, unsigned long newsp,
40155 + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
40156 +{
40157 + if (!newsp)
40158 + newsp = regs->rsp;
40159 + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
40160 +}
40161 +
40162 +/*
40163 + * This is trivial, and on the face of it looks like it
40164 + * could equally well be done in user mode.
40165 + *
40166 + * Not so, for quite unobvious reasons - register pressure.
40167 + * In user mode vfork() cannot have a stack frame, and if
40168 + * done by calling the "clone()" system call directly, you
40169 + * do not have enough call-clobbered registers to hold all
40170 + * the information you need.
40171 + */
40172 +asmlinkage long sys_vfork(struct pt_regs *regs)
40173 +{
40174 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
40175 + NULL, NULL);
40176 +}
40177 +
40178 +unsigned long get_wchan(struct task_struct *p)
40179 +{
40180 + unsigned long stack;
40181 + u64 fp,rip;
40182 + int count = 0;
40183 +
40184 + if (!p || p == current || p->state==TASK_RUNNING)
40185 + return 0;
40186 + stack = (unsigned long)task_stack_page(p);
40187 + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
40188 + return 0;
40189 + fp = *(u64 *)(p->thread.rsp);
40190 + do {
40191 + if (fp < (unsigned long)stack ||
40192 + fp > (unsigned long)stack+THREAD_SIZE)
40193 + return 0;
40194 + rip = *(u64 *)(fp+8);
40195 + if (!in_sched_functions(rip))
40196 + return rip;
40197 + fp = *(u64 *)fp;
40198 + } while (count++ < 16);
40199 + return 0;
40200 +}
40201 +
40202 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
40203 +{
40204 + int ret = 0;
40205 + int doit = task == current;
40206 + int cpu;
40207 +
40208 + switch (code) {
40209 + case ARCH_SET_GS:
40210 + if (addr >= TASK_SIZE_OF(task))
40211 + return -EPERM;
40212 + cpu = get_cpu();
40213 + /* handle small bases via the GDT because that's faster to
40214 + switch. */
40215 + if (addr <= 0xffffffff) {
40216 + set_32bit_tls(task, GS_TLS, addr);
40217 + if (doit) {
40218 + load_TLS(&task->thread, cpu);
40219 + load_gs_index(GS_TLS_SEL);
40220 + }
40221 + task->thread.gsindex = GS_TLS_SEL;
40222 + task->thread.gs = 0;
40223 + } else {
40224 + task->thread.gsindex = 0;
40225 + task->thread.gs = addr;
40226 + if (doit) {
40227 + load_gs_index(0);
40228 + ret = HYPERVISOR_set_segment_base(
40229 + SEGBASE_GS_USER, addr);
40230 + }
40231 + }
40232 + put_cpu();
40233 + break;
40234 + case ARCH_SET_FS:
40235 + /* Not strictly needed for fs, but do it for symmetry
40236 + with gs */
40237 + if (addr >= TASK_SIZE_OF(task))
40238 + return -EPERM;
40239 + cpu = get_cpu();
40240 + /* handle small bases via the GDT because that's faster to
40241 + switch. */
40242 + if (addr <= 0xffffffff) {
40243 + set_32bit_tls(task, FS_TLS, addr);
40244 + if (doit) {
40245 + load_TLS(&task->thread, cpu);
40246 + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
40247 + }
40248 + task->thread.fsindex = FS_TLS_SEL;
40249 + task->thread.fs = 0;
40250 + } else {
40251 + task->thread.fsindex = 0;
40252 + task->thread.fs = addr;
40253 + if (doit) {
40254 + /* set the selector to 0 to not confuse
40255 + __switch_to */
40256 + asm volatile("movl %0,%%fs" :: "r" (0));
40257 + ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
40258 + addr);
40259 + }
40260 + }
40261 + put_cpu();
40262 + break;
40263 + case ARCH_GET_FS: {
40264 + unsigned long base;
40265 + if (task->thread.fsindex == FS_TLS_SEL)
40266 + base = read_32bit_tls(task, FS_TLS);
40267 + else if (doit)
40268 + rdmsrl(MSR_FS_BASE, base);
40269 + else
40270 + base = task->thread.fs;
40271 + ret = put_user(base, (unsigned long __user *)addr);
40272 + break;
40273 + }
40274 + case ARCH_GET_GS: {
40275 + unsigned long base;
40276 + if (task->thread.gsindex == GS_TLS_SEL)
40277 + base = read_32bit_tls(task, GS_TLS);
40278 + else if (doit)
40279 + rdmsrl(MSR_KERNEL_GS_BASE, base);
40280 + else
40281 + base = task->thread.gs;
40282 + ret = put_user(base, (unsigned long __user *)addr);
40283 + break;
40284 + }
40285 +
40286 + default:
40287 + ret = -EINVAL;
40288 + break;
40289 + }
40290 +
40291 + return ret;
40292 +}
40293 +
40294 +long sys_arch_prctl(int code, unsigned long addr)
40295 +{
40296 + return do_arch_prctl(current, code, addr);
40297 +}
40298 +
40299 +/*
40300 + * Capture the user space registers if the task is not running (in user space)
40301 + */
40302 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
40303 +{
40304 + struct pt_regs *pp, ptregs;
40305 +
40306 + pp = task_pt_regs(tsk);
40307 +
40308 + ptregs = *pp;
40309 + ptregs.cs &= 0xffff;
40310 + ptregs.ss &= 0xffff;
40311 +
40312 + elf_core_copy_regs(regs, &ptregs);
40313 +
40314 + boot_option_idle_override = 1;
40315 + return 1;
40316 +}
40317 +
40318 +unsigned long arch_align_stack(unsigned long sp)
40319 +{
40320 + if (randomize_va_space)
40321 + sp -= get_random_int() % 8192;
40322 + return sp & ~0xf;
40323 +}
40324 +
40325 +#ifndef CONFIG_SMP
40326 +void _restore_vcpu(void)
40327 +{
40328 +}
40329 +#endif
40330 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/relocate_kernel.S linux-2.6.16.33/arch/x86_64/kernel/relocate_kernel.S
40331 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/relocate_kernel.S 2006-11-22 18:06:31.000000000 +0000
40332 +++ linux-2.6.16.33/arch/x86_64/kernel/relocate_kernel.S 2007-05-23 21:00:01.000000000 +0000
40333 @@ -7,31 +7,195 @@
40334 */
40335
40336 #include <linux/linkage.h>
40337 +#include <asm/page.h>
40338 +#include <asm/kexec.h>
40339
40340 - /*
40341 - * Must be relocatable PIC code callable as a C function, that once
40342 - * it starts can not use the previous processes stack.
40343 - */
40344 - .globl relocate_new_kernel
40345 +/*
40346 + * Must be relocatable PIC code callable as a C function
40347 + */
40348 +
40349 +#define PTR(x) (x << 3)
40350 +#define PAGE_ALIGNED (1 << PAGE_SHIFT)
40351 +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
40352 +
40353 + .text
40354 + .align PAGE_ALIGNED
40355 .code64
40356 + .globl relocate_kernel
40357 +relocate_kernel:
40358 + /* %rdi indirection_page
40359 + * %rsi page_list
40360 + * %rdx start address
40361 + */
40362 +
40363 + /* map the control page at its virtual address */
40364 +
40365 + movq $0x0000ff8000000000, %r10 /* mask */
40366 + mov $(39 - 3), %cl /* bits to shift */
40367 + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
40368 +
40369 + movq %r11, %r9
40370 + andq %r10, %r9
40371 + shrq %cl, %r9
40372 +
40373 + movq PTR(VA_PGD)(%rsi), %r8
40374 + addq %r8, %r9
40375 + movq PTR(PA_PUD_0)(%rsi), %r8
40376 + orq $PAGE_ATTR, %r8
40377 + movq %r8, (%r9)
40378 +
40379 + shrq $9, %r10
40380 + sub $9, %cl
40381 +
40382 + movq %r11, %r9
40383 + andq %r10, %r9
40384 + shrq %cl, %r9
40385 +
40386 + movq PTR(VA_PUD_0)(%rsi), %r8
40387 + addq %r8, %r9
40388 + movq PTR(PA_PMD_0)(%rsi), %r8
40389 + orq $PAGE_ATTR, %r8
40390 + movq %r8, (%r9)
40391 +
40392 + shrq $9, %r10
40393 + sub $9, %cl
40394 +
40395 + movq %r11, %r9
40396 + andq %r10, %r9
40397 + shrq %cl, %r9
40398 +
40399 + movq PTR(VA_PMD_0)(%rsi), %r8
40400 + addq %r8, %r9
40401 + movq PTR(PA_PTE_0)(%rsi), %r8
40402 + orq $PAGE_ATTR, %r8
40403 + movq %r8, (%r9)
40404 +
40405 + shrq $9, %r10
40406 + sub $9, %cl
40407 +
40408 + movq %r11, %r9
40409 + andq %r10, %r9
40410 + shrq %cl, %r9
40411 +
40412 + movq PTR(VA_PTE_0)(%rsi), %r8
40413 + addq %r8, %r9
40414 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40415 + orq $PAGE_ATTR, %r8
40416 + movq %r8, (%r9)
40417 +
40418 + /* identity map the control page at its physical address */
40419 +
40420 + movq $0x0000ff8000000000, %r10 /* mask */
40421 + mov $(39 - 3), %cl /* bits to shift */
40422 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
40423 +
40424 + movq %r11, %r9
40425 + andq %r10, %r9
40426 + shrq %cl, %r9
40427 +
40428 + movq PTR(VA_PGD)(%rsi), %r8
40429 + addq %r8, %r9
40430 + movq PTR(PA_PUD_1)(%rsi), %r8
40431 + orq $PAGE_ATTR, %r8
40432 + movq %r8, (%r9)
40433 +
40434 + shrq $9, %r10
40435 + sub $9, %cl
40436 +
40437 + movq %r11, %r9
40438 + andq %r10, %r9
40439 + shrq %cl, %r9
40440 +
40441 + movq PTR(VA_PUD_1)(%rsi), %r8
40442 + addq %r8, %r9
40443 + movq PTR(PA_PMD_1)(%rsi), %r8
40444 + orq $PAGE_ATTR, %r8
40445 + movq %r8, (%r9)
40446 +
40447 + shrq $9, %r10
40448 + sub $9, %cl
40449 +
40450 + movq %r11, %r9
40451 + andq %r10, %r9
40452 + shrq %cl, %r9
40453 +
40454 + movq PTR(VA_PMD_1)(%rsi), %r8
40455 + addq %r8, %r9
40456 + movq PTR(PA_PTE_1)(%rsi), %r8
40457 + orq $PAGE_ATTR, %r8
40458 + movq %r8, (%r9)
40459 +
40460 + shrq $9, %r10
40461 + sub $9, %cl
40462 +
40463 + movq %r11, %r9
40464 + andq %r10, %r9
40465 + shrq %cl, %r9
40466 +
40467 + movq PTR(VA_PTE_1)(%rsi), %r8
40468 + addq %r8, %r9
40469 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40470 + orq $PAGE_ATTR, %r8
40471 + movq %r8, (%r9)
40472 +
40473 relocate_new_kernel:
40474 - /* %rdi page_list
40475 - * %rsi reboot_code_buffer
40476 + /* %rdi indirection_page
40477 + * %rsi page_list
40478 * %rdx start address
40479 - * %rcx page_table
40480 - * %r8 arg5
40481 - * %r9 arg6
40482 */
40483
40484 /* zero out flags, and disable interrupts */
40485 pushq $0
40486 popfq
40487
40488 - /* set a new stack at the bottom of our page... */
40489 - lea 4096(%rsi), %rsp
40490 -
40491 - /* store the parameters back on the stack */
40492 - pushq %rdx /* store the start address */
40493 + /* get physical address of control page now */
40494 + /* this is impossible after page table switch */
40495 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
40496 +
40497 + /* get physical address of page table now too */
40498 + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
40499 +
40500 + /* switch to new set of page tables */
40501 + movq PTR(PA_PGD)(%rsi), %r9
40502 + movq %r9, %cr3
40503 +
40504 + /* setup idt */
40505 + movq %r8, %rax
40506 + addq $(idt_80 - relocate_kernel), %rax
40507 + lidtq (%rax)
40508 +
40509 + /* setup gdt */
40510 + movq %r8, %rax
40511 + addq $(gdt - relocate_kernel), %rax
40512 + movq %r8, %r9
40513 + addq $((gdt_80 - relocate_kernel) + 2), %r9
40514 + movq %rax, (%r9)
40515 +
40516 + movq %r8, %rax
40517 + addq $(gdt_80 - relocate_kernel), %rax
40518 + lgdtq (%rax)
40519 +
40520 + /* setup data segment registers */
40521 + xorl %eax, %eax
40522 + movl %eax, %ds
40523 + movl %eax, %es
40524 + movl %eax, %fs
40525 + movl %eax, %gs
40526 + movl %eax, %ss
40527 +
40528 + /* setup a new stack at the end of the physical control page */
40529 + lea 4096(%r8), %rsp
40530 +
40531 + /* load new code segment and jump to identity mapped page */
40532 + movq %r8, %rax
40533 + addq $(identity_mapped - relocate_kernel), %rax
40534 + pushq $(gdt_cs - gdt)
40535 + pushq %rax
40536 + lretq
40537 +
40538 +identity_mapped:
40539 + /* store the start address on the stack */
40540 + pushq %rdx
40541
40542 /* Set cr0 to a known state:
40543 * 31 1 == Paging enabled
40544 @@ -134,10 +298,19 @@
40545 xorq %r13, %r13
40546 xorq %r14, %r14
40547 xorq %r15, %r15
40548 -
40549 ret
40550 -relocate_new_kernel_end:
40551
40552 - .globl relocate_new_kernel_size
40553 -relocate_new_kernel_size:
40554 - .quad relocate_new_kernel_end - relocate_new_kernel
40555 + .align 16
40556 +gdt:
40557 + .quad 0x0000000000000000 /* NULL descriptor */
40558 +gdt_cs:
40559 + .quad 0x00af9a000000ffff
40560 +gdt_end:
40561 +
40562 +gdt_80:
40563 + .word gdt_end - gdt - 1 /* limit */
40564 + .quad 0 /* base - filled in by code above */
40565 +
40566 +idt_80:
40567 + .word 0 /* limit */
40568 + .quad 0 /* base */
40569 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/setup-xen.c linux-2.6.16.33/arch/x86_64/kernel/setup-xen.c
40570 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/setup-xen.c 1970-01-01 00:00:00.000000000 +0000
40571 +++ linux-2.6.16.33/arch/x86_64/kernel/setup-xen.c 2007-01-08 15:00:45.000000000 +0000
40572 @@ -0,0 +1,1672 @@
40573 +/*
40574 + * linux/arch/x86-64/kernel/setup.c
40575 + *
40576 + * Copyright (C) 1995 Linus Torvalds
40577 + *
40578 + * Nov 2001 Dave Jones <davej@suse.de>
40579 + * Forked from i386 setup code.
40580 + *
40581 + * $Id$
40582 + */
40583 +
40584 +/*
40585 + * This file handles the architecture-dependent parts of initialization
40586 + */
40587 +
40588 +#include <linux/errno.h>
40589 +#include <linux/sched.h>
40590 +#include <linux/kernel.h>
40591 +#include <linux/mm.h>
40592 +#include <linux/stddef.h>
40593 +#include <linux/unistd.h>
40594 +#include <linux/ptrace.h>
40595 +#include <linux/slab.h>
40596 +#include <linux/user.h>
40597 +#include <linux/a.out.h>
40598 +#include <linux/tty.h>
40599 +#include <linux/ioport.h>
40600 +#include <linux/delay.h>
40601 +#include <linux/config.h>
40602 +#include <linux/init.h>
40603 +#include <linux/initrd.h>
40604 +#include <linux/highmem.h>
40605 +#include <linux/bootmem.h>
40606 +#include <linux/module.h>
40607 +#include <asm/processor.h>
40608 +#include <linux/console.h>
40609 +#include <linux/seq_file.h>
40610 +#include <linux/crash_dump.h>
40611 +#include <linux/root_dev.h>
40612 +#include <linux/pci.h>
40613 +#include <linux/acpi.h>
40614 +#include <linux/kallsyms.h>
40615 +#include <linux/edd.h>
40616 +#include <linux/mmzone.h>
40617 +#include <linux/kexec.h>
40618 +#include <linux/cpufreq.h>
40619 +#include <linux/dmi.h>
40620 +#include <linux/dma-mapping.h>
40621 +
40622 +#include <asm/mtrr.h>
40623 +#include <asm/uaccess.h>
40624 +#include <asm/system.h>
40625 +#include <asm/io.h>
40626 +#include <asm/smp.h>
40627 +#include <asm/msr.h>
40628 +#include <asm/desc.h>
40629 +#include <video/edid.h>
40630 +#include <asm/e820.h>
40631 +#include <asm/dma.h>
40632 +#include <asm/mpspec.h>
40633 +#include <asm/mmu_context.h>
40634 +#include <asm/bootsetup.h>
40635 +#include <asm/proto.h>
40636 +#include <asm/setup.h>
40637 +#include <asm/mach_apic.h>
40638 +#include <asm/numa.h>
40639 +#include <asm/swiotlb.h>
40640 +#include <asm/sections.h>
40641 +#include <asm/gart-mapping.h>
40642 +#ifdef CONFIG_XEN
40643 +#include <linux/percpu.h>
40644 +#include <xen/interface/physdev.h>
40645 +#include "setup_arch_pre.h"
40646 +#include <asm/hypervisor.h>
40647 +#include <xen/interface/nmi.h>
40648 +#include <xen/features.h>
40649 +#include <xen/xencons.h>
40650 +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
40651 +#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
40652 +#include <asm/mach-xen/setup_arch_post.h>
40653 +#include <xen/interface/memory.h>
40654 +
40655 +#ifdef CONFIG_XEN
40656 +#include <xen/interface/kexec.h>
40657 +#endif
40658 +
40659 +extern unsigned long start_pfn;
40660 +extern struct edid_info edid_info;
40661 +
40662 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
40663 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
40664 +
40665 +extern char hypercall_page[PAGE_SIZE];
40666 +EXPORT_SYMBOL(hypercall_page);
40667 +
40668 +/* Allows setting of maximum possible memory size */
40669 +unsigned long xen_override_max_pfn;
40670 +
40671 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
40672 +static struct notifier_block xen_panic_block = {
40673 + xen_panic_event, NULL, 0 /* try to go last */
40674 +};
40675 +
40676 +unsigned long *phys_to_machine_mapping;
40677 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
40678 +
40679 +EXPORT_SYMBOL(phys_to_machine_mapping);
40680 +
40681 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
40682 +DEFINE_PER_CPU(int, nr_multicall_ents);
40683 +
40684 +/* Raw start-of-day parameters from the hypervisor. */
40685 +start_info_t *xen_start_info;
40686 +EXPORT_SYMBOL(xen_start_info);
40687 +#endif
40688 +
40689 +/*
40690 + * Machine setup..
40691 + */
40692 +
40693 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
40694 +
40695 +unsigned long mmu_cr4_features;
40696 +
40697 +int acpi_disabled;
40698 +EXPORT_SYMBOL(acpi_disabled);
40699 +#ifdef CONFIG_ACPI
40700 +extern int __initdata acpi_ht;
40701 +extern acpi_interrupt_flags acpi_sci_flags;
40702 +int __initdata acpi_force = 0;
40703 +#endif
40704 +
40705 +int acpi_numa __initdata;
40706 +
40707 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
40708 +int bootloader_type;
40709 +
40710 +unsigned long saved_video_mode;
40711 +
40712 +/*
40713 + * Setup options
40714 + */
40715 +struct screen_info screen_info;
40716 +struct sys_desc_table_struct {
40717 + unsigned short length;
40718 + unsigned char table[0];
40719 +};
40720 +
40721 +struct edid_info edid_info;
40722 +struct e820map e820;
40723 +#ifdef CONFIG_XEN
40724 +struct e820map machine_e820;
40725 +#endif
40726 +
40727 +extern int root_mountflags;
40728 +
40729 +char command_line[COMMAND_LINE_SIZE];
40730 +
40731 +struct resource standard_io_resources[] = {
40732 + { .name = "dma1", .start = 0x00, .end = 0x1f,
40733 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40734 + { .name = "pic1", .start = 0x20, .end = 0x21,
40735 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40736 + { .name = "timer0", .start = 0x40, .end = 0x43,
40737 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40738 + { .name = "timer1", .start = 0x50, .end = 0x53,
40739 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40740 + { .name = "keyboard", .start = 0x60, .end = 0x6f,
40741 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40742 + { .name = "dma page reg", .start = 0x80, .end = 0x8f,
40743 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40744 + { .name = "pic2", .start = 0xa0, .end = 0xa1,
40745 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40746 + { .name = "dma2", .start = 0xc0, .end = 0xdf,
40747 + .flags = IORESOURCE_BUSY | IORESOURCE_IO },
40748 + { .name = "fpu", .start = 0xf0, .end = 0xff,
40749 + .flags = IORESOURCE_BUSY | IORESOURCE_IO }
40750 +};
40751 +
40752 +#define STANDARD_IO_RESOURCES \
40753 + (sizeof standard_io_resources / sizeof standard_io_resources[0])
40754 +
40755 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
40756 +
40757 +struct resource data_resource = {
40758 + .name = "Kernel data",
40759 + .start = 0,
40760 + .end = 0,
40761 + .flags = IORESOURCE_RAM,
40762 +};
40763 +struct resource code_resource = {
40764 + .name = "Kernel code",
40765 + .start = 0,
40766 + .end = 0,
40767 + .flags = IORESOURCE_RAM,
40768 +};
40769 +
40770 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
40771 +
40772 +static struct resource system_rom_resource = {
40773 + .name = "System ROM",
40774 + .start = 0xf0000,
40775 + .end = 0xfffff,
40776 + .flags = IORESOURCE_ROM,
40777 +};
40778 +
40779 +static struct resource extension_rom_resource = {
40780 + .name = "Extension ROM",
40781 + .start = 0xe0000,
40782 + .end = 0xeffff,
40783 + .flags = IORESOURCE_ROM,
40784 +};
40785 +
40786 +static struct resource adapter_rom_resources[] = {
40787 + { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
40788 + .flags = IORESOURCE_ROM },
40789 + { .name = "Adapter ROM", .start = 0, .end = 0,
40790 + .flags = IORESOURCE_ROM },
40791 + { .name = "Adapter ROM", .start = 0, .end = 0,
40792 + .flags = IORESOURCE_ROM },
40793 + { .name = "Adapter ROM", .start = 0, .end = 0,
40794 + .flags = IORESOURCE_ROM },
40795 + { .name = "Adapter ROM", .start = 0, .end = 0,
40796 + .flags = IORESOURCE_ROM },
40797 + { .name = "Adapter ROM", .start = 0, .end = 0,
40798 + .flags = IORESOURCE_ROM }
40799 +};
40800 +
40801 +#define ADAPTER_ROM_RESOURCES \
40802 + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
40803 +
40804 +static struct resource video_rom_resource = {
40805 + .name = "Video ROM",
40806 + .start = 0xc0000,
40807 + .end = 0xc7fff,
40808 + .flags = IORESOURCE_ROM,
40809 +};
40810 +
40811 +static struct resource video_ram_resource = {
40812 + .name = "Video RAM area",
40813 + .start = 0xa0000,
40814 + .end = 0xbffff,
40815 + .flags = IORESOURCE_RAM,
40816 +};
40817 +
40818 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
40819 +
40820 +static int __init romchecksum(unsigned char *rom, unsigned long length)
40821 +{
40822 + unsigned char *p, sum = 0;
40823 +
40824 + for (p = rom; p < rom + length; p++)
40825 + sum += *p;
40826 + return sum == 0;
40827 +}
40828 +
40829 +static void __init probe_roms(void)
40830 +{
40831 + unsigned long start, length, upper;
40832 + unsigned char *rom;
40833 + int i;
40834 +
40835 +#ifdef CONFIG_XEN
40836 + /* Nothing to do if not running in dom0. */
40837 + if (!is_initial_xendomain())
40838 + return;
40839 +#endif
40840 +
40841 + /* video rom */
40842 + upper = adapter_rom_resources[0].start;
40843 + for (start = video_rom_resource.start; start < upper; start += 2048) {
40844 + rom = isa_bus_to_virt(start);
40845 + if (!romsignature(rom))
40846 + continue;
40847 +
40848 + video_rom_resource.start = start;
40849 +
40850 + /* 0 < length <= 0x7f * 512, historically */
40851 + length = rom[2] * 512;
40852 +
40853 + /* if checksum okay, trust length byte */
40854 + if (length && romchecksum(rom, length))
40855 + video_rom_resource.end = start + length - 1;
40856 +
40857 + request_resource(&iomem_resource, &video_rom_resource);
40858 + break;
40859 + }
40860 +
40861 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
40862 + if (start < upper)
40863 + start = upper;
40864 +
40865 + /* system rom */
40866 + request_resource(&iomem_resource, &system_rom_resource);
40867 + upper = system_rom_resource.start;
40868 +
40869 + /* check for extension rom (ignore length byte!) */
40870 + rom = isa_bus_to_virt(extension_rom_resource.start);
40871 + if (romsignature(rom)) {
40872 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
40873 + if (romchecksum(rom, length)) {
40874 + request_resource(&iomem_resource, &extension_rom_resource);
40875 + upper = extension_rom_resource.start;
40876 + }
40877 + }
40878 +
40879 + /* check for adapter roms on 2k boundaries */
40880 + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
40881 + rom = isa_bus_to_virt(start);
40882 + if (!romsignature(rom))
40883 + continue;
40884 +
40885 + /* 0 < length <= 0x7f * 512, historically */
40886 + length = rom[2] * 512;
40887 +
40888 + /* but accept any length that fits if checksum okay */
40889 + if (!length || start + length > upper || !romchecksum(rom, length))
40890 + continue;
40891 +
40892 + adapter_rom_resources[i].start = start;
40893 + adapter_rom_resources[i].end = start + length - 1;
40894 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
40895 +
40896 + start = adapter_rom_resources[i++].end & ~2047UL;
40897 + }
40898 +}
40899 +
40900 +static __init void parse_cmdline_early (char ** cmdline_p)
40901 +{
40902 + char c = ' ', *to = command_line, *from = COMMAND_LINE;
40903 + int len = 0;
40904 + int userdef = 0;
40905 +
40906 + for (;;) {
40907 + if (c != ' ')
40908 + goto next_char;
40909 +
40910 +#ifdef CONFIG_SMP
40911 + /*
40912 + * If the BIOS enumerates physical processors before logical,
40913 + * maxcpus=N at enumeration-time can be used to disable HT.
40914 + */
40915 + else if (!memcmp(from, "maxcpus=", 8)) {
40916 + extern unsigned int maxcpus;
40917 +
40918 + maxcpus = simple_strtoul(from + 8, NULL, 0);
40919 + }
40920 +#endif
40921 +#ifdef CONFIG_ACPI
40922 + /* "acpi=off" disables both ACPI table parsing and interpreter init */
40923 + if (!memcmp(from, "acpi=off", 8))
40924 + disable_acpi();
40925 +
40926 + if (!memcmp(from, "acpi=force", 10)) {
40927 + /* add later when we do DMI horrors: */
40928 + acpi_force = 1;
40929 + acpi_disabled = 0;
40930 + }
40931 +
40932 + /* acpi=ht just means: do ACPI MADT parsing
40933 + at bootup, but don't enable the full ACPI interpreter */
40934 + if (!memcmp(from, "acpi=ht", 7)) {
40935 + if (!acpi_force)
40936 + disable_acpi();
40937 + acpi_ht = 1;
40938 + }
40939 + else if (!memcmp(from, "pci=noacpi", 10))
40940 + acpi_disable_pci();
40941 + else if (!memcmp(from, "acpi=noirq", 10))
40942 + acpi_noirq_set();
40943 +
40944 + else if (!memcmp(from, "acpi_sci=edge", 13))
40945 + acpi_sci_flags.trigger = 1;
40946 + else if (!memcmp(from, "acpi_sci=level", 14))
40947 + acpi_sci_flags.trigger = 3;
40948 + else if (!memcmp(from, "acpi_sci=high", 13))
40949 + acpi_sci_flags.polarity = 1;
40950 + else if (!memcmp(from, "acpi_sci=low", 12))
40951 + acpi_sci_flags.polarity = 3;
40952 +
40953 + /* acpi=strict disables out-of-spec workarounds */
40954 + else if (!memcmp(from, "acpi=strict", 11)) {
40955 + acpi_strict = 1;
40956 + }
40957 +#ifdef CONFIG_X86_IO_APIC
40958 + else if (!memcmp(from, "acpi_skip_timer_override", 24))
40959 + acpi_skip_timer_override = 1;
40960 +#endif
40961 +#endif
40962 +
40963 +#ifndef CONFIG_XEN
40964 + if (!memcmp(from, "nolapic", 7) ||
40965 + !memcmp(from, "disableapic", 11))
40966 + disable_apic = 1;
40967 +
40968 + /* Don't confuse with noapictimer */
40969 + if (!memcmp(from, "noapic", 6) &&
40970 + (from[6] == ' ' || from[6] == 0))
40971 + skip_ioapic_setup = 1;
40972 +
40973 + /* Make sure to not confuse with apic= */
40974 + if (!memcmp(from, "apic", 4) &&
40975 + (from[4] == ' ' || from[4] == 0)) {
40976 + skip_ioapic_setup = 0;
40977 + ioapic_force = 1;
40978 + }
40979 +#endif
40980 +
40981 + if (!memcmp(from, "mem=", 4))
40982 + parse_memopt(from+4, &from);
40983 +
40984 + if (!memcmp(from, "memmap=", 7)) {
40985 + /* exactmap option is for used defined memory */
40986 + if (!memcmp(from+7, "exactmap", 8)) {
40987 +#ifdef CONFIG_CRASH_DUMP
40988 + /* If we are doing a crash dump, we
40989 + * still need to know the real mem
40990 + * size before original memory map is
40991 + * reset.
40992 + */
40993 + saved_max_pfn = e820_end_of_ram();
40994 +#endif
40995 + from += 8+7;
40996 + end_pfn_map = 0;
40997 + e820.nr_map = 0;
40998 + userdef = 1;
40999 + }
41000 + else {
41001 + parse_memmapopt(from+7, &from);
41002 + userdef = 1;
41003 + }
41004 + }
41005 +
41006 +#ifdef CONFIG_NUMA
41007 + if (!memcmp(from, "numa=", 5))
41008 + numa_setup(from+5);
41009 +#endif
41010 +
41011 + if (!memcmp(from,"iommu=",6)) {
41012 + iommu_setup(from+6);
41013 + }
41014 +
41015 + if (!memcmp(from,"oops=panic", 10))
41016 + panic_on_oops = 1;
41017 +
41018 + if (!memcmp(from, "noexec=", 7))
41019 + nonx_setup(from + 7);
41020 +
41021 +#ifdef CONFIG_KEXEC
41022 + /* crashkernel=size@addr specifies the location to reserve for
41023 + * a crash kernel. By reserving this memory we guarantee
41024 + * that linux never set's it up as a DMA target.
41025 + * Useful for holding code to do something appropriate
41026 + * after a kernel panic.
41027 + */
41028 + else if (!memcmp(from, "crashkernel=", 12)) {
41029 +#ifndef CONFIG_XEN
41030 + unsigned long size, base;
41031 + size = memparse(from+12, &from);
41032 + if (*from == '@') {
41033 + base = memparse(from+1, &from);
41034 + /* FIXME: Do I want a sanity check
41035 + * to validate the memory range?
41036 + */
41037 + crashk_res.start = base;
41038 + crashk_res.end = base + size - 1;
41039 + }
41040 +#else
41041 + printk("Ignoring crashkernel command line, "
41042 + "parameter will be supplied by xen\n");
41043 +#endif
41044 + }
41045 +#endif
41046 +
41047 +#ifdef CONFIG_PROC_VMCORE
41048 + /* elfcorehdr= specifies the location of elf core header
41049 + * stored by the crashed kernel. This option will be passed
41050 + * by kexec loader to the capture kernel.
41051 + */
41052 + else if(!memcmp(from, "elfcorehdr=", 11))
41053 + elfcorehdr_addr = memparse(from+11, &from);
41054 +#endif
41055 +
41056 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
41057 + else if (!memcmp(from, "additional_cpus=", 16))
41058 + setup_additional_cpus(from+16);
41059 +#endif
41060 +
41061 + next_char:
41062 + c = *(from++);
41063 + if (!c)
41064 + break;
41065 + if (COMMAND_LINE_SIZE <= ++len)
41066 + break;
41067 + *(to++) = c;
41068 + }
41069 + if (userdef) {
41070 + printk(KERN_INFO "user-defined physical RAM map:\n");
41071 + e820_print_map("user");
41072 + }
41073 + *to = '\0';
41074 + *cmdline_p = command_line;
41075 +}
41076 +
41077 +#ifndef CONFIG_NUMA
41078 +static void __init
41079 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
41080 +{
41081 + unsigned long bootmap_size, bootmap;
41082 +
41083 + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
41084 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
41085 + if (bootmap == -1L)
41086 + panic("Cannot find bootmem map of size %ld\n",bootmap_size);
41087 + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
41088 +#ifdef CONFIG_XEN
41089 + e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
41090 +#else
41091 + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
41092 +#endif
41093 + reserve_bootmem(bootmap, bootmap_size);
41094 +}
41095 +#endif
41096 +
41097 +/* Use inline assembly to define this because the nops are defined
41098 + as inline assembly strings in the include files and we cannot
41099 + get them easily into strings. */
41100 +asm("\t.data\nk8nops: "
41101 + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
41102 + K8_NOP7 K8_NOP8);
41103 +
41104 +extern unsigned char k8nops[];
41105 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
41106 + NULL,
41107 + k8nops,
41108 + k8nops + 1,
41109 + k8nops + 1 + 2,
41110 + k8nops + 1 + 2 + 3,
41111 + k8nops + 1 + 2 + 3 + 4,
41112 + k8nops + 1 + 2 + 3 + 4 + 5,
41113 + k8nops + 1 + 2 + 3 + 4 + 5 + 6,
41114 + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
41115 +};
41116 +
41117 +extern char __vsyscall_0;
41118 +
41119 +/* Replace instructions with better alternatives for this CPU type.
41120 +
41121 + This runs before SMP is initialized to avoid SMP problems with
41122 + self modifying code. This implies that assymetric systems where
41123 + APs have less capabilities than the boot processor are not handled.
41124 + In this case boot with "noreplacement". */
41125 +void apply_alternatives(void *start, void *end)
41126 +{
41127 + struct alt_instr *a;
41128 + int diff, i, k;
41129 + for (a = start; (void *)a < end; a++) {
41130 + u8 *instr;
41131 +
41132 + if (!boot_cpu_has(a->cpuid))
41133 + continue;
41134 +
41135 + BUG_ON(a->replacementlen > a->instrlen);
41136 + instr = a->instr;
41137 + /* vsyscall code is not mapped yet. resolve it manually. */
41138 + if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
41139 + instr -= VSYSCALL_START - (unsigned long)&__vsyscall_0;
41140 + __inline_memcpy(instr, a->replacement, a->replacementlen);
41141 + diff = a->instrlen - a->replacementlen;
41142 +
41143 + /* Pad the rest with nops */
41144 + for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
41145 + k = diff;
41146 + if (k > ASM_NOP_MAX)
41147 + k = ASM_NOP_MAX;
41148 + __inline_memcpy(instr + i, k8_nops[k], k);
41149 + }
41150 + }
41151 +}
41152 +
41153 +static int no_replacement __initdata = 0;
41154 +
41155 +void __init alternative_instructions(void)
41156 +{
41157 + extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
41158 + if (no_replacement)
41159 + return;
41160 + apply_alternatives(__alt_instructions, __alt_instructions_end);
41161 +}
41162 +
41163 +static int __init noreplacement_setup(char *s)
41164 +{
41165 + no_replacement = 1;
41166 + return 0;
41167 +}
41168 +
41169 +__setup("noreplacement", noreplacement_setup);
41170 +
41171 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
41172 +struct edd edd;
41173 +#ifdef CONFIG_EDD_MODULE
41174 +EXPORT_SYMBOL(edd);
41175 +#endif
41176 +/**
41177 + * copy_edd() - Copy the BIOS EDD information
41178 + * from boot_params into a safe place.
41179 + *
41180 + */
41181 +static inline void copy_edd(void)
41182 +{
41183 + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
41184 + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
41185 + edd.mbr_signature_nr = EDD_MBR_SIG_NR;
41186 + edd.edd_info_nr = EDD_NR;
41187 +}
41188 +#else
41189 +static inline void copy_edd(void)
41190 +{
41191 +}
41192 +#endif
41193 +
41194 +#ifndef CONFIG_XEN
41195 +#define EBDA_ADDR_POINTER 0x40E
41196 +static void __init reserve_ebda_region(void)
41197 +{
41198 + unsigned int addr;
41199 + /**
41200 + * there is a real-mode segmented pointer pointing to the
41201 + * 4K EBDA area at 0x40E
41202 + */
41203 + addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
41204 + addr <<= 4;
41205 + if (addr)
41206 + reserve_bootmem_generic(addr, PAGE_SIZE);
41207 +}
41208 +#endif
41209 +
41210 +void __init setup_arch(char **cmdline_p)
41211 +{
41212 + unsigned long kernel_end;
41213 + struct xen_memory_map memmap;
41214 +
41215 +#ifdef CONFIG_XEN
41216 + /* Register a call for panic conditions. */
41217 + notifier_chain_register(&panic_notifier_list, &xen_panic_block);
41218 +
41219 + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
41220 + kernel_end = 0; /* dummy */
41221 + screen_info = SCREEN_INFO;
41222 +
41223 + if (is_initial_xendomain()) {
41224 + /* This is drawn from a dump from vgacon:startup in
41225 + * standard Linux. */
41226 + screen_info.orig_video_mode = 3;
41227 + screen_info.orig_video_isVGA = 1;
41228 + screen_info.orig_video_lines = 25;
41229 + screen_info.orig_video_cols = 80;
41230 + screen_info.orig_video_ega_bx = 3;
41231 + screen_info.orig_video_points = 16;
41232 + screen_info.orig_y = screen_info.orig_video_lines - 1;
41233 + if (xen_start_info->console.dom0.info_size >=
41234 + sizeof(struct dom0_vga_console_info)) {
41235 + const struct dom0_vga_console_info *info =
41236 + (struct dom0_vga_console_info *)(
41237 + (char *)xen_start_info +
41238 + xen_start_info->console.dom0.info_off);
41239 + dom0_init_screen_info(info);
41240 + }
41241 + xen_start_info->console.domU.mfn = 0;
41242 + xen_start_info->console.domU.evtchn = 0;
41243 + } else
41244 + screen_info.orig_video_isVGA = 0;
41245 +
41246 + edid_info = EDID_INFO;
41247 + saved_video_mode = SAVED_VIDEO_MODE;
41248 + bootloader_type = LOADER_TYPE;
41249 +
41250 +#ifdef CONFIG_BLK_DEV_RAM
41251 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
41252 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
41253 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
41254 +
41255 +
41256 +#endif
41257 +
41258 + setup_xen_features();
41259 +
41260 + HYPERVISOR_vm_assist(VMASST_CMD_enable,
41261 + VMASST_TYPE_writable_pagetables);
41262 +
41263 + ARCH_SETUP
41264 +#else
41265 + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
41266 + screen_info = SCREEN_INFO;
41267 + edid_info = EDID_INFO;
41268 + saved_video_mode = SAVED_VIDEO_MODE;
41269 + bootloader_type = LOADER_TYPE;
41270 +
41271 +#ifdef CONFIG_BLK_DEV_RAM
41272 + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
41273 + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
41274 + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
41275 +#endif
41276 +#endif /* !CONFIG_XEN */
41277 + setup_memory_region();
41278 + copy_edd();
41279 +
41280 + if (!MOUNT_ROOT_RDONLY)
41281 + root_mountflags &= ~MS_RDONLY;
41282 + init_mm.start_code = (unsigned long) &_text;
41283 + init_mm.end_code = (unsigned long) &_etext;
41284 + init_mm.end_data = (unsigned long) &_edata;
41285 + init_mm.brk = (unsigned long) &_end;
41286 +
41287 + code_resource.start = virt_to_phys(&_text);
41288 + code_resource.end = virt_to_phys(&_etext)-1;
41289 + data_resource.start = virt_to_phys(&_etext);
41290 + data_resource.end = virt_to_phys(&_edata)-1;
41291 +
41292 + parse_cmdline_early(cmdline_p);
41293 +
41294 + early_identify_cpu(&boot_cpu_data);
41295 +
41296 + /*
41297 + * partially used pages are not usable - thus
41298 + * we are rounding upwards:
41299 + */
41300 + end_pfn = e820_end_of_ram();
41301 +
41302 + check_efer();
41303 +
41304 + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
41305 +
41306 +#ifdef CONFIG_ACPI_NUMA
41307 + /*
41308 + * Parse SRAT to discover nodes.
41309 + */
41310 + acpi_numa_init();
41311 +#endif
41312 +
41313 +#ifdef CONFIG_NUMA
41314 + numa_initmem_init(0, end_pfn);
41315 +#else
41316 + contig_initmem_init(0, end_pfn);
41317 +#endif
41318 +
41319 + /* Reserve direct mapping */
41320 + reserve_bootmem_generic(table_start << PAGE_SHIFT,
41321 + (table_end - table_start) << PAGE_SHIFT);
41322 +
41323 + /* reserve kernel */
41324 + kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
41325 + reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
41326 +
41327 +#ifdef CONFIG_XEN
41328 + /* reserve physmap, start info and initial page tables */
41329 + reserve_bootmem(kernel_end, (table_start<<PAGE_SHIFT)-kernel_end);
41330 +#else
41331 + /*
41332 + * reserve physical page 0 - it's a special BIOS page on many boxes,
41333 + * enabling clean reboots, SMP operation, laptop functions.
41334 + */
41335 + reserve_bootmem_generic(0, PAGE_SIZE);
41336 +
41337 + /* reserve ebda region */
41338 + reserve_ebda_region();
41339 +#endif
41340 +
41341 +#ifdef CONFIG_SMP
41342 + /*
41343 + * But first pinch a few for the stack/trampoline stuff
41344 + * FIXME: Don't need the extra page at 4K, but need to fix
41345 + * trampoline before removing it. (see the GDT stuff)
41346 + */
41347 + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
41348 +
41349 + /* Reserve SMP trampoline */
41350 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
41351 +#endif
41352 +
41353 +#ifdef CONFIG_ACPI_SLEEP
41354 + /*
41355 + * Reserve low memory region for sleep support.
41356 + */
41357 + acpi_reserve_bootmem();
41358 +#endif
41359 +#ifdef CONFIG_XEN
41360 +#ifdef CONFIG_BLK_DEV_INITRD
41361 + if (xen_start_info->mod_start) {
41362 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
41363 + /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
41364 + initrd_start = INITRD_START + PAGE_OFFSET;
41365 + initrd_end = initrd_start+INITRD_SIZE;
41366 + initrd_below_start_ok = 1;
41367 + } else {
41368 + printk(KERN_ERR "initrd extends beyond end of memory "
41369 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
41370 + (unsigned long)(INITRD_START + INITRD_SIZE),
41371 + (unsigned long)(end_pfn << PAGE_SHIFT));
41372 + initrd_start = 0;
41373 + }
41374 + }
41375 +#endif
41376 +#else /* CONFIG_XEN */
41377 +#ifdef CONFIG_BLK_DEV_INITRD
41378 + if (LOADER_TYPE && INITRD_START) {
41379 + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
41380 + reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
41381 + initrd_start =
41382 + INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
41383 + initrd_end = initrd_start+INITRD_SIZE;
41384 + }
41385 + else {
41386 + printk(KERN_ERR "initrd extends beyond end of memory "
41387 + "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
41388 + (unsigned long)(INITRD_START + INITRD_SIZE),
41389 + (unsigned long)(end_pfn << PAGE_SHIFT));
41390 + initrd_start = 0;
41391 + }
41392 + }
41393 +#endif
41394 +#endif /* !CONFIG_XEN */
41395 +#ifdef CONFIG_KEXEC
41396 +#ifdef CONFIG_XEN
41397 + xen_machine_kexec_setup_resources();
41398 +#else
41399 + if (crashk_res.start != crashk_res.end) {
41400 + reserve_bootmem(crashk_res.start,
41401 + crashk_res.end - crashk_res.start + 1);
41402 + }
41403 +#endif
41404 +#endif
41405 +
41406 + paging_init();
41407 +#ifdef CONFIG_X86_LOCAL_APIC
41408 + /*
41409 + * Find and reserve possible boot-time SMP configuration:
41410 + */
41411 + find_smp_config();
41412 +#endif
41413 +#ifdef CONFIG_XEN
41414 + {
41415 + int i, j, k, fpp;
41416 +
41417 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
41418 + /* Make sure we have a large enough P->M table. */
41419 + phys_to_machine_mapping = alloc_bootmem_pages(
41420 + end_pfn * sizeof(unsigned long));
41421 + memset(phys_to_machine_mapping, ~0,
41422 + end_pfn * sizeof(unsigned long));
41423 + memcpy(phys_to_machine_mapping,
41424 + (unsigned long *)xen_start_info->mfn_list,
41425 + xen_start_info->nr_pages * sizeof(unsigned long));
41426 + free_bootmem(
41427 + __pa(xen_start_info->mfn_list),
41428 + PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
41429 + sizeof(unsigned long))));
41430 +
41431 + /*
41432 + * Initialise the list of the frames that specify the
41433 + * list of frames that make up the p2m table. Used by
41434 + * save/restore.
41435 + */
41436 + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
41437 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
41438 + virt_to_mfn(pfn_to_mfn_frame_list_list);
41439 +
41440 + fpp = PAGE_SIZE/sizeof(unsigned long);
41441 + for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
41442 + if ((j % fpp) == 0) {
41443 + k++;
41444 + BUG_ON(k>=fpp);
41445 + pfn_to_mfn_frame_list[k] =
41446 + alloc_bootmem_pages(PAGE_SIZE);
41447 + pfn_to_mfn_frame_list_list[k] =
41448 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
41449 + j=0;
41450 + }
41451 + pfn_to_mfn_frame_list[k][j] =
41452 + virt_to_mfn(&phys_to_machine_mapping[i]);
41453 + }
41454 + HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
41455 + }
41456 +
41457 + }
41458 +
41459 + if (!is_initial_xendomain()) {
41460 + acpi_disabled = 1;
41461 +#ifdef CONFIG_ACPI
41462 + acpi_ht = 0;
41463 +#endif
41464 + }
41465 +#endif
41466 +
41467 +#ifndef CONFIG_XEN
41468 + check_ioapic();
41469 +#endif
41470 +
41471 + zap_low_mappings(0);
41472 +
41473 +#ifdef CONFIG_ACPI
41474 + /*
41475 + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
41476 + * Call this early for SRAT node setup.
41477 + */
41478 + acpi_boot_table_init();
41479 +
41480 + /*
41481 + * Read APIC and some other early information from ACPI tables.
41482 + */
41483 + acpi_boot_init();
41484 +#endif
41485 +
41486 + init_cpu_to_node();
41487 +
41488 +#ifdef CONFIG_X86_LOCAL_APIC
41489 + /*
41490 + * get boot-time SMP configuration:
41491 + */
41492 + if (smp_found_config)
41493 + get_smp_config();
41494 +#ifndef CONFIG_XEN
41495 + init_apic_mappings();
41496 +#endif
41497 +#endif
41498 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
41499 + prefill_possible_map();
41500 +#endif
41501 +
41502 + /*
41503 + * Request address space for all standard RAM and ROM resources
41504 + * and also for regions reported as reserved by the e820.
41505 + */
41506 + probe_roms();
41507 +#ifdef CONFIG_XEN
41508 + if (is_initial_xendomain()) {
41509 + memmap.nr_entries = E820MAX;
41510 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
41511 +
41512 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
41513 + BUG();
41514 + machine_e820.nr_map = memmap.nr_entries;
41515 +
41516 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
41517 + }
41518 +#else
41519 + e820_reserve_resources(e820.map, e820.nr_map);
41520 +#endif
41521 +
41522 + request_resource(&iomem_resource, &video_ram_resource);
41523 +
41524 + {
41525 + unsigned i;
41526 + /* request I/O space for devices used on all i[345]86 PCs */
41527 + for (i = 0; i < STANDARD_IO_RESOURCES; i++)
41528 + request_resource(&ioport_resource, &standard_io_resources[i]);
41529 + }
41530 +
41531 +#ifdef CONFIG_XEN
41532 + if (is_initial_xendomain())
41533 + e820_setup_gap(machine_e820.map, machine_e820.nr_map);
41534 +#else
41535 + e820_setup_gap(e820.map, e820.nr_map);
41536 +#endif
41537 +
41538 +#ifdef CONFIG_GART_IOMMU
41539 + iommu_hole_init();
41540 +#endif
41541 +
41542 +#ifdef CONFIG_XEN
41543 + {
41544 + struct physdev_set_iopl set_iopl;
41545 +
41546 + set_iopl.iopl = 1;
41547 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
41548 +
41549 + if (is_initial_xendomain()) {
41550 +#ifdef CONFIG_VT
41551 +#if defined(CONFIG_VGA_CONSOLE)
41552 + conswitchp = &vga_con;
41553 +#elif defined(CONFIG_DUMMY_CONSOLE)
41554 + conswitchp = &dummy_con;
41555 +#endif
41556 +#endif
41557 + } else {
41558 +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
41559 + conswitchp = &dummy_con;
41560 +#endif
41561 + }
41562 + }
41563 + xencons_early_setup();
41564 +#else /* CONFIG_XEN */
41565 +
41566 +#ifdef CONFIG_VT
41567 +#if defined(CONFIG_VGA_CONSOLE)
41568 + conswitchp = &vga_con;
41569 +#elif defined(CONFIG_DUMMY_CONSOLE)
41570 + conswitchp = &dummy_con;
41571 +#endif
41572 +#endif
41573 +
41574 +#endif /* !CONFIG_XEN */
41575 +}
41576 +
41577 +#ifdef CONFIG_XEN
41578 +static int
41579 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
41580 +{
41581 + HYPERVISOR_shutdown(SHUTDOWN_crash);
41582 + /* we're never actually going to get here... */
41583 + return NOTIFY_DONE;
41584 +}
41585 +#endif /* !CONFIG_XEN */
41586 +
41587 +
41588 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
41589 +{
41590 + unsigned int *v;
41591 +
41592 + if (c->extended_cpuid_level < 0x80000004)
41593 + return 0;
41594 +
41595 + v = (unsigned int *) c->x86_model_id;
41596 + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
41597 + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
41598 + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
41599 + c->x86_model_id[48] = 0;
41600 + return 1;
41601 +}
41602 +
41603 +
41604 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
41605 +{
41606 + unsigned int n, dummy, eax, ebx, ecx, edx;
41607 +
41608 + n = c->extended_cpuid_level;
41609 +
41610 + if (n >= 0x80000005) {
41611 + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
41612 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
41613 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
41614 + c->x86_cache_size=(ecx>>24)+(edx>>24);
41615 + /* On K8 L1 TLB is inclusive, so don't count it */
41616 + c->x86_tlbsize = 0;
41617 + }
41618 +
41619 + if (n >= 0x80000006) {
41620 + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
41621 + ecx = cpuid_ecx(0x80000006);
41622 + c->x86_cache_size = ecx >> 16;
41623 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
41624 +
41625 + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
41626 + c->x86_cache_size, ecx & 0xFF);
41627 + }
41628 +
41629 + if (n >= 0x80000007)
41630 + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
41631 + if (n >= 0x80000008) {
41632 + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
41633 + c->x86_virt_bits = (eax >> 8) & 0xff;
41634 + c->x86_phys_bits = eax & 0xff;
41635 + }
41636 +}
41637 +
41638 +#ifdef CONFIG_NUMA
41639 +static int nearby_node(int apicid)
41640 +{
41641 + int i;
41642 + for (i = apicid - 1; i >= 0; i--) {
41643 + int node = apicid_to_node[i];
41644 + if (node != NUMA_NO_NODE && node_online(node))
41645 + return node;
41646 + }
41647 + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
41648 + int node = apicid_to_node[i];
41649 + if (node != NUMA_NO_NODE && node_online(node))
41650 + return node;
41651 + }
41652 + return first_node(node_online_map); /* Shouldn't happen */
41653 +}
41654 +#endif
41655 +
41656 +/*
41657 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
41658 + * Assumes number of cores is a power of two.
41659 + */
41660 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
41661 +{
41662 +#ifdef CONFIG_SMP
41663 + int cpu = smp_processor_id();
41664 + unsigned bits;
41665 +#ifdef CONFIG_NUMA
41666 + int node = 0;
41667 + unsigned apicid = phys_proc_id[cpu];
41668 +#endif
41669 +
41670 + bits = 0;
41671 + while ((1 << bits) < c->x86_max_cores)
41672 + bits++;
41673 +
41674 + /* Low order bits define the core id (index of core in socket) */
41675 + cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
41676 + /* Convert the APIC ID into the socket ID */
41677 + phys_proc_id[cpu] >>= bits;
41678 +
41679 +#ifdef CONFIG_NUMA
41680 + node = phys_proc_id[cpu];
41681 + if (apicid_to_node[apicid] != NUMA_NO_NODE)
41682 + node = apicid_to_node[apicid];
41683 + if (!node_online(node)) {
41684 + /* Two possibilities here:
41685 + - The CPU is missing memory and no node was created.
41686 + In that case try picking one from a nearby CPU
41687 + - The APIC IDs differ from the HyperTransport node IDs
41688 + which the K8 northbridge parsing fills in.
41689 + Assume they are all increased by a constant offset,
41690 + but in the same order as the HT nodeids.
41691 + If that doesn't result in a usable node fall back to the
41692 + path for the previous case. */
41693 + int ht_nodeid = apicid - (phys_proc_id[0] << bits);
41694 + if (ht_nodeid >= 0 &&
41695 + apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
41696 + node = apicid_to_node[ht_nodeid];
41697 + /* Pick a nearby node */
41698 + if (!node_online(node))
41699 + node = nearby_node(apicid);
41700 + }
41701 + numa_set_node(cpu, node);
41702 +
41703 + printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
41704 + cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
41705 +#endif
41706 +#endif
41707 +}
41708 +
41709 +static int __init init_amd(struct cpuinfo_x86 *c)
41710 +{
41711 + int r;
41712 + unsigned level;
41713 +
41714 +#ifdef CONFIG_SMP
41715 + unsigned long value;
41716 +
41717 + /*
41718 + * Disable TLB flush filter by setting HWCR.FFDIS on K8
41719 + * bit 6 of msr C001_0015
41720 + *
41721 + * Errata 63 for SH-B3 steppings
41722 + * Errata 122 for all steppings (F+ have it disabled by default)
41723 + */
41724 + if (c->x86 == 15) {
41725 + rdmsrl(MSR_K8_HWCR, value);
41726 + value |= 1 << 6;
41727 + wrmsrl(MSR_K8_HWCR, value);
41728 + }
41729 +#endif
41730 +
41731 + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
41732 + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
41733 + clear_bit(0*32+31, &c->x86_capability);
41734 +
41735 + /* On C+ stepping K8 rep microcode works well for copy/memset */
41736 + level = cpuid_eax(1);
41737 + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
41738 + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
41739 +
41740 + /* Enable workaround for FXSAVE leak */
41741 + if (c->x86 >= 6)
41742 + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
41743 +
41744 + r = get_model_name(c);
41745 + if (!r) {
41746 + switch (c->x86) {
41747 + case 15:
41748 + /* Should distinguish Models here, but this is only
41749 + a fallback anyways. */
41750 + strcpy(c->x86_model_id, "Hammer");
41751 + break;
41752 + }
41753 + }
41754 + display_cacheinfo(c);
41755 +
41756 + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
41757 + if (c->x86_power & (1<<8))
41758 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41759 +
41760 + if (c->extended_cpuid_level >= 0x80000008) {
41761 + c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
41762 + if (c->x86_max_cores & (c->x86_max_cores - 1))
41763 + c->x86_max_cores = 1;
41764 +
41765 + amd_detect_cmp(c);
41766 + }
41767 +
41768 + return r;
41769 +}
41770 +
41771 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
41772 +{
41773 +#ifdef CONFIG_SMP
41774 + u32 eax, ebx, ecx, edx;
41775 + int index_msb, core_bits;
41776 + int cpu = smp_processor_id();
41777 +
41778 + cpuid(1, &eax, &ebx, &ecx, &edx);
41779 +
41780 + c->apicid = phys_pkg_id(0);
41781 +
41782 + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
41783 + return;
41784 +
41785 + smp_num_siblings = (ebx & 0xff0000) >> 16;
41786 +
41787 + if (smp_num_siblings == 1) {
41788 + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
41789 + } else if (smp_num_siblings > 1 ) {
41790 +
41791 + if (smp_num_siblings > NR_CPUS) {
41792 + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
41793 + smp_num_siblings = 1;
41794 + return;
41795 + }
41796 +
41797 + index_msb = get_count_order(smp_num_siblings);
41798 + phys_proc_id[cpu] = phys_pkg_id(index_msb);
41799 +
41800 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
41801 + phys_proc_id[cpu]);
41802 +
41803 + smp_num_siblings = smp_num_siblings / c->x86_max_cores;
41804 +
41805 + index_msb = get_count_order(smp_num_siblings) ;
41806 +
41807 + core_bits = get_count_order(c->x86_max_cores);
41808 +
41809 + cpu_core_id[cpu] = phys_pkg_id(index_msb) &
41810 + ((1 << core_bits) - 1);
41811 +
41812 + if (c->x86_max_cores > 1)
41813 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
41814 + cpu_core_id[cpu]);
41815 + }
41816 +#endif
41817 +}
41818 +
41819 +/*
41820 + * find out the number of processor cores on the die
41821 + */
41822 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
41823 +{
41824 + unsigned int eax;
41825 +
41826 + if (c->cpuid_level < 4)
41827 + return 1;
41828 +
41829 + __asm__("cpuid"
41830 + : "=a" (eax)
41831 + : "0" (4), "c" (0)
41832 + : "bx", "dx");
41833 +
41834 + if (eax & 0x1f)
41835 + return ((eax >> 26) + 1);
41836 + else
41837 + return 1;
41838 +}
41839 +
41840 +static void srat_detect_node(void)
41841 +{
41842 +#ifdef CONFIG_NUMA
41843 + unsigned node;
41844 + int cpu = smp_processor_id();
41845 +
41846 + /* Don't do the funky fallback heuristics the AMD version employs
41847 + for now. */
41848 + node = apicid_to_node[hard_smp_processor_id()];
41849 + if (node == NUMA_NO_NODE)
41850 + node = 0;
41851 + numa_set_node(cpu, node);
41852 +
41853 + if (acpi_numa > 0)
41854 + printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
41855 +#endif
41856 +}
41857 +
41858 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
41859 +{
41860 + /* Cache sizes */
41861 + unsigned n;
41862 +
41863 + init_intel_cacheinfo(c);
41864 + n = c->extended_cpuid_level;
41865 + if (n >= 0x80000008) {
41866 + unsigned eax = cpuid_eax(0x80000008);
41867 + c->x86_virt_bits = (eax >> 8) & 0xff;
41868 + c->x86_phys_bits = eax & 0xff;
41869 + /* CPUID workaround for Intel 0F34 CPU */
41870 + if (c->x86_vendor == X86_VENDOR_INTEL &&
41871 + c->x86 == 0xF && c->x86_model == 0x3 &&
41872 + c->x86_mask == 0x4)
41873 + c->x86_phys_bits = 36;
41874 + }
41875 +
41876 + if (c->x86 == 15)
41877 + c->x86_cache_alignment = c->x86_clflush_size * 2;
41878 + if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
41879 + (c->x86 == 0x6 && c->x86_model >= 0x0e))
41880 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
41881 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
41882 + c->x86_max_cores = intel_num_cpu_cores(c);
41883 +
41884 + srat_detect_node();
41885 +}
41886 +
41887 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
41888 +{
41889 + char *v = c->x86_vendor_id;
41890 +
41891 + if (!strcmp(v, "AuthenticAMD"))
41892 + c->x86_vendor = X86_VENDOR_AMD;
41893 + else if (!strcmp(v, "GenuineIntel"))
41894 + c->x86_vendor = X86_VENDOR_INTEL;
41895 + else
41896 + c->x86_vendor = X86_VENDOR_UNKNOWN;
41897 +}
41898 +
41899 +struct cpu_model_info {
41900 + int vendor;
41901 + int family;
41902 + char *model_names[16];
41903 +};
41904 +
41905 +/* Do some early cpuid on the boot CPU to get some parameter that are
41906 + needed before check_bugs. Everything advanced is in identify_cpu
41907 + below. */
41908 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
41909 +{
41910 + u32 tfms;
41911 +
41912 + c->loops_per_jiffy = loops_per_jiffy;
41913 + c->x86_cache_size = -1;
41914 + c->x86_vendor = X86_VENDOR_UNKNOWN;
41915 + c->x86_model = c->x86_mask = 0; /* So far unknown... */
41916 + c->x86_vendor_id[0] = '\0'; /* Unset */
41917 + c->x86_model_id[0] = '\0'; /* Unset */
41918 + c->x86_clflush_size = 64;
41919 + c->x86_cache_alignment = c->x86_clflush_size;
41920 + c->x86_max_cores = 1;
41921 + c->extended_cpuid_level = 0;
41922 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
41923 +
41924 + /* Get vendor name */
41925 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
41926 + (unsigned int *)&c->x86_vendor_id[0],
41927 + (unsigned int *)&c->x86_vendor_id[8],
41928 + (unsigned int *)&c->x86_vendor_id[4]);
41929 +
41930 + get_cpu_vendor(c);
41931 +
41932 + /* Initialize the standard set of capabilities */
41933 + /* Note that the vendor-specific code below might override */
41934 +
41935 + /* Intel-defined flags: level 0x00000001 */
41936 + if (c->cpuid_level >= 0x00000001) {
41937 + __u32 misc;
41938 + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
41939 + &c->x86_capability[0]);
41940 + c->x86 = (tfms >> 8) & 0xf;
41941 + c->x86_model = (tfms >> 4) & 0xf;
41942 + c->x86_mask = tfms & 0xf;
41943 + if (c->x86 == 0xf)
41944 + c->x86 += (tfms >> 20) & 0xff;
41945 + if (c->x86 >= 0x6)
41946 + c->x86_model += ((tfms >> 16) & 0xF) << 4;
41947 + if (c->x86_capability[0] & (1<<19))
41948 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
41949 + } else {
41950 + /* Have CPUID level 0 only - unheard of */
41951 + c->x86 = 4;
41952 + }
41953 +
41954 +#ifdef CONFIG_SMP
41955 + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
41956 +#endif
41957 +}
41958 +
41959 +/*
41960 + * This does the hard work of actually picking apart the CPU stuff...
41961 + */
41962 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
41963 +{
41964 + int i;
41965 + u32 xlvl;
41966 +
41967 + early_identify_cpu(c);
41968 +
41969 + /* AMD-defined flags: level 0x80000001 */
41970 + xlvl = cpuid_eax(0x80000000);
41971 + c->extended_cpuid_level = xlvl;
41972 + if ((xlvl & 0xffff0000) == 0x80000000) {
41973 + if (xlvl >= 0x80000001) {
41974 + c->x86_capability[1] = cpuid_edx(0x80000001);
41975 + c->x86_capability[6] = cpuid_ecx(0x80000001);
41976 + }
41977 + if (xlvl >= 0x80000004)
41978 + get_model_name(c); /* Default name */
41979 + }
41980 +
41981 + /* Transmeta-defined flags: level 0x80860001 */
41982 + xlvl = cpuid_eax(0x80860000);
41983 + if ((xlvl & 0xffff0000) == 0x80860000) {
41984 + /* Don't set x86_cpuid_level here for now to not confuse. */
41985 + if (xlvl >= 0x80860001)
41986 + c->x86_capability[2] = cpuid_edx(0x80860001);
41987 + }
41988 +
41989 + /*
41990 + * Vendor-specific initialization. In this section we
41991 + * canonicalize the feature flags, meaning if there are
41992 + * features a certain CPU supports which CPUID doesn't
41993 + * tell us, CPUID claiming incorrect flags, or other bugs,
41994 + * we handle them here.
41995 + *
41996 + * At the end of this section, c->x86_capability better
41997 + * indicate the features this CPU genuinely supports!
41998 + */
41999 + switch (c->x86_vendor) {
42000 + case X86_VENDOR_AMD:
42001 + init_amd(c);
42002 + break;
42003 +
42004 + case X86_VENDOR_INTEL:
42005 + init_intel(c);
42006 + break;
42007 +
42008 + case X86_VENDOR_UNKNOWN:
42009 + default:
42010 + display_cacheinfo(c);
42011 + break;
42012 + }
42013 +
42014 + select_idle_routine(c);
42015 + detect_ht(c);
42016 +
42017 + /*
42018 + * On SMP, boot_cpu_data holds the common feature set between
42019 + * all CPUs; so make sure that we indicate which features are
42020 + * common between the CPUs. The first time this routine gets
42021 + * executed, c == &boot_cpu_data.
42022 + */
42023 + if (c != &boot_cpu_data) {
42024 + /* AND the already accumulated flags with these */
42025 + for (i = 0 ; i < NCAPINTS ; i++)
42026 + boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
42027 + }
42028 +
42029 +#ifdef CONFIG_X86_MCE
42030 + mcheck_init(c);
42031 +#endif
42032 + if (c == &boot_cpu_data)
42033 + mtrr_bp_init();
42034 + else
42035 + mtrr_ap_init();
42036 +#ifdef CONFIG_NUMA
42037 + numa_add_cpu(smp_processor_id());
42038 +#endif
42039 +}
42040 +
42041 +
42042 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
42043 +{
42044 + if (c->x86_model_id[0])
42045 + printk("%s", c->x86_model_id);
42046 +
42047 + if (c->x86_mask || c->cpuid_level >= 0)
42048 + printk(" stepping %02x\n", c->x86_mask);
42049 + else
42050 + printk("\n");
42051 +}
42052 +
42053 +/*
42054 + * Get CPU information for use by the procfs.
42055 + */
42056 +
42057 +static int show_cpuinfo(struct seq_file *m, void *v)
42058 +{
42059 + struct cpuinfo_x86 *c = v;
42060 +
42061 + /*
42062 + * These flag bits must match the definitions in <asm/cpufeature.h>.
42063 + * NULL means this bit is undefined or reserved; either way it doesn't
42064 + * have meaning as far as Linux is concerned. Note that it's important
42065 + * to realize there is a difference between this table and CPUID -- if
42066 + * applications want to get the raw CPUID data, they should access
42067 + * /dev/cpu/<cpu_nr>/cpuid instead.
42068 + */
42069 + static char *x86_cap_flags[] = {
42070 + /* Intel-defined */
42071 + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
42072 + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
42073 + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
42074 + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
42075 +
42076 + /* AMD-defined */
42077 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42078 + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
42079 + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
42080 + NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
42081 +
42082 + /* Transmeta-defined */
42083 + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
42084 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42085 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42086 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42087 +
42088 + /* Other (Linux-defined) */
42089 + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
42090 + "constant_tsc", NULL, NULL,
42091 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42092 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42093 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42094 +
42095 + /* Intel-defined (#2) */
42096 + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
42097 + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
42098 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42099 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42100 +
42101 + /* VIA/Cyrix/Centaur-defined */
42102 + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
42103 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42104 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42105 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42106 +
42107 + /* AMD-defined (#2) */
42108 + "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
42109 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42110 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42111 + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
42112 + };
42113 + static char *x86_power_flags[] = {
42114 + "ts", /* temperature sensor */
42115 + "fid", /* frequency id control */
42116 + "vid", /* voltage id control */
42117 + "ttp", /* thermal trip */
42118 + "tm",
42119 + "stc",
42120 + NULL,
42121 + /* nothing */ /* constant_tsc - moved to flags */
42122 + };
42123 +
42124 +
42125 +#ifdef CONFIG_SMP
42126 + if (!cpu_online(c-cpu_data))
42127 + return 0;
42128 +#endif
42129 +
42130 + seq_printf(m,"processor\t: %u\n"
42131 + "vendor_id\t: %s\n"
42132 + "cpu family\t: %d\n"
42133 + "model\t\t: %d\n"
42134 + "model name\t: %s\n",
42135 + (unsigned)(c-cpu_data),
42136 + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
42137 + c->x86,
42138 + (int)c->x86_model,
42139 + c->x86_model_id[0] ? c->x86_model_id : "unknown");
42140 +
42141 + if (c->x86_mask || c->cpuid_level >= 0)
42142 + seq_printf(m, "stepping\t: %d\n", c->x86_mask);
42143 + else
42144 + seq_printf(m, "stepping\t: unknown\n");
42145 +
42146 + if (cpu_has(c,X86_FEATURE_TSC)) {
42147 + unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
42148 + if (!freq)
42149 + freq = cpu_khz;
42150 + seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
42151 + freq / 1000, (freq % 1000));
42152 + }
42153 +
42154 + /* Cache size */
42155 + if (c->x86_cache_size >= 0)
42156 + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
42157 +
42158 +#ifdef CONFIG_SMP
42159 + if (smp_num_siblings * c->x86_max_cores > 1) {
42160 + int cpu = c - cpu_data;
42161 + seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
42162 + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
42163 + seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
42164 + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
42165 + }
42166 +#endif
42167 +
42168 + seq_printf(m,
42169 + "fpu\t\t: yes\n"
42170 + "fpu_exception\t: yes\n"
42171 + "cpuid level\t: %d\n"
42172 + "wp\t\t: yes\n"
42173 + "flags\t\t:",
42174 + c->cpuid_level);
42175 +
42176 + {
42177 + int i;
42178 + for ( i = 0 ; i < 32*NCAPINTS ; i++ )
42179 + if ( test_bit(i, &c->x86_capability) &&
42180 + x86_cap_flags[i] != NULL )
42181 + seq_printf(m, " %s", x86_cap_flags[i]);
42182 + }
42183 +
42184 + seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
42185 + c->loops_per_jiffy/(500000/HZ),
42186 + (c->loops_per_jiffy/(5000/HZ)) % 100);
42187 +
42188 + if (c->x86_tlbsize > 0)
42189 + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
42190 + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
42191 + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
42192 +
42193 + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
42194 + c->x86_phys_bits, c->x86_virt_bits);
42195 +
42196 + seq_printf(m, "power management:");
42197 + {
42198 + unsigned i;
42199 + for (i = 0; i < 32; i++)
42200 + if (c->x86_power & (1 << i)) {
42201 + if (i < ARRAY_SIZE(x86_power_flags) &&
42202 + x86_power_flags[i])
42203 + seq_printf(m, "%s%s",
42204 + x86_power_flags[i][0]?" ":"",
42205 + x86_power_flags[i]);
42206 + else
42207 + seq_printf(m, " [%d]", i);
42208 + }
42209 + }
42210 +
42211 + seq_printf(m, "\n\n");
42212 +
42213 + return 0;
42214 +}
42215 +
42216 +static void *c_start(struct seq_file *m, loff_t *pos)
42217 +{
42218 + return *pos < NR_CPUS ? cpu_data + *pos : NULL;
42219 +}
42220 +
42221 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
42222 +{
42223 + ++*pos;
42224 + return c_start(m, pos);
42225 +}
42226 +
42227 +static void c_stop(struct seq_file *m, void *v)
42228 +{
42229 +}
42230 +
42231 +struct seq_operations cpuinfo_op = {
42232 + .start =c_start,
42233 + .next = c_next,
42234 + .stop = c_stop,
42235 + .show = show_cpuinfo,
42236 +};
42237 +
42238 +static int __init run_dmi_scan(void)
42239 +{
42240 + dmi_scan_machine();
42241 + return 0;
42242 +}
42243 +core_initcall(run_dmi_scan);
42244 +
42245 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/setup64-xen.c linux-2.6.16.33/arch/x86_64/kernel/setup64-xen.c
42246 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/setup64-xen.c 1970-01-01 00:00:00.000000000 +0000
42247 +++ linux-2.6.16.33/arch/x86_64/kernel/setup64-xen.c 2007-01-08 15:00:45.000000000 +0000
42248 @@ -0,0 +1,360 @@
42249 +/*
42250 + * X86-64 specific CPU setup.
42251 + * Copyright (C) 1995 Linus Torvalds
42252 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
42253 + * See setup.c for older changelog.
42254 + * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
42255 + *
42256 + * Jun Nakajima <jun.nakajima@intel.com>
42257 + * Modified for Xen
42258 + *
42259 + */
42260 +#include <linux/config.h>
42261 +#include <linux/init.h>
42262 +#include <linux/kernel.h>
42263 +#include <linux/sched.h>
42264 +#include <linux/string.h>
42265 +#include <linux/bootmem.h>
42266 +#include <linux/bitops.h>
42267 +#include <linux/module.h>
42268 +#include <asm/bootsetup.h>
42269 +#include <asm/pda.h>
42270 +#include <asm/pgtable.h>
42271 +#include <asm/processor.h>
42272 +#include <asm/desc.h>
42273 +#include <asm/atomic.h>
42274 +#include <asm/mmu_context.h>
42275 +#include <asm/smp.h>
42276 +#include <asm/i387.h>
42277 +#include <asm/percpu.h>
42278 +#include <asm/proto.h>
42279 +#include <asm/sections.h>
42280 +#ifdef CONFIG_XEN
42281 +#include <asm/hypervisor.h>
42282 +#endif
42283 +
42284 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
42285 +
42286 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
42287 +
42288 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
42289 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
42290 +
42291 +#ifndef CONFIG_X86_NO_IDT
42292 +struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
42293 +#endif
42294 +
42295 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
42296 +
42297 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
42298 +static int do_not_nx __cpuinitdata = 0;
42299 +
42300 +/* noexec=on|off
42301 +Control non executable mappings for 64bit processes.
42302 +
42303 +on Enable(default)
42304 +off Disable
42305 +*/
42306 +int __init nonx_setup(char *str)
42307 +{
42308 + if (!strncmp(str, "on", 2)) {
42309 + __supported_pte_mask |= _PAGE_NX;
42310 + do_not_nx = 0;
42311 + } else if (!strncmp(str, "off", 3)) {
42312 + do_not_nx = 1;
42313 + __supported_pte_mask &= ~_PAGE_NX;
42314 + }
42315 + return 0;
42316 +}
42317 +__setup("noexec=", nonx_setup); /* parsed early actually */
42318 +
42319 +int force_personality32 = READ_IMPLIES_EXEC;
42320 +
42321 +/* noexec32=on|off
42322 +Control non executable heap for 32bit processes.
42323 +To control the stack too use noexec=off
42324 +
42325 +on PROT_READ does not imply PROT_EXEC for 32bit processes
42326 +off PROT_READ implies PROT_EXEC (default)
42327 +*/
42328 +static int __init nonx32_setup(char *str)
42329 +{
42330 + if (!strcmp(str, "on"))
42331 + force_personality32 &= ~READ_IMPLIES_EXEC;
42332 + else if (!strcmp(str, "off"))
42333 + force_personality32 |= READ_IMPLIES_EXEC;
42334 + return 0;
42335 +}
42336 +__setup("noexec32=", nonx32_setup);
42337 +
42338 +/*
42339 + * Great future plan:
42340 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
42341 + * Always point %gs to its beginning
42342 + */
42343 +void __init setup_per_cpu_areas(void)
42344 +{
42345 + int i;
42346 + unsigned long size;
42347 +
42348 +#ifdef CONFIG_HOTPLUG_CPU
42349 + prefill_possible_map();
42350 +#endif
42351 +
42352 + /* Copy section for each CPU (we discard the original) */
42353 + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
42354 +#ifdef CONFIG_MODULES
42355 + if (size < PERCPU_ENOUGH_ROOM)
42356 + size = PERCPU_ENOUGH_ROOM;
42357 +#endif
42358 +
42359 + for_each_cpu_mask (i, cpu_possible_map) {
42360 + char *ptr;
42361 +
42362 + if (!NODE_DATA(cpu_to_node(i))) {
42363 + printk("cpu with no node %d, num_online_nodes %d\n",
42364 + i, num_online_nodes());
42365 + ptr = alloc_bootmem(size);
42366 + } else {
42367 + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
42368 + }
42369 + if (!ptr)
42370 + panic("Cannot allocate cpu data for CPU %d\n", i);
42371 + cpu_pda(i)->data_offset = ptr - __per_cpu_start;
42372 + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
42373 + }
42374 +}
42375 +
42376 +#ifdef CONFIG_XEN
42377 +static void switch_pt(void)
42378 +{
42379 + xen_pt_switch(__pa(init_level4_pgt));
42380 + xen_new_user_pt(__pa(init_level4_user_pgt));
42381 +}
42382 +
42383 +void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
42384 +{
42385 + unsigned long frames[16];
42386 + unsigned long va;
42387 + int f;
42388 +
42389 + for (va = gdt_descr->address, f = 0;
42390 + va < gdt_descr->address + gdt_descr->size;
42391 + va += PAGE_SIZE, f++) {
42392 + frames[f] = virt_to_mfn(va);
42393 + make_page_readonly(
42394 + (void *)va, XENFEAT_writable_descriptor_tables);
42395 + }
42396 + if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
42397 + sizeof (struct desc_struct)))
42398 + BUG();
42399 +}
42400 +#else
42401 +static void switch_pt(void)
42402 +{
42403 + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
42404 +}
42405 +
42406 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
42407 +{
42408 + asm volatile("lgdt %0" :: "m" (*gdt_descr));
42409 + asm volatile("lidt %0" :: "m" (idt_descr));
42410 +}
42411 +#endif
42412 +
42413 +void pda_init(int cpu)
42414 +{
42415 + struct x8664_pda *pda = cpu_pda(cpu);
42416 +
42417 + /* Setup up data that may be needed in __get_free_pages early */
42418 + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
42419 +#ifndef CONFIG_XEN
42420 + wrmsrl(MSR_GS_BASE, pda);
42421 +#else
42422 + HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
42423 +#endif
42424 + pda->cpunumber = cpu;
42425 + pda->irqcount = -1;
42426 + pda->kernelstack =
42427 + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
42428 + pda->active_mm = &init_mm;
42429 + pda->mmu_state = 0;
42430 +
42431 + if (cpu == 0) {
42432 +#ifdef CONFIG_XEN
42433 + xen_init_pt();
42434 +#endif
42435 + /* others are initialized in smpboot.c */
42436 + pda->pcurrent = &init_task;
42437 + pda->irqstackptr = boot_cpu_stack;
42438 + } else {
42439 + pda->irqstackptr = (char *)
42440 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
42441 + if (!pda->irqstackptr)
42442 + panic("cannot allocate irqstack for cpu %d", cpu);
42443 + }
42444 +
42445 + switch_pt();
42446 +
42447 + pda->irqstackptr += IRQSTACKSIZE-64;
42448 +}
42449 +
42450 +#ifndef CONFIG_X86_NO_TSS
42451 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
42452 +__attribute__((section(".bss.page_aligned")));
42453 +#endif
42454 +
42455 +/* May not be marked __init: used by software suspend */
42456 +void syscall_init(void)
42457 +{
42458 +#ifndef CONFIG_XEN
42459 + /*
42460 + * LSTAR and STAR live in a bit strange symbiosis.
42461 + * They both write to the same internal register. STAR allows to set CS/DS
42462 + * but only a 32bit target. LSTAR sets the 64bit rip.
42463 + */
42464 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
42465 + wrmsrl(MSR_LSTAR, system_call);
42466 +
42467 + /* Flags to clear on syscall */
42468 + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
42469 +#endif
42470 +#ifdef CONFIG_IA32_EMULATION
42471 + syscall32_cpu_init ();
42472 +#endif
42473 +}
42474 +
42475 +void __cpuinit check_efer(void)
42476 +{
42477 + unsigned long efer;
42478 +
42479 + rdmsrl(MSR_EFER, efer);
42480 + if (!(efer & EFER_NX) || do_not_nx) {
42481 + __supported_pte_mask &= ~_PAGE_NX;
42482 + }
42483 +}
42484 +
42485 +/*
42486 + * cpu_init() initializes state that is per-CPU. Some data is already
42487 + * initialized (naturally) in the bootstrap process, such as the GDT
42488 + * and IDT. We reload them nevertheless, this function acts as a
42489 + * 'CPU state barrier', nothing should get across.
42490 + * A lot of state is already set up in PDA init.
42491 + */
42492 +void __cpuinit cpu_init (void)
42493 +{
42494 + int cpu = stack_smp_processor_id();
42495 +#ifndef CONFIG_X86_NO_TSS
42496 + struct tss_struct *t = &per_cpu(init_tss, cpu);
42497 + unsigned long v;
42498 + char *estacks = NULL;
42499 + unsigned i;
42500 +#endif
42501 + struct task_struct *me;
42502 +
42503 + /* CPU 0 is initialised in head64.c */
42504 + if (cpu != 0) {
42505 + pda_init(cpu);
42506 + zap_low_mappings(cpu);
42507 + }
42508 +#ifndef CONFIG_X86_NO_TSS
42509 + else
42510 + estacks = boot_exception_stacks;
42511 +#endif
42512 +
42513 + me = current;
42514 +
42515 + if (cpu_test_and_set(cpu, cpu_initialized))
42516 + panic("CPU#%d already initialized!\n", cpu);
42517 +
42518 + printk("Initializing CPU#%d\n", cpu);
42519 +
42520 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
42521 +
42522 + /*
42523 + * Initialize the per-CPU GDT with the boot GDT,
42524 + * and set up the GDT descriptor:
42525 + */
42526 +#ifndef CONFIG_XEN
42527 + if (cpu)
42528 + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
42529 +#endif
42530 +
42531 + cpu_gdt_descr[cpu].size = GDT_SIZE;
42532 + cpu_gdt_init(&cpu_gdt_descr[cpu]);
42533 +
42534 + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
42535 + syscall_init();
42536 +
42537 + wrmsrl(MSR_FS_BASE, 0);
42538 + wrmsrl(MSR_KERNEL_GS_BASE, 0);
42539 + barrier();
42540 +
42541 + check_efer();
42542 +
42543 +#ifndef CONFIG_X86_NO_TSS
42544 + /*
42545 + * set up and load the per-CPU TSS
42546 + */
42547 + for (v = 0; v < N_EXCEPTION_STACKS; v++) {
42548 + if (cpu) {
42549 + static const unsigned int order[N_EXCEPTION_STACKS] = {
42550 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
42551 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
42552 + };
42553 +
42554 + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
42555 + if (!estacks)
42556 + panic("Cannot allocate exception stack %ld %d\n",
42557 + v, cpu);
42558 + }
42559 + switch (v + 1) {
42560 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
42561 + case DEBUG_STACK:
42562 + cpu_pda[cpu].debugstack = (unsigned long)estacks;
42563 + estacks += DEBUG_STKSZ;
42564 + break;
42565 +#endif
42566 + default:
42567 + estacks += EXCEPTION_STKSZ;
42568 + break;
42569 + }
42570 + t->ist[v] = (unsigned long)estacks;
42571 + }
42572 +
42573 + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
42574 + /*
42575 + * <= is required because the CPU will access up to
42576 + * 8 bits beyond the end of the IO permission bitmap.
42577 + */
42578 + for (i = 0; i <= IO_BITMAP_LONGS; i++)
42579 + t->io_bitmap[i] = ~0UL;
42580 +#endif
42581 +
42582 + atomic_inc(&init_mm.mm_count);
42583 + me->active_mm = &init_mm;
42584 + if (me->mm)
42585 + BUG();
42586 + enter_lazy_tlb(&init_mm, me);
42587 +
42588 +#ifndef CONFIG_X86_NO_TSS
42589 + set_tss_desc(cpu, t);
42590 +#endif
42591 +#ifndef CONFIG_XEN
42592 + load_TR_desc();
42593 +#endif
42594 + load_LDT(&init_mm.context);
42595 +
42596 + /*
42597 + * Clear all 6 debug registers:
42598 + */
42599 +
42600 + set_debug(0UL, 0);
42601 + set_debug(0UL, 1);
42602 + set_debug(0UL, 2);
42603 + set_debug(0UL, 3);
42604 + set_debug(0UL, 6);
42605 + set_debug(0UL, 7);
42606 +
42607 + fpu_init();
42608 +}
42609 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/smp-xen.c linux-2.6.16.33/arch/x86_64/kernel/smp-xen.c
42610 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/smp-xen.c 1970-01-01 00:00:00.000000000 +0000
42611 +++ linux-2.6.16.33/arch/x86_64/kernel/smp-xen.c 2007-01-08 15:00:45.000000000 +0000
42612 @@ -0,0 +1,596 @@
42613 +/*
42614 + * Intel SMP support routines.
42615 + *
42616 + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
42617 + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
42618 + * (c) 2002,2003 Andi Kleen, SuSE Labs.
42619 + *
42620 + * This code is released under the GNU General Public License version 2 or
42621 + * later.
42622 + */
42623 +
42624 +#include <linux/init.h>
42625 +
42626 +#include <linux/mm.h>
42627 +#include <linux/delay.h>
42628 +#include <linux/spinlock.h>
42629 +#include <linux/smp_lock.h>
42630 +#include <linux/smp.h>
42631 +#include <linux/kernel_stat.h>
42632 +#include <linux/mc146818rtc.h>
42633 +#include <linux/interrupt.h>
42634 +
42635 +#include <asm/mtrr.h>
42636 +#include <asm/pgalloc.h>
42637 +#include <asm/tlbflush.h>
42638 +#include <asm/mach_apic.h>
42639 +#include <asm/mmu_context.h>
42640 +#include <asm/proto.h>
42641 +#include <asm/apicdef.h>
42642 +#include <asm/idle.h>
42643 +#ifdef CONFIG_XEN
42644 +#include <xen/evtchn.h>
42645 +#endif
42646 +
42647 +#ifndef CONFIG_XEN
42648 +/*
42649 + * Smarter SMP flushing macros.
42650 + * c/o Linus Torvalds.
42651 + *
42652 + * These mean you can really definitely utterly forget about
42653 + * writing to user space from interrupts. (Its not allowed anyway).
42654 + *
42655 + * Optimizations Manfred Spraul <manfred@colorfullife.com>
42656 + *
42657 + * More scalable flush, from Andi Kleen
42658 + *
42659 + * To avoid global state use 8 different call vectors.
42660 + * Each CPU uses a specific vector to trigger flushes on other
42661 + * CPUs. Depending on the received vector the target CPUs look into
42662 + * the right per cpu variable for the flush data.
42663 + *
42664 + * With more than 8 CPUs they are hashed to the 8 available
42665 + * vectors. The limited global vector space forces us to this right now.
42666 + * In future when interrupts are split into per CPU domains this could be
42667 + * fixed, at the cost of triggering multiple IPIs in some cases.
42668 + */
42669 +
42670 +union smp_flush_state {
42671 + struct {
42672 + cpumask_t flush_cpumask;
42673 + struct mm_struct *flush_mm;
42674 + unsigned long flush_va;
42675 +#define FLUSH_ALL -1ULL
42676 + spinlock_t tlbstate_lock;
42677 + };
42678 + char pad[SMP_CACHE_BYTES];
42679 +} ____cacheline_aligned;
42680 +
42681 +/* State is put into the per CPU data section, but padded
42682 + to a full cache line because other CPUs can access it and we don't
42683 + want false sharing in the per cpu data segment. */
42684 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
42685 +#endif
42686 +
42687 +/*
42688 + * We cannot call mmdrop() because we are in interrupt context,
42689 + * instead update mm->cpu_vm_mask.
42690 + */
42691 +static inline void leave_mm(unsigned long cpu)
42692 +{
42693 + if (read_pda(mmu_state) == TLBSTATE_OK)
42694 + BUG();
42695 + clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
42696 + load_cr3(swapper_pg_dir);
42697 +}
42698 +
42699 +#ifndef CONFIG_XEN
42700 +/*
42701 + *
42702 + * The flush IPI assumes that a thread switch happens in this order:
42703 + * [cpu0: the cpu that switches]
42704 + * 1) switch_mm() either 1a) or 1b)
42705 + * 1a) thread switch to a different mm
42706 + * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
42707 + * Stop ipi delivery for the old mm. This is not synchronized with
42708 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
42709 + * for the wrong mm, and in the worst case we perform a superfluous
42710 + * tlb flush.
42711 + * 1a2) set cpu mmu_state to TLBSTATE_OK
42712 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
42713 + * was in lazy tlb mode.
42714 + * 1a3) update cpu active_mm
42715 + * Now cpu0 accepts tlb flushes for the new mm.
42716 + * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
42717 + * Now the other cpus will send tlb flush ipis.
42718 + * 1a4) change cr3.
42719 + * 1b) thread switch without mm change
42720 + * cpu active_mm is correct, cpu0 already handles
42721 + * flush ipis.
42722 + * 1b1) set cpu mmu_state to TLBSTATE_OK
42723 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
42724 + * Atomically set the bit [other cpus will start sending flush ipis],
42725 + * and test the bit.
42726 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
42727 + * 2) switch %%esp, ie current
42728 + *
42729 + * The interrupt must handle 2 special cases:
42730 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
42731 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
42732 + * runs in kernel space, the cpu could load tlb entries for user space
42733 + * pages.
42734 + *
42735 + * The good news is that cpu mmu_state is local to each cpu, no
42736 + * write/read ordering problems.
42737 + */
42738 +
42739 +/*
42740 + * TLB flush IPI:
42741 + *
42742 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
42743 + * 2) Leave the mm if we are in the lazy tlb mode.
42744 + *
42745 + * Interrupts are disabled.
42746 + */
42747 +
42748 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
42749 +{
42750 + int cpu;
42751 + int sender;
42752 + union smp_flush_state *f;
42753 +
42754 + cpu = smp_processor_id();
42755 + /*
42756 + * orig_rax contains the interrupt vector - 256.
42757 + * Use that to determine where the sender put the data.
42758 + */
42759 + sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
42760 + f = &per_cpu(flush_state, sender);
42761 +
42762 + if (!cpu_isset(cpu, f->flush_cpumask))
42763 + goto out;
42764 + /*
42765 + * This was a BUG() but until someone can quote me the
42766 + * line from the intel manual that guarantees an IPI to
42767 + * multiple CPUs is retried _only_ on the erroring CPUs
42768 + * its staying as a return
42769 + *
42770 + * BUG();
42771 + */
42772 +
42773 + if (f->flush_mm == read_pda(active_mm)) {
42774 + if (read_pda(mmu_state) == TLBSTATE_OK) {
42775 + if (f->flush_va == FLUSH_ALL)
42776 + local_flush_tlb();
42777 + else
42778 + __flush_tlb_one(f->flush_va);
42779 + } else
42780 + leave_mm(cpu);
42781 + }
42782 +out:
42783 + ack_APIC_irq();
42784 + cpu_clear(cpu, f->flush_cpumask);
42785 +}
42786 +
42787 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
42788 + unsigned long va)
42789 +{
42790 + int sender;
42791 + union smp_flush_state *f;
42792 +
42793 + /* Caller has disabled preemption */
42794 + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
42795 + f = &per_cpu(flush_state, sender);
42796 +
42797 + /* Could avoid this lock when
42798 + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
42799 + probably not worth checking this for a cache-hot lock. */
42800 + spin_lock(&f->tlbstate_lock);
42801 +
42802 + f->flush_mm = mm;
42803 + f->flush_va = va;
42804 + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
42805 +
42806 + /*
42807 + * We have to send the IPI only to
42808 + * CPUs affected.
42809 + */
42810 + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
42811 +
42812 + while (!cpus_empty(f->flush_cpumask))
42813 + cpu_relax();
42814 +
42815 + f->flush_mm = NULL;
42816 + f->flush_va = 0;
42817 + spin_unlock(&f->tlbstate_lock);
42818 +}
42819 +
42820 +int __cpuinit init_smp_flush(void)
42821 +{
42822 + int i;
42823 + for_each_cpu_mask(i, cpu_possible_map) {
42824 + spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
42825 + }
42826 + return 0;
42827 +}
42828 +
42829 +core_initcall(init_smp_flush);
42830 +
42831 +void flush_tlb_current_task(void)
42832 +{
42833 + struct mm_struct *mm = current->mm;
42834 + cpumask_t cpu_mask;
42835 +
42836 + preempt_disable();
42837 + cpu_mask = mm->cpu_vm_mask;
42838 + cpu_clear(smp_processor_id(), cpu_mask);
42839 +
42840 + local_flush_tlb();
42841 + if (!cpus_empty(cpu_mask))
42842 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42843 + preempt_enable();
42844 +}
42845 +
42846 +void flush_tlb_mm (struct mm_struct * mm)
42847 +{
42848 + cpumask_t cpu_mask;
42849 +
42850 + preempt_disable();
42851 + cpu_mask = mm->cpu_vm_mask;
42852 + cpu_clear(smp_processor_id(), cpu_mask);
42853 +
42854 + if (current->active_mm == mm) {
42855 + if (current->mm)
42856 + local_flush_tlb();
42857 + else
42858 + leave_mm(smp_processor_id());
42859 + }
42860 + if (!cpus_empty(cpu_mask))
42861 + flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
42862 +
42863 + preempt_enable();
42864 +}
42865 +
42866 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42867 +{
42868 + struct mm_struct *mm = vma->vm_mm;
42869 + cpumask_t cpu_mask;
42870 +
42871 + preempt_disable();
42872 + cpu_mask = mm->cpu_vm_mask;
42873 + cpu_clear(smp_processor_id(), cpu_mask);
42874 +
42875 + if (current->active_mm == mm) {
42876 + if(current->mm)
42877 + __flush_tlb_one(va);
42878 + else
42879 + leave_mm(smp_processor_id());
42880 + }
42881 +
42882 + if (!cpus_empty(cpu_mask))
42883 + flush_tlb_others(cpu_mask, mm, va);
42884 +
42885 + preempt_enable();
42886 +}
42887 +
42888 +static void do_flush_tlb_all(void* info)
42889 +{
42890 + unsigned long cpu = smp_processor_id();
42891 +
42892 + __flush_tlb_all();
42893 + if (read_pda(mmu_state) == TLBSTATE_LAZY)
42894 + leave_mm(cpu);
42895 +}
42896 +
42897 +void flush_tlb_all(void)
42898 +{
42899 + on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
42900 +}
42901 +#else
42902 +asmlinkage void smp_invalidate_interrupt (void)
42903 +{ return; }
42904 +void flush_tlb_current_task(void)
42905 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
42906 +void flush_tlb_mm (struct mm_struct * mm)
42907 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
42908 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
42909 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
42910 +void flush_tlb_all(void)
42911 +{ xen_tlb_flush_all(); }
42912 +#endif /* Xen */
42913 +
42914 +/*
42915 + * this function sends a 'reschedule' IPI to another CPU.
42916 + * it goes straight through and wastes no time serializing
42917 + * anything. Worst case is that we lose a reschedule ...
42918 + */
42919 +
42920 +void smp_send_reschedule(int cpu)
42921 +{
42922 + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
42923 +}
42924 +
42925 +/*
42926 + * Structure and data for smp_call_function(). This is designed to minimise
42927 + * static memory requirements. It also looks cleaner.
42928 + */
42929 +static DEFINE_SPINLOCK(call_lock);
42930 +
42931 +struct call_data_struct {
42932 + void (*func) (void *info);
42933 + void *info;
42934 + atomic_t started;
42935 + atomic_t finished;
42936 + int wait;
42937 +};
42938 +
42939 +static struct call_data_struct * call_data;
42940 +
42941 +void lock_ipi_call_lock(void)
42942 +{
42943 + spin_lock_irq(&call_lock);
42944 +}
42945 +
42946 +void unlock_ipi_call_lock(void)
42947 +{
42948 + spin_unlock_irq(&call_lock);
42949 +}
42950 +
42951 +/*
42952 + * this function sends a 'generic call function' IPI to one other CPU
42953 + * in the system.
42954 + *
42955 + * cpu is a standard Linux logical CPU number.
42956 + */
42957 +static void
42958 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
42959 + int nonatomic, int wait)
42960 +{
42961 + struct call_data_struct data;
42962 + int cpus = 1;
42963 +
42964 + data.func = func;
42965 + data.info = info;
42966 + atomic_set(&data.started, 0);
42967 + data.wait = wait;
42968 + if (wait)
42969 + atomic_set(&data.finished, 0);
42970 +
42971 + call_data = &data;
42972 + wmb();
42973 + /* Send a message to all other CPUs and wait for them to respond */
42974 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
42975 +
42976 + /* Wait for response */
42977 + while (atomic_read(&data.started) != cpus)
42978 + cpu_relax();
42979 +
42980 + if (!wait)
42981 + return;
42982 +
42983 + while (atomic_read(&data.finished) != cpus)
42984 + cpu_relax();
42985 +}
42986 +
42987 +/*
42988 + * smp_call_function_single - Run a function on another CPU
42989 + * @func: The function to run. This must be fast and non-blocking.
42990 + * @info: An arbitrary pointer to pass to the function.
42991 + * @nonatomic: Currently unused.
42992 + * @wait: If true, wait until function has completed on other CPUs.
42993 + *
42994 + * Retrurns 0 on success, else a negative status code.
42995 + *
42996 + * Does not return until the remote CPU is nearly ready to execute <func>
42997 + * or is or has executed.
42998 + */
42999 +
43000 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
43001 + int nonatomic, int wait)
43002 +{
43003 + /* prevent preemption and reschedule on another processor */
43004 + int me = get_cpu();
43005 + if (cpu == me) {
43006 + WARN_ON(1);
43007 + put_cpu();
43008 + return -EBUSY;
43009 + }
43010 + spin_lock_bh(&call_lock);
43011 + __smp_call_function_single(cpu, func, info, nonatomic, wait);
43012 + spin_unlock_bh(&call_lock);
43013 + put_cpu();
43014 + return 0;
43015 +}
43016 +
43017 +/*
43018 + * this function sends a 'generic call function' IPI to all other CPUs
43019 + * in the system.
43020 + */
43021 +static void __smp_call_function (void (*func) (void *info), void *info,
43022 + int nonatomic, int wait)
43023 +{
43024 + struct call_data_struct data;
43025 + int cpus = num_online_cpus()-1;
43026 +
43027 + if (!cpus)
43028 + return;
43029 +
43030 + data.func = func;
43031 + data.info = info;
43032 + atomic_set(&data.started, 0);
43033 + data.wait = wait;
43034 + if (wait)
43035 + atomic_set(&data.finished, 0);
43036 +
43037 + call_data = &data;
43038 + wmb();
43039 + /* Send a message to all other CPUs and wait for them to respond */
43040 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
43041 +
43042 + /* Wait for response */
43043 + while (atomic_read(&data.started) != cpus)
43044 +#ifndef CONFIG_XEN
43045 + cpu_relax();
43046 +#else
43047 + barrier();
43048 +#endif
43049 +
43050 + if (!wait)
43051 + return;
43052 +
43053 + while (atomic_read(&data.finished) != cpus)
43054 +#ifndef CONFIG_XEN
43055 + cpu_relax();
43056 +#else
43057 + barrier();
43058 +#endif
43059 +}
43060 +
43061 +/*
43062 + * smp_call_function - run a function on all other CPUs.
43063 + * @func: The function to run. This must be fast and non-blocking.
43064 + * @info: An arbitrary pointer to pass to the function.
43065 + * @nonatomic: currently unused.
43066 + * @wait: If true, wait (atomically) until function has completed on other
43067 + * CPUs.
43068 + *
43069 + * Returns 0 on success, else a negative status code. Does not return until
43070 + * remote CPUs are nearly ready to execute func or are or have executed.
43071 + *
43072 + * You must not call this function with disabled interrupts or from a
43073 + * hardware interrupt handler or from a bottom half handler.
43074 + * Actually there are a few legal cases, like panic.
43075 + */
43076 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
43077 + int wait)
43078 +{
43079 + spin_lock(&call_lock);
43080 + __smp_call_function(func,info,nonatomic,wait);
43081 + spin_unlock(&call_lock);
43082 + return 0;
43083 +}
43084 +
43085 +void smp_stop_cpu(void)
43086 +{
43087 + unsigned long flags;
43088 + /*
43089 + * Remove this CPU:
43090 + */
43091 + cpu_clear(smp_processor_id(), cpu_online_map);
43092 + local_irq_save(flags);
43093 +#ifndef CONFIG_XEN
43094 + disable_local_APIC();
43095 +#endif
43096 + local_irq_restore(flags);
43097 +}
43098 +
43099 +static void smp_really_stop_cpu(void *dummy)
43100 +{
43101 + smp_stop_cpu();
43102 + for (;;)
43103 + halt();
43104 +}
43105 +
43106 +void smp_send_stop(void)
43107 +{
43108 + int nolock = 0;
43109 +#ifndef CONFIG_XEN
43110 + if (reboot_force)
43111 + return;
43112 +#endif
43113 + /* Don't deadlock on the call lock in panic */
43114 + if (!spin_trylock(&call_lock)) {
43115 + /* ignore locking because we have paniced anyways */
43116 + nolock = 1;
43117 + }
43118 + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
43119 + if (!nolock)
43120 + spin_unlock(&call_lock);
43121 +
43122 + local_irq_disable();
43123 +#ifndef CONFIG_XEN
43124 + disable_local_APIC();
43125 +#endif
43126 + local_irq_enable();
43127 +}
43128 +
43129 +/*
43130 + * Reschedule call back. Nothing to do,
43131 + * all the work is done automatically when
43132 + * we return from the interrupt.
43133 + */
43134 +#ifndef CONFIG_XEN
43135 +asmlinkage void smp_reschedule_interrupt(void)
43136 +#else
43137 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
43138 +#endif
43139 +{
43140 +#ifndef CONFIG_XEN
43141 + ack_APIC_irq();
43142 +#else
43143 + return IRQ_HANDLED;
43144 +#endif
43145 +}
43146 +
43147 +#ifndef CONFIG_XEN
43148 +asmlinkage void smp_call_function_interrupt(void)
43149 +#else
43150 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
43151 +#endif
43152 +{
43153 + void (*func) (void *info) = call_data->func;
43154 + void *info = call_data->info;
43155 + int wait = call_data->wait;
43156 +
43157 +#ifndef CONFIG_XEN
43158 + ack_APIC_irq();
43159 +#endif
43160 + /*
43161 + * Notify initiating CPU that I've grabbed the data and am
43162 + * about to execute the function
43163 + */
43164 + mb();
43165 + atomic_inc(&call_data->started);
43166 + /*
43167 + * At this point the info structure may be out of scope unless wait==1
43168 + */
43169 + exit_idle();
43170 + irq_enter();
43171 + (*func)(info);
43172 + irq_exit();
43173 + if (wait) {
43174 + mb();
43175 + atomic_inc(&call_data->finished);
43176 + }
43177 +#ifdef CONFIG_XEN
43178 + return IRQ_HANDLED;
43179 +#endif
43180 +}
43181 +
43182 +int safe_smp_processor_id(void)
43183 +{
43184 +#ifdef CONFIG_XEN
43185 + return smp_processor_id();
43186 +#else
43187 + int apicid, i;
43188 +
43189 + if (disable_apic)
43190 + return 0;
43191 +
43192 + apicid = hard_smp_processor_id();
43193 + if (x86_cpu_to_apicid[apicid] == apicid)
43194 + return apicid;
43195 +
43196 + for (i = 0; i < NR_CPUS; ++i) {
43197 + if (x86_cpu_to_apicid[i] == apicid)
43198 + return i;
43199 + }
43200 +
43201 + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
43202 + * or called too early. Either way, we must be CPU 0. */
43203 + if (x86_cpu_to_apicid[0] == BAD_APICID)
43204 + return 0;
43205 +
43206 + return 0; /* Should not happen */
43207 +#endif
43208 +}
43209 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/smp.c linux-2.6.16.33/arch/x86_64/kernel/smp.c
43210 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/smp.c 2006-11-22 18:06:31.000000000 +0000
43211 +++ linux-2.6.16.33/arch/x86_64/kernel/smp.c 2007-05-23 21:00:01.000000000 +0000
43212 @@ -135,10 +135,10 @@
43213
43214 cpu = smp_processor_id();
43215 /*
43216 - * orig_rax contains the interrupt vector - 256.
43217 + * orig_rax contains the negated interrupt vector.
43218 * Use that to determine where the sender put the data.
43219 */
43220 - sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
43221 + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
43222 f = &per_cpu(flush_state, sender);
43223
43224 if (!cpu_isset(cpu, f->flush_cpumask))
43225 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/traps-xen.c linux-2.6.16.33/arch/x86_64/kernel/traps-xen.c
43226 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/traps-xen.c 1970-01-01 00:00:00.000000000 +0000
43227 +++ linux-2.6.16.33/arch/x86_64/kernel/traps-xen.c 2007-01-08 15:00:45.000000000 +0000
43228 @@ -0,0 +1,1028 @@
43229 +/*
43230 + * linux/arch/x86-64/traps.c
43231 + *
43232 + * Copyright (C) 1991, 1992 Linus Torvalds
43233 + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
43234 + *
43235 + * Pentium III FXSR, SSE support
43236 + * Gareth Hughes <gareth@valinux.com>, May 2000
43237 + *
43238 + * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
43239 + */
43240 +
43241 +/*
43242 + * 'Traps.c' handles hardware traps and faults after we have saved some
43243 + * state in 'entry.S'.
43244 + */
43245 +#include <linux/config.h>
43246 +#include <linux/sched.h>
43247 +#include <linux/kernel.h>
43248 +#include <linux/string.h>
43249 +#include <linux/errno.h>
43250 +#include <linux/ptrace.h>
43251 +#include <linux/timer.h>
43252 +#include <linux/mm.h>
43253 +#include <linux/init.h>
43254 +#include <linux/delay.h>
43255 +#include <linux/spinlock.h>
43256 +#include <linux/interrupt.h>
43257 +#include <linux/module.h>
43258 +#include <linux/moduleparam.h>
43259 +#include <linux/nmi.h>
43260 +#include <linux/kprobes.h>
43261 +#include <linux/kexec.h>
43262 +
43263 +#include <asm/system.h>
43264 +#include <asm/uaccess.h>
43265 +#include <asm/io.h>
43266 +#include <asm/atomic.h>
43267 +#include <asm/debugreg.h>
43268 +#include <asm/desc.h>
43269 +#include <asm/i387.h>
43270 +#include <asm/kdebug.h>
43271 +#include <asm/processor.h>
43272 +
43273 +#include <asm/smp.h>
43274 +#include <asm/pgalloc.h>
43275 +#include <asm/pda.h>
43276 +#include <asm/proto.h>
43277 +#include <asm/nmi.h>
43278 +
43279 +#ifndef CONFIG_X86_NO_IDT
43280 +extern struct gate_struct idt_table[256];
43281 +#endif
43282 +
43283 +asmlinkage void divide_error(void);
43284 +asmlinkage void debug(void);
43285 +asmlinkage void nmi(void);
43286 +asmlinkage void int3(void);
43287 +asmlinkage void overflow(void);
43288 +asmlinkage void bounds(void);
43289 +asmlinkage void invalid_op(void);
43290 +asmlinkage void device_not_available(void);
43291 +asmlinkage void double_fault(void);
43292 +asmlinkage void coprocessor_segment_overrun(void);
43293 +asmlinkage void invalid_TSS(void);
43294 +asmlinkage void segment_not_present(void);
43295 +asmlinkage void stack_segment(void);
43296 +asmlinkage void general_protection(void);
43297 +asmlinkage void page_fault(void);
43298 +asmlinkage void coprocessor_error(void);
43299 +asmlinkage void simd_coprocessor_error(void);
43300 +asmlinkage void reserved(void);
43301 +asmlinkage void alignment_check(void);
43302 +asmlinkage void machine_check(void);
43303 +asmlinkage void spurious_interrupt_bug(void);
43304 +
43305 +struct notifier_block *die_chain;
43306 +static DEFINE_SPINLOCK(die_notifier_lock);
43307 +
43308 +int register_die_notifier(struct notifier_block *nb)
43309 +{
43310 + int err = 0;
43311 + unsigned long flags;
43312 + spin_lock_irqsave(&die_notifier_lock, flags);
43313 + err = notifier_chain_register(&die_chain, nb);
43314 + spin_unlock_irqrestore(&die_notifier_lock, flags);
43315 + return err;
43316 +}
43317 +
43318 +static inline void conditional_sti(struct pt_regs *regs)
43319 +{
43320 + if (regs->eflags & X86_EFLAGS_IF)
43321 + local_irq_enable();
43322 +}
43323 +
43324 +static inline void preempt_conditional_sti(struct pt_regs *regs)
43325 +{
43326 + preempt_disable();
43327 + if (regs->eflags & X86_EFLAGS_IF)
43328 + local_irq_enable();
43329 +}
43330 +
43331 +static inline void preempt_conditional_cli(struct pt_regs *regs)
43332 +{
43333 + if (regs->eflags & X86_EFLAGS_IF)
43334 + local_irq_disable();
43335 + preempt_enable_no_resched();
43336 +}
43337 +
43338 +static int kstack_depth_to_print = 10;
43339 +
43340 +#ifdef CONFIG_KALLSYMS
43341 +#include <linux/kallsyms.h>
43342 +int printk_address(unsigned long address)
43343 +{
43344 + unsigned long offset = 0, symsize;
43345 + const char *symname;
43346 + char *modname;
43347 + char *delim = ":";
43348 + char namebuf[128];
43349 +
43350 + symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
43351 + if (!symname)
43352 + return printk("[<%016lx>]", address);
43353 + if (!modname)
43354 + modname = delim = "";
43355 + return printk("<%016lx>{%s%s%s%s%+ld}",
43356 + address,delim,modname,delim,symname,offset);
43357 +}
43358 +#else
43359 +int printk_address(unsigned long address)
43360 +{
43361 + return printk("[<%016lx>]", address);
43362 +}
43363 +#endif
43364 +
43365 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
43366 + unsigned *usedp, const char **idp)
43367 +{
43368 +#ifndef CONFIG_X86_NO_TSS
43369 + static char ids[][8] = {
43370 + [DEBUG_STACK - 1] = "#DB",
43371 + [NMI_STACK - 1] = "NMI",
43372 + [DOUBLEFAULT_STACK - 1] = "#DF",
43373 + [STACKFAULT_STACK - 1] = "#SS",
43374 + [MCE_STACK - 1] = "#MC",
43375 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
43376 + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
43377 +#endif
43378 + };
43379 + unsigned k;
43380 +
43381 + for (k = 0; k < N_EXCEPTION_STACKS; k++) {
43382 + unsigned long end;
43383 +
43384 + switch (k + 1) {
43385 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
43386 + case DEBUG_STACK:
43387 + end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
43388 + break;
43389 +#endif
43390 + default:
43391 + end = per_cpu(init_tss, cpu).ist[k];
43392 + break;
43393 + }
43394 + if (stack >= end)
43395 + continue;
43396 + if (stack >= end - EXCEPTION_STKSZ) {
43397 + if (*usedp & (1U << k))
43398 + break;
43399 + *usedp |= 1U << k;
43400 + *idp = ids[k];
43401 + return (unsigned long *)end;
43402 + }
43403 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
43404 + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
43405 + unsigned j = N_EXCEPTION_STACKS - 1;
43406 +
43407 + do {
43408 + ++j;
43409 + end -= EXCEPTION_STKSZ;
43410 + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
43411 + } while (stack < end - EXCEPTION_STKSZ);
43412 + if (*usedp & (1U << j))
43413 + break;
43414 + *usedp |= 1U << j;
43415 + *idp = ids[j];
43416 + return (unsigned long *)end;
43417 + }
43418 +#endif
43419 + }
43420 +#endif
43421 + return NULL;
43422 +}
43423 +
43424 +/*
43425 + * x86-64 can have upto three kernel stacks:
43426 + * process stack
43427 + * interrupt stack
43428 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
43429 + */
43430 +
43431 +void show_trace(unsigned long *stack)
43432 +{
43433 + const unsigned cpu = safe_smp_processor_id();
43434 + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
43435 + int i;
43436 + unsigned used = 0;
43437 +
43438 + printk("\nCall Trace:");
43439 +
43440 +#define HANDLE_STACK(cond) \
43441 + do while (cond) { \
43442 + unsigned long addr = *stack++; \
43443 + if (kernel_text_address(addr)) { \
43444 + if (i > 50) { \
43445 + printk("\n "); \
43446 + i = 0; \
43447 + } \
43448 + else \
43449 + i += printk(" "); \
43450 + /* \
43451 + * If the address is either in the text segment of the \
43452 + * kernel, or in the region which contains vmalloc'ed \
43453 + * memory, it *may* be the address of a calling \
43454 + * routine; if so, print it so that someone tracing \
43455 + * down the cause of the crash will be able to figure \
43456 + * out the call path that was taken. \
43457 + */ \
43458 + i += printk_address(addr); \
43459 + } \
43460 + } while (0)
43461 +
43462 + for(i = 11; ; ) {
43463 + const char *id;
43464 + unsigned long *estack_end;
43465 + estack_end = in_exception_stack(cpu, (unsigned long)stack,
43466 + &used, &id);
43467 +
43468 + if (estack_end) {
43469 + i += printk(" <%s>", id);
43470 + HANDLE_STACK (stack < estack_end);
43471 + i += printk(" <EOE>");
43472 + stack = (unsigned long *) estack_end[-2];
43473 + continue;
43474 + }
43475 + if (irqstack_end) {
43476 + unsigned long *irqstack;
43477 + irqstack = irqstack_end -
43478 + (IRQSTACKSIZE - 64) / sizeof(*irqstack);
43479 +
43480 + if (stack >= irqstack && stack < irqstack_end) {
43481 + i += printk(" <IRQ>");
43482 + HANDLE_STACK (stack < irqstack_end);
43483 + stack = (unsigned long *) (irqstack_end[-1]);
43484 + irqstack_end = NULL;
43485 + i += printk(" <EOI>");
43486 + continue;
43487 + }
43488 + }
43489 + break;
43490 + }
43491 +
43492 + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
43493 +#undef HANDLE_STACK
43494 + printk("\n");
43495 +}
43496 +
43497 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
43498 +{
43499 + unsigned long *stack;
43500 + int i;
43501 + const int cpu = safe_smp_processor_id();
43502 + unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
43503 + unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
43504 +
43505 + // debugging aid: "show_stack(NULL, NULL);" prints the
43506 + // back trace for this cpu.
43507 +
43508 + if (rsp == NULL) {
43509 + if (tsk)
43510 + rsp = (unsigned long *)tsk->thread.rsp;
43511 + else
43512 + rsp = (unsigned long *)&rsp;
43513 + }
43514 +
43515 + stack = rsp;
43516 + for(i=0; i < kstack_depth_to_print; i++) {
43517 + if (stack >= irqstack && stack <= irqstack_end) {
43518 + if (stack == irqstack_end) {
43519 + stack = (unsigned long *) (irqstack_end[-1]);
43520 + printk(" <EOI> ");
43521 + }
43522 + } else {
43523 + if (((long) stack & (THREAD_SIZE-1)) == 0)
43524 + break;
43525 + }
43526 + if (i && ((i % 4) == 0))
43527 + printk("\n ");
43528 + printk("%016lx ", *stack++);
43529 + touch_nmi_watchdog();
43530 + }
43531 + show_trace((unsigned long *)rsp);
43532 +}
43533 +
43534 +/*
43535 + * The architecture-independent dump_stack generator
43536 + */
43537 +void dump_stack(void)
43538 +{
43539 + unsigned long dummy;
43540 + show_trace(&dummy);
43541 +}
43542 +
43543 +EXPORT_SYMBOL(dump_stack);
43544 +
43545 +void show_registers(struct pt_regs *regs)
43546 +{
43547 + int i;
43548 + int in_kernel = !user_mode(regs);
43549 + unsigned long rsp;
43550 + const int cpu = safe_smp_processor_id();
43551 + struct task_struct *cur = cpu_pda(cpu)->pcurrent;
43552 +
43553 + rsp = regs->rsp;
43554 +
43555 + printk("CPU %d ", cpu);
43556 + __show_regs(regs);
43557 + printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
43558 + cur->comm, cur->pid, task_thread_info(cur), cur);
43559 +
43560 + /*
43561 + * When in-kernel, we also print out the stack and code at the
43562 + * time of the fault..
43563 + */
43564 + if (in_kernel) {
43565 +
43566 + printk("Stack: ");
43567 + show_stack(NULL, (unsigned long*)rsp);
43568 +
43569 + printk("\nCode: ");
43570 + if(regs->rip < PAGE_OFFSET)
43571 + goto bad;
43572 +
43573 + for(i=0;i<20;i++)
43574 + {
43575 + unsigned char c;
43576 + if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
43577 +bad:
43578 + printk(" Bad RIP value.");
43579 + break;
43580 + }
43581 + printk("%02x ", c);
43582 + }
43583 + }
43584 + printk("\n");
43585 +}
43586 +
43587 +void handle_BUG(struct pt_regs *regs)
43588 +{
43589 + struct bug_frame f;
43590 + long len;
43591 + const char *prefix = "";
43592 +
43593 + if (user_mode(regs))
43594 + return;
43595 + if (__copy_from_user(&f, (const void __user *) regs->rip,
43596 + sizeof(struct bug_frame)))
43597 + return;
43598 + if (f.filename >= 0 ||
43599 + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
43600 + return;
43601 + len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
43602 + if (len < 0 || len >= PATH_MAX)
43603 + f.filename = (int)(long)"unmapped filename";
43604 + else if (len > 50) {
43605 + f.filename += len - 50;
43606 + prefix = "...";
43607 + }
43608 + printk("----------- [cut here ] --------- [please bite here ] ---------\n");
43609 + printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
43610 +}
43611 +
43612 +#ifdef CONFIG_BUG
43613 +void out_of_line_bug(void)
43614 +{
43615 + BUG();
43616 +}
43617 +#endif
43618 +
43619 +static DEFINE_SPINLOCK(die_lock);
43620 +static int die_owner = -1;
43621 +
43622 +unsigned __kprobes long oops_begin(void)
43623 +{
43624 + int cpu = safe_smp_processor_id();
43625 + unsigned long flags;
43626 +
43627 + /* racy, but better than risking deadlock. */
43628 + local_irq_save(flags);
43629 + if (!spin_trylock(&die_lock)) {
43630 + if (cpu == die_owner)
43631 + /* nested oops. should stop eventually */;
43632 + else
43633 + spin_lock(&die_lock);
43634 + }
43635 + die_owner = cpu;
43636 + console_verbose();
43637 + bust_spinlocks(1);
43638 + return flags;
43639 +}
43640 +
43641 +void __kprobes oops_end(unsigned long flags)
43642 +{
43643 + die_owner = -1;
43644 + bust_spinlocks(0);
43645 + spin_unlock_irqrestore(&die_lock, flags);
43646 + if (panic_on_oops)
43647 + panic("Oops");
43648 +}
43649 +
43650 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
43651 +{
43652 + static int die_counter;
43653 + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
43654 +#ifdef CONFIG_PREEMPT
43655 + printk("PREEMPT ");
43656 +#endif
43657 +#ifdef CONFIG_SMP
43658 + printk("SMP ");
43659 +#endif
43660 +#ifdef CONFIG_DEBUG_PAGEALLOC
43661 + printk("DEBUG_PAGEALLOC");
43662 +#endif
43663 + printk("\n");
43664 + notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
43665 + show_registers(regs);
43666 + /* Executive summary in case the oops scrolled away */
43667 + printk(KERN_ALERT "RIP ");
43668 + printk_address(regs->rip);
43669 + printk(" RSP <%016lx>\n", regs->rsp);
43670 + if (kexec_should_crash(current))
43671 + crash_kexec(regs);
43672 +}
43673 +
43674 +void die(const char * str, struct pt_regs * regs, long err)
43675 +{
43676 + unsigned long flags = oops_begin();
43677 +
43678 + handle_BUG(regs);
43679 + __die(str, regs, err);
43680 + oops_end(flags);
43681 + do_exit(SIGSEGV);
43682 +}
43683 +
43684 +#ifdef CONFIG_X86_LOCAL_APIC
43685 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
43686 +{
43687 + unsigned long flags = oops_begin();
43688 +
43689 + /*
43690 + * We are in trouble anyway, lets at least try
43691 + * to get a message out.
43692 + */
43693 + printk(str, safe_smp_processor_id());
43694 + show_registers(regs);
43695 + if (kexec_should_crash(current))
43696 + crash_kexec(regs);
43697 + if (panic_on_timeout || panic_on_oops)
43698 + panic("nmi watchdog");
43699 + printk("console shuts up ...\n");
43700 + oops_end(flags);
43701 + do_exit(SIGSEGV);
43702 +}
43703 +#endif
43704 +
43705 +static void __kprobes do_trap(int trapnr, int signr, char *str,
43706 + struct pt_regs * regs, long error_code,
43707 + siginfo_t *info)
43708 +{
43709 + struct task_struct *tsk = current;
43710 +
43711 + conditional_sti(regs);
43712 +
43713 + tsk->thread.error_code = error_code;
43714 + tsk->thread.trap_no = trapnr;
43715 +
43716 + if (user_mode(regs)) {
43717 + if (exception_trace && unhandled_signal(tsk, signr))
43718 + printk(KERN_INFO
43719 + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
43720 + tsk->comm, tsk->pid, str,
43721 + regs->rip,regs->rsp,error_code);
43722 +
43723 + if (info)
43724 + force_sig_info(signr, info, tsk);
43725 + else
43726 + force_sig(signr, tsk);
43727 + return;
43728 + }
43729 +
43730 +
43731 + /* kernel trap */
43732 + {
43733 + const struct exception_table_entry *fixup;
43734 + fixup = search_exception_tables(regs->rip);
43735 + if (fixup) {
43736 + regs->rip = fixup->fixup;
43737 + } else
43738 + die(str, regs, error_code);
43739 + return;
43740 + }
43741 +}
43742 +
43743 +#define DO_ERROR(trapnr, signr, str, name) \
43744 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43745 +{ \
43746 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43747 + == NOTIFY_STOP) \
43748 + return; \
43749 + do_trap(trapnr, signr, str, regs, error_code, NULL); \
43750 +}
43751 +
43752 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
43753 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
43754 +{ \
43755 + siginfo_t info; \
43756 + info.si_signo = signr; \
43757 + info.si_errno = 0; \
43758 + info.si_code = sicode; \
43759 + info.si_addr = (void __user *)siaddr; \
43760 + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
43761 + == NOTIFY_STOP) \
43762 + return; \
43763 + do_trap(trapnr, signr, str, regs, error_code, &info); \
43764 +}
43765 +
43766 +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
43767 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
43768 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
43769 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
43770 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
43771 +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
43772 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
43773 +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
43774 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
43775 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
43776 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
43777 +
43778 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
43779 +{
43780 + static const char str[] = "double fault";
43781 + struct task_struct *tsk = current;
43782 +
43783 + /* Return not checked because double check cannot be ignored */
43784 + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
43785 +
43786 + tsk->thread.error_code = error_code;
43787 + tsk->thread.trap_no = 8;
43788 +
43789 + /* This is always a kernel trap and never fixable (and thus must
43790 + never return). */
43791 + for (;;)
43792 + die(str, regs, error_code);
43793 +}
43794 +
43795 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
43796 + long error_code)
43797 +{
43798 + struct task_struct *tsk = current;
43799 +
43800 + conditional_sti(regs);
43801 +
43802 + tsk->thread.error_code = error_code;
43803 + tsk->thread.trap_no = 13;
43804 +
43805 + if (user_mode(regs)) {
43806 + if (exception_trace && unhandled_signal(tsk, SIGSEGV))
43807 + printk(KERN_INFO
43808 + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
43809 + tsk->comm, tsk->pid,
43810 + regs->rip,regs->rsp,error_code);
43811 +
43812 + force_sig(SIGSEGV, tsk);
43813 + return;
43814 + }
43815 +
43816 + /* kernel gp */
43817 + {
43818 + const struct exception_table_entry *fixup;
43819 + fixup = search_exception_tables(regs->rip);
43820 + if (fixup) {
43821 + regs->rip = fixup->fixup;
43822 + return;
43823 + }
43824 + if (notify_die(DIE_GPF, "general protection fault", regs,
43825 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
43826 + return;
43827 + die("general protection fault", regs, error_code);
43828 + }
43829 +}
43830 +
43831 +static __kprobes void
43832 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
43833 +{
43834 + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
43835 + printk("You probably have a hardware problem with your RAM chips\n");
43836 +
43837 +#if 0 /* XEN */
43838 + /* Clear and disable the memory parity error line. */
43839 + reason = (reason & 0xf) | 4;
43840 + outb(reason, 0x61);
43841 +#endif /* XEN */
43842 +}
43843 +
43844 +static __kprobes void
43845 +io_check_error(unsigned char reason, struct pt_regs * regs)
43846 +{
43847 + printk("NMI: IOCK error (debug interrupt?)\n");
43848 + show_registers(regs);
43849 +
43850 +#if 0 /* XEN */
43851 + /* Re-enable the IOCK line, wait for a few seconds */
43852 + reason = (reason & 0xf) | 8;
43853 + outb(reason, 0x61);
43854 + mdelay(2000);
43855 + reason &= ~8;
43856 + outb(reason, 0x61);
43857 +#endif /* XEN */
43858 +}
43859 +
43860 +static __kprobes void
43861 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
43862 +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
43863 + printk("Dazed and confused, but trying to continue\n");
43864 + printk("Do you have a strange power saving mode enabled?\n");
43865 +}
43866 +
43867 +/* Runs on IST stack. This code must keep interrupts off all the time.
43868 + Nested NMIs are prevented by the CPU. */
43869 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
43870 +{
43871 + unsigned char reason = 0;
43872 + int cpu;
43873 +
43874 + cpu = smp_processor_id();
43875 +
43876 + /* Only the BSP gets external NMIs from the system. */
43877 + if (!cpu)
43878 + reason = get_nmi_reason();
43879 +
43880 + if (!(reason & 0xc0)) {
43881 + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
43882 + == NOTIFY_STOP)
43883 + return;
43884 +#ifdef CONFIG_X86_LOCAL_APIC
43885 + /*
43886 + * Ok, so this is none of the documented NMI sources,
43887 + * so it must be the NMI watchdog.
43888 + */
43889 + if (nmi_watchdog > 0) {
43890 + nmi_watchdog_tick(regs,reason);
43891 + return;
43892 + }
43893 +#endif
43894 + unknown_nmi_error(reason, regs);
43895 + return;
43896 + }
43897 + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
43898 + return;
43899 +
43900 + /* AK: following checks seem to be broken on modern chipsets. FIXME */
43901 +
43902 + if (reason & 0x80)
43903 + mem_parity_error(reason, regs);
43904 + if (reason & 0x40)
43905 + io_check_error(reason, regs);
43906 +}
43907 +
43908 +/* runs on IST stack. */
43909 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
43910 +{
43911 + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
43912 + return;
43913 + }
43914 + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
43915 + return;
43916 +}
43917 +
43918 +/* Help handler running on IST stack to switch back to user stack
43919 + for scheduling or signal handling. The actual stack switch is done in
43920 + entry.S */
43921 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
43922 +{
43923 + struct pt_regs *regs = eregs;
43924 + /* Did already sync */
43925 + if (eregs == (struct pt_regs *)eregs->rsp)
43926 + ;
43927 + /* Exception from user space */
43928 + else if (user_mode(eregs))
43929 + regs = task_pt_regs(current);
43930 + /* Exception from kernel and interrupts are enabled. Move to
43931 + kernel process stack. */
43932 + else if (eregs->eflags & X86_EFLAGS_IF)
43933 + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
43934 + if (eregs != regs)
43935 + *regs = *eregs;
43936 + return regs;
43937 +}
43938 +
43939 +/* runs on IST stack. */
43940 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
43941 + unsigned long error_code)
43942 +{
43943 + unsigned long condition;
43944 + struct task_struct *tsk = current;
43945 + siginfo_t info;
43946 +
43947 + get_debugreg(condition, 6);
43948 +
43949 + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
43950 + SIGTRAP) == NOTIFY_STOP)
43951 + return;
43952 +
43953 + preempt_conditional_sti(regs);
43954 +
43955 + /* Mask out spurious debug traps due to lazy DR7 setting */
43956 + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
43957 + if (!tsk->thread.debugreg7) {
43958 + goto clear_dr7;
43959 + }
43960 + }
43961 +
43962 + tsk->thread.debugreg6 = condition;
43963 +
43964 + /* Mask out spurious TF errors due to lazy TF clearing */
43965 + if (condition & DR_STEP) {
43966 + /*
43967 + * The TF error should be masked out only if the current
43968 + * process is not traced and if the TRAP flag has been set
43969 + * previously by a tracing process (condition detected by
43970 + * the PT_DTRACE flag); remember that the i386 TRAP flag
43971 + * can be modified by the process itself in user mode,
43972 + * allowing programs to debug themselves without the ptrace()
43973 + * interface.
43974 + */
43975 + if (!user_mode(regs))
43976 + goto clear_TF_reenable;
43977 + /*
43978 + * Was the TF flag set by a debugger? If so, clear it now,
43979 + * so that register information is correct.
43980 + */
43981 + if (tsk->ptrace & PT_DTRACE) {
43982 + regs->eflags &= ~TF_MASK;
43983 + tsk->ptrace &= ~PT_DTRACE;
43984 + }
43985 + }
43986 +
43987 + /* Ok, finally something we can handle */
43988 + tsk->thread.trap_no = 1;
43989 + tsk->thread.error_code = error_code;
43990 + info.si_signo = SIGTRAP;
43991 + info.si_errno = 0;
43992 + info.si_code = TRAP_BRKPT;
43993 + info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
43994 + force_sig_info(SIGTRAP, &info, tsk);
43995 +
43996 +clear_dr7:
43997 + set_debugreg(0UL, 7);
43998 + preempt_conditional_cli(regs);
43999 + return;
44000 +
44001 +clear_TF_reenable:
44002 + set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
44003 + regs->eflags &= ~TF_MASK;
44004 + preempt_conditional_cli(regs);
44005 +}
44006 +
44007 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
44008 +{
44009 + const struct exception_table_entry *fixup;
44010 + fixup = search_exception_tables(regs->rip);
44011 + if (fixup) {
44012 + regs->rip = fixup->fixup;
44013 + return 1;
44014 + }
44015 + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
44016 + /* Illegal floating point operation in the kernel */
44017 + current->thread.trap_no = trapnr;
44018 + die(str, regs, 0);
44019 + return 0;
44020 +}
44021 +
44022 +/*
44023 + * Note that we play around with the 'TS' bit in an attempt to get
44024 + * the correct behaviour even in the presence of the asynchronous
44025 + * IRQ13 behaviour
44026 + */
44027 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
44028 +{
44029 + void __user *rip = (void __user *)(regs->rip);
44030 + struct task_struct * task;
44031 + siginfo_t info;
44032 + unsigned short cwd, swd;
44033 +
44034 + conditional_sti(regs);
44035 + if (!user_mode(regs) &&
44036 + kernel_math_error(regs, "kernel x87 math error", 16))
44037 + return;
44038 +
44039 + /*
44040 + * Save the info for the exception handler and clear the error.
44041 + */
44042 + task = current;
44043 + save_init_fpu(task);
44044 + task->thread.trap_no = 16;
44045 + task->thread.error_code = 0;
44046 + info.si_signo = SIGFPE;
44047 + info.si_errno = 0;
44048 + info.si_code = __SI_FAULT;
44049 + info.si_addr = rip;
44050 + /*
44051 + * (~cwd & swd) will mask out exceptions that are not set to unmasked
44052 + * status. 0x3f is the exception bits in these regs, 0x200 is the
44053 + * C1 reg you need in case of a stack fault, 0x040 is the stack
44054 + * fault bit. We should only be taking one exception at a time,
44055 + * so if this combination doesn't produce any single exception,
44056 + * then we have a bad program that isn't synchronizing its FPU usage
44057 + * and it will suffer the consequences since we won't be able to
44058 + * fully reproduce the context of the exception
44059 + */
44060 + cwd = get_fpu_cwd(task);
44061 + swd = get_fpu_swd(task);
44062 + switch (swd & ~cwd & 0x3f) {
44063 + case 0x000:
44064 + default:
44065 + break;
44066 + case 0x001: /* Invalid Op */
44067 + /*
44068 + * swd & 0x240 == 0x040: Stack Underflow
44069 + * swd & 0x240 == 0x240: Stack Overflow
44070 + * User must clear the SF bit (0x40) if set
44071 + */
44072 + info.si_code = FPE_FLTINV;
44073 + break;
44074 + case 0x002: /* Denormalize */
44075 + case 0x010: /* Underflow */
44076 + info.si_code = FPE_FLTUND;
44077 + break;
44078 + case 0x004: /* Zero Divide */
44079 + info.si_code = FPE_FLTDIV;
44080 + break;
44081 + case 0x008: /* Overflow */
44082 + info.si_code = FPE_FLTOVF;
44083 + break;
44084 + case 0x020: /* Precision */
44085 + info.si_code = FPE_FLTRES;
44086 + break;
44087 + }
44088 + force_sig_info(SIGFPE, &info, task);
44089 +}
44090 +
44091 +asmlinkage void bad_intr(void)
44092 +{
44093 + printk("bad interrupt");
44094 +}
44095 +
44096 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
44097 +{
44098 + void __user *rip = (void __user *)(regs->rip);
44099 + struct task_struct * task;
44100 + siginfo_t info;
44101 + unsigned short mxcsr;
44102 +
44103 + conditional_sti(regs);
44104 + if (!user_mode(regs) &&
44105 + kernel_math_error(regs, "kernel simd math error", 19))
44106 + return;
44107 +
44108 + /*
44109 + * Save the info for the exception handler and clear the error.
44110 + */
44111 + task = current;
44112 + save_init_fpu(task);
44113 + task->thread.trap_no = 19;
44114 + task->thread.error_code = 0;
44115 + info.si_signo = SIGFPE;
44116 + info.si_errno = 0;
44117 + info.si_code = __SI_FAULT;
44118 + info.si_addr = rip;
44119 + /*
44120 + * The SIMD FPU exceptions are handled a little differently, as there
44121 + * is only a single status/control register. Thus, to determine which
44122 + * unmasked exception was caught we must mask the exception mask bits
44123 + * at 0x1f80, and then use these to mask the exception bits at 0x3f.
44124 + */
44125 + mxcsr = get_fpu_mxcsr(task);
44126 + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
44127 + case 0x000:
44128 + default:
44129 + break;
44130 + case 0x001: /* Invalid Op */
44131 + info.si_code = FPE_FLTINV;
44132 + break;
44133 + case 0x002: /* Denormalize */
44134 + case 0x010: /* Underflow */
44135 + info.si_code = FPE_FLTUND;
44136 + break;
44137 + case 0x004: /* Zero Divide */
44138 + info.si_code = FPE_FLTDIV;
44139 + break;
44140 + case 0x008: /* Overflow */
44141 + info.si_code = FPE_FLTOVF;
44142 + break;
44143 + case 0x020: /* Precision */
44144 + info.si_code = FPE_FLTRES;
44145 + break;
44146 + }
44147 + force_sig_info(SIGFPE, &info, task);
44148 +}
44149 +
44150 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
44151 +{
44152 +}
44153 +
44154 +#if 0
44155 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
44156 +{
44157 +}
44158 +#endif
44159 +
44160 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
44161 +{
44162 +}
44163 +
44164 +/*
44165 + * 'math_state_restore()' saves the current math information in the
44166 + * old math state array, and gets the new ones from the current task
44167 + *
44168 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
44169 + * Don't touch unless you *really* know how it works.
44170 + */
44171 +asmlinkage void math_state_restore(void)
44172 +{
44173 + struct task_struct *me = current;
44174 + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
44175 +
44176 + if (!used_math())
44177 + init_fpu(me);
44178 + restore_fpu_checking(&me->thread.i387.fxsave);
44179 + task_thread_info(me)->status |= TS_USEDFPU;
44180 +}
44181 +
44182 +
44183 +/*
44184 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
44185 + * specify <dpl>|4 in the second field.
44186 + */
44187 +static trap_info_t trap_table[] = {
44188 + { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
44189 + { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
44190 + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
44191 + { 4, 3|4, __KERNEL_CS, (unsigned long)overflow },
44192 + { 5, 0|4, __KERNEL_CS, (unsigned long)bounds },
44193 + { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op },
44194 + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available },
44195 + { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
44196 + { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS },
44197 + { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present },
44198 + { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment },
44199 + { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection },
44200 + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault },
44201 + { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug },
44202 + { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error },
44203 + { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check },
44204 +#ifdef CONFIG_X86_MCE
44205 + { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check },
44206 +#endif
44207 + { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
44208 +#ifdef CONFIG_IA32_EMULATION
44209 + { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
44210 +#endif
44211 + { 0, 0, 0, 0 }
44212 +};
44213 +
44214 +void __init trap_init(void)
44215 +{
44216 + int ret;
44217 +
44218 + ret = HYPERVISOR_set_trap_table(trap_table);
44219 +
44220 + if (ret)
44221 + printk("HYPERVISOR_set_trap_table faild: error %d\n",
44222 + ret);
44223 +
44224 + /*
44225 + * Should be a barrier for any external CPU state.
44226 + */
44227 + cpu_init();
44228 +}
44229 +
44230 +void smp_trap_init(trap_info_t *trap_ctxt)
44231 +{
44232 + trap_info_t *t = trap_table;
44233 +
44234 + for (t = trap_table; t->address; t++) {
44235 + trap_ctxt[t->vector].flags = t->flags;
44236 + trap_ctxt[t->vector].cs = t->cs;
44237 + trap_ctxt[t->vector].address = t->address;
44238 + }
44239 +}
44240 +
44241 +
44242 +/* Actual parsing is done early in setup.c. */
44243 +static int __init oops_dummy(char *s)
44244 +{
44245 + panic_on_oops = 1;
44246 + return -1;
44247 +}
44248 +__setup("oops=", oops_dummy);
44249 +
44250 +static int __init kstack_setup(char *s)
44251 +{
44252 + kstack_depth_to_print = simple_strtoul(s,NULL,0);
44253 + return 0;
44254 +}
44255 +__setup("kstack=", kstack_setup);
44256 +
44257 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S
44258 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S 2006-11-22 18:06:31.000000000 +0000
44259 +++ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S 2007-05-23 21:00:01.000000000 +0000
44260 @@ -14,6 +14,13 @@
44261 OUTPUT_ARCH(i386:x86-64)
44262 ENTRY(phys_startup_64)
44263 jiffies_64 = jiffies;
44264 +PHDRS {
44265 + text PT_LOAD FLAGS(5); /* R_E */
44266 + data PT_LOAD FLAGS(7); /* RWE */
44267 + user PT_LOAD FLAGS(7); /* RWE */
44268 + data.init PT_LOAD FLAGS(7); /* RWE */
44269 + note PT_NOTE FLAGS(4); /* R__ */
44270 +}
44271 SECTIONS
44272 {
44273 . = __START_KERNEL;
44274 @@ -26,7 +33,7 @@
44275 KPROBES_TEXT
44276 *(.fixup)
44277 *(.gnu.warning)
44278 - } = 0x9090
44279 + } :text = 0x9090
44280 /* out-of-line lock text */
44281 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
44282
44283 @@ -43,17 +50,10 @@
44284 .data : AT(ADDR(.data) - LOAD_OFFSET) {
44285 *(.data)
44286 CONSTRUCTORS
44287 - }
44288 + } :data
44289
44290 _edata = .; /* End of data section */
44291
44292 - __bss_start = .; /* BSS */
44293 - .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44294 - *(.bss.page_aligned)
44295 - *(.bss)
44296 - }
44297 - __bss_stop = .;
44298 -
44299 . = ALIGN(PAGE_SIZE);
44300 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44301 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
44302 @@ -75,7 +75,7 @@
44303 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
44304
44305 . = VSYSCALL_ADDR;
44306 - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
44307 + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
44308 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
44309
44310 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44311 @@ -118,7 +118,7 @@
44312 . = ALIGN(8192); /* init_task */
44313 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
44314 *(.data.init_task)
44315 - }
44316 + }:data.init
44317
44318 . = ALIGN(4096);
44319 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
44320 @@ -188,6 +188,14 @@
44321 . = ALIGN(4096);
44322 __nosave_end = .;
44323
44324 + __bss_start = .; /* BSS */
44325 + . = ALIGN(4096);
44326 + .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44327 + *(.bss.page_aligned)
44328 + *(.bss)
44329 + }
44330 + __bss_stop = .;
44331 +
44332 _end = . ;
44333
44334 /* Sections to be discarded */
44335 @@ -201,4 +209,6 @@
44336 STABS_DEBUG
44337
44338 DWARF_DEBUG
44339 +
44340 + NOTES
44341 }
44342 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S~ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S~
44343 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/vmlinux.lds.S~ 1970-01-01 00:00:00.000000000 +0000
44344 +++ linux-2.6.16.33/arch/x86_64/kernel/vmlinux.lds.S~ 2007-05-23 21:00:01.000000000 +0000
44345 @@ -0,0 +1,213 @@
44346 +/* ld script to make x86-64 Linux kernel
44347 + * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
44348 + */
44349 +
44350 +#define LOAD_OFFSET __START_KERNEL_map
44351 +
44352 +#include <asm-generic/vmlinux.lds.h>
44353 +#include <asm/page.h>
44354 +#include <linux/config.h>
44355 +
44356 +#undef i386 /* in case the preprocessor is a 32bit one */
44357 +
44358 +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
44359 +OUTPUT_ARCH(i386:x86-64)
44360 +ENTRY(phys_startup_64)
44361 +jiffies_64 = jiffies;
44362 +PHDRS {
44363 + text PT_LOAD FLAGS(5); /* R_E */
44364 + data PT_LOAD FLAGS(7); /* RWE */
44365 + user PT_LOAD FLAGS(7); /* RWE */
44366 + note PT_NOTE FLAGS(4); /* R__ */
44367 +}
44368 +SECTIONS
44369 +{
44370 + . = __START_KERNEL;
44371 + phys_startup_64 = startup_64 - LOAD_OFFSET;
44372 + _text = .; /* Text and read-only data */
44373 + .text : AT(ADDR(.text) - LOAD_OFFSET) {
44374 + *(.text)
44375 + SCHED_TEXT
44376 + LOCK_TEXT
44377 + KPROBES_TEXT
44378 + *(.fixup)
44379 + *(.gnu.warning)
44380 + } :text = 0x9090
44381 + /* out-of-line lock text */
44382 + .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
44383 +
44384 + _etext = .; /* End of text section */
44385 +
44386 + . = ALIGN(16); /* Exception table */
44387 + __start___ex_table = .;
44388 + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
44389 + __stop___ex_table = .;
44390 +
44391 + RODATA
44392 +
44393 + /* Data */
44394 + .data : AT(ADDR(.data) - LOAD_OFFSET) {
44395 + *(.data)
44396 + CONSTRUCTORS
44397 + } :data
44398 +
44399 + _edata = .; /* End of data section */
44400 +
44401 + . = ALIGN(PAGE_SIZE);
44402 + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44403 + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
44404 + *(.data.cacheline_aligned)
44405 + }
44406 + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44407 + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
44408 + *(.data.read_mostly)
44409 + }
44410 +
44411 +#define VSYSCALL_ADDR (-10*1024*1024)
44412 +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
44413 +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
44414 +
44415 +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
44416 +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
44417 +
44418 +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
44419 +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
44420 +
44421 + . = VSYSCALL_ADDR;
44422 + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
44423 + __vsyscall_0 = VSYSCALL_VIRT_ADDR;
44424 +
44425 + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44426 + .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
44427 + xtime_lock = VVIRT(.xtime_lock);
44428 +
44429 + .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
44430 + vxtime = VVIRT(.vxtime);
44431 +
44432 + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
44433 + wall_jiffies = VVIRT(.wall_jiffies);
44434 +
44435 + .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
44436 + sys_tz = VVIRT(.sys_tz);
44437 +
44438 + .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
44439 + sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
44440 +
44441 + .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
44442 + xtime = VVIRT(.xtime);
44443 +
44444 + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44445 + .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
44446 + jiffies = VVIRT(.jiffies);
44447 +
44448 + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
44449 + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
44450 + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
44451 +
44452 + . = VSYSCALL_VIRT_ADDR + 4096;
44453 +
44454 +#undef VSYSCALL_ADDR
44455 +#undef VSYSCALL_PHYS_ADDR
44456 +#undef VSYSCALL_VIRT_ADDR
44457 +#undef VLOAD_OFFSET
44458 +#undef VLOAD
44459 +#undef VVIRT_OFFSET
44460 +#undef VVIRT
44461 +
44462 + . = ALIGN(8192); /* init_task */
44463 + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
44464 + *(.data.init_task)
44465 + } :data
44466 +
44467 + . = ALIGN(4096);
44468 + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
44469 + *(.data.page_aligned)
44470 + }
44471 +
44472 + . = ALIGN(4096); /* Init code and data */
44473 + __init_begin = .;
44474 + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
44475 + _sinittext = .;
44476 + *(.init.text)
44477 + _einittext = .;
44478 + }
44479 + __initdata_begin = .;
44480 + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
44481 + __initdata_end = .;
44482 + . = ALIGN(16);
44483 + __setup_start = .;
44484 + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
44485 + __setup_end = .;
44486 + __initcall_start = .;
44487 + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
44488 + *(.initcall1.init)
44489 + *(.initcall2.init)
44490 + *(.initcall3.init)
44491 + *(.initcall4.init)
44492 + *(.initcall5.init)
44493 + *(.initcall6.init)
44494 + *(.initcall7.init)
44495 + }
44496 + __initcall_end = .;
44497 + __con_initcall_start = .;
44498 + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
44499 + *(.con_initcall.init)
44500 + }
44501 + __con_initcall_end = .;
44502 + SECURITY_INIT
44503 + . = ALIGN(8);
44504 + __alt_instructions = .;
44505 + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
44506 + *(.altinstructions)
44507 + }
44508 + __alt_instructions_end = .;
44509 + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
44510 + *(.altinstr_replacement)
44511 + }
44512 + /* .exit.text is discard at runtime, not link time, to deal with references
44513 + from .altinstructions and .eh_frame */
44514 + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
44515 + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
44516 + . = ALIGN(4096);
44517 + __initramfs_start = .;
44518 + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
44519 + __initramfs_end = .;
44520 + /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+
44521 + complain */
44522 + . = ALIGN(4096);
44523 + __init_end = .;
44524 + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
44525 + __per_cpu_start = .;
44526 + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
44527 + __per_cpu_end = .;
44528 +
44529 + . = ALIGN(4096);
44530 + __nosave_begin = .;
44531 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
44532 + . = ALIGN(4096);
44533 + __nosave_end = .;
44534 +
44535 + __bss_start = .; /* BSS */
44536 + . = ALIGN(4096);
44537 + .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
44538 + *(.bss.page_aligned)
44539 + *(.bss)
44540 + }
44541 + __bss_stop = .;
44542 +
44543 + _end = . ;
44544 +
44545 + /* Sections to be discarded */
44546 + /DISCARD/ : {
44547 + *(.exitcall.exit)
44548 +#ifndef CONFIG_UNWIND_INFO
44549 + *(.eh_frame)
44550 +#endif
44551 + }
44552 +
44553 + STABS_DEBUG
44554 +
44555 + DWARF_DEBUG
44556 +
44557 + NOTES
44558 +}
44559 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/vsyscall-xen.c linux-2.6.16.33/arch/x86_64/kernel/vsyscall-xen.c
44560 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/vsyscall-xen.c 1970-01-01 00:00:00.000000000 +0000
44561 +++ linux-2.6.16.33/arch/x86_64/kernel/vsyscall-xen.c 2007-01-08 15:00:45.000000000 +0000
44562 @@ -0,0 +1,239 @@
44563 +/*
44564 + * linux/arch/x86_64/kernel/vsyscall.c
44565 + *
44566 + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
44567 + * Copyright 2003 Andi Kleen, SuSE Labs.
44568 + *
44569 + * Thanks to hpa@transmeta.com for some useful hint.
44570 + * Special thanks to Ingo Molnar for his early experience with
44571 + * a different vsyscall implementation for Linux/IA32 and for the name.
44572 + *
44573 + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
44574 + * at virtual address -10Mbyte+1024bytes etc... There are at max 4
44575 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
44576 + * jumping out of line if necessary. We cannot add more with this
44577 + * mechanism because older kernels won't return -ENOSYS.
44578 + * If we want more than four we need a vDSO.
44579 + *
44580 + * Note: the concept clashes with user mode linux. If you use UML and
44581 + * want per guest time just set the kernel.vsyscall64 sysctl to 0.
44582 + */
44583 +
44584 +#include <linux/time.h>
44585 +#include <linux/init.h>
44586 +#include <linux/kernel.h>
44587 +#include <linux/timer.h>
44588 +#include <linux/seqlock.h>
44589 +#include <linux/jiffies.h>
44590 +#include <linux/sysctl.h>
44591 +
44592 +#include <asm/vsyscall.h>
44593 +#include <asm/pgtable.h>
44594 +#include <asm/page.h>
44595 +#include <asm/fixmap.h>
44596 +#include <asm/errno.h>
44597 +#include <asm/io.h>
44598 +
44599 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
44600 +
44601 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
44602 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
44603 +
44604 +#include <asm/unistd.h>
44605 +
44606 +static __always_inline void timeval_normalize(struct timeval * tv)
44607 +{
44608 + time_t __sec;
44609 +
44610 + __sec = tv->tv_usec / 1000000;
44611 + if (__sec) {
44612 + tv->tv_usec %= 1000000;
44613 + tv->tv_sec += __sec;
44614 + }
44615 +}
44616 +
44617 +static __always_inline void do_vgettimeofday(struct timeval * tv)
44618 +{
44619 + long sequence, t;
44620 + unsigned long sec, usec;
44621 +
44622 + do {
44623 + sequence = read_seqbegin(&__xtime_lock);
44624 +
44625 + sec = __xtime.tv_sec;
44626 + usec = (__xtime.tv_nsec / 1000) +
44627 + (__jiffies - __wall_jiffies) * (1000000 / HZ);
44628 +
44629 + if (__vxtime.mode != VXTIME_HPET) {
44630 + t = get_cycles_sync();
44631 + if (t < __vxtime.last_tsc)
44632 + t = __vxtime.last_tsc;
44633 + usec += ((t - __vxtime.last_tsc) *
44634 + __vxtime.tsc_quot) >> 32;
44635 + /* See comment in x86_64 do_gettimeofday. */
44636 + } else {
44637 + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
44638 + __vxtime.last) * __vxtime.quot) >> 32;
44639 + }
44640 + } while (read_seqretry(&__xtime_lock, sequence));
44641 +
44642 + tv->tv_sec = sec + usec / 1000000;
44643 + tv->tv_usec = usec % 1000000;
44644 +}
44645 +
44646 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
44647 +static __always_inline void do_get_tz(struct timezone * tz)
44648 +{
44649 + *tz = __sys_tz;
44650 +}
44651 +
44652 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
44653 +{
44654 + int ret;
44655 + asm volatile("vsysc2: syscall"
44656 + : "=a" (ret)
44657 + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
44658 + return ret;
44659 +}
44660 +
44661 +static __always_inline long time_syscall(long *t)
44662 +{
44663 + long secs;
44664 + asm volatile("vsysc1: syscall"
44665 + : "=a" (secs)
44666 + : "0" (__NR_time),"D" (t) : __syscall_clobber);
44667 + return secs;
44668 +}
44669 +
44670 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
44671 +{
44672 + if (unlikely(!__sysctl_vsyscall))
44673 + return gettimeofday(tv,tz);
44674 + if (tv)
44675 + do_vgettimeofday(tv);
44676 + if (tz)
44677 + do_get_tz(tz);
44678 + return 0;
44679 +}
44680 +
44681 +/* This will break when the xtime seconds get inaccurate, but that is
44682 + * unlikely */
44683 +time_t __vsyscall(1) vtime(time_t *t)
44684 +{
44685 + if (unlikely(!__sysctl_vsyscall))
44686 + return time_syscall(t);
44687 + else if (t)
44688 + *t = __xtime.tv_sec;
44689 + return __xtime.tv_sec;
44690 +}
44691 +
44692 +long __vsyscall(2) venosys_0(void)
44693 +{
44694 + return -ENOSYS;
44695 +}
44696 +
44697 +long __vsyscall(3) venosys_1(void)
44698 +{
44699 + return -ENOSYS;
44700 +}
44701 +
44702 +#ifdef CONFIG_SYSCTL
44703 +
44704 +#define SYSCALL 0x050f
44705 +#define NOP2 0x9090
44706 +
44707 +/*
44708 + * NOP out syscall in vsyscall page when not needed.
44709 + */
44710 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
44711 + void __user *buffer, size_t *lenp, loff_t *ppos)
44712 +{
44713 + extern u16 vsysc1, vsysc2;
44714 + u16 *map1, *map2;
44715 + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
44716 + if (!write)
44717 + return ret;
44718 + /* gcc has some trouble with __va(__pa()), so just do it this
44719 + way. */
44720 + map1 = ioremap(__pa_symbol(&vsysc1), 2);
44721 + if (!map1)
44722 + return -ENOMEM;
44723 + map2 = ioremap(__pa_symbol(&vsysc2), 2);
44724 + if (!map2) {
44725 + ret = -ENOMEM;
44726 + goto out;
44727 + }
44728 + if (!sysctl_vsyscall) {
44729 + *map1 = SYSCALL;
44730 + *map2 = SYSCALL;
44731 + } else {
44732 + *map1 = NOP2;
44733 + *map2 = NOP2;
44734 + }
44735 + iounmap(map2);
44736 +out:
44737 + iounmap(map1);
44738 + return ret;
44739 +}
44740 +
44741 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
44742 + void __user *oldval, size_t __user *oldlenp,
44743 + void __user *newval, size_t newlen,
44744 + void **context)
44745 +{
44746 + return -ENOSYS;
44747 +}
44748 +
44749 +static ctl_table kernel_table2[] = {
44750 + { .ctl_name = 99, .procname = "vsyscall64",
44751 + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
44752 + .strategy = vsyscall_sysctl_nostrat,
44753 + .proc_handler = vsyscall_sysctl_change },
44754 + { 0, }
44755 +};
44756 +
44757 +static ctl_table kernel_root_table2[] = {
44758 + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
44759 + .child = kernel_table2 },
44760 + { 0 },
44761 +};
44762 +
44763 +#endif
44764 +
44765 +static void __init map_vsyscall(void)
44766 +{
44767 + extern char __vsyscall_0;
44768 + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44769 +
44770 + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44771 +}
44772 +
44773 +#ifdef CONFIG_XEN
44774 +static void __init map_vsyscall_user(void)
44775 +{
44776 + extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
44777 + extern char __vsyscall_0;
44778 + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
44779 +
44780 + __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
44781 +}
44782 +#endif
44783 +
44784 +static int __init vsyscall_init(void)
44785 +{
44786 + BUG_ON(((unsigned long) &vgettimeofday !=
44787 + VSYSCALL_ADDR(__NR_vgettimeofday)));
44788 + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
44789 + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
44790 + map_vsyscall();
44791 +#ifdef CONFIG_XEN
44792 + map_vsyscall_user();
44793 + sysctl_vsyscall = 0; /* disable vgettimeofay() */
44794 +#endif
44795 +#ifdef CONFIG_SYSCTL
44796 + register_sysctl_table(kernel_root_table2, 0);
44797 +#endif
44798 + return 0;
44799 +}
44800 +
44801 +__initcall(vsyscall_init);
44802 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/x8664_ksyms-xen.c linux-2.6.16.33/arch/x86_64/kernel/x8664_ksyms-xen.c
44803 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/x8664_ksyms-xen.c 1970-01-01 00:00:00.000000000 +0000
44804 +++ linux-2.6.16.33/arch/x86_64/kernel/x8664_ksyms-xen.c 2007-01-08 15:00:45.000000000 +0000
44805 @@ -0,0 +1,163 @@
44806 +#include <linux/config.h>
44807 +#include <linux/module.h>
44808 +#include <linux/smp.h>
44809 +#include <linux/user.h>
44810 +#include <linux/sched.h>
44811 +#include <linux/in6.h>
44812 +#include <linux/interrupt.h>
44813 +#include <linux/smp_lock.h>
44814 +#include <linux/pm.h>
44815 +#include <linux/pci.h>
44816 +#include <linux/apm_bios.h>
44817 +#include <linux/kernel.h>
44818 +#include <linux/string.h>
44819 +#include <linux/syscalls.h>
44820 +#include <linux/tty.h>
44821 +
44822 +#include <asm/semaphore.h>
44823 +#include <asm/processor.h>
44824 +#include <asm/i387.h>
44825 +#include <asm/uaccess.h>
44826 +#include <asm/checksum.h>
44827 +#include <asm/io.h>
44828 +#include <asm/delay.h>
44829 +#include <asm/irq.h>
44830 +#include <asm/mmx.h>
44831 +#include <asm/desc.h>
44832 +#include <asm/pgtable.h>
44833 +#include <asm/pgalloc.h>
44834 +#include <asm/nmi.h>
44835 +#include <asm/kdebug.h>
44836 +#include <asm/unistd.h>
44837 +#include <asm/tlbflush.h>
44838 +#include <asm/kdebug.h>
44839 +
44840 +#ifdef CONFIG_SMP
44841 +extern void __write_lock_failed(rwlock_t *rw);
44842 +extern void __read_lock_failed(rwlock_t *rw);
44843 +#endif
44844 +
44845 +/* platform dependent support */
44846 +EXPORT_SYMBOL(boot_cpu_data);
44847 +//EXPORT_SYMBOL(dump_fpu);
44848 +EXPORT_SYMBOL(kernel_thread);
44849 +EXPORT_SYMBOL(pm_idle);
44850 +EXPORT_SYMBOL(pm_power_off);
44851 +
44852 +EXPORT_SYMBOL(__down_failed);
44853 +EXPORT_SYMBOL(__down_failed_interruptible);
44854 +EXPORT_SYMBOL(__down_failed_trylock);
44855 +EXPORT_SYMBOL(__up_wakeup);
44856 +/* Networking helper routines. */
44857 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
44858 +EXPORT_SYMBOL(ip_compute_csum);
44859 +/* Delay loops */
44860 +EXPORT_SYMBOL(__udelay);
44861 +EXPORT_SYMBOL(__ndelay);
44862 +EXPORT_SYMBOL(__delay);
44863 +EXPORT_SYMBOL(__const_udelay);
44864 +
44865 +EXPORT_SYMBOL(__get_user_1);
44866 +EXPORT_SYMBOL(__get_user_2);
44867 +EXPORT_SYMBOL(__get_user_4);
44868 +EXPORT_SYMBOL(__get_user_8);
44869 +EXPORT_SYMBOL(__put_user_1);
44870 +EXPORT_SYMBOL(__put_user_2);
44871 +EXPORT_SYMBOL(__put_user_4);
44872 +EXPORT_SYMBOL(__put_user_8);
44873 +
44874 +EXPORT_SYMBOL(strncpy_from_user);
44875 +EXPORT_SYMBOL(__strncpy_from_user);
44876 +EXPORT_SYMBOL(clear_user);
44877 +EXPORT_SYMBOL(__clear_user);
44878 +EXPORT_SYMBOL(copy_user_generic);
44879 +EXPORT_SYMBOL(copy_from_user);
44880 +EXPORT_SYMBOL(copy_to_user);
44881 +EXPORT_SYMBOL(copy_in_user);
44882 +EXPORT_SYMBOL(strnlen_user);
44883 +
44884 +#ifdef CONFIG_PCI
44885 +EXPORT_SYMBOL(pci_mem_start);
44886 +#endif
44887 +
44888 +EXPORT_SYMBOL(copy_page);
44889 +EXPORT_SYMBOL(clear_page);
44890 +
44891 +EXPORT_SYMBOL(_cpu_pda);
44892 +#ifdef CONFIG_SMP
44893 +EXPORT_SYMBOL(__write_lock_failed);
44894 +EXPORT_SYMBOL(__read_lock_failed);
44895 +
44896 +EXPORT_SYMBOL(smp_call_function);
44897 +#endif
44898 +
44899 +#ifdef CONFIG_VT
44900 +EXPORT_SYMBOL(screen_info);
44901 +#endif
44902 +
44903 +EXPORT_SYMBOL(get_wchan);
44904 +
44905 +#ifdef CONFIG_X86_LOCAL_APIC
44906 +EXPORT_SYMBOL_GPL(set_nmi_callback);
44907 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
44908 +#endif
44909 +
44910 +/* Export string functions. We normally rely on gcc builtin for most of these,
44911 + but gcc sometimes decides not to inline them. */
44912 +#undef memcpy
44913 +#undef memset
44914 +#undef memmove
44915 +#undef strlen
44916 +
44917 +extern void * memset(void *,int,__kernel_size_t);
44918 +extern size_t strlen(const char *);
44919 +extern void * memmove(void * dest,const void *src,size_t count);
44920 +extern void * memcpy(void *,const void *,__kernel_size_t);
44921 +extern void * __memcpy(void *,const void *,__kernel_size_t);
44922 +
44923 +EXPORT_SYMBOL(memset);
44924 +EXPORT_SYMBOL(strlen);
44925 +EXPORT_SYMBOL(memmove);
44926 +EXPORT_SYMBOL(memcpy);
44927 +EXPORT_SYMBOL(__memcpy);
44928 +
44929 +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
44930 +/* prototypes are wrong, these are assembly with custom calling functions */
44931 +extern void rwsem_down_read_failed_thunk(void);
44932 +extern void rwsem_wake_thunk(void);
44933 +extern void rwsem_downgrade_thunk(void);
44934 +extern void rwsem_down_write_failed_thunk(void);
44935 +EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
44936 +EXPORT_SYMBOL(rwsem_wake_thunk);
44937 +EXPORT_SYMBOL(rwsem_downgrade_thunk);
44938 +EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
44939 +#endif
44940 +
44941 +EXPORT_SYMBOL(empty_zero_page);
44942 +
44943 +EXPORT_SYMBOL(die_chain);
44944 +EXPORT_SYMBOL(register_die_notifier);
44945 +
44946 +#ifdef CONFIG_SMP
44947 +EXPORT_SYMBOL(cpu_sibling_map);
44948 +EXPORT_SYMBOL(smp_num_siblings);
44949 +#endif
44950 +
44951 +extern void do_softirq_thunk(void);
44952 +EXPORT_SYMBOL(do_softirq_thunk);
44953 +
44954 +#ifdef CONFIG_BUG
44955 +EXPORT_SYMBOL(out_of_line_bug);
44956 +#endif
44957 +
44958 +EXPORT_SYMBOL(init_level4_pgt);
44959 +
44960 +extern unsigned long __supported_pte_mask;
44961 +EXPORT_SYMBOL(__supported_pte_mask);
44962 +
44963 +#ifdef CONFIG_SMP
44964 +EXPORT_SYMBOL(flush_tlb_page);
44965 +#endif
44966 +
44967 +EXPORT_SYMBOL(load_gs_index);
44968 +
44969 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/kernel/xen_entry.S linux-2.6.16.33/arch/x86_64/kernel/xen_entry.S
44970 --- linux-2.6.16.33-noxen/arch/x86_64/kernel/xen_entry.S 1970-01-01 00:00:00.000000000 +0000
44971 +++ linux-2.6.16.33/arch/x86_64/kernel/xen_entry.S 2007-01-08 15:00:45.000000000 +0000
44972 @@ -0,0 +1,40 @@
44973 +/*
44974 + * Copied from arch/xen/i386/kernel/entry.S
44975 + */
44976 +/* Offsets into shared_info_t. */
44977 +#define evtchn_upcall_pending /* 0 */
44978 +#define evtchn_upcall_mask 1
44979 +
44980 +#define sizeof_vcpu_shift 6
44981 +
44982 +#ifdef CONFIG_SMP
44983 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
44984 +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
44985 +#define preempt_disable(reg)
44986 +#define preempt_enable(reg)
44987 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
44988 + movq %gs:pda_cpunumber,reg ; \
44989 + shl $32, reg ; \
44990 + shr $32-sizeof_vcpu_shift,reg ; \
44991 + addq HYPERVISOR_shared_info,reg
44992 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
44993 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
44994 +#else
44995 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
44996 +#define XEN_PUT_VCPU_INFO(reg)
44997 +#define XEN_PUT_VCPU_INFO_fixup
44998 +#endif
44999 +
45000 +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
45001 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
45002 +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
45003 + XEN_LOCKED_BLOCK_EVENTS(reg) ; \
45004 + XEN_PUT_VCPU_INFO(reg)
45005 +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
45006 + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
45007 + XEN_PUT_VCPU_INFO(reg)
45008 +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
45009 +
45010 +VGCF_IN_SYSCALL = (1<<8)
45011 +
45012 +
45013 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/Makefile linux-2.6.16.33/arch/x86_64/mm/Makefile
45014 --- linux-2.6.16.33-noxen/arch/x86_64/mm/Makefile 2006-11-22 18:06:31.000000000 +0000
45015 +++ linux-2.6.16.33/arch/x86_64/mm/Makefile 2007-01-08 15:00:45.000000000 +0000
45016 @@ -9,3 +9,13 @@
45017 obj-$(CONFIG_ACPI_NUMA) += srat.o
45018
45019 hugetlbpage-y = ../../i386/mm/hugetlbpage.o
45020 +
45021 +ifdef CONFIG_XEN
45022 +include $(srctree)/scripts/Makefile.xen
45023 +
45024 +ioremap-y += ../../i386/mm/ioremap-xen.o
45025 +hypervisor-y += ../../i386/mm/hypervisor.o
45026 +obj-y += hypervisor.o
45027 +
45028 +obj-y := $(call cherrypickxen, $(obj-y))
45029 +endif
45030 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/fault-xen.c linux-2.6.16.33/arch/x86_64/mm/fault-xen.c
45031 --- linux-2.6.16.33-noxen/arch/x86_64/mm/fault-xen.c 1970-01-01 00:00:00.000000000 +0000
45032 +++ linux-2.6.16.33/arch/x86_64/mm/fault-xen.c 2007-01-08 15:00:45.000000000 +0000
45033 @@ -0,0 +1,641 @@
45034 +/*
45035 + * linux/arch/x86-64/mm/fault.c
45036 + *
45037 + * Copyright (C) 1995 Linus Torvalds
45038 + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
45039 + */
45040 +
45041 +#include <linux/config.h>
45042 +#include <linux/signal.h>
45043 +#include <linux/sched.h>
45044 +#include <linux/kernel.h>
45045 +#include <linux/errno.h>
45046 +#include <linux/string.h>
45047 +#include <linux/types.h>
45048 +#include <linux/ptrace.h>
45049 +#include <linux/mman.h>
45050 +#include <linux/mm.h>
45051 +#include <linux/smp.h>
45052 +#include <linux/smp_lock.h>
45053 +#include <linux/interrupt.h>
45054 +#include <linux/init.h>
45055 +#include <linux/tty.h>
45056 +#include <linux/vt_kern.h> /* For unblank_screen() */
45057 +#include <linux/compiler.h>
45058 +#include <linux/module.h>
45059 +#include <linux/kprobes.h>
45060 +
45061 +#include <asm/system.h>
45062 +#include <asm/uaccess.h>
45063 +#include <asm/pgalloc.h>
45064 +#include <asm/smp.h>
45065 +#include <asm/tlbflush.h>
45066 +#include <asm/proto.h>
45067 +#include <asm/kdebug.h>
45068 +#include <asm-generic/sections.h>
45069 +
45070 +/* Page fault error code bits */
45071 +#define PF_PROT (1<<0) /* or no page found */
45072 +#define PF_WRITE (1<<1)
45073 +#define PF_USER (1<<2)
45074 +#define PF_RSVD (1<<3)
45075 +#define PF_INSTR (1<<4)
45076 +
45077 +void bust_spinlocks(int yes)
45078 +{
45079 + int loglevel_save = console_loglevel;
45080 + if (yes) {
45081 + oops_in_progress = 1;
45082 + } else {
45083 +#ifdef CONFIG_VT
45084 + unblank_screen();
45085 +#endif
45086 + oops_in_progress = 0;
45087 + /*
45088 + * OK, the message is on the console. Now we call printk()
45089 + * without oops_in_progress set so that printk will give klogd
45090 + * a poke. Hold onto your hats...
45091 + */
45092 + console_loglevel = 15; /* NMI oopser may have shut the console up */
45093 + printk(" ");
45094 + console_loglevel = loglevel_save;
45095 + }
45096 +}
45097 +
45098 +/* Sometimes the CPU reports invalid exceptions on prefetch.
45099 + Check that here and ignore.
45100 + Opcode checker based on code by Richard Brunner */
45101 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
45102 + unsigned long error_code)
45103 +{
45104 + unsigned char *instr;
45105 + int scan_more = 1;
45106 + int prefetch = 0;
45107 + unsigned char *max_instr;
45108 +
45109 + /* If it was a exec fault ignore */
45110 + if (error_code & PF_INSTR)
45111 + return 0;
45112 +
45113 + instr = (unsigned char *)convert_rip_to_linear(current, regs);
45114 + max_instr = instr + 15;
45115 +
45116 + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
45117 + return 0;
45118 +
45119 + while (scan_more && instr < max_instr) {
45120 + unsigned char opcode;
45121 + unsigned char instr_hi;
45122 + unsigned char instr_lo;
45123 +
45124 + if (__get_user(opcode, instr))
45125 + break;
45126 +
45127 + instr_hi = opcode & 0xf0;
45128 + instr_lo = opcode & 0x0f;
45129 + instr++;
45130 +
45131 + switch (instr_hi) {
45132 + case 0x20:
45133 + case 0x30:
45134 + /* Values 0x26,0x2E,0x36,0x3E are valid x86
45135 + prefixes. In long mode, the CPU will signal
45136 + invalid opcode if some of these prefixes are
45137 + present so we will never get here anyway */
45138 + scan_more = ((instr_lo & 7) == 0x6);
45139 + break;
45140 +
45141 + case 0x40:
45142 + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
45143 + Need to figure out under what instruction mode the
45144 + instruction was issued ... */
45145 + /* Could check the LDT for lm, but for now it's good
45146 + enough to assume that long mode only uses well known
45147 + segments or kernel. */
45148 + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
45149 + break;
45150 +
45151 + case 0x60:
45152 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
45153 + scan_more = (instr_lo & 0xC) == 0x4;
45154 + break;
45155 + case 0xF0:
45156 + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
45157 + scan_more = !instr_lo || (instr_lo>>1) == 1;
45158 + break;
45159 + case 0x00:
45160 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
45161 + scan_more = 0;
45162 + if (__get_user(opcode, instr))
45163 + break;
45164 + prefetch = (instr_lo == 0xF) &&
45165 + (opcode == 0x0D || opcode == 0x18);
45166 + break;
45167 + default:
45168 + scan_more = 0;
45169 + break;
45170 + }
45171 + }
45172 + return prefetch;
45173 +}
45174 +
45175 +static int bad_address(void *p)
45176 +{
45177 + unsigned long dummy;
45178 + return __get_user(dummy, (unsigned long *)p);
45179 +}
45180 +
45181 +void dump_pagetable(unsigned long address)
45182 +{
45183 + pgd_t *pgd;
45184 + pud_t *pud;
45185 + pmd_t *pmd;
45186 + pte_t *pte;
45187 +
45188 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
45189 + pgd += pgd_index(address);
45190 + if (bad_address(pgd)) goto bad;
45191 + printk("PGD %lx ", pgd_val(*pgd));
45192 + if (!pgd_present(*pgd)) goto ret;
45193 +
45194 + pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
45195 + if (bad_address(pud)) goto bad;
45196 + printk("PUD %lx ", pud_val(*pud));
45197 + if (!pud_present(*pud)) goto ret;
45198 +
45199 + pmd = pmd_offset(pud, address);
45200 + if (bad_address(pmd)) goto bad;
45201 + printk("PMD %lx ", pmd_val(*pmd));
45202 + if (!pmd_present(*pmd)) goto ret;
45203 +
45204 + pte = pte_offset_kernel(pmd, address);
45205 + if (bad_address(pte)) goto bad;
45206 + printk("PTE %lx", pte_val(*pte));
45207 +ret:
45208 + printk("\n");
45209 + return;
45210 +bad:
45211 + printk("BAD\n");
45212 +}
45213 +
45214 +static const char errata93_warning[] =
45215 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
45216 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
45217 +KERN_ERR "******* Please consider a BIOS update.\n"
45218 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
45219 +
45220 +/* Workaround for K8 erratum #93 & buggy BIOS.
45221 + BIOS SMM functions are required to use a specific workaround
45222 + to avoid corruption of the 64bit RIP register on C stepping K8.
45223 + A lot of BIOS that didn't get tested properly miss this.
45224 + The OS sees this as a page fault with the upper 32bits of RIP cleared.
45225 + Try to work around it here.
45226 + Note we only handle faults in kernel here. */
45227 +
45228 +static int is_errata93(struct pt_regs *regs, unsigned long address)
45229 +{
45230 + static int warned;
45231 + if (address != regs->rip)
45232 + return 0;
45233 + if ((address >> 32) != 0)
45234 + return 0;
45235 + address |= 0xffffffffUL << 32;
45236 + if ((address >= (u64)_stext && address <= (u64)_etext) ||
45237 + (address >= MODULES_VADDR && address <= MODULES_END)) {
45238 + if (!warned) {
45239 + printk(errata93_warning);
45240 + warned = 1;
45241 + }
45242 + regs->rip = address;
45243 + return 1;
45244 + }
45245 + return 0;
45246 +}
45247 +
45248 +int unhandled_signal(struct task_struct *tsk, int sig)
45249 +{
45250 + if (tsk->pid == 1)
45251 + return 1;
45252 + if (tsk->ptrace & PT_PTRACED)
45253 + return 0;
45254 + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
45255 + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
45256 +}
45257 +
45258 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
45259 + unsigned long error_code)
45260 +{
45261 + unsigned long flags = oops_begin();
45262 + struct task_struct *tsk;
45263 +
45264 + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
45265 + current->comm, address);
45266 + dump_pagetable(address);
45267 + tsk = current;
45268 + tsk->thread.cr2 = address;
45269 + tsk->thread.trap_no = 14;
45270 + tsk->thread.error_code = error_code;
45271 + __die("Bad pagetable", regs, error_code);
45272 + oops_end(flags);
45273 + do_exit(SIGKILL);
45274 +}
45275 +
45276 +/*
45277 + * Handle a fault on the vmalloc area
45278 + *
45279 + * This assumes no large pages in there.
45280 + */
45281 +static int vmalloc_fault(unsigned long address)
45282 +{
45283 + pgd_t *pgd, *pgd_ref;
45284 + pud_t *pud, *pud_ref;
45285 + pmd_t *pmd, *pmd_ref;
45286 + pte_t *pte, *pte_ref;
45287 +
45288 + /* Copy kernel mappings over when needed. This can also
45289 + happen within a race in page table update. In the later
45290 + case just flush. */
45291 +
45292 + /* On Xen the line below does not always work. Needs investigating! */
45293 + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
45294 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
45295 + pgd += pgd_index(address);
45296 + pgd_ref = pgd_offset_k(address);
45297 + if (pgd_none(*pgd_ref))
45298 + return -1;
45299 + if (pgd_none(*pgd))
45300 + set_pgd(pgd, *pgd_ref);
45301 +
45302 + /* Below here mismatches are bugs because these lower tables
45303 + are shared */
45304 +
45305 + pud = pud_offset(pgd, address);
45306 + pud_ref = pud_offset(pgd_ref, address);
45307 + if (pud_none(*pud_ref))
45308 + return -1;
45309 + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
45310 + BUG();
45311 + pmd = pmd_offset(pud, address);
45312 + pmd_ref = pmd_offset(pud_ref, address);
45313 + if (pmd_none(*pmd_ref))
45314 + return -1;
45315 + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
45316 + BUG();
45317 + pte_ref = pte_offset_kernel(pmd_ref, address);
45318 + if (!pte_present(*pte_ref))
45319 + return -1;
45320 + pte = pte_offset_kernel(pmd, address);
45321 + /* Don't use pte_page here, because the mappings can point
45322 + outside mem_map, and the NUMA hash lookup cannot handle
45323 + that. */
45324 + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
45325 + BUG();
45326 + return 0;
45327 +}
45328 +
45329 +int page_fault_trace = 0;
45330 +int exception_trace = 1;
45331 +
45332 +
45333 +#define MEM_VERBOSE 1
45334 +
45335 +#ifdef MEM_VERBOSE
45336 +#define MEM_LOG(_f, _a...) \
45337 + printk("fault.c:[%d]-> " _f "\n", \
45338 + __LINE__ , ## _a )
45339 +#else
45340 +#define MEM_LOG(_f, _a...) ((void)0)
45341 +#endif
45342 +
45343 +static int spurious_fault(struct pt_regs *regs,
45344 + unsigned long address,
45345 + unsigned long error_code)
45346 +{
45347 + pgd_t *pgd;
45348 + pud_t *pud;
45349 + pmd_t *pmd;
45350 + pte_t *pte;
45351 +
45352 +#ifdef CONFIG_XEN
45353 + /* Faults in hypervisor area are never spurious. */
45354 + if ((address >= HYPERVISOR_VIRT_START) &&
45355 + (address < HYPERVISOR_VIRT_END))
45356 + return 0;
45357 +#endif
45358 +
45359 + /* Reserved-bit violation or user access to kernel space? */
45360 + if (error_code & (PF_RSVD|PF_USER))
45361 + return 0;
45362 +
45363 + pgd = init_mm.pgd + pgd_index(address);
45364 + if (!pgd_present(*pgd))
45365 + return 0;
45366 +
45367 + pud = pud_offset(pgd, address);
45368 + if (!pud_present(*pud))
45369 + return 0;
45370 +
45371 + pmd = pmd_offset(pud, address);
45372 + if (!pmd_present(*pmd))
45373 + return 0;
45374 +
45375 + pte = pte_offset_kernel(pmd, address);
45376 + if (!pte_present(*pte))
45377 + return 0;
45378 + if ((error_code & PF_WRITE) && !pte_write(*pte))
45379 + return 0;
45380 + if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
45381 + return 0;
45382 +
45383 + return 1;
45384 +}
45385 +
45386 +/*
45387 + * This routine handles page faults. It determines the address,
45388 + * and the problem, and then passes it off to one of the appropriate
45389 + * routines.
45390 + */
45391 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
45392 + unsigned long error_code)
45393 +{
45394 + struct task_struct *tsk;
45395 + struct mm_struct *mm;
45396 + struct vm_area_struct * vma;
45397 + unsigned long address;
45398 + const struct exception_table_entry *fixup;
45399 + int write;
45400 + unsigned long flags;
45401 + siginfo_t info;
45402 +
45403 + if (!user_mode(regs))
45404 + error_code &= ~PF_USER; /* means kernel */
45405 +
45406 + /* get the address */
45407 + address = HYPERVISOR_shared_info->vcpu_info[
45408 + smp_processor_id()].arch.cr2;
45409 + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
45410 + SIGSEGV) == NOTIFY_STOP)
45411 + return;
45412 +
45413 + if (likely(regs->eflags & X86_EFLAGS_IF))
45414 + local_irq_enable();
45415 +
45416 + if (unlikely(page_fault_trace))
45417 + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
45418 + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
45419 +
45420 + tsk = current;
45421 + mm = tsk->mm;
45422 + info.si_code = SEGV_MAPERR;
45423 +
45424 +
45425 + /*
45426 + * We fault-in kernel-space virtual memory on-demand. The
45427 + * 'reference' page table is init_mm.pgd.
45428 + *
45429 + * NOTE! We MUST NOT take any locks for this case. We may
45430 + * be in an interrupt or a critical region, and should
45431 + * only copy the information from the master page table,
45432 + * nothing more.
45433 + *
45434 + * This verifies that the fault happens in kernel space
45435 + * (error_code & 4) == 0, and that the fault was not a
45436 + * protection error (error_code & 9) == 0.
45437 + */
45438 + if (unlikely(address >= TASK_SIZE64)) {
45439 + /*
45440 + * Don't check for the module range here: its PML4
45441 + * is always initialized because it's shared with the main
45442 + * kernel text. Only vmalloc may need PML4 syncups.
45443 + */
45444 + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
45445 + ((address >= VMALLOC_START && address < VMALLOC_END))) {
45446 + if (vmalloc_fault(address) < 0)
45447 + goto bad_area_nosemaphore;
45448 + return;
45449 + }
45450 + /* Can take a spurious fault if mapping changes R/O -> R/W. */
45451 + if (spurious_fault(regs, address, error_code))
45452 + return;
45453 + /*
45454 + * Don't take the mm semaphore here. If we fixup a prefetch
45455 + * fault we could otherwise deadlock.
45456 + */
45457 + goto bad_area_nosemaphore;
45458 + }
45459 +
45460 + if (unlikely(error_code & PF_RSVD))
45461 + pgtable_bad(address, regs, error_code);
45462 +
45463 + /*
45464 + * If we're in an interrupt or have no user
45465 + * context, we must not take the fault..
45466 + */
45467 + if (unlikely(in_atomic() || !mm))
45468 + goto bad_area_nosemaphore;
45469 +
45470 + again:
45471 + /* When running in the kernel we expect faults to occur only to
45472 + * addresses in user space. All other faults represent errors in the
45473 + * kernel and should generate an OOPS. Unfortunatly, in the case of an
45474 + * erroneous fault occuring in a code path which already holds mmap_sem
45475 + * we will deadlock attempting to validate the fault against the
45476 + * address space. Luckily the kernel only validly references user
45477 + * space from well defined areas of code, which are listed in the
45478 + * exceptions table.
45479 + *
45480 + * As the vast majority of faults will be valid we will only perform
45481 + * the source reference check when there is a possibilty of a deadlock.
45482 + * Attempt to lock the address space, if we cannot we then validate the
45483 + * source. If this is invalid we can skip the address space check,
45484 + * thus avoiding the deadlock.
45485 + */
45486 + if (!down_read_trylock(&mm->mmap_sem)) {
45487 + if ((error_code & PF_USER) == 0 &&
45488 + !search_exception_tables(regs->rip))
45489 + goto bad_area_nosemaphore;
45490 + down_read(&mm->mmap_sem);
45491 + }
45492 +
45493 + vma = find_vma(mm, address);
45494 + if (!vma)
45495 + goto bad_area;
45496 + if (likely(vma->vm_start <= address))
45497 + goto good_area;
45498 + if (!(vma->vm_flags & VM_GROWSDOWN))
45499 + goto bad_area;
45500 + if (error_code & 4) {
45501 + // XXX: align red zone size with ABI
45502 + if (address + 128 < regs->rsp)
45503 + goto bad_area;
45504 + }
45505 + if (expand_stack(vma, address))
45506 + goto bad_area;
45507 +/*
45508 + * Ok, we have a good vm_area for this memory access, so
45509 + * we can handle it..
45510 + */
45511 +good_area:
45512 + info.si_code = SEGV_ACCERR;
45513 + write = 0;
45514 + switch (error_code & (PF_PROT|PF_WRITE)) {
45515 + default: /* 3: write, present */
45516 + /* fall through */
45517 + case PF_WRITE: /* write, not present */
45518 + if (!(vma->vm_flags & VM_WRITE))
45519 + goto bad_area;
45520 + write++;
45521 + break;
45522 + case PF_PROT: /* read, present */
45523 + goto bad_area;
45524 + case 0: /* read, not present */
45525 + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
45526 + goto bad_area;
45527 + }
45528 +
45529 + /*
45530 + * If for any reason at all we couldn't handle the fault,
45531 + * make sure we exit gracefully rather than endlessly redo
45532 + * the fault.
45533 + */
45534 + switch (handle_mm_fault(mm, vma, address, write)) {
45535 + case VM_FAULT_MINOR:
45536 + tsk->min_flt++;
45537 + break;
45538 + case VM_FAULT_MAJOR:
45539 + tsk->maj_flt++;
45540 + break;
45541 + case VM_FAULT_SIGBUS:
45542 + goto do_sigbus;
45543 + default:
45544 + goto out_of_memory;
45545 + }
45546 +
45547 + up_read(&mm->mmap_sem);
45548 + return;
45549 +
45550 +/*
45551 + * Something tried to access memory that isn't in our memory map..
45552 + * Fix it, but check if it's kernel or user first..
45553 + */
45554 +bad_area:
45555 + up_read(&mm->mmap_sem);
45556 +
45557 +bad_area_nosemaphore:
45558 + /* User mode accesses just cause a SIGSEGV */
45559 + if (error_code & PF_USER) {
45560 + if (is_prefetch(regs, address, error_code))
45561 + return;
45562 +
45563 + /* Work around K8 erratum #100 K8 in compat mode
45564 + occasionally jumps to illegal addresses >4GB. We
45565 + catch this here in the page fault handler because
45566 + these addresses are not reachable. Just detect this
45567 + case and return. Any code segment in LDT is
45568 + compatibility mode. */
45569 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
45570 + (address >> 32))
45571 + return;
45572 +
45573 + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
45574 + printk(
45575 + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
45576 + tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
45577 + tsk->comm, tsk->pid, address, regs->rip,
45578 + regs->rsp, error_code);
45579 + }
45580 +
45581 + tsk->thread.cr2 = address;
45582 + /* Kernel addresses are always protection faults */
45583 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
45584 + tsk->thread.trap_no = 14;
45585 + info.si_signo = SIGSEGV;
45586 + info.si_errno = 0;
45587 + /* info.si_code has been set above */
45588 + info.si_addr = (void __user *)address;
45589 + force_sig_info(SIGSEGV, &info, tsk);
45590 + return;
45591 + }
45592 +
45593 +no_context:
45594 +
45595 + /* Are we prepared to handle this kernel fault? */
45596 + fixup = search_exception_tables(regs->rip);
45597 + if (fixup) {
45598 + regs->rip = fixup->fixup;
45599 + return;
45600 + }
45601 +
45602 + /*
45603 + * Hall of shame of CPU/BIOS bugs.
45604 + */
45605 +
45606 + if (is_prefetch(regs, address, error_code))
45607 + return;
45608 +
45609 + if (is_errata93(regs, address))
45610 + return;
45611 +
45612 +/*
45613 + * Oops. The kernel tried to access some bad page. We'll have to
45614 + * terminate things with extreme prejudice.
45615 + */
45616 +
45617 + flags = oops_begin();
45618 +
45619 + if (address < PAGE_SIZE)
45620 + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
45621 + else
45622 + printk(KERN_ALERT "Unable to handle kernel paging request");
45623 + printk(" at %016lx RIP: \n" KERN_ALERT,address);
45624 + printk_address(regs->rip);
45625 + printk("\n");
45626 + dump_pagetable(address);
45627 + tsk->thread.cr2 = address;
45628 + tsk->thread.trap_no = 14;
45629 + tsk->thread.error_code = error_code;
45630 + __die("Oops", regs, error_code);
45631 + /* Executive summary in case the body of the oops scrolled away */
45632 + printk(KERN_EMERG "CR2: %016lx\n", address);
45633 + oops_end(flags);
45634 + do_exit(SIGKILL);
45635 +
45636 +/*
45637 + * We ran out of memory, or some other thing happened to us that made
45638 + * us unable to handle the page fault gracefully.
45639 + */
45640 +out_of_memory:
45641 + up_read(&mm->mmap_sem);
45642 + if (current->pid == 1) {
45643 + yield();
45644 + goto again;
45645 + }
45646 + printk("VM: killing process %s\n", tsk->comm);
45647 + if (error_code & 4)
45648 + do_exit(SIGKILL);
45649 + goto no_context;
45650 +
45651 +do_sigbus:
45652 + up_read(&mm->mmap_sem);
45653 +
45654 + /* Kernel mode? Handle exceptions or die */
45655 + if (!(error_code & PF_USER))
45656 + goto no_context;
45657 +
45658 + tsk->thread.cr2 = address;
45659 + tsk->thread.error_code = error_code;
45660 + tsk->thread.trap_no = 14;
45661 + info.si_signo = SIGBUS;
45662 + info.si_errno = 0;
45663 + info.si_code = BUS_ADRERR;
45664 + info.si_addr = (void __user *)address;
45665 + force_sig_info(SIGBUS, &info, tsk);
45666 + return;
45667 +}
45668 +
45669 +static int __init enable_pagefaulttrace(char *str)
45670 +{
45671 + page_fault_trace = 1;
45672 + return 0;
45673 +}
45674 +__setup("pagefaulttrace", enable_pagefaulttrace);
45675 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/init-xen.c linux-2.6.16.33/arch/x86_64/mm/init-xen.c
45676 --- linux-2.6.16.33-noxen/arch/x86_64/mm/init-xen.c 1970-01-01 00:00:00.000000000 +0000
45677 +++ linux-2.6.16.33/arch/x86_64/mm/init-xen.c 2007-01-08 15:00:45.000000000 +0000
45678 @@ -0,0 +1,1162 @@
45679 +/*
45680 + * linux/arch/x86_64/mm/init.c
45681 + *
45682 + * Copyright (C) 1995 Linus Torvalds
45683 + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
45684 + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
45685 + *
45686 + * Jun Nakajima <jun.nakajima@intel.com>
45687 + * Modified for Xen.
45688 + */
45689 +
45690 +#include <linux/config.h>
45691 +#include <linux/signal.h>
45692 +#include <linux/sched.h>
45693 +#include <linux/kernel.h>
45694 +#include <linux/errno.h>
45695 +#include <linux/string.h>
45696 +#include <linux/types.h>
45697 +#include <linux/ptrace.h>
45698 +#include <linux/mman.h>
45699 +#include <linux/mm.h>
45700 +#include <linux/swap.h>
45701 +#include <linux/smp.h>
45702 +#include <linux/init.h>
45703 +#include <linux/pagemap.h>
45704 +#include <linux/bootmem.h>
45705 +#include <linux/proc_fs.h>
45706 +#include <linux/pci.h>
45707 +#include <linux/dma-mapping.h>
45708 +#include <linux/module.h>
45709 +#include <linux/memory_hotplug.h>
45710 +
45711 +#include <asm/processor.h>
45712 +#include <asm/system.h>
45713 +#include <asm/uaccess.h>
45714 +#include <asm/pgtable.h>
45715 +#include <asm/pgalloc.h>
45716 +#include <asm/dma.h>
45717 +#include <asm/fixmap.h>
45718 +#include <asm/e820.h>
45719 +#include <asm/apic.h>
45720 +#include <asm/tlb.h>
45721 +#include <asm/mmu_context.h>
45722 +#include <asm/proto.h>
45723 +#include <asm/smp.h>
45724 +#include <asm/sections.h>
45725 +#include <asm/dma-mapping.h>
45726 +#include <asm/swiotlb.h>
45727 +
45728 +#include <xen/features.h>
45729 +
45730 +#ifndef Dprintk
45731 +#define Dprintk(x...)
45732 +#endif
45733 +
45734 +struct dma_mapping_ops* dma_ops;
45735 +EXPORT_SYMBOL(dma_ops);
45736 +
45737 +#ifdef CONFIG_XEN_COMPAT_030002
45738 +unsigned int __kernel_page_user;
45739 +EXPORT_SYMBOL(__kernel_page_user);
45740 +#endif
45741 +
45742 +extern unsigned long *contiguous_bitmap;
45743 +
45744 +static unsigned long dma_reserve __initdata;
45745 +
45746 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
45747 +extern unsigned long start_pfn;
45748 +
45749 +/*
45750 + * Use this until direct mapping is established, i.e. before __va() is
45751 + * available in init_memory_mapping().
45752 + */
45753 +
45754 +#define addr_to_page(addr, page) \
45755 + (addr) &= PHYSICAL_PAGE_MASK; \
45756 + (page) = ((unsigned long *) ((unsigned long) \
45757 + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
45758 + __START_KERNEL_map)))
45759 +
45760 +static void early_make_page_readonly(void *va, unsigned int feature)
45761 +{
45762 + unsigned long addr, _va = (unsigned long)va;
45763 + pte_t pte, *ptep;
45764 + unsigned long *page = (unsigned long *) init_level4_pgt;
45765 +
45766 + if (xen_feature(feature))
45767 + return;
45768 +
45769 + addr = (unsigned long) page[pgd_index(_va)];
45770 + addr_to_page(addr, page);
45771 +
45772 + addr = page[pud_index(_va)];
45773 + addr_to_page(addr, page);
45774 +
45775 + addr = page[pmd_index(_va)];
45776 + addr_to_page(addr, page);
45777 +
45778 + ptep = (pte_t *) &page[pte_index(_va)];
45779 +
45780 + pte.pte = ptep->pte & ~_PAGE_RW;
45781 + if (HYPERVISOR_update_va_mapping(_va, pte, 0))
45782 + BUG();
45783 +}
45784 +
45785 +void make_page_readonly(void *va, unsigned int feature)
45786 +{
45787 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45788 + unsigned long addr = (unsigned long) va;
45789 +
45790 + if (xen_feature(feature))
45791 + return;
45792 +
45793 + pgd = pgd_offset_k(addr);
45794 + pud = pud_offset(pgd, addr);
45795 + pmd = pmd_offset(pud, addr);
45796 + ptep = pte_offset_kernel(pmd, addr);
45797 +
45798 + pte.pte = ptep->pte & ~_PAGE_RW;
45799 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45800 + xen_l1_entry_update(ptep, pte); /* fallback */
45801 +
45802 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45803 + make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45804 +}
45805 +
45806 +void make_page_writable(void *va, unsigned int feature)
45807 +{
45808 + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
45809 + unsigned long addr = (unsigned long) va;
45810 +
45811 + if (xen_feature(feature))
45812 + return;
45813 +
45814 + pgd = pgd_offset_k(addr);
45815 + pud = pud_offset(pgd, addr);
45816 + pmd = pmd_offset(pud, addr);
45817 + ptep = pte_offset_kernel(pmd, addr);
45818 +
45819 + pte.pte = ptep->pte | _PAGE_RW;
45820 + if (HYPERVISOR_update_va_mapping(addr, pte, 0))
45821 + xen_l1_entry_update(ptep, pte); /* fallback */
45822 +
45823 + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
45824 + make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
45825 +}
45826 +
45827 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
45828 +{
45829 + if (xen_feature(feature))
45830 + return;
45831 +
45832 + while (nr-- != 0) {
45833 + make_page_readonly(va, feature);
45834 + va = (void*)((unsigned long)va + PAGE_SIZE);
45835 + }
45836 +}
45837 +
45838 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
45839 +{
45840 + if (xen_feature(feature))
45841 + return;
45842 +
45843 + while (nr-- != 0) {
45844 + make_page_writable(va, feature);
45845 + va = (void*)((unsigned long)va + PAGE_SIZE);
45846 + }
45847 +}
45848 +
45849 +/*
45850 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
45851 + * physical space so we can cache the place of the first one and move
45852 + * around without checking the pgd every time.
45853 + */
45854 +
45855 +void show_mem(void)
45856 +{
45857 + long i, total = 0, reserved = 0;
45858 + long shared = 0, cached = 0;
45859 + pg_data_t *pgdat;
45860 + struct page *page;
45861 +
45862 + printk(KERN_INFO "Mem-info:\n");
45863 + show_free_areas();
45864 + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
45865 +
45866 + for_each_pgdat(pgdat) {
45867 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
45868 + page = pfn_to_page(pgdat->node_start_pfn + i);
45869 + total++;
45870 + if (PageReserved(page))
45871 + reserved++;
45872 + else if (PageSwapCache(page))
45873 + cached++;
45874 + else if (page_count(page))
45875 + shared += page_count(page) - 1;
45876 + }
45877 + }
45878 + printk(KERN_INFO "%lu pages of RAM\n", total);
45879 + printk(KERN_INFO "%lu reserved pages\n",reserved);
45880 + printk(KERN_INFO "%lu pages shared\n",shared);
45881 + printk(KERN_INFO "%lu pages swap cached\n",cached);
45882 +}
45883 +
45884 +/* References to section boundaries */
45885 +
45886 +int after_bootmem;
45887 +
45888 +static void *spp_getpage(void)
45889 +{
45890 + void *ptr;
45891 + if (after_bootmem)
45892 + ptr = (void *) get_zeroed_page(GFP_ATOMIC);
45893 + else
45894 + ptr = alloc_bootmem_pages(PAGE_SIZE);
45895 + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
45896 + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
45897 +
45898 + Dprintk("spp_getpage %p\n", ptr);
45899 + return ptr;
45900 +}
45901 +
45902 +#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
45903 +
45904 +static inline pud_t *pud_offset_u(unsigned long address)
45905 +{
45906 + pud_t *pud = level3_user_pgt;
45907 +
45908 + return pud + pud_index(address);
45909 +}
45910 +
45911 +static void set_pte_phys(unsigned long vaddr,
45912 + unsigned long phys, pgprot_t prot, int user_mode)
45913 +{
45914 + pgd_t *pgd;
45915 + pud_t *pud;
45916 + pmd_t *pmd;
45917 + pte_t *pte, new_pte;
45918 +
45919 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45920 +
45921 + pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
45922 + if (pgd_none(*pgd)) {
45923 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45924 + return;
45925 + }
45926 + pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
45927 + if (pud_none(*pud)) {
45928 + pmd = (pmd_t *) spp_getpage();
45929 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
45930 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45931 + if (pmd != pmd_offset(pud, 0)) {
45932 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45933 + return;
45934 + }
45935 + }
45936 + pmd = pmd_offset(pud, vaddr);
45937 + if (pmd_none(*pmd)) {
45938 + pte = (pte_t *) spp_getpage();
45939 + make_page_readonly(pte, XENFEAT_writable_page_tables);
45940 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45941 + if (pte != pte_offset_kernel(pmd, 0)) {
45942 + printk("PAGETABLE BUG #02!\n");
45943 + return;
45944 + }
45945 + }
45946 + if (pgprot_val(prot))
45947 + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
45948 + else
45949 + new_pte = __pte(0);
45950 +
45951 + pte = pte_offset_kernel(pmd, vaddr);
45952 + if (!pte_none(*pte) &&
45953 + pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
45954 + pte_ERROR(*pte);
45955 + set_pte(pte, new_pte);
45956 +
45957 + /*
45958 + * It's enough to flush this one mapping.
45959 + * (PGE mappings get flushed as well)
45960 + */
45961 + __flush_tlb_one(vaddr);
45962 +}
45963 +
45964 +static void set_pte_phys_ma(unsigned long vaddr,
45965 + unsigned long phys, pgprot_t prot)
45966 +{
45967 + pgd_t *pgd;
45968 + pud_t *pud;
45969 + pmd_t *pmd;
45970 + pte_t *pte, new_pte;
45971 +
45972 + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
45973 +
45974 + pgd = pgd_offset_k(vaddr);
45975 + if (pgd_none(*pgd)) {
45976 + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
45977 + return;
45978 + }
45979 + pud = pud_offset(pgd, vaddr);
45980 + if (pud_none(*pud)) {
45981 +
45982 + pmd = (pmd_t *) spp_getpage();
45983 + make_page_readonly(pmd, XENFEAT_writable_page_tables);
45984 +
45985 + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
45986 +
45987 + if (pmd != pmd_offset(pud, 0)) {
45988 + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
45989 + return;
45990 + }
45991 + }
45992 + pmd = pmd_offset(pud, vaddr);
45993 +
45994 + if (pmd_none(*pmd)) {
45995 + pte = (pte_t *) spp_getpage();
45996 + make_page_readonly(pte, XENFEAT_writable_page_tables);
45997 +
45998 + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
45999 + if (pte != pte_offset_kernel(pmd, 0)) {
46000 + printk("PAGETABLE BUG #02!\n");
46001 + return;
46002 + }
46003 + }
46004 +
46005 + new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
46006 + pte = pte_offset_kernel(pmd, vaddr);
46007 +
46008 + /*
46009 + * Note that the pte page is already RO, thus we want to use
46010 + * xen_l1_entry_update(), not set_pte().
46011 + */
46012 + xen_l1_entry_update(pte,
46013 + pfn_pte_ma(phys >> PAGE_SHIFT, prot));
46014 +
46015 + /*
46016 + * It's enough to flush this one mapping.
46017 + * (PGE mappings get flushed as well)
46018 + */
46019 + __flush_tlb_one(vaddr);
46020 +}
46021 +
46022 +#define SET_FIXMAP_KERNEL 0
46023 +#define SET_FIXMAP_USER 1
46024 +
46025 +/* NOTE: this is meant to be run only at boot */
46026 +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
46027 +{
46028 + unsigned long address = __fix_to_virt(idx);
46029 +
46030 + if (idx >= __end_of_fixed_addresses) {
46031 + printk("Invalid __set_fixmap\n");
46032 + return;
46033 + }
46034 + switch (idx) {
46035 + case VSYSCALL_FIRST_PAGE:
46036 + set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
46037 + break;
46038 + default:
46039 + set_pte_phys_ma(address, phys, prot);
46040 + break;
46041 + }
46042 +}
46043 +
46044 +/*
46045 + * At this point it only supports vsyscall area.
46046 + */
46047 +void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
46048 +{
46049 + unsigned long address = __fix_to_virt(idx);
46050 +
46051 + if (idx >= __end_of_fixed_addresses) {
46052 + printk("Invalid __set_fixmap\n");
46053 + return;
46054 + }
46055 +
46056 + set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
46057 +}
46058 +
46059 +unsigned long __initdata table_start, table_end;
46060 +
46061 +unsigned long get_machine_pfn(unsigned long addr)
46062 +{
46063 + pud_t* pud = pud_offset_k(NULL, addr);
46064 + pmd_t* pmd = pmd_offset(pud, addr);
46065 + pte_t *pte = pte_offset_kernel(pmd, addr);
46066 +
46067 + return pte_mfn(*pte);
46068 +}
46069 +
46070 +static __meminit void *alloc_static_page(unsigned long *phys)
46071 +{
46072 + unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
46073 +
46074 + if (after_bootmem) {
46075 + void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
46076 +
46077 + *phys = __pa(adr);
46078 + return adr;
46079 + }
46080 +
46081 + *phys = start_pfn << PAGE_SHIFT;
46082 + start_pfn++;
46083 + memset((void *)va, 0, PAGE_SIZE);
46084 + return (void *)va;
46085 +}
46086 +
46087 +#define PTE_SIZE PAGE_SIZE
46088 +
46089 +static inline void __set_pte(pte_t *dst, pte_t val)
46090 +{
46091 + *dst = val;
46092 +}
46093 +
46094 +static inline int make_readonly(unsigned long paddr)
46095 +{
46096 + int readonly = 0;
46097 +
46098 + /* Make new page tables read-only. */
46099 + if (!xen_feature(XENFEAT_writable_page_tables)
46100 + && (paddr >= (table_start << PAGE_SHIFT))
46101 + && (paddr < (table_end << PAGE_SHIFT)))
46102 + readonly = 1;
46103 + /* Make old page tables read-only. */
46104 + if (!xen_feature(XENFEAT_writable_page_tables)
46105 + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
46106 + && (paddr < (start_pfn << PAGE_SHIFT)))
46107 + readonly = 1;
46108 +
46109 + /*
46110 + * No need for writable mapping of kernel image. This also ensures that
46111 + * page and descriptor tables embedded inside don't have writable
46112 + * mappings.
46113 + */
46114 + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
46115 + readonly = 1;
46116 +
46117 + return readonly;
46118 +}
46119 +
46120 +static void __meminit
46121 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
46122 +{
46123 + int i, k;
46124 +
46125 + for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
46126 + unsigned long pte_phys;
46127 + pte_t *pte, *pte_save;
46128 +
46129 + if (address >= end) {
46130 + for (; i < PTRS_PER_PMD; i++, pmd++)
46131 + set_pmd(pmd, __pmd(0));
46132 + break;
46133 + }
46134 + pte = alloc_static_page(&pte_phys);
46135 + pte_save = pte;
46136 + for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
46137 + if ((address >= end) ||
46138 + ((address >> PAGE_SHIFT) >=
46139 + xen_start_info->nr_pages)) {
46140 + __set_pte(pte, __pte(0));
46141 + continue;
46142 + }
46143 + if (make_readonly(address)) {
46144 + __set_pte(pte,
46145 + __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
46146 + continue;
46147 + }
46148 + __set_pte(pte, __pte(address | _KERNPG_TABLE));
46149 + }
46150 + pte = pte_save;
46151 + early_make_page_readonly(pte, XENFEAT_writable_page_tables);
46152 + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
46153 + }
46154 +}
46155 +
46156 +static void __meminit
46157 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
46158 +{
46159 + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
46160 +
46161 + if (pmd_none(*pmd)) {
46162 + spin_lock(&init_mm.page_table_lock);
46163 + phys_pmd_init(pmd, address, end);
46164 + spin_unlock(&init_mm.page_table_lock);
46165 + __flush_tlb_all();
46166 + }
46167 +}
46168 +
46169 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
46170 +{
46171 + long i = pud_index(address);
46172 +
46173 + pud = pud + i;
46174 +
46175 + if (after_bootmem && pud_val(*pud)) {
46176 + phys_pmd_update(pud, address, end);
46177 + return;
46178 + }
46179 +
46180 + for (; i < PTRS_PER_PUD; pud++, i++) {
46181 + unsigned long paddr, pmd_phys;
46182 + pmd_t *pmd;
46183 +
46184 + paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
46185 + if (paddr >= end)
46186 + break;
46187 +
46188 + pmd = alloc_static_page(&pmd_phys);
46189 + early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
46190 + spin_lock(&init_mm.page_table_lock);
46191 + set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
46192 + phys_pmd_init(pmd, paddr, end);
46193 + spin_unlock(&init_mm.page_table_lock);
46194 + }
46195 + __flush_tlb();
46196 +}
46197 +
46198 +void __init xen_init_pt(void)
46199 +{
46200 + unsigned long addr, *page;
46201 +
46202 + memset((void *)init_level4_pgt, 0, PAGE_SIZE);
46203 + memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
46204 + memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
46205 +
46206 + /* Find the initial pte page that was built for us. */
46207 + page = (unsigned long *)xen_start_info->pt_base;
46208 + addr = page[pgd_index(__START_KERNEL_map)];
46209 + addr_to_page(addr, page);
46210 + addr = page[pud_index(__START_KERNEL_map)];
46211 + addr_to_page(addr, page);
46212 +
46213 +#ifdef CONFIG_XEN_COMPAT_030002
46214 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
46215 + in kernel PTEs. We check that here. */
46216 + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
46217 + unsigned long *pg;
46218 + pte_t pte;
46219 +
46220 + /* Mess with the initial mapping of page 0. It's not needed. */
46221 + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
46222 + addr = page[pmd_index(__START_KERNEL_map)];
46223 + addr_to_page(addr, pg);
46224 + pte.pte = pg[pte_index(__START_KERNEL_map)];
46225 + BUG_ON(!(pte.pte & _PAGE_PRESENT));
46226 +
46227 + /* If _PAGE_USER isn't set, we obviously do not need it. */
46228 + if (pte.pte & _PAGE_USER) {
46229 + /* _PAGE_USER is needed, but is it set implicitly? */
46230 + pte.pte &= ~_PAGE_USER;
46231 + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
46232 + pte, 0) != 0) ||
46233 + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
46234 + /* We need to explicitly specify _PAGE_USER. */
46235 + __kernel_page_user = _PAGE_USER;
46236 + }
46237 + }
46238 +#endif
46239 +
46240 + /* Construct mapping of initial pte page in our own directories. */
46241 + init_level4_pgt[pgd_index(__START_KERNEL_map)] =
46242 + mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
46243 + level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
46244 + __pud(__pa_symbol(level2_kernel_pgt) |
46245 + _KERNPG_TABLE);
46246 + memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
46247 +
46248 + early_make_page_readonly(init_level4_pgt,
46249 + XENFEAT_writable_page_tables);
46250 + early_make_page_readonly(init_level4_user_pgt,
46251 + XENFEAT_writable_page_tables);
46252 + early_make_page_readonly(level3_kernel_pgt,
46253 + XENFEAT_writable_page_tables);
46254 + early_make_page_readonly(level3_user_pgt,
46255 + XENFEAT_writable_page_tables);
46256 + early_make_page_readonly(level2_kernel_pgt,
46257 + XENFEAT_writable_page_tables);
46258 +
46259 + xen_pgd_pin(__pa_symbol(init_level4_pgt));
46260 + xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
46261 +
46262 + set_pgd((pgd_t *)(init_level4_user_pgt + 511),
46263 + mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
46264 +}
46265 +
46266 +void __init extend_init_mapping(unsigned long tables_space)
46267 +{
46268 + unsigned long va = __START_KERNEL_map;
46269 + unsigned long phys, addr, *pte_page;
46270 + pmd_t *pmd;
46271 + pte_t *pte, new_pte;
46272 + unsigned long *page = (unsigned long *)init_level4_pgt;
46273 +
46274 + addr = page[pgd_index(va)];
46275 + addr_to_page(addr, page);
46276 + addr = page[pud_index(va)];
46277 + addr_to_page(addr, page);
46278 +
46279 + /* Kill mapping of low 1MB. */
46280 + while (va < (unsigned long)&_text) {
46281 + HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
46282 + va += PAGE_SIZE;
46283 + }
46284 +
46285 + /* Ensure init mappings cover kernel text/data and initial tables. */
46286 + while (va < (__START_KERNEL_map
46287 + + (start_pfn << PAGE_SHIFT)
46288 + + tables_space)) {
46289 + pmd = (pmd_t *)&page[pmd_index(va)];
46290 + if (pmd_none(*pmd)) {
46291 + pte_page = alloc_static_page(&phys);
46292 + early_make_page_readonly(
46293 + pte_page, XENFEAT_writable_page_tables);
46294 + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
46295 + } else {
46296 + addr = page[pmd_index(va)];
46297 + addr_to_page(addr, pte_page);
46298 + }
46299 + pte = (pte_t *)&pte_page[pte_index(va)];
46300 + if (pte_none(*pte)) {
46301 + new_pte = pfn_pte(
46302 + (va - __START_KERNEL_map) >> PAGE_SHIFT,
46303 + __pgprot(_KERNPG_TABLE));
46304 + xen_l1_entry_update(pte, new_pte);
46305 + }
46306 + va += PAGE_SIZE;
46307 + }
46308 +
46309 + /* Finally, blow away any spurious initial mappings. */
46310 + while (1) {
46311 + pmd = (pmd_t *)&page[pmd_index(va)];
46312 + if (pmd_none(*pmd))
46313 + break;
46314 + HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
46315 + va += PAGE_SIZE;
46316 + }
46317 +}
46318 +
46319 +static void __init find_early_table_space(unsigned long end)
46320 +{
46321 + unsigned long puds, pmds, ptes, tables;
46322 +
46323 + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
46324 + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
46325 + ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
46326 +
46327 + tables = round_up(puds * 8, PAGE_SIZE) +
46328 + round_up(pmds * 8, PAGE_SIZE) +
46329 + round_up(ptes * 8, PAGE_SIZE);
46330 +
46331 + extend_init_mapping(tables);
46332 +
46333 + table_start = start_pfn;
46334 + table_end = table_start + (tables>>PAGE_SHIFT);
46335 +
46336 + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
46337 + end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
46338 +}
46339 +
46340 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
46341 + This runs before bootmem is initialized and gets pages directly from the
46342 + physical memory. To access them they are temporarily mapped. */
46343 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
46344 +{
46345 + unsigned long next;
46346 +
46347 + Dprintk("init_memory_mapping\n");
46348 +
46349 + /*
46350 + * Find space for the kernel direct mapping tables.
46351 + * Later we should allocate these tables in the local node of the memory
46352 + * mapped. Unfortunately this is done currently before the nodes are
46353 + * discovered.
46354 + */
46355 + if (!after_bootmem)
46356 + find_early_table_space(end);
46357 +
46358 + start = (unsigned long)__va(start);
46359 + end = (unsigned long)__va(end);
46360 +
46361 + for (; start < end; start = next) {
46362 + unsigned long pud_phys;
46363 + pgd_t *pgd = pgd_offset_k(start);
46364 + pud_t *pud;
46365 +
46366 + if (after_bootmem) {
46367 + pud = pud_offset_k(pgd, __PAGE_OFFSET);
46368 + make_page_readonly(pud, XENFEAT_writable_page_tables);
46369 + pud_phys = __pa(pud);
46370 + } else {
46371 + pud = alloc_static_page(&pud_phys);
46372 + early_make_page_readonly(pud, XENFEAT_writable_page_tables);
46373 + }
46374 + next = start + PGDIR_SIZE;
46375 + if (next > end)
46376 + next = end;
46377 + phys_pud_init(pud, __pa(start), __pa(next));
46378 + if (!after_bootmem)
46379 + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
46380 + }
46381 +
46382 + if (!after_bootmem) {
46383 + BUG_ON(start_pfn != table_end);
46384 +
46385 + /* Re-vector virtual addresses pointing into the initial
46386 + mapping to the just-established permanent ones. */
46387 + xen_start_info = __va(__pa(xen_start_info));
46388 + xen_start_info->pt_base = (unsigned long)
46389 + __va(__pa(xen_start_info->pt_base));
46390 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
46391 + phys_to_machine_mapping =
46392 + __va(__pa(xen_start_info->mfn_list));
46393 + xen_start_info->mfn_list = (unsigned long)
46394 + phys_to_machine_mapping;
46395 + }
46396 + if (xen_start_info->mod_start)
46397 + xen_start_info->mod_start = (unsigned long)
46398 + __va(__pa(xen_start_info->mod_start));
46399 +
46400 + /* Destroy the Xen-created mappings beyond the kernel image as
46401 + * well as the temporary mappings created above. Prevents
46402 + * overlap with modules area (if init mapping is very big).
46403 + */
46404 + start = PAGE_ALIGN((unsigned long)_end);
46405 + end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
46406 + for (; start < end; start += PAGE_SIZE)
46407 + WARN_ON(HYPERVISOR_update_va_mapping(
46408 + start, __pte_ma(0), 0));
46409 + }
46410 +
46411 + __flush_tlb_all();
46412 +}
46413 +
46414 +void __cpuinit zap_low_mappings(int cpu)
46415 +{
46416 + /* this is not required for Xen */
46417 +#if 0
46418 + swap_low_mappings();
46419 +#endif
46420 +}
46421 +
46422 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
46423 +__init void
46424 +size_zones(unsigned long *z, unsigned long *h,
46425 + unsigned long start_pfn, unsigned long end_pfn)
46426 +{
46427 + int i;
46428 +#ifndef CONFIG_XEN
46429 + unsigned long w;
46430 +#endif
46431 +
46432 + for (i = 0; i < MAX_NR_ZONES; i++)
46433 + z[i] = 0;
46434 +
46435 +#ifndef CONFIG_XEN
46436 + if (start_pfn < MAX_DMA_PFN)
46437 + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
46438 + if (start_pfn < MAX_DMA32_PFN) {
46439 + unsigned long dma32_pfn = MAX_DMA32_PFN;
46440 + if (dma32_pfn > end_pfn)
46441 + dma32_pfn = end_pfn;
46442 + z[ZONE_DMA32] = dma32_pfn - start_pfn;
46443 + }
46444 + z[ZONE_NORMAL] = end_pfn - start_pfn;
46445 +
46446 + /* Remove lower zones from higher ones. */
46447 + w = 0;
46448 + for (i = 0; i < MAX_NR_ZONES; i++) {
46449 + if (z[i])
46450 + z[i] -= w;
46451 + w += z[i];
46452 + }
46453 +
46454 + /* Compute holes */
46455 + w = start_pfn;
46456 + for (i = 0; i < MAX_NR_ZONES; i++) {
46457 + unsigned long s = w;
46458 + w += z[i];
46459 + h[i] = e820_hole_size(s, w);
46460 + }
46461 +
46462 + /* Add the space pace needed for mem_map to the holes too. */
46463 + for (i = 0; i < MAX_NR_ZONES; i++)
46464 + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
46465 +
46466 + /* The 16MB DMA zone has the kernel and other misc mappings.
46467 + Account them too */
46468 + if (h[ZONE_DMA]) {
46469 + h[ZONE_DMA] += dma_reserve;
46470 + if (h[ZONE_DMA] >= z[ZONE_DMA]) {
46471 + printk(KERN_WARNING
46472 + "Kernel too large and filling up ZONE_DMA?\n");
46473 + h[ZONE_DMA] = z[ZONE_DMA];
46474 + }
46475 + }
46476 +#else
46477 + z[ZONE_DMA] = end_pfn;
46478 + for (i = 0; i < MAX_NR_ZONES; i++)
46479 + h[i] = 0;
46480 +#endif
46481 +}
46482 +
46483 +#ifndef CONFIG_NUMA
46484 +void __init paging_init(void)
46485 +{
46486 + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
46487 + int i;
46488 +
46489 + memory_present(0, 0, end_pfn);
46490 + sparse_init();
46491 + size_zones(zones, holes, 0, end_pfn);
46492 + free_area_init_node(0, NODE_DATA(0), zones,
46493 + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
46494 +
46495 + /* Switch to the real shared_info page, and clear the
46496 + * dummy page. */
46497 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
46498 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
46499 + memset(empty_zero_page, 0, sizeof(empty_zero_page));
46500 +
46501 + init_mm.context.pinned = 1;
46502 +
46503 + /* Setup mapping of lower 1st MB */
46504 + for (i = 0; i < NR_FIX_ISAMAPS; i++)
46505 + if (is_initial_xendomain())
46506 + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
46507 + else
46508 + __set_fixmap(FIX_ISAMAP_BEGIN - i,
46509 + virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
46510 + PAGE_KERNEL_RO);
46511 +}
46512 +#endif
46513 +
46514 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
46515 + from the CPU leading to inconsistent cache lines. address and size
46516 + must be aligned to 2MB boundaries.
46517 + Does nothing when the mapping doesn't exist. */
46518 +void __init clear_kernel_mapping(unsigned long address, unsigned long size)
46519 +{
46520 + unsigned long end = address + size;
46521 +
46522 + BUG_ON(address & ~LARGE_PAGE_MASK);
46523 + BUG_ON(size & ~LARGE_PAGE_MASK);
46524 +
46525 + for (; address < end; address += LARGE_PAGE_SIZE) {
46526 + pgd_t *pgd = pgd_offset_k(address);
46527 + pud_t *pud;
46528 + pmd_t *pmd;
46529 + if (pgd_none(*pgd))
46530 + continue;
46531 + pud = pud_offset(pgd, address);
46532 + if (pud_none(*pud))
46533 + continue;
46534 + pmd = pmd_offset(pud, address);
46535 + if (!pmd || pmd_none(*pmd))
46536 + continue;
46537 + if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
46538 + /* Could handle this, but it should not happen currently. */
46539 + printk(KERN_ERR
46540 + "clear_kernel_mapping: mapping has been split. will leak memory\n");
46541 + pmd_ERROR(*pmd);
46542 + }
46543 + set_pmd(pmd, __pmd(0));
46544 + }
46545 + __flush_tlb_all();
46546 +}
46547 +
46548 +/*
46549 + * Memory hotplug specific functions
46550 + * These are only for non-NUMA machines right now.
46551 + */
46552 +#ifdef CONFIG_MEMORY_HOTPLUG
46553 +
46554 +void online_page(struct page *page)
46555 +{
46556 + ClearPageReserved(page);
46557 + set_page_count(page, 1);
46558 + __free_page(page);
46559 + totalram_pages++;
46560 + num_physpages++;
46561 +}
46562 +
46563 +int add_memory(u64 start, u64 size)
46564 +{
46565 + struct pglist_data *pgdat = NODE_DATA(0);
46566 + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
46567 + unsigned long start_pfn = start >> PAGE_SHIFT;
46568 + unsigned long nr_pages = size >> PAGE_SHIFT;
46569 + int ret;
46570 +
46571 + ret = __add_pages(zone, start_pfn, nr_pages);
46572 + if (ret)
46573 + goto error;
46574 +
46575 + init_memory_mapping(start, (start + size -1));
46576 +
46577 + return ret;
46578 +error:
46579 + printk("%s: Problem encountered in __add_pages!\n", __func__);
46580 + return ret;
46581 +}
46582 +EXPORT_SYMBOL_GPL(add_memory);
46583 +
46584 +int remove_memory(u64 start, u64 size)
46585 +{
46586 + return -EINVAL;
46587 +}
46588 +EXPORT_SYMBOL_GPL(remove_memory);
46589 +
46590 +#endif
46591 +
46592 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
46593 + kcore_vsyscall;
46594 +
46595 +void __init mem_init(void)
46596 +{
46597 + long codesize, reservedpages, datasize, initsize;
46598 + unsigned long pfn;
46599 +
46600 + contiguous_bitmap = alloc_bootmem_low_pages(
46601 + (end_pfn + 2*BITS_PER_LONG) >> 3);
46602 + BUG_ON(!contiguous_bitmap);
46603 + memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
46604 +
46605 +#if defined(CONFIG_SWIOTLB)
46606 + pci_swiotlb_init();
46607 +#endif
46608 + no_iommu_init();
46609 +
46610 + /* How many end-of-memory variables you have, grandma! */
46611 + max_low_pfn = end_pfn;
46612 + max_pfn = end_pfn;
46613 + num_physpages = end_pfn;
46614 + high_memory = (void *) __va(end_pfn * PAGE_SIZE);
46615 +
46616 + /* clear the zero-page */
46617 + memset(empty_zero_page, 0, PAGE_SIZE);
46618 +
46619 + reservedpages = 0;
46620 +
46621 + /* this will put all low memory onto the freelists */
46622 +#ifdef CONFIG_NUMA
46623 + totalram_pages = numa_free_all_bootmem();
46624 +#else
46625 + totalram_pages = free_all_bootmem();
46626 +#endif
46627 + /* XEN: init and count pages outside initial allocation. */
46628 + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
46629 + ClearPageReserved(pfn_to_page(pfn));
46630 + set_page_count(pfn_to_page(pfn), 1);
46631 + totalram_pages++;
46632 + }
46633 + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
46634 +
46635 + after_bootmem = 1;
46636 +
46637 + codesize = (unsigned long) &_etext - (unsigned long) &_text;
46638 + datasize = (unsigned long) &_edata - (unsigned long) &_etext;
46639 + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
46640 +
46641 + /* Register memory areas for /proc/kcore */
46642 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
46643 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
46644 + VMALLOC_END-VMALLOC_START);
46645 + kclist_add(&kcore_kernel, &_stext, _end - _stext);
46646 + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
46647 + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
46648 + VSYSCALL_END - VSYSCALL_START);
46649 +
46650 + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
46651 + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
46652 + end_pfn << (PAGE_SHIFT-10),
46653 + codesize >> 10,
46654 + reservedpages << (PAGE_SHIFT-10),
46655 + datasize >> 10,
46656 + initsize >> 10);
46657 +
46658 +#ifndef CONFIG_XEN
46659 +#ifdef CONFIG_SMP
46660 + /*
46661 + * Sync boot_level4_pgt mappings with the init_level4_pgt
46662 + * except for the low identity mappings which are already zapped
46663 + * in init_level4_pgt. This sync-up is essential for AP's bringup
46664 + */
46665 + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
46666 +#endif
46667 +#endif
46668 +}
46669 +
46670 +void free_initmem(void)
46671 +{
46672 +#ifdef __DO_LATER__
46673 + /*
46674 + * Some pages can be pinned, but some are not. Unpinning such pages
46675 + * triggers BUG().
46676 + */
46677 + unsigned long addr;
46678 +
46679 + addr = (unsigned long)(&__init_begin);
46680 + for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
46681 + ClearPageReserved(virt_to_page(addr));
46682 + set_page_count(virt_to_page(addr), 1);
46683 + memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
46684 + make_page_writable(
46685 + __va(__pa(addr)), XENFEAT_writable_page_tables);
46686 + /*
46687 + * Make pages from __PAGE_OFFSET address as well
46688 + */
46689 + make_page_writable(
46690 + (void *)addr, XENFEAT_writable_page_tables);
46691 + free_page(addr);
46692 + totalram_pages++;
46693 + }
46694 + memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
46695 + printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
46696 +#endif
46697 +}
46698 +
46699 +#ifdef CONFIG_DEBUG_RODATA
46700 +
46701 +extern char __start_rodata, __end_rodata;
46702 +void mark_rodata_ro(void)
46703 +{
46704 + unsigned long addr = (unsigned long)&__start_rodata;
46705 +
46706 + for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
46707 + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
46708 +
46709 + printk ("Write protecting the kernel read-only data: %luk\n",
46710 + (&__end_rodata - &__start_rodata) >> 10);
46711 +
46712 + /*
46713 + * change_page_attr_addr() requires a global_flush_tlb() call after it.
46714 + * We do this after the printk so that if something went wrong in the
46715 + * change, the printk gets out at least to give a better debug hint
46716 + * of who is the culprit.
46717 + */
46718 + global_flush_tlb();
46719 +}
46720 +#endif
46721 +
46722 +#ifdef CONFIG_BLK_DEV_INITRD
46723 +void free_initrd_mem(unsigned long start, unsigned long end)
46724 +{
46725 + if (start >= end)
46726 + return;
46727 + printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
46728 + for (; start < end; start += PAGE_SIZE) {
46729 + ClearPageReserved(virt_to_page(start));
46730 + set_page_count(virt_to_page(start), 1);
46731 + free_page(start);
46732 + totalram_pages++;
46733 + }
46734 +}
46735 +#endif
46736 +
46737 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
46738 +{
46739 + /* Should check here against the e820 map to avoid double free */
46740 +#ifdef CONFIG_NUMA
46741 + int nid = phys_to_nid(phys);
46742 + reserve_bootmem_node(NODE_DATA(nid), phys, len);
46743 +#else
46744 + reserve_bootmem(phys, len);
46745 +#endif
46746 + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
46747 + dma_reserve += len / PAGE_SIZE;
46748 +}
46749 +
46750 +int kern_addr_valid(unsigned long addr)
46751 +{
46752 + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
46753 + pgd_t *pgd;
46754 + pud_t *pud;
46755 + pmd_t *pmd;
46756 + pte_t *pte;
46757 +
46758 + if (above != 0 && above != -1UL)
46759 + return 0;
46760 +
46761 + pgd = pgd_offset_k(addr);
46762 + if (pgd_none(*pgd))
46763 + return 0;
46764 +
46765 + pud = pud_offset_k(pgd, addr);
46766 + if (pud_none(*pud))
46767 + return 0;
46768 +
46769 + pmd = pmd_offset(pud, addr);
46770 + if (pmd_none(*pmd))
46771 + return 0;
46772 + if (pmd_large(*pmd))
46773 + return pfn_valid(pmd_pfn(*pmd));
46774 +
46775 + pte = pte_offset_kernel(pmd, addr);
46776 + if (pte_none(*pte))
46777 + return 0;
46778 + return pfn_valid(pte_pfn(*pte));
46779 +}
46780 +
46781 +#ifdef CONFIG_SYSCTL
46782 +#include <linux/sysctl.h>
46783 +
46784 +extern int exception_trace, page_fault_trace;
46785 +
46786 +static ctl_table debug_table2[] = {
46787 + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
46788 + proc_dointvec },
46789 + { 0, }
46790 +};
46791 +
46792 +static ctl_table debug_root_table2[] = {
46793 + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
46794 + .child = debug_table2 },
46795 + { 0 },
46796 +};
46797 +
46798 +static __init int x8664_sysctl_init(void)
46799 +{
46800 + register_sysctl_table(debug_root_table2, 1);
46801 + return 0;
46802 +}
46803 +__initcall(x8664_sysctl_init);
46804 +#endif
46805 +
46806 +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
46807 + covers the 64bit vsyscall page now. 32bit has a real VMA now and does
46808 + not need special handling anymore. */
46809 +
46810 +static struct vm_area_struct gate_vma = {
46811 + .vm_start = VSYSCALL_START,
46812 + .vm_end = VSYSCALL_END,
46813 + .vm_page_prot = PAGE_READONLY
46814 +};
46815 +
46816 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
46817 +{
46818 +#ifdef CONFIG_IA32_EMULATION
46819 + if (test_tsk_thread_flag(tsk, TIF_IA32))
46820 + return NULL;
46821 +#endif
46822 + return &gate_vma;
46823 +}
46824 +
46825 +int in_gate_area(struct task_struct *task, unsigned long addr)
46826 +{
46827 + struct vm_area_struct *vma = get_gate_vma(task);
46828 + if (!vma)
46829 + return 0;
46830 + return (addr >= vma->vm_start) && (addr < vma->vm_end);
46831 +}
46832 +
46833 +/* Use this when you have no reliable task/vma, typically from interrupt
46834 + * context. It is less reliable than using the task's vma and may give
46835 + * false positives.
46836 + */
46837 +int in_gate_area_no_task(unsigned long addr)
46838 +{
46839 + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
46840 +}
46841 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/mm/pageattr-xen.c linux-2.6.16.33/arch/x86_64/mm/pageattr-xen.c
46842 --- linux-2.6.16.33-noxen/arch/x86_64/mm/pageattr-xen.c 1970-01-01 00:00:00.000000000 +0000
46843 +++ linux-2.6.16.33/arch/x86_64/mm/pageattr-xen.c 2007-01-08 15:00:45.000000000 +0000
46844 @@ -0,0 +1,408 @@
46845 +/*
46846 + * Copyright 2002 Andi Kleen, SuSE Labs.
46847 + * Thanks to Ben LaHaise for precious feedback.
46848 + */
46849 +
46850 +#include <linux/config.h>
46851 +#include <linux/mm.h>
46852 +#include <linux/sched.h>
46853 +#include <linux/highmem.h>
46854 +#include <linux/module.h>
46855 +#include <linux/slab.h>
46856 +#include <asm/uaccess.h>
46857 +#include <asm/processor.h>
46858 +#include <asm/tlbflush.h>
46859 +#include <asm/io.h>
46860 +
46861 +#ifdef CONFIG_XEN
46862 +#include <asm/pgalloc.h>
46863 +#include <asm/mmu_context.h>
46864 +
46865 +LIST_HEAD(mm_unpinned);
46866 +DEFINE_SPINLOCK(mm_unpinned_lock);
46867 +
46868 +static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
46869 +{
46870 + struct page *page = virt_to_page(pt);
46871 + unsigned long pfn = page_to_pfn(page);
46872 +
46873 + BUG_ON(HYPERVISOR_update_va_mapping(
46874 + (unsigned long)__va(pfn << PAGE_SHIFT),
46875 + pfn_pte(pfn, flags), 0));
46876 +}
46877 +
46878 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
46879 +{
46880 + pgd_t *pgd;
46881 + pud_t *pud;
46882 + pmd_t *pmd;
46883 + pte_t *pte;
46884 + int g,u,m;
46885 +
46886 + pgd = mm->pgd;
46887 + /*
46888 + * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
46889 + * be the 'current' task's pagetables (e.g., current may be 32-bit,
46890 + * but the pagetables may be for a 64-bit task).
46891 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
46892 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
46893 + */
46894 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
46895 + if (pgd_none(*pgd))
46896 + continue;
46897 + pud = pud_offset(pgd, 0);
46898 + if (PTRS_PER_PUD > 1) /* not folded */
46899 + mm_walk_set_prot(pud,flags);
46900 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
46901 + if (pud_none(*pud))
46902 + continue;
46903 + pmd = pmd_offset(pud, 0);
46904 + if (PTRS_PER_PMD > 1) /* not folded */
46905 + mm_walk_set_prot(pmd,flags);
46906 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
46907 + if (pmd_none(*pmd))
46908 + continue;
46909 + pte = pte_offset_kernel(pmd,0);
46910 + mm_walk_set_prot(pte,flags);
46911 + }
46912 + }
46913 + }
46914 +}
46915 +
46916 +void mm_pin(struct mm_struct *mm)
46917 +{
46918 + if (xen_feature(XENFEAT_writable_page_tables))
46919 + return;
46920 +
46921 + spin_lock(&mm->page_table_lock);
46922 +
46923 + mm_walk(mm, PAGE_KERNEL_RO);
46924 + BUG_ON(HYPERVISOR_update_va_mapping(
46925 + (unsigned long)mm->pgd,
46926 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
46927 + UVMF_TLB_FLUSH));
46928 + BUG_ON(HYPERVISOR_update_va_mapping(
46929 + (unsigned long)__user_pgd(mm->pgd),
46930 + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
46931 + UVMF_TLB_FLUSH));
46932 + xen_pgd_pin(__pa(mm->pgd)); /* kernel */
46933 + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
46934 + mm->context.pinned = 1;
46935 + spin_lock(&mm_unpinned_lock);
46936 + list_del(&mm->context.unpinned);
46937 + spin_unlock(&mm_unpinned_lock);
46938 +
46939 + spin_unlock(&mm->page_table_lock);
46940 +}
46941 +
46942 +void mm_unpin(struct mm_struct *mm)
46943 +{
46944 + if (xen_feature(XENFEAT_writable_page_tables))
46945 + return;
46946 +
46947 + spin_lock(&mm->page_table_lock);
46948 +
46949 + xen_pgd_unpin(__pa(mm->pgd));
46950 + xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
46951 + BUG_ON(HYPERVISOR_update_va_mapping(
46952 + (unsigned long)mm->pgd,
46953 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
46954 + BUG_ON(HYPERVISOR_update_va_mapping(
46955 + (unsigned long)__user_pgd(mm->pgd),
46956 + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
46957 + mm_walk(mm, PAGE_KERNEL);
46958 + xen_tlb_flush();
46959 + mm->context.pinned = 0;
46960 + spin_lock(&mm_unpinned_lock);
46961 + list_add(&mm->context.unpinned, &mm_unpinned);
46962 + spin_unlock(&mm_unpinned_lock);
46963 +
46964 + spin_unlock(&mm->page_table_lock);
46965 +}
46966 +
46967 +void mm_pin_all(void)
46968 +{
46969 + if (xen_feature(XENFEAT_writable_page_tables))
46970 + return;
46971 +
46972 + while (!list_empty(&mm_unpinned))
46973 + mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
46974 + context.unpinned));
46975 +}
46976 +
46977 +void _arch_dup_mmap(struct mm_struct *mm)
46978 +{
46979 + if (!mm->context.pinned)
46980 + mm_pin(mm);
46981 +}
46982 +
46983 +void _arch_exit_mmap(struct mm_struct *mm)
46984 +{
46985 + struct task_struct *tsk = current;
46986 +
46987 + task_lock(tsk);
46988 +
46989 + /*
46990 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
46991 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
46992 + */
46993 + if ( tsk->active_mm == mm )
46994 + {
46995 + tsk->active_mm = &init_mm;
46996 + atomic_inc(&init_mm.mm_count);
46997 +
46998 + switch_mm(mm, &init_mm, tsk);
46999 +
47000 + atomic_dec(&mm->mm_count);
47001 + BUG_ON(atomic_read(&mm->mm_count) == 0);
47002 + }
47003 +
47004 + task_unlock(tsk);
47005 +
47006 + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) &&
47007 + !mm->context.has_foreign_mappings )
47008 + mm_unpin(mm);
47009 +}
47010 +
47011 +void pte_free(struct page *pte)
47012 +{
47013 + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
47014 +
47015 + if (!pte_write(*virt_to_ptep(va)))
47016 + BUG_ON(HYPERVISOR_update_va_mapping(
47017 + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
47018 + __free_page(pte);
47019 +}
47020 +#endif /* CONFIG_XEN */
47021 +
47022 +static inline pte_t *lookup_address(unsigned long address)
47023 +{
47024 + pgd_t *pgd = pgd_offset_k(address);
47025 + pud_t *pud;
47026 + pmd_t *pmd;
47027 + pte_t *pte;
47028 + if (pgd_none(*pgd))
47029 + return NULL;
47030 + pud = pud_offset(pgd, address);
47031 + if (!pud_present(*pud))
47032 + return NULL;
47033 + pmd = pmd_offset(pud, address);
47034 + if (!pmd_present(*pmd))
47035 + return NULL;
47036 + if (pmd_large(*pmd))
47037 + return (pte_t *)pmd;
47038 + pte = pte_offset_kernel(pmd, address);
47039 + if (pte && !pte_present(*pte))
47040 + pte = NULL;
47041 + return pte;
47042 +}
47043 +
47044 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
47045 + pgprot_t ref_prot)
47046 +{
47047 + int i;
47048 + unsigned long addr;
47049 + struct page *base = alloc_pages(GFP_KERNEL, 0);
47050 + pte_t *pbase;
47051 + if (!base)
47052 + return NULL;
47053 + address = __pa(address);
47054 + addr = address & LARGE_PAGE_MASK;
47055 + pbase = (pte_t *)page_address(base);
47056 + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
47057 + pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
47058 + addr == address ? prot : ref_prot);
47059 + }
47060 + return base;
47061 +}
47062 +
47063 +
47064 +static void flush_kernel_map(void *address)
47065 +{
47066 + if (0 && address && cpu_has_clflush) {
47067 + /* is this worth it? */
47068 + int i;
47069 + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
47070 + asm volatile("clflush (%0)" :: "r" (address + i));
47071 + } else
47072 + asm volatile("wbinvd":::"memory");
47073 + if (address)
47074 + __flush_tlb_one(address);
47075 + else
47076 + __flush_tlb_all();
47077 +}
47078 +
47079 +
47080 +static inline void flush_map(unsigned long address)
47081 +{
47082 + on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
47083 +}
47084 +
47085 +struct deferred_page {
47086 + struct deferred_page *next;
47087 + struct page *fpage;
47088 + unsigned long address;
47089 +};
47090 +static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
47091 +
47092 +static inline void save_page(unsigned long address, struct page *fpage)
47093 +{
47094 + struct deferred_page *df;
47095 + df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
47096 + if (!df) {
47097 + flush_map(address);
47098 + __free_page(fpage);
47099 + } else {
47100 + df->next = df_list;
47101 + df->fpage = fpage;
47102 + df->address = address;
47103 + df_list = df;
47104 + }
47105 +}
47106 +
47107 +/*
47108 + * No more special protections in this 2/4MB area - revert to a
47109 + * large page again.
47110 + */
47111 +static void revert_page(unsigned long address, pgprot_t ref_prot)
47112 +{
47113 + pgd_t *pgd;
47114 + pud_t *pud;
47115 + pmd_t *pmd;
47116 + pte_t large_pte;
47117 +
47118 + pgd = pgd_offset_k(address);
47119 + BUG_ON(pgd_none(*pgd));
47120 + pud = pud_offset(pgd,address);
47121 + BUG_ON(pud_none(*pud));
47122 + pmd = pmd_offset(pud, address);
47123 + BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
47124 + pgprot_val(ref_prot) |= _PAGE_PSE;
47125 + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
47126 + set_pte((pte_t *)pmd, large_pte);
47127 +}
47128 +
47129 +static int
47130 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
47131 + pgprot_t ref_prot)
47132 +{
47133 + pte_t *kpte;
47134 + struct page *kpte_page;
47135 + unsigned kpte_flags;
47136 + pgprot_t ref_prot2;
47137 + kpte = lookup_address(address);
47138 + if (!kpte) return 0;
47139 + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
47140 + kpte_flags = pte_val(*kpte);
47141 + if (pgprot_val(prot) != pgprot_val(ref_prot)) {
47142 + if ((kpte_flags & _PAGE_PSE) == 0) {
47143 + set_pte(kpte, pfn_pte(pfn, prot));
47144 + } else {
47145 + /*
47146 + * split_large_page will take the reference for this change_page_attr
47147 + * on the split page.
47148 + */
47149 +
47150 + struct page *split;
47151 + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
47152 +
47153 + split = split_large_page(address, prot, ref_prot2);
47154 + if (!split)
47155 + return -ENOMEM;
47156 + set_pte(kpte,mk_pte(split, ref_prot2));
47157 + kpte_page = split;
47158 + }
47159 + get_page(kpte_page);
47160 + } else if ((kpte_flags & _PAGE_PSE) == 0) {
47161 + set_pte(kpte, pfn_pte(pfn, ref_prot));
47162 + __put_page(kpte_page);
47163 + } else
47164 + BUG();
47165 +
47166 + /* on x86-64 the direct mapping set at boot is not using 4k pages */
47167 + /*
47168 + * ..., but the XEN guest kernels (currently) do:
47169 + * If the pte was reserved, it means it was created at boot
47170 + * time (not via split_large_page) and in turn we must not
47171 + * replace it with a large page.
47172 + */
47173 +#ifndef CONFIG_XEN
47174 + BUG_ON(PageReserved(kpte_page));
47175 +#else
47176 + if (!PageReserved(kpte_page))
47177 +#endif
47178 + switch (page_count(kpte_page)) {
47179 + case 1:
47180 + save_page(address, kpte_page);
47181 + revert_page(address, ref_prot);
47182 + break;
47183 + case 0:
47184 + BUG(); /* memleak and failed 2M page regeneration */
47185 + }
47186 + return 0;
47187 +}
47188 +
47189 +/*
47190 + * Change the page attributes of an page in the linear mapping.
47191 + *
47192 + * This should be used when a page is mapped with a different caching policy
47193 + * than write-back somewhere - some CPUs do not like it when mappings with
47194 + * different caching policies exist. This changes the page attributes of the
47195 + * in kernel linear mapping too.
47196 + *
47197 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
47198 + * This function only deals with the kernel linear map.
47199 + *
47200 + * Caller must call global_flush_tlb() after this.
47201 + */
47202 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
47203 +{
47204 + int err = 0;
47205 + int i;
47206 +
47207 + down_write(&init_mm.mmap_sem);
47208 + for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
47209 + unsigned long pfn = __pa(address) >> PAGE_SHIFT;
47210 +
47211 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
47212 + if (err)
47213 + break;
47214 + /* Handle kernel mapping too which aliases part of the
47215 + * lowmem */
47216 + if (__pa(address) < KERNEL_TEXT_SIZE) {
47217 + unsigned long addr2;
47218 + pgprot_t prot2 = prot;
47219 + addr2 = __START_KERNEL_map + __pa(address);
47220 + pgprot_val(prot2) &= ~_PAGE_NX;
47221 + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
47222 + }
47223 + }
47224 + up_write(&init_mm.mmap_sem);
47225 + return err;
47226 +}
47227 +
47228 +/* Don't call this for MMIO areas that may not have a mem_map entry */
47229 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
47230 +{
47231 + unsigned long addr = (unsigned long)page_address(page);
47232 + return change_page_attr_addr(addr, numpages, prot);
47233 +}
47234 +
47235 +void global_flush_tlb(void)
47236 +{
47237 + struct deferred_page *df, *next_df;
47238 +
47239 + down_read(&init_mm.mmap_sem);
47240 + df = xchg(&df_list, NULL);
47241 + up_read(&init_mm.mmap_sem);
47242 + flush_map((df && !df->next) ? df->address : 0);
47243 + for (; df; df = next_df) {
47244 + next_df = df->next;
47245 + if (df->fpage)
47246 + __free_page(df->fpage);
47247 + kfree(df);
47248 + }
47249 +}
47250 +
47251 +EXPORT_SYMBOL(change_page_attr);
47252 +EXPORT_SYMBOL(global_flush_tlb);
47253 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/oprofile/Makefile linux-2.6.16.33/arch/x86_64/oprofile/Makefile
47254 --- linux-2.6.16.33-noxen/arch/x86_64/oprofile/Makefile 2006-11-22 18:06:31.000000000 +0000
47255 +++ linux-2.6.16.33/arch/x86_64/oprofile/Makefile 2007-01-08 15:00:45.000000000 +0000
47256 @@ -11,9 +11,15 @@
47257 oprofilefs.o oprofile_stats.o \
47258 timer_int.o )
47259
47260 +ifdef CONFIG_XEN
47261 +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
47262 + xenoprofile.o)
47263 +OPROFILE-y := xenoprof.o
47264 +else
47265 OPROFILE-y := init.o backtrace.o
47266 OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
47267 op_model_ppro.o
47268 OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
47269 -
47270 -oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
47271 +endif
47272 +oprofile-y = $(DRIVER_OBJS) $(XENOPROF_COMMON_OBJS) \
47273 + $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
47274 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/pci/Makefile linux-2.6.16.33/arch/x86_64/pci/Makefile
47275 --- linux-2.6.16.33-noxen/arch/x86_64/pci/Makefile 2006-11-22 18:06:31.000000000 +0000
47276 +++ linux-2.6.16.33/arch/x86_64/pci/Makefile 2007-01-08 15:00:45.000000000 +0000
47277 @@ -15,10 +15,22 @@
47278
47279 obj-$(CONFIG_NUMA) += k8-bus.o
47280
47281 +# pcifront should be after mmconfig.o and direct.o as it should only
47282 +# take over if direct access to the PCI bus is unavailable
47283 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
47284 +
47285 direct-y += ../../i386/pci/direct.o
47286 acpi-y += ../../i386/pci/acpi.o
47287 +pcifront-y += ../../i386/pci/pcifront.o
47288 legacy-y += ../../i386/pci/legacy.o
47289 irq-y += ../../i386/pci/irq.o
47290 common-y += ../../i386/pci/common.o
47291 fixup-y += ../../i386/pci/fixup.o
47292 i386-y += ../../i386/pci/i386.o
47293 +
47294 +ifdef CONFIG_XEN
47295 +irq-y := ../../i386/pci/irq-xen.o
47296 +include $(srctree)/scripts/Makefile.xen
47297 +
47298 +obj-y := $(call cherrypickxen, $(obj-y))
47299 +endif
47300 diff -Nur linux-2.6.16.33-noxen/arch/x86_64/pci/mmconfig.c linux-2.6.16.33/arch/x86_64/pci/mmconfig.c
47301 --- linux-2.6.16.33-noxen/arch/x86_64/pci/mmconfig.c 2006-11-22 18:06:31.000000000 +0000
47302 +++ linux-2.6.16.33/arch/x86_64/pci/mmconfig.c 2007-05-23 21:00:01.000000000 +0000
47303 @@ -9,11 +9,19 @@
47304 #include <linux/init.h>
47305 #include <linux/acpi.h>
47306 #include <linux/bitmap.h>
47307 +#include <asm/e820.h>
47308 +
47309 #include "pci.h"
47310
47311 -#define MMCONFIG_APER_SIZE (256*1024*1024)
47312 +/* aperture is up to 256MB but BIOS may reserve less */
47313 +#define MMCONFIG_APER_MIN (2 * 1024*1024)
47314 +#define MMCONFIG_APER_MAX (256 * 1024*1024)
47315 +
47316 +/* Verify the first 16 busses. We assume that systems with more busses
47317 + get MCFG right. */
47318 +#define MAX_CHECK_BUS 16
47319
47320 -static DECLARE_BITMAP(fallback_slots, 32);
47321 +static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
47322
47323 /* Static virtual mapping of the MMCONFIG aperture */
47324 struct mmcfg_virt {
47325 @@ -55,7 +63,8 @@
47326 static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
47327 {
47328 char __iomem *addr;
47329 - if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), &fallback_slots))
47330 + if (seg == 0 && bus < MAX_CHECK_BUS &&
47331 + test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
47332 return NULL;
47333 addr = get_virt(seg, bus);
47334 if (!addr)
47335 @@ -69,8 +78,10 @@
47336 char __iomem *addr;
47337
47338 /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
47339 - if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
47340 + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
47341 + *value = -1;
47342 return -EINVAL;
47343 + }
47344
47345 addr = pci_dev_base(seg, bus, devfn);
47346 if (!addr)
47347 @@ -129,23 +140,56 @@
47348 Normally this can be expressed in the MCFG by not listing them
47349 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
47350 Instead try to discover all devices on bus 0 that are unreachable using MM
47351 - and fallback for them.
47352 - We only do this for bus 0/seg 0 */
47353 + and fallback for them. */
47354 static __init void unreachable_devices(void)
47355 {
47356 - int i;
47357 - for (i = 0; i < 32; i++) {
47358 - u32 val1;
47359 - char __iomem *addr;
47360 + int i, k;
47361 + /* Use the max bus number from ACPI here? */
47362 + for (k = 0; k < MAX_CHECK_BUS; k++) {
47363 + for (i = 0; i < 32; i++) {
47364 + u32 val1;
47365 + char __iomem *addr;
47366 +
47367 + pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
47368 + if (val1 == 0xffffffff)
47369 + continue;
47370 + addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
47371 + if (addr == NULL|| readl(addr) != val1) {
47372 + set_bit(i + 32*k, fallback_slots);
47373 + printk(KERN_NOTICE
47374 + "PCI: No mmconfig possible on device %x:%x\n",
47375 + k, i);
47376 + }
47377 + }
47378 + }
47379 +}
47380
47381 - pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
47382 - if (val1 == 0xffffffff)
47383 +/* NB. Ripped from arch/x86_64/kernel/e820.c for this Xen bugfix patch. */
47384 +#ifdef CONFIG_XEN
47385 +extern struct e820map machine_e820;
47386 +#define e820 machine_e820
47387 +#endif
47388 +static int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
47389 +{
47390 + int i;
47391 + for (i = 0; i < e820.nr_map; i++) {
47392 + struct e820entry *ei = &e820.map[i];
47393 + if (type && ei->type != type)
47394 continue;
47395 - addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
47396 - if (addr == NULL|| readl(addr) != val1) {
47397 - set_bit(i, &fallback_slots);
47398 - }
47399 + /* is the region (part) in overlap with the current region ?*/
47400 + if (ei->addr >= end || ei->addr + ei->size <= start)
47401 + continue;
47402 +
47403 + /* if the region is at the beginning of <start,end> we move
47404 + * start to the end of the region since it's ok until there
47405 + */
47406 + if (ei->addr <= start)
47407 + start = ei->addr + ei->size;
47408 + /* if start is now at or beyond end, we're done, full coverage */
47409 + if (start >= end)
47410 + return 1; /* we're done */
47411 }
47412 + return 0;
47413 }
47414
47415 static int __init pci_mmcfg_init(void)
47416 @@ -161,6 +205,15 @@
47417 (pci_mmcfg_config[0].base_address == 0))
47418 return 0;
47419
47420 + if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
47421 + pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
47422 + E820_RESERVED)) {
47423 + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
47424 + pci_mmcfg_config[0].base_address);
47425 + printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
47426 + return 0;
47427 + }
47428 +
47429 /* RED-PEN i386 doesn't do _nocache right now */
47430 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
47431 if (pci_mmcfg_virt == NULL) {
47432 @@ -169,7 +222,8 @@
47433 }
47434 for (i = 0; i < pci_mmcfg_config_num; ++i) {
47435 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
47436 - pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address, MMCONFIG_APER_SIZE);
47437 + pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address,
47438 + MMCONFIG_APER_MAX);
47439 if (!pci_mmcfg_virt[i].virt) {
47440 printk("PCI: Cannot map mmconfig aperture for segment %d\n",
47441 pci_mmcfg_config[i].pci_segment_group_number);
47442 diff -Nur linux-2.6.16.33-noxen/drivers/Makefile linux-2.6.16.33/drivers/Makefile
47443 --- linux-2.6.16.33-noxen/drivers/Makefile 2006-11-22 18:06:31.000000000 +0000
47444 +++ linux-2.6.16.33/drivers/Makefile 2007-01-08 15:00:45.000000000 +0000
47445 @@ -34,6 +34,7 @@
47446 obj-$(CONFIG_NUBUS) += nubus/
47447 obj-$(CONFIG_ATM) += atm/
47448 obj-$(CONFIG_PPC_PMAC) += macintosh/
47449 +obj-$(CONFIG_XEN) += xen/
47450 obj-$(CONFIG_IDE) += ide/
47451 obj-$(CONFIG_FC4) += fc4/
47452 obj-$(CONFIG_SCSI) += scsi/
47453 diff -Nur linux-2.6.16.33-noxen/drivers/acpi/Kconfig linux-2.6.16.33/drivers/acpi/Kconfig
47454 --- linux-2.6.16.33-noxen/drivers/acpi/Kconfig 2006-11-22 18:06:31.000000000 +0000
47455 +++ linux-2.6.16.33/drivers/acpi/Kconfig 2007-01-08 15:00:45.000000000 +0000
47456 @@ -46,7 +46,7 @@
47457
47458 config ACPI_SLEEP
47459 bool "Sleep States"
47460 - depends on X86 && (!SMP || SUSPEND_SMP)
47461 + depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
47462 depends on PM
47463 default y
47464 ---help---
47465 @@ -287,6 +287,7 @@
47466 config X86_PM_TIMER
47467 bool "Power Management Timer Support" if EMBEDDED
47468 depends on X86
47469 + depends on !XEN
47470 default y
47471 help
47472 The Power Management Timer is available on all ACPI-capable,
47473 diff -Nur linux-2.6.16.33-noxen/drivers/acpi/tables.c linux-2.6.16.33/drivers/acpi/tables.c
47474 --- linux-2.6.16.33-noxen/drivers/acpi/tables.c 2006-11-22 18:06:31.000000000 +0000
47475 +++ linux-2.6.16.33/drivers/acpi/tables.c 2007-01-08 15:00:45.000000000 +0000
47476 @@ -572,6 +572,11 @@
47477 *
47478 * result: sdt_entry[] is initialized
47479 */
47480 +#if defined(CONFIG_X86_XEN) || defined(CONFIG_X86_64_XEN)
47481 +#define acpi_rsdp_phys_to_va(rsdp_phys) isa_bus_to_virt(rsdp_phys)
47482 +#else
47483 +#define acpi_rsdp_phys_to_va(rsdp_phys) __va(rsdp_phys)
47484 +#endif
47485
47486 int __init acpi_table_init(void)
47487 {
47488 @@ -587,7 +592,7 @@
47489 return -ENODEV;
47490 }
47491
47492 - rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys);
47493 + rsdp = (struct acpi_table_rsdp *)acpi_rsdp_phys_to_va(rsdp_phys);
47494 if (!rsdp) {
47495 printk(KERN_WARNING PREFIX "Unable to map RSDP\n");
47496 return -ENODEV;
47497 diff -Nur linux-2.6.16.33-noxen/drivers/base/bus.c linux-2.6.16.33/drivers/base/bus.c
47498 --- linux-2.6.16.33-noxen/drivers/base/bus.c 2006-11-22 18:06:31.000000000 +0000
47499 +++ linux-2.6.16.33/drivers/base/bus.c 2007-05-23 21:00:01.000000000 +0000
47500 @@ -188,6 +188,11 @@
47501 up(&dev->sem);
47502 if (dev->parent)
47503 up(&dev->parent->sem);
47504 +
47505 + if (err > 0) /* success */
47506 + err = count;
47507 + else if (err == 0) /* driver didn't accept device */
47508 + err = -ENODEV;
47509 }
47510 put_device(dev);
47511 put_bus(bus);
47512 diff -Nur linux-2.6.16.33-noxen/drivers/block/aoe/aoenet.c linux-2.6.16.33/drivers/block/aoe/aoenet.c
47513 --- linux-2.6.16.33-noxen/drivers/block/aoe/aoenet.c 2006-11-22 18:06:31.000000000 +0000
47514 +++ linux-2.6.16.33/drivers/block/aoe/aoenet.c 2007-05-23 21:00:01.000000000 +0000
47515 @@ -95,9 +95,8 @@
47516 static struct sk_buff *
47517 skb_check(struct sk_buff *skb)
47518 {
47519 - if (skb_is_nonlinear(skb))
47520 if ((skb = skb_share_check(skb, GFP_ATOMIC)))
47521 - if (skb_linearize(skb, GFP_ATOMIC) < 0) {
47522 + if (skb_linearize(skb)) {
47523 dev_kfree_skb(skb);
47524 return NULL;
47525 }
47526 diff -Nur linux-2.6.16.33-noxen/drivers/char/mem.c linux-2.6.16.33/drivers/char/mem.c
47527 --- linux-2.6.16.33-noxen/drivers/char/mem.c 2006-11-22 18:06:31.000000000 +0000
47528 +++ linux-2.6.16.33/drivers/char/mem.c 2007-01-08 15:00:45.000000000 +0000
47529 @@ -108,6 +108,7 @@
47530 }
47531 #endif
47532
47533 +#ifndef ARCH_HAS_DEV_MEM
47534 /*
47535 * This funcion reads the *physical* memory. The f_pos points directly to the
47536 * memory location.
47537 @@ -232,6 +233,7 @@
47538 *ppos += written;
47539 return written;
47540 }
47541 +#endif
47542
47543 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
47544 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
47545 @@ -773,6 +775,7 @@
47546 #define open_kmem open_mem
47547 #define open_oldmem open_mem
47548
47549 +#ifndef ARCH_HAS_DEV_MEM
47550 static struct file_operations mem_fops = {
47551 .llseek = memory_lseek,
47552 .read = read_mem,
47553 @@ -780,6 +783,9 @@
47554 .mmap = mmap_mem,
47555 .open = open_mem,
47556 };
47557 +#else
47558 +extern struct file_operations mem_fops;
47559 +#endif
47560
47561 static struct file_operations kmem_fops = {
47562 .llseek = memory_lseek,
47563 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/Kconfig linux-2.6.16.33/drivers/char/tpm/Kconfig
47564 --- linux-2.6.16.33-noxen/drivers/char/tpm/Kconfig 2006-11-22 18:06:31.000000000 +0000
47565 +++ linux-2.6.16.33/drivers/char/tpm/Kconfig 2007-01-08 15:00:45.000000000 +0000
47566 @@ -20,9 +20,18 @@
47567 Note: For more TPM drivers enable CONFIG_PNP, CONFIG_ACPI
47568 and CONFIG_PNPACPI.
47569
47570 +config TCG_TIS
47571 + tristate "TPM Interface Specification 1.2 Interface"
47572 + depends on TCG_TPM
47573 + ---help---
47574 + If you have a TPM security chip that is compliant with the
47575 + TCG TIS 1.2 TPM specification say Yes and it will be accessible
47576 + from within Linux. To compile this driver as a module, choose
47577 + M here; the module will be called tpm_tis.
47578 +
47579 config TCG_NSC
47580 tristate "National Semiconductor TPM Interface"
47581 - depends on TCG_TPM
47582 + depends on TCG_TPM && PNPACPI
47583 ---help---
47584 If you have a TPM security chip from National Semicondutor
47585 say Yes and it will be accessible from within Linux. To
47586 @@ -49,5 +58,13 @@
47587 Further information on this driver and the supported hardware
47588 can be found at http://www.prosec.rub.de/tpm
47589
47590 -endmenu
47591 +config TCG_XEN
47592 + tristate "XEN TPM Interface"
47593 + depends on TCG_TPM && XEN
47594 + ---help---
47595 + If you want to make TPM support available to a Xen user domain,
47596 + say Yes and it will be accessible from within Linux.
47597 + To compile this driver as a module, choose M here; the module
47598 + will be called tpm_xenu.
47599
47600 +endmenu
47601 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/Makefile linux-2.6.16.33/drivers/char/tpm/Makefile
47602 --- linux-2.6.16.33-noxen/drivers/char/tpm/Makefile 2006-11-22 18:06:31.000000000 +0000
47603 +++ linux-2.6.16.33/drivers/char/tpm/Makefile 2007-01-08 15:00:45.000000000 +0000
47604 @@ -5,6 +5,9 @@
47605 ifdef CONFIG_ACPI
47606 obj-$(CONFIG_TCG_TPM) += tpm_bios.o
47607 endif
47608 +obj-$(CONFIG_TCG_TIS) += tpm_tis.o
47609 obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
47610 obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
47611 obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
47612 +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
47613 +tpm_xenu-y = tpm_xen.o tpm_vtpm.o
47614 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm.c linux-2.6.16.33/drivers/char/tpm/tpm.c
47615 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm.c 2006-11-22 18:06:31.000000000 +0000
47616 +++ linux-2.6.16.33/drivers/char/tpm/tpm.c 2007-01-08 15:00:45.000000000 +0000
47617 @@ -30,14 +30,295 @@
47618
47619 enum tpm_const {
47620 TPM_MINOR = 224, /* officially assigned */
47621 +#ifndef CONFIG_XEN
47622 TPM_BUFSIZE = 2048,
47623 +#endif
47624 TPM_NUM_DEVICES = 256,
47625 - TPM_NUM_MASK_ENTRIES = TPM_NUM_DEVICES / (8 * sizeof(int))
47626 };
47627
47628 +enum tpm_duration {
47629 + TPM_SHORT = 0,
47630 + TPM_MEDIUM = 1,
47631 + TPM_LONG = 2,
47632 + TPM_UNDEFINED,
47633 +};
47634 +
47635 +#define TPM_MAX_ORDINAL 243
47636 +#define TPM_MAX_PROTECTED_ORDINAL 12
47637 +#define TPM_PROTECTED_ORDINAL_MASK 0xFF
47638 +
47639 static LIST_HEAD(tpm_chip_list);
47640 static DEFINE_SPINLOCK(driver_lock);
47641 -static int dev_mask[TPM_NUM_MASK_ENTRIES];
47642 +static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
47643 +
47644 +/*
47645 + * Array with one entry per ordinal defining the maximum amount
47646 + * of time the chip could take to return the result. The ordinal
47647 + * designation of short, medium or long is defined in a table in
47648 + * TCG Specification TPM Main Part 2 TPM Structures Section 17. The
47649 + * values of the SHORT, MEDIUM, and LONG durations are retrieved
47650 + * from the chip during initialization with a call to tpm_get_timeouts.
47651 + */
47652 +static const u8 tpm_protected_ordinal_duration[TPM_MAX_PROTECTED_ORDINAL] = {
47653 + TPM_UNDEFINED, /* 0 */
47654 + TPM_UNDEFINED,
47655 + TPM_UNDEFINED,
47656 + TPM_UNDEFINED,
47657 + TPM_UNDEFINED,
47658 + TPM_UNDEFINED, /* 5 */
47659 + TPM_UNDEFINED,
47660 + TPM_UNDEFINED,
47661 + TPM_UNDEFINED,
47662 + TPM_UNDEFINED,
47663 + TPM_SHORT, /* 10 */
47664 + TPM_SHORT,
47665 +};
47666 +
47667 +static const u8 tpm_ordinal_duration[TPM_MAX_ORDINAL] = {
47668 + TPM_UNDEFINED, /* 0 */
47669 + TPM_UNDEFINED,
47670 + TPM_UNDEFINED,
47671 + TPM_UNDEFINED,
47672 + TPM_UNDEFINED,
47673 + TPM_UNDEFINED, /* 5 */
47674 + TPM_UNDEFINED,
47675 + TPM_UNDEFINED,
47676 + TPM_UNDEFINED,
47677 + TPM_UNDEFINED,
47678 + TPM_SHORT, /* 10 */
47679 + TPM_SHORT,
47680 + TPM_MEDIUM,
47681 + TPM_LONG,
47682 + TPM_LONG,
47683 + TPM_MEDIUM, /* 15 */
47684 + TPM_SHORT,
47685 + TPM_SHORT,
47686 + TPM_MEDIUM,
47687 + TPM_LONG,
47688 + TPM_SHORT, /* 20 */
47689 + TPM_SHORT,
47690 + TPM_MEDIUM,
47691 + TPM_MEDIUM,
47692 + TPM_MEDIUM,
47693 + TPM_SHORT, /* 25 */
47694 + TPM_SHORT,
47695 + TPM_MEDIUM,
47696 + TPM_SHORT,
47697 + TPM_SHORT,
47698 + TPM_MEDIUM, /* 30 */
47699 + TPM_LONG,
47700 + TPM_MEDIUM,
47701 + TPM_SHORT,
47702 + TPM_SHORT,
47703 + TPM_SHORT, /* 35 */
47704 + TPM_MEDIUM,
47705 + TPM_MEDIUM,
47706 + TPM_UNDEFINED,
47707 + TPM_UNDEFINED,
47708 + TPM_MEDIUM, /* 40 */
47709 + TPM_LONG,
47710 + TPM_MEDIUM,
47711 + TPM_SHORT,
47712 + TPM_SHORT,
47713 + TPM_SHORT, /* 45 */
47714 + TPM_SHORT,
47715 + TPM_SHORT,
47716 + TPM_SHORT,
47717 + TPM_LONG,
47718 + TPM_MEDIUM, /* 50 */
47719 + TPM_MEDIUM,
47720 + TPM_UNDEFINED,
47721 + TPM_UNDEFINED,
47722 + TPM_UNDEFINED,
47723 + TPM_UNDEFINED, /* 55 */
47724 + TPM_UNDEFINED,
47725 + TPM_UNDEFINED,
47726 + TPM_UNDEFINED,
47727 + TPM_UNDEFINED,
47728 + TPM_MEDIUM, /* 60 */
47729 + TPM_MEDIUM,
47730 + TPM_MEDIUM,
47731 + TPM_SHORT,
47732 + TPM_SHORT,
47733 + TPM_MEDIUM, /* 65 */
47734 + TPM_UNDEFINED,
47735 + TPM_UNDEFINED,
47736 + TPM_UNDEFINED,
47737 + TPM_UNDEFINED,
47738 + TPM_SHORT, /* 70 */
47739 + TPM_SHORT,
47740 + TPM_UNDEFINED,
47741 + TPM_UNDEFINED,
47742 + TPM_UNDEFINED,
47743 + TPM_UNDEFINED, /* 75 */
47744 + TPM_UNDEFINED,
47745 + TPM_UNDEFINED,
47746 + TPM_UNDEFINED,
47747 + TPM_UNDEFINED,
47748 + TPM_LONG, /* 80 */
47749 + TPM_UNDEFINED,
47750 + TPM_MEDIUM,
47751 + TPM_LONG,
47752 + TPM_SHORT,
47753 + TPM_UNDEFINED, /* 85 */
47754 + TPM_UNDEFINED,
47755 + TPM_UNDEFINED,
47756 + TPM_UNDEFINED,
47757 + TPM_UNDEFINED,
47758 + TPM_SHORT, /* 90 */
47759 + TPM_SHORT,
47760 + TPM_SHORT,
47761 + TPM_SHORT,
47762 + TPM_SHORT,
47763 + TPM_UNDEFINED, /* 95 */
47764 + TPM_UNDEFINED,
47765 + TPM_UNDEFINED,
47766 + TPM_UNDEFINED,
47767 + TPM_UNDEFINED,
47768 + TPM_MEDIUM, /* 100 */
47769 + TPM_SHORT,
47770 + TPM_SHORT,
47771 + TPM_UNDEFINED,
47772 + TPM_UNDEFINED,
47773 + TPM_UNDEFINED, /* 105 */
47774 + TPM_UNDEFINED,
47775 + TPM_UNDEFINED,
47776 + TPM_UNDEFINED,
47777 + TPM_UNDEFINED,
47778 + TPM_SHORT, /* 110 */
47779 + TPM_SHORT,
47780 + TPM_SHORT,
47781 + TPM_SHORT,
47782 + TPM_SHORT,
47783 + TPM_SHORT, /* 115 */
47784 + TPM_SHORT,
47785 + TPM_SHORT,
47786 + TPM_UNDEFINED,
47787 + TPM_UNDEFINED,
47788 + TPM_LONG, /* 120 */
47789 + TPM_LONG,
47790 + TPM_MEDIUM,
47791 + TPM_UNDEFINED,
47792 + TPM_SHORT,
47793 + TPM_SHORT, /* 125 */
47794 + TPM_SHORT,
47795 + TPM_LONG,
47796 + TPM_SHORT,
47797 + TPM_SHORT,
47798 + TPM_SHORT, /* 130 */
47799 + TPM_MEDIUM,
47800 + TPM_UNDEFINED,
47801 + TPM_SHORT,
47802 + TPM_MEDIUM,
47803 + TPM_UNDEFINED, /* 135 */
47804 + TPM_UNDEFINED,
47805 + TPM_UNDEFINED,
47806 + TPM_UNDEFINED,
47807 + TPM_UNDEFINED,
47808 + TPM_SHORT, /* 140 */
47809 + TPM_SHORT,
47810 + TPM_UNDEFINED,
47811 + TPM_UNDEFINED,
47812 + TPM_UNDEFINED,
47813 + TPM_UNDEFINED, /* 145 */
47814 + TPM_UNDEFINED,
47815 + TPM_UNDEFINED,
47816 + TPM_UNDEFINED,
47817 + TPM_UNDEFINED,
47818 + TPM_SHORT, /* 150 */
47819 + TPM_MEDIUM,
47820 + TPM_MEDIUM,
47821 + TPM_SHORT,
47822 + TPM_SHORT,
47823 + TPM_UNDEFINED, /* 155 */
47824 + TPM_UNDEFINED,
47825 + TPM_UNDEFINED,
47826 + TPM_UNDEFINED,
47827 + TPM_UNDEFINED,
47828 + TPM_SHORT, /* 160 */
47829 + TPM_SHORT,
47830 + TPM_SHORT,
47831 + TPM_SHORT,
47832 + TPM_UNDEFINED,
47833 + TPM_UNDEFINED, /* 165 */
47834 + TPM_UNDEFINED,
47835 + TPM_UNDEFINED,
47836 + TPM_UNDEFINED,
47837 + TPM_UNDEFINED,
47838 + TPM_LONG, /* 170 */
47839 + TPM_UNDEFINED,
47840 + TPM_UNDEFINED,
47841 + TPM_UNDEFINED,
47842 + TPM_UNDEFINED,
47843 + TPM_UNDEFINED, /* 175 */
47844 + TPM_UNDEFINED,
47845 + TPM_UNDEFINED,
47846 + TPM_UNDEFINED,
47847 + TPM_UNDEFINED,
47848 + TPM_MEDIUM, /* 180 */
47849 + TPM_SHORT,
47850 + TPM_MEDIUM,
47851 + TPM_MEDIUM,
47852 + TPM_MEDIUM,
47853 + TPM_MEDIUM, /* 185 */
47854 + TPM_SHORT,
47855 + TPM_UNDEFINED,
47856 + TPM_UNDEFINED,
47857 + TPM_UNDEFINED,
47858 + TPM_UNDEFINED, /* 190 */
47859 + TPM_UNDEFINED,
47860 + TPM_UNDEFINED,
47861 + TPM_UNDEFINED,
47862 + TPM_UNDEFINED,
47863 + TPM_UNDEFINED, /* 195 */
47864 + TPM_UNDEFINED,
47865 + TPM_UNDEFINED,
47866 + TPM_UNDEFINED,
47867 + TPM_UNDEFINED,
47868 + TPM_SHORT, /* 200 */
47869 + TPM_UNDEFINED,
47870 + TPM_UNDEFINED,
47871 + TPM_UNDEFINED,
47872 + TPM_SHORT,
47873 + TPM_SHORT, /* 205 */
47874 + TPM_SHORT,
47875 + TPM_SHORT,
47876 + TPM_SHORT,
47877 + TPM_SHORT,
47878 + TPM_MEDIUM, /* 210 */
47879 + TPM_UNDEFINED,
47880 + TPM_MEDIUM,
47881 + TPM_MEDIUM,
47882 + TPM_MEDIUM,
47883 + TPM_UNDEFINED, /* 215 */
47884 + TPM_MEDIUM,
47885 + TPM_UNDEFINED,
47886 + TPM_UNDEFINED,
47887 + TPM_SHORT,
47888 + TPM_SHORT, /* 220 */
47889 + TPM_SHORT,
47890 + TPM_SHORT,
47891 + TPM_SHORT,
47892 + TPM_SHORT,
47893 + TPM_UNDEFINED, /* 225 */
47894 + TPM_UNDEFINED,
47895 + TPM_UNDEFINED,
47896 + TPM_UNDEFINED,
47897 + TPM_UNDEFINED,
47898 + TPM_SHORT, /* 230 */
47899 + TPM_LONG,
47900 + TPM_MEDIUM,
47901 + TPM_UNDEFINED,
47902 + TPM_UNDEFINED,
47903 + TPM_UNDEFINED, /* 235 */
47904 + TPM_UNDEFINED,
47905 + TPM_UNDEFINED,
47906 + TPM_UNDEFINED,
47907 + TPM_UNDEFINED,
47908 + TPM_SHORT, /* 240 */
47909 + TPM_UNDEFINED,
47910 + TPM_MEDIUM,
47911 +};
47912
47913 static void user_reader_timeout(unsigned long ptr)
47914 {
47915 @@ -46,28 +327,58 @@
47916 schedule_work(&chip->work);
47917 }
47918
47919 -static void timeout_work(void * ptr)
47920 +static void timeout_work(void *ptr)
47921 {
47922 struct tpm_chip *chip = ptr;
47923
47924 down(&chip->buffer_mutex);
47925 atomic_set(&chip->data_pending, 0);
47926 +#ifndef CONFIG_XEN
47927 memset(chip->data_buffer, 0, TPM_BUFSIZE);
47928 +#else
47929 + memset(chip->data_buffer, 0, get_chip_buffersize(chip));
47930 +#endif
47931 up(&chip->buffer_mutex);
47932 }
47933
47934 /*
47935 + * Returns max number of jiffies to wait
47936 + */
47937 +unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip,
47938 + u32 ordinal)
47939 +{
47940 + int duration_idx = TPM_UNDEFINED;
47941 + int duration = 0;
47942 +
47943 + if (ordinal < TPM_MAX_ORDINAL)
47944 + duration_idx = tpm_ordinal_duration[ordinal];
47945 + else if ((ordinal & TPM_PROTECTED_ORDINAL_MASK) <
47946 + TPM_MAX_PROTECTED_ORDINAL)
47947 + duration_idx =
47948 + tpm_protected_ordinal_duration[ordinal &
47949 + TPM_PROTECTED_ORDINAL_MASK];
47950 +
47951 + if (duration_idx != TPM_UNDEFINED)
47952 + duration = chip->vendor.duration[duration_idx];
47953 + if (duration <= 0)
47954 + return 2 * 60 * HZ;
47955 + else
47956 + return duration;
47957 +}
47958 +EXPORT_SYMBOL_GPL(tpm_calc_ordinal_duration);
47959 +
47960 +/*
47961 * Internal kernel interface to transmit TPM commands
47962 */
47963 static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf,
47964 size_t bufsiz)
47965 {
47966 ssize_t rc;
47967 - u32 count;
47968 + u32 count, ordinal;
47969 unsigned long stop;
47970
47971 count = be32_to_cpu(*((__be32 *) (buf + 2)));
47972 -
47973 + ordinal = be32_to_cpu(*((__be32 *) (buf + 6)));
47974 if (count == 0)
47975 return -ENODATA;
47976 if (count > bufsiz) {
47977 @@ -78,21 +389,23 @@
47978
47979 down(&chip->tpm_mutex);
47980
47981 - if ((rc = chip->vendor->send(chip, (u8 *) buf, count)) < 0) {
47982 + if ((rc = chip->vendor.send(chip, (u8 *) buf, count)) < 0) {
47983 dev_err(chip->dev,
47984 "tpm_transmit: tpm_send: error %zd\n", rc);
47985 goto out;
47986 }
47987
47988 - stop = jiffies + 2 * 60 * HZ;
47989 + if (chip->vendor.irq)
47990 + goto out_recv;
47991 +
47992 + stop = jiffies + tpm_calc_ordinal_duration(chip, ordinal);
47993 do {
47994 - u8 status = chip->vendor->status(chip);
47995 - if ((status & chip->vendor->req_complete_mask) ==
47996 - chip->vendor->req_complete_val) {
47997 + u8 status = chip->vendor.status(chip);
47998 + if ((status & chip->vendor.req_complete_mask) ==
47999 + chip->vendor.req_complete_val)
48000 goto out_recv;
48001 - }
48002
48003 - if ((status == chip->vendor->req_canceled)) {
48004 + if ((status == chip->vendor.req_canceled)) {
48005 dev_err(chip->dev, "Operation Canceled\n");
48006 rc = -ECANCELED;
48007 goto out;
48008 @@ -102,14 +415,13 @@
48009 rmb();
48010 } while (time_before(jiffies, stop));
48011
48012 -
48013 - chip->vendor->cancel(chip);
48014 + chip->vendor.cancel(chip);
48015 dev_err(chip->dev, "Operation Timed out\n");
48016 rc = -ETIME;
48017 goto out;
48018
48019 out_recv:
48020 - rc = chip->vendor->recv(chip, (u8 *) buf, bufsiz);
48021 + rc = chip->vendor.recv(chip, (u8 *) buf, bufsiz);
48022 if (rc < 0)
48023 dev_err(chip->dev,
48024 "tpm_transmit: tpm_recv: error %zd\n", rc);
48025 @@ -119,17 +431,247 @@
48026 }
48027
48028 #define TPM_DIGEST_SIZE 20
48029 -#define CAP_PCR_RESULT_SIZE 18
48030 -static const u8 cap_pcr[] = {
48031 +#define TPM_ERROR_SIZE 10
48032 +#define TPM_RET_CODE_IDX 6
48033 +#define TPM_GET_CAP_RET_SIZE_IDX 10
48034 +#define TPM_GET_CAP_RET_UINT32_1_IDX 14
48035 +#define TPM_GET_CAP_RET_UINT32_2_IDX 18
48036 +#define TPM_GET_CAP_RET_UINT32_3_IDX 22
48037 +#define TPM_GET_CAP_RET_UINT32_4_IDX 26
48038 +#define TPM_GET_CAP_PERM_DISABLE_IDX 16
48039 +#define TPM_GET_CAP_PERM_INACTIVE_IDX 18
48040 +#define TPM_GET_CAP_RET_BOOL_1_IDX 14
48041 +#define TPM_GET_CAP_TEMP_INACTIVE_IDX 16
48042 +
48043 +#define TPM_CAP_IDX 13
48044 +#define TPM_CAP_SUBCAP_IDX 21
48045 +
48046 +enum tpm_capabilities {
48047 + TPM_CAP_FLAG = 4,
48048 + TPM_CAP_PROP = 5,
48049 +};
48050 +
48051 +enum tpm_sub_capabilities {
48052 + TPM_CAP_PROP_PCR = 0x1,
48053 + TPM_CAP_PROP_MANUFACTURER = 0x3,
48054 + TPM_CAP_FLAG_PERM = 0x8,
48055 + TPM_CAP_FLAG_VOL = 0x9,
48056 + TPM_CAP_PROP_OWNER = 0x11,
48057 + TPM_CAP_PROP_TIS_TIMEOUT = 0x15,
48058 + TPM_CAP_PROP_TIS_DURATION = 0x20,
48059 +};
48060 +
48061 +/*
48062 + * This is a semi generic GetCapability command for use
48063 + * with the capability type TPM_CAP_PROP or TPM_CAP_FLAG
48064 + * and their associated sub_capabilities.
48065 + */
48066 +
48067 +static const u8 tpm_cap[] = {
48068 0, 193, /* TPM_TAG_RQU_COMMAND */
48069 0, 0, 0, 22, /* length */
48070 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48071 - 0, 0, 0, 5,
48072 - 0, 0, 0, 4,
48073 - 0, 0, 1, 1
48074 + 0, 0, 0, 0, /* TPM_CAP_<TYPE> */
48075 + 0, 0, 0, 4, /* TPM_CAP_SUB_<TYPE> size */
48076 + 0, 0, 1, 0 /* TPM_CAP_SUB_<TYPE> */
48077 };
48078
48079 -#define READ_PCR_RESULT_SIZE 30
48080 +static ssize_t transmit_cmd(struct tpm_chip *chip, u8 *data, int len,
48081 + char *desc)
48082 +{
48083 + int err;
48084 +
48085 + len = tpm_transmit(chip, data, len);
48086 + if (len < 0)
48087 + return len;
48088 + if (len == TPM_ERROR_SIZE) {
48089 + err = be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX)));
48090 + dev_dbg(chip->dev, "A TPM error (%d) occurred %s\n", err, desc);
48091 + return err;
48092 + }
48093 + return 0;
48094 +}
48095 +
48096 +void tpm_gen_interrupt(struct tpm_chip *chip)
48097 +{
48098 + u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 30)];
48099 + ssize_t rc;
48100 +
48101 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48102 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48103 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_TIMEOUT;
48104 +
48105 + rc = transmit_cmd(chip, data, sizeof(data),
48106 + "attempting to determine the timeouts");
48107 +}
48108 +EXPORT_SYMBOL_GPL(tpm_gen_interrupt);
48109 +
48110 +void tpm_get_timeouts(struct tpm_chip *chip)
48111 +{
48112 + u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 30)];
48113 + ssize_t rc;
48114 + u32 timeout;
48115 +
48116 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48117 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48118 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_TIMEOUT;
48119 +
48120 + rc = transmit_cmd(chip, data, sizeof(data),
48121 + "attempting to determine the timeouts");
48122 + if (rc)
48123 + goto duration;
48124 +
48125 + if (be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_SIZE_IDX)))
48126 + != 4 * sizeof(u32))
48127 + goto duration;
48128 +
48129 + /* Don't overwrite default if value is 0 */
48130 + timeout =
48131 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX)));
48132 + if (timeout)
48133 + chip->vendor.timeout_a = msecs_to_jiffies(timeout);
48134 + timeout =
48135 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_2_IDX)));
48136 + if (timeout)
48137 + chip->vendor.timeout_b = msecs_to_jiffies(timeout);
48138 + timeout =
48139 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_3_IDX)));
48140 + if (timeout)
48141 + chip->vendor.timeout_c = msecs_to_jiffies(timeout);
48142 + timeout =
48143 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_4_IDX)));
48144 + if (timeout)
48145 + chip->vendor.timeout_d = msecs_to_jiffies(timeout);
48146 +
48147 +duration:
48148 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48149 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48150 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_TIS_DURATION;
48151 +
48152 + rc = transmit_cmd(chip, data, sizeof(data),
48153 + "attempting to determine the durations");
48154 + if (rc)
48155 + return;
48156 +
48157 + if (be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_SIZE_IDX)))
48158 + != 3 * sizeof(u32))
48159 + return;
48160 +
48161 + chip->vendor.duration[TPM_SHORT] =
48162 + msecs_to_jiffies(be32_to_cpu
48163 + (*((__be32 *) (data +
48164 + TPM_GET_CAP_RET_UINT32_1_IDX))));
48165 + chip->vendor.duration[TPM_MEDIUM] =
48166 + msecs_to_jiffies(be32_to_cpu
48167 + (*((__be32 *) (data +
48168 + TPM_GET_CAP_RET_UINT32_2_IDX))));
48169 + chip->vendor.duration[TPM_LONG] =
48170 + msecs_to_jiffies(be32_to_cpu
48171 + (*((__be32 *) (data +
48172 + TPM_GET_CAP_RET_UINT32_3_IDX))));
48173 +}
48174 +EXPORT_SYMBOL_GPL(tpm_get_timeouts);
48175 +
48176 +void tpm_continue_selftest(struct tpm_chip *chip)
48177 +{
48178 + u8 data[] = {
48179 + 0, 193, /* TPM_TAG_RQU_COMMAND */
48180 + 0, 0, 0, 10, /* length */
48181 + 0, 0, 0, 83, /* TPM_ORD_GetCapability */
48182 + };
48183 +
48184 + tpm_transmit(chip, data, sizeof(data));
48185 +}
48186 +EXPORT_SYMBOL_GPL(tpm_continue_selftest);
48187 +
48188 +ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr,
48189 + char *buf)
48190 +{
48191 + u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 35)];
48192 + ssize_t rc;
48193 +
48194 + struct tpm_chip *chip = dev_get_drvdata(dev);
48195 + if (chip == NULL)
48196 + return -ENODEV;
48197 +
48198 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48199 + data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48200 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_PERM;
48201 +
48202 + rc = transmit_cmd(chip, data, sizeof(data),
48203 + "attemtping to determine the permanent state");
48204 + if (rc)
48205 + return 0;
48206 + return sprintf(buf, "%d\n", !data[TPM_GET_CAP_PERM_DISABLE_IDX]);
48207 +}
48208 +EXPORT_SYMBOL_GPL(tpm_show_enabled);
48209 +
48210 +ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr,
48211 + char *buf)
48212 +{
48213 + u8 data[max_t(int, ARRAY_SIZE(tpm_cap), 35)];
48214 + ssize_t rc;
48215 +
48216 + struct tpm_chip *chip = dev_get_drvdata(dev);
48217 + if (chip == NULL)
48218 + return -ENODEV;
48219 +
48220 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48221 + data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48222 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_PERM;
48223 +
48224 + rc = transmit_cmd(chip, data, sizeof(data),
48225 + "attemtping to determine the permanent state");
48226 + if (rc)
48227 + return 0;
48228 + return sprintf(buf, "%d\n", !data[TPM_GET_CAP_PERM_INACTIVE_IDX]);
48229 +}
48230 +EXPORT_SYMBOL_GPL(tpm_show_active);
48231 +
48232 +ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr,
48233 + char *buf)
48234 +{
48235 + u8 data[sizeof(tpm_cap)];
48236 + ssize_t rc;
48237 +
48238 + struct tpm_chip *chip = dev_get_drvdata(dev);
48239 + if (chip == NULL)
48240 + return -ENODEV;
48241 +
48242 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48243 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48244 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_OWNER;
48245 +
48246 + rc = transmit_cmd(chip, data, sizeof(data),
48247 + "attempting to determine the owner state");
48248 + if (rc)
48249 + return 0;
48250 + return sprintf(buf, "%d\n", data[TPM_GET_CAP_RET_BOOL_1_IDX]);
48251 +}
48252 +EXPORT_SYMBOL_GPL(tpm_show_owned);
48253 +
48254 +ssize_t tpm_show_temp_deactivated(struct device * dev,
48255 + struct device_attribute * attr, char *buf)
48256 +{
48257 + u8 data[sizeof(tpm_cap)];
48258 + ssize_t rc;
48259 +
48260 + struct tpm_chip *chip = dev_get_drvdata(dev);
48261 + if (chip == NULL)
48262 + return -ENODEV;
48263 +
48264 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48265 + data[TPM_CAP_IDX] = TPM_CAP_FLAG;
48266 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_FLAG_VOL;
48267 +
48268 + rc = transmit_cmd(chip, data, sizeof(data),
48269 + "attempting to determine the temporary state");
48270 + if (rc)
48271 + return 0;
48272 + return sprintf(buf, "%d\n", data[TPM_GET_CAP_TEMP_INACTIVE_IDX]);
48273 +}
48274 +EXPORT_SYMBOL_GPL(tpm_show_temp_deactivated);
48275 +
48276 static const u8 pcrread[] = {
48277 0, 193, /* TPM_TAG_RQU_COMMAND */
48278 0, 0, 0, 14, /* length */
48279 @@ -140,8 +682,8 @@
48280 ssize_t tpm_show_pcrs(struct device *dev, struct device_attribute *attr,
48281 char *buf)
48282 {
48283 - u8 data[READ_PCR_RESULT_SIZE];
48284 - ssize_t len;
48285 + u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(pcrread)), 30)];
48286 + ssize_t rc;
48287 int i, j, num_pcrs;
48288 __be32 index;
48289 char *str = buf;
48290 @@ -150,29 +692,24 @@
48291 if (chip == NULL)
48292 return -ENODEV;
48293
48294 - memcpy(data, cap_pcr, sizeof(cap_pcr));
48295 - if ((len = tpm_transmit(chip, data, sizeof(data)))
48296 - < CAP_PCR_RESULT_SIZE) {
48297 - dev_dbg(chip->dev, "A TPM error (%d) occurred "
48298 - "attempting to determine the number of PCRS\n",
48299 - be32_to_cpu(*((__be32 *) (data + 6))));
48300 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48301 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48302 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_PCR;
48303 +
48304 + rc = transmit_cmd(chip, data, sizeof(data),
48305 + "attempting to determine the number of PCRS");
48306 + if (rc)
48307 return 0;
48308 - }
48309
48310 num_pcrs = be32_to_cpu(*((__be32 *) (data + 14)));
48311 -
48312 for (i = 0; i < num_pcrs; i++) {
48313 memcpy(data, pcrread, sizeof(pcrread));
48314 index = cpu_to_be32(i);
48315 memcpy(data + 10, &index, 4);
48316 - if ((len = tpm_transmit(chip, data, sizeof(data)))
48317 - < READ_PCR_RESULT_SIZE){
48318 - dev_dbg(chip->dev, "A TPM error (%d) occurred"
48319 - " attempting to read PCR %d of %d\n",
48320 - be32_to_cpu(*((__be32 *) (data + 6))),
48321 - i, num_pcrs);
48322 + rc = transmit_cmd(chip, data, sizeof(data),
48323 + "attempting to read a PCR");
48324 + if (rc)
48325 goto out;
48326 - }
48327 str += sprintf(str, "PCR-%02d: ", i);
48328 for (j = 0; j < TPM_DIGEST_SIZE; j++)
48329 str += sprintf(str, "%02X ", *(data + 10 + j));
48330 @@ -194,7 +731,7 @@
48331 char *buf)
48332 {
48333 u8 *data;
48334 - ssize_t len;
48335 + ssize_t err;
48336 int i, rc;
48337 char *str = buf;
48338
48339 @@ -208,14 +745,10 @@
48340
48341 memcpy(data, readpubek, sizeof(readpubek));
48342
48343 - if ((len = tpm_transmit(chip, data, READ_PUBEK_RESULT_SIZE)) <
48344 - READ_PUBEK_RESULT_SIZE) {
48345 - dev_dbg(chip->dev, "A TPM error (%d) occurred "
48346 - "attempting to read the PUBEK\n",
48347 - be32_to_cpu(*((__be32 *) (data + 6))));
48348 - rc = 0;
48349 + err = transmit_cmd(chip, data, READ_PUBEK_RESULT_SIZE,
48350 + "attempting to read the PUBEK");
48351 + if (err)
48352 goto out;
48353 - }
48354
48355 /*
48356 ignore header 10 bytes
48357 @@ -245,36 +778,68 @@
48358 if ((i + 1) % 16 == 0)
48359 str += sprintf(str, "\n");
48360 }
48361 - rc = str - buf;
48362 out:
48363 + rc = str - buf;
48364 kfree(data);
48365 return rc;
48366 }
48367 EXPORT_SYMBOL_GPL(tpm_show_pubek);
48368
48369 -#define CAP_VER_RESULT_SIZE 18
48370 +#define CAP_VERSION_1_1 6
48371 +#define CAP_VERSION_1_2 0x1A
48372 +#define CAP_VERSION_IDX 13
48373 static const u8 cap_version[] = {
48374 0, 193, /* TPM_TAG_RQU_COMMAND */
48375 0, 0, 0, 18, /* length */
48376 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48377 - 0, 0, 0, 6,
48378 + 0, 0, 0, 0,
48379 0, 0, 0, 0
48380 };
48381
48382 -#define CAP_MANUFACTURER_RESULT_SIZE 18
48383 -static const u8 cap_manufacturer[] = {
48384 - 0, 193, /* TPM_TAG_RQU_COMMAND */
48385 - 0, 0, 0, 22, /* length */
48386 - 0, 0, 0, 101, /* TPM_ORD_GetCapability */
48387 - 0, 0, 0, 5,
48388 - 0, 0, 0, 4,
48389 - 0, 0, 1, 3
48390 -};
48391 -
48392 ssize_t tpm_show_caps(struct device *dev, struct device_attribute *attr,
48393 char *buf)
48394 {
48395 - u8 data[sizeof(cap_manufacturer)];
48396 + u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(cap_version)), 30)];
48397 + ssize_t rc;
48398 + char *str = buf;
48399 +
48400 + struct tpm_chip *chip = dev_get_drvdata(dev);
48401 + if (chip == NULL)
48402 + return -ENODEV;
48403 +
48404 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48405 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48406 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_MANUFACTURER;
48407 +
48408 + rc = transmit_cmd(chip, data, sizeof(data),
48409 + "attempting to determine the manufacturer");
48410 + if (rc)
48411 + return 0;
48412 +
48413 + str += sprintf(str, "Manufacturer: 0x%x\n",
48414 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX))));
48415 +
48416 + memcpy(data, cap_version, sizeof(cap_version));
48417 + data[CAP_VERSION_IDX] = CAP_VERSION_1_1;
48418 + rc = transmit_cmd(chip, data, sizeof(data),
48419 + "attempting to determine the 1.1 version");
48420 + if (rc)
48421 + goto out;
48422 +
48423 + str += sprintf(str,
48424 + "TCG version: %d.%d\nFirmware version: %d.%d\n",
48425 + (int) data[14], (int) data[15], (int) data[16],
48426 + (int) data[17]);
48427 +
48428 +out:
48429 + return str - buf;
48430 +}
48431 +EXPORT_SYMBOL_GPL(tpm_show_caps);
48432 +
48433 +ssize_t tpm_show_caps_1_2(struct device * dev,
48434 + struct device_attribute * attr, char *buf)
48435 +{
48436 + u8 data[max_t(int, max(ARRAY_SIZE(tpm_cap), ARRAY_SIZE(cap_version)), 30)];
48437 ssize_t len;
48438 char *str = buf;
48439
48440 @@ -282,29 +847,40 @@
48441 if (chip == NULL)
48442 return -ENODEV;
48443
48444 - memcpy(data, cap_manufacturer, sizeof(cap_manufacturer));
48445 + memcpy(data, tpm_cap, sizeof(tpm_cap));
48446 + data[TPM_CAP_IDX] = TPM_CAP_PROP;
48447 + data[TPM_CAP_SUBCAP_IDX] = TPM_CAP_PROP_MANUFACTURER;
48448
48449 - if ((len = tpm_transmit(chip, data, sizeof(data))) <
48450 - CAP_MANUFACTURER_RESULT_SIZE)
48451 - return len;
48452 + if ((len = tpm_transmit(chip, data, sizeof(data))) <=
48453 + TPM_ERROR_SIZE) {
48454 + dev_dbg(chip->dev, "A TPM error (%d) occurred "
48455 + "attempting to determine the manufacturer\n",
48456 + be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX))));
48457 + return 0;
48458 + }
48459
48460 str += sprintf(str, "Manufacturer: 0x%x\n",
48461 - be32_to_cpu(*((__be32 *) (data + 14))));
48462 + be32_to_cpu(*((__be32 *) (data + TPM_GET_CAP_RET_UINT32_1_IDX))));
48463
48464 memcpy(data, cap_version, sizeof(cap_version));
48465 + data[CAP_VERSION_IDX] = CAP_VERSION_1_2;
48466
48467 - if ((len = tpm_transmit(chip, data, sizeof(data))) <
48468 - CAP_VER_RESULT_SIZE)
48469 - return len;
48470 -
48471 - str +=
48472 - sprintf(str, "TCG version: %d.%d\nFirmware version: %d.%d\n",
48473 - (int) data[14], (int) data[15], (int) data[16],
48474 - (int) data[17]);
48475 + if ((len = tpm_transmit(chip, data, sizeof(data))) <=
48476 + TPM_ERROR_SIZE) {
48477 + dev_err(chip->dev, "A TPM error (%d) occurred "
48478 + "attempting to determine the 1.2 version\n",
48479 + be32_to_cpu(*((__be32 *) (data + TPM_RET_CODE_IDX))));
48480 + goto out;
48481 + }
48482 + str += sprintf(str,
48483 + "TCG version: %d.%d\nFirmware version: %d.%d\n",
48484 + (int) data[16], (int) data[17], (int) data[18],
48485 + (int) data[19]);
48486
48487 +out:
48488 return str - buf;
48489 }
48490 -EXPORT_SYMBOL_GPL(tpm_show_caps);
48491 +EXPORT_SYMBOL_GPL(tpm_show_caps_1_2);
48492
48493 ssize_t tpm_store_cancel(struct device *dev, struct device_attribute *attr,
48494 const char *buf, size_t count)
48495 @@ -313,7 +889,7 @@
48496 if (chip == NULL)
48497 return 0;
48498
48499 - chip->vendor->cancel(chip);
48500 + chip->vendor.cancel(chip);
48501 return count;
48502 }
48503 EXPORT_SYMBOL_GPL(tpm_store_cancel);
48504 @@ -329,7 +905,7 @@
48505 spin_lock(&driver_lock);
48506
48507 list_for_each_entry(pos, &tpm_chip_list, list) {
48508 - if (pos->vendor->miscdev.minor == minor) {
48509 + if (pos->vendor.miscdev.minor == minor) {
48510 chip = pos;
48511 break;
48512 }
48513 @@ -351,7 +927,12 @@
48514
48515 spin_unlock(&driver_lock);
48516
48517 +#ifndef CONFIG_XEN
48518 chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
48519 +#else
48520 + chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8),
48521 + GFP_KERNEL);
48522 +#endif
48523 if (chip->data_buffer == NULL) {
48524 chip->num_opens--;
48525 put_device(chip->dev);
48526 @@ -387,7 +968,7 @@
48527 EXPORT_SYMBOL_GPL(tpm_release);
48528
48529 ssize_t tpm_write(struct file *file, const char __user *buf,
48530 - size_t size, loff_t * off)
48531 + size_t size, loff_t *off)
48532 {
48533 struct tpm_chip *chip = file->private_data;
48534 int in_size = size, out_size;
48535 @@ -399,8 +980,13 @@
48536
48537 down(&chip->buffer_mutex);
48538
48539 +#ifndef CONFIG_XEN
48540 if (in_size > TPM_BUFSIZE)
48541 in_size = TPM_BUFSIZE;
48542 +#else
48543 + if (in_size > get_chip_buffersize(chip))
48544 + in_size = get_chip_buffersize(chip);
48545 +#endif
48546
48547 if (copy_from_user
48548 (chip->data_buffer, (void __user *) buf, in_size)) {
48549 @@ -409,9 +995,17 @@
48550 }
48551
48552 /* atomic tpm command send and result receive */
48553 +#ifndef CONFIG_XEN
48554 out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
48555 +#else
48556 + out_size = tpm_transmit(chip, chip->data_buffer,
48557 + get_chip_buffersize(chip));
48558 +#endif
48559
48560 atomic_set(&chip->data_pending, out_size);
48561 +#ifdef CONFIG_XEN
48562 + atomic_set(&chip->data_position, 0);
48563 +#endif
48564 up(&chip->buffer_mutex);
48565
48566 /* Set a timeout by which the reader must come claim the result */
48567 @@ -419,29 +1013,59 @@
48568
48569 return in_size;
48570 }
48571 -
48572 EXPORT_SYMBOL_GPL(tpm_write);
48573
48574 -ssize_t tpm_read(struct file * file, char __user *buf,
48575 - size_t size, loff_t * off)
48576 +ssize_t tpm_read(struct file *file, char __user *buf,
48577 + size_t size, loff_t *off)
48578 {
48579 struct tpm_chip *chip = file->private_data;
48580 int ret_size;
48581 +#ifdef CONFIG_XEN
48582 + int pos, pending = 0;
48583 +#endif
48584
48585 +#ifndef CONFIG_XEN
48586 del_singleshot_timer_sync(&chip->user_read_timer);
48587 flush_scheduled_work();
48588 +#endif
48589 ret_size = atomic_read(&chip->data_pending);
48590 +#ifndef CONFIG_XEN
48591 atomic_set(&chip->data_pending, 0);
48592 +#endif
48593 if (ret_size > 0) { /* relay data */
48594 if (size < ret_size)
48595 ret_size = size;
48596
48597 +#ifdef CONFIG_XEN
48598 + pos = atomic_read(&chip->data_position);
48599 +#endif
48600 down(&chip->buffer_mutex);
48601 +#ifndef CONFIG_XEN
48602 if (copy_to_user(buf, chip->data_buffer, ret_size))
48603 +#else
48604 + if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
48605 +#endif
48606 ret_size = -EFAULT;
48607 +#ifdef CONFIG_XEN
48608 + } else {
48609 + pending = atomic_read(&chip->data_pending) - ret_size;
48610 + if ( pending ) {
48611 + atomic_set(&chip->data_pending, pending);
48612 + atomic_set(&chip->data_position,
48613 + pos+ret_size);
48614 + }
48615 + }
48616 +#endif
48617 up(&chip->buffer_mutex);
48618 }
48619
48620 +#ifdef CONFIG_XEN
48621 + if ( ret_size <= 0 || pending == 0 ) {
48622 + atomic_set(&chip->data_pending, 0);
48623 + del_singleshot_timer_sync(&chip->user_read_timer);
48624 + flush_scheduled_work();
48625 + }
48626 +#endif
48627 return ret_size;
48628 }
48629 EXPORT_SYMBOL_GPL(tpm_read);
48630 @@ -462,14 +1086,13 @@
48631 spin_unlock(&driver_lock);
48632
48633 dev_set_drvdata(dev, NULL);
48634 - misc_deregister(&chip->vendor->miscdev);
48635 - kfree(chip->vendor->miscdev.name);
48636 + misc_deregister(&chip->vendor.miscdev);
48637 + kfree(chip->vendor.miscdev.name);
48638
48639 - sysfs_remove_group(&dev->kobj, chip->vendor->attr_group);
48640 + sysfs_remove_group(&dev->kobj, chip->vendor.attr_group);
48641 tpm_bios_log_teardown(chip->bios_dir);
48642
48643 - dev_mask[chip->dev_num / TPM_NUM_MASK_ENTRIES ] &=
48644 - ~(1 << (chip->dev_num % TPM_NUM_MASK_ENTRIES));
48645 + clear_bit(chip->dev_num, dev_mask);
48646
48647 kfree(chip);
48648
48649 @@ -520,18 +1143,18 @@
48650 * upon errant exit from this function specific probe function should call
48651 * pci_disable_device
48652 */
48653 -int tpm_register_hardware(struct device *dev, struct tpm_vendor_specific *entry)
48654 +struct tpm_chip *tpm_register_hardware(struct device *dev, const struct tpm_vendor_specific
48655 + *entry)
48656 {
48657 #define DEVNAME_SIZE 7
48658
48659 char *devname;
48660 struct tpm_chip *chip;
48661 - int i, j;
48662
48663 /* Driver specific per-device data */
48664 chip = kzalloc(sizeof(*chip), GFP_KERNEL);
48665 if (chip == NULL)
48666 - return -ENOMEM;
48667 + return NULL;
48668
48669 init_MUTEX(&chip->buffer_mutex);
48670 init_MUTEX(&chip->tpm_mutex);
48671 @@ -543,45 +1166,37 @@
48672 chip->user_read_timer.function = user_reader_timeout;
48673 chip->user_read_timer.data = (unsigned long) chip;
48674
48675 - chip->vendor = entry;
48676 + memcpy(&chip->vendor, entry, sizeof(struct tpm_vendor_specific));
48677
48678 - chip->dev_num = -1;
48679 -
48680 - for (i = 0; i < TPM_NUM_MASK_ENTRIES; i++)
48681 - for (j = 0; j < 8 * sizeof(int); j++)
48682 - if ((dev_mask[i] & (1 << j)) == 0) {
48683 - chip->dev_num =
48684 - i * TPM_NUM_MASK_ENTRIES + j;
48685 - dev_mask[i] |= 1 << j;
48686 - goto dev_num_search_complete;
48687 - }
48688 + chip->dev_num = find_first_zero_bit(dev_mask, TPM_NUM_DEVICES);
48689
48690 -dev_num_search_complete:
48691 - if (chip->dev_num < 0) {
48692 + if (chip->dev_num >= TPM_NUM_DEVICES) {
48693 dev_err(dev, "No available tpm device numbers\n");
48694 kfree(chip);
48695 - return -ENODEV;
48696 + return NULL;
48697 } else if (chip->dev_num == 0)
48698 - chip->vendor->miscdev.minor = TPM_MINOR;
48699 + chip->vendor.miscdev.minor = TPM_MINOR;
48700 else
48701 - chip->vendor->miscdev.minor = MISC_DYNAMIC_MINOR;
48702 + chip->vendor.miscdev.minor = MISC_DYNAMIC_MINOR;
48703 +
48704 + set_bit(chip->dev_num, dev_mask);
48705
48706 devname = kmalloc(DEVNAME_SIZE, GFP_KERNEL);
48707 scnprintf(devname, DEVNAME_SIZE, "%s%d", "tpm", chip->dev_num);
48708 - chip->vendor->miscdev.name = devname;
48709 + chip->vendor.miscdev.name = devname;
48710
48711 - chip->vendor->miscdev.dev = dev;
48712 + chip->vendor.miscdev.dev = dev;
48713 chip->dev = get_device(dev);
48714
48715 - if (misc_register(&chip->vendor->miscdev)) {
48716 + if (misc_register(&chip->vendor.miscdev)) {
48717 dev_err(chip->dev,
48718 "unable to misc_register %s, minor %d\n",
48719 - chip->vendor->miscdev.name,
48720 - chip->vendor->miscdev.minor);
48721 + chip->vendor.miscdev.name,
48722 + chip->vendor.miscdev.minor);
48723 put_device(dev);
48724 + clear_bit(chip->dev_num, dev_mask);
48725 kfree(chip);
48726 - dev_mask[i] &= !(1 << j);
48727 - return -ENODEV;
48728 + return NULL;
48729 }
48730
48731 spin_lock(&driver_lock);
48732 @@ -592,11 +1207,11 @@
48733
48734 spin_unlock(&driver_lock);
48735
48736 - sysfs_create_group(&dev->kobj, chip->vendor->attr_group);
48737 + sysfs_create_group(&dev->kobj, chip->vendor.attr_group);
48738
48739 chip->bios_dir = tpm_bios_log_setup(devname);
48740
48741 - return 0;
48742 + return chip;
48743 }
48744 EXPORT_SYMBOL_GPL(tpm_register_hardware);
48745
48746 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm.h linux-2.6.16.33/drivers/char/tpm/tpm.h
48747 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm.h 2006-11-22 18:06:31.000000000 +0000
48748 +++ linux-2.6.16.33/drivers/char/tpm/tpm.h 2007-01-08 15:00:45.000000000 +0000
48749 @@ -24,6 +24,14 @@
48750 #include <linux/fs.h>
48751 #include <linux/miscdevice.h>
48752 #include <linux/platform_device.h>
48753 +#include <linux/io.h>
48754 +
48755 +#ifdef CONFIG_XEN
48756 +enum tpm_bufsize {
48757 + TPM_MIN_BUFFERSIZE = 2048,
48758 + TPM_MAX_BUFFERSIZE = 64 * 1024,
48759 +};
48760 +#endif
48761
48762 enum tpm_timeout {
48763 TPM_TIMEOUT = 5, /* msecs */
48764 @@ -41,18 +49,33 @@
48765 char *);
48766 extern ssize_t tpm_show_caps(struct device *, struct device_attribute *attr,
48767 char *);
48768 +extern ssize_t tpm_show_caps_1_2(struct device *, struct device_attribute *attr,
48769 + char *);
48770 extern ssize_t tpm_store_cancel(struct device *, struct device_attribute *attr,
48771 const char *, size_t);
48772 +extern ssize_t tpm_show_enabled(struct device *, struct device_attribute *attr,
48773 + char *);
48774 +extern ssize_t tpm_show_active(struct device *, struct device_attribute *attr,
48775 + char *);
48776 +extern ssize_t tpm_show_owned(struct device *, struct device_attribute *attr,
48777 + char *);
48778 +extern ssize_t tpm_show_temp_deactivated(struct device *,
48779 + struct device_attribute *attr, char *);
48780
48781 struct tpm_chip;
48782
48783 struct tpm_vendor_specific {
48784 - u8 req_complete_mask;
48785 - u8 req_complete_val;
48786 - u8 req_canceled;
48787 + const u8 req_complete_mask;
48788 + const u8 req_complete_val;
48789 + const u8 req_canceled;
48790 +#ifdef CONFIG_XEN
48791 + u32 buffersize;
48792 +#endif
48793 void __iomem *iobase; /* ioremapped address */
48794 unsigned long base; /* TPM base address */
48795
48796 + int irq;
48797 +
48798 int region_size;
48799 int have_region;
48800
48801 @@ -62,6 +85,13 @@
48802 u8 (*status) (struct tpm_chip *);
48803 struct miscdevice miscdev;
48804 struct attribute_group *attr_group;
48805 + struct list_head list;
48806 + int locality;
48807 + unsigned long timeout_a, timeout_b, timeout_c, timeout_d; /* jiffies */
48808 + unsigned long duration[3]; /* jiffies */
48809 +
48810 + wait_queue_head_t read_queue;
48811 + wait_queue_head_t int_queue;
48812 };
48813
48814 struct tpm_chip {
48815 @@ -74,19 +104,27 @@
48816 /* Data passed to and from the tpm via the read/write calls */
48817 u8 *data_buffer;
48818 atomic_t data_pending;
48819 +#ifdef CONFIG_XEN
48820 + atomic_t data_position;
48821 +#endif
48822 struct semaphore buffer_mutex;
48823
48824 struct timer_list user_read_timer; /* user needs to claim result */
48825 struct work_struct work;
48826 struct semaphore tpm_mutex; /* tpm is processing */
48827
48828 - struct tpm_vendor_specific *vendor;
48829 + struct tpm_vendor_specific vendor;
48830
48831 struct dentry **bios_dir;
48832
48833 struct list_head list;
48834 +#ifdef CONFIG_XEN
48835 + void *priv;
48836 +#endif
48837 };
48838
48839 +#define to_tpm_chip(n) container_of(n, struct tpm_chip, vendor)
48840 +
48841 static inline int tpm_read_index(int base, int index)
48842 {
48843 outb(index, base);
48844 @@ -99,8 +137,35 @@
48845 outb(value & 0xFF, base+1);
48846 }
48847
48848 -extern int tpm_register_hardware(struct device *,
48849 - struct tpm_vendor_specific *);
48850 +#ifdef CONFIG_XEN
48851 +static inline u32 get_chip_buffersize(struct tpm_chip *chip)
48852 +{
48853 + u32 size = chip->vendor.buffersize;
48854 + if (size > TPM_MAX_BUFFERSIZE) {
48855 + return TPM_MAX_BUFFERSIZE;
48856 + } else if (size < TPM_MIN_BUFFERSIZE) {
48857 + return TPM_MIN_BUFFERSIZE;
48858 + }
48859 + return size;
48860 +}
48861 +
48862 +static inline void *chip_get_private(const struct tpm_chip *chip)
48863 +{
48864 + return chip->priv;
48865 +}
48866 +
48867 +static inline void chip_set_private(struct tpm_chip *chip, void *priv)
48868 +{
48869 + chip->priv = priv;
48870 +}
48871 +#endif
48872 +
48873 +extern void tpm_get_timeouts(struct tpm_chip *);
48874 +extern void tpm_gen_interrupt(struct tpm_chip *);
48875 +extern void tpm_continue_selftest(struct tpm_chip *);
48876 +extern unsigned long tpm_calc_ordinal_duration(struct tpm_chip *, u32);
48877 +extern struct tpm_chip* tpm_register_hardware(struct device *,
48878 + const struct tpm_vendor_specific *);
48879 extern int tpm_open(struct inode *, struct file *);
48880 extern int tpm_release(struct inode *, struct file *);
48881 extern ssize_t tpm_write(struct file *, const char __user *, size_t,
48882 @@ -114,7 +179,7 @@
48883 extern struct dentry ** tpm_bios_log_setup(char *);
48884 extern void tpm_bios_log_teardown(struct dentry **);
48885 #else
48886 -static inline struct dentry* tpm_bios_log_setup(char *name)
48887 +static inline struct dentry ** tpm_bios_log_setup(char *name)
48888 {
48889 return NULL;
48890 }
48891 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.c linux-2.6.16.33/drivers/char/tpm/tpm_atmel.c
48892 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.c 2006-11-22 18:06:31.000000000 +0000
48893 +++ linux-2.6.16.33/drivers/char/tpm/tpm_atmel.c 2007-05-23 21:00:01.000000000 +0000
48894 @@ -47,12 +47,12 @@
48895 return -EIO;
48896
48897 for (i = 0; i < 6; i++) {
48898 - status = ioread8(chip->vendor->iobase + 1);
48899 + status = ioread8(chip->vendor.iobase + 1);
48900 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48901 dev_err(chip->dev, "error reading header\n");
48902 return -EIO;
48903 }
48904 - *buf++ = ioread8(chip->vendor->iobase);
48905 + *buf++ = ioread8(chip->vendor.iobase);
48906 }
48907
48908 /* size of the data received */
48909 @@ -63,7 +63,7 @@
48910 dev_err(chip->dev,
48911 "Recv size(%d) less than available space\n", size);
48912 for (; i < size; i++) { /* clear the waiting data anyway */
48913 - status = ioread8(chip->vendor->iobase + 1);
48914 + status = ioread8(chip->vendor.iobase + 1);
48915 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48916 dev_err(chip->dev, "error reading data\n");
48917 return -EIO;
48918 @@ -74,16 +74,16 @@
48919
48920 /* read all the data available */
48921 for (; i < size; i++) {
48922 - status = ioread8(chip->vendor->iobase + 1);
48923 + status = ioread8(chip->vendor.iobase + 1);
48924 if ((status & ATML_STATUS_DATA_AVAIL) == 0) {
48925 dev_err(chip->dev, "error reading data\n");
48926 return -EIO;
48927 }
48928 - *buf++ = ioread8(chip->vendor->iobase);
48929 + *buf++ = ioread8(chip->vendor.iobase);
48930 }
48931
48932 /* make sure data available is gone */
48933 - status = ioread8(chip->vendor->iobase + 1);
48934 + status = ioread8(chip->vendor.iobase + 1);
48935
48936 if (status & ATML_STATUS_DATA_AVAIL) {
48937 dev_err(chip->dev, "data available is stuck\n");
48938 @@ -100,7 +100,7 @@
48939 dev_dbg(chip->dev, "tpm_atml_send:\n");
48940 for (i = 0; i < count; i++) {
48941 dev_dbg(chip->dev, "%d 0x%x(%d)\n", i, buf[i], buf[i]);
48942 - iowrite8(buf[i], chip->vendor->iobase);
48943 + iowrite8(buf[i], chip->vendor.iobase);
48944 }
48945
48946 return count;
48947 @@ -108,12 +108,12 @@
48948
48949 static void tpm_atml_cancel(struct tpm_chip *chip)
48950 {
48951 - iowrite8(ATML_STATUS_ABORT, chip->vendor->iobase + 1);
48952 + iowrite8(ATML_STATUS_ABORT, chip->vendor.iobase + 1);
48953 }
48954
48955 static u8 tpm_atml_status(struct tpm_chip *chip)
48956 {
48957 - return ioread8(chip->vendor->iobase + 1);
48958 + return ioread8(chip->vendor.iobase + 1);
48959 }
48960
48961 static struct file_operations atmel_ops = {
48962 @@ -140,7 +140,7 @@
48963
48964 static struct attribute_group atmel_attr_grp = { .attrs = atmel_attrs };
48965
48966 -static struct tpm_vendor_specific tpm_atmel = {
48967 +static const struct tpm_vendor_specific tpm_atmel = {
48968 .recv = tpm_atml_recv,
48969 .send = tpm_atml_send,
48970 .cancel = tpm_atml_cancel,
48971 @@ -159,10 +159,10 @@
48972 struct tpm_chip *chip = dev_get_drvdata(&pdev->dev);
48973
48974 if (chip) {
48975 - if (chip->vendor->have_region)
48976 - atmel_release_region(chip->vendor->base,
48977 - chip->vendor->region_size);
48978 - atmel_put_base_addr(chip->vendor);
48979 + if (chip->vendor.have_region)
48980 + atmel_release_region(chip->vendor.base,
48981 + chip->vendor.region_size);
48982 + atmel_put_base_addr(chip->vendor.iobase);
48983 tpm_remove_hardware(chip->dev);
48984 platform_device_unregister(pdev);
48985 }
48986 @@ -179,18 +179,22 @@
48987 static int __init init_atmel(void)
48988 {
48989 int rc = 0;
48990 + void __iomem *iobase = NULL;
48991 + int have_region, region_size;
48992 + unsigned long base;
48993 + struct tpm_chip *chip;
48994
48995 driver_register(&atml_drv);
48996
48997 - if ((tpm_atmel.iobase = atmel_get_base_addr(&tpm_atmel)) == NULL) {
48998 + if ((iobase = atmel_get_base_addr(&base, &region_size)) == NULL) {
48999 rc = -ENODEV;
49000 goto err_unreg_drv;
49001 }
49002
49003 - tpm_atmel.have_region =
49004 + have_region =
49005 (atmel_request_region
49006 - (tpm_atmel.base, tpm_atmel.region_size,
49007 - "tpm_atmel0") == NULL) ? 0 : 1;
49008 + (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1;
49009 +
49010
49011 if (IS_ERR
49012 (pdev =
49013 @@ -199,17 +203,25 @@
49014 goto err_rel_reg;
49015 }
49016
49017 - if ((rc = tpm_register_hardware(&pdev->dev, &tpm_atmel)) < 0)
49018 + if (!(chip = tpm_register_hardware(&pdev->dev, &tpm_atmel))) {
49019 + rc = -ENODEV;
49020 goto err_unreg_dev;
49021 + }
49022 +
49023 + chip->vendor.iobase = iobase;
49024 + chip->vendor.base = base;
49025 + chip->vendor.have_region = have_region;
49026 + chip->vendor.region_size = region_size;
49027 +
49028 return 0;
49029
49030 err_unreg_dev:
49031 platform_device_unregister(pdev);
49032 err_rel_reg:
49033 - atmel_put_base_addr(&tpm_atmel);
49034 - if (tpm_atmel.have_region)
49035 - atmel_release_region(tpm_atmel.base,
49036 - tpm_atmel.region_size);
49037 + atmel_put_base_addr(iobase);
49038 + if (have_region)
49039 + atmel_release_region(base,
49040 + region_size);
49041 err_unreg_drv:
49042 driver_unregister(&atml_drv);
49043 return rc;
49044 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.h linux-2.6.16.33/drivers/char/tpm/tpm_atmel.h
49045 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_atmel.h 2006-11-22 18:06:31.000000000 +0000
49046 +++ linux-2.6.16.33/drivers/char/tpm/tpm_atmel.h 2007-05-23 21:00:01.000000000 +0000
49047 @@ -28,13 +28,12 @@
49048 #define atmel_request_region request_mem_region
49049 #define atmel_release_region release_mem_region
49050
49051 -static inline void atmel_put_base_addr(struct tpm_vendor_specific
49052 - *vendor)
49053 +static inline void atmel_put_base_addr(void __iomem *iobase)
49054 {
49055 - iounmap(vendor->iobase);
49056 + iounmap(iobase);
49057 }
49058
49059 -static void __iomem * atmel_get_base_addr(struct tpm_vendor_specific *vendor)
49060 +static void __iomem * atmel_get_base_addr(unsigned long *base, int *region_size)
49061 {
49062 struct device_node *dn;
49063 unsigned long address, size;
49064 @@ -71,9 +70,9 @@
49065 else
49066 size = reg[naddrc];
49067
49068 - vendor->base = address;
49069 - vendor->region_size = size;
49070 - return ioremap(vendor->base, vendor->region_size);
49071 + *base = address;
49072 + *region_size = size;
49073 + return ioremap(*base, *region_size);
49074 }
49075 #else
49076 #define atmel_getb(chip, offset) inb(chip->vendor->base + offset)
49077 @@ -106,14 +105,12 @@
49078 return 0;
49079 }
49080
49081 -static inline void atmel_put_base_addr(struct tpm_vendor_specific
49082 - *vendor)
49083 +static inline void atmel_put_base_addr(void __iomem *iobase)
49084 {
49085 }
49086
49087 /* Determine where to talk to device */
49088 -static void __iomem * atmel_get_base_addr(struct tpm_vendor_specific
49089 - *vendor)
49090 +static void __iomem * atmel_get_base_addr(unsigned long *base, int *region_size)
49091 {
49092 int lo, hi;
49093
49094 @@ -123,9 +120,9 @@
49095 lo = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_LO);
49096 hi = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_HI);
49097
49098 - vendor->base = (hi << 8) | lo;
49099 - vendor->region_size = 2;
49100 + *base = (hi << 8) | lo;
49101 + *region_size = 2;
49102
49103 - return ioport_map(vendor->base, vendor->region_size);
49104 + return ioport_map(*base, *region_size);
49105 }
49106 #endif
49107 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_bios.c linux-2.6.16.33/drivers/char/tpm/tpm_bios.c
49108 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_bios.c 2006-11-22 18:06:31.000000000 +0000
49109 +++ linux-2.6.16.33/drivers/char/tpm/tpm_bios.c 2007-05-23 21:00:01.000000000 +0000
49110 @@ -29,6 +29,11 @@
49111 #define MAX_TEXT_EVENT 1000 /* Max event string length */
49112 #define ACPI_TCPA_SIG "TCPA" /* 0x41504354 /'TCPA' */
49113
49114 +enum bios_platform_class {
49115 + BIOS_CLIENT = 0x00,
49116 + BIOS_SERVER = 0x01,
49117 +};
49118 +
49119 struct tpm_bios_log {
49120 void *bios_event_log;
49121 void *bios_event_log_end;
49122 @@ -36,9 +41,18 @@
49123
49124 struct acpi_tcpa {
49125 struct acpi_table_header hdr;
49126 - u16 reserved;
49127 - u32 log_max_len __attribute__ ((packed));
49128 - u32 log_start_addr __attribute__ ((packed));
49129 + u16 platform_class;
49130 + union {
49131 + struct client_hdr {
49132 + u32 log_max_len __attribute__ ((packed));
49133 + u64 log_start_addr __attribute__ ((packed));
49134 + } client;
49135 + struct server_hdr {
49136 + u16 reserved;
49137 + u64 log_max_len __attribute__ ((packed));
49138 + u64 log_start_addr __attribute__ ((packed));
49139 + } server;
49140 + };
49141 };
49142
49143 struct tcpa_event {
49144 @@ -91,6 +105,12 @@
49145 "Non-Host Info"
49146 };
49147
49148 +struct tcpa_pc_event {
49149 + u32 event_id;
49150 + u32 event_size;
49151 + u8 event_data[0];
49152 +};
49153 +
49154 enum tcpa_pc_event_ids {
49155 SMBIOS = 1,
49156 BIS_CERT,
49157 @@ -100,14 +120,15 @@
49158 NVRAM,
49159 OPTION_ROM_EXEC,
49160 OPTION_ROM_CONFIG,
49161 - OPTION_ROM_MICROCODE,
49162 + OPTION_ROM_MICROCODE = 10,
49163 S_CRTM_VERSION,
49164 S_CRTM_CONTENTS,
49165 POST_CONTENTS,
49166 + HOST_TABLE_OF_DEVICES,
49167 };
49168
49169 static const char* tcpa_pc_event_id_strings[] = {
49170 - ""
49171 + "",
49172 "SMBIOS",
49173 "BIS Certificate",
49174 "POST BIOS ",
49175 @@ -116,10 +137,12 @@
49176 "NVRAM",
49177 "Option ROM",
49178 "Option ROM config",
49179 - "Option ROM microcode",
49180 + "",
49181 + "Option ROM microcode ",
49182 "S-CRTM Version",
49183 - "S-CRTM Contents",
49184 - "S-CRTM POST Contents",
49185 + "S-CRTM Contents ",
49186 + "POST Contents ",
49187 + "Table of Devices",
49188 };
49189
49190 /* returns pointer to start of pos. entry of tcg log */
49191 @@ -191,7 +214,7 @@
49192 const char *name = "";
49193 char data[40] = "";
49194 int i, n_len = 0, d_len = 0;
49195 - u32 event_id;
49196 + struct tcpa_pc_event *pc_event;
49197
49198 switch(event->event_type) {
49199 case PREBOOT:
49200 @@ -220,31 +243,32 @@
49201 }
49202 break;
49203 case EVENT_TAG:
49204 - event_id = be32_to_cpu(*((u32 *)event_entry));
49205 + pc_event = (struct tcpa_pc_event *)event_entry;
49206
49207 /* ToDo Row data -> Base64 */
49208
49209 - switch (event_id) {
49210 + switch (pc_event->event_id) {
49211 case SMBIOS:
49212 case BIS_CERT:
49213 case CMOS:
49214 case NVRAM:
49215 case OPTION_ROM_EXEC:
49216 case OPTION_ROM_CONFIG:
49217 - case OPTION_ROM_MICROCODE:
49218 case S_CRTM_VERSION:
49219 - case S_CRTM_CONTENTS:
49220 - case POST_CONTENTS:
49221 - name = tcpa_pc_event_id_strings[event_id];
49222 + name = tcpa_pc_event_id_strings[pc_event->event_id];
49223 n_len = strlen(name);
49224 break;
49225 + /* hash data */
49226 case POST_BIOS_ROM:
49227 case ESCD:
49228 - name = tcpa_pc_event_id_strings[event_id];
49229 + case OPTION_ROM_MICROCODE:
49230 + case S_CRTM_CONTENTS:
49231 + case POST_CONTENTS:
49232 + name = tcpa_pc_event_id_strings[pc_event->event_id];
49233 n_len = strlen(name);
49234 for (i = 0; i < 20; i++)
49235 - d_len += sprintf(data, "%02x",
49236 - event_entry[8 + i]);
49237 + d_len += sprintf(&data[2*i], "%02x",
49238 + pc_event->event_data[i]);
49239 break;
49240 default:
49241 break;
49242 @@ -260,52 +284,13 @@
49243
49244 static int tpm_binary_bios_measurements_show(struct seq_file *m, void *v)
49245 {
49246 + struct tcpa_event *event = v;
49247 + char *data = v;
49248 + int i;
49249
49250 - char *eventname;
49251 - char data[4];
49252 - u32 help;
49253 - int i, len;
49254 - struct tcpa_event *event = (struct tcpa_event *) v;
49255 - unsigned char *event_entry =
49256 - (unsigned char *) (v + sizeof(struct tcpa_event));
49257 -
49258 - eventname = kmalloc(MAX_TEXT_EVENT, GFP_KERNEL);
49259 - if (!eventname) {
49260 - printk(KERN_ERR "%s: ERROR - No Memory for event name\n ",
49261 - __func__);
49262 - return -ENOMEM;
49263 - }
49264 -
49265 - /* 1st: PCR used is in little-endian format (4 bytes) */
49266 - help = le32_to_cpu(event->pcr_index);
49267 - memcpy(data, &help, 4);
49268 - for (i = 0; i < 4; i++)
49269 - seq_putc(m, data[i]);
49270 -
49271 - /* 2nd: SHA1 (20 bytes) */
49272 - for (i = 0; i < 20; i++)
49273 - seq_putc(m, event->pcr_value[i]);
49274 -
49275 - /* 3rd: event type identifier (4 bytes) */
49276 - help = le32_to_cpu(event->event_type);
49277 - memcpy(data, &help, 4);
49278 - for (i = 0; i < 4; i++)
49279 + for (i = 0; i < sizeof(struct tcpa_event) + event->event_size; i++)
49280 seq_putc(m, data[i]);
49281
49282 - len = 0;
49283 -
49284 - len += get_event_name(eventname, event, event_entry);
49285 -
49286 - /* 4th: filename <= 255 + \'0' delimiter */
49287 - if (len > TCG_EVENT_NAME_LEN_MAX)
49288 - len = TCG_EVENT_NAME_LEN_MAX;
49289 -
49290 - for (i = 0; i < len; i++)
49291 - seq_putc(m, eventname[i]);
49292 -
49293 - /* 5th: delimiter */
49294 - seq_putc(m, '\0');
49295 -
49296 return 0;
49297 }
49298
49299 @@ -353,6 +338,7 @@
49300 /* 4th: eventname <= max + \'0' delimiter */
49301 seq_printf(m, " %s\n", eventname);
49302
49303 + kfree(eventname);
49304 return 0;
49305 }
49306
49307 @@ -376,6 +362,7 @@
49308 struct acpi_tcpa *buff;
49309 acpi_status status;
49310 struct acpi_table_header *virt;
49311 + u64 len, start;
49312
49313 if (log->bios_event_log != NULL) {
49314 printk(KERN_ERR
49315 @@ -396,27 +383,37 @@
49316 return -EIO;
49317 }
49318
49319 - if (buff->log_max_len == 0) {
49320 + switch(buff->platform_class) {
49321 + case BIOS_SERVER:
49322 + len = buff->server.log_max_len;
49323 + start = buff->server.log_start_addr;
49324 + break;
49325 + case BIOS_CLIENT:
49326 + default:
49327 + len = buff->client.log_max_len;
49328 + start = buff->client.log_start_addr;
49329 + break;
49330 + }
49331 + if (!len) {
49332 printk(KERN_ERR "%s: ERROR - TCPA log area empty\n", __func__);
49333 return -EIO;
49334 }
49335
49336 /* malloc EventLog space */
49337 - log->bios_event_log = kmalloc(buff->log_max_len, GFP_KERNEL);
49338 + log->bios_event_log = kmalloc(len, GFP_KERNEL);
49339 if (!log->bios_event_log) {
49340 - printk
49341 - ("%s: ERROR - Not enough Memory for BIOS measurements\n",
49342 - __func__);
49343 + printk("%s: ERROR - Not enough Memory for BIOS measurements\n",
49344 + __func__);
49345 return -ENOMEM;
49346 }
49347
49348 - log->bios_event_log_end = log->bios_event_log + buff->log_max_len;
49349 + log->bios_event_log_end = log->bios_event_log + len;
49350
49351 - acpi_os_map_memory(buff->log_start_addr, buff->log_max_len, (void *) &virt);
49352 + acpi_os_map_memory(start, len, (void *) &virt);
49353
49354 - memcpy(log->bios_event_log, virt, buff->log_max_len);
49355 + memcpy(log->bios_event_log, virt, len);
49356
49357 - acpi_os_unmap_memory(virt, buff->log_max_len);
49358 + acpi_os_unmap_memory(virt, len);
49359 return 0;
49360 }
49361
49362 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_infineon.c linux-2.6.16.33/drivers/char/tpm/tpm_infineon.c
49363 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_infineon.c 2006-11-22 18:06:31.000000000 +0000
49364 +++ linux-2.6.16.33/drivers/char/tpm/tpm_infineon.c 2007-05-23 21:00:01.000000000 +0000
49365 @@ -15,6 +15,7 @@
49366 * License.
49367 */
49368
49369 +#include <linux/init.h>
49370 #include <linux/pnp.h>
49371 #include "tpm.h"
49372
49373 @@ -104,7 +105,7 @@
49374
49375 if (clear_wrfifo) {
49376 for (i = 0; i < 4096; i++) {
49377 - status = inb(chip->vendor->base + WRFIFO);
49378 + status = inb(chip->vendor.base + WRFIFO);
49379 if (status == 0xff) {
49380 if (check == 5)
49381 break;
49382 @@ -124,8 +125,8 @@
49383 */
49384 i = 0;
49385 do {
49386 - status = inb(chip->vendor->base + RDFIFO);
49387 - status = inb(chip->vendor->base + STAT);
49388 + status = inb(chip->vendor.base + RDFIFO);
49389 + status = inb(chip->vendor.base + STAT);
49390 i++;
49391 if (i == TPM_MAX_TRIES)
49392 return -EIO;
49393 @@ -138,7 +139,7 @@
49394 int status;
49395 int i;
49396 for (i = 0; i < TPM_MAX_TRIES; i++) {
49397 - status = inb(chip->vendor->base + STAT);
49398 + status = inb(chip->vendor.base + STAT);
49399 /* check the status-register if wait_for_bit is set */
49400 if (status & 1 << wait_for_bit)
49401 break;
49402 @@ -157,7 +158,7 @@
49403 static void wait_and_send(struct tpm_chip *chip, u8 sendbyte)
49404 {
49405 wait(chip, STAT_XFE);
49406 - outb(sendbyte, chip->vendor->base + WRFIFO);
49407 + outb(sendbyte, chip->vendor.base + WRFIFO);
49408 }
49409
49410 /* Note: WTX means Waiting-Time-Extension. Whenever the TPM needs more
49411 @@ -204,7 +205,7 @@
49412 ret = wait(chip, STAT_RDA);
49413 if (ret)
49414 return -EIO;
49415 - buf[i] = inb(chip->vendor->base + RDFIFO);
49416 + buf[i] = inb(chip->vendor.base + RDFIFO);
49417 }
49418
49419 if (buf[0] != TPM_VL_VER) {
49420 @@ -219,7 +220,7 @@
49421
49422 for (i = 0; i < size; i++) {
49423 wait(chip, STAT_RDA);
49424 - buf[i] = inb(chip->vendor->base + RDFIFO);
49425 + buf[i] = inb(chip->vendor.base + RDFIFO);
49426 }
49427
49428 if ((size == 0x6D00) && (buf[1] == 0x80)) {
49429 @@ -268,7 +269,7 @@
49430 u8 count_high, count_low, count_4, count_3, count_2, count_1;
49431
49432 /* Disabling Reset, LP and IRQC */
49433 - outb(RESET_LP_IRQC_DISABLE, chip->vendor->base + CMD);
49434 + outb(RESET_LP_IRQC_DISABLE, chip->vendor.base + CMD);
49435
49436 ret = empty_fifo(chip, 1);
49437 if (ret) {
49438 @@ -319,7 +320,7 @@
49439
49440 static u8 tpm_inf_status(struct tpm_chip *chip)
49441 {
49442 - return inb(chip->vendor->base + STAT);
49443 + return inb(chip->vendor.base + STAT);
49444 }
49445
49446 static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
49447 @@ -346,7 +347,7 @@
49448 .release = tpm_release,
49449 };
49450
49451 -static struct tpm_vendor_specific tpm_inf = {
49452 +static const struct tpm_vendor_specific tpm_inf = {
49453 .recv = tpm_inf_recv,
49454 .send = tpm_inf_send,
49455 .cancel = tpm_inf_cancel,
49456 @@ -375,6 +376,7 @@
49457 int version[2];
49458 int productid[2];
49459 char chipname[20];
49460 + struct tpm_chip *chip;
49461
49462 /* read IO-ports through PnP */
49463 if (pnp_port_valid(dev, 0) && pnp_port_valid(dev, 1) &&
49464 @@ -395,14 +397,13 @@
49465 goto err_last;
49466 }
49467 /* publish my base address and request region */
49468 - tpm_inf.base = TPM_INF_BASE;
49469 if (request_region
49470 - (tpm_inf.base, TPM_INF_PORT_LEN, "tpm_infineon0") == NULL) {
49471 + (TPM_INF_BASE, TPM_INF_PORT_LEN, "tpm_infineon0") == NULL) {
49472 rc = -EINVAL;
49473 goto err_last;
49474 }
49475 - if (request_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN,
49476 - "tpm_infineon0") == NULL) {
49477 + if (request_region
49478 + (TPM_INF_ADDR, TPM_INF_ADDR_LEN, "tpm_infineon0") == NULL) {
49479 rc = -EINVAL;
49480 goto err_last;
49481 }
49482 @@ -442,9 +443,9 @@
49483
49484 /* configure TPM with IO-ports */
49485 outb(IOLIMH, TPM_INF_ADDR);
49486 - outb(((tpm_inf.base >> 8) & 0xff), TPM_INF_DATA);
49487 + outb(((TPM_INF_BASE >> 8) & 0xff), TPM_INF_DATA);
49488 outb(IOLIML, TPM_INF_ADDR);
49489 - outb((tpm_inf.base & 0xff), TPM_INF_DATA);
49490 + outb((TPM_INF_BASE & 0xff), TPM_INF_DATA);
49491
49492 /* control if IO-ports are set correctly */
49493 outb(IOLIMH, TPM_INF_ADDR);
49494 @@ -452,10 +453,10 @@
49495 outb(IOLIML, TPM_INF_ADDR);
49496 iol = inb(TPM_INF_DATA);
49497
49498 - if ((ioh << 8 | iol) != tpm_inf.base) {
49499 + if ((ioh << 8 | iol) != TPM_INF_BASE) {
49500 dev_err(&dev->dev,
49501 - "Could not set IO-ports to 0x%lx\n",
49502 - tpm_inf.base);
49503 + "Could not set IO-ports to 0x%x\n",
49504 + TPM_INF_BASE);
49505 rc = -EIO;
49506 goto err_release_region;
49507 }
49508 @@ -466,15 +467,15 @@
49509 outb(DISABLE_REGISTER_PAIR, TPM_INF_ADDR);
49510
49511 /* disable RESET, LP and IRQC */
49512 - outb(RESET_LP_IRQC_DISABLE, tpm_inf.base + CMD);
49513 + outb(RESET_LP_IRQC_DISABLE, TPM_INF_BASE + CMD);
49514
49515 /* Finally, we're done, print some infos */
49516 dev_info(&dev->dev, "TPM found: "
49517 "config base 0x%x, "
49518 "io base 0x%x, "
49519 - "chip version %02x%02x, "
49520 - "vendor id %x%x (Infineon), "
49521 - "product id %02x%02x"
49522 + "chip version 0x%02x%02x, "
49523 + "vendor id 0x%x%x (Infineon), "
49524 + "product id 0x%02x%02x"
49525 "%s\n",
49526 TPM_INF_ADDR,
49527 TPM_INF_BASE,
49528 @@ -482,11 +483,10 @@
49529 vendorid[0], vendorid[1],
49530 productid[0], productid[1], chipname);
49531
49532 - rc = tpm_register_hardware(&dev->dev, &tpm_inf);
49533 - if (rc < 0) {
49534 - rc = -ENODEV;
49535 + if (!(chip = tpm_register_hardware(&dev->dev, &tpm_inf))) {
49536 goto err_release_region;
49537 }
49538 + chip->vendor.base = TPM_INF_BASE;
49539 return 0;
49540 } else {
49541 rc = -ENODEV;
49542 @@ -494,7 +494,7 @@
49543 }
49544
49545 err_release_region:
49546 - release_region(tpm_inf.base, TPM_INF_PORT_LEN);
49547 + release_region(TPM_INF_BASE, TPM_INF_PORT_LEN);
49548 release_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN);
49549
49550 err_last:
49551 @@ -506,7 +506,8 @@
49552 struct tpm_chip *chip = pnp_get_drvdata(dev);
49553
49554 if (chip) {
49555 - release_region(chip->vendor->base, TPM_INF_PORT_LEN);
49556 + release_region(TPM_INF_BASE, TPM_INF_PORT_LEN);
49557 + release_region(TPM_INF_ADDR, TPM_INF_ADDR_LEN);
49558 tpm_remove_hardware(chip->dev);
49559 }
49560 }
49561 @@ -520,7 +521,7 @@
49562 },
49563 .id_table = tpm_pnp_tbl,
49564 .probe = tpm_inf_pnp_probe,
49565 - .remove = tpm_inf_pnp_remove,
49566 + .remove = __devexit_p(tpm_inf_pnp_remove),
49567 };
49568
49569 static int __init init_inf(void)
49570 @@ -538,5 +539,5 @@
49571
49572 MODULE_AUTHOR("Marcel Selhorst <selhorst@crypto.rub.de>");
49573 MODULE_DESCRIPTION("Driver for Infineon TPM SLD 9630 TT 1.1 / SLB 9635 TT 1.2");
49574 -MODULE_VERSION("1.7");
49575 +MODULE_VERSION("1.8");
49576 MODULE_LICENSE("GPL");
49577 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_nsc.c linux-2.6.16.33/drivers/char/tpm/tpm_nsc.c
49578 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_nsc.c 2006-11-22 18:06:31.000000000 +0000
49579 +++ linux-2.6.16.33/drivers/char/tpm/tpm_nsc.c 2007-05-23 21:00:01.000000000 +0000
49580 @@ -71,7 +71,7 @@
49581 unsigned long stop;
49582
49583 /* status immediately available check */
49584 - *data = inb(chip->vendor->base + NSC_STATUS);
49585 + *data = inb(chip->vendor.base + NSC_STATUS);
49586 if ((*data & mask) == val)
49587 return 0;
49588
49589 @@ -79,7 +79,7 @@
49590 stop = jiffies + 10 * HZ;
49591 do {
49592 msleep(TPM_TIMEOUT);
49593 - *data = inb(chip->vendor->base + 1);
49594 + *data = inb(chip->vendor.base + 1);
49595 if ((*data & mask) == val)
49596 return 0;
49597 }
49598 @@ -94,9 +94,9 @@
49599 unsigned long stop;
49600
49601 /* status immediately available check */
49602 - status = inb(chip->vendor->base + NSC_STATUS);
49603 + status = inb(chip->vendor.base + NSC_STATUS);
49604 if (status & NSC_STATUS_OBF)
49605 - status = inb(chip->vendor->base + NSC_DATA);
49606 + status = inb(chip->vendor.base + NSC_DATA);
49607 if (status & NSC_STATUS_RDY)
49608 return 0;
49609
49610 @@ -104,9 +104,9 @@
49611 stop = jiffies + 100;
49612 do {
49613 msleep(TPM_TIMEOUT);
49614 - status = inb(chip->vendor->base + NSC_STATUS);
49615 + status = inb(chip->vendor.base + NSC_STATUS);
49616 if (status & NSC_STATUS_OBF)
49617 - status = inb(chip->vendor->base + NSC_DATA);
49618 + status = inb(chip->vendor.base + NSC_DATA);
49619 if (status & NSC_STATUS_RDY)
49620 return 0;
49621 }
49622 @@ -132,7 +132,7 @@
49623 return -EIO;
49624 }
49625 if ((data =
49626 - inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_NORMAL) {
49627 + inb(chip->vendor.base + NSC_DATA)) != NSC_COMMAND_NORMAL) {
49628 dev_err(chip->dev, "not in normal mode (0x%x)\n",
49629 data);
49630 return -EIO;
49631 @@ -148,7 +148,7 @@
49632 }
49633 if (data & NSC_STATUS_F0)
49634 break;
49635 - *p = inb(chip->vendor->base + NSC_DATA);
49636 + *p = inb(chip->vendor.base + NSC_DATA);
49637 }
49638
49639 if ((data & NSC_STATUS_F0) == 0 &&
49640 @@ -156,7 +156,7 @@
49641 dev_err(chip->dev, "F0 not set\n");
49642 return -EIO;
49643 }
49644 - if ((data = inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_EOC) {
49645 + if ((data = inb(chip->vendor.base + NSC_DATA)) != NSC_COMMAND_EOC) {
49646 dev_err(chip->dev,
49647 "expected end of command(0x%x)\n", data);
49648 return -EIO;
49649 @@ -182,7 +182,7 @@
49650 * fix it. Not sure why this is needed, we followed the flow
49651 * chart in the manual to the letter.
49652 */
49653 - outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND);
49654 + outb(NSC_COMMAND_CANCEL, chip->vendor.base + NSC_COMMAND);
49655
49656 if (nsc_wait_for_ready(chip) != 0)
49657 return -EIO;
49658 @@ -192,7 +192,7 @@
49659 return -EIO;
49660 }
49661
49662 - outb(NSC_COMMAND_NORMAL, chip->vendor->base + NSC_COMMAND);
49663 + outb(NSC_COMMAND_NORMAL, chip->vendor.base + NSC_COMMAND);
49664 if (wait_for_stat(chip, NSC_STATUS_IBR, NSC_STATUS_IBR, &data) < 0) {
49665 dev_err(chip->dev, "IBR timeout\n");
49666 return -EIO;
49667 @@ -204,26 +204,26 @@
49668 "IBF timeout (while writing data)\n");
49669 return -EIO;
49670 }
49671 - outb(buf[i], chip->vendor->base + NSC_DATA);
49672 + outb(buf[i], chip->vendor.base + NSC_DATA);
49673 }
49674
49675 if (wait_for_stat(chip, NSC_STATUS_IBF, 0, &data) < 0) {
49676 dev_err(chip->dev, "IBF timeout\n");
49677 return -EIO;
49678 }
49679 - outb(NSC_COMMAND_EOC, chip->vendor->base + NSC_COMMAND);
49680 + outb(NSC_COMMAND_EOC, chip->vendor.base + NSC_COMMAND);
49681
49682 return count;
49683 }
49684
49685 static void tpm_nsc_cancel(struct tpm_chip *chip)
49686 {
49687 - outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND);
49688 + outb(NSC_COMMAND_CANCEL, chip->vendor.base + NSC_COMMAND);
49689 }
49690
49691 static u8 tpm_nsc_status(struct tpm_chip *chip)
49692 {
49693 - return inb(chip->vendor->base + NSC_STATUS);
49694 + return inb(chip->vendor.base + NSC_STATUS);
49695 }
49696
49697 static struct file_operations nsc_ops = {
49698 @@ -250,7 +250,7 @@
49699
49700 static struct attribute_group nsc_attr_grp = { .attrs = nsc_attrs };
49701
49702 -static struct tpm_vendor_specific tpm_nsc = {
49703 +static const struct tpm_vendor_specific tpm_nsc = {
49704 .recv = tpm_nsc_recv,
49705 .send = tpm_nsc_send,
49706 .cancel = tpm_nsc_cancel,
49707 @@ -268,7 +268,7 @@
49708 {
49709 struct tpm_chip *chip = dev_get_drvdata(dev);
49710 if ( chip ) {
49711 - release_region(chip->vendor->base, 2);
49712 + release_region(chip->vendor.base, 2);
49713 tpm_remove_hardware(chip->dev);
49714 }
49715 }
49716 @@ -286,7 +286,8 @@
49717 int rc = 0;
49718 int lo, hi;
49719 int nscAddrBase = TPM_ADDR;
49720 -
49721 + struct tpm_chip *chip;
49722 + unsigned long base;
49723
49724 /* verify that it is a National part (SID) */
49725 if (tpm_read_index(TPM_ADDR, NSC_SID_INDEX) != 0xEF) {
49726 @@ -300,7 +301,7 @@
49727
49728 hi = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_HI);
49729 lo = tpm_read_index(nscAddrBase, TPM_NSC_BASE0_LO);
49730 - tpm_nsc.base = (hi<<8) | lo;
49731 + base = (hi<<8) | lo;
49732
49733 /* enable the DPM module */
49734 tpm_write_index(nscAddrBase, NSC_LDC_INDEX, 0x01);
49735 @@ -320,13 +321,15 @@
49736 if ((rc = platform_device_register(pdev)) < 0)
49737 goto err_free_dev;
49738
49739 - if (request_region(tpm_nsc.base, 2, "tpm_nsc0") == NULL ) {
49740 + if (request_region(base, 2, "tpm_nsc0") == NULL ) {
49741 rc = -EBUSY;
49742 goto err_unreg_dev;
49743 }
49744
49745 - if ((rc = tpm_register_hardware(&pdev->dev, &tpm_nsc)) < 0)
49746 + if (!(chip = tpm_register_hardware(&pdev->dev, &tpm_nsc))) {
49747 + rc = -ENODEV;
49748 goto err_rel_reg;
49749 + }
49750
49751 dev_dbg(&pdev->dev, "NSC TPM detected\n");
49752 dev_dbg(&pdev->dev,
49753 @@ -361,10 +364,12 @@
49754 "NSC TPM revision %d\n",
49755 tpm_read_index(nscAddrBase, 0x27) & 0x1F);
49756
49757 + chip->vendor.base = base;
49758 +
49759 return 0;
49760
49761 err_rel_reg:
49762 - release_region(tpm_nsc.base, 2);
49763 + release_region(base, 2);
49764 err_unreg_dev:
49765 platform_device_unregister(pdev);
49766 err_free_dev:
49767 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_tis.c linux-2.6.16.33/drivers/char/tpm/tpm_tis.c
49768 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_tis.c 1970-01-01 00:00:00.000000000 +0000
49769 +++ linux-2.6.16.33/drivers/char/tpm/tpm_tis.c 2007-05-23 21:00:01.000000000 +0000
49770 @@ -0,0 +1,665 @@
49771 +/*
49772 + * Copyright (C) 2005, 2006 IBM Corporation
49773 + *
49774 + * Authors:
49775 + * Leendert van Doorn <leendert@watson.ibm.com>
49776 + * Kylene Hall <kjhall@us.ibm.com>
49777 + *
49778 + * Device driver for TCG/TCPA TPM (trusted platform module).
49779 + * Specifications at www.trustedcomputinggroup.org
49780 + *
49781 + * This device driver implements the TPM interface as defined in
49782 + * the TCG TPM Interface Spec version 1.2, revision 1.0.
49783 + *
49784 + * This program is free software; you can redistribute it and/or
49785 + * modify it under the terms of the GNU General Public License as
49786 + * published by the Free Software Foundation, version 2 of the
49787 + * License.
49788 + */
49789 +#include <linux/init.h>
49790 +#include <linux/module.h>
49791 +#include <linux/moduleparam.h>
49792 +#include <linux/pnp.h>
49793 +#include <linux/interrupt.h>
49794 +#include <linux/wait.h>
49795 +#include "tpm.h"
49796 +
49797 +#define TPM_HEADER_SIZE 10
49798 +
49799 +enum tis_access {
49800 + TPM_ACCESS_VALID = 0x80,
49801 + TPM_ACCESS_ACTIVE_LOCALITY = 0x20,
49802 + TPM_ACCESS_REQUEST_PENDING = 0x04,
49803 + TPM_ACCESS_REQUEST_USE = 0x02,
49804 +};
49805 +
49806 +enum tis_status {
49807 + TPM_STS_VALID = 0x80,
49808 + TPM_STS_COMMAND_READY = 0x40,
49809 + TPM_STS_GO = 0x20,
49810 + TPM_STS_DATA_AVAIL = 0x10,
49811 + TPM_STS_DATA_EXPECT = 0x08,
49812 +};
49813 +
49814 +enum tis_int_flags {
49815 + TPM_GLOBAL_INT_ENABLE = 0x80000000,
49816 + TPM_INTF_BURST_COUNT_STATIC = 0x100,
49817 + TPM_INTF_CMD_READY_INT = 0x080,
49818 + TPM_INTF_INT_EDGE_FALLING = 0x040,
49819 + TPM_INTF_INT_EDGE_RISING = 0x020,
49820 + TPM_INTF_INT_LEVEL_LOW = 0x010,
49821 + TPM_INTF_INT_LEVEL_HIGH = 0x008,
49822 + TPM_INTF_LOCALITY_CHANGE_INT = 0x004,
49823 + TPM_INTF_STS_VALID_INT = 0x002,
49824 + TPM_INTF_DATA_AVAIL_INT = 0x001,
49825 +};
49826 +
49827 +enum tis_defaults {
49828 + TIS_MEM_BASE = 0xFED40000,
49829 + TIS_MEM_LEN = 0x5000,
49830 + TIS_SHORT_TIMEOUT = 750, /* ms */
49831 + TIS_LONG_TIMEOUT = 2000, /* 2 sec */
49832 +};
49833 +
49834 +#define TPM_ACCESS(l) (0x0000 | ((l) << 12))
49835 +#define TPM_INT_ENABLE(l) (0x0008 | ((l) << 12))
49836 +#define TPM_INT_VECTOR(l) (0x000C | ((l) << 12))
49837 +#define TPM_INT_STATUS(l) (0x0010 | ((l) << 12))
49838 +#define TPM_INTF_CAPS(l) (0x0014 | ((l) << 12))
49839 +#define TPM_STS(l) (0x0018 | ((l) << 12))
49840 +#define TPM_DATA_FIFO(l) (0x0024 | ((l) << 12))
49841 +
49842 +#define TPM_DID_VID(l) (0x0F00 | ((l) << 12))
49843 +#define TPM_RID(l) (0x0F04 | ((l) << 12))
49844 +
49845 +static LIST_HEAD(tis_chips);
49846 +static DEFINE_SPINLOCK(tis_lock);
49847 +
49848 +static int check_locality(struct tpm_chip *chip, int l)
49849 +{
49850 + if ((ioread8(chip->vendor.iobase + TPM_ACCESS(l)) &
49851 + (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
49852 + (TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID))
49853 + return chip->vendor.locality = l;
49854 +
49855 + return -1;
49856 +}
49857 +
49858 +static void release_locality(struct tpm_chip *chip, int l, int force)
49859 +{
49860 + if (force || (ioread8(chip->vendor.iobase + TPM_ACCESS(l)) &
49861 + (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID)) ==
49862 + (TPM_ACCESS_REQUEST_PENDING | TPM_ACCESS_VALID))
49863 + iowrite8(TPM_ACCESS_ACTIVE_LOCALITY,
49864 + chip->vendor.iobase + TPM_ACCESS(l));
49865 +}
49866 +
49867 +static int request_locality(struct tpm_chip *chip, int l)
49868 +{
49869 + unsigned long stop;
49870 + long rc;
49871 +
49872 + if (check_locality(chip, l) >= 0)
49873 + return l;
49874 +
49875 + iowrite8(TPM_ACCESS_REQUEST_USE,
49876 + chip->vendor.iobase + TPM_ACCESS(l));
49877 +
49878 + if (chip->vendor.irq) {
49879 + rc = wait_event_interruptible_timeout(chip->vendor.int_queue,
49880 + (check_locality
49881 + (chip, l) >= 0),
49882 + chip->vendor.timeout_a);
49883 + if (rc > 0)
49884 + return l;
49885 +
49886 + } else {
49887 + /* wait for burstcount */
49888 + stop = jiffies + chip->vendor.timeout_a;
49889 + do {
49890 + if (check_locality(chip, l) >= 0)
49891 + return l;
49892 + msleep(TPM_TIMEOUT);
49893 + }
49894 + while (time_before(jiffies, stop));
49895 + }
49896 + return -1;
49897 +}
49898 +
49899 +static u8 tpm_tis_status(struct tpm_chip *chip)
49900 +{
49901 + return ioread8(chip->vendor.iobase +
49902 + TPM_STS(chip->vendor.locality));
49903 +}
49904 +
49905 +static void tpm_tis_ready(struct tpm_chip *chip)
49906 +{
49907 + /* this causes the current command to be aborted */
49908 + iowrite8(TPM_STS_COMMAND_READY,
49909 + chip->vendor.iobase + TPM_STS(chip->vendor.locality));
49910 +}
49911 +
49912 +static int get_burstcount(struct tpm_chip *chip)
49913 +{
49914 + unsigned long stop;
49915 + int burstcnt;
49916 +
49917 + /* wait for burstcount */
49918 + /* which timeout value, spec has 2 answers (c & d) */
49919 + stop = jiffies + chip->vendor.timeout_d;
49920 + do {
49921 + burstcnt = ioread8(chip->vendor.iobase +
49922 + TPM_STS(chip->vendor.locality) + 1);
49923 + burstcnt += ioread8(chip->vendor.iobase +
49924 + TPM_STS(chip->vendor.locality) +
49925 + 2) << 8;
49926 + if (burstcnt)
49927 + return burstcnt;
49928 + msleep(TPM_TIMEOUT);
49929 + } while (time_before(jiffies, stop));
49930 + return -EBUSY;
49931 +}
49932 +
49933 +static int wait_for_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout,
49934 + wait_queue_head_t *queue)
49935 +{
49936 + unsigned long stop;
49937 + long rc;
49938 + u8 status;
49939 +
49940 + /* check current status */
49941 + status = tpm_tis_status(chip);
49942 + if ((status & mask) == mask)
49943 + return 0;
49944 +
49945 + if (chip->vendor.irq) {
49946 + rc = wait_event_interruptible_timeout(*queue,
49947 + ((tpm_tis_status
49948 + (chip) & mask) ==
49949 + mask), timeout);
49950 + if (rc > 0)
49951 + return 0;
49952 + } else {
49953 + stop = jiffies + timeout;
49954 + do {
49955 + msleep(TPM_TIMEOUT);
49956 + status = tpm_tis_status(chip);
49957 + if ((status & mask) == mask)
49958 + return 0;
49959 + } while (time_before(jiffies, stop));
49960 + }
49961 + return -ETIME;
49962 +}
49963 +
49964 +static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
49965 +{
49966 + int size = 0, burstcnt;
49967 + while (size < count &&
49968 + wait_for_stat(chip,
49969 + TPM_STS_DATA_AVAIL | TPM_STS_VALID,
49970 + chip->vendor.timeout_c,
49971 + &chip->vendor.read_queue)
49972 + == 0) {
49973 + burstcnt = get_burstcount(chip);
49974 + for (; burstcnt > 0 && size < count; burstcnt--)
49975 + buf[size++] = ioread8(chip->vendor.iobase +
49976 + TPM_DATA_FIFO(chip->vendor.
49977 + locality));
49978 + }
49979 + return size;
49980 +}
49981 +
49982 +static int tpm_tis_recv(struct tpm_chip *chip, u8 *buf, size_t count)
49983 +{
49984 + int size = 0;
49985 + int expected, status;
49986 +
49987 + if (count < TPM_HEADER_SIZE) {
49988 + size = -EIO;
49989 + goto out;
49990 + }
49991 +
49992 + /* read first 10 bytes, including tag, paramsize, and result */
49993 + if ((size =
49994 + recv_data(chip, buf, TPM_HEADER_SIZE)) < TPM_HEADER_SIZE) {
49995 + dev_err(chip->dev, "Unable to read header\n");
49996 + goto out;
49997 + }
49998 +
49999 + expected = be32_to_cpu(*(__be32 *) (buf + 2));
50000 + if (expected > count) {
50001 + size = -EIO;
50002 + goto out;
50003 + }
50004 +
50005 + if ((size +=
50006 + recv_data(chip, &buf[TPM_HEADER_SIZE],
50007 + expected - TPM_HEADER_SIZE)) < expected) {
50008 + dev_err(chip->dev, "Unable to read remainder of result\n");
50009 + size = -ETIME;
50010 + goto out;
50011 + }
50012 +
50013 + wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50014 + &chip->vendor.int_queue);
50015 + status = tpm_tis_status(chip);
50016 + if (status & TPM_STS_DATA_AVAIL) { /* retry? */
50017 + dev_err(chip->dev, "Error left over data\n");
50018 + size = -EIO;
50019 + goto out;
50020 + }
50021 +
50022 +out:
50023 + tpm_tis_ready(chip);
50024 + release_locality(chip, chip->vendor.locality, 0);
50025 + return size;
50026 +}
50027 +
50028 +/*
50029 + * If interrupts are used (signaled by an irq set in the vendor structure)
50030 + * tpm.c can skip polling for the data to be available as the interrupt is
50031 + * waited for here
50032 + */
50033 +static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len)
50034 +{
50035 + int rc, status, burstcnt;
50036 + size_t count = 0;
50037 + u32 ordinal;
50038 +
50039 + if (request_locality(chip, 0) < 0)
50040 + return -EBUSY;
50041 +
50042 + status = tpm_tis_status(chip);
50043 + if ((status & TPM_STS_COMMAND_READY) == 0) {
50044 + tpm_tis_ready(chip);
50045 + if (wait_for_stat
50046 + (chip, TPM_STS_COMMAND_READY, chip->vendor.timeout_b,
50047 + &chip->vendor.int_queue) < 0) {
50048 + rc = -ETIME;
50049 + goto out_err;
50050 + }
50051 + }
50052 +
50053 + while (count < len - 1) {
50054 + burstcnt = get_burstcount(chip);
50055 + for (; burstcnt > 0 && count < len - 1; burstcnt--) {
50056 + iowrite8(buf[count], chip->vendor.iobase +
50057 + TPM_DATA_FIFO(chip->vendor.locality));
50058 + count++;
50059 + }
50060 +
50061 + wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50062 + &chip->vendor.int_queue);
50063 + status = tpm_tis_status(chip);
50064 + if ((status & TPM_STS_DATA_EXPECT) == 0) {
50065 + rc = -EIO;
50066 + goto out_err;
50067 + }
50068 + }
50069 +
50070 + /* write last byte */
50071 + iowrite8(buf[count],
50072 + chip->vendor.iobase +
50073 + TPM_DATA_FIFO(chip->vendor.locality));
50074 + wait_for_stat(chip, TPM_STS_VALID, chip->vendor.timeout_c,
50075 + &chip->vendor.int_queue);
50076 + status = tpm_tis_status(chip);
50077 + if ((status & TPM_STS_DATA_EXPECT) != 0) {
50078 + rc = -EIO;
50079 + goto out_err;
50080 + }
50081 +
50082 + /* go and do it */
50083 + iowrite8(TPM_STS_GO,
50084 + chip->vendor.iobase + TPM_STS(chip->vendor.locality));
50085 +
50086 + if (chip->vendor.irq) {
50087 + ordinal = be32_to_cpu(*((__be32 *) (buf + 6)));
50088 + if (wait_for_stat
50089 + (chip, TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50090 + tpm_calc_ordinal_duration(chip, ordinal),
50091 + &chip->vendor.read_queue) < 0) {
50092 + rc = -ETIME;
50093 + goto out_err;
50094 + }
50095 + }
50096 + return len;
50097 +out_err:
50098 + tpm_tis_ready(chip);
50099 + release_locality(chip, chip->vendor.locality, 0);
50100 + return rc;
50101 +}
50102 +
50103 +static struct file_operations tis_ops = {
50104 + .owner = THIS_MODULE,
50105 + .llseek = no_llseek,
50106 + .open = tpm_open,
50107 + .read = tpm_read,
50108 + .write = tpm_write,
50109 + .release = tpm_release,
50110 +};
50111 +
50112 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
50113 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
50114 +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
50115 +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
50116 +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
50117 +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
50118 + NULL);
50119 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
50120 +static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
50121 +
50122 +static struct attribute *tis_attrs[] = {
50123 + &dev_attr_pubek.attr,
50124 + &dev_attr_pcrs.attr,
50125 + &dev_attr_enabled.attr,
50126 + &dev_attr_active.attr,
50127 + &dev_attr_owned.attr,
50128 + &dev_attr_temp_deactivated.attr,
50129 + &dev_attr_caps.attr,
50130 + &dev_attr_cancel.attr, NULL,
50131 +};
50132 +
50133 +static struct attribute_group tis_attr_grp = {
50134 + .attrs = tis_attrs
50135 +};
50136 +
50137 +static struct tpm_vendor_specific tpm_tis = {
50138 + .status = tpm_tis_status,
50139 + .recv = tpm_tis_recv,
50140 + .send = tpm_tis_send,
50141 + .cancel = tpm_tis_ready,
50142 + .req_complete_mask = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50143 + .req_complete_val = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
50144 + .req_canceled = TPM_STS_COMMAND_READY,
50145 + .attr_group = &tis_attr_grp,
50146 + .miscdev = {
50147 + .fops = &tis_ops,},
50148 +};
50149 +
50150 +static irqreturn_t tis_int_probe(int irq, void *dev_id, struct pt_regs *regs)
50151 +{
50152 + struct tpm_chip *chip = (struct tpm_chip *) dev_id;
50153 + u32 interrupt;
50154 +
50155 + interrupt = ioread32(chip->vendor.iobase +
50156 + TPM_INT_STATUS(chip->vendor.locality));
50157 +
50158 + if (interrupt == 0)
50159 + return IRQ_NONE;
50160 +
50161 + chip->vendor.irq = irq;
50162 +
50163 + /* Clear interrupts handled with TPM_EOI */
50164 + iowrite32(interrupt,
50165 + chip->vendor.iobase +
50166 + TPM_INT_STATUS(chip->vendor.locality));
50167 + return IRQ_HANDLED;
50168 +}
50169 +
50170 +static irqreturn_t tis_int_handler(int irq, void *dev_id, struct pt_regs *regs)
50171 +{
50172 + struct tpm_chip *chip = (struct tpm_chip *) dev_id;
50173 + u32 interrupt;
50174 + int i;
50175 +
50176 + interrupt = ioread32(chip->vendor.iobase +
50177 + TPM_INT_STATUS(chip->vendor.locality));
50178 +
50179 + if (interrupt == 0)
50180 + return IRQ_NONE;
50181 +
50182 + if (interrupt & TPM_INTF_DATA_AVAIL_INT)
50183 + wake_up_interruptible(&chip->vendor.read_queue);
50184 + if (interrupt & TPM_INTF_LOCALITY_CHANGE_INT)
50185 + for (i = 0; i < 5; i++)
50186 + if (check_locality(chip, i) >= 0)
50187 + break;
50188 + if (interrupt &
50189 + (TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_STS_VALID_INT |
50190 + TPM_INTF_CMD_READY_INT))
50191 + wake_up_interruptible(&chip->vendor.int_queue);
50192 +
50193 + /* Clear interrupts handled with TPM_EOI */
50194 + iowrite32(interrupt,
50195 + chip->vendor.iobase +
50196 + TPM_INT_STATUS(chip->vendor.locality));
50197 + return IRQ_HANDLED;
50198 +}
50199 +
50200 +static int interrupts = 1;
50201 +module_param(interrupts, bool, 0444);
50202 +MODULE_PARM_DESC(interrupts, "Enable interrupts");
50203 +
50204 +static int __devinit tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
50205 + const struct pnp_device_id *pnp_id)
50206 +{
50207 + u32 vendor, intfcaps, intmask;
50208 + int rc, i;
50209 + unsigned long start, len;
50210 + struct tpm_chip *chip;
50211 +
50212 + start = pnp_mem_start(pnp_dev, 0);
50213 + len = pnp_mem_len(pnp_dev, 0);
50214 +
50215 + if (!start)
50216 + start = TIS_MEM_BASE;
50217 + if (!len)
50218 + len = TIS_MEM_LEN;
50219 +
50220 + if (!(chip = tpm_register_hardware(&pnp_dev->dev, &tpm_tis)))
50221 + return -ENODEV;
50222 +
50223 + chip->vendor.iobase = ioremap(start, len);
50224 + if (!chip->vendor.iobase) {
50225 + rc = -EIO;
50226 + goto out_err;
50227 + }
50228 +
50229 + vendor = ioread32(chip->vendor.iobase + TPM_DID_VID(0));
50230 +
50231 + /* Default timeouts */
50232 + chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50233 + chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
50234 + chip->vendor.timeout_c = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50235 + chip->vendor.timeout_d = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
50236 +
50237 + dev_info(&pnp_dev->dev,
50238 + "1.2 TPM (device-id 0x%X, rev-id %d)\n",
50239 + vendor >> 16, ioread8(chip->vendor.iobase + TPM_RID(0)));
50240 +
50241 + /* Figure out the capabilities */
50242 + intfcaps =
50243 + ioread32(chip->vendor.iobase +
50244 + TPM_INTF_CAPS(chip->vendor.locality));
50245 + dev_dbg(&pnp_dev->dev, "TPM interface capabilities (0x%x):\n",
50246 + intfcaps);
50247 + if (intfcaps & TPM_INTF_BURST_COUNT_STATIC)
50248 + dev_dbg(&pnp_dev->dev, "\tBurst Count Static\n");
50249 + if (intfcaps & TPM_INTF_CMD_READY_INT)
50250 + dev_dbg(&pnp_dev->dev, "\tCommand Ready Int Support\n");
50251 + if (intfcaps & TPM_INTF_INT_EDGE_FALLING)
50252 + dev_dbg(&pnp_dev->dev, "\tInterrupt Edge Falling\n");
50253 + if (intfcaps & TPM_INTF_INT_EDGE_RISING)
50254 + dev_dbg(&pnp_dev->dev, "\tInterrupt Edge Rising\n");
50255 + if (intfcaps & TPM_INTF_INT_LEVEL_LOW)
50256 + dev_dbg(&pnp_dev->dev, "\tInterrupt Level Low\n");
50257 + if (intfcaps & TPM_INTF_INT_LEVEL_HIGH)
50258 + dev_dbg(&pnp_dev->dev, "\tInterrupt Level High\n");
50259 + if (intfcaps & TPM_INTF_LOCALITY_CHANGE_INT)
50260 + dev_dbg(&pnp_dev->dev, "\tLocality Change Int Support\n");
50261 + if (intfcaps & TPM_INTF_STS_VALID_INT)
50262 + dev_dbg(&pnp_dev->dev, "\tSts Valid Int Support\n");
50263 + if (intfcaps & TPM_INTF_DATA_AVAIL_INT)
50264 + dev_dbg(&pnp_dev->dev, "\tData Avail Int Support\n");
50265 +
50266 + if (request_locality(chip, 0) != 0) {
50267 + rc = -ENODEV;
50268 + goto out_err;
50269 + }
50270 +
50271 + /* INTERRUPT Setup */
50272 + init_waitqueue_head(&chip->vendor.read_queue);
50273 + init_waitqueue_head(&chip->vendor.int_queue);
50274 +
50275 + intmask =
50276 + ioread32(chip->vendor.iobase +
50277 + TPM_INT_ENABLE(chip->vendor.locality));
50278 +
50279 + intmask |= TPM_INTF_CMD_READY_INT
50280 + | TPM_INTF_LOCALITY_CHANGE_INT | TPM_INTF_DATA_AVAIL_INT
50281 + | TPM_INTF_STS_VALID_INT;
50282 +
50283 + iowrite32(intmask,
50284 + chip->vendor.iobase +
50285 + TPM_INT_ENABLE(chip->vendor.locality));
50286 + if (interrupts) {
50287 + chip->vendor.irq =
50288 + ioread8(chip->vendor.iobase +
50289 + TPM_INT_VECTOR(chip->vendor.locality));
50290 +
50291 + for (i = 3; i < 16 && chip->vendor.irq == 0; i++) {
50292 + iowrite8(i, chip->vendor.iobase +
50293 + TPM_INT_VECTOR(chip->vendor.locality));
50294 + if (request_irq
50295 + (i, tis_int_probe, SA_SHIRQ,
50296 + chip->vendor.miscdev.name, chip) != 0) {
50297 + dev_info(chip->dev,
50298 + "Unable to request irq: %d for probe\n",
50299 + i);
50300 + continue;
50301 + }
50302 +
50303 + /* Clear all existing */
50304 + iowrite32(ioread32
50305 + (chip->vendor.iobase +
50306 + TPM_INT_STATUS(chip->vendor.locality)),
50307 + chip->vendor.iobase +
50308 + TPM_INT_STATUS(chip->vendor.locality));
50309 +
50310 + /* Turn on */
50311 + iowrite32(intmask | TPM_GLOBAL_INT_ENABLE,
50312 + chip->vendor.iobase +
50313 + TPM_INT_ENABLE(chip->vendor.locality));
50314 +
50315 + /* Generate Interrupts */
50316 + tpm_gen_interrupt(chip);
50317 +
50318 + /* Turn off */
50319 + iowrite32(intmask,
50320 + chip->vendor.iobase +
50321 + TPM_INT_ENABLE(chip->vendor.locality));
50322 + free_irq(i, chip);
50323 + }
50324 + }
50325 + if (chip->vendor.irq) {
50326 + iowrite8(chip->vendor.irq,
50327 + chip->vendor.iobase +
50328 + TPM_INT_VECTOR(chip->vendor.locality));
50329 + if (request_irq
50330 + (chip->vendor.irq, tis_int_handler, SA_SHIRQ,
50331 + chip->vendor.miscdev.name, chip) != 0) {
50332 + dev_info(chip->dev,
50333 + "Unable to request irq: %d for use\n",
50334 + chip->vendor.irq);
50335 + chip->vendor.irq = 0;
50336 + } else {
50337 + /* Clear all existing */
50338 + iowrite32(ioread32
50339 + (chip->vendor.iobase +
50340 + TPM_INT_STATUS(chip->vendor.locality)),
50341 + chip->vendor.iobase +
50342 + TPM_INT_STATUS(chip->vendor.locality));
50343 +
50344 + /* Turn on */
50345 + iowrite32(intmask | TPM_GLOBAL_INT_ENABLE,
50346 + chip->vendor.iobase +
50347 + TPM_INT_ENABLE(chip->vendor.locality));
50348 + }
50349 + }
50350 +
50351 + INIT_LIST_HEAD(&chip->vendor.list);
50352 + spin_lock(&tis_lock);
50353 + list_add(&chip->vendor.list, &tis_chips);
50354 + spin_unlock(&tis_lock);
50355 +
50356 + tpm_get_timeouts(chip);
50357 + tpm_continue_selftest(chip);
50358 +
50359 + return 0;
50360 +out_err:
50361 + if (chip->vendor.iobase)
50362 + iounmap(chip->vendor.iobase);
50363 + tpm_remove_hardware(chip->dev);
50364 + return rc;
50365 +}
50366 +
50367 +static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg)
50368 +{
50369 + return tpm_pm_suspend(&dev->dev, msg);
50370 +}
50371 +
50372 +static int tpm_tis_pnp_resume(struct pnp_dev *dev)
50373 +{
50374 + return tpm_pm_resume(&dev->dev);
50375 +}
50376 +
50377 +static struct pnp_device_id tpm_pnp_tbl[] __devinitdata = {
50378 + {"PNP0C31", 0}, /* TPM */
50379 + {"ATM1200", 0}, /* Atmel */
50380 + {"IFX0102", 0}, /* Infineon */
50381 + {"BCM0101", 0}, /* Broadcom */
50382 + {"NSC1200", 0}, /* National */
50383 + /* Add new here */
50384 + {"", 0}, /* User Specified */
50385 + {"", 0} /* Terminator */
50386 +};
50387 +
50388 +static struct pnp_driver tis_pnp_driver = {
50389 + .name = "tpm_tis",
50390 + .id_table = tpm_pnp_tbl,
50391 + .probe = tpm_tis_pnp_init,
50392 + .suspend = tpm_tis_pnp_suspend,
50393 + .resume = tpm_tis_pnp_resume,
50394 +};
50395 +
50396 +#define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2
50397 +module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id,
50398 + sizeof(tpm_pnp_tbl[TIS_HID_USR_IDX].id), 0444);
50399 +MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe");
50400 +
50401 +static int __init init_tis(void)
50402 +{
50403 + return pnp_register_driver(&tis_pnp_driver);
50404 +}
50405 +
50406 +static void __exit cleanup_tis(void)
50407 +{
50408 + struct tpm_vendor_specific *i, *j;
50409 + struct tpm_chip *chip;
50410 + spin_lock(&tis_lock);
50411 + list_for_each_entry_safe(i, j, &tis_chips, list) {
50412 + chip = to_tpm_chip(i);
50413 + iowrite32(~TPM_GLOBAL_INT_ENABLE &
50414 + ioread32(chip->vendor.iobase +
50415 + TPM_INT_ENABLE(chip->vendor.
50416 + locality)),
50417 + chip->vendor.iobase +
50418 + TPM_INT_ENABLE(chip->vendor.locality));
50419 + release_locality(chip, chip->vendor.locality, 1);
50420 + if (chip->vendor.irq)
50421 + free_irq(chip->vendor.irq, chip);
50422 + iounmap(i->iobase);
50423 + list_del(&i->list);
50424 + tpm_remove_hardware(chip->dev);
50425 + }
50426 + spin_unlock(&tis_lock);
50427 + pnp_unregister_driver(&tis_pnp_driver);
50428 +}
50429 +
50430 +module_init(init_tis);
50431 +module_exit(cleanup_tis);
50432 +MODULE_AUTHOR("Leendert van Doorn (leendert@watson.ibm.com)");
50433 +MODULE_DESCRIPTION("TPM Driver");
50434 +MODULE_VERSION("2.0");
50435 +MODULE_LICENSE("GPL");
50436 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.c linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.c
50437 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.c 1970-01-01 00:00:00.000000000 +0000
50438 +++ linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.c 2007-01-08 15:00:45.000000000 +0000
50439 @@ -0,0 +1,547 @@
50440 +/*
50441 + * Copyright (C) 2006 IBM Corporation
50442 + *
50443 + * Authors:
50444 + * Stefan Berger <stefanb@us.ibm.com>
50445 + *
50446 + * Generic device driver part for device drivers in a virtualized
50447 + * environment.
50448 + *
50449 + * This program is free software; you can redistribute it and/or
50450 + * modify it under the terms of the GNU General Public License as
50451 + * published by the Free Software Foundation, version 2 of the
50452 + * License.
50453 + *
50454 + */
50455 +
50456 +#include <asm/uaccess.h>
50457 +#include <linux/list.h>
50458 +#include <linux/device.h>
50459 +#include <linux/interrupt.h>
50460 +#include <linux/platform_device.h>
50461 +#include "tpm.h"
50462 +#include "tpm_vtpm.h"
50463 +
50464 +/* read status bits */
50465 +enum {
50466 + STATUS_BUSY = 0x01,
50467 + STATUS_DATA_AVAIL = 0x02,
50468 + STATUS_READY = 0x04
50469 +};
50470 +
50471 +struct transmission {
50472 + struct list_head next;
50473 +
50474 + unsigned char *request;
50475 + size_t request_len;
50476 + size_t request_buflen;
50477 +
50478 + unsigned char *response;
50479 + size_t response_len;
50480 + size_t response_buflen;
50481 +
50482 + unsigned int flags;
50483 +};
50484 +
50485 +enum {
50486 + TRANSMISSION_FLAG_WAS_QUEUED = 0x1
50487 +};
50488 +
50489 +
50490 +enum {
50491 + DATAEX_FLAG_QUEUED_ONLY = 0x1
50492 +};
50493 +
50494 +
50495 +/* local variables */
50496 +
50497 +/* local function prototypes */
50498 +static int _vtpm_send_queued(struct tpm_chip *chip);
50499 +
50500 +
50501 +/* =============================================================
50502 + * Some utility functions
50503 + * =============================================================
50504 + */
50505 +static void vtpm_state_init(struct vtpm_state *vtpms)
50506 +{
50507 + vtpms->current_request = NULL;
50508 + spin_lock_init(&vtpms->req_list_lock);
50509 + init_waitqueue_head(&vtpms->req_wait_queue);
50510 + INIT_LIST_HEAD(&vtpms->queued_requests);
50511 +
50512 + vtpms->current_response = NULL;
50513 + spin_lock_init(&vtpms->resp_list_lock);
50514 + init_waitqueue_head(&vtpms->resp_wait_queue);
50515 +
50516 + vtpms->disconnect_time = jiffies;
50517 +}
50518 +
50519 +
50520 +static inline struct transmission *transmission_alloc(void)
50521 +{
50522 + return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
50523 +}
50524 +
50525 +static unsigned char *
50526 +transmission_set_req_buffer(struct transmission *t,
50527 + unsigned char *buffer, size_t len)
50528 +{
50529 + if (t->request_buflen < len) {
50530 + kfree(t->request);
50531 + t->request = kmalloc(len, GFP_KERNEL);
50532 + if (!t->request) {
50533 + t->request_buflen = 0;
50534 + return NULL;
50535 + }
50536 + t->request_buflen = len;
50537 + }
50538 +
50539 + memcpy(t->request, buffer, len);
50540 + t->request_len = len;
50541 +
50542 + return t->request;
50543 +}
50544 +
50545 +static unsigned char *
50546 +transmission_set_res_buffer(struct transmission *t,
50547 + const unsigned char *buffer, size_t len)
50548 +{
50549 + if (t->response_buflen < len) {
50550 + kfree(t->response);
50551 + t->response = kmalloc(len, GFP_ATOMIC);
50552 + if (!t->response) {
50553 + t->response_buflen = 0;
50554 + return NULL;
50555 + }
50556 + t->response_buflen = len;
50557 + }
50558 +
50559 + memcpy(t->response, buffer, len);
50560 + t->response_len = len;
50561 +
50562 + return t->response;
50563 +}
50564 +
50565 +static inline void transmission_free(struct transmission *t)
50566 +{
50567 + kfree(t->request);
50568 + kfree(t->response);
50569 + kfree(t);
50570 +}
50571 +
50572 +/* =============================================================
50573 + * Interface with the lower layer driver
50574 + * =============================================================
50575 + */
50576 +/*
50577 + * Lower layer uses this function to make a response available.
50578 + */
50579 +int vtpm_vd_recv(const struct tpm_chip *chip,
50580 + const unsigned char *buffer, size_t count,
50581 + void *ptr)
50582 +{
50583 + unsigned long flags;
50584 + int ret_size = 0;
50585 + struct transmission *t;
50586 + struct vtpm_state *vtpms;
50587 +
50588 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50589 +
50590 + /*
50591 + * The list with requests must contain one request
50592 + * only and the element there must be the one that
50593 + * was passed to me from the front-end.
50594 + */
50595 + spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50596 + if (vtpms->current_request != ptr) {
50597 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50598 + return 0;
50599 + }
50600 +
50601 + if ((t = vtpms->current_request)) {
50602 + transmission_free(t);
50603 + vtpms->current_request = NULL;
50604 + }
50605 +
50606 + t = transmission_alloc();
50607 + if (t) {
50608 + if (!transmission_set_res_buffer(t, buffer, count)) {
50609 + transmission_free(t);
50610 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50611 + return -ENOMEM;
50612 + }
50613 + ret_size = count;
50614 + vtpms->current_response = t;
50615 + wake_up_interruptible(&vtpms->resp_wait_queue);
50616 + }
50617 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50618 +
50619 + return ret_size;
50620 +}
50621 +
50622 +
50623 +/*
50624 + * Lower layer indicates its status (connected/disconnected)
50625 + */
50626 +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
50627 +{
50628 + struct vtpm_state *vtpms;
50629 +
50630 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50631 +
50632 + vtpms->vd_status = vd_status;
50633 + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
50634 + vtpms->disconnect_time = jiffies;
50635 + }
50636 +}
50637 +
50638 +/* =============================================================
50639 + * Interface with the generic TPM driver
50640 + * =============================================================
50641 + */
50642 +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
50643 +{
50644 + int rc = 0;
50645 + unsigned long flags;
50646 + struct vtpm_state *vtpms;
50647 +
50648 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50649 +
50650 + /*
50651 + * Check if the previous operation only queued the command
50652 + * In this case there won't be a response, so I just
50653 + * return from here and reset that flag. In any other
50654 + * case I should receive a response from the back-end.
50655 + */
50656 + spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50657 + if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
50658 + vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
50659 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50660 + /*
50661 + * The first few commands (measurements) must be
50662 + * queued since it might not be possible to talk to the
50663 + * TPM, yet.
50664 + * Return a response of up to 30 '0's.
50665 + */
50666 +
50667 + count = min_t(size_t, count, 30);
50668 + memset(buf, 0x0, count);
50669 + return count;
50670 + }
50671 + /*
50672 + * Check whether something is in the responselist and if
50673 + * there's nothing in the list wait for something to appear.
50674 + */
50675 +
50676 + if (!vtpms->current_response) {
50677 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50678 + interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
50679 + 1000);
50680 + spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
50681 + }
50682 +
50683 + if (vtpms->current_response) {
50684 + struct transmission *t = vtpms->current_response;
50685 + vtpms->current_response = NULL;
50686 + rc = min(count, t->response_len);
50687 + memcpy(buf, t->response, rc);
50688 + transmission_free(t);
50689 + }
50690 +
50691 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50692 + return rc;
50693 +}
50694 +
50695 +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
50696 +{
50697 + int rc = 0;
50698 + unsigned long flags;
50699 + struct transmission *t = transmission_alloc();
50700 + struct vtpm_state *vtpms;
50701 +
50702 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50703 +
50704 + if (!t)
50705 + return -ENOMEM;
50706 + /*
50707 + * If there's a current request, it must be the
50708 + * previous request that has timed out.
50709 + */
50710 + spin_lock_irqsave(&vtpms->req_list_lock, flags);
50711 + if (vtpms->current_request != NULL) {
50712 + printk("WARNING: Sending although there is a request outstanding.\n"
50713 + " Previous request must have timed out.\n");
50714 + transmission_free(vtpms->current_request);
50715 + vtpms->current_request = NULL;
50716 + }
50717 + spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50718 +
50719 + /*
50720 + * Queue the packet if the driver below is not
50721 + * ready, yet, or there is any packet already
50722 + * in the queue.
50723 + * If the driver below is ready, unqueue all
50724 + * packets first before sending our current
50725 + * packet.
50726 + * For each unqueued packet, except for the
50727 + * last (=current) packet, call the function
50728 + * tpm_xen_recv to wait for the response to come
50729 + * back.
50730 + */
50731 + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
50732 + if (time_after(jiffies,
50733 + vtpms->disconnect_time + HZ * 10)) {
50734 + rc = -ENOENT;
50735 + } else {
50736 + goto queue_it;
50737 + }
50738 + } else {
50739 + /*
50740 + * Send all queued packets.
50741 + */
50742 + if (_vtpm_send_queued(chip) == 0) {
50743 +
50744 + vtpms->current_request = t;
50745 +
50746 + rc = vtpm_vd_send(vtpms->tpm_private,
50747 + buf,
50748 + count,
50749 + t);
50750 + /*
50751 + * The generic TPM driver will call
50752 + * the function to receive the response.
50753 + */
50754 + if (rc < 0) {
50755 + vtpms->current_request = NULL;
50756 + goto queue_it;
50757 + }
50758 + } else {
50759 +queue_it:
50760 + if (!transmission_set_req_buffer(t, buf, count)) {
50761 + transmission_free(t);
50762 + rc = -ENOMEM;
50763 + goto exit;
50764 + }
50765 + /*
50766 + * An error occurred. Don't event try
50767 + * to send the current request. Just
50768 + * queue it.
50769 + */
50770 + spin_lock_irqsave(&vtpms->req_list_lock, flags);
50771 + vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
50772 + list_add_tail(&t->next, &vtpms->queued_requests);
50773 + spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50774 + }
50775 + }
50776 +
50777 +exit:
50778 + return rc;
50779 +}
50780 +
50781 +
50782 +/*
50783 + * Send all queued requests.
50784 + */
50785 +static int _vtpm_send_queued(struct tpm_chip *chip)
50786 +{
50787 + int rc;
50788 + int error = 0;
50789 + long flags;
50790 + unsigned char buffer[1];
50791 + struct vtpm_state *vtpms;
50792 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50793 +
50794 + spin_lock_irqsave(&vtpms->req_list_lock, flags);
50795 +
50796 + while (!list_empty(&vtpms->queued_requests)) {
50797 + /*
50798 + * Need to dequeue them.
50799 + * Read the result into a dummy buffer.
50800 + */
50801 + struct transmission *qt = (struct transmission *)
50802 + vtpms->queued_requests.next;
50803 + list_del(&qt->next);
50804 + vtpms->current_request = qt;
50805 + spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50806 +
50807 + rc = vtpm_vd_send(vtpms->tpm_private,
50808 + qt->request,
50809 + qt->request_len,
50810 + qt);
50811 +
50812 + if (rc < 0) {
50813 + spin_lock_irqsave(&vtpms->req_list_lock, flags);
50814 + if ((qt = vtpms->current_request) != NULL) {
50815 + /*
50816 + * requeue it at the beginning
50817 + * of the list
50818 + */
50819 + list_add(&qt->next,
50820 + &vtpms->queued_requests);
50821 + }
50822 + vtpms->current_request = NULL;
50823 + error = 1;
50824 + break;
50825 + }
50826 + /*
50827 + * After this point qt is not valid anymore!
50828 + * It is freed when the front-end is delivering
50829 + * the data by calling tpm_recv
50830 + */
50831 + /*
50832 + * Receive response into provided dummy buffer
50833 + */
50834 + rc = vtpm_recv(chip, buffer, sizeof(buffer));
50835 + spin_lock_irqsave(&vtpms->req_list_lock, flags);
50836 + }
50837 +
50838 + spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
50839 +
50840 + return error;
50841 +}
50842 +
50843 +static void vtpm_cancel(struct tpm_chip *chip)
50844 +{
50845 + unsigned long flags;
50846 + struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
50847 +
50848 + spin_lock_irqsave(&vtpms->resp_list_lock,flags);
50849 +
50850 + if (!vtpms->current_response && vtpms->current_request) {
50851 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50852 + interruptible_sleep_on(&vtpms->resp_wait_queue);
50853 + spin_lock_irqsave(&vtpms->resp_list_lock,flags);
50854 + }
50855 +
50856 + if (vtpms->current_response) {
50857 + struct transmission *t = vtpms->current_response;
50858 + vtpms->current_response = NULL;
50859 + transmission_free(t);
50860 + }
50861 +
50862 + spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
50863 +}
50864 +
50865 +static u8 vtpm_status(struct tpm_chip *chip)
50866 +{
50867 + u8 rc = 0;
50868 + unsigned long flags;
50869 + struct vtpm_state *vtpms;
50870 +
50871 + vtpms = (struct vtpm_state *)chip_get_private(chip);
50872 +
50873 + spin_lock_irqsave(&vtpms->resp_list_lock, flags);
50874 + /*
50875 + * Data are available if:
50876 + * - there's a current response
50877 + * - the last packet was queued only (this is fake, but necessary to
50878 + * get the generic TPM layer to call the receive function.)
50879 + */
50880 + if (vtpms->current_response ||
50881 + 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
50882 + rc = STATUS_DATA_AVAIL;
50883 + } else if (!vtpms->current_response && !vtpms->current_request) {
50884 + rc = STATUS_READY;
50885 + }
50886 +
50887 + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
50888 + return rc;
50889 +}
50890 +
50891 +static struct file_operations vtpm_ops = {
50892 + .owner = THIS_MODULE,
50893 + .llseek = no_llseek,
50894 + .open = tpm_open,
50895 + .read = tpm_read,
50896 + .write = tpm_write,
50897 + .release = tpm_release,
50898 +};
50899 +
50900 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
50901 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
50902 +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
50903 +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
50904 +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
50905 +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
50906 + NULL);
50907 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
50908 +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
50909 +
50910 +static struct attribute *vtpm_attrs[] = {
50911 + &dev_attr_pubek.attr,
50912 + &dev_attr_pcrs.attr,
50913 + &dev_attr_enabled.attr,
50914 + &dev_attr_active.attr,
50915 + &dev_attr_owned.attr,
50916 + &dev_attr_temp_deactivated.attr,
50917 + &dev_attr_caps.attr,
50918 + &dev_attr_cancel.attr,
50919 + NULL,
50920 +};
50921 +
50922 +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
50923 +
50924 +#define TPM_LONG_TIMEOUT (10 * 60 * HZ)
50925 +
50926 +static struct tpm_vendor_specific tpm_vtpm = {
50927 + .recv = vtpm_recv,
50928 + .send = vtpm_send,
50929 + .cancel = vtpm_cancel,
50930 + .status = vtpm_status,
50931 + .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
50932 + .req_complete_val = STATUS_DATA_AVAIL,
50933 + .req_canceled = STATUS_READY,
50934 + .attr_group = &vtpm_attr_grp,
50935 + .miscdev = {
50936 + .fops = &vtpm_ops,
50937 + },
50938 + .duration = {
50939 + TPM_LONG_TIMEOUT,
50940 + TPM_LONG_TIMEOUT,
50941 + TPM_LONG_TIMEOUT,
50942 + },
50943 +};
50944 +
50945 +struct tpm_chip *init_vtpm(struct device *dev,
50946 + struct tpm_virtual_device *tvd,
50947 + struct tpm_private *tp)
50948 +{
50949 + long rc;
50950 + struct tpm_chip *chip;
50951 + struct vtpm_state *vtpms;
50952 +
50953 + vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
50954 + if (!vtpms)
50955 + return ERR_PTR(-ENOMEM);
50956 +
50957 + vtpm_state_init(vtpms);
50958 + vtpms->tpmvd = tvd;
50959 + vtpms->tpm_private = tp;
50960 +
50961 + if (tvd)
50962 + tpm_vtpm.buffersize = tvd->max_tx_size;
50963 +
50964 + chip = tpm_register_hardware(dev, &tpm_vtpm);
50965 + if (!chip) {
50966 + rc = -ENODEV;
50967 + goto err_free_mem;
50968 + }
50969 +
50970 + chip_set_private(chip, vtpms);
50971 +
50972 + return chip;
50973 +
50974 +err_free_mem:
50975 + kfree(vtpms);
50976 +
50977 + return ERR_PTR(rc);
50978 +}
50979 +
50980 +void cleanup_vtpm(struct device *dev)
50981 +{
50982 + struct tpm_chip *chip = dev_get_drvdata(dev);
50983 + struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
50984 + tpm_remove_hardware(dev);
50985 + kfree(vtpms);
50986 +}
50987 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.h linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.h
50988 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_vtpm.h 1970-01-01 00:00:00.000000000 +0000
50989 +++ linux-2.6.16.33/drivers/char/tpm/tpm_vtpm.h 2007-01-08 15:00:45.000000000 +0000
50990 @@ -0,0 +1,68 @@
50991 +#ifndef TPM_VTPM_H
50992 +#define TPM_VTPM_H
50993 +
50994 +struct tpm_chip;
50995 +struct tpm_private;
50996 +
50997 +struct tpm_virtual_device {
50998 + /*
50999 + * This field indicates the maximum size the driver can
51000 + * transfer in one chunk. It is filled in by the front-end
51001 + * driver and should be propagated to the generic tpm driver
51002 + * for allocation of buffers.
51003 + */
51004 + unsigned int max_tx_size;
51005 +};
51006 +
51007 +struct vtpm_state {
51008 + struct transmission *current_request;
51009 + spinlock_t req_list_lock;
51010 + wait_queue_head_t req_wait_queue;
51011 +
51012 + struct list_head queued_requests;
51013 +
51014 + struct transmission *current_response;
51015 + spinlock_t resp_list_lock;
51016 + wait_queue_head_t resp_wait_queue; // processes waiting for responses
51017 +
51018 + u8 vd_status;
51019 + u8 flags;
51020 +
51021 + unsigned long disconnect_time;
51022 +
51023 + struct tpm_virtual_device *tpmvd;
51024 +
51025 + /*
51026 + * The following is a private structure of the underlying
51027 + * driver. It is passed as parameter in the send function.
51028 + */
51029 + struct tpm_private *tpm_private;
51030 +};
51031 +
51032 +
51033 +enum vdev_status {
51034 + TPM_VD_STATUS_DISCONNECTED = 0x0,
51035 + TPM_VD_STATUS_CONNECTED = 0x1
51036 +};
51037 +
51038 +/* this function is called from tpm_vtpm.c */
51039 +int vtpm_vd_send(struct tpm_private * tp,
51040 + const u8 * buf, size_t count, void *ptr);
51041 +
51042 +/* these functions are offered by tpm_vtpm.c */
51043 +struct tpm_chip *init_vtpm(struct device *,
51044 + struct tpm_virtual_device *,
51045 + struct tpm_private *);
51046 +void cleanup_vtpm(struct device *);
51047 +int vtpm_vd_recv(const struct tpm_chip* chip,
51048 + const unsigned char *buffer, size_t count, void *ptr);
51049 +void vtpm_vd_status(const struct tpm_chip *, u8 status);
51050 +
51051 +static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
51052 +{
51053 + struct tpm_chip *chip = dev_get_drvdata(dev);
51054 + struct vtpm_state *vtpms = chip_get_private(chip);
51055 + return vtpms->tpm_private;
51056 +}
51057 +
51058 +#endif
51059 diff -Nur linux-2.6.16.33-noxen/drivers/char/tpm/tpm_xen.c linux-2.6.16.33/drivers/char/tpm/tpm_xen.c
51060 --- linux-2.6.16.33-noxen/drivers/char/tpm/tpm_xen.c 1970-01-01 00:00:00.000000000 +0000
51061 +++ linux-2.6.16.33/drivers/char/tpm/tpm_xen.c 2007-01-08 15:00:45.000000000 +0000
51062 @@ -0,0 +1,760 @@
51063 +/*
51064 + * Copyright (c) 2005, IBM Corporation
51065 + *
51066 + * Author: Stefan Berger, stefanb@us.ibm.com
51067 + * Grant table support: Mahadevan Gomathisankaran
51068 + *
51069 + * This code has been derived from drivers/xen/netfront/netfront.c
51070 + *
51071 + * Copyright (c) 2002-2004, K A Fraser
51072 + *
51073 + * This program is free software; you can redistribute it and/or
51074 + * modify it under the terms of the GNU General Public License version 2
51075 + * as published by the Free Software Foundation; or, when distributed
51076 + * separately from the Linux kernel or incorporated into other
51077 + * software packages, subject to the following license:
51078 + *
51079 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51080 + * of this source file (the "Software"), to deal in the Software without
51081 + * restriction, including without limitation the rights to use, copy, modify,
51082 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51083 + * and to permit persons to whom the Software is furnished to do so, subject to
51084 + * the following conditions:
51085 + *
51086 + * The above copyright notice and this permission notice shall be included in
51087 + * all copies or substantial portions of the Software.
51088 + *
51089 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51090 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51091 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51092 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51093 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51094 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51095 + * IN THE SOFTWARE.
51096 + */
51097 +
51098 +#include <linux/errno.h>
51099 +#include <linux/err.h>
51100 +#include <linux/interrupt.h>
51101 +#include <linux/mutex.h>
51102 +#include <asm/uaccess.h>
51103 +#include <xen/evtchn.h>
51104 +#include <xen/interface/grant_table.h>
51105 +#include <xen/interface/io/tpmif.h>
51106 +#include <xen/gnttab.h>
51107 +#include <xen/xenbus.h>
51108 +#include "tpm.h"
51109 +#include "tpm_vtpm.h"
51110 +
51111 +#undef DEBUG
51112 +
51113 +/* local structures */
51114 +struct tpm_private {
51115 + struct tpm_chip *chip;
51116 +
51117 + tpmif_tx_interface_t *tx;
51118 + atomic_t refcnt;
51119 + unsigned int evtchn;
51120 + unsigned int irq;
51121 + u8 is_connected;
51122 + u8 is_suspended;
51123 +
51124 + spinlock_t tx_lock;
51125 +
51126 + struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
51127 +
51128 + atomic_t tx_busy;
51129 + void *tx_remember;
51130 +
51131 + domid_t backend_id;
51132 + wait_queue_head_t wait_q;
51133 +
51134 + struct xenbus_device *dev;
51135 + int ring_ref;
51136 +};
51137 +
51138 +struct tx_buffer {
51139 + unsigned int size; // available space in data
51140 + unsigned int len; // used space in data
51141 + unsigned char *data; // pointer to a page
51142 +};
51143 +
51144 +
51145 +/* locally visible variables */
51146 +static grant_ref_t gref_head;
51147 +static struct tpm_private *my_priv;
51148 +
51149 +/* local function prototypes */
51150 +static irqreturn_t tpmif_int(int irq,
51151 + void *tpm_priv,
51152 + struct pt_regs *ptregs);
51153 +static void tpmif_rx_action(unsigned long unused);
51154 +static int tpmif_connect(struct xenbus_device *dev,
51155 + struct tpm_private *tp,
51156 + domid_t domid);
51157 +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
51158 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
51159 +static void tpmif_free_tx_buffers(struct tpm_private *tp);
51160 +static void tpmif_set_connected_state(struct tpm_private *tp,
51161 + u8 newstate);
51162 +static int tpm_xmit(struct tpm_private *tp,
51163 + const u8 * buf, size_t count, int userbuffer,
51164 + void *remember);
51165 +static void destroy_tpmring(struct tpm_private *tp);
51166 +void __exit tpmif_exit(void);
51167 +
51168 +#define DPRINTK(fmt, args...) \
51169 + pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
51170 +#define IPRINTK(fmt, args...) \
51171 + printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
51172 +#define WPRINTK(fmt, args...) \
51173 + printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
51174 +
51175 +#define GRANT_INVALID_REF 0
51176 +
51177 +
51178 +static inline int
51179 +tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
51180 + int isuserbuffer)
51181 +{
51182 + int copied = len;
51183 +
51184 + if (len > txb->size) {
51185 + copied = txb->size;
51186 + }
51187 + if (isuserbuffer) {
51188 + if (copy_from_user(txb->data, src, copied))
51189 + return -EFAULT;
51190 + } else {
51191 + memcpy(txb->data, src, copied);
51192 + }
51193 + txb->len = len;
51194 + return copied;
51195 +}
51196 +
51197 +static inline struct tx_buffer *tx_buffer_alloc(void)
51198 +{
51199 + struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
51200 + GFP_KERNEL);
51201 +
51202 + if (txb) {
51203 + txb->len = 0;
51204 + txb->size = PAGE_SIZE;
51205 + txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
51206 + if (txb->data == NULL) {
51207 + kfree(txb);
51208 + txb = NULL;
51209 + }
51210 + }
51211 + return txb;
51212 +}
51213 +
51214 +
51215 +static inline void tx_buffer_free(struct tx_buffer *txb)
51216 +{
51217 + if (txb) {
51218 + free_page((long)txb->data);
51219 + kfree(txb);
51220 + }
51221 +}
51222 +
51223 +/**************************************************************
51224 + Utility function for the tpm_private structure
51225 +**************************************************************/
51226 +static inline void tpm_private_init(struct tpm_private *tp)
51227 +{
51228 + spin_lock_init(&tp->tx_lock);
51229 + init_waitqueue_head(&tp->wait_q);
51230 + atomic_set(&tp->refcnt, 1);
51231 +}
51232 +
51233 +static inline void tpm_private_put(void)
51234 +{
51235 + if ( atomic_dec_and_test(&my_priv->refcnt)) {
51236 + tpmif_free_tx_buffers(my_priv);
51237 + kfree(my_priv);
51238 + my_priv = NULL;
51239 + }
51240 +}
51241 +
51242 +static struct tpm_private *tpm_private_get(void)
51243 +{
51244 + int err;
51245 + if (!my_priv) {
51246 + my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
51247 + if (my_priv) {
51248 + tpm_private_init(my_priv);
51249 + err = tpmif_allocate_tx_buffers(my_priv);
51250 + if (err < 0) {
51251 + tpm_private_put();
51252 + }
51253 + }
51254 + } else {
51255 + atomic_inc(&my_priv->refcnt);
51256 + }
51257 + return my_priv;
51258 +}
51259 +
51260 +/**************************************************************
51261 +
51262 + The interface to let the tpm plugin register its callback
51263 + function and send data to another partition using this module
51264 +
51265 +**************************************************************/
51266 +
51267 +static DEFINE_MUTEX(suspend_lock);
51268 +/*
51269 + * Send data via this module by calling this function
51270 + */
51271 +int vtpm_vd_send(struct tpm_private *tp,
51272 + const u8 * buf, size_t count, void *ptr)
51273 +{
51274 + int sent;
51275 +
51276 + mutex_lock(&suspend_lock);
51277 + sent = tpm_xmit(tp, buf, count, 0, ptr);
51278 + mutex_unlock(&suspend_lock);
51279 +
51280 + return sent;
51281 +}
51282 +
51283 +/**************************************************************
51284 + XENBUS support code
51285 +**************************************************************/
51286 +
51287 +static int setup_tpmring(struct xenbus_device *dev,
51288 + struct tpm_private *tp)
51289 +{
51290 + tpmif_tx_interface_t *sring;
51291 + int err;
51292 +
51293 + tp->ring_ref = GRANT_INVALID_REF;
51294 +
51295 + sring = (void *)__get_free_page(GFP_KERNEL);
51296 + if (!sring) {
51297 + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
51298 + return -ENOMEM;
51299 + }
51300 + tp->tx = sring;
51301 +
51302 + err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
51303 + if (err < 0) {
51304 + free_page((unsigned long)sring);
51305 + tp->tx = NULL;
51306 + xenbus_dev_fatal(dev, err, "allocating grant reference");
51307 + goto fail;
51308 + }
51309 + tp->ring_ref = err;
51310 +
51311 + err = tpmif_connect(dev, tp, dev->otherend_id);
51312 + if (err)
51313 + goto fail;
51314 +
51315 + return 0;
51316 +fail:
51317 + destroy_tpmring(tp);
51318 + return err;
51319 +}
51320 +
51321 +
51322 +static void destroy_tpmring(struct tpm_private *tp)
51323 +{
51324 + tpmif_set_connected_state(tp, 0);
51325 +
51326 + if (tp->ring_ref != GRANT_INVALID_REF) {
51327 + gnttab_end_foreign_access(tp->ring_ref, 0,
51328 + (unsigned long)tp->tx);
51329 + tp->ring_ref = GRANT_INVALID_REF;
51330 + tp->tx = NULL;
51331 + }
51332 +
51333 + if (tp->irq)
51334 + unbind_from_irqhandler(tp->irq, tp);
51335 +
51336 + tp->evtchn = tp->irq = 0;
51337 +}
51338 +
51339 +
51340 +static int talk_to_backend(struct xenbus_device *dev,
51341 + struct tpm_private *tp)
51342 +{
51343 + const char *message = NULL;
51344 + int err;
51345 + struct xenbus_transaction xbt;
51346 +
51347 + err = setup_tpmring(dev, tp);
51348 + if (err) {
51349 + xenbus_dev_fatal(dev, err, "setting up ring");
51350 + goto out;
51351 + }
51352 +
51353 +again:
51354 + err = xenbus_transaction_start(&xbt);
51355 + if (err) {
51356 + xenbus_dev_fatal(dev, err, "starting transaction");
51357 + goto destroy_tpmring;
51358 + }
51359 +
51360 + err = xenbus_printf(xbt, dev->nodename,
51361 + "ring-ref","%u", tp->ring_ref);
51362 + if (err) {
51363 + message = "writing ring-ref";
51364 + goto abort_transaction;
51365 + }
51366 +
51367 + err = xenbus_printf(xbt, dev->nodename,
51368 + "event-channel", "%u", tp->evtchn);
51369 + if (err) {
51370 + message = "writing event-channel";
51371 + goto abort_transaction;
51372 + }
51373 +
51374 + err = xenbus_transaction_end(xbt, 0);
51375 + if (err == -EAGAIN)
51376 + goto again;
51377 + if (err) {
51378 + xenbus_dev_fatal(dev, err, "completing transaction");
51379 + goto destroy_tpmring;
51380 + }
51381 +
51382 + xenbus_switch_state(dev, XenbusStateConnected);
51383 +
51384 + return 0;
51385 +
51386 +abort_transaction:
51387 + xenbus_transaction_end(xbt, 1);
51388 + if (message)
51389 + xenbus_dev_error(dev, err, "%s", message);
51390 +destroy_tpmring:
51391 + destroy_tpmring(tp);
51392 +out:
51393 + return err;
51394 +}
51395 +
51396 +/**
51397 + * Callback received when the backend's state changes.
51398 + */
51399 +static void backend_changed(struct xenbus_device *dev,
51400 + enum xenbus_state backend_state)
51401 +{
51402 + struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51403 + DPRINTK("\n");
51404 +
51405 + switch (backend_state) {
51406 + case XenbusStateInitialising:
51407 + case XenbusStateInitWait:
51408 + case XenbusStateInitialised:
51409 + case XenbusStateUnknown:
51410 + break;
51411 +
51412 + case XenbusStateConnected:
51413 + tpmif_set_connected_state(tp, 1);
51414 + break;
51415 +
51416 + case XenbusStateClosing:
51417 + tpmif_set_connected_state(tp, 0);
51418 + xenbus_frontend_closed(dev);
51419 + break;
51420 +
51421 + case XenbusStateClosed:
51422 + tpmif_set_connected_state(tp, 0);
51423 + if (tp->is_suspended == 0)
51424 + device_unregister(&dev->dev);
51425 + xenbus_frontend_closed(dev);
51426 + break;
51427 + }
51428 +}
51429 +
51430 +struct tpm_virtual_device tvd = {
51431 + .max_tx_size = PAGE_SIZE * TPMIF_TX_RING_SIZE,
51432 +};
51433 +
51434 +static int tpmfront_probe(struct xenbus_device *dev,
51435 + const struct xenbus_device_id *id)
51436 +{
51437 + int err;
51438 + int handle;
51439 + struct tpm_private *tp = tpm_private_get();
51440 +
51441 + if (!tp)
51442 + return -ENOMEM;
51443 +
51444 + tp->chip = init_vtpm(&dev->dev, &tvd, tp);
51445 +
51446 + if (IS_ERR(tp->chip)) {
51447 + return PTR_ERR(tp->chip);
51448 + }
51449 +
51450 + err = xenbus_scanf(XBT_NIL, dev->nodename,
51451 + "handle", "%i", &handle);
51452 + if (XENBUS_EXIST_ERR(err))
51453 + return err;
51454 +
51455 + if (err < 0) {
51456 + xenbus_dev_fatal(dev,err,"reading virtual-device");
51457 + return err;
51458 + }
51459 +
51460 + tp->dev = dev;
51461 +
51462 + err = talk_to_backend(dev, tp);
51463 + if (err) {
51464 + tpm_private_put();
51465 + return err;
51466 + }
51467 + return 0;
51468 +}
51469 +
51470 +
51471 +static int tpmfront_remove(struct xenbus_device *dev)
51472 +{
51473 + struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51474 + destroy_tpmring(tp);
51475 + cleanup_vtpm(&dev->dev);
51476 + return 0;
51477 +}
51478 +
51479 +static int tpmfront_suspend(struct xenbus_device *dev)
51480 +{
51481 + struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51482 + u32 ctr;
51483 + /* lock, so no app can send */
51484 + mutex_lock(&suspend_lock);
51485 + tp->is_suspended = 1;
51486 +
51487 + for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 300; ctr++) {
51488 + if ((ctr % 10) == 0)
51489 + printk("TPM-FE [INFO]: Waiting for outstanding "
51490 + "request.\n");
51491 + /*
51492 + * Wait for a request to be responded to.
51493 + */
51494 + interruptible_sleep_on_timeout(&tp->wait_q, 100);
51495 + }
51496 + xenbus_switch_state(dev, XenbusStateClosing);
51497 +
51498 + if (atomic_read(&tp->tx_busy)) {
51499 + /*
51500 + * A temporary work-around.
51501 + */
51502 + printk("TPM-FE [WARNING]: Resetting busy flag.");
51503 + atomic_set(&tp->tx_busy, 0);
51504 + }
51505 +
51506 + return 0;
51507 +}
51508 +
51509 +static int tpmfront_resume(struct xenbus_device *dev)
51510 +{
51511 + struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
51512 + destroy_tpmring(tp);
51513 + return talk_to_backend(dev, tp);
51514 +}
51515 +
51516 +static int tpmif_connect(struct xenbus_device *dev,
51517 + struct tpm_private *tp,
51518 + domid_t domid)
51519 +{
51520 + int err;
51521 +
51522 + tp->backend_id = domid;
51523 +
51524 + err = xenbus_alloc_evtchn(dev, &tp->evtchn);
51525 + if (err)
51526 + return err;
51527 +
51528 + err = bind_evtchn_to_irqhandler(tp->evtchn,
51529 + tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
51530 + tp);
51531 + if (err <= 0) {
51532 + WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
51533 + return err;
51534 + }
51535 +
51536 + tp->irq = err;
51537 + return 0;
51538 +}
51539 +
51540 +static struct xenbus_device_id tpmfront_ids[] = {
51541 + { "vtpm" },
51542 + { "" }
51543 +};
51544 +
51545 +static struct xenbus_driver tpmfront = {
51546 + .name = "vtpm",
51547 + .owner = THIS_MODULE,
51548 + .ids = tpmfront_ids,
51549 + .probe = tpmfront_probe,
51550 + .remove = tpmfront_remove,
51551 + .resume = tpmfront_resume,
51552 + .otherend_changed = backend_changed,
51553 + .suspend = tpmfront_suspend,
51554 +};
51555 +
51556 +static void __init init_tpm_xenbus(void)
51557 +{
51558 + xenbus_register_frontend(&tpmfront);
51559 +}
51560 +
51561 +static void __exit exit_tpm_xenbus(void)
51562 +{
51563 + xenbus_unregister_driver(&tpmfront);
51564 +}
51565 +
51566 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
51567 +{
51568 + unsigned int i;
51569 +
51570 + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
51571 + tp->tx_buffers[i] = tx_buffer_alloc();
51572 + if (!tp->tx_buffers[i]) {
51573 + tpmif_free_tx_buffers(tp);
51574 + return -ENOMEM;
51575 + }
51576 + }
51577 + return 0;
51578 +}
51579 +
51580 +static void tpmif_free_tx_buffers(struct tpm_private *tp)
51581 +{
51582 + unsigned int i;
51583 +
51584 + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
51585 + tx_buffer_free(tp->tx_buffers[i]);
51586 + }
51587 +}
51588 +
51589 +static void tpmif_rx_action(unsigned long priv)
51590 +{
51591 + struct tpm_private *tp = (struct tpm_private *)priv;
51592 +
51593 + int i = 0;
51594 + unsigned int received;
51595 + unsigned int offset = 0;
51596 + u8 *buffer;
51597 + tpmif_tx_request_t *tx;
51598 + tx = &tp->tx->ring[i].req;
51599 +
51600 + atomic_set(&tp->tx_busy, 0);
51601 + wake_up_interruptible(&tp->wait_q);
51602 +
51603 + received = tx->size;
51604 +
51605 + buffer = kmalloc(received, GFP_ATOMIC);
51606 + if (NULL == buffer) {
51607 + goto exit;
51608 + }
51609 +
51610 + for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
51611 + struct tx_buffer *txb = tp->tx_buffers[i];
51612 + tpmif_tx_request_t *tx;
51613 + unsigned int tocopy;
51614 +
51615 + tx = &tp->tx->ring[i].req;
51616 + tocopy = tx->size;
51617 + if (tocopy > PAGE_SIZE) {
51618 + tocopy = PAGE_SIZE;
51619 + }
51620 +
51621 + memcpy(&buffer[offset], txb->data, tocopy);
51622 +
51623 + gnttab_release_grant_reference(&gref_head, tx->ref);
51624 +
51625 + offset += tocopy;
51626 + }
51627 +
51628 + vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
51629 + kfree(buffer);
51630 +
51631 +exit:
51632 +
51633 + return;
51634 +}
51635 +
51636 +
51637 +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
51638 +{
51639 + struct tpm_private *tp = tpm_priv;
51640 + unsigned long flags;
51641 +
51642 + spin_lock_irqsave(&tp->tx_lock, flags);
51643 + tpmif_rx_tasklet.data = (unsigned long)tp;
51644 + tasklet_schedule(&tpmif_rx_tasklet);
51645 + spin_unlock_irqrestore(&tp->tx_lock, flags);
51646 +
51647 + return IRQ_HANDLED;
51648 +}
51649 +
51650 +
51651 +static int tpm_xmit(struct tpm_private *tp,
51652 + const u8 * buf, size_t count, int isuserbuffer,
51653 + void *remember)
51654 +{
51655 + tpmif_tx_request_t *tx;
51656 + TPMIF_RING_IDX i;
51657 + unsigned int offset = 0;
51658 +
51659 + spin_lock_irq(&tp->tx_lock);
51660 +
51661 + if (unlikely(atomic_read(&tp->tx_busy))) {
51662 + printk("tpm_xmit: There's an outstanding request/response "
51663 + "on the way!\n");
51664 + spin_unlock_irq(&tp->tx_lock);
51665 + return -EBUSY;
51666 + }
51667 +
51668 + if (tp->is_connected != 1) {
51669 + spin_unlock_irq(&tp->tx_lock);
51670 + return -EIO;
51671 + }
51672 +
51673 + for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
51674 + struct tx_buffer *txb = tp->tx_buffers[i];
51675 + int copied;
51676 +
51677 + if (NULL == txb) {
51678 + DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
51679 + "Not transmitting anything!\n", i);
51680 + spin_unlock_irq(&tp->tx_lock);
51681 + return -EFAULT;
51682 + }
51683 + copied = tx_buffer_copy(txb, &buf[offset], count,
51684 + isuserbuffer);
51685 + if (copied < 0) {
51686 + /* An error occurred */
51687 + spin_unlock_irq(&tp->tx_lock);
51688 + return copied;
51689 + }
51690 + count -= copied;
51691 + offset += copied;
51692 +
51693 + tx = &tp->tx->ring[i].req;
51694 +
51695 + tx->addr = virt_to_machine(txb->data);
51696 + tx->size = txb->len;
51697 +
51698 + DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
51699 + txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
51700 +
51701 + /* get the granttable reference for this page */
51702 + tx->ref = gnttab_claim_grant_reference(&gref_head);
51703 +
51704 + if (-ENOSPC == tx->ref) {
51705 + spin_unlock_irq(&tp->tx_lock);
51706 + DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
51707 + return -ENOSPC;
51708 + }
51709 + gnttab_grant_foreign_access_ref( tx->ref,
51710 + tp->backend_id,
51711 + (tx->addr >> PAGE_SHIFT),
51712 + 0 /*RW*/);
51713 + wmb();
51714 + }
51715 +
51716 + atomic_set(&tp->tx_busy, 1);
51717 + tp->tx_remember = remember;
51718 +
51719 + mb();
51720 +
51721 + DPRINTK("Notifying backend via event channel %d\n",
51722 + tp->evtchn);
51723 +
51724 + notify_remote_via_irq(tp->irq);
51725 +
51726 + spin_unlock_irq(&tp->tx_lock);
51727 + return offset;
51728 +}
51729 +
51730 +
51731 +static void tpmif_notify_upperlayer(struct tpm_private *tp)
51732 +{
51733 + /*
51734 + * Notify upper layer about the state of the connection
51735 + * to the BE.
51736 + */
51737 + if (tp->is_connected) {
51738 + vtpm_vd_status(tp->chip, TPM_VD_STATUS_CONNECTED);
51739 + } else {
51740 + vtpm_vd_status(tp->chip, TPM_VD_STATUS_DISCONNECTED);
51741 + }
51742 +}
51743 +
51744 +
51745 +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
51746 +{
51747 + /*
51748 + * Don't notify upper layer if we are in suspend mode and
51749 + * should disconnect - assumption is that we will resume
51750 + * The mutex keeps apps from sending.
51751 + */
51752 + if (is_connected == 0 && tp->is_suspended == 1) {
51753 + return;
51754 + }
51755 +
51756 + /*
51757 + * Unlock the mutex if we are connected again
51758 + * after being suspended - now resuming.
51759 + * This also removes the suspend state.
51760 + */
51761 + if (is_connected == 1 && tp->is_suspended == 1) {
51762 + tp->is_suspended = 0;
51763 + /* unlock, so apps can resume sending */
51764 + mutex_unlock(&suspend_lock);
51765 + }
51766 +
51767 + if (is_connected != tp->is_connected) {
51768 + tp->is_connected = is_connected;
51769 + tpmif_notify_upperlayer(tp);
51770 + }
51771 +}
51772 +
51773 +
51774 +
51775 +/* =================================================================
51776 + * Initialization function.
51777 + * =================================================================
51778 + */
51779 +
51780 +
51781 +static int __init tpmif_init(void)
51782 +{
51783 + long rc = 0;
51784 + struct tpm_private *tp;
51785 +
51786 + if (is_initial_xendomain())
51787 + return -EPERM;
51788 +
51789 + tp = tpm_private_get();
51790 + if (!tp) {
51791 + rc = -ENOMEM;
51792 + goto failexit;
51793 + }
51794 +
51795 + IPRINTK("Initialising the vTPM driver.\n");
51796 + if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
51797 + &gref_head ) < 0) {
51798 + rc = -EFAULT;
51799 + goto gnttab_alloc_failed;
51800 + }
51801 +
51802 + init_tpm_xenbus();
51803 + return 0;
51804 +
51805 +gnttab_alloc_failed:
51806 + tpm_private_put();
51807 +failexit:
51808 +
51809 + return (int)rc;
51810 +}
51811 +
51812 +
51813 +void __exit tpmif_exit(void)
51814 +{
51815 + exit_tpm_xenbus();
51816 + tpm_private_put();
51817 + gnttab_free_grant_references(gref_head);
51818 +}
51819 +
51820 +module_init(tpmif_init);
51821 +
51822 +MODULE_LICENSE("Dual BSD/GPL");
51823 diff -Nur linux-2.6.16.33-noxen/drivers/char/tty_io.c linux-2.6.16.33/drivers/char/tty_io.c
51824 --- linux-2.6.16.33-noxen/drivers/char/tty_io.c 2006-11-22 18:06:31.000000000 +0000
51825 +++ linux-2.6.16.33/drivers/char/tty_io.c 2007-01-08 15:00:45.000000000 +0000
51826 @@ -132,6 +132,8 @@
51827 vt.c for deeply disgusting hack reasons */
51828 DECLARE_MUTEX(tty_sem);
51829
51830 +int console_use_vt = 1;
51831 +
51832 #ifdef CONFIG_UNIX98_PTYS
51833 extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
51834 extern int pty_limit; /* Config limit on Unix98 ptys */
51835 @@ -2054,7 +2056,7 @@
51836 goto got_driver;
51837 }
51838 #ifdef CONFIG_VT
51839 - if (device == MKDEV(TTY_MAJOR,0)) {
51840 + if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
51841 extern struct tty_driver *console_driver;
51842 driver = console_driver;
51843 index = fg_console;
51844 @@ -3245,6 +3247,8 @@
51845 #endif
51846
51847 #ifdef CONFIG_VT
51848 + if (!console_use_vt)
51849 + goto out_vt;
51850 cdev_init(&vc0_cdev, &console_fops);
51851 if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
51852 register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
51853 @@ -3253,6 +3257,7 @@
51854 class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
51855
51856 vty_init();
51857 + out_vt:
51858 #endif
51859 return 0;
51860 }
51861 diff -Nur linux-2.6.16.33-noxen/drivers/firmware/Kconfig linux-2.6.16.33/drivers/firmware/Kconfig
51862 --- linux-2.6.16.33-noxen/drivers/firmware/Kconfig 2006-11-22 18:06:31.000000000 +0000
51863 +++ linux-2.6.16.33/drivers/firmware/Kconfig 2007-01-08 15:00:45.000000000 +0000
51864 @@ -8,7 +8,7 @@
51865 config EDD
51866 tristate "BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)"
51867 depends on EXPERIMENTAL
51868 - depends on !IA64
51869 + depends on !IA64 && !XEN
51870 help
51871 Say Y or M here if you want to enable BIOS Enhanced Disk Drive
51872 Services real mode BIOS calls to determine which disk
51873 diff -Nur linux-2.6.16.33-noxen/drivers/ide/ide-lib.c linux-2.6.16.33/drivers/ide/ide-lib.c
51874 --- linux-2.6.16.33-noxen/drivers/ide/ide-lib.c 2006-11-22 18:06:31.000000000 +0000
51875 +++ linux-2.6.16.33/drivers/ide/ide-lib.c 2007-05-23 21:00:01.000000000 +0000
51876 @@ -410,10 +410,10 @@
51877 {
51878 u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */
51879
51880 - if (!PCI_DMA_BUS_IS_PHYS) {
51881 - addr = BLK_BOUNCE_ANY;
51882 - } else if (on && drive->media == ide_disk) {
51883 - if (HWIF(drive)->pci_dev)
51884 + if (on && drive->media == ide_disk) {
51885 + if (!PCI_DMA_BUS_IS_PHYS)
51886 + addr = BLK_BOUNCE_ANY;
51887 + else if (HWIF(drive)->pci_dev)
51888 addr = HWIF(drive)->pci_dev->dma_mask;
51889 }
51890
51891 diff -Nur linux-2.6.16.33-noxen/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-2.6.16.33/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
51892 --- linux-2.6.16.33-noxen/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2006-11-22 18:06:31.000000000 +0000
51893 +++ linux-2.6.16.33/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2007-05-23 21:00:01.000000000 +0000
51894 @@ -821,7 +821,8 @@
51895
51896 ipoib_mcast_stop_thread(dev, 0);
51897
51898 - spin_lock_irqsave(&dev->xmit_lock, flags);
51899 + local_irq_save(flags);
51900 + netif_tx_lock(dev);
51901 spin_lock(&priv->lock);
51902
51903 /*
51904 @@ -896,7 +897,8 @@
51905 }
51906
51907 spin_unlock(&priv->lock);
51908 - spin_unlock_irqrestore(&dev->xmit_lock, flags);
51909 + netif_tx_unlock(dev);
51910 + local_irq_restore(flags);
51911
51912 /* We have to cancel outside of the spinlock */
51913 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
51914 diff -Nur linux-2.6.16.33-noxen/drivers/media/dvb/dvb-core/dvb_net.c linux-2.6.16.33/drivers/media/dvb/dvb-core/dvb_net.c
51915 --- linux-2.6.16.33-noxen/drivers/media/dvb/dvb-core/dvb_net.c 2006-11-22 18:06:31.000000000 +0000
51916 +++ linux-2.6.16.33/drivers/media/dvb/dvb-core/dvb_net.c 2007-05-23 21:00:01.000000000 +0000
51917 @@ -1053,7 +1053,7 @@
51918
51919 dvb_net_feed_stop(dev);
51920 priv->rx_mode = RX_MODE_UNI;
51921 - spin_lock_bh(&dev->xmit_lock);
51922 + netif_tx_lock_bh(dev);
51923
51924 if (dev->flags & IFF_PROMISC) {
51925 dprintk("%s: promiscuous mode\n", dev->name);
51926 @@ -1078,7 +1078,7 @@
51927 }
51928 }
51929
51930 - spin_unlock_bh(&dev->xmit_lock);
51931 + netif_tx_unlock_bh(dev);
51932 dvb_net_feed_start(dev);
51933 }
51934
51935 diff -Nur linux-2.6.16.33-noxen/drivers/net/8139cp.c linux-2.6.16.33/drivers/net/8139cp.c
51936 --- linux-2.6.16.33-noxen/drivers/net/8139cp.c 2006-11-22 18:06:31.000000000 +0000
51937 +++ linux-2.6.16.33/drivers/net/8139cp.c 2007-05-23 21:00:01.000000000 +0000
51938 @@ -794,7 +794,7 @@
51939 entry = cp->tx_head;
51940 eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
51941 if (dev->features & NETIF_F_TSO)
51942 - mss = skb_shinfo(skb)->tso_size;
51943 + mss = skb_shinfo(skb)->gso_size;
51944
51945 if (skb_shinfo(skb)->nr_frags == 0) {
51946 struct cp_desc *txd = &cp->tx_ring[entry];
51947 diff -Nur linux-2.6.16.33-noxen/drivers/net/bnx2.c linux-2.6.16.33/drivers/net/bnx2.c
51948 --- linux-2.6.16.33-noxen/drivers/net/bnx2.c 2006-11-22 18:06:31.000000000 +0000
51949 +++ linux-2.6.16.33/drivers/net/bnx2.c 2007-05-23 21:00:01.000000000 +0000
51950 @@ -1593,7 +1593,7 @@
51951 skb = tx_buf->skb;
51952 #ifdef BCM_TSO
51953 /* partial BD completions possible with TSO packets */
51954 - if (skb_shinfo(skb)->tso_size) {
51955 + if (skb_is_gso(skb)) {
51956 u16 last_idx, last_ring_idx;
51957
51958 last_idx = sw_cons +
51959 @@ -1948,7 +1948,7 @@
51960 return 1;
51961 }
51962
51963 -/* Called with rtnl_lock from vlan functions and also dev->xmit_lock
51964 +/* Called with rtnl_lock from vlan functions and also netif_tx_lock
51965 * from set_multicast.
51966 */
51967 static void
51968 @@ -4403,7 +4403,7 @@
51969 }
51970 #endif
51971
51972 -/* Called with dev->xmit_lock.
51973 +/* Called with netif_tx_lock.
51974 * hard_start_xmit is pseudo-lockless - a lock is only required when
51975 * the tx queue is full. This way, we get the benefit of lockless
51976 * operations most of the time without the complexities to handle
51977 @@ -4441,7 +4441,7 @@
51978 (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
51979 }
51980 #ifdef BCM_TSO
51981 - if ((mss = skb_shinfo(skb)->tso_size) &&
51982 + if ((mss = skb_shinfo(skb)->gso_size) &&
51983 (skb->len > (bp->dev->mtu + ETH_HLEN))) {
51984 u32 tcp_opt_len, ip_tcp_len;
51985
51986 diff -Nur linux-2.6.16.33-noxen/drivers/net/bonding/bond_main.c linux-2.6.16.33/drivers/net/bonding/bond_main.c
51987 --- linux-2.6.16.33-noxen/drivers/net/bonding/bond_main.c 2006-11-22 18:06:31.000000000 +0000
51988 +++ linux-2.6.16.33/drivers/net/bonding/bond_main.c 2007-05-23 21:00:01.000000000 +0000
51989 @@ -1145,8 +1145,7 @@
51990 }
51991
51992 #define BOND_INTERSECT_FEATURES \
51993 - (NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\
51994 - NETIF_F_TSO|NETIF_F_UFO)
51995 + (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO)
51996
51997 /*
51998 * Compute the common dev->feature set available to all slaves. Some
51999 @@ -1164,9 +1163,7 @@
52000 features &= (slave->dev->features & BOND_INTERSECT_FEATURES);
52001
52002 if ((features & NETIF_F_SG) &&
52003 - !(features & (NETIF_F_IP_CSUM |
52004 - NETIF_F_NO_CSUM |
52005 - NETIF_F_HW_CSUM)))
52006 + !(features & NETIF_F_ALL_CSUM))
52007 features &= ~NETIF_F_SG;
52008
52009 /*
52010 @@ -4147,7 +4144,7 @@
52011 */
52012 bond_dev->features |= NETIF_F_VLAN_CHALLENGED;
52013
52014 - /* don't acquire bond device's xmit_lock when
52015 + /* don't acquire bond device's netif_tx_lock when
52016 * transmitting */
52017 bond_dev->features |= NETIF_F_LLTX;
52018
52019 diff -Nur linux-2.6.16.33-noxen/drivers/net/chelsio/sge.c linux-2.6.16.33/drivers/net/chelsio/sge.c
52020 --- linux-2.6.16.33-noxen/drivers/net/chelsio/sge.c 2006-11-22 18:06:31.000000000 +0000
52021 +++ linux-2.6.16.33/drivers/net/chelsio/sge.c 2007-05-23 21:00:01.000000000 +0000
52022 @@ -1419,7 +1419,7 @@
52023 struct cpl_tx_pkt *cpl;
52024
52025 #ifdef NETIF_F_TSO
52026 - if (skb_shinfo(skb)->tso_size) {
52027 + if (skb_is_gso(skb)) {
52028 int eth_type;
52029 struct cpl_tx_pkt_lso *hdr;
52030
52031 @@ -1434,7 +1434,7 @@
52032 hdr->ip_hdr_words = skb->nh.iph->ihl;
52033 hdr->tcp_hdr_words = skb->h.th->doff;
52034 hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
52035 - skb_shinfo(skb)->tso_size));
52036 + skb_shinfo(skb)->gso_size));
52037 hdr->len = htonl(skb->len - sizeof(*hdr));
52038 cpl = (struct cpl_tx_pkt *)hdr;
52039 sge->stats.tx_lso_pkts++;
52040 diff -Nur linux-2.6.16.33-noxen/drivers/net/e1000/e1000_main.c linux-2.6.16.33/drivers/net/e1000/e1000_main.c
52041 --- linux-2.6.16.33-noxen/drivers/net/e1000/e1000_main.c 2006-11-22 18:06:31.000000000 +0000
52042 +++ linux-2.6.16.33/drivers/net/e1000/e1000_main.c 2007-05-23 21:00:01.000000000 +0000
52043 @@ -2526,7 +2526,7 @@
52044 uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
52045 int err;
52046
52047 - if (skb_shinfo(skb)->tso_size) {
52048 + if (skb_is_gso(skb)) {
52049 if (skb_header_cloned(skb)) {
52050 err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
52051 if (err)
52052 @@ -2534,7 +2534,7 @@
52053 }
52054
52055 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
52056 - mss = skb_shinfo(skb)->tso_size;
52057 + mss = skb_shinfo(skb)->gso_size;
52058 if (skb->protocol == ntohs(ETH_P_IP)) {
52059 skb->nh.iph->tot_len = 0;
52060 skb->nh.iph->check = 0;
52061 @@ -2651,7 +2651,7 @@
52062 * tso gets written back prematurely before the data is fully
52063 * DMAd to the controller */
52064 if (!skb->data_len && tx_ring->last_tx_tso &&
52065 - !skb_shinfo(skb)->tso_size) {
52066 + !skb_is_gso(skb)) {
52067 tx_ring->last_tx_tso = 0;
52068 size -= 4;
52069 }
52070 @@ -2893,7 +2893,7 @@
52071 }
52072
52073 #ifdef NETIF_F_TSO
52074 - mss = skb_shinfo(skb)->tso_size;
52075 + mss = skb_shinfo(skb)->gso_size;
52076 /* The controller does a simple calculation to
52077 * make sure there is enough room in the FIFO before
52078 * initiating the DMA for each buffer. The calc is:
52079 @@ -2934,8 +2934,7 @@
52080
52081 #ifdef NETIF_F_TSO
52082 /* Controller Erratum workaround */
52083 - if (!skb->data_len && tx_ring->last_tx_tso &&
52084 - !skb_shinfo(skb)->tso_size)
52085 + if (!skb->data_len && tx_ring->last_tx_tso && !skb_is_gso(skb))
52086 count++;
52087 #endif
52088
52089 diff -Nur linux-2.6.16.33-noxen/drivers/net/forcedeth.c linux-2.6.16.33/drivers/net/forcedeth.c
52090 --- linux-2.6.16.33-noxen/drivers/net/forcedeth.c 2006-11-22 18:06:31.000000000 +0000
52091 +++ linux-2.6.16.33/drivers/net/forcedeth.c 2007-05-23 21:00:01.000000000 +0000
52092 @@ -482,9 +482,9 @@
52093 * critical parts:
52094 * - rx is (pseudo-) lockless: it relies on the single-threading provided
52095 * by the arch code for interrupts.
52096 - * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission
52097 + * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
52098 * needs dev->priv->lock :-(
52099 - * - set_multicast_list: preparation lockless, relies on dev->xmit_lock.
52100 + * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
52101 */
52102
52103 /* in dev: base, irq */
52104 @@ -1016,7 +1016,7 @@
52105
52106 /*
52107 * nv_start_xmit: dev->hard_start_xmit function
52108 - * Called with dev->xmit_lock held.
52109 + * Called with netif_tx_lock held.
52110 */
52111 static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
52112 {
52113 @@ -1105,8 +1105,8 @@
52114 np->tx_skbuff[nr] = skb;
52115
52116 #ifdef NETIF_F_TSO
52117 - if (skb_shinfo(skb)->tso_size)
52118 - tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
52119 + if (skb_is_gso(skb))
52120 + tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
52121 else
52122 #endif
52123 tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
52124 @@ -1203,7 +1203,7 @@
52125
52126 /*
52127 * nv_tx_timeout: dev->tx_timeout function
52128 - * Called with dev->xmit_lock held.
52129 + * Called with netif_tx_lock held.
52130 */
52131 static void nv_tx_timeout(struct net_device *dev)
52132 {
52133 @@ -1524,7 +1524,7 @@
52134 * Changing the MTU is a rare event, it shouldn't matter.
52135 */
52136 disable_irq(dev->irq);
52137 - spin_lock_bh(&dev->xmit_lock);
52138 + netif_tx_lock_bh(dev);
52139 spin_lock(&np->lock);
52140 /* stop engines */
52141 nv_stop_rx(dev);
52142 @@ -1559,7 +1559,7 @@
52143 nv_start_rx(dev);
52144 nv_start_tx(dev);
52145 spin_unlock(&np->lock);
52146 - spin_unlock_bh(&dev->xmit_lock);
52147 + netif_tx_unlock_bh(dev);
52148 enable_irq(dev->irq);
52149 }
52150 return 0;
52151 @@ -1594,7 +1594,7 @@
52152 memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN);
52153
52154 if (netif_running(dev)) {
52155 - spin_lock_bh(&dev->xmit_lock);
52156 + netif_tx_lock_bh(dev);
52157 spin_lock_irq(&np->lock);
52158
52159 /* stop rx engine */
52160 @@ -1606,7 +1606,7 @@
52161 /* restart rx engine */
52162 nv_start_rx(dev);
52163 spin_unlock_irq(&np->lock);
52164 - spin_unlock_bh(&dev->xmit_lock);
52165 + netif_tx_unlock_bh(dev);
52166 } else {
52167 nv_copy_mac_to_hw(dev);
52168 }
52169 @@ -1615,7 +1615,7 @@
52170
52171 /*
52172 * nv_set_multicast: dev->set_multicast function
52173 - * Called with dev->xmit_lock held.
52174 + * Called with netif_tx_lock held.
52175 */
52176 static void nv_set_multicast(struct net_device *dev)
52177 {
52178 diff -Nur linux-2.6.16.33-noxen/drivers/net/hamradio/6pack.c linux-2.6.16.33/drivers/net/hamradio/6pack.c
52179 --- linux-2.6.16.33-noxen/drivers/net/hamradio/6pack.c 2006-11-22 18:06:31.000000000 +0000
52180 +++ linux-2.6.16.33/drivers/net/hamradio/6pack.c 2007-05-23 21:00:01.000000000 +0000
52181 @@ -308,9 +308,9 @@
52182 {
52183 struct sockaddr_ax25 *sa = addr;
52184
52185 - spin_lock_irq(&dev->xmit_lock);
52186 + netif_tx_lock_bh(dev);
52187 memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
52188 - spin_unlock_irq(&dev->xmit_lock);
52189 + netif_tx_unlock_bh(dev);
52190
52191 return 0;
52192 }
52193 @@ -767,9 +767,9 @@
52194 break;
52195 }
52196
52197 - spin_lock_irq(&dev->xmit_lock);
52198 + netif_tx_lock_bh(dev);
52199 memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN);
52200 - spin_unlock_irq(&dev->xmit_lock);
52201 + netif_tx_unlock_bh(dev);
52202
52203 err = 0;
52204 break;
52205 diff -Nur linux-2.6.16.33-noxen/drivers/net/hamradio/mkiss.c linux-2.6.16.33/drivers/net/hamradio/mkiss.c
52206 --- linux-2.6.16.33-noxen/drivers/net/hamradio/mkiss.c 2006-11-22 18:06:31.000000000 +0000
52207 +++ linux-2.6.16.33/drivers/net/hamradio/mkiss.c 2007-05-23 21:00:01.000000000 +0000
52208 @@ -357,9 +357,9 @@
52209 {
52210 struct sockaddr_ax25 *sa = addr;
52211
52212 - spin_lock_irq(&dev->xmit_lock);
52213 + netif_tx_lock_bh(dev);
52214 memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
52215 - spin_unlock_irq(&dev->xmit_lock);
52216 + netif_tx_unlock_bh(dev);
52217
52218 return 0;
52219 }
52220 @@ -886,9 +886,9 @@
52221 break;
52222 }
52223
52224 - spin_lock_irq(&dev->xmit_lock);
52225 + netif_tx_lock_bh(dev);
52226 memcpy(dev->dev_addr, addr, AX25_ADDR_LEN);
52227 - spin_unlock_irq(&dev->xmit_lock);
52228 + netif_tx_unlock_bh(dev);
52229
52230 err = 0;
52231 break;
52232 diff -Nur linux-2.6.16.33-noxen/drivers/net/ifb.c linux-2.6.16.33/drivers/net/ifb.c
52233 --- linux-2.6.16.33-noxen/drivers/net/ifb.c 2006-11-22 18:06:31.000000000 +0000
52234 +++ linux-2.6.16.33/drivers/net/ifb.c 2007-05-23 21:00:01.000000000 +0000
52235 @@ -76,13 +76,13 @@
52236 dp->st_task_enter++;
52237 if ((skb = skb_peek(&dp->tq)) == NULL) {
52238 dp->st_txq_refl_try++;
52239 - if (spin_trylock(&_dev->xmit_lock)) {
52240 + if (netif_tx_trylock(_dev)) {
52241 dp->st_rxq_enter++;
52242 while ((skb = skb_dequeue(&dp->rq)) != NULL) {
52243 skb_queue_tail(&dp->tq, skb);
52244 dp->st_rx2tx_tran++;
52245 }
52246 - spin_unlock(&_dev->xmit_lock);
52247 + netif_tx_unlock(_dev);
52248 } else {
52249 /* reschedule */
52250 dp->st_rxq_notenter++;
52251 @@ -110,7 +110,7 @@
52252 }
52253 }
52254
52255 - if (spin_trylock(&_dev->xmit_lock)) {
52256 + if (netif_tx_trylock(_dev)) {
52257 dp->st_rxq_check++;
52258 if ((skb = skb_peek(&dp->rq)) == NULL) {
52259 dp->tasklet_pending = 0;
52260 @@ -118,10 +118,10 @@
52261 netif_wake_queue(_dev);
52262 } else {
52263 dp->st_rxq_rsch++;
52264 - spin_unlock(&_dev->xmit_lock);
52265 + netif_tx_unlock(_dev);
52266 goto resched;
52267 }
52268 - spin_unlock(&_dev->xmit_lock);
52269 + netif_tx_unlock(_dev);
52270 } else {
52271 resched:
52272 dp->tasklet_pending = 1;
52273 diff -Nur linux-2.6.16.33-noxen/drivers/net/irda/vlsi_ir.c linux-2.6.16.33/drivers/net/irda/vlsi_ir.c
52274 --- linux-2.6.16.33-noxen/drivers/net/irda/vlsi_ir.c 2006-11-22 18:06:31.000000000 +0000
52275 +++ linux-2.6.16.33/drivers/net/irda/vlsi_ir.c 2007-05-23 21:00:01.000000000 +0000
52276 @@ -959,7 +959,7 @@
52277 || (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec))
52278 break;
52279 udelay(100);
52280 - /* must not sleep here - we are called under xmit_lock! */
52281 + /* must not sleep here - called under netif_tx_lock! */
52282 }
52283 }
52284
52285 diff -Nur linux-2.6.16.33-noxen/drivers/net/ixgb/ixgb_main.c linux-2.6.16.33/drivers/net/ixgb/ixgb_main.c
52286 --- linux-2.6.16.33-noxen/drivers/net/ixgb/ixgb_main.c 2006-11-22 18:06:31.000000000 +0000
52287 +++ linux-2.6.16.33/drivers/net/ixgb/ixgb_main.c 2007-05-23 21:00:01.000000000 +0000
52288 @@ -1163,7 +1163,7 @@
52289 uint16_t ipcse, tucse, mss;
52290 int err;
52291
52292 - if(likely(skb_shinfo(skb)->tso_size)) {
52293 + if (likely(skb_is_gso(skb))) {
52294 if (skb_header_cloned(skb)) {
52295 err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
52296 if (err)
52297 @@ -1171,7 +1171,7 @@
52298 }
52299
52300 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
52301 - mss = skb_shinfo(skb)->tso_size;
52302 + mss = skb_shinfo(skb)->gso_size;
52303 skb->nh.iph->tot_len = 0;
52304 skb->nh.iph->check = 0;
52305 skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
52306 diff -Nur linux-2.6.16.33-noxen/drivers/net/loopback.c linux-2.6.16.33/drivers/net/loopback.c
52307 --- linux-2.6.16.33-noxen/drivers/net/loopback.c 2006-11-22 18:06:31.000000000 +0000
52308 +++ linux-2.6.16.33/drivers/net/loopback.c 2007-05-23 21:00:01.000000000 +0000
52309 @@ -74,7 +74,7 @@
52310 struct iphdr *iph = skb->nh.iph;
52311 struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
52312 unsigned int doffset = (iph->ihl + th->doff) * 4;
52313 - unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
52314 + unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
52315 unsigned int offset = 0;
52316 u32 seq = ntohl(th->seq);
52317 u16 id = ntohs(iph->id);
52318 @@ -139,7 +139,7 @@
52319 #endif
52320
52321 #ifdef LOOPBACK_TSO
52322 - if (skb_shinfo(skb)->tso_size) {
52323 + if (skb_is_gso(skb)) {
52324 BUG_ON(skb->protocol != htons(ETH_P_IP));
52325 BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
52326
52327 diff -Nur linux-2.6.16.33-noxen/drivers/net/mv643xx_eth.c linux-2.6.16.33/drivers/net/mv643xx_eth.c
52328 --- linux-2.6.16.33-noxen/drivers/net/mv643xx_eth.c 2006-11-22 18:06:31.000000000 +0000
52329 +++ linux-2.6.16.33/drivers/net/mv643xx_eth.c 2007-05-23 21:00:01.000000000 +0000
52330 @@ -1107,7 +1107,7 @@
52331
52332 #ifdef MV643XX_CHECKSUM_OFFLOAD_TX
52333 if (has_tiny_unaligned_frags(skb)) {
52334 - if ((skb_linearize(skb, GFP_ATOMIC) != 0)) {
52335 + if (__skb_linearize(skb)) {
52336 stats->tx_dropped++;
52337 printk(KERN_DEBUG "%s: failed to linearize tiny "
52338 "unaligned fragment\n", dev->name);
52339 diff -Nur linux-2.6.16.33-noxen/drivers/net/natsemi.c linux-2.6.16.33/drivers/net/natsemi.c
52340 --- linux-2.6.16.33-noxen/drivers/net/natsemi.c 2006-11-22 18:06:31.000000000 +0000
52341 +++ linux-2.6.16.33/drivers/net/natsemi.c 2007-05-23 21:00:01.000000000 +0000
52342 @@ -323,12 +323,12 @@
52343 The rx process only runs in the interrupt handler. Access from outside
52344 the interrupt handler is only permitted after disable_irq().
52345
52346 -The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap
52347 +The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap
52348 is set, then access is permitted under spin_lock_irq(&np->lock).
52349
52350 Thus configuration functions that want to access everything must call
52351 disable_irq(dev->irq);
52352 - spin_lock_bh(dev->xmit_lock);
52353 + netif_tx_lock_bh(dev);
52354 spin_lock_irq(&np->lock);
52355
52356 IV. Notes
52357 diff -Nur linux-2.6.16.33-noxen/drivers/net/r8169.c linux-2.6.16.33/drivers/net/r8169.c
52358 --- linux-2.6.16.33-noxen/drivers/net/r8169.c 2006-11-22 18:06:31.000000000 +0000
52359 +++ linux-2.6.16.33/drivers/net/r8169.c 2007-05-23 21:00:01.000000000 +0000
52360 @@ -2171,7 +2171,7 @@
52361 static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
52362 {
52363 if (dev->features & NETIF_F_TSO) {
52364 - u32 mss = skb_shinfo(skb)->tso_size;
52365 + u32 mss = skb_shinfo(skb)->gso_size;
52366
52367 if (mss)
52368 return LargeSend | ((mss & MSSMask) << MSSShift);
52369 diff -Nur linux-2.6.16.33-noxen/drivers/net/s2io.c linux-2.6.16.33/drivers/net/s2io.c
52370 --- linux-2.6.16.33-noxen/drivers/net/s2io.c 2006-11-22 18:06:31.000000000 +0000
52371 +++ linux-2.6.16.33/drivers/net/s2io.c 2007-05-23 21:00:01.000000000 +0000
52372 @@ -3522,8 +3522,8 @@
52373 txdp->Control_1 = 0;
52374 txdp->Control_2 = 0;
52375 #ifdef NETIF_F_TSO
52376 - mss = skb_shinfo(skb)->tso_size;
52377 - if (mss) {
52378 + mss = skb_shinfo(skb)->gso_size;
52379 + if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
52380 txdp->Control_1 |= TXD_TCP_LSO_EN;
52381 txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
52382 }
52383 @@ -3543,10 +3543,10 @@
52384 }
52385
52386 frg_len = skb->len - skb->data_len;
52387 - if (skb_shinfo(skb)->ufo_size) {
52388 + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
52389 int ufo_size;
52390
52391 - ufo_size = skb_shinfo(skb)->ufo_size;
52392 + ufo_size = skb_shinfo(skb)->gso_size;
52393 ufo_size &= ~7;
52394 txdp->Control_1 |= TXD_UFO_EN;
52395 txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
52396 @@ -3572,7 +3572,7 @@
52397 txdp->Host_Control = (unsigned long) skb;
52398 txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
52399
52400 - if (skb_shinfo(skb)->ufo_size)
52401 + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52402 txdp->Control_1 |= TXD_UFO_EN;
52403
52404 frg_cnt = skb_shinfo(skb)->nr_frags;
52405 @@ -3587,12 +3587,12 @@
52406 (sp->pdev, frag->page, frag->page_offset,
52407 frag->size, PCI_DMA_TODEVICE);
52408 txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
52409 - if (skb_shinfo(skb)->ufo_size)
52410 + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52411 txdp->Control_1 |= TXD_UFO_EN;
52412 }
52413 txdp->Control_1 |= TXD_GATHER_CODE_LAST;
52414
52415 - if (skb_shinfo(skb)->ufo_size)
52416 + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52417 frg_cnt++; /* as Txd0 was used for inband header */
52418
52419 tx_fifo = mac_control->tx_FIFO_start[queue];
52420 @@ -3606,7 +3606,7 @@
52421 if (mss)
52422 val64 |= TX_FIFO_SPECIAL_FUNC;
52423 #endif
52424 - if (skb_shinfo(skb)->ufo_size)
52425 + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
52426 val64 |= TX_FIFO_SPECIAL_FUNC;
52427 writeq(val64, &tx_fifo->List_Control);
52428
52429 diff -Nur linux-2.6.16.33-noxen/drivers/net/sky2.c linux-2.6.16.33/drivers/net/sky2.c
52430 --- linux-2.6.16.33-noxen/drivers/net/sky2.c 2006-11-22 18:06:31.000000000 +0000
52431 +++ linux-2.6.16.33/drivers/net/sky2.c 2007-05-23 21:00:01.000000000 +0000
52432 @@ -1141,7 +1141,7 @@
52433 count = sizeof(dma_addr_t) / sizeof(u32);
52434 count += skb_shinfo(skb)->nr_frags * count;
52435
52436 - if (skb_shinfo(skb)->tso_size)
52437 + if (skb_is_gso(skb))
52438 ++count;
52439
52440 if (skb->ip_summed == CHECKSUM_HW)
52441 @@ -1213,7 +1213,7 @@
52442 }
52443
52444 /* Check for TCP Segmentation Offload */
52445 - mss = skb_shinfo(skb)->tso_size;
52446 + mss = skb_shinfo(skb)->gso_size;
52447 if (mss != 0) {
52448 /* just drop the packet if non-linear expansion fails */
52449 if (skb_header_cloned(skb) &&
52450 diff -Nur linux-2.6.16.33-noxen/drivers/net/sky2.c~ linux-2.6.16.33/drivers/net/sky2.c~
52451 --- linux-2.6.16.33-noxen/drivers/net/sky2.c~ 1970-01-01 00:00:00.000000000 +0000
52452 +++ linux-2.6.16.33/drivers/net/sky2.c~ 2007-05-23 21:00:01.000000000 +0000
52453 @@ -0,0 +1,3425 @@
52454 +/*
52455 + * New driver for Marvell Yukon 2 chipset.
52456 + * Based on earlier sk98lin, and skge driver.
52457 + *
52458 + * This driver intentionally does not support all the features
52459 + * of the original driver such as link fail-over and link management because
52460 + * those should be done at higher levels.
52461 + *
52462 + * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
52463 + *
52464 + * This program is free software; you can redistribute it and/or modify
52465 + * it under the terms of the GNU General Public License as published by
52466 + * the Free Software Foundation; either version 2 of the License, or
52467 + * (at your option) any later version.
52468 + *
52469 + * This program is distributed in the hope that it will be useful,
52470 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
52471 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
52472 + * GNU General Public License for more details.
52473 + *
52474 + * You should have received a copy of the GNU General Public License
52475 + * along with this program; if not, write to the Free Software
52476 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
52477 + */
52478 +
52479 +#include <linux/config.h>
52480 +#include <linux/crc32.h>
52481 +#include <linux/kernel.h>
52482 +#include <linux/version.h>
52483 +#include <linux/module.h>
52484 +#include <linux/netdevice.h>
52485 +#include <linux/dma-mapping.h>
52486 +#include <linux/etherdevice.h>
52487 +#include <linux/ethtool.h>
52488 +#include <linux/pci.h>
52489 +#include <linux/ip.h>
52490 +#include <linux/tcp.h>
52491 +#include <linux/in.h>
52492 +#include <linux/delay.h>
52493 +#include <linux/workqueue.h>
52494 +#include <linux/if_vlan.h>
52495 +#include <linux/prefetch.h>
52496 +#include <linux/mii.h>
52497 +
52498 +#include <asm/irq.h>
52499 +
52500 +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
52501 +#define SKY2_VLAN_TAG_USED 1
52502 +#endif
52503 +
52504 +#include "sky2.h"
52505 +
52506 +#define DRV_NAME "sky2"
52507 +#define DRV_VERSION "0.15"
52508 +#define PFX DRV_NAME " "
52509 +
52510 +/*
52511 + * The Yukon II chipset takes 64 bit command blocks (called list elements)
52512 + * that are organized into three (receive, transmit, status) different rings
52513 + * similar to Tigon3. A transmit can require several elements;
52514 + * a receive requires one (or two if using 64 bit dma).
52515 + */
52516 +
52517 +#define is_ec_a1(hw) \
52518 + unlikely((hw)->chip_id == CHIP_ID_YUKON_EC && \
52519 + (hw)->chip_rev == CHIP_REV_YU_EC_A1)
52520 +
52521 +#define RX_LE_SIZE 512
52522 +#define RX_LE_BYTES (RX_LE_SIZE*sizeof(struct sky2_rx_le))
52523 +#define RX_MAX_PENDING (RX_LE_SIZE/2 - 2)
52524 +#define RX_DEF_PENDING RX_MAX_PENDING
52525 +#define RX_SKB_ALIGN 8
52526 +
52527 +#define TX_RING_SIZE 512
52528 +#define TX_DEF_PENDING (TX_RING_SIZE - 1)
52529 +#define TX_MIN_PENDING 64
52530 +#define MAX_SKB_TX_LE (4 + (sizeof(dma_addr_t)/sizeof(u32))*MAX_SKB_FRAGS)
52531 +
52532 +#define STATUS_RING_SIZE 2048 /* 2 ports * (TX + 2*RX) */
52533 +#define STATUS_LE_BYTES (STATUS_RING_SIZE*sizeof(struct sky2_status_le))
52534 +#define ETH_JUMBO_MTU 9000
52535 +#define TX_WATCHDOG (5 * HZ)
52536 +#define NAPI_WEIGHT 64
52537 +#define PHY_RETRIES 1000
52538 +
52539 +static const u32 default_msg =
52540 + NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
52541 + | NETIF_MSG_TIMER | NETIF_MSG_TX_ERR | NETIF_MSG_RX_ERR
52542 + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
52543 +
52544 +static int debug = -1; /* defaults above */
52545 +module_param(debug, int, 0);
52546 +MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
52547 +
52548 +static int copybreak __read_mostly = 256;
52549 +module_param(copybreak, int, 0);
52550 +MODULE_PARM_DESC(copybreak, "Receive copy threshold");
52551 +
52552 +static const struct pci_device_id sky2_id_table[] = {
52553 + { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) },
52554 + { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) },
52555 + { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b00) },
52556 + { PCI_DEVICE(PCI_VENDOR_ID_DLINK, 0x4b01) },
52557 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4340) },
52558 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4341) },
52559 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4342) },
52560 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4343) },
52561 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4344) },
52562 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4345) },
52563 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4346) },
52564 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4347) },
52565 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4350) },
52566 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4351) },
52567 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4352) },
52568 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4360) },
52569 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4361) },
52570 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4362) },
52571 + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4363) },
52572 + { 0 }
52573 +};
52574 +
52575 +MODULE_DEVICE_TABLE(pci, sky2_id_table);
52576 +
52577 +/* Avoid conditionals by using array */
52578 +static const unsigned txqaddr[] = { Q_XA1, Q_XA2 };
52579 +static const unsigned rxqaddr[] = { Q_R1, Q_R2 };
52580 +
52581 +/* This driver supports yukon2 chipset only */
52582 +static const char *yukon2_name[] = {
52583 + "XL", /* 0xb3 */
52584 + "EC Ultra", /* 0xb4 */
52585 + "UNKNOWN", /* 0xb5 */
52586 + "EC", /* 0xb6 */
52587 + "FE", /* 0xb7 */
52588 +};
52589 +
52590 +/* Access to external PHY */
52591 +static int gm_phy_write(struct sky2_hw *hw, unsigned port, u16 reg, u16 val)
52592 +{
52593 + int i;
52594 +
52595 + gma_write16(hw, port, GM_SMI_DATA, val);
52596 + gma_write16(hw, port, GM_SMI_CTRL,
52597 + GM_SMI_CT_PHY_AD(PHY_ADDR_MARV) | GM_SMI_CT_REG_AD(reg));
52598 +
52599 + for (i = 0; i < PHY_RETRIES; i++) {
52600 + if (!(gma_read16(hw, port, GM_SMI_CTRL) & GM_SMI_CT_BUSY))
52601 + return 0;
52602 + udelay(1);
52603 + }
52604 +
52605 + printk(KERN_WARNING PFX "%s: phy write timeout\n", hw->dev[port]->name);
52606 + return -ETIMEDOUT;
52607 +}
52608 +
52609 +static int __gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg, u16 *val)
52610 +{
52611 + int i;
52612 +
52613 + gma_write16(hw, port, GM_SMI_CTRL, GM_SMI_CT_PHY_AD(PHY_ADDR_MARV)
52614 + | GM_SMI_CT_REG_AD(reg) | GM_SMI_CT_OP_RD);
52615 +
52616 + for (i = 0; i < PHY_RETRIES; i++) {
52617 + if (gma_read16(hw, port, GM_SMI_CTRL) & GM_SMI_CT_RD_VAL) {
52618 + *val = gma_read16(hw, port, GM_SMI_DATA);
52619 + return 0;
52620 + }
52621 +
52622 + udelay(1);
52623 + }
52624 +
52625 + return -ETIMEDOUT;
52626 +}
52627 +
52628 +static u16 gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg)
52629 +{
52630 + u16 v;
52631 +
52632 + if (__gm_phy_read(hw, port, reg, &v) != 0)
52633 + printk(KERN_WARNING PFX "%s: phy read timeout\n", hw->dev[port]->name);
52634 + return v;
52635 +}
52636 +
52637 +static int sky2_set_power_state(struct sky2_hw *hw, pci_power_t state)
52638 +{
52639 + u16 power_control;
52640 + u32 reg1;
52641 + int vaux;
52642 + int ret = 0;
52643 +
52644 + pr_debug("sky2_set_power_state %d\n", state);
52645 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
52646 +
52647 + power_control = sky2_pci_read16(hw, hw->pm_cap + PCI_PM_PMC);
52648 + vaux = (sky2_read16(hw, B0_CTST) & Y2_VAUX_AVAIL) &&
52649 + (power_control & PCI_PM_CAP_PME_D3cold);
52650 +
52651 + power_control = sky2_pci_read16(hw, hw->pm_cap + PCI_PM_CTRL);
52652 +
52653 + power_control |= PCI_PM_CTRL_PME_STATUS;
52654 + power_control &= ~(PCI_PM_CTRL_STATE_MASK);
52655 +
52656 + switch (state) {
52657 + case PCI_D0:
52658 + /* switch power to VCC (WA for VAUX problem) */
52659 + sky2_write8(hw, B0_POWER_CTRL,
52660 + PC_VAUX_ENA | PC_VCC_ENA | PC_VAUX_OFF | PC_VCC_ON);
52661 +
52662 + /* disable Core Clock Division, */
52663 + sky2_write32(hw, B2_Y2_CLK_CTRL, Y2_CLK_DIV_DIS);
52664 +
52665 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52666 + /* enable bits are inverted */
52667 + sky2_write8(hw, B2_Y2_CLK_GATE,
52668 + Y2_PCI_CLK_LNK1_DIS | Y2_COR_CLK_LNK1_DIS |
52669 + Y2_CLK_GAT_LNK1_DIS | Y2_PCI_CLK_LNK2_DIS |
52670 + Y2_COR_CLK_LNK2_DIS | Y2_CLK_GAT_LNK2_DIS);
52671 + else
52672 + sky2_write8(hw, B2_Y2_CLK_GATE, 0);
52673 +
52674 + /* Turn off phy power saving */
52675 + reg1 = sky2_pci_read32(hw, PCI_DEV_REG1);
52676 + reg1 &= ~(PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52677 +
52678 + /* looks like this XL is back asswards .. */
52679 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1) {
52680 + reg1 |= PCI_Y2_PHY1_COMA;
52681 + if (hw->ports > 1)
52682 + reg1 |= PCI_Y2_PHY2_COMA;
52683 + }
52684 +
52685 + if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
52686 + sky2_pci_write32(hw, PCI_DEV_REG3, 0);
52687 + reg1 = sky2_pci_read32(hw, PCI_DEV_REG4);
52688 + reg1 &= P_ASPM_CONTROL_MSK;
52689 + sky2_pci_write32(hw, PCI_DEV_REG4, reg1);
52690 + sky2_pci_write32(hw, PCI_DEV_REG5, 0);
52691 + }
52692 +
52693 + sky2_pci_write32(hw, PCI_DEV_REG1, reg1);
52694 +
52695 + break;
52696 +
52697 + case PCI_D3hot:
52698 + case PCI_D3cold:
52699 + /* Turn on phy power saving */
52700 + reg1 = sky2_pci_read32(hw, PCI_DEV_REG1);
52701 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52702 + reg1 &= ~(PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52703 + else
52704 + reg1 |= (PCI_Y2_PHY1_POWD | PCI_Y2_PHY2_POWD);
52705 + sky2_pci_write32(hw, PCI_DEV_REG1, reg1);
52706 +
52707 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev > 1)
52708 + sky2_write8(hw, B2_Y2_CLK_GATE, 0);
52709 + else
52710 + /* enable bits are inverted */
52711 + sky2_write8(hw, B2_Y2_CLK_GATE,
52712 + Y2_PCI_CLK_LNK1_DIS | Y2_COR_CLK_LNK1_DIS |
52713 + Y2_CLK_GAT_LNK1_DIS | Y2_PCI_CLK_LNK2_DIS |
52714 + Y2_COR_CLK_LNK2_DIS | Y2_CLK_GAT_LNK2_DIS);
52715 +
52716 + /* switch power to VAUX */
52717 + if (vaux && state != PCI_D3cold)
52718 + sky2_write8(hw, B0_POWER_CTRL,
52719 + (PC_VAUX_ENA | PC_VCC_ENA |
52720 + PC_VAUX_ON | PC_VCC_OFF));
52721 + break;
52722 + default:
52723 + printk(KERN_ERR PFX "Unknown power state %d\n", state);
52724 + ret = -1;
52725 + }
52726 +
52727 + sky2_pci_write16(hw, hw->pm_cap + PCI_PM_CTRL, power_control);
52728 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
52729 + return ret;
52730 +}
52731 +
52732 +static void sky2_phy_reset(struct sky2_hw *hw, unsigned port)
52733 +{
52734 + u16 reg;
52735 +
52736 + /* disable all GMAC IRQ's */
52737 + sky2_write8(hw, SK_REG(port, GMAC_IRQ_MSK), 0);
52738 + /* disable PHY IRQs */
52739 + gm_phy_write(hw, port, PHY_MARV_INT_MASK, 0);
52740 +
52741 + gma_write16(hw, port, GM_MC_ADDR_H1, 0); /* clear MC hash */
52742 + gma_write16(hw, port, GM_MC_ADDR_H2, 0);
52743 + gma_write16(hw, port, GM_MC_ADDR_H3, 0);
52744 + gma_write16(hw, port, GM_MC_ADDR_H4, 0);
52745 +
52746 + reg = gma_read16(hw, port, GM_RX_CTRL);
52747 + reg |= GM_RXCR_UCF_ENA | GM_RXCR_MCF_ENA;
52748 + gma_write16(hw, port, GM_RX_CTRL, reg);
52749 +}
52750 +
52751 +static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
52752 +{
52753 + struct sky2_port *sky2 = netdev_priv(hw->dev[port]);
52754 + u16 ctrl, ct1000, adv, pg, ledctrl, ledover;
52755 +
52756 + if (sky2->autoneg == AUTONEG_ENABLE && hw->chip_id != CHIP_ID_YUKON_XL) {
52757 + u16 ectrl = gm_phy_read(hw, port, PHY_MARV_EXT_CTRL);
52758 +
52759 + ectrl &= ~(PHY_M_EC_M_DSC_MSK | PHY_M_EC_S_DSC_MSK |
52760 + PHY_M_EC_MAC_S_MSK);
52761 + ectrl |= PHY_M_EC_MAC_S(MAC_TX_CLK_25_MHZ);
52762 +
52763 + if (hw->chip_id == CHIP_ID_YUKON_EC)
52764 + ectrl |= PHY_M_EC_DSC_2(2) | PHY_M_EC_DOWN_S_ENA;
52765 + else
52766 + ectrl |= PHY_M_EC_M_DSC(2) | PHY_M_EC_S_DSC(3);
52767 +
52768 + gm_phy_write(hw, port, PHY_MARV_EXT_CTRL, ectrl);
52769 + }
52770 +
52771 + ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52772 + if (sky2_is_copper(hw)) {
52773 + if (hw->chip_id == CHIP_ID_YUKON_FE) {
52774 + /* enable automatic crossover */
52775 + ctrl |= PHY_M_PC_MDI_XMODE(PHY_M_PC_ENA_AUTO) >> 1;
52776 + } else {
52777 + /* disable energy detect */
52778 + ctrl &= ~PHY_M_PC_EN_DET_MSK;
52779 +
52780 + /* enable automatic crossover */
52781 + ctrl |= PHY_M_PC_MDI_XMODE(PHY_M_PC_ENA_AUTO);
52782 +
52783 + if (sky2->autoneg == AUTONEG_ENABLE &&
52784 + hw->chip_id == CHIP_ID_YUKON_XL) {
52785 + ctrl &= ~PHY_M_PC_DSC_MSK;
52786 + ctrl |= PHY_M_PC_DSC(2) | PHY_M_PC_DOWN_S_ENA;
52787 + }
52788 + }
52789 + } else {
52790 + /* workaround for deviation #4.88 (CRC errors) */
52791 + /* disable Automatic Crossover */
52792 +
52793 + ctrl &= ~PHY_M_PC_MDIX_MSK;
52794 + }
52795 +
52796 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ctrl);
52797 +
52798 + /* special setup for PHY 88E1112 Fiber */
52799 + if (hw->chip_id == CHIP_ID_YUKON_XL && !sky2_is_copper(hw)) {
52800 + pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
52801 +
52802 + /* Fiber: select 1000BASE-X only mode MAC Specific Ctrl Reg. */
52803 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 2);
52804 + ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52805 + ctrl &= ~PHY_M_MAC_MD_MSK;
52806 + ctrl |= PHY_M_MAC_MODE_SEL(PHY_M_MAC_MD_1000BX);
52807 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ctrl);
52808 +
52809 + if (hw->pmd_type == 'P') {
52810 + /* select page 1 to access Fiber registers */
52811 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 1);
52812 +
52813 + /* for SFP-module set SIGDET polarity to low */
52814 + ctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
52815 + ctrl |= PHY_M_FIB_SIGD_POL;
52816 + gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52817 + }
52818 +
52819 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
52820 + }
52821 +
52822 + ctrl = gm_phy_read(hw, port, PHY_MARV_CTRL);
52823 + if (sky2->autoneg == AUTONEG_DISABLE)
52824 + ctrl &= ~PHY_CT_ANE;
52825 + else
52826 + ctrl |= PHY_CT_ANE;
52827 +
52828 + ctrl |= PHY_CT_RESET;
52829 + gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52830 +
52831 + ctrl = 0;
52832 + ct1000 = 0;
52833 + adv = PHY_AN_CSMA;
52834 +
52835 + if (sky2->autoneg == AUTONEG_ENABLE) {
52836 + if (sky2_is_copper(hw)) {
52837 + if (sky2->advertising & ADVERTISED_1000baseT_Full)
52838 + ct1000 |= PHY_M_1000C_AFD;
52839 + if (sky2->advertising & ADVERTISED_1000baseT_Half)
52840 + ct1000 |= PHY_M_1000C_AHD;
52841 + if (sky2->advertising & ADVERTISED_100baseT_Full)
52842 + adv |= PHY_M_AN_100_FD;
52843 + if (sky2->advertising & ADVERTISED_100baseT_Half)
52844 + adv |= PHY_M_AN_100_HD;
52845 + if (sky2->advertising & ADVERTISED_10baseT_Full)
52846 + adv |= PHY_M_AN_10_FD;
52847 + if (sky2->advertising & ADVERTISED_10baseT_Half)
52848 + adv |= PHY_M_AN_10_HD;
52849 + } else { /* special defines for FIBER (88E1040S only) */
52850 + if (sky2->advertising & ADVERTISED_1000baseT_Full)
52851 + adv |= PHY_M_AN_1000X_AFD;
52852 + if (sky2->advertising & ADVERTISED_1000baseT_Half)
52853 + adv |= PHY_M_AN_1000X_AHD;
52854 + }
52855 +
52856 + /* Set Flow-control capabilities */
52857 + if (sky2->tx_pause && sky2->rx_pause)
52858 + adv |= PHY_AN_PAUSE_CAP; /* symmetric */
52859 + else if (sky2->rx_pause && !sky2->tx_pause)
52860 + adv |= PHY_AN_PAUSE_ASYM | PHY_AN_PAUSE_CAP;
52861 + else if (!sky2->rx_pause && sky2->tx_pause)
52862 + adv |= PHY_AN_PAUSE_ASYM; /* local */
52863 +
52864 + /* Restart Auto-negotiation */
52865 + ctrl |= PHY_CT_ANE | PHY_CT_RE_CFG;
52866 + } else {
52867 + /* forced speed/duplex settings */
52868 + ct1000 = PHY_M_1000C_MSE;
52869 +
52870 + if (sky2->duplex == DUPLEX_FULL)
52871 + ctrl |= PHY_CT_DUP_MD;
52872 +
52873 + switch (sky2->speed) {
52874 + case SPEED_1000:
52875 + ctrl |= PHY_CT_SP1000;
52876 + break;
52877 + case SPEED_100:
52878 + ctrl |= PHY_CT_SP100;
52879 + break;
52880 + }
52881 +
52882 + ctrl |= PHY_CT_RESET;
52883 + }
52884 +
52885 + if (hw->chip_id != CHIP_ID_YUKON_FE)
52886 + gm_phy_write(hw, port, PHY_MARV_1000T_CTRL, ct1000);
52887 +
52888 + gm_phy_write(hw, port, PHY_MARV_AUNE_ADV, adv);
52889 + gm_phy_write(hw, port, PHY_MARV_CTRL, ctrl);
52890 +
52891 + /* Setup Phy LED's */
52892 + ledctrl = PHY_M_LED_PULS_DUR(PULS_170MS);
52893 + ledover = 0;
52894 +
52895 + switch (hw->chip_id) {
52896 + case CHIP_ID_YUKON_FE:
52897 + /* on 88E3082 these bits are at 11..9 (shifted left) */
52898 + ledctrl |= PHY_M_LED_BLINK_RT(BLINK_84MS) << 1;
52899 +
52900 + ctrl = gm_phy_read(hw, port, PHY_MARV_FE_LED_PAR);
52901 +
52902 + /* delete ACT LED control bits */
52903 + ctrl &= ~PHY_M_FELP_LED1_MSK;
52904 + /* change ACT LED control to blink mode */
52905 + ctrl |= PHY_M_FELP_LED1_CTRL(LED_PAR_CTRL_ACT_BL);
52906 + gm_phy_write(hw, port, PHY_MARV_FE_LED_PAR, ctrl);
52907 + break;
52908 +
52909 + case CHIP_ID_YUKON_XL:
52910 + pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
52911 +
52912 + /* select page 3 to access LED control register */
52913 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
52914 +
52915 + /* set LED Function Control register */
52916 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, (PHY_M_LEDC_LOS_CTRL(1) | /* LINK/ACT */
52917 + PHY_M_LEDC_INIT_CTRL(7) | /* 10 Mbps */
52918 + PHY_M_LEDC_STA1_CTRL(7) | /* 100 Mbps */
52919 + PHY_M_LEDC_STA0_CTRL(7))); /* 1000 Mbps */
52920 +
52921 + /* set Polarity Control register */
52922 + gm_phy_write(hw, port, PHY_MARV_PHY_STAT,
52923 + (PHY_M_POLC_LS1_P_MIX(4) |
52924 + PHY_M_POLC_IS0_P_MIX(4) |
52925 + PHY_M_POLC_LOS_CTRL(2) |
52926 + PHY_M_POLC_INIT_CTRL(2) |
52927 + PHY_M_POLC_STA1_CTRL(2) |
52928 + PHY_M_POLC_STA0_CTRL(2)));
52929 +
52930 + /* restore page register */
52931 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
52932 + break;
52933 +
52934 + default:
52935 + /* set Tx LED (LED_TX) to blink mode on Rx OR Tx activity */
52936 + ledctrl |= PHY_M_LED_BLINK_RT(BLINK_84MS) | PHY_M_LEDC_TX_CTRL;
52937 + /* turn off the Rx LED (LED_RX) */
52938 + ledover |= PHY_M_LED_MO_RX(MO_LED_OFF);
52939 + }
52940 +
52941 + if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev >= 2) {
52942 + /* apply fixes in PHY AFE */
52943 + gm_phy_write(hw, port, 22, 255);
52944 + /* increase differential signal amplitude in 10BASE-T */
52945 + gm_phy_write(hw, port, 24, 0xaa99);
52946 + gm_phy_write(hw, port, 23, 0x2011);
52947 +
52948 + /* fix for IEEE A/B Symmetry failure in 1000BASE-T */
52949 + gm_phy_write(hw, port, 24, 0xa204);
52950 + gm_phy_write(hw, port, 23, 0x2002);
52951 +
52952 + /* set page register to 0 */
52953 + gm_phy_write(hw, port, 22, 0);
52954 + } else {
52955 + gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
52956 +
52957 + if (sky2->autoneg == AUTONEG_DISABLE || sky2->speed == SPEED_100) {
52958 + /* turn on 100 Mbps LED (LED_LINK100) */
52959 + ledover |= PHY_M_LED_MO_100(MO_LED_ON);
52960 + }
52961 +
52962 + if (ledover)
52963 + gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
52964 +
52965 + }
52966 + /* Enable phy interrupt on auto-negotiation complete (or link up) */
52967 + if (sky2->autoneg == AUTONEG_ENABLE)
52968 + gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_IS_AN_COMPL);
52969 + else
52970 + gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_DEF_MSK);
52971 +}
52972 +
52973 +/* Force a renegotiation */
52974 +static void sky2_phy_reinit(struct sky2_port *sky2)
52975 +{
52976 + down(&sky2->phy_sema);
52977 + sky2_phy_init(sky2->hw, sky2->port);
52978 + up(&sky2->phy_sema);
52979 +}
52980 +
52981 +static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
52982 +{
52983 + struct sky2_port *sky2 = netdev_priv(hw->dev[port]);
52984 + u16 reg;
52985 + int i;
52986 + const u8 *addr = hw->dev[port]->dev_addr;
52987 +
52988 + sky2_write32(hw, SK_REG(port, GPHY_CTRL), GPC_RST_SET);
52989 + sky2_write32(hw, SK_REG(port, GPHY_CTRL), GPC_RST_CLR|GPC_ENA_PAUSE);
52990 +
52991 + sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_RST_CLR);
52992 +
52993 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0 && port == 1) {
52994 + /* WA DEV_472 -- looks like crossed wires on port 2 */
52995 + /* clear GMAC 1 Control reset */
52996 + sky2_write8(hw, SK_REG(0, GMAC_CTRL), GMC_RST_CLR);
52997 + do {
52998 + sky2_write8(hw, SK_REG(1, GMAC_CTRL), GMC_RST_SET);
52999 + sky2_write8(hw, SK_REG(1, GMAC_CTRL), GMC_RST_CLR);
53000 + } while (gm_phy_read(hw, 1, PHY_MARV_ID0) != PHY_MARV_ID0_VAL ||
53001 + gm_phy_read(hw, 1, PHY_MARV_ID1) != PHY_MARV_ID1_Y2 ||
53002 + gm_phy_read(hw, 1, PHY_MARV_INT_MASK) != 0);
53003 + }
53004 +
53005 + if (sky2->autoneg == AUTONEG_DISABLE) {
53006 + reg = gma_read16(hw, port, GM_GP_CTRL);
53007 + reg |= GM_GPCR_AU_ALL_DIS;
53008 + gma_write16(hw, port, GM_GP_CTRL, reg);
53009 + gma_read16(hw, port, GM_GP_CTRL);
53010 +
53011 + switch (sky2->speed) {
53012 + case SPEED_1000:
53013 + reg &= ~GM_GPCR_SPEED_100;
53014 + reg |= GM_GPCR_SPEED_1000;
53015 + break;
53016 + case SPEED_100:
53017 + reg &= ~GM_GPCR_SPEED_1000;
53018 + reg |= GM_GPCR_SPEED_100;
53019 + break;
53020 + case SPEED_10:
53021 + reg &= ~(GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100);
53022 + break;
53023 + }
53024 +
53025 + if (sky2->duplex == DUPLEX_FULL)
53026 + reg |= GM_GPCR_DUP_FULL;
53027 + } else
53028 + reg = GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100 | GM_GPCR_DUP_FULL;
53029 +
53030 + if (!sky2->tx_pause && !sky2->rx_pause) {
53031 + sky2_write32(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
53032 + reg |=
53033 + GM_GPCR_FC_TX_DIS | GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS;
53034 + } else if (sky2->tx_pause && !sky2->rx_pause) {
53035 + /* disable Rx flow-control */
53036 + reg |= GM_GPCR_FC_RX_DIS | GM_GPCR_AU_FCT_DIS;
53037 + }
53038 +
53039 + gma_write16(hw, port, GM_GP_CTRL, reg);
53040 +
53041 + sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC));
53042 +
53043 + down(&sky2->phy_sema);
53044 + sky2_phy_init(hw, port);
53045 + up(&sky2->phy_sema);
53046 +
53047 + /* MIB clear */
53048 + reg = gma_read16(hw, port, GM_PHY_ADDR);
53049 + gma_write16(hw, port, GM_PHY_ADDR, reg | GM_PAR_MIB_CLR);
53050 +
53051 + for (i = GM_MIB_CNT_BASE; i <= GM_MIB_CNT_END; i += 4)
53052 + gma_read16(hw, port, i);
53053 + gma_write16(hw, port, GM_PHY_ADDR, reg);
53054 +
53055 + /* transmit control */
53056 + gma_write16(hw, port, GM_TX_CTRL, TX_COL_THR(TX_COL_DEF));
53057 +
53058 + /* receive control reg: unicast + multicast + no FCS */
53059 + gma_write16(hw, port, GM_RX_CTRL,
53060 + GM_RXCR_UCF_ENA | GM_RXCR_CRC_DIS | GM_RXCR_MCF_ENA);
53061 +
53062 + /* transmit flow control */
53063 + gma_write16(hw, port, GM_TX_FLOW_CTRL, 0xffff);
53064 +
53065 + /* transmit parameter */
53066 + gma_write16(hw, port, GM_TX_PARAM,
53067 + TX_JAM_LEN_VAL(TX_JAM_LEN_DEF) |
53068 + TX_JAM_IPG_VAL(TX_JAM_IPG_DEF) |
53069 + TX_IPG_JAM_DATA(TX_IPG_JAM_DEF) |
53070 + TX_BACK_OFF_LIM(TX_BOF_LIM_DEF));
53071 +
53072 + /* serial mode register */
53073 + reg = DATA_BLIND_VAL(DATA_BLIND_DEF) |
53074 + GM_SMOD_VLAN_ENA | IPG_DATA_VAL(IPG_DATA_DEF);
53075 +
53076 + if (hw->dev[port]->mtu > ETH_DATA_LEN)
53077 + reg |= GM_SMOD_JUMBO_ENA;
53078 +
53079 + gma_write16(hw, port, GM_SERIAL_MODE, reg);
53080 +
53081 + /* virtual address for data */
53082 + gma_set_addr(hw, port, GM_SRC_ADDR_2L, addr);
53083 +
53084 + /* physical address: used for pause frames */
53085 + gma_set_addr(hw, port, GM_SRC_ADDR_1L, addr);
53086 +
53087 + /* ignore counter overflows */
53088 + gma_write16(hw, port, GM_TX_IRQ_MSK, 0);
53089 + gma_write16(hw, port, GM_RX_IRQ_MSK, 0);
53090 + gma_write16(hw, port, GM_TR_IRQ_MSK, 0);
53091 +
53092 + /* Configure Rx MAC FIFO */
53093 + sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_CLR);
53094 + sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
53095 + GMF_OPER_ON | GMF_RX_F_FL_ON);
53096 +
53097 + /* Flush Rx MAC FIFO on any flow control or error */
53098 + sky2_write16(hw, SK_REG(port, RX_GMF_FL_MSK), GMR_FS_ANY_ERR);
53099 +
53100 + /* Set threshold to 0xa (64 bytes)
53101 + * ASF disabled so no need to do WA dev #4.30
53102 + */
53103 + sky2_write16(hw, SK_REG(port, RX_GMF_FL_THR), RX_GMF_FL_THR_DEF);
53104 +
53105 + /* Configure Tx MAC FIFO */
53106 + sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_CLR);
53107 + sky2_write16(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_OPER_ON);
53108 +
53109 + if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
53110 + sky2_write8(hw, SK_REG(port, RX_GMF_LP_THR), 768/8);
53111 + sky2_write8(hw, SK_REG(port, RX_GMF_UP_THR), 1024/8);
53112 + if (hw->dev[port]->mtu > ETH_DATA_LEN) {
53113 + /* set Tx GMAC FIFO Almost Empty Threshold */
53114 + sky2_write32(hw, SK_REG(port, TX_GMF_AE_THR), 0x180);
53115 + /* Disable Store & Forward mode for TX */
53116 + sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_STFW_DIS);
53117 + }
53118 + }
53119 +
53120 +}
53121 +
53122 +/* Assign Ram Buffer allocation.
53123 + * start and end are in units of 4k bytes
53124 + * ram registers are in units of 64bit words
53125 + */
53126 +static void sky2_ramset(struct sky2_hw *hw, u16 q, u8 startk, u8 endk)
53127 +{
53128 + u32 start, end;
53129 +
53130 + start = startk * 4096/8;
53131 + end = (endk * 4096/8) - 1;
53132 +
53133 + sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_RST_CLR);
53134 + sky2_write32(hw, RB_ADDR(q, RB_START), start);
53135 + sky2_write32(hw, RB_ADDR(q, RB_END), end);
53136 + sky2_write32(hw, RB_ADDR(q, RB_WP), start);
53137 + sky2_write32(hw, RB_ADDR(q, RB_RP), start);
53138 +
53139 + if (q == Q_R1 || q == Q_R2) {
53140 + u32 space = (endk - startk) * 4096/8;
53141 + u32 tp = space - space/4;
53142 +
53143 + /* On receive queue's set the thresholds
53144 + * give receiver priority when > 3/4 full
53145 + * send pause when down to 2K
53146 + */
53147 + sky2_write32(hw, RB_ADDR(q, RB_RX_UTHP), tp);
53148 + sky2_write32(hw, RB_ADDR(q, RB_RX_LTHP), space/2);
53149 +
53150 + tp = space - 2048/8;
53151 + sky2_write32(hw, RB_ADDR(q, RB_RX_UTPP), tp);
53152 + sky2_write32(hw, RB_ADDR(q, RB_RX_LTPP), space/4);
53153 + } else {
53154 + /* Enable store & forward on Tx queue's because
53155 + * Tx FIFO is only 1K on Yukon
53156 + */
53157 + sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_ENA_STFWD);
53158 + }
53159 +
53160 + sky2_write8(hw, RB_ADDR(q, RB_CTRL), RB_ENA_OP_MD);
53161 + sky2_read8(hw, RB_ADDR(q, RB_CTRL));
53162 +}
53163 +
53164 +/* Setup Bus Memory Interface */
53165 +static void sky2_qset(struct sky2_hw *hw, u16 q)
53166 +{
53167 + sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_CLR_RESET);
53168 + sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_OPER_INIT);
53169 + sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_FIFO_OP_ON);
53170 + sky2_write32(hw, Q_ADDR(q, Q_WM), BMU_WM_DEFAULT);
53171 +}
53172 +
53173 +/* Setup prefetch unit registers. This is the interface between
53174 + * hardware and driver list elements
53175 + */
53176 +static void sky2_prefetch_init(struct sky2_hw *hw, u32 qaddr,
53177 + u64 addr, u32 last)
53178 +{
53179 + sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
53180 + sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_RST_CLR);
53181 + sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_ADDR_HI), addr >> 32);
53182 + sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_ADDR_LO), (u32) addr);
53183 + sky2_write16(hw, Y2_QADDR(qaddr, PREF_UNIT_LAST_IDX), last);
53184 + sky2_write32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL), PREF_UNIT_OP_ON);
53185 +
53186 + sky2_read32(hw, Y2_QADDR(qaddr, PREF_UNIT_CTRL));
53187 +}
53188 +
53189 +static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2)
53190 +{
53191 + struct sky2_tx_le *le = sky2->tx_le + sky2->tx_prod;
53192 +
53193 + sky2->tx_prod = (sky2->tx_prod + 1) % TX_RING_SIZE;
53194 + return le;
53195 +}
53196 +
53197 +/*
53198 + * This is a workaround code taken from SysKonnect sk98lin driver
53199 + * to deal with chip bug on Yukon EC rev 0 in the wraparound case.
53200 + */
53201 +static void sky2_put_idx(struct sky2_hw *hw, unsigned q,
53202 + u16 idx, u16 *last, u16 size)
53203 +{
53204 + wmb();
53205 + if (is_ec_a1(hw) && idx < *last) {
53206 + u16 hwget = sky2_read16(hw, Y2_QADDR(q, PREF_UNIT_GET_IDX));
53207 +
53208 + if (hwget == 0) {
53209 + /* Start prefetching again */
53210 + sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 0xe0);
53211 + goto setnew;
53212 + }
53213 +
53214 + if (hwget == size - 1) {
53215 + /* set watermark to one list element */
53216 + sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 8);
53217 +
53218 + /* set put index to first list element */
53219 + sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), 0);
53220 + } else /* have hardware go to end of list */
53221 + sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX),
53222 + size - 1);
53223 + } else {
53224 +setnew:
53225 + sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
53226 + }
53227 + *last = idx;
53228 + mmiowb();
53229 +}
53230 +
53231 +
53232 +static inline struct sky2_rx_le *sky2_next_rx(struct sky2_port *sky2)
53233 +{
53234 + struct sky2_rx_le *le = sky2->rx_le + sky2->rx_put;
53235 + sky2->rx_put = (sky2->rx_put + 1) % RX_LE_SIZE;
53236 + return le;
53237 +}
53238 +
53239 +/* Return high part of DMA address (could be 32 or 64 bit) */
53240 +static inline u32 high32(dma_addr_t a)
53241 +{
53242 + return sizeof(a) > sizeof(u32) ? (a >> 16) >> 16 : 0;
53243 +}
53244 +
53245 +/* Build description to hardware about buffer */
53246 +static void sky2_rx_add(struct sky2_port *sky2, dma_addr_t map)
53247 +{
53248 + struct sky2_rx_le *le;
53249 + u32 hi = high32(map);
53250 + u16 len = sky2->rx_bufsize;
53251 +
53252 + if (sky2->rx_addr64 != hi) {
53253 + le = sky2_next_rx(sky2);
53254 + le->addr = cpu_to_le32(hi);
53255 + le->ctrl = 0;
53256 + le->opcode = OP_ADDR64 | HW_OWNER;
53257 + sky2->rx_addr64 = high32(map + len);
53258 + }
53259 +
53260 + le = sky2_next_rx(sky2);
53261 + le->addr = cpu_to_le32((u32) map);
53262 + le->length = cpu_to_le16(len);
53263 + le->ctrl = 0;
53264 + le->opcode = OP_PACKET | HW_OWNER;
53265 +}
53266 +
53267 +
53268 +/* Tell chip where to start receive checksum.
53269 + * Actually has two checksums, but set both same to avoid possible byte
53270 + * order problems.
53271 + */
53272 +static void rx_set_checksum(struct sky2_port *sky2)
53273 +{
53274 + struct sky2_rx_le *le;
53275 +
53276 + le = sky2_next_rx(sky2);
53277 + le->addr = (ETH_HLEN << 16) | ETH_HLEN;
53278 + le->ctrl = 0;
53279 + le->opcode = OP_TCPSTART | HW_OWNER;
53280 +
53281 + sky2_write32(sky2->hw,
53282 + Q_ADDR(rxqaddr[sky2->port], Q_CSR),
53283 + sky2->rx_csum ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM);
53284 +
53285 +}
53286 +
53287 +/*
53288 + * The RX Stop command will not work for Yukon-2 if the BMU does not
53289 + * reach the end of packet and since we can't make sure that we have
53290 + * incoming data, we must reset the BMU while it is not doing a DMA
53291 + * transfer. Since it is possible that the RX path is still active,
53292 + * the RX RAM buffer will be stopped first, so any possible incoming
53293 + * data will not trigger a DMA. After the RAM buffer is stopped, the
53294 + * BMU is polled until any DMA in progress is ended and only then it
53295 + * will be reset.
53296 + */
53297 +static void sky2_rx_stop(struct sky2_port *sky2)
53298 +{
53299 + struct sky2_hw *hw = sky2->hw;
53300 + unsigned rxq = rxqaddr[sky2->port];
53301 + int i;
53302 +
53303 + /* disable the RAM Buffer receive queue */
53304 + sky2_write8(hw, RB_ADDR(rxq, RB_CTRL), RB_DIS_OP_MD);
53305 +
53306 + for (i = 0; i < 0xffff; i++)
53307 + if (sky2_read8(hw, RB_ADDR(rxq, Q_RSL))
53308 + == sky2_read8(hw, RB_ADDR(rxq, Q_RL)))
53309 + goto stopped;
53310 +
53311 + printk(KERN_WARNING PFX "%s: receiver stop failed\n",
53312 + sky2->netdev->name);
53313 +stopped:
53314 + sky2_write32(hw, Q_ADDR(rxq, Q_CSR), BMU_RST_SET | BMU_FIFO_RST);
53315 +
53316 + /* reset the Rx prefetch unit */
53317 + sky2_write32(hw, Y2_QADDR(rxq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
53318 +}
53319 +
53320 +/* Clean out receive buffer area, assumes receiver hardware stopped */
53321 +static void sky2_rx_clean(struct sky2_port *sky2)
53322 +{
53323 + unsigned i;
53324 +
53325 + memset(sky2->rx_le, 0, RX_LE_BYTES);
53326 + for (i = 0; i < sky2->rx_pending; i++) {
53327 + struct ring_info *re = sky2->rx_ring + i;
53328 +
53329 + if (re->skb) {
53330 + pci_unmap_single(sky2->hw->pdev,
53331 + re->mapaddr, sky2->rx_bufsize,
53332 + PCI_DMA_FROMDEVICE);
53333 + kfree_skb(re->skb);
53334 + re->skb = NULL;
53335 + }
53336 + }
53337 +}
53338 +
53339 +/* Basic MII support */
53340 +static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
53341 +{
53342 + struct mii_ioctl_data *data = if_mii(ifr);
53343 + struct sky2_port *sky2 = netdev_priv(dev);
53344 + struct sky2_hw *hw = sky2->hw;
53345 + int err = -EOPNOTSUPP;
53346 +
53347 + if (!netif_running(dev))
53348 + return -ENODEV; /* Phy still in reset */
53349 +
53350 + switch(cmd) {
53351 + case SIOCGMIIPHY:
53352 + data->phy_id = PHY_ADDR_MARV;
53353 +
53354 + /* fallthru */
53355 + case SIOCGMIIREG: {
53356 + u16 val = 0;
53357 +
53358 + down(&sky2->phy_sema);
53359 + err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val);
53360 + up(&sky2->phy_sema);
53361 +
53362 + data->val_out = val;
53363 + break;
53364 + }
53365 +
53366 + case SIOCSMIIREG:
53367 + if (!capable(CAP_NET_ADMIN))
53368 + return -EPERM;
53369 +
53370 + down(&sky2->phy_sema);
53371 + err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f,
53372 + data->val_in);
53373 + up(&sky2->phy_sema);
53374 + break;
53375 + }
53376 + return err;
53377 +}
53378 +
53379 +#ifdef SKY2_VLAN_TAG_USED
53380 +static void sky2_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
53381 +{
53382 + struct sky2_port *sky2 = netdev_priv(dev);
53383 + struct sky2_hw *hw = sky2->hw;
53384 + u16 port = sky2->port;
53385 +
53386 + spin_lock_bh(&sky2->tx_lock);
53387 +
53388 + sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T), RX_VLAN_STRIP_ON);
53389 + sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_VLAN_TAG_ON);
53390 + sky2->vlgrp = grp;
53391 +
53392 + spin_unlock_bh(&sky2->tx_lock);
53393 +}
53394 +
53395 +static void sky2_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
53396 +{
53397 + struct sky2_port *sky2 = netdev_priv(dev);
53398 + struct sky2_hw *hw = sky2->hw;
53399 + u16 port = sky2->port;
53400 +
53401 + spin_lock_bh(&sky2->tx_lock);
53402 +
53403 + sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T), RX_VLAN_STRIP_OFF);
53404 + sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T), TX_VLAN_TAG_OFF);
53405 + if (sky2->vlgrp)
53406 + sky2->vlgrp->vlan_devices[vid] = NULL;
53407 +
53408 + spin_unlock_bh(&sky2->tx_lock);
53409 +}
53410 +#endif
53411 +
53412 +/*
53413 + * It appears the hardware has a bug in the FIFO logic that
53414 + * cause it to hang if the FIFO gets overrun and the receive buffer
53415 + * is not aligned. Also dev_alloc_skb() won't align properly if slab
53416 + * debugging is enabled.
53417 + */
53418 +static inline struct sk_buff *sky2_alloc_skb(unsigned int size, gfp_t gfp_mask)
53419 +{
53420 + struct sk_buff *skb;
53421 +
53422 + skb = __dev_alloc_skb(size + RX_SKB_ALIGN, gfp_mask);
53423 + if (likely(skb)) {
53424 + unsigned long p = (unsigned long) skb->data;
53425 + skb_reserve(skb,
53426 + ((p + RX_SKB_ALIGN - 1) & ~(RX_SKB_ALIGN - 1)) - p);
53427 + }
53428 +
53429 + return skb;
53430 +}
53431 +
53432 +/*
53433 + * Allocate and setup receiver buffer pool.
53434 + * In case of 64 bit dma, there are 2X as many list elements
53435 + * available as ring entries
53436 + * and need to reserve one list element so we don't wrap around.
53437 + */
53438 +static int sky2_rx_start(struct sky2_port *sky2)
53439 +{
53440 + struct sky2_hw *hw = sky2->hw;
53441 + unsigned rxq = rxqaddr[sky2->port];
53442 + int i;
53443 +
53444 + sky2->rx_put = sky2->rx_next = 0;
53445 + sky2_qset(hw, rxq);
53446 +
53447 + if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev >= 2) {
53448 + /* MAC Rx RAM Read is controlled by hardware */
53449 + sky2_write32(hw, Q_ADDR(rxq, Q_F), F_M_RX_RAM_DIS);
53450 + }
53451 +
53452 + sky2_prefetch_init(hw, rxq, sky2->rx_le_map, RX_LE_SIZE - 1);
53453 +
53454 + rx_set_checksum(sky2);
53455 + for (i = 0; i < sky2->rx_pending; i++) {
53456 + struct ring_info *re = sky2->rx_ring + i;
53457 +
53458 + re->skb = sky2_alloc_skb(sky2->rx_bufsize, GFP_KERNEL);
53459 + if (!re->skb)
53460 + goto nomem;
53461 +
53462 + re->mapaddr = pci_map_single(hw->pdev, re->skb->data,
53463 + sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
53464 + sky2_rx_add(sky2, re->mapaddr);
53465 + }
53466 +
53467 + /* Truncate oversize frames */
53468 + sky2_write16(hw, SK_REG(sky2->port, RX_GMF_TR_THR), sky2->rx_bufsize - 8);
53469 + sky2_write32(hw, SK_REG(sky2->port, RX_GMF_CTRL_T), RX_TRUNC_ON);
53470 +
53471 + /* Tell chip about available buffers */
53472 + sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put);
53473 + sky2->rx_last_put = sky2_read16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX));
53474 + return 0;
53475 +nomem:
53476 + sky2_rx_clean(sky2);
53477 + return -ENOMEM;
53478 +}
53479 +
53480 +/* Bring up network interface. */
53481 +static int sky2_up(struct net_device *dev)
53482 +{
53483 + struct sky2_port *sky2 = netdev_priv(dev);
53484 + struct sky2_hw *hw = sky2->hw;
53485 + unsigned port = sky2->port;
53486 + u32 ramsize, rxspace;
53487 + int err = -ENOMEM;
53488 +
53489 + if (netif_msg_ifup(sky2))
53490 + printk(KERN_INFO PFX "%s: enabling interface\n", dev->name);
53491 +
53492 + /* must be power of 2 */
53493 + sky2->tx_le = pci_alloc_consistent(hw->pdev,
53494 + TX_RING_SIZE *
53495 + sizeof(struct sky2_tx_le),
53496 + &sky2->tx_le_map);
53497 + if (!sky2->tx_le)
53498 + goto err_out;
53499 +
53500 + sky2->tx_ring = kcalloc(TX_RING_SIZE, sizeof(struct tx_ring_info),
53501 + GFP_KERNEL);
53502 + if (!sky2->tx_ring)
53503 + goto err_out;
53504 + sky2->tx_prod = sky2->tx_cons = 0;
53505 +
53506 + sky2->rx_le = pci_alloc_consistent(hw->pdev, RX_LE_BYTES,
53507 + &sky2->rx_le_map);
53508 + if (!sky2->rx_le)
53509 + goto err_out;
53510 + memset(sky2->rx_le, 0, RX_LE_BYTES);
53511 +
53512 + sky2->rx_ring = kcalloc(sky2->rx_pending, sizeof(struct ring_info),
53513 + GFP_KERNEL);
53514 + if (!sky2->rx_ring)
53515 + goto err_out;
53516 +
53517 + sky2_mac_init(hw, port);
53518 +
53519 + /* Determine available ram buffer space (in 4K blocks).
53520 + * Note: not sure about the FE setting below yet
53521 + */
53522 + if (hw->chip_id == CHIP_ID_YUKON_FE)
53523 + ramsize = 4;
53524 + else
53525 + ramsize = sky2_read8(hw, B2_E_0);
53526 +
53527 + /* Give transmitter one third (rounded up) */
53528 + rxspace = ramsize - (ramsize + 2) / 3;
53529 +
53530 + sky2_ramset(hw, rxqaddr[port], 0, rxspace);
53531 + sky2_ramset(hw, txqaddr[port], rxspace, ramsize);
53532 +
53533 + /* Make sure SyncQ is disabled */
53534 + sky2_write8(hw, RB_ADDR(port == 0 ? Q_XS1 : Q_XS2, RB_CTRL),
53535 + RB_RST_SET);
53536 +
53537 + sky2_qset(hw, txqaddr[port]);
53538 +
53539 + /* Set almost empty threshold */
53540 + if (hw->chip_id == CHIP_ID_YUKON_EC_U && hw->chip_rev == 1)
53541 + sky2_write16(hw, Q_ADDR(txqaddr[port], Q_AL), 0x1a0);
53542 +
53543 + sky2_prefetch_init(hw, txqaddr[port], sky2->tx_le_map,
53544 + TX_RING_SIZE - 1);
53545 +
53546 + err = sky2_rx_start(sky2);
53547 + if (err)
53548 + goto err_out;
53549 +
53550 + /* Enable interrupts from phy/mac for port */
53551 + spin_lock_irq(&hw->hw_lock);
53552 + hw->intr_mask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
53553 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
53554 + spin_unlock_irq(&hw->hw_lock);
53555 + return 0;
53556 +
53557 +err_out:
53558 + if (sky2->rx_le) {
53559 + pci_free_consistent(hw->pdev, RX_LE_BYTES,
53560 + sky2->rx_le, sky2->rx_le_map);
53561 + sky2->rx_le = NULL;
53562 + }
53563 + if (sky2->tx_le) {
53564 + pci_free_consistent(hw->pdev,
53565 + TX_RING_SIZE * sizeof(struct sky2_tx_le),
53566 + sky2->tx_le, sky2->tx_le_map);
53567 + sky2->tx_le = NULL;
53568 + }
53569 + kfree(sky2->tx_ring);
53570 + kfree(sky2->rx_ring);
53571 +
53572 + sky2->tx_ring = NULL;
53573 + sky2->rx_ring = NULL;
53574 + return err;
53575 +}
53576 +
53577 +/* Modular subtraction in ring */
53578 +static inline int tx_dist(unsigned tail, unsigned head)
53579 +{
53580 + return (head - tail) % TX_RING_SIZE;
53581 +}
53582 +
53583 +/* Number of list elements available for next tx */
53584 +static inline int tx_avail(const struct sky2_port *sky2)
53585 +{
53586 + return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod);
53587 +}
53588 +
53589 +/* Estimate of number of transmit list elements required */
53590 +static unsigned tx_le_req(const struct sk_buff *skb)
53591 +{
53592 + unsigned count;
53593 +
53594 + count = sizeof(dma_addr_t) / sizeof(u32);
53595 + count += skb_shinfo(skb)->nr_frags * count;
53596 +
53597 + if (skb_shinfo(skb)->gso_size)
53598 + ++count;
53599 +
53600 + if (skb->ip_summed == CHECKSUM_HW)
53601 + ++count;
53602 +
53603 + return count;
53604 +}
53605 +
53606 +/*
53607 + * Put one packet in ring for transmit.
53608 + * A single packet can generate multiple list elements, and
53609 + * the number of ring elements will probably be less than the number
53610 + * of list elements used.
53611 + *
53612 + * No BH disabling for tx_lock here (like tg3)
53613 + */
53614 +static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
53615 +{
53616 + struct sky2_port *sky2 = netdev_priv(dev);
53617 + struct sky2_hw *hw = sky2->hw;
53618 + struct sky2_tx_le *le = NULL;
53619 + struct tx_ring_info *re;
53620 + unsigned i, len;
53621 + int avail;
53622 + dma_addr_t mapping;
53623 + u32 addr64;
53624 + u16 mss;
53625 + u8 ctrl;
53626 +
53627 + /* No BH disabling for tx_lock here. We are running in BH disabled
53628 + * context and TX reclaim runs via poll inside of a software
53629 + * interrupt, and no related locks in IRQ processing.
53630 + */
53631 + if (!spin_trylock(&sky2->tx_lock))
53632 + return NETDEV_TX_LOCKED;
53633 +
53634 + if (unlikely(tx_avail(sky2) < tx_le_req(skb))) {
53635 + /* There is a known but harmless race with lockless tx
53636 + * and netif_stop_queue.
53637 + */
53638 + if (!netif_queue_stopped(dev)) {
53639 + netif_stop_queue(dev);
53640 + if (net_ratelimit())
53641 + printk(KERN_WARNING PFX "%s: ring full when queue awake!\n",
53642 + dev->name);
53643 + }
53644 + spin_unlock(&sky2->tx_lock);
53645 +
53646 + return NETDEV_TX_BUSY;
53647 + }
53648 +
53649 + if (unlikely(netif_msg_tx_queued(sky2)))
53650 + printk(KERN_DEBUG "%s: tx queued, slot %u, len %d\n",
53651 + dev->name, sky2->tx_prod, skb->len);
53652 +
53653 + len = skb_headlen(skb);
53654 + mapping = pci_map_single(hw->pdev, skb->data, len, PCI_DMA_TODEVICE);
53655 + addr64 = high32(mapping);
53656 +
53657 + re = sky2->tx_ring + sky2->tx_prod;
53658 +
53659 + /* Send high bits if changed or crosses boundary */
53660 + if (addr64 != sky2->tx_addr64 || high32(mapping + len) != sky2->tx_addr64) {
53661 + le = get_tx_le(sky2);
53662 + le->tx.addr = cpu_to_le32(addr64);
53663 + le->ctrl = 0;
53664 + le->opcode = OP_ADDR64 | HW_OWNER;
53665 + sky2->tx_addr64 = high32(mapping + len);
53666 + }
53667 +
53668 + /* Check for TCP Segmentation Offload */
53669 + mss = skb_shinfo(skb)->gso_size;
53670 + if (mss != 0) {
53671 + /* just drop the packet if non-linear expansion fails */
53672 + if (skb_header_cloned(skb) &&
53673 + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
53674 + dev_kfree_skb_any(skb);
53675 + goto out_unlock;
53676 + }
53677 +
53678 + mss += ((skb->h.th->doff - 5) * 4); /* TCP options */
53679 + mss += (skb->nh.iph->ihl * 4) + sizeof(struct tcphdr);
53680 + mss += ETH_HLEN;
53681 + }
53682 +
53683 + if (mss != sky2->tx_last_mss) {
53684 + le = get_tx_le(sky2);
53685 + le->tx.tso.size = cpu_to_le16(mss);
53686 + le->tx.tso.rsvd = 0;
53687 + le->opcode = OP_LRGLEN | HW_OWNER;
53688 + le->ctrl = 0;
53689 + sky2->tx_last_mss = mss;
53690 + }
53691 +
53692 + ctrl = 0;
53693 +#ifdef SKY2_VLAN_TAG_USED
53694 + /* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
53695 + if (sky2->vlgrp && vlan_tx_tag_present(skb)) {
53696 + if (!le) {
53697 + le = get_tx_le(sky2);
53698 + le->tx.addr = 0;
53699 + le->opcode = OP_VLAN|HW_OWNER;
53700 + le->ctrl = 0;
53701 + } else
53702 + le->opcode |= OP_VLAN;
53703 + le->length = cpu_to_be16(vlan_tx_tag_get(skb));
53704 + ctrl |= INS_VLAN;
53705 + }
53706 +#endif
53707 +
53708 + /* Handle TCP checksum offload */
53709 + if (skb->ip_summed == CHECKSUM_HW) {
53710 + u16 hdr = skb->h.raw - skb->data;
53711 + u16 offset = hdr + skb->csum;
53712 +
53713 + ctrl = CALSUM | WR_SUM | INIT_SUM | LOCK_SUM;
53714 + if (skb->nh.iph->protocol == IPPROTO_UDP)
53715 + ctrl |= UDPTCP;
53716 +
53717 + le = get_tx_le(sky2);
53718 + le->tx.csum.start = cpu_to_le16(hdr);
53719 + le->tx.csum.offset = cpu_to_le16(offset);
53720 + le->length = 0; /* initial checksum value */
53721 + le->ctrl = 1; /* one packet */
53722 + le->opcode = OP_TCPLISW | HW_OWNER;
53723 + }
53724 +
53725 + le = get_tx_le(sky2);
53726 + le->tx.addr = cpu_to_le32((u32) mapping);
53727 + le->length = cpu_to_le16(len);
53728 + le->ctrl = ctrl;
53729 + le->opcode = mss ? (OP_LARGESEND | HW_OWNER) : (OP_PACKET | HW_OWNER);
53730 +
53731 + /* Record the transmit mapping info */
53732 + re->skb = skb;
53733 + pci_unmap_addr_set(re, mapaddr, mapping);
53734 +
53735 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
53736 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
53737 + struct tx_ring_info *fre;
53738 +
53739 + mapping = pci_map_page(hw->pdev, frag->page, frag->page_offset,
53740 + frag->size, PCI_DMA_TODEVICE);
53741 + addr64 = high32(mapping);
53742 + if (addr64 != sky2->tx_addr64) {
53743 + le = get_tx_le(sky2);
53744 + le->tx.addr = cpu_to_le32(addr64);
53745 + le->ctrl = 0;
53746 + le->opcode = OP_ADDR64 | HW_OWNER;
53747 + sky2->tx_addr64 = addr64;
53748 + }
53749 +
53750 + le = get_tx_le(sky2);
53751 + le->tx.addr = cpu_to_le32((u32) mapping);
53752 + le->length = cpu_to_le16(frag->size);
53753 + le->ctrl = ctrl;
53754 + le->opcode = OP_BUFFER | HW_OWNER;
53755 +
53756 + fre = sky2->tx_ring
53757 + + ((re - sky2->tx_ring) + i + 1) % TX_RING_SIZE;
53758 + pci_unmap_addr_set(fre, mapaddr, mapping);
53759 + }
53760 +
53761 + re->idx = sky2->tx_prod;
53762 + le->ctrl |= EOP;
53763 +
53764 + avail = tx_avail(sky2);
53765 + if (mss != 0 || avail < TX_MIN_PENDING) {
53766 + le->ctrl |= FRC_STAT;
53767 + if (avail <= MAX_SKB_TX_LE)
53768 + netif_stop_queue(dev);
53769 + }
53770 +
53771 + sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod,
53772 + &sky2->tx_last_put, TX_RING_SIZE);
53773 +
53774 +out_unlock:
53775 + spin_unlock(&sky2->tx_lock);
53776 +
53777 + dev->trans_start = jiffies;
53778 + return NETDEV_TX_OK;
53779 +}
53780 +
53781 +/*
53782 + * Free ring elements from starting at tx_cons until "done"
53783 + *
53784 + * NB: the hardware will tell us about partial completion of multi-part
53785 + * buffers; these are deferred until completion.
53786 + */
53787 +static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
53788 +{
53789 + struct net_device *dev = sky2->netdev;
53790 + struct pci_dev *pdev = sky2->hw->pdev;
53791 + u16 nxt, put;
53792 + unsigned i;
53793 +
53794 + BUG_ON(done >= TX_RING_SIZE);
53795 +
53796 + if (unlikely(netif_msg_tx_done(sky2)))
53797 + printk(KERN_DEBUG "%s: tx done, up to %u\n",
53798 + dev->name, done);
53799 +
53800 + for (put = sky2->tx_cons; put != done; put = nxt) {
53801 + struct tx_ring_info *re = sky2->tx_ring + put;
53802 + struct sk_buff *skb = re->skb;
53803 +
53804 + nxt = re->idx;
53805 + BUG_ON(nxt >= TX_RING_SIZE);
53806 + prefetch(sky2->tx_ring + nxt);
53807 +
53808 + /* Check for partial status */
53809 + if (tx_dist(put, done) < tx_dist(put, nxt))
53810 + break;
53811 +
53812 + skb = re->skb;
53813 + pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
53814 + skb_headlen(skb), PCI_DMA_TODEVICE);
53815 +
53816 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
53817 + struct tx_ring_info *fre;
53818 + fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE;
53819 + pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr),
53820 + skb_shinfo(skb)->frags[i].size,
53821 + PCI_DMA_TODEVICE);
53822 + }
53823 +
53824 + dev_kfree_skb_any(skb);
53825 + }
53826 +
53827 + sky2->tx_cons = put;
53828 + if (netif_queue_stopped(dev) && tx_avail(sky2) > MAX_SKB_TX_LE)
53829 + netif_wake_queue(dev);
53830 +}
53831 +
53832 +/* Cleanup all untransmitted buffers, assume transmitter not running */
53833 +static void sky2_tx_clean(struct sky2_port *sky2)
53834 +{
53835 + spin_lock_bh(&sky2->tx_lock);
53836 + sky2_tx_complete(sky2, sky2->tx_prod);
53837 + spin_unlock_bh(&sky2->tx_lock);
53838 +}
53839 +
53840 +/* Network shutdown */
53841 +static int sky2_down(struct net_device *dev)
53842 +{
53843 + struct sky2_port *sky2 = netdev_priv(dev);
53844 + struct sky2_hw *hw = sky2->hw;
53845 + unsigned port = sky2->port;
53846 + u16 ctrl;
53847 +
53848 + /* Never really got started! */
53849 + if (!sky2->tx_le)
53850 + return 0;
53851 +
53852 + if (netif_msg_ifdown(sky2))
53853 + printk(KERN_INFO PFX "%s: disabling interface\n", dev->name);
53854 +
53855 + /* Stop more packets from being queued */
53856 + netif_stop_queue(dev);
53857 +
53858 + /* Disable port IRQ */
53859 + spin_lock_irq(&hw->hw_lock);
53860 + hw->intr_mask &= ~((sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
53861 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
53862 + spin_unlock_irq(&hw->hw_lock);
53863 +
53864 + flush_scheduled_work();
53865 +
53866 + sky2_phy_reset(hw, port);
53867 +
53868 + /* Stop transmitter */
53869 + sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR), BMU_STOP);
53870 + sky2_read32(hw, Q_ADDR(txqaddr[port], Q_CSR));
53871 +
53872 + sky2_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL),
53873 + RB_RST_SET | RB_DIS_OP_MD);
53874 +
53875 + ctrl = gma_read16(hw, port, GM_GP_CTRL);
53876 + ctrl &= ~(GM_GPCR_TX_ENA | GM_GPCR_RX_ENA);
53877 + gma_write16(hw, port, GM_GP_CTRL, ctrl);
53878 +
53879 + sky2_write8(hw, SK_REG(port, GPHY_CTRL), GPC_RST_SET);
53880 +
53881 + /* Workaround shared GMAC reset */
53882 + if (!(hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0
53883 + && port == 0 && hw->dev[1] && netif_running(hw->dev[1])))
53884 + sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_RST_SET);
53885 +
53886 + /* Disable Force Sync bit and Enable Alloc bit */
53887 + sky2_write8(hw, SK_REG(port, TXA_CTRL),
53888 + TXA_DIS_FSYNC | TXA_DIS_ALLOC | TXA_STOP_RC);
53889 +
53890 + /* Stop Interval Timer and Limit Counter of Tx Arbiter */
53891 + sky2_write32(hw, SK_REG(port, TXA_ITI_INI), 0L);
53892 + sky2_write32(hw, SK_REG(port, TXA_LIM_INI), 0L);
53893 +
53894 + /* Reset the PCI FIFO of the async Tx queue */
53895 + sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR),
53896 + BMU_RST_SET | BMU_FIFO_RST);
53897 +
53898 + /* Reset the Tx prefetch units */
53899 + sky2_write32(hw, Y2_QADDR(txqaddr[port], PREF_UNIT_CTRL),
53900 + PREF_UNIT_RST_SET);
53901 +
53902 + sky2_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL), RB_RST_SET);
53903 +
53904 + sky2_rx_stop(sky2);
53905 +
53906 + sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET);
53907 + sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET);
53908 +
53909 + /* turn off LED's */
53910 + sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
53911 +
53912 + synchronize_irq(hw->pdev->irq);
53913 +
53914 + sky2_tx_clean(sky2);
53915 + sky2_rx_clean(sky2);
53916 +
53917 + pci_free_consistent(hw->pdev, RX_LE_BYTES,
53918 + sky2->rx_le, sky2->rx_le_map);
53919 + kfree(sky2->rx_ring);
53920 +
53921 + pci_free_consistent(hw->pdev,
53922 + TX_RING_SIZE * sizeof(struct sky2_tx_le),
53923 + sky2->tx_le, sky2->tx_le_map);
53924 + kfree(sky2->tx_ring);
53925 +
53926 + sky2->tx_le = NULL;
53927 + sky2->rx_le = NULL;
53928 +
53929 + sky2->rx_ring = NULL;
53930 + sky2->tx_ring = NULL;
53931 +
53932 + return 0;
53933 +}
53934 +
53935 +static u16 sky2_phy_speed(const struct sky2_hw *hw, u16 aux)
53936 +{
53937 + if (!sky2_is_copper(hw))
53938 + return SPEED_1000;
53939 +
53940 + if (hw->chip_id == CHIP_ID_YUKON_FE)
53941 + return (aux & PHY_M_PS_SPEED_100) ? SPEED_100 : SPEED_10;
53942 +
53943 + switch (aux & PHY_M_PS_SPEED_MSK) {
53944 + case PHY_M_PS_SPEED_1000:
53945 + return SPEED_1000;
53946 + case PHY_M_PS_SPEED_100:
53947 + return SPEED_100;
53948 + default:
53949 + return SPEED_10;
53950 + }
53951 +}
53952 +
53953 +static void sky2_link_up(struct sky2_port *sky2)
53954 +{
53955 + struct sky2_hw *hw = sky2->hw;
53956 + unsigned port = sky2->port;
53957 + u16 reg;
53958 +
53959 + /* Enable Transmit FIFO Underrun */
53960 + sky2_write8(hw, SK_REG(port, GMAC_IRQ_MSK), GMAC_DEF_MSK);
53961 +
53962 + reg = gma_read16(hw, port, GM_GP_CTRL);
53963 + if (sky2->autoneg == AUTONEG_DISABLE) {
53964 + reg |= GM_GPCR_AU_ALL_DIS;
53965 +
53966 + /* Is write/read necessary? Copied from sky2_mac_init */
53967 + gma_write16(hw, port, GM_GP_CTRL, reg);
53968 + gma_read16(hw, port, GM_GP_CTRL);
53969 +
53970 + switch (sky2->speed) {
53971 + case SPEED_1000:
53972 + reg &= ~GM_GPCR_SPEED_100;
53973 + reg |= GM_GPCR_SPEED_1000;
53974 + break;
53975 + case SPEED_100:
53976 + reg &= ~GM_GPCR_SPEED_1000;
53977 + reg |= GM_GPCR_SPEED_100;
53978 + break;
53979 + case SPEED_10:
53980 + reg &= ~(GM_GPCR_SPEED_1000 | GM_GPCR_SPEED_100);
53981 + break;
53982 + }
53983 + } else
53984 + reg &= ~GM_GPCR_AU_ALL_DIS;
53985 +
53986 + if (sky2->duplex == DUPLEX_FULL || sky2->autoneg == AUTONEG_ENABLE)
53987 + reg |= GM_GPCR_DUP_FULL;
53988 +
53989 + /* enable Rx/Tx */
53990 + reg |= GM_GPCR_RX_ENA | GM_GPCR_TX_ENA;
53991 + gma_write16(hw, port, GM_GP_CTRL, reg);
53992 + gma_read16(hw, port, GM_GP_CTRL);
53993 +
53994 + gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_DEF_MSK);
53995 +
53996 + netif_carrier_on(sky2->netdev);
53997 + netif_wake_queue(sky2->netdev);
53998 +
53999 + /* Turn on link LED */
54000 + sky2_write8(hw, SK_REG(port, LNK_LED_REG),
54001 + LINKLED_ON | LINKLED_BLINK_OFF | LINKLED_LINKSYNC_OFF);
54002 +
54003 + if (hw->chip_id == CHIP_ID_YUKON_XL) {
54004 + u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
54005 +
54006 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
54007 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, PHY_M_LEDC_LOS_CTRL(1) | /* LINK/ACT */
54008 + PHY_M_LEDC_INIT_CTRL(sky2->speed ==
54009 + SPEED_10 ? 7 : 0) |
54010 + PHY_M_LEDC_STA1_CTRL(sky2->speed ==
54011 + SPEED_100 ? 7 : 0) |
54012 + PHY_M_LEDC_STA0_CTRL(sky2->speed ==
54013 + SPEED_1000 ? 7 : 0));
54014 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
54015 + }
54016 +
54017 + if (netif_msg_link(sky2))
54018 + printk(KERN_INFO PFX
54019 + "%s: Link is up at %d Mbps, %s duplex, flow control %s\n",
54020 + sky2->netdev->name, sky2->speed,
54021 + sky2->duplex == DUPLEX_FULL ? "full" : "half",
54022 + (sky2->tx_pause && sky2->rx_pause) ? "both" :
54023 + sky2->tx_pause ? "tx" : sky2->rx_pause ? "rx" : "none");
54024 +}
54025 +
54026 +static void sky2_link_down(struct sky2_port *sky2)
54027 +{
54028 + struct sky2_hw *hw = sky2->hw;
54029 + unsigned port = sky2->port;
54030 + u16 reg;
54031 +
54032 + gm_phy_write(hw, port, PHY_MARV_INT_MASK, 0);
54033 +
54034 + reg = gma_read16(hw, port, GM_GP_CTRL);
54035 + reg &= ~(GM_GPCR_RX_ENA | GM_GPCR_TX_ENA);
54036 + gma_write16(hw, port, GM_GP_CTRL, reg);
54037 + gma_read16(hw, port, GM_GP_CTRL); /* PCI post */
54038 +
54039 + if (sky2->rx_pause && !sky2->tx_pause) {
54040 + /* restore Asymmetric Pause bit */
54041 + gm_phy_write(hw, port, PHY_MARV_AUNE_ADV,
54042 + gm_phy_read(hw, port, PHY_MARV_AUNE_ADV)
54043 + | PHY_M_AN_ASP);
54044 + }
54045 +
54046 + netif_carrier_off(sky2->netdev);
54047 + netif_stop_queue(sky2->netdev);
54048 +
54049 + /* Turn on link LED */
54050 + sky2_write8(hw, SK_REG(port, LNK_LED_REG), LINKLED_OFF);
54051 +
54052 + if (netif_msg_link(sky2))
54053 + printk(KERN_INFO PFX "%s: Link is down.\n", sky2->netdev->name);
54054 + sky2_phy_init(hw, port);
54055 +}
54056 +
54057 +static int sky2_autoneg_done(struct sky2_port *sky2, u16 aux)
54058 +{
54059 + struct sky2_hw *hw = sky2->hw;
54060 + unsigned port = sky2->port;
54061 + u16 lpa;
54062 +
54063 + lpa = gm_phy_read(hw, port, PHY_MARV_AUNE_LP);
54064 +
54065 + if (lpa & PHY_M_AN_RF) {
54066 + printk(KERN_ERR PFX "%s: remote fault", sky2->netdev->name);
54067 + return -1;
54068 + }
54069 +
54070 + if (hw->chip_id != CHIP_ID_YUKON_FE &&
54071 + gm_phy_read(hw, port, PHY_MARV_1000T_STAT) & PHY_B_1000S_MSF) {
54072 + printk(KERN_ERR PFX "%s: master/slave fault",
54073 + sky2->netdev->name);
54074 + return -1;
54075 + }
54076 +
54077 + if (!(aux & PHY_M_PS_SPDUP_RES)) {
54078 + printk(KERN_ERR PFX "%s: speed/duplex mismatch",
54079 + sky2->netdev->name);
54080 + return -1;
54081 + }
54082 +
54083 + sky2->duplex = (aux & PHY_M_PS_FULL_DUP) ? DUPLEX_FULL : DUPLEX_HALF;
54084 +
54085 + sky2->speed = sky2_phy_speed(hw, aux);
54086 +
54087 + /* Pause bits are offset (9..8) */
54088 + if (hw->chip_id == CHIP_ID_YUKON_XL)
54089 + aux >>= 6;
54090 +
54091 + sky2->rx_pause = (aux & PHY_M_PS_RX_P_EN) != 0;
54092 + sky2->tx_pause = (aux & PHY_M_PS_TX_P_EN) != 0;
54093 +
54094 + if ((sky2->tx_pause || sky2->rx_pause)
54095 + && !(sky2->speed < SPEED_1000 && sky2->duplex == DUPLEX_HALF))
54096 + sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_ON);
54097 + else
54098 + sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
54099 +
54100 + return 0;
54101 +}
54102 +
54103 +/*
54104 + * Interrupt from PHY are handled outside of interrupt context
54105 + * because accessing phy registers requires spin wait which might
54106 + * cause excess interrupt latency.
54107 + */
54108 +static void sky2_phy_task(void *arg)
54109 +{
54110 + struct sky2_port *sky2 = arg;
54111 + struct sky2_hw *hw = sky2->hw;
54112 + u16 istatus, phystat;
54113 +
54114 + down(&sky2->phy_sema);
54115 + istatus = gm_phy_read(hw, sky2->port, PHY_MARV_INT_STAT);
54116 + phystat = gm_phy_read(hw, sky2->port, PHY_MARV_PHY_STAT);
54117 +
54118 + if (netif_msg_intr(sky2))
54119 + printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n",
54120 + sky2->netdev->name, istatus, phystat);
54121 +
54122 + if (istatus & PHY_M_IS_AN_COMPL) {
54123 + if (sky2_autoneg_done(sky2, phystat) == 0)
54124 + sky2_link_up(sky2);
54125 + goto out;
54126 + }
54127 +
54128 + if (istatus & PHY_M_IS_LSP_CHANGE)
54129 + sky2->speed = sky2_phy_speed(hw, phystat);
54130 +
54131 + if (istatus & PHY_M_IS_DUP_CHANGE)
54132 + sky2->duplex =
54133 + (phystat & PHY_M_PS_FULL_DUP) ? DUPLEX_FULL : DUPLEX_HALF;
54134 +
54135 + if (istatus & PHY_M_IS_LST_CHANGE) {
54136 + if (phystat & PHY_M_PS_LINK_UP)
54137 + sky2_link_up(sky2);
54138 + else
54139 + sky2_link_down(sky2);
54140 + }
54141 +out:
54142 + up(&sky2->phy_sema);
54143 +
54144 + spin_lock_irq(&hw->hw_lock);
54145 + hw->intr_mask |= (sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2;
54146 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
54147 + spin_unlock_irq(&hw->hw_lock);
54148 +}
54149 +
54150 +
54151 +/* Transmit timeout is only called if we are running, carries is up
54152 + * and tx queue is full (stopped).
54153 + */
54154 +static void sky2_tx_timeout(struct net_device *dev)
54155 +{
54156 + struct sky2_port *sky2 = netdev_priv(dev);
54157 + struct sky2_hw *hw = sky2->hw;
54158 + unsigned txq = txqaddr[sky2->port];
54159 + u16 ridx;
54160 +
54161 + /* Maybe we just missed an status interrupt */
54162 + spin_lock(&sky2->tx_lock);
54163 + ridx = sky2_read16(hw,
54164 + sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
54165 + sky2_tx_complete(sky2, ridx);
54166 + spin_unlock(&sky2->tx_lock);
54167 +
54168 + if (!netif_queue_stopped(dev)) {
54169 + if (net_ratelimit())
54170 + pr_info(PFX "transmit interrupt missed? recovered\n");
54171 + return;
54172 + }
54173 +
54174 + if (netif_msg_timer(sky2))
54175 + printk(KERN_ERR PFX "%s: tx timeout\n", dev->name);
54176 +
54177 + sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP);
54178 + sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
54179 +
54180 + sky2_tx_clean(sky2);
54181 +
54182 + sky2_qset(hw, txq);
54183 + sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1);
54184 +}
54185 +
54186 +
54187 +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
54188 +/* Want receive buffer size to be multiple of 64 bits
54189 + * and incl room for vlan and truncation
54190 + */
54191 +static inline unsigned sky2_buf_size(int mtu)
54192 +{
54193 + return roundup(mtu + ETH_HLEN + VLAN_HLEN, 8) + 8;
54194 +}
54195 +
54196 +static int sky2_change_mtu(struct net_device *dev, int new_mtu)
54197 +{
54198 + struct sky2_port *sky2 = netdev_priv(dev);
54199 + struct sky2_hw *hw = sky2->hw;
54200 + int err;
54201 + u16 ctl, mode;
54202 +
54203 + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
54204 + return -EINVAL;
54205 +
54206 + if (hw->chip_id == CHIP_ID_YUKON_EC_U && new_mtu > ETH_DATA_LEN)
54207 + return -EINVAL;
54208 +
54209 + if (!netif_running(dev)) {
54210 + dev->mtu = new_mtu;
54211 + return 0;
54212 + }
54213 +
54214 + sky2_write32(hw, B0_IMSK, 0);
54215 +
54216 + dev->trans_start = jiffies; /* prevent tx timeout */
54217 + netif_stop_queue(dev);
54218 + netif_poll_disable(hw->dev[0]);
54219 +
54220 + ctl = gma_read16(hw, sky2->port, GM_GP_CTRL);
54221 + gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA);
54222 + sky2_rx_stop(sky2);
54223 + sky2_rx_clean(sky2);
54224 +
54225 + dev->mtu = new_mtu;
54226 + sky2->rx_bufsize = sky2_buf_size(new_mtu);
54227 + mode = DATA_BLIND_VAL(DATA_BLIND_DEF) |
54228 + GM_SMOD_VLAN_ENA | IPG_DATA_VAL(IPG_DATA_DEF);
54229 +
54230 + if (dev->mtu > ETH_DATA_LEN)
54231 + mode |= GM_SMOD_JUMBO_ENA;
54232 +
54233 + gma_write16(hw, sky2->port, GM_SERIAL_MODE, mode);
54234 +
54235 + sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD);
54236 +
54237 + err = sky2_rx_start(sky2);
54238 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
54239 +
54240 + if (err)
54241 + dev_close(dev);
54242 + else {
54243 + gma_write16(hw, sky2->port, GM_GP_CTRL, ctl);
54244 +
54245 + netif_poll_enable(hw->dev[0]);
54246 + netif_wake_queue(dev);
54247 + }
54248 +
54249 + return err;
54250 +}
54251 +
54252 +/*
54253 + * Receive one packet.
54254 + * For small packets or errors, just reuse existing skb.
54255 + * For larger packets, get new buffer.
54256 + */
54257 +static struct sk_buff *sky2_receive(struct sky2_port *sky2,
54258 + u16 length, u32 status)
54259 +{
54260 + struct ring_info *re = sky2->rx_ring + sky2->rx_next;
54261 + struct sk_buff *skb = NULL;
54262 +
54263 + if (unlikely(netif_msg_rx_status(sky2)))
54264 + printk(KERN_DEBUG PFX "%s: rx slot %u status 0x%x len %d\n",
54265 + sky2->netdev->name, sky2->rx_next, status, length);
54266 +
54267 + sky2->rx_next = (sky2->rx_next + 1) % sky2->rx_pending;
54268 + prefetch(sky2->rx_ring + sky2->rx_next);
54269 +
54270 + if (status & GMR_FS_ANY_ERR)
54271 + goto error;
54272 +
54273 + if (!(status & GMR_FS_RX_OK))
54274 + goto resubmit;
54275 +
54276 + if (length > sky2->netdev->mtu + ETH_HLEN)
54277 + goto oversize;
54278 +
54279 + if (length < copybreak) {
54280 + skb = dev_alloc_skb(length + 2);
54281 + if (!skb)
54282 + goto resubmit;
54283 +
54284 + skb_reserve(skb, 2);
54285 + pci_dma_sync_single_for_cpu(sky2->hw->pdev, re->mapaddr,
54286 + length, PCI_DMA_FROMDEVICE);
54287 + memcpy(skb->data, re->skb->data, length);
54288 + skb->ip_summed = re->skb->ip_summed;
54289 + skb->csum = re->skb->csum;
54290 + pci_dma_sync_single_for_device(sky2->hw->pdev, re->mapaddr,
54291 + length, PCI_DMA_FROMDEVICE);
54292 + } else {
54293 + struct sk_buff *nskb;
54294 +
54295 + nskb = sky2_alloc_skb(sky2->rx_bufsize, GFP_ATOMIC);
54296 + if (!nskb)
54297 + goto resubmit;
54298 +
54299 + skb = re->skb;
54300 + re->skb = nskb;
54301 + pci_unmap_single(sky2->hw->pdev, re->mapaddr,
54302 + sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
54303 + prefetch(skb->data);
54304 +
54305 + re->mapaddr = pci_map_single(sky2->hw->pdev, nskb->data,
54306 + sky2->rx_bufsize, PCI_DMA_FROMDEVICE);
54307 + }
54308 +
54309 + skb_put(skb, length);
54310 +resubmit:
54311 + re->skb->ip_summed = CHECKSUM_NONE;
54312 + sky2_rx_add(sky2, re->mapaddr);
54313 +
54314 + /* Tell receiver about new buffers. */
54315 + sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put,
54316 + &sky2->rx_last_put, RX_LE_SIZE);
54317 +
54318 + return skb;
54319 +
54320 +oversize:
54321 + ++sky2->net_stats.rx_over_errors;
54322 + goto resubmit;
54323 +
54324 +error:
54325 + ++sky2->net_stats.rx_errors;
54326 +
54327 + if (netif_msg_rx_err(sky2) && net_ratelimit())
54328 + printk(KERN_INFO PFX "%s: rx error, status 0x%x length %d\n",
54329 + sky2->netdev->name, status, length);
54330 +
54331 + if (status & (GMR_FS_LONG_ERR | GMR_FS_UN_SIZE))
54332 + sky2->net_stats.rx_length_errors++;
54333 + if (status & GMR_FS_FRAGMENT)
54334 + sky2->net_stats.rx_frame_errors++;
54335 + if (status & GMR_FS_CRC_ERR)
54336 + sky2->net_stats.rx_crc_errors++;
54337 + if (status & GMR_FS_RX_FF_OV)
54338 + sky2->net_stats.rx_fifo_errors++;
54339 +
54340 + goto resubmit;
54341 +}
54342 +
54343 +/*
54344 + * Check for transmit complete
54345 + */
54346 +#define TX_NO_STATUS 0xffff
54347 +
54348 +static void sky2_tx_check(struct sky2_hw *hw, int port, u16 last)
54349 +{
54350 + if (last != TX_NO_STATUS) {
54351 + struct net_device *dev = hw->dev[port];
54352 + if (dev && netif_running(dev)) {
54353 + struct sky2_port *sky2 = netdev_priv(dev);
54354 +
54355 + spin_lock(&sky2->tx_lock);
54356 + sky2_tx_complete(sky2, last);
54357 + spin_unlock(&sky2->tx_lock);
54358 + }
54359 + }
54360 +}
54361 +
54362 +/*
54363 + * Both ports share the same status interrupt, therefore there is only
54364 + * one poll routine.
54365 + */
54366 +static int sky2_poll(struct net_device *dev0, int *budget)
54367 +{
54368 + struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
54369 + unsigned int to_do = min(dev0->quota, *budget);
54370 + unsigned int work_done = 0;
54371 + u16 hwidx;
54372 + u16 tx_done[2] = { TX_NO_STATUS, TX_NO_STATUS };
54373 +
54374 + sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
54375 +
54376 + /*
54377 + * Kick the STAT_LEV_TIMER_CTRL timer.
54378 + * This fixes my hangs on Yukon-EC (0xb6) rev 1.
54379 + * The if clause is there to start the timer only if it has been
54380 + * configured correctly and not been disabled via ethtool.
54381 + */
54382 + if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_START) {
54383 + sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
54384 + sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
54385 + }
54386 +
54387 + hwidx = sky2_read16(hw, STAT_PUT_IDX);
54388 + BUG_ON(hwidx >= STATUS_RING_SIZE);
54389 + rmb();
54390 +
54391 + while (hwidx != hw->st_idx) {
54392 + struct sky2_status_le *le = hw->st_le + hw->st_idx;
54393 + struct net_device *dev;
54394 + struct sky2_port *sky2;
54395 + struct sk_buff *skb;
54396 + u32 status;
54397 + u16 length;
54398 +
54399 + le = hw->st_le + hw->st_idx;
54400 + hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE;
54401 + prefetch(hw->st_le + hw->st_idx);
54402 +
54403 + BUG_ON(le->link >= 2);
54404 + dev = hw->dev[le->link];
54405 + if (dev == NULL || !netif_running(dev))
54406 + continue;
54407 +
54408 + sky2 = netdev_priv(dev);
54409 + status = le32_to_cpu(le->status);
54410 + length = le16_to_cpu(le->length);
54411 +
54412 + switch (le->opcode & ~HW_OWNER) {
54413 + case OP_RXSTAT:
54414 + skb = sky2_receive(sky2, length, status);
54415 + if (!skb)
54416 + break;
54417 +
54418 + skb->dev = dev;
54419 + skb->protocol = eth_type_trans(skb, dev);
54420 + dev->last_rx = jiffies;
54421 +
54422 +#ifdef SKY2_VLAN_TAG_USED
54423 + if (sky2->vlgrp && (status & GMR_FS_VLAN)) {
54424 + vlan_hwaccel_receive_skb(skb,
54425 + sky2->vlgrp,
54426 + be16_to_cpu(sky2->rx_tag));
54427 + } else
54428 +#endif
54429 + netif_receive_skb(skb);
54430 +
54431 + if (++work_done >= to_do)
54432 + goto exit_loop;
54433 + break;
54434 +
54435 +#ifdef SKY2_VLAN_TAG_USED
54436 + case OP_RXVLAN:
54437 + sky2->rx_tag = length;
54438 + break;
54439 +
54440 + case OP_RXCHKSVLAN:
54441 + sky2->rx_tag = length;
54442 + /* fall through */
54443 +#endif
54444 + case OP_RXCHKS:
54445 + skb = sky2->rx_ring[sky2->rx_next].skb;
54446 + skb->ip_summed = CHECKSUM_HW;
54447 + skb->csum = le16_to_cpu(status);
54448 + break;
54449 +
54450 + case OP_TXINDEXLE:
54451 + /* TX index reports status for both ports */
54452 + tx_done[0] = status & 0xffff;
54453 + tx_done[1] = ((status >> 24) & 0xff)
54454 + | (u16)(length & 0xf) << 8;
54455 + break;
54456 +
54457 + default:
54458 + if (net_ratelimit())
54459 + printk(KERN_WARNING PFX
54460 + "unknown status opcode 0x%x\n", le->opcode);
54461 + break;
54462 + }
54463 + }
54464 +
54465 +exit_loop:
54466 + sky2_tx_check(hw, 0, tx_done[0]);
54467 + sky2_tx_check(hw, 1, tx_done[1]);
54468 +
54469 + if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
54470 + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
54471 + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
54472 + }
54473 +
54474 + if (likely(work_done < to_do)) {
54475 + spin_lock_irq(&hw->hw_lock);
54476 + __netif_rx_complete(dev0);
54477 +
54478 + hw->intr_mask |= Y2_IS_STAT_BMU;
54479 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
54480 + spin_unlock_irq(&hw->hw_lock);
54481 +
54482 + return 0;
54483 + } else {
54484 + *budget -= work_done;
54485 + dev0->quota -= work_done;
54486 + return 1;
54487 + }
54488 +}
54489 +
54490 +static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status)
54491 +{
54492 + struct net_device *dev = hw->dev[port];
54493 +
54494 + if (net_ratelimit())
54495 + printk(KERN_INFO PFX "%s: hw error interrupt status 0x%x\n",
54496 + dev->name, status);
54497 +
54498 + if (status & Y2_IS_PAR_RD1) {
54499 + if (net_ratelimit())
54500 + printk(KERN_ERR PFX "%s: ram data read parity error\n",
54501 + dev->name);
54502 + /* Clear IRQ */
54503 + sky2_write16(hw, RAM_BUFFER(port, B3_RI_CTRL), RI_CLR_RD_PERR);
54504 + }
54505 +
54506 + if (status & Y2_IS_PAR_WR1) {
54507 + if (net_ratelimit())
54508 + printk(KERN_ERR PFX "%s: ram data write parity error\n",
54509 + dev->name);
54510 +
54511 + sky2_write16(hw, RAM_BUFFER(port, B3_RI_CTRL), RI_CLR_WR_PERR);
54512 + }
54513 +
54514 + if (status & Y2_IS_PAR_MAC1) {
54515 + if (net_ratelimit())
54516 + printk(KERN_ERR PFX "%s: MAC parity error\n", dev->name);
54517 + sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_CLI_TX_PE);
54518 + }
54519 +
54520 + if (status & Y2_IS_PAR_RX1) {
54521 + if (net_ratelimit())
54522 + printk(KERN_ERR PFX "%s: RX parity error\n", dev->name);
54523 + sky2_write32(hw, Q_ADDR(rxqaddr[port], Q_CSR), BMU_CLR_IRQ_PAR);
54524 + }
54525 +
54526 + if (status & Y2_IS_TCP_TXA1) {
54527 + if (net_ratelimit())
54528 + printk(KERN_ERR PFX "%s: TCP segmentation error\n",
54529 + dev->name);
54530 + sky2_write32(hw, Q_ADDR(txqaddr[port], Q_CSR), BMU_CLR_IRQ_TCP);
54531 + }
54532 +}
54533 +
54534 +static void sky2_hw_intr(struct sky2_hw *hw)
54535 +{
54536 + u32 status = sky2_read32(hw, B0_HWE_ISRC);
54537 +
54538 + if (status & Y2_IS_TIST_OV)
54539 + sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_CLR_IRQ);
54540 +
54541 + if (status & (Y2_IS_MST_ERR | Y2_IS_IRQ_STAT)) {
54542 + u16 pci_err;
54543 +
54544 + pci_err = sky2_pci_read16(hw, PCI_STATUS);
54545 + if (net_ratelimit())
54546 + printk(KERN_ERR PFX "%s: pci hw error (0x%x)\n",
54547 + pci_name(hw->pdev), pci_err);
54548 +
54549 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54550 + sky2_pci_write16(hw, PCI_STATUS,
54551 + pci_err | PCI_STATUS_ERROR_BITS);
54552 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54553 + }
54554 +
54555 + if (status & Y2_IS_PCI_EXP) {
54556 + /* PCI-Express uncorrectable Error occurred */
54557 + u32 pex_err;
54558 +
54559 + pex_err = sky2_pci_read32(hw, PEX_UNC_ERR_STAT);
54560 +
54561 + if (net_ratelimit())
54562 + printk(KERN_ERR PFX "%s: pci express error (0x%x)\n",
54563 + pci_name(hw->pdev), pex_err);
54564 +
54565 + /* clear the interrupt */
54566 + sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54567 + sky2_pci_write32(hw, PEX_UNC_ERR_STAT,
54568 + 0xffffffffUL);
54569 + sky2_write32(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54570 +
54571 + if (pex_err & PEX_FATAL_ERRORS) {
54572 + u32 hwmsk = sky2_read32(hw, B0_HWE_IMSK);
54573 + hwmsk &= ~Y2_IS_PCI_EXP;
54574 + sky2_write32(hw, B0_HWE_IMSK, hwmsk);
54575 + }
54576 + }
54577 +
54578 + if (status & Y2_HWE_L1_MASK)
54579 + sky2_hw_error(hw, 0, status);
54580 + status >>= 8;
54581 + if (status & Y2_HWE_L1_MASK)
54582 + sky2_hw_error(hw, 1, status);
54583 +}
54584 +
54585 +static void sky2_mac_intr(struct sky2_hw *hw, unsigned port)
54586 +{
54587 + struct net_device *dev = hw->dev[port];
54588 + struct sky2_port *sky2 = netdev_priv(dev);
54589 + u8 status = sky2_read8(hw, SK_REG(port, GMAC_IRQ_SRC));
54590 +
54591 + if (netif_msg_intr(sky2))
54592 + printk(KERN_INFO PFX "%s: mac interrupt status 0x%x\n",
54593 + dev->name, status);
54594 +
54595 + if (status & GM_IS_RX_FF_OR) {
54596 + ++sky2->net_stats.rx_fifo_errors;
54597 + sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_CLI_RX_FO);
54598 + }
54599 +
54600 + if (status & GM_IS_TX_FF_UR) {
54601 + ++sky2->net_stats.tx_fifo_errors;
54602 + sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_CLI_TX_FU);
54603 + }
54604 +}
54605 +
54606 +static void sky2_phy_intr(struct sky2_hw *hw, unsigned port)
54607 +{
54608 + struct net_device *dev = hw->dev[port];
54609 + struct sky2_port *sky2 = netdev_priv(dev);
54610 +
54611 + hw->intr_mask &= ~(port == 0 ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
54612 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
54613 +
54614 + schedule_work(&sky2->phy_task);
54615 +}
54616 +
54617 +static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
54618 +{
54619 + struct sky2_hw *hw = dev_id;
54620 + struct net_device *dev0 = hw->dev[0];
54621 + u32 status;
54622 +
54623 + status = sky2_read32(hw, B0_Y2_SP_ISRC2);
54624 + if (status == 0 || status == ~0)
54625 + return IRQ_NONE;
54626 +
54627 + spin_lock(&hw->hw_lock);
54628 + if (status & Y2_IS_HW_ERR)
54629 + sky2_hw_intr(hw);
54630 +
54631 + /* Do NAPI for Rx and Tx status */
54632 + if (status & Y2_IS_STAT_BMU) {
54633 + hw->intr_mask &= ~Y2_IS_STAT_BMU;
54634 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
54635 +
54636 + if (likely(__netif_rx_schedule_prep(dev0))) {
54637 + prefetch(&hw->st_le[hw->st_idx]);
54638 + __netif_rx_schedule(dev0);
54639 + }
54640 + }
54641 +
54642 + if (status & Y2_IS_IRQ_PHY1)
54643 + sky2_phy_intr(hw, 0);
54644 +
54645 + if (status & Y2_IS_IRQ_PHY2)
54646 + sky2_phy_intr(hw, 1);
54647 +
54648 + if (status & Y2_IS_IRQ_MAC1)
54649 + sky2_mac_intr(hw, 0);
54650 +
54651 + if (status & Y2_IS_IRQ_MAC2)
54652 + sky2_mac_intr(hw, 1);
54653 +
54654 + sky2_write32(hw, B0_Y2_SP_ICR, 2);
54655 +
54656 + spin_unlock(&hw->hw_lock);
54657 +
54658 + return IRQ_HANDLED;
54659 +}
54660 +
54661 +#ifdef CONFIG_NET_POLL_CONTROLLER
54662 +static void sky2_netpoll(struct net_device *dev)
54663 +{
54664 + struct sky2_port *sky2 = netdev_priv(dev);
54665 +
54666 + sky2_intr(sky2->hw->pdev->irq, sky2->hw, NULL);
54667 +}
54668 +#endif
54669 +
54670 +/* Chip internal frequency for clock calculations */
54671 +static inline u32 sky2_mhz(const struct sky2_hw *hw)
54672 +{
54673 + switch (hw->chip_id) {
54674 + case CHIP_ID_YUKON_EC:
54675 + case CHIP_ID_YUKON_EC_U:
54676 + return 125; /* 125 Mhz */
54677 + case CHIP_ID_YUKON_FE:
54678 + return 100; /* 100 Mhz */
54679 + default: /* YUKON_XL */
54680 + return 156; /* 156 Mhz */
54681 + }
54682 +}
54683 +
54684 +static inline u32 sky2_us2clk(const struct sky2_hw *hw, u32 us)
54685 +{
54686 + return sky2_mhz(hw) * us;
54687 +}
54688 +
54689 +static inline u32 sky2_clk2us(const struct sky2_hw *hw, u32 clk)
54690 +{
54691 + return clk / sky2_mhz(hw);
54692 +}
54693 +
54694 +
54695 +static int sky2_reset(struct sky2_hw *hw)
54696 +{
54697 + u16 status;
54698 + u8 t8;
54699 + int i;
54700 +
54701 + sky2_write8(hw, B0_CTST, CS_RST_CLR);
54702 +
54703 + hw->chip_id = sky2_read8(hw, B2_CHIP_ID);
54704 + if (hw->chip_id < CHIP_ID_YUKON_XL || hw->chip_id > CHIP_ID_YUKON_FE) {
54705 + printk(KERN_ERR PFX "%s: unsupported chip type 0x%x\n",
54706 + pci_name(hw->pdev), hw->chip_id);
54707 + return -EOPNOTSUPP;
54708 + }
54709 +
54710 + /* disable ASF */
54711 + if (hw->chip_id <= CHIP_ID_YUKON_EC) {
54712 + sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET);
54713 + sky2_write16(hw, B0_CTST, Y2_ASF_DISABLE);
54714 + }
54715 +
54716 + /* do a SW reset */
54717 + sky2_write8(hw, B0_CTST, CS_RST_SET);
54718 + sky2_write8(hw, B0_CTST, CS_RST_CLR);
54719 +
54720 + /* clear PCI errors, if any */
54721 + status = sky2_pci_read16(hw, PCI_STATUS);
54722 +
54723 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
54724 + sky2_pci_write16(hw, PCI_STATUS, status | PCI_STATUS_ERROR_BITS);
54725 +
54726 +
54727 + sky2_write8(hw, B0_CTST, CS_MRST_CLR);
54728 +
54729 + /* clear any PEX errors */
54730 + if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
54731 + sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL);
54732 +
54733 +
54734 + hw->pmd_type = sky2_read8(hw, B2_PMD_TYP);
54735 + hw->ports = 1;
54736 + t8 = sky2_read8(hw, B2_Y2_HW_RES);
54737 + if ((t8 & CFG_DUAL_MAC_MSK) == CFG_DUAL_MAC_MSK) {
54738 + if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC))
54739 + ++hw->ports;
54740 + }
54741 + hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
54742 +
54743 + sky2_set_power_state(hw, PCI_D0);
54744 +
54745 + for (i = 0; i < hw->ports; i++) {
54746 + sky2_write8(hw, SK_REG(i, GMAC_LINK_CTRL), GMLC_RST_SET);
54747 + sky2_write8(hw, SK_REG(i, GMAC_LINK_CTRL), GMLC_RST_CLR);
54748 + }
54749 +
54750 + sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
54751 +
54752 + /* Clear I2C IRQ noise */
54753 + sky2_write32(hw, B2_I2C_IRQ, 1);
54754 +
54755 + /* turn off hardware timer (unused) */
54756 + sky2_write8(hw, B2_TI_CTRL, TIM_STOP);
54757 + sky2_write8(hw, B2_TI_CTRL, TIM_CLR_IRQ);
54758 +
54759 + sky2_write8(hw, B0_Y2LED, LED_STAT_ON);
54760 +
54761 + /* Turn off descriptor polling */
54762 + sky2_write32(hw, B28_DPT_CTRL, DPT_STOP);
54763 +
54764 + /* Turn off receive timestamp */
54765 + sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_STOP);
54766 + sky2_write8(hw, GMAC_TI_ST_CTRL, GMT_ST_CLR_IRQ);
54767 +
54768 + /* enable the Tx Arbiters */
54769 + for (i = 0; i < hw->ports; i++)
54770 + sky2_write8(hw, SK_REG(i, TXA_CTRL), TXA_ENA_ARB);
54771 +
54772 + /* Initialize ram interface */
54773 + for (i = 0; i < hw->ports; i++) {
54774 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_CTRL), RI_RST_CLR);
54775 +
54776 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_R1), SK_RI_TO_53);
54777 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XA1), SK_RI_TO_53);
54778 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XS1), SK_RI_TO_53);
54779 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_R1), SK_RI_TO_53);
54780 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XA1), SK_RI_TO_53);
54781 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS1), SK_RI_TO_53);
54782 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_R2), SK_RI_TO_53);
54783 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XA2), SK_RI_TO_53);
54784 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_WTO_XS2), SK_RI_TO_53);
54785 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_R2), SK_RI_TO_53);
54786 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XA2), SK_RI_TO_53);
54787 + sky2_write8(hw, RAM_BUFFER(i, B3_RI_RTO_XS2), SK_RI_TO_53);
54788 + }
54789 +
54790 + sky2_write32(hw, B0_HWE_IMSK, Y2_HWE_ALL_MASK);
54791 +
54792 + for (i = 0; i < hw->ports; i++)
54793 + sky2_phy_reset(hw, i);
54794 +
54795 + memset(hw->st_le, 0, STATUS_LE_BYTES);
54796 + hw->st_idx = 0;
54797 +
54798 + sky2_write32(hw, STAT_CTRL, SC_STAT_RST_SET);
54799 + sky2_write32(hw, STAT_CTRL, SC_STAT_RST_CLR);
54800 +
54801 + sky2_write32(hw, STAT_LIST_ADDR_LO, hw->st_dma);
54802 + sky2_write32(hw, STAT_LIST_ADDR_HI, (u64) hw->st_dma >> 32);
54803 +
54804 + /* Set the list last index */
54805 + sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1);
54806 +
54807 + /* These status setup values are copied from SysKonnect's driver */
54808 + if (is_ec_a1(hw)) {
54809 + /* WA for dev. #4.3 */
54810 + sky2_write16(hw, STAT_TX_IDX_TH, 0xfff); /* Tx Threshold */
54811 +
54812 + /* set Status-FIFO watermark */
54813 + sky2_write8(hw, STAT_FIFO_WM, 0x21); /* WA for dev. #4.18 */
54814 +
54815 + /* set Status-FIFO ISR watermark */
54816 + sky2_write8(hw, STAT_FIFO_ISR_WM, 0x07); /* WA for dev. #4.18 */
54817 + sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 10000));
54818 + } else {
54819 + sky2_write16(hw, STAT_TX_IDX_TH, 10);
54820 + sky2_write8(hw, STAT_FIFO_WM, 16);
54821 +
54822 + /* set Status-FIFO ISR watermark */
54823 + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
54824 + sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
54825 + else
54826 + sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
54827 +
54828 + sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000));
54829 + sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 7));
54830 + }
54831 +
54832 + /* enable status unit */
54833 + sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON);
54834 +
54835 + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
54836 + sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
54837 + sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_START);
54838 +
54839 + return 0;
54840 +}
54841 +
54842 +static u32 sky2_supported_modes(const struct sky2_hw *hw)
54843 +{
54844 + if (sky2_is_copper(hw)) {
54845 + u32 modes = SUPPORTED_10baseT_Half
54846 + | SUPPORTED_10baseT_Full
54847 + | SUPPORTED_100baseT_Half
54848 + | SUPPORTED_100baseT_Full
54849 + | SUPPORTED_Autoneg | SUPPORTED_TP;
54850 +
54851 + if (hw->chip_id != CHIP_ID_YUKON_FE)
54852 + modes |= SUPPORTED_1000baseT_Half
54853 + | SUPPORTED_1000baseT_Full;
54854 + return modes;
54855 + } else
54856 + return SUPPORTED_1000baseT_Half
54857 + | SUPPORTED_1000baseT_Full
54858 + | SUPPORTED_Autoneg
54859 + | SUPPORTED_FIBRE;
54860 +}
54861 +
54862 +static int sky2_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
54863 +{
54864 + struct sky2_port *sky2 = netdev_priv(dev);
54865 + struct sky2_hw *hw = sky2->hw;
54866 +
54867 + ecmd->transceiver = XCVR_INTERNAL;
54868 + ecmd->supported = sky2_supported_modes(hw);
54869 + ecmd->phy_address = PHY_ADDR_MARV;
54870 + if (sky2_is_copper(hw)) {
54871 + ecmd->supported = SUPPORTED_10baseT_Half
54872 + | SUPPORTED_10baseT_Full
54873 + | SUPPORTED_100baseT_Half
54874 + | SUPPORTED_100baseT_Full
54875 + | SUPPORTED_1000baseT_Half
54876 + | SUPPORTED_1000baseT_Full
54877 + | SUPPORTED_Autoneg | SUPPORTED_TP;
54878 + ecmd->port = PORT_TP;
54879 + ecmd->speed = sky2->speed;
54880 + } else {
54881 + ecmd->speed = SPEED_1000;
54882 + ecmd->port = PORT_FIBRE;
54883 + }
54884 +
54885 + ecmd->advertising = sky2->advertising;
54886 + ecmd->autoneg = sky2->autoneg;
54887 + ecmd->duplex = sky2->duplex;
54888 + return 0;
54889 +}
54890 +
54891 +static int sky2_set_settings(struct net_device *dev, struct ethtool_cmd *ecmd)
54892 +{
54893 + struct sky2_port *sky2 = netdev_priv(dev);
54894 + const struct sky2_hw *hw = sky2->hw;
54895 + u32 supported = sky2_supported_modes(hw);
54896 +
54897 + if (ecmd->autoneg == AUTONEG_ENABLE) {
54898 + ecmd->advertising = supported;
54899 + sky2->duplex = -1;
54900 + sky2->speed = -1;
54901 + } else {
54902 + u32 setting;
54903 +
54904 + switch (ecmd->speed) {
54905 + case SPEED_1000:
54906 + if (ecmd->duplex == DUPLEX_FULL)
54907 + setting = SUPPORTED_1000baseT_Full;
54908 + else if (ecmd->duplex == DUPLEX_HALF)
54909 + setting = SUPPORTED_1000baseT_Half;
54910 + else
54911 + return -EINVAL;
54912 + break;
54913 + case SPEED_100:
54914 + if (ecmd->duplex == DUPLEX_FULL)
54915 + setting = SUPPORTED_100baseT_Full;
54916 + else if (ecmd->duplex == DUPLEX_HALF)
54917 + setting = SUPPORTED_100baseT_Half;
54918 + else
54919 + return -EINVAL;
54920 + break;
54921 +
54922 + case SPEED_10:
54923 + if (ecmd->duplex == DUPLEX_FULL)
54924 + setting = SUPPORTED_10baseT_Full;
54925 + else if (ecmd->duplex == DUPLEX_HALF)
54926 + setting = SUPPORTED_10baseT_Half;
54927 + else
54928 + return -EINVAL;
54929 + break;
54930 + default:
54931 + return -EINVAL;
54932 + }
54933 +
54934 + if ((setting & supported) == 0)
54935 + return -EINVAL;
54936 +
54937 + sky2->speed = ecmd->speed;
54938 + sky2->duplex = ecmd->duplex;
54939 + }
54940 +
54941 + sky2->autoneg = ecmd->autoneg;
54942 + sky2->advertising = ecmd->advertising;
54943 +
54944 + if (netif_running(dev))
54945 + sky2_phy_reinit(sky2);
54946 +
54947 + return 0;
54948 +}
54949 +
54950 +static void sky2_get_drvinfo(struct net_device *dev,
54951 + struct ethtool_drvinfo *info)
54952 +{
54953 + struct sky2_port *sky2 = netdev_priv(dev);
54954 +
54955 + strcpy(info->driver, DRV_NAME);
54956 + strcpy(info->version, DRV_VERSION);
54957 + strcpy(info->fw_version, "N/A");
54958 + strcpy(info->bus_info, pci_name(sky2->hw->pdev));
54959 +}
54960 +
54961 +static const struct sky2_stat {
54962 + char name[ETH_GSTRING_LEN];
54963 + u16 offset;
54964 +} sky2_stats[] = {
54965 + { "tx_bytes", GM_TXO_OK_HI },
54966 + { "rx_bytes", GM_RXO_OK_HI },
54967 + { "tx_broadcast", GM_TXF_BC_OK },
54968 + { "rx_broadcast", GM_RXF_BC_OK },
54969 + { "tx_multicast", GM_TXF_MC_OK },
54970 + { "rx_multicast", GM_RXF_MC_OK },
54971 + { "tx_unicast", GM_TXF_UC_OK },
54972 + { "rx_unicast", GM_RXF_UC_OK },
54973 + { "tx_mac_pause", GM_TXF_MPAUSE },
54974 + { "rx_mac_pause", GM_RXF_MPAUSE },
54975 + { "collisions", GM_TXF_SNG_COL },
54976 + { "late_collision",GM_TXF_LAT_COL },
54977 + { "aborted", GM_TXF_ABO_COL },
54978 + { "multi_collisions", GM_TXF_MUL_COL },
54979 + { "fifo_underrun", GM_TXE_FIFO_UR },
54980 + { "fifo_overflow", GM_RXE_FIFO_OV },
54981 + { "rx_toolong", GM_RXF_LNG_ERR },
54982 + { "rx_jabber", GM_RXF_JAB_PKT },
54983 + { "rx_runt", GM_RXE_FRAG },
54984 + { "rx_too_long", GM_RXF_LNG_ERR },
54985 + { "rx_fcs_error", GM_RXF_FCS_ERR },
54986 +};
54987 +
54988 +static u32 sky2_get_rx_csum(struct net_device *dev)
54989 +{
54990 + struct sky2_port *sky2 = netdev_priv(dev);
54991 +
54992 + return sky2->rx_csum;
54993 +}
54994 +
54995 +static int sky2_set_rx_csum(struct net_device *dev, u32 data)
54996 +{
54997 + struct sky2_port *sky2 = netdev_priv(dev);
54998 +
54999 + sky2->rx_csum = data;
55000 +
55001 + sky2_write32(sky2->hw, Q_ADDR(rxqaddr[sky2->port], Q_CSR),
55002 + data ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM);
55003 +
55004 + return 0;
55005 +}
55006 +
55007 +static u32 sky2_get_msglevel(struct net_device *netdev)
55008 +{
55009 + struct sky2_port *sky2 = netdev_priv(netdev);
55010 + return sky2->msg_enable;
55011 +}
55012 +
55013 +static int sky2_nway_reset(struct net_device *dev)
55014 +{
55015 + struct sky2_port *sky2 = netdev_priv(dev);
55016 +
55017 + if (sky2->autoneg != AUTONEG_ENABLE)
55018 + return -EINVAL;
55019 +
55020 + sky2_phy_reinit(sky2);
55021 +
55022 + return 0;
55023 +}
55024 +
55025 +static void sky2_phy_stats(struct sky2_port *sky2, u64 * data, unsigned count)
55026 +{
55027 + struct sky2_hw *hw = sky2->hw;
55028 + unsigned port = sky2->port;
55029 + int i;
55030 +
55031 + data[0] = (u64) gma_read32(hw, port, GM_TXO_OK_HI) << 32
55032 + | (u64) gma_read32(hw, port, GM_TXO_OK_LO);
55033 + data[1] = (u64) gma_read32(hw, port, GM_RXO_OK_HI) << 32
55034 + | (u64) gma_read32(hw, port, GM_RXO_OK_LO);
55035 +
55036 + for (i = 2; i < count; i++)
55037 + data[i] = (u64) gma_read32(hw, port, sky2_stats[i].offset);
55038 +}
55039 +
55040 +static void sky2_set_msglevel(struct net_device *netdev, u32 value)
55041 +{
55042 + struct sky2_port *sky2 = netdev_priv(netdev);
55043 + sky2->msg_enable = value;
55044 +}
55045 +
55046 +static int sky2_get_stats_count(struct net_device *dev)
55047 +{
55048 + return ARRAY_SIZE(sky2_stats);
55049 +}
55050 +
55051 +static void sky2_get_ethtool_stats(struct net_device *dev,
55052 + struct ethtool_stats *stats, u64 * data)
55053 +{
55054 + struct sky2_port *sky2 = netdev_priv(dev);
55055 +
55056 + sky2_phy_stats(sky2, data, ARRAY_SIZE(sky2_stats));
55057 +}
55058 +
55059 +static void sky2_get_strings(struct net_device *dev, u32 stringset, u8 * data)
55060 +{
55061 + int i;
55062 +
55063 + switch (stringset) {
55064 + case ETH_SS_STATS:
55065 + for (i = 0; i < ARRAY_SIZE(sky2_stats); i++)
55066 + memcpy(data + i * ETH_GSTRING_LEN,
55067 + sky2_stats[i].name, ETH_GSTRING_LEN);
55068 + break;
55069 + }
55070 +}
55071 +
55072 +/* Use hardware MIB variables for critical path statistics and
55073 + * transmit feedback not reported at interrupt.
55074 + * Other errors are accounted for in interrupt handler.
55075 + */
55076 +static struct net_device_stats *sky2_get_stats(struct net_device *dev)
55077 +{
55078 + struct sky2_port *sky2 = netdev_priv(dev);
55079 + u64 data[13];
55080 +
55081 + sky2_phy_stats(sky2, data, ARRAY_SIZE(data));
55082 +
55083 + sky2->net_stats.tx_bytes = data[0];
55084 + sky2->net_stats.rx_bytes = data[1];
55085 + sky2->net_stats.tx_packets = data[2] + data[4] + data[6];
55086 + sky2->net_stats.rx_packets = data[3] + data[5] + data[7];
55087 + sky2->net_stats.multicast = data[5] + data[7];
55088 + sky2->net_stats.collisions = data[10];
55089 + sky2->net_stats.tx_aborted_errors = data[12];
55090 +
55091 + return &sky2->net_stats;
55092 +}
55093 +
55094 +static int sky2_set_mac_address(struct net_device *dev, void *p)
55095 +{
55096 + struct sky2_port *sky2 = netdev_priv(dev);
55097 + struct sky2_hw *hw = sky2->hw;
55098 + unsigned port = sky2->port;
55099 + const struct sockaddr *addr = p;
55100 +
55101 + if (!is_valid_ether_addr(addr->sa_data))
55102 + return -EADDRNOTAVAIL;
55103 +
55104 + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
55105 + memcpy_toio(hw->regs + B2_MAC_1 + port * 8,
55106 + dev->dev_addr, ETH_ALEN);
55107 + memcpy_toio(hw->regs + B2_MAC_2 + port * 8,
55108 + dev->dev_addr, ETH_ALEN);
55109 +
55110 + /* virtual address for data */
55111 + gma_set_addr(hw, port, GM_SRC_ADDR_2L, dev->dev_addr);
55112 +
55113 + /* physical address: used for pause frames */
55114 + gma_set_addr(hw, port, GM_SRC_ADDR_1L, dev->dev_addr);
55115 +
55116 + return 0;
55117 +}
55118 +
55119 +static void sky2_set_multicast(struct net_device *dev)
55120 +{
55121 + struct sky2_port *sky2 = netdev_priv(dev);
55122 + struct sky2_hw *hw = sky2->hw;
55123 + unsigned port = sky2->port;
55124 + struct dev_mc_list *list = dev->mc_list;
55125 + u16 reg;
55126 + u8 filter[8];
55127 +
55128 + memset(filter, 0, sizeof(filter));
55129 +
55130 + reg = gma_read16(hw, port, GM_RX_CTRL);
55131 + reg |= GM_RXCR_UCF_ENA;
55132 +
55133 + if (dev->flags & IFF_PROMISC) /* promiscuous */
55134 + reg &= ~(GM_RXCR_UCF_ENA | GM_RXCR_MCF_ENA);
55135 + else if ((dev->flags & IFF_ALLMULTI) || dev->mc_count > 16) /* all multicast */
55136 + memset(filter, 0xff, sizeof(filter));
55137 + else if (dev->mc_count == 0) /* no multicast */
55138 + reg &= ~GM_RXCR_MCF_ENA;
55139 + else {
55140 + int i;
55141 + reg |= GM_RXCR_MCF_ENA;
55142 +
55143 + for (i = 0; list && i < dev->mc_count; i++, list = list->next) {
55144 + u32 bit = ether_crc(ETH_ALEN, list->dmi_addr) & 0x3f;
55145 + filter[bit / 8] |= 1 << (bit % 8);
55146 + }
55147 + }
55148 +
55149 + gma_write16(hw, port, GM_MC_ADDR_H1,
55150 + (u16) filter[0] | ((u16) filter[1] << 8));
55151 + gma_write16(hw, port, GM_MC_ADDR_H2,
55152 + (u16) filter[2] | ((u16) filter[3] << 8));
55153 + gma_write16(hw, port, GM_MC_ADDR_H3,
55154 + (u16) filter[4] | ((u16) filter[5] << 8));
55155 + gma_write16(hw, port, GM_MC_ADDR_H4,
55156 + (u16) filter[6] | ((u16) filter[7] << 8));
55157 +
55158 + gma_write16(hw, port, GM_RX_CTRL, reg);
55159 +}
55160 +
55161 +/* Can have one global because blinking is controlled by
55162 + * ethtool and that is always under RTNL mutex
55163 + */
55164 +static void sky2_led(struct sky2_hw *hw, unsigned port, int on)
55165 +{
55166 + u16 pg;
55167 +
55168 + switch (hw->chip_id) {
55169 + case CHIP_ID_YUKON_XL:
55170 + pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55171 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55172 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL,
55173 + on ? (PHY_M_LEDC_LOS_CTRL(1) |
55174 + PHY_M_LEDC_INIT_CTRL(7) |
55175 + PHY_M_LEDC_STA1_CTRL(7) |
55176 + PHY_M_LEDC_STA0_CTRL(7))
55177 + : 0);
55178 +
55179 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55180 + break;
55181 +
55182 + default:
55183 + gm_phy_write(hw, port, PHY_MARV_LED_CTRL, 0);
55184 + gm_phy_write(hw, port, PHY_MARV_LED_OVER,
55185 + on ? PHY_M_LED_MO_DUP(MO_LED_ON) |
55186 + PHY_M_LED_MO_10(MO_LED_ON) |
55187 + PHY_M_LED_MO_100(MO_LED_ON) |
55188 + PHY_M_LED_MO_1000(MO_LED_ON) |
55189 + PHY_M_LED_MO_RX(MO_LED_ON)
55190 + : PHY_M_LED_MO_DUP(MO_LED_OFF) |
55191 + PHY_M_LED_MO_10(MO_LED_OFF) |
55192 + PHY_M_LED_MO_100(MO_LED_OFF) |
55193 + PHY_M_LED_MO_1000(MO_LED_OFF) |
55194 + PHY_M_LED_MO_RX(MO_LED_OFF));
55195 +
55196 + }
55197 +}
55198 +
55199 +/* blink LED's for finding board */
55200 +static int sky2_phys_id(struct net_device *dev, u32 data)
55201 +{
55202 + struct sky2_port *sky2 = netdev_priv(dev);
55203 + struct sky2_hw *hw = sky2->hw;
55204 + unsigned port = sky2->port;
55205 + u16 ledctrl, ledover = 0;
55206 + long ms;
55207 + int interrupted;
55208 + int onoff = 1;
55209 +
55210 + if (!data || data > (u32) (MAX_SCHEDULE_TIMEOUT / HZ))
55211 + ms = jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT);
55212 + else
55213 + ms = data * 1000;
55214 +
55215 + /* save initial values */
55216 + down(&sky2->phy_sema);
55217 + if (hw->chip_id == CHIP_ID_YUKON_XL) {
55218 + u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55219 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55220 + ledctrl = gm_phy_read(hw, port, PHY_MARV_PHY_CTRL);
55221 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55222 + } else {
55223 + ledctrl = gm_phy_read(hw, port, PHY_MARV_LED_CTRL);
55224 + ledover = gm_phy_read(hw, port, PHY_MARV_LED_OVER);
55225 + }
55226 +
55227 + interrupted = 0;
55228 + while (!interrupted && ms > 0) {
55229 + sky2_led(hw, port, onoff);
55230 + onoff = !onoff;
55231 +
55232 + up(&sky2->phy_sema);
55233 + interrupted = msleep_interruptible(250);
55234 + down(&sky2->phy_sema);
55235 +
55236 + ms -= 250;
55237 + }
55238 +
55239 + /* resume regularly scheduled programming */
55240 + if (hw->chip_id == CHIP_ID_YUKON_XL) {
55241 + u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
55242 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
55243 + gm_phy_write(hw, port, PHY_MARV_PHY_CTRL, ledctrl);
55244 + gm_phy_write(hw, port, PHY_MARV_EXT_ADR, pg);
55245 + } else {
55246 + gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
55247 + gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
55248 + }
55249 + up(&sky2->phy_sema);
55250 +
55251 + return 0;
55252 +}
55253 +
55254 +static void sky2_get_pauseparam(struct net_device *dev,
55255 + struct ethtool_pauseparam *ecmd)
55256 +{
55257 + struct sky2_port *sky2 = netdev_priv(dev);
55258 +
55259 + ecmd->tx_pause = sky2->tx_pause;
55260 + ecmd->rx_pause = sky2->rx_pause;
55261 + ecmd->autoneg = sky2->autoneg;
55262 +}
55263 +
55264 +static int sky2_set_pauseparam(struct net_device *dev,
55265 + struct ethtool_pauseparam *ecmd)
55266 +{
55267 + struct sky2_port *sky2 = netdev_priv(dev);
55268 + int err = 0;
55269 +
55270 + sky2->autoneg = ecmd->autoneg;
55271 + sky2->tx_pause = ecmd->tx_pause != 0;
55272 + sky2->rx_pause = ecmd->rx_pause != 0;
55273 +
55274 + sky2_phy_reinit(sky2);
55275 +
55276 + return err;
55277 +}
55278 +
55279 +#ifdef CONFIG_PM
55280 +static void sky2_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
55281 +{
55282 + struct sky2_port *sky2 = netdev_priv(dev);
55283 +
55284 + wol->supported = WAKE_MAGIC;
55285 + wol->wolopts = sky2->wol ? WAKE_MAGIC : 0;
55286 +}
55287 +
55288 +static int sky2_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
55289 +{
55290 + struct sky2_port *sky2 = netdev_priv(dev);
55291 + struct sky2_hw *hw = sky2->hw;
55292 +
55293 + if (wol->wolopts != WAKE_MAGIC && wol->wolopts != 0)
55294 + return -EOPNOTSUPP;
55295 +
55296 + sky2->wol = wol->wolopts == WAKE_MAGIC;
55297 +
55298 + if (sky2->wol) {
55299 + memcpy_toio(hw->regs + WOL_MAC_ADDR, dev->dev_addr, ETH_ALEN);
55300 +
55301 + sky2_write16(hw, WOL_CTRL_STAT,
55302 + WOL_CTL_ENA_PME_ON_MAGIC_PKT |
55303 + WOL_CTL_ENA_MAGIC_PKT_UNIT);
55304 + } else
55305 + sky2_write16(hw, WOL_CTRL_STAT, WOL_CTL_DEFAULT);
55306 +
55307 + return 0;
55308 +}
55309 +#endif
55310 +
55311 +static int sky2_get_coalesce(struct net_device *dev,
55312 + struct ethtool_coalesce *ecmd)
55313 +{
55314 + struct sky2_port *sky2 = netdev_priv(dev);
55315 + struct sky2_hw *hw = sky2->hw;
55316 +
55317 + if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_STOP)
55318 + ecmd->tx_coalesce_usecs = 0;
55319 + else {
55320 + u32 clks = sky2_read32(hw, STAT_TX_TIMER_INI);
55321 + ecmd->tx_coalesce_usecs = sky2_clk2us(hw, clks);
55322 + }
55323 + ecmd->tx_max_coalesced_frames = sky2_read16(hw, STAT_TX_IDX_TH);
55324 +
55325 + if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_STOP)
55326 + ecmd->rx_coalesce_usecs = 0;
55327 + else {
55328 + u32 clks = sky2_read32(hw, STAT_LEV_TIMER_INI);
55329 + ecmd->rx_coalesce_usecs = sky2_clk2us(hw, clks);
55330 + }
55331 + ecmd->rx_max_coalesced_frames = sky2_read8(hw, STAT_FIFO_WM);
55332 +
55333 + if (sky2_read8(hw, STAT_ISR_TIMER_CTRL) == TIM_STOP)
55334 + ecmd->rx_coalesce_usecs_irq = 0;
55335 + else {
55336 + u32 clks = sky2_read32(hw, STAT_ISR_TIMER_INI);
55337 + ecmd->rx_coalesce_usecs_irq = sky2_clk2us(hw, clks);
55338 + }
55339 +
55340 + ecmd->rx_max_coalesced_frames_irq = sky2_read8(hw, STAT_FIFO_ISR_WM);
55341 +
55342 + return 0;
55343 +}
55344 +
55345 +/* Note: this affect both ports */
55346 +static int sky2_set_coalesce(struct net_device *dev,
55347 + struct ethtool_coalesce *ecmd)
55348 +{
55349 + struct sky2_port *sky2 = netdev_priv(dev);
55350 + struct sky2_hw *hw = sky2->hw;
55351 + const u32 tmin = sky2_clk2us(hw, 1);
55352 + const u32 tmax = 5000;
55353 +
55354 + if (ecmd->tx_coalesce_usecs != 0 &&
55355 + (ecmd->tx_coalesce_usecs < tmin || ecmd->tx_coalesce_usecs > tmax))
55356 + return -EINVAL;
55357 +
55358 + if (ecmd->rx_coalesce_usecs != 0 &&
55359 + (ecmd->rx_coalesce_usecs < tmin || ecmd->rx_coalesce_usecs > tmax))
55360 + return -EINVAL;
55361 +
55362 + if (ecmd->rx_coalesce_usecs_irq != 0 &&
55363 + (ecmd->rx_coalesce_usecs_irq < tmin || ecmd->rx_coalesce_usecs_irq > tmax))
55364 + return -EINVAL;
55365 +
55366 + if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1)
55367 + return -EINVAL;
55368 + if (ecmd->rx_max_coalesced_frames > RX_MAX_PENDING)
55369 + return -EINVAL;
55370 + if (ecmd->rx_max_coalesced_frames_irq >RX_MAX_PENDING)
55371 + return -EINVAL;
55372 +
55373 + if (ecmd->tx_coalesce_usecs == 0)
55374 + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
55375 + else {
55376 + sky2_write32(hw, STAT_TX_TIMER_INI,
55377 + sky2_us2clk(hw, ecmd->tx_coalesce_usecs));
55378 + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
55379 + }
55380 + sky2_write16(hw, STAT_TX_IDX_TH, ecmd->tx_max_coalesced_frames);
55381 +
55382 + if (ecmd->rx_coalesce_usecs == 0)
55383 + sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
55384 + else {
55385 + sky2_write32(hw, STAT_LEV_TIMER_INI,
55386 + sky2_us2clk(hw, ecmd->rx_coalesce_usecs));
55387 + sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
55388 + }
55389 + sky2_write8(hw, STAT_FIFO_WM, ecmd->rx_max_coalesced_frames);
55390 +
55391 + if (ecmd->rx_coalesce_usecs_irq == 0)
55392 + sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_STOP);
55393 + else {
55394 + sky2_write32(hw, STAT_ISR_TIMER_INI,
55395 + sky2_us2clk(hw, ecmd->rx_coalesce_usecs_irq));
55396 + sky2_write8(hw, STAT_ISR_TIMER_CTRL, TIM_START);
55397 + }
55398 + sky2_write8(hw, STAT_FIFO_ISR_WM, ecmd->rx_max_coalesced_frames_irq);
55399 + return 0;
55400 +}
55401 +
55402 +static void sky2_get_ringparam(struct net_device *dev,
55403 + struct ethtool_ringparam *ering)
55404 +{
55405 + struct sky2_port *sky2 = netdev_priv(dev);
55406 +
55407 + ering->rx_max_pending = RX_MAX_PENDING;
55408 + ering->rx_mini_max_pending = 0;
55409 + ering->rx_jumbo_max_pending = 0;
55410 + ering->tx_max_pending = TX_RING_SIZE - 1;
55411 +
55412 + ering->rx_pending = sky2->rx_pending;
55413 + ering->rx_mini_pending = 0;
55414 + ering->rx_jumbo_pending = 0;
55415 + ering->tx_pending = sky2->tx_pending;
55416 +}
55417 +
55418 +static int sky2_set_ringparam(struct net_device *dev,
55419 + struct ethtool_ringparam *ering)
55420 +{
55421 + struct sky2_port *sky2 = netdev_priv(dev);
55422 + int err = 0;
55423 +
55424 + if (ering->rx_pending > RX_MAX_PENDING ||
55425 + ering->rx_pending < 8 ||
55426 + ering->tx_pending < MAX_SKB_TX_LE ||
55427 + ering->tx_pending > TX_RING_SIZE - 1)
55428 + return -EINVAL;
55429 +
55430 + if (netif_running(dev))
55431 + sky2_down(dev);
55432 +
55433 + sky2->rx_pending = ering->rx_pending;
55434 + sky2->tx_pending = ering->tx_pending;
55435 +
55436 + if (netif_running(dev)) {
55437 + err = sky2_up(dev);
55438 + if (err)
55439 + dev_close(dev);
55440 + else
55441 + sky2_set_multicast(dev);
55442 + }
55443 +
55444 + return err;
55445 +}
55446 +
55447 +static int sky2_get_regs_len(struct net_device *dev)
55448 +{
55449 + return 0x4000;
55450 +}
55451 +
55452 +/*
55453 + * Returns copy of control register region
55454 + * Note: access to the RAM address register set will cause timeouts.
55455 + */
55456 +static void sky2_get_regs(struct net_device *dev, struct ethtool_regs *regs,
55457 + void *p)
55458 +{
55459 + const struct sky2_port *sky2 = netdev_priv(dev);
55460 + const void __iomem *io = sky2->hw->regs;
55461 +
55462 + BUG_ON(regs->len < B3_RI_WTO_R1);
55463 + regs->version = 1;
55464 + memset(p, 0, regs->len);
55465 +
55466 + memcpy_fromio(p, io, B3_RAM_ADDR);
55467 +
55468 + memcpy_fromio(p + B3_RI_WTO_R1,
55469 + io + B3_RI_WTO_R1,
55470 + regs->len - B3_RI_WTO_R1);
55471 +}
55472 +
55473 +static struct ethtool_ops sky2_ethtool_ops = {
55474 + .get_settings = sky2_get_settings,
55475 + .set_settings = sky2_set_settings,
55476 + .get_drvinfo = sky2_get_drvinfo,
55477 + .get_msglevel = sky2_get_msglevel,
55478 + .set_msglevel = sky2_set_msglevel,
55479 + .nway_reset = sky2_nway_reset,
55480 + .get_regs_len = sky2_get_regs_len,
55481 + .get_regs = sky2_get_regs,
55482 + .get_link = ethtool_op_get_link,
55483 + .get_sg = ethtool_op_get_sg,
55484 + .set_sg = ethtool_op_set_sg,
55485 + .get_tx_csum = ethtool_op_get_tx_csum,
55486 + .set_tx_csum = ethtool_op_set_tx_csum,
55487 + .get_tso = ethtool_op_get_tso,
55488 + .set_tso = ethtool_op_set_tso,
55489 + .get_rx_csum = sky2_get_rx_csum,
55490 + .set_rx_csum = sky2_set_rx_csum,
55491 + .get_strings = sky2_get_strings,
55492 + .get_coalesce = sky2_get_coalesce,
55493 + .set_coalesce = sky2_set_coalesce,
55494 + .get_ringparam = sky2_get_ringparam,
55495 + .set_ringparam = sky2_set_ringparam,
55496 + .get_pauseparam = sky2_get_pauseparam,
55497 + .set_pauseparam = sky2_set_pauseparam,
55498 +#ifdef CONFIG_PM
55499 + .get_wol = sky2_get_wol,
55500 + .set_wol = sky2_set_wol,
55501 +#endif
55502 + .phys_id = sky2_phys_id,
55503 + .get_stats_count = sky2_get_stats_count,
55504 + .get_ethtool_stats = sky2_get_ethtool_stats,
55505 + .get_perm_addr = ethtool_op_get_perm_addr,
55506 +};
55507 +
55508 +/* Initialize network device */
55509 +static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
55510 + unsigned port, int highmem)
55511 +{
55512 + struct sky2_port *sky2;
55513 + struct net_device *dev = alloc_etherdev(sizeof(*sky2));
55514 +
55515 + if (!dev) {
55516 + printk(KERN_ERR "sky2 etherdev alloc failed");
55517 + return NULL;
55518 + }
55519 +
55520 + SET_MODULE_OWNER(dev);
55521 + SET_NETDEV_DEV(dev, &hw->pdev->dev);
55522 + dev->irq = hw->pdev->irq;
55523 + dev->open = sky2_up;
55524 + dev->stop = sky2_down;
55525 + dev->do_ioctl = sky2_ioctl;
55526 + dev->hard_start_xmit = sky2_xmit_frame;
55527 + dev->get_stats = sky2_get_stats;
55528 + dev->set_multicast_list = sky2_set_multicast;
55529 + dev->set_mac_address = sky2_set_mac_address;
55530 + dev->change_mtu = sky2_change_mtu;
55531 + SET_ETHTOOL_OPS(dev, &sky2_ethtool_ops);
55532 + dev->tx_timeout = sky2_tx_timeout;
55533 + dev->watchdog_timeo = TX_WATCHDOG;
55534 + if (port == 0)
55535 + dev->poll = sky2_poll;
55536 + dev->weight = NAPI_WEIGHT;
55537 +#ifdef CONFIG_NET_POLL_CONTROLLER
55538 + dev->poll_controller = sky2_netpoll;
55539 +#endif
55540 +
55541 + sky2 = netdev_priv(dev);
55542 + sky2->netdev = dev;
55543 + sky2->hw = hw;
55544 + sky2->msg_enable = netif_msg_init(debug, default_msg);
55545 +
55546 + spin_lock_init(&sky2->tx_lock);
55547 + /* Auto speed and flow control */
55548 + sky2->autoneg = AUTONEG_ENABLE;
55549 + sky2->tx_pause = 1;
55550 + sky2->rx_pause = 1;
55551 + sky2->duplex = -1;
55552 + sky2->speed = -1;
55553 + sky2->advertising = sky2_supported_modes(hw);
55554 +
55555 + /* Receive checksum disabled for Yukon XL
55556 + * because of observed problems with incorrect
55557 + * values when multiple packets are received in one interrupt
55558 + */
55559 + sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL);
55560 +
55561 + INIT_WORK(&sky2->phy_task, sky2_phy_task, sky2);
55562 + init_MUTEX(&sky2->phy_sema);
55563 + sky2->tx_pending = TX_DEF_PENDING;
55564 + sky2->rx_pending = is_ec_a1(hw) ? 8 : RX_DEF_PENDING;
55565 + sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN);
55566 +
55567 + hw->dev[port] = dev;
55568 +
55569 + sky2->port = port;
55570 +
55571 + dev->features |= NETIF_F_LLTX;
55572 + if (hw->chip_id != CHIP_ID_YUKON_EC_U)
55573 + dev->features |= NETIF_F_TSO;
55574 + if (highmem)
55575 + dev->features |= NETIF_F_HIGHDMA;
55576 + dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
55577 +
55578 +#ifdef SKY2_VLAN_TAG_USED
55579 + dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
55580 + dev->vlan_rx_register = sky2_vlan_rx_register;
55581 + dev->vlan_rx_kill_vid = sky2_vlan_rx_kill_vid;
55582 +#endif
55583 +
55584 + /* read the mac address */
55585 + memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN);
55586 + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
55587 +
55588 + /* device is off until link detection */
55589 + netif_carrier_off(dev);
55590 + netif_stop_queue(dev);
55591 +
55592 + return dev;
55593 +}
55594 +
55595 +static void __devinit sky2_show_addr(struct net_device *dev)
55596 +{
55597 + const struct sky2_port *sky2 = netdev_priv(dev);
55598 +
55599 + if (netif_msg_probe(sky2))
55600 + printk(KERN_INFO PFX "%s: addr %02x:%02x:%02x:%02x:%02x:%02x\n",
55601 + dev->name,
55602 + dev->dev_addr[0], dev->dev_addr[1], dev->dev_addr[2],
55603 + dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]);
55604 +}
55605 +
55606 +static int __devinit sky2_probe(struct pci_dev *pdev,
55607 + const struct pci_device_id *ent)
55608 +{
55609 + struct net_device *dev, *dev1 = NULL;
55610 + struct sky2_hw *hw;
55611 + int err, pm_cap, using_dac = 0;
55612 +
55613 + err = pci_enable_device(pdev);
55614 + if (err) {
55615 + printk(KERN_ERR PFX "%s cannot enable PCI device\n",
55616 + pci_name(pdev));
55617 + goto err_out;
55618 + }
55619 +
55620 + err = pci_request_regions(pdev, DRV_NAME);
55621 + if (err) {
55622 + printk(KERN_ERR PFX "%s cannot obtain PCI resources\n",
55623 + pci_name(pdev));
55624 + goto err_out;
55625 + }
55626 +
55627 + pci_set_master(pdev);
55628 +
55629 + /* Find power-management capability. */
55630 + pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM);
55631 + if (pm_cap == 0) {
55632 + printk(KERN_ERR PFX "Cannot find PowerManagement capability, "
55633 + "aborting.\n");
55634 + err = -EIO;
55635 + goto err_out_free_regions;
55636 + }
55637 +
55638 + if (sizeof(dma_addr_t) > sizeof(u32) &&
55639 + !(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) {
55640 + using_dac = 1;
55641 + err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
55642 + if (err < 0) {
55643 + printk(KERN_ERR PFX "%s unable to obtain 64 bit DMA "
55644 + "for consistent allocations\n", pci_name(pdev));
55645 + goto err_out_free_regions;
55646 + }
55647 +
55648 + } else {
55649 + err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
55650 + if (err) {
55651 + printk(KERN_ERR PFX "%s no usable DMA configuration\n",
55652 + pci_name(pdev));
55653 + goto err_out_free_regions;
55654 + }
55655 + }
55656 +
55657 + err = -ENOMEM;
55658 + hw = kzalloc(sizeof(*hw), GFP_KERNEL);
55659 + if (!hw) {
55660 + printk(KERN_ERR PFX "%s: cannot allocate hardware struct\n",
55661 + pci_name(pdev));
55662 + goto err_out_free_regions;
55663 + }
55664 +
55665 + hw->pdev = pdev;
55666 +
55667 + hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000);
55668 + if (!hw->regs) {
55669 + printk(KERN_ERR PFX "%s: cannot map device registers\n",
55670 + pci_name(pdev));
55671 + goto err_out_free_hw;
55672 + }
55673 + hw->pm_cap = pm_cap;
55674 + spin_lock_init(&hw->hw_lock);
55675 +
55676 +#ifdef __BIG_ENDIAN
55677 + /* byte swap descriptors in hardware */
55678 + {
55679 + u32 reg;
55680 +
55681 + reg = sky2_pci_read32(hw, PCI_DEV_REG2);
55682 + reg |= PCI_REV_DESC;
55683 + sky2_pci_write32(hw, PCI_DEV_REG2, reg);
55684 + }
55685 +#endif
55686 +
55687 + /* ring for status responses */
55688 + hw->st_le = pci_alloc_consistent(hw->pdev, STATUS_LE_BYTES,
55689 + &hw->st_dma);
55690 + if (!hw->st_le)
55691 + goto err_out_iounmap;
55692 +
55693 + err = sky2_reset(hw);
55694 + if (err)
55695 + goto err_out_iounmap;
55696 +
55697 + printk(KERN_INFO PFX "v%s addr 0x%lx irq %d Yukon-%s (0x%x) rev %d\n",
55698 + DRV_VERSION, pci_resource_start(pdev, 0), pdev->irq,
55699 + yukon2_name[hw->chip_id - CHIP_ID_YUKON_XL],
55700 + hw->chip_id, hw->chip_rev);
55701 +
55702 + dev = sky2_init_netdev(hw, 0, using_dac);
55703 + if (!dev)
55704 + goto err_out_free_pci;
55705 +
55706 + err = register_netdev(dev);
55707 + if (err) {
55708 + printk(KERN_ERR PFX "%s: cannot register net device\n",
55709 + pci_name(pdev));
55710 + goto err_out_free_netdev;
55711 + }
55712 +
55713 + sky2_show_addr(dev);
55714 +
55715 + if (hw->ports > 1 && (dev1 = sky2_init_netdev(hw, 1, using_dac))) {
55716 + if (register_netdev(dev1) == 0)
55717 + sky2_show_addr(dev1);
55718 + else {
55719 + /* Failure to register second port need not be fatal */
55720 + printk(KERN_WARNING PFX
55721 + "register of second port failed\n");
55722 + hw->dev[1] = NULL;
55723 + free_netdev(dev1);
55724 + }
55725 + }
55726 +
55727 + err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw);
55728 + if (err) {
55729 + printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
55730 + pci_name(pdev), pdev->irq);
55731 + goto err_out_unregister;
55732 + }
55733 +
55734 + hw->intr_mask = Y2_IS_BASE;
55735 + sky2_write32(hw, B0_IMSK, hw->intr_mask);
55736 +
55737 + pci_set_drvdata(pdev, hw);
55738 +
55739 + return 0;
55740 +
55741 +err_out_unregister:
55742 + if (dev1) {
55743 + unregister_netdev(dev1);
55744 + free_netdev(dev1);
55745 + }
55746 + unregister_netdev(dev);
55747 +err_out_free_netdev:
55748 + free_netdev(dev);
55749 +err_out_free_pci:
55750 + sky2_write8(hw, B0_CTST, CS_RST_SET);
55751 + pci_free_consistent(hw->pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
55752 +err_out_iounmap:
55753 + iounmap(hw->regs);
55754 +err_out_free_hw:
55755 + kfree(hw);
55756 +err_out_free_regions:
55757 + pci_release_regions(pdev);
55758 + pci_disable_device(pdev);
55759 +err_out:
55760 + return err;
55761 +}
55762 +
55763 +static void __devexit sky2_remove(struct pci_dev *pdev)
55764 +{
55765 + struct sky2_hw *hw = pci_get_drvdata(pdev);
55766 + struct net_device *dev0, *dev1;
55767 +
55768 + if (!hw)
55769 + return;
55770 +
55771 + dev0 = hw->dev[0];
55772 + dev1 = hw->dev[1];
55773 + if (dev1)
55774 + unregister_netdev(dev1);
55775 + unregister_netdev(dev0);
55776 +
55777 + sky2_write32(hw, B0_IMSK, 0);
55778 + sky2_set_power_state(hw, PCI_D3hot);
55779 + sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
55780 + sky2_write8(hw, B0_CTST, CS_RST_SET);
55781 + sky2_read8(hw, B0_CTST);
55782 +
55783 + free_irq(pdev->irq, hw);
55784 + pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
55785 + pci_release_regions(pdev);
55786 + pci_disable_device(pdev);
55787 +
55788 + if (dev1)
55789 + free_netdev(dev1);
55790 + free_netdev(dev0);
55791 + iounmap(hw->regs);
55792 + kfree(hw);
55793 +
55794 + pci_set_drvdata(pdev, NULL);
55795 +}
55796 +
55797 +#ifdef CONFIG_PM
55798 +static int sky2_suspend(struct pci_dev *pdev, pm_message_t state)
55799 +{
55800 + struct sky2_hw *hw = pci_get_drvdata(pdev);
55801 + int i;
55802 +
55803 + for (i = 0; i < 2; i++) {
55804 + struct net_device *dev = hw->dev[i];
55805 +
55806 + if (dev) {
55807 + if (!netif_running(dev))
55808 + continue;
55809 +
55810 + sky2_down(dev);
55811 + netif_device_detach(dev);
55812 + }
55813 + }
55814 +
55815 + return sky2_set_power_state(hw, pci_choose_state(pdev, state));
55816 +}
55817 +
55818 +static int sky2_resume(struct pci_dev *pdev)
55819 +{
55820 + struct sky2_hw *hw = pci_get_drvdata(pdev);
55821 + int i, err;
55822 +
55823 + pci_restore_state(pdev);
55824 + pci_enable_wake(pdev, PCI_D0, 0);
55825 + err = sky2_set_power_state(hw, PCI_D0);
55826 + if (err)
55827 + goto out;
55828 +
55829 + err = sky2_reset(hw);
55830 + if (err)
55831 + goto out;
55832 +
55833 + for (i = 0; i < 2; i++) {
55834 + struct net_device *dev = hw->dev[i];
55835 + if (dev && netif_running(dev)) {
55836 + netif_device_attach(dev);
55837 + err = sky2_up(dev);
55838 + if (err) {
55839 + printk(KERN_ERR PFX "%s: could not up: %d\n",
55840 + dev->name, err);
55841 + dev_close(dev);
55842 + break;
55843 + }
55844 + }
55845 + }
55846 +out:
55847 + return err;
55848 +}
55849 +#endif
55850 +
55851 +static struct pci_driver sky2_driver = {
55852 + .name = DRV_NAME,
55853 + .id_table = sky2_id_table,
55854 + .probe = sky2_probe,
55855 + .remove = __devexit_p(sky2_remove),
55856 +#ifdef CONFIG_PM
55857 + .suspend = sky2_suspend,
55858 + .resume = sky2_resume,
55859 +#endif
55860 +};
55861 +
55862 +static int __init sky2_init_module(void)
55863 +{
55864 + return pci_register_driver(&sky2_driver);
55865 +}
55866 +
55867 +static void __exit sky2_cleanup_module(void)
55868 +{
55869 + pci_unregister_driver(&sky2_driver);
55870 +}
55871 +
55872 +module_init(sky2_init_module);
55873 +module_exit(sky2_cleanup_module);
55874 +
55875 +MODULE_DESCRIPTION("Marvell Yukon 2 Gigabit Ethernet driver");
55876 +MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>");
55877 +MODULE_LICENSE("GPL");
55878 +MODULE_VERSION(DRV_VERSION);
55879 diff -Nur linux-2.6.16.33-noxen/drivers/net/tg3.c linux-2.6.16.33/drivers/net/tg3.c
55880 --- linux-2.6.16.33-noxen/drivers/net/tg3.c 2006-11-22 18:06:31.000000000 +0000
55881 +++ linux-2.6.16.33/drivers/net/tg3.c 2007-05-23 21:00:01.000000000 +0000
55882 @@ -3664,7 +3664,7 @@
55883 #if TG3_TSO_SUPPORT != 0
55884 mss = 0;
55885 if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
55886 - (mss = skb_shinfo(skb)->tso_size) != 0) {
55887 + (mss = skb_shinfo(skb)->gso_size) != 0) {
55888 int tcp_opt_len, ip_tcp_len;
55889
55890 if (skb_header_cloned(skb) &&
55891 diff -Nur linux-2.6.16.33-noxen/drivers/net/tulip/winbond-840.c linux-2.6.16.33/drivers/net/tulip/winbond-840.c
55892 --- linux-2.6.16.33-noxen/drivers/net/tulip/winbond-840.c 2006-11-22 18:06:31.000000000 +0000
55893 +++ linux-2.6.16.33/drivers/net/tulip/winbond-840.c 2007-05-23 21:00:01.000000000 +0000
55894 @@ -1605,11 +1605,11 @@
55895 * - get_stats:
55896 * spin_lock_irq(np->lock), doesn't touch hw if not present
55897 * - hard_start_xmit:
55898 - * netif_stop_queue + spin_unlock_wait(&dev->xmit_lock);
55899 + * synchronize_irq + netif_tx_disable;
55900 * - tx_timeout:
55901 - * netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
55902 + * netif_device_detach + netif_tx_disable;
55903 * - set_multicast_list
55904 - * netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
55905 + * netif_device_detach + netif_tx_disable;
55906 * - interrupt handler
55907 * doesn't touch hw if not present, synchronize_irq waits for
55908 * running instances of the interrupt handler.
55909 @@ -1635,11 +1635,10 @@
55910 netif_device_detach(dev);
55911 update_csr6(dev, 0);
55912 iowrite32(0, ioaddr + IntrEnable);
55913 - netif_stop_queue(dev);
55914 spin_unlock_irq(&np->lock);
55915
55916 - spin_unlock_wait(&dev->xmit_lock);
55917 synchronize_irq(dev->irq);
55918 + netif_tx_disable(dev);
55919
55920 np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff;
55921
55922 diff -Nur linux-2.6.16.33-noxen/drivers/net/typhoon.c linux-2.6.16.33/drivers/net/typhoon.c
55923 --- linux-2.6.16.33-noxen/drivers/net/typhoon.c 2006-11-22 18:06:31.000000000 +0000
55924 +++ linux-2.6.16.33/drivers/net/typhoon.c 2007-05-23 21:00:01.000000000 +0000
55925 @@ -340,7 +340,7 @@
55926 #endif
55927
55928 #if defined(NETIF_F_TSO)
55929 -#define skb_tso_size(x) (skb_shinfo(x)->tso_size)
55930 +#define skb_tso_size(x) (skb_shinfo(x)->gso_size)
55931 #define TSO_NUM_DESCRIPTORS 2
55932 #define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT
55933 #else
55934 @@ -805,7 +805,7 @@
55935 * If problems develop with TSO, check this first.
55936 */
55937 numDesc = skb_shinfo(skb)->nr_frags + 1;
55938 - if(skb_tso_size(skb))
55939 + if (skb_is_gso(skb))
55940 numDesc++;
55941
55942 /* When checking for free space in the ring, we need to also
55943 @@ -845,7 +845,7 @@
55944 TYPHOON_TX_PF_VLAN_TAG_SHIFT);
55945 }
55946
55947 - if(skb_tso_size(skb)) {
55948 + if (skb_is_gso(skb)) {
55949 first_txd->processFlags |= TYPHOON_TX_PF_TCP_SEGMENT;
55950 first_txd->numDesc++;
55951
55952 diff -Nur linux-2.6.16.33-noxen/drivers/net/via-velocity.c linux-2.6.16.33/drivers/net/via-velocity.c
55953 --- linux-2.6.16.33-noxen/drivers/net/via-velocity.c 2006-11-22 18:06:31.000000000 +0000
55954 +++ linux-2.6.16.33/drivers/net/via-velocity.c 2007-05-23 21:00:01.000000000 +0000
55955 @@ -1905,6 +1905,13 @@
55956
55957 int pktlen = skb->len;
55958
55959 +#ifdef VELOCITY_ZERO_COPY_SUPPORT
55960 + if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) {
55961 + kfree_skb(skb);
55962 + return 0;
55963 + }
55964 +#endif
55965 +
55966 spin_lock_irqsave(&vptr->lock, flags);
55967
55968 index = vptr->td_curr[qnum];
55969 @@ -1920,8 +1927,6 @@
55970 */
55971 if (pktlen < ETH_ZLEN) {
55972 /* Cannot occur until ZC support */
55973 - if(skb_linearize(skb, GFP_ATOMIC))
55974 - return 0;
55975 pktlen = ETH_ZLEN;
55976 memcpy(tdinfo->buf, skb->data, skb->len);
55977 memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len);
55978 @@ -1939,7 +1944,6 @@
55979 int nfrags = skb_shinfo(skb)->nr_frags;
55980 tdinfo->skb = skb;
55981 if (nfrags > 6) {
55982 - skb_linearize(skb, GFP_ATOMIC);
55983 memcpy(tdinfo->buf, skb->data, skb->len);
55984 tdinfo->skb_dma[0] = tdinfo->buf_dma;
55985 td_ptr->tdesc0.pktsize =
55986 diff -Nur linux-2.6.16.33-noxen/drivers/net/wireless/orinoco.c linux-2.6.16.33/drivers/net/wireless/orinoco.c
55987 --- linux-2.6.16.33-noxen/drivers/net/wireless/orinoco.c 2006-11-22 18:06:31.000000000 +0000
55988 +++ linux-2.6.16.33/drivers/net/wireless/orinoco.c 2007-05-23 21:00:01.000000000 +0000
55989 @@ -1835,7 +1835,9 @@
55990 /* Set promiscuity / multicast*/
55991 priv->promiscuous = 0;
55992 priv->mc_count = 0;
55993 - __orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */
55994 +
55995 + /* FIXME: what about netif_tx_lock */
55996 + __orinoco_set_multicast_list(dev);
55997
55998 return 0;
55999 }
56000 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/buffer_sync.c linux-2.6.16.33/drivers/oprofile/buffer_sync.c
56001 --- linux-2.6.16.33-noxen/drivers/oprofile/buffer_sync.c 2006-11-22 18:06:31.000000000 +0000
56002 +++ linux-2.6.16.33/drivers/oprofile/buffer_sync.c 2007-05-23 21:00:01.000000000 +0000
56003 @@ -6,6 +6,10 @@
56004 *
56005 * @author John Levon <levon@movementarian.org>
56006 *
56007 + * Modified by Aravind Menon for Xen
56008 + * These modifications are:
56009 + * Copyright (C) 2005 Hewlett-Packard Co.
56010 + *
56011 * This is the core of the buffer management. Each
56012 * CPU buffer is processed and entered into the
56013 * global event buffer. Such processing is necessary
56014 @@ -38,6 +42,7 @@
56015 static DEFINE_SPINLOCK(task_mortuary);
56016 static void process_task_mortuary(void);
56017
56018 +static int cpu_current_domain[NR_CPUS];
56019
56020 /* Take ownership of the task struct and place it on the
56021 * list for processing. Only after two full buffer syncs
56022 @@ -146,6 +151,11 @@
56023 int sync_start(void)
56024 {
56025 int err;
56026 + int i;
56027 +
56028 + for (i = 0; i < NR_CPUS; i++) {
56029 + cpu_current_domain[i] = COORDINATOR_DOMAIN;
56030 + }
56031
56032 start_cpu_work();
56033
56034 @@ -275,15 +285,31 @@
56035 last_cookie = INVALID_COOKIE;
56036 }
56037
56038 -static void add_kernel_ctx_switch(unsigned int in_kernel)
56039 +static void add_cpu_mode_switch(unsigned int cpu_mode)
56040 {
56041 add_event_entry(ESCAPE_CODE);
56042 - if (in_kernel)
56043 - add_event_entry(KERNEL_ENTER_SWITCH_CODE);
56044 - else
56045 - add_event_entry(KERNEL_EXIT_SWITCH_CODE);
56046 + switch (cpu_mode) {
56047 + case CPU_MODE_USER:
56048 + add_event_entry(USER_ENTER_SWITCH_CODE);
56049 + break;
56050 + case CPU_MODE_KERNEL:
56051 + add_event_entry(KERNEL_ENTER_SWITCH_CODE);
56052 + break;
56053 + case CPU_MODE_XEN:
56054 + add_event_entry(XEN_ENTER_SWITCH_CODE);
56055 + break;
56056 + default:
56057 + break;
56058 + }
56059 }
56060 -
56061 +
56062 +static void add_domain_switch(unsigned long domain_id)
56063 +{
56064 + add_event_entry(ESCAPE_CODE);
56065 + add_event_entry(DOMAIN_SWITCH_CODE);
56066 + add_event_entry(domain_id);
56067 +}
56068 +
56069 static void
56070 add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
56071 {
56072 @@ -348,9 +374,9 @@
56073 * for later lookup from userspace.
56074 */
56075 static int
56076 -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
56077 +add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
56078 {
56079 - if (in_kernel) {
56080 + if (cpu_mode >= CPU_MODE_KERNEL) {
56081 add_sample_entry(s->eip, s->event);
56082 return 1;
56083 } else if (mm) {
56084 @@ -496,15 +522,21 @@
56085 struct mm_struct *mm = NULL;
56086 struct task_struct * new;
56087 unsigned long cookie = 0;
56088 - int in_kernel = 1;
56089 + int cpu_mode = 1;
56090 unsigned int i;
56091 sync_buffer_state state = sb_buffer_start;
56092 unsigned long available;
56093 + int domain_switch = 0;
56094
56095 down(&buffer_sem);
56096
56097 add_cpu_switch(cpu);
56098
56099 + /* We need to assign the first samples in this CPU buffer to the
56100 + same domain that we were processing at the last sync_buffer */
56101 + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
56102 + add_domain_switch(cpu_current_domain[cpu]);
56103 + }
56104 /* Remember, only we can modify tail_pos */
56105
56106 available = get_slots(cpu_buf);
56107 @@ -512,16 +544,18 @@
56108 for (i = 0; i < available; ++i) {
56109 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
56110
56111 - if (is_code(s->eip)) {
56112 - if (s->event <= CPU_IS_KERNEL) {
56113 - /* kernel/userspace switch */
56114 - in_kernel = s->event;
56115 + if (is_code(s->eip) && !domain_switch) {
56116 + if (s->event <= CPU_MODE_XEN) {
56117 + /* xen/kernel/userspace switch */
56118 + cpu_mode = s->event;
56119 if (state == sb_buffer_start)
56120 state = sb_sample_start;
56121 - add_kernel_ctx_switch(s->event);
56122 + add_cpu_mode_switch(s->event);
56123 } else if (s->event == CPU_TRACE_BEGIN) {
56124 state = sb_bt_start;
56125 add_trace_begin();
56126 + } else if (s->event == CPU_DOMAIN_SWITCH) {
56127 + domain_switch = 1;
56128 } else {
56129 struct mm_struct * oldmm = mm;
56130
56131 @@ -535,11 +569,21 @@
56132 add_user_ctx_switch(new, cookie);
56133 }
56134 } else {
56135 - if (state >= sb_bt_start &&
56136 - !add_sample(mm, s, in_kernel)) {
56137 - if (state == sb_bt_start) {
56138 - state = sb_bt_ignore;
56139 - atomic_inc(&oprofile_stats.bt_lost_no_mapping);
56140 + if (domain_switch) {
56141 + cpu_current_domain[cpu] = s->eip;
56142 + add_domain_switch(s->eip);
56143 + domain_switch = 0;
56144 + } else {
56145 + if (cpu_current_domain[cpu] !=
56146 + COORDINATOR_DOMAIN) {
56147 + add_sample_entry(s->eip, s->event);
56148 + }
56149 + else if (state >= sb_bt_start &&
56150 + !add_sample(mm, s, cpu_mode)) {
56151 + if (state == sb_bt_start) {
56152 + state = sb_bt_ignore;
56153 + atomic_inc(&oprofile_stats.bt_lost_no_mapping);
56154 + }
56155 }
56156 }
56157 }
56158 @@ -548,6 +592,11 @@
56159 }
56160 release_mm(mm);
56161
56162 + /* We reset domain to COORDINATOR at each CPU switch */
56163 + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
56164 + add_domain_switch(COORDINATOR_DOMAIN);
56165 + }
56166 +
56167 mark_done(cpu);
56168
56169 up(&buffer_sem);
56170 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.c linux-2.6.16.33/drivers/oprofile/cpu_buffer.c
56171 --- linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.c 2006-11-22 18:06:31.000000000 +0000
56172 +++ linux-2.6.16.33/drivers/oprofile/cpu_buffer.c 2007-05-23 21:00:01.000000000 +0000
56173 @@ -6,6 +6,10 @@
56174 *
56175 * @author John Levon <levon@movementarian.org>
56176 *
56177 + * Modified by Aravind Menon for Xen
56178 + * These modifications are:
56179 + * Copyright (C) 2005 Hewlett-Packard Co.
56180 + *
56181 * Each CPU has a local buffer that stores PC value/event
56182 * pairs. We also log context switches when we notice them.
56183 * Eventually each CPU's buffer is processed into the global
56184 @@ -34,6 +38,8 @@
56185 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
56186 static int work_enabled;
56187
56188 +static int32_t current_domain = COORDINATOR_DOMAIN;
56189 +
56190 void free_cpu_buffers(void)
56191 {
56192 int i;
56193 @@ -58,7 +64,7 @@
56194 goto fail;
56195
56196 b->last_task = NULL;
56197 - b->last_is_kernel = -1;
56198 + b->last_cpu_mode = -1;
56199 b->tracing = 0;
56200 b->buffer_size = buffer_size;
56201 b->tail_pos = 0;
56202 @@ -114,7 +120,7 @@
56203 * collected will populate the buffer with proper
56204 * values to initialize the buffer
56205 */
56206 - cpu_buf->last_is_kernel = -1;
56207 + cpu_buf->last_cpu_mode = -1;
56208 cpu_buf->last_task = NULL;
56209 }
56210
56211 @@ -164,13 +170,13 @@
56212 * because of the head/tail separation of the writer and reader
56213 * of the CPU buffer.
56214 *
56215 - * is_kernel is needed because on some architectures you cannot
56216 + * cpu_mode is needed because on some architectures you cannot
56217 * tell if you are in kernel or user space simply by looking at
56218 - * pc. We tag this in the buffer by generating kernel enter/exit
56219 - * events whenever is_kernel changes
56220 + * pc. We tag this in the buffer by generating kernel/user (and xen)
56221 + * enter events whenever cpu_mode changes
56222 */
56223 static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
56224 - int is_kernel, unsigned long event)
56225 + int cpu_mode, unsigned long event)
56226 {
56227 struct task_struct * task;
56228
56229 @@ -181,18 +187,18 @@
56230 return 0;
56231 }
56232
56233 - is_kernel = !!is_kernel;
56234 -
56235 task = current;
56236
56237 /* notice a switch from user->kernel or vice versa */
56238 - if (cpu_buf->last_is_kernel != is_kernel) {
56239 - cpu_buf->last_is_kernel = is_kernel;
56240 - add_code(cpu_buf, is_kernel);
56241 + if (cpu_buf->last_cpu_mode != cpu_mode) {
56242 + cpu_buf->last_cpu_mode = cpu_mode;
56243 + add_code(cpu_buf, cpu_mode);
56244 }
56245 -
56246 +
56247 /* notice a task switch */
56248 - if (cpu_buf->last_task != task) {
56249 + /* if not processing other domain samples */
56250 + if ((cpu_buf->last_task != task) &&
56251 + (current_domain == COORDINATOR_DOMAIN)) {
56252 cpu_buf->last_task = task;
56253 add_code(cpu_buf, (unsigned long)task);
56254 }
56255 @@ -269,6 +275,25 @@
56256 add_sample(cpu_buf, pc, 0);
56257 }
56258
56259 +int oprofile_add_domain_switch(int32_t domain_id)
56260 +{
56261 + struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
56262 +
56263 + /* should have space for switching into and out of domain
56264 + (2 slots each) plus one sample and one cpu mode switch */
56265 + if (((nr_available_slots(cpu_buf) < 6) &&
56266 + (domain_id != COORDINATOR_DOMAIN)) ||
56267 + (nr_available_slots(cpu_buf) < 2))
56268 + return 0;
56269 +
56270 + add_code(cpu_buf, CPU_DOMAIN_SWITCH);
56271 + add_sample(cpu_buf, domain_id, 0);
56272 +
56273 + current_domain = domain_id;
56274 +
56275 + return 1;
56276 +}
56277 +
56278 /*
56279 * This serves to avoid cpu buffer overflow, and makes sure
56280 * the task mortuary progresses
56281 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.h linux-2.6.16.33/drivers/oprofile/cpu_buffer.h
56282 --- linux-2.6.16.33-noxen/drivers/oprofile/cpu_buffer.h 2006-11-22 18:06:31.000000000 +0000
56283 +++ linux-2.6.16.33/drivers/oprofile/cpu_buffer.h 2007-05-23 21:00:01.000000000 +0000
56284 @@ -36,7 +36,7 @@
56285 volatile unsigned long tail_pos;
56286 unsigned long buffer_size;
56287 struct task_struct * last_task;
56288 - int last_is_kernel;
56289 + int last_cpu_mode;
56290 int tracing;
56291 struct op_sample * buffer;
56292 unsigned long sample_received;
56293 @@ -51,7 +51,10 @@
56294 void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
56295
56296 /* transient events for the CPU buffer -> event buffer */
56297 -#define CPU_IS_KERNEL 1
56298 -#define CPU_TRACE_BEGIN 2
56299 +#define CPU_MODE_USER 0
56300 +#define CPU_MODE_KERNEL 1
56301 +#define CPU_MODE_XEN 2
56302 +#define CPU_TRACE_BEGIN 3
56303 +#define CPU_DOMAIN_SWITCH 4
56304
56305 #endif /* OPROFILE_CPU_BUFFER_H */
56306 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/event_buffer.h linux-2.6.16.33/drivers/oprofile/event_buffer.h
56307 --- linux-2.6.16.33-noxen/drivers/oprofile/event_buffer.h 2006-11-22 18:06:31.000000000 +0000
56308 +++ linux-2.6.16.33/drivers/oprofile/event_buffer.h 2007-05-23 21:00:01.000000000 +0000
56309 @@ -29,15 +29,20 @@
56310 #define CPU_SWITCH_CODE 2
56311 #define COOKIE_SWITCH_CODE 3
56312 #define KERNEL_ENTER_SWITCH_CODE 4
56313 -#define KERNEL_EXIT_SWITCH_CODE 5
56314 +#define USER_ENTER_SWITCH_CODE 5
56315 #define MODULE_LOADED_CODE 6
56316 #define CTX_TGID_CODE 7
56317 #define TRACE_BEGIN_CODE 8
56318 #define TRACE_END_CODE 9
56319 +#define XEN_ENTER_SWITCH_CODE 10
56320 +#define DOMAIN_SWITCH_CODE 11
56321
56322 #define INVALID_COOKIE ~0UL
56323 #define NO_COOKIE 0UL
56324
56325 +/* Constant used to refer to coordinator domain (Xen) */
56326 +#define COORDINATOR_DOMAIN -1
56327 +
56328 /* add data to the event buffer */
56329 void add_event_entry(unsigned long data);
56330
56331 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprof.c linux-2.6.16.33/drivers/oprofile/oprof.c
56332 --- linux-2.6.16.33-noxen/drivers/oprofile/oprof.c 2006-11-22 18:06:31.000000000 +0000
56333 +++ linux-2.6.16.33/drivers/oprofile/oprof.c 2007-05-23 21:00:01.000000000 +0000
56334 @@ -5,6 +5,10 @@
56335 * @remark Read the file COPYING
56336 *
56337 * @author John Levon <levon@movementarian.org>
56338 + *
56339 + * Modified by Aravind Menon for Xen
56340 + * These modifications are:
56341 + * Copyright (C) 2005 Hewlett-Packard Co.
56342 */
56343
56344 #include <linux/kernel.h>
56345 @@ -19,7 +23,7 @@
56346 #include "cpu_buffer.h"
56347 #include "buffer_sync.h"
56348 #include "oprofile_stats.h"
56349 -
56350 +
56351 struct oprofile_operations oprofile_ops;
56352
56353 unsigned long oprofile_started;
56354 @@ -33,6 +37,32 @@
56355 */
56356 static int timer = 0;
56357
56358 +int oprofile_set_active(int active_domains[], unsigned int adomains)
56359 +{
56360 + int err;
56361 +
56362 + if (!oprofile_ops.set_active)
56363 + return -EINVAL;
56364 +
56365 + down(&start_sem);
56366 + err = oprofile_ops.set_active(active_domains, adomains);
56367 + up(&start_sem);
56368 + return err;
56369 +}
56370 +
56371 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
56372 +{
56373 + int err;
56374 +
56375 + if (!oprofile_ops.set_passive)
56376 + return -EINVAL;
56377 +
56378 + down(&start_sem);
56379 + err = oprofile_ops.set_passive(passive_domains, pdomains);
56380 + up(&start_sem);
56381 + return err;
56382 +}
56383 +
56384 int oprofile_setup(void)
56385 {
56386 int err;
56387 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprof.h linux-2.6.16.33/drivers/oprofile/oprof.h
56388 --- linux-2.6.16.33-noxen/drivers/oprofile/oprof.h 2006-11-22 18:06:31.000000000 +0000
56389 +++ linux-2.6.16.33/drivers/oprofile/oprof.h 2007-05-23 21:00:01.000000000 +0000
56390 @@ -35,5 +35,8 @@
56391 void oprofile_timer_init(struct oprofile_operations * ops);
56392
56393 int oprofile_set_backtrace(unsigned long depth);
56394 +
56395 +int oprofile_set_active(int active_domains[], unsigned int adomains);
56396 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
56397
56398 #endif /* OPROF_H */
56399 diff -Nur linux-2.6.16.33-noxen/drivers/oprofile/oprofile_files.c linux-2.6.16.33/drivers/oprofile/oprofile_files.c
56400 --- linux-2.6.16.33-noxen/drivers/oprofile/oprofile_files.c 2006-11-22 18:06:31.000000000 +0000
56401 +++ linux-2.6.16.33/drivers/oprofile/oprofile_files.c 2007-05-23 21:00:01.000000000 +0000
56402 @@ -5,15 +5,21 @@
56403 * @remark Read the file COPYING
56404 *
56405 * @author John Levon <levon@movementarian.org>
56406 + *
56407 + * Modified by Aravind Menon for Xen
56408 + * These modifications are:
56409 + * Copyright (C) 2005 Hewlett-Packard Co.
56410 */
56411
56412 #include <linux/fs.h>
56413 #include <linux/oprofile.h>
56414 +#include <asm/uaccess.h>
56415 +#include <linux/ctype.h>
56416
56417 #include "event_buffer.h"
56418 #include "oprofile_stats.h"
56419 #include "oprof.h"
56420 -
56421 +
56422 unsigned long fs_buffer_size = 131072;
56423 unsigned long fs_cpu_buffer_size = 8192;
56424 unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
56425 @@ -117,11 +123,202 @@
56426 static struct file_operations dump_fops = {
56427 .write = dump_write,
56428 };
56429 -
56430 +
56431 +#define TMPBUFSIZE 512
56432 +
56433 +static unsigned int adomains = 0;
56434 +static int active_domains[MAX_OPROF_DOMAINS + 1];
56435 +static DEFINE_MUTEX(adom_mutex);
56436 +
56437 +static ssize_t adomain_write(struct file * file, char const __user * buf,
56438 + size_t count, loff_t * offset)
56439 +{
56440 + char *tmpbuf;
56441 + char *startp, *endp;
56442 + int i;
56443 + unsigned long val;
56444 + ssize_t retval = count;
56445 +
56446 + if (*offset)
56447 + return -EINVAL;
56448 + if (count > TMPBUFSIZE - 1)
56449 + return -EINVAL;
56450 +
56451 + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56452 + return -ENOMEM;
56453 +
56454 + if (copy_from_user(tmpbuf, buf, count)) {
56455 + kfree(tmpbuf);
56456 + return -EFAULT;
56457 + }
56458 + tmpbuf[count] = 0;
56459 +
56460 + mutex_lock(&adom_mutex);
56461 +
56462 + startp = tmpbuf;
56463 + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
56464 + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
56465 + val = simple_strtoul(startp, &endp, 0);
56466 + if (endp == startp)
56467 + break;
56468 + while (ispunct(*endp) || isspace(*endp))
56469 + endp++;
56470 + active_domains[i] = val;
56471 + if (active_domains[i] != val)
56472 + /* Overflow, force error below */
56473 + i = MAX_OPROF_DOMAINS + 1;
56474 + startp = endp;
56475 + }
56476 + /* Force error on trailing junk */
56477 + adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
56478 +
56479 + kfree(tmpbuf);
56480 +
56481 + if (adomains > MAX_OPROF_DOMAINS
56482 + || oprofile_set_active(active_domains, adomains)) {
56483 + adomains = 0;
56484 + retval = -EINVAL;
56485 + }
56486 +
56487 + mutex_unlock(&adom_mutex);
56488 + return retval;
56489 +}
56490 +
56491 +static ssize_t adomain_read(struct file * file, char __user * buf,
56492 + size_t count, loff_t * offset)
56493 +{
56494 + char * tmpbuf;
56495 + size_t len;
56496 + int i;
56497 + ssize_t retval;
56498 +
56499 + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56500 + return -ENOMEM;
56501 +
56502 + mutex_lock(&adom_mutex);
56503 +
56504 + len = 0;
56505 + for (i = 0; i < adomains; i++)
56506 + len += snprintf(tmpbuf + len,
56507 + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
56508 + "%u ", active_domains[i]);
56509 + WARN_ON(len > TMPBUFSIZE);
56510 + if (len != 0 && len <= TMPBUFSIZE)
56511 + tmpbuf[len-1] = '\n';
56512 +
56513 + mutex_unlock(&adom_mutex);
56514 +
56515 + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
56516 +
56517 + kfree(tmpbuf);
56518 + return retval;
56519 +}
56520 +
56521 +
56522 +static struct file_operations active_domain_ops = {
56523 + .read = adomain_read,
56524 + .write = adomain_write,
56525 +};
56526 +
56527 +static unsigned int pdomains = 0;
56528 +static int passive_domains[MAX_OPROF_DOMAINS];
56529 +static DEFINE_MUTEX(pdom_mutex);
56530 +
56531 +static ssize_t pdomain_write(struct file * file, char const __user * buf,
56532 + size_t count, loff_t * offset)
56533 +{
56534 + char *tmpbuf;
56535 + char *startp, *endp;
56536 + int i;
56537 + unsigned long val;
56538 + ssize_t retval = count;
56539 +
56540 + if (*offset)
56541 + return -EINVAL;
56542 + if (count > TMPBUFSIZE - 1)
56543 + return -EINVAL;
56544 +
56545 + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56546 + return -ENOMEM;
56547 +
56548 + if (copy_from_user(tmpbuf, buf, count)) {
56549 + kfree(tmpbuf);
56550 + return -EFAULT;
56551 + }
56552 + tmpbuf[count] = 0;
56553 +
56554 + mutex_lock(&pdom_mutex);
56555 +
56556 + startp = tmpbuf;
56557 + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
56558 + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
56559 + val = simple_strtoul(startp, &endp, 0);
56560 + if (endp == startp)
56561 + break;
56562 + while (ispunct(*endp) || isspace(*endp))
56563 + endp++;
56564 + passive_domains[i] = val;
56565 + if (passive_domains[i] != val)
56566 + /* Overflow, force error below */
56567 + i = MAX_OPROF_DOMAINS + 1;
56568 + startp = endp;
56569 + }
56570 + /* Force error on trailing junk */
56571 + pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
56572 +
56573 + kfree(tmpbuf);
56574 +
56575 + if (pdomains > MAX_OPROF_DOMAINS
56576 + || oprofile_set_passive(passive_domains, pdomains)) {
56577 + pdomains = 0;
56578 + retval = -EINVAL;
56579 + }
56580 +
56581 + mutex_unlock(&pdom_mutex);
56582 + return retval;
56583 +}
56584 +
56585 +static ssize_t pdomain_read(struct file * file, char __user * buf,
56586 + size_t count, loff_t * offset)
56587 +{
56588 + char * tmpbuf;
56589 + size_t len;
56590 + int i;
56591 + ssize_t retval;
56592 +
56593 + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
56594 + return -ENOMEM;
56595 +
56596 + mutex_lock(&pdom_mutex);
56597 +
56598 + len = 0;
56599 + for (i = 0; i < pdomains; i++)
56600 + len += snprintf(tmpbuf + len,
56601 + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
56602 + "%u ", passive_domains[i]);
56603 + WARN_ON(len > TMPBUFSIZE);
56604 + if (len != 0 && len <= TMPBUFSIZE)
56605 + tmpbuf[len-1] = '\n';
56606 +
56607 + mutex_unlock(&pdom_mutex);
56608 +
56609 + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
56610 +
56611 + kfree(tmpbuf);
56612 + return retval;
56613 +}
56614 +
56615 +static struct file_operations passive_domain_ops = {
56616 + .read = pdomain_read,
56617 + .write = pdomain_write,
56618 +};
56619 +
56620 void oprofile_create_files(struct super_block * sb, struct dentry * root)
56621 {
56622 oprofilefs_create_file(sb, root, "enable", &enable_fops);
56623 oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
56624 + oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
56625 + oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
56626 oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
56627 oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
56628 oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
56629 diff -Nur linux-2.6.16.33-noxen/drivers/pci/Kconfig linux-2.6.16.33/drivers/pci/Kconfig
56630 --- linux-2.6.16.33-noxen/drivers/pci/Kconfig 2006-11-22 18:06:31.000000000 +0000
56631 +++ linux-2.6.16.33/drivers/pci/Kconfig 2007-01-08 15:00:45.000000000 +0000
56632 @@ -5,6 +5,7 @@
56633 bool "Message Signaled Interrupts (MSI and MSI-X)"
56634 depends on PCI
56635 depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
56636 + depends on !XEN
56637 help
56638 This allows device drivers to enable MSI (Message Signaled
56639 Interrupts). Message Signaled Interrupts enable a device to
56640 diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_eddp.c linux-2.6.16.33/drivers/s390/net/qeth_eddp.c
56641 --- linux-2.6.16.33-noxen/drivers/s390/net/qeth_eddp.c 2006-11-22 18:06:31.000000000 +0000
56642 +++ linux-2.6.16.33/drivers/s390/net/qeth_eddp.c 2007-05-23 21:00:01.000000000 +0000
56643 @@ -421,7 +421,7 @@
56644 }
56645 tcph = eddp->skb->h.th;
56646 while (eddp->skb_offset < eddp->skb->len) {
56647 - data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
56648 + data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
56649 (int)(eddp->skb->len - eddp->skb_offset));
56650 /* prepare qdio hdr */
56651 if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
56652 @@ -516,20 +516,20 @@
56653
56654 QETH_DBF_TEXT(trace, 5, "eddpcanp");
56655 /* can we put multiple skbs in one page? */
56656 - skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
56657 + skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
56658 if (skbs_per_page > 1){
56659 - ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
56660 + ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
56661 skbs_per_page + 1;
56662 ctx->elements_per_skb = 1;
56663 } else {
56664 /* no -> how many elements per skb? */
56665 - ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
56666 + ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
56667 PAGE_SIZE) >> PAGE_SHIFT;
56668 ctx->num_pages = ctx->elements_per_skb *
56669 - (skb_shinfo(skb)->tso_segs + 1);
56670 + (skb_shinfo(skb)->gso_segs + 1);
56671 }
56672 ctx->num_elements = ctx->elements_per_skb *
56673 - (skb_shinfo(skb)->tso_segs + 1);
56674 + (skb_shinfo(skb)->gso_segs + 1);
56675 }
56676
56677 static inline struct qeth_eddp_context *
56678 diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_main.c linux-2.6.16.33/drivers/s390/net/qeth_main.c
56679 --- linux-2.6.16.33-noxen/drivers/s390/net/qeth_main.c 2006-11-22 18:06:31.000000000 +0000
56680 +++ linux-2.6.16.33/drivers/s390/net/qeth_main.c 2007-05-23 21:00:01.000000000 +0000
56681 @@ -4454,7 +4454,7 @@
56682 queue = card->qdio.out_qs
56683 [qeth_get_priority_queue(card, skb, ipv, cast_type)];
56684
56685 - if (skb_shinfo(skb)->tso_size)
56686 + if (skb_is_gso(skb))
56687 large_send = card->options.large_send;
56688
56689 /*are we able to do TSO ? If so ,prepare and send it from here */
56690 @@ -4501,8 +4501,7 @@
56691 card->stats.tx_packets++;
56692 card->stats.tx_bytes += skb->len;
56693 #ifdef CONFIG_QETH_PERF_STATS
56694 - if (skb_shinfo(skb)->tso_size &&
56695 - !(large_send == QETH_LARGE_SEND_NO)) {
56696 + if (skb_is_gso(skb) && !(large_send == QETH_LARGE_SEND_NO)) {
56697 card->perf_stats.large_send_bytes += skb->len;
56698 card->perf_stats.large_send_cnt++;
56699 }
56700 diff -Nur linux-2.6.16.33-noxen/drivers/s390/net/qeth_tso.h linux-2.6.16.33/drivers/s390/net/qeth_tso.h
56701 --- linux-2.6.16.33-noxen/drivers/s390/net/qeth_tso.h 2006-11-22 18:06:31.000000000 +0000
56702 +++ linux-2.6.16.33/drivers/s390/net/qeth_tso.h 2007-05-23 21:00:01.000000000 +0000
56703 @@ -51,7 +51,7 @@
56704 hdr->ext.hdr_version = 1;
56705 hdr->ext.hdr_len = 28;
56706 /*insert non-fix values */
56707 - hdr->ext.mss = skb_shinfo(skb)->tso_size;
56708 + hdr->ext.mss = skb_shinfo(skb)->gso_size;
56709 hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
56710 hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
56711 sizeof(struct qeth_hdr_tso));
56712 diff -Nur linux-2.6.16.33-noxen/drivers/serial/Kconfig linux-2.6.16.33/drivers/serial/Kconfig
56713 --- linux-2.6.16.33-noxen/drivers/serial/Kconfig 2006-11-22 18:06:31.000000000 +0000
56714 +++ linux-2.6.16.33/drivers/serial/Kconfig 2007-01-08 15:00:45.000000000 +0000
56715 @@ -11,6 +11,7 @@
56716 config SERIAL_8250
56717 tristate "8250/16550 and compatible serial support"
56718 depends on (BROKEN || !SPARC)
56719 + depends on !XEN_DISABLE_SERIAL
56720 select SERIAL_CORE
56721 ---help---
56722 This selects whether you want to include the driver for the standard
56723 diff -Nur linux-2.6.16.33-noxen/drivers/video/Kconfig linux-2.6.16.33/drivers/video/Kconfig
56724 --- linux-2.6.16.33-noxen/drivers/video/Kconfig 2006-11-22 18:06:31.000000000 +0000
56725 +++ linux-2.6.16.33/drivers/video/Kconfig 2007-01-08 15:00:45.000000000 +0000
56726 @@ -495,7 +495,7 @@
56727
56728 config VIDEO_SELECT
56729 bool
56730 - depends on (FB = y) && X86
56731 + depends on (FB = y) && X86 && !XEN
56732 default y
56733
56734 config FB_SGIVW
56735 diff -Nur linux-2.6.16.33-noxen/drivers/xen/Kconfig linux-2.6.16.33/drivers/xen/Kconfig
56736 --- linux-2.6.16.33-noxen/drivers/xen/Kconfig 1970-01-01 00:00:00.000000000 +0000
56737 +++ linux-2.6.16.33/drivers/xen/Kconfig 2007-01-08 15:00:45.000000000 +0000
56738 @@ -0,0 +1,283 @@
56739 +#
56740 +# This Kconfig describe xen options
56741 +#
56742 +
56743 +mainmenu "Xen Configuration"
56744 +
56745 +config XEN
56746 + bool
56747 + default y if X86_XEN || X86_64_XEN
56748 + help
56749 + This is the Linux Xen port.
56750 +
56751 +if XEN
56752 +config XEN_INTERFACE_VERSION
56753 + hex
56754 + default 0x00030203
56755 +
56756 +menu "XEN"
56757 +
56758 +config XEN_PRIVILEGED_GUEST
56759 + bool "Privileged Guest (domain 0)"
56760 + depends XEN
56761 + default n
56762 + help
56763 + Support for privileged operation (domain 0)
56764 +
56765 +config XEN_UNPRIVILEGED_GUEST
56766 + bool
56767 + default !XEN_PRIVILEGED_GUEST
56768 +
56769 +config XEN_PRIVCMD
56770 + bool
56771 + depends on PROC_FS
56772 + default y
56773 +
56774 +config XEN_XENBUS_DEV
56775 + bool
56776 + depends on PROC_FS
56777 + default y
56778 +
56779 +config XEN_BACKEND
56780 + tristate "Backend driver support"
56781 + default y
56782 + help
56783 + Support for backend device drivers that provide I/O services
56784 + to other virtual machines.
56785 +
56786 +config XEN_BLKDEV_BACKEND
56787 + tristate "Block-device backend driver"
56788 + depends on XEN_BACKEND
56789 + default y
56790 + help
56791 + The block-device backend driver allows the kernel to export its
56792 + block devices to other guests via a high-performance shared-memory
56793 + interface.
56794 +
56795 +config XEN_BLKDEV_TAP
56796 + tristate "Block-device tap backend driver"
56797 + depends on XEN_BACKEND
56798 + default XEN_PRIVILEGED_GUEST
56799 + help
56800 + The block tap driver is an alternative to the block back driver
56801 + and allows VM block requests to be redirected to userspace through
56802 + a device interface. The tap allows user-space development of
56803 + high-performance block backends, where disk images may be implemented
56804 + as files, in memory, or on other hosts across the network. This
56805 + driver can safely coexist with the existing blockback driver.
56806 +
56807 +config XEN_NETDEV_BACKEND
56808 + tristate "Network-device backend driver"
56809 + depends on XEN_BACKEND && NET
56810 + default y
56811 + help
56812 + The network-device backend driver allows the kernel to export its
56813 + network devices to other guests via a high-performance shared-memory
56814 + interface.
56815 +
56816 +config XEN_NETDEV_PIPELINED_TRANSMITTER
56817 + bool "Pipelined transmitter (DANGEROUS)"
56818 + depends on XEN_NETDEV_BACKEND
56819 + default n
56820 + help
56821 + If the net backend is a dumb domain, such as a transparent Ethernet
56822 + bridge with no local IP interface, it is safe to say Y here to get
56823 + slightly lower network overhead.
56824 + If the backend has a local IP interface; or may be doing smart things
56825 + like reassembling packets to perform firewall filtering; or if you
56826 + are unsure; or if you experience network hangs when this option is
56827 + enabled; then you must say N here.
56828 +
56829 +config XEN_NETDEV_LOOPBACK
56830 + tristate "Network-device loopback driver"
56831 + depends on XEN_NETDEV_BACKEND
56832 + default y
56833 + help
56834 + A two-interface loopback device to emulate a local netfront-netback
56835 + connection.
56836 +
56837 +config XEN_PCIDEV_BACKEND
56838 + tristate "PCI-device backend driver"
56839 + depends on PCI && XEN_BACKEND
56840 + default XEN_PRIVILEGED_GUEST
56841 + help
56842 + The PCI device backend driver allows the kernel to export arbitrary
56843 + PCI devices to other guests. If you select this to be a module, you
56844 + will need to make sure no other driver has bound to the device(s)
56845 + you want to make visible to other guests.
56846 +
56847 +choice
56848 + prompt "PCI Backend Mode"
56849 + depends on XEN_PCIDEV_BACKEND
56850 + default XEN_PCIDEV_BACKEND_VPCI
56851 +
56852 +config XEN_PCIDEV_BACKEND_VPCI
56853 + bool "Virtual PCI"
56854 + ---help---
56855 + This PCI Backend hides the true PCI topology and makes the frontend
56856 + think there is a single PCI bus with only the exported devices on it.
56857 + For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
56858 + second device at 02:1a.1 will be re-assigned to 00:01.1.
56859 +
56860 +config XEN_PCIDEV_BACKEND_PASS
56861 + bool "Passthrough"
56862 + ---help---
56863 + This PCI Backend provides a real view of the PCI topology to the
56864 + frontend (for example, a device at 06:01.b will still appear at
56865 + 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
56866 + PCI devices to its driver domains. This may be required for drivers
56867 + which depend on finding their hardward in certain bus/slot
56868 + locations.
56869 +
56870 +config XEN_PCIDEV_BACKEND_SLOT
56871 + bool "Slot"
56872 + ---help---
56873 + This PCI Backend hides the true PCI topology and makes the frontend
56874 + think there is a single PCI bus with only the exported devices on it.
56875 + Contrary to the virtual PCI backend, a function becomes a new slot.
56876 + For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
56877 + second device at 02:1a.1 will be re-assigned to 00:01.0.
56878 +
56879 +endchoice
56880 +
56881 +config XEN_PCIDEV_BE_DEBUG
56882 + bool "PCI Backend Debugging"
56883 + depends on XEN_PCIDEV_BACKEND
56884 + default n
56885 +
56886 +config XEN_TPMDEV_BACKEND
56887 + tristate "TPM-device backend driver"
56888 + depends on XEN_BACKEND
56889 + default n
56890 + help
56891 + The TPM-device backend driver
56892 +
56893 +config XEN_BLKDEV_FRONTEND
56894 + tristate "Block-device frontend driver"
56895 + depends on XEN
56896 + default y
56897 + help
56898 + The block-device frontend driver allows the kernel to access block
56899 + devices mounted within another guest OS. Unless you are building a
56900 + dedicated device-driver domain, or your master control domain
56901 + (domain 0), then you almost certainly want to say Y here.
56902 +
56903 +config XEN_NETDEV_FRONTEND
56904 + tristate "Network-device frontend driver"
56905 + depends on XEN && NET
56906 + default y
56907 + help
56908 + The network-device frontend driver allows the kernel to access
56909 + network interfaces within another guest OS. Unless you are building a
56910 + dedicated device-driver domain, or your master control domain
56911 + (domain 0), then you almost certainly want to say Y here.
56912 +
56913 +config XEN_FRAMEBUFFER
56914 + tristate "Framebuffer-device frontend driver"
56915 + depends on XEN && FB
56916 + select FB_CFB_FILLRECT
56917 + select FB_CFB_COPYAREA
56918 + select FB_CFB_IMAGEBLIT
56919 + default y
56920 + help
56921 + The framebuffer-device frontend drivers allows the kernel to create a
56922 + virtual framebuffer. This framebuffer can be viewed in another
56923 + domain. Unless this domain has access to a real video card, you
56924 + probably want to say Y here.
56925 +
56926 +config XEN_KEYBOARD
56927 + tristate "Keyboard-device frontend driver"
56928 + depends on XEN && XEN_FRAMEBUFFER && INPUT
56929 + default y
56930 + help
56931 + The keyboard-device frontend driver allows the kernel to create a
56932 + virtual keyboard. This keyboard can then be driven by another
56933 + domain. If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
56934 + want to say Y here.
56935 +
56936 +config XEN_SCRUB_PAGES
56937 + bool "Scrub memory before freeing it to Xen"
56938 + default y
56939 + help
56940 + Erase memory contents before freeing it back to Xen's global
56941 + pool. This ensures that any secrets contained within that
56942 + memory (e.g., private keys) cannot be found by other guests that
56943 + may be running on the machine. Most people will want to say Y here.
56944 + If security is not a concern then you may increase performance by
56945 + saying N.
56946 +
56947 +config XEN_DISABLE_SERIAL
56948 + bool "Disable serial port drivers"
56949 + default y
56950 + help
56951 + Disable serial port drivers, allowing the Xen console driver
56952 + to provide a serial console at ttyS0.
56953 +
56954 +config XEN_SYSFS
56955 + tristate "Export Xen attributes in sysfs"
56956 + depends on SYSFS
56957 + default y
56958 + help
56959 + Xen hypervisor attributes will show up under /sys/hypervisor/.
56960 +
56961 +choice
56962 + prompt "Xen version compatibility"
56963 + default XEN_COMPAT_030002_AND_LATER
56964 +
56965 + config XEN_COMPAT_030002_AND_LATER
56966 + bool "3.0.2 and later"
56967 +
56968 + config XEN_COMPAT_LATEST_ONLY
56969 + bool "no compatibility code"
56970 +
56971 +endchoice
56972 +
56973 +config XEN_COMPAT_030002
56974 + bool
56975 + default XEN_COMPAT_030002_AND_LATER
56976 +
56977 +endmenu
56978 +
56979 +config HAVE_ARCH_ALLOC_SKB
56980 + bool
56981 + default y
56982 +
56983 +config HAVE_ARCH_DEV_ALLOC_SKB
56984 + bool
56985 + default y
56986 +
56987 +config HAVE_IRQ_IGNORE_UNHANDLED
56988 + bool
56989 + default y
56990 +
56991 +config NO_IDLE_HZ
56992 + bool
56993 + default y
56994 +
56995 +config XEN_UTIL
56996 + bool
56997 + default y
56998 +
56999 +config XEN_BALLOON
57000 + bool
57001 + default y
57002 +
57003 +config XEN_DEVMEM
57004 + bool
57005 + default y
57006 +
57007 +config XEN_SKBUFF
57008 + bool
57009 + default y
57010 + depends on NET
57011 +
57012 +config XEN_REBOOT
57013 + bool
57014 + default y
57015 +
57016 +config XEN_SMPBOOT
57017 + bool
57018 + default y
57019 + depends on SMP
57020 +
57021 +endif
57022 diff -Nur linux-2.6.16.33-noxen/drivers/xen/Makefile linux-2.6.16.33/drivers/xen/Makefile
57023 --- linux-2.6.16.33-noxen/drivers/xen/Makefile 1970-01-01 00:00:00.000000000 +0000
57024 +++ linux-2.6.16.33/drivers/xen/Makefile 2007-01-08 15:00:45.000000000 +0000
57025 @@ -0,0 +1,19 @@
57026 +obj-y += core/
57027 +obj-y += console/
57028 +obj-y += evtchn/
57029 +obj-y += privcmd/
57030 +obj-y += xenbus/
57031 +
57032 +obj-$(CONFIG_XEN_UTIL) += util.o
57033 +obj-$(CONFIG_XEN_BALLOON) += balloon/
57034 +obj-$(CONFIG_XEN_DEVMEM) += char/
57035 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
57036 +obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
57037 +obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/
57038 +obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmback/
57039 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/
57040 +obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/
57041 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
57042 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront/
57043 +obj-$(CONFIG_XEN_FRAMEBUFFER) += fbfront/
57044 +obj-$(CONFIG_XEN_KEYBOARD) += fbfront/
57045 diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/Makefile linux-2.6.16.33/drivers/xen/balloon/Makefile
57046 --- linux-2.6.16.33-noxen/drivers/xen/balloon/Makefile 1970-01-01 00:00:00.000000000 +0000
57047 +++ linux-2.6.16.33/drivers/xen/balloon/Makefile 2007-01-08 15:00:45.000000000 +0000
57048 @@ -0,0 +1,2 @@
57049 +
57050 +obj-y := balloon.o sysfs.o
57051 diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/balloon.c linux-2.6.16.33/drivers/xen/balloon/balloon.c
57052 --- linux-2.6.16.33-noxen/drivers/xen/balloon/balloon.c 1970-01-01 00:00:00.000000000 +0000
57053 +++ linux-2.6.16.33/drivers/xen/balloon/balloon.c 2007-01-08 15:00:45.000000000 +0000
57054 @@ -0,0 +1,625 @@
57055 +/******************************************************************************
57056 + * balloon.c
57057 + *
57058 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
57059 + *
57060 + * Copyright (c) 2003, B Dragovic
57061 + * Copyright (c) 2003-2004, M Williamson, K Fraser
57062 + * Copyright (c) 2005 Dan M. Smith, IBM Corporation
57063 + *
57064 + * This program is free software; you can redistribute it and/or
57065 + * modify it under the terms of the GNU General Public License version 2
57066 + * as published by the Free Software Foundation; or, when distributed
57067 + * separately from the Linux kernel or incorporated into other
57068 + * software packages, subject to the following license:
57069 + *
57070 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57071 + * of this source file (the "Software"), to deal in the Software without
57072 + * restriction, including without limitation the rights to use, copy, modify,
57073 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57074 + * and to permit persons to whom the Software is furnished to do so, subject to
57075 + * the following conditions:
57076 + *
57077 + * The above copyright notice and this permission notice shall be included in
57078 + * all copies or substantial portions of the Software.
57079 + *
57080 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57081 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57082 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57083 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57084 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57085 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57086 + * IN THE SOFTWARE.
57087 + */
57088 +
57089 +#include <linux/config.h>
57090 +#include <linux/kernel.h>
57091 +#include <linux/module.h>
57092 +#include <linux/sched.h>
57093 +#include <linux/errno.h>
57094 +#include <linux/mm.h>
57095 +#include <linux/mman.h>
57096 +#include <linux/smp_lock.h>
57097 +#include <linux/pagemap.h>
57098 +#include <linux/bootmem.h>
57099 +#include <linux/highmem.h>
57100 +#include <linux/vmalloc.h>
57101 +#include <xen/xen_proc.h>
57102 +#include <asm/hypervisor.h>
57103 +#include <xen/balloon.h>
57104 +#include <xen/interface/memory.h>
57105 +#include <asm/pgalloc.h>
57106 +#include <asm/pgtable.h>
57107 +#include <asm/uaccess.h>
57108 +#include <asm/tlb.h>
57109 +#include <linux/list.h>
57110 +#include <xen/xenbus.h>
57111 +#include "common.h"
57112 +
57113 +#ifdef CONFIG_PROC_FS
57114 +static struct proc_dir_entry *balloon_pde;
57115 +#endif
57116 +
57117 +static DECLARE_MUTEX(balloon_mutex);
57118 +
57119 +/*
57120 + * Protects atomic reservation decrease/increase against concurrent increases.
57121 + * Also protects non-atomic updates of current_pages and driver_pages, and
57122 + * balloon lists.
57123 + */
57124 +DEFINE_SPINLOCK(balloon_lock);
57125 +
57126 +struct balloon_stats balloon_stats;
57127 +
57128 +/* We increase/decrease in batches which fit in a page */
57129 +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
57130 +
57131 +/* VM /proc information for memory */
57132 +extern unsigned long totalram_pages;
57133 +
57134 +/* List of ballooned pages, threaded through the mem_map array. */
57135 +static LIST_HEAD(ballooned_pages);
57136 +
57137 +/* Main work function, always executed in process context. */
57138 +static void balloon_process(void *unused);
57139 +static DECLARE_WORK(balloon_worker, balloon_process, NULL);
57140 +static struct timer_list balloon_timer;
57141 +
57142 +/* When ballooning out (allocating memory to return to Xen) we don't really
57143 + want the kernel to try too hard since that can trigger the oom killer. */
57144 +#define GFP_BALLOON \
57145 + (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
57146 +
57147 +#define PAGE_TO_LIST(p) (&(p)->lru)
57148 +#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
57149 +#define UNLIST_PAGE(p) \
57150 + do { \
57151 + list_del(PAGE_TO_LIST(p)); \
57152 + PAGE_TO_LIST(p)->next = NULL; \
57153 + PAGE_TO_LIST(p)->prev = NULL; \
57154 + } while(0)
57155 +
57156 +#define IPRINTK(fmt, args...) \
57157 + printk(KERN_INFO "xen_mem: " fmt, ##args)
57158 +#define WPRINTK(fmt, args...) \
57159 + printk(KERN_WARNING "xen_mem: " fmt, ##args)
57160 +
57161 +/* balloon_append: add the given page to the balloon. */
57162 +static void balloon_append(struct page *page)
57163 +{
57164 + /* Lowmem is re-populated first, so highmem pages go at list tail. */
57165 + if (PageHighMem(page)) {
57166 + list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
57167 + bs.balloon_high++;
57168 + } else {
57169 + list_add(PAGE_TO_LIST(page), &ballooned_pages);
57170 + bs.balloon_low++;
57171 + }
57172 +}
57173 +
57174 +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
57175 +static struct page *balloon_retrieve(void)
57176 +{
57177 + struct page *page;
57178 +
57179 + if (list_empty(&ballooned_pages))
57180 + return NULL;
57181 +
57182 + page = LIST_TO_PAGE(ballooned_pages.next);
57183 + UNLIST_PAGE(page);
57184 +
57185 + if (PageHighMem(page))
57186 + bs.balloon_high--;
57187 + else
57188 + bs.balloon_low--;
57189 +
57190 + return page;
57191 +}
57192 +
57193 +static struct page *balloon_first_page(void)
57194 +{
57195 + if (list_empty(&ballooned_pages))
57196 + return NULL;
57197 + return LIST_TO_PAGE(ballooned_pages.next);
57198 +}
57199 +
57200 +static struct page *balloon_next_page(struct page *page)
57201 +{
57202 + struct list_head *next = PAGE_TO_LIST(page)->next;
57203 + if (next == &ballooned_pages)
57204 + return NULL;
57205 + return LIST_TO_PAGE(next);
57206 +}
57207 +
57208 +static void balloon_alarm(unsigned long unused)
57209 +{
57210 + schedule_work(&balloon_worker);
57211 +}
57212 +
57213 +static unsigned long current_target(void)
57214 +{
57215 + unsigned long target = min(bs.target_pages, bs.hard_limit);
57216 + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
57217 + target = bs.current_pages + bs.balloon_low + bs.balloon_high;
57218 + return target;
57219 +}
57220 +
57221 +static int increase_reservation(unsigned long nr_pages)
57222 +{
57223 + unsigned long pfn, i, flags;
57224 + struct page *page;
57225 + long rc;
57226 + struct xen_memory_reservation reservation = {
57227 + .address_bits = 0,
57228 + .extent_order = 0,
57229 + .domid = DOMID_SELF
57230 + };
57231 +
57232 + if (nr_pages > ARRAY_SIZE(frame_list))
57233 + nr_pages = ARRAY_SIZE(frame_list);
57234 +
57235 + balloon_lock(flags);
57236 +
57237 + page = balloon_first_page();
57238 + for (i = 0; i < nr_pages; i++) {
57239 + BUG_ON(page == NULL);
57240 + frame_list[i] = page_to_pfn(page);;
57241 + page = balloon_next_page(page);
57242 + }
57243 +
57244 + set_xen_guest_handle(reservation.extent_start, frame_list);
57245 + reservation.nr_extents = nr_pages;
57246 + rc = HYPERVISOR_memory_op(
57247 + XENMEM_populate_physmap, &reservation);
57248 + if (rc < nr_pages) {
57249 + if (rc > 0) {
57250 + int ret;
57251 +
57252 + /* We hit the Xen hard limit: reprobe. */
57253 + reservation.nr_extents = rc;
57254 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
57255 + &reservation);
57256 + BUG_ON(ret != rc);
57257 + }
57258 + if (rc >= 0)
57259 + bs.hard_limit = (bs.current_pages + rc -
57260 + bs.driver_pages);
57261 + goto out;
57262 + }
57263 +
57264 + for (i = 0; i < nr_pages; i++) {
57265 + page = balloon_retrieve();
57266 + BUG_ON(page == NULL);
57267 +
57268 + pfn = page_to_pfn(page);
57269 + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
57270 + phys_to_machine_mapping_valid(pfn));
57271 +
57272 + set_phys_to_machine(pfn, frame_list[i]);
57273 +
57274 + /* Link back into the page tables if not highmem. */
57275 + if (pfn < max_low_pfn) {
57276 + int ret;
57277 + ret = HYPERVISOR_update_va_mapping(
57278 + (unsigned long)__va(pfn << PAGE_SHIFT),
57279 + pfn_pte_ma(frame_list[i], PAGE_KERNEL),
57280 + 0);
57281 + BUG_ON(ret);
57282 + }
57283 +
57284 + /* Relinquish the page back to the allocator. */
57285 + ClearPageReserved(page);
57286 + set_page_count(page, 1);
57287 + __free_page(page);
57288 + }
57289 +
57290 + bs.current_pages += nr_pages;
57291 + totalram_pages = bs.current_pages;
57292 +
57293 + out:
57294 + balloon_unlock(flags);
57295 +
57296 + return 0;
57297 +}
57298 +
57299 +static int decrease_reservation(unsigned long nr_pages)
57300 +{
57301 + unsigned long pfn, i, flags;
57302 + struct page *page;
57303 + void *v;
57304 + int need_sleep = 0;
57305 + int ret;
57306 + struct xen_memory_reservation reservation = {
57307 + .address_bits = 0,
57308 + .extent_order = 0,
57309 + .domid = DOMID_SELF
57310 + };
57311 +
57312 + if (nr_pages > ARRAY_SIZE(frame_list))
57313 + nr_pages = ARRAY_SIZE(frame_list);
57314 +
57315 + for (i = 0; i < nr_pages; i++) {
57316 + if ((page = alloc_page(GFP_BALLOON)) == NULL) {
57317 + nr_pages = i;
57318 + need_sleep = 1;
57319 + break;
57320 + }
57321 +
57322 + pfn = page_to_pfn(page);
57323 + frame_list[i] = pfn_to_mfn(pfn);
57324 +
57325 + if (!PageHighMem(page)) {
57326 + v = phys_to_virt(pfn << PAGE_SHIFT);
57327 + scrub_pages(v, 1);
57328 + ret = HYPERVISOR_update_va_mapping(
57329 + (unsigned long)v, __pte_ma(0), 0);
57330 + BUG_ON(ret);
57331 + }
57332 +#ifdef CONFIG_XEN_SCRUB_PAGES
57333 + else {
57334 + v = kmap(page);
57335 + scrub_pages(v, 1);
57336 + kunmap(page);
57337 + }
57338 +#endif
57339 + }
57340 +
57341 + /* Ensure that ballooned highmem pages don't have kmaps. */
57342 + kmap_flush_unused();
57343 + flush_tlb_all();
57344 +
57345 + balloon_lock(flags);
57346 +
57347 + /* No more mappings: invalidate P2M and add to balloon. */
57348 + for (i = 0; i < nr_pages; i++) {
57349 + pfn = mfn_to_pfn(frame_list[i]);
57350 + set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
57351 + balloon_append(pfn_to_page(pfn));
57352 + }
57353 +
57354 + set_xen_guest_handle(reservation.extent_start, frame_list);
57355 + reservation.nr_extents = nr_pages;
57356 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
57357 + BUG_ON(ret != nr_pages);
57358 +
57359 + bs.current_pages -= nr_pages;
57360 + totalram_pages = bs.current_pages;
57361 +
57362 + balloon_unlock(flags);
57363 +
57364 + return need_sleep;
57365 +}
57366 +
57367 +/*
57368 + * We avoid multiple worker processes conflicting via the balloon mutex.
57369 + * We may of course race updates of the target counts (which are protected
57370 + * by the balloon lock), or with changes to the Xen hard limit, but we will
57371 + * recover from these in time.
57372 + */
57373 +static void balloon_process(void *unused)
57374 +{
57375 + int need_sleep = 0;
57376 + long credit;
57377 +
57378 + down(&balloon_mutex);
57379 +
57380 + do {
57381 + credit = current_target() - bs.current_pages;
57382 + if (credit > 0)
57383 + need_sleep = (increase_reservation(credit) != 0);
57384 + if (credit < 0)
57385 + need_sleep = (decrease_reservation(-credit) != 0);
57386 +
57387 +#ifndef CONFIG_PREEMPT
57388 + if (need_resched())
57389 + schedule();
57390 +#endif
57391 + } while ((credit != 0) && !need_sleep);
57392 +
57393 + /* Schedule more work if there is some still to be done. */
57394 + if (current_target() != bs.current_pages)
57395 + mod_timer(&balloon_timer, jiffies + HZ);
57396 +
57397 + up(&balloon_mutex);
57398 +}
57399 +
57400 +/* Resets the Xen limit, sets new target, and kicks off processing. */
57401 +void balloon_set_new_target(unsigned long target)
57402 +{
57403 + /* No need for lock. Not read-modify-write updates. */
57404 + bs.hard_limit = ~0UL;
57405 + bs.target_pages = target;
57406 + schedule_work(&balloon_worker);
57407 +}
57408 +
57409 +static struct xenbus_watch target_watch =
57410 +{
57411 + .node = "memory/target"
57412 +};
57413 +
57414 +/* React to a change in the target key */
57415 +static void watch_target(struct xenbus_watch *watch,
57416 + const char **vec, unsigned int len)
57417 +{
57418 + unsigned long long new_target;
57419 + int err;
57420 +
57421 + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
57422 + if (err != 1) {
57423 + /* This is ok (for domain0 at least) - so just return */
57424 + return;
57425 + }
57426 +
57427 + /* The given memory/target value is in KiB, so it needs converting to
57428 + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
57429 + */
57430 + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
57431 +}
57432 +
57433 +static int balloon_init_watcher(struct notifier_block *notifier,
57434 + unsigned long event,
57435 + void *data)
57436 +{
57437 + int err;
57438 +
57439 + err = register_xenbus_watch(&target_watch);
57440 + if (err)
57441 + printk(KERN_ERR "Failed to set balloon watcher\n");
57442 +
57443 + return NOTIFY_DONE;
57444 +}
57445 +
57446 +#ifdef CONFIG_PROC_FS
57447 +static int balloon_write(struct file *file, const char __user *buffer,
57448 + unsigned long count, void *data)
57449 +{
57450 + char memstring[64], *endchar;
57451 + unsigned long long target_bytes;
57452 +
57453 + if (!capable(CAP_SYS_ADMIN))
57454 + return -EPERM;
57455 +
57456 + if (count <= 1)
57457 + return -EBADMSG; /* runt */
57458 + if (count > sizeof(memstring))
57459 + return -EFBIG; /* too long */
57460 +
57461 + if (copy_from_user(memstring, buffer, count))
57462 + return -EFAULT;
57463 + memstring[sizeof(memstring)-1] = '\0';
57464 +
57465 + target_bytes = memparse(memstring, &endchar);
57466 + balloon_set_new_target(target_bytes >> PAGE_SHIFT);
57467 +
57468 + return count;
57469 +}
57470 +
57471 +static int balloon_read(char *page, char **start, off_t off,
57472 + int count, int *eof, void *data)
57473 +{
57474 + int len;
57475 +
57476 + len = sprintf(
57477 + page,
57478 + "Current allocation: %8lu kB\n"
57479 + "Requested target: %8lu kB\n"
57480 + "Low-mem balloon: %8lu kB\n"
57481 + "High-mem balloon: %8lu kB\n"
57482 + "Driver pages: %8lu kB\n"
57483 + "Xen hard limit: ",
57484 + PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages),
57485 + PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
57486 + PAGES2KB(bs.driver_pages));
57487 +
57488 + if (bs.hard_limit != ~0UL)
57489 + len += sprintf(page + len, "%8lu kB\n",
57490 + PAGES2KB(bs.hard_limit));
57491 + else
57492 + len += sprintf(page + len, " ??? kB\n");
57493 +
57494 + *eof = 1;
57495 + return len;
57496 +}
57497 +#endif
57498 +
57499 +static struct notifier_block xenstore_notifier;
57500 +
57501 +static int __init balloon_init(void)
57502 +{
57503 + unsigned long pfn;
57504 + struct page *page;
57505 +
57506 + if (!is_running_on_xen())
57507 + return -ENODEV;
57508 +
57509 + IPRINTK("Initialising balloon driver.\n");
57510 +
57511 + bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
57512 + totalram_pages = bs.current_pages;
57513 + bs.target_pages = bs.current_pages;
57514 + bs.balloon_low = 0;
57515 + bs.balloon_high = 0;
57516 + bs.driver_pages = 0UL;
57517 + bs.hard_limit = ~0UL;
57518 +
57519 + init_timer(&balloon_timer);
57520 + balloon_timer.data = 0;
57521 + balloon_timer.function = balloon_alarm;
57522 +
57523 +#ifdef CONFIG_PROC_FS
57524 + if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
57525 + WPRINTK("Unable to create /proc/xen/balloon.\n");
57526 + return -1;
57527 + }
57528 +
57529 + balloon_pde->read_proc = balloon_read;
57530 + balloon_pde->write_proc = balloon_write;
57531 +#endif
57532 + balloon_sysfs_init();
57533 +
57534 + /* Initialise the balloon with excess memory space. */
57535 + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
57536 + page = pfn_to_page(pfn);
57537 + if (!PageReserved(page))
57538 + balloon_append(page);
57539 + }
57540 +
57541 + target_watch.callback = watch_target;
57542 + xenstore_notifier.notifier_call = balloon_init_watcher;
57543 +
57544 + register_xenstore_notifier(&xenstore_notifier);
57545 +
57546 + return 0;
57547 +}
57548 +
57549 +subsys_initcall(balloon_init);
57550 +
57551 +void balloon_update_driver_allowance(long delta)
57552 +{
57553 + unsigned long flags;
57554 +
57555 + balloon_lock(flags);
57556 + bs.driver_pages += delta;
57557 + balloon_unlock(flags);
57558 +}
57559 +
57560 +static int dealloc_pte_fn(
57561 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
57562 +{
57563 + unsigned long mfn = pte_mfn(*pte);
57564 + int ret;
57565 + struct xen_memory_reservation reservation = {
57566 + .nr_extents = 1,
57567 + .extent_order = 0,
57568 + .domid = DOMID_SELF
57569 + };
57570 + set_xen_guest_handle(reservation.extent_start, &mfn);
57571 + set_pte_at(&init_mm, addr, pte, __pte_ma(0));
57572 + set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
57573 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
57574 + BUG_ON(ret != 1);
57575 + return 0;
57576 +}
57577 +
57578 +struct page **alloc_empty_pages_and_pagevec(int nr_pages)
57579 +{
57580 + unsigned long vaddr, flags;
57581 + struct page *page, **pagevec;
57582 + int i, ret;
57583 +
57584 + pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
57585 + if (pagevec == NULL)
57586 + return NULL;
57587 +
57588 + for (i = 0; i < nr_pages; i++) {
57589 + page = pagevec[i] = alloc_page(GFP_KERNEL);
57590 + if (page == NULL)
57591 + goto err;
57592 +
57593 + vaddr = (unsigned long)page_address(page);
57594 +
57595 + scrub_pages(vaddr, 1);
57596 +
57597 + balloon_lock(flags);
57598 +
57599 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
57600 + unsigned long gmfn = page_to_pfn(page);
57601 + struct xen_memory_reservation reservation = {
57602 + .nr_extents = 1,
57603 + .extent_order = 0,
57604 + .domid = DOMID_SELF
57605 + };
57606 + set_xen_guest_handle(reservation.extent_start, &gmfn);
57607 + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
57608 + &reservation);
57609 + if (ret == 1)
57610 + ret = 0; /* success */
57611 + } else {
57612 + ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
57613 + dealloc_pte_fn, NULL);
57614 + }
57615 +
57616 + if (ret != 0) {
57617 + balloon_unlock(flags);
57618 + __free_page(page);
57619 + goto err;
57620 + }
57621 +
57622 + totalram_pages = --bs.current_pages;
57623 +
57624 + balloon_unlock(flags);
57625 + }
57626 +
57627 + out:
57628 + schedule_work(&balloon_worker);
57629 + flush_tlb_all();
57630 + return pagevec;
57631 +
57632 + err:
57633 + balloon_lock(flags);
57634 + while (--i >= 0)
57635 + balloon_append(pagevec[i]);
57636 + balloon_unlock(flags);
57637 + kfree(pagevec);
57638 + pagevec = NULL;
57639 + goto out;
57640 +}
57641 +
57642 +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
57643 +{
57644 + unsigned long flags;
57645 + int i;
57646 +
57647 + if (pagevec == NULL)
57648 + return;
57649 +
57650 + balloon_lock(flags);
57651 + for (i = 0; i < nr_pages; i++) {
57652 + BUG_ON(page_count(pagevec[i]) != 1);
57653 + balloon_append(pagevec[i]);
57654 + }
57655 + balloon_unlock(flags);
57656 +
57657 + kfree(pagevec);
57658 +
57659 + schedule_work(&balloon_worker);
57660 +}
57661 +
57662 +void balloon_release_driver_page(struct page *page)
57663 +{
57664 + unsigned long flags;
57665 +
57666 + balloon_lock(flags);
57667 + balloon_append(page);
57668 + bs.driver_pages--;
57669 + balloon_unlock(flags);
57670 +
57671 + schedule_work(&balloon_worker);
57672 +}
57673 +
57674 +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
57675 +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
57676 +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
57677 +EXPORT_SYMBOL_GPL(balloon_release_driver_page);
57678 +
57679 +MODULE_LICENSE("Dual BSD/GPL");
57680 diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/common.h linux-2.6.16.33/drivers/xen/balloon/common.h
57681 --- linux-2.6.16.33-noxen/drivers/xen/balloon/common.h 1970-01-01 00:00:00.000000000 +0000
57682 +++ linux-2.6.16.33/drivers/xen/balloon/common.h 2007-01-08 15:00:45.000000000 +0000
57683 @@ -0,0 +1,58 @@
57684 +/******************************************************************************
57685 + * balloon/common.h
57686 + *
57687 + * This program is free software; you can redistribute it and/or
57688 + * modify it under the terms of the GNU General Public License version 2
57689 + * as published by the Free Software Foundation; or, when distributed
57690 + * separately from the Linux kernel or incorporated into other
57691 + * software packages, subject to the following license:
57692 + *
57693 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57694 + * of this source file (the "Software"), to deal in the Software without
57695 + * restriction, including without limitation the rights to use, copy, modify,
57696 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57697 + * and to permit persons to whom the Software is furnished to do so, subject to
57698 + * the following conditions:
57699 + *
57700 + * The above copyright notice and this permission notice shall be included in
57701 + * all copies or substantial portions of the Software.
57702 + *
57703 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57704 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57705 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57706 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57707 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57708 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57709 + * IN THE SOFTWARE.
57710 + */
57711 +
57712 +#ifndef __XEN_BALLOON_COMMON_H__
57713 +#define __XEN_BALLOON_COMMON_H__
57714 +
57715 +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
57716 +
57717 +struct balloon_stats {
57718 + /* We aim for 'current allocation' == 'target allocation'. */
57719 + unsigned long current_pages;
57720 + unsigned long target_pages;
57721 + /* We may hit the hard limit in Xen. If we do then we remember it. */
57722 + unsigned long hard_limit;
57723 + /*
57724 + * Drivers may alter the memory reservation independently, but they
57725 + * must inform the balloon driver so we avoid hitting the hard limit.
57726 + */
57727 + unsigned long driver_pages;
57728 + /* Number of pages in high- and low-memory balloons. */
57729 + unsigned long balloon_low;
57730 + unsigned long balloon_high;
57731 +};
57732 +
57733 +extern struct balloon_stats balloon_stats;
57734 +#define bs balloon_stats
57735 +
57736 +int balloon_sysfs_init(void);
57737 +void balloon_sysfs_exit(void);
57738 +
57739 +void balloon_set_new_target(unsigned long target);
57740 +
57741 +#endif /* __XEN_BALLOON_COMMON_H__ */
57742 diff -Nur linux-2.6.16.33-noxen/drivers/xen/balloon/sysfs.c linux-2.6.16.33/drivers/xen/balloon/sysfs.c
57743 --- linux-2.6.16.33-noxen/drivers/xen/balloon/sysfs.c 1970-01-01 00:00:00.000000000 +0000
57744 +++ linux-2.6.16.33/drivers/xen/balloon/sysfs.c 2007-01-08 15:00:45.000000000 +0000
57745 @@ -0,0 +1,165 @@
57746 +/******************************************************************************
57747 + * balloon/sysfs.c
57748 + *
57749 + * Xen balloon driver - sysfs interfaces.
57750 + *
57751 + * This program is free software; you can redistribute it and/or
57752 + * modify it under the terms of the GNU General Public License version 2
57753 + * as published by the Free Software Foundation; or, when distributed
57754 + * separately from the Linux kernel or incorporated into other
57755 + * software packages, subject to the following license:
57756 + *
57757 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57758 + * of this source file (the "Software"), to deal in the Software without
57759 + * restriction, including without limitation the rights to use, copy, modify,
57760 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57761 + * and to permit persons to whom the Software is furnished to do so, subject to
57762 + * the following conditions:
57763 + *
57764 + * The above copyright notice and this permission notice shall be included in
57765 + * all copies or substantial portions of the Software.
57766 + *
57767 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57768 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57769 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57770 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57771 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57772 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57773 + * IN THE SOFTWARE.
57774 + */
57775 +
57776 +#include <linux/config.h>
57777 +#include <linux/capability.h>
57778 +#include <linux/stat.h>
57779 +#include <linux/sysdev.h>
57780 +#include "common.h"
57781 +
57782 +#define BALLOON_CLASS_NAME "memory"
57783 +
57784 +#define BALLOON_SHOW(name, format, args...) \
57785 + static ssize_t show_##name(struct sys_device *dev, \
57786 + char *buf) \
57787 + { \
57788 + return sprintf(buf, format, ##args); \
57789 + } \
57790 + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
57791 +
57792 +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
57793 +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
57794 +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
57795 +BALLOON_SHOW(hard_limit_kb,
57796 + (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n",
57797 + (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0);
57798 +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
57799 +
57800 +static ssize_t show_target_kb(struct sys_device *dev, char *buf)
57801 +{
57802 + return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
57803 +}
57804 +
57805 +static ssize_t store_target_kb(struct sys_device *dev,
57806 + const char *buf,
57807 + size_t count)
57808 +{
57809 + char memstring[64], *endchar;
57810 + unsigned long long target_bytes;
57811 +
57812 + if (!capable(CAP_SYS_ADMIN))
57813 + return -EPERM;
57814 +
57815 + if (count <= 1)
57816 + return -EBADMSG; /* runt */
57817 + if (count > sizeof(memstring))
57818 + return -EFBIG; /* too long */
57819 + strcpy(memstring, buf);
57820 +
57821 + target_bytes = memparse(memstring, &endchar);
57822 + balloon_set_new_target(target_bytes >> PAGE_SHIFT);
57823 +
57824 + return count;
57825 +}
57826 +
57827 +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
57828 + show_target_kb, store_target_kb);
57829 +
57830 +static struct sysdev_attribute *balloon_attrs[] = {
57831 + &attr_target_kb,
57832 +};
57833 +
57834 +static struct attribute *balloon_info_attrs[] = {
57835 + &attr_current_kb.attr,
57836 + &attr_low_kb.attr,
57837 + &attr_high_kb.attr,
57838 + &attr_hard_limit_kb.attr,
57839 + &attr_driver_kb.attr,
57840 + NULL
57841 +};
57842 +
57843 +static struct attribute_group balloon_info_group = {
57844 + .name = "info",
57845 + .attrs = balloon_info_attrs,
57846 +};
57847 +
57848 +static struct sysdev_class balloon_sysdev_class = {
57849 + set_kset_name(BALLOON_CLASS_NAME),
57850 +};
57851 +
57852 +static struct sys_device balloon_sysdev;
57853 +
57854 +static int register_balloon(struct sys_device *sysdev)
57855 +{
57856 + int i, error;
57857 +
57858 + error = sysdev_class_register(&balloon_sysdev_class);
57859 + if (error)
57860 + return error;
57861 +
57862 + sysdev->id = 0;
57863 + sysdev->cls = &balloon_sysdev_class;
57864 +
57865 + error = sysdev_register(sysdev);
57866 + if (error) {
57867 + sysdev_class_unregister(&balloon_sysdev_class);
57868 + return error;
57869 + }
57870 +
57871 + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
57872 + error = sysdev_create_file(sysdev, balloon_attrs[i]);
57873 + if (error)
57874 + goto fail;
57875 + }
57876 +
57877 + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
57878 + if (error)
57879 + goto fail;
57880 +
57881 + return 0;
57882 +
57883 + fail:
57884 + while (--i >= 0)
57885 + sysdev_remove_file(sysdev, balloon_attrs[i]);
57886 + sysdev_unregister(sysdev);
57887 + sysdev_class_unregister(&balloon_sysdev_class);
57888 + return error;
57889 +}
57890 +
57891 +static void unregister_balloon(struct sys_device *sysdev)
57892 +{
57893 + int i;
57894 +
57895 + sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
57896 + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
57897 + sysdev_remove_file(sysdev, balloon_attrs[i]);
57898 + sysdev_unregister(sysdev);
57899 + sysdev_class_unregister(&balloon_sysdev_class);
57900 +}
57901 +
57902 +int balloon_sysfs_init(void)
57903 +{
57904 + return register_balloon(&balloon_sysdev);
57905 +}
57906 +
57907 +void balloon_sysfs_exit(void)
57908 +{
57909 + unregister_balloon(&balloon_sysdev);
57910 +}
57911 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/Makefile linux-2.6.16.33/drivers/xen/blkback/Makefile
57912 --- linux-2.6.16.33-noxen/drivers/xen/blkback/Makefile 1970-01-01 00:00:00.000000000 +0000
57913 +++ linux-2.6.16.33/drivers/xen/blkback/Makefile 2007-01-08 15:00:45.000000000 +0000
57914 @@ -0,0 +1,3 @@
57915 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
57916 +
57917 +blkbk-y := blkback.o xenbus.o interface.o vbd.o
57918 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/blkback.c linux-2.6.16.33/drivers/xen/blkback/blkback.c
57919 --- linux-2.6.16.33-noxen/drivers/xen/blkback/blkback.c 1970-01-01 00:00:00.000000000 +0000
57920 +++ linux-2.6.16.33/drivers/xen/blkback/blkback.c 2007-01-08 15:00:45.000000000 +0000
57921 @@ -0,0 +1,580 @@
57922 +/******************************************************************************
57923 + * arch/xen/drivers/blkif/backend/main.c
57924 + *
57925 + * Back-end of the driver for virtual block devices. This portion of the
57926 + * driver exports a 'unified' block-device interface that can be accessed
57927 + * by any operating system that implements a compatible front end. A
57928 + * reference front-end implementation can be found in:
57929 + * arch/xen/drivers/blkif/frontend
57930 + *
57931 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
57932 + * Copyright (c) 2005, Christopher Clark
57933 + *
57934 + * This program is free software; you can redistribute it and/or
57935 + * modify it under the terms of the GNU General Public License version 2
57936 + * as published by the Free Software Foundation; or, when distributed
57937 + * separately from the Linux kernel or incorporated into other
57938 + * software packages, subject to the following license:
57939 + *
57940 + * Permission is hereby granted, free of charge, to any person obtaining a copy
57941 + * of this source file (the "Software"), to deal in the Software without
57942 + * restriction, including without limitation the rights to use, copy, modify,
57943 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
57944 + * and to permit persons to whom the Software is furnished to do so, subject to
57945 + * the following conditions:
57946 + *
57947 + * The above copyright notice and this permission notice shall be included in
57948 + * all copies or substantial portions of the Software.
57949 + *
57950 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
57951 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57952 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57953 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
57954 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
57955 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
57956 + * IN THE SOFTWARE.
57957 + */
57958 +
57959 +#include <linux/spinlock.h>
57960 +#include <linux/kthread.h>
57961 +#include <linux/list.h>
57962 +#include <xen/balloon.h>
57963 +#include <asm/hypervisor.h>
57964 +#include "common.h"
57965 +
57966 +/*
57967 + * These are rather arbitrary. They are fairly large because adjacent requests
57968 + * pulled from a communication ring are quite likely to end up being part of
57969 + * the same scatter/gather request at the disc.
57970 + *
57971 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
57972 + *
57973 + * This will increase the chances of being able to write whole tracks.
57974 + * 64 should be enough to keep us competitive with Linux.
57975 + */
57976 +static int blkif_reqs = 64;
57977 +module_param_named(reqs, blkif_reqs, int, 0);
57978 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
57979 +
57980 +/* Run-time switchable: /sys/module/blkback/parameters/ */
57981 +static unsigned int log_stats = 0;
57982 +static unsigned int debug_lvl = 0;
57983 +module_param(log_stats, int, 0644);
57984 +module_param(debug_lvl, int, 0644);
57985 +
57986 +/*
57987 + * Each outstanding request that we've passed to the lower device layers has a
57988 + * 'pending_req' allocated to it. Each buffer_head that completes decrements
57989 + * the pendcnt towards zero. When it hits zero, the specified domain has a
57990 + * response queued for it, with the saved 'id' passed back.
57991 + */
57992 +typedef struct {
57993 + blkif_t *blkif;
57994 + unsigned long id;
57995 + int nr_pages;
57996 + atomic_t pendcnt;
57997 + unsigned short operation;
57998 + int status;
57999 + struct list_head free_list;
58000 +} pending_req_t;
58001 +
58002 +static pending_req_t *pending_reqs;
58003 +static struct list_head pending_free;
58004 +static DEFINE_SPINLOCK(pending_free_lock);
58005 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
58006 +
58007 +#define BLKBACK_INVALID_HANDLE (~0)
58008 +
58009 +static struct page **pending_pages;
58010 +static grant_handle_t *pending_grant_handles;
58011 +
58012 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
58013 +{
58014 + return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
58015 +}
58016 +
58017 +static inline unsigned long vaddr(pending_req_t *req, int seg)
58018 +{
58019 + unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
58020 + return (unsigned long)pfn_to_kaddr(pfn);
58021 +}
58022 +
58023 +#define pending_handle(_req, _seg) \
58024 + (pending_grant_handles[vaddr_pagenr(_req, _seg)])
58025 +
58026 +
58027 +static int do_block_io_op(blkif_t *blkif);
58028 +static void dispatch_rw_block_io(blkif_t *blkif,
58029 + blkif_request_t *req,
58030 + pending_req_t *pending_req);
58031 +static void make_response(blkif_t *blkif, unsigned long id,
58032 + unsigned short op, int st);
58033 +
58034 +/******************************************************************
58035 + * misc small helpers
58036 + */
58037 +static pending_req_t* alloc_req(void)
58038 +{
58039 + pending_req_t *req = NULL;
58040 + unsigned long flags;
58041 +
58042 + spin_lock_irqsave(&pending_free_lock, flags);
58043 + if (!list_empty(&pending_free)) {
58044 + req = list_entry(pending_free.next, pending_req_t, free_list);
58045 + list_del(&req->free_list);
58046 + }
58047 + spin_unlock_irqrestore(&pending_free_lock, flags);
58048 + return req;
58049 +}
58050 +
58051 +static void free_req(pending_req_t *req)
58052 +{
58053 + unsigned long flags;
58054 + int was_empty;
58055 +
58056 + spin_lock_irqsave(&pending_free_lock, flags);
58057 + was_empty = list_empty(&pending_free);
58058 + list_add(&req->free_list, &pending_free);
58059 + spin_unlock_irqrestore(&pending_free_lock, flags);
58060 + if (was_empty)
58061 + wake_up(&pending_free_wq);
58062 +}
58063 +
58064 +static void unplug_queue(blkif_t *blkif)
58065 +{
58066 + if (blkif->plug == NULL)
58067 + return;
58068 + if (blkif->plug->unplug_fn)
58069 + blkif->plug->unplug_fn(blkif->plug);
58070 + blk_put_queue(blkif->plug);
58071 + blkif->plug = NULL;
58072 +}
58073 +
58074 +static void plug_queue(blkif_t *blkif, struct bio *bio)
58075 +{
58076 + request_queue_t *q = bdev_get_queue(bio->bi_bdev);
58077 +
58078 + if (q == blkif->plug)
58079 + return;
58080 + unplug_queue(blkif);
58081 + blk_get_queue(q);
58082 + blkif->plug = q;
58083 +}
58084 +
58085 +static void fast_flush_area(pending_req_t *req)
58086 +{
58087 + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58088 + unsigned int i, invcount = 0;
58089 + grant_handle_t handle;
58090 + int ret;
58091 +
58092 + for (i = 0; i < req->nr_pages; i++) {
58093 + handle = pending_handle(req, i);
58094 + if (handle == BLKBACK_INVALID_HANDLE)
58095 + continue;
58096 + gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
58097 + handle);
58098 + pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
58099 + invcount++;
58100 + }
58101 +
58102 + ret = HYPERVISOR_grant_table_op(
58103 + GNTTABOP_unmap_grant_ref, unmap, invcount);
58104 + BUG_ON(ret);
58105 +}
58106 +
58107 +/******************************************************************
58108 + * SCHEDULER FUNCTIONS
58109 + */
58110 +
58111 +static void print_stats(blkif_t *blkif)
58112 +{
58113 + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
58114 + current->comm, blkif->st_oo_req,
58115 + blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
58116 + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
58117 + blkif->st_rd_req = 0;
58118 + blkif->st_wr_req = 0;
58119 + blkif->st_oo_req = 0;
58120 +}
58121 +
58122 +int blkif_schedule(void *arg)
58123 +{
58124 + blkif_t *blkif = arg;
58125 +
58126 + blkif_get(blkif);
58127 +
58128 + if (debug_lvl)
58129 + printk(KERN_DEBUG "%s: started\n", current->comm);
58130 +
58131 + while (!kthread_should_stop()) {
58132 + wait_event_interruptible(
58133 + blkif->wq,
58134 + blkif->waiting_reqs || kthread_should_stop());
58135 + wait_event_interruptible(
58136 + pending_free_wq,
58137 + !list_empty(&pending_free) || kthread_should_stop());
58138 +
58139 + blkif->waiting_reqs = 0;
58140 + smp_mb(); /* clear flag *before* checking for work */
58141 +
58142 + if (do_block_io_op(blkif))
58143 + blkif->waiting_reqs = 1;
58144 + unplug_queue(blkif);
58145 +
58146 + if (log_stats && time_after(jiffies, blkif->st_print))
58147 + print_stats(blkif);
58148 + }
58149 +
58150 + if (log_stats)
58151 + print_stats(blkif);
58152 + if (debug_lvl)
58153 + printk(KERN_DEBUG "%s: exiting\n", current->comm);
58154 +
58155 + blkif->xenblkd = NULL;
58156 + blkif_put(blkif);
58157 +
58158 + return 0;
58159 +}
58160 +
58161 +/******************************************************************
58162 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
58163 + */
58164 +
58165 +static void __end_block_io_op(pending_req_t *pending_req, int error)
58166 +{
58167 + /* An error fails the entire request. */
58168 + if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
58169 + (error == -EOPNOTSUPP)) {
58170 + DPRINTK("blkback: write barrier op failed, not supported\n");
58171 + blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
58172 + pending_req->status = BLKIF_RSP_EOPNOTSUPP;
58173 + } else if (error) {
58174 + DPRINTK("Buffer not up-to-date at end of operation, "
58175 + "error=%d\n", error);
58176 + pending_req->status = BLKIF_RSP_ERROR;
58177 + }
58178 +
58179 + if (atomic_dec_and_test(&pending_req->pendcnt)) {
58180 + fast_flush_area(pending_req);
58181 + make_response(pending_req->blkif, pending_req->id,
58182 + pending_req->operation, pending_req->status);
58183 + blkif_put(pending_req->blkif);
58184 + free_req(pending_req);
58185 + }
58186 +}
58187 +
58188 +static int end_block_io_op(struct bio *bio, unsigned int done, int error)
58189 +{
58190 + if (bio->bi_size != 0)
58191 + return 1;
58192 + __end_block_io_op(bio->bi_private, error);
58193 + bio_put(bio);
58194 + return error;
58195 +}
58196 +
58197 +
58198 +/******************************************************************************
58199 + * NOTIFICATION FROM GUEST OS.
58200 + */
58201 +
58202 +static void blkif_notify_work(blkif_t *blkif)
58203 +{
58204 + blkif->waiting_reqs = 1;
58205 + wake_up(&blkif->wq);
58206 +}
58207 +
58208 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
58209 +{
58210 + blkif_notify_work(dev_id);
58211 + return IRQ_HANDLED;
58212 +}
58213 +
58214 +
58215 +
58216 +/******************************************************************
58217 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
58218 + */
58219 +
58220 +static int do_block_io_op(blkif_t *blkif)
58221 +{
58222 + blkif_back_ring_t *blk_ring = &blkif->blk_ring;
58223 + blkif_request_t req;
58224 + pending_req_t *pending_req;
58225 + RING_IDX rc, rp;
58226 + int more_to_do = 0;
58227 +
58228 + rc = blk_ring->req_cons;
58229 + rp = blk_ring->sring->req_prod;
58230 + rmb(); /* Ensure we see queued requests up to 'rp'. */
58231 +
58232 + while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
58233 +
58234 + pending_req = alloc_req();
58235 + if (NULL == pending_req) {
58236 + blkif->st_oo_req++;
58237 + more_to_do = 1;
58238 + break;
58239 + }
58240 +
58241 + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
58242 + blk_ring->req_cons = ++rc; /* before make_response() */
58243 +
58244 + switch (req.operation) {
58245 + case BLKIF_OP_READ:
58246 + blkif->st_rd_req++;
58247 + dispatch_rw_block_io(blkif, &req, pending_req);
58248 + break;
58249 + case BLKIF_OP_WRITE_BARRIER:
58250 + blkif->st_br_req++;
58251 + /* fall through */
58252 + case BLKIF_OP_WRITE:
58253 + blkif->st_wr_req++;
58254 + dispatch_rw_block_io(blkif, &req, pending_req);
58255 + break;
58256 + default:
58257 + DPRINTK("error: unknown block io operation [%d]\n",
58258 + req.operation);
58259 + make_response(blkif, req.id, req.operation,
58260 + BLKIF_RSP_ERROR);
58261 + free_req(pending_req);
58262 + break;
58263 + }
58264 + }
58265 + return more_to_do;
58266 +}
58267 +
58268 +static void dispatch_rw_block_io(blkif_t *blkif,
58269 + blkif_request_t *req,
58270 + pending_req_t *pending_req)
58271 +{
58272 + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
58273 + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58274 + struct phys_req preq;
58275 + struct {
58276 + unsigned long buf; unsigned int nsec;
58277 + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58278 + unsigned int nseg;
58279 + struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
58280 + int ret, i, nbio = 0;
58281 + int operation;
58282 +
58283 + switch (req->operation) {
58284 + case BLKIF_OP_READ:
58285 + operation = READ;
58286 + break;
58287 + case BLKIF_OP_WRITE:
58288 + operation = WRITE;
58289 + break;
58290 + case BLKIF_OP_WRITE_BARRIER:
58291 + operation = WRITE_BARRIER;
58292 + break;
58293 + default:
58294 + operation = 0; /* make gcc happy */
58295 + BUG();
58296 + }
58297 +
58298 + /* Check that number of segments is sane. */
58299 + nseg = req->nr_segments;
58300 + if (unlikely(nseg == 0) ||
58301 + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
58302 + DPRINTK("Bad number of segments in request (%d)\n", nseg);
58303 + goto fail_response;
58304 + }
58305 +
58306 + preq.dev = req->handle;
58307 + preq.sector_number = req->sector_number;
58308 + preq.nr_sects = 0;
58309 +
58310 + pending_req->blkif = blkif;
58311 + pending_req->id = req->id;
58312 + pending_req->operation = req->operation;
58313 + pending_req->status = BLKIF_RSP_OKAY;
58314 + pending_req->nr_pages = nseg;
58315 +
58316 + for (i = 0; i < nseg; i++) {
58317 + uint32_t flags;
58318 +
58319 + seg[i].nsec = req->seg[i].last_sect -
58320 + req->seg[i].first_sect + 1;
58321 +
58322 + if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
58323 + (req->seg[i].last_sect < req->seg[i].first_sect))
58324 + goto fail_response;
58325 + preq.nr_sects += seg[i].nsec;
58326 +
58327 + flags = GNTMAP_host_map;
58328 + if (operation != READ)
58329 + flags |= GNTMAP_readonly;
58330 + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
58331 + req->seg[i].gref, blkif->domid);
58332 + }
58333 +
58334 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
58335 + BUG_ON(ret);
58336 +
58337 + for (i = 0; i < nseg; i++) {
58338 + if (unlikely(map[i].status != 0)) {
58339 + DPRINTK("invalid buffer -- could not remap it\n");
58340 + map[i].handle = BLKBACK_INVALID_HANDLE;
58341 + ret |= 1;
58342 + }
58343 +
58344 + pending_handle(pending_req, i) = map[i].handle;
58345 +
58346 + if (ret)
58347 + continue;
58348 +
58349 + set_phys_to_machine(__pa(vaddr(
58350 + pending_req, i)) >> PAGE_SHIFT,
58351 + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
58352 + seg[i].buf = map[i].dev_bus_addr |
58353 + (req->seg[i].first_sect << 9);
58354 + }
58355 +
58356 + if (ret)
58357 + goto fail_flush;
58358 +
58359 + if (vbd_translate(&preq, blkif, operation) != 0) {
58360 + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
58361 + operation == READ ? "read" : "write",
58362 + preq.sector_number,
58363 + preq.sector_number + preq.nr_sects, preq.dev);
58364 + goto fail_flush;
58365 + }
58366 +
58367 + for (i = 0; i < nseg; i++) {
58368 + if (((int)preq.sector_number|(int)seg[i].nsec) &
58369 + ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
58370 + DPRINTK("Misaligned I/O request from domain %d",
58371 + blkif->domid);
58372 + goto fail_put_bio;
58373 + }
58374 +
58375 + while ((bio == NULL) ||
58376 + (bio_add_page(bio,
58377 + virt_to_page(vaddr(pending_req, i)),
58378 + seg[i].nsec << 9,
58379 + seg[i].buf & ~PAGE_MASK) == 0)) {
58380 + bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
58381 + if (unlikely(bio == NULL))
58382 + goto fail_put_bio;
58383 +
58384 + bio->bi_bdev = preq.bdev;
58385 + bio->bi_private = pending_req;
58386 + bio->bi_end_io = end_block_io_op;
58387 + bio->bi_sector = preq.sector_number;
58388 + }
58389 +
58390 + preq.sector_number += seg[i].nsec;
58391 + }
58392 +
58393 + plug_queue(blkif, bio);
58394 + atomic_set(&pending_req->pendcnt, nbio);
58395 + blkif_get(blkif);
58396 +
58397 + for (i = 0; i < nbio; i++)
58398 + submit_bio(operation, biolist[i]);
58399 +
58400 + return;
58401 +
58402 + fail_put_bio:
58403 + for (i = 0; i < (nbio-1); i++)
58404 + bio_put(biolist[i]);
58405 + fail_flush:
58406 + fast_flush_area(pending_req);
58407 + fail_response:
58408 + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
58409 + free_req(pending_req);
58410 +}
58411 +
58412 +
58413 +
58414 +/******************************************************************
58415 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
58416 + */
58417 +
58418 +
58419 +static void make_response(blkif_t *blkif, unsigned long id,
58420 + unsigned short op, int st)
58421 +{
58422 + blkif_response_t *resp;
58423 + unsigned long flags;
58424 + blkif_back_ring_t *blk_ring = &blkif->blk_ring;
58425 + int more_to_do = 0;
58426 + int notify;
58427 +
58428 + spin_lock_irqsave(&blkif->blk_ring_lock, flags);
58429 +
58430 + /* Place on the response ring for the relevant domain. */
58431 + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
58432 + resp->id = id;
58433 + resp->operation = op;
58434 + resp->status = st;
58435 + blk_ring->rsp_prod_pvt++;
58436 + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
58437 +
58438 + if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
58439 + /*
58440 + * Tail check for pending requests. Allows frontend to avoid
58441 + * notifications if requests are already in flight (lower
58442 + * overheads and promotes batching).
58443 + */
58444 + RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
58445 +
58446 + } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
58447 + more_to_do = 1;
58448 +
58449 + }
58450 + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
58451 +
58452 + if (more_to_do)
58453 + blkif_notify_work(blkif);
58454 + if (notify)
58455 + notify_remote_via_irq(blkif->irq);
58456 +}
58457 +
58458 +static int __init blkif_init(void)
58459 +{
58460 + int i, mmap_pages;
58461 +
58462 + if (!is_running_on_xen())
58463 + return -ENODEV;
58464 +
58465 + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
58466 +
58467 + pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
58468 + blkif_reqs, GFP_KERNEL);
58469 + pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
58470 + mmap_pages, GFP_KERNEL);
58471 + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
58472 +
58473 + if (!pending_reqs || !pending_grant_handles || !pending_pages)
58474 + goto out_of_memory;
58475 +
58476 + for (i = 0; i < mmap_pages; i++)
58477 + pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
58478 +
58479 + blkif_interface_init();
58480 +
58481 + memset(pending_reqs, 0, sizeof(pending_reqs));
58482 + INIT_LIST_HEAD(&pending_free);
58483 +
58484 + for (i = 0; i < blkif_reqs; i++)
58485 + list_add_tail(&pending_reqs[i].free_list, &pending_free);
58486 +
58487 + blkif_xenbus_init();
58488 +
58489 + return 0;
58490 +
58491 + out_of_memory:
58492 + kfree(pending_reqs);
58493 + kfree(pending_grant_handles);
58494 + free_empty_pages_and_pagevec(pending_pages, mmap_pages);
58495 + printk("%s: out of memory\n", __FUNCTION__);
58496 + return -ENOMEM;
58497 +}
58498 +
58499 +module_init(blkif_init);
58500 +
58501 +MODULE_LICENSE("Dual BSD/GPL");
58502 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/common.h linux-2.6.16.33/drivers/xen/blkback/common.h
58503 --- linux-2.6.16.33-noxen/drivers/xen/blkback/common.h 1970-01-01 00:00:00.000000000 +0000
58504 +++ linux-2.6.16.33/drivers/xen/blkback/common.h 2007-01-08 15:00:45.000000000 +0000
58505 @@ -0,0 +1,139 @@
58506 +/*
58507 + * This program is free software; you can redistribute it and/or
58508 + * modify it under the terms of the GNU General Public License version 2
58509 + * as published by the Free Software Foundation; or, when distributed
58510 + * separately from the Linux kernel or incorporated into other
58511 + * software packages, subject to the following license:
58512 + *
58513 + * Permission is hereby granted, free of charge, to any person obtaining a copy
58514 + * of this source file (the "Software"), to deal in the Software without
58515 + * restriction, including without limitation the rights to use, copy, modify,
58516 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58517 + * and to permit persons to whom the Software is furnished to do so, subject to
58518 + * the following conditions:
58519 + *
58520 + * The above copyright notice and this permission notice shall be included in
58521 + * all copies or substantial portions of the Software.
58522 + *
58523 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58524 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58525 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58526 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58527 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58528 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58529 + * IN THE SOFTWARE.
58530 + */
58531 +
58532 +#ifndef __BLKIF__BACKEND__COMMON_H__
58533 +#define __BLKIF__BACKEND__COMMON_H__
58534 +
58535 +#include <linux/config.h>
58536 +#include <linux/version.h>
58537 +#include <linux/module.h>
58538 +#include <linux/interrupt.h>
58539 +#include <linux/slab.h>
58540 +#include <linux/blkdev.h>
58541 +#include <linux/vmalloc.h>
58542 +#include <linux/wait.h>
58543 +#include <asm/io.h>
58544 +#include <asm/setup.h>
58545 +#include <asm/pgalloc.h>
58546 +#include <xen/evtchn.h>
58547 +#include <asm/hypervisor.h>
58548 +#include <xen/interface/io/blkif.h>
58549 +#include <xen/interface/io/ring.h>
58550 +#include <xen/gnttab.h>
58551 +#include <xen/driver_util.h>
58552 +#include <xen/xenbus.h>
58553 +
58554 +#define DPRINTK(_f, _a...) \
58555 + pr_debug("(file=%s, line=%d) " _f, \
58556 + __FILE__ , __LINE__ , ## _a )
58557 +
58558 +struct vbd {
58559 + blkif_vdev_t handle; /* what the domain refers to this vbd as */
58560 + unsigned char readonly; /* Non-zero -> read-only */
58561 + unsigned char type; /* VDISK_xxx */
58562 + u32 pdevice; /* phys device that this vbd maps to */
58563 + struct block_device *bdev;
58564 +};
58565 +
58566 +struct backend_info;
58567 +
58568 +typedef struct blkif_st {
58569 + /* Unique identifier for this interface. */
58570 + domid_t domid;
58571 + unsigned int handle;
58572 + /* Physical parameters of the comms window. */
58573 + unsigned int evtchn;
58574 + unsigned int irq;
58575 + /* Comms information. */
58576 + blkif_back_ring_t blk_ring;
58577 + struct vm_struct *blk_ring_area;
58578 + /* The VBD attached to this interface. */
58579 + struct vbd vbd;
58580 + /* Back pointer to the backend_info. */
58581 + struct backend_info *be;
58582 + /* Private fields. */
58583 + spinlock_t blk_ring_lock;
58584 + atomic_t refcnt;
58585 +
58586 + wait_queue_head_t wq;
58587 + struct task_struct *xenblkd;
58588 + unsigned int waiting_reqs;
58589 + request_queue_t *plug;
58590 +
58591 + /* statistics */
58592 + unsigned long st_print;
58593 + int st_rd_req;
58594 + int st_wr_req;
58595 + int st_oo_req;
58596 + int st_br_req;
58597 +
58598 + wait_queue_head_t waiting_to_free;
58599 +
58600 + grant_handle_t shmem_handle;
58601 + grant_ref_t shmem_ref;
58602 +} blkif_t;
58603 +
58604 +blkif_t *blkif_alloc(domid_t domid);
58605 +void blkif_disconnect(blkif_t *blkif);
58606 +void blkif_free(blkif_t *blkif);
58607 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
58608 +
58609 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
58610 +#define blkif_put(_b) \
58611 + do { \
58612 + if (atomic_dec_and_test(&(_b)->refcnt)) \
58613 + wake_up(&(_b)->waiting_to_free);\
58614 + } while (0)
58615 +
58616 +/* Create a vbd. */
58617 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
58618 + unsigned minor, int readonly);
58619 +void vbd_free(struct vbd *vbd);
58620 +
58621 +unsigned long long vbd_size(struct vbd *vbd);
58622 +unsigned int vbd_info(struct vbd *vbd);
58623 +unsigned long vbd_secsize(struct vbd *vbd);
58624 +
58625 +struct phys_req {
58626 + unsigned short dev;
58627 + unsigned short nr_sects;
58628 + struct block_device *bdev;
58629 + blkif_sector_t sector_number;
58630 +};
58631 +
58632 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
58633 +
58634 +void blkif_interface_init(void);
58635 +
58636 +void blkif_xenbus_init(void);
58637 +
58638 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
58639 +int blkif_schedule(void *arg);
58640 +
58641 +int blkback_barrier(struct xenbus_transaction xbt,
58642 + struct backend_info *be, int state);
58643 +
58644 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
58645 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/interface.c linux-2.6.16.33/drivers/xen/blkback/interface.c
58646 --- linux-2.6.16.33-noxen/drivers/xen/blkback/interface.c 1970-01-01 00:00:00.000000000 +0000
58647 +++ linux-2.6.16.33/drivers/xen/blkback/interface.c 2007-01-08 15:00:45.000000000 +0000
58648 @@ -0,0 +1,171 @@
58649 +/******************************************************************************
58650 + * arch/xen/drivers/blkif/backend/interface.c
58651 + *
58652 + * Block-device interface management.
58653 + *
58654 + * Copyright (c) 2004, Keir Fraser
58655 + *
58656 + * This program is free software; you can redistribute it and/or
58657 + * modify it under the terms of the GNU General Public License version 2
58658 + * as published by the Free Software Foundation; or, when distributed
58659 + * separately from the Linux kernel or incorporated into other
58660 + * software packages, subject to the following license:
58661 + *
58662 + * Permission is hereby granted, free of charge, to any person obtaining a copy
58663 + * of this source file (the "Software"), to deal in the Software without
58664 + * restriction, including without limitation the rights to use, copy, modify,
58665 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58666 + * and to permit persons to whom the Software is furnished to do so, subject to
58667 + * the following conditions:
58668 + *
58669 + * The above copyright notice and this permission notice shall be included in
58670 + * all copies or substantial portions of the Software.
58671 + *
58672 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58673 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58674 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58675 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58676 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58677 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58678 + * IN THE SOFTWARE.
58679 + */
58680 +
58681 +#include "common.h"
58682 +#include <xen/evtchn.h>
58683 +#include <linux/kthread.h>
58684 +
58685 +static kmem_cache_t *blkif_cachep;
58686 +
58687 +blkif_t *blkif_alloc(domid_t domid)
58688 +{
58689 + blkif_t *blkif;
58690 +
58691 + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
58692 + if (!blkif)
58693 + return ERR_PTR(-ENOMEM);
58694 +
58695 + memset(blkif, 0, sizeof(*blkif));
58696 + blkif->domid = domid;
58697 + spin_lock_init(&blkif->blk_ring_lock);
58698 + atomic_set(&blkif->refcnt, 1);
58699 + init_waitqueue_head(&blkif->wq);
58700 + blkif->st_print = jiffies;
58701 + init_waitqueue_head(&blkif->waiting_to_free);
58702 +
58703 + return blkif;
58704 +}
58705 +
58706 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
58707 +{
58708 + struct gnttab_map_grant_ref op;
58709 + int ret;
58710 +
58711 + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
58712 + GNTMAP_host_map, shared_page, blkif->domid);
58713 +
58714 + lock_vm_area(blkif->blk_ring_area);
58715 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
58716 + unlock_vm_area(blkif->blk_ring_area);
58717 + BUG_ON(ret);
58718 +
58719 + if (op.status) {
58720 + DPRINTK(" Grant table operation failure !\n");
58721 + return op.status;
58722 + }
58723 +
58724 + blkif->shmem_ref = shared_page;
58725 + blkif->shmem_handle = op.handle;
58726 +
58727 + return 0;
58728 +}
58729 +
58730 +static void unmap_frontend_page(blkif_t *blkif)
58731 +{
58732 + struct gnttab_unmap_grant_ref op;
58733 + int ret;
58734 +
58735 + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
58736 + GNTMAP_host_map, blkif->shmem_handle);
58737 +
58738 + lock_vm_area(blkif->blk_ring_area);
58739 + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
58740 + unlock_vm_area(blkif->blk_ring_area);
58741 + BUG_ON(ret);
58742 +}
58743 +
58744 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
58745 +{
58746 + blkif_sring_t *sring;
58747 + int err;
58748 + struct evtchn_bind_interdomain bind_interdomain;
58749 +
58750 + /* Already connected through? */
58751 + if (blkif->irq)
58752 + return 0;
58753 +
58754 + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
58755 + return -ENOMEM;
58756 +
58757 + err = map_frontend_page(blkif, shared_page);
58758 + if (err) {
58759 + free_vm_area(blkif->blk_ring_area);
58760 + return err;
58761 + }
58762 +
58763 + bind_interdomain.remote_dom = blkif->domid;
58764 + bind_interdomain.remote_port = evtchn;
58765 +
58766 + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
58767 + &bind_interdomain);
58768 + if (err) {
58769 + unmap_frontend_page(blkif);
58770 + free_vm_area(blkif->blk_ring_area);
58771 + return err;
58772 + }
58773 +
58774 + blkif->evtchn = bind_interdomain.local_port;
58775 +
58776 + sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
58777 + BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
58778 +
58779 + blkif->irq = bind_evtchn_to_irqhandler(
58780 + blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
58781 +
58782 + return 0;
58783 +}
58784 +
58785 +void blkif_disconnect(blkif_t *blkif)
58786 +{
58787 + if (blkif->xenblkd) {
58788 + kthread_stop(blkif->xenblkd);
58789 + blkif->xenblkd = NULL;
58790 + }
58791 +
58792 + atomic_dec(&blkif->refcnt);
58793 + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
58794 + atomic_inc(&blkif->refcnt);
58795 +
58796 + if (blkif->irq) {
58797 + unbind_from_irqhandler(blkif->irq, blkif);
58798 + blkif->irq = 0;
58799 + }
58800 +
58801 + if (blkif->blk_ring.sring) {
58802 + unmap_frontend_page(blkif);
58803 + free_vm_area(blkif->blk_ring_area);
58804 + blkif->blk_ring.sring = NULL;
58805 + }
58806 +}
58807 +
58808 +void blkif_free(blkif_t *blkif)
58809 +{
58810 + if (!atomic_dec_and_test(&blkif->refcnt))
58811 + BUG();
58812 + kmem_cache_free(blkif_cachep, blkif);
58813 +}
58814 +
58815 +void __init blkif_interface_init(void)
58816 +{
58817 + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
58818 + 0, 0, NULL, NULL);
58819 +}
58820 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/vbd.c linux-2.6.16.33/drivers/xen/blkback/vbd.c
58821 --- linux-2.6.16.33-noxen/drivers/xen/blkback/vbd.c 1970-01-01 00:00:00.000000000 +0000
58822 +++ linux-2.6.16.33/drivers/xen/blkback/vbd.c 2007-01-08 15:00:45.000000000 +0000
58823 @@ -0,0 +1,118 @@
58824 +/******************************************************************************
58825 + * blkback/vbd.c
58826 + *
58827 + * Routines for managing virtual block devices (VBDs).
58828 + *
58829 + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
58830 + *
58831 + * This program is free software; you can redistribute it and/or
58832 + * modify it under the terms of the GNU General Public License version 2
58833 + * as published by the Free Software Foundation; or, when distributed
58834 + * separately from the Linux kernel or incorporated into other
58835 + * software packages, subject to the following license:
58836 + *
58837 + * Permission is hereby granted, free of charge, to any person obtaining a copy
58838 + * of this source file (the "Software"), to deal in the Software without
58839 + * restriction, including without limitation the rights to use, copy, modify,
58840 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
58841 + * and to permit persons to whom the Software is furnished to do so, subject to
58842 + * the following conditions:
58843 + *
58844 + * The above copyright notice and this permission notice shall be included in
58845 + * all copies or substantial portions of the Software.
58846 + *
58847 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58848 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
58849 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58850 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58851 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
58852 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
58853 + * IN THE SOFTWARE.
58854 + */
58855 +
58856 +#include "common.h"
58857 +
58858 +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
58859 + (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
58860 +
58861 +unsigned long long vbd_size(struct vbd *vbd)
58862 +{
58863 + return vbd_sz(vbd);
58864 +}
58865 +
58866 +unsigned int vbd_info(struct vbd *vbd)
58867 +{
58868 + return vbd->type | (vbd->readonly?VDISK_READONLY:0);
58869 +}
58870 +
58871 +unsigned long vbd_secsize(struct vbd *vbd)
58872 +{
58873 + return bdev_hardsect_size(vbd->bdev);
58874 +}
58875 +
58876 +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
58877 + unsigned minor, int readonly)
58878 +{
58879 + struct vbd *vbd;
58880 + struct block_device *bdev;
58881 +
58882 + vbd = &blkif->vbd;
58883 + vbd->handle = handle;
58884 + vbd->readonly = readonly;
58885 + vbd->type = 0;
58886 +
58887 + vbd->pdevice = MKDEV(major, minor);
58888 +
58889 + bdev = open_by_devnum(vbd->pdevice,
58890 + vbd->readonly ? FMODE_READ : FMODE_WRITE);
58891 +
58892 + if (IS_ERR(bdev)) {
58893 + DPRINTK("vbd_creat: device %08x could not be opened.\n",
58894 + vbd->pdevice);
58895 + return -ENOENT;
58896 + }
58897 +
58898 + vbd->bdev = bdev;
58899 +
58900 + if (vbd->bdev->bd_disk == NULL) {
58901 + DPRINTK("vbd_creat: device %08x doesn't exist.\n",
58902 + vbd->pdevice);
58903 + vbd_free(vbd);
58904 + return -ENOENT;
58905 + }
58906 +
58907 + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
58908 + vbd->type |= VDISK_CDROM;
58909 + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
58910 + vbd->type |= VDISK_REMOVABLE;
58911 +
58912 + DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
58913 + handle, blkif->domid);
58914 + return 0;
58915 +}
58916 +
58917 +void vbd_free(struct vbd *vbd)
58918 +{
58919 + if (vbd->bdev)
58920 + blkdev_put(vbd->bdev);
58921 + vbd->bdev = NULL;
58922 +}
58923 +
58924 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
58925 +{
58926 + struct vbd *vbd = &blkif->vbd;
58927 + int rc = -EACCES;
58928 +
58929 + if ((operation != READ) && vbd->readonly)
58930 + goto out;
58931 +
58932 + if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
58933 + goto out;
58934 +
58935 + req->dev = vbd->pdevice;
58936 + req->bdev = vbd->bdev;
58937 + rc = 0;
58938 +
58939 + out:
58940 + return rc;
58941 +}
58942 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkback/xenbus.c linux-2.6.16.33/drivers/xen/blkback/xenbus.c
58943 --- linux-2.6.16.33-noxen/drivers/xen/blkback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
58944 +++ linux-2.6.16.33/drivers/xen/blkback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
58945 @@ -0,0 +1,485 @@
58946 +/* Xenbus code for blkif backend
58947 + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
58948 + Copyright (C) 2005 XenSource Ltd
58949 +
58950 + This program is free software; you can redistribute it and/or modify
58951 + it under the terms of the GNU General Public License as published by
58952 + the Free Software Foundation; either version 2 of the License, or
58953 + (at your option) any later version.
58954 +
58955 + This program is distributed in the hope that it will be useful,
58956 + but WITHOUT ANY WARRANTY; without even the implied warranty of
58957 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
58958 + GNU General Public License for more details.
58959 +
58960 + You should have received a copy of the GNU General Public License
58961 + along with this program; if not, write to the Free Software
58962 + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
58963 +*/
58964 +
58965 +#include <stdarg.h>
58966 +#include <linux/module.h>
58967 +#include <linux/kthread.h>
58968 +#include "common.h"
58969 +
58970 +#undef DPRINTK
58971 +#define DPRINTK(fmt, args...) \
58972 + pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
58973 + __FUNCTION__, __LINE__, ##args)
58974 +
58975 +struct backend_info
58976 +{
58977 + struct xenbus_device *dev;
58978 + blkif_t *blkif;
58979 + struct xenbus_watch backend_watch;
58980 + unsigned major;
58981 + unsigned minor;
58982 + char *mode;
58983 +};
58984 +
58985 +static void connect(struct backend_info *);
58986 +static int connect_ring(struct backend_info *);
58987 +static void backend_changed(struct xenbus_watch *, const char **,
58988 + unsigned int);
58989 +
58990 +static void update_blkif_status(blkif_t *blkif)
58991 +{
58992 + int err;
58993 +
58994 + /* Not ready to connect? */
58995 + if (!blkif->irq || !blkif->vbd.bdev)
58996 + return;
58997 +
58998 + /* Already connected? */
58999 + if (blkif->be->dev->state == XenbusStateConnected)
59000 + return;
59001 +
59002 + /* Attempt to connect: exit if we fail to. */
59003 + connect(blkif->be);
59004 + if (blkif->be->dev->state != XenbusStateConnected)
59005 + return;
59006 +
59007 + blkif->xenblkd = kthread_run(blkif_schedule, blkif,
59008 + "xvd %d %02x:%02x",
59009 + blkif->domid,
59010 + blkif->be->major, blkif->be->minor);
59011 + if (IS_ERR(blkif->xenblkd)) {
59012 + err = PTR_ERR(blkif->xenblkd);
59013 + blkif->xenblkd = NULL;
59014 + xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
59015 + }
59016 +}
59017 +
59018 +
59019 +/****************************************************************
59020 + * sysfs interface for VBD I/O requests
59021 + */
59022 +
59023 +#define VBD_SHOW(name, format, args...) \
59024 + static ssize_t show_##name(struct device *_dev, \
59025 + struct device_attribute *attr, \
59026 + char *buf) \
59027 + { \
59028 + struct xenbus_device *dev = to_xenbus_device(_dev); \
59029 + struct backend_info *be = dev->dev.driver_data; \
59030 + \
59031 + return sprintf(buf, format, ##args); \
59032 + } \
59033 + DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
59034 +
59035 +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
59036 +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
59037 +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
59038 +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
59039 +
59040 +static struct attribute *vbdstat_attrs[] = {
59041 + &dev_attr_oo_req.attr,
59042 + &dev_attr_rd_req.attr,
59043 + &dev_attr_wr_req.attr,
59044 + &dev_attr_br_req.attr,
59045 + NULL
59046 +};
59047 +
59048 +static struct attribute_group vbdstat_group = {
59049 + .name = "statistics",
59050 + .attrs = vbdstat_attrs,
59051 +};
59052 +
59053 +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
59054 +VBD_SHOW(mode, "%s\n", be->mode);
59055 +
59056 +int xenvbd_sysfs_addif(struct xenbus_device *dev)
59057 +{
59058 + int error;
59059 +
59060 + error = device_create_file(&dev->dev, &dev_attr_physical_device);
59061 + if (error)
59062 + goto fail1;
59063 +
59064 + error = device_create_file(&dev->dev, &dev_attr_mode);
59065 + if (error)
59066 + goto fail2;
59067 +
59068 + error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
59069 + if (error)
59070 + goto fail3;
59071 +
59072 + return 0;
59073 +
59074 +fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
59075 +fail2: device_remove_file(&dev->dev, &dev_attr_mode);
59076 +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
59077 + return error;
59078 +}
59079 +
59080 +void xenvbd_sysfs_delif(struct xenbus_device *dev)
59081 +{
59082 + sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
59083 + device_remove_file(&dev->dev, &dev_attr_mode);
59084 + device_remove_file(&dev->dev, &dev_attr_physical_device);
59085 +}
59086 +
59087 +static int blkback_remove(struct xenbus_device *dev)
59088 +{
59089 + struct backend_info *be = dev->dev.driver_data;
59090 +
59091 + DPRINTK("");
59092 +
59093 + if (be->backend_watch.node) {
59094 + unregister_xenbus_watch(&be->backend_watch);
59095 + kfree(be->backend_watch.node);
59096 + be->backend_watch.node = NULL;
59097 + }
59098 +
59099 + if (be->blkif) {
59100 + blkif_disconnect(be->blkif);
59101 + vbd_free(&be->blkif->vbd);
59102 + blkif_free(be->blkif);
59103 + be->blkif = NULL;
59104 + }
59105 +
59106 + if (be->major || be->minor)
59107 + xenvbd_sysfs_delif(dev);
59108 +
59109 + kfree(be);
59110 + dev->dev.driver_data = NULL;
59111 + return 0;
59112 +}
59113 +
59114 +int blkback_barrier(struct xenbus_transaction xbt,
59115 + struct backend_info *be, int state)
59116 +{
59117 + struct xenbus_device *dev = be->dev;
59118 + int err;
59119 +
59120 + err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
59121 + "%d", state);
59122 + if (err)
59123 + xenbus_dev_fatal(dev, err, "writing feature-barrier");
59124 +
59125 + return err;
59126 +}
59127 +
59128 +/**
59129 + * Entry point to this code when a new device is created. Allocate the basic
59130 + * structures, and watch the store waiting for the hotplug scripts to tell us
59131 + * the device's physical major and minor numbers. Switch to InitWait.
59132 + */
59133 +static int blkback_probe(struct xenbus_device *dev,
59134 + const struct xenbus_device_id *id)
59135 +{
59136 + int err;
59137 + struct backend_info *be = kzalloc(sizeof(struct backend_info),
59138 + GFP_KERNEL);
59139 + if (!be) {
59140 + xenbus_dev_fatal(dev, -ENOMEM,
59141 + "allocating backend structure");
59142 + return -ENOMEM;
59143 + }
59144 + be->dev = dev;
59145 + dev->dev.driver_data = be;
59146 +
59147 + be->blkif = blkif_alloc(dev->otherend_id);
59148 + if (IS_ERR(be->blkif)) {
59149 + err = PTR_ERR(be->blkif);
59150 + be->blkif = NULL;
59151 + xenbus_dev_fatal(dev, err, "creating block interface");
59152 + goto fail;
59153 + }
59154 +
59155 + /* setup back pointer */
59156 + be->blkif->be = be;
59157 +
59158 + err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
59159 + &be->backend_watch, backend_changed);
59160 + if (err)
59161 + goto fail;
59162 +
59163 + err = xenbus_switch_state(dev, XenbusStateInitWait);
59164 + if (err)
59165 + goto fail;
59166 +
59167 + return 0;
59168 +
59169 +fail:
59170 + DPRINTK("failed");
59171 + blkback_remove(dev);
59172 + return err;
59173 +}
59174 +
59175 +
59176 +/**
59177 + * Callback received when the hotplug scripts have placed the physical-device
59178 + * node. Read it and the mode node, and create a vbd. If the frontend is
59179 + * ready, connect.
59180 + */
59181 +static void backend_changed(struct xenbus_watch *watch,
59182 + const char **vec, unsigned int len)
59183 +{
59184 + int err;
59185 + unsigned major;
59186 + unsigned minor;
59187 + struct backend_info *be
59188 + = container_of(watch, struct backend_info, backend_watch);
59189 + struct xenbus_device *dev = be->dev;
59190 +
59191 + DPRINTK("");
59192 +
59193 + err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
59194 + &major, &minor);
59195 + if (XENBUS_EXIST_ERR(err)) {
59196 + /* Since this watch will fire once immediately after it is
59197 + registered, we expect this. Ignore it, and wait for the
59198 + hotplug scripts. */
59199 + return;
59200 + }
59201 + if (err != 2) {
59202 + xenbus_dev_fatal(dev, err, "reading physical-device");
59203 + return;
59204 + }
59205 +
59206 + if ((be->major || be->minor) &&
59207 + ((be->major != major) || (be->minor != minor))) {
59208 + printk(KERN_WARNING
59209 + "blkback: changing physical device (from %x:%x to "
59210 + "%x:%x) not supported.\n", be->major, be->minor,
59211 + major, minor);
59212 + return;
59213 + }
59214 +
59215 + be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
59216 + if (IS_ERR(be->mode)) {
59217 + err = PTR_ERR(be->mode);
59218 + be->mode = NULL;
59219 + xenbus_dev_fatal(dev, err, "reading mode");
59220 + return;
59221 + }
59222 +
59223 + if (be->major == 0 && be->minor == 0) {
59224 + /* Front end dir is a number, which is used as the handle. */
59225 +
59226 + char *p = strrchr(dev->otherend, '/') + 1;
59227 + long handle = simple_strtoul(p, NULL, 0);
59228 +
59229 + be->major = major;
59230 + be->minor = minor;
59231 +
59232 + err = vbd_create(be->blkif, handle, major, minor,
59233 + (NULL == strchr(be->mode, 'w')));
59234 + if (err) {
59235 + be->major = be->minor = 0;
59236 + xenbus_dev_fatal(dev, err, "creating vbd structure");
59237 + return;
59238 + }
59239 +
59240 + err = xenvbd_sysfs_addif(dev);
59241 + if (err) {
59242 + vbd_free(&be->blkif->vbd);
59243 + be->major = be->minor = 0;
59244 + xenbus_dev_fatal(dev, err, "creating sysfs entries");
59245 + return;
59246 + }
59247 +
59248 + /* We're potentially connected now */
59249 + update_blkif_status(be->blkif);
59250 + }
59251 +}
59252 +
59253 +
59254 +/**
59255 + * Callback received when the frontend's state changes.
59256 + */
59257 +static void frontend_changed(struct xenbus_device *dev,
59258 + enum xenbus_state frontend_state)
59259 +{
59260 + struct backend_info *be = dev->dev.driver_data;
59261 + int err;
59262 +
59263 + DPRINTK("%s", xenbus_strstate(frontend_state));
59264 +
59265 + switch (frontend_state) {
59266 + case XenbusStateInitialising:
59267 + if (dev->state == XenbusStateClosed) {
59268 + printk("%s: %s: prepare for reconnect\n",
59269 + __FUNCTION__, dev->nodename);
59270 + xenbus_switch_state(dev, XenbusStateInitWait);
59271 + }
59272 + break;
59273 +
59274 + case XenbusStateInitialised:
59275 + case XenbusStateConnected:
59276 + /* Ensure we connect even when two watches fire in
59277 + close successsion and we miss the intermediate value
59278 + of frontend_state. */
59279 + if (dev->state == XenbusStateConnected)
59280 + break;
59281 +
59282 + err = connect_ring(be);
59283 + if (err)
59284 + break;
59285 + update_blkif_status(be->blkif);
59286 + break;
59287 +
59288 + case XenbusStateClosing:
59289 + blkif_disconnect(be->blkif);
59290 + xenbus_switch_state(dev, XenbusStateClosing);
59291 + break;
59292 +
59293 + case XenbusStateClosed:
59294 + xenbus_switch_state(dev, XenbusStateClosed);
59295 + if (xenbus_dev_is_online(dev))
59296 + break;
59297 + /* fall through if not online */
59298 + case XenbusStateUnknown:
59299 + device_unregister(&dev->dev);
59300 + break;
59301 +
59302 + default:
59303 + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
59304 + frontend_state);
59305 + break;
59306 + }
59307 +}
59308 +
59309 +
59310 +/* ** Connection ** */
59311 +
59312 +
59313 +/**
59314 + * Write the physical details regarding the block device to the store, and
59315 + * switch to Connected state.
59316 + */
59317 +static void connect(struct backend_info *be)
59318 +{
59319 + struct xenbus_transaction xbt;
59320 + int err;
59321 + struct xenbus_device *dev = be->dev;
59322 +
59323 + DPRINTK("%s", dev->otherend);
59324 +
59325 + /* Supply the information about the device the frontend needs */
59326 +again:
59327 + err = xenbus_transaction_start(&xbt);
59328 + if (err) {
59329 + xenbus_dev_fatal(dev, err, "starting transaction");
59330 + return;
59331 + }
59332 +
59333 + err = blkback_barrier(xbt, be, 1);
59334 + if (err)
59335 + goto abort;
59336 +
59337 + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
59338 + vbd_size(&be->blkif->vbd));
59339 + if (err) {
59340 + xenbus_dev_fatal(dev, err, "writing %s/sectors",
59341 + dev->nodename);
59342 + goto abort;
59343 + }
59344 +
59345 + /* FIXME: use a typename instead */
59346 + err = xenbus_printf(xbt, dev->nodename, "info", "%u",
59347 + vbd_info(&be->blkif->vbd));
59348 + if (err) {
59349 + xenbus_dev_fatal(dev, err, "writing %s/info",
59350 + dev->nodename);
59351 + goto abort;
59352 + }
59353 + err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
59354 + vbd_secsize(&be->blkif->vbd));
59355 + if (err) {
59356 + xenbus_dev_fatal(dev, err, "writing %s/sector-size",
59357 + dev->nodename);
59358 + goto abort;
59359 + }
59360 +
59361 + err = xenbus_transaction_end(xbt, 0);
59362 + if (err == -EAGAIN)
59363 + goto again;
59364 + if (err)
59365 + xenbus_dev_fatal(dev, err, "ending transaction");
59366 +
59367 + err = xenbus_switch_state(dev, XenbusStateConnected);
59368 + if (err)
59369 + xenbus_dev_fatal(dev, err, "switching to Connected state",
59370 + dev->nodename);
59371 +
59372 + return;
59373 + abort:
59374 + xenbus_transaction_end(xbt, 1);
59375 +}
59376 +
59377 +
59378 +static int connect_ring(struct backend_info *be)
59379 +{
59380 + struct xenbus_device *dev = be->dev;
59381 + unsigned long ring_ref;
59382 + unsigned int evtchn;
59383 + int err;
59384 +
59385 + DPRINTK("%s", dev->otherend);
59386 +
59387 + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
59388 + "event-channel", "%u", &evtchn, NULL);
59389 + if (err) {
59390 + xenbus_dev_fatal(dev, err,
59391 + "reading %s/ring-ref and event-channel",
59392 + dev->otherend);
59393 + return err;
59394 + }
59395 +
59396 + /* Map the shared frame, irq etc. */
59397 + err = blkif_map(be->blkif, ring_ref, evtchn);
59398 + if (err) {
59399 + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
59400 + ring_ref, evtchn);
59401 + return err;
59402 + }
59403 +
59404 + return 0;
59405 +}
59406 +
59407 +
59408 +/* ** Driver Registration ** */
59409 +
59410 +
59411 +static struct xenbus_device_id blkback_ids[] = {
59412 + { "vbd" },
59413 + { "" }
59414 +};
59415 +
59416 +
59417 +static struct xenbus_driver blkback = {
59418 + .name = "vbd",
59419 + .owner = THIS_MODULE,
59420 + .ids = blkback_ids,
59421 + .probe = blkback_probe,
59422 + .remove = blkback_remove,
59423 + .otherend_changed = frontend_changed
59424 +};
59425 +
59426 +
59427 +void blkif_xenbus_init(void)
59428 +{
59429 + xenbus_register_backend(&blkback);
59430 +}
59431 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/Makefile linux-2.6.16.33/drivers/xen/blkfront/Makefile
59432 --- linux-2.6.16.33-noxen/drivers/xen/blkfront/Makefile 1970-01-01 00:00:00.000000000 +0000
59433 +++ linux-2.6.16.33/drivers/xen/blkfront/Makefile 2007-01-08 15:00:45.000000000 +0000
59434 @@ -0,0 +1,5 @@
59435 +
59436 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o
59437 +
59438 +xenblk-objs := blkfront.o vbd.o
59439 +
59440 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/blkfront.c linux-2.6.16.33/drivers/xen/blkfront/blkfront.c
59441 --- linux-2.6.16.33-noxen/drivers/xen/blkfront/blkfront.c 1970-01-01 00:00:00.000000000 +0000
59442 +++ linux-2.6.16.33/drivers/xen/blkfront/blkfront.c 2007-01-08 15:00:45.000000000 +0000
59443 @@ -0,0 +1,891 @@
59444 +/******************************************************************************
59445 + * blkfront.c
59446 + *
59447 + * XenLinux virtual block-device driver.
59448 + *
59449 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
59450 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
59451 + * Copyright (c) 2004, Christian Limpach
59452 + * Copyright (c) 2004, Andrew Warfield
59453 + * Copyright (c) 2005, Christopher Clark
59454 + * Copyright (c) 2005, XenSource Ltd
59455 + *
59456 + * This program is free software; you can redistribute it and/or
59457 + * modify it under the terms of the GNU General Public License version 2
59458 + * as published by the Free Software Foundation; or, when distributed
59459 + * separately from the Linux kernel or incorporated into other
59460 + * software packages, subject to the following license:
59461 + *
59462 + * Permission is hereby granted, free of charge, to any person obtaining a copy
59463 + * of this source file (the "Software"), to deal in the Software without
59464 + * restriction, including without limitation the rights to use, copy, modify,
59465 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
59466 + * and to permit persons to whom the Software is furnished to do so, subject to
59467 + * the following conditions:
59468 + *
59469 + * The above copyright notice and this permission notice shall be included in
59470 + * all copies or substantial portions of the Software.
59471 + *
59472 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59473 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59474 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
59475 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59476 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59477 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
59478 + * IN THE SOFTWARE.
59479 + */
59480 +
59481 +#include <linux/version.h>
59482 +#include "block.h"
59483 +#include <linux/cdrom.h>
59484 +#include <linux/sched.h>
59485 +#include <linux/interrupt.h>
59486 +#include <scsi/scsi.h>
59487 +#include <xen/evtchn.h>
59488 +#include <xen/xenbus.h>
59489 +#include <xen/interface/grant_table.h>
59490 +#include <xen/gnttab.h>
59491 +#include <asm/hypervisor.h>
59492 +#include <asm/maddr.h>
59493 +
59494 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
59495 +#include <xen/platform-compat.h>
59496 +#endif
59497 +
59498 +#define BLKIF_STATE_DISCONNECTED 0
59499 +#define BLKIF_STATE_CONNECTED 1
59500 +#define BLKIF_STATE_SUSPENDED 2
59501 +
59502 +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
59503 + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
59504 +#define GRANT_INVALID_REF 0
59505 +
59506 +static void connect(struct blkfront_info *);
59507 +static void blkfront_closing(struct xenbus_device *);
59508 +static int blkfront_remove(struct xenbus_device *);
59509 +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
59510 +static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
59511 +
59512 +static void kick_pending_request_queues(struct blkfront_info *);
59513 +
59514 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
59515 +static void blkif_restart_queue(void *arg);
59516 +static void blkif_recover(struct blkfront_info *);
59517 +static void blkif_completion(struct blk_shadow *);
59518 +static void blkif_free(struct blkfront_info *, int);
59519 +
59520 +
59521 +/**
59522 + * Entry point to this code when a new device is created. Allocate the basic
59523 + * structures and the ring buffer for communication with the backend, and
59524 + * inform the backend of the appropriate details for those. Switch to
59525 + * Initialised state.
59526 + */
59527 +static int blkfront_probe(struct xenbus_device *dev,
59528 + const struct xenbus_device_id *id)
59529 +{
59530 + int err, vdevice, i;
59531 + struct blkfront_info *info;
59532 +
59533 + /* FIXME: Use dynamic device id if this is not set. */
59534 + err = xenbus_scanf(XBT_NIL, dev->nodename,
59535 + "virtual-device", "%i", &vdevice);
59536 + if (err != 1) {
59537 + xenbus_dev_fatal(dev, err, "reading virtual-device");
59538 + return err;
59539 + }
59540 +
59541 + info = kzalloc(sizeof(*info), GFP_KERNEL);
59542 + if (!info) {
59543 + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
59544 + return -ENOMEM;
59545 + }
59546 +
59547 + info->xbdev = dev;
59548 + info->vdevice = vdevice;
59549 + info->connected = BLKIF_STATE_DISCONNECTED;
59550 + INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
59551 +
59552 + for (i = 0; i < BLK_RING_SIZE; i++)
59553 + info->shadow[i].req.id = i+1;
59554 + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
59555 +
59556 + /* Front end dir is a number, which is used as the id. */
59557 + info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
59558 + dev->dev.driver_data = info;
59559 +
59560 + err = talk_to_backend(dev, info);
59561 + if (err) {
59562 + kfree(info);
59563 + dev->dev.driver_data = NULL;
59564 + return err;
59565 + }
59566 +
59567 + return 0;
59568 +}
59569 +
59570 +
59571 +/**
59572 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
59573 + * driver restart. We tear down our blkif structure and recreate it, but
59574 + * leave the device-layer structures intact so that this is transparent to the
59575 + * rest of the kernel.
59576 + */
59577 +static int blkfront_resume(struct xenbus_device *dev)
59578 +{
59579 + struct blkfront_info *info = dev->dev.driver_data;
59580 + int err;
59581 +
59582 + DPRINTK("blkfront_resume: %s\n", dev->nodename);
59583 +
59584 + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
59585 +
59586 + err = talk_to_backend(dev, info);
59587 + if (info->connected == BLKIF_STATE_SUSPENDED && !err)
59588 + blkif_recover(info);
59589 +
59590 + return err;
59591 +}
59592 +
59593 +
59594 +/* Common code used when first setting up, and when resuming. */
59595 +static int talk_to_backend(struct xenbus_device *dev,
59596 + struct blkfront_info *info)
59597 +{
59598 + const char *message = NULL;
59599 + struct xenbus_transaction xbt;
59600 + int err;
59601 +
59602 + /* Create shared ring, alloc event channel. */
59603 + err = setup_blkring(dev, info);
59604 + if (err)
59605 + goto out;
59606 +
59607 +again:
59608 + err = xenbus_transaction_start(&xbt);
59609 + if (err) {
59610 + xenbus_dev_fatal(dev, err, "starting transaction");
59611 + goto destroy_blkring;
59612 + }
59613 +
59614 + err = xenbus_printf(xbt, dev->nodename,
59615 + "ring-ref","%u", info->ring_ref);
59616 + if (err) {
59617 + message = "writing ring-ref";
59618 + goto abort_transaction;
59619 + }
59620 + err = xenbus_printf(xbt, dev->nodename,
59621 + "event-channel", "%u", info->evtchn);
59622 + if (err) {
59623 + message = "writing event-channel";
59624 + goto abort_transaction;
59625 + }
59626 +
59627 + err = xenbus_transaction_end(xbt, 0);
59628 + if (err) {
59629 + if (err == -EAGAIN)
59630 + goto again;
59631 + xenbus_dev_fatal(dev, err, "completing transaction");
59632 + goto destroy_blkring;
59633 + }
59634 +
59635 + xenbus_switch_state(dev, XenbusStateInitialised);
59636 +
59637 + return 0;
59638 +
59639 + abort_transaction:
59640 + xenbus_transaction_end(xbt, 1);
59641 + if (message)
59642 + xenbus_dev_fatal(dev, err, "%s", message);
59643 + destroy_blkring:
59644 + blkif_free(info, 0);
59645 + out:
59646 + return err;
59647 +}
59648 +
59649 +
59650 +static int setup_blkring(struct xenbus_device *dev,
59651 + struct blkfront_info *info)
59652 +{
59653 + blkif_sring_t *sring;
59654 + int err;
59655 +
59656 + info->ring_ref = GRANT_INVALID_REF;
59657 +
59658 + sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
59659 + if (!sring) {
59660 + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
59661 + return -ENOMEM;
59662 + }
59663 + SHARED_RING_INIT(sring);
59664 + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
59665 +
59666 + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
59667 + if (err < 0) {
59668 + free_page((unsigned long)sring);
59669 + info->ring.sring = NULL;
59670 + goto fail;
59671 + }
59672 + info->ring_ref = err;
59673 +
59674 + err = xenbus_alloc_evtchn(dev, &info->evtchn);
59675 + if (err)
59676 + goto fail;
59677 +
59678 + err = bind_evtchn_to_irqhandler(
59679 + info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
59680 + if (err <= 0) {
59681 + xenbus_dev_fatal(dev, err,
59682 + "bind_evtchn_to_irqhandler failed");
59683 + goto fail;
59684 + }
59685 + info->irq = err;
59686 +
59687 + return 0;
59688 +fail:
59689 + blkif_free(info, 0);
59690 + return err;
59691 +}
59692 +
59693 +
59694 +/**
59695 + * Callback received when the backend's state changes.
59696 + */
59697 +static void backend_changed(struct xenbus_device *dev,
59698 + enum xenbus_state backend_state)
59699 +{
59700 + struct blkfront_info *info = dev->dev.driver_data;
59701 + struct block_device *bd;
59702 +
59703 + DPRINTK("blkfront:backend_changed.\n");
59704 +
59705 + switch (backend_state) {
59706 + case XenbusStateInitialising:
59707 + case XenbusStateInitWait:
59708 + case XenbusStateInitialised:
59709 + case XenbusStateUnknown:
59710 + case XenbusStateClosed:
59711 + break;
59712 +
59713 + case XenbusStateConnected:
59714 + connect(info);
59715 + break;
59716 +
59717 + case XenbusStateClosing:
59718 + bd = bdget(info->dev);
59719 + if (bd == NULL)
59720 + xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
59721 +
59722 + down(&bd->bd_sem);
59723 + if (info->users > 0)
59724 + xenbus_dev_error(dev, -EBUSY,
59725 + "Device in use; refusing to close");
59726 + else
59727 + blkfront_closing(dev);
59728 + up(&bd->bd_sem);
59729 + bdput(bd);
59730 + break;
59731 + }
59732 +}
59733 +
59734 +
59735 +/* ** Connection ** */
59736 +
59737 +
59738 +/*
59739 + * Invoked when the backend is finally 'ready' (and has told produced
59740 + * the details about the physical device - #sectors, size, etc).
59741 + */
59742 +static void connect(struct blkfront_info *info)
59743 +{
59744 + unsigned long long sectors;
59745 + unsigned long sector_size;
59746 + unsigned int binfo;
59747 + int err;
59748 +
59749 + if ((info->connected == BLKIF_STATE_CONNECTED) ||
59750 + (info->connected == BLKIF_STATE_SUSPENDED) )
59751 + return;
59752 +
59753 + DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
59754 +
59755 + err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
59756 + "sectors", "%Lu", &sectors,
59757 + "info", "%u", &binfo,
59758 + "sector-size", "%lu", &sector_size,
59759 + NULL);
59760 + if (err) {
59761 + xenbus_dev_fatal(info->xbdev, err,
59762 + "reading backend fields at %s",
59763 + info->xbdev->otherend);
59764 + return;
59765 + }
59766 +
59767 + err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
59768 + "feature-barrier", "%lu", &info->feature_barrier,
59769 + NULL);
59770 + if (err)
59771 + info->feature_barrier = 0;
59772 +
59773 + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
59774 + if (err) {
59775 + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
59776 + info->xbdev->otherend);
59777 + return;
59778 + }
59779 +
59780 + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
59781 +
59782 + /* Kick pending requests. */
59783 + spin_lock_irq(&blkif_io_lock);
59784 + info->connected = BLKIF_STATE_CONNECTED;
59785 + kick_pending_request_queues(info);
59786 + spin_unlock_irq(&blkif_io_lock);
59787 +
59788 + add_disk(info->gd);
59789 +}
59790 +
59791 +/**
59792 + * Handle the change of state of the backend to Closing. We must delete our
59793 + * device-layer structures now, to ensure that writes are flushed through to
59794 + * the backend. Once is this done, we can switch to Closed in
59795 + * acknowledgement.
59796 + */
59797 +static void blkfront_closing(struct xenbus_device *dev)
59798 +{
59799 + struct blkfront_info *info = dev->dev.driver_data;
59800 + unsigned long flags;
59801 +
59802 + DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
59803 +
59804 + if (info->rq == NULL)
59805 + goto out;
59806 +
59807 + spin_lock_irqsave(&blkif_io_lock, flags);
59808 + /* No more blkif_request(). */
59809 + blk_stop_queue(info->rq);
59810 + /* No more gnttab callback work. */
59811 + gnttab_cancel_free_callback(&info->callback);
59812 + spin_unlock_irqrestore(&blkif_io_lock, flags);
59813 +
59814 + /* Flush gnttab callback work. Must be done with no locks held. */
59815 + flush_scheduled_work();
59816 +
59817 + xlvbd_del(info);
59818 +
59819 + out:
59820 + xenbus_frontend_closed(dev);
59821 +}
59822 +
59823 +
59824 +static int blkfront_remove(struct xenbus_device *dev)
59825 +{
59826 + struct blkfront_info *info = dev->dev.driver_data;
59827 +
59828 + DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
59829 +
59830 + blkif_free(info, 0);
59831 +
59832 + kfree(info);
59833 +
59834 + return 0;
59835 +}
59836 +
59837 +
59838 +static inline int GET_ID_FROM_FREELIST(
59839 + struct blkfront_info *info)
59840 +{
59841 + unsigned long free = info->shadow_free;
59842 + BUG_ON(free > BLK_RING_SIZE);
59843 + info->shadow_free = info->shadow[free].req.id;
59844 + info->shadow[free].req.id = 0x0fffffee; /* debug */
59845 + return free;
59846 +}
59847 +
59848 +static inline void ADD_ID_TO_FREELIST(
59849 + struct blkfront_info *info, unsigned long id)
59850 +{
59851 + info->shadow[id].req.id = info->shadow_free;
59852 + info->shadow[id].request = 0;
59853 + info->shadow_free = id;
59854 +}
59855 +
59856 +static inline void flush_requests(struct blkfront_info *info)
59857 +{
59858 + int notify;
59859 +
59860 + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
59861 +
59862 + if (notify)
59863 + notify_remote_via_irq(info->irq);
59864 +}
59865 +
59866 +static void kick_pending_request_queues(struct blkfront_info *info)
59867 +{
59868 + if (!RING_FULL(&info->ring)) {
59869 + /* Re-enable calldowns. */
59870 + blk_start_queue(info->rq);
59871 + /* Kick things off immediately. */
59872 + do_blkif_request(info->rq);
59873 + }
59874 +}
59875 +
59876 +static void blkif_restart_queue(void *arg)
59877 +{
59878 + struct blkfront_info *info = (struct blkfront_info *)arg;
59879 + spin_lock_irq(&blkif_io_lock);
59880 + if (info->connected == BLKIF_STATE_CONNECTED)
59881 + kick_pending_request_queues(info);
59882 + spin_unlock_irq(&blkif_io_lock);
59883 +}
59884 +
59885 +static void blkif_restart_queue_callback(void *arg)
59886 +{
59887 + struct blkfront_info *info = (struct blkfront_info *)arg;
59888 + schedule_work(&info->work);
59889 +}
59890 +
59891 +int blkif_open(struct inode *inode, struct file *filep)
59892 +{
59893 + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
59894 + info->users++;
59895 + return 0;
59896 +}
59897 +
59898 +
59899 +int blkif_release(struct inode *inode, struct file *filep)
59900 +{
59901 + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
59902 + info->users--;
59903 + if (info->users == 0) {
59904 + /* Check whether we have been instructed to close. We will
59905 + have ignored this request initially, as the device was
59906 + still mounted. */
59907 + struct xenbus_device * dev = info->xbdev;
59908 + enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
59909 +
59910 + if (state == XenbusStateClosing)
59911 + blkfront_closing(dev);
59912 + }
59913 + return 0;
59914 +}
59915 +
59916 +
59917 +int blkif_ioctl(struct inode *inode, struct file *filep,
59918 + unsigned command, unsigned long argument)
59919 +{
59920 + int i;
59921 +
59922 + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
59923 + command, (long)argument, inode->i_rdev);
59924 +
59925 + switch (command) {
59926 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
59927 + case HDIO_GETGEO: {
59928 + struct block_device *bd = inode->i_bdev;
59929 + struct hd_geometry geo;
59930 + int ret;
59931 +
59932 + if (!argument)
59933 + return -EINVAL;
59934 +
59935 + geo.start = get_start_sect(bd);
59936 + ret = blkif_getgeo(bd, &geo);
59937 + if (ret)
59938 + return ret;
59939 +
59940 + if (copy_to_user((struct hd_geometry __user *)argument, &geo,
59941 + sizeof(geo)))
59942 + return -EFAULT;
59943 +
59944 + return 0;
59945 + }
59946 +#endif
59947 + case CDROMMULTISESSION:
59948 + DPRINTK("FIXME: support multisession CDs later\n");
59949 + for (i = 0; i < sizeof(struct cdrom_multisession); i++)
59950 + if (put_user(0, (char __user *)(argument + i)))
59951 + return -EFAULT;
59952 + return 0;
59953 +
59954 + default:
59955 + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
59956 + command);*/
59957 + return -EINVAL; /* same return as native Linux */
59958 + }
59959 +
59960 + return 0;
59961 +}
59962 +
59963 +
59964 +int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
59965 +{
59966 + /* We don't have real geometry info, but let's at least return
59967 + values consistent with the size of the device */
59968 + sector_t nsect = get_capacity(bd->bd_disk);
59969 + sector_t cylinders = nsect;
59970 +
59971 + hg->heads = 0xff;
59972 + hg->sectors = 0x3f;
59973 + sector_div(cylinders, hg->heads * hg->sectors);
59974 + hg->cylinders = cylinders;
59975 + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
59976 + hg->cylinders = 0xffff;
59977 + return 0;
59978 +}
59979 +
59980 +
59981 +/*
59982 + * blkif_queue_request
59983 + *
59984 + * request block io
59985 + *
59986 + * id: for guest use only.
59987 + * operation: BLKIF_OP_{READ,WRITE,PROBE}
59988 + * buffer: buffer to read/write into. this should be a
59989 + * virtual address in the guest os.
59990 + */
59991 +static int blkif_queue_request(struct request *req)
59992 +{
59993 + struct blkfront_info *info = req->rq_disk->private_data;
59994 + unsigned long buffer_mfn;
59995 + blkif_request_t *ring_req;
59996 + struct bio *bio;
59997 + struct bio_vec *bvec;
59998 + int idx;
59999 + unsigned long id;
60000 + unsigned int fsect, lsect;
60001 + int ref;
60002 + grant_ref_t gref_head;
60003 +
60004 + if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
60005 + return 1;
60006 +
60007 + if (gnttab_alloc_grant_references(
60008 + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
60009 + gnttab_request_free_callback(
60010 + &info->callback,
60011 + blkif_restart_queue_callback,
60012 + info,
60013 + BLKIF_MAX_SEGMENTS_PER_REQUEST);
60014 + return 1;
60015 + }
60016 +
60017 + /* Fill out a communications ring structure. */
60018 + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
60019 + id = GET_ID_FROM_FREELIST(info);
60020 + info->shadow[id].request = (unsigned long)req;
60021 +
60022 + ring_req->id = id;
60023 + ring_req->sector_number = (blkif_sector_t)req->sector;
60024 + ring_req->handle = info->handle;
60025 +
60026 + ring_req->operation = rq_data_dir(req) ?
60027 + BLKIF_OP_WRITE : BLKIF_OP_READ;
60028 + if (blk_barrier_rq(req))
60029 + ring_req->operation = BLKIF_OP_WRITE_BARRIER;
60030 +
60031 + ring_req->nr_segments = 0;
60032 + rq_for_each_bio (bio, req) {
60033 + bio_for_each_segment (bvec, bio, idx) {
60034 + BUG_ON(ring_req->nr_segments
60035 + == BLKIF_MAX_SEGMENTS_PER_REQUEST);
60036 + buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
60037 + fsect = bvec->bv_offset >> 9;
60038 + lsect = fsect + (bvec->bv_len >> 9) - 1;
60039 + /* install a grant reference. */
60040 + ref = gnttab_claim_grant_reference(&gref_head);
60041 + BUG_ON(ref == -ENOSPC);
60042 +
60043 + gnttab_grant_foreign_access_ref(
60044 + ref,
60045 + info->xbdev->otherend_id,
60046 + buffer_mfn,
60047 + rq_data_dir(req) );
60048 +
60049 + info->shadow[id].frame[ring_req->nr_segments] =
60050 + mfn_to_pfn(buffer_mfn);
60051 +
60052 + ring_req->seg[ring_req->nr_segments] =
60053 + (struct blkif_request_segment) {
60054 + .gref = ref,
60055 + .first_sect = fsect,
60056 + .last_sect = lsect };
60057 +
60058 + ring_req->nr_segments++;
60059 + }
60060 + }
60061 +
60062 + info->ring.req_prod_pvt++;
60063 +
60064 + /* Keep a private copy so we can reissue requests when recovering. */
60065 + info->shadow[id].req = *ring_req;
60066 +
60067 + gnttab_free_grant_references(gref_head);
60068 +
60069 + return 0;
60070 +}
60071 +
60072 +/*
60073 + * do_blkif_request
60074 + * read a block; request is in a request queue
60075 + */
60076 +void do_blkif_request(request_queue_t *rq)
60077 +{
60078 + struct blkfront_info *info = NULL;
60079 + struct request *req;
60080 + int queued;
60081 +
60082 + DPRINTK("Entered do_blkif_request\n");
60083 +
60084 + queued = 0;
60085 +
60086 + while ((req = elv_next_request(rq)) != NULL) {
60087 + info = req->rq_disk->private_data;
60088 + if (!blk_fs_request(req)) {
60089 + end_request(req, 0);
60090 + continue;
60091 + }
60092 +
60093 + if (RING_FULL(&info->ring))
60094 + goto wait;
60095 +
60096 + DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
60097 + "(%u/%li) buffer:%p [%s]\n",
60098 + req, req->cmd, req->sector, req->current_nr_sectors,
60099 + req->nr_sectors, req->buffer,
60100 + rq_data_dir(req) ? "write" : "read");
60101 +
60102 +
60103 + blkdev_dequeue_request(req);
60104 + if (blkif_queue_request(req)) {
60105 + blk_requeue_request(rq, req);
60106 + wait:
60107 + /* Avoid pointless unplugs. */
60108 + blk_stop_queue(rq);
60109 + break;
60110 + }
60111 +
60112 + queued++;
60113 + }
60114 +
60115 + if (queued != 0)
60116 + flush_requests(info);
60117 +}
60118 +
60119 +
60120 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
60121 +{
60122 + struct request *req;
60123 + blkif_response_t *bret;
60124 + RING_IDX i, rp;
60125 + unsigned long flags;
60126 + struct blkfront_info *info = (struct blkfront_info *)dev_id;
60127 + int uptodate;
60128 +
60129 + spin_lock_irqsave(&blkif_io_lock, flags);
60130 +
60131 + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
60132 + spin_unlock_irqrestore(&blkif_io_lock, flags);
60133 + return IRQ_HANDLED;
60134 + }
60135 +
60136 + again:
60137 + rp = info->ring.sring->rsp_prod;
60138 + rmb(); /* Ensure we see queued responses up to 'rp'. */
60139 +
60140 + for (i = info->ring.rsp_cons; i != rp; i++) {
60141 + unsigned long id;
60142 + int ret;
60143 +
60144 + bret = RING_GET_RESPONSE(&info->ring, i);
60145 + id = bret->id;
60146 + req = (struct request *)info->shadow[id].request;
60147 +
60148 + blkif_completion(&info->shadow[id]);
60149 +
60150 + ADD_ID_TO_FREELIST(info, id);
60151 +
60152 + uptodate = (bret->status == BLKIF_RSP_OKAY);
60153 + switch (bret->operation) {
60154 + case BLKIF_OP_WRITE_BARRIER:
60155 + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
60156 + printk("blkfront: %s: write barrier op failed\n",
60157 + info->gd->disk_name);
60158 + uptodate = -EOPNOTSUPP;
60159 + info->feature_barrier = 0;
60160 + xlvbd_barrier(info);
60161 + }
60162 + /* fall through */
60163 + case BLKIF_OP_READ:
60164 + case BLKIF_OP_WRITE:
60165 + if (unlikely(bret->status != BLKIF_RSP_OKAY))
60166 + DPRINTK("Bad return from blkdev data "
60167 + "request: %x\n", bret->status);
60168 +
60169 + ret = end_that_request_first(req, uptodate,
60170 + req->hard_nr_sectors);
60171 + BUG_ON(ret);
60172 + end_that_request_last(req, uptodate);
60173 + break;
60174 + default:
60175 + BUG();
60176 + }
60177 + }
60178 +
60179 + info->ring.rsp_cons = i;
60180 +
60181 + if (i != info->ring.req_prod_pvt) {
60182 + int more_to_do;
60183 + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
60184 + if (more_to_do)
60185 + goto again;
60186 + } else
60187 + info->ring.sring->rsp_event = i + 1;
60188 +
60189 + kick_pending_request_queues(info);
60190 +
60191 + spin_unlock_irqrestore(&blkif_io_lock, flags);
60192 +
60193 + return IRQ_HANDLED;
60194 +}
60195 +
60196 +static void blkif_free(struct blkfront_info *info, int suspend)
60197 +{
60198 + /* Prevent new requests being issued until we fix things up. */
60199 + spin_lock_irq(&blkif_io_lock);
60200 + info->connected = suspend ?
60201 + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
60202 + /* No more blkif_request(). */
60203 + if (info->rq)
60204 + blk_stop_queue(info->rq);
60205 + /* No more gnttab callback work. */
60206 + gnttab_cancel_free_callback(&info->callback);
60207 + spin_unlock_irq(&blkif_io_lock);
60208 +
60209 + /* Flush gnttab callback work. Must be done with no locks held. */
60210 + flush_scheduled_work();
60211 +
60212 + /* Free resources associated with old device channel. */
60213 + if (info->ring_ref != GRANT_INVALID_REF) {
60214 + gnttab_end_foreign_access(info->ring_ref, 0,
60215 + (unsigned long)info->ring.sring);
60216 + info->ring_ref = GRANT_INVALID_REF;
60217 + info->ring.sring = NULL;
60218 + }
60219 + if (info->irq)
60220 + unbind_from_irqhandler(info->irq, info);
60221 + info->evtchn = info->irq = 0;
60222 +
60223 +}
60224 +
60225 +static void blkif_completion(struct blk_shadow *s)
60226 +{
60227 + int i;
60228 + for (i = 0; i < s->req.nr_segments; i++)
60229 + gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
60230 +}
60231 +
60232 +static void blkif_recover(struct blkfront_info *info)
60233 +{
60234 + int i;
60235 + blkif_request_t *req;
60236 + struct blk_shadow *copy;
60237 + int j;
60238 +
60239 + /* Stage 1: Make a safe copy of the shadow state. */
60240 + copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
60241 + memcpy(copy, info->shadow, sizeof(info->shadow));
60242 +
60243 + /* Stage 2: Set up free list. */
60244 + memset(&info->shadow, 0, sizeof(info->shadow));
60245 + for (i = 0; i < BLK_RING_SIZE; i++)
60246 + info->shadow[i].req.id = i+1;
60247 + info->shadow_free = info->ring.req_prod_pvt;
60248 + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
60249 +
60250 + /* Stage 3: Find pending requests and requeue them. */
60251 + for (i = 0; i < BLK_RING_SIZE; i++) {
60252 + /* Not in use? */
60253 + if (copy[i].request == 0)
60254 + continue;
60255 +
60256 + /* Grab a request slot and copy shadow state into it. */
60257 + req = RING_GET_REQUEST(
60258 + &info->ring, info->ring.req_prod_pvt);
60259 + *req = copy[i].req;
60260 +
60261 + /* We get a new request id, and must reset the shadow state. */
60262 + req->id = GET_ID_FROM_FREELIST(info);
60263 + memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
60264 +
60265 + /* Rewrite any grant references invalidated by susp/resume. */
60266 + for (j = 0; j < req->nr_segments; j++)
60267 + gnttab_grant_foreign_access_ref(
60268 + req->seg[j].gref,
60269 + info->xbdev->otherend_id,
60270 + pfn_to_mfn(info->shadow[req->id].frame[j]),
60271 + rq_data_dir(
60272 + (struct request *)
60273 + info->shadow[req->id].request));
60274 + info->shadow[req->id].req = *req;
60275 +
60276 + info->ring.req_prod_pvt++;
60277 + }
60278 +
60279 + kfree(copy);
60280 +
60281 + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
60282 +
60283 + spin_lock_irq(&blkif_io_lock);
60284 +
60285 + /* Now safe for us to use the shared ring */
60286 + info->connected = BLKIF_STATE_CONNECTED;
60287 +
60288 + /* Send off requeued requests */
60289 + flush_requests(info);
60290 +
60291 + /* Kick any other new requests queued since we resumed */
60292 + kick_pending_request_queues(info);
60293 +
60294 + spin_unlock_irq(&blkif_io_lock);
60295 +}
60296 +
60297 +
60298 +/* ** Driver Registration ** */
60299 +
60300 +
60301 +static struct xenbus_device_id blkfront_ids[] = {
60302 + { "vbd" },
60303 + { "" }
60304 +};
60305 +
60306 +
60307 +static struct xenbus_driver blkfront = {
60308 + .name = "vbd",
60309 + .owner = THIS_MODULE,
60310 + .ids = blkfront_ids,
60311 + .probe = blkfront_probe,
60312 + .remove = blkfront_remove,
60313 + .resume = blkfront_resume,
60314 + .otherend_changed = backend_changed,
60315 +};
60316 +
60317 +
60318 +static int __init xlblk_init(void)
60319 +{
60320 + if (!is_running_on_xen())
60321 + return -ENODEV;
60322 +
60323 + return xenbus_register_frontend(&blkfront);
60324 +}
60325 +module_init(xlblk_init);
60326 +
60327 +
60328 +static void xlblk_exit(void)
60329 +{
60330 + return xenbus_unregister_driver(&blkfront);
60331 +}
60332 +module_exit(xlblk_exit);
60333 +
60334 +MODULE_LICENSE("Dual BSD/GPL");
60335 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/block.h linux-2.6.16.33/drivers/xen/blkfront/block.h
60336 --- linux-2.6.16.33-noxen/drivers/xen/blkfront/block.h 1970-01-01 00:00:00.000000000 +0000
60337 +++ linux-2.6.16.33/drivers/xen/blkfront/block.h 2007-01-08 15:00:45.000000000 +0000
60338 @@ -0,0 +1,158 @@
60339 +/******************************************************************************
60340 + * block.h
60341 + *
60342 + * Shared definitions between all levels of XenLinux Virtual block devices.
60343 + *
60344 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
60345 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
60346 + * Copyright (c) 2004-2005, Christian Limpach
60347 + *
60348 + * This program is free software; you can redistribute it and/or
60349 + * modify it under the terms of the GNU General Public License version 2
60350 + * as published by the Free Software Foundation; or, when distributed
60351 + * separately from the Linux kernel or incorporated into other
60352 + * software packages, subject to the following license:
60353 + *
60354 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60355 + * of this source file (the "Software"), to deal in the Software without
60356 + * restriction, including without limitation the rights to use, copy, modify,
60357 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60358 + * and to permit persons to whom the Software is furnished to do so, subject to
60359 + * the following conditions:
60360 + *
60361 + * The above copyright notice and this permission notice shall be included in
60362 + * all copies or substantial portions of the Software.
60363 + *
60364 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60365 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60366 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60367 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60368 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60369 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60370 + * IN THE SOFTWARE.
60371 + */
60372 +
60373 +#ifndef __XEN_DRIVERS_BLOCK_H__
60374 +#define __XEN_DRIVERS_BLOCK_H__
60375 +
60376 +#include <linux/config.h>
60377 +#include <linux/version.h>
60378 +#include <linux/module.h>
60379 +#include <linux/kernel.h>
60380 +#include <linux/sched.h>
60381 +#include <linux/slab.h>
60382 +#include <linux/string.h>
60383 +#include <linux/errno.h>
60384 +#include <linux/fs.h>
60385 +#include <linux/hdreg.h>
60386 +#include <linux/blkdev.h>
60387 +#include <linux/major.h>
60388 +#include <linux/devfs_fs_kernel.h>
60389 +#include <asm/hypervisor.h>
60390 +#include <xen/xenbus.h>
60391 +#include <xen/gnttab.h>
60392 +#include <xen/interface/xen.h>
60393 +#include <xen/interface/io/blkif.h>
60394 +#include <xen/interface/io/ring.h>
60395 +#include <asm/io.h>
60396 +#include <asm/atomic.h>
60397 +#include <asm/uaccess.h>
60398 +
60399 +#if 1
60400 +#define IPRINTK(fmt, args...) \
60401 + printk(KERN_INFO "xen_blk: " fmt, ##args)
60402 +#else
60403 +#define IPRINTK(fmt, args...) ((void)0)
60404 +#endif
60405 +
60406 +#if 1
60407 +#define WPRINTK(fmt, args...) \
60408 + printk(KERN_WARNING "xen_blk: " fmt, ##args)
60409 +#else
60410 +#define WPRINTK(fmt, args...) ((void)0)
60411 +#endif
60412 +
60413 +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
60414 +
60415 +#if 0
60416 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
60417 +#else
60418 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
60419 +#endif
60420 +
60421 +struct xlbd_type_info
60422 +{
60423 + int partn_shift;
60424 + int disks_per_major;
60425 + char *devname;
60426 + char *diskname;
60427 +};
60428 +
60429 +struct xlbd_major_info
60430 +{
60431 + int major;
60432 + int index;
60433 + int usage;
60434 + struct xlbd_type_info *type;
60435 +};
60436 +
60437 +struct blk_shadow {
60438 + blkif_request_t req;
60439 + unsigned long request;
60440 + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
60441 +};
60442 +
60443 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
60444 +
60445 +/*
60446 + * We have one of these per vbd, whether ide, scsi or 'other'. They
60447 + * hang in private_data off the gendisk structure. We may end up
60448 + * putting all kinds of interesting stuff here :-)
60449 + */
60450 +struct blkfront_info
60451 +{
60452 + struct xenbus_device *xbdev;
60453 + dev_t dev;
60454 + struct gendisk *gd;
60455 + int vdevice;
60456 + blkif_vdev_t handle;
60457 + int connected;
60458 + int ring_ref;
60459 + blkif_front_ring_t ring;
60460 + unsigned int evtchn, irq;
60461 + struct xlbd_major_info *mi;
60462 + request_queue_t *rq;
60463 + struct work_struct work;
60464 + struct gnttab_free_callback callback;
60465 + struct blk_shadow shadow[BLK_RING_SIZE];
60466 + unsigned long shadow_free;
60467 + int feature_barrier;
60468 +
60469 + /**
60470 + * The number of people holding this device open. We won't allow a
60471 + * hot-unplug unless this is 0.
60472 + */
60473 + int users;
60474 +};
60475 +
60476 +extern spinlock_t blkif_io_lock;
60477 +
60478 +extern int blkif_open(struct inode *inode, struct file *filep);
60479 +extern int blkif_release(struct inode *inode, struct file *filep);
60480 +extern int blkif_ioctl(struct inode *inode, struct file *filep,
60481 + unsigned command, unsigned long argument);
60482 +extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
60483 +extern int blkif_check(dev_t dev);
60484 +extern int blkif_revalidate(dev_t dev);
60485 +extern void do_blkif_request (request_queue_t *rq);
60486 +
60487 +/* Virtual block-device subsystem. */
60488 +/* Note that xlvbd_add doesn't call add_disk for you: you're expected
60489 + to call add_disk on info->gd once the disk is properly connected
60490 + up. */
60491 +int xlvbd_add(blkif_sector_t capacity, int device,
60492 + u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
60493 +void xlvbd_del(struct blkfront_info *info);
60494 +int xlvbd_barrier(struct blkfront_info *info);
60495 +
60496 +#endif /* __XEN_DRIVERS_BLOCK_H__ */
60497 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blkfront/vbd.c linux-2.6.16.33/drivers/xen/blkfront/vbd.c
60498 --- linux-2.6.16.33-noxen/drivers/xen/blkfront/vbd.c 1970-01-01 00:00:00.000000000 +0000
60499 +++ linux-2.6.16.33/drivers/xen/blkfront/vbd.c 2007-01-08 15:00:45.000000000 +0000
60500 @@ -0,0 +1,375 @@
60501 +/******************************************************************************
60502 + * vbd.c
60503 + *
60504 + * XenLinux virtual block-device driver (xvd).
60505 + *
60506 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
60507 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
60508 + * Copyright (c) 2004-2005, Christian Limpach
60509 + *
60510 + * This program is free software; you can redistribute it and/or
60511 + * modify it under the terms of the GNU General Public License version 2
60512 + * as published by the Free Software Foundation; or, when distributed
60513 + * separately from the Linux kernel or incorporated into other
60514 + * software packages, subject to the following license:
60515 + *
60516 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60517 + * of this source file (the "Software"), to deal in the Software without
60518 + * restriction, including without limitation the rights to use, copy, modify,
60519 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60520 + * and to permit persons to whom the Software is furnished to do so, subject to
60521 + * the following conditions:
60522 + *
60523 + * The above copyright notice and this permission notice shall be included in
60524 + * all copies or substantial portions of the Software.
60525 + *
60526 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60527 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60528 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60529 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60530 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60531 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60532 + * IN THE SOFTWARE.
60533 + */
60534 +
60535 +#include "block.h"
60536 +#include <linux/blkdev.h>
60537 +#include <linux/list.h>
60538 +
60539 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
60540 +#include <xen/platform-compat.h>
60541 +#endif
60542 +
60543 +#define BLKIF_MAJOR(dev) ((dev)>>8)
60544 +#define BLKIF_MINOR(dev) ((dev) & 0xff)
60545 +
60546 +/*
60547 + * For convenience we distinguish between ide, scsi and 'other' (i.e.,
60548 + * potentially combinations of the two) in the naming scheme and in a few other
60549 + * places.
60550 + */
60551 +
60552 +#define NUM_IDE_MAJORS 10
60553 +#define NUM_SCSI_MAJORS 17
60554 +#define NUM_VBD_MAJORS 1
60555 +
60556 +static struct xlbd_type_info xlbd_ide_type = {
60557 + .partn_shift = 6,
60558 + .disks_per_major = 2,
60559 + .devname = "ide",
60560 + .diskname = "hd",
60561 +};
60562 +
60563 +static struct xlbd_type_info xlbd_scsi_type = {
60564 + .partn_shift = 4,
60565 + .disks_per_major = 16,
60566 + .devname = "sd",
60567 + .diskname = "sd",
60568 +};
60569 +
60570 +static struct xlbd_type_info xlbd_vbd_type = {
60571 + .partn_shift = 4,
60572 + .disks_per_major = 16,
60573 + .devname = "xvd",
60574 + .diskname = "xvd",
60575 +};
60576 +
60577 +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
60578 + NUM_VBD_MAJORS];
60579 +
60580 +#define XLBD_MAJOR_IDE_START 0
60581 +#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS)
60582 +#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
60583 +
60584 +#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
60585 +#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
60586 +#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
60587 +
60588 +/* Information about our VBDs. */
60589 +#define MAX_VBDS 64
60590 +static LIST_HEAD(vbds_list);
60591 +
60592 +static struct block_device_operations xlvbd_block_fops =
60593 +{
60594 + .owner = THIS_MODULE,
60595 + .open = blkif_open,
60596 + .release = blkif_release,
60597 + .ioctl = blkif_ioctl,
60598 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
60599 + .getgeo = blkif_getgeo
60600 +#endif
60601 +};
60602 +
60603 +DEFINE_SPINLOCK(blkif_io_lock);
60604 +
60605 +static struct xlbd_major_info *
60606 +xlbd_alloc_major_info(int major, int minor, int index)
60607 +{
60608 + struct xlbd_major_info *ptr;
60609 +
60610 + ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
60611 + if (ptr == NULL)
60612 + return NULL;
60613 +
60614 + ptr->major = major;
60615 +
60616 + switch (index) {
60617 + case XLBD_MAJOR_IDE_RANGE:
60618 + ptr->type = &xlbd_ide_type;
60619 + ptr->index = index - XLBD_MAJOR_IDE_START;
60620 + break;
60621 + case XLBD_MAJOR_SCSI_RANGE:
60622 + ptr->type = &xlbd_scsi_type;
60623 + ptr->index = index - XLBD_MAJOR_SCSI_START;
60624 + break;
60625 + case XLBD_MAJOR_VBD_RANGE:
60626 + ptr->type = &xlbd_vbd_type;
60627 + ptr->index = index - XLBD_MAJOR_VBD_START;
60628 + break;
60629 + }
60630 +
60631 + printk("Registering block device major %i\n", ptr->major);
60632 + if (register_blkdev(ptr->major, ptr->type->devname)) {
60633 + WPRINTK("can't get major %d with name %s\n",
60634 + ptr->major, ptr->type->devname);
60635 + kfree(ptr);
60636 + return NULL;
60637 + }
60638 +
60639 + devfs_mk_dir(ptr->type->devname);
60640 + major_info[index] = ptr;
60641 + return ptr;
60642 +}
60643 +
60644 +static struct xlbd_major_info *
60645 +xlbd_get_major_info(int vdevice)
60646 +{
60647 + struct xlbd_major_info *mi;
60648 + int major, minor, index;
60649 +
60650 + major = BLKIF_MAJOR(vdevice);
60651 + minor = BLKIF_MINOR(vdevice);
60652 +
60653 + switch (major) {
60654 + case IDE0_MAJOR: index = 0; break;
60655 + case IDE1_MAJOR: index = 1; break;
60656 + case IDE2_MAJOR: index = 2; break;
60657 + case IDE3_MAJOR: index = 3; break;
60658 + case IDE4_MAJOR: index = 4; break;
60659 + case IDE5_MAJOR: index = 5; break;
60660 + case IDE6_MAJOR: index = 6; break;
60661 + case IDE7_MAJOR: index = 7; break;
60662 + case IDE8_MAJOR: index = 8; break;
60663 + case IDE9_MAJOR: index = 9; break;
60664 + case SCSI_DISK0_MAJOR: index = 10; break;
60665 + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
60666 + index = 11 + major - SCSI_DISK1_MAJOR;
60667 + break;
60668 + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
60669 + index = 18 + major - SCSI_DISK8_MAJOR;
60670 + break;
60671 + case SCSI_CDROM_MAJOR: index = 26; break;
60672 + default: index = 27; break;
60673 + }
60674 +
60675 + mi = ((major_info[index] != NULL) ? major_info[index] :
60676 + xlbd_alloc_major_info(major, minor, index));
60677 + if (mi)
60678 + mi->usage++;
60679 + return mi;
60680 +}
60681 +
60682 +static void
60683 +xlbd_put_major_info(struct xlbd_major_info *mi)
60684 +{
60685 + mi->usage--;
60686 + /* XXX: release major if 0 */
60687 +}
60688 +
60689 +static int
60690 +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
60691 +{
60692 + request_queue_t *rq;
60693 +
60694 + rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
60695 + if (rq == NULL)
60696 + return -1;
60697 +
60698 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
60699 + elevator_init(rq, "noop");
60700 +#else
60701 + elevator_init(rq, &elevator_noop);
60702 +#endif
60703 +
60704 + /* Hard sector size and max sectors impersonate the equiv. hardware. */
60705 + blk_queue_hardsect_size(rq, sector_size);
60706 + blk_queue_max_sectors(rq, 512);
60707 +
60708 + /* Each segment in a request is up to an aligned page in size. */
60709 + blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
60710 + blk_queue_max_segment_size(rq, PAGE_SIZE);
60711 +
60712 + /* Ensure a merged request will fit in a single I/O ring slot. */
60713 + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
60714 + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
60715 +
60716 + /* Make sure buffer addresses are sector-aligned. */
60717 + blk_queue_dma_alignment(rq, 511);
60718 +
60719 + gd->queue = rq;
60720 +
60721 + return 0;
60722 +}
60723 +
60724 +static int
60725 +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
60726 + u16 vdisk_info, u16 sector_size,
60727 + struct blkfront_info *info)
60728 +{
60729 + struct gendisk *gd;
60730 + struct xlbd_major_info *mi;
60731 + int nr_minors = 1;
60732 + int err = -ENODEV;
60733 + unsigned int offset;
60734 +
60735 + BUG_ON(info->gd != NULL);
60736 + BUG_ON(info->mi != NULL);
60737 + BUG_ON(info->rq != NULL);
60738 +
60739 + mi = xlbd_get_major_info(vdevice);
60740 + if (mi == NULL)
60741 + goto out;
60742 + info->mi = mi;
60743 +
60744 + if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
60745 + nr_minors = 1 << mi->type->partn_shift;
60746 +
60747 + gd = alloc_disk(nr_minors);
60748 + if (gd == NULL)
60749 + goto out;
60750 +
60751 + offset = mi->index * mi->type->disks_per_major +
60752 + (minor >> mi->type->partn_shift);
60753 + if (nr_minors > 1) {
60754 + if (offset < 26) {
60755 + sprintf(gd->disk_name, "%s%c",
60756 + mi->type->diskname, 'a' + offset );
60757 + }
60758 + else {
60759 + sprintf(gd->disk_name, "%s%c%c",
60760 + mi->type->diskname,
60761 + 'a' + ((offset/26)-1), 'a' + (offset%26) );
60762 + }
60763 + }
60764 + else {
60765 + if (offset < 26) {
60766 + sprintf(gd->disk_name, "%s%c%d",
60767 + mi->type->diskname,
60768 + 'a' + offset,
60769 + minor & ((1 << mi->type->partn_shift) - 1));
60770 + }
60771 + else {
60772 + sprintf(gd->disk_name, "%s%c%c%d",
60773 + mi->type->diskname,
60774 + 'a' + ((offset/26)-1), 'a' + (offset%26),
60775 + minor & ((1 << mi->type->partn_shift) - 1));
60776 + }
60777 + }
60778 +
60779 + gd->major = mi->major;
60780 + gd->first_minor = minor;
60781 + gd->fops = &xlvbd_block_fops;
60782 + gd->private_data = info;
60783 + gd->driverfs_dev = &(info->xbdev->dev);
60784 + set_capacity(gd, capacity);
60785 +
60786 + if (xlvbd_init_blk_queue(gd, sector_size)) {
60787 + del_gendisk(gd);
60788 + goto out;
60789 + }
60790 +
60791 + info->rq = gd->queue;
60792 + info->gd = gd;
60793 +
60794 + if (info->feature_barrier)
60795 + xlvbd_barrier(info);
60796 +
60797 + if (vdisk_info & VDISK_READONLY)
60798 + set_disk_ro(gd, 1);
60799 +
60800 + if (vdisk_info & VDISK_REMOVABLE)
60801 + gd->flags |= GENHD_FL_REMOVABLE;
60802 +
60803 + if (vdisk_info & VDISK_CDROM)
60804 + gd->flags |= GENHD_FL_CD;
60805 +
60806 + return 0;
60807 +
60808 + out:
60809 + if (mi)
60810 + xlbd_put_major_info(mi);
60811 + info->mi = NULL;
60812 + return err;
60813 +}
60814 +
60815 +int
60816 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
60817 + u16 sector_size, struct blkfront_info *info)
60818 +{
60819 + struct block_device *bd;
60820 + int err = 0;
60821 +
60822 + info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
60823 +
60824 + bd = bdget(info->dev);
60825 + if (bd == NULL)
60826 + return -ENODEV;
60827 +
60828 + err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
60829 + vdisk_info, sector_size, info);
60830 +
60831 + bdput(bd);
60832 + return err;
60833 +}
60834 +
60835 +void
60836 +xlvbd_del(struct blkfront_info *info)
60837 +{
60838 + if (info->mi == NULL)
60839 + return;
60840 +
60841 + BUG_ON(info->gd == NULL);
60842 + del_gendisk(info->gd);
60843 + put_disk(info->gd);
60844 + info->gd = NULL;
60845 +
60846 + xlbd_put_major_info(info->mi);
60847 + info->mi = NULL;
60848 +
60849 + BUG_ON(info->rq == NULL);
60850 + blk_cleanup_queue(info->rq);
60851 + info->rq = NULL;
60852 +}
60853 +
60854 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
60855 +int
60856 +xlvbd_barrier(struct blkfront_info *info)
60857 +{
60858 + int err;
60859 +
60860 + err = blk_queue_ordered(info->rq,
60861 + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL);
60862 + if (err)
60863 + return err;
60864 + printk("blkfront: %s: barriers %s\n",
60865 + info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled");
60866 + return 0;
60867 +}
60868 +#else
60869 +int
60870 +xlvbd_barrier(struct blkfront_info *info)
60871 +{
60872 + printk("blkfront: %s: barriers disabled\n", info->gd->disk_name);
60873 + return -ENOSYS;
60874 +}
60875 +#endif
60876 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/Makefile linux-2.6.16.33/drivers/xen/blktap/Makefile
60877 --- linux-2.6.16.33-noxen/drivers/xen/blktap/Makefile 1970-01-01 00:00:00.000000000 +0000
60878 +++ linux-2.6.16.33/drivers/xen/blktap/Makefile 2007-01-08 15:00:45.000000000 +0000
60879 @@ -0,0 +1,3 @@
60880 +LINUXINCLUDE += -I../xen/include/public/io
60881 +obj-y := xenbus.o interface.o blktap.o
60882 +
60883 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/blktap.c linux-2.6.16.33/drivers/xen/blktap/blktap.c
60884 --- linux-2.6.16.33-noxen/drivers/xen/blktap/blktap.c 1970-01-01 00:00:00.000000000 +0000
60885 +++ linux-2.6.16.33/drivers/xen/blktap/blktap.c 2007-01-08 15:00:45.000000000 +0000
60886 @@ -0,0 +1,1517 @@
60887 +/******************************************************************************
60888 + * drivers/xen/blktap/blktap.c
60889 + *
60890 + * Back-end driver for user level virtual block devices. This portion of the
60891 + * driver exports a 'unified' block-device interface that can be accessed
60892 + * by any operating system that implements a compatible front end. Requests
60893 + * are remapped to a user-space memory region.
60894 + *
60895 + * Based on the blkback driver code.
60896 + *
60897 + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
60898 + *
60899 + * Clean ups and fix ups:
60900 + * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
60901 + *
60902 + * This program is free software; you can redistribute it and/or
60903 + * modify it under the terms of the GNU General Public License version 2
60904 + * as published by the Free Software Foundation; or, when distributed
60905 + * separately from the Linux kernel or incorporated into other
60906 + * software packages, subject to the following license:
60907 + *
60908 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60909 + * of this source file (the "Software"), to deal in the Software without
60910 + * restriction, including without limitation the rights to use, copy, modify,
60911 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60912 + * and to permit persons to whom the Software is furnished to do so, subject to
60913 + * the following conditions:
60914 + *
60915 + * The above copyright notice and this permission notice shall be included in
60916 + * all copies or substantial portions of the Software.
60917 + *
60918 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60919 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60920 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60921 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60922 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60923 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60924 + * IN THE SOFTWARE.
60925 + */
60926 +
60927 +#include <linux/spinlock.h>
60928 +#include <linux/kthread.h>
60929 +#include <linux/list.h>
60930 +#include <asm/hypervisor.h>
60931 +#include "common.h"
60932 +#include <xen/balloon.h>
60933 +#include <linux/kernel.h>
60934 +#include <linux/fs.h>
60935 +#include <linux/mm.h>
60936 +#include <linux/errno.h>
60937 +#include <linux/major.h>
60938 +#include <linux/gfp.h>
60939 +#include <linux/poll.h>
60940 +#include <asm/tlbflush.h>
60941 +#include <linux/devfs_fs_kernel.h>
60942 +
60943 +#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
60944 +#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
60945 +
60946 +
60947 +struct class *xen_class;
60948 +EXPORT_SYMBOL_GPL(xen_class);
60949 +
60950 +/*
60951 + * Setup the xen class. This should probably go in another file, but
60952 + * since blktap is the only user of it so far, it gets to keep it.
60953 + */
60954 +int setup_xen_class(void)
60955 +{
60956 + int ret;
60957 +
60958 + if (xen_class)
60959 + return 0;
60960 +
60961 + xen_class = class_create(THIS_MODULE, "xen");
60962 + if ((ret = IS_ERR(xen_class))) {
60963 + xen_class = NULL;
60964 + return ret;
60965 + }
60966 +
60967 + return 0;
60968 +}
60969 +
60970 +/*
60971 + * The maximum number of requests that can be outstanding at any time
60972 + * is determined by
60973 + *
60974 + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
60975 + *
60976 + * where mmap_alloc < MAX_DYNAMIC_MEM.
60977 + *
60978 + * TODO:
60979 + * mmap_alloc is initialised to 2 and should be adjustable on the fly via
60980 + * sysfs.
60981 + */
60982 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
60983 +#define MAX_DYNAMIC_MEM BLK_RING_SIZE
60984 +#define MAX_PENDING_REQS BLK_RING_SIZE
60985 +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
60986 +#define MMAP_VADDR(_start, _req,_seg) \
60987 + (_start + \
60988 + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
60989 + ((_seg) * PAGE_SIZE))
60990 +static int blkif_reqs = MAX_PENDING_REQS;
60991 +static int mmap_pages = MMAP_PAGES;
60992 +
60993 +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
60994 + * have a bunch of pages reserved for shared
60995 + * memory rings.
60996 + */
60997 +
60998 +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
60999 +typedef struct domid_translate {
61000 + unsigned short domid;
61001 + unsigned short busid;
61002 +} domid_translate_t ;
61003 +
61004 +/*Data struct associated with each of the tapdisk devices*/
61005 +typedef struct tap_blkif {
61006 + struct vm_area_struct *vma; /*Shared memory area */
61007 + unsigned long rings_vstart; /*Kernel memory mapping */
61008 + unsigned long user_vstart; /*User memory mapping */
61009 + unsigned long dev_inuse; /*One process opens device at a time. */
61010 + unsigned long dev_pending; /*In process of being opened */
61011 + unsigned long ring_ok; /*make this ring->state */
61012 + blkif_front_ring_t ufe_ring; /*Rings up to user space. */
61013 + wait_queue_head_t wait; /*for poll */
61014 + unsigned long mode; /*current switching mode */
61015 + int minor; /*Minor number for tapdisk device */
61016 + pid_t pid; /*tapdisk process id */
61017 + enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
61018 + shutdown */
61019 + unsigned long *idx_map; /*Record the user ring id to kern
61020 + [req id, idx] tuple */
61021 + blkif_t *blkif; /*Associate blkif with tapdev */
61022 + struct domid_translate trans; /*Translation from domid to bus. */
61023 +} tap_blkif_t;
61024 +
61025 +static struct tap_blkif *tapfds[MAX_TAP_DEV];
61026 +static int blktap_next_minor;
61027 +
61028 +static int __init set_blkif_reqs(char *str)
61029 +{
61030 + get_option(&str, &blkif_reqs);
61031 + return 1;
61032 +}
61033 +__setup("blkif_reqs=", set_blkif_reqs);
61034 +
61035 +/* Run-time switchable: /sys/module/blktap/parameters/ */
61036 +static unsigned int log_stats = 0;
61037 +static unsigned int debug_lvl = 0;
61038 +module_param(log_stats, int, 0644);
61039 +module_param(debug_lvl, int, 0644);
61040 +
61041 +/*
61042 + * Each outstanding request that we've passed to the lower device layers has a
61043 + * 'pending_req' allocated to it. Each buffer_head that completes decrements
61044 + * the pendcnt towards zero. When it hits zero, the specified domain has a
61045 + * response queued for it, with the saved 'id' passed back.
61046 + */
61047 +typedef struct {
61048 + blkif_t *blkif;
61049 + unsigned long id;
61050 + unsigned short mem_idx;
61051 + int nr_pages;
61052 + atomic_t pendcnt;
61053 + unsigned short operation;
61054 + int status;
61055 + struct list_head free_list;
61056 + int inuse;
61057 +} pending_req_t;
61058 +
61059 +static pending_req_t *pending_reqs[MAX_PENDING_REQS];
61060 +static struct list_head pending_free;
61061 +static DEFINE_SPINLOCK(pending_free_lock);
61062 +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
61063 +static int alloc_pending_reqs;
61064 +
61065 +typedef unsigned int PEND_RING_IDX;
61066 +
61067 +static inline int MASK_PEND_IDX(int i) {
61068 + return (i & (MAX_PENDING_REQS-1));
61069 +}
61070 +
61071 +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
61072 + return (req - pending_reqs[idx]);
61073 +}
61074 +
61075 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
61076 +
61077 +#define BLKBACK_INVALID_HANDLE (~0)
61078 +
61079 +static struct page **foreign_pages[MAX_DYNAMIC_MEM];
61080 +static inline unsigned long idx_to_kaddr(
61081 + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
61082 +{
61083 + unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
61084 + unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
61085 + return (unsigned long)pfn_to_kaddr(pfn);
61086 +}
61087 +
61088 +static unsigned short mmap_alloc = 0;
61089 +static unsigned short mmap_lock = 0;
61090 +static unsigned short mmap_inuse = 0;
61091 +
61092 +/******************************************************************
61093 + * GRANT HANDLES
61094 + */
61095 +
61096 +/* When using grant tables to map a frame for device access then the
61097 + * handle returned must be used to unmap the frame. This is needed to
61098 + * drop the ref count on the frame.
61099 + */
61100 +struct grant_handle_pair
61101 +{
61102 + grant_handle_t kernel;
61103 + grant_handle_t user;
61104 +};
61105 +#define INVALID_GRANT_HANDLE 0xFFFF
61106 +
61107 +static struct grant_handle_pair
61108 + pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
61109 +#define pending_handle(_id, _idx, _i) \
61110 + (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
61111 + + (_i)])
61112 +
61113 +
61114 +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
61115 +
61116 +#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
61117 +#define BLKTAP_DEV_DIR "/dev/xen"
61118 +
61119 +static int blktap_major;
61120 +
61121 +/* blktap IOCTLs: */
61122 +#define BLKTAP_IOCTL_KICK_FE 1
61123 +#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
61124 +#define BLKTAP_IOCTL_SETMODE 3
61125 +#define BLKTAP_IOCTL_SENDPID 4
61126 +#define BLKTAP_IOCTL_NEWINTF 5
61127 +#define BLKTAP_IOCTL_MINOR 6
61128 +#define BLKTAP_IOCTL_MAJOR 7
61129 +#define BLKTAP_QUERY_ALLOC_REQS 8
61130 +#define BLKTAP_IOCTL_FREEINTF 9
61131 +#define BLKTAP_IOCTL_PRINT_IDXS 100
61132 +
61133 +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
61134 +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
61135 +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
61136 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
61137 +
61138 +#define BLKTAP_MODE_INTERPOSE \
61139 + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
61140 +
61141 +
61142 +static inline int BLKTAP_MODE_VALID(unsigned long arg)
61143 +{
61144 + return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
61145 + (arg == BLKTAP_MODE_INTERCEPT_FE) ||
61146 + (arg == BLKTAP_MODE_INTERPOSE ));
61147 +}
61148 +
61149 +/* Requests passing through the tap to userspace are re-assigned an ID.
61150 + * We must record a mapping between the BE [IDX,ID] tuple and the userspace
61151 + * ring ID.
61152 + */
61153 +
61154 +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
61155 +{
61156 + return ((fe_dom << 16) | MASK_PEND_IDX(idx));
61157 +}
61158 +
61159 +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
61160 +{
61161 + return (PEND_RING_IDX)(id & 0x0000ffff);
61162 +}
61163 +
61164 +extern inline int ID_TO_MIDX(unsigned long id)
61165 +{
61166 + return (int)(id >> 16);
61167 +}
61168 +
61169 +#define INVALID_REQ 0xdead0000
61170 +
61171 +/*TODO: Convert to a free list*/
61172 +static inline int GET_NEXT_REQ(unsigned long *idx_map)
61173 +{
61174 + int i;
61175 + for (i = 0; i < MAX_PENDING_REQS; i++)
61176 + if (idx_map[i] == INVALID_REQ)
61177 + return i;
61178 +
61179 + return INVALID_REQ;
61180 +}
61181 +
61182 +
61183 +#define BLKTAP_INVALID_HANDLE(_g) \
61184 + (((_g->kernel) == INVALID_GRANT_HANDLE) && \
61185 + ((_g->user) == INVALID_GRANT_HANDLE))
61186 +
61187 +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
61188 + (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
61189 + } while(0)
61190 +
61191 +
61192 +/******************************************************************
61193 + * BLKTAP VM OPS
61194 + */
61195 +
61196 +static struct page *blktap_nopage(struct vm_area_struct *vma,
61197 + unsigned long address,
61198 + int *type)
61199 +{
61200 + /*
61201 + * if the page has not been mapped in by the driver then return
61202 + * NOPAGE_SIGBUS to the domain.
61203 + */
61204 +
61205 + return NOPAGE_SIGBUS;
61206 +}
61207 +
61208 +struct vm_operations_struct blktap_vm_ops = {
61209 + nopage: blktap_nopage,
61210 +};
61211 +
61212 +/******************************************************************
61213 + * BLKTAP FILE OPS
61214 + */
61215 +
61216 +/*Function Declarations*/
61217 +static tap_blkif_t *get_next_free_dev(void);
61218 +static int blktap_open(struct inode *inode, struct file *filp);
61219 +static int blktap_release(struct inode *inode, struct file *filp);
61220 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
61221 +static int blktap_ioctl(struct inode *inode, struct file *filp,
61222 + unsigned int cmd, unsigned long arg);
61223 +static unsigned int blktap_poll(struct file *file, poll_table *wait);
61224 +
61225 +static struct file_operations blktap_fops = {
61226 + .owner = THIS_MODULE,
61227 + .poll = blktap_poll,
61228 + .ioctl = blktap_ioctl,
61229 + .open = blktap_open,
61230 + .release = blktap_release,
61231 + .mmap = blktap_mmap,
61232 +};
61233 +
61234 +
61235 +static tap_blkif_t *get_next_free_dev(void)
61236 +{
61237 + tap_blkif_t *info;
61238 + int minor;
61239 +
61240 + /*
61241 + * This is called only from the ioctl, which
61242 + * means we should always have interrupts enabled.
61243 + */
61244 + BUG_ON(irqs_disabled());
61245 +
61246 + spin_lock_irq(&pending_free_lock);
61247 +
61248 + /* tapfds[0] is always NULL */
61249 +
61250 + for (minor = 1; minor < blktap_next_minor; minor++) {
61251 + info = tapfds[minor];
61252 + /* we could have failed a previous attempt. */
61253 + if (!info ||
61254 + ((info->dev_inuse == 0) &&
61255 + (info->dev_pending == 0)) ) {
61256 + info->dev_pending = 1;
61257 + goto found;
61258 + }
61259 + }
61260 + info = NULL;
61261 + minor = -1;
61262 +
61263 + /*
61264 + * We didn't find free device. If we can still allocate
61265 + * more, then we grab the next device minor that is
61266 + * available. This is done while we are still under
61267 + * the protection of the pending_free_lock.
61268 + */
61269 + if (blktap_next_minor < MAX_TAP_DEV)
61270 + minor = blktap_next_minor++;
61271 +found:
61272 + spin_unlock_irq(&pending_free_lock);
61273 +
61274 + if (!info && minor > 0) {
61275 + info = kzalloc(sizeof(*info), GFP_KERNEL);
61276 + if (unlikely(!info)) {
61277 + /*
61278 + * If we failed here, try to put back
61279 + * the next minor number. But if one
61280 + * was just taken, then we just lose this
61281 + * minor. We can try to allocate this
61282 + * minor again later.
61283 + */
61284 + spin_lock_irq(&pending_free_lock);
61285 + if (blktap_next_minor == minor+1)
61286 + blktap_next_minor--;
61287 + spin_unlock_irq(&pending_free_lock);
61288 + goto out;
61289 + }
61290 +
61291 + info->minor = minor;
61292 + /*
61293 + * Make sure that we have a minor before others can
61294 + * see us.
61295 + */
61296 + wmb();
61297 + tapfds[minor] = info;
61298 +
61299 + class_device_create(xen_class, NULL,
61300 + MKDEV(blktap_major, minor), NULL,
61301 + "blktap%d", minor);
61302 + devfs_mk_cdev(MKDEV(blktap_major, minor),
61303 + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor);
61304 + }
61305 +
61306 +out:
61307 + return info;
61308 +}
61309 +
61310 +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
61311 +{
61312 + tap_blkif_t *info;
61313 + int i;
61314 +
61315 + for (i = 1; i < blktap_next_minor; i++) {
61316 + info = tapfds[i];
61317 + if ( info &&
61318 + (info->trans.domid == domid) &&
61319 + (info->trans.busid == xenbus_id) ) {
61320 + info->blkif = blkif;
61321 + info->status = RUNNING;
61322 + return i;
61323 + }
61324 + }
61325 + return -1;
61326 +}
61327 +
61328 +void signal_tapdisk(int idx)
61329 +{
61330 + tap_blkif_t *info;
61331 + struct task_struct *ptask;
61332 +
61333 + info = tapfds[idx];
61334 + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
61335 + return;
61336 +
61337 + if (info->pid > 0) {
61338 + ptask = find_task_by_pid(info->pid);
61339 + if (ptask)
61340 + info->status = CLEANSHUTDOWN;
61341 + }
61342 + info->blkif = NULL;
61343 +
61344 + return;
61345 +}
61346 +
61347 +static int blktap_open(struct inode *inode, struct file *filp)
61348 +{
61349 + blkif_sring_t *sring;
61350 + int idx = iminor(inode) - BLKTAP_MINOR;
61351 + tap_blkif_t *info;
61352 + int i;
61353 +
61354 + /* ctrl device, treat differently */
61355 + if (!idx)
61356 + return 0;
61357 +
61358 + info = tapfds[idx];
61359 +
61360 + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
61361 + WPRINTK("Unable to open device /dev/xen/blktap%d\n",
61362 + idx);
61363 + return -ENODEV;
61364 + }
61365 +
61366 + DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
61367 +
61368 + /*Only one process can access device at a time*/
61369 + if (test_and_set_bit(0, &info->dev_inuse))
61370 + return -EBUSY;
61371 +
61372 + info->dev_pending = 0;
61373 +
61374 + /* Allocate the fe ring. */
61375 + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
61376 + if (sring == NULL)
61377 + goto fail_nomem;
61378 +
61379 + SetPageReserved(virt_to_page(sring));
61380 +
61381 + SHARED_RING_INIT(sring);
61382 + FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
61383 +
61384 + filp->private_data = info;
61385 + info->vma = NULL;
61386 +
61387 + info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
61388 + GFP_KERNEL);
61389 +
61390 + if (idx > 0) {
61391 + init_waitqueue_head(&info->wait);
61392 + for (i = 0; i < MAX_PENDING_REQS; i++)
61393 + info->idx_map[i] = INVALID_REQ;
61394 + }
61395 +
61396 + DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
61397 + return 0;
61398 +
61399 + fail_nomem:
61400 + return -ENOMEM;
61401 +}
61402 +
61403 +static int blktap_release(struct inode *inode, struct file *filp)
61404 +{
61405 + tap_blkif_t *info = filp->private_data;
61406 +
61407 + /* check for control device */
61408 + if (!info)
61409 + return 0;
61410 +
61411 + info->dev_inuse = 0;
61412 + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
61413 +
61414 + /* Free the ring page. */
61415 + ClearPageReserved(virt_to_page(info->ufe_ring.sring));
61416 + free_page((unsigned long) info->ufe_ring.sring);
61417 +
61418 + /* Clear any active mappings and free foreign map table */
61419 + if (info->vma) {
61420 + zap_page_range(
61421 + info->vma, info->vma->vm_start,
61422 + info->vma->vm_end - info->vma->vm_start, NULL);
61423 + info->vma = NULL;
61424 + }
61425 +
61426 + if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
61427 + if (info->blkif->xenblkd != NULL) {
61428 + kthread_stop(info->blkif->xenblkd);
61429 + info->blkif->xenblkd = NULL;
61430 + }
61431 + info->status = CLEANSHUTDOWN;
61432 + }
61433 + return 0;
61434 +}
61435 +
61436 +
61437 +/* Note on mmap:
61438 + * We need to map pages to user space in a way that will allow the block
61439 + * subsystem set up direct IO to them. This couldn't be done before, because
61440 + * there isn't really a sane way to translate a user virtual address down to a
61441 + * physical address when the page belongs to another domain.
61442 + *
61443 + * My first approach was to map the page in to kernel memory, add an entry
61444 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
61445 + * and then attempt to map that page up to user space. This is disallowed
61446 + * by xen though, which realizes that we don't really own the machine frame
61447 + * underlying the physical page.
61448 + *
61449 + * The new approach is to provide explicit support for this in xen linux.
61450 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
61451 + * mapped from other vms. vma->vm_private_data is set up as a mapping
61452 + * from pages to actual page structs. There is a new clause in get_user_pages
61453 + * that does the right thing for this sort of mapping.
61454 + */
61455 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
61456 +{
61457 + int size;
61458 + struct page **map;
61459 + int i;
61460 + tap_blkif_t *info = filp->private_data;
61461 +
61462 + if (info == NULL) {
61463 + WPRINTK("blktap: mmap, retrieving idx failed\n");
61464 + return -ENOMEM;
61465 + }
61466 +
61467 + vma->vm_flags |= VM_RESERVED;
61468 + vma->vm_ops = &blktap_vm_ops;
61469 +
61470 + size = vma->vm_end - vma->vm_start;
61471 + if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
61472 + WPRINTK("you _must_ map exactly %d pages!\n",
61473 + mmap_pages + RING_PAGES);
61474 + return -EAGAIN;
61475 + }
61476 +
61477 + size >>= PAGE_SHIFT;
61478 + info->rings_vstart = vma->vm_start;
61479 + info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
61480 +
61481 + /* Map the ring pages to the start of the region and reserve it. */
61482 + if (remap_pfn_range(vma, vma->vm_start,
61483 + __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
61484 + PAGE_SIZE, vma->vm_page_prot)) {
61485 + WPRINTK("Mapping user ring failed!\n");
61486 + goto fail;
61487 + }
61488 +
61489 + /* Mark this VM as containing foreign pages, and set up mappings. */
61490 + map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
61491 + * sizeof(struct page_struct*),
61492 + GFP_KERNEL);
61493 + if (map == NULL) {
61494 + WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
61495 + goto fail;
61496 + }
61497 +
61498 + for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
61499 + map[i] = NULL;
61500 +
61501 + vma->vm_private_data = map;
61502 + vma->vm_flags |= VM_FOREIGN;
61503 +
61504 + info->vma = vma;
61505 + info->ring_ok = 1;
61506 + return 0;
61507 + fail:
61508 + /* Clear any active mappings. */
61509 + zap_page_range(vma, vma->vm_start,
61510 + vma->vm_end - vma->vm_start, NULL);
61511 +
61512 + return -ENOMEM;
61513 +}
61514 +
61515 +
61516 +static int blktap_ioctl(struct inode *inode, struct file *filp,
61517 + unsigned int cmd, unsigned long arg)
61518 +{
61519 + tap_blkif_t *info = filp->private_data;
61520 +
61521 + switch(cmd) {
61522 + case BLKTAP_IOCTL_KICK_FE:
61523 + {
61524 + /* There are fe messages to process. */
61525 + return blktap_read_ufe_ring(info);
61526 + }
61527 + case BLKTAP_IOCTL_SETMODE:
61528 + {
61529 + if (info) {
61530 + if (BLKTAP_MODE_VALID(arg)) {
61531 + info->mode = arg;
61532 + /* XXX: may need to flush rings here. */
61533 + DPRINTK("blktap: set mode to %lx\n",
61534 + arg);
61535 + return 0;
61536 + }
61537 + }
61538 + return 0;
61539 + }
61540 + case BLKTAP_IOCTL_PRINT_IDXS:
61541 + {
61542 + if (info) {
61543 + printk("User Rings: \n-----------\n");
61544 + printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
61545 + "| req_prod: %2d, rsp_prod: %2d\n",
61546 + info->ufe_ring.rsp_cons,
61547 + info->ufe_ring.req_prod_pvt,
61548 + info->ufe_ring.sring->req_prod,
61549 + info->ufe_ring.sring->rsp_prod);
61550 + }
61551 + return 0;
61552 + }
61553 + case BLKTAP_IOCTL_SENDPID:
61554 + {
61555 + if (info) {
61556 + info->pid = (pid_t)arg;
61557 + DPRINTK("blktap: pid received %d\n",
61558 + info->pid);
61559 + }
61560 + return 0;
61561 + }
61562 + case BLKTAP_IOCTL_NEWINTF:
61563 + {
61564 + uint64_t val = (uint64_t)arg;
61565 + domid_translate_t *tr = (domid_translate_t *)&val;
61566 +
61567 + DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
61568 + tr->domid, tr->busid);
61569 + info = get_next_free_dev();
61570 + if (!info) {
61571 + WPRINTK("Error initialising /dev/xen/blktap - "
61572 + "No more devices\n");
61573 + return -1;
61574 + }
61575 + info->trans.domid = tr->domid;
61576 + info->trans.busid = tr->busid;
61577 + return info->minor;
61578 + }
61579 + case BLKTAP_IOCTL_FREEINTF:
61580 + {
61581 + unsigned long dev = arg;
61582 + unsigned long flags;
61583 +
61584 + info = tapfds[dev];
61585 +
61586 + if ((dev > MAX_TAP_DEV) || !info)
61587 + return 0; /* should this be an error? */
61588 +
61589 + spin_lock_irqsave(&pending_free_lock, flags);
61590 + if (info->dev_pending)
61591 + info->dev_pending = 0;
61592 + spin_unlock_irqrestore(&pending_free_lock, flags);
61593 +
61594 + return 0;
61595 + }
61596 + case BLKTAP_IOCTL_MINOR:
61597 + {
61598 + unsigned long dev = arg;
61599 +
61600 + info = tapfds[dev];
61601 +
61602 + if ((dev > MAX_TAP_DEV) || !info)
61603 + return -EINVAL;
61604 +
61605 + return info->minor;
61606 + }
61607 + case BLKTAP_IOCTL_MAJOR:
61608 + return blktap_major;
61609 +
61610 + case BLKTAP_QUERY_ALLOC_REQS:
61611 + {
61612 + WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
61613 + alloc_pending_reqs, blkif_reqs);
61614 + return (alloc_pending_reqs/blkif_reqs) * 100;
61615 + }
61616 + }
61617 + return -ENOIOCTLCMD;
61618 +}
61619 +
61620 +static unsigned int blktap_poll(struct file *filp, poll_table *wait)
61621 +{
61622 + tap_blkif_t *info = filp->private_data;
61623 +
61624 + /* do not work on the control device */
61625 + if (!info)
61626 + return 0;
61627 +
61628 + poll_wait(filp, &info->wait, wait);
61629 + if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
61630 + RING_PUSH_REQUESTS(&info->ufe_ring);
61631 + return POLLIN | POLLRDNORM;
61632 + }
61633 + return 0;
61634 +}
61635 +
61636 +void blktap_kick_user(int idx)
61637 +{
61638 + tap_blkif_t *info;
61639 +
61640 + info = tapfds[idx];
61641 +
61642 + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
61643 + return;
61644 +
61645 + wake_up_interruptible(&info->wait);
61646 +
61647 + return;
61648 +}
61649 +
61650 +static int do_block_io_op(blkif_t *blkif);
61651 +static void dispatch_rw_block_io(blkif_t *blkif,
61652 + blkif_request_t *req,
61653 + pending_req_t *pending_req);
61654 +static void make_response(blkif_t *blkif, unsigned long id,
61655 + unsigned short op, int st);
61656 +
61657 +/******************************************************************
61658 + * misc small helpers
61659 + */
61660 +static int req_increase(void)
61661 +{
61662 + int i, j;
61663 +
61664 + if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
61665 + return -EINVAL;
61666 +
61667 + pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
61668 + * blkif_reqs, GFP_KERNEL);
61669 + foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
61670 +
61671 + if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
61672 + goto out_of_memory;
61673 +
61674 + DPRINTK("%s: reqs=%d, pages=%d\n",
61675 + __FUNCTION__, blkif_reqs, mmap_pages);
61676 +
61677 + for (i = 0; i < MAX_PENDING_REQS; i++) {
61678 + list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
61679 + &pending_free);
61680 + pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
61681 + for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
61682 + BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
61683 + i, j));
61684 + }
61685 +
61686 + mmap_alloc++;
61687 + DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
61688 + return 0;
61689 +
61690 + out_of_memory:
61691 + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
61692 + kfree(pending_reqs[mmap_alloc]);
61693 + WPRINTK("%s: out of memory\n", __FUNCTION__);
61694 + return -ENOMEM;
61695 +}
61696 +
61697 +static void mmap_req_del(int mmap)
61698 +{
61699 + BUG_ON(!spin_is_locked(&pending_free_lock));
61700 +
61701 + kfree(pending_reqs[mmap]);
61702 + pending_reqs[mmap] = NULL;
61703 +
61704 + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
61705 + foreign_pages[mmap] = NULL;
61706 +
61707 + mmap_lock = 0;
61708 + DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
61709 + mmap_alloc--;
61710 +}
61711 +
61712 +static pending_req_t* alloc_req(void)
61713 +{
61714 + pending_req_t *req = NULL;
61715 + unsigned long flags;
61716 +
61717 + spin_lock_irqsave(&pending_free_lock, flags);
61718 +
61719 + if (!list_empty(&pending_free)) {
61720 + req = list_entry(pending_free.next, pending_req_t, free_list);
61721 + list_del(&req->free_list);
61722 + }
61723 +
61724 + if (req) {
61725 + req->inuse = 1;
61726 + alloc_pending_reqs++;
61727 + }
61728 + spin_unlock_irqrestore(&pending_free_lock, flags);
61729 +
61730 + return req;
61731 +}
61732 +
61733 +static void free_req(pending_req_t *req)
61734 +{
61735 + unsigned long flags;
61736 + int was_empty;
61737 +
61738 + spin_lock_irqsave(&pending_free_lock, flags);
61739 +
61740 + alloc_pending_reqs--;
61741 + req->inuse = 0;
61742 + if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
61743 + mmap_inuse--;
61744 + if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
61745 + spin_unlock_irqrestore(&pending_free_lock, flags);
61746 + return;
61747 + }
61748 + was_empty = list_empty(&pending_free);
61749 + list_add(&req->free_list, &pending_free);
61750 +
61751 + spin_unlock_irqrestore(&pending_free_lock, flags);
61752 +
61753 + if (was_empty)
61754 + wake_up(&pending_free_wq);
61755 +}
61756 +
61757 +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
61758 + int tapidx)
61759 +{
61760 + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
61761 + unsigned int i, invcount = 0;
61762 + struct grant_handle_pair *khandle;
61763 + uint64_t ptep;
61764 + int ret, mmap_idx;
61765 + unsigned long kvaddr, uvaddr;
61766 + tap_blkif_t *info;
61767 +
61768 +
61769 + info = tapfds[tapidx];
61770 +
61771 + if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
61772 + WPRINTK("fast_flush: Couldn't get info!\n");
61773 + return;
61774 + }
61775 +
61776 + if (info->vma != NULL &&
61777 + xen_feature(XENFEAT_auto_translated_physmap)) {
61778 + down_write(&info->vma->vm_mm->mmap_sem);
61779 + zap_page_range(info->vma,
61780 + MMAP_VADDR(info->user_vstart, u_idx, 0),
61781 + req->nr_pages << PAGE_SHIFT, NULL);
61782 + up_write(&info->vma->vm_mm->mmap_sem);
61783 + }
61784 +
61785 + mmap_idx = req->mem_idx;
61786 +
61787 + for (i = 0; i < req->nr_pages; i++) {
61788 + kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
61789 + uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
61790 +
61791 + khandle = &pending_handle(mmap_idx, k_idx, i);
61792 +
61793 + if (khandle->kernel != INVALID_GRANT_HANDLE) {
61794 + gnttab_set_unmap_op(&unmap[invcount],
61795 + idx_to_kaddr(mmap_idx, k_idx, i),
61796 + GNTMAP_host_map, khandle->kernel);
61797 + invcount++;
61798 + }
61799 +
61800 + if (khandle->user != INVALID_GRANT_HANDLE) {
61801 + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
61802 + if (create_lookup_pte_addr(
61803 + info->vma->vm_mm,
61804 + MMAP_VADDR(info->user_vstart, u_idx, i),
61805 + &ptep) !=0) {
61806 + WPRINTK("Couldn't get a pte addr!\n");
61807 + return;
61808 + }
61809 +
61810 + gnttab_set_unmap_op(&unmap[invcount], ptep,
61811 + GNTMAP_host_map
61812 + | GNTMAP_application_map
61813 + | GNTMAP_contains_pte,
61814 + khandle->user);
61815 + invcount++;
61816 + }
61817 +
61818 + BLKTAP_INVALIDATE_HANDLE(khandle);
61819 + }
61820 + ret = HYPERVISOR_grant_table_op(
61821 + GNTTABOP_unmap_grant_ref, unmap, invcount);
61822 + BUG_ON(ret);
61823 +
61824 + if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
61825 + zap_page_range(info->vma,
61826 + MMAP_VADDR(info->user_vstart, u_idx, 0),
61827 + req->nr_pages << PAGE_SHIFT, NULL);
61828 +}
61829 +
61830 +/******************************************************************
61831 + * SCHEDULER FUNCTIONS
61832 + */
61833 +
61834 +static void print_stats(blkif_t *blkif)
61835 +{
61836 + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
61837 + current->comm, blkif->st_oo_req,
61838 + blkif->st_rd_req, blkif->st_wr_req);
61839 + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
61840 + blkif->st_rd_req = 0;
61841 + blkif->st_wr_req = 0;
61842 + blkif->st_oo_req = 0;
61843 +}
61844 +
61845 +int tap_blkif_schedule(void *arg)
61846 +{
61847 + blkif_t *blkif = arg;
61848 +
61849 + blkif_get(blkif);
61850 +
61851 + if (debug_lvl)
61852 + printk(KERN_DEBUG "%s: started\n", current->comm);
61853 +
61854 + while (!kthread_should_stop()) {
61855 + wait_event_interruptible(
61856 + blkif->wq,
61857 + blkif->waiting_reqs || kthread_should_stop());
61858 + wait_event_interruptible(
61859 + pending_free_wq,
61860 + !list_empty(&pending_free) || kthread_should_stop());
61861 +
61862 + blkif->waiting_reqs = 0;
61863 + smp_mb(); /* clear flag *before* checking for work */
61864 +
61865 + if (do_block_io_op(blkif))
61866 + blkif->waiting_reqs = 1;
61867 +
61868 + if (log_stats && time_after(jiffies, blkif->st_print))
61869 + print_stats(blkif);
61870 + }
61871 +
61872 + if (log_stats)
61873 + print_stats(blkif);
61874 + if (debug_lvl)
61875 + printk(KERN_DEBUG "%s: exiting\n", current->comm);
61876 +
61877 + blkif->xenblkd = NULL;
61878 + blkif_put(blkif);
61879 +
61880 + return 0;
61881 +}
61882 +
61883 +/******************************************************************
61884 + * COMPLETION CALLBACK -- Called by user level ioctl()
61885 + */
61886 +
61887 +static int blktap_read_ufe_ring(tap_blkif_t *info)
61888 +{
61889 + /* This is called to read responses from the UFE ring. */
61890 + RING_IDX i, j, rp;
61891 + blkif_response_t *resp;
61892 + blkif_t *blkif=NULL;
61893 + int pending_idx, usr_idx, mmap_idx;
61894 + pending_req_t *pending_req;
61895 +
61896 + if (!info)
61897 + return 0;
61898 +
61899 + /* We currently only forward packets in INTERCEPT_FE mode. */
61900 + if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
61901 + return 0;
61902 +
61903 + /* for each outstanding message on the UFEring */
61904 + rp = info->ufe_ring.sring->rsp_prod;
61905 + rmb();
61906 +
61907 + for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
61908 + blkif_response_t res;
61909 + resp = RING_GET_RESPONSE(&info->ufe_ring, i);
61910 + memcpy(&res, resp, sizeof(res));
61911 + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
61912 + ++info->ufe_ring.rsp_cons;
61913 +
61914 + /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
61915 + usr_idx = (int)res.id;
61916 + pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
61917 + mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
61918 +
61919 + if ( (mmap_idx >= mmap_alloc) ||
61920 + (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
61921 + WPRINTK("Incorrect req map"
61922 + "[%d], internal map [%d,%d (%d)]\n",
61923 + usr_idx, mmap_idx,
61924 + ID_TO_IDX(info->idx_map[usr_idx]),
61925 + MASK_PEND_IDX(
61926 + ID_TO_IDX(info->idx_map[usr_idx])));
61927 +
61928 + pending_req = &pending_reqs[mmap_idx][pending_idx];
61929 + blkif = pending_req->blkif;
61930 +
61931 + for (j = 0; j < pending_req->nr_pages; j++) {
61932 +
61933 + unsigned long kvaddr, uvaddr;
61934 + struct page **map = info->vma->vm_private_data;
61935 + struct page *pg;
61936 + int offset;
61937 +
61938 + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
61939 + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
61940 +
61941 + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
61942 + ClearPageReserved(pg);
61943 + offset = (uvaddr - info->vma->vm_start)
61944 + >> PAGE_SHIFT;
61945 + map[offset] = NULL;
61946 + }
61947 + fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
61948 + info->idx_map[usr_idx] = INVALID_REQ;
61949 + make_response(blkif, pending_req->id, res.operation,
61950 + res.status);
61951 + blkif_put(pending_req->blkif);
61952 + free_req(pending_req);
61953 + }
61954 +
61955 + return 0;
61956 +}
61957 +
61958 +
61959 +/******************************************************************************
61960 + * NOTIFICATION FROM GUEST OS.
61961 + */
61962 +
61963 +static void blkif_notify_work(blkif_t *blkif)
61964 +{
61965 + blkif->waiting_reqs = 1;
61966 + wake_up(&blkif->wq);
61967 +}
61968 +
61969 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
61970 +{
61971 + blkif_notify_work(dev_id);
61972 + return IRQ_HANDLED;
61973 +}
61974 +
61975 +
61976 +
61977 +/******************************************************************
61978 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
61979 + */
61980 +static int print_dbug = 1;
61981 +static int do_block_io_op(blkif_t *blkif)
61982 +{
61983 + blkif_back_ring_t *blk_ring = &blkif->blk_ring;
61984 + blkif_request_t req;
61985 + pending_req_t *pending_req;
61986 + RING_IDX rc, rp;
61987 + int more_to_do = 0;
61988 + tap_blkif_t *info;
61989 +
61990 + rc = blk_ring->req_cons;
61991 + rp = blk_ring->sring->req_prod;
61992 + rmb(); /* Ensure we see queued requests up to 'rp'. */
61993 +
61994 + /*Check blkif has corresponding UE ring*/
61995 + if (blkif->dev_num < 0) {
61996 + /*oops*/
61997 + if (print_dbug) {
61998 + WPRINTK("Corresponding UE "
61999 + "ring does not exist!\n");
62000 + print_dbug = 0; /*We only print this message once*/
62001 + }
62002 + return 0;
62003 + }
62004 +
62005 + info = tapfds[blkif->dev_num];
62006 +
62007 + if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
62008 + if (print_dbug) {
62009 + WPRINTK("Can't get UE info!\n");
62010 + print_dbug = 0;
62011 + }
62012 + return 0;
62013 + }
62014 +
62015 + while (rc != rp) {
62016 +
62017 + if (RING_FULL(&info->ufe_ring)) {
62018 + WPRINTK("RING_FULL! More to do\n");
62019 + more_to_do = 1;
62020 + break;
62021 + }
62022 +
62023 + if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
62024 + WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
62025 + " More to do\n");
62026 + more_to_do = 1;
62027 + break;
62028 + }
62029 +
62030 + pending_req = alloc_req();
62031 + if (NULL == pending_req) {
62032 + blkif->st_oo_req++;
62033 + more_to_do = 1;
62034 + break;
62035 + }
62036 +
62037 + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
62038 + blk_ring->req_cons = ++rc; /* before make_response() */
62039 +
62040 + switch (req.operation) {
62041 + case BLKIF_OP_READ:
62042 + blkif->st_rd_req++;
62043 + dispatch_rw_block_io(blkif, &req, pending_req);
62044 + break;
62045 +
62046 + case BLKIF_OP_WRITE:
62047 + blkif->st_wr_req++;
62048 + dispatch_rw_block_io(blkif, &req, pending_req);
62049 + break;
62050 +
62051 + default:
62052 + WPRINTK("unknown operation [%d]\n",
62053 + req.operation);
62054 + make_response(blkif, req.id, req.operation,
62055 + BLKIF_RSP_ERROR);
62056 + free_req(pending_req);
62057 + break;
62058 + }
62059 + }
62060 +
62061 + blktap_kick_user(blkif->dev_num);
62062 +
62063 + return more_to_do;
62064 +}
62065 +
62066 +static void dispatch_rw_block_io(blkif_t *blkif,
62067 + blkif_request_t *req,
62068 + pending_req_t *pending_req)
62069 +{
62070 + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
62071 + int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
62072 + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
62073 + unsigned int nseg;
62074 + int ret, i;
62075 + tap_blkif_t *info;
62076 + uint64_t sector;
62077 + blkif_request_t *target;
62078 + int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
62079 + int usr_idx;
62080 + uint16_t mmap_idx = pending_req->mem_idx;
62081 +
62082 + if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
62083 + goto fail_response;
62084 +
62085 + info = tapfds[blkif->dev_num];
62086 + if (info == NULL)
62087 + goto fail_response;
62088 +
62089 + /* Check we have space on user ring - should never fail. */
62090 + usr_idx = GET_NEXT_REQ(info->idx_map);
62091 + if (usr_idx == INVALID_REQ) {
62092 + BUG();
62093 + goto fail_response;
62094 + }
62095 +
62096 + /* Check that number of segments is sane. */
62097 + nseg = req->nr_segments;
62098 + if ( unlikely(nseg == 0) ||
62099 + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
62100 + WPRINTK("Bad number of segments in request (%d)\n", nseg);
62101 + goto fail_response;
62102 + }
62103 +
62104 + /* Make sure userspace is ready. */
62105 + if (!info->ring_ok) {
62106 + WPRINTK("blktap: ring not ready for requests!\n");
62107 + goto fail_response;
62108 + }
62109 +
62110 + if (RING_FULL(&info->ufe_ring)) {
62111 + WPRINTK("blktap: fe_ring is full, can't add "
62112 + "IO Request will be dropped. %d %d\n",
62113 + RING_SIZE(&info->ufe_ring),
62114 + RING_SIZE(&blkif->blk_ring));
62115 + goto fail_response;
62116 + }
62117 +
62118 + pending_req->blkif = blkif;
62119 + pending_req->id = req->id;
62120 + pending_req->operation = operation;
62121 + pending_req->status = BLKIF_RSP_OKAY;
62122 + pending_req->nr_pages = nseg;
62123 + op = 0;
62124 + for (i = 0; i < nseg; i++) {
62125 + unsigned long uvaddr;
62126 + unsigned long kvaddr;
62127 + uint64_t ptep;
62128 + uint32_t flags;
62129 +
62130 + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
62131 + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62132 +
62133 + sector = req->sector_number + ((PAGE_SIZE / 512) * i);
62134 + if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
62135 + WPRINTK("BLKTAP: Sector request greater"
62136 + "than size\n");
62137 + WPRINTK("BLKTAP: %s request sector"
62138 + "[%llu,%llu], Total [%llu]\n",
62139 + (req->operation ==
62140 + BLKIF_OP_WRITE ? "WRITE" : "READ"),
62141 + (long long unsigned) sector,
62142 + (long long unsigned) sector>>9,
62143 + (long long unsigned) blkif->sectors);
62144 + }
62145 +
62146 + flags = GNTMAP_host_map;
62147 + if (operation == WRITE)
62148 + flags |= GNTMAP_readonly;
62149 + gnttab_set_map_op(&map[op], kvaddr, flags,
62150 + req->seg[i].gref, blkif->domid);
62151 + op++;
62152 +
62153 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
62154 + /* Now map it to user. */
62155 + ret = create_lookup_pte_addr(info->vma->vm_mm,
62156 + uvaddr, &ptep);
62157 + if (ret) {
62158 + WPRINTK("Couldn't get a pte addr!\n");
62159 + goto fail_flush;
62160 + }
62161 +
62162 + flags = GNTMAP_host_map | GNTMAP_application_map
62163 + | GNTMAP_contains_pte;
62164 + if (operation == WRITE)
62165 + flags |= GNTMAP_readonly;
62166 + gnttab_set_map_op(&map[op], ptep, flags,
62167 + req->seg[i].gref, blkif->domid);
62168 + op++;
62169 + }
62170 + }
62171 +
62172 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
62173 + BUG_ON(ret);
62174 +
62175 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
62176 + for (i = 0; i < (nseg*2); i+=2) {
62177 + unsigned long uvaddr;
62178 + unsigned long kvaddr;
62179 + unsigned long offset;
62180 + struct page *pg;
62181 +
62182 + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
62183 + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
62184 +
62185 + if (unlikely(map[i].status != 0)) {
62186 + WPRINTK("invalid kernel buffer -- "
62187 + "could not remap it\n");
62188 + ret |= 1;
62189 + map[i].handle = INVALID_GRANT_HANDLE;
62190 + }
62191 +
62192 + if (unlikely(map[i+1].status != 0)) {
62193 + WPRINTK("invalid user buffer -- "
62194 + "could not remap it\n");
62195 + ret |= 1;
62196 + map[i+1].handle = INVALID_GRANT_HANDLE;
62197 + }
62198 +
62199 + pending_handle(mmap_idx, pending_idx, i/2).kernel
62200 + = map[i].handle;
62201 + pending_handle(mmap_idx, pending_idx, i/2).user
62202 + = map[i+1].handle;
62203 +
62204 + if (ret)
62205 + continue;
62206 +
62207 + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
62208 + FOREIGN_FRAME(map[i].dev_bus_addr
62209 + >> PAGE_SHIFT));
62210 + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
62211 + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62212 + ((struct page **)info->vma->vm_private_data)[offset] =
62213 + pg;
62214 + }
62215 + } else {
62216 + for (i = 0; i < nseg; i++) {
62217 + unsigned long uvaddr;
62218 + unsigned long kvaddr;
62219 + unsigned long offset;
62220 + struct page *pg;
62221 +
62222 + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
62223 + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62224 +
62225 + if (unlikely(map[i].status != 0)) {
62226 + WPRINTK("invalid kernel buffer -- "
62227 + "could not remap it\n");
62228 + ret |= 1;
62229 + map[i].handle = INVALID_GRANT_HANDLE;
62230 + }
62231 +
62232 + pending_handle(mmap_idx, pending_idx, i).kernel
62233 + = map[i].handle;
62234 +
62235 + if (ret)
62236 + continue;
62237 +
62238 + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
62239 + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62240 + ((struct page **)info->vma->vm_private_data)[offset] =
62241 + pg;
62242 + }
62243 + }
62244 +
62245 + if (ret)
62246 + goto fail_flush;
62247 +
62248 + if (xen_feature(XENFEAT_auto_translated_physmap))
62249 + down_write(&info->vma->vm_mm->mmap_sem);
62250 + /* Mark mapped pages as reserved: */
62251 + for (i = 0; i < req->nr_segments; i++) {
62252 + unsigned long kvaddr;
62253 + struct page *pg;
62254 +
62255 + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
62256 + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
62257 + SetPageReserved(pg);
62258 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
62259 + ret = vm_insert_page(info->vma,
62260 + MMAP_VADDR(info->user_vstart,
62261 + usr_idx, i), pg);
62262 + if (ret) {
62263 + up_write(&info->vma->vm_mm->mmap_sem);
62264 + goto fail_flush;
62265 + }
62266 + }
62267 + }
62268 + if (xen_feature(XENFEAT_auto_translated_physmap))
62269 + up_write(&info->vma->vm_mm->mmap_sem);
62270 +
62271 + /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
62272 + info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
62273 +
62274 + blkif_get(blkif);
62275 + /* Finally, write the request message to the user ring. */
62276 + target = RING_GET_REQUEST(&info->ufe_ring,
62277 + info->ufe_ring.req_prod_pvt);
62278 + memcpy(target, req, sizeof(*req));
62279 + target->id = usr_idx;
62280 + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
62281 + info->ufe_ring.req_prod_pvt++;
62282 + return;
62283 +
62284 + fail_flush:
62285 + WPRINTK("Reached Fail_flush\n");
62286 + fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
62287 + fail_response:
62288 + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
62289 + free_req(pending_req);
62290 +}
62291 +
62292 +
62293 +
62294 +/******************************************************************
62295 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
62296 + */
62297 +
62298 +
62299 +static void make_response(blkif_t *blkif, unsigned long id,
62300 + unsigned short op, int st)
62301 +{
62302 + blkif_response_t *resp;
62303 + unsigned long flags;
62304 + blkif_back_ring_t *blk_ring = &blkif->blk_ring;
62305 + int more_to_do = 0;
62306 + int notify;
62307 +
62308 + spin_lock_irqsave(&blkif->blk_ring_lock, flags);
62309 + /* Place on the response ring for the relevant domain. */
62310 + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
62311 + resp->id = id;
62312 + resp->operation = op;
62313 + resp->status = st;
62314 + blk_ring->rsp_prod_pvt++;
62315 + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
62316 +
62317 + if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
62318 + /*
62319 + * Tail check for pending requests. Allows frontend to avoid
62320 + * notifications if requests are already in flight (lower
62321 + * overheads and promotes batching).
62322 + */
62323 + RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
62324 + } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
62325 + more_to_do = 1;
62326 +
62327 + }
62328 + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
62329 + if (more_to_do)
62330 + blkif_notify_work(blkif);
62331 + if (notify)
62332 + notify_remote_via_irq(blkif->irq);
62333 +}
62334 +
62335 +static int __init blkif_init(void)
62336 +{
62337 + int i,ret,blktap_dir;
62338 +
62339 + if (!is_running_on_xen())
62340 + return -ENODEV;
62341 +
62342 + INIT_LIST_HEAD(&pending_free);
62343 + for(i = 0; i < 2; i++) {
62344 + ret = req_increase();
62345 + if (ret)
62346 + break;
62347 + }
62348 + if (i == 0)
62349 + return ret;
62350 +
62351 + tap_blkif_interface_init();
62352 +
62353 + alloc_pending_reqs = 0;
62354 +
62355 + tap_blkif_xenbus_init();
62356 +
62357 + /* Dynamically allocate a major for this device */
62358 + ret = register_chrdev(0, "blktap", &blktap_fops);
62359 + blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
62360 +
62361 + if ( (ret < 0)||(blktap_dir < 0) ) {
62362 + WPRINTK("Couldn't register /dev/xen/blktap\n");
62363 + return -ENOMEM;
62364 + }
62365 +
62366 + blktap_major = ret;
62367 +
62368 + /* tapfds[0] is always NULL */
62369 + blktap_next_minor++;
62370 +
62371 + ret = devfs_mk_cdev(MKDEV(blktap_major, i),
62372 + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
62373 +
62374 + if(ret != 0)
62375 + return -ENOMEM;
62376 +
62377 + DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
62378 +
62379 + /* Make sure the xen class exists */
62380 + if (!setup_xen_class()) {
62381 + /*
62382 + * This will allow udev to create the blktap ctrl device.
62383 + * We only want to create blktap0 first. We don't want
62384 + * to flood the sysfs system with needless blktap devices.
62385 + * We only create the device when a request of a new device is
62386 + * made.
62387 + */
62388 + class_device_create(xen_class, NULL,
62389 + MKDEV(blktap_major, 0), NULL,
62390 + "blktap0");
62391 + } else {
62392 + /* this is bad, but not fatal */
62393 + WPRINTK("blktap: sysfs xen_class not created\n");
62394 + }
62395 +
62396 + DPRINTK("Blktap device successfully created\n");
62397 +
62398 + return 0;
62399 +}
62400 +
62401 +module_init(blkif_init);
62402 +
62403 +MODULE_LICENSE("Dual BSD/GPL");
62404 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/common.h linux-2.6.16.33/drivers/xen/blktap/common.h
62405 --- linux-2.6.16.33-noxen/drivers/xen/blktap/common.h 1970-01-01 00:00:00.000000000 +0000
62406 +++ linux-2.6.16.33/drivers/xen/blktap/common.h 2007-01-08 15:00:45.000000000 +0000
62407 @@ -0,0 +1,121 @@
62408 +/*
62409 + * This program is free software; you can redistribute it and/or
62410 + * modify it under the terms of the GNU General Public License version 2
62411 + * as published by the Free Software Foundation; or, when distributed
62412 + * separately from the Linux kernel or incorporated into other
62413 + * software packages, subject to the following license:
62414 + *
62415 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62416 + * of this source file (the "Software"), to deal in the Software without
62417 + * restriction, including without limitation the rights to use, copy, modify,
62418 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62419 + * and to permit persons to whom the Software is furnished to do so, subject to
62420 + * the following conditions:
62421 + *
62422 + * The above copyright notice and this permission notice shall be included in
62423 + * all copies or substantial portions of the Software.
62424 + *
62425 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62426 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62427 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62428 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62429 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62430 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62431 + * IN THE SOFTWARE.
62432 + */
62433 +
62434 +#ifndef __BLKIF__BACKEND__COMMON_H__
62435 +#define __BLKIF__BACKEND__COMMON_H__
62436 +
62437 +#include <linux/config.h>
62438 +#include <linux/version.h>
62439 +#include <linux/module.h>
62440 +#include <linux/interrupt.h>
62441 +#include <linux/slab.h>
62442 +#include <linux/blkdev.h>
62443 +#include <linux/vmalloc.h>
62444 +#include <asm/io.h>
62445 +#include <asm/setup.h>
62446 +#include <asm/pgalloc.h>
62447 +#include <xen/evtchn.h>
62448 +#include <asm/hypervisor.h>
62449 +#include <xen/interface/io/blkif.h>
62450 +#include <xen/interface/io/ring.h>
62451 +#include <xen/gnttab.h>
62452 +#include <xen/driver_util.h>
62453 +
62454 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
62455 + __FILE__ , __LINE__ , ## _a )
62456 +
62457 +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
62458 +
62459 +struct backend_info;
62460 +
62461 +typedef struct blkif_st {
62462 + /* Unique identifier for this interface. */
62463 + domid_t domid;
62464 + unsigned int handle;
62465 + /* Physical parameters of the comms window. */
62466 + unsigned int evtchn;
62467 + unsigned int irq;
62468 + /* Comms information. */
62469 + blkif_back_ring_t blk_ring;
62470 + struct vm_struct *blk_ring_area;
62471 + /* Back pointer to the backend_info. */
62472 + struct backend_info *be;
62473 + /* Private fields. */
62474 + spinlock_t blk_ring_lock;
62475 + atomic_t refcnt;
62476 +
62477 + wait_queue_head_t wq;
62478 + struct task_struct *xenblkd;
62479 + unsigned int waiting_reqs;
62480 + request_queue_t *plug;
62481 +
62482 + /* statistics */
62483 + unsigned long st_print;
62484 + int st_rd_req;
62485 + int st_wr_req;
62486 + int st_oo_req;
62487 +
62488 + wait_queue_head_t waiting_to_free;
62489 +
62490 + grant_handle_t shmem_handle;
62491 + grant_ref_t shmem_ref;
62492 +
62493 + int dev_num;
62494 + uint64_t sectors;
62495 +} blkif_t;
62496 +
62497 +blkif_t *tap_alloc_blkif(domid_t domid);
62498 +void tap_blkif_free(blkif_t *blkif);
62499 +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
62500 + unsigned int evtchn);
62501 +void tap_blkif_unmap(blkif_t *blkif);
62502 +
62503 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
62504 +#define blkif_put(_b) \
62505 + do { \
62506 + if (atomic_dec_and_test(&(_b)->refcnt)) \
62507 + wake_up(&(_b)->waiting_to_free);\
62508 + } while (0)
62509 +
62510 +
62511 +struct phys_req {
62512 + unsigned short dev;
62513 + unsigned short nr_sects;
62514 + struct block_device *bdev;
62515 + blkif_sector_t sector_number;
62516 +};
62517 +
62518 +void tap_blkif_interface_init(void);
62519 +
62520 +void tap_blkif_xenbus_init(void);
62521 +
62522 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
62523 +int tap_blkif_schedule(void *arg);
62524 +
62525 +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
62526 +void signal_tapdisk(int idx);
62527 +
62528 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
62529 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/interface.c linux-2.6.16.33/drivers/xen/blktap/interface.c
62530 --- linux-2.6.16.33-noxen/drivers/xen/blktap/interface.c 1970-01-01 00:00:00.000000000 +0000
62531 +++ linux-2.6.16.33/drivers/xen/blktap/interface.c 2007-01-08 15:00:45.000000000 +0000
62532 @@ -0,0 +1,164 @@
62533 +/******************************************************************************
62534 + * drivers/xen/blktap/interface.c
62535 + *
62536 + * Block-device interface management.
62537 + *
62538 + * Copyright (c) 2004, Keir Fraser
62539 + *
62540 + * This program is free software; you can redistribute it and/or
62541 + * modify it under the terms of the GNU General Public License version 2
62542 + * as published by the Free Software Foundation; or, when distributed
62543 + * separately from the Linux kernel or incorporated into other
62544 + * software packages, subject to the following license:
62545 + *
62546 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62547 + * of this source file (the "Software"), to deal in the Software without
62548 + * restriction, including without limitation the rights to use, copy, modify,
62549 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62550 + * and to permit persons to whom the Software is furnished to do so, subject to
62551 + * the following conditions:
62552 + *
62553 + * The above copyright notice and this permission notice shall be included in
62554 + * all copies or substantial portions of the Software.
62555 + *
62556 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62557 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62558 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62559 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62560 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62561 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62562 + * IN THE SOFTWARE.
62563 +
62564 + */
62565 +
62566 +#include "common.h"
62567 +#include <xen/evtchn.h>
62568 +
62569 +static kmem_cache_t *blkif_cachep;
62570 +
62571 +blkif_t *tap_alloc_blkif(domid_t domid)
62572 +{
62573 + blkif_t *blkif;
62574 +
62575 + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
62576 + if (!blkif)
62577 + return ERR_PTR(-ENOMEM);
62578 +
62579 + memset(blkif, 0, sizeof(*blkif));
62580 + blkif->domid = domid;
62581 + spin_lock_init(&blkif->blk_ring_lock);
62582 + atomic_set(&blkif->refcnt, 1);
62583 + init_waitqueue_head(&blkif->wq);
62584 + blkif->st_print = jiffies;
62585 + init_waitqueue_head(&blkif->waiting_to_free);
62586 +
62587 + return blkif;
62588 +}
62589 +
62590 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
62591 +{
62592 + struct gnttab_map_grant_ref op;
62593 + int ret;
62594 +
62595 + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
62596 + GNTMAP_host_map, shared_page, blkif->domid);
62597 +
62598 + lock_vm_area(blkif->blk_ring_area);
62599 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
62600 + unlock_vm_area(blkif->blk_ring_area);
62601 + BUG_ON(ret);
62602 +
62603 + if (op.status) {
62604 + DPRINTK(" Grant table operation failure !\n");
62605 + return op.status;
62606 + }
62607 +
62608 + blkif->shmem_ref = shared_page;
62609 + blkif->shmem_handle = op.handle;
62610 +
62611 + return 0;
62612 +}
62613 +
62614 +static void unmap_frontend_page(blkif_t *blkif)
62615 +{
62616 + struct gnttab_unmap_grant_ref op;
62617 + int ret;
62618 +
62619 + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
62620 + GNTMAP_host_map, blkif->shmem_handle);
62621 +
62622 + lock_vm_area(blkif->blk_ring_area);
62623 + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
62624 + unlock_vm_area(blkif->blk_ring_area);
62625 + BUG_ON(ret);
62626 +}
62627 +
62628 +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page,
62629 + unsigned int evtchn)
62630 +{
62631 + blkif_sring_t *sring;
62632 + int err;
62633 + struct evtchn_bind_interdomain bind_interdomain;
62634 +
62635 + /* Already connected through? */
62636 + if (blkif->irq)
62637 + return 0;
62638 +
62639 + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
62640 + return -ENOMEM;
62641 +
62642 + err = map_frontend_page(blkif, shared_page);
62643 + if (err) {
62644 + free_vm_area(blkif->blk_ring_area);
62645 + return err;
62646 + }
62647 +
62648 + bind_interdomain.remote_dom = blkif->domid;
62649 + bind_interdomain.remote_port = evtchn;
62650 +
62651 + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
62652 + &bind_interdomain);
62653 + if (err) {
62654 + unmap_frontend_page(blkif);
62655 + free_vm_area(blkif->blk_ring_area);
62656 + return err;
62657 + }
62658 +
62659 + blkif->evtchn = bind_interdomain.local_port;
62660 +
62661 + sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
62662 + BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
62663 +
62664 + blkif->irq = bind_evtchn_to_irqhandler(
62665 + blkif->evtchn, tap_blkif_be_int, 0, "blkif-backend", blkif);
62666 +
62667 + return 0;
62668 +}
62669 +
62670 +void tap_blkif_unmap(blkif_t *blkif)
62671 +{
62672 + if (blkif->irq) {
62673 + unbind_from_irqhandler(blkif->irq, blkif);
62674 + blkif->irq = 0;
62675 + }
62676 + if (blkif->blk_ring.sring) {
62677 + unmap_frontend_page(blkif);
62678 + free_vm_area(blkif->blk_ring_area);
62679 + blkif->blk_ring.sring = NULL;
62680 + }
62681 +}
62682 +
62683 +void tap_blkif_free(blkif_t *blkif)
62684 +{
62685 + atomic_dec(&blkif->refcnt);
62686 + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
62687 +
62688 + tap_blkif_unmap(blkif);
62689 + kmem_cache_free(blkif_cachep, blkif);
62690 +}
62691 +
62692 +void __init tap_blkif_interface_init(void)
62693 +{
62694 + blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t),
62695 + 0, 0, NULL, NULL);
62696 +}
62697 diff -Nur linux-2.6.16.33-noxen/drivers/xen/blktap/xenbus.c linux-2.6.16.33/drivers/xen/blktap/xenbus.c
62698 --- linux-2.6.16.33-noxen/drivers/xen/blktap/xenbus.c 1970-01-01 00:00:00.000000000 +0000
62699 +++ linux-2.6.16.33/drivers/xen/blktap/xenbus.c 2007-01-08 15:00:45.000000000 +0000
62700 @@ -0,0 +1,366 @@
62701 +/* drivers/xen/blktap/xenbus.c
62702 + *
62703 + * Xenbus code for blktap
62704 + *
62705 + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
62706 + *
62707 + * Based on the blkback xenbus code:
62708 + *
62709 + * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
62710 + * Copyright (C) 2005 XenSource Ltd
62711 + *
62712 + * This program is free software; you can redistribute it and/or
62713 + * modify it under the terms of the GNU General Public License version 2
62714 + * as published by the Free Software Foundation; or, when distributed
62715 + * separately from the Linux kernel or incorporated into other
62716 + * software packages, subject to the following license:
62717 + *
62718 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62719 + * of this source file (the "Software"), to deal in the Software without
62720 + * restriction, including without limitation the rights to use, copy, modify,
62721 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62722 + * and to permit persons to whom the Software is furnished to do so, subject to
62723 + * the following conditions:
62724 + *
62725 + * The above copyright notice and this permission notice shall be included in
62726 + * all copies or substantial portions of the Software.
62727 + *
62728 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62729 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62730 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62731 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62732 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62733 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62734 + * IN THE SOFTWARE.
62735 + */
62736 +
62737 +#include <stdarg.h>
62738 +#include <linux/module.h>
62739 +#include <linux/kthread.h>
62740 +#include <xen/xenbus.h>
62741 +#include "common.h"
62742 +
62743 +
62744 +struct backend_info
62745 +{
62746 + struct xenbus_device *dev;
62747 + blkif_t *blkif;
62748 + struct xenbus_watch backend_watch;
62749 + int xenbus_id;
62750 +};
62751 +
62752 +
62753 +static void connect(struct backend_info *);
62754 +static int connect_ring(struct backend_info *);
62755 +static int blktap_remove(struct xenbus_device *dev);
62756 +static int blktap_probe(struct xenbus_device *dev,
62757 + const struct xenbus_device_id *id);
62758 +static void tap_backend_changed(struct xenbus_watch *, const char **,
62759 + unsigned int);
62760 +static void tap_frontend_changed(struct xenbus_device *dev,
62761 + enum xenbus_state frontend_state);
62762 +
62763 +static int strsep_len(const char *str, char c, unsigned int len)
62764 +{
62765 + unsigned int i;
62766 +
62767 + for (i = 0; str[i]; i++)
62768 + if (str[i] == c) {
62769 + if (len == 0)
62770 + return i;
62771 + len--;
62772 + }
62773 + return (len == 0) ? i : -ERANGE;
62774 +}
62775 +
62776 +static long get_id(const char *str)
62777 +{
62778 + int len,end;
62779 + const char *ptr;
62780 + char *tptr, num[10];
62781 +
62782 + len = strsep_len(str, '/', 2);
62783 + end = strlen(str);
62784 + if ( (len < 0) || (end < 0) ) return -1;
62785 +
62786 + ptr = str + len + 1;
62787 + strncpy(num,ptr,end - len);
62788 + tptr = num + (end - (len + 1));
62789 + *tptr = '\0';
62790 + DPRINTK("Get_id called for %s (%s)\n",str,num);
62791 +
62792 + return simple_strtol(num, NULL, 10);
62793 +}
62794 +
62795 +static void tap_update_blkif_status(blkif_t *blkif)
62796 +{
62797 + int err;
62798 +
62799 + /* Not ready to connect? */
62800 + if(!blkif->irq || !blkif->sectors) {
62801 + return;
62802 + }
62803 +
62804 + /* Already connected? */
62805 + if (blkif->be->dev->state == XenbusStateConnected)
62806 + return;
62807 +
62808 + /* Attempt to connect: exit if we fail to. */
62809 + connect(blkif->be);
62810 + if (blkif->be->dev->state != XenbusStateConnected)
62811 + return;
62812 +
62813 + blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif,
62814 + "xvd %d",
62815 + blkif->domid);
62816 +
62817 + if (IS_ERR(blkif->xenblkd)) {
62818 + err = PTR_ERR(blkif->xenblkd);
62819 + blkif->xenblkd = NULL;
62820 + xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
62821 + WPRINTK("Error starting thread\n");
62822 + }
62823 +}
62824 +
62825 +static int blktap_remove(struct xenbus_device *dev)
62826 +{
62827 + struct backend_info *be = dev->dev.driver_data;
62828 +
62829 + if (be->backend_watch.node) {
62830 + unregister_xenbus_watch(&be->backend_watch);
62831 + kfree(be->backend_watch.node);
62832 + be->backend_watch.node = NULL;
62833 + }
62834 + if (be->blkif) {
62835 + if (be->blkif->xenblkd)
62836 + kthread_stop(be->blkif->xenblkd);
62837 + signal_tapdisk(be->blkif->dev_num);
62838 + tap_blkif_free(be->blkif);
62839 + be->blkif = NULL;
62840 + }
62841 + kfree(be);
62842 + dev->dev.driver_data = NULL;
62843 + return 0;
62844 +}
62845 +
62846 +/**
62847 + * Entry point to this code when a new device is created. Allocate
62848 + * the basic structures, and watch the store waiting for the
62849 + * user-space program to tell us the physical device info. Switch to
62850 + * InitWait.
62851 + */
62852 +static int blktap_probe(struct xenbus_device *dev,
62853 + const struct xenbus_device_id *id)
62854 +{
62855 + int err;
62856 + struct backend_info *be = kzalloc(sizeof(struct backend_info),
62857 + GFP_KERNEL);
62858 + if (!be) {
62859 + xenbus_dev_fatal(dev, -ENOMEM,
62860 + "allocating backend structure");
62861 + return -ENOMEM;
62862 + }
62863 +
62864 + be->dev = dev;
62865 + dev->dev.driver_data = be;
62866 + be->xenbus_id = get_id(dev->nodename);
62867 +
62868 + be->blkif = tap_alloc_blkif(dev->otherend_id);
62869 + if (IS_ERR(be->blkif)) {
62870 + err = PTR_ERR(be->blkif);
62871 + be->blkif = NULL;
62872 + xenbus_dev_fatal(dev, err, "creating block interface");
62873 + goto fail;
62874 + }
62875 +
62876 + /* setup back pointer */
62877 + be->blkif->be = be;
62878 + be->blkif->sectors = 0;
62879 +
62880 + /* set a watch on disk info, waiting for userspace to update details*/
62881 + err = xenbus_watch_path2(dev, dev->nodename, "info",
62882 + &be->backend_watch, tap_backend_changed);
62883 + if (err)
62884 + goto fail;
62885 +
62886 + err = xenbus_switch_state(dev, XenbusStateInitWait);
62887 + if (err)
62888 + goto fail;
62889 + return 0;
62890 +
62891 +fail:
62892 + DPRINTK("blktap probe failed\n");
62893 + blktap_remove(dev);
62894 + return err;
62895 +}
62896 +
62897 +
62898 +/**
62899 + * Callback received when the user space code has placed the device
62900 + * information in xenstore.
62901 + */
62902 +static void tap_backend_changed(struct xenbus_watch *watch,
62903 + const char **vec, unsigned int len)
62904 +{
62905 + int err;
62906 + unsigned long info;
62907 + struct backend_info *be
62908 + = container_of(watch, struct backend_info, backend_watch);
62909 + struct xenbus_device *dev = be->dev;
62910 +
62911 + /**
62912 + * Check to see whether userspace code has opened the image
62913 + * and written sector
62914 + * and disk info to xenstore
62915 + */
62916 + err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info,
62917 + NULL);
62918 + if (err) {
62919 + xenbus_dev_error(dev, err, "getting info");
62920 + return;
62921 + }
62922 +
62923 + DPRINTK("Userspace update on disk info, %lu\n",info);
62924 +
62925 + err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu",
62926 + &be->blkif->sectors, NULL);
62927 +
62928 + /* Associate tap dev with domid*/
62929 + be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id,
62930 + be->blkif);
62931 + DPRINTK("Thread started for domid [%d], connecting disk\n",
62932 + be->blkif->dev_num);
62933 +
62934 + tap_update_blkif_status(be->blkif);
62935 +}
62936 +
62937 +/**
62938 + * Callback received when the frontend's state changes.
62939 + */
62940 +static void tap_frontend_changed(struct xenbus_device *dev,
62941 + enum xenbus_state frontend_state)
62942 +{
62943 + struct backend_info *be = dev->dev.driver_data;
62944 + int err;
62945 +
62946 + DPRINTK("\n");
62947 +
62948 + switch (frontend_state) {
62949 + case XenbusStateInitialising:
62950 + if (dev->state == XenbusStateClosed) {
62951 + printk("%s: %s: prepare for reconnect\n",
62952 + __FUNCTION__, dev->nodename);
62953 + xenbus_switch_state(dev, XenbusStateInitWait);
62954 + }
62955 + break;
62956 +
62957 + case XenbusStateInitialised:
62958 + case XenbusStateConnected:
62959 + /* Ensure we connect even when two watches fire in
62960 + close successsion and we miss the intermediate value
62961 + of frontend_state. */
62962 + if (dev->state == XenbusStateConnected)
62963 + break;
62964 +
62965 + err = connect_ring(be);
62966 + if (err)
62967 + break;
62968 + tap_update_blkif_status(be->blkif);
62969 + break;
62970 +
62971 + case XenbusStateClosing:
62972 + if (be->blkif->xenblkd) {
62973 + kthread_stop(be->blkif->xenblkd);
62974 + be->blkif->xenblkd = NULL;
62975 + }
62976 + xenbus_switch_state(dev, XenbusStateClosing);
62977 + break;
62978 +
62979 + case XenbusStateClosed:
62980 + xenbus_switch_state(dev, XenbusStateClosed);
62981 + if (xenbus_dev_is_online(dev))
62982 + break;
62983 + /* fall through if not online */
62984 + case XenbusStateUnknown:
62985 + device_unregister(&dev->dev);
62986 + break;
62987 +
62988 + default:
62989 + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
62990 + frontend_state);
62991 + break;
62992 + }
62993 +}
62994 +
62995 +
62996 +/**
62997 + * Switch to Connected state.
62998 + */
62999 +static void connect(struct backend_info *be)
63000 +{
63001 + int err;
63002 +
63003 + struct xenbus_device *dev = be->dev;
63004 +
63005 + err = xenbus_switch_state(dev, XenbusStateConnected);
63006 + if (err)
63007 + xenbus_dev_fatal(dev, err, "switching to Connected state",
63008 + dev->nodename);
63009 +
63010 + return;
63011 +}
63012 +
63013 +
63014 +static int connect_ring(struct backend_info *be)
63015 +{
63016 + struct xenbus_device *dev = be->dev;
63017 + unsigned long ring_ref;
63018 + unsigned int evtchn;
63019 + int err;
63020 +
63021 + DPRINTK("%s\n", dev->otherend);
63022 +
63023 + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
63024 + &ring_ref, "event-channel", "%u", &evtchn, NULL);
63025 + if (err) {
63026 + xenbus_dev_fatal(dev, err,
63027 + "reading %s/ring-ref and event-channel",
63028 + dev->otherend);
63029 + return err;
63030 + }
63031 +
63032 + /* Map the shared frame, irq etc. */
63033 + err = tap_blkif_map(be->blkif, ring_ref, evtchn);
63034 + if (err) {
63035 + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
63036 + ring_ref, evtchn);
63037 + return err;
63038 + }
63039 +
63040 + return 0;
63041 +}
63042 +
63043 +
63044 +/* ** Driver Registration ** */
63045 +
63046 +
63047 +static struct xenbus_device_id blktap_ids[] = {
63048 + { "tap" },
63049 + { "" }
63050 +};
63051 +
63052 +
63053 +static struct xenbus_driver blktap = {
63054 + .name = "tap",
63055 + .owner = THIS_MODULE,
63056 + .ids = blktap_ids,
63057 + .probe = blktap_probe,
63058 + .remove = blktap_remove,
63059 + .otherend_changed = tap_frontend_changed
63060 +};
63061 +
63062 +
63063 +void tap_blkif_xenbus_init(void)
63064 +{
63065 + xenbus_register_backend(&blktap);
63066 +}
63067 diff -Nur linux-2.6.16.33-noxen/drivers/xen/char/Makefile linux-2.6.16.33/drivers/xen/char/Makefile
63068 --- linux-2.6.16.33-noxen/drivers/xen/char/Makefile 1970-01-01 00:00:00.000000000 +0000
63069 +++ linux-2.6.16.33/drivers/xen/char/Makefile 2007-01-08 15:00:45.000000000 +0000
63070 @@ -0,0 +1,2 @@
63071 +
63072 +obj-y := mem.o
63073 diff -Nur linux-2.6.16.33-noxen/drivers/xen/char/mem.c linux-2.6.16.33/drivers/xen/char/mem.c
63074 --- linux-2.6.16.33-noxen/drivers/xen/char/mem.c 1970-01-01 00:00:00.000000000 +0000
63075 +++ linux-2.6.16.33/drivers/xen/char/mem.c 2007-01-08 15:00:45.000000000 +0000
63076 @@ -0,0 +1,205 @@
63077 +/*
63078 + * Originally from linux/drivers/char/mem.c
63079 + *
63080 + * Copyright (C) 1991, 1992 Linus Torvalds
63081 + *
63082 + * Added devfs support.
63083 + * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
63084 + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
63085 + */
63086 +
63087 +#include <linux/config.h>
63088 +#include <linux/mm.h>
63089 +#include <linux/miscdevice.h>
63090 +#include <linux/slab.h>
63091 +#include <linux/vmalloc.h>
63092 +#include <linux/mman.h>
63093 +#include <linux/random.h>
63094 +#include <linux/init.h>
63095 +#include <linux/raw.h>
63096 +#include <linux/tty.h>
63097 +#include <linux/capability.h>
63098 +#include <linux/smp_lock.h>
63099 +#include <linux/devfs_fs_kernel.h>
63100 +#include <linux/ptrace.h>
63101 +#include <linux/device.h>
63102 +#include <asm/pgalloc.h>
63103 +#include <asm/uaccess.h>
63104 +#include <asm/io.h>
63105 +#include <asm/hypervisor.h>
63106 +
63107 +#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE
63108 +static inline int valid_phys_addr_range(unsigned long addr, size_t *count)
63109 +{
63110 + return 1;
63111 +}
63112 +#endif
63113 +
63114 +/*
63115 + * This funcion reads the *physical* memory. The f_pos points directly to the
63116 + * memory location.
63117 + */
63118 +static ssize_t read_mem(struct file * file, char __user * buf,
63119 + size_t count, loff_t *ppos)
63120 +{
63121 + unsigned long p = *ppos, ignored;
63122 + ssize_t read = 0, sz;
63123 + void __iomem *v;
63124 +
63125 + if (!valid_phys_addr_range(p, &count))
63126 + return -EFAULT;
63127 +
63128 + while (count > 0) {
63129 + /*
63130 + * Handle first page in case it's not aligned
63131 + */
63132 + if (-p & (PAGE_SIZE - 1))
63133 + sz = -p & (PAGE_SIZE - 1);
63134 + else
63135 + sz = PAGE_SIZE;
63136 +
63137 + sz = min_t(unsigned long, sz, count);
63138 +
63139 + v = xlate_dev_mem_ptr(p, sz);
63140 + if (IS_ERR(v) || v == NULL) {
63141 + /*
63142 + * Some programs (e.g., dmidecode) groove off into
63143 + * weird RAM areas where no tables can possibly exist
63144 + * (because Xen will have stomped on them!). These
63145 + * programs get rather upset if we let them know that
63146 + * Xen failed their access, so we fake out a read of
63147 + * all zeroes.
63148 + */
63149 + if (clear_user(buf, count))
63150 + return -EFAULT;
63151 + read += count;
63152 + break;
63153 + }
63154 +
63155 + ignored = copy_to_user(buf, v, sz);
63156 + xlate_dev_mem_ptr_unmap(v);
63157 + if (ignored)
63158 + return -EFAULT;
63159 + buf += sz;
63160 + p += sz;
63161 + count -= sz;
63162 + read += sz;
63163 + }
63164 +
63165 + *ppos += read;
63166 + return read;
63167 +}
63168 +
63169 +static ssize_t write_mem(struct file * file, const char __user * buf,
63170 + size_t count, loff_t *ppos)
63171 +{
63172 + unsigned long p = *ppos, ignored;
63173 + ssize_t written = 0, sz;
63174 + void __iomem *v;
63175 +
63176 + if (!valid_phys_addr_range(p, &count))
63177 + return -EFAULT;
63178 +
63179 + while (count > 0) {
63180 + /*
63181 + * Handle first page in case it's not aligned
63182 + */
63183 + if (-p & (PAGE_SIZE - 1))
63184 + sz = -p & (PAGE_SIZE - 1);
63185 + else
63186 + sz = PAGE_SIZE;
63187 +
63188 + sz = min_t(unsigned long, sz, count);
63189 +
63190 + v = xlate_dev_mem_ptr(p, sz);
63191 + if (v == NULL)
63192 + break;
63193 + if (IS_ERR(v)) {
63194 + if (written == 0)
63195 + return PTR_ERR(v);
63196 + break;
63197 + }
63198 +
63199 + ignored = copy_from_user(v, buf, sz);
63200 + xlate_dev_mem_ptr_unmap(v);
63201 + if (ignored) {
63202 + written += sz - ignored;
63203 + if (written)
63204 + break;
63205 + return -EFAULT;
63206 + }
63207 + buf += sz;
63208 + p += sz;
63209 + count -= sz;
63210 + written += sz;
63211 + }
63212 +
63213 + *ppos += written;
63214 + return written;
63215 +}
63216 +
63217 +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
63218 +static inline int uncached_access(struct file *file)
63219 +{
63220 + if (file->f_flags & O_SYNC)
63221 + return 1;
63222 + /* Xen sets correct MTRR type on non-RAM for us. */
63223 + return 0;
63224 +}
63225 +
63226 +static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
63227 +{
63228 + size_t size = vma->vm_end - vma->vm_start;
63229 +
63230 + if (uncached_access(file))
63231 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
63232 +
63233 + /* We want to return the real error code, not EAGAIN. */
63234 + return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
63235 + size, vma->vm_page_prot, DOMID_IO);
63236 +}
63237 +#endif
63238 +
63239 +/*
63240 + * The memory devices use the full 32/64 bits of the offset, and so we cannot
63241 + * check against negative addresses: they are ok. The return value is weird,
63242 + * though, in that case (0).
63243 + *
63244 + * also note that seeking relative to the "end of file" isn't supported:
63245 + * it has no meaning, so it returns -EINVAL.
63246 + */
63247 +static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
63248 +{
63249 + loff_t ret;
63250 +
63251 + mutex_lock(&file->f_dentry->d_inode->i_mutex);
63252 + switch (orig) {
63253 + case 0:
63254 + file->f_pos = offset;
63255 + ret = file->f_pos;
63256 + force_successful_syscall_return();
63257 + break;
63258 + case 1:
63259 + file->f_pos += offset;
63260 + ret = file->f_pos;
63261 + force_successful_syscall_return();
63262 + break;
63263 + default:
63264 + ret = -EINVAL;
63265 + }
63266 + mutex_unlock(&file->f_dentry->d_inode->i_mutex);
63267 + return ret;
63268 +}
63269 +
63270 +static int open_mem(struct inode * inode, struct file * filp)
63271 +{
63272 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
63273 +}
63274 +
63275 +struct file_operations mem_fops = {
63276 + .llseek = memory_lseek,
63277 + .read = read_mem,
63278 + .write = write_mem,
63279 + .mmap = xen_mmap_mem,
63280 + .open = open_mem,
63281 +};
63282 diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/Makefile linux-2.6.16.33/drivers/xen/console/Makefile
63283 --- linux-2.6.16.33-noxen/drivers/xen/console/Makefile 1970-01-01 00:00:00.000000000 +0000
63284 +++ linux-2.6.16.33/drivers/xen/console/Makefile 2007-01-08 15:00:45.000000000 +0000
63285 @@ -0,0 +1,2 @@
63286 +
63287 +obj-y := console.o xencons_ring.o
63288 diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/console.c linux-2.6.16.33/drivers/xen/console/console.c
63289 --- linux-2.6.16.33-noxen/drivers/xen/console/console.c 1970-01-01 00:00:00.000000000 +0000
63290 +++ linux-2.6.16.33/drivers/xen/console/console.c 2007-01-08 15:00:45.000000000 +0000
63291 @@ -0,0 +1,718 @@
63292 +/******************************************************************************
63293 + * console.c
63294 + *
63295 + * Virtual console driver.
63296 + *
63297 + * Copyright (c) 2002-2004, K A Fraser.
63298 + *
63299 + * This program is free software; you can redistribute it and/or
63300 + * modify it under the terms of the GNU General Public License version 2
63301 + * as published by the Free Software Foundation; or, when distributed
63302 + * separately from the Linux kernel or incorporated into other
63303 + * software packages, subject to the following license:
63304 + *
63305 + * Permission is hereby granted, free of charge, to any person obtaining a copy
63306 + * of this source file (the "Software"), to deal in the Software without
63307 + * restriction, including without limitation the rights to use, copy, modify,
63308 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63309 + * and to permit persons to whom the Software is furnished to do so, subject to
63310 + * the following conditions:
63311 + *
63312 + * The above copyright notice and this permission notice shall be included in
63313 + * all copies or substantial portions of the Software.
63314 + *
63315 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63316 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63317 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63318 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63319 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63320 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63321 + * IN THE SOFTWARE.
63322 + */
63323 +
63324 +#include <linux/config.h>
63325 +#include <linux/version.h>
63326 +#include <linux/module.h>
63327 +#include <linux/errno.h>
63328 +#include <linux/signal.h>
63329 +#include <linux/sched.h>
63330 +#include <linux/interrupt.h>
63331 +#include <linux/tty.h>
63332 +#include <linux/tty_flip.h>
63333 +#include <linux/serial.h>
63334 +#include <linux/major.h>
63335 +#include <linux/ptrace.h>
63336 +#include <linux/ioport.h>
63337 +#include <linux/mm.h>
63338 +#include <linux/slab.h>
63339 +#include <linux/init.h>
63340 +#include <linux/console.h>
63341 +#include <linux/bootmem.h>
63342 +#include <linux/sysrq.h>
63343 +#include <linux/screen_info.h>
63344 +#include <asm/io.h>
63345 +#include <asm/irq.h>
63346 +#include <asm/uaccess.h>
63347 +#include <xen/interface/xen.h>
63348 +#include <xen/interface/event_channel.h>
63349 +#include <asm/hypervisor.h>
63350 +#include <xen/evtchn.h>
63351 +#include <xen/xenbus.h>
63352 +#include <xen/xencons.h>
63353 +
63354 +/*
63355 + * Modes:
63356 + * 'xencons=off' [XC_OFF]: Console is disabled.
63357 + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'.
63358 + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'.
63359 + * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'.
63360 + * default: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
63361 + *
63362 + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
63363 + * warnings from standard distro startup scripts.
63364 + */
63365 +static enum {
63366 + XC_OFF, XC_TTY, XC_SERIAL, XC_XVC
63367 +} xc_mode;
63368 +static int xc_num = -1;
63369 +
63370 +/* /dev/xvc0 device number allocated by lanana.org. */
63371 +#define XEN_XVC_MAJOR 204
63372 +#define XEN_XVC_MINOR 191
63373 +
63374 +#ifdef CONFIG_MAGIC_SYSRQ
63375 +static unsigned long sysrq_requested;
63376 +extern int sysrq_enabled;
63377 +#endif
63378 +
63379 +void xencons_early_setup(void)
63380 +{
63381 + extern int console_use_vt;
63382 +
63383 + if (is_initial_xendomain()) {
63384 + xc_mode = XC_SERIAL;
63385 + } else {
63386 + xc_mode = XC_TTY;
63387 + console_use_vt = 0;
63388 + }
63389 +}
63390 +
63391 +static int __init xencons_setup(char *str)
63392 +{
63393 + char *q;
63394 + int n;
63395 + extern int console_use_vt;
63396 +
63397 + console_use_vt = 1;
63398 + if (!strncmp(str, "ttyS", 4)) {
63399 + xc_mode = XC_SERIAL;
63400 + str += 4;
63401 + } else if (!strncmp(str, "tty", 3)) {
63402 + xc_mode = XC_TTY;
63403 + str += 3;
63404 + console_use_vt = 0;
63405 + } else if (!strncmp(str, "xvc", 3)) {
63406 + xc_mode = XC_XVC;
63407 + str += 3;
63408 + } else if (!strncmp(str, "off", 3)) {
63409 + xc_mode = XC_OFF;
63410 + str += 3;
63411 + }
63412 +
63413 + n = simple_strtol(str, &q, 10);
63414 + if (q != str)
63415 + xc_num = n;
63416 +
63417 + return 1;
63418 +}
63419 +__setup("xencons=", xencons_setup);
63420 +
63421 +/* The kernel and user-land drivers share a common transmit buffer. */
63422 +static unsigned int wbuf_size = 4096;
63423 +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
63424 +static char *wbuf;
63425 +static unsigned int wc, wp; /* write_cons, write_prod */
63426 +
63427 +static int __init xencons_bufsz_setup(char *str)
63428 +{
63429 + unsigned int goal;
63430 + goal = simple_strtoul(str, NULL, 0);
63431 + if (goal) {
63432 + goal = roundup_pow_of_two(goal);
63433 + if (wbuf_size < goal)
63434 + wbuf_size = goal;
63435 + }
63436 + return 1;
63437 +}
63438 +__setup("xencons_bufsz=", xencons_bufsz_setup);
63439 +
63440 +/* This lock protects accesses to the common transmit buffer. */
63441 +static DEFINE_SPINLOCK(xencons_lock);
63442 +
63443 +/* Common transmit-kick routine. */
63444 +static void __xencons_tx_flush(void);
63445 +
63446 +static struct tty_driver *xencons_driver;
63447 +
63448 +/******************** Kernel console driver ********************************/
63449 +
63450 +static void kcons_write(struct console *c, const char *s, unsigned int count)
63451 +{
63452 + int i = 0;
63453 + unsigned long flags;
63454 +
63455 + spin_lock_irqsave(&xencons_lock, flags);
63456 +
63457 + while (i < count) {
63458 + for (; i < count; i++) {
63459 + if ((wp - wc) >= (wbuf_size - 1))
63460 + break;
63461 + if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
63462 + wbuf[WBUF_MASK(wp++)] = '\r';
63463 + }
63464 +
63465 + __xencons_tx_flush();
63466 + }
63467 +
63468 + spin_unlock_irqrestore(&xencons_lock, flags);
63469 +}
63470 +
63471 +static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
63472 +{
63473 +
63474 + while (count > 0) {
63475 + int rc;
63476 + rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
63477 + if (rc <= 0)
63478 + break;
63479 + count -= rc;
63480 + s += rc;
63481 + }
63482 +}
63483 +
63484 +static struct tty_driver *kcons_device(struct console *c, int *index)
63485 +{
63486 + *index = 0;
63487 + return xencons_driver;
63488 +}
63489 +
63490 +static struct console kcons_info = {
63491 + .device = kcons_device,
63492 + .flags = CON_PRINTBUFFER | CON_ENABLED,
63493 + .index = -1,
63494 +};
63495 +
63496 +static int __init xen_console_init(void)
63497 +{
63498 + if (!is_running_on_xen())
63499 + goto out;
63500 +
63501 + if (is_initial_xendomain()) {
63502 + kcons_info.write = kcons_write_dom0;
63503 + } else {
63504 + if (!xen_start_info->console.domU.evtchn)
63505 + goto out;
63506 + kcons_info.write = kcons_write;
63507 + }
63508 +
63509 + switch (xc_mode) {
63510 + case XC_XVC:
63511 + strcpy(kcons_info.name, "xvc");
63512 + if (xc_num == -1)
63513 + xc_num = 0;
63514 + break;
63515 +
63516 + case XC_SERIAL:
63517 + strcpy(kcons_info.name, "ttyS");
63518 + if (xc_num == -1)
63519 + xc_num = 0;
63520 + break;
63521 +
63522 + case XC_TTY:
63523 + strcpy(kcons_info.name, "tty");
63524 + if (xc_num == -1)
63525 + xc_num = 1;
63526 + break;
63527 +
63528 + default:
63529 + goto out;
63530 + }
63531 +
63532 + wbuf = alloc_bootmem(wbuf_size);
63533 +
63534 + register_console(&kcons_info);
63535 +
63536 + out:
63537 + return 0;
63538 +}
63539 +console_initcall(xen_console_init);
63540 +
63541 +/*** Useful function for console debugging -- goes straight to Xen. ***/
63542 +asmlinkage int xprintk(const char *fmt, ...)
63543 +{
63544 + va_list args;
63545 + int printk_len;
63546 + static char printk_buf[1024];
63547 +
63548 + /* Emit the output into the temporary buffer */
63549 + va_start(args, fmt);
63550 + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
63551 + va_end(args);
63552 +
63553 + /* Send the processed output directly to Xen. */
63554 + kcons_write_dom0(NULL, printk_buf, printk_len);
63555 +
63556 + return 0;
63557 +}
63558 +
63559 +/*** Forcibly flush console data before dying. ***/
63560 +void xencons_force_flush(void)
63561 +{
63562 + int sz;
63563 +
63564 + /* Emergency console is synchronous, so there's nothing to flush. */
63565 + if (!is_running_on_xen() ||
63566 + is_initial_xendomain() ||
63567 + !xen_start_info->console.domU.evtchn)
63568 + return;
63569 +
63570 + /* Spin until console data is flushed through to the daemon. */
63571 + while (wc != wp) {
63572 + int sent = 0;
63573 + if ((sz = wp - wc) == 0)
63574 + continue;
63575 + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
63576 + if (sent > 0)
63577 + wc += sent;
63578 + }
63579 +}
63580 +
63581 +
63582 +void dom0_init_screen_info(const struct dom0_vga_console_info *info)
63583 +{
63584 + switch (info->video_type) {
63585 + case XEN_VGATYPE_TEXT_MODE_3:
63586 + screen_info.orig_video_mode = 3;
63587 + screen_info.orig_video_ega_bx = 3;
63588 + screen_info.orig_video_isVGA = 1;
63589 + screen_info.orig_video_lines = info->u.text_mode_3.rows;
63590 + screen_info.orig_video_cols = info->u.text_mode_3.columns;
63591 + screen_info.orig_x = info->u.text_mode_3.cursor_x;
63592 + screen_info.orig_y = info->u.text_mode_3.cursor_y;
63593 + screen_info.orig_video_points =
63594 + info->u.text_mode_3.font_height;
63595 + break;
63596 + case XEN_VGATYPE_VESA_LFB:
63597 + screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
63598 + screen_info.lfb_width = info->u.vesa_lfb.width;
63599 + screen_info.lfb_height = info->u.vesa_lfb.height;
63600 + screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
63601 + screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
63602 + screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
63603 + screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
63604 + screen_info.red_size = info->u.vesa_lfb.red_size;
63605 + screen_info.red_pos = info->u.vesa_lfb.red_pos;
63606 + screen_info.green_size = info->u.vesa_lfb.green_size;
63607 + screen_info.green_pos = info->u.vesa_lfb.green_pos;
63608 + screen_info.blue_size = info->u.vesa_lfb.blue_size;
63609 + screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
63610 + screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
63611 + screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
63612 + break;
63613 + }
63614 +}
63615 +
63616 +
63617 +/******************** User-space console driver (/dev/console) ************/
63618 +
63619 +#define DRV(_d) (_d)
63620 +#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
63621 + ((_tty)->index != (xc_num - 1)))
63622 +
63623 +static struct termios *xencons_termios[MAX_NR_CONSOLES];
63624 +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
63625 +static struct tty_struct *xencons_tty;
63626 +static int xencons_priv_irq;
63627 +static char x_char;
63628 +
63629 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
63630 +{
63631 + int i;
63632 + unsigned long flags;
63633 +
63634 + spin_lock_irqsave(&xencons_lock, flags);
63635 + if (xencons_tty == NULL)
63636 + goto out;
63637 +
63638 + for (i = 0; i < len; i++) {
63639 +#ifdef CONFIG_MAGIC_SYSRQ
63640 + if (sysrq_enabled) {
63641 + if (buf[i] == '\x0f') { /* ^O */
63642 + sysrq_requested = jiffies;
63643 + continue; /* don't print the sysrq key */
63644 + } else if (sysrq_requested) {
63645 + unsigned long sysrq_timeout =
63646 + sysrq_requested + HZ*2;
63647 + sysrq_requested = 0;
63648 + if (time_before(jiffies, sysrq_timeout)) {
63649 + spin_unlock_irqrestore(
63650 + &xencons_lock, flags);
63651 + handle_sysrq(
63652 + buf[i], regs, xencons_tty);
63653 + spin_lock_irqsave(
63654 + &xencons_lock, flags);
63655 + continue;
63656 + }
63657 + }
63658 + }
63659 +#endif
63660 + tty_insert_flip_char(xencons_tty, buf[i], 0);
63661 + }
63662 + tty_flip_buffer_push(xencons_tty);
63663 +
63664 + out:
63665 + spin_unlock_irqrestore(&xencons_lock, flags);
63666 +}
63667 +
63668 +static void __xencons_tx_flush(void)
63669 +{
63670 + int sent, sz, work_done = 0;
63671 +
63672 + if (x_char) {
63673 + if (is_initial_xendomain())
63674 + kcons_write_dom0(NULL, &x_char, 1);
63675 + else
63676 + while (x_char)
63677 + if (xencons_ring_send(&x_char, 1) == 1)
63678 + break;
63679 + x_char = 0;
63680 + work_done = 1;
63681 + }
63682 +
63683 + while (wc != wp) {
63684 + sz = wp - wc;
63685 + if (sz > (wbuf_size - WBUF_MASK(wc)))
63686 + sz = wbuf_size - WBUF_MASK(wc);
63687 + if (is_initial_xendomain()) {
63688 + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
63689 + wc += sz;
63690 + } else {
63691 + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
63692 + if (sent == 0)
63693 + break;
63694 + wc += sent;
63695 + }
63696 + work_done = 1;
63697 + }
63698 +
63699 + if (work_done && (xencons_tty != NULL)) {
63700 + wake_up_interruptible(&xencons_tty->write_wait);
63701 + if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
63702 + (xencons_tty->ldisc.write_wakeup != NULL))
63703 + (xencons_tty->ldisc.write_wakeup)(xencons_tty);
63704 + }
63705 +}
63706 +
63707 +void xencons_tx(void)
63708 +{
63709 + unsigned long flags;
63710 +
63711 + spin_lock_irqsave(&xencons_lock, flags);
63712 + __xencons_tx_flush();
63713 + spin_unlock_irqrestore(&xencons_lock, flags);
63714 +}
63715 +
63716 +/* Privileged receive callback and transmit kicker. */
63717 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
63718 + struct pt_regs *regs)
63719 +{
63720 + static char rbuf[16];
63721 + int l;
63722 +
63723 + while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
63724 + xencons_rx(rbuf, l, regs);
63725 +
63726 + xencons_tx();
63727 +
63728 + return IRQ_HANDLED;
63729 +}
63730 +
63731 +static int xencons_write_room(struct tty_struct *tty)
63732 +{
63733 + return wbuf_size - (wp - wc);
63734 +}
63735 +
63736 +static int xencons_chars_in_buffer(struct tty_struct *tty)
63737 +{
63738 + return wp - wc;
63739 +}
63740 +
63741 +static void xencons_send_xchar(struct tty_struct *tty, char ch)
63742 +{
63743 + unsigned long flags;
63744 +
63745 + if (DUMMY_TTY(tty))
63746 + return;
63747 +
63748 + spin_lock_irqsave(&xencons_lock, flags);
63749 + x_char = ch;
63750 + __xencons_tx_flush();
63751 + spin_unlock_irqrestore(&xencons_lock, flags);
63752 +}
63753 +
63754 +static void xencons_throttle(struct tty_struct *tty)
63755 +{
63756 + if (DUMMY_TTY(tty))
63757 + return;
63758 +
63759 + if (I_IXOFF(tty))
63760 + xencons_send_xchar(tty, STOP_CHAR(tty));
63761 +}
63762 +
63763 +static void xencons_unthrottle(struct tty_struct *tty)
63764 +{
63765 + if (DUMMY_TTY(tty))
63766 + return;
63767 +
63768 + if (I_IXOFF(tty)) {
63769 + if (x_char != 0)
63770 + x_char = 0;
63771 + else
63772 + xencons_send_xchar(tty, START_CHAR(tty));
63773 + }
63774 +}
63775 +
63776 +static void xencons_flush_buffer(struct tty_struct *tty)
63777 +{
63778 + unsigned long flags;
63779 +
63780 + if (DUMMY_TTY(tty))
63781 + return;
63782 +
63783 + spin_lock_irqsave(&xencons_lock, flags);
63784 + wc = wp = 0;
63785 + spin_unlock_irqrestore(&xencons_lock, flags);
63786 +}
63787 +
63788 +static inline int __xencons_put_char(int ch)
63789 +{
63790 + char _ch = (char)ch;
63791 + if ((wp - wc) == wbuf_size)
63792 + return 0;
63793 + wbuf[WBUF_MASK(wp++)] = _ch;
63794 + return 1;
63795 +}
63796 +
63797 +static int xencons_write(
63798 + struct tty_struct *tty,
63799 + const unsigned char *buf,
63800 + int count)
63801 +{
63802 + int i;
63803 + unsigned long flags;
63804 +
63805 + if (DUMMY_TTY(tty))
63806 + return count;
63807 +
63808 + spin_lock_irqsave(&xencons_lock, flags);
63809 +
63810 + for (i = 0; i < count; i++)
63811 + if (!__xencons_put_char(buf[i]))
63812 + break;
63813 +
63814 + if (i != 0)
63815 + __xencons_tx_flush();
63816 +
63817 + spin_unlock_irqrestore(&xencons_lock, flags);
63818 +
63819 + return i;
63820 +}
63821 +
63822 +static void xencons_put_char(struct tty_struct *tty, u_char ch)
63823 +{
63824 + unsigned long flags;
63825 +
63826 + if (DUMMY_TTY(tty))
63827 + return;
63828 +
63829 + spin_lock_irqsave(&xencons_lock, flags);
63830 + (void)__xencons_put_char(ch);
63831 + spin_unlock_irqrestore(&xencons_lock, flags);
63832 +}
63833 +
63834 +static void xencons_flush_chars(struct tty_struct *tty)
63835 +{
63836 + unsigned long flags;
63837 +
63838 + if (DUMMY_TTY(tty))
63839 + return;
63840 +
63841 + spin_lock_irqsave(&xencons_lock, flags);
63842 + __xencons_tx_flush();
63843 + spin_unlock_irqrestore(&xencons_lock, flags);
63844 +}
63845 +
63846 +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
63847 +{
63848 + unsigned long orig_jiffies = jiffies;
63849 +
63850 + if (DUMMY_TTY(tty))
63851 + return;
63852 +
63853 + while (DRV(tty->driver)->chars_in_buffer(tty)) {
63854 + set_current_state(TASK_INTERRUPTIBLE);
63855 + schedule_timeout(1);
63856 + if (signal_pending(current))
63857 + break;
63858 + if (timeout && time_after(jiffies, orig_jiffies + timeout))
63859 + break;
63860 + }
63861 +
63862 + set_current_state(TASK_RUNNING);
63863 +}
63864 +
63865 +static int xencons_open(struct tty_struct *tty, struct file *filp)
63866 +{
63867 + unsigned long flags;
63868 +
63869 + if (DUMMY_TTY(tty))
63870 + return 0;
63871 +
63872 + spin_lock_irqsave(&xencons_lock, flags);
63873 + tty->driver_data = NULL;
63874 + if (xencons_tty == NULL)
63875 + xencons_tty = tty;
63876 + __xencons_tx_flush();
63877 + spin_unlock_irqrestore(&xencons_lock, flags);
63878 +
63879 + return 0;
63880 +}
63881 +
63882 +static void xencons_close(struct tty_struct *tty, struct file *filp)
63883 +{
63884 + unsigned long flags;
63885 +
63886 + if (DUMMY_TTY(tty))
63887 + return;
63888 +
63889 + down(&tty_sem);
63890 +
63891 + if (tty->count != 1) {
63892 + up(&tty_sem);
63893 + return;
63894 + }
63895 +
63896 + /* Prevent other threads from re-opening this tty. */
63897 + set_bit(TTY_CLOSING, &tty->flags);
63898 + up(&tty_sem);
63899 +
63900 + tty->closing = 1;
63901 + tty_wait_until_sent(tty, 0);
63902 + if (DRV(tty->driver)->flush_buffer != NULL)
63903 + DRV(tty->driver)->flush_buffer(tty);
63904 + if (tty->ldisc.flush_buffer != NULL)
63905 + tty->ldisc.flush_buffer(tty);
63906 + tty->closing = 0;
63907 + spin_lock_irqsave(&xencons_lock, flags);
63908 + xencons_tty = NULL;
63909 + spin_unlock_irqrestore(&xencons_lock, flags);
63910 +}
63911 +
63912 +static struct tty_operations xencons_ops = {
63913 + .open = xencons_open,
63914 + .close = xencons_close,
63915 + .write = xencons_write,
63916 + .write_room = xencons_write_room,
63917 + .put_char = xencons_put_char,
63918 + .flush_chars = xencons_flush_chars,
63919 + .chars_in_buffer = xencons_chars_in_buffer,
63920 + .send_xchar = xencons_send_xchar,
63921 + .flush_buffer = xencons_flush_buffer,
63922 + .throttle = xencons_throttle,
63923 + .unthrottle = xencons_unthrottle,
63924 + .wait_until_sent = xencons_wait_until_sent,
63925 +};
63926 +
63927 +static int __init xencons_init(void)
63928 +{
63929 + int rc;
63930 +
63931 + if (!is_running_on_xen())
63932 + return -ENODEV;
63933 +
63934 + if (xc_mode == XC_OFF)
63935 + return 0;
63936 +
63937 + if (!is_initial_xendomain()) {
63938 + rc = xencons_ring_init();
63939 + if (rc)
63940 + return rc;
63941 + }
63942 +
63943 + xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
63944 + MAX_NR_CONSOLES : 1);
63945 + if (xencons_driver == NULL)
63946 + return -ENOMEM;
63947 +
63948 + DRV(xencons_driver)->name = "xencons";
63949 + DRV(xencons_driver)->major = TTY_MAJOR;
63950 + DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL;
63951 + DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL;
63952 + DRV(xencons_driver)->init_termios = tty_std_termios;
63953 + DRV(xencons_driver)->flags =
63954 + TTY_DRIVER_REAL_RAW |
63955 + TTY_DRIVER_RESET_TERMIOS;
63956 + DRV(xencons_driver)->termios = xencons_termios;
63957 + DRV(xencons_driver)->termios_locked = xencons_termios_locked;
63958 +
63959 + switch (xc_mode) {
63960 + case XC_XVC:
63961 + DRV(xencons_driver)->name = "xvc";
63962 + DRV(xencons_driver)->major = XEN_XVC_MAJOR;
63963 + DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
63964 + DRV(xencons_driver)->name_base = xc_num;
63965 + break;
63966 + case XC_SERIAL:
63967 + DRV(xencons_driver)->name = "ttyS";
63968 + DRV(xencons_driver)->minor_start = 64 + xc_num;
63969 + DRV(xencons_driver)->name_base = xc_num;
63970 + break;
63971 + default:
63972 + DRV(xencons_driver)->name = "tty";
63973 + DRV(xencons_driver)->minor_start = 1;
63974 + DRV(xencons_driver)->name_base = 1;
63975 + break;
63976 + }
63977 +
63978 + tty_set_operations(xencons_driver, &xencons_ops);
63979 +
63980 + if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
63981 + printk("WARNING: Failed to register Xen virtual "
63982 + "console driver as '%s%d'\n",
63983 + DRV(xencons_driver)->name,
63984 + DRV(xencons_driver)->name_base);
63985 + put_tty_driver(xencons_driver);
63986 + xencons_driver = NULL;
63987 + return rc;
63988 + }
63989 +
63990 + if (is_initial_xendomain()) {
63991 + xencons_priv_irq = bind_virq_to_irqhandler(
63992 + VIRQ_CONSOLE,
63993 + 0,
63994 + xencons_priv_interrupt,
63995 + 0,
63996 + "console",
63997 + NULL);
63998 + BUG_ON(xencons_priv_irq < 0);
63999 + }
64000 +
64001 + printk("Xen virtual console successfully installed as %s%d\n",
64002 + DRV(xencons_driver)->name, xc_num);
64003 +
64004 + return 0;
64005 +}
64006 +
64007 +module_init(xencons_init);
64008 +
64009 +MODULE_LICENSE("Dual BSD/GPL");
64010 diff -Nur linux-2.6.16.33-noxen/drivers/xen/console/xencons_ring.c linux-2.6.16.33/drivers/xen/console/xencons_ring.c
64011 --- linux-2.6.16.33-noxen/drivers/xen/console/xencons_ring.c 1970-01-01 00:00:00.000000000 +0000
64012 +++ linux-2.6.16.33/drivers/xen/console/xencons_ring.c 2007-01-08 15:00:45.000000000 +0000
64013 @@ -0,0 +1,143 @@
64014 +/*
64015 + * This program is free software; you can redistribute it and/or
64016 + * modify it under the terms of the GNU General Public License version 2
64017 + * as published by the Free Software Foundation; or, when distributed
64018 + * separately from the Linux kernel or incorporated into other
64019 + * software packages, subject to the following license:
64020 + *
64021 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64022 + * of this source file (the "Software"), to deal in the Software without
64023 + * restriction, including without limitation the rights to use, copy, modify,
64024 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64025 + * and to permit persons to whom the Software is furnished to do so, subject to
64026 + * the following conditions:
64027 + *
64028 + * The above copyright notice and this permission notice shall be included in
64029 + * all copies or substantial portions of the Software.
64030 + *
64031 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64032 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64033 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64034 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64035 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64036 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64037 + * IN THE SOFTWARE.
64038 + */
64039 +
64040 +#include <linux/version.h>
64041 +#include <linux/module.h>
64042 +#include <linux/errno.h>
64043 +#include <linux/signal.h>
64044 +#include <linux/sched.h>
64045 +#include <linux/interrupt.h>
64046 +#include <linux/tty.h>
64047 +#include <linux/tty_flip.h>
64048 +#include <linux/serial.h>
64049 +#include <linux/major.h>
64050 +#include <linux/ptrace.h>
64051 +#include <linux/ioport.h>
64052 +#include <linux/mm.h>
64053 +#include <linux/slab.h>
64054 +
64055 +#include <asm/hypervisor.h>
64056 +#include <xen/evtchn.h>
64057 +#include <xen/xencons.h>
64058 +#include <linux/wait.h>
64059 +#include <linux/interrupt.h>
64060 +#include <linux/sched.h>
64061 +#include <linux/err.h>
64062 +#include <xen/interface/io/console.h>
64063 +
64064 +static int xencons_irq;
64065 +
64066 +static inline struct xencons_interface *xencons_interface(void)
64067 +{
64068 + return mfn_to_virt(xen_start_info->console.domU.mfn);
64069 +}
64070 +
64071 +static inline void notify_daemon(void)
64072 +{
64073 + /* Use evtchn: this is called early, before irq is set up. */
64074 + notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
64075 +}
64076 +
64077 +int xencons_ring_send(const char *data, unsigned len)
64078 +{
64079 + int sent = 0;
64080 + struct xencons_interface *intf = xencons_interface();
64081 + XENCONS_RING_IDX cons, prod;
64082 +
64083 + cons = intf->out_cons;
64084 + prod = intf->out_prod;
64085 + mb();
64086 + BUG_ON((prod - cons) > sizeof(intf->out));
64087 +
64088 + while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
64089 + intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
64090 +
64091 + wmb();
64092 + intf->out_prod = prod;
64093 +
64094 + notify_daemon();
64095 +
64096 + return sent;
64097 +}
64098 +
64099 +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
64100 +{
64101 + struct xencons_interface *intf = xencons_interface();
64102 + XENCONS_RING_IDX cons, prod;
64103 +
64104 + cons = intf->in_cons;
64105 + prod = intf->in_prod;
64106 + mb();
64107 + BUG_ON((prod - cons) > sizeof(intf->in));
64108 +
64109 + while (cons != prod) {
64110 + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
64111 + cons++;
64112 + }
64113 +
64114 + mb();
64115 + intf->in_cons = cons;
64116 +
64117 + notify_daemon();
64118 +
64119 + xencons_tx();
64120 +
64121 + return IRQ_HANDLED;
64122 +}
64123 +
64124 +int xencons_ring_init(void)
64125 +{
64126 + int irq;
64127 +
64128 + if (xencons_irq)
64129 + unbind_from_irqhandler(xencons_irq, NULL);
64130 + xencons_irq = 0;
64131 +
64132 + if (!is_running_on_xen() ||
64133 + is_initial_xendomain() ||
64134 + !xen_start_info->console.domU.evtchn)
64135 + return -ENODEV;
64136 +
64137 + irq = bind_evtchn_to_irqhandler(
64138 + xen_start_info->console.domU.evtchn,
64139 + handle_input, 0, "xencons", NULL);
64140 + if (irq < 0) {
64141 + printk(KERN_ERR "XEN console request irq failed %i\n", irq);
64142 + return irq;
64143 + }
64144 +
64145 + xencons_irq = irq;
64146 +
64147 + /* In case we have in-flight data after save/restore... */
64148 + notify_daemon();
64149 +
64150 + return 0;
64151 +}
64152 +
64153 +void xencons_resume(void)
64154 +{
64155 + (void)xencons_ring_init();
64156 +}
64157 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/Makefile linux-2.6.16.33/drivers/xen/core/Makefile
64158 --- linux-2.6.16.33-noxen/drivers/xen/core/Makefile 1970-01-01 00:00:00.000000000 +0000
64159 +++ linux-2.6.16.33/drivers/xen/core/Makefile 2007-01-08 15:00:45.000000000 +0000
64160 @@ -0,0 +1,14 @@
64161 +#
64162 +# Makefile for the linux kernel.
64163 +#
64164 +
64165 +obj-y := evtchn.o gnttab.o features.o
64166 +
64167 +obj-$(CONFIG_PROC_FS) += xen_proc.o
64168 +obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o
64169 +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
64170 +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
64171 +obj-$(CONFIG_XEN_SKBUFF) += skbuff.o
64172 +obj-$(CONFIG_XEN_REBOOT) += reboot.o machine_reboot.o
64173 +obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
64174 +obj-$(CONFIG_KEXEC) += machine_kexec.o
64175 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/cpu_hotplug.c linux-2.6.16.33/drivers/xen/core/cpu_hotplug.c
64176 --- linux-2.6.16.33-noxen/drivers/xen/core/cpu_hotplug.c 1970-01-01 00:00:00.000000000 +0000
64177 +++ linux-2.6.16.33/drivers/xen/core/cpu_hotplug.c 2007-01-08 15:00:45.000000000 +0000
64178 @@ -0,0 +1,188 @@
64179 +#include <linux/config.h>
64180 +#include <linux/init.h>
64181 +#include <linux/kernel.h>
64182 +#include <linux/sched.h>
64183 +#include <linux/notifier.h>
64184 +#include <linux/cpu.h>
64185 +#include <xen/cpu_hotplug.h>
64186 +#include <xen/xenbus.h>
64187 +
64188 +/*
64189 + * Set of CPUs that remote admin software will allow us to bring online.
64190 + * Notified to us via xenbus.
64191 + */
64192 +static cpumask_t xenbus_allowed_cpumask;
64193 +
64194 +/* Set of CPUs that local admin will allow us to bring online. */
64195 +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
64196 +
64197 +static int local_cpu_hotplug_request(void)
64198 +{
64199 + /*
64200 + * We assume a CPU hotplug request comes from local admin if it is made
64201 + * via a userspace process (i.e., one with a real mm_struct).
64202 + */
64203 + return (current->mm != NULL);
64204 +}
64205 +
64206 +static void vcpu_hotplug(unsigned int cpu)
64207 +{
64208 + int err;
64209 + char dir[32], state[32];
64210 +
64211 + if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
64212 + return;
64213 +
64214 + sprintf(dir, "cpu/%d", cpu);
64215 + err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
64216 + if (err != 1) {
64217 + printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
64218 + return;
64219 + }
64220 +
64221 + if (strcmp(state, "online") == 0) {
64222 + cpu_set(cpu, xenbus_allowed_cpumask);
64223 + (void)cpu_up(cpu);
64224 + } else if (strcmp(state, "offline") == 0) {
64225 + cpu_clear(cpu, xenbus_allowed_cpumask);
64226 + (void)cpu_down(cpu);
64227 + } else {
64228 + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
64229 + state, cpu);
64230 + }
64231 +}
64232 +
64233 +static void handle_vcpu_hotplug_event(
64234 + struct xenbus_watch *watch, const char **vec, unsigned int len)
64235 +{
64236 + int cpu;
64237 + char *cpustr;
64238 + const char *node = vec[XS_WATCH_PATH];
64239 +
64240 + if ((cpustr = strstr(node, "cpu/")) != NULL) {
64241 + sscanf(cpustr, "cpu/%d", &cpu);
64242 + vcpu_hotplug(cpu);
64243 + }
64244 +}
64245 +
64246 +static int smpboot_cpu_notify(struct notifier_block *notifier,
64247 + unsigned long action, void *hcpu)
64248 +{
64249 + int cpu = (long)hcpu;
64250 +
64251 + /*
64252 + * We do this in a callback notifier rather than __cpu_disable()
64253 + * because local_cpu_hotplug_request() does not work in the latter
64254 + * as it's always executed from within a stopmachine kthread.
64255 + */
64256 + if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
64257 + cpu_clear(cpu, local_allowed_cpumask);
64258 +
64259 + return NOTIFY_OK;
64260 +}
64261 +
64262 +static int setup_cpu_watcher(struct notifier_block *notifier,
64263 + unsigned long event, void *data)
64264 +{
64265 + int i;
64266 +
64267 + static struct xenbus_watch cpu_watch = {
64268 + .node = "cpu",
64269 + .callback = handle_vcpu_hotplug_event,
64270 + .flags = XBWF_new_thread };
64271 + (void)register_xenbus_watch(&cpu_watch);
64272 +
64273 + if (!is_initial_xendomain()) {
64274 + for_each_cpu(i)
64275 + vcpu_hotplug(i);
64276 + printk(KERN_INFO "Brought up %ld CPUs\n",
64277 + (long)num_online_cpus());
64278 + }
64279 +
64280 + return NOTIFY_DONE;
64281 +}
64282 +
64283 +static int __init setup_vcpu_hotplug_event(void)
64284 +{
64285 + static struct notifier_block hotplug_cpu = {
64286 + .notifier_call = smpboot_cpu_notify };
64287 + static struct notifier_block xsn_cpu = {
64288 + .notifier_call = setup_cpu_watcher };
64289 +
64290 + if (!is_running_on_xen())
64291 + return -ENODEV;
64292 +
64293 + register_cpu_notifier(&hotplug_cpu);
64294 + register_xenstore_notifier(&xsn_cpu);
64295 +
64296 + return 0;
64297 +}
64298 +
64299 +arch_initcall(setup_vcpu_hotplug_event);
64300 +
64301 +int smp_suspend(void)
64302 +{
64303 + int i, err;
64304 +
64305 + lock_cpu_hotplug();
64306 +
64307 + /*
64308 + * Take all other CPUs offline. We hold the hotplug mutex to
64309 + * avoid other processes bringing up CPUs under our feet.
64310 + */
64311 + while (num_online_cpus() > 1) {
64312 + unlock_cpu_hotplug();
64313 + for_each_online_cpu(i) {
64314 + if (i == 0)
64315 + continue;
64316 + err = cpu_down(i);
64317 + if (err) {
64318 + printk(KERN_CRIT "Failed to take all CPUs "
64319 + "down: %d.\n", err);
64320 + for_each_cpu(i)
64321 + vcpu_hotplug(i);
64322 + return err;
64323 + }
64324 + }
64325 + lock_cpu_hotplug();
64326 + }
64327 +
64328 + return 0;
64329 +}
64330 +
64331 +void smp_resume(void)
64332 +{
64333 + int cpu;
64334 +
64335 + for_each_cpu(cpu)
64336 + cpu_initialize_context(cpu);
64337 +
64338 + unlock_cpu_hotplug();
64339 +
64340 + for_each_cpu(cpu)
64341 + vcpu_hotplug(cpu);
64342 +}
64343 +
64344 +int cpu_up_check(unsigned int cpu)
64345 +{
64346 + int rc = 0;
64347 +
64348 + if (local_cpu_hotplug_request()) {
64349 + cpu_set(cpu, local_allowed_cpumask);
64350 + if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
64351 + printk("%s: attempt to bring up CPU %u disallowed by "
64352 + "remote admin.\n", __FUNCTION__, cpu);
64353 + rc = -EBUSY;
64354 + }
64355 + } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
64356 + !cpu_isset(cpu, xenbus_allowed_cpumask)) {
64357 + rc = -EBUSY;
64358 + }
64359 +
64360 + return rc;
64361 +}
64362 +
64363 +void init_xenbus_allowed_cpumask(void)
64364 +{
64365 + xenbus_allowed_cpumask = cpu_present_map;
64366 +}
64367 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/evtchn.c linux-2.6.16.33/drivers/xen/core/evtchn.c
64368 --- linux-2.6.16.33-noxen/drivers/xen/core/evtchn.c 1970-01-01 00:00:00.000000000 +0000
64369 +++ linux-2.6.16.33/drivers/xen/core/evtchn.c 2007-01-08 15:00:45.000000000 +0000
64370 @@ -0,0 +1,872 @@
64371 +/******************************************************************************
64372 + * evtchn.c
64373 + *
64374 + * Communication via Xen event channels.
64375 + *
64376 + * Copyright (c) 2002-2005, K A Fraser
64377 + *
64378 + * This program is free software; you can redistribute it and/or
64379 + * modify it under the terms of the GNU General Public License version 2
64380 + * as published by the Free Software Foundation; or, when distributed
64381 + * separately from the Linux kernel or incorporated into other
64382 + * software packages, subject to the following license:
64383 + *
64384 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64385 + * of this source file (the "Software"), to deal in the Software without
64386 + * restriction, including without limitation the rights to use, copy, modify,
64387 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64388 + * and to permit persons to whom the Software is furnished to do so, subject to
64389 + * the following conditions:
64390 + *
64391 + * The above copyright notice and this permission notice shall be included in
64392 + * all copies or substantial portions of the Software.
64393 + *
64394 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64395 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64396 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64397 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64398 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64399 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64400 + * IN THE SOFTWARE.
64401 + */
64402 +
64403 +#include <linux/config.h>
64404 +#include <linux/module.h>
64405 +#include <linux/irq.h>
64406 +#include <linux/interrupt.h>
64407 +#include <linux/sched.h>
64408 +#include <linux/kernel_stat.h>
64409 +#include <linux/version.h>
64410 +#include <asm/atomic.h>
64411 +#include <asm/system.h>
64412 +#include <asm/ptrace.h>
64413 +#include <asm/synch_bitops.h>
64414 +#include <xen/evtchn.h>
64415 +#include <xen/interface/event_channel.h>
64416 +#include <xen/interface/physdev.h>
64417 +#include <asm/hypervisor.h>
64418 +#include <linux/mc146818rtc.h> /* RTC_IRQ */
64419 +
64420 +/*
64421 + * This lock protects updates to the following mapping and reference-count
64422 + * arrays. The lock does not need to be acquired to read the mapping tables.
64423 + */
64424 +static DEFINE_SPINLOCK(irq_mapping_update_lock);
64425 +
64426 +/* IRQ <-> event-channel mappings. */
64427 +static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
64428 + [0 ... NR_EVENT_CHANNELS-1] = -1 };
64429 +
64430 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
64431 +static u32 irq_info[NR_IRQS];
64432 +
64433 +/* Binding types. */
64434 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
64435 +
64436 +/* Constructor for packed IRQ information. */
64437 +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
64438 +{
64439 + return ((type << 24) | (index << 16) | evtchn);
64440 +}
64441 +
64442 +/* Convenient shorthand for packed representation of an unbound IRQ. */
64443 +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
64444 +
64445 +/*
64446 + * Accessors for packed IRQ information.
64447 + */
64448 +
64449 +static inline unsigned int evtchn_from_irq(int irq)
64450 +{
64451 + return (u16)(irq_info[irq]);
64452 +}
64453 +
64454 +static inline unsigned int index_from_irq(int irq)
64455 +{
64456 + return (u8)(irq_info[irq] >> 16);
64457 +}
64458 +
64459 +static inline unsigned int type_from_irq(int irq)
64460 +{
64461 + return (u8)(irq_info[irq] >> 24);
64462 +}
64463 +
64464 +/* IRQ <-> VIRQ mapping. */
64465 +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
64466 +
64467 +/* IRQ <-> IPI mapping. */
64468 +#ifndef NR_IPIS
64469 +#define NR_IPIS 1
64470 +#endif
64471 +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
64472 +
64473 +/* Reference counts for bindings to IRQs. */
64474 +static int irq_bindcount[NR_IRQS];
64475 +
64476 +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
64477 +static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
64478 +
64479 +#ifdef CONFIG_SMP
64480 +
64481 +static u8 cpu_evtchn[NR_EVENT_CHANNELS];
64482 +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
64483 +
64484 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
64485 + unsigned int idx)
64486 +{
64487 + return (sh->evtchn_pending[idx] &
64488 + cpu_evtchn_mask[cpu][idx] &
64489 + ~sh->evtchn_mask[idx]);
64490 +}
64491 +
64492 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
64493 +{
64494 + int irq = evtchn_to_irq[chn];
64495 +
64496 + BUG_ON(irq == -1);
64497 + set_native_irq_info(irq, cpumask_of_cpu(cpu));
64498 +
64499 + clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
64500 + set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
64501 + cpu_evtchn[chn] = cpu;
64502 +}
64503 +
64504 +static void init_evtchn_cpu_bindings(void)
64505 +{
64506 + int i;
64507 +
64508 + /* By default all event channels notify CPU#0. */
64509 + for (i = 0; i < NR_IRQS; i++)
64510 + set_native_irq_info(i, cpumask_of_cpu(0));
64511 +
64512 + memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
64513 + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
64514 +}
64515 +
64516 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
64517 +{
64518 + return cpu_evtchn[evtchn];
64519 +}
64520 +
64521 +#else
64522 +
64523 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
64524 + unsigned int idx)
64525 +{
64526 + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
64527 +}
64528 +
64529 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
64530 +{
64531 +}
64532 +
64533 +static void init_evtchn_cpu_bindings(void)
64534 +{
64535 +}
64536 +
64537 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
64538 +{
64539 + return 0;
64540 +}
64541 +
64542 +#endif
64543 +
64544 +/* Upcall to generic IRQ layer. */
64545 +#ifdef CONFIG_X86
64546 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
64547 +void __init xen_init_IRQ(void);
64548 +void __init init_IRQ(void)
64549 +{
64550 + irq_ctx_init(0);
64551 + xen_init_IRQ();
64552 +}
64553 +#if defined (__i386__)
64554 +static inline void exit_idle(void) {}
64555 +#define IRQ_REG orig_eax
64556 +#elif defined (__x86_64__)
64557 +#include <asm/idle.h>
64558 +#define IRQ_REG orig_rax
64559 +#endif
64560 +#define do_IRQ(irq, regs) do { \
64561 + (regs)->IRQ_REG = ~(irq); \
64562 + do_IRQ((regs)); \
64563 +} while (0)
64564 +#endif
64565 +
64566 +/* Xen will never allocate port zero for any purpose. */
64567 +#define VALID_EVTCHN(chn) ((chn) != 0)
64568 +
64569 +/*
64570 + * Force a proper event-channel callback from Xen after clearing the
64571 + * callback mask. We do this in a very simple manner, by making a call
64572 + * down into Xen. The pending flag will be checked by Xen on return.
64573 + */
64574 +void force_evtchn_callback(void)
64575 +{
64576 + (void)HYPERVISOR_xen_version(0, NULL);
64577 +}
64578 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
64579 +EXPORT_SYMBOL(force_evtchn_callback);
64580 +
64581 +/* NB. Interrupts are disabled on entry. */
64582 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
64583 +{
64584 + unsigned long l1, l2;
64585 + unsigned int l1i, l2i, port;
64586 + int irq, cpu = smp_processor_id();
64587 + shared_info_t *s = HYPERVISOR_shared_info;
64588 + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
64589 +
64590 + vcpu_info->evtchn_upcall_pending = 0;
64591 +
64592 +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
64593 + /* Clear master pending flag /before/ clearing selector flag. */
64594 + rmb();
64595 +#endif
64596 + l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
64597 + while (l1 != 0) {
64598 + l1i = __ffs(l1);
64599 + l1 &= ~(1UL << l1i);
64600 +
64601 + while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
64602 + l2i = __ffs(l2);
64603 +
64604 + port = (l1i * BITS_PER_LONG) + l2i;
64605 + if ((irq = evtchn_to_irq[port]) != -1)
64606 + do_IRQ(irq, regs);
64607 + else {
64608 + exit_idle();
64609 + evtchn_device_upcall(port);
64610 + }
64611 + }
64612 + }
64613 +}
64614 +
64615 +static int find_unbound_irq(void)
64616 +{
64617 + static int warned;
64618 + int dynirq, irq;
64619 +
64620 + for (dynirq = 0; dynirq < NR_DYNIRQS; dynirq++) {
64621 + irq = dynirq_to_irq(dynirq);
64622 + if (irq_bindcount[irq] == 0)
64623 + return irq;
64624 + }
64625 +
64626 + if (!warned) {
64627 + warned = 1;
64628 + printk(KERN_WARNING "No available IRQ to bind to: "
64629 + "increase NR_DYNIRQS.\n");
64630 + }
64631 +
64632 + return -ENOSPC;
64633 +}
64634 +
64635 +static int bind_evtchn_to_irq(unsigned int evtchn)
64636 +{
64637 + int irq;
64638 +
64639 + spin_lock(&irq_mapping_update_lock);
64640 +
64641 + if ((irq = evtchn_to_irq[evtchn]) == -1) {
64642 + if ((irq = find_unbound_irq()) < 0)
64643 + goto out;
64644 +
64645 + evtchn_to_irq[evtchn] = irq;
64646 + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
64647 + }
64648 +
64649 + irq_bindcount[irq]++;
64650 +
64651 + out:
64652 + spin_unlock(&irq_mapping_update_lock);
64653 + return irq;
64654 +}
64655 +
64656 +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
64657 +{
64658 + struct evtchn_bind_virq bind_virq;
64659 + int evtchn, irq;
64660 +
64661 + spin_lock(&irq_mapping_update_lock);
64662 +
64663 + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
64664 + if ((irq = find_unbound_irq()) < 0)
64665 + goto out;
64666 +
64667 + bind_virq.virq = virq;
64668 + bind_virq.vcpu = cpu;
64669 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
64670 + &bind_virq) != 0)
64671 + BUG();
64672 + evtchn = bind_virq.port;
64673 +
64674 + evtchn_to_irq[evtchn] = irq;
64675 + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
64676 +
64677 + per_cpu(virq_to_irq, cpu)[virq] = irq;
64678 +
64679 + bind_evtchn_to_cpu(evtchn, cpu);
64680 + }
64681 +
64682 + irq_bindcount[irq]++;
64683 +
64684 + out:
64685 + spin_unlock(&irq_mapping_update_lock);
64686 + return irq;
64687 +}
64688 +
64689 +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
64690 +{
64691 + struct evtchn_bind_ipi bind_ipi;
64692 + int evtchn, irq;
64693 +
64694 + spin_lock(&irq_mapping_update_lock);
64695 +
64696 + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
64697 + if ((irq = find_unbound_irq()) < 0)
64698 + goto out;
64699 +
64700 + bind_ipi.vcpu = cpu;
64701 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
64702 + &bind_ipi) != 0)
64703 + BUG();
64704 + evtchn = bind_ipi.port;
64705 +
64706 + evtchn_to_irq[evtchn] = irq;
64707 + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
64708 +
64709 + per_cpu(ipi_to_irq, cpu)[ipi] = irq;
64710 +
64711 + bind_evtchn_to_cpu(evtchn, cpu);
64712 + }
64713 +
64714 + irq_bindcount[irq]++;
64715 +
64716 + out:
64717 + spin_unlock(&irq_mapping_update_lock);
64718 + return irq;
64719 +}
64720 +
64721 +static void unbind_from_irq(unsigned int irq)
64722 +{
64723 + struct evtchn_close close;
64724 + int evtchn = evtchn_from_irq(irq);
64725 +
64726 + spin_lock(&irq_mapping_update_lock);
64727 +
64728 + if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
64729 + close.port = evtchn;
64730 + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
64731 + BUG();
64732 +
64733 + switch (type_from_irq(irq)) {
64734 + case IRQT_VIRQ:
64735 + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
64736 + [index_from_irq(irq)] = -1;
64737 + break;
64738 + case IRQT_IPI:
64739 + per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
64740 + [index_from_irq(irq)] = -1;
64741 + break;
64742 + default:
64743 + break;
64744 + }
64745 +
64746 + /* Closed ports are implicitly re-bound to VCPU0. */
64747 + bind_evtchn_to_cpu(evtchn, 0);
64748 +
64749 + evtchn_to_irq[evtchn] = -1;
64750 + irq_info[irq] = IRQ_UNBOUND;
64751 + }
64752 +
64753 + spin_unlock(&irq_mapping_update_lock);
64754 +}
64755 +
64756 +int bind_evtchn_to_irqhandler(
64757 + unsigned int evtchn,
64758 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
64759 + unsigned long irqflags,
64760 + const char *devname,
64761 + void *dev_id)
64762 +{
64763 + unsigned int irq;
64764 + int retval;
64765 +
64766 + irq = bind_evtchn_to_irq(evtchn);
64767 + if (irq < 0)
64768 + return irq;
64769 +
64770 + retval = request_irq(irq, handler, irqflags, devname, dev_id);
64771 + if (retval != 0) {
64772 + unbind_from_irq(irq);
64773 + return retval;
64774 + }
64775 +
64776 + return irq;
64777 +}
64778 +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
64779 +
64780 +int bind_virq_to_irqhandler(
64781 + unsigned int virq,
64782 + unsigned int cpu,
64783 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
64784 + unsigned long irqflags,
64785 + const char *devname,
64786 + void *dev_id)
64787 +{
64788 + unsigned int irq;
64789 + int retval;
64790 +
64791 + irq = bind_virq_to_irq(virq, cpu);
64792 + if (irq < 0)
64793 + return irq;
64794 +
64795 + retval = request_irq(irq, handler, irqflags, devname, dev_id);
64796 + if (retval != 0) {
64797 + unbind_from_irq(irq);
64798 + return retval;
64799 + }
64800 +
64801 + return irq;
64802 +}
64803 +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
64804 +
64805 +int bind_ipi_to_irqhandler(
64806 + unsigned int ipi,
64807 + unsigned int cpu,
64808 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
64809 + unsigned long irqflags,
64810 + const char *devname,
64811 + void *dev_id)
64812 +{
64813 + unsigned int irq;
64814 + int retval;
64815 +
64816 + irq = bind_ipi_to_irq(ipi, cpu);
64817 + if (irq < 0)
64818 + return irq;
64819 +
64820 + retval = request_irq(irq, handler, irqflags, devname, dev_id);
64821 + if (retval != 0) {
64822 + unbind_from_irq(irq);
64823 + return retval;
64824 + }
64825 +
64826 + return irq;
64827 +}
64828 +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
64829 +
64830 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
64831 +{
64832 + free_irq(irq, dev_id);
64833 + unbind_from_irq(irq);
64834 +}
64835 +EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
64836 +
64837 +/* Rebind an evtchn so that it gets delivered to a specific cpu */
64838 +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
64839 +{
64840 + struct evtchn_bind_vcpu bind_vcpu;
64841 + int evtchn = evtchn_from_irq(irq);
64842 +
64843 + if (!VALID_EVTCHN(evtchn))
64844 + return;
64845 +
64846 + /* Send future instances of this interrupt to other vcpu. */
64847 + bind_vcpu.port = evtchn;
64848 + bind_vcpu.vcpu = tcpu;
64849 +
64850 + /*
64851 + * If this fails, it usually just indicates that we're dealing with a
64852 + * virq or IPI channel, which don't actually need to be rebound. Ignore
64853 + * it, but don't do the xenlinux-level rebind in that case.
64854 + */
64855 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
64856 + bind_evtchn_to_cpu(evtchn, tcpu);
64857 +}
64858 +
64859 +
64860 +static void set_affinity_irq(unsigned irq, cpumask_t dest)
64861 +{
64862 + unsigned tcpu = first_cpu(dest);
64863 + rebind_irq_to_cpu(irq, tcpu);
64864 +}
64865 +
64866 +/*
64867 + * Interface to generic handling in irq.c
64868 + */
64869 +
64870 +static unsigned int startup_dynirq(unsigned int irq)
64871 +{
64872 + int evtchn = evtchn_from_irq(irq);
64873 +
64874 + if (VALID_EVTCHN(evtchn))
64875 + unmask_evtchn(evtchn);
64876 + return 0;
64877 +}
64878 +
64879 +static void shutdown_dynirq(unsigned int irq)
64880 +{
64881 + int evtchn = evtchn_from_irq(irq);
64882 +
64883 + if (VALID_EVTCHN(evtchn))
64884 + mask_evtchn(evtchn);
64885 +}
64886 +
64887 +static void enable_dynirq(unsigned int irq)
64888 +{
64889 + int evtchn = evtchn_from_irq(irq);
64890 +
64891 + if (VALID_EVTCHN(evtchn))
64892 + unmask_evtchn(evtchn);
64893 +}
64894 +
64895 +static void disable_dynirq(unsigned int irq)
64896 +{
64897 + int evtchn = evtchn_from_irq(irq);
64898 +
64899 + if (VALID_EVTCHN(evtchn))
64900 + mask_evtchn(evtchn);
64901 +}
64902 +
64903 +static void ack_dynirq(unsigned int irq)
64904 +{
64905 + int evtchn = evtchn_from_irq(irq);
64906 +
64907 + move_native_irq(irq);
64908 +
64909 + if (VALID_EVTCHN(evtchn)) {
64910 + mask_evtchn(evtchn);
64911 + clear_evtchn(evtchn);
64912 + }
64913 +}
64914 +
64915 +static void end_dynirq(unsigned int irq)
64916 +{
64917 + int evtchn = evtchn_from_irq(irq);
64918 +
64919 + if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
64920 + unmask_evtchn(evtchn);
64921 +}
64922 +
64923 +static struct hw_interrupt_type dynirq_type = {
64924 + "Dynamic-irq",
64925 + startup_dynirq,
64926 + shutdown_dynirq,
64927 + enable_dynirq,
64928 + disable_dynirq,
64929 + ack_dynirq,
64930 + end_dynirq,
64931 + set_affinity_irq
64932 +};
64933 +
64934 +static inline void pirq_unmask_notify(int pirq)
64935 +{
64936 + struct physdev_eoi eoi = { .irq = pirq };
64937 + if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
64938 + (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
64939 +}
64940 +
64941 +static inline void pirq_query_unmask(int pirq)
64942 +{
64943 + struct physdev_irq_status_query irq_status;
64944 + irq_status.irq = pirq;
64945 + (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
64946 + clear_bit(pirq, &pirq_needs_eoi[0]);
64947 + if (irq_status.flags & XENIRQSTAT_needs_eoi)
64948 + set_bit(pirq, &pirq_needs_eoi[0]);
64949 +}
64950 +
64951 +/*
64952 + * On startup, if there is no action associated with the IRQ then we are
64953 + * probing. In this case we should not share with others as it will confuse us.
64954 + */
64955 +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
64956 +
64957 +static unsigned int startup_pirq(unsigned int irq)
64958 +{
64959 + struct evtchn_bind_pirq bind_pirq;
64960 + int evtchn = evtchn_from_irq(irq);
64961 +
64962 + if (VALID_EVTCHN(evtchn))
64963 + goto out;
64964 +
64965 + bind_pirq.pirq = irq;
64966 + /* NB. We are happy to share unless we are probing. */
64967 + bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
64968 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
64969 + if (!probing_irq(irq))
64970 + printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
64971 + irq);
64972 + return 0;
64973 + }
64974 + evtchn = bind_pirq.port;
64975 +
64976 + pirq_query_unmask(irq_to_pirq(irq));
64977 +
64978 + evtchn_to_irq[evtchn] = irq;
64979 + bind_evtchn_to_cpu(evtchn, 0);
64980 + irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
64981 +
64982 + out:
64983 + unmask_evtchn(evtchn);
64984 + pirq_unmask_notify(irq_to_pirq(irq));
64985 +
64986 + return 0;
64987 +}
64988 +
64989 +static void shutdown_pirq(unsigned int irq)
64990 +{
64991 + struct evtchn_close close;
64992 + int evtchn = evtchn_from_irq(irq);
64993 +
64994 + if (!VALID_EVTCHN(evtchn))
64995 + return;
64996 +
64997 + mask_evtchn(evtchn);
64998 +
64999 + close.port = evtchn;
65000 + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
65001 + BUG();
65002 +
65003 + bind_evtchn_to_cpu(evtchn, 0);
65004 + evtchn_to_irq[evtchn] = -1;
65005 + irq_info[irq] = IRQ_UNBOUND;
65006 +}
65007 +
65008 +static void enable_pirq(unsigned int irq)
65009 +{
65010 + int evtchn = evtchn_from_irq(irq);
65011 +
65012 + if (VALID_EVTCHN(evtchn)) {
65013 + unmask_evtchn(evtchn);
65014 + pirq_unmask_notify(irq_to_pirq(irq));
65015 + }
65016 +}
65017 +
65018 +static void disable_pirq(unsigned int irq)
65019 +{
65020 + int evtchn = evtchn_from_irq(irq);
65021 +
65022 + if (VALID_EVTCHN(evtchn))
65023 + mask_evtchn(evtchn);
65024 +}
65025 +
65026 +static void ack_pirq(unsigned int irq)
65027 +{
65028 + int evtchn = evtchn_from_irq(irq);
65029 +
65030 + move_native_irq(irq);
65031 +
65032 + if (VALID_EVTCHN(evtchn)) {
65033 + mask_evtchn(evtchn);
65034 + clear_evtchn(evtchn);
65035 + }
65036 +}
65037 +
65038 +static void end_pirq(unsigned int irq)
65039 +{
65040 + int evtchn = evtchn_from_irq(irq);
65041 +
65042 + if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
65043 + unmask_evtchn(evtchn);
65044 + pirq_unmask_notify(irq_to_pirq(irq));
65045 + }
65046 +}
65047 +
65048 +static struct hw_interrupt_type pirq_type = {
65049 + "Phys-irq",
65050 + startup_pirq,
65051 + shutdown_pirq,
65052 + enable_pirq,
65053 + disable_pirq,
65054 + ack_pirq,
65055 + end_pirq,
65056 + set_affinity_irq
65057 +};
65058 +
65059 +int irq_ignore_unhandled(unsigned int irq)
65060 +{
65061 + struct physdev_irq_status_query irq_status = { .irq = irq };
65062 +
65063 + if (!is_running_on_xen())
65064 + return 0;
65065 +
65066 + (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
65067 + return !!(irq_status.flags & XENIRQSTAT_shared);
65068 +}
65069 +
65070 +void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i)
65071 +{
65072 + int evtchn = evtchn_from_irq(i);
65073 + shared_info_t *s = HYPERVISOR_shared_info;
65074 + if (!VALID_EVTCHN(evtchn))
65075 + return;
65076 + BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
65077 + synch_set_bit(evtchn, &s->evtchn_pending[0]);
65078 +}
65079 +
65080 +void notify_remote_via_irq(int irq)
65081 +{
65082 + int evtchn = evtchn_from_irq(irq);
65083 +
65084 + if (VALID_EVTCHN(evtchn))
65085 + notify_remote_via_evtchn(evtchn);
65086 +}
65087 +EXPORT_SYMBOL_GPL(notify_remote_via_irq);
65088 +
65089 +void mask_evtchn(int port)
65090 +{
65091 + shared_info_t *s = HYPERVISOR_shared_info;
65092 + synch_set_bit(port, &s->evtchn_mask[0]);
65093 +}
65094 +EXPORT_SYMBOL_GPL(mask_evtchn);
65095 +
65096 +void unmask_evtchn(int port)
65097 +{
65098 + shared_info_t *s = HYPERVISOR_shared_info;
65099 + unsigned int cpu = smp_processor_id();
65100 + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
65101 +
65102 + BUG_ON(!irqs_disabled());
65103 +
65104 + /* Slow path (hypercall) if this is a non-local port. */
65105 + if (unlikely(cpu != cpu_from_evtchn(port))) {
65106 + struct evtchn_unmask unmask = { .port = port };
65107 + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
65108 + return;
65109 + }
65110 +
65111 + synch_clear_bit(port, &s->evtchn_mask[0]);
65112 +
65113 + /*
65114 + * The following is basically the equivalent of 'hw_resend_irq'. Just
65115 + * like a real IO-APIC we 'lose the interrupt edge' if the channel is
65116 + * masked.
65117 + */
65118 + if (synch_test_bit(port, &s->evtchn_pending[0]) &&
65119 + !synch_test_and_set_bit(port / BITS_PER_LONG,
65120 + &vcpu_info->evtchn_pending_sel))
65121 + vcpu_info->evtchn_upcall_pending = 1;
65122 +}
65123 +EXPORT_SYMBOL_GPL(unmask_evtchn);
65124 +
65125 +void irq_resume(void)
65126 +{
65127 + struct evtchn_bind_virq bind_virq;
65128 + struct evtchn_bind_ipi bind_ipi;
65129 + int cpu, pirq, virq, ipi, irq, evtchn;
65130 +
65131 + init_evtchn_cpu_bindings();
65132 +
65133 + /* New event-channel space is not 'live' yet. */
65134 + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
65135 + mask_evtchn(evtchn);
65136 +
65137 + /* Check that no PIRQs are still bound. */
65138 + for (pirq = 0; pirq < NR_PIRQS; pirq++)
65139 + BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
65140 +
65141 + /* Secondary CPUs must have no VIRQ or IPI bindings. */
65142 + for_each_possible_cpu(cpu) {
65143 + if (cpu == 0)
65144 + continue;
65145 + for (virq = 0; virq < NR_VIRQS; virq++)
65146 + BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
65147 + for (ipi = 0; ipi < NR_IPIS; ipi++)
65148 + BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
65149 + }
65150 +
65151 + /* No IRQ <-> event-channel mappings. */
65152 + for (irq = 0; irq < NR_IRQS; irq++)
65153 + irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
65154 + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
65155 + evtchn_to_irq[evtchn] = -1;
65156 +
65157 + /* Primary CPU: rebind VIRQs automatically. */
65158 + for (virq = 0; virq < NR_VIRQS; virq++) {
65159 + if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
65160 + continue;
65161 +
65162 + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
65163 +
65164 + /* Get a new binding from Xen. */
65165 + bind_virq.virq = virq;
65166 + bind_virq.vcpu = 0;
65167 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
65168 + &bind_virq) != 0)
65169 + BUG();
65170 + evtchn = bind_virq.port;
65171 +
65172 + /* Record the new mapping. */
65173 + evtchn_to_irq[evtchn] = irq;
65174 + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
65175 +
65176 + /* Ready for use. */
65177 + unmask_evtchn(evtchn);
65178 + }
65179 +
65180 + /* Primary CPU: rebind IPIs automatically. */
65181 + for (ipi = 0; ipi < NR_IPIS; ipi++) {
65182 + if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
65183 + continue;
65184 +
65185 + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
65186 +
65187 + /* Get a new binding from Xen. */
65188 + bind_ipi.vcpu = 0;
65189 + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
65190 + &bind_ipi) != 0)
65191 + BUG();
65192 + evtchn = bind_ipi.port;
65193 +
65194 + /* Record the new mapping. */
65195 + evtchn_to_irq[evtchn] = irq;
65196 + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
65197 +
65198 + /* Ready for use. */
65199 + unmask_evtchn(evtchn);
65200 + }
65201 +}
65202 +
65203 +void __init xen_init_IRQ(void)
65204 +{
65205 + int i;
65206 +
65207 + init_evtchn_cpu_bindings();
65208 +
65209 + /* No event channels are 'live' right now. */
65210 + for (i = 0; i < NR_EVENT_CHANNELS; i++)
65211 + mask_evtchn(i);
65212 +
65213 + /* No IRQ -> event-channel mappings. */
65214 + for (i = 0; i < NR_IRQS; i++)
65215 + irq_info[i] = IRQ_UNBOUND;
65216 +
65217 + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
65218 + for (i = 0; i < NR_DYNIRQS; i++) {
65219 + irq_bindcount[dynirq_to_irq(i)] = 0;
65220 +
65221 + irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
65222 + irq_desc[dynirq_to_irq(i)].action = NULL;
65223 + irq_desc[dynirq_to_irq(i)].depth = 1;
65224 + irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
65225 + }
65226 +
65227 + /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
65228 + for (i = 0; i < NR_PIRQS; i++) {
65229 + irq_bindcount[pirq_to_irq(i)] = 1;
65230 +
65231 +#ifdef RTC_IRQ
65232 + /* If not domain 0, force our RTC driver to fail its probe. */
65233 + if ((i == RTC_IRQ) && !is_initial_xendomain())
65234 + continue;
65235 +#endif
65236 +
65237 + irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
65238 + irq_desc[pirq_to_irq(i)].action = NULL;
65239 + irq_desc[pirq_to_irq(i)].depth = 1;
65240 + irq_desc[pirq_to_irq(i)].handler = &pirq_type;
65241 + }
65242 +}
65243 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/features.c linux-2.6.16.33/drivers/xen/core/features.c
65244 --- linux-2.6.16.33-noxen/drivers/xen/core/features.c 1970-01-01 00:00:00.000000000 +0000
65245 +++ linux-2.6.16.33/drivers/xen/core/features.c 2007-01-08 15:00:45.000000000 +0000
65246 @@ -0,0 +1,34 @@
65247 +/******************************************************************************
65248 + * features.c
65249 + *
65250 + * Xen feature flags.
65251 + *
65252 + * Copyright (c) 2006, Ian Campbell, XenSource Inc.
65253 + */
65254 +#include <linux/types.h>
65255 +#include <linux/cache.h>
65256 +#include <linux/module.h>
65257 +#include <asm/hypervisor.h>
65258 +#include <xen/features.h>
65259 +
65260 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
65261 +#include <xen/platform-compat.h>
65262 +#endif
65263 +
65264 +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
65265 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
65266 +EXPORT_SYMBOL(xen_features);
65267 +
65268 +void setup_xen_features(void)
65269 +{
65270 + xen_feature_info_t fi;
65271 + int i, j;
65272 +
65273 + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
65274 + fi.submap_idx = i;
65275 + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
65276 + break;
65277 + for (j=0; j<32; j++)
65278 + xen_features[i*32+j] = !!(fi.submap & 1<<j);
65279 + }
65280 +}
65281 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/gnttab.c linux-2.6.16.33/drivers/xen/core/gnttab.c
65282 --- linux-2.6.16.33-noxen/drivers/xen/core/gnttab.c 1970-01-01 00:00:00.000000000 +0000
65283 +++ linux-2.6.16.33/drivers/xen/core/gnttab.c 2007-01-08 15:00:45.000000000 +0000
65284 @@ -0,0 +1,488 @@
65285 +/******************************************************************************
65286 + * gnttab.c
65287 + *
65288 + * Granting foreign access to our memory reservation.
65289 + *
65290 + * Copyright (c) 2005, Christopher Clark
65291 + * Copyright (c) 2004-2005, K A Fraser
65292 + *
65293 + * This program is free software; you can redistribute it and/or
65294 + * modify it under the terms of the GNU General Public License version 2
65295 + * as published by the Free Software Foundation; or, when distributed
65296 + * separately from the Linux kernel or incorporated into other
65297 + * software packages, subject to the following license:
65298 + *
65299 + * Permission is hereby granted, free of charge, to any person obtaining a copy
65300 + * of this source file (the "Software"), to deal in the Software without
65301 + * restriction, including without limitation the rights to use, copy, modify,
65302 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
65303 + * and to permit persons to whom the Software is furnished to do so, subject to
65304 + * the following conditions:
65305 + *
65306 + * The above copyright notice and this permission notice shall be included in
65307 + * all copies or substantial portions of the Software.
65308 + *
65309 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65310 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
65311 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
65312 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65313 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
65314 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
65315 + * IN THE SOFTWARE.
65316 + */
65317 +
65318 +#include <linux/config.h>
65319 +#include <linux/module.h>
65320 +#include <linux/sched.h>
65321 +#include <linux/mm.h>
65322 +#include <linux/vmalloc.h>
65323 +#include <xen/interface/xen.h>
65324 +#include <xen/gnttab.h>
65325 +#include <asm/pgtable.h>
65326 +#include <asm/uaccess.h>
65327 +#include <asm/synch_bitops.h>
65328 +#include <asm/io.h>
65329 +#include <xen/interface/memory.h>
65330 +
65331 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
65332 +#include <xen/platform-compat.h>
65333 +#endif
65334 +
65335 +/* External tools reserve first few grant table entries. */
65336 +#define NR_RESERVED_ENTRIES 8
65337 +
65338 +#define NR_GRANT_ENTRIES \
65339 + (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(struct grant_entry))
65340 +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
65341 +
65342 +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
65343 +static int gnttab_free_count;
65344 +static grant_ref_t gnttab_free_head;
65345 +static DEFINE_SPINLOCK(gnttab_list_lock);
65346 +
65347 +static struct grant_entry *shared;
65348 +
65349 +static struct gnttab_free_callback *gnttab_free_callback_list;
65350 +
65351 +static int get_free_entries(int count)
65352 +{
65353 + unsigned long flags;
65354 + int ref;
65355 + grant_ref_t head;
65356 + spin_lock_irqsave(&gnttab_list_lock, flags);
65357 + if (gnttab_free_count < count) {
65358 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65359 + return -1;
65360 + }
65361 + ref = head = gnttab_free_head;
65362 + gnttab_free_count -= count;
65363 + while (count-- > 1)
65364 + head = gnttab_list[head];
65365 + gnttab_free_head = gnttab_list[head];
65366 + gnttab_list[head] = GNTTAB_LIST_END;
65367 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65368 + return ref;
65369 +}
65370 +
65371 +#define get_free_entry() get_free_entries(1)
65372 +
65373 +static void do_free_callbacks(void)
65374 +{
65375 + struct gnttab_free_callback *callback, *next;
65376 +
65377 + callback = gnttab_free_callback_list;
65378 + gnttab_free_callback_list = NULL;
65379 +
65380 + while (callback != NULL) {
65381 + next = callback->next;
65382 + if (gnttab_free_count >= callback->count) {
65383 + callback->next = NULL;
65384 + callback->fn(callback->arg);
65385 + } else {
65386 + callback->next = gnttab_free_callback_list;
65387 + gnttab_free_callback_list = callback;
65388 + }
65389 + callback = next;
65390 + }
65391 +}
65392 +
65393 +static inline void check_free_callbacks(void)
65394 +{
65395 + if (unlikely(gnttab_free_callback_list))
65396 + do_free_callbacks();
65397 +}
65398 +
65399 +static void put_free_entry(grant_ref_t ref)
65400 +{
65401 + unsigned long flags;
65402 + spin_lock_irqsave(&gnttab_list_lock, flags);
65403 + gnttab_list[ref] = gnttab_free_head;
65404 + gnttab_free_head = ref;
65405 + gnttab_free_count++;
65406 + check_free_callbacks();
65407 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65408 +}
65409 +
65410 +/*
65411 + * Public grant-issuing interface functions
65412 + */
65413 +
65414 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
65415 + int readonly)
65416 +{
65417 + int ref;
65418 +
65419 + if (unlikely((ref = get_free_entry()) == -1))
65420 + return -ENOSPC;
65421 +
65422 + shared[ref].frame = frame;
65423 + shared[ref].domid = domid;
65424 + wmb();
65425 + shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
65426 +
65427 + return ref;
65428 +}
65429 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
65430 +
65431 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
65432 + unsigned long frame, int readonly)
65433 +{
65434 + shared[ref].frame = frame;
65435 + shared[ref].domid = domid;
65436 + wmb();
65437 + shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
65438 +}
65439 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
65440 +
65441 +
65442 +int gnttab_query_foreign_access(grant_ref_t ref)
65443 +{
65444 + u16 nflags;
65445 +
65446 + nflags = shared[ref].flags;
65447 +
65448 + return (nflags & (GTF_reading|GTF_writing));
65449 +}
65450 +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
65451 +
65452 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
65453 +{
65454 + u16 flags, nflags;
65455 +
65456 + nflags = shared[ref].flags;
65457 + do {
65458 + if ((flags = nflags) & (GTF_reading|GTF_writing)) {
65459 + printk(KERN_ALERT "WARNING: g.e. still in use!\n");
65460 + return 0;
65461 + }
65462 + } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
65463 + flags);
65464 +
65465 + return 1;
65466 +}
65467 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
65468 +
65469 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
65470 + unsigned long page)
65471 +{
65472 + if (gnttab_end_foreign_access_ref(ref, readonly)) {
65473 + put_free_entry(ref);
65474 + if (page != 0)
65475 + free_page(page);
65476 + } else {
65477 + /* XXX This needs to be fixed so that the ref and page are
65478 + placed on a list to be freed up later. */
65479 + printk(KERN_WARNING
65480 + "WARNING: leaking g.e. and page still in use!\n");
65481 + }
65482 +}
65483 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
65484 +
65485 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
65486 +{
65487 + int ref;
65488 +
65489 + if (unlikely((ref = get_free_entry()) == -1))
65490 + return -ENOSPC;
65491 + gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
65492 +
65493 + return ref;
65494 +}
65495 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
65496 +
65497 +void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
65498 + unsigned long pfn)
65499 +{
65500 + shared[ref].frame = pfn;
65501 + shared[ref].domid = domid;
65502 + wmb();
65503 + shared[ref].flags = GTF_accept_transfer;
65504 +}
65505 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
65506 +
65507 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
65508 +{
65509 + unsigned long frame;
65510 + u16 flags;
65511 +
65512 + /*
65513 + * If a transfer is not even yet started, try to reclaim the grant
65514 + * reference and return failure (== 0).
65515 + */
65516 + while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
65517 + if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
65518 + return 0;
65519 + cpu_relax();
65520 + }
65521 +
65522 + /* If a transfer is in progress then wait until it is completed. */
65523 + while (!(flags & GTF_transfer_completed)) {
65524 + flags = shared[ref].flags;
65525 + cpu_relax();
65526 + }
65527 +
65528 + /* Read the frame number /after/ reading completion status. */
65529 + rmb();
65530 + frame = shared[ref].frame;
65531 + BUG_ON(frame == 0);
65532 +
65533 + return frame;
65534 +}
65535 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
65536 +
65537 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
65538 +{
65539 + unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
65540 + put_free_entry(ref);
65541 + return frame;
65542 +}
65543 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
65544 +
65545 +void gnttab_free_grant_reference(grant_ref_t ref)
65546 +{
65547 + put_free_entry(ref);
65548 +}
65549 +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
65550 +
65551 +void gnttab_free_grant_references(grant_ref_t head)
65552 +{
65553 + grant_ref_t ref;
65554 + unsigned long flags;
65555 + int count = 1;
65556 + if (head == GNTTAB_LIST_END)
65557 + return;
65558 + spin_lock_irqsave(&gnttab_list_lock, flags);
65559 + ref = head;
65560 + while (gnttab_list[ref] != GNTTAB_LIST_END) {
65561 + ref = gnttab_list[ref];
65562 + count++;
65563 + }
65564 + gnttab_list[ref] = gnttab_free_head;
65565 + gnttab_free_head = head;
65566 + gnttab_free_count += count;
65567 + check_free_callbacks();
65568 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65569 +}
65570 +EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
65571 +
65572 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
65573 +{
65574 + int h = get_free_entries(count);
65575 +
65576 + if (h == -1)
65577 + return -ENOSPC;
65578 +
65579 + *head = h;
65580 +
65581 + return 0;
65582 +}
65583 +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
65584 +
65585 +int gnttab_empty_grant_references(const grant_ref_t *private_head)
65586 +{
65587 + return (*private_head == GNTTAB_LIST_END);
65588 +}
65589 +EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
65590 +
65591 +int gnttab_claim_grant_reference(grant_ref_t *private_head)
65592 +{
65593 + grant_ref_t g = *private_head;
65594 + if (unlikely(g == GNTTAB_LIST_END))
65595 + return -ENOSPC;
65596 + *private_head = gnttab_list[g];
65597 + return g;
65598 +}
65599 +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
65600 +
65601 +void gnttab_release_grant_reference(grant_ref_t *private_head,
65602 + grant_ref_t release)
65603 +{
65604 + gnttab_list[release] = *private_head;
65605 + *private_head = release;
65606 +}
65607 +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
65608 +
65609 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
65610 + void (*fn)(void *), void *arg, u16 count)
65611 +{
65612 + unsigned long flags;
65613 + spin_lock_irqsave(&gnttab_list_lock, flags);
65614 + if (callback->next)
65615 + goto out;
65616 + callback->fn = fn;
65617 + callback->arg = arg;
65618 + callback->count = count;
65619 + callback->next = gnttab_free_callback_list;
65620 + gnttab_free_callback_list = callback;
65621 + check_free_callbacks();
65622 +out:
65623 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65624 +}
65625 +EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
65626 +
65627 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
65628 +{
65629 + struct gnttab_free_callback **pcb;
65630 + unsigned long flags;
65631 +
65632 + spin_lock_irqsave(&gnttab_list_lock, flags);
65633 + for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
65634 + if (*pcb == callback) {
65635 + *pcb = callback->next;
65636 + break;
65637 + }
65638 + }
65639 + spin_unlock_irqrestore(&gnttab_list_lock, flags);
65640 +}
65641 +EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
65642 +
65643 +#ifdef CONFIG_XEN
65644 +
65645 +#ifndef __ia64__
65646 +static int map_pte_fn(pte_t *pte, struct page *pmd_page,
65647 + unsigned long addr, void *data)
65648 +{
65649 + unsigned long **frames = (unsigned long **)data;
65650 +
65651 + set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
65652 + (*frames)++;
65653 + return 0;
65654 +}
65655 +
65656 +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
65657 + unsigned long addr, void *data)
65658 +{
65659 +
65660 + set_pte_at(&init_mm, addr, pte, __pte(0));
65661 + return 0;
65662 +}
65663 +#endif
65664 +
65665 +int gnttab_resume(void)
65666 +{
65667 + struct gnttab_setup_table setup;
65668 + unsigned long frames[NR_GRANT_FRAMES];
65669 + int rc;
65670 +#ifndef __ia64__
65671 + void *pframes = frames;
65672 + struct vm_struct *area;
65673 +#endif
65674 +
65675 + setup.dom = DOMID_SELF;
65676 + setup.nr_frames = NR_GRANT_FRAMES;
65677 + set_xen_guest_handle(setup.frame_list, frames);
65678 +
65679 + rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
65680 + if (rc == -ENOSYS)
65681 + return -ENOSYS;
65682 +
65683 + BUG_ON(rc || setup.status);
65684 +
65685 +#ifndef __ia64__
65686 + if (shared == NULL) {
65687 + area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
65688 + BUG_ON(area == NULL);
65689 + shared = area->addr;
65690 + }
65691 + rc = apply_to_page_range(&init_mm, (unsigned long)shared,
65692 + PAGE_SIZE * NR_GRANT_FRAMES,
65693 + map_pte_fn, &pframes);
65694 + BUG_ON(rc);
65695 +#else
65696 + shared = __va(frames[0] << PAGE_SHIFT);
65697 + printk("grant table at %p\n", shared);
65698 +#endif
65699 +
65700 + return 0;
65701 +}
65702 +
65703 +int gnttab_suspend(void)
65704 +{
65705 +#ifndef __ia64__
65706 + apply_to_page_range(&init_mm, (unsigned long)shared,
65707 + PAGE_SIZE * NR_GRANT_FRAMES,
65708 + unmap_pte_fn, NULL);
65709 +#endif
65710 + return 0;
65711 +}
65712 +
65713 +#else /* !CONFIG_XEN */
65714 +
65715 +#include <platform-pci.h>
65716 +
65717 +int gnttab_resume(void)
65718 +{
65719 + unsigned long frames;
65720 + struct xen_add_to_physmap xatp;
65721 + unsigned int i;
65722 +
65723 + frames = alloc_xen_mmio(PAGE_SIZE * NR_GRANT_FRAMES);
65724 +
65725 + for (i = 0; i < NR_GRANT_FRAMES; i++) {
65726 + xatp.domid = DOMID_SELF;
65727 + xatp.idx = i;
65728 + xatp.space = XENMAPSPACE_grant_table;
65729 + xatp.gpfn = (frames >> PAGE_SHIFT) + i;
65730 + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
65731 + BUG();
65732 + }
65733 +
65734 + shared = ioremap(frames, PAGE_SIZE * NR_GRANT_FRAMES);
65735 + if (shared == NULL) {
65736 + printk("error to ioremap gnttab share frames\n");
65737 + return -1;
65738 + }
65739 +
65740 + return 0;
65741 +}
65742 +
65743 +int gnttab_suspend(void)
65744 +{
65745 + iounmap(shared);
65746 + return 0;
65747 +}
65748 +
65749 +#endif /* !CONFIG_XEN */
65750 +
65751 +int __init gnttab_init(void)
65752 +{
65753 + int i;
65754 +
65755 + if (!is_running_on_xen())
65756 + return -ENODEV;
65757 +
65758 + if (gnttab_resume() < 0)
65759 + return -ENODEV;
65760 +
65761 + for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
65762 + gnttab_list[i] = i + 1;
65763 + gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
65764 + gnttab_free_head = NR_RESERVED_ENTRIES;
65765 +
65766 + printk("Grant table initialized\n");
65767 + return 0;
65768 +}
65769 +
65770 +#ifdef CONFIG_XEN
65771 +core_initcall(gnttab_init);
65772 +#endif
65773 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/hypervisor_sysfs.c linux-2.6.16.33/drivers/xen/core/hypervisor_sysfs.c
65774 --- linux-2.6.16.33-noxen/drivers/xen/core/hypervisor_sysfs.c 1970-01-01 00:00:00.000000000 +0000
65775 +++ linux-2.6.16.33/drivers/xen/core/hypervisor_sysfs.c 2007-01-08 15:00:45.000000000 +0000
65776 @@ -0,0 +1,60 @@
65777 +/*
65778 + * copyright (c) 2006 IBM Corporation
65779 + * Authored by: Mike D. Day <ncmike@us.ibm.com>
65780 + *
65781 + * This program is free software; you can redistribute it and/or modify
65782 + * it under the terms of the GNU General Public License version 2 as
65783 + * published by the Free Software Foundation.
65784 + */
65785 +
65786 +#include <linux/config.h>
65787 +#include <linux/kernel.h>
65788 +#include <linux/module.h>
65789 +#include <linux/kobject.h>
65790 +#include <xen/hypervisor_sysfs.h>
65791 +
65792 +decl_subsys(hypervisor, NULL, NULL);
65793 +
65794 +static ssize_t hyp_sysfs_show(struct kobject *kobj,
65795 + struct attribute *attr,
65796 + char *buffer)
65797 +{
65798 + struct hyp_sysfs_attr *hyp_attr;
65799 + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
65800 + if (hyp_attr->show)
65801 + return hyp_attr->show(hyp_attr, buffer);
65802 + return 0;
65803 +}
65804 +
65805 +static ssize_t hyp_sysfs_store(struct kobject *kobj,
65806 + struct attribute *attr,
65807 + const char *buffer,
65808 + size_t len)
65809 +{
65810 + struct hyp_sysfs_attr *hyp_attr;
65811 + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
65812 + if (hyp_attr->store)
65813 + return hyp_attr->store(hyp_attr, buffer, len);
65814 + return 0;
65815 +}
65816 +
65817 +struct sysfs_ops hyp_sysfs_ops = {
65818 + .show = hyp_sysfs_show,
65819 + .store = hyp_sysfs_store,
65820 +};
65821 +
65822 +static struct kobj_type hyp_sysfs_kobj_type = {
65823 + .sysfs_ops = &hyp_sysfs_ops,
65824 +};
65825 +
65826 +static int __init hypervisor_subsys_init(void)
65827 +{
65828 + if (!is_running_on_xen())
65829 + return -ENODEV;
65830 +
65831 + hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
65832 + return subsystem_register(&hypervisor_subsys);
65833 +}
65834 +
65835 +device_initcall(hypervisor_subsys_init);
65836 +EXPORT_SYMBOL_GPL(hypervisor_subsys);
65837 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/machine_kexec.c linux-2.6.16.33/drivers/xen/core/machine_kexec.c
65838 --- linux-2.6.16.33-noxen/drivers/xen/core/machine_kexec.c 1970-01-01 00:00:00.000000000 +0000
65839 +++ linux-2.6.16.33/drivers/xen/core/machine_kexec.c 2007-01-08 15:00:45.000000000 +0000
65840 @@ -0,0 +1,190 @@
65841 +/*
65842 + * drivers/xen/core/machine_kexec.c
65843 + * handle transition of Linux booting another kernel
65844 + */
65845 +
65846 +#include <linux/kexec.h>
65847 +#include <xen/interface/kexec.h>
65848 +#include <linux/mm.h>
65849 +#include <linux/bootmem.h>
65850 +#include <asm/hypercall.h>
65851 +
65852 +extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,
65853 + struct kimage *image);
65854 +
65855 +int xen_max_nr_phys_cpus;
65856 +struct resource xen_hypervisor_res;
65857 +struct resource *xen_phys_cpus;
65858 +
65859 +void xen_machine_kexec_setup_resources(void)
65860 +{
65861 + xen_kexec_range_t range;
65862 + struct resource *res;
65863 + int k = 0;
65864 +
65865 + if (!is_initial_xendomain())
65866 + return;
65867 +
65868 + /* determine maximum number of physical cpus */
65869 +
65870 + while (1) {
65871 + memset(&range, 0, sizeof(range));
65872 + range.range = KEXEC_RANGE_MA_CPU;
65873 + range.nr = k;
65874 +
65875 + if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65876 + break;
65877 +
65878 + k++;
65879 + }
65880 +
65881 + if (k == 0)
65882 + return;
65883 +
65884 + xen_max_nr_phys_cpus = k;
65885 +
65886 + /* allocate xen_phys_cpus */
65887 +
65888 + xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
65889 + BUG_ON(xen_phys_cpus == NULL);
65890 +
65891 + /* fill in xen_phys_cpus with per-cpu crash note information */
65892 +
65893 + for (k = 0; k < xen_max_nr_phys_cpus; k++) {
65894 + memset(&range, 0, sizeof(range));
65895 + range.range = KEXEC_RANGE_MA_CPU;
65896 + range.nr = k;
65897 +
65898 + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65899 + goto err;
65900 +
65901 + res = xen_phys_cpus + k;
65902 +
65903 + memset(res, 0, sizeof(*res));
65904 + res->name = "Crash note";
65905 + res->start = range.start;
65906 + res->end = range.start + range.size - 1;
65907 + res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
65908 + }
65909 +
65910 + /* fill in xen_hypervisor_res with hypervisor machine address range */
65911 +
65912 + memset(&range, 0, sizeof(range));
65913 + range.range = KEXEC_RANGE_MA_XEN;
65914 +
65915 + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65916 + goto err;
65917 +
65918 + xen_hypervisor_res.name = "Hypervisor code and data";
65919 + xen_hypervisor_res.start = range.start;
65920 + xen_hypervisor_res.end = range.start + range.size - 1;
65921 + xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
65922 +
65923 + /* fill in crashk_res if range is reserved by hypervisor */
65924 +
65925 + memset(&range, 0, sizeof(range));
65926 + range.range = KEXEC_RANGE_MA_CRASH;
65927 +
65928 + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
65929 + return;
65930 +
65931 + if (range.size) {
65932 + crashk_res.start = range.start;
65933 + crashk_res.end = range.start + range.size - 1;
65934 + }
65935 +
65936 + return;
65937 +
65938 + err:
65939 + /*
65940 + * It isn't possible to free xen_phys_cpus this early in the
65941 + * boot. Since failure at this stage is unexpected and the
65942 + * amount is small we leak the memory.
65943 + */
65944 + xen_max_nr_phys_cpus = 0;
65945 + return;
65946 +}
65947 +
65948 +void xen_machine_kexec_register_resources(struct resource *res)
65949 +{
65950 + int k;
65951 +
65952 + request_resource(res, &xen_hypervisor_res);
65953 +
65954 + for (k = 0; k < xen_max_nr_phys_cpus; k++)
65955 + request_resource(&xen_hypervisor_res, xen_phys_cpus + k);
65956 +
65957 +}
65958 +
65959 +static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
65960 +{
65961 + machine_kexec_setup_load_arg(xki, image);
65962 +
65963 + xki->indirection_page = image->head;
65964 + xki->start_address = image->start;
65965 +}
65966 +
65967 +/*
65968 + * Load the image into xen so xen can kdump itself
65969 + * This might have been done in prepare, but prepare
65970 + * is currently called too early. It might make sense
65971 + * to move prepare, but for now, just add an extra hook.
65972 + */
65973 +int xen_machine_kexec_load(struct kimage *image)
65974 +{
65975 + xen_kexec_load_t xkl;
65976 +
65977 + memset(&xkl, 0, sizeof(xkl));
65978 + xkl.type = image->type;
65979 + setup_load_arg(&xkl.image, image);
65980 + return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
65981 +}
65982 +
65983 +/*
65984 + * Unload the image that was stored by machine_kexec_load()
65985 + * This might have been done in machine_kexec_cleanup() but it
65986 + * is called too late, and its possible xen could try and kdump
65987 + * using resources that have been freed.
65988 + */
65989 +void xen_machine_kexec_unload(struct kimage *image)
65990 +{
65991 + xen_kexec_load_t xkl;
65992 +
65993 + memset(&xkl, 0, sizeof(xkl));
65994 + xkl.type = image->type;
65995 + HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl);
65996 +}
65997 +
65998 +/*
65999 + * Do not allocate memory (or fail in any way) in machine_kexec().
66000 + * We are past the point of no return, committed to rebooting now.
66001 + *
66002 + * This has the hypervisor move to the prefered reboot CPU,
66003 + * stop all CPUs and kexec. That is it combines machine_shutdown()
66004 + * and machine_kexec() in Linux kexec terms.
66005 + */
66006 +NORET_TYPE void machine_kexec(struct kimage *image)
66007 +{
66008 + xen_kexec_exec_t xke;
66009 +
66010 + memset(&xke, 0, sizeof(xke));
66011 + xke.type = image->type;
66012 + HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
66013 + panic("KEXEC_CMD_kexec hypercall should not return\n");
66014 +}
66015 +
66016 +void machine_shutdown(void)
66017 +{
66018 + /* do nothing */
66019 +}
66020 +
66021 +
66022 +/*
66023 + * Local variables:
66024 + * c-file-style: "linux"
66025 + * indent-tabs-mode: t
66026 + * c-indent-level: 8
66027 + * c-basic-offset: 8
66028 + * tab-width: 8
66029 + * End:
66030 + */
66031 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/machine_reboot.c linux-2.6.16.33/drivers/xen/core/machine_reboot.c
66032 --- linux-2.6.16.33-noxen/drivers/xen/core/machine_reboot.c 1970-01-01 00:00:00.000000000 +0000
66033 +++ linux-2.6.16.33/drivers/xen/core/machine_reboot.c 2007-01-08 15:00:45.000000000 +0000
66034 @@ -0,0 +1,185 @@
66035 +#define __KERNEL_SYSCALLS__
66036 +#include <linux/version.h>
66037 +#include <linux/kernel.h>
66038 +#include <linux/mm.h>
66039 +#include <linux/unistd.h>
66040 +#include <linux/module.h>
66041 +#include <linux/reboot.h>
66042 +#include <linux/sysrq.h>
66043 +#include <linux/stringify.h>
66044 +#include <asm/irq.h>
66045 +#include <asm/mmu_context.h>
66046 +#include <xen/evtchn.h>
66047 +#include <asm/hypervisor.h>
66048 +#include <xen/interface/dom0_ops.h>
66049 +#include <xen/xenbus.h>
66050 +#include <linux/cpu.h>
66051 +#include <linux/kthread.h>
66052 +#include <xen/gnttab.h>
66053 +#include <xen/xencons.h>
66054 +#include <xen/cpu_hotplug.h>
66055 +
66056 +#if defined(__i386__) || defined(__x86_64__)
66057 +
66058 +/*
66059 + * Power off function, if any
66060 + */
66061 +void (*pm_power_off)(void);
66062 +EXPORT_SYMBOL(pm_power_off);
66063 +
66064 +void machine_emergency_restart(void)
66065 +{
66066 + /* We really want to get pending console data out before we die. */
66067 + xencons_force_flush();
66068 + HYPERVISOR_shutdown(SHUTDOWN_reboot);
66069 +}
66070 +
66071 +void machine_restart(char * __unused)
66072 +{
66073 + machine_emergency_restart();
66074 +}
66075 +
66076 +void machine_halt(void)
66077 +{
66078 + machine_power_off();
66079 +}
66080 +
66081 +void machine_power_off(void)
66082 +{
66083 + /* We really want to get pending console data out before we die. */
66084 + xencons_force_flush();
66085 + if (pm_power_off)
66086 + pm_power_off();
66087 + HYPERVISOR_shutdown(SHUTDOWN_poweroff);
66088 +}
66089 +
66090 +int reboot_thru_bios = 0; /* for dmi_scan.c */
66091 +EXPORT_SYMBOL(machine_restart);
66092 +EXPORT_SYMBOL(machine_halt);
66093 +EXPORT_SYMBOL(machine_power_off);
66094 +
66095 +/* Ensure we run on the idle task page tables so that we will
66096 + switch page tables before running user space. This is needed
66097 + on architectures with separate kernel and user page tables
66098 + because the user page table pointer is not saved/restored. */
66099 +static void switch_idle_mm(void)
66100 +{
66101 + struct mm_struct *mm = current->active_mm;
66102 +
66103 + if (mm == &init_mm)
66104 + return;
66105 +
66106 + atomic_inc(&init_mm.mm_count);
66107 + switch_mm(mm, &init_mm, current);
66108 + current->active_mm = &init_mm;
66109 + mmdrop(mm);
66110 +}
66111 +
66112 +static void pre_suspend(void)
66113 +{
66114 + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
66115 + clear_fixmap(FIX_SHARED_INFO);
66116 +
66117 + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
66118 + xen_start_info->console.domU.mfn =
66119 + mfn_to_pfn(xen_start_info->console.domU.mfn);
66120 +}
66121 +
66122 +static void post_suspend(void)
66123 +{
66124 + int i, j, k, fpp;
66125 + extern unsigned long max_pfn;
66126 + extern unsigned long *pfn_to_mfn_frame_list_list;
66127 + extern unsigned long *pfn_to_mfn_frame_list[];
66128 +
66129 + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
66130 +
66131 + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
66132 +
66133 + memset(empty_zero_page, 0, PAGE_SIZE);
66134 +
66135 + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
66136 + virt_to_mfn(pfn_to_mfn_frame_list_list);
66137 +
66138 + fpp = PAGE_SIZE/sizeof(unsigned long);
66139 + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
66140 + if ((j % fpp) == 0) {
66141 + k++;
66142 + pfn_to_mfn_frame_list_list[k] =
66143 + virt_to_mfn(pfn_to_mfn_frame_list[k]);
66144 + j = 0;
66145 + }
66146 + pfn_to_mfn_frame_list[k][j] =
66147 + virt_to_mfn(&phys_to_machine_mapping[i]);
66148 + }
66149 + HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
66150 +}
66151 +
66152 +#else /* !(defined(__i386__) || defined(__x86_64__)) */
66153 +
66154 +#define switch_idle_mm() ((void)0)
66155 +#define mm_pin_all() ((void)0)
66156 +#define pre_suspend() ((void)0)
66157 +#define post_suspend() ((void)0)
66158 +
66159 +#endif
66160 +
66161 +int __xen_suspend(void)
66162 +{
66163 + int err;
66164 +
66165 + extern void time_resume(void);
66166 +
66167 + BUG_ON(smp_processor_id() != 0);
66168 + BUG_ON(in_interrupt());
66169 +
66170 +#if defined(__i386__) || defined(__x86_64__)
66171 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
66172 + printk(KERN_WARNING "Cannot suspend in "
66173 + "auto_translated_physmap mode.\n");
66174 + return -EOPNOTSUPP;
66175 + }
66176 +#endif
66177 +
66178 + err = smp_suspend();
66179 + if (err)
66180 + return err;
66181 +
66182 + xenbus_suspend();
66183 +
66184 + preempt_disable();
66185 +
66186 + mm_pin_all();
66187 + local_irq_disable();
66188 + preempt_enable();
66189 +
66190 + gnttab_suspend();
66191 +
66192 + pre_suspend();
66193 +
66194 + /*
66195 + * We'll stop somewhere inside this hypercall. When it returns,
66196 + * we'll start resuming after the restore.
66197 + */
66198 + HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
66199 +
66200 + post_suspend();
66201 +
66202 + gnttab_resume();
66203 +
66204 + irq_resume();
66205 +
66206 + time_resume();
66207 +
66208 + switch_idle_mm();
66209 +
66210 + local_irq_enable();
66211 +
66212 + xencons_resume();
66213 +
66214 + xenbus_resume();
66215 +
66216 + smp_resume();
66217 +
66218 + return err;
66219 +}
66220 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/reboot.c linux-2.6.16.33/drivers/xen/core/reboot.c
66221 --- linux-2.6.16.33-noxen/drivers/xen/core/reboot.c 1970-01-01 00:00:00.000000000 +0000
66222 +++ linux-2.6.16.33/drivers/xen/core/reboot.c 2007-01-08 15:00:45.000000000 +0000
66223 @@ -0,0 +1,220 @@
66224 +#define __KERNEL_SYSCALLS__
66225 +#include <linux/version.h>
66226 +#include <linux/kernel.h>
66227 +#include <linux/unistd.h>
66228 +#include <linux/module.h>
66229 +#include <linux/reboot.h>
66230 +#include <linux/sysrq.h>
66231 +#include <asm/hypervisor.h>
66232 +#include <xen/xenbus.h>
66233 +#include <linux/kthread.h>
66234 +
66235 +MODULE_LICENSE("Dual BSD/GPL");
66236 +
66237 +#define SHUTDOWN_INVALID -1
66238 +#define SHUTDOWN_POWEROFF 0
66239 +#define SHUTDOWN_SUSPEND 2
66240 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
66241 + * report a crash, not be instructed to crash!
66242 + * HALT is the same as POWEROFF, as far as we're concerned. The tools use
66243 + * the distinction when we return the reason code to them.
66244 + */
66245 +#define SHUTDOWN_HALT 4
66246 +
66247 +/* Ignore multiple shutdown requests. */
66248 +static int shutting_down = SHUTDOWN_INVALID;
66249 +
66250 +static void __shutdown_handler(void *unused);
66251 +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
66252 +
66253 +#ifdef CONFIG_XEN
66254 +int __xen_suspend(void);
66255 +#else
66256 +#define __xen_suspend() (void)0
66257 +#endif
66258 +
66259 +static int shutdown_process(void *__unused)
66260 +{
66261 + static char *envp[] = { "HOME=/", "TERM=linux",
66262 + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
66263 + static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
66264 +
66265 + extern asmlinkage long sys_reboot(int magic1, int magic2,
66266 + unsigned int cmd, void *arg);
66267 +
66268 + if ((shutting_down == SHUTDOWN_POWEROFF) ||
66269 + (shutting_down == SHUTDOWN_HALT)) {
66270 + if (call_usermodehelper("/sbin/poweroff", poweroff_argv, envp, 0) < 0) {
66271 +#ifdef CONFIG_XEN
66272 + sys_reboot(LINUX_REBOOT_MAGIC1,
66273 + LINUX_REBOOT_MAGIC2,
66274 + LINUX_REBOOT_CMD_POWER_OFF,
66275 + NULL);
66276 +#endif /* CONFIG_XEN */
66277 + }
66278 + }
66279 +
66280 + shutting_down = SHUTDOWN_INVALID; /* could try again */
66281 +
66282 + return 0;
66283 +}
66284 +
66285 +static int xen_suspend(void *__unused)
66286 +{
66287 + __xen_suspend();
66288 + shutting_down = SHUTDOWN_INVALID;
66289 + return 0;
66290 +}
66291 +
66292 +static int kthread_create_on_cpu(int (*f)(void *arg),
66293 + void *arg,
66294 + const char *name,
66295 + int cpu)
66296 +{
66297 + struct task_struct *p;
66298 + p = kthread_create(f, arg, name);
66299 + if (IS_ERR(p))
66300 + return PTR_ERR(p);
66301 + kthread_bind(p, cpu);
66302 + wake_up_process(p);
66303 + return 0;
66304 +}
66305 +
66306 +static void __shutdown_handler(void *unused)
66307 +{
66308 + int err;
66309 +
66310 + if (shutting_down != SHUTDOWN_SUSPEND)
66311 + err = kernel_thread(shutdown_process, NULL,
66312 + CLONE_FS | CLONE_FILES);
66313 + else
66314 + err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0);
66315 +
66316 + if (err < 0) {
66317 + printk(KERN_WARNING "Error creating shutdown process (%d): "
66318 + "retrying...\n", -err);
66319 + schedule_delayed_work(&shutdown_work, HZ/2);
66320 + }
66321 +}
66322 +
66323 +static void shutdown_handler(struct xenbus_watch *watch,
66324 + const char **vec, unsigned int len)
66325 +{
66326 + char *str;
66327 + struct xenbus_transaction xbt;
66328 + int err;
66329 +
66330 + if (shutting_down != SHUTDOWN_INVALID)
66331 + return;
66332 +
66333 + again:
66334 + err = xenbus_transaction_start(&xbt);
66335 + if (err)
66336 + return;
66337 + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
66338 + /* Ignore read errors and empty reads. */
66339 + if (XENBUS_IS_ERR_READ(str)) {
66340 + xenbus_transaction_end(xbt, 1);
66341 + return;
66342 + }
66343 +
66344 + xenbus_write(xbt, "control", "shutdown", "");
66345 +
66346 + err = xenbus_transaction_end(xbt, 0);
66347 + if (err == -EAGAIN) {
66348 + kfree(str);
66349 + goto again;
66350 + }
66351 +
66352 + if (strcmp(str, "poweroff") == 0)
66353 + shutting_down = SHUTDOWN_POWEROFF;
66354 + else if (strcmp(str, "reboot") == 0)
66355 + kill_proc(1, SIGINT, 1); /* interrupt init */
66356 + else if (strcmp(str, "suspend") == 0)
66357 + shutting_down = SHUTDOWN_SUSPEND;
66358 + else if (strcmp(str, "halt") == 0)
66359 + shutting_down = SHUTDOWN_HALT;
66360 + else {
66361 + printk("Ignoring shutdown request: %s\n", str);
66362 + shutting_down = SHUTDOWN_INVALID;
66363 + }
66364 +
66365 + if (shutting_down != SHUTDOWN_INVALID)
66366 + schedule_work(&shutdown_work);
66367 +
66368 + kfree(str);
66369 +}
66370 +
66371 +static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
66372 + unsigned int len)
66373 +{
66374 + char sysrq_key = '\0';
66375 + struct xenbus_transaction xbt;
66376 + int err;
66377 +
66378 + again:
66379 + err = xenbus_transaction_start(&xbt);
66380 + if (err)
66381 + return;
66382 + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
66383 + printk(KERN_ERR "Unable to read sysrq code in "
66384 + "control/sysrq\n");
66385 + xenbus_transaction_end(xbt, 1);
66386 + return;
66387 + }
66388 +
66389 + if (sysrq_key != '\0')
66390 + xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
66391 +
66392 + err = xenbus_transaction_end(xbt, 0);
66393 + if (err == -EAGAIN)
66394 + goto again;
66395 +
66396 +#ifdef CONFIG_MAGIC_SYSRQ
66397 + if (sysrq_key != '\0')
66398 + handle_sysrq(sysrq_key, NULL, NULL);
66399 +#endif
66400 +}
66401 +
66402 +static struct xenbus_watch shutdown_watch = {
66403 + .node = "control/shutdown",
66404 + .callback = shutdown_handler
66405 +};
66406 +
66407 +static struct xenbus_watch sysrq_watch = {
66408 + .node ="control/sysrq",
66409 + .callback = sysrq_handler
66410 +};
66411 +
66412 +static int setup_shutdown_watcher(struct notifier_block *notifier,
66413 + unsigned long event,
66414 + void *data)
66415 +{
66416 + int err;
66417 +
66418 + err = register_xenbus_watch(&shutdown_watch);
66419 + if (err)
66420 + printk(KERN_ERR "Failed to set shutdown watcher\n");
66421 + else
66422 + xenbus_write(XBT_NIL, "control", "feature-reboot", "1");
66423 +
66424 + err = register_xenbus_watch(&sysrq_watch);
66425 + if (err)
66426 + printk(KERN_ERR "Failed to set sysrq watcher\n");
66427 + else
66428 + xenbus_write(XBT_NIL, "control", "feature-sysrq", "1");
66429 +
66430 + return NOTIFY_DONE;
66431 +}
66432 +
66433 +static int __init setup_shutdown_event(void)
66434 +{
66435 + static struct notifier_block xenstore_notifier = {
66436 + .notifier_call = setup_shutdown_watcher
66437 + };
66438 + register_xenstore_notifier(&xenstore_notifier);
66439 +
66440 + return 0;
66441 +}
66442 +
66443 +subsys_initcall(setup_shutdown_event);
66444 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/skbuff.c linux-2.6.16.33/drivers/xen/core/skbuff.c
66445 --- linux-2.6.16.33-noxen/drivers/xen/core/skbuff.c 1970-01-01 00:00:00.000000000 +0000
66446 +++ linux-2.6.16.33/drivers/xen/core/skbuff.c 2007-01-08 15:00:45.000000000 +0000
66447 @@ -0,0 +1,145 @@
66448 +
66449 +#include <linux/config.h>
66450 +#include <linux/module.h>
66451 +#include <linux/version.h>
66452 +#include <linux/kernel.h>
66453 +#include <linux/sched.h>
66454 +#include <linux/slab.h>
66455 +#include <linux/netdevice.h>
66456 +#include <linux/inetdevice.h>
66457 +#include <linux/etherdevice.h>
66458 +#include <linux/skbuff.h>
66459 +#include <linux/init.h>
66460 +#include <asm/io.h>
66461 +#include <asm/page.h>
66462 +#include <asm/hypervisor.h>
66463 +
66464 +/* Referenced in netback.c. */
66465 +/*static*/ kmem_cache_t *skbuff_cachep;
66466 +EXPORT_SYMBOL(skbuff_cachep);
66467 +
66468 +/* Allow up to 64kB or page-sized packets (whichever is greater). */
66469 +#if PAGE_SHIFT < 16
66470 +#define MAX_SKBUFF_ORDER (16 - PAGE_SHIFT)
66471 +#else
66472 +#define MAX_SKBUFF_ORDER 0
66473 +#endif
66474 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
66475 +
66476 +static struct {
66477 + int size;
66478 + kmem_cache_t *cachep;
66479 +} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
66480 +
66481 +struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
66482 + int fclone)
66483 +{
66484 + int order, i;
66485 + kmem_cache_t *cachep;
66486 +
66487 + length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
66488 +
66489 + if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
66490 + for (i = 0; skbuff_small[i].size < length; i++)
66491 + continue;
66492 + cachep = skbuff_small[i].cachep;
66493 + } else {
66494 + order = get_order(length);
66495 + if (order > MAX_SKBUFF_ORDER) {
66496 + printk(KERN_ALERT "Attempt to allocate order %d "
66497 + "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
66498 + return NULL;
66499 + }
66500 + cachep = skbuff_order_cachep[order];
66501 + }
66502 +
66503 + length -= sizeof(struct skb_shared_info);
66504 +
66505 + return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
66506 +}
66507 +
66508 +struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
66509 +{
66510 + struct sk_buff *skb;
66511 + int order;
66512 +
66513 + length = SKB_DATA_ALIGN(length + 16);
66514 + order = get_order(length + sizeof(struct skb_shared_info));
66515 + if (order > MAX_SKBUFF_ORDER) {
66516 + printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
66517 + "Increase MAX_SKBUFF_ORDER.\n", order);
66518 + return NULL;
66519 + }
66520 +
66521 + skb = alloc_skb_from_cache(
66522 + skbuff_order_cachep[order], length, gfp_mask, 0);
66523 + if (skb != NULL)
66524 + skb_reserve(skb, 16);
66525 +
66526 + return skb;
66527 +}
66528 +
66529 +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
66530 +{
66531 + int order = 0;
66532 +
66533 + while (skbuff_order_cachep[order] != cachep)
66534 + order++;
66535 +
66536 + /* Do our best to allocate contiguous memory but fall back to IOMMU. */
66537 + if (order != 0)
66538 + (void)xen_create_contiguous_region(
66539 + (unsigned long)buf, order, 0);
66540 +
66541 + scrub_pages(buf, 1 << order);
66542 +}
66543 +
66544 +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
66545 +{
66546 + int order = 0;
66547 +
66548 + while (skbuff_order_cachep[order] != cachep)
66549 + order++;
66550 +
66551 + if (order != 0)
66552 + xen_destroy_contiguous_region((unsigned long)buf, order);
66553 +}
66554 +
66555 +static int __init skbuff_init(void)
66556 +{
66557 + static char name[MAX_SKBUFF_ORDER + 1][20];
66558 + static char small_name[ARRAY_SIZE(skbuff_small)][20];
66559 + unsigned long size;
66560 + int i, order;
66561 +
66562 + for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
66563 + size = skbuff_small[i].size;
66564 + sprintf(small_name[i], "xen-skb-%lu", size);
66565 + /*
66566 + * No ctor/dtor: objects do not span page boundaries, and they
66567 + * are only used on transmit path so no need for scrubbing.
66568 + */
66569 + skbuff_small[i].cachep = kmem_cache_create(
66570 + small_name[i], size, size, 0, NULL, NULL);
66571 + }
66572 +
66573 + for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
66574 + size = PAGE_SIZE << order;
66575 + sprintf(name[order], "xen-skb-%lu", size);
66576 + if (is_running_on_xen() && is_initial_xendomain())
66577 + skbuff_order_cachep[order] = kmem_cache_create(
66578 + name[order], size, size, 0,
66579 + skbuff_ctor, skbuff_dtor);
66580 + else
66581 + skbuff_order_cachep[order] = kmem_cache_create(
66582 + name[order], size, size, 0, NULL, NULL);
66583 +
66584 + }
66585 +
66586 + skbuff_cachep = skbuff_order_cachep[0];
66587 +
66588 + return 0;
66589 +}
66590 +core_initcall(skbuff_init);
66591 +
66592 +EXPORT_SYMBOL(__dev_alloc_skb);
66593 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/smpboot.c linux-2.6.16.33/drivers/xen/core/smpboot.c
66594 --- linux-2.6.16.33-noxen/drivers/xen/core/smpboot.c 1970-01-01 00:00:00.000000000 +0000
66595 +++ linux-2.6.16.33/drivers/xen/core/smpboot.c 2007-01-08 15:00:45.000000000 +0000
66596 @@ -0,0 +1,459 @@
66597 +/*
66598 + * Xen SMP booting functions
66599 + *
66600 + * See arch/i386/kernel/smpboot.c for copyright and credits for derived
66601 + * portions of this file.
66602 + */
66603 +
66604 +#include <linux/module.h>
66605 +#include <linux/config.h>
66606 +#include <linux/init.h>
66607 +#include <linux/kernel.h>
66608 +#include <linux/mm.h>
66609 +#include <linux/sched.h>
66610 +#include <linux/kernel_stat.h>
66611 +#include <linux/smp_lock.h>
66612 +#include <linux/irq.h>
66613 +#include <linux/bootmem.h>
66614 +#include <linux/notifier.h>
66615 +#include <linux/cpu.h>
66616 +#include <linux/percpu.h>
66617 +#include <asm/desc.h>
66618 +#include <asm/arch_hooks.h>
66619 +#include <asm/pgalloc.h>
66620 +#include <xen/evtchn.h>
66621 +#include <xen/interface/vcpu.h>
66622 +#include <xen/cpu_hotplug.h>
66623 +#include <xen/xenbus.h>
66624 +
66625 +#ifdef CONFIG_SMP_ALTERNATIVES
66626 +#include <asm/smp_alt.h>
66627 +#endif
66628 +
66629 +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
66630 +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
66631 +
66632 +extern int local_setup_timer(unsigned int cpu);
66633 +extern void local_teardown_timer(unsigned int cpu);
66634 +
66635 +extern void hypervisor_callback(void);
66636 +extern void failsafe_callback(void);
66637 +extern void system_call(void);
66638 +extern void smp_trap_init(trap_info_t *);
66639 +
66640 +/* Number of siblings per CPU package */
66641 +int smp_num_siblings = 1;
66642 +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
66643 +EXPORT_SYMBOL(phys_proc_id);
66644 +int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */
66645 +EXPORT_SYMBOL(cpu_core_id);
66646 +
66647 +cpumask_t cpu_online_map;
66648 +EXPORT_SYMBOL(cpu_online_map);
66649 +cpumask_t cpu_possible_map;
66650 +EXPORT_SYMBOL(cpu_possible_map);
66651 +
66652 +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
66653 +EXPORT_SYMBOL(cpu_data);
66654 +
66655 +#ifdef CONFIG_HOTPLUG_CPU
66656 +DEFINE_PER_CPU(int, cpu_state) = { 0 };
66657 +#endif
66658 +
66659 +static DEFINE_PER_CPU(int, resched_irq);
66660 +static DEFINE_PER_CPU(int, callfunc_irq);
66661 +static char resched_name[NR_CPUS][15];
66662 +static char callfunc_name[NR_CPUS][15];
66663 +
66664 +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
66665 +
66666 +void *xquad_portio;
66667 +
66668 +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
66669 +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
66670 +EXPORT_SYMBOL(cpu_core_map);
66671 +
66672 +#if defined(__i386__)
66673 +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
66674 +EXPORT_SYMBOL(x86_cpu_to_apicid);
66675 +#elif !defined(CONFIG_X86_IO_APIC)
66676 +unsigned int maxcpus = NR_CPUS;
66677 +#endif
66678 +
66679 +void __init prefill_possible_map(void)
66680 +{
66681 + int i, rc;
66682 +
66683 + if (!cpus_empty(cpu_possible_map))
66684 + return;
66685 +
66686 + for (i = 0; i < NR_CPUS; i++) {
66687 + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
66688 + if (rc >= 0)
66689 + cpu_set(i, cpu_possible_map);
66690 + }
66691 +}
66692 +
66693 +void __init smp_alloc_memory(void)
66694 +{
66695 +}
66696 +
66697 +static inline void
66698 +set_cpu_sibling_map(int cpu)
66699 +{
66700 + phys_proc_id[cpu] = cpu;
66701 + cpu_core_id[cpu] = 0;
66702 +
66703 + cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
66704 + cpu_core_map[cpu] = cpumask_of_cpu(cpu);
66705 +
66706 + cpu_data[cpu].booted_cores = 1;
66707 +}
66708 +
66709 +static void
66710 +remove_siblinginfo(int cpu)
66711 +{
66712 + phys_proc_id[cpu] = BAD_APICID;
66713 + cpu_core_id[cpu] = BAD_APICID;
66714 +
66715 + cpus_clear(cpu_sibling_map[cpu]);
66716 + cpus_clear(cpu_core_map[cpu]);
66717 +
66718 + cpu_data[cpu].booted_cores = 0;
66719 +}
66720 +
66721 +static int xen_smp_intr_init(unsigned int cpu)
66722 +{
66723 + int rc;
66724 +
66725 + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
66726 +
66727 + sprintf(resched_name[cpu], "resched%d", cpu);
66728 + rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
66729 + cpu,
66730 + smp_reschedule_interrupt,
66731 + SA_INTERRUPT,
66732 + resched_name[cpu],
66733 + NULL);
66734 + if (rc < 0)
66735 + goto fail;
66736 + per_cpu(resched_irq, cpu) = rc;
66737 +
66738 + sprintf(callfunc_name[cpu], "callfunc%d", cpu);
66739 + rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
66740 + cpu,
66741 + smp_call_function_interrupt,
66742 + SA_INTERRUPT,
66743 + callfunc_name[cpu],
66744 + NULL);
66745 + if (rc < 0)
66746 + goto fail;
66747 + per_cpu(callfunc_irq, cpu) = rc;
66748 +
66749 + if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
66750 + goto fail;
66751 +
66752 + return 0;
66753 +
66754 + fail:
66755 + if (per_cpu(resched_irq, cpu) >= 0)
66756 + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
66757 + if (per_cpu(callfunc_irq, cpu) >= 0)
66758 + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
66759 + return rc;
66760 +}
66761 +
66762 +#ifdef CONFIG_HOTPLUG_CPU
66763 +static void xen_smp_intr_exit(unsigned int cpu)
66764 +{
66765 + if (cpu != 0)
66766 + local_teardown_timer(cpu);
66767 +
66768 + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
66769 + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
66770 +}
66771 +#endif
66772 +
66773 +void cpu_bringup(void)
66774 +{
66775 + cpu_init();
66776 + touch_softlockup_watchdog();
66777 + preempt_disable();
66778 + local_irq_enable();
66779 +}
66780 +
66781 +static void cpu_bringup_and_idle(void)
66782 +{
66783 + cpu_bringup();
66784 + cpu_idle();
66785 +}
66786 +
66787 +void cpu_initialize_context(unsigned int cpu)
66788 +{
66789 + vcpu_guest_context_t ctxt;
66790 + struct task_struct *idle = idle_task(cpu);
66791 +#ifdef __x86_64__
66792 + struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
66793 +#else
66794 + struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
66795 +#endif
66796 +
66797 + if (cpu == 0)
66798 + return;
66799 +
66800 + memset(&ctxt, 0, sizeof(ctxt));
66801 +
66802 + ctxt.flags = VGCF_IN_KERNEL;
66803 + ctxt.user_regs.ds = __USER_DS;
66804 + ctxt.user_regs.es = __USER_DS;
66805 + ctxt.user_regs.fs = 0;
66806 + ctxt.user_regs.gs = 0;
66807 + ctxt.user_regs.ss = __KERNEL_DS;
66808 + ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
66809 + ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
66810 +
66811 + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
66812 +
66813 + smp_trap_init(ctxt.trap_ctxt);
66814 +
66815 + ctxt.ldt_ents = 0;
66816 +
66817 + ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
66818 + ctxt.gdt_ents = gdt_descr->size / 8;
66819 +
66820 +#ifdef __i386__
66821 + ctxt.user_regs.cs = __KERNEL_CS;
66822 + ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
66823 +
66824 + ctxt.kernel_ss = __KERNEL_DS;
66825 + ctxt.kernel_sp = idle->thread.esp0;
66826 +
66827 + ctxt.event_callback_cs = __KERNEL_CS;
66828 + ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
66829 + ctxt.failsafe_callback_cs = __KERNEL_CS;
66830 + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
66831 +
66832 + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
66833 +#else /* __x86_64__ */
66834 + ctxt.user_regs.cs = __KERNEL_CS;
66835 + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
66836 +
66837 + ctxt.kernel_ss = __KERNEL_DS;
66838 + ctxt.kernel_sp = idle->thread.rsp0;
66839 +
66840 + ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
66841 + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
66842 + ctxt.syscall_callback_eip = (unsigned long)system_call;
66843 +
66844 + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
66845 +
66846 + ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
66847 +#endif
66848 +
66849 + BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
66850 +}
66851 +
66852 +void __init smp_prepare_cpus(unsigned int max_cpus)
66853 +{
66854 + int cpu;
66855 + struct task_struct *idle;
66856 +#ifdef __x86_64__
66857 + struct desc_ptr *gdt_descr;
66858 +#else
66859 + struct Xgt_desc_struct *gdt_descr;
66860 +#endif
66861 +
66862 + boot_cpu_data.apicid = 0;
66863 + cpu_data[0] = boot_cpu_data;
66864 +
66865 + cpu_2_logical_apicid[0] = 0;
66866 + x86_cpu_to_apicid[0] = 0;
66867 +
66868 + current_thread_info()->cpu = 0;
66869 +
66870 + for (cpu = 0; cpu < NR_CPUS; cpu++) {
66871 + cpus_clear(cpu_sibling_map[cpu]);
66872 + cpus_clear(cpu_core_map[cpu]);
66873 + }
66874 +
66875 + set_cpu_sibling_map(0);
66876 +
66877 + if (xen_smp_intr_init(0))
66878 + BUG();
66879 +
66880 + /* Restrict the possible_map according to max_cpus. */
66881 + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
66882 + for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
66883 + continue;
66884 + cpu_clear(cpu, cpu_possible_map);
66885 + }
66886 +
66887 + for_each_cpu (cpu) {
66888 + if (cpu == 0)
66889 + continue;
66890 +
66891 +#ifdef __x86_64__
66892 + gdt_descr = &cpu_gdt_descr[cpu];
66893 +#else
66894 + gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
66895 +#endif
66896 + gdt_descr->address = get_zeroed_page(GFP_KERNEL);
66897 + if (unlikely(!gdt_descr->address)) {
66898 + printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
66899 + cpu);
66900 + continue;
66901 + }
66902 + gdt_descr->size = GDT_SIZE;
66903 + memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
66904 + make_page_readonly(
66905 + (void *)gdt_descr->address,
66906 + XENFEAT_writable_descriptor_tables);
66907 +
66908 + cpu_data[cpu] = boot_cpu_data;
66909 + cpu_data[cpu].apicid = cpu;
66910 +
66911 + cpu_2_logical_apicid[cpu] = cpu;
66912 + x86_cpu_to_apicid[cpu] = cpu;
66913 +
66914 + idle = fork_idle(cpu);
66915 + if (IS_ERR(idle))
66916 + panic("failed fork for CPU %d", cpu);
66917 +
66918 +#ifdef __x86_64__
66919 + cpu_pda(cpu)->pcurrent = idle;
66920 + cpu_pda(cpu)->cpunumber = cpu;
66921 + clear_ti_thread_flag(idle->thread_info, TIF_FORK);
66922 +#endif
66923 +
66924 + irq_ctx_init(cpu);
66925 +
66926 +#ifdef CONFIG_HOTPLUG_CPU
66927 + if (is_initial_xendomain())
66928 + cpu_set(cpu, cpu_present_map);
66929 +#else
66930 + cpu_set(cpu, cpu_present_map);
66931 +#endif
66932 +
66933 + cpu_initialize_context(cpu);
66934 + }
66935 +
66936 + init_xenbus_allowed_cpumask();
66937 +
66938 +#ifdef CONFIG_X86_IO_APIC
66939 + /*
66940 + * Here we can be sure that there is an IO-APIC in the system. Let's
66941 + * go and set it up:
66942 + */
66943 + if (!skip_ioapic_setup && nr_ioapics)
66944 + setup_IO_APIC();
66945 +#endif
66946 +}
66947 +
66948 +void __devinit smp_prepare_boot_cpu(void)
66949 +{
66950 + prefill_possible_map();
66951 + cpu_present_map = cpumask_of_cpu(0);
66952 + cpu_online_map = cpumask_of_cpu(0);
66953 +}
66954 +
66955 +#ifdef CONFIG_HOTPLUG_CPU
66956 +
66957 +/*
66958 + * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
66959 + * But do it early enough to catch critical for_each_present_cpu() loops
66960 + * in i386-specific code.
66961 + */
66962 +static int __init initialize_cpu_present_map(void)
66963 +{
66964 + cpu_present_map = cpu_possible_map;
66965 + return 0;
66966 +}
66967 +core_initcall(initialize_cpu_present_map);
66968 +
66969 +int __cpu_disable(void)
66970 +{
66971 + cpumask_t map = cpu_online_map;
66972 + int cpu = smp_processor_id();
66973 +
66974 + if (cpu == 0)
66975 + return -EBUSY;
66976 +
66977 + remove_siblinginfo(cpu);
66978 +
66979 + cpu_clear(cpu, map);
66980 + fixup_irqs(map);
66981 + cpu_clear(cpu, cpu_online_map);
66982 +
66983 + return 0;
66984 +}
66985 +
66986 +void __cpu_die(unsigned int cpu)
66987 +{
66988 + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
66989 + current->state = TASK_UNINTERRUPTIBLE;
66990 + schedule_timeout(HZ/10);
66991 + }
66992 +
66993 + xen_smp_intr_exit(cpu);
66994 +
66995 +#ifdef CONFIG_SMP_ALTERNATIVES
66996 + if (num_online_cpus() == 1)
66997 + unprepare_for_smp();
66998 +#endif
66999 +}
67000 +
67001 +#else /* !CONFIG_HOTPLUG_CPU */
67002 +
67003 +int __cpu_disable(void)
67004 +{
67005 + return -ENOSYS;
67006 +}
67007 +
67008 +void __cpu_die(unsigned int cpu)
67009 +{
67010 + BUG();
67011 +}
67012 +
67013 +#endif /* CONFIG_HOTPLUG_CPU */
67014 +
67015 +int __devinit __cpu_up(unsigned int cpu)
67016 +{
67017 + int rc;
67018 +
67019 + rc = cpu_up_check(cpu);
67020 + if (rc)
67021 + return rc;
67022 +
67023 +#ifdef CONFIG_SMP_ALTERNATIVES
67024 + if (num_online_cpus() == 1)
67025 + prepare_for_smp();
67026 +#endif
67027 +
67028 + /* This must be done before setting cpu_online_map */
67029 + set_cpu_sibling_map(cpu);
67030 + wmb();
67031 +
67032 + rc = xen_smp_intr_init(cpu);
67033 + if (rc) {
67034 + remove_siblinginfo(cpu);
67035 + return rc;
67036 + }
67037 +
67038 + cpu_set(cpu, cpu_online_map);
67039 +
67040 + rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
67041 + BUG_ON(rc);
67042 +
67043 + return 0;
67044 +}
67045 +
67046 +void __init smp_cpus_done(unsigned int max_cpus)
67047 +{
67048 +}
67049 +
67050 +#ifndef CONFIG_X86_LOCAL_APIC
67051 +int setup_profiling_timer(unsigned int multiplier)
67052 +{
67053 + return -EINVAL;
67054 +}
67055 +#endif
67056 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/xen_proc.c linux-2.6.16.33/drivers/xen/core/xen_proc.c
67057 --- linux-2.6.16.33-noxen/drivers/xen/core/xen_proc.c 1970-01-01 00:00:00.000000000 +0000
67058 +++ linux-2.6.16.33/drivers/xen/core/xen_proc.c 2007-01-08 15:00:45.000000000 +0000
67059 @@ -0,0 +1,19 @@
67060 +
67061 +#include <linux/config.h>
67062 +#include <linux/proc_fs.h>
67063 +#include <xen/xen_proc.h>
67064 +
67065 +static struct proc_dir_entry *xen_base;
67066 +
67067 +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
67068 +{
67069 + if ( xen_base == NULL )
67070 + if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
67071 + panic("Couldn't create /proc/xen");
67072 + return create_proc_entry(name, mode, xen_base);
67073 +}
67074 +
67075 +void remove_xen_proc_entry(const char *name)
67076 +{
67077 + remove_proc_entry(name, xen_base);
67078 +}
67079 diff -Nur linux-2.6.16.33-noxen/drivers/xen/core/xen_sysfs.c linux-2.6.16.33/drivers/xen/core/xen_sysfs.c
67080 --- linux-2.6.16.33-noxen/drivers/xen/core/xen_sysfs.c 1970-01-01 00:00:00.000000000 +0000
67081 +++ linux-2.6.16.33/drivers/xen/core/xen_sysfs.c 2007-01-08 15:00:45.000000000 +0000
67082 @@ -0,0 +1,379 @@
67083 +/*
67084 + * copyright (c) 2006 IBM Corporation
67085 + * Authored by: Mike D. Day <ncmike@us.ibm.com>
67086 + *
67087 + * This program is free software; you can redistribute it and/or modify
67088 + * it under the terms of the GNU General Public License version 2 as
67089 + * published by the Free Software Foundation.
67090 + */
67091 +
67092 +#include <linux/config.h>
67093 +#include <linux/err.h>
67094 +#include <linux/kernel.h>
67095 +#include <linux/module.h>
67096 +#include <linux/init.h>
67097 +#include <asm/hypervisor.h>
67098 +#include <xen/features.h>
67099 +#include <xen/hypervisor_sysfs.h>
67100 +#include <xen/xenbus.h>
67101 +
67102 +MODULE_LICENSE("GPL");
67103 +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
67104 +
67105 +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
67106 +{
67107 + return sprintf(buffer, "xen\n");
67108 +}
67109 +
67110 +HYPERVISOR_ATTR_RO(type);
67111 +
67112 +static int __init xen_sysfs_type_init(void)
67113 +{
67114 + return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
67115 +}
67116 +
67117 +static void xen_sysfs_type_destroy(void)
67118 +{
67119 + sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
67120 +}
67121 +
67122 +/* xen version attributes */
67123 +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
67124 +{
67125 + int version = HYPERVISOR_xen_version(XENVER_version, NULL);
67126 + if (version)
67127 + return sprintf(buffer, "%d\n", version >> 16);
67128 + return -ENODEV;
67129 +}
67130 +
67131 +HYPERVISOR_ATTR_RO(major);
67132 +
67133 +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
67134 +{
67135 + int version = HYPERVISOR_xen_version(XENVER_version, NULL);
67136 + if (version)
67137 + return sprintf(buffer, "%d\n", version & 0xff);
67138 + return -ENODEV;
67139 +}
67140 +
67141 +HYPERVISOR_ATTR_RO(minor);
67142 +
67143 +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
67144 +{
67145 + int ret = -ENOMEM;
67146 + char *extra;
67147 +
67148 + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
67149 + if (extra) {
67150 + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
67151 + if (!ret)
67152 + ret = sprintf(buffer, "%s\n", extra);
67153 + kfree(extra);
67154 + }
67155 +
67156 + return ret;
67157 +}
67158 +
67159 +HYPERVISOR_ATTR_RO(extra);
67160 +
67161 +static struct attribute *version_attrs[] = {
67162 + &major_attr.attr,
67163 + &minor_attr.attr,
67164 + &extra_attr.attr,
67165 + NULL
67166 +};
67167 +
67168 +static struct attribute_group version_group = {
67169 + .name = "version",
67170 + .attrs = version_attrs,
67171 +};
67172 +
67173 +static int __init xen_sysfs_version_init(void)
67174 +{
67175 + return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67176 + &version_group);
67177 +}
67178 +
67179 +static void xen_sysfs_version_destroy(void)
67180 +{
67181 + sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
67182 +}
67183 +
67184 +/* UUID */
67185 +
67186 +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
67187 +{
67188 + char *vm, *val;
67189 + int ret;
67190 +
67191 + vm = xenbus_read(XBT_NIL, "vm", "", NULL);
67192 + if (IS_ERR(vm))
67193 + return PTR_ERR(vm);
67194 + val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
67195 + kfree(vm);
67196 + if (IS_ERR(val))
67197 + return PTR_ERR(val);
67198 + ret = sprintf(buffer, "%s\n", val);
67199 + kfree(val);
67200 + return ret;
67201 +}
67202 +
67203 +HYPERVISOR_ATTR_RO(uuid);
67204 +
67205 +static int __init xen_sysfs_uuid_init(void)
67206 +{
67207 + return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
67208 +}
67209 +
67210 +static void xen_sysfs_uuid_destroy(void)
67211 +{
67212 + sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
67213 +}
67214 +
67215 +/* xen compilation attributes */
67216 +
67217 +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
67218 +{
67219 + int ret = -ENOMEM;
67220 + struct xen_compile_info *info;
67221 +
67222 + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67223 + if (info) {
67224 + ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67225 + if (!ret)
67226 + ret = sprintf(buffer, "%s\n", info->compiler);
67227 + kfree(info);
67228 + }
67229 +
67230 + return ret;
67231 +}
67232 +
67233 +HYPERVISOR_ATTR_RO(compiler);
67234 +
67235 +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
67236 +{
67237 + int ret = -ENOMEM;
67238 + struct xen_compile_info *info;
67239 +
67240 + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67241 + if (info) {
67242 + ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67243 + if (!ret)
67244 + ret = sprintf(buffer, "%s\n", info->compile_by);
67245 + kfree(info);
67246 + }
67247 +
67248 + return ret;
67249 +}
67250 +
67251 +HYPERVISOR_ATTR_RO(compiled_by);
67252 +
67253 +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
67254 +{
67255 + int ret = -ENOMEM;
67256 + struct xen_compile_info *info;
67257 +
67258 + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
67259 + if (info) {
67260 + ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
67261 + if (!ret)
67262 + ret = sprintf(buffer, "%s\n", info->compile_date);
67263 + kfree(info);
67264 + }
67265 +
67266 + return ret;
67267 +}
67268 +
67269 +HYPERVISOR_ATTR_RO(compile_date);
67270 +
67271 +static struct attribute *xen_compile_attrs[] = {
67272 + &compiler_attr.attr,
67273 + &compiled_by_attr.attr,
67274 + &compile_date_attr.attr,
67275 + NULL
67276 +};
67277 +
67278 +static struct attribute_group xen_compilation_group = {
67279 + .name = "compilation",
67280 + .attrs = xen_compile_attrs,
67281 +};
67282 +
67283 +int __init static xen_compilation_init(void)
67284 +{
67285 + return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67286 + &xen_compilation_group);
67287 +}
67288 +
67289 +static void xen_compilation_destroy(void)
67290 +{
67291 + sysfs_remove_group(&hypervisor_subsys.kset.kobj,
67292 + &xen_compilation_group);
67293 +}
67294 +
67295 +/* xen properties info */
67296 +
67297 +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
67298 +{
67299 + int ret = -ENOMEM;
67300 + char *caps;
67301 +
67302 + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
67303 + if (caps) {
67304 + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
67305 + if (!ret)
67306 + ret = sprintf(buffer, "%s\n", caps);
67307 + kfree(caps);
67308 + }
67309 +
67310 + return ret;
67311 +}
67312 +
67313 +HYPERVISOR_ATTR_RO(capabilities);
67314 +
67315 +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
67316 +{
67317 + int ret = -ENOMEM;
67318 + char *cset;
67319 +
67320 + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
67321 + if (cset) {
67322 + ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
67323 + if (!ret)
67324 + ret = sprintf(buffer, "%s\n", cset);
67325 + kfree(cset);
67326 + }
67327 +
67328 + return ret;
67329 +}
67330 +
67331 +HYPERVISOR_ATTR_RO(changeset);
67332 +
67333 +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
67334 +{
67335 + int ret = -ENOMEM;
67336 + struct xen_platform_parameters *parms;
67337 +
67338 + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
67339 + if (parms) {
67340 + ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
67341 + parms);
67342 + if (!ret)
67343 + ret = sprintf(buffer, "%lx\n", parms->virt_start);
67344 + kfree(parms);
67345 + }
67346 +
67347 + return ret;
67348 +}
67349 +
67350 +HYPERVISOR_ATTR_RO(virtual_start);
67351 +
67352 +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
67353 +{
67354 + int ret;
67355 +
67356 + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
67357 + if (ret > 0)
67358 + ret = sprintf(buffer, "%x\n", ret);
67359 +
67360 + return ret;
67361 +}
67362 +
67363 +HYPERVISOR_ATTR_RO(pagesize);
67364 +
67365 +/* eventually there will be several more features to export */
67366 +static ssize_t xen_feature_show(int index, char *buffer)
67367 +{
67368 + int ret = -ENOMEM;
67369 + struct xen_feature_info *info;
67370 +
67371 + info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
67372 + if (info) {
67373 + info->submap_idx = index;
67374 + ret = HYPERVISOR_xen_version(XENVER_get_features, info);
67375 + if (!ret)
67376 + ret = sprintf(buffer, "%d\n", info->submap);
67377 + kfree(info);
67378 + }
67379 +
67380 + return ret;
67381 +}
67382 +
67383 +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
67384 +{
67385 + return xen_feature_show(XENFEAT_writable_page_tables, buffer);
67386 +}
67387 +
67388 +HYPERVISOR_ATTR_RO(writable_pt);
67389 +
67390 +static struct attribute *xen_properties_attrs[] = {
67391 + &capabilities_attr.attr,
67392 + &changeset_attr.attr,
67393 + &virtual_start_attr.attr,
67394 + &pagesize_attr.attr,
67395 + &writable_pt_attr.attr,
67396 + NULL
67397 +};
67398 +
67399 +static struct attribute_group xen_properties_group = {
67400 + .name = "properties",
67401 + .attrs = xen_properties_attrs,
67402 +};
67403 +
67404 +static int __init xen_properties_init(void)
67405 +{
67406 + return sysfs_create_group(&hypervisor_subsys.kset.kobj,
67407 + &xen_properties_group);
67408 +}
67409 +
67410 +static void xen_properties_destroy(void)
67411 +{
67412 + sysfs_remove_group(&hypervisor_subsys.kset.kobj,
67413 + &xen_properties_group);
67414 +}
67415 +
67416 +static int __init hyper_sysfs_init(void)
67417 +{
67418 + int ret;
67419 +
67420 + if (!is_running_on_xen())
67421 + return -ENODEV;
67422 +
67423 + ret = xen_sysfs_type_init();
67424 + if (ret)
67425 + goto out;
67426 + ret = xen_sysfs_version_init();
67427 + if (ret)
67428 + goto version_out;
67429 + ret = xen_compilation_init();
67430 + if (ret)
67431 + goto comp_out;
67432 + ret = xen_sysfs_uuid_init();
67433 + if (ret)
67434 + goto uuid_out;
67435 + ret = xen_properties_init();
67436 + if (!ret)
67437 + goto out;
67438 +
67439 + xen_sysfs_uuid_destroy();
67440 +uuid_out:
67441 + xen_compilation_destroy();
67442 +comp_out:
67443 + xen_sysfs_version_destroy();
67444 +version_out:
67445 + xen_sysfs_type_destroy();
67446 +out:
67447 + return ret;
67448 +}
67449 +
67450 +static void hyper_sysfs_exit(void)
67451 +{
67452 + xen_properties_destroy();
67453 + xen_compilation_destroy();
67454 + xen_sysfs_uuid_destroy();
67455 + xen_sysfs_version_destroy();
67456 + xen_sysfs_type_destroy();
67457 +
67458 +}
67459 +
67460 +module_init(hyper_sysfs_init);
67461 +module_exit(hyper_sysfs_exit);
67462 diff -Nur linux-2.6.16.33-noxen/drivers/xen/evtchn/Makefile linux-2.6.16.33/drivers/xen/evtchn/Makefile
67463 --- linux-2.6.16.33-noxen/drivers/xen/evtchn/Makefile 1970-01-01 00:00:00.000000000 +0000
67464 +++ linux-2.6.16.33/drivers/xen/evtchn/Makefile 2007-01-08 15:00:45.000000000 +0000
67465 @@ -0,0 +1,2 @@
67466 +
67467 +obj-y := evtchn.o
67468 diff -Nur linux-2.6.16.33-noxen/drivers/xen/evtchn/evtchn.c linux-2.6.16.33/drivers/xen/evtchn/evtchn.c
67469 --- linux-2.6.16.33-noxen/drivers/xen/evtchn/evtchn.c 1970-01-01 00:00:00.000000000 +0000
67470 +++ linux-2.6.16.33/drivers/xen/evtchn/evtchn.c 2007-01-08 15:00:45.000000000 +0000
67471 @@ -0,0 +1,457 @@
67472 +/******************************************************************************
67473 + * evtchn.c
67474 + *
67475 + * Driver for receiving and demuxing event-channel signals.
67476 + *
67477 + * Copyright (c) 2004-2005, K A Fraser
67478 + * Multi-process extensions Copyright (c) 2004, Steven Smith
67479 + *
67480 + * This program is free software; you can redistribute it and/or
67481 + * modify it under the terms of the GNU General Public License version 2
67482 + * as published by the Free Software Foundation; or, when distributed
67483 + * separately from the Linux kernel or incorporated into other
67484 + * software packages, subject to the following license:
67485 + *
67486 + * Permission is hereby granted, free of charge, to any person obtaining a copy
67487 + * of this source file (the "Software"), to deal in the Software without
67488 + * restriction, including without limitation the rights to use, copy, modify,
67489 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
67490 + * and to permit persons to whom the Software is furnished to do so, subject to
67491 + * the following conditions:
67492 + *
67493 + * The above copyright notice and this permission notice shall be included in
67494 + * all copies or substantial portions of the Software.
67495 + *
67496 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67497 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67498 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67499 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
67500 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
67501 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
67502 + * IN THE SOFTWARE.
67503 + */
67504 +
67505 +#include <linux/config.h>
67506 +#include <linux/module.h>
67507 +#include <linux/kernel.h>
67508 +#include <linux/sched.h>
67509 +#include <linux/slab.h>
67510 +#include <linux/string.h>
67511 +#include <linux/errno.h>
67512 +#include <linux/fs.h>
67513 +#include <linux/errno.h>
67514 +#include <linux/miscdevice.h>
67515 +#include <linux/major.h>
67516 +#include <linux/proc_fs.h>
67517 +#include <linux/stat.h>
67518 +#include <linux/poll.h>
67519 +#include <linux/irq.h>
67520 +#include <linux/init.h>
67521 +#include <linux/gfp.h>
67522 +#include <xen/evtchn.h>
67523 +#include <xen/public/evtchn.h>
67524 +
67525 +struct per_user_data {
67526 + /* Notification ring, accessed via /dev/xen/evtchn. */
67527 +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t))
67528 +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
67529 + evtchn_port_t *ring;
67530 + unsigned int ring_cons, ring_prod, ring_overflow;
67531 +
67532 + /* Processes wait on this queue when ring is empty. */
67533 + wait_queue_head_t evtchn_wait;
67534 + struct fasync_struct *evtchn_async_queue;
67535 +};
67536 +
67537 +/* Who's bound to each port? */
67538 +static struct per_user_data *port_user[NR_EVENT_CHANNELS];
67539 +static spinlock_t port_user_lock;
67540 +
67541 +void evtchn_device_upcall(int port)
67542 +{
67543 + struct per_user_data *u;
67544 +
67545 + spin_lock(&port_user_lock);
67546 +
67547 + mask_evtchn(port);
67548 + clear_evtchn(port);
67549 +
67550 + if ((u = port_user[port]) != NULL) {
67551 + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
67552 + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
67553 + if (u->ring_cons == u->ring_prod++) {
67554 + wake_up_interruptible(&u->evtchn_wait);
67555 + kill_fasync(&u->evtchn_async_queue,
67556 + SIGIO, POLL_IN);
67557 + }
67558 + } else {
67559 + u->ring_overflow = 1;
67560 + }
67561 + }
67562 +
67563 + spin_unlock(&port_user_lock);
67564 +}
67565 +
67566 +static ssize_t evtchn_read(struct file *file, char __user *buf,
67567 + size_t count, loff_t *ppos)
67568 +{
67569 + int rc;
67570 + unsigned int c, p, bytes1 = 0, bytes2 = 0;
67571 + struct per_user_data *u = file->private_data;
67572 +
67573 + /* Whole number of ports. */
67574 + count &= ~(sizeof(evtchn_port_t)-1);
67575 +
67576 + if (count == 0)
67577 + return 0;
67578 +
67579 + if (count > PAGE_SIZE)
67580 + count = PAGE_SIZE;
67581 +
67582 + for (;;) {
67583 + if (u->ring_overflow)
67584 + return -EFBIG;
67585 +
67586 + if ((c = u->ring_cons) != (p = u->ring_prod))
67587 + break;
67588 +
67589 + if (file->f_flags & O_NONBLOCK)
67590 + return -EAGAIN;
67591 +
67592 + rc = wait_event_interruptible(
67593 + u->evtchn_wait, u->ring_cons != u->ring_prod);
67594 + if (rc)
67595 + return rc;
67596 + }
67597 +
67598 + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
67599 + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
67600 + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
67601 + sizeof(evtchn_port_t);
67602 + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
67603 + } else {
67604 + bytes1 = (p - c) * sizeof(evtchn_port_t);
67605 + bytes2 = 0;
67606 + }
67607 +
67608 + /* Truncate chunks according to caller's maximum byte count. */
67609 + if (bytes1 > count) {
67610 + bytes1 = count;
67611 + bytes2 = 0;
67612 + } else if ((bytes1 + bytes2) > count) {
67613 + bytes2 = count - bytes1;
67614 + }
67615 +
67616 + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
67617 + ((bytes2 != 0) &&
67618 + copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
67619 + return -EFAULT;
67620 +
67621 + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
67622 +
67623 + return bytes1 + bytes2;
67624 +}
67625 +
67626 +static ssize_t evtchn_write(struct file *file, const char __user *buf,
67627 + size_t count, loff_t *ppos)
67628 +{
67629 + int rc, i;
67630 + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
67631 + struct per_user_data *u = file->private_data;
67632 +
67633 + if (kbuf == NULL)
67634 + return -ENOMEM;
67635 +
67636 + /* Whole number of ports. */
67637 + count &= ~(sizeof(evtchn_port_t)-1);
67638 +
67639 + if (count == 0) {
67640 + rc = 0;
67641 + goto out;
67642 + }
67643 +
67644 + if (count > PAGE_SIZE)
67645 + count = PAGE_SIZE;
67646 +
67647 + if (copy_from_user(kbuf, buf, count) != 0) {
67648 + rc = -EFAULT;
67649 + goto out;
67650 + }
67651 +
67652 + spin_lock_irq(&port_user_lock);
67653 + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
67654 + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
67655 + unmask_evtchn(kbuf[i]);
67656 + spin_unlock_irq(&port_user_lock);
67657 +
67658 + rc = count;
67659 +
67660 + out:
67661 + free_page((unsigned long)kbuf);
67662 + return rc;
67663 +}
67664 +
67665 +static void evtchn_bind_to_user(struct per_user_data *u, int port)
67666 +{
67667 + spin_lock_irq(&port_user_lock);
67668 + BUG_ON(port_user[port] != NULL);
67669 + port_user[port] = u;
67670 + unmask_evtchn(port);
67671 + spin_unlock_irq(&port_user_lock);
67672 +}
67673 +
67674 +static int evtchn_ioctl(struct inode *inode, struct file *file,
67675 + unsigned int cmd, unsigned long arg)
67676 +{
67677 + int rc;
67678 + struct per_user_data *u = file->private_data;
67679 + void __user *uarg = (void __user *) arg;
67680 +
67681 + switch (cmd) {
67682 + case IOCTL_EVTCHN_BIND_VIRQ: {
67683 + struct ioctl_evtchn_bind_virq bind;
67684 + struct evtchn_bind_virq bind_virq;
67685 +
67686 + rc = -EFAULT;
67687 + if (copy_from_user(&bind, uarg, sizeof(bind)))
67688 + break;
67689 +
67690 + bind_virq.virq = bind.virq;
67691 + bind_virq.vcpu = 0;
67692 + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
67693 + &bind_virq);
67694 + if (rc != 0)
67695 + break;
67696 +
67697 + rc = bind_virq.port;
67698 + evtchn_bind_to_user(u, rc);
67699 + break;
67700 + }
67701 +
67702 + case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
67703 + struct ioctl_evtchn_bind_interdomain bind;
67704 + struct evtchn_bind_interdomain bind_interdomain;
67705 +
67706 + rc = -EFAULT;
67707 + if (copy_from_user(&bind, uarg, sizeof(bind)))
67708 + break;
67709 +
67710 + bind_interdomain.remote_dom = bind.remote_domain;
67711 + bind_interdomain.remote_port = bind.remote_port;
67712 + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
67713 + &bind_interdomain);
67714 + if (rc != 0)
67715 + break;
67716 +
67717 + rc = bind_interdomain.local_port;
67718 + evtchn_bind_to_user(u, rc);
67719 + break;
67720 + }
67721 +
67722 + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
67723 + struct ioctl_evtchn_bind_unbound_port bind;
67724 + struct evtchn_alloc_unbound alloc_unbound;
67725 +
67726 + rc = -EFAULT;
67727 + if (copy_from_user(&bind, uarg, sizeof(bind)))
67728 + break;
67729 +
67730 + alloc_unbound.dom = DOMID_SELF;
67731 + alloc_unbound.remote_dom = bind.remote_domain;
67732 + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
67733 + &alloc_unbound);
67734 + if (rc != 0)
67735 + break;
67736 +
67737 + rc = alloc_unbound.port;
67738 + evtchn_bind_to_user(u, rc);
67739 + break;
67740 + }
67741 +
67742 + case IOCTL_EVTCHN_UNBIND: {
67743 + struct ioctl_evtchn_unbind unbind;
67744 + struct evtchn_close close;
67745 + int ret;
67746 +
67747 + rc = -EFAULT;
67748 + if (copy_from_user(&unbind, uarg, sizeof(unbind)))
67749 + break;
67750 +
67751 + rc = -EINVAL;
67752 + if (unbind.port >= NR_EVENT_CHANNELS)
67753 + break;
67754 +
67755 + spin_lock_irq(&port_user_lock);
67756 +
67757 + rc = -ENOTCONN;
67758 + if (port_user[unbind.port] != u) {
67759 + spin_unlock_irq(&port_user_lock);
67760 + break;
67761 + }
67762 +
67763 + port_user[unbind.port] = NULL;
67764 + mask_evtchn(unbind.port);
67765 +
67766 + spin_unlock_irq(&port_user_lock);
67767 +
67768 + close.port = unbind.port;
67769 + ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
67770 + BUG_ON(ret);
67771 +
67772 + rc = 0;
67773 + break;
67774 + }
67775 +
67776 + case IOCTL_EVTCHN_NOTIFY: {
67777 + struct ioctl_evtchn_notify notify;
67778 +
67779 + rc = -EFAULT;
67780 + if (copy_from_user(&notify, uarg, sizeof(notify)))
67781 + break;
67782 +
67783 + if (notify.port >= NR_EVENT_CHANNELS) {
67784 + rc = -EINVAL;
67785 + } else if (port_user[notify.port] != u) {
67786 + rc = -ENOTCONN;
67787 + } else {
67788 + notify_remote_via_evtchn(notify.port);
67789 + rc = 0;
67790 + }
67791 + break;
67792 + }
67793 +
67794 + case IOCTL_EVTCHN_RESET: {
67795 + /* Initialise the ring to empty. Clear errors. */
67796 + spin_lock_irq(&port_user_lock);
67797 + u->ring_cons = u->ring_prod = u->ring_overflow = 0;
67798 + spin_unlock_irq(&port_user_lock);
67799 + rc = 0;
67800 + break;
67801 + }
67802 +
67803 + default:
67804 + rc = -ENOSYS;
67805 + break;
67806 + }
67807 +
67808 + return rc;
67809 +}
67810 +
67811 +static unsigned int evtchn_poll(struct file *file, poll_table *wait)
67812 +{
67813 + unsigned int mask = POLLOUT | POLLWRNORM;
67814 + struct per_user_data *u = file->private_data;
67815 +
67816 + poll_wait(file, &u->evtchn_wait, wait);
67817 + if (u->ring_cons != u->ring_prod)
67818 + mask |= POLLIN | POLLRDNORM;
67819 + if (u->ring_overflow)
67820 + mask = POLLERR;
67821 + return mask;
67822 +}
67823 +
67824 +static int evtchn_fasync(int fd, struct file *filp, int on)
67825 +{
67826 + struct per_user_data *u = filp->private_data;
67827 + return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
67828 +}
67829 +
67830 +static int evtchn_open(struct inode *inode, struct file *filp)
67831 +{
67832 + struct per_user_data *u;
67833 +
67834 + if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
67835 + return -ENOMEM;
67836 +
67837 + memset(u, 0, sizeof(*u));
67838 + init_waitqueue_head(&u->evtchn_wait);
67839 +
67840 + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
67841 + if (u->ring == NULL) {
67842 + kfree(u);
67843 + return -ENOMEM;
67844 + }
67845 +
67846 + filp->private_data = u;
67847 +
67848 + return 0;
67849 +}
67850 +
67851 +static int evtchn_release(struct inode *inode, struct file *filp)
67852 +{
67853 + int i;
67854 + struct per_user_data *u = filp->private_data;
67855 + struct evtchn_close close;
67856 +
67857 + spin_lock_irq(&port_user_lock);
67858 +
67859 + free_page((unsigned long)u->ring);
67860 +
67861 + for (i = 0; i < NR_EVENT_CHANNELS; i++) {
67862 + int ret;
67863 + if (port_user[i] != u)
67864 + continue;
67865 +
67866 + port_user[i] = NULL;
67867 + mask_evtchn(i);
67868 +
67869 + close.port = i;
67870 + ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
67871 + BUG_ON(ret);
67872 + }
67873 +
67874 + spin_unlock_irq(&port_user_lock);
67875 +
67876 + kfree(u);
67877 +
67878 + return 0;
67879 +}
67880 +
67881 +static struct file_operations evtchn_fops = {
67882 + .owner = THIS_MODULE,
67883 + .read = evtchn_read,
67884 + .write = evtchn_write,
67885 + .ioctl = evtchn_ioctl,
67886 + .poll = evtchn_poll,
67887 + .fasync = evtchn_fasync,
67888 + .open = evtchn_open,
67889 + .release = evtchn_release,
67890 +};
67891 +
67892 +static struct miscdevice evtchn_miscdev = {
67893 + .minor = MISC_DYNAMIC_MINOR,
67894 + .name = "evtchn",
67895 + .fops = &evtchn_fops,
67896 +};
67897 +
67898 +static int __init evtchn_init(void)
67899 +{
67900 + int err;
67901 +
67902 + if (!is_running_on_xen())
67903 + return -ENODEV;
67904 +
67905 + spin_lock_init(&port_user_lock);
67906 + memset(port_user, 0, sizeof(port_user));
67907 +
67908 + /* Create '/dev/misc/evtchn'. */
67909 + err = misc_register(&evtchn_miscdev);
67910 + if (err != 0) {
67911 + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
67912 + return err;
67913 + }
67914 +
67915 + printk("Event-channel device installed.\n");
67916 +
67917 + return 0;
67918 +}
67919 +
67920 +static void evtchn_cleanup(void)
67921 +{
67922 + misc_deregister(&evtchn_miscdev);
67923 +}
67924 +
67925 +module_init(evtchn_init);
67926 +module_exit(evtchn_cleanup);
67927 +
67928 +MODULE_LICENSE("Dual BSD/GPL");
67929 diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/Makefile linux-2.6.16.33/drivers/xen/fbfront/Makefile
67930 --- linux-2.6.16.33-noxen/drivers/xen/fbfront/Makefile 1970-01-01 00:00:00.000000000 +0000
67931 +++ linux-2.6.16.33/drivers/xen/fbfront/Makefile 2007-01-08 15:00:45.000000000 +0000
67932 @@ -0,0 +1,2 @@
67933 +obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o
67934 +obj-$(CONFIG_XEN_KEYBOARD) += xenkbd.o
67935 diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/xenfb.c linux-2.6.16.33/drivers/xen/fbfront/xenfb.c
67936 --- linux-2.6.16.33-noxen/drivers/xen/fbfront/xenfb.c 1970-01-01 00:00:00.000000000 +0000
67937 +++ linux-2.6.16.33/drivers/xen/fbfront/xenfb.c 2007-01-08 15:00:45.000000000 +0000
67938 @@ -0,0 +1,750 @@
67939 +/*
67940 + * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
67941 + *
67942 + * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
67943 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
67944 + *
67945 + * Based on linux/drivers/video/q40fb.c
67946 + *
67947 + * This file is subject to the terms and conditions of the GNU General Public
67948 + * License. See the file COPYING in the main directory of this archive for
67949 + * more details.
67950 + */
67951 +
67952 +/*
67953 + * TODO:
67954 + *
67955 + * Switch to grant tables when they become capable of dealing with the
67956 + * frame buffer.
67957 + */
67958 +
67959 +#include <linux/kernel.h>
67960 +#include <linux/errno.h>
67961 +#include <linux/fb.h>
67962 +#include <linux/module.h>
67963 +#include <linux/vmalloc.h>
67964 +#include <linux/mm.h>
67965 +#include <asm/hypervisor.h>
67966 +#include <xen/evtchn.h>
67967 +#include <xen/interface/io/fbif.h>
67968 +#include <xen/xenbus.h>
67969 +#include <linux/kthread.h>
67970 +
67971 +struct xenfb_mapping
67972 +{
67973 + struct list_head link;
67974 + struct vm_area_struct *vma;
67975 + atomic_t map_refs;
67976 + int faults;
67977 + struct xenfb_info *info;
67978 +};
67979 +
67980 +struct xenfb_info
67981 +{
67982 + struct task_struct *kthread;
67983 + wait_queue_head_t wq;
67984 +
67985 + unsigned char *fb;
67986 + struct fb_info *fb_info;
67987 + struct timer_list refresh;
67988 + int dirty;
67989 + int x1, y1, x2, y2; /* dirty rectangle,
67990 + protected by dirty_lock */
67991 + spinlock_t dirty_lock;
67992 + struct mutex mm_lock;
67993 + int nr_pages;
67994 + struct page **pages;
67995 + struct list_head mappings; /* protected by mm_lock */
67996 +
67997 + unsigned evtchn;
67998 + int irq;
67999 + struct xenfb_page *page;
68000 + unsigned long *mfns;
68001 + int update_wanted; /* XENFB_TYPE_UPDATE wanted */
68002 +
68003 + struct xenbus_device *xbdev;
68004 +};
68005 +
68006 +/*
68007 + * How the locks work together
68008 + *
68009 + * There are two locks: spinlock dirty_lock protecting the dirty
68010 + * rectangle, and mutex mm_lock protecting mappings.
68011 + *
68012 + * The problem is that dirty rectangle and mappings aren't
68013 + * independent: the dirty rectangle must cover all faulted pages in
68014 + * mappings. We need to prove that our locking maintains this
68015 + * invariant.
68016 + *
68017 + * There are several kinds of critical regions:
68018 + *
68019 + * 1. Holding only dirty_lock: xenfb_refresh(). May run in
68020 + * interrupts. Extends the dirty rectangle. Trivially preserves
68021 + * invariant.
68022 + *
68023 + * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close(). Touch
68024 + * only mappings. The former creates unfaulted pages. Preserves
68025 + * invariant. The latter removes pages. Preserves invariant.
68026 + *
68027 + * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty
68028 + * rectangle and updates mappings consistently. Preserves
68029 + * invariant.
68030 + *
68031 + * 4. The ugliest one: xenfb_update_screen(). Clear the dirty
68032 + * rectangle and update mappings consistently.
68033 + *
68034 + * We can't simply hold both locks, because zap_page_range() cannot
68035 + * be called with a spinlock held.
68036 + *
68037 + * Therefore, we first clear the dirty rectangle with both locks
68038 + * held. Then we unlock dirty_lock and update the mappings.
68039 + * Critical regions that hold only dirty_lock may interfere with
68040 + * that. This can only be region 1: xenfb_refresh(). But that
68041 + * just extends the dirty rectangle, which can't harm the
68042 + * invariant.
68043 + *
68044 + * But FIXME: the invariant is too weak. It misses that the fault
68045 + * record in mappings must be consistent with the mapping of pages in
68046 + * the associated address space! do_no_page() updates the PTE after
68047 + * xenfb_vm_nopage() returns, i.e. outside the critical region. This
68048 + * allows the following race:
68049 + *
68050 + * X writes to some address in the Xen frame buffer
68051 + * Fault - call do_no_page()
68052 + * call xenfb_vm_nopage()
68053 + * grab mm_lock
68054 + * map->faults++;
68055 + * release mm_lock
68056 + * return back to do_no_page()
68057 + * (preempted, or SMP)
68058 + * Xen worker thread runs.
68059 + * grab mm_lock
68060 + * look at mappings
68061 + * find this mapping, zaps its pages (but page not in pte yet)
68062 + * clear map->faults
68063 + * releases mm_lock
68064 + * (back to X process)
68065 + * put page in X's pte
68066 + *
68067 + * Oh well, we wont be updating the writes to this page anytime soon.
68068 + */
68069 +
68070 +static int xenfb_fps = 20;
68071 +static unsigned long xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8;
68072 +
68073 +static int xenfb_remove(struct xenbus_device *);
68074 +static void xenfb_init_shared_page(struct xenfb_info *);
68075 +static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
68076 +static void xenfb_disconnect_backend(struct xenfb_info *);
68077 +
68078 +static void xenfb_do_update(struct xenfb_info *info,
68079 + int x, int y, int w, int h)
68080 +{
68081 + union xenfb_out_event event;
68082 + __u32 prod;
68083 +
68084 + event.type = XENFB_TYPE_UPDATE;
68085 + event.update.x = x;
68086 + event.update.y = y;
68087 + event.update.width = w;
68088 + event.update.height = h;
68089 +
68090 + prod = info->page->out_prod;
68091 + /* caller ensures !xenfb_queue_full() */
68092 + mb(); /* ensure ring space available */
68093 + XENFB_OUT_RING_REF(info->page, prod) = event;
68094 + wmb(); /* ensure ring contents visible */
68095 + info->page->out_prod = prod + 1;
68096 +
68097 + notify_remote_via_evtchn(info->evtchn);
68098 +}
68099 +
68100 +static int xenfb_queue_full(struct xenfb_info *info)
68101 +{
68102 + __u32 cons, prod;
68103 +
68104 + prod = info->page->out_prod;
68105 + cons = info->page->out_cons;
68106 + return prod - cons == XENFB_OUT_RING_LEN;
68107 +}
68108 +
68109 +static void xenfb_update_screen(struct xenfb_info *info)
68110 +{
68111 + unsigned long flags;
68112 + int y1, y2, x1, x2;
68113 + struct xenfb_mapping *map;
68114 +
68115 + if (!info->update_wanted)
68116 + return;
68117 + if (xenfb_queue_full(info))
68118 + return;
68119 +
68120 + mutex_lock(&info->mm_lock);
68121 +
68122 + spin_lock_irqsave(&info->dirty_lock, flags);
68123 + y1 = info->y1;
68124 + y2 = info->y2;
68125 + x1 = info->x1;
68126 + x2 = info->x2;
68127 + info->x1 = info->y1 = INT_MAX;
68128 + info->x2 = info->y2 = 0;
68129 + spin_unlock_irqrestore(&info->dirty_lock, flags);
68130 +
68131 + list_for_each_entry(map, &info->mappings, link) {
68132 + if (!map->faults)
68133 + continue;
68134 + zap_page_range(map->vma, map->vma->vm_start,
68135 + map->vma->vm_end - map->vma->vm_start, NULL);
68136 + map->faults = 0;
68137 + }
68138 +
68139 + mutex_unlock(&info->mm_lock);
68140 +
68141 + xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
68142 +}
68143 +
68144 +static int xenfb_thread(void *data)
68145 +{
68146 + struct xenfb_info *info = data;
68147 +
68148 + while (!kthread_should_stop()) {
68149 + if (info->dirty) {
68150 + info->dirty = 0;
68151 + xenfb_update_screen(info);
68152 + }
68153 + wait_event_interruptible(info->wq,
68154 + kthread_should_stop() || info->dirty);
68155 + try_to_freeze();
68156 + }
68157 + return 0;
68158 +}
68159 +
68160 +static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
68161 + unsigned blue, unsigned transp,
68162 + struct fb_info *info)
68163 +{
68164 + u32 v;
68165 +
68166 + if (regno > info->cmap.len)
68167 + return 1;
68168 +
68169 + red >>= (16 - info->var.red.length);
68170 + green >>= (16 - info->var.green.length);
68171 + blue >>= (16 - info->var.blue.length);
68172 +
68173 + v = (red << info->var.red.offset) |
68174 + (green << info->var.green.offset) |
68175 + (blue << info->var.blue.offset);
68176 +
68177 + /* FIXME is this sane? check against xxxfb_setcolreg()! */
68178 + switch (info->var.bits_per_pixel) {
68179 + case 16:
68180 + case 24:
68181 + case 32:
68182 + ((u32 *)info->pseudo_palette)[regno] = v;
68183 + break;
68184 + }
68185 +
68186 + return 0;
68187 +}
68188 +
68189 +static void xenfb_timer(unsigned long data)
68190 +{
68191 + struct xenfb_info *info = (struct xenfb_info *)data;
68192 + info->dirty = 1;
68193 + wake_up(&info->wq);
68194 +}
68195 +
68196 +static void __xenfb_refresh(struct xenfb_info *info,
68197 + int x1, int y1, int w, int h)
68198 +{
68199 + int y2, x2;
68200 +
68201 + y2 = y1 + h;
68202 + x2 = x1 + w;
68203 +
68204 + if (info->y1 > y1)
68205 + info->y1 = y1;
68206 + if (info->y2 < y2)
68207 + info->y2 = y2;
68208 + if (info->x1 > x1)
68209 + info->x1 = x1;
68210 + if (info->x2 < x2)
68211 + info->x2 = x2;
68212 +
68213 + if (timer_pending(&info->refresh))
68214 + return;
68215 +
68216 + mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
68217 +}
68218 +
68219 +static void xenfb_refresh(struct xenfb_info *info,
68220 + int x1, int y1, int w, int h)
68221 +{
68222 + unsigned long flags;
68223 +
68224 + spin_lock_irqsave(&info->dirty_lock, flags);
68225 + __xenfb_refresh(info, x1, y1, w, h);
68226 + spin_unlock_irqrestore(&info->dirty_lock, flags);
68227 +}
68228 +
68229 +static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
68230 +{
68231 + struct xenfb_info *info = p->par;
68232 +
68233 + cfb_fillrect(p, rect);
68234 + xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
68235 +}
68236 +
68237 +static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
68238 +{
68239 + struct xenfb_info *info = p->par;
68240 +
68241 + cfb_imageblit(p, image);
68242 + xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
68243 +}
68244 +
68245 +static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
68246 +{
68247 + struct xenfb_info *info = p->par;
68248 +
68249 + cfb_copyarea(p, area);
68250 + xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
68251 +}
68252 +
68253 +static void xenfb_vm_open(struct vm_area_struct *vma)
68254 +{
68255 + struct xenfb_mapping *map = vma->vm_private_data;
68256 + atomic_inc(&map->map_refs);
68257 +}
68258 +
68259 +static void xenfb_vm_close(struct vm_area_struct *vma)
68260 +{
68261 + struct xenfb_mapping *map = vma->vm_private_data;
68262 + struct xenfb_info *info = map->info;
68263 +
68264 + mutex_lock(&info->mm_lock);
68265 + if (atomic_dec_and_test(&map->map_refs)) {
68266 + list_del(&map->link);
68267 + kfree(map);
68268 + }
68269 + mutex_unlock(&info->mm_lock);
68270 +}
68271 +
68272 +static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
68273 + unsigned long vaddr, int *type)
68274 +{
68275 + struct xenfb_mapping *map = vma->vm_private_data;
68276 + struct xenfb_info *info = map->info;
68277 + int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
68278 + unsigned long flags;
68279 + struct page *page;
68280 + int y1, y2;
68281 +
68282 + if (pgnr >= info->nr_pages)
68283 + return NOPAGE_SIGBUS;
68284 +
68285 + mutex_lock(&info->mm_lock);
68286 + spin_lock_irqsave(&info->dirty_lock, flags);
68287 + page = info->pages[pgnr];
68288 + get_page(page);
68289 + map->faults++;
68290 +
68291 + y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
68292 + y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
68293 + if (y2 > info->fb_info->var.yres)
68294 + y2 = info->fb_info->var.yres;
68295 + __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
68296 + spin_unlock_irqrestore(&info->dirty_lock, flags);
68297 + mutex_unlock(&info->mm_lock);
68298 +
68299 + if (type)
68300 + *type = VM_FAULT_MINOR;
68301 +
68302 + return page;
68303 +}
68304 +
68305 +static struct vm_operations_struct xenfb_vm_ops = {
68306 + .open = xenfb_vm_open,
68307 + .close = xenfb_vm_close,
68308 + .nopage = xenfb_vm_nopage,
68309 +};
68310 +
68311 +static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
68312 +{
68313 + struct xenfb_info *info = fb_info->par;
68314 + struct xenfb_mapping *map;
68315 + int map_pages;
68316 +
68317 + if (!(vma->vm_flags & VM_WRITE))
68318 + return -EINVAL;
68319 + if (!(vma->vm_flags & VM_SHARED))
68320 + return -EINVAL;
68321 + if (vma->vm_pgoff != 0)
68322 + return -EINVAL;
68323 +
68324 + map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
68325 + if (map_pages > info->nr_pages)
68326 + return -EINVAL;
68327 +
68328 + map = kzalloc(sizeof(*map), GFP_KERNEL);
68329 + if (map == NULL)
68330 + return -ENOMEM;
68331 +
68332 + map->vma = vma;
68333 + map->faults = 0;
68334 + map->info = info;
68335 + atomic_set(&map->map_refs, 1);
68336 +
68337 + mutex_lock(&info->mm_lock);
68338 + list_add(&map->link, &info->mappings);
68339 + mutex_unlock(&info->mm_lock);
68340 +
68341 + vma->vm_ops = &xenfb_vm_ops;
68342 + vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
68343 + vma->vm_private_data = map;
68344 +
68345 + return 0;
68346 +}
68347 +
68348 +static struct fb_ops xenfb_fb_ops = {
68349 + .owner = THIS_MODULE,
68350 + .fb_setcolreg = xenfb_setcolreg,
68351 + .fb_fillrect = xenfb_fillrect,
68352 + .fb_copyarea = xenfb_copyarea,
68353 + .fb_imageblit = xenfb_imageblit,
68354 + .fb_mmap = xenfb_mmap,
68355 +};
68356 +
68357 +static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
68358 + struct pt_regs *regs)
68359 +{
68360 + /*
68361 + * No in events recognized, simply ignore them all.
68362 + * If you need to recognize some, see xenbkd's input_handler()
68363 + * for how to do that.
68364 + */
68365 + struct xenfb_info *info = dev_id;
68366 + struct xenfb_page *page = info->page;
68367 +
68368 + if (page->in_cons != page->in_prod) {
68369 + info->page->in_cons = info->page->in_prod;
68370 + notify_remote_via_evtchn(info->evtchn);
68371 + }
68372 + return IRQ_HANDLED;
68373 +}
68374 +
68375 +static unsigned long vmalloc_to_mfn(void *address)
68376 +{
68377 + return pfn_to_mfn(vmalloc_to_pfn(address));
68378 +}
68379 +
68380 +static int __devinit xenfb_probe(struct xenbus_device *dev,
68381 + const struct xenbus_device_id *id)
68382 +{
68383 + struct xenfb_info *info;
68384 + struct fb_info *fb_info;
68385 + int ret;
68386 +
68387 + info = kzalloc(sizeof(*info), GFP_KERNEL);
68388 + if (info == NULL) {
68389 + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
68390 + return -ENOMEM;
68391 + }
68392 + dev->dev.driver_data = info;
68393 + info->xbdev = dev;
68394 + info->irq = -1;
68395 + info->x1 = info->y1 = INT_MAX;
68396 + spin_lock_init(&info->dirty_lock);
68397 + mutex_init(&info->mm_lock);
68398 + init_waitqueue_head(&info->wq);
68399 + init_timer(&info->refresh);
68400 + info->refresh.function = xenfb_timer;
68401 + info->refresh.data = (unsigned long)info;
68402 + INIT_LIST_HEAD(&info->mappings);
68403 +
68404 + info->fb = vmalloc(xenfb_mem_len);
68405 + if (info->fb == NULL)
68406 + goto error_nomem;
68407 + memset(info->fb, 0, xenfb_mem_len);
68408 +
68409 + info->nr_pages = (xenfb_mem_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
68410 +
68411 + info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
68412 + GFP_KERNEL);
68413 + if (info->pages == NULL)
68414 + goto error_nomem;
68415 +
68416 + info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
68417 + if (!info->mfns)
68418 + goto error_nomem;
68419 +
68420 + /* set up shared page */
68421 + info->page = (void *)__get_free_page(GFP_KERNEL);
68422 + if (!info->page)
68423 + goto error_nomem;
68424 +
68425 + xenfb_init_shared_page(info);
68426 +
68427 + fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
68428 + /* see fishy hackery below */
68429 + if (fb_info == NULL)
68430 + goto error_nomem;
68431 +
68432 + /* FIXME fishy hackery */
68433 + fb_info->pseudo_palette = fb_info->par;
68434 + fb_info->par = info;
68435 + /* /FIXME */
68436 + fb_info->screen_base = info->fb;
68437 +
68438 + fb_info->fbops = &xenfb_fb_ops;
68439 + fb_info->var.xres_virtual = fb_info->var.xres = info->page->width;
68440 + fb_info->var.yres_virtual = fb_info->var.yres = info->page->height;
68441 + fb_info->var.bits_per_pixel = info->page->depth;
68442 +
68443 + fb_info->var.red = (struct fb_bitfield){16, 8, 0};
68444 + fb_info->var.green = (struct fb_bitfield){8, 8, 0};
68445 + fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
68446 +
68447 + fb_info->var.activate = FB_ACTIVATE_NOW;
68448 + fb_info->var.height = -1;
68449 + fb_info->var.width = -1;
68450 + fb_info->var.vmode = FB_VMODE_NONINTERLACED;
68451 +
68452 + fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
68453 + fb_info->fix.line_length = info->page->line_length;
68454 + fb_info->fix.smem_start = 0;
68455 + fb_info->fix.smem_len = xenfb_mem_len;
68456 + strcpy(fb_info->fix.id, "xen");
68457 + fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
68458 + fb_info->fix.accel = FB_ACCEL_NONE;
68459 +
68460 + fb_info->flags = FBINFO_FLAG_DEFAULT;
68461 +
68462 + ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
68463 + if (ret < 0) {
68464 + framebuffer_release(fb_info);
68465 + xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
68466 + goto error;
68467 + }
68468 +
68469 + ret = register_framebuffer(fb_info);
68470 + if (ret) {
68471 + fb_dealloc_cmap(&info->fb_info->cmap);
68472 + framebuffer_release(fb_info);
68473 + xenbus_dev_fatal(dev, ret, "register_framebuffer");
68474 + goto error;
68475 + }
68476 + info->fb_info = fb_info;
68477 +
68478 + /* FIXME should this be delayed until backend XenbusStateConnected? */
68479 + info->kthread = kthread_run(xenfb_thread, info, "xenfb thread");
68480 + if (IS_ERR(info->kthread)) {
68481 + ret = PTR_ERR(info->kthread);
68482 + info->kthread = NULL;
68483 + xenbus_dev_fatal(dev, ret, "register_framebuffer");
68484 + goto error;
68485 + }
68486 +
68487 + ret = xenfb_connect_backend(dev, info);
68488 + if (ret < 0)
68489 + goto error;
68490 +
68491 + return 0;
68492 +
68493 + error_nomem:
68494 + ret = -ENOMEM;
68495 + xenbus_dev_fatal(dev, ret, "allocating device memory");
68496 + error:
68497 + xenfb_remove(dev);
68498 + return ret;
68499 +}
68500 +
68501 +static int xenfb_resume(struct xenbus_device *dev)
68502 +{
68503 + struct xenfb_info *info = dev->dev.driver_data;
68504 +
68505 + xenfb_disconnect_backend(info);
68506 + xenfb_init_shared_page(info);
68507 + return xenfb_connect_backend(dev, info);
68508 +}
68509 +
68510 +static int xenfb_remove(struct xenbus_device *dev)
68511 +{
68512 + struct xenfb_info *info = dev->dev.driver_data;
68513 +
68514 + del_timer(&info->refresh);
68515 + if (info->kthread)
68516 + kthread_stop(info->kthread);
68517 + xenfb_disconnect_backend(info);
68518 + if (info->fb_info) {
68519 + unregister_framebuffer(info->fb_info);
68520 + fb_dealloc_cmap(&info->fb_info->cmap);
68521 + framebuffer_release(info->fb_info);
68522 + }
68523 + free_page((unsigned long)info->page);
68524 + vfree(info->mfns);
68525 + kfree(info->pages);
68526 + vfree(info->fb);
68527 + kfree(info);
68528 +
68529 + return 0;
68530 +}
68531 +
68532 +static void xenfb_init_shared_page(struct xenfb_info *info)
68533 +{
68534 + int i;
68535 +
68536 + for (i = 0; i < info->nr_pages; i++)
68537 + info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
68538 +
68539 + for (i = 0; i < info->nr_pages; i++)
68540 + info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
68541 +
68542 + info->page->pd[0] = vmalloc_to_mfn(info->mfns);
68543 + info->page->pd[1] = 0;
68544 + info->page->width = XENFB_WIDTH;
68545 + info->page->height = XENFB_HEIGHT;
68546 + info->page->depth = XENFB_DEPTH;
68547 + info->page->line_length = (info->page->depth / 8) * info->page->width;
68548 + info->page->mem_length = xenfb_mem_len;
68549 + info->page->in_cons = info->page->in_prod = 0;
68550 + info->page->out_cons = info->page->out_prod = 0;
68551 +}
68552 +
68553 +static int xenfb_connect_backend(struct xenbus_device *dev,
68554 + struct xenfb_info *info)
68555 +{
68556 + int ret;
68557 + struct xenbus_transaction xbt;
68558 +
68559 + ret = xenbus_alloc_evtchn(dev, &info->evtchn);
68560 + if (ret)
68561 + return ret;
68562 + ret = bind_evtchn_to_irqhandler(info->evtchn, xenfb_event_handler,
68563 + 0, "xenfb", info);
68564 + if (ret < 0) {
68565 + xenbus_free_evtchn(dev, info->evtchn);
68566 + xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
68567 + return ret;
68568 + }
68569 + info->irq = ret;
68570 +
68571 + again:
68572 + ret = xenbus_transaction_start(&xbt);
68573 + if (ret) {
68574 + xenbus_dev_fatal(dev, ret, "starting transaction");
68575 + return ret;
68576 + }
68577 + ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
68578 + virt_to_mfn(info->page));
68579 + if (ret)
68580 + goto error_xenbus;
68581 + ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
68582 + info->evtchn);
68583 + if (ret)
68584 + goto error_xenbus;
68585 + ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
68586 + if (ret)
68587 + goto error_xenbus;
68588 + ret = xenbus_transaction_end(xbt, 0);
68589 + if (ret) {
68590 + if (ret == -EAGAIN)
68591 + goto again;
68592 + xenbus_dev_fatal(dev, ret, "completing transaction");
68593 + return ret;
68594 + }
68595 +
68596 + xenbus_switch_state(dev, XenbusStateInitialised);
68597 + return 0;
68598 +
68599 + error_xenbus:
68600 + xenbus_transaction_end(xbt, 1);
68601 + xenbus_dev_fatal(dev, ret, "writing xenstore");
68602 + return ret;
68603 +}
68604 +
68605 +static void xenfb_disconnect_backend(struct xenfb_info *info)
68606 +{
68607 + if (info->irq >= 0)
68608 + unbind_from_irqhandler(info->irq, info);
68609 + info->irq = -1;
68610 +}
68611 +
68612 +static void xenfb_backend_changed(struct xenbus_device *dev,
68613 + enum xenbus_state backend_state)
68614 +{
68615 + struct xenfb_info *info = dev->dev.driver_data;
68616 + int val;
68617 +
68618 + switch (backend_state) {
68619 + case XenbusStateInitialising:
68620 + case XenbusStateInitialised:
68621 + case XenbusStateUnknown:
68622 + case XenbusStateClosed:
68623 + break;
68624 +
68625 + case XenbusStateInitWait:
68626 + InitWait:
68627 + xenbus_switch_state(dev, XenbusStateConnected);
68628 + break;
68629 +
68630 + case XenbusStateConnected:
68631 + /*
68632 + * Work around xenbus race condition: If backend goes
68633 + * through InitWait to Connected fast enough, we can
68634 + * get Connected twice here.
68635 + */
68636 + if (dev->state != XenbusStateConnected)
68637 + goto InitWait; /* no InitWait seen yet, fudge it */
68638 +
68639 + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
68640 + "request-update", "%d", &val) < 0)
68641 + val = 0;
68642 + if (val)
68643 + info->update_wanted = 1;
68644 + break;
68645 +
68646 + case XenbusStateClosing:
68647 + // FIXME is this safe in any dev->state?
68648 + xenbus_frontend_closed(dev);
68649 + break;
68650 + }
68651 +}
68652 +
68653 +static struct xenbus_device_id xenfb_ids[] = {
68654 + { "vfb" },
68655 + { "" }
68656 +};
68657 +
68658 +static struct xenbus_driver xenfb = {
68659 + .name = "vfb",
68660 + .owner = THIS_MODULE,
68661 + .ids = xenfb_ids,
68662 + .probe = xenfb_probe,
68663 + .remove = xenfb_remove,
68664 + .resume = xenfb_resume,
68665 + .otherend_changed = xenfb_backend_changed,
68666 +};
68667 +
68668 +static int __init xenfb_init(void)
68669 +{
68670 + if (!is_running_on_xen())
68671 + return -ENODEV;
68672 +
68673 + /* Nothing to do if running in dom0. */
68674 + if (is_initial_xendomain())
68675 + return -ENODEV;
68676 +
68677 + return xenbus_register_frontend(&xenfb);
68678 +}
68679 +
68680 +static void __exit xenfb_cleanup(void)
68681 +{
68682 + return xenbus_unregister_driver(&xenfb);
68683 +}
68684 +
68685 +module_init(xenfb_init);
68686 +module_exit(xenfb_cleanup);
68687 +
68688 +MODULE_LICENSE("GPL");
68689 diff -Nur linux-2.6.16.33-noxen/drivers/xen/fbfront/xenkbd.c linux-2.6.16.33/drivers/xen/fbfront/xenkbd.c
68690 --- linux-2.6.16.33-noxen/drivers/xen/fbfront/xenkbd.c 1970-01-01 00:00:00.000000000 +0000
68691 +++ linux-2.6.16.33/drivers/xen/fbfront/xenkbd.c 2007-01-08 15:00:45.000000000 +0000
68692 @@ -0,0 +1,300 @@
68693 +/*
68694 + * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
68695 + *
68696 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
68697 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
68698 + *
68699 + * Based on linux/drivers/input/mouse/sermouse.c
68700 + *
68701 + * This file is subject to the terms and conditions of the GNU General Public
68702 + * License. See the file COPYING in the main directory of this archive for
68703 + * more details.
68704 + */
68705 +
68706 +/*
68707 + * TODO:
68708 + *
68709 + * Switch to grant tables together with xenfb.c.
68710 + */
68711 +
68712 +#include <linux/kernel.h>
68713 +#include <linux/errno.h>
68714 +#include <linux/module.h>
68715 +#include <linux/input.h>
68716 +#include <asm/hypervisor.h>
68717 +#include <xen/evtchn.h>
68718 +#include <xen/interface/io/fbif.h>
68719 +#include <xen/interface/io/kbdif.h>
68720 +#include <xen/xenbus.h>
68721 +
68722 +struct xenkbd_info
68723 +{
68724 + struct input_dev *dev;
68725 + struct xenkbd_page *page;
68726 + unsigned evtchn;
68727 + int irq;
68728 + struct xenbus_device *xbdev;
68729 +};
68730 +
68731 +static int xenkbd_remove(struct xenbus_device *);
68732 +static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
68733 +static void xenkbd_disconnect_backend(struct xenkbd_info *);
68734 +
68735 +/*
68736 + * Note: if you need to send out events, see xenfb_do_update() for how
68737 + * to do that.
68738 + */
68739 +
68740 +static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
68741 +{
68742 + struct xenkbd_info *info = dev_id;
68743 + struct xenkbd_page *page = info->page;
68744 + __u32 cons, prod;
68745 +
68746 + prod = page->in_prod;
68747 + if (prod == page->out_cons)
68748 + return IRQ_HANDLED;
68749 + rmb(); /* ensure we see ring contents up to prod */
68750 + for (cons = page->in_cons; cons != prod; cons++) {
68751 + union xenkbd_in_event *event;
68752 + event = &XENKBD_IN_RING_REF(page, cons);
68753 +
68754 + switch (event->type) {
68755 + case XENKBD_TYPE_MOTION:
68756 + input_report_rel(info->dev, REL_X, event->motion.rel_x);
68757 + input_report_rel(info->dev, REL_Y, event->motion.rel_y);
68758 + break;
68759 + case XENKBD_TYPE_KEY:
68760 + input_report_key(info->dev, event->key.keycode, event->key.pressed);
68761 + break;
68762 + case XENKBD_TYPE_POS:
68763 + input_report_abs(info->dev, ABS_X, event->pos.abs_x);
68764 + input_report_abs(info->dev, ABS_Y, event->pos.abs_y);
68765 + break;
68766 + }
68767 + }
68768 + input_sync(info->dev);
68769 + mb(); /* ensure we got ring contents */
68770 + page->in_cons = cons;
68771 + notify_remote_via_evtchn(info->evtchn);
68772 +
68773 + return IRQ_HANDLED;
68774 +}
68775 +
68776 +int __devinit xenkbd_probe(struct xenbus_device *dev,
68777 + const struct xenbus_device_id *id)
68778 +{
68779 + int ret, i;
68780 + struct xenkbd_info *info;
68781 + struct input_dev *input_dev;
68782 +
68783 + info = kzalloc(sizeof(*info), GFP_KERNEL);
68784 + if (!info) {
68785 + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
68786 + return -ENOMEM;
68787 + }
68788 + dev->dev.driver_data = info;
68789 + info->xbdev = dev;
68790 +
68791 + info->page = (void *)__get_free_page(GFP_KERNEL);
68792 + if (!info->page)
68793 + goto error_nomem;
68794 + info->page->in_cons = info->page->in_prod = 0;
68795 + info->page->out_cons = info->page->out_prod = 0;
68796 +
68797 + input_dev = input_allocate_device();
68798 + if (!input_dev)
68799 + goto error_nomem;
68800 +
68801 + input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS);
68802 + input_dev->keybit[LONG(BTN_MOUSE)]
68803 + = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT);
68804 + /* TODO additional buttons */
68805 + input_dev->relbit[0] = BIT(REL_X) | BIT(REL_Y);
68806 +
68807 + /* FIXME not sure this is quite right */
68808 + for (i = 0; i < 256; i++)
68809 + set_bit(i, input_dev->keybit);
68810 +
68811 + input_dev->name = "Xen Virtual Keyboard/Mouse";
68812 +
68813 + input_set_abs_params(input_dev, ABS_X, 0, XENFB_WIDTH, 0, 0);
68814 + input_set_abs_params(input_dev, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
68815 +
68816 + ret = input_register_device(input_dev);
68817 + if (ret) {
68818 + input_free_device(input_dev);
68819 + xenbus_dev_fatal(dev, ret, "input_register_device");
68820 + goto error;
68821 + }
68822 + info->dev = input_dev;
68823 +
68824 + ret = xenkbd_connect_backend(dev, info);
68825 + if (ret < 0)
68826 + goto error;
68827 +
68828 + return 0;
68829 +
68830 + error_nomem:
68831 + ret = -ENOMEM;
68832 + xenbus_dev_fatal(dev, ret, "allocating device memory");
68833 + error:
68834 + xenkbd_remove(dev);
68835 + return ret;
68836 +}
68837 +
68838 +static int xenkbd_resume(struct xenbus_device *dev)
68839 +{
68840 + struct xenkbd_info *info = dev->dev.driver_data;
68841 +
68842 + xenkbd_disconnect_backend(info);
68843 + return xenkbd_connect_backend(dev, info);
68844 +}
68845 +
68846 +static int xenkbd_remove(struct xenbus_device *dev)
68847 +{
68848 + struct xenkbd_info *info = dev->dev.driver_data;
68849 +
68850 + xenkbd_disconnect_backend(info);
68851 + input_unregister_device(info->dev);
68852 + free_page((unsigned long)info->page);
68853 + kfree(info);
68854 + return 0;
68855 +}
68856 +
68857 +static int xenkbd_connect_backend(struct xenbus_device *dev,
68858 + struct xenkbd_info *info)
68859 +{
68860 + int ret;
68861 + struct xenbus_transaction xbt;
68862 +
68863 + ret = xenbus_alloc_evtchn(dev, &info->evtchn);
68864 + if (ret)
68865 + return ret;
68866 + ret = bind_evtchn_to_irqhandler(info->evtchn, input_handler, 0,
68867 + "xenkbd", info);
68868 + if (ret < 0) {
68869 + xenbus_free_evtchn(dev, info->evtchn);
68870 + xenbus_dev_fatal(dev, ret, "bind_evtchn_to_irqhandler");
68871 + return ret;
68872 + }
68873 + info->irq = ret;
68874 +
68875 + again:
68876 + ret = xenbus_transaction_start(&xbt);
68877 + if (ret) {
68878 + xenbus_dev_fatal(dev, ret, "starting transaction");
68879 + return ret;
68880 + }
68881 + ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
68882 + virt_to_mfn(info->page));
68883 + if (ret)
68884 + goto error_xenbus;
68885 + ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
68886 + info->evtchn);
68887 + if (ret)
68888 + goto error_xenbus;
68889 + ret = xenbus_transaction_end(xbt, 0);
68890 + if (ret) {
68891 + if (ret == -EAGAIN)
68892 + goto again;
68893 + xenbus_dev_fatal(dev, ret, "completing transaction");
68894 + return ret;
68895 + }
68896 +
68897 + xenbus_switch_state(dev, XenbusStateInitialised);
68898 + return 0;
68899 +
68900 + error_xenbus:
68901 + xenbus_transaction_end(xbt, 1);
68902 + xenbus_dev_fatal(dev, ret, "writing xenstore");
68903 + return ret;
68904 +}
68905 +
68906 +static void xenkbd_disconnect_backend(struct xenkbd_info *info)
68907 +{
68908 + if (info->irq >= 0)
68909 + unbind_from_irqhandler(info->irq, info);
68910 + info->irq = -1;
68911 +}
68912 +
68913 +static void xenkbd_backend_changed(struct xenbus_device *dev,
68914 + enum xenbus_state backend_state)
68915 +{
68916 + struct xenkbd_info *info = dev->dev.driver_data;
68917 + int ret, val;
68918 +
68919 + switch (backend_state) {
68920 + case XenbusStateInitialising:
68921 + case XenbusStateInitialised:
68922 + case XenbusStateUnknown:
68923 + case XenbusStateClosed:
68924 + break;
68925 +
68926 + case XenbusStateInitWait:
68927 + InitWait:
68928 + ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
68929 + "feature-abs-pointer", "%d", &val);
68930 + if (ret < 0)
68931 + val = 0;
68932 + if (val) {
68933 + ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
68934 + "request-abs-pointer", "1");
68935 + if (ret)
68936 + ; /* FIXME */
68937 + }
68938 + xenbus_switch_state(dev, XenbusStateConnected);
68939 + break;
68940 +
68941 + case XenbusStateConnected:
68942 + /*
68943 + * Work around xenbus race condition: If backend goes
68944 + * through InitWait to Connected fast enough, we can
68945 + * get Connected twice here.
68946 + */
68947 + if (dev->state != XenbusStateConnected)
68948 + goto InitWait; /* no InitWait seen yet, fudge it */
68949 + break;
68950 +
68951 + case XenbusStateClosing:
68952 + xenbus_frontend_closed(dev);
68953 + break;
68954 + }
68955 +}
68956 +
68957 +static struct xenbus_device_id xenkbd_ids[] = {
68958 + { "vkbd" },
68959 + { "" }
68960 +};
68961 +
68962 +static struct xenbus_driver xenkbd = {
68963 + .name = "vkbd",
68964 + .owner = THIS_MODULE,
68965 + .ids = xenkbd_ids,
68966 + .probe = xenkbd_probe,
68967 + .remove = xenkbd_remove,
68968 + .resume = xenkbd_resume,
68969 + .otherend_changed = xenkbd_backend_changed,
68970 +};
68971 +
68972 +static int __init xenkbd_init(void)
68973 +{
68974 + if (!is_running_on_xen())
68975 + return -ENODEV;
68976 +
68977 + /* Nothing to do if running in dom0. */
68978 + if (is_initial_xendomain())
68979 + return -ENODEV;
68980 +
68981 + return xenbus_register_frontend(&xenkbd);
68982 +}
68983 +
68984 +static void __exit xenkbd_cleanup(void)
68985 +{
68986 + return xenbus_unregister_driver(&xenkbd);
68987 +}
68988 +
68989 +module_init(xenkbd_init);
68990 +module_exit(xenkbd_cleanup);
68991 +
68992 +MODULE_LICENSE("GPL");
68993 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/Makefile linux-2.6.16.33/drivers/xen/netback/Makefile
68994 --- linux-2.6.16.33-noxen/drivers/xen/netback/Makefile 1970-01-01 00:00:00.000000000 +0000
68995 +++ linux-2.6.16.33/drivers/xen/netback/Makefile 2007-01-08 15:00:45.000000000 +0000
68996 @@ -0,0 +1,5 @@
68997 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
68998 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
68999 +
69000 +netbk-y := netback.o xenbus.o interface.o
69001 +netloop-y := loopback.o
69002 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/common.h linux-2.6.16.33/drivers/xen/netback/common.h
69003 --- linux-2.6.16.33-noxen/drivers/xen/netback/common.h 1970-01-01 00:00:00.000000000 +0000
69004 +++ linux-2.6.16.33/drivers/xen/netback/common.h 2007-01-08 15:00:45.000000000 +0000
69005 @@ -0,0 +1,146 @@
69006 +/******************************************************************************
69007 + * arch/xen/drivers/netif/backend/common.h
69008 + *
69009 + * This program is free software; you can redistribute it and/or
69010 + * modify it under the terms of the GNU General Public License version 2
69011 + * as published by the Free Software Foundation; or, when distributed
69012 + * separately from the Linux kernel or incorporated into other
69013 + * software packages, subject to the following license:
69014 + *
69015 + * Permission is hereby granted, free of charge, to any person obtaining a copy
69016 + * of this source file (the "Software"), to deal in the Software without
69017 + * restriction, including without limitation the rights to use, copy, modify,
69018 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69019 + * and to permit persons to whom the Software is furnished to do so, subject to
69020 + * the following conditions:
69021 + *
69022 + * The above copyright notice and this permission notice shall be included in
69023 + * all copies or substantial portions of the Software.
69024 + *
69025 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69026 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69027 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69028 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69029 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69030 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69031 + * IN THE SOFTWARE.
69032 + */
69033 +
69034 +#ifndef __NETIF__BACKEND__COMMON_H__
69035 +#define __NETIF__BACKEND__COMMON_H__
69036 +
69037 +#include <linux/config.h>
69038 +#include <linux/version.h>
69039 +#include <linux/module.h>
69040 +#include <linux/interrupt.h>
69041 +#include <linux/slab.h>
69042 +#include <linux/ip.h>
69043 +#include <linux/in.h>
69044 +#include <linux/netdevice.h>
69045 +#include <linux/etherdevice.h>
69046 +#include <linux/wait.h>
69047 +#include <xen/evtchn.h>
69048 +#include <xen/interface/io/netif.h>
69049 +#include <asm/io.h>
69050 +#include <asm/pgalloc.h>
69051 +#include <xen/interface/grant_table.h>
69052 +#include <xen/gnttab.h>
69053 +#include <xen/driver_util.h>
69054 +
69055 +#define DPRINTK(_f, _a...) \
69056 + pr_debug("(file=%s, line=%d) " _f, \
69057 + __FILE__ , __LINE__ , ## _a )
69058 +#define IPRINTK(fmt, args...) \
69059 + printk(KERN_INFO "xen_net: " fmt, ##args)
69060 +#define WPRINTK(fmt, args...) \
69061 + printk(KERN_WARNING "xen_net: " fmt, ##args)
69062 +
69063 +typedef struct netif_st {
69064 + /* Unique identifier for this interface. */
69065 + domid_t domid;
69066 + unsigned int handle;
69067 +
69068 + u8 fe_dev_addr[6];
69069 +
69070 + /* Physical parameters of the comms window. */
69071 + grant_handle_t tx_shmem_handle;
69072 + grant_ref_t tx_shmem_ref;
69073 + grant_handle_t rx_shmem_handle;
69074 + grant_ref_t rx_shmem_ref;
69075 + unsigned int evtchn;
69076 + unsigned int irq;
69077 +
69078 + /* The shared rings and indexes. */
69079 + netif_tx_back_ring_t tx;
69080 + netif_rx_back_ring_t rx;
69081 + struct vm_struct *tx_comms_area;
69082 + struct vm_struct *rx_comms_area;
69083 +
69084 + /* Set of features that can be turned on in dev->features. */
69085 + int features;
69086 +
69087 + /* Internal feature information. */
69088 + int can_queue:1; /* can queue packets for receiver? */
69089 + int copying_receiver:1; /* copy packets to receiver? */
69090 +
69091 + /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
69092 + RING_IDX rx_req_cons_peek;
69093 +
69094 + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
69095 + unsigned long credit_bytes;
69096 + unsigned long credit_usec;
69097 + unsigned long remaining_credit;
69098 + struct timer_list credit_timeout;
69099 +
69100 + /* Enforce draining of the transmit queue. */
69101 + struct timer_list tx_queue_timeout;
69102 +
69103 + /* Miscellaneous private stuff. */
69104 + struct list_head list; /* scheduling list */
69105 + atomic_t refcnt;
69106 + struct net_device *dev;
69107 + struct net_device_stats stats;
69108 +
69109 + wait_queue_head_t waiting_to_free;
69110 +} netif_t;
69111 +
69112 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
69113 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
69114 +
69115 +void netif_disconnect(netif_t *netif);
69116 +
69117 +netif_t *netif_alloc(domid_t domid, unsigned int handle);
69118 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
69119 + unsigned long rx_ring_ref, unsigned int evtchn);
69120 +
69121 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
69122 +#define netif_put(_b) \
69123 + do { \
69124 + if ( atomic_dec_and_test(&(_b)->refcnt) ) \
69125 + wake_up(&(_b)->waiting_to_free); \
69126 + } while (0)
69127 +
69128 +void netif_xenbus_init(void);
69129 +
69130 +#define netif_schedulable(dev) (netif_running(dev) && netif_carrier_ok(dev))
69131 +
69132 +void netif_schedule_work(netif_t *netif);
69133 +void netif_deschedule_work(netif_t *netif);
69134 +
69135 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
69136 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
69137 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
69138 +
69139 +static inline int netbk_can_queue(struct net_device *dev)
69140 +{
69141 + netif_t *netif = netdev_priv(dev);
69142 + return netif->can_queue;
69143 +}
69144 +
69145 +static inline int netbk_can_sg(struct net_device *dev)
69146 +{
69147 + netif_t *netif = netdev_priv(dev);
69148 + return netif->features & NETIF_F_SG;
69149 +}
69150 +
69151 +#endif /* __NETIF__BACKEND__COMMON_H__ */
69152 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/interface.c linux-2.6.16.33/drivers/xen/netback/interface.c
69153 --- linux-2.6.16.33-noxen/drivers/xen/netback/interface.c 1970-01-01 00:00:00.000000000 +0000
69154 +++ linux-2.6.16.33/drivers/xen/netback/interface.c 2007-01-08 15:00:45.000000000 +0000
69155 @@ -0,0 +1,349 @@
69156 +/******************************************************************************
69157 + * arch/xen/drivers/netif/backend/interface.c
69158 + *
69159 + * Network-device interface management.
69160 + *
69161 + * Copyright (c) 2004-2005, Keir Fraser
69162 + *
69163 + * This program is free software; you can redistribute it and/or
69164 + * modify it under the terms of the GNU General Public License version 2
69165 + * as published by the Free Software Foundation; or, when distributed
69166 + * separately from the Linux kernel or incorporated into other
69167 + * software packages, subject to the following license:
69168 + *
69169 + * Permission is hereby granted, free of charge, to any person obtaining a copy
69170 + * of this source file (the "Software"), to deal in the Software without
69171 + * restriction, including without limitation the rights to use, copy, modify,
69172 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69173 + * and to permit persons to whom the Software is furnished to do so, subject to
69174 + * the following conditions:
69175 + *
69176 + * The above copyright notice and this permission notice shall be included in
69177 + * all copies or substantial portions of the Software.
69178 + *
69179 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69180 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69181 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69182 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69183 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69184 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69185 + * IN THE SOFTWARE.
69186 + */
69187 +
69188 +#include "common.h"
69189 +#include <linux/ethtool.h>
69190 +#include <linux/rtnetlink.h>
69191 +
69192 +/*
69193 + * Module parameter 'queue_length':
69194 + *
69195 + * Enables queuing in the network stack when a client has run out of receive
69196 + * descriptors. Although this feature can improve receive bandwidth by avoiding
69197 + * packet loss, it can also result in packets sitting in the 'tx_queue' for
69198 + * unbounded time. This is bad if those packets hold onto foreign resources.
69199 + * For example, consider a packet that holds onto resources belonging to the
69200 + * guest for which it is queued (e.g., packet received on vif1.0, destined for
69201 + * vif1.1 which is not activated in the guest): in this situation the guest
69202 + * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
69203 + * run a timer (tx_queue_timeout) to drain the queue when the interface is
69204 + * blocked.
69205 + */
69206 +static unsigned long netbk_queue_length = 32;
69207 +module_param_named(queue_length, netbk_queue_length, ulong, 0);
69208 +
69209 +static void __netif_up(netif_t *netif)
69210 +{
69211 + enable_irq(netif->irq);
69212 + netif_schedule_work(netif);
69213 +}
69214 +
69215 +static void __netif_down(netif_t *netif)
69216 +{
69217 + disable_irq(netif->irq);
69218 + netif_deschedule_work(netif);
69219 +}
69220 +
69221 +static int net_open(struct net_device *dev)
69222 +{
69223 + netif_t *netif = netdev_priv(dev);
69224 + if (netif_carrier_ok(dev))
69225 + __netif_up(netif);
69226 + return 0;
69227 +}
69228 +
69229 +static int net_close(struct net_device *dev)
69230 +{
69231 + netif_t *netif = netdev_priv(dev);
69232 + if (netif_carrier_ok(dev))
69233 + __netif_down(netif);
69234 + return 0;
69235 +}
69236 +
69237 +static int netbk_change_mtu(struct net_device *dev, int mtu)
69238 +{
69239 + int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
69240 +
69241 + if (mtu > max)
69242 + return -EINVAL;
69243 + dev->mtu = mtu;
69244 + return 0;
69245 +}
69246 +
69247 +static int netbk_set_sg(struct net_device *dev, u32 data)
69248 +{
69249 + if (data) {
69250 + netif_t *netif = netdev_priv(dev);
69251 +
69252 + if (!(netif->features & NETIF_F_SG))
69253 + return -ENOSYS;
69254 + }
69255 +
69256 + return ethtool_op_set_sg(dev, data);
69257 +}
69258 +
69259 +static int netbk_set_tso(struct net_device *dev, u32 data)
69260 +{
69261 + if (data) {
69262 + netif_t *netif = netdev_priv(dev);
69263 +
69264 + if (!(netif->features & NETIF_F_TSO))
69265 + return -ENOSYS;
69266 + }
69267 +
69268 + return ethtool_op_set_tso(dev, data);
69269 +}
69270 +
69271 +static struct ethtool_ops network_ethtool_ops =
69272 +{
69273 + .get_tx_csum = ethtool_op_get_tx_csum,
69274 + .set_tx_csum = ethtool_op_set_tx_csum,
69275 + .get_sg = ethtool_op_get_sg,
69276 + .set_sg = netbk_set_sg,
69277 + .get_tso = ethtool_op_get_tso,
69278 + .set_tso = netbk_set_tso,
69279 + .get_link = ethtool_op_get_link,
69280 +};
69281 +
69282 +netif_t *netif_alloc(domid_t domid, unsigned int handle)
69283 +{
69284 + int err = 0;
69285 + struct net_device *dev;
69286 + netif_t *netif;
69287 + char name[IFNAMSIZ] = {};
69288 +
69289 + snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
69290 + dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
69291 + if (dev == NULL) {
69292 + DPRINTK("Could not create netif: out of memory\n");
69293 + return ERR_PTR(-ENOMEM);
69294 + }
69295 +
69296 + netif_carrier_off(dev);
69297 +
69298 + netif = netdev_priv(dev);
69299 + memset(netif, 0, sizeof(*netif));
69300 + netif->domid = domid;
69301 + netif->handle = handle;
69302 + atomic_set(&netif->refcnt, 1);
69303 + init_waitqueue_head(&netif->waiting_to_free);
69304 + netif->dev = dev;
69305 +
69306 + netif->credit_bytes = netif->remaining_credit = ~0UL;
69307 + netif->credit_usec = 0UL;
69308 + init_timer(&netif->credit_timeout);
69309 + /* Initialize 'expires' now: it's used to track the credit window. */
69310 + netif->credit_timeout.expires = jiffies;
69311 +
69312 + init_timer(&netif->tx_queue_timeout);
69313 +
69314 + dev->hard_start_xmit = netif_be_start_xmit;
69315 + dev->get_stats = netif_be_get_stats;
69316 + dev->open = net_open;
69317 + dev->stop = net_close;
69318 + dev->change_mtu = netbk_change_mtu;
69319 + dev->features = NETIF_F_IP_CSUM;
69320 +
69321 + SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
69322 +
69323 + dev->tx_queue_len = netbk_queue_length;
69324 +
69325 + /*
69326 + * Initialise a dummy MAC address. We choose the numerically
69327 + * largest non-broadcast address to prevent the address getting
69328 + * stolen by an Ethernet bridge for STP purposes.
69329 + * (FE:FF:FF:FF:FF:FF)
69330 + */
69331 + memset(dev->dev_addr, 0xFF, ETH_ALEN);
69332 + dev->dev_addr[0] &= ~0x01;
69333 +
69334 + rtnl_lock();
69335 + err = register_netdevice(dev);
69336 + rtnl_unlock();
69337 + if (err) {
69338 + DPRINTK("Could not register new net device %s: err=%d\n",
69339 + dev->name, err);
69340 + free_netdev(dev);
69341 + return ERR_PTR(err);
69342 + }
69343 +
69344 + DPRINTK("Successfully created netif\n");
69345 + return netif;
69346 +}
69347 +
69348 +static int map_frontend_pages(
69349 + netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
69350 +{
69351 + struct gnttab_map_grant_ref op;
69352 + int ret;
69353 +
69354 + gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
69355 + GNTMAP_host_map, tx_ring_ref, netif->domid);
69356 +
69357 + lock_vm_area(netif->tx_comms_area);
69358 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
69359 + unlock_vm_area(netif->tx_comms_area);
69360 + BUG_ON(ret);
69361 +
69362 + if (op.status) {
69363 + DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
69364 + return op.status;
69365 + }
69366 +
69367 + netif->tx_shmem_ref = tx_ring_ref;
69368 + netif->tx_shmem_handle = op.handle;
69369 +
69370 + gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
69371 + GNTMAP_host_map, rx_ring_ref, netif->domid);
69372 +
69373 + lock_vm_area(netif->rx_comms_area);
69374 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
69375 + unlock_vm_area(netif->rx_comms_area);
69376 + BUG_ON(ret);
69377 +
69378 + if (op.status) {
69379 + DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
69380 + return op.status;
69381 + }
69382 +
69383 + netif->rx_shmem_ref = rx_ring_ref;
69384 + netif->rx_shmem_handle = op.handle;
69385 +
69386 + return 0;
69387 +}
69388 +
69389 +static void unmap_frontend_pages(netif_t *netif)
69390 +{
69391 + struct gnttab_unmap_grant_ref op;
69392 + int ret;
69393 +
69394 + gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
69395 + GNTMAP_host_map, netif->tx_shmem_handle);
69396 +
69397 + lock_vm_area(netif->tx_comms_area);
69398 + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
69399 + unlock_vm_area(netif->tx_comms_area);
69400 + BUG_ON(ret);
69401 +
69402 + gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
69403 + GNTMAP_host_map, netif->rx_shmem_handle);
69404 +
69405 + lock_vm_area(netif->rx_comms_area);
69406 + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
69407 + unlock_vm_area(netif->rx_comms_area);
69408 + BUG_ON(ret);
69409 +}
69410 +
69411 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
69412 + unsigned long rx_ring_ref, unsigned int evtchn)
69413 +{
69414 + int err = -ENOMEM;
69415 + netif_tx_sring_t *txs;
69416 + netif_rx_sring_t *rxs;
69417 + struct evtchn_bind_interdomain bind_interdomain;
69418 +
69419 + /* Already connected through? */
69420 + if (netif->irq)
69421 + return 0;
69422 +
69423 + netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
69424 + if (netif->tx_comms_area == NULL)
69425 + return -ENOMEM;
69426 + netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
69427 + if (netif->rx_comms_area == NULL)
69428 + goto err_rx;
69429 +
69430 + err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
69431 + if (err)
69432 + goto err_map;
69433 +
69434 + bind_interdomain.remote_dom = netif->domid;
69435 + bind_interdomain.remote_port = evtchn;
69436 +
69437 + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
69438 + &bind_interdomain);
69439 + if (err)
69440 + goto err_hypervisor;
69441 +
69442 + netif->evtchn = bind_interdomain.local_port;
69443 +
69444 + netif->irq = bind_evtchn_to_irqhandler(
69445 + netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
69446 + disable_irq(netif->irq);
69447 +
69448 + txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
69449 + BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
69450 +
69451 + rxs = (netif_rx_sring_t *)
69452 + ((char *)netif->rx_comms_area->addr);
69453 + BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
69454 +
69455 + netif->rx_req_cons_peek = 0;
69456 +
69457 + netif_get(netif);
69458 +
69459 + rtnl_lock();
69460 + netif_carrier_on(netif->dev);
69461 + if (netif_running(netif->dev))
69462 + __netif_up(netif);
69463 + rtnl_unlock();
69464 +
69465 + return 0;
69466 +err_hypervisor:
69467 + unmap_frontend_pages(netif);
69468 +err_map:
69469 + free_vm_area(netif->rx_comms_area);
69470 +err_rx:
69471 + free_vm_area(netif->tx_comms_area);
69472 + return err;
69473 +}
69474 +
69475 +void netif_disconnect(netif_t *netif)
69476 +{
69477 + if (netif_carrier_ok(netif->dev)) {
69478 + rtnl_lock();
69479 + netif_carrier_off(netif->dev);
69480 + if (netif_running(netif->dev))
69481 + __netif_down(netif);
69482 + rtnl_unlock();
69483 + netif_put(netif);
69484 + }
69485 +
69486 + atomic_dec(&netif->refcnt);
69487 + wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
69488 +
69489 + del_timer_sync(&netif->credit_timeout);
69490 + del_timer_sync(&netif->tx_queue_timeout);
69491 +
69492 + if (netif->irq)
69493 + unbind_from_irqhandler(netif->irq, netif);
69494 +
69495 + unregister_netdev(netif->dev);
69496 +
69497 + if (netif->tx.sring) {
69498 + unmap_frontend_pages(netif);
69499 + free_vm_area(netif->tx_comms_area);
69500 + free_vm_area(netif->rx_comms_area);
69501 + }
69502 +
69503 + free_netdev(netif->dev);
69504 +}
69505 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/loopback.c linux-2.6.16.33/drivers/xen/netback/loopback.c
69506 --- linux-2.6.16.33-noxen/drivers/xen/netback/loopback.c 1970-01-01 00:00:00.000000000 +0000
69507 +++ linux-2.6.16.33/drivers/xen/netback/loopback.c 2007-01-08 15:00:45.000000000 +0000
69508 @@ -0,0 +1,321 @@
69509 +/******************************************************************************
69510 + * netback/loopback.c
69511 + *
69512 + * A two-interface loopback device to emulate a local netfront-netback
69513 + * connection. This ensures that local packet delivery looks identical
69514 + * to inter-domain delivery. Most importantly, packets delivered locally
69515 + * originating from other domains will get *copied* when they traverse this
69516 + * driver. This prevents unbounded delays in socket-buffer queues from
69517 + * causing the netback driver to "seize up".
69518 + *
69519 + * This driver creates a symmetric pair of loopback interfaces with names
69520 + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
69521 + * bridge, just like a proper netback interface, while a local IP interface
69522 + * is configured on 'veth0'.
69523 + *
69524 + * As with a real netback interface, vif0.0 is configured with a suitable
69525 + * dummy MAC address. No default is provided for veth0: a reasonable strategy
69526 + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
69527 + * (to avoid confusing the Etherbridge).
69528 + *
69529 + * Copyright (c) 2005 K A Fraser
69530 + *
69531 + * This program is free software; you can redistribute it and/or
69532 + * modify it under the terms of the GNU General Public License version 2
69533 + * as published by the Free Software Foundation; or, when distributed
69534 + * separately from the Linux kernel or incorporated into other
69535 + * software packages, subject to the following license:
69536 + *
69537 + * Permission is hereby granted, free of charge, to any person obtaining a copy
69538 + * of this source file (the "Software"), to deal in the Software without
69539 + * restriction, including without limitation the rights to use, copy, modify,
69540 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69541 + * and to permit persons to whom the Software is furnished to do so, subject to
69542 + * the following conditions:
69543 + *
69544 + * The above copyright notice and this permission notice shall be included in
69545 + * all copies or substantial portions of the Software.
69546 + *
69547 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69548 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69549 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69550 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69551 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69552 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69553 + * IN THE SOFTWARE.
69554 + */
69555 +
69556 +#include <linux/config.h>
69557 +#include <linux/module.h>
69558 +#include <linux/netdevice.h>
69559 +#include <linux/inetdevice.h>
69560 +#include <linux/etherdevice.h>
69561 +#include <linux/skbuff.h>
69562 +#include <linux/ethtool.h>
69563 +#include <net/dst.h>
69564 +#include <net/xfrm.h> /* secpath_reset() */
69565 +#include <asm/hypervisor.h> /* is_initial_xendomain() */
69566 +
69567 +static int nloopbacks = -1;
69568 +module_param(nloopbacks, int, 0);
69569 +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
69570 +
69571 +struct net_private {
69572 + struct net_device *loopback_dev;
69573 + struct net_device_stats stats;
69574 +};
69575 +
69576 +static int loopback_open(struct net_device *dev)
69577 +{
69578 + struct net_private *np = netdev_priv(dev);
69579 + memset(&np->stats, 0, sizeof(np->stats));
69580 + netif_start_queue(dev);
69581 + return 0;
69582 +}
69583 +
69584 +static int loopback_close(struct net_device *dev)
69585 +{
69586 + netif_stop_queue(dev);
69587 + return 0;
69588 +}
69589 +
69590 +#ifdef CONFIG_X86
69591 +static int is_foreign(unsigned long pfn)
69592 +{
69593 + /* NB. Play it safe for auto-translation mode. */
69594 + return (xen_feature(XENFEAT_auto_translated_physmap) ||
69595 + (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
69596 +}
69597 +#else
69598 +/* How to detect a foreign mapping? Play it safe. */
69599 +#define is_foreign(pfn) (1)
69600 +#endif
69601 +
69602 +static int skb_remove_foreign_references(struct sk_buff *skb)
69603 +{
69604 + struct page *page;
69605 + unsigned long pfn;
69606 + int i, off;
69607 + char *vaddr;
69608 +
69609 + BUG_ON(skb_shinfo(skb)->frag_list);
69610 +
69611 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
69612 + pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page);
69613 + if (!is_foreign(pfn))
69614 + continue;
69615 +
69616 + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
69617 + if (unlikely(!page))
69618 + return 0;
69619 +
69620 + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
69621 + off = skb_shinfo(skb)->frags[i].page_offset;
69622 + memcpy(page_address(page) + off,
69623 + vaddr + off,
69624 + skb_shinfo(skb)->frags[i].size);
69625 + kunmap_skb_frag(vaddr);
69626 +
69627 + put_page(skb_shinfo(skb)->frags[i].page);
69628 + skb_shinfo(skb)->frags[i].page = page;
69629 + }
69630 +
69631 + return 1;
69632 +}
69633 +
69634 +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
69635 +{
69636 + struct net_private *np = netdev_priv(dev);
69637 +
69638 + if (!skb_remove_foreign_references(skb)) {
69639 + np->stats.tx_dropped++;
69640 + dev_kfree_skb(skb);
69641 + return 0;
69642 + }
69643 +
69644 + dst_release(skb->dst);
69645 + skb->dst = NULL;
69646 +
69647 + skb_orphan(skb);
69648 +
69649 + np->stats.tx_bytes += skb->len;
69650 + np->stats.tx_packets++;
69651 +
69652 + /* Switch to loopback context. */
69653 + dev = np->loopback_dev;
69654 + np = netdev_priv(dev);
69655 +
69656 + np->stats.rx_bytes += skb->len;
69657 + np->stats.rx_packets++;
69658 +
69659 + if (skb->ip_summed == CHECKSUM_HW) {
69660 + /* Defer checksum calculation. */
69661 + skb->proto_csum_blank = 1;
69662 + /* Must be a local packet: assert its integrity. */
69663 + skb->proto_data_valid = 1;
69664 + }
69665 +
69666 + skb->ip_summed = skb->proto_data_valid ?
69667 + CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
69668 +
69669 + skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
69670 + skb->protocol = eth_type_trans(skb, dev);
69671 + skb->dev = dev;
69672 + dev->last_rx = jiffies;
69673 +
69674 + /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
69675 + nf_reset(skb);
69676 + secpath_reset(skb);
69677 +
69678 + netif_rx(skb);
69679 +
69680 + return 0;
69681 +}
69682 +
69683 +static struct net_device_stats *loopback_get_stats(struct net_device *dev)
69684 +{
69685 + struct net_private *np = netdev_priv(dev);
69686 + return &np->stats;
69687 +}
69688 +
69689 +static struct ethtool_ops network_ethtool_ops =
69690 +{
69691 + .get_tx_csum = ethtool_op_get_tx_csum,
69692 + .set_tx_csum = ethtool_op_set_tx_csum,
69693 + .get_sg = ethtool_op_get_sg,
69694 + .set_sg = ethtool_op_set_sg,
69695 + .get_tso = ethtool_op_get_tso,
69696 + .set_tso = ethtool_op_set_tso,
69697 + .get_link = ethtool_op_get_link,
69698 +};
69699 +
69700 +/*
69701 + * Nothing to do here. Virtual interface is point-to-point and the
69702 + * physical interface is probably promiscuous anyway.
69703 + */
69704 +static void loopback_set_multicast_list(struct net_device *dev)
69705 +{
69706 +}
69707 +
69708 +static void loopback_construct(struct net_device *dev, struct net_device *lo)
69709 +{
69710 + struct net_private *np = netdev_priv(dev);
69711 +
69712 + np->loopback_dev = lo;
69713 +
69714 + dev->open = loopback_open;
69715 + dev->stop = loopback_close;
69716 + dev->hard_start_xmit = loopback_start_xmit;
69717 + dev->get_stats = loopback_get_stats;
69718 + dev->set_multicast_list = loopback_set_multicast_list;
69719 + dev->change_mtu = NULL; /* allow arbitrary mtu */
69720 +
69721 + dev->tx_queue_len = 0;
69722 +
69723 + dev->features = (NETIF_F_HIGHDMA |
69724 + NETIF_F_LLTX |
69725 + NETIF_F_TSO |
69726 + NETIF_F_SG |
69727 + NETIF_F_IP_CSUM);
69728 +
69729 + SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
69730 +
69731 + /*
69732 + * We do not set a jumbo MTU on the interface. Otherwise the network
69733 + * stack will try to send large packets that will get dropped by the
69734 + * Ethernet bridge (unless the physical Ethernet interface is
69735 + * configured to transfer jumbo packets). If a larger MTU is desired
69736 + * then the system administrator can specify it using the 'ifconfig'
69737 + * command.
69738 + */
69739 + /*dev->mtu = 16*1024;*/
69740 +}
69741 +
69742 +static int __init make_loopback(int i)
69743 +{
69744 + struct net_device *dev1, *dev2;
69745 + char dev_name[IFNAMSIZ];
69746 + int err = -ENOMEM;
69747 +
69748 + sprintf(dev_name, "vif0.%d", i);
69749 + dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
69750 + if (!dev1)
69751 + return err;
69752 +
69753 + sprintf(dev_name, "veth%d", i);
69754 + dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
69755 + if (!dev2)
69756 + goto fail_netdev2;
69757 +
69758 + loopback_construct(dev1, dev2);
69759 + loopback_construct(dev2, dev1);
69760 +
69761 + /*
69762 + * Initialise a dummy MAC address for the 'dummy backend' interface. We
69763 + * choose the numerically largest non-broadcast address to prevent the
69764 + * address getting stolen by an Ethernet bridge for STP purposes.
69765 + */
69766 + memset(dev1->dev_addr, 0xFF, ETH_ALEN);
69767 + dev1->dev_addr[0] &= ~0x01;
69768 +
69769 + if ((err = register_netdev(dev1)) != 0)
69770 + goto fail;
69771 +
69772 + if ((err = register_netdev(dev2)) != 0) {
69773 + unregister_netdev(dev1);
69774 + goto fail;
69775 + }
69776 +
69777 + return 0;
69778 +
69779 + fail:
69780 + free_netdev(dev2);
69781 + fail_netdev2:
69782 + free_netdev(dev1);
69783 + return err;
69784 +}
69785 +
69786 +static void __exit clean_loopback(int i)
69787 +{
69788 + struct net_device *dev1, *dev2;
69789 + char dev_name[IFNAMSIZ];
69790 +
69791 + sprintf(dev_name, "vif0.%d", i);
69792 + dev1 = dev_get_by_name(dev_name);
69793 + sprintf(dev_name, "veth%d", i);
69794 + dev2 = dev_get_by_name(dev_name);
69795 + if (dev1 && dev2) {
69796 + unregister_netdev(dev2);
69797 + unregister_netdev(dev1);
69798 + free_netdev(dev2);
69799 + free_netdev(dev1);
69800 + }
69801 +}
69802 +
69803 +static int __init loopback_init(void)
69804 +{
69805 + int i, err = 0;
69806 +
69807 + if (nloopbacks == -1)
69808 + nloopbacks = is_initial_xendomain() ? 4 : 0;
69809 +
69810 + for (i = 0; i < nloopbacks; i++)
69811 + if ((err = make_loopback(i)) != 0)
69812 + break;
69813 +
69814 + return err;
69815 +}
69816 +
69817 +module_init(loopback_init);
69818 +
69819 +static void __exit loopback_exit(void)
69820 +{
69821 + int i;
69822 +
69823 + for (i = nloopbacks; i-- > 0; )
69824 + clean_loopback(i);
69825 +}
69826 +
69827 +module_exit(loopback_exit);
69828 +
69829 +MODULE_LICENSE("Dual BSD/GPL");
69830 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/netback.c linux-2.6.16.33/drivers/xen/netback/netback.c
69831 --- linux-2.6.16.33-noxen/drivers/xen/netback/netback.c 1970-01-01 00:00:00.000000000 +0000
69832 +++ linux-2.6.16.33/drivers/xen/netback/netback.c 2007-01-08 15:00:45.000000000 +0000
69833 @@ -0,0 +1,1523 @@
69834 +/******************************************************************************
69835 + * drivers/xen/netback/netback.c
69836 + *
69837 + * Back-end of the driver for virtual network devices. This portion of the
69838 + * driver exports a 'unified' network-device interface that can be accessed
69839 + * by any operating system that implements a compatible front end. A
69840 + * reference front-end implementation can be found in:
69841 + * drivers/xen/netfront/netfront.c
69842 + *
69843 + * Copyright (c) 2002-2005, K A Fraser
69844 + *
69845 + * This program is free software; you can redistribute it and/or
69846 + * modify it under the terms of the GNU General Public License version 2
69847 + * as published by the Free Software Foundation; or, when distributed
69848 + * separately from the Linux kernel or incorporated into other
69849 + * software packages, subject to the following license:
69850 + *
69851 + * Permission is hereby granted, free of charge, to any person obtaining a copy
69852 + * of this source file (the "Software"), to deal in the Software without
69853 + * restriction, including without limitation the rights to use, copy, modify,
69854 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
69855 + * and to permit persons to whom the Software is furnished to do so, subject to
69856 + * the following conditions:
69857 + *
69858 + * The above copyright notice and this permission notice shall be included in
69859 + * all copies or substantial portions of the Software.
69860 + *
69861 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
69862 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
69863 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69864 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69865 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
69866 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
69867 + * IN THE SOFTWARE.
69868 + */
69869 +
69870 +#include "common.h"
69871 +#include <xen/balloon.h>
69872 +#include <xen/interface/memory.h>
69873 +
69874 +/*#define NETBE_DEBUG_INTERRUPT*/
69875 +
69876 +struct netbk_rx_meta {
69877 + skb_frag_t frag;
69878 + int id;
69879 + int copy:1;
69880 +};
69881 +
69882 +static void netif_idx_release(u16 pending_idx);
69883 +static void netif_page_release(struct page *page);
69884 +static void make_tx_response(netif_t *netif,
69885 + netif_tx_request_t *txp,
69886 + s8 st);
69887 +static netif_rx_response_t *make_rx_response(netif_t *netif,
69888 + u16 id,
69889 + s8 st,
69890 + u16 offset,
69891 + u16 size,
69892 + u16 flags);
69893 +
69894 +static void net_tx_action(unsigned long unused);
69895 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
69896 +
69897 +static void net_rx_action(unsigned long unused);
69898 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
69899 +
69900 +static struct timer_list net_timer;
69901 +
69902 +#define MAX_PENDING_REQS 256
69903 +
69904 +static struct sk_buff_head rx_queue;
69905 +
69906 +static struct page **mmap_pages;
69907 +static inline unsigned long idx_to_kaddr(unsigned int idx)
69908 +{
69909 + return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx]));
69910 +}
69911 +
69912 +#define PKT_PROT_LEN 64
69913 +
69914 +static struct pending_tx_info {
69915 + netif_tx_request_t req;
69916 + netif_t *netif;
69917 +} pending_tx_info[MAX_PENDING_REQS];
69918 +static u16 pending_ring[MAX_PENDING_REQS];
69919 +typedef unsigned int PEND_RING_IDX;
69920 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
69921 +static PEND_RING_IDX pending_prod, pending_cons;
69922 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
69923 +
69924 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
69925 +static u16 dealloc_ring[MAX_PENDING_REQS];
69926 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
69927 +
69928 +static struct sk_buff_head tx_queue;
69929 +
69930 +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
69931 +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
69932 +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
69933 +
69934 +static struct list_head net_schedule_list;
69935 +static spinlock_t net_schedule_list_lock;
69936 +
69937 +#define MAX_MFN_ALLOC 64
69938 +static unsigned long mfn_list[MAX_MFN_ALLOC];
69939 +static unsigned int alloc_index = 0;
69940 +
69941 +static inline unsigned long alloc_mfn(void)
69942 +{
69943 + return mfn_list[--alloc_index];
69944 +}
69945 +
69946 +static int check_mfn(int nr)
69947 +{
69948 + struct xen_memory_reservation reservation = {
69949 + .extent_order = 0,
69950 + .domid = DOMID_SELF
69951 + };
69952 +
69953 + if (likely(alloc_index >= nr))
69954 + return 0;
69955 +
69956 + set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
69957 + reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
69958 + alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
69959 + &reservation);
69960 +
69961 + return alloc_index >= nr ? 0 : -ENOMEM;
69962 +}
69963 +
69964 +static inline void maybe_schedule_tx_action(void)
69965 +{
69966 + smp_mb();
69967 + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
69968 + !list_empty(&net_schedule_list))
69969 + tasklet_schedule(&net_tx_tasklet);
69970 +}
69971 +
69972 +/*
69973 + * A gross way of confirming the origin of an skb data page. The slab
69974 + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
69975 + */
69976 +static inline int is_xen_skb(struct sk_buff *skb)
69977 +{
69978 + extern kmem_cache_t *skbuff_cachep;
69979 + kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
69980 + return (cp == skbuff_cachep);
69981 +}
69982 +
69983 +/*
69984 + * We can flip without copying the packet unless:
69985 + * 1. The data is not allocated from our special cache; or
69986 + * 2. The main data area is shared; or
69987 + * 3. One or more fragments are shared; or
69988 + * 4. There are chained fragments.
69989 + */
69990 +static inline int is_flippable_skb(struct sk_buff *skb)
69991 +{
69992 + int frag;
69993 +
69994 + if (!is_xen_skb(skb) || skb_cloned(skb))
69995 + return 0;
69996 +
69997 + for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
69998 + if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
69999 + return 0;
70000 + }
70001 +
70002 + if (skb_shinfo(skb)->frag_list != NULL)
70003 + return 0;
70004 +
70005 + return 1;
70006 +}
70007 +
70008 +static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
70009 +{
70010 + struct skb_shared_info *ninfo;
70011 + struct sk_buff *nskb;
70012 + unsigned long offset;
70013 + int ret;
70014 + int len;
70015 + int headlen;
70016 +
70017 + BUG_ON(skb_shinfo(skb)->frag_list != NULL);
70018 +
70019 + nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
70020 + if (unlikely(!nskb))
70021 + goto err;
70022 +
70023 + skb_reserve(nskb, 16 + NET_IP_ALIGN);
70024 + headlen = nskb->end - nskb->data;
70025 + if (headlen > skb_headlen(skb))
70026 + headlen = skb_headlen(skb);
70027 + ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
70028 + BUG_ON(ret);
70029 +
70030 + ninfo = skb_shinfo(nskb);
70031 + ninfo->gso_size = skb_shinfo(skb)->gso_size;
70032 + ninfo->gso_type = skb_shinfo(skb)->gso_type;
70033 +
70034 + offset = headlen;
70035 + len = skb->len - headlen;
70036 +
70037 + nskb->len = skb->len;
70038 + nskb->data_len = len;
70039 + nskb->truesize += len;
70040 +
70041 + while (len) {
70042 + struct page *page;
70043 + int copy;
70044 + int zero;
70045 +
70046 + if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
70047 + dump_stack();
70048 + goto err_free;
70049 + }
70050 +
70051 + copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
70052 + zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
70053 +
70054 + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
70055 + if (unlikely(!page))
70056 + goto err_free;
70057 +
70058 + ret = skb_copy_bits(skb, offset, page_address(page), copy);
70059 + BUG_ON(ret);
70060 +
70061 + ninfo->frags[ninfo->nr_frags].page = page;
70062 + ninfo->frags[ninfo->nr_frags].page_offset = 0;
70063 + ninfo->frags[ninfo->nr_frags].size = copy;
70064 + ninfo->nr_frags++;
70065 +
70066 + offset += copy;
70067 + len -= copy;
70068 + }
70069 +
70070 + offset = nskb->data - skb->data;
70071 +
70072 + nskb->h.raw = skb->h.raw + offset;
70073 + nskb->nh.raw = skb->nh.raw + offset;
70074 + nskb->mac.raw = skb->mac.raw + offset;
70075 +
70076 + return nskb;
70077 +
70078 + err_free:
70079 + kfree_skb(nskb);
70080 + err:
70081 + return NULL;
70082 +}
70083 +
70084 +static inline int netbk_max_required_rx_slots(netif_t *netif)
70085 +{
70086 + if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
70087 + return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
70088 + return 1; /* all in one */
70089 +}
70090 +
70091 +static inline int netbk_queue_full(netif_t *netif)
70092 +{
70093 + RING_IDX peek = netif->rx_req_cons_peek;
70094 + RING_IDX needed = netbk_max_required_rx_slots(netif);
70095 +
70096 + return ((netif->rx.sring->req_prod - peek) < needed) ||
70097 + ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
70098 +}
70099 +
70100 +static void tx_queue_callback(unsigned long data)
70101 +{
70102 + netif_t *netif = (netif_t *)data;
70103 + if (netif_schedulable(netif->dev))
70104 + netif_wake_queue(netif->dev);
70105 +}
70106 +
70107 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
70108 +{
70109 + netif_t *netif = netdev_priv(dev);
70110 +
70111 + BUG_ON(skb->dev != dev);
70112 +
70113 + /* Drop the packet if the target domain has no receive buffers. */
70114 + if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif)))
70115 + goto drop;
70116 +
70117 + /*
70118 + * Copy the packet here if it's destined for a flipping interface
70119 + * but isn't flippable (e.g. extra references to data).
70120 + */
70121 + if (!netif->copying_receiver && !is_flippable_skb(skb)) {
70122 + struct sk_buff *nskb = netbk_copy_skb(skb);
70123 + if ( unlikely(nskb == NULL) )
70124 + goto drop;
70125 + /* Copy only the header fields we use in this driver. */
70126 + nskb->dev = skb->dev;
70127 + nskb->ip_summed = skb->ip_summed;
70128 + nskb->proto_data_valid = skb->proto_data_valid;
70129 + dev_kfree_skb(skb);
70130 + skb = nskb;
70131 + }
70132 +
70133 + netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
70134 + !!skb_shinfo(skb)->gso_size;
70135 + netif_get(netif);
70136 +
70137 + if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
70138 + netif->rx.sring->req_event = netif->rx_req_cons_peek +
70139 + netbk_max_required_rx_slots(netif);
70140 + mb(); /* request notification /then/ check & stop the queue */
70141 + if (netbk_queue_full(netif)) {
70142 + netif_stop_queue(dev);
70143 + /*
70144 + * Schedule 500ms timeout to restart the queue, thus
70145 + * ensuring that an inactive queue will be drained.
70146 + * Packets will be immediately be dropped until more
70147 + * receive buffers become available (see
70148 + * netbk_queue_full() check above).
70149 + */
70150 + netif->tx_queue_timeout.data = (unsigned long)netif;
70151 + netif->tx_queue_timeout.function = tx_queue_callback;
70152 + __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
70153 + }
70154 + }
70155 +
70156 + skb_queue_tail(&rx_queue, skb);
70157 + tasklet_schedule(&net_rx_tasklet);
70158 +
70159 + return 0;
70160 +
70161 + drop:
70162 + netif->stats.tx_dropped++;
70163 + dev_kfree_skb(skb);
70164 + return 0;
70165 +}
70166 +
70167 +#if 0
70168 +static void xen_network_done_notify(void)
70169 +{
70170 + static struct net_device *eth0_dev = NULL;
70171 + if (unlikely(eth0_dev == NULL))
70172 + eth0_dev = __dev_get_by_name("eth0");
70173 + netif_rx_schedule(eth0_dev);
70174 +}
70175 +/*
70176 + * Add following to poll() function in NAPI driver (Tigon3 is example):
70177 + * if ( xen_network_done() )
70178 + * tg3_enable_ints(tp);
70179 + */
70180 +int xen_network_done(void)
70181 +{
70182 + return skb_queue_empty(&rx_queue);
70183 +}
70184 +#endif
70185 +
70186 +struct netrx_pending_operations {
70187 + unsigned trans_prod, trans_cons;
70188 + unsigned mmu_prod, mmu_cons;
70189 + unsigned mcl_prod, mcl_cons;
70190 + unsigned copy_prod, copy_cons;
70191 + unsigned meta_prod, meta_cons;
70192 + mmu_update_t *mmu;
70193 + gnttab_transfer_t *trans;
70194 + gnttab_copy_t *copy;
70195 + multicall_entry_t *mcl;
70196 + struct netbk_rx_meta *meta;
70197 +};
70198 +
70199 +/* Set up the grant operations for this fragment. If it's a flipping
70200 + interface, we also set up the unmap request from here. */
70201 +static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
70202 + int i, struct netrx_pending_operations *npo,
70203 + struct page *page, unsigned long size,
70204 + unsigned long offset)
70205 +{
70206 + mmu_update_t *mmu;
70207 + gnttab_transfer_t *gop;
70208 + gnttab_copy_t *copy_gop;
70209 + multicall_entry_t *mcl;
70210 + netif_rx_request_t *req;
70211 + unsigned long old_mfn, new_mfn;
70212 +
70213 + old_mfn = virt_to_mfn(page_address(page));
70214 +
70215 + req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
70216 + if (netif->copying_receiver) {
70217 + /* The fragment needs to be copied rather than
70218 + flipped. */
70219 + meta->copy = 1;
70220 + copy_gop = npo->copy + npo->copy_prod++;
70221 + copy_gop->flags = GNTCOPY_dest_gref;
70222 + if (PageForeign(page)) {
70223 + struct pending_tx_info *src_pend =
70224 + &pending_tx_info[page->index];
70225 + copy_gop->source.domid = src_pend->netif->domid;
70226 + copy_gop->source.u.ref = src_pend->req.gref;
70227 + copy_gop->flags |= GNTCOPY_source_gref;
70228 + } else {
70229 + copy_gop->source.domid = DOMID_SELF;
70230 + copy_gop->source.u.gmfn = old_mfn;
70231 + }
70232 + copy_gop->source.offset = offset;
70233 + copy_gop->dest.domid = netif->domid;
70234 + copy_gop->dest.offset = 0;
70235 + copy_gop->dest.u.ref = req->gref;
70236 + copy_gop->len = size;
70237 + } else {
70238 + meta->copy = 0;
70239 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
70240 + new_mfn = alloc_mfn();
70241 +
70242 + /*
70243 + * Set the new P2M table entry before
70244 + * reassigning the old data page. Heed the
70245 + * comment in pgtable-2level.h:pte_page(). :-)
70246 + */
70247 + set_phys_to_machine(page_to_pfn(page), new_mfn);
70248 +
70249 + mcl = npo->mcl + npo->mcl_prod++;
70250 + MULTI_update_va_mapping(mcl,
70251 + (unsigned long)page_address(page),
70252 + pfn_pte_ma(new_mfn, PAGE_KERNEL),
70253 + 0);
70254 +
70255 + mmu = npo->mmu + npo->mmu_prod++;
70256 + mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
70257 + MMU_MACHPHYS_UPDATE;
70258 + mmu->val = page_to_pfn(page);
70259 + }
70260 +
70261 + gop = npo->trans + npo->trans_prod++;
70262 + gop->mfn = old_mfn;
70263 + gop->domid = netif->domid;
70264 + gop->ref = req->gref;
70265 + }
70266 + return req->id;
70267 +}
70268 +
70269 +static void netbk_gop_skb(struct sk_buff *skb,
70270 + struct netrx_pending_operations *npo)
70271 +{
70272 + netif_t *netif = netdev_priv(skb->dev);
70273 + int nr_frags = skb_shinfo(skb)->nr_frags;
70274 + int i;
70275 + int extra;
70276 + struct netbk_rx_meta *head_meta, *meta;
70277 +
70278 + head_meta = npo->meta + npo->meta_prod++;
70279 + head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
70280 + head_meta->frag.size = skb_shinfo(skb)->gso_size;
70281 + extra = !!head_meta->frag.size + 1;
70282 +
70283 + for (i = 0; i < nr_frags; i++) {
70284 + meta = npo->meta + npo->meta_prod++;
70285 + meta->frag = skb_shinfo(skb)->frags[i];
70286 + meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
70287 + meta->frag.page,
70288 + meta->frag.size,
70289 + meta->frag.page_offset);
70290 + }
70291 +
70292 + /*
70293 + * This must occur at the end to ensure that we don't trash
70294 + * skb_shinfo until we're done.
70295 + */
70296 + head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
70297 + virt_to_page(skb->data),
70298 + skb_headlen(skb),
70299 + offset_in_page(skb->data));
70300 +
70301 + netif->rx.req_cons += nr_frags + extra;
70302 +}
70303 +
70304 +static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
70305 +{
70306 + int i;
70307 +
70308 + for (i = 0; i < nr_frags; i++)
70309 + put_page(meta[i].frag.page);
70310 +}
70311 +
70312 +/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
70313 + used to set up the operations on the top of
70314 + netrx_pending_operations, which have since been done. Check that
70315 + they didn't give any errors and advance over them. */
70316 +static int netbk_check_gop(int nr_frags, domid_t domid,
70317 + struct netrx_pending_operations *npo)
70318 +{
70319 + multicall_entry_t *mcl;
70320 + gnttab_transfer_t *gop;
70321 + gnttab_copy_t *copy_op;
70322 + int status = NETIF_RSP_OKAY;
70323 + int i;
70324 +
70325 + for (i = 0; i <= nr_frags; i++) {
70326 + if (npo->meta[npo->meta_cons + i].copy) {
70327 + copy_op = npo->copy + npo->copy_cons++;
70328 + if (copy_op->status != GNTST_okay) {
70329 + DPRINTK("Bad status %d from copy to DOM%d.\n",
70330 + copy_op->status, domid);
70331 + status = NETIF_RSP_ERROR;
70332 + }
70333 + } else {
70334 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
70335 + mcl = npo->mcl + npo->mcl_cons++;
70336 + /* The update_va_mapping() must not fail. */
70337 + BUG_ON(mcl->result != 0);
70338 + }
70339 +
70340 + gop = npo->trans + npo->trans_cons++;
70341 + /* Check the reassignment error code. */
70342 + if (gop->status != 0) {
70343 + DPRINTK("Bad status %d from grant transfer to DOM%u\n",
70344 + gop->status, domid);
70345 + /*
70346 + * Page no longer belongs to us unless
70347 + * GNTST_bad_page, but that should be
70348 + * a fatal error anyway.
70349 + */
70350 + BUG_ON(gop->status == GNTST_bad_page);
70351 + status = NETIF_RSP_ERROR;
70352 + }
70353 + }
70354 + }
70355 +
70356 + return status;
70357 +}
70358 +
70359 +static void netbk_add_frag_responses(netif_t *netif, int status,
70360 + struct netbk_rx_meta *meta, int nr_frags)
70361 +{
70362 + int i;
70363 + unsigned long offset;
70364 +
70365 + for (i = 0; i < nr_frags; i++) {
70366 + int id = meta[i].id;
70367 + int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
70368 +
70369 + if (meta[i].copy)
70370 + offset = 0;
70371 + else
70372 + offset = meta[i].frag.page_offset;
70373 + make_rx_response(netif, id, status, offset,
70374 + meta[i].frag.size, flags);
70375 + }
70376 +}
70377 +
70378 +static void net_rx_action(unsigned long unused)
70379 +{
70380 + netif_t *netif = NULL;
70381 + s8 status;
70382 + u16 id, irq, flags;
70383 + netif_rx_response_t *resp;
70384 + multicall_entry_t *mcl;
70385 + struct sk_buff_head rxq;
70386 + struct sk_buff *skb;
70387 + int notify_nr = 0;
70388 + int ret;
70389 + int nr_frags;
70390 + int count;
70391 + unsigned long offset;
70392 +
70393 + /*
70394 + * Putting hundreds of bytes on the stack is considered rude.
70395 + * Static works because a tasklet can only be on one CPU at any time.
70396 + */
70397 + static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
70398 + static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
70399 + static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
70400 + static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
70401 + static unsigned char rx_notify[NR_IRQS];
70402 + static u16 notify_list[NET_RX_RING_SIZE];
70403 + static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
70404 +
70405 + struct netrx_pending_operations npo = {
70406 + mmu: rx_mmu,
70407 + trans: grant_trans_op,
70408 + copy: grant_copy_op,
70409 + mcl: rx_mcl,
70410 + meta: meta};
70411 +
70412 + skb_queue_head_init(&rxq);
70413 +
70414 + count = 0;
70415 +
70416 + while ((skb = skb_dequeue(&rx_queue)) != NULL) {
70417 + nr_frags = skb_shinfo(skb)->nr_frags;
70418 + *(int *)skb->cb = nr_frags;
70419 +
70420 + if (!xen_feature(XENFEAT_auto_translated_physmap) &&
70421 + check_mfn(nr_frags + 1)) {
70422 + /* Memory squeeze? Back off for an arbitrary while. */
70423 + if ( net_ratelimit() )
70424 + WPRINTK("Memory squeeze in netback "
70425 + "driver.\n");
70426 + mod_timer(&net_timer, jiffies + HZ);
70427 + skb_queue_head(&rx_queue, skb);
70428 + break;
70429 + }
70430 +
70431 + netbk_gop_skb(skb, &npo);
70432 +
70433 + count += nr_frags + 1;
70434 +
70435 + __skb_queue_tail(&rxq, skb);
70436 +
70437 + /* Filled the batch queue? */
70438 + if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
70439 + break;
70440 + }
70441 +
70442 + if (npo.mcl_prod &&
70443 + !xen_feature(XENFEAT_auto_translated_physmap)) {
70444 + mcl = npo.mcl + npo.mcl_prod++;
70445 +
70446 + BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
70447 + mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
70448 +
70449 + mcl->op = __HYPERVISOR_mmu_update;
70450 + mcl->args[0] = (unsigned long)rx_mmu;
70451 + mcl->args[1] = npo.mmu_prod;
70452 + mcl->args[2] = 0;
70453 + mcl->args[3] = DOMID_SELF;
70454 + }
70455 +
70456 + if (npo.trans_prod) {
70457 + mcl = npo.mcl + npo.mcl_prod++;
70458 + mcl->op = __HYPERVISOR_grant_table_op;
70459 + mcl->args[0] = GNTTABOP_transfer;
70460 + mcl->args[1] = (unsigned long)grant_trans_op;
70461 + mcl->args[2] = npo.trans_prod;
70462 + }
70463 +
70464 + if (npo.copy_prod) {
70465 + mcl = npo.mcl + npo.mcl_prod++;
70466 + mcl->op = __HYPERVISOR_grant_table_op;
70467 + mcl->args[0] = GNTTABOP_copy;
70468 + mcl->args[1] = (unsigned long)grant_copy_op;
70469 + mcl->args[2] = npo.copy_prod;
70470 + }
70471 +
70472 + /* Nothing to do? */
70473 + if (!npo.mcl_prod)
70474 + return;
70475 +
70476 + BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
70477 + BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
70478 + BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
70479 + BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
70480 + BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
70481 +
70482 + ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
70483 + BUG_ON(ret != 0);
70484 +
70485 + while ((skb = __skb_dequeue(&rxq)) != NULL) {
70486 + nr_frags = *(int *)skb->cb;
70487 +
70488 + netif = netdev_priv(skb->dev);
70489 + /* We can't rely on skb_release_data to release the
70490 + pages used by fragments for us, since it tries to
70491 + touch the pages in the fraglist. If we're in
70492 + flipping mode, that doesn't work. In copying mode,
70493 + we still have access to all of the pages, and so
70494 + it's safe to let release_data deal with it. */
70495 + /* (Freeing the fragments is safe since we copy
70496 + non-linear skbs destined for flipping interfaces) */
70497 + if (!netif->copying_receiver) {
70498 + atomic_set(&(skb_shinfo(skb)->dataref), 1);
70499 + skb_shinfo(skb)->frag_list = NULL;
70500 + skb_shinfo(skb)->nr_frags = 0;
70501 + netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
70502 + }
70503 +
70504 + netif->stats.tx_bytes += skb->len;
70505 + netif->stats.tx_packets++;
70506 +
70507 + status = netbk_check_gop(nr_frags, netif->domid, &npo);
70508 +
70509 + id = meta[npo.meta_cons].id;
70510 + flags = nr_frags ? NETRXF_more_data : 0;
70511 +
70512 + if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
70513 + flags |= NETRXF_csum_blank | NETRXF_data_validated;
70514 + else if (skb->proto_data_valid) /* remote but checksummed? */
70515 + flags |= NETRXF_data_validated;
70516 +
70517 + if (meta[npo.meta_cons].copy)
70518 + offset = 0;
70519 + else
70520 + offset = offset_in_page(skb->data);
70521 + resp = make_rx_response(netif, id, status, offset,
70522 + skb_headlen(skb), flags);
70523 +
70524 + if (meta[npo.meta_cons].frag.size) {
70525 + struct netif_extra_info *gso =
70526 + (struct netif_extra_info *)
70527 + RING_GET_RESPONSE(&netif->rx,
70528 + netif->rx.rsp_prod_pvt++);
70529 +
70530 + resp->flags |= NETRXF_extra_info;
70531 +
70532 + gso->u.gso.size = meta[npo.meta_cons].frag.size;
70533 + gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
70534 + gso->u.gso.pad = 0;
70535 + gso->u.gso.features = 0;
70536 +
70537 + gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
70538 + gso->flags = 0;
70539 + }
70540 +
70541 + netbk_add_frag_responses(netif, status,
70542 + meta + npo.meta_cons + 1,
70543 + nr_frags);
70544 +
70545 + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
70546 + irq = netif->irq;
70547 + if (ret && !rx_notify[irq]) {
70548 + rx_notify[irq] = 1;
70549 + notify_list[notify_nr++] = irq;
70550 + }
70551 +
70552 + if (netif_queue_stopped(netif->dev) &&
70553 + netif_schedulable(netif->dev) &&
70554 + !netbk_queue_full(netif))
70555 + netif_wake_queue(netif->dev);
70556 +
70557 + netif_put(netif);
70558 + dev_kfree_skb(skb);
70559 + npo.meta_cons += nr_frags + 1;
70560 + }
70561 +
70562 + while (notify_nr != 0) {
70563 + irq = notify_list[--notify_nr];
70564 + rx_notify[irq] = 0;
70565 + notify_remote_via_irq(irq);
70566 + }
70567 +
70568 + /* More work to do? */
70569 + if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
70570 + tasklet_schedule(&net_rx_tasklet);
70571 +#if 0
70572 + else
70573 + xen_network_done_notify();
70574 +#endif
70575 +}
70576 +
70577 +static void net_alarm(unsigned long unused)
70578 +{
70579 + tasklet_schedule(&net_rx_tasklet);
70580 +}
70581 +
70582 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
70583 +{
70584 + netif_t *netif = netdev_priv(dev);
70585 + return &netif->stats;
70586 +}
70587 +
70588 +static int __on_net_schedule_list(netif_t *netif)
70589 +{
70590 + return netif->list.next != NULL;
70591 +}
70592 +
70593 +static void remove_from_net_schedule_list(netif_t *netif)
70594 +{
70595 + spin_lock_irq(&net_schedule_list_lock);
70596 + if (likely(__on_net_schedule_list(netif))) {
70597 + list_del(&netif->list);
70598 + netif->list.next = NULL;
70599 + netif_put(netif);
70600 + }
70601 + spin_unlock_irq(&net_schedule_list_lock);
70602 +}
70603 +
70604 +static void add_to_net_schedule_list_tail(netif_t *netif)
70605 +{
70606 + if (__on_net_schedule_list(netif))
70607 + return;
70608 +
70609 + spin_lock_irq(&net_schedule_list_lock);
70610 + if (!__on_net_schedule_list(netif) &&
70611 + likely(netif_schedulable(netif->dev))) {
70612 + list_add_tail(&netif->list, &net_schedule_list);
70613 + netif_get(netif);
70614 + }
70615 + spin_unlock_irq(&net_schedule_list_lock);
70616 +}
70617 +
70618 +/*
70619 + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
70620 + * If this driver is pipelining transmit requests then we can be very
70621 + * aggressive in avoiding new-packet notifications -- frontend only needs to
70622 + * send a notification if there are no outstanding unreceived responses.
70623 + * If we may be buffer transmit buffers for any reason then we must be rather
70624 + * more conservative and treat this as the final check for pending work.
70625 + */
70626 +void netif_schedule_work(netif_t *netif)
70627 +{
70628 + int more_to_do;
70629 +
70630 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
70631 + more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
70632 +#else
70633 + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
70634 +#endif
70635 +
70636 + if (more_to_do) {
70637 + add_to_net_schedule_list_tail(netif);
70638 + maybe_schedule_tx_action();
70639 + }
70640 +}
70641 +
70642 +void netif_deschedule_work(netif_t *netif)
70643 +{
70644 + remove_from_net_schedule_list(netif);
70645 +}
70646 +
70647 +
70648 +static void tx_add_credit(netif_t *netif)
70649 +{
70650 + unsigned long max_burst, max_credit;
70651 +
70652 + /*
70653 + * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
70654 + * Otherwise the interface can seize up due to insufficient credit.
70655 + */
70656 + max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
70657 + max_burst = min(max_burst, 131072UL);
70658 + max_burst = max(max_burst, netif->credit_bytes);
70659 +
70660 + /* Take care that adding a new chunk of credit doesn't wrap to zero. */
70661 + max_credit = netif->remaining_credit + netif->credit_bytes;
70662 + if (max_credit < netif->remaining_credit)
70663 + max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
70664 +
70665 + netif->remaining_credit = min(max_credit, max_burst);
70666 +}
70667 +
70668 +static void tx_credit_callback(unsigned long data)
70669 +{
70670 + netif_t *netif = (netif_t *)data;
70671 + tx_add_credit(netif);
70672 + netif_schedule_work(netif);
70673 +}
70674 +
70675 +inline static void net_tx_action_dealloc(void)
70676 +{
70677 + gnttab_unmap_grant_ref_t *gop;
70678 + u16 pending_idx;
70679 + PEND_RING_IDX dc, dp;
70680 + netif_t *netif;
70681 + int ret;
70682 +
70683 + dc = dealloc_cons;
70684 + dp = dealloc_prod;
70685 +
70686 + /* Ensure we see all indexes enqueued by netif_idx_release(). */
70687 + smp_rmb();
70688 +
70689 + /*
70690 + * Free up any grants we have finished using
70691 + */
70692 + gop = tx_unmap_ops;
70693 + while (dc != dp) {
70694 + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
70695 + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
70696 + GNTMAP_host_map,
70697 + grant_tx_handle[pending_idx]);
70698 + gop++;
70699 + }
70700 + ret = HYPERVISOR_grant_table_op(
70701 + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
70702 + BUG_ON(ret);
70703 +
70704 + while (dealloc_cons != dp) {
70705 + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
70706 +
70707 + netif = pending_tx_info[pending_idx].netif;
70708 +
70709 + make_tx_response(netif, &pending_tx_info[pending_idx].req,
70710 + NETIF_RSP_OKAY);
70711 +
70712 + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70713 +
70714 + netif_put(netif);
70715 + }
70716 +}
70717 +
70718 +static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
70719 +{
70720 + RING_IDX cons = netif->tx.req_cons;
70721 +
70722 + do {
70723 + make_tx_response(netif, txp, NETIF_RSP_ERROR);
70724 + if (cons >= end)
70725 + break;
70726 + txp = RING_GET_REQUEST(&netif->tx, cons++);
70727 + } while (1);
70728 + netif->tx.req_cons = cons;
70729 + netif_schedule_work(netif);
70730 + netif_put(netif);
70731 +}
70732 +
70733 +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
70734 + netif_tx_request_t *txp, int work_to_do)
70735 +{
70736 + RING_IDX cons = netif->tx.req_cons;
70737 + int frags = 0;
70738 +
70739 + if (!(first->flags & NETTXF_more_data))
70740 + return 0;
70741 +
70742 + do {
70743 + if (frags >= work_to_do) {
70744 + DPRINTK("Need more frags\n");
70745 + return -frags;
70746 + }
70747 +
70748 + if (unlikely(frags >= MAX_SKB_FRAGS)) {
70749 + DPRINTK("Too many frags\n");
70750 + return -frags;
70751 + }
70752 +
70753 + memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
70754 + sizeof(*txp));
70755 + if (txp->size > first->size) {
70756 + DPRINTK("Frags galore\n");
70757 + return -frags;
70758 + }
70759 +
70760 + first->size -= txp->size;
70761 + frags++;
70762 +
70763 + if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
70764 + DPRINTK("txp->offset: %x, size: %u\n",
70765 + txp->offset, txp->size);
70766 + return -frags;
70767 + }
70768 + } while ((txp++)->flags & NETTXF_more_data);
70769 +
70770 + return frags;
70771 +}
70772 +
70773 +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
70774 + struct sk_buff *skb,
70775 + netif_tx_request_t *txp,
70776 + gnttab_map_grant_ref_t *mop)
70777 +{
70778 + struct skb_shared_info *shinfo = skb_shinfo(skb);
70779 + skb_frag_t *frags = shinfo->frags;
70780 + unsigned long pending_idx = *((u16 *)skb->data);
70781 + int i, start;
70782 +
70783 + /* Skip first skb fragment if it is on same page as header fragment. */
70784 + start = ((unsigned long)shinfo->frags[0].page == pending_idx);
70785 +
70786 + for (i = start; i < shinfo->nr_frags; i++, txp++) {
70787 + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
70788 +
70789 + gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
70790 + GNTMAP_host_map | GNTMAP_readonly,
70791 + txp->gref, netif->domid);
70792 +
70793 + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
70794 + netif_get(netif);
70795 + pending_tx_info[pending_idx].netif = netif;
70796 + frags[i].page = (void *)pending_idx;
70797 + }
70798 +
70799 + return mop;
70800 +}
70801 +
70802 +static int netbk_tx_check_mop(struct sk_buff *skb,
70803 + gnttab_map_grant_ref_t **mopp)
70804 +{
70805 + gnttab_map_grant_ref_t *mop = *mopp;
70806 + int pending_idx = *((u16 *)skb->data);
70807 + netif_t *netif = pending_tx_info[pending_idx].netif;
70808 + netif_tx_request_t *txp;
70809 + struct skb_shared_info *shinfo = skb_shinfo(skb);
70810 + int nr_frags = shinfo->nr_frags;
70811 + int i, err, start;
70812 +
70813 + /* Check status of header. */
70814 + err = mop->status;
70815 + if (unlikely(err)) {
70816 + txp = &pending_tx_info[pending_idx].req;
70817 + make_tx_response(netif, txp, NETIF_RSP_ERROR);
70818 + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70819 + netif_put(netif);
70820 + } else {
70821 + set_phys_to_machine(
70822 + __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
70823 + FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
70824 + grant_tx_handle[pending_idx] = mop->handle;
70825 + }
70826 +
70827 + /* Skip first skb fragment if it is on same page as header fragment. */
70828 + start = ((unsigned long)shinfo->frags[0].page == pending_idx);
70829 +
70830 + for (i = start; i < nr_frags; i++) {
70831 + int j, newerr;
70832 +
70833 + pending_idx = (unsigned long)shinfo->frags[i].page;
70834 +
70835 + /* Check error status: if okay then remember grant handle. */
70836 + newerr = (++mop)->status;
70837 + if (likely(!newerr)) {
70838 + set_phys_to_machine(
70839 + __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
70840 + FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
70841 + grant_tx_handle[pending_idx] = mop->handle;
70842 + /* Had a previous error? Invalidate this fragment. */
70843 + if (unlikely(err))
70844 + netif_idx_release(pending_idx);
70845 + continue;
70846 + }
70847 +
70848 + /* Error on this fragment: respond to client with an error. */
70849 + txp = &pending_tx_info[pending_idx].req;
70850 + make_tx_response(netif, txp, NETIF_RSP_ERROR);
70851 + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
70852 + netif_put(netif);
70853 +
70854 + /* Not the first error? Preceding frags already invalidated. */
70855 + if (err)
70856 + continue;
70857 +
70858 + /* First error: invalidate header and preceding fragments. */
70859 + pending_idx = *((u16 *)skb->data);
70860 + netif_idx_release(pending_idx);
70861 + for (j = start; j < i; j++) {
70862 + pending_idx = (unsigned long)shinfo->frags[i].page;
70863 + netif_idx_release(pending_idx);
70864 + }
70865 +
70866 + /* Remember the error: invalidate all subsequent fragments. */
70867 + err = newerr;
70868 + }
70869 +
70870 + *mopp = mop + 1;
70871 + return err;
70872 +}
70873 +
70874 +static void netbk_fill_frags(struct sk_buff *skb)
70875 +{
70876 + struct skb_shared_info *shinfo = skb_shinfo(skb);
70877 + int nr_frags = shinfo->nr_frags;
70878 + int i;
70879 +
70880 + for (i = 0; i < nr_frags; i++) {
70881 + skb_frag_t *frag = shinfo->frags + i;
70882 + netif_tx_request_t *txp;
70883 + unsigned long pending_idx;
70884 +
70885 + pending_idx = (unsigned long)frag->page;
70886 + txp = &pending_tx_info[pending_idx].req;
70887 + frag->page = virt_to_page(idx_to_kaddr(pending_idx));
70888 + frag->size = txp->size;
70889 + frag->page_offset = txp->offset;
70890 +
70891 + skb->len += txp->size;
70892 + skb->data_len += txp->size;
70893 + skb->truesize += txp->size;
70894 + }
70895 +}
70896 +
70897 +int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
70898 + int work_to_do)
70899 +{
70900 + struct netif_extra_info extra;
70901 + RING_IDX cons = netif->tx.req_cons;
70902 +
70903 + do {
70904 + if (unlikely(work_to_do-- <= 0)) {
70905 + DPRINTK("Missing extra info\n");
70906 + return -EBADR;
70907 + }
70908 +
70909 + memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
70910 + sizeof(extra));
70911 + if (unlikely(!extra.type ||
70912 + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
70913 + netif->tx.req_cons = ++cons;
70914 + DPRINTK("Invalid extra type: %d\n", extra.type);
70915 + return -EINVAL;
70916 + }
70917 +
70918 + memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
70919 + netif->tx.req_cons = ++cons;
70920 + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
70921 +
70922 + return work_to_do;
70923 +}
70924 +
70925 +static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
70926 +{
70927 + if (!gso->u.gso.size) {
70928 + DPRINTK("GSO size must not be zero.\n");
70929 + return -EINVAL;
70930 + }
70931 +
70932 + /* Currently only TCPv4 S.O. is supported. */
70933 + if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
70934 + DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
70935 + return -EINVAL;
70936 + }
70937 +
70938 + skb_shinfo(skb)->gso_size = gso->u.gso.size;
70939 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
70940 +
70941 + /* Header must be checked, and gso_segs computed. */
70942 + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
70943 + skb_shinfo(skb)->gso_segs = 0;
70944 +
70945 + return 0;
70946 +}
70947 +
70948 +/* Called after netfront has transmitted */
70949 +static void net_tx_action(unsigned long unused)
70950 +{
70951 + struct list_head *ent;
70952 + struct sk_buff *skb;
70953 + netif_t *netif;
70954 + netif_tx_request_t txreq;
70955 + netif_tx_request_t txfrags[MAX_SKB_FRAGS];
70956 + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
70957 + u16 pending_idx;
70958 + RING_IDX i;
70959 + gnttab_map_grant_ref_t *mop;
70960 + unsigned int data_len;
70961 + int ret, work_to_do;
70962 +
70963 + if (dealloc_cons != dealloc_prod)
70964 + net_tx_action_dealloc();
70965 +
70966 + mop = tx_map_ops;
70967 + while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
70968 + !list_empty(&net_schedule_list)) {
70969 + /* Get a netif from the list with work to do. */
70970 + ent = net_schedule_list.next;
70971 + netif = list_entry(ent, netif_t, list);
70972 + netif_get(netif);
70973 + remove_from_net_schedule_list(netif);
70974 +
70975 + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
70976 + if (!work_to_do) {
70977 + netif_put(netif);
70978 + continue;
70979 + }
70980 +
70981 + i = netif->tx.req_cons;
70982 + rmb(); /* Ensure that we see the request before we copy it. */
70983 + memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
70984 +
70985 + /* Credit-based scheduling. */
70986 + if (txreq.size > netif->remaining_credit) {
70987 + unsigned long now = jiffies;
70988 + unsigned long next_credit =
70989 + netif->credit_timeout.expires +
70990 + msecs_to_jiffies(netif->credit_usec / 1000);
70991 +
70992 + /* Timer could already be pending in rare cases. */
70993 + if (timer_pending(&netif->credit_timeout)) {
70994 + netif_put(netif);
70995 + continue;
70996 + }
70997 +
70998 + /* Passed the point where we can replenish credit? */
70999 + if (time_after_eq(now, next_credit)) {
71000 + netif->credit_timeout.expires = now;
71001 + tx_add_credit(netif);
71002 + }
71003 +
71004 + /* Still too big to send right now? Set a callback. */
71005 + if (txreq.size > netif->remaining_credit) {
71006 + netif->credit_timeout.data =
71007 + (unsigned long)netif;
71008 + netif->credit_timeout.function =
71009 + tx_credit_callback;
71010 + __mod_timer(&netif->credit_timeout,
71011 + next_credit);
71012 + netif_put(netif);
71013 + continue;
71014 + }
71015 + }
71016 + netif->remaining_credit -= txreq.size;
71017 +
71018 + work_to_do--;
71019 + netif->tx.req_cons = ++i;
71020 +
71021 + memset(extras, 0, sizeof(extras));
71022 + if (txreq.flags & NETTXF_extra_info) {
71023 + work_to_do = netbk_get_extras(netif, extras,
71024 + work_to_do);
71025 + i = netif->tx.req_cons;
71026 + if (unlikely(work_to_do < 0)) {
71027 + netbk_tx_err(netif, &txreq, i);
71028 + continue;
71029 + }
71030 + }
71031 +
71032 + ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
71033 + if (unlikely(ret < 0)) {
71034 + netbk_tx_err(netif, &txreq, i - ret);
71035 + continue;
71036 + }
71037 + i += ret;
71038 +
71039 + if (unlikely(txreq.size < ETH_HLEN)) {
71040 + DPRINTK("Bad packet size: %d\n", txreq.size);
71041 + netbk_tx_err(netif, &txreq, i);
71042 + continue;
71043 + }
71044 +
71045 + /* No crossing a page as the payload mustn't fragment. */
71046 + if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
71047 + DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
71048 + txreq.offset, txreq.size,
71049 + (txreq.offset &~PAGE_MASK) + txreq.size);
71050 + netbk_tx_err(netif, &txreq, i);
71051 + continue;
71052 + }
71053 +
71054 + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
71055 +
71056 + data_len = (txreq.size > PKT_PROT_LEN &&
71057 + ret < MAX_SKB_FRAGS) ?
71058 + PKT_PROT_LEN : txreq.size;
71059 +
71060 + skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
71061 + GFP_ATOMIC | __GFP_NOWARN);
71062 + if (unlikely(skb == NULL)) {
71063 + DPRINTK("Can't allocate a skb in start_xmit.\n");
71064 + netbk_tx_err(netif, &txreq, i);
71065 + break;
71066 + }
71067 +
71068 + /* Packets passed to netif_rx() must have some headroom. */
71069 + skb_reserve(skb, 16 + NET_IP_ALIGN);
71070 +
71071 + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
71072 + struct netif_extra_info *gso;
71073 + gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
71074 +
71075 + if (netbk_set_skb_gso(skb, gso)) {
71076 + kfree_skb(skb);
71077 + netbk_tx_err(netif, &txreq, i);
71078 + continue;
71079 + }
71080 + }
71081 +
71082 + gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
71083 + GNTMAP_host_map | GNTMAP_readonly,
71084 + txreq.gref, netif->domid);
71085 + mop++;
71086 +
71087 + memcpy(&pending_tx_info[pending_idx].req,
71088 + &txreq, sizeof(txreq));
71089 + pending_tx_info[pending_idx].netif = netif;
71090 + *((u16 *)skb->data) = pending_idx;
71091 +
71092 + __skb_put(skb, data_len);
71093 +
71094 + skb_shinfo(skb)->nr_frags = ret;
71095 + if (data_len < txreq.size) {
71096 + skb_shinfo(skb)->nr_frags++;
71097 + skb_shinfo(skb)->frags[0].page =
71098 + (void *)(unsigned long)pending_idx;
71099 + } else {
71100 + /* Discriminate from any valid pending_idx value. */
71101 + skb_shinfo(skb)->frags[0].page = (void *)~0UL;
71102 + }
71103 +
71104 + __skb_queue_tail(&tx_queue, skb);
71105 +
71106 + pending_cons++;
71107 +
71108 + mop = netbk_get_requests(netif, skb, txfrags, mop);
71109 +
71110 + netif->tx.req_cons = i;
71111 + netif_schedule_work(netif);
71112 +
71113 + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
71114 + break;
71115 + }
71116 +
71117 + if (mop == tx_map_ops)
71118 + return;
71119 +
71120 + ret = HYPERVISOR_grant_table_op(
71121 + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
71122 + BUG_ON(ret);
71123 +
71124 + mop = tx_map_ops;
71125 + while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
71126 + netif_tx_request_t *txp;
71127 +
71128 + pending_idx = *((u16 *)skb->data);
71129 + netif = pending_tx_info[pending_idx].netif;
71130 + txp = &pending_tx_info[pending_idx].req;
71131 +
71132 + /* Check the remap error code. */
71133 + if (unlikely(netbk_tx_check_mop(skb, &mop))) {
71134 + printk(KERN_ALERT "#### netback grant fails\n");
71135 + skb_shinfo(skb)->nr_frags = 0;
71136 + kfree_skb(skb);
71137 + continue;
71138 + }
71139 +
71140 + data_len = skb->len;
71141 + memcpy(skb->data,
71142 + (void *)(idx_to_kaddr(pending_idx)|txp->offset),
71143 + data_len);
71144 + if (data_len < txp->size) {
71145 + /* Append the packet payload as a fragment. */
71146 + txp->offset += data_len;
71147 + txp->size -= data_len;
71148 + } else {
71149 + /* Schedule a response immediately. */
71150 + netif_idx_release(pending_idx);
71151 + }
71152 +
71153 + /*
71154 + * Old frontends do not assert data_validated but we
71155 + * can infer it from csum_blank so test both flags.
71156 + */
71157 + if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
71158 + skb->ip_summed = CHECKSUM_UNNECESSARY;
71159 + skb->proto_data_valid = 1;
71160 + } else {
71161 + skb->ip_summed = CHECKSUM_NONE;
71162 + skb->proto_data_valid = 0;
71163 + }
71164 + skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
71165 +
71166 + netbk_fill_frags(skb);
71167 +
71168 + skb->dev = netif->dev;
71169 + skb->protocol = eth_type_trans(skb, skb->dev);
71170 +
71171 + netif->stats.rx_bytes += skb->len;
71172 + netif->stats.rx_packets++;
71173 +
71174 + netif_rx(skb);
71175 + netif->dev->last_rx = jiffies;
71176 + }
71177 +}
71178 +
71179 +static void netif_idx_release(u16 pending_idx)
71180 +{
71181 + static DEFINE_SPINLOCK(_lock);
71182 + unsigned long flags;
71183 +
71184 + spin_lock_irqsave(&_lock, flags);
71185 + dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
71186 + /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
71187 + smp_wmb();
71188 + dealloc_prod++;
71189 + spin_unlock_irqrestore(&_lock, flags);
71190 +
71191 + tasklet_schedule(&net_tx_tasklet);
71192 +}
71193 +
71194 +static void netif_page_release(struct page *page)
71195 +{
71196 + /* Ready for next use. */
71197 + set_page_count(page, 1);
71198 +
71199 + netif_idx_release(page->index);
71200 +}
71201 +
71202 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
71203 +{
71204 + netif_t *netif = dev_id;
71205 +
71206 + add_to_net_schedule_list_tail(netif);
71207 + maybe_schedule_tx_action();
71208 +
71209 + if (netif_schedulable(netif->dev) && !netbk_queue_full(netif))
71210 + netif_wake_queue(netif->dev);
71211 +
71212 + return IRQ_HANDLED;
71213 +}
71214 +
71215 +static void make_tx_response(netif_t *netif,
71216 + netif_tx_request_t *txp,
71217 + s8 st)
71218 +{
71219 + RING_IDX i = netif->tx.rsp_prod_pvt;
71220 + netif_tx_response_t *resp;
71221 + int notify;
71222 +
71223 + resp = RING_GET_RESPONSE(&netif->tx, i);
71224 + resp->id = txp->id;
71225 + resp->status = st;
71226 +
71227 + if (txp->flags & NETTXF_extra_info)
71228 + RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
71229 +
71230 + netif->tx.rsp_prod_pvt = ++i;
71231 + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
71232 + if (notify)
71233 + notify_remote_via_irq(netif->irq);
71234 +
71235 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
71236 + if (i == netif->tx.req_cons) {
71237 + int more_to_do;
71238 + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
71239 + if (more_to_do)
71240 + add_to_net_schedule_list_tail(netif);
71241 + }
71242 +#endif
71243 +}
71244 +
71245 +static netif_rx_response_t *make_rx_response(netif_t *netif,
71246 + u16 id,
71247 + s8 st,
71248 + u16 offset,
71249 + u16 size,
71250 + u16 flags)
71251 +{
71252 + RING_IDX i = netif->rx.rsp_prod_pvt;
71253 + netif_rx_response_t *resp;
71254 +
71255 + resp = RING_GET_RESPONSE(&netif->rx, i);
71256 + resp->offset = offset;
71257 + resp->flags = flags;
71258 + resp->id = id;
71259 + resp->status = (s16)size;
71260 + if (st < 0)
71261 + resp->status = (s16)st;
71262 +
71263 + netif->rx.rsp_prod_pvt = ++i;
71264 +
71265 + return resp;
71266 +}
71267 +
71268 +#ifdef NETBE_DEBUG_INTERRUPT
71269 +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
71270 +{
71271 + struct list_head *ent;
71272 + netif_t *netif;
71273 + int i = 0;
71274 +
71275 + printk(KERN_ALERT "netif_schedule_list:\n");
71276 + spin_lock_irq(&net_schedule_list_lock);
71277 +
71278 + list_for_each (ent, &net_schedule_list) {
71279 + netif = list_entry(ent, netif_t, list);
71280 + printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
71281 + "rx_resp_prod=%08x\n",
71282 + i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
71283 + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n",
71284 + netif->tx.req_cons, netif->tx.rsp_prod_pvt);
71285 + printk(KERN_ALERT " shared(rx_req_prod=%08x "
71286 + "rx_resp_prod=%08x\n",
71287 + netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
71288 + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n",
71289 + netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
71290 + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n",
71291 + netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
71292 + i++;
71293 + }
71294 +
71295 + spin_unlock_irq(&net_schedule_list_lock);
71296 + printk(KERN_ALERT " ** End of netif_schedule_list **\n");
71297 +
71298 + return IRQ_HANDLED;
71299 +}
71300 +#endif
71301 +
71302 +static int __init netback_init(void)
71303 +{
71304 + int i;
71305 + struct page *page;
71306 +
71307 + if (!is_running_on_xen())
71308 + return -ENODEV;
71309 +
71310 + /* We can increase reservation by this much in net_rx_action(). */
71311 + balloon_update_driver_allowance(NET_RX_RING_SIZE);
71312 +
71313 + skb_queue_head_init(&rx_queue);
71314 + skb_queue_head_init(&tx_queue);
71315 +
71316 + init_timer(&net_timer);
71317 + net_timer.data = 0;
71318 + net_timer.function = net_alarm;
71319 +
71320 + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
71321 + if (mmap_pages == NULL) {
71322 + printk("%s: out of memory\n", __FUNCTION__);
71323 + return -ENOMEM;
71324 + }
71325 +
71326 + for (i = 0; i < MAX_PENDING_REQS; i++) {
71327 + page = mmap_pages[i];
71328 + SetPageForeign(page, netif_page_release);
71329 + page->index = i;
71330 + }
71331 +
71332 + pending_cons = 0;
71333 + pending_prod = MAX_PENDING_REQS;
71334 + for (i = 0; i < MAX_PENDING_REQS; i++)
71335 + pending_ring[i] = i;
71336 +
71337 + spin_lock_init(&net_schedule_list_lock);
71338 + INIT_LIST_HEAD(&net_schedule_list);
71339 +
71340 + netif_xenbus_init();
71341 +
71342 +#ifdef NETBE_DEBUG_INTERRUPT
71343 + (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
71344 + 0,
71345 + netif_be_dbg,
71346 + SA_SHIRQ,
71347 + "net-be-dbg",
71348 + &netif_be_dbg);
71349 +#endif
71350 +
71351 + return 0;
71352 +}
71353 +
71354 +module_init(netback_init);
71355 +
71356 +MODULE_LICENSE("Dual BSD/GPL");
71357 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netback/xenbus.c linux-2.6.16.33/drivers/xen/netback/xenbus.c
71358 --- linux-2.6.16.33-noxen/drivers/xen/netback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
71359 +++ linux-2.6.16.33/drivers/xen/netback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
71360 @@ -0,0 +1,450 @@
71361 +/* Xenbus code for netif backend
71362 + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
71363 + Copyright (C) 2005 XenSource Ltd
71364 +
71365 + This program is free software; you can redistribute it and/or modify
71366 + it under the terms of the GNU General Public License as published by
71367 + the Free Software Foundation; either version 2 of the License, or
71368 + (at your option) any later version.
71369 +
71370 + This program is distributed in the hope that it will be useful,
71371 + but WITHOUT ANY WARRANTY; without even the implied warranty of
71372 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
71373 + GNU General Public License for more details.
71374 +
71375 + You should have received a copy of the GNU General Public License
71376 + along with this program; if not, write to the Free Software
71377 + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
71378 +*/
71379 +
71380 +#include <stdarg.h>
71381 +#include <linux/module.h>
71382 +#include <xen/xenbus.h>
71383 +#include "common.h"
71384 +
71385 +#if 0
71386 +#undef DPRINTK
71387 +#define DPRINTK(fmt, args...) \
71388 + printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
71389 +#endif
71390 +
71391 +struct backend_info {
71392 + struct xenbus_device *dev;
71393 + netif_t *netif;
71394 + enum xenbus_state frontend_state;
71395 +};
71396 +
71397 +static int connect_rings(struct backend_info *);
71398 +static void connect(struct backend_info *);
71399 +static void backend_create_netif(struct backend_info *be);
71400 +
71401 +static int netback_remove(struct xenbus_device *dev)
71402 +{
71403 + struct backend_info *be = dev->dev.driver_data;
71404 +
71405 + if (be->netif) {
71406 + netif_disconnect(be->netif);
71407 + be->netif = NULL;
71408 + }
71409 + kfree(be);
71410 + dev->dev.driver_data = NULL;
71411 + return 0;
71412 +}
71413 +
71414 +
71415 +/**
71416 + * Entry point to this code when a new device is created. Allocate the basic
71417 + * structures and switch to InitWait.
71418 + */
71419 +static int netback_probe(struct xenbus_device *dev,
71420 + const struct xenbus_device_id *id)
71421 +{
71422 + const char *message;
71423 + struct xenbus_transaction xbt;
71424 + int err;
71425 + struct backend_info *be = kzalloc(sizeof(struct backend_info),
71426 + GFP_KERNEL);
71427 + if (!be) {
71428 + xenbus_dev_fatal(dev, -ENOMEM,
71429 + "allocating backend structure");
71430 + return -ENOMEM;
71431 + }
71432 +
71433 + be->dev = dev;
71434 + dev->dev.driver_data = be;
71435 +
71436 + do {
71437 + err = xenbus_transaction_start(&xbt);
71438 + if (err) {
71439 + xenbus_dev_fatal(dev, err, "starting transaction");
71440 + goto fail;
71441 + }
71442 +
71443 + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
71444 + if (err) {
71445 + message = "writing feature-sg";
71446 + goto abort_transaction;
71447 + }
71448 +
71449 + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
71450 + "%d", 1);
71451 + if (err) {
71452 + message = "writing feature-gso-tcpv4";
71453 + goto abort_transaction;
71454 + }
71455 +
71456 + /* We support rx-copy path. */
71457 + err = xenbus_printf(xbt, dev->nodename,
71458 + "feature-rx-copy", "%d", 1);
71459 + if (err) {
71460 + message = "writing feature-rx-copy";
71461 + goto abort_transaction;
71462 + }
71463 +
71464 + /*
71465 + * We don't support rx-flip path (except old guests who don't
71466 + * grok this feature flag).
71467 + */
71468 + err = xenbus_printf(xbt, dev->nodename,
71469 + "feature-rx-flip", "%d", 0);
71470 + if (err) {
71471 + message = "writing feature-rx-flip";
71472 + goto abort_transaction;
71473 + }
71474 +
71475 + err = xenbus_transaction_end(xbt, 0);
71476 + } while (err == -EAGAIN);
71477 +
71478 + if (err) {
71479 + xenbus_dev_fatal(dev, err, "completing transaction");
71480 + goto fail;
71481 + }
71482 +
71483 + err = xenbus_switch_state(dev, XenbusStateInitWait);
71484 + if (err)
71485 + goto fail;
71486 +
71487 + /* This kicks hotplug scripts, so do it immediately. */
71488 + backend_create_netif(be);
71489 +
71490 + return 0;
71491 +
71492 +abort_transaction:
71493 + xenbus_transaction_end(xbt, 1);
71494 + xenbus_dev_fatal(dev, err, "%s", message);
71495 +fail:
71496 + DPRINTK("failed");
71497 + netback_remove(dev);
71498 + return err;
71499 +}
71500 +
71501 +
71502 +/**
71503 + * Handle the creation of the hotplug script environment. We add the script
71504 + * and vif variables to the environment, for the benefit of the vif-* hotplug
71505 + * scripts.
71506 + */
71507 +static int netback_uevent(struct xenbus_device *xdev, char **envp,
71508 + int num_envp, char *buffer, int buffer_size)
71509 +{
71510 + struct backend_info *be = xdev->dev.driver_data;
71511 + netif_t *netif = be->netif;
71512 + int i = 0, length = 0;
71513 + char *val;
71514 +
71515 + DPRINTK("netback_uevent");
71516 +
71517 + val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
71518 + if (IS_ERR(val)) {
71519 + int err = PTR_ERR(val);
71520 + xenbus_dev_fatal(xdev, err, "reading script");
71521 + return err;
71522 + }
71523 + else {
71524 + add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
71525 + &length, "script=%s", val);
71526 + kfree(val);
71527 + }
71528 +
71529 + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
71530 + "vif=%s", netif->dev->name);
71531 +
71532 + envp[i] = NULL;
71533 +
71534 + return 0;
71535 +}
71536 +
71537 +
71538 +static void backend_create_netif(struct backend_info *be)
71539 +{
71540 + int err;
71541 + long handle;
71542 + struct xenbus_device *dev = be->dev;
71543 +
71544 + if (be->netif != NULL)
71545 + return;
71546 +
71547 + err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
71548 + if (err != 1) {
71549 + xenbus_dev_fatal(dev, err, "reading handle");
71550 + return;
71551 + }
71552 +
71553 + be->netif = netif_alloc(dev->otherend_id, handle);
71554 + if (IS_ERR(be->netif)) {
71555 + err = PTR_ERR(be->netif);
71556 + be->netif = NULL;
71557 + xenbus_dev_fatal(dev, err, "creating interface");
71558 + return;
71559 + }
71560 +
71561 + kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
71562 +}
71563 +
71564 +
71565 +/**
71566 + * Callback received when the frontend's state changes.
71567 + */
71568 +static void frontend_changed(struct xenbus_device *dev,
71569 + enum xenbus_state frontend_state)
71570 +{
71571 + struct backend_info *be = dev->dev.driver_data;
71572 +
71573 + DPRINTK("%s", xenbus_strstate(frontend_state));
71574 +
71575 + be->frontend_state = frontend_state;
71576 +
71577 + switch (frontend_state) {
71578 + case XenbusStateInitialising:
71579 + if (dev->state == XenbusStateClosed) {
71580 + printk("%s: %s: prepare for reconnect\n",
71581 + __FUNCTION__, dev->nodename);
71582 + if (be->netif) {
71583 + netif_disconnect(be->netif);
71584 + be->netif = NULL;
71585 + }
71586 + xenbus_switch_state(dev, XenbusStateInitWait);
71587 + }
71588 + break;
71589 +
71590 + case XenbusStateInitialised:
71591 + break;
71592 +
71593 + case XenbusStateConnected:
71594 + backend_create_netif(be);
71595 + if (be->netif)
71596 + connect(be);
71597 + break;
71598 +
71599 + case XenbusStateClosing:
71600 + xenbus_switch_state(dev, XenbusStateClosing);
71601 + break;
71602 +
71603 + case XenbusStateClosed:
71604 + xenbus_switch_state(dev, XenbusStateClosed);
71605 + if (xenbus_dev_is_online(dev))
71606 + break;
71607 + /* fall through if not online */
71608 + case XenbusStateUnknown:
71609 + if (be->netif != NULL)
71610 + kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
71611 + device_unregister(&dev->dev);
71612 + break;
71613 +
71614 + default:
71615 + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
71616 + frontend_state);
71617 + break;
71618 + }
71619 +}
71620 +
71621 +
71622 +static void xen_net_read_rate(struct xenbus_device *dev,
71623 + unsigned long *bytes, unsigned long *usec)
71624 +{
71625 + char *s, *e;
71626 + unsigned long b, u;
71627 + char *ratestr;
71628 +
71629 + /* Default to unlimited bandwidth. */
71630 + *bytes = ~0UL;
71631 + *usec = 0;
71632 +
71633 + ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
71634 + if (IS_ERR(ratestr))
71635 + return;
71636 +
71637 + s = ratestr;
71638 + b = simple_strtoul(s, &e, 10);
71639 + if ((s == e) || (*e != ','))
71640 + goto fail;
71641 +
71642 + s = e + 1;
71643 + u = simple_strtoul(s, &e, 10);
71644 + if ((s == e) || (*e != '\0'))
71645 + goto fail;
71646 +
71647 + *bytes = b;
71648 + *usec = u;
71649 +
71650 + kfree(ratestr);
71651 + return;
71652 +
71653 + fail:
71654 + WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
71655 + kfree(ratestr);
71656 +}
71657 +
71658 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
71659 +{
71660 + char *s, *e, *macstr;
71661 + int i;
71662 +
71663 + macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
71664 + if (IS_ERR(macstr))
71665 + return PTR_ERR(macstr);
71666 +
71667 + for (i = 0; i < ETH_ALEN; i++) {
71668 + mac[i] = simple_strtoul(s, &e, 16);
71669 + if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
71670 + kfree(macstr);
71671 + return -ENOENT;
71672 + }
71673 + s = e+1;
71674 + }
71675 +
71676 + kfree(macstr);
71677 + return 0;
71678 +}
71679 +
71680 +static void connect(struct backend_info *be)
71681 +{
71682 + int err;
71683 + struct xenbus_device *dev = be->dev;
71684 +
71685 + err = connect_rings(be);
71686 + if (err)
71687 + return;
71688 +
71689 + err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
71690 + if (err) {
71691 + xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
71692 + return;
71693 + }
71694 +
71695 + xen_net_read_rate(dev, &be->netif->credit_bytes,
71696 + &be->netif->credit_usec);
71697 + be->netif->remaining_credit = be->netif->credit_bytes;
71698 +
71699 + xenbus_switch_state(dev, XenbusStateConnected);
71700 +
71701 + /* May not get a kick from the frontend, so start the tx_queue now. */
71702 + if (!netbk_can_queue(be->netif->dev))
71703 + netif_wake_queue(be->netif->dev);
71704 +}
71705 +
71706 +
71707 +static int connect_rings(struct backend_info *be)
71708 +{
71709 + struct xenbus_device *dev = be->dev;
71710 + unsigned long tx_ring_ref, rx_ring_ref;
71711 + unsigned int evtchn, rx_copy;
71712 + int err;
71713 + int val;
71714 +
71715 + DPRINTK("");
71716 +
71717 + err = xenbus_gather(XBT_NIL, dev->otherend,
71718 + "tx-ring-ref", "%lu", &tx_ring_ref,
71719 + "rx-ring-ref", "%lu", &rx_ring_ref,
71720 + "event-channel", "%u", &evtchn, NULL);
71721 + if (err) {
71722 + xenbus_dev_fatal(dev, err,
71723 + "reading %s/ring-ref and event-channel",
71724 + dev->otherend);
71725 + return err;
71726 + }
71727 +
71728 + err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
71729 + &rx_copy);
71730 + if (err == -ENOENT) {
71731 + err = 0;
71732 + rx_copy = 0;
71733 + }
71734 + if (err < 0) {
71735 + xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
71736 + dev->otherend);
71737 + return err;
71738 + }
71739 + be->netif->copying_receiver = !!rx_copy;
71740 +
71741 + if (be->netif->dev->tx_queue_len != 0) {
71742 + if (xenbus_scanf(XBT_NIL, dev->otherend,
71743 + "feature-rx-notify", "%d", &val) < 0)
71744 + val = 0;
71745 + if (val)
71746 + be->netif->can_queue = 1;
71747 + else
71748 + /* Must be non-zero for pfifo_fast to work. */
71749 + be->netif->dev->tx_queue_len = 1;
71750 + }
71751 +
71752 + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
71753 + val = 0;
71754 + if (val) {
71755 + be->netif->features |= NETIF_F_SG;
71756 + be->netif->dev->features |= NETIF_F_SG;
71757 + }
71758 +
71759 + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
71760 + &val) < 0)
71761 + val = 0;
71762 + if (val) {
71763 + be->netif->features |= NETIF_F_TSO;
71764 + be->netif->dev->features |= NETIF_F_TSO;
71765 + }
71766 +
71767 + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
71768 + "%d", &val) < 0)
71769 + val = 0;
71770 + if (val) {
71771 + be->netif->features &= ~NETIF_F_IP_CSUM;
71772 + be->netif->dev->features &= ~NETIF_F_IP_CSUM;
71773 + }
71774 +
71775 + /* Map the shared frame, irq etc. */
71776 + err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
71777 + if (err) {
71778 + xenbus_dev_fatal(dev, err,
71779 + "mapping shared-frames %lu/%lu port %u",
71780 + tx_ring_ref, rx_ring_ref, evtchn);
71781 + return err;
71782 + }
71783 + return 0;
71784 +}
71785 +
71786 +
71787 +/* ** Driver Registration ** */
71788 +
71789 +
71790 +static struct xenbus_device_id netback_ids[] = {
71791 + { "vif" },
71792 + { "" }
71793 +};
71794 +
71795 +
71796 +static struct xenbus_driver netback = {
71797 + .name = "vif",
71798 + .owner = THIS_MODULE,
71799 + .ids = netback_ids,
71800 + .probe = netback_probe,
71801 + .remove = netback_remove,
71802 + .uevent = netback_uevent,
71803 + .otherend_changed = frontend_changed,
71804 +};
71805 +
71806 +
71807 +void netif_xenbus_init(void)
71808 +{
71809 + xenbus_register_backend(&netback);
71810 +}
71811 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netfront/Makefile linux-2.6.16.33/drivers/xen/netfront/Makefile
71812 --- linux-2.6.16.33-noxen/drivers/xen/netfront/Makefile 1970-01-01 00:00:00.000000000 +0000
71813 +++ linux-2.6.16.33/drivers/xen/netfront/Makefile 2007-01-08 15:00:45.000000000 +0000
71814 @@ -0,0 +1,4 @@
71815 +
71816 +obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o
71817 +
71818 +xennet-objs := netfront.o
71819 diff -Nur linux-2.6.16.33-noxen/drivers/xen/netfront/netfront.c linux-2.6.16.33/drivers/xen/netfront/netfront.c
71820 --- linux-2.6.16.33-noxen/drivers/xen/netfront/netfront.c 1970-01-01 00:00:00.000000000 +0000
71821 +++ linux-2.6.16.33/drivers/xen/netfront/netfront.c 2007-01-08 15:00:45.000000000 +0000
71822 @@ -0,0 +1,2114 @@
71823 +/******************************************************************************
71824 + * Virtual network driver for conversing with remote driver backends.
71825 + *
71826 + * Copyright (c) 2002-2005, K A Fraser
71827 + * Copyright (c) 2005, XenSource Ltd
71828 + *
71829 + * This program is free software; you can redistribute it and/or
71830 + * modify it under the terms of the GNU General Public License version 2
71831 + * as published by the Free Software Foundation; or, when distributed
71832 + * separately from the Linux kernel or incorporated into other
71833 + * software packages, subject to the following license:
71834 + *
71835 + * Permission is hereby granted, free of charge, to any person obtaining a copy
71836 + * of this source file (the "Software"), to deal in the Software without
71837 + * restriction, including without limitation the rights to use, copy, modify,
71838 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
71839 + * and to permit persons to whom the Software is furnished to do so, subject to
71840 + * the following conditions:
71841 + *
71842 + * The above copyright notice and this permission notice shall be included in
71843 + * all copies or substantial portions of the Software.
71844 + *
71845 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
71846 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
71847 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
71848 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
71849 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
71850 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
71851 + * IN THE SOFTWARE.
71852 + */
71853 +
71854 +#include <linux/config.h>
71855 +#include <linux/module.h>
71856 +#include <linux/version.h>
71857 +#include <linux/kernel.h>
71858 +#include <linux/sched.h>
71859 +#include <linux/slab.h>
71860 +#include <linux/string.h>
71861 +#include <linux/errno.h>
71862 +#include <linux/netdevice.h>
71863 +#include <linux/inetdevice.h>
71864 +#include <linux/etherdevice.h>
71865 +#include <linux/skbuff.h>
71866 +#include <linux/init.h>
71867 +#include <linux/bitops.h>
71868 +#include <linux/ethtool.h>
71869 +#include <linux/in.h>
71870 +#include <linux/if_ether.h>
71871 +#include <linux/io.h>
71872 +#include <linux/moduleparam.h>
71873 +#include <net/sock.h>
71874 +#include <net/pkt_sched.h>
71875 +#include <net/arp.h>
71876 +#include <net/route.h>
71877 +#include <asm/uaccess.h>
71878 +#include <xen/evtchn.h>
71879 +#include <xen/xenbus.h>
71880 +#include <xen/interface/io/netif.h>
71881 +#include <xen/interface/memory.h>
71882 +#include <xen/balloon.h>
71883 +#include <asm/page.h>
71884 +#include <asm/maddr.h>
71885 +#include <asm/uaccess.h>
71886 +#include <xen/interface/grant_table.h>
71887 +#include <xen/gnttab.h>
71888 +
71889 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
71890 +#include <xen/platform-compat.h>
71891 +#endif
71892 +
71893 +/*
71894 + * Mutually-exclusive module options to select receive data path:
71895 + * rx_copy : Packets are copied by network backend into local memory
71896 + * rx_flip : Page containing packet data is transferred to our ownership
71897 + * For fully-virtualised guests there is no option - copying must be used.
71898 + * For paravirtualised guests, flipping is the default.
71899 + */
71900 +#ifdef CONFIG_XEN
71901 +static int MODPARM_rx_copy = 0;
71902 +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
71903 +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
71904 +static int MODPARM_rx_flip = 0;
71905 +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
71906 +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
71907 +#else
71908 +static const int MODPARM_rx_copy = 1;
71909 +static const int MODPARM_rx_flip = 0;
71910 +#endif
71911 +
71912 +#define RX_COPY_THRESHOLD 256
71913 +
71914 +/* If we don't have GSO, fake things up so that we never try to use it. */
71915 +#if defined(NETIF_F_GSO)
71916 +#define HAVE_GSO 1
71917 +#define HAVE_TSO 1 /* TSO is a subset of GSO */
71918 +static inline void dev_disable_gso_features(struct net_device *dev)
71919 +{
71920 + /* Turn off all GSO bits except ROBUST. */
71921 + dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
71922 + dev->features |= NETIF_F_GSO_ROBUST;
71923 +}
71924 +#elif defined(NETIF_F_TSO)
71925 +#define HAVE_TSO 1
71926 +
71927 +/* Some older kernels cannot cope with incorrect checksums,
71928 + * particularly in netfilter. I'm not sure there is 100% correlation
71929 + * with the presence of NETIF_F_TSO but it appears to be a good first
71930 + * approximiation.
71931 + */
71932 +#define HAVE_NO_CSUM_OFFLOAD 1
71933 +
71934 +#define gso_size tso_size
71935 +#define gso_segs tso_segs
71936 +static inline void dev_disable_gso_features(struct net_device *dev)
71937 +{
71938 + /* Turn off all TSO bits. */
71939 + dev->features &= ~NETIF_F_TSO;
71940 +}
71941 +static inline int skb_is_gso(const struct sk_buff *skb)
71942 +{
71943 + return skb_shinfo(skb)->tso_size;
71944 +}
71945 +static inline int skb_gso_ok(struct sk_buff *skb, int features)
71946 +{
71947 + return (features & NETIF_F_TSO);
71948 +}
71949 +
71950 +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
71951 +{
71952 + return skb_is_gso(skb) &&
71953 + (!skb_gso_ok(skb, dev->features) ||
71954 + unlikely(skb->ip_summed != CHECKSUM_HW));
71955 +}
71956 +#else
71957 +#define netif_needs_gso(dev, skb) 0
71958 +#define dev_disable_gso_features(dev) ((void)0)
71959 +#endif
71960 +
71961 +#define GRANT_INVALID_REF 0
71962 +
71963 +#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
71964 +#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
71965 +
71966 +struct netfront_info {
71967 + struct list_head list;
71968 + struct net_device *netdev;
71969 +
71970 + struct net_device_stats stats;
71971 +
71972 + struct netif_tx_front_ring tx;
71973 + struct netif_rx_front_ring rx;
71974 +
71975 + spinlock_t tx_lock;
71976 + spinlock_t rx_lock;
71977 +
71978 + unsigned int evtchn, irq;
71979 + unsigned int copying_receiver;
71980 +
71981 + /* Receive-ring batched refills. */
71982 +#define RX_MIN_TARGET 8
71983 +#define RX_DFL_MIN_TARGET 64
71984 +#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
71985 + unsigned rx_min_target, rx_max_target, rx_target;
71986 + struct sk_buff_head rx_batch;
71987 +
71988 + struct timer_list rx_refill_timer;
71989 +
71990 + /*
71991 + * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
71992 + * is an index into a chain of free entries.
71993 + */
71994 + struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
71995 + struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
71996 +
71997 +#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
71998 + grant_ref_t gref_tx_head;
71999 + grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
72000 + grant_ref_t gref_rx_head;
72001 + grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
72002 +
72003 + struct xenbus_device *xbdev;
72004 + int tx_ring_ref;
72005 + int rx_ring_ref;
72006 + u8 mac[ETH_ALEN];
72007 +
72008 + unsigned long rx_pfn_array[NET_RX_RING_SIZE];
72009 + struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
72010 + struct mmu_update rx_mmu[NET_RX_RING_SIZE];
72011 +};
72012 +
72013 +struct netfront_rx_info {
72014 + struct netif_rx_response rx;
72015 + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
72016 +};
72017 +
72018 +/*
72019 + * Access macros for acquiring freeing slots in tx_skbs[].
72020 + */
72021 +
72022 +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
72023 +{
72024 + list[id] = list[0];
72025 + list[0] = (void *)(unsigned long)id;
72026 +}
72027 +
72028 +static inline unsigned short get_id_from_freelist(struct sk_buff **list)
72029 +{
72030 + unsigned int id = (unsigned int)(unsigned long)list[0];
72031 + list[0] = list[id];
72032 + return id;
72033 +}
72034 +
72035 +static inline int xennet_rxidx(RING_IDX idx)
72036 +{
72037 + return idx & (NET_RX_RING_SIZE - 1);
72038 +}
72039 +
72040 +static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
72041 + RING_IDX ri)
72042 +{
72043 + int i = xennet_rxidx(ri);
72044 + struct sk_buff *skb = np->rx_skbs[i];
72045 + np->rx_skbs[i] = NULL;
72046 + return skb;
72047 +}
72048 +
72049 +static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
72050 + RING_IDX ri)
72051 +{
72052 + int i = xennet_rxidx(ri);
72053 + grant_ref_t ref = np->grant_rx_ref[i];
72054 + np->grant_rx_ref[i] = GRANT_INVALID_REF;
72055 + return ref;
72056 +}
72057 +
72058 +#define DPRINTK(fmt, args...) \
72059 + pr_debug("netfront (%s:%d) " fmt, \
72060 + __FUNCTION__, __LINE__, ##args)
72061 +#define IPRINTK(fmt, args...) \
72062 + printk(KERN_INFO "netfront: " fmt, ##args)
72063 +#define WPRINTK(fmt, args...) \
72064 + printk(KERN_WARNING "netfront: " fmt, ##args)
72065 +
72066 +static int setup_device(struct xenbus_device *, struct netfront_info *);
72067 +static struct net_device *create_netdev(struct xenbus_device *);
72068 +
72069 +static void end_access(int, void *);
72070 +static void netif_disconnect_backend(struct netfront_info *);
72071 +
72072 +static int network_connect(struct net_device *);
72073 +static void network_tx_buf_gc(struct net_device *);
72074 +static void network_alloc_rx_buffers(struct net_device *);
72075 +static int send_fake_arp(struct net_device *);
72076 +
72077 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
72078 +
72079 +#ifdef CONFIG_SYSFS
72080 +static int xennet_sysfs_addif(struct net_device *netdev);
72081 +static void xennet_sysfs_delif(struct net_device *netdev);
72082 +#else /* !CONFIG_SYSFS */
72083 +#define xennet_sysfs_addif(dev) (0)
72084 +#define xennet_sysfs_delif(dev) do { } while(0)
72085 +#endif
72086 +
72087 +static inline int xennet_can_sg(struct net_device *dev)
72088 +{
72089 + return dev->features & NETIF_F_SG;
72090 +}
72091 +
72092 +/**
72093 + * Entry point to this code when a new device is created. Allocate the basic
72094 + * structures and the ring buffers for communication with the backend, and
72095 + * inform the backend of the appropriate details for those.
72096 + */
72097 +static int __devinit netfront_probe(struct xenbus_device *dev,
72098 + const struct xenbus_device_id *id)
72099 +{
72100 + int err;
72101 + struct net_device *netdev;
72102 + struct netfront_info *info;
72103 +
72104 + netdev = create_netdev(dev);
72105 + if (IS_ERR(netdev)) {
72106 + err = PTR_ERR(netdev);
72107 + xenbus_dev_fatal(dev, err, "creating netdev");
72108 + return err;
72109 + }
72110 +
72111 + info = netdev_priv(netdev);
72112 + dev->dev.driver_data = info;
72113 +
72114 + err = register_netdev(info->netdev);
72115 + if (err) {
72116 + printk(KERN_WARNING "%s: register_netdev err=%d\n",
72117 + __FUNCTION__, err);
72118 + goto fail;
72119 + }
72120 +
72121 + err = xennet_sysfs_addif(info->netdev);
72122 + if (err) {
72123 + unregister_netdev(info->netdev);
72124 + printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
72125 + __FUNCTION__, err);
72126 + goto fail;
72127 + }
72128 +
72129 + return 0;
72130 +
72131 + fail:
72132 + free_netdev(netdev);
72133 + dev->dev.driver_data = NULL;
72134 + return err;
72135 +}
72136 +
72137 +static int __devexit netfront_remove(struct xenbus_device *dev)
72138 +{
72139 + struct netfront_info *info = dev->dev.driver_data;
72140 +
72141 + DPRINTK("%s\n", dev->nodename);
72142 +
72143 + netif_disconnect_backend(info);
72144 +
72145 + del_timer_sync(&info->rx_refill_timer);
72146 +
72147 + xennet_sysfs_delif(info->netdev);
72148 +
72149 + unregister_netdev(info->netdev);
72150 +
72151 + free_netdev(info->netdev);
72152 +
72153 + return 0;
72154 +}
72155 +
72156 +/**
72157 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
72158 + * driver restart. We tear down our netif structure and recreate it, but
72159 + * leave the device-layer structures intact so that this is transparent to the
72160 + * rest of the kernel.
72161 + */
72162 +static int netfront_resume(struct xenbus_device *dev)
72163 +{
72164 + struct netfront_info *info = dev->dev.driver_data;
72165 +
72166 + DPRINTK("%s\n", dev->nodename);
72167 +
72168 + netif_disconnect_backend(info);
72169 + return 0;
72170 +}
72171 +
72172 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
72173 +{
72174 + char *s, *e, *macstr;
72175 + int i;
72176 +
72177 + macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
72178 + if (IS_ERR(macstr))
72179 + return PTR_ERR(macstr);
72180 +
72181 + for (i = 0; i < ETH_ALEN; i++) {
72182 + mac[i] = simple_strtoul(s, &e, 16);
72183 + if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
72184 + kfree(macstr);
72185 + return -ENOENT;
72186 + }
72187 + s = e+1;
72188 + }
72189 +
72190 + kfree(macstr);
72191 + return 0;
72192 +}
72193 +
72194 +/* Common code used when first setting up, and when resuming. */
72195 +static int talk_to_backend(struct xenbus_device *dev,
72196 + struct netfront_info *info)
72197 +{
72198 + const char *message;
72199 + struct xenbus_transaction xbt;
72200 + int err;
72201 +
72202 + err = xen_net_read_mac(dev, info->mac);
72203 + if (err) {
72204 + xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
72205 + goto out;
72206 + }
72207 +
72208 + /* Create shared ring, alloc event channel. */
72209 + err = setup_device(dev, info);
72210 + if (err)
72211 + goto out;
72212 +
72213 +again:
72214 + err = xenbus_transaction_start(&xbt);
72215 + if (err) {
72216 + xenbus_dev_fatal(dev, err, "starting transaction");
72217 + goto destroy_ring;
72218 + }
72219 +
72220 + err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
72221 + info->tx_ring_ref);
72222 + if (err) {
72223 + message = "writing tx ring-ref";
72224 + goto abort_transaction;
72225 + }
72226 + err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
72227 + info->rx_ring_ref);
72228 + if (err) {
72229 + message = "writing rx ring-ref";
72230 + goto abort_transaction;
72231 + }
72232 + err = xenbus_printf(xbt, dev->nodename,
72233 + "event-channel", "%u", info->evtchn);
72234 + if (err) {
72235 + message = "writing event-channel";
72236 + goto abort_transaction;
72237 + }
72238 +
72239 + err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
72240 + info->copying_receiver);
72241 + if (err) {
72242 + message = "writing request-rx-copy";
72243 + goto abort_transaction;
72244 + }
72245 +
72246 + err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
72247 + if (err) {
72248 + message = "writing feature-rx-notify";
72249 + goto abort_transaction;
72250 + }
72251 +
72252 +#ifdef HAVE_NO_CSUM_OFFLOAD
72253 + err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1);
72254 + if (err) {
72255 + message = "writing feature-no-csum-offload";
72256 + goto abort_transaction;
72257 + }
72258 +#endif
72259 +
72260 + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
72261 + if (err) {
72262 + message = "writing feature-sg";
72263 + goto abort_transaction;
72264 + }
72265 +
72266 +#ifdef HAVE_TSO
72267 + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
72268 + if (err) {
72269 + message = "writing feature-gso-tcpv4";
72270 + goto abort_transaction;
72271 + }
72272 +#endif
72273 +
72274 + err = xenbus_transaction_end(xbt, 0);
72275 + if (err) {
72276 + if (err == -EAGAIN)
72277 + goto again;
72278 + xenbus_dev_fatal(dev, err, "completing transaction");
72279 + goto destroy_ring;
72280 + }
72281 +
72282 + return 0;
72283 +
72284 + abort_transaction:
72285 + xenbus_transaction_end(xbt, 1);
72286 + xenbus_dev_fatal(dev, err, "%s", message);
72287 + destroy_ring:
72288 + netif_disconnect_backend(info);
72289 + out:
72290 + return err;
72291 +}
72292 +
72293 +static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
72294 +{
72295 + struct netif_tx_sring *txs;
72296 + struct netif_rx_sring *rxs;
72297 + int err;
72298 + struct net_device *netdev = info->netdev;
72299 +
72300 + info->tx_ring_ref = GRANT_INVALID_REF;
72301 + info->rx_ring_ref = GRANT_INVALID_REF;
72302 + info->rx.sring = NULL;
72303 + info->tx.sring = NULL;
72304 + info->irq = 0;
72305 +
72306 + txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
72307 + if (!txs) {
72308 + err = -ENOMEM;
72309 + xenbus_dev_fatal(dev, err, "allocating tx ring page");
72310 + goto fail;
72311 + }
72312 + SHARED_RING_INIT(txs);
72313 + FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
72314 +
72315 + err = xenbus_grant_ring(dev, virt_to_mfn(txs));
72316 + if (err < 0) {
72317 + free_page((unsigned long)txs);
72318 + goto fail;
72319 + }
72320 + info->tx_ring_ref = err;
72321 +
72322 + rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
72323 + if (!rxs) {
72324 + err = -ENOMEM;
72325 + xenbus_dev_fatal(dev, err, "allocating rx ring page");
72326 + goto fail;
72327 + }
72328 + SHARED_RING_INIT(rxs);
72329 + FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
72330 +
72331 + err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
72332 + if (err < 0) {
72333 + free_page((unsigned long)rxs);
72334 + goto fail;
72335 + }
72336 + info->rx_ring_ref = err;
72337 +
72338 + err = xenbus_alloc_evtchn(dev, &info->evtchn);
72339 + if (err)
72340 + goto fail;
72341 +
72342 + memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
72343 + err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
72344 + SA_SAMPLE_RANDOM, netdev->name,
72345 + netdev);
72346 + if (err < 0)
72347 + goto fail;
72348 + info->irq = err;
72349 + return 0;
72350 +
72351 + fail:
72352 + return err;
72353 +}
72354 +
72355 +/**
72356 + * Callback received when the backend's state changes.
72357 + */
72358 +static void backend_changed(struct xenbus_device *dev,
72359 + enum xenbus_state backend_state)
72360 +{
72361 + struct netfront_info *np = dev->dev.driver_data;
72362 + struct net_device *netdev = np->netdev;
72363 +
72364 + DPRINTK("%s\n", xenbus_strstate(backend_state));
72365 +
72366 + switch (backend_state) {
72367 + case XenbusStateInitialising:
72368 + case XenbusStateInitialised:
72369 + case XenbusStateConnected:
72370 + case XenbusStateUnknown:
72371 + case XenbusStateClosed:
72372 + break;
72373 +
72374 + case XenbusStateInitWait:
72375 + if (dev->state != XenbusStateInitialising)
72376 + break;
72377 + if (network_connect(netdev) != 0)
72378 + break;
72379 + xenbus_switch_state(dev, XenbusStateConnected);
72380 + (void)send_fake_arp(netdev);
72381 + break;
72382 +
72383 + case XenbusStateClosing:
72384 + xenbus_frontend_closed(dev);
72385 + break;
72386 + }
72387 +}
72388 +
72389 +/** Send a packet on a net device to encourage switches to learn the
72390 + * MAC. We send a fake ARP request.
72391 + *
72392 + * @param dev device
72393 + * @return 0 on success, error code otherwise
72394 + */
72395 +static int send_fake_arp(struct net_device *dev)
72396 +{
72397 + struct sk_buff *skb;
72398 + u32 src_ip, dst_ip;
72399 +
72400 + dst_ip = INADDR_BROADCAST;
72401 + src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
72402 +
72403 + /* No IP? Then nothing to do. */
72404 + if (src_ip == 0)
72405 + return 0;
72406 +
72407 + skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
72408 + dst_ip, dev, src_ip,
72409 + /*dst_hw*/ NULL, /*src_hw*/ NULL,
72410 + /*target_hw*/ dev->dev_addr);
72411 + if (skb == NULL)
72412 + return -ENOMEM;
72413 +
72414 + return dev_queue_xmit(skb);
72415 +}
72416 +
72417 +static int network_open(struct net_device *dev)
72418 +{
72419 + struct netfront_info *np = netdev_priv(dev);
72420 +
72421 + memset(&np->stats, 0, sizeof(np->stats));
72422 +
72423 + spin_lock(&np->rx_lock);
72424 + if (netif_carrier_ok(dev)) {
72425 + network_alloc_rx_buffers(dev);
72426 + np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
72427 + if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
72428 + netif_rx_schedule(dev);
72429 + }
72430 + spin_unlock(&np->rx_lock);
72431 +
72432 + netif_start_queue(dev);
72433 +
72434 + return 0;
72435 +}
72436 +
72437 +static inline int netfront_tx_slot_available(struct netfront_info *np)
72438 +{
72439 + return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
72440 +}
72441 +
72442 +static inline void network_maybe_wake_tx(struct net_device *dev)
72443 +{
72444 + struct netfront_info *np = netdev_priv(dev);
72445 +
72446 + if (unlikely(netif_queue_stopped(dev)) &&
72447 + netfront_tx_slot_available(np) &&
72448 + likely(netif_running(dev)))
72449 + netif_wake_queue(dev);
72450 +}
72451 +
72452 +static void network_tx_buf_gc(struct net_device *dev)
72453 +{
72454 + RING_IDX cons, prod;
72455 + unsigned short id;
72456 + struct netfront_info *np = netdev_priv(dev);
72457 + struct sk_buff *skb;
72458 +
72459 + BUG_ON(!netif_carrier_ok(dev));
72460 +
72461 + do {
72462 + prod = np->tx.sring->rsp_prod;
72463 + rmb(); /* Ensure we see responses up to 'rp'. */
72464 +
72465 + for (cons = np->tx.rsp_cons; cons != prod; cons++) {
72466 + struct netif_tx_response *txrsp;
72467 +
72468 + txrsp = RING_GET_RESPONSE(&np->tx, cons);
72469 + if (txrsp->status == NETIF_RSP_NULL)
72470 + continue;
72471 +
72472 + id = txrsp->id;
72473 + skb = np->tx_skbs[id];
72474 + if (unlikely(gnttab_query_foreign_access(
72475 + np->grant_tx_ref[id]) != 0)) {
72476 + printk(KERN_ALERT "network_tx_buf_gc: warning "
72477 + "-- grant still in use by backend "
72478 + "domain.\n");
72479 + BUG();
72480 + }
72481 + gnttab_end_foreign_access_ref(
72482 + np->grant_tx_ref[id], GNTMAP_readonly);
72483 + gnttab_release_grant_reference(
72484 + &np->gref_tx_head, np->grant_tx_ref[id]);
72485 + np->grant_tx_ref[id] = GRANT_INVALID_REF;
72486 + add_id_to_freelist(np->tx_skbs, id);
72487 + dev_kfree_skb_irq(skb);
72488 + }
72489 +
72490 + np->tx.rsp_cons = prod;
72491 +
72492 + /*
72493 + * Set a new event, then check for race with update of tx_cons.
72494 + * Note that it is essential to schedule a callback, no matter
72495 + * how few buffers are pending. Even if there is space in the
72496 + * transmit ring, higher layers may be blocked because too much
72497 + * data is outstanding: in such cases notification from Xen is
72498 + * likely to be the only kick that we'll get.
72499 + */
72500 + np->tx.sring->rsp_event =
72501 + prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
72502 + mb();
72503 + } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
72504 +
72505 + network_maybe_wake_tx(dev);
72506 +}
72507 +
72508 +static void rx_refill_timeout(unsigned long data)
72509 +{
72510 + struct net_device *dev = (struct net_device *)data;
72511 + netif_rx_schedule(dev);
72512 +}
72513 +
72514 +static void network_alloc_rx_buffers(struct net_device *dev)
72515 +{
72516 + unsigned short id;
72517 + struct netfront_info *np = netdev_priv(dev);
72518 + struct sk_buff *skb;
72519 + struct page *page;
72520 + int i, batch_target, notify;
72521 + RING_IDX req_prod = np->rx.req_prod_pvt;
72522 + struct xen_memory_reservation reservation;
72523 + grant_ref_t ref;
72524 + unsigned long pfn;
72525 + void *vaddr;
72526 + int nr_flips;
72527 + netif_rx_request_t *req;
72528 +
72529 + if (unlikely(!netif_carrier_ok(dev)))
72530 + return;
72531 +
72532 + /*
72533 + * Allocate skbuffs greedily, even though we batch updates to the
72534 + * receive ring. This creates a less bursty demand on the memory
72535 + * allocator, so should reduce the chance of failed allocation requests
72536 + * both for ourself and for other kernel subsystems.
72537 + */
72538 + batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
72539 + for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
72540 + /*
72541 + * Allocate an skb and a page. Do not use __dev_alloc_skb as
72542 + * that will allocate page-sized buffers which is not
72543 + * necessary here.
72544 + * 16 bytes added as necessary headroom for netif_receive_skb.
72545 + */
72546 + skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
72547 + GFP_ATOMIC | __GFP_NOWARN);
72548 + if (unlikely(!skb))
72549 + goto no_skb;
72550 +
72551 + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
72552 + if (!page) {
72553 + kfree_skb(skb);
72554 +no_skb:
72555 + /* Any skbuffs queued for refill? Force them out. */
72556 + if (i != 0)
72557 + goto refill;
72558 + /* Could not allocate any skbuffs. Try again later. */
72559 + mod_timer(&np->rx_refill_timer,
72560 + jiffies + (HZ/10));
72561 + break;
72562 + }
72563 +
72564 + skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
72565 + skb_shinfo(skb)->frags[0].page = page;
72566 + skb_shinfo(skb)->nr_frags = 1;
72567 + __skb_queue_tail(&np->rx_batch, skb);
72568 + }
72569 +
72570 + /* Is the batch large enough to be worthwhile? */
72571 + if (i < (np->rx_target/2)) {
72572 + if (req_prod > np->rx.sring->req_prod)
72573 + goto push;
72574 + return;
72575 + }
72576 +
72577 + /* Adjust our fill target if we risked running out of buffers. */
72578 + if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
72579 + ((np->rx_target *= 2) > np->rx_max_target))
72580 + np->rx_target = np->rx_max_target;
72581 +
72582 + refill:
72583 + for (nr_flips = i = 0; ; i++) {
72584 + if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
72585 + break;
72586 +
72587 + skb->dev = dev;
72588 +
72589 + id = xennet_rxidx(req_prod + i);
72590 +
72591 + BUG_ON(np->rx_skbs[id]);
72592 + np->rx_skbs[id] = skb;
72593 +
72594 + ref = gnttab_claim_grant_reference(&np->gref_rx_head);
72595 + BUG_ON((signed short)ref < 0);
72596 + np->grant_rx_ref[id] = ref;
72597 +
72598 + pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
72599 + vaddr = page_address(skb_shinfo(skb)->frags[0].page);
72600 +
72601 + req = RING_GET_REQUEST(&np->rx, req_prod + i);
72602 + if (!np->copying_receiver) {
72603 + gnttab_grant_foreign_transfer_ref(ref,
72604 + np->xbdev->otherend_id,
72605 + pfn);
72606 + np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
72607 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72608 + /* Remove this page before passing
72609 + * back to Xen. */
72610 + set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
72611 + MULTI_update_va_mapping(np->rx_mcl+i,
72612 + (unsigned long)vaddr,
72613 + __pte(0), 0);
72614 + }
72615 + nr_flips++;
72616 + } else {
72617 + gnttab_grant_foreign_access_ref(ref,
72618 + np->xbdev->otherend_id,
72619 + pfn_to_mfn(pfn),
72620 + 0);
72621 + }
72622 +
72623 + req->id = id;
72624 + req->gref = ref;
72625 + }
72626 +
72627 + if ( nr_flips != 0 ) {
72628 + /* Tell the ballon driver what is going on. */
72629 + balloon_update_driver_allowance(i);
72630 +
72631 + set_xen_guest_handle(reservation.extent_start,
72632 + np->rx_pfn_array);
72633 + reservation.nr_extents = nr_flips;
72634 + reservation.extent_order = 0;
72635 + reservation.address_bits = 0;
72636 + reservation.domid = DOMID_SELF;
72637 +
72638 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72639 + /* After all PTEs have been zapped, flush the TLB. */
72640 + np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
72641 + UVMF_TLB_FLUSH|UVMF_ALL;
72642 +
72643 + /* Give away a batch of pages. */
72644 + np->rx_mcl[i].op = __HYPERVISOR_memory_op;
72645 + np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
72646 + np->rx_mcl[i].args[1] = (unsigned long)&reservation;
72647 +
72648 + /* Zap PTEs and give away pages in one big
72649 + * multicall. */
72650 + (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
72651 +
72652 + /* Check return status of HYPERVISOR_memory_op(). */
72653 + if (unlikely(np->rx_mcl[i].result != i))
72654 + panic("Unable to reduce memory reservation\n");
72655 + } else {
72656 + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
72657 + &reservation) != i)
72658 + panic("Unable to reduce memory reservation\n");
72659 + }
72660 + } else {
72661 + wmb();
72662 + }
72663 +
72664 + /* Above is a suitable barrier to ensure backend will see requests. */
72665 + np->rx.req_prod_pvt = req_prod + i;
72666 + push:
72667 + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
72668 + if (notify)
72669 + notify_remote_via_irq(np->irq);
72670 +}
72671 +
72672 +static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
72673 + struct netif_tx_request *tx)
72674 +{
72675 + struct netfront_info *np = netdev_priv(dev);
72676 + char *data = skb->data;
72677 + unsigned long mfn;
72678 + RING_IDX prod = np->tx.req_prod_pvt;
72679 + int frags = skb_shinfo(skb)->nr_frags;
72680 + unsigned int offset = offset_in_page(data);
72681 + unsigned int len = skb_headlen(skb);
72682 + unsigned int id;
72683 + grant_ref_t ref;
72684 + int i;
72685 +
72686 + while (len > PAGE_SIZE - offset) {
72687 + tx->size = PAGE_SIZE - offset;
72688 + tx->flags |= NETTXF_more_data;
72689 + len -= tx->size;
72690 + data += tx->size;
72691 + offset = 0;
72692 +
72693 + id = get_id_from_freelist(np->tx_skbs);
72694 + np->tx_skbs[id] = skb_get(skb);
72695 + tx = RING_GET_REQUEST(&np->tx, prod++);
72696 + tx->id = id;
72697 + ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72698 + BUG_ON((signed short)ref < 0);
72699 +
72700 + mfn = virt_to_mfn(data);
72701 + gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
72702 + mfn, GNTMAP_readonly);
72703 +
72704 + tx->gref = np->grant_tx_ref[id] = ref;
72705 + tx->offset = offset;
72706 + tx->size = len;
72707 + tx->flags = 0;
72708 + }
72709 +
72710 + for (i = 0; i < frags; i++) {
72711 + skb_frag_t *frag = skb_shinfo(skb)->frags + i;
72712 +
72713 + tx->flags |= NETTXF_more_data;
72714 +
72715 + id = get_id_from_freelist(np->tx_skbs);
72716 + np->tx_skbs[id] = skb_get(skb);
72717 + tx = RING_GET_REQUEST(&np->tx, prod++);
72718 + tx->id = id;
72719 + ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72720 + BUG_ON((signed short)ref < 0);
72721 +
72722 + mfn = pfn_to_mfn(page_to_pfn(frag->page));
72723 + gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
72724 + mfn, GNTMAP_readonly);
72725 +
72726 + tx->gref = np->grant_tx_ref[id] = ref;
72727 + tx->offset = frag->page_offset;
72728 + tx->size = frag->size;
72729 + tx->flags = 0;
72730 + }
72731 +
72732 + np->tx.req_prod_pvt = prod;
72733 +}
72734 +
72735 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
72736 +{
72737 + unsigned short id;
72738 + struct netfront_info *np = netdev_priv(dev);
72739 + struct netif_tx_request *tx;
72740 + struct netif_extra_info *extra;
72741 + char *data = skb->data;
72742 + RING_IDX i;
72743 + grant_ref_t ref;
72744 + unsigned long mfn;
72745 + int notify;
72746 + int frags = skb_shinfo(skb)->nr_frags;
72747 + unsigned int offset = offset_in_page(data);
72748 + unsigned int len = skb_headlen(skb);
72749 +
72750 + frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
72751 + if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
72752 + printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
72753 + frags);
72754 + dump_stack();
72755 + goto drop;
72756 + }
72757 +
72758 + spin_lock_irq(&np->tx_lock);
72759 +
72760 + if (unlikely(!netif_carrier_ok(dev) ||
72761 + (frags > 1 && !xennet_can_sg(dev)) ||
72762 + netif_needs_gso(dev, skb))) {
72763 + spin_unlock_irq(&np->tx_lock);
72764 + goto drop;
72765 + }
72766 +
72767 + i = np->tx.req_prod_pvt;
72768 +
72769 + id = get_id_from_freelist(np->tx_skbs);
72770 + np->tx_skbs[id] = skb;
72771 +
72772 + tx = RING_GET_REQUEST(&np->tx, i);
72773 +
72774 + tx->id = id;
72775 + ref = gnttab_claim_grant_reference(&np->gref_tx_head);
72776 + BUG_ON((signed short)ref < 0);
72777 + mfn = virt_to_mfn(data);
72778 + gnttab_grant_foreign_access_ref(
72779 + ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
72780 + tx->gref = np->grant_tx_ref[id] = ref;
72781 + tx->offset = offset;
72782 + tx->size = len;
72783 +
72784 + tx->flags = 0;
72785 + extra = NULL;
72786 +
72787 + if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
72788 + tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
72789 +#ifdef CONFIG_XEN
72790 + if (skb->proto_data_valid) /* remote but checksummed? */
72791 + tx->flags |= NETTXF_data_validated;
72792 +#endif
72793 +
72794 +#ifdef HAVE_TSO
72795 + if (skb_shinfo(skb)->gso_size) {
72796 + struct netif_extra_info *gso = (struct netif_extra_info *)
72797 + RING_GET_REQUEST(&np->tx, ++i);
72798 +
72799 + if (extra)
72800 + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
72801 + else
72802 + tx->flags |= NETTXF_extra_info;
72803 +
72804 + gso->u.gso.size = skb_shinfo(skb)->gso_size;
72805 + gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
72806 + gso->u.gso.pad = 0;
72807 + gso->u.gso.features = 0;
72808 +
72809 + gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
72810 + gso->flags = 0;
72811 + extra = gso;
72812 + }
72813 +#endif
72814 +
72815 + np->tx.req_prod_pvt = i + 1;
72816 +
72817 + xennet_make_frags(skb, dev, tx);
72818 + tx->size = skb->len;
72819 +
72820 + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
72821 + if (notify)
72822 + notify_remote_via_irq(np->irq);
72823 +
72824 + network_tx_buf_gc(dev);
72825 +
72826 + if (!netfront_tx_slot_available(np))
72827 + netif_stop_queue(dev);
72828 +
72829 + spin_unlock_irq(&np->tx_lock);
72830 +
72831 + np->stats.tx_bytes += skb->len;
72832 + np->stats.tx_packets++;
72833 +
72834 + return 0;
72835 +
72836 + drop:
72837 + np->stats.tx_dropped++;
72838 + dev_kfree_skb(skb);
72839 + return 0;
72840 +}
72841 +
72842 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
72843 +{
72844 + struct net_device *dev = dev_id;
72845 + struct netfront_info *np = netdev_priv(dev);
72846 + unsigned long flags;
72847 +
72848 + spin_lock_irqsave(&np->tx_lock, flags);
72849 +
72850 + if (likely(netif_carrier_ok(dev))) {
72851 + network_tx_buf_gc(dev);
72852 + /* Under tx_lock: protects access to rx shared-ring indexes. */
72853 + if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
72854 + netif_rx_schedule(dev);
72855 + }
72856 +
72857 + spin_unlock_irqrestore(&np->tx_lock, flags);
72858 +
72859 + return IRQ_HANDLED;
72860 +}
72861 +
72862 +static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
72863 + grant_ref_t ref)
72864 +{
72865 + int new = xennet_rxidx(np->rx.req_prod_pvt);
72866 +
72867 + BUG_ON(np->rx_skbs[new]);
72868 + np->rx_skbs[new] = skb;
72869 + np->grant_rx_ref[new] = ref;
72870 + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
72871 + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
72872 + np->rx.req_prod_pvt++;
72873 +}
72874 +
72875 +int xennet_get_extras(struct netfront_info *np,
72876 + struct netif_extra_info *extras, RING_IDX rp)
72877 +
72878 +{
72879 + struct netif_extra_info *extra;
72880 + RING_IDX cons = np->rx.rsp_cons;
72881 + int err = 0;
72882 +
72883 + do {
72884 + struct sk_buff *skb;
72885 + grant_ref_t ref;
72886 +
72887 + if (unlikely(cons + 1 == rp)) {
72888 + if (net_ratelimit())
72889 + WPRINTK("Missing extra info\n");
72890 + err = -EBADR;
72891 + break;
72892 + }
72893 +
72894 + extra = (struct netif_extra_info *)
72895 + RING_GET_RESPONSE(&np->rx, ++cons);
72896 +
72897 + if (unlikely(!extra->type ||
72898 + extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
72899 + if (net_ratelimit())
72900 + WPRINTK("Invalid extra type: %d\n",
72901 + extra->type);
72902 + err = -EINVAL;
72903 + } else {
72904 + memcpy(&extras[extra->type - 1], extra,
72905 + sizeof(*extra));
72906 + }
72907 +
72908 + skb = xennet_get_rx_skb(np, cons);
72909 + ref = xennet_get_rx_ref(np, cons);
72910 + xennet_move_rx_slot(np, skb, ref);
72911 + } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
72912 +
72913 + np->rx.rsp_cons = cons;
72914 + return err;
72915 +}
72916 +
72917 +static int xennet_get_responses(struct netfront_info *np,
72918 + struct netfront_rx_info *rinfo, RING_IDX rp,
72919 + struct sk_buff_head *list,
72920 + int *pages_flipped_p)
72921 +{
72922 + int pages_flipped = *pages_flipped_p;
72923 + struct mmu_update *mmu;
72924 + struct multicall_entry *mcl;
72925 + struct netif_rx_response *rx = &rinfo->rx;
72926 + struct netif_extra_info *extras = rinfo->extras;
72927 + RING_IDX cons = np->rx.rsp_cons;
72928 + struct sk_buff *skb = xennet_get_rx_skb(np, cons);
72929 + grant_ref_t ref = xennet_get_rx_ref(np, cons);
72930 + int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
72931 + int frags = 1;
72932 + int err = 0;
72933 + unsigned long ret;
72934 +
72935 + if (rx->flags & NETRXF_extra_info) {
72936 + err = xennet_get_extras(np, extras, rp);
72937 + cons = np->rx.rsp_cons;
72938 + }
72939 +
72940 + for (;;) {
72941 + unsigned long mfn;
72942 +
72943 + if (unlikely(rx->status < 0 ||
72944 + rx->offset + rx->status > PAGE_SIZE)) {
72945 + if (net_ratelimit())
72946 + WPRINTK("rx->offset: %x, size: %u\n",
72947 + rx->offset, rx->status);
72948 + xennet_move_rx_slot(np, skb, ref);
72949 + err = -EINVAL;
72950 + goto next;
72951 + }
72952 +
72953 + /*
72954 + * This definitely indicates a bug, either in this driver or in
72955 + * the backend driver. In future this should flag the bad
72956 + * situation to the system controller to reboot the backed.
72957 + */
72958 + if (ref == GRANT_INVALID_REF) {
72959 + if (net_ratelimit())
72960 + WPRINTK("Bad rx response id %d.\n", rx->id);
72961 + err = -EINVAL;
72962 + goto next;
72963 + }
72964 +
72965 + if (!np->copying_receiver) {
72966 + /* Memory pressure, insufficient buffer
72967 + * headroom, ... */
72968 + if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
72969 + if (net_ratelimit())
72970 + WPRINTK("Unfulfilled rx req "
72971 + "(id=%d, st=%d).\n",
72972 + rx->id, rx->status);
72973 + xennet_move_rx_slot(np, skb, ref);
72974 + err = -ENOMEM;
72975 + goto next;
72976 + }
72977 +
72978 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
72979 + /* Remap the page. */
72980 + struct page *page =
72981 + skb_shinfo(skb)->frags[0].page;
72982 + unsigned long pfn = page_to_pfn(page);
72983 + void *vaddr = page_address(page);
72984 +
72985 + mcl = np->rx_mcl + pages_flipped;
72986 + mmu = np->rx_mmu + pages_flipped;
72987 +
72988 + MULTI_update_va_mapping(mcl,
72989 + (unsigned long)vaddr,
72990 + pfn_pte_ma(mfn,
72991 + PAGE_KERNEL),
72992 + 0);
72993 + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
72994 + | MMU_MACHPHYS_UPDATE;
72995 + mmu->val = pfn;
72996 +
72997 + set_phys_to_machine(pfn, mfn);
72998 + }
72999 + pages_flipped++;
73000 + } else {
73001 + ret = gnttab_end_foreign_access_ref(ref, 0);
73002 + BUG_ON(!ret);
73003 + }
73004 +
73005 + gnttab_release_grant_reference(&np->gref_rx_head, ref);
73006 +
73007 + __skb_queue_tail(list, skb);
73008 +
73009 +next:
73010 + if (!(rx->flags & NETRXF_more_data))
73011 + break;
73012 +
73013 + if (cons + frags == rp) {
73014 + if (net_ratelimit())
73015 + WPRINTK("Need more frags\n");
73016 + err = -ENOENT;
73017 + break;
73018 + }
73019 +
73020 + rx = RING_GET_RESPONSE(&np->rx, cons + frags);
73021 + skb = xennet_get_rx_skb(np, cons + frags);
73022 + ref = xennet_get_rx_ref(np, cons + frags);
73023 + frags++;
73024 + }
73025 +
73026 + if (unlikely(frags > max)) {
73027 + if (net_ratelimit())
73028 + WPRINTK("Too many frags\n");
73029 + err = -E2BIG;
73030 + }
73031 +
73032 + if (unlikely(err))
73033 + np->rx.rsp_cons = cons + frags;
73034 +
73035 + *pages_flipped_p = pages_flipped;
73036 +
73037 + return err;
73038 +}
73039 +
73040 +static RING_IDX xennet_fill_frags(struct netfront_info *np,
73041 + struct sk_buff *skb,
73042 + struct sk_buff_head *list)
73043 +{
73044 + struct skb_shared_info *shinfo = skb_shinfo(skb);
73045 + int nr_frags = shinfo->nr_frags;
73046 + RING_IDX cons = np->rx.rsp_cons;
73047 + skb_frag_t *frag = shinfo->frags + nr_frags;
73048 + struct sk_buff *nskb;
73049 +
73050 + while ((nskb = __skb_dequeue(list))) {
73051 + struct netif_rx_response *rx =
73052 + RING_GET_RESPONSE(&np->rx, ++cons);
73053 +
73054 + frag->page = skb_shinfo(nskb)->frags[0].page;
73055 + frag->page_offset = rx->offset;
73056 + frag->size = rx->status;
73057 +
73058 + skb->data_len += rx->status;
73059 +
73060 + skb_shinfo(nskb)->nr_frags = 0;
73061 + kfree_skb(nskb);
73062 +
73063 + frag++;
73064 + nr_frags++;
73065 + }
73066 +
73067 + shinfo->nr_frags = nr_frags;
73068 + return cons;
73069 +}
73070 +
73071 +static int xennet_set_skb_gso(struct sk_buff *skb,
73072 + struct netif_extra_info *gso)
73073 +{
73074 + if (!gso->u.gso.size) {
73075 + if (net_ratelimit())
73076 + WPRINTK("GSO size must not be zero.\n");
73077 + return -EINVAL;
73078 + }
73079 +
73080 + /* Currently only TCPv4 S.O. is supported. */
73081 + if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
73082 + if (net_ratelimit())
73083 + WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
73084 + return -EINVAL;
73085 + }
73086 +
73087 +#ifdef HAVE_TSO
73088 + skb_shinfo(skb)->gso_size = gso->u.gso.size;
73089 +#ifdef HAVE_GSO
73090 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
73091 +
73092 + /* Header must be checked, and gso_segs computed. */
73093 + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
73094 +#endif
73095 + skb_shinfo(skb)->gso_segs = 0;
73096 +
73097 + return 0;
73098 +#else
73099 + if (net_ratelimit())
73100 + WPRINTK("GSO unsupported by this kernel.\n");
73101 + return -EINVAL;
73102 +#endif
73103 +}
73104 +
73105 +static int netif_poll(struct net_device *dev, int *pbudget)
73106 +{
73107 + struct netfront_info *np = netdev_priv(dev);
73108 + struct sk_buff *skb;
73109 + struct netfront_rx_info rinfo;
73110 + struct netif_rx_response *rx = &rinfo.rx;
73111 + struct netif_extra_info *extras = rinfo.extras;
73112 + RING_IDX i, rp;
73113 + struct multicall_entry *mcl;
73114 + int work_done, budget, more_to_do = 1;
73115 + struct sk_buff_head rxq;
73116 + struct sk_buff_head errq;
73117 + struct sk_buff_head tmpq;
73118 + unsigned long flags;
73119 + unsigned int len;
73120 + int pages_flipped = 0;
73121 + int err;
73122 +
73123 + spin_lock(&np->rx_lock);
73124 +
73125 + if (unlikely(!netif_carrier_ok(dev))) {
73126 + spin_unlock(&np->rx_lock);
73127 + return 0;
73128 + }
73129 +
73130 + skb_queue_head_init(&rxq);
73131 + skb_queue_head_init(&errq);
73132 + skb_queue_head_init(&tmpq);
73133 +
73134 + if ((budget = *pbudget) > dev->quota)
73135 + budget = dev->quota;
73136 + rp = np->rx.sring->rsp_prod;
73137 + rmb(); /* Ensure we see queued responses up to 'rp'. */
73138 +
73139 + i = np->rx.rsp_cons;
73140 + work_done = 0;
73141 + while ((i != rp) && (work_done < budget)) {
73142 + memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
73143 + memset(extras, 0, sizeof(extras));
73144 +
73145 + err = xennet_get_responses(np, &rinfo, rp, &tmpq,
73146 + &pages_flipped);
73147 +
73148 + if (unlikely(err)) {
73149 +err:
73150 + while ((skb = __skb_dequeue(&tmpq)))
73151 + __skb_queue_tail(&errq, skb);
73152 + np->stats.rx_errors++;
73153 + i = np->rx.rsp_cons;
73154 + continue;
73155 + }
73156 +
73157 + skb = __skb_dequeue(&tmpq);
73158 +
73159 + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
73160 + struct netif_extra_info *gso;
73161 + gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
73162 +
73163 + if (unlikely(xennet_set_skb_gso(skb, gso))) {
73164 + __skb_queue_head(&tmpq, skb);
73165 + np->rx.rsp_cons += skb_queue_len(&tmpq);
73166 + goto err;
73167 + }
73168 + }
73169 +
73170 + skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
73171 + skb->h.raw = skb->nh.raw + rx->offset;
73172 +
73173 + len = rx->status;
73174 + if (len > RX_COPY_THRESHOLD)
73175 + len = RX_COPY_THRESHOLD;
73176 + skb_put(skb, len);
73177 +
73178 + if (rx->status > len) {
73179 + skb_shinfo(skb)->frags[0].page_offset =
73180 + rx->offset + len;
73181 + skb_shinfo(skb)->frags[0].size = rx->status - len;
73182 + skb->data_len = rx->status - len;
73183 + } else {
73184 + skb_shinfo(skb)->frags[0].page = NULL;
73185 + skb_shinfo(skb)->nr_frags = 0;
73186 + }
73187 +
73188 + i = xennet_fill_frags(np, skb, &tmpq);
73189 +
73190 + /*
73191 + * Truesize must approximates the size of true data plus
73192 + * any supervisor overheads. Adding hypervisor overheads
73193 + * has been shown to significantly reduce achievable
73194 + * bandwidth with the default receive buffer size. It is
73195 + * therefore not wise to account for it here.
73196 + *
73197 + * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
73198 + * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
73199 + * add the size of the data pulled in xennet_fill_frags().
73200 + *
73201 + * We also adjust for any unused space in the main data
73202 + * area by subtracting (RX_COPY_THRESHOLD - len). This is
73203 + * especially important with drivers which split incoming
73204 + * packets into header and data, using only 66 bytes of
73205 + * the main data area (see the e1000 driver for example.)
73206 + * On such systems, without this last adjustement, our
73207 + * achievable receive throughout using the standard receive
73208 + * buffer size was cut by 25%(!!!).
73209 + */
73210 + skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
73211 + skb->len += skb->data_len;
73212 +
73213 + /*
73214 + * Old backends do not assert data_validated but we
73215 + * can infer it from csum_blank so test both flags.
73216 + */
73217 + if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank))
73218 + skb->ip_summed = CHECKSUM_UNNECESSARY;
73219 + else
73220 + skb->ip_summed = CHECKSUM_NONE;
73221 +#ifdef CONFIG_XEN
73222 + skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE);
73223 + skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
73224 +#endif
73225 + np->stats.rx_packets++;
73226 + np->stats.rx_bytes += skb->len;
73227 +
73228 + __skb_queue_tail(&rxq, skb);
73229 +
73230 + np->rx.rsp_cons = ++i;
73231 + work_done++;
73232 + }
73233 +
73234 + if (pages_flipped) {
73235 + /* Some pages are no longer absent... */
73236 + balloon_update_driver_allowance(-pages_flipped);
73237 +
73238 + /* Do all the remapping work and M2P updates. */
73239 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73240 + mcl = np->rx_mcl + pages_flipped;
73241 + mcl->op = __HYPERVISOR_mmu_update;
73242 + mcl->args[0] = (unsigned long)np->rx_mmu;
73243 + mcl->args[1] = pages_flipped;
73244 + mcl->args[2] = 0;
73245 + mcl->args[3] = DOMID_SELF;
73246 + (void)HYPERVISOR_multicall(np->rx_mcl,
73247 + pages_flipped + 1);
73248 + }
73249 + }
73250 +
73251 + while ((skb = __skb_dequeue(&errq)))
73252 + kfree_skb(skb);
73253 +
73254 + while ((skb = __skb_dequeue(&rxq)) != NULL) {
73255 + struct page *page = (struct page *)skb->nh.raw;
73256 + void *vaddr = page_address(page);
73257 +
73258 + memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
73259 + skb_headlen(skb));
73260 +
73261 + if (page != skb_shinfo(skb)->frags[0].page)
73262 + __free_page(page);
73263 +
73264 + /* Ethernet work: Delayed to here as it peeks the header. */
73265 + skb->protocol = eth_type_trans(skb, dev);
73266 +
73267 + /* Pass it up. */
73268 + netif_receive_skb(skb);
73269 + dev->last_rx = jiffies;
73270 + }
73271 +
73272 + /* If we get a callback with very few responses, reduce fill target. */
73273 + /* NB. Note exponential increase, linear decrease. */
73274 + if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
73275 + ((3*np->rx_target) / 4)) &&
73276 + (--np->rx_target < np->rx_min_target))
73277 + np->rx_target = np->rx_min_target;
73278 +
73279 + network_alloc_rx_buffers(dev);
73280 +
73281 + *pbudget -= work_done;
73282 + dev->quota -= work_done;
73283 +
73284 + if (work_done < budget) {
73285 + local_irq_save(flags);
73286 +
73287 + RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
73288 + if (!more_to_do)
73289 + __netif_rx_complete(dev);
73290 +
73291 + local_irq_restore(flags);
73292 + }
73293 +
73294 + spin_unlock(&np->rx_lock);
73295 +
73296 + return more_to_do;
73297 +}
73298 +
73299 +static void netif_release_tx_bufs(struct netfront_info *np)
73300 +{
73301 + struct sk_buff *skb;
73302 + int i;
73303 +
73304 + for (i = 1; i <= NET_TX_RING_SIZE; i++) {
73305 + if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
73306 + continue;
73307 +
73308 + skb = np->tx_skbs[i];
73309 + gnttab_end_foreign_access_ref(
73310 + np->grant_tx_ref[i], GNTMAP_readonly);
73311 + gnttab_release_grant_reference(
73312 + &np->gref_tx_head, np->grant_tx_ref[i]);
73313 + np->grant_tx_ref[i] = GRANT_INVALID_REF;
73314 + add_id_to_freelist(np->tx_skbs, i);
73315 + dev_kfree_skb_irq(skb);
73316 + }
73317 +}
73318 +
73319 +static void netif_release_rx_bufs(struct netfront_info *np)
73320 +{
73321 + struct mmu_update *mmu = np->rx_mmu;
73322 + struct multicall_entry *mcl = np->rx_mcl;
73323 + struct sk_buff_head free_list;
73324 + struct sk_buff *skb;
73325 + unsigned long mfn;
73326 + int xfer = 0, noxfer = 0, unused = 0;
73327 + int id, ref;
73328 +
73329 + if (np->copying_receiver) {
73330 + printk("%s: fix me for copying receiver.\n", __FUNCTION__);
73331 + return;
73332 + }
73333 +
73334 + skb_queue_head_init(&free_list);
73335 +
73336 + spin_lock(&np->rx_lock);
73337 +
73338 + for (id = 0; id < NET_RX_RING_SIZE; id++) {
73339 + if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
73340 + unused++;
73341 + continue;
73342 + }
73343 +
73344 + skb = np->rx_skbs[id];
73345 + mfn = gnttab_end_foreign_transfer_ref(ref);
73346 + gnttab_release_grant_reference(&np->gref_rx_head, ref);
73347 + np->grant_rx_ref[id] = GRANT_INVALID_REF;
73348 + add_id_to_freelist(np->rx_skbs, id);
73349 +
73350 + if (0 == mfn) {
73351 + struct page *page = skb_shinfo(skb)->frags[0].page;
73352 + balloon_release_driver_page(page);
73353 + skb_shinfo(skb)->nr_frags = 0;
73354 + dev_kfree_skb(skb);
73355 + noxfer++;
73356 + continue;
73357 + }
73358 +
73359 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73360 + /* Remap the page. */
73361 + struct page *page = skb_shinfo(skb)->frags[0].page;
73362 + unsigned long pfn = page_to_pfn(page);
73363 + void *vaddr = page_address(page);
73364 +
73365 + MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
73366 + pfn_pte_ma(mfn, PAGE_KERNEL),
73367 + 0);
73368 + mcl++;
73369 + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
73370 + | MMU_MACHPHYS_UPDATE;
73371 + mmu->val = pfn;
73372 + mmu++;
73373 +
73374 + set_phys_to_machine(pfn, mfn);
73375 + }
73376 + __skb_queue_tail(&free_list, skb);
73377 + xfer++;
73378 + }
73379 +
73380 + printk("%s: %d xfer, %d noxfer, %d unused\n",
73381 + __FUNCTION__, xfer, noxfer, unused);
73382 +
73383 + if (xfer) {
73384 + /* Some pages are no longer absent... */
73385 + balloon_update_driver_allowance(-xfer);
73386 +
73387 + if (!xen_feature(XENFEAT_auto_translated_physmap)) {
73388 + /* Do all the remapping work and M2P updates. */
73389 + mcl->op = __HYPERVISOR_mmu_update;
73390 + mcl->args[0] = (unsigned long)np->rx_mmu;
73391 + mcl->args[1] = mmu - np->rx_mmu;
73392 + mcl->args[2] = 0;
73393 + mcl->args[3] = DOMID_SELF;
73394 + mcl++;
73395 + HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
73396 + }
73397 + }
73398 +
73399 + while ((skb = __skb_dequeue(&free_list)) != NULL)
73400 + dev_kfree_skb(skb);
73401 +
73402 + spin_unlock(&np->rx_lock);
73403 +}
73404 +
73405 +static int network_close(struct net_device *dev)
73406 +{
73407 + struct netfront_info *np = netdev_priv(dev);
73408 + netif_stop_queue(np->netdev);
73409 + return 0;
73410 +}
73411 +
73412 +
73413 +static struct net_device_stats *network_get_stats(struct net_device *dev)
73414 +{
73415 + struct netfront_info *np = netdev_priv(dev);
73416 + return &np->stats;
73417 +}
73418 +
73419 +static int xennet_change_mtu(struct net_device *dev, int mtu)
73420 +{
73421 + int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
73422 +
73423 + if (mtu > max)
73424 + return -EINVAL;
73425 + dev->mtu = mtu;
73426 + return 0;
73427 +}
73428 +
73429 +static int xennet_set_sg(struct net_device *dev, u32 data)
73430 +{
73431 + if (data) {
73432 + struct netfront_info *np = netdev_priv(dev);
73433 + int val;
73434 +
73435 + if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
73436 + "%d", &val) < 0)
73437 + val = 0;
73438 + if (!val)
73439 + return -ENOSYS;
73440 + } else if (dev->mtu > ETH_DATA_LEN)
73441 + dev->mtu = ETH_DATA_LEN;
73442 +
73443 + return ethtool_op_set_sg(dev, data);
73444 +}
73445 +
73446 +static int xennet_set_tso(struct net_device *dev, u32 data)
73447 +{
73448 +#ifdef HAVE_TSO
73449 + if (data) {
73450 + struct netfront_info *np = netdev_priv(dev);
73451 + int val;
73452 +
73453 + if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73454 + "feature-gso-tcpv4", "%d", &val) < 0)
73455 + val = 0;
73456 + if (!val)
73457 + return -ENOSYS;
73458 + }
73459 +
73460 + return ethtool_op_set_tso(dev, data);
73461 +#else
73462 + return -ENOSYS;
73463 +#endif
73464 +}
73465 +
73466 +static void xennet_set_features(struct net_device *dev)
73467 +{
73468 + dev_disable_gso_features(dev);
73469 + xennet_set_sg(dev, 0);
73470 +
73471 + /* We need checksum offload to enable scatter/gather and TSO. */
73472 + if (!(dev->features & NETIF_F_IP_CSUM))
73473 + return;
73474 +
73475 + if (xennet_set_sg(dev, 1))
73476 + return;
73477 +
73478 + /* Before 2.6.9 TSO seems to be unreliable so do not enable it
73479 + * on older kernels.
73480 + */
73481 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
73482 + xennet_set_tso(dev, 1);
73483 +#endif
73484 +
73485 +}
73486 +
73487 +static int network_connect(struct net_device *dev)
73488 +{
73489 + struct netfront_info *np = netdev_priv(dev);
73490 + int i, requeue_idx, err;
73491 + struct sk_buff *skb;
73492 + grant_ref_t ref;
73493 + netif_rx_request_t *req;
73494 + unsigned int feature_rx_copy, feature_rx_flip;
73495 +
73496 + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73497 + "feature-rx-copy", "%u", &feature_rx_copy);
73498 + if (err != 1)
73499 + feature_rx_copy = 0;
73500 + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
73501 + "feature-rx-flip", "%u", &feature_rx_flip);
73502 + if (err != 1)
73503 + feature_rx_flip = 1;
73504 +
73505 + /*
73506 + * Copy packets on receive path if:
73507 + * (a) This was requested by user, and the backend supports it; or
73508 + * (b) Flipping was requested, but this is unsupported by the backend.
73509 + */
73510 + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
73511 + (MODPARM_rx_flip && !feature_rx_flip));
73512 +
73513 + err = talk_to_backend(np->xbdev, np);
73514 + if (err)
73515 + return err;
73516 +
73517 + xennet_set_features(dev);
73518 +
73519 + IPRINTK("device %s has %sing receive path.\n",
73520 + dev->name, np->copying_receiver ? "copy" : "flipp");
73521 +
73522 + spin_lock_irq(&np->tx_lock);
73523 + spin_lock(&np->rx_lock);
73524 +
73525 + /*
73526 + * Recovery procedure:
73527 + * NB. Freelist index entries are always going to be less than
73528 + * PAGE_OFFSET, whereas pointers to skbs will always be equal or
73529 + * greater than PAGE_OFFSET: we use this property to distinguish
73530 + * them.
73531 + */
73532 +
73533 + /* Step 1: Discard all pending TX packet fragments. */
73534 + netif_release_tx_bufs(np);
73535 +
73536 + /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
73537 + for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
73538 + if (!np->rx_skbs[i])
73539 + continue;
73540 +
73541 + skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
73542 + ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
73543 + req = RING_GET_REQUEST(&np->rx, requeue_idx);
73544 +
73545 + if (!np->copying_receiver) {
73546 + gnttab_grant_foreign_transfer_ref(
73547 + ref, np->xbdev->otherend_id,
73548 + page_to_pfn(skb_shinfo(skb)->frags->page));
73549 + } else {
73550 + gnttab_grant_foreign_access_ref(
73551 + ref, np->xbdev->otherend_id,
73552 + pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
73553 + frags->page)),
73554 + 0);
73555 + }
73556 + req->gref = ref;
73557 + req->id = requeue_idx;
73558 +
73559 + requeue_idx++;
73560 + }
73561 +
73562 + np->rx.req_prod_pvt = requeue_idx;
73563 +
73564 + /*
73565 + * Step 3: All public and private state should now be sane. Get
73566 + * ready to start sending and receiving packets and give the driver
73567 + * domain a kick because we've probably just requeued some
73568 + * packets.
73569 + */
73570 + netif_carrier_on(dev);
73571 + notify_remote_via_irq(np->irq);
73572 + network_tx_buf_gc(dev);
73573 + network_alloc_rx_buffers(dev);
73574 +
73575 + spin_unlock(&np->rx_lock);
73576 + spin_unlock_irq(&np->tx_lock);
73577 +
73578 + return 0;
73579 +}
73580 +
73581 +static void netif_uninit(struct net_device *dev)
73582 +{
73583 + struct netfront_info *np = netdev_priv(dev);
73584 + netif_release_tx_bufs(np);
73585 + netif_release_rx_bufs(np);
73586 + gnttab_free_grant_references(np->gref_tx_head);
73587 + gnttab_free_grant_references(np->gref_rx_head);
73588 +}
73589 +
73590 +static struct ethtool_ops network_ethtool_ops =
73591 +{
73592 + .get_tx_csum = ethtool_op_get_tx_csum,
73593 + .set_tx_csum = ethtool_op_set_tx_csum,
73594 + .get_sg = ethtool_op_get_sg,
73595 + .set_sg = xennet_set_sg,
73596 + .get_tso = ethtool_op_get_tso,
73597 + .set_tso = xennet_set_tso,
73598 + .get_link = ethtool_op_get_link,
73599 +};
73600 +
73601 +#ifdef CONFIG_SYSFS
73602 +static ssize_t show_rxbuf_min(struct class_device *cd, char *buf)
73603 +{
73604 + struct net_device *netdev = container_of(cd, struct net_device,
73605 + class_dev);
73606 + struct netfront_info *info = netdev_priv(netdev);
73607 +
73608 + return sprintf(buf, "%u\n", info->rx_min_target);
73609 +}
73610 +
73611 +static ssize_t store_rxbuf_min(struct class_device *cd,
73612 + const char *buf, size_t len)
73613 +{
73614 + struct net_device *netdev = container_of(cd, struct net_device,
73615 + class_dev);
73616 + struct netfront_info *np = netdev_priv(netdev);
73617 + char *endp;
73618 + unsigned long target;
73619 +
73620 + if (!capable(CAP_NET_ADMIN))
73621 + return -EPERM;
73622 +
73623 + target = simple_strtoul(buf, &endp, 0);
73624 + if (endp == buf)
73625 + return -EBADMSG;
73626 +
73627 + if (target < RX_MIN_TARGET)
73628 + target = RX_MIN_TARGET;
73629 + if (target > RX_MAX_TARGET)
73630 + target = RX_MAX_TARGET;
73631 +
73632 + spin_lock(&np->rx_lock);
73633 + if (target > np->rx_max_target)
73634 + np->rx_max_target = target;
73635 + np->rx_min_target = target;
73636 + if (target > np->rx_target)
73637 + np->rx_target = target;
73638 +
73639 + network_alloc_rx_buffers(netdev);
73640 +
73641 + spin_unlock(&np->rx_lock);
73642 + return len;
73643 +}
73644 +
73645 +static ssize_t show_rxbuf_max(struct class_device *cd, char *buf)
73646 +{
73647 + struct net_device *netdev = container_of(cd, struct net_device,
73648 + class_dev);
73649 + struct netfront_info *info = netdev_priv(netdev);
73650 +
73651 + return sprintf(buf, "%u\n", info->rx_max_target);
73652 +}
73653 +
73654 +static ssize_t store_rxbuf_max(struct class_device *cd,
73655 + const char *buf, size_t len)
73656 +{
73657 + struct net_device *netdev = container_of(cd, struct net_device,
73658 + class_dev);
73659 + struct netfront_info *np = netdev_priv(netdev);
73660 + char *endp;
73661 + unsigned long target;
73662 +
73663 + if (!capable(CAP_NET_ADMIN))
73664 + return -EPERM;
73665 +
73666 + target = simple_strtoul(buf, &endp, 0);
73667 + if (endp == buf)
73668 + return -EBADMSG;
73669 +
73670 + if (target < RX_MIN_TARGET)
73671 + target = RX_MIN_TARGET;
73672 + if (target > RX_MAX_TARGET)
73673 + target = RX_MAX_TARGET;
73674 +
73675 + spin_lock(&np->rx_lock);
73676 + if (target < np->rx_min_target)
73677 + np->rx_min_target = target;
73678 + np->rx_max_target = target;
73679 + if (target < np->rx_target)
73680 + np->rx_target = target;
73681 +
73682 + network_alloc_rx_buffers(netdev);
73683 +
73684 + spin_unlock(&np->rx_lock);
73685 + return len;
73686 +}
73687 +
73688 +static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf)
73689 +{
73690 + struct net_device *netdev = container_of(cd, struct net_device,
73691 + class_dev);
73692 + struct netfront_info *info = netdev_priv(netdev);
73693 +
73694 + return sprintf(buf, "%u\n", info->rx_target);
73695 +}
73696 +
73697 +static const struct class_device_attribute xennet_attrs[] = {
73698 + __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
73699 + __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
73700 + __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
73701 +};
73702 +
73703 +static int xennet_sysfs_addif(struct net_device *netdev)
73704 +{
73705 + int i;
73706 + int error = 0;
73707 +
73708 + for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
73709 + error = class_device_create_file(&netdev->class_dev,
73710 + &xennet_attrs[i]);
73711 + if (error)
73712 + goto fail;
73713 + }
73714 + return 0;
73715 +
73716 + fail:
73717 + while (--i >= 0)
73718 + class_device_remove_file(&netdev->class_dev,
73719 + &xennet_attrs[i]);
73720 + return error;
73721 +}
73722 +
73723 +static void xennet_sysfs_delif(struct net_device *netdev)
73724 +{
73725 + int i;
73726 +
73727 + for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
73728 + class_device_remove_file(&netdev->class_dev,
73729 + &xennet_attrs[i]);
73730 + }
73731 +}
73732 +
73733 +#endif /* CONFIG_SYSFS */
73734 +
73735 +
73736 +/*
73737 + * Nothing to do here. Virtual interface is point-to-point and the
73738 + * physical interface is probably promiscuous anyway.
73739 + */
73740 +static void network_set_multicast_list(struct net_device *dev)
73741 +{
73742 +}
73743 +
73744 +static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
73745 +{
73746 + int i, err = 0;
73747 + struct net_device *netdev = NULL;
73748 + struct netfront_info *np = NULL;
73749 +
73750 + netdev = alloc_etherdev(sizeof(struct netfront_info));
73751 + if (!netdev) {
73752 + printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
73753 + __FUNCTION__);
73754 + return ERR_PTR(-ENOMEM);
73755 + }
73756 +
73757 + np = netdev_priv(netdev);
73758 + np->xbdev = dev;
73759 +
73760 + netif_carrier_off(netdev);
73761 +
73762 + spin_lock_init(&np->tx_lock);
73763 + spin_lock_init(&np->rx_lock);
73764 +
73765 + skb_queue_head_init(&np->rx_batch);
73766 + np->rx_target = RX_DFL_MIN_TARGET;
73767 + np->rx_min_target = RX_DFL_MIN_TARGET;
73768 + np->rx_max_target = RX_MAX_TARGET;
73769 +
73770 + init_timer(&np->rx_refill_timer);
73771 + np->rx_refill_timer.data = (unsigned long)netdev;
73772 + np->rx_refill_timer.function = rx_refill_timeout;
73773 +
73774 + /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
73775 + for (i = 0; i <= NET_TX_RING_SIZE; i++) {
73776 + np->tx_skbs[i] = (void *)((unsigned long) i+1);
73777 + np->grant_tx_ref[i] = GRANT_INVALID_REF;
73778 + }
73779 +
73780 + for (i = 0; i < NET_RX_RING_SIZE; i++) {
73781 + np->rx_skbs[i] = NULL;
73782 + np->grant_rx_ref[i] = GRANT_INVALID_REF;
73783 + }
73784 +
73785 + /* A grant for every tx ring slot */
73786 + if (gnttab_alloc_grant_references(TX_MAX_TARGET,
73787 + &np->gref_tx_head) < 0) {
73788 + printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
73789 + err = -ENOMEM;
73790 + goto exit;
73791 + }
73792 + /* A grant for every rx ring slot */
73793 + if (gnttab_alloc_grant_references(RX_MAX_TARGET,
73794 + &np->gref_rx_head) < 0) {
73795 + printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
73796 + err = -ENOMEM;
73797 + goto exit_free_tx;
73798 + }
73799 +
73800 + netdev->open = network_open;
73801 + netdev->hard_start_xmit = network_start_xmit;
73802 + netdev->stop = network_close;
73803 + netdev->get_stats = network_get_stats;
73804 + netdev->poll = netif_poll;
73805 + netdev->set_multicast_list = network_set_multicast_list;
73806 + netdev->uninit = netif_uninit;
73807 + netdev->change_mtu = xennet_change_mtu;
73808 + netdev->weight = 64;
73809 + netdev->features = NETIF_F_IP_CSUM;
73810 +
73811 + SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
73812 + SET_MODULE_OWNER(netdev);
73813 + SET_NETDEV_DEV(netdev, &dev->dev);
73814 +
73815 + np->netdev = netdev;
73816 + return netdev;
73817 +
73818 + exit_free_tx:
73819 + gnttab_free_grant_references(np->gref_tx_head);
73820 + exit:
73821 + free_netdev(netdev);
73822 + return ERR_PTR(err);
73823 +}
73824 +
73825 +/*
73826 + * We use this notifier to send out a fake ARP reply to reset switches and
73827 + * router ARP caches when an IP interface is brought up on a VIF.
73828 + */
73829 +static int
73830 +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
73831 +{
73832 + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
73833 + struct net_device *dev = ifa->ifa_dev->dev;
73834 +
73835 + /* UP event and is it one of our devices? */
73836 + if (event == NETDEV_UP && dev->open == network_open)
73837 + (void)send_fake_arp(dev);
73838 +
73839 + return NOTIFY_DONE;
73840 +}
73841 +
73842 +
73843 +static void netif_disconnect_backend(struct netfront_info *info)
73844 +{
73845 + /* Stop old i/f to prevent errors whilst we rebuild the state. */
73846 + spin_lock_irq(&info->tx_lock);
73847 + spin_lock(&info->rx_lock);
73848 + netif_carrier_off(info->netdev);
73849 + spin_unlock(&info->rx_lock);
73850 + spin_unlock_irq(&info->tx_lock);
73851 +
73852 + if (info->irq)
73853 + unbind_from_irqhandler(info->irq, info->netdev);
73854 + info->evtchn = info->irq = 0;
73855 +
73856 + end_access(info->tx_ring_ref, info->tx.sring);
73857 + end_access(info->rx_ring_ref, info->rx.sring);
73858 + info->tx_ring_ref = GRANT_INVALID_REF;
73859 + info->rx_ring_ref = GRANT_INVALID_REF;
73860 + info->tx.sring = NULL;
73861 + info->rx.sring = NULL;
73862 +}
73863 +
73864 +
73865 +static void end_access(int ref, void *page)
73866 +{
73867 + if (ref != GRANT_INVALID_REF)
73868 + gnttab_end_foreign_access(ref, 0, (unsigned long)page);
73869 +}
73870 +
73871 +
73872 +/* ** Driver registration ** */
73873 +
73874 +
73875 +static struct xenbus_device_id netfront_ids[] = {
73876 + { "vif" },
73877 + { "" }
73878 +};
73879 +
73880 +
73881 +static struct xenbus_driver netfront = {
73882 + .name = "vif",
73883 + .owner = THIS_MODULE,
73884 + .ids = netfront_ids,
73885 + .probe = netfront_probe,
73886 + .remove = __devexit_p(netfront_remove),
73887 + .resume = netfront_resume,
73888 + .otherend_changed = backend_changed,
73889 +};
73890 +
73891 +
73892 +static struct notifier_block notifier_inetdev = {
73893 + .notifier_call = inetdev_notify,
73894 + .next = NULL,
73895 + .priority = 0
73896 +};
73897 +
73898 +static int __init netif_init(void)
73899 +{
73900 + if (!is_running_on_xen())
73901 + return -ENODEV;
73902 +
73903 +#ifdef CONFIG_XEN
73904 + if (MODPARM_rx_flip && MODPARM_rx_copy) {
73905 + WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
73906 + return -EINVAL;
73907 + }
73908 +
73909 + if (!MODPARM_rx_flip && !MODPARM_rx_copy)
73910 + MODPARM_rx_flip = 1; /* Default is to flip. */
73911 +#endif
73912 +
73913 + if (is_initial_xendomain())
73914 + return 0;
73915 +
73916 + IPRINTK("Initialising virtual ethernet driver.\n");
73917 +
73918 + (void)register_inetaddr_notifier(&notifier_inetdev);
73919 +
73920 + return xenbus_register_frontend(&netfront);
73921 +}
73922 +module_init(netif_init);
73923 +
73924 +
73925 +static void __exit netif_exit(void)
73926 +{
73927 + if (is_initial_xendomain())
73928 + return;
73929 +
73930 + unregister_inetaddr_notifier(&notifier_inetdev);
73931 +
73932 + return xenbus_unregister_driver(&netfront);
73933 +}
73934 +module_exit(netif_exit);
73935 +
73936 +MODULE_LICENSE("Dual BSD/GPL");
73937 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/Makefile linux-2.6.16.33/drivers/xen/pciback/Makefile
73938 --- linux-2.6.16.33-noxen/drivers/xen/pciback/Makefile 1970-01-01 00:00:00.000000000 +0000
73939 +++ linux-2.6.16.33/drivers/xen/pciback/Makefile 2007-01-08 15:00:45.000000000 +0000
73940 @@ -0,0 +1,15 @@
73941 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
73942 +
73943 +pciback-y := pci_stub.o pciback_ops.o xenbus.o
73944 +pciback-y += conf_space.o conf_space_header.o \
73945 + conf_space_capability.o \
73946 + conf_space_capability_vpd.o \
73947 + conf_space_capability_pm.o \
73948 + conf_space_quirks.o
73949 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
73950 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
73951 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
73952 +
73953 +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
73954 +EXTRA_CFLAGS += -DDEBUG
73955 +endif
73956 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.c linux-2.6.16.33/drivers/xen/pciback/conf_space.c
73957 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.c 1970-01-01 00:00:00.000000000 +0000
73958 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space.c 2007-01-08 15:00:45.000000000 +0000
73959 @@ -0,0 +1,425 @@
73960 +/*
73961 + * PCI Backend - Functions for creating a virtual configuration space for
73962 + * exported PCI Devices.
73963 + * It's dangerous to allow PCI Driver Domains to change their
73964 + * device's resources (memory, i/o ports, interrupts). We need to
73965 + * restrict changes to certain PCI Configuration registers:
73966 + * BARs, INTERRUPT_PIN, most registers in the header...
73967 + *
73968 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
73969 + */
73970 +
73971 +#include <linux/kernel.h>
73972 +#include <linux/pci.h>
73973 +#include "pciback.h"
73974 +#include "conf_space.h"
73975 +#include "conf_space_quirks.h"
73976 +
73977 +#define DEFINE_PCI_CONFIG(op,size,type) \
73978 +int pciback_##op##_config_##size \
73979 +(struct pci_dev *dev, int offset, type value, void *data) \
73980 +{ \
73981 + return pci_##op##_config_##size (dev, offset, value); \
73982 +}
73983 +
73984 +DEFINE_PCI_CONFIG(read, byte, u8 *)
73985 +DEFINE_PCI_CONFIG(read, word, u16 *)
73986 +DEFINE_PCI_CONFIG(read, dword, u32 *)
73987 +
73988 +DEFINE_PCI_CONFIG(write, byte, u8)
73989 +DEFINE_PCI_CONFIG(write, word, u16)
73990 +DEFINE_PCI_CONFIG(write, dword, u32)
73991 +
73992 +static int conf_space_read(struct pci_dev *dev,
73993 + struct config_field_entry *entry, int offset,
73994 + u32 * value)
73995 +{
73996 + int ret = 0;
73997 + struct config_field *field = entry->field;
73998 +
73999 + *value = 0;
74000 +
74001 + switch (field->size) {
74002 + case 1:
74003 + if (field->u.b.read)
74004 + ret = field->u.b.read(dev, offset, (u8 *) value,
74005 + entry->data);
74006 + break;
74007 + case 2:
74008 + if (field->u.w.read)
74009 + ret = field->u.w.read(dev, offset, (u16 *) value,
74010 + entry->data);
74011 + break;
74012 + case 4:
74013 + if (field->u.dw.read)
74014 + ret = field->u.dw.read(dev, offset, value, entry->data);
74015 + break;
74016 + }
74017 + return ret;
74018 +}
74019 +
74020 +static int conf_space_write(struct pci_dev *dev,
74021 + struct config_field_entry *entry, int offset,
74022 + u32 value)
74023 +{
74024 + int ret = 0;
74025 + struct config_field *field = entry->field;
74026 +
74027 + switch (field->size) {
74028 + case 1:
74029 + if (field->u.b.write)
74030 + ret = field->u.b.write(dev, offset, (u8) value,
74031 + entry->data);
74032 + break;
74033 + case 2:
74034 + if (field->u.w.write)
74035 + ret = field->u.w.write(dev, offset, (u16) value,
74036 + entry->data);
74037 + break;
74038 + case 4:
74039 + if (field->u.dw.write)
74040 + ret = field->u.dw.write(dev, offset, value,
74041 + entry->data);
74042 + break;
74043 + }
74044 + return ret;
74045 +}
74046 +
74047 +static inline u32 get_mask(int size)
74048 +{
74049 + if (size == 1)
74050 + return 0xff;
74051 + else if (size == 2)
74052 + return 0xffff;
74053 + else
74054 + return 0xffffffff;
74055 +}
74056 +
74057 +static inline int valid_request(int offset, int size)
74058 +{
74059 + /* Validate request (no un-aligned requests) */
74060 + if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
74061 + return 1;
74062 + return 0;
74063 +}
74064 +
74065 +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
74066 + int offset)
74067 +{
74068 + if (offset >= 0) {
74069 + new_val_mask <<= (offset * 8);
74070 + new_val <<= (offset * 8);
74071 + } else {
74072 + new_val_mask >>= (offset * -8);
74073 + new_val >>= (offset * -8);
74074 + }
74075 + val = (val & ~new_val_mask) | (new_val & new_val_mask);
74076 +
74077 + return val;
74078 +}
74079 +
74080 +static int pcibios_err_to_errno(int err)
74081 +{
74082 + switch (err) {
74083 + case PCIBIOS_SUCCESSFUL:
74084 + return XEN_PCI_ERR_success;
74085 + case PCIBIOS_DEVICE_NOT_FOUND:
74086 + return XEN_PCI_ERR_dev_not_found;
74087 + case PCIBIOS_BAD_REGISTER_NUMBER:
74088 + return XEN_PCI_ERR_invalid_offset;
74089 + case PCIBIOS_FUNC_NOT_SUPPORTED:
74090 + return XEN_PCI_ERR_not_implemented;
74091 + case PCIBIOS_SET_FAILED:
74092 + return XEN_PCI_ERR_access_denied;
74093 + }
74094 + return err;
74095 +}
74096 +
74097 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
74098 + u32 * ret_val)
74099 +{
74100 + int err = 0;
74101 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74102 + struct config_field_entry *cfg_entry;
74103 + struct config_field *field;
74104 + int req_start, req_end, field_start, field_end;
74105 + /* if read fails for any reason, return 0 (as if device didn't respond) */
74106 + u32 value = 0, tmp_val;
74107 +
74108 + if (unlikely(verbose_request))
74109 + printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
74110 + pci_name(dev), size, offset);
74111 +
74112 + if (!valid_request(offset, size)) {
74113 + err = XEN_PCI_ERR_invalid_offset;
74114 + goto out;
74115 + }
74116 +
74117 + /* Get the real value first, then modify as appropriate */
74118 + switch (size) {
74119 + case 1:
74120 + err = pci_read_config_byte(dev, offset, (u8 *) & value);
74121 + break;
74122 + case 2:
74123 + err = pci_read_config_word(dev, offset, (u16 *) & value);
74124 + break;
74125 + case 4:
74126 + err = pci_read_config_dword(dev, offset, &value);
74127 + break;
74128 + }
74129 +
74130 + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74131 + field = cfg_entry->field;
74132 +
74133 + req_start = offset;
74134 + req_end = offset + size;
74135 + field_start = OFFSET(cfg_entry);
74136 + field_end = OFFSET(cfg_entry) + field->size;
74137 +
74138 + if ((req_start >= field_start && req_start < field_end)
74139 + || (req_end > field_start && req_end <= field_end)) {
74140 + err = conf_space_read(dev, cfg_entry, field_start,
74141 + &tmp_val);
74142 + if (err)
74143 + goto out;
74144 +
74145 + value = merge_value(value, tmp_val,
74146 + get_mask(field->size),
74147 + field_start - req_start);
74148 + }
74149 + }
74150 +
74151 + out:
74152 + if (unlikely(verbose_request))
74153 + printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
74154 + pci_name(dev), size, offset, value);
74155 +
74156 + *ret_val = value;
74157 + return pcibios_err_to_errno(err);
74158 +}
74159 +
74160 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
74161 +{
74162 + int err = 0, handled = 0;
74163 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74164 + struct config_field_entry *cfg_entry;
74165 + struct config_field *field;
74166 + u32 tmp_val;
74167 + int req_start, req_end, field_start, field_end;
74168 +
74169 + if (unlikely(verbose_request))
74170 + printk(KERN_DEBUG
74171 + "pciback: %s: write request %d bytes at 0x%x = %x\n",
74172 + pci_name(dev), size, offset, value);
74173 +
74174 + if (!valid_request(offset, size))
74175 + return XEN_PCI_ERR_invalid_offset;
74176 +
74177 + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74178 + field = cfg_entry->field;
74179 +
74180 + req_start = offset;
74181 + req_end = offset + size;
74182 + field_start = OFFSET(cfg_entry);
74183 + field_end = OFFSET(cfg_entry) + field->size;
74184 +
74185 + if ((req_start >= field_start && req_start < field_end)
74186 + || (req_end > field_start && req_end <= field_end)) {
74187 + tmp_val = 0;
74188 +
74189 + err = pciback_config_read(dev, field_start,
74190 + field->size, &tmp_val);
74191 + if (err)
74192 + break;
74193 +
74194 + tmp_val = merge_value(tmp_val, value, get_mask(size),
74195 + req_start - field_start);
74196 +
74197 + err = conf_space_write(dev, cfg_entry, field_start,
74198 + tmp_val);
74199 +
74200 + /* handled is set true here, but not every byte
74201 + * may have been written! Properly detecting if
74202 + * every byte is handled is unnecessary as the
74203 + * flag is used to detect devices that need
74204 + * special helpers to work correctly.
74205 + */
74206 + handled = 1;
74207 + }
74208 + }
74209 +
74210 + if (!handled && !err) {
74211 + /* By default, anything not specificially handled above is
74212 + * read-only. The permissive flag changes this behavior so
74213 + * that anything not specifically handled above is writable.
74214 + * This means that some fields may still be read-only because
74215 + * they have entries in the config_field list that intercept
74216 + * the write and do nothing. */
74217 + if (dev_data->permissive) {
74218 + switch (size) {
74219 + case 1:
74220 + err = pci_write_config_byte(dev, offset,
74221 + (u8) value);
74222 + break;
74223 + case 2:
74224 + err = pci_write_config_word(dev, offset,
74225 + (u16) value);
74226 + break;
74227 + case 4:
74228 + err = pci_write_config_dword(dev, offset,
74229 + (u32) value);
74230 + break;
74231 + }
74232 + } else if (!dev_data->warned_on_write) {
74233 + dev_data->warned_on_write = 1;
74234 + dev_warn(&dev->dev, "Driver tried to write to a "
74235 + "read-only configuration space field at offset "
74236 + "0x%x, size %d. This may be harmless, but if "
74237 + "you have problems with your device:\n"
74238 + "1) see permissive attribute in sysfs\n"
74239 + "2) report problems to the xen-devel "
74240 + "mailing list along with details of your "
74241 + "device obtained from lspci.\n", offset, size);
74242 + }
74243 + }
74244 +
74245 + return pcibios_err_to_errno(err);
74246 +}
74247 +
74248 +void pciback_config_free_dyn_fields(struct pci_dev *dev)
74249 +{
74250 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74251 + struct config_field_entry *cfg_entry, *t;
74252 + struct config_field *field;
74253 +
74254 + dev_dbg(&dev->dev,
74255 + "free-ing dynamically allocated virtual configuration space fields\n");
74256 +
74257 + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
74258 + field = cfg_entry->field;
74259 +
74260 + if (field->clean) {
74261 + field->clean(field);
74262 +
74263 + if (cfg_entry->data)
74264 + kfree(cfg_entry->data);
74265 +
74266 + list_del(&cfg_entry->list);
74267 + kfree(cfg_entry);
74268 + }
74269 +
74270 + }
74271 +}
74272 +
74273 +void pciback_config_reset_dev(struct pci_dev *dev)
74274 +{
74275 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74276 + struct config_field_entry *cfg_entry;
74277 + struct config_field *field;
74278 +
74279 + dev_dbg(&dev->dev, "resetting virtual configuration space\n");
74280 +
74281 + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
74282 + field = cfg_entry->field;
74283 +
74284 + if (field->reset)
74285 + field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
74286 + }
74287 +}
74288 +
74289 +void pciback_config_free_dev(struct pci_dev *dev)
74290 +{
74291 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74292 + struct config_field_entry *cfg_entry, *t;
74293 + struct config_field *field;
74294 +
74295 + dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
74296 +
74297 + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
74298 + list_del(&cfg_entry->list);
74299 +
74300 + field = cfg_entry->field;
74301 +
74302 + if (field->release)
74303 + field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
74304 +
74305 + kfree(cfg_entry);
74306 + }
74307 +}
74308 +
74309 +int pciback_config_add_field_offset(struct pci_dev *dev,
74310 + struct config_field *field,
74311 + unsigned int offset)
74312 +{
74313 + int err = 0;
74314 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74315 + struct config_field_entry *cfg_entry;
74316 + void *tmp;
74317 +
74318 + /* silently ignore duplicate fields */
74319 + if (pciback_field_is_dup(dev, field->offset))
74320 + goto out;
74321 +
74322 + cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
74323 + if (!cfg_entry) {
74324 + err = -ENOMEM;
74325 + goto out;
74326 + }
74327 +
74328 + cfg_entry->data = NULL;
74329 + cfg_entry->field = field;
74330 + cfg_entry->base_offset = offset;
74331 +
74332 + if (field->init) {
74333 + tmp = field->init(dev, OFFSET(cfg_entry));
74334 +
74335 + if (IS_ERR(tmp)) {
74336 + err = PTR_ERR(tmp);
74337 + goto out;
74338 + }
74339 +
74340 + cfg_entry->data = tmp;
74341 + }
74342 +
74343 + dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
74344 + OFFSET(cfg_entry));
74345 + list_add_tail(&cfg_entry->list, &dev_data->config_fields);
74346 +
74347 + out:
74348 + if (err)
74349 + kfree(cfg_entry);
74350 +
74351 + return err;
74352 +}
74353 +
74354 +/* This sets up the device's virtual configuration space to keep track of
74355 + * certain registers (like the base address registers (BARs) so that we can
74356 + * keep the client from manipulating them directly.
74357 + */
74358 +int pciback_config_init_dev(struct pci_dev *dev)
74359 +{
74360 + int err = 0;
74361 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
74362 +
74363 + dev_dbg(&dev->dev, "initializing virtual configuration space\n");
74364 +
74365 + INIT_LIST_HEAD(&dev_data->config_fields);
74366 +
74367 + err = pciback_config_header_add_fields(dev);
74368 + if (err)
74369 + goto out;
74370 +
74371 + err = pciback_config_capability_add_fields(dev);
74372 + if (err)
74373 + goto out;
74374 +
74375 + err = pciback_config_quirks_init(dev);
74376 +
74377 + out:
74378 + return err;
74379 +}
74380 +
74381 +int pciback_config_init(void)
74382 +{
74383 + return pciback_config_capability_init();
74384 +}
74385 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.h linux-2.6.16.33/drivers/xen/pciback/conf_space.h
74386 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space.h 1970-01-01 00:00:00.000000000 +0000
74387 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space.h 2007-01-08 15:00:45.000000000 +0000
74388 @@ -0,0 +1,126 @@
74389 +/*
74390 + * PCI Backend - Common data structures for overriding the configuration space
74391 + *
74392 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74393 + */
74394 +
74395 +#ifndef __XEN_PCIBACK_CONF_SPACE_H__
74396 +#define __XEN_PCIBACK_CONF_SPACE_H__
74397 +
74398 +#include <linux/list.h>
74399 +#include <linux/err.h>
74400 +
74401 +/* conf_field_init can return an errno in a ptr with ERR_PTR() */
74402 +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
74403 +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
74404 +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
74405 +
74406 +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
74407 + void *data);
74408 +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
74409 + void *data);
74410 +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
74411 + void *data);
74412 +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
74413 + void *data);
74414 +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
74415 + void *data);
74416 +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
74417 + void *data);
74418 +
74419 +/* These are the fields within the configuration space which we
74420 + * are interested in intercepting reads/writes to and changing their
74421 + * values.
74422 + */
74423 +struct config_field {
74424 + unsigned int offset;
74425 + unsigned int size;
74426 + unsigned int mask;
74427 + conf_field_init init;
74428 + conf_field_reset reset;
74429 + conf_field_free release;
74430 + void (*clean) (struct config_field * field);
74431 + union {
74432 + struct {
74433 + conf_dword_write write;
74434 + conf_dword_read read;
74435 + } dw;
74436 + struct {
74437 + conf_word_write write;
74438 + conf_word_read read;
74439 + } w;
74440 + struct {
74441 + conf_byte_write write;
74442 + conf_byte_read read;
74443 + } b;
74444 + } u;
74445 + struct list_head list;
74446 +};
74447 +
74448 +struct config_field_entry {
74449 + struct list_head list;
74450 + struct config_field *field;
74451 + unsigned int base_offset;
74452 + void *data;
74453 +};
74454 +
74455 +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
74456 +
74457 +/* Add fields to a device - the add_fields macro expects to get a pointer to
74458 + * the first entry in an array (of which the ending is marked by size==0)
74459 + */
74460 +int pciback_config_add_field_offset(struct pci_dev *dev,
74461 + struct config_field *field,
74462 + unsigned int offset);
74463 +
74464 +static inline int pciback_config_add_field(struct pci_dev *dev,
74465 + struct config_field *field)
74466 +{
74467 + return pciback_config_add_field_offset(dev, field, 0);
74468 +}
74469 +
74470 +static inline int pciback_config_add_fields(struct pci_dev *dev,
74471 + struct config_field *field)
74472 +{
74473 + int i, err = 0;
74474 + for (i = 0; field[i].size != 0; i++) {
74475 + err = pciback_config_add_field(dev, &field[i]);
74476 + if (err)
74477 + break;
74478 + }
74479 + return err;
74480 +}
74481 +
74482 +static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
74483 + struct config_field *field,
74484 + unsigned int offset)
74485 +{
74486 + int i, err = 0;
74487 + for (i = 0; field[i].size != 0; i++) {
74488 + err = pciback_config_add_field_offset(dev, &field[i], offset);
74489 + if (err)
74490 + break;
74491 + }
74492 + return err;
74493 +}
74494 +
74495 +/* Read/Write the real configuration space */
74496 +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
74497 + void *data);
74498 +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
74499 + void *data);
74500 +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
74501 + void *data);
74502 +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
74503 + void *data);
74504 +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
74505 + void *data);
74506 +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
74507 + void *data);
74508 +
74509 +int pciback_config_capability_init(void);
74510 +
74511 +int pciback_config_header_add_fields(struct pci_dev *dev);
74512 +int pciback_config_capability_add_fields(struct pci_dev *dev);
74513 +
74514 +#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
74515 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.c
74516 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.c 1970-01-01 00:00:00.000000000 +0000
74517 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.c 2007-01-08 15:00:45.000000000 +0000
74518 @@ -0,0 +1,71 @@
74519 +/*
74520 + * PCI Backend - Handles the virtual fields found on the capability lists
74521 + * in the configuration space.
74522 + *
74523 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74524 + */
74525 +
74526 +#include <linux/kernel.h>
74527 +#include <linux/pci.h>
74528 +#include "pciback.h"
74529 +#include "conf_space.h"
74530 +#include "conf_space_capability.h"
74531 +
74532 +static LIST_HEAD(capabilities);
74533 +
74534 +static struct config_field caplist_header[] = {
74535 + {
74536 + .offset = PCI_CAP_LIST_ID,
74537 + .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
74538 + .u.w.read = pciback_read_config_word,
74539 + .u.w.write = NULL,
74540 + },
74541 + {
74542 + .size = 0,
74543 + },
74544 +};
74545 +
74546 +static inline void register_capability(struct pciback_config_capability *cap)
74547 +{
74548 + list_add_tail(&cap->cap_list, &capabilities);
74549 +}
74550 +
74551 +int pciback_config_capability_add_fields(struct pci_dev *dev)
74552 +{
74553 + int err = 0;
74554 + struct pciback_config_capability *cap;
74555 + int cap_offset;
74556 +
74557 + list_for_each_entry(cap, &capabilities, cap_list) {
74558 + cap_offset = pci_find_capability(dev, cap->capability);
74559 + if (cap_offset) {
74560 + dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
74561 + cap->capability, cap_offset);
74562 +
74563 + err = pciback_config_add_fields_offset(dev,
74564 + caplist_header,
74565 + cap_offset);
74566 + if (err)
74567 + goto out;
74568 + err = pciback_config_add_fields_offset(dev,
74569 + cap->fields,
74570 + cap_offset);
74571 + if (err)
74572 + goto out;
74573 + }
74574 + }
74575 +
74576 + out:
74577 + return err;
74578 +}
74579 +
74580 +extern struct pciback_config_capability pciback_config_capability_vpd;
74581 +extern struct pciback_config_capability pciback_config_capability_pm;
74582 +
74583 +int pciback_config_capability_init(void)
74584 +{
74585 + register_capability(&pciback_config_capability_vpd);
74586 + register_capability(&pciback_config_capability_pm);
74587 +
74588 + return 0;
74589 +}
74590 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.h linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.h
74591 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability.h 1970-01-01 00:00:00.000000000 +0000
74592 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability.h 2007-01-08 15:00:45.000000000 +0000
74593 @@ -0,0 +1,23 @@
74594 +/*
74595 + * PCI Backend - Data structures for special overlays for structures on
74596 + * the capability list.
74597 + *
74598 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74599 + */
74600 +
74601 +#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
74602 +#define __PCIBACK_CONFIG_CAPABILITY_H__
74603 +
74604 +#include <linux/pci.h>
74605 +#include <linux/list.h>
74606 +
74607 +struct pciback_config_capability {
74608 + struct list_head cap_list;
74609 +
74610 + int capability;
74611 +
74612 + /* If the device has the capability found above, add these fields */
74613 + struct config_field *fields;
74614 +};
74615 +
74616 +#endif
74617 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_pm.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_pm.c
74618 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_pm.c 1970-01-01 00:00:00.000000000 +0000
74619 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_pm.c 2007-01-08 15:00:45.000000000 +0000
74620 @@ -0,0 +1,113 @@
74621 +/*
74622 + * PCI Backend - Configuration space overlay for power management
74623 + *
74624 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74625 + */
74626 +
74627 +#include <linux/pci.h>
74628 +#include "conf_space.h"
74629 +#include "conf_space_capability.h"
74630 +
74631 +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
74632 + void *data)
74633 +{
74634 + int err;
74635 + u16 real_value;
74636 +
74637 + err = pci_read_config_word(dev, offset, &real_value);
74638 + if (err)
74639 + goto out;
74640 +
74641 + *value = real_value & ~PCI_PM_CAP_PME_MASK;
74642 +
74643 + out:
74644 + return err;
74645 +}
74646 +
74647 +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
74648 + * Can't allow driver domain to enable PMEs - they're shared */
74649 +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
74650 +
74651 +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
74652 + void *data)
74653 +{
74654 + int err;
74655 + u16 cur_value;
74656 + pci_power_t new_state;
74657 +
74658 + /* Handle setting power state separately */
74659 + new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
74660 +
74661 + err = pci_read_config_word(dev, offset, &cur_value);
74662 + if (err)
74663 + goto out;
74664 +
74665 + new_value &= PM_OK_BITS;
74666 + if ((cur_value & PM_OK_BITS) != new_value) {
74667 + new_value = (cur_value & ~PM_OK_BITS) | new_value;
74668 + err = pci_write_config_word(dev, offset, new_value);
74669 + if (err)
74670 + goto out;
74671 + }
74672 +
74673 + /* Let pci core handle the power management change */
74674 + dev_dbg(&dev->dev, "set power state to %x\n", new_state);
74675 + err = pci_set_power_state(dev, new_state);
74676 + if (err)
74677 + err = PCIBIOS_SET_FAILED;
74678 +
74679 + out:
74680 + return err;
74681 +}
74682 +
74683 +/* Ensure PMEs are disabled */
74684 +static void *pm_ctrl_init(struct pci_dev *dev, int offset)
74685 +{
74686 + int err;
74687 + u16 value;
74688 +
74689 + err = pci_read_config_word(dev, offset, &value);
74690 + if (err)
74691 + goto out;
74692 +
74693 + if (value & PCI_PM_CTRL_PME_ENABLE) {
74694 + value &= ~PCI_PM_CTRL_PME_ENABLE;
74695 + err = pci_write_config_word(dev, offset, value);
74696 + }
74697 +
74698 + out:
74699 + return ERR_PTR(err);
74700 +}
74701 +
74702 +static struct config_field caplist_pm[] = {
74703 + {
74704 + .offset = PCI_PM_PMC,
74705 + .size = 2,
74706 + .u.w.read = pm_caps_read,
74707 + },
74708 + {
74709 + .offset = PCI_PM_CTRL,
74710 + .size = 2,
74711 + .init = pm_ctrl_init,
74712 + .u.w.read = pciback_read_config_word,
74713 + .u.w.write = pm_ctrl_write,
74714 + },
74715 + {
74716 + .offset = PCI_PM_PPB_EXTENSIONS,
74717 + .size = 1,
74718 + .u.b.read = pciback_read_config_byte,
74719 + },
74720 + {
74721 + .offset = PCI_PM_DATA_REGISTER,
74722 + .size = 1,
74723 + .u.b.read = pciback_read_config_byte,
74724 + },
74725 + {
74726 + .size = 0,
74727 + },
74728 +};
74729 +
74730 +struct pciback_config_capability pciback_config_capability_pm = {
74731 + .capability = PCI_CAP_ID_PM,
74732 + .fields = caplist_pm,
74733 +};
74734 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_vpd.c linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_vpd.c
74735 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_capability_vpd.c 1970-01-01 00:00:00.000000000 +0000
74736 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_capability_vpd.c 2007-01-08 15:00:45.000000000 +0000
74737 @@ -0,0 +1,42 @@
74738 +/*
74739 + * PCI Backend - Configuration space overlay for Vital Product Data
74740 + *
74741 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74742 + */
74743 +
74744 +#include <linux/pci.h>
74745 +#include "conf_space.h"
74746 +#include "conf_space_capability.h"
74747 +
74748 +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
74749 + void *data)
74750 +{
74751 + /* Disallow writes to the vital product data */
74752 + if (value & PCI_VPD_ADDR_F)
74753 + return PCIBIOS_SET_FAILED;
74754 + else
74755 + return pci_write_config_word(dev, offset, value);
74756 +}
74757 +
74758 +static struct config_field caplist_vpd[] = {
74759 + {
74760 + .offset = PCI_VPD_ADDR,
74761 + .size = 2,
74762 + .u.w.read = pciback_read_config_word,
74763 + .u.w.write = vpd_address_write,
74764 + },
74765 + {
74766 + .offset = PCI_VPD_DATA,
74767 + .size = 4,
74768 + .u.dw.read = pciback_read_config_dword,
74769 + .u.dw.write = NULL,
74770 + },
74771 + {
74772 + .size = 0,
74773 + },
74774 +};
74775 +
74776 +struct pciback_config_capability pciback_config_capability_vpd = {
74777 + .capability = PCI_CAP_ID_VPD,
74778 + .fields = caplist_vpd,
74779 +};
74780 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_header.c linux-2.6.16.33/drivers/xen/pciback/conf_space_header.c
74781 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_header.c 1970-01-01 00:00:00.000000000 +0000
74782 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_header.c 2007-01-08 15:00:45.000000000 +0000
74783 @@ -0,0 +1,299 @@
74784 +/*
74785 + * PCI Backend - Handles the virtual fields in the configuration space headers.
74786 + *
74787 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
74788 + */
74789 +
74790 +#include <linux/kernel.h>
74791 +#include <linux/pci.h>
74792 +#include "pciback.h"
74793 +#include "conf_space.h"
74794 +
74795 +struct pci_bar_info {
74796 + u32 val;
74797 + u32 len_val;
74798 + int which;
74799 +};
74800 +
74801 +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
74802 +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
74803 +
74804 +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
74805 +{
74806 + if (!dev->is_enabled && is_enable_cmd(value)) {
74807 + if (unlikely(verbose_request))
74808 + printk(KERN_DEBUG "pciback: %s: enable\n",
74809 + pci_name(dev));
74810 + pci_enable_device(dev);
74811 + } else if (dev->is_enabled && !is_enable_cmd(value)) {
74812 + if (unlikely(verbose_request))
74813 + printk(KERN_DEBUG "pciback: %s: disable\n",
74814 + pci_name(dev));
74815 + pci_disable_device(dev);
74816 + }
74817 +
74818 + if (!dev->is_busmaster && is_master_cmd(value)) {
74819 + if (unlikely(verbose_request))
74820 + printk(KERN_DEBUG "pciback: %s: set bus master\n",
74821 + pci_name(dev));
74822 + pci_set_master(dev);
74823 + }
74824 +
74825 + if (value & PCI_COMMAND_INVALIDATE) {
74826 + if (unlikely(verbose_request))
74827 + printk(KERN_DEBUG
74828 + "pciback: %s: enable memory-write-invalidate\n",
74829 + pci_name(dev));
74830 + pci_set_mwi(dev);
74831 + }
74832 +
74833 + return pci_write_config_word(dev, offset, value);
74834 +}
74835 +
74836 +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
74837 +{
74838 + struct pci_bar_info *bar = data;
74839 +
74840 + if (unlikely(!bar)) {
74841 + printk(KERN_WARNING "pciback: driver data not found for %s\n",
74842 + pci_name(dev));
74843 + return XEN_PCI_ERR_op_failed;
74844 + }
74845 +
74846 + /* A write to obtain the length must happen as a 32-bit write.
74847 + * This does not (yet) support writing individual bytes
74848 + */
74849 + if (value == ~PCI_ROM_ADDRESS_ENABLE)
74850 + bar->which = 1;
74851 + else
74852 + bar->which = 0;
74853 +
74854 + /* Do we need to support enabling/disabling the rom address here? */
74855 +
74856 + return 0;
74857 +}
74858 +
74859 +/* For the BARs, only allow writes which write ~0 or
74860 + * the correct resource information
74861 + * (Needed for when the driver probes the resource usage)
74862 + */
74863 +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
74864 +{
74865 + struct pci_bar_info *bar = data;
74866 +
74867 + if (unlikely(!bar)) {
74868 + printk(KERN_WARNING "pciback: driver data not found for %s\n",
74869 + pci_name(dev));
74870 + return XEN_PCI_ERR_op_failed;
74871 + }
74872 +
74873 + /* A write to obtain the length must happen as a 32-bit write.
74874 + * This does not (yet) support writing individual bytes
74875 + */
74876 + if (value == ~0)
74877 + bar->which = 1;
74878 + else
74879 + bar->which = 0;
74880 +
74881 + return 0;
74882 +}
74883 +
74884 +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
74885 +{
74886 + struct pci_bar_info *bar = data;
74887 +
74888 + if (unlikely(!bar)) {
74889 + printk(KERN_WARNING "pciback: driver data not found for %s\n",
74890 + pci_name(dev));
74891 + return XEN_PCI_ERR_op_failed;
74892 + }
74893 +
74894 + *value = bar->which ? bar->len_val : bar->val;
74895 +
74896 + return 0;
74897 +}
74898 +
74899 +static inline void read_dev_bar(struct pci_dev *dev,
74900 + struct pci_bar_info *bar_info, int offset,
74901 + u32 len_mask)
74902 +{
74903 + pci_read_config_dword(dev, offset, &bar_info->val);
74904 + pci_write_config_dword(dev, offset, len_mask);
74905 + pci_read_config_dword(dev, offset, &bar_info->len_val);
74906 + pci_write_config_dword(dev, offset, bar_info->val);
74907 +}
74908 +
74909 +static void *bar_init(struct pci_dev *dev, int offset)
74910 +{
74911 + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
74912 +
74913 + if (!bar)
74914 + return ERR_PTR(-ENOMEM);
74915 +
74916 + read_dev_bar(dev, bar, offset, ~0);
74917 + bar->which = 0;
74918 +
74919 + return bar;
74920 +}
74921 +
74922 +static void *rom_init(struct pci_dev *dev, int offset)
74923 +{
74924 + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
74925 +
74926 + if (!bar)
74927 + return ERR_PTR(-ENOMEM);
74928 +
74929 + read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
74930 + bar->which = 0;
74931 +
74932 + return bar;
74933 +}
74934 +
74935 +static void bar_reset(struct pci_dev *dev, int offset, void *data)
74936 +{
74937 + struct pci_bar_info *bar = data;
74938 +
74939 + bar->which = 0;
74940 +}
74941 +
74942 +static void bar_release(struct pci_dev *dev, int offset, void *data)
74943 +{
74944 + kfree(data);
74945 +}
74946 +
74947 +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
74948 + void *data)
74949 +{
74950 + *value = (u8) dev->irq;
74951 +
74952 + return 0;
74953 +}
74954 +
74955 +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
74956 +{
74957 + u8 cur_value;
74958 + int err;
74959 +
74960 + err = pci_read_config_byte(dev, offset, &cur_value);
74961 + if (err)
74962 + goto out;
74963 +
74964 + if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
74965 + || value == PCI_BIST_START)
74966 + err = pci_write_config_byte(dev, offset, value);
74967 +
74968 + out:
74969 + return err;
74970 +}
74971 +
74972 +static struct config_field header_common[] = {
74973 + {
74974 + .offset = PCI_COMMAND,
74975 + .size = 2,
74976 + .u.w.read = pciback_read_config_word,
74977 + .u.w.write = command_write,
74978 + },
74979 + {
74980 + .offset = PCI_INTERRUPT_LINE,
74981 + .size = 1,
74982 + .u.b.read = interrupt_read,
74983 + },
74984 + {
74985 + .offset = PCI_INTERRUPT_PIN,
74986 + .size = 1,
74987 + .u.b.read = pciback_read_config_byte,
74988 + },
74989 + {
74990 + /* Any side effects of letting driver domain control cache line? */
74991 + .offset = PCI_CACHE_LINE_SIZE,
74992 + .size = 1,
74993 + .u.b.read = pciback_read_config_byte,
74994 + .u.b.write = pciback_write_config_byte,
74995 + },
74996 + {
74997 + .offset = PCI_LATENCY_TIMER,
74998 + .size = 1,
74999 + .u.b.read = pciback_read_config_byte,
75000 + },
75001 + {
75002 + .offset = PCI_BIST,
75003 + .size = 1,
75004 + .u.b.read = pciback_read_config_byte,
75005 + .u.b.write = bist_write,
75006 + },
75007 + {
75008 + .size = 0,
75009 + },
75010 +};
75011 +
75012 +#define CFG_FIELD_BAR(reg_offset) \
75013 + { \
75014 + .offset = reg_offset, \
75015 + .size = 4, \
75016 + .init = bar_init, \
75017 + .reset = bar_reset, \
75018 + .release = bar_release, \
75019 + .u.dw.read = bar_read, \
75020 + .u.dw.write = bar_write, \
75021 + }
75022 +
75023 +#define CFG_FIELD_ROM(reg_offset) \
75024 + { \
75025 + .offset = reg_offset, \
75026 + .size = 4, \
75027 + .init = rom_init, \
75028 + .reset = bar_reset, \
75029 + .release = bar_release, \
75030 + .u.dw.read = bar_read, \
75031 + .u.dw.write = rom_write, \
75032 + }
75033 +
75034 +static struct config_field header_0[] = {
75035 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
75036 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
75037 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
75038 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
75039 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
75040 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
75041 + CFG_FIELD_ROM(PCI_ROM_ADDRESS),
75042 + {
75043 + .size = 0,
75044 + },
75045 +};
75046 +
75047 +static struct config_field header_1[] = {
75048 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
75049 + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
75050 + CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
75051 + {
75052 + .size = 0,
75053 + },
75054 +};
75055 +
75056 +int pciback_config_header_add_fields(struct pci_dev *dev)
75057 +{
75058 + int err;
75059 +
75060 + err = pciback_config_add_fields(dev, header_common);
75061 + if (err)
75062 + goto out;
75063 +
75064 + switch (dev->hdr_type) {
75065 + case PCI_HEADER_TYPE_NORMAL:
75066 + err = pciback_config_add_fields(dev, header_0);
75067 + break;
75068 +
75069 + case PCI_HEADER_TYPE_BRIDGE:
75070 + err = pciback_config_add_fields(dev, header_1);
75071 + break;
75072 +
75073 + default:
75074 + err = -EINVAL;
75075 + printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
75076 + pci_name(dev), dev->hdr_type);
75077 + break;
75078 + }
75079 +
75080 + out:
75081 + return err;
75082 +}
75083 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.c linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.c
75084 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.c 1970-01-01 00:00:00.000000000 +0000
75085 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.c 2007-01-08 15:00:45.000000000 +0000
75086 @@ -0,0 +1,128 @@
75087 +/*
75088 + * PCI Backend - Handle special overlays for broken devices.
75089 + *
75090 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
75091 + * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
75092 + */
75093 +
75094 +#include <linux/kernel.h>
75095 +#include <linux/pci.h>
75096 +#include "pciback.h"
75097 +#include "conf_space.h"
75098 +#include "conf_space_quirks.h"
75099 +
75100 +LIST_HEAD(pciback_quirks);
75101 +
75102 +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
75103 +{
75104 + struct pciback_config_quirk *tmp_quirk;
75105 +
75106 + list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
75107 + if (pci_match_id(&tmp_quirk->devid, dev))
75108 + goto out;
75109 + tmp_quirk = NULL;
75110 + printk(KERN_DEBUG
75111 + "quirk didn't match any device pciback knows about\n");
75112 + out:
75113 + return tmp_quirk;
75114 +}
75115 +
75116 +static inline void register_quirk(struct pciback_config_quirk *quirk)
75117 +{
75118 + list_add_tail(&quirk->quirks_list, &pciback_quirks);
75119 +}
75120 +
75121 +int pciback_field_is_dup(struct pci_dev *dev, int reg)
75122 +{
75123 + int ret = 0;
75124 + struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
75125 + struct config_field *field;
75126 + struct config_field_entry *cfg_entry;
75127 +
75128 + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
75129 + field = cfg_entry->field;
75130 + if (field->offset == reg) {
75131 + ret = 1;
75132 + break;
75133 + }
75134 + }
75135 + return ret;
75136 +}
75137 +
75138 +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
75139 + *field)
75140 +{
75141 + int err = 0;
75142 +
75143 + switch (field->size) {
75144 + case 1:
75145 + field->u.b.read = pciback_read_config_byte;
75146 + field->u.b.write = pciback_write_config_byte;
75147 + break;
75148 + case 2:
75149 + field->u.w.read = pciback_read_config_word;
75150 + field->u.w.write = pciback_write_config_word;
75151 + break;
75152 + case 4:
75153 + field->u.dw.read = pciback_read_config_dword;
75154 + field->u.dw.write = pciback_write_config_dword;
75155 + break;
75156 + default:
75157 + err = -EINVAL;
75158 + goto out;
75159 + }
75160 +
75161 + pciback_config_add_field(dev, field);
75162 +
75163 + out:
75164 + return err;
75165 +}
75166 +
75167 +int pciback_config_quirks_init(struct pci_dev *dev)
75168 +{
75169 + struct pciback_config_quirk *quirk;
75170 + int ret = 0;
75171 +
75172 + quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
75173 + if (!quirk) {
75174 + ret = -ENOMEM;
75175 + goto out;
75176 + }
75177 +
75178 + quirk->devid.vendor = dev->vendor;
75179 + quirk->devid.device = dev->device;
75180 + quirk->devid.subvendor = dev->subsystem_vendor;
75181 + quirk->devid.subdevice = dev->subsystem_device;
75182 + quirk->devid.class = 0;
75183 + quirk->devid.class_mask = 0;
75184 + quirk->devid.driver_data = 0UL;
75185 +
75186 + quirk->pdev = dev;
75187 +
75188 + register_quirk(quirk);
75189 + out:
75190 + return ret;
75191 +}
75192 +
75193 +void pciback_config_field_free(struct config_field *field)
75194 +{
75195 + kfree(field);
75196 +}
75197 +
75198 +int pciback_config_quirk_release(struct pci_dev *dev)
75199 +{
75200 + struct pciback_config_quirk *quirk;
75201 + int ret = 0;
75202 +
75203 + quirk = pciback_find_quirk(dev);
75204 + if (!quirk) {
75205 + ret = -ENXIO;
75206 + goto out;
75207 + }
75208 +
75209 + list_del(&quirk->quirks_list);
75210 + kfree(quirk);
75211 +
75212 + out:
75213 + return ret;
75214 +}
75215 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.h linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.h
75216 --- linux-2.6.16.33-noxen/drivers/xen/pciback/conf_space_quirks.h 1970-01-01 00:00:00.000000000 +0000
75217 +++ linux-2.6.16.33/drivers/xen/pciback/conf_space_quirks.h 2007-01-08 15:00:45.000000000 +0000
75218 @@ -0,0 +1,35 @@
75219 +/*
75220 + * PCI Backend - Data structures for special overlays for broken devices.
75221 + *
75222 + * Ryan Wilson <hap9@epoch.ncsc.mil>
75223 + * Chris Bookholt <hap10@epoch.ncsc.mil>
75224 + */
75225 +
75226 +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
75227 +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
75228 +
75229 +#include <linux/pci.h>
75230 +#include <linux/list.h>
75231 +
75232 +struct pciback_config_quirk {
75233 + struct list_head quirks_list;
75234 + struct pci_device_id devid;
75235 + struct pci_dev *pdev;
75236 +};
75237 +
75238 +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
75239 +
75240 +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
75241 + *field);
75242 +
75243 +int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
75244 +
75245 +int pciback_config_quirks_init(struct pci_dev *dev);
75246 +
75247 +void pciback_config_field_free(struct config_field *field);
75248 +
75249 +int pciback_config_quirk_release(struct pci_dev *dev);
75250 +
75251 +int pciback_field_is_dup(struct pci_dev *dev, int reg);
75252 +
75253 +#endif
75254 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/passthrough.c linux-2.6.16.33/drivers/xen/pciback/passthrough.c
75255 --- linux-2.6.16.33-noxen/drivers/xen/pciback/passthrough.c 1970-01-01 00:00:00.000000000 +0000
75256 +++ linux-2.6.16.33/drivers/xen/pciback/passthrough.c 2007-01-08 15:00:45.000000000 +0000
75257 @@ -0,0 +1,157 @@
75258 +/*
75259 + * PCI Backend - Provides restricted access to the real PCI bus topology
75260 + * to the frontend
75261 + *
75262 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
75263 + */
75264 +
75265 +#include <linux/list.h>
75266 +#include <linux/pci.h>
75267 +#include <linux/spinlock.h>
75268 +#include "pciback.h"
75269 +
75270 +struct passthrough_dev_data {
75271 + /* Access to dev_list must be protected by lock */
75272 + struct list_head dev_list;
75273 + spinlock_t lock;
75274 +};
75275 +
75276 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
75277 + unsigned int domain, unsigned int bus,
75278 + unsigned int devfn)
75279 +{
75280 + struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75281 + struct pci_dev_entry *dev_entry;
75282 + struct pci_dev *dev = NULL;
75283 + unsigned long flags;
75284 +
75285 + spin_lock_irqsave(&dev_data->lock, flags);
75286 +
75287 + list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
75288 + if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
75289 + && bus == (unsigned int)dev_entry->dev->bus->number
75290 + && devfn == dev_entry->dev->devfn) {
75291 + dev = dev_entry->dev;
75292 + break;
75293 + }
75294 + }
75295 +
75296 + spin_unlock_irqrestore(&dev_data->lock, flags);
75297 +
75298 + return dev;
75299 +}
75300 +
75301 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
75302 +{
75303 + struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75304 + struct pci_dev_entry *dev_entry;
75305 + unsigned long flags;
75306 +
75307 + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
75308 + if (!dev_entry)
75309 + return -ENOMEM;
75310 + dev_entry->dev = dev;
75311 +
75312 + spin_lock_irqsave(&dev_data->lock, flags);
75313 + list_add_tail(&dev_entry->list, &dev_data->dev_list);
75314 + spin_unlock_irqrestore(&dev_data->lock, flags);
75315 +
75316 + return 0;
75317 +}
75318 +
75319 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
75320 +{
75321 + struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75322 + struct pci_dev_entry *dev_entry, *t;
75323 + struct pci_dev *found_dev = NULL;
75324 + unsigned long flags;
75325 +
75326 + spin_lock_irqsave(&dev_data->lock, flags);
75327 +
75328 + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
75329 + if (dev_entry->dev == dev) {
75330 + list_del(&dev_entry->list);
75331 + found_dev = dev_entry->dev;
75332 + kfree(dev_entry);
75333 + }
75334 + }
75335 +
75336 + spin_unlock_irqrestore(&dev_data->lock, flags);
75337 +
75338 + if (found_dev)
75339 + pcistub_put_pci_dev(found_dev);
75340 +}
75341 +
75342 +int pciback_init_devices(struct pciback_device *pdev)
75343 +{
75344 + struct passthrough_dev_data *dev_data;
75345 +
75346 + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
75347 + if (!dev_data)
75348 + return -ENOMEM;
75349 +
75350 + spin_lock_init(&dev_data->lock);
75351 +
75352 + INIT_LIST_HEAD(&dev_data->dev_list);
75353 +
75354 + pdev->pci_dev_data = dev_data;
75355 +
75356 + return 0;
75357 +}
75358 +
75359 +int pciback_publish_pci_roots(struct pciback_device *pdev,
75360 + publish_pci_root_cb publish_root_cb)
75361 +{
75362 + int err = 0;
75363 + struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75364 + struct pci_dev_entry *dev_entry, *e;
75365 + struct pci_dev *dev;
75366 + int found;
75367 + unsigned int domain, bus;
75368 +
75369 + spin_lock(&dev_data->lock);
75370 +
75371 + list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
75372 + /* Only publish this device as a root if none of its
75373 + * parent bridges are exported
75374 + */
75375 + found = 0;
75376 + dev = dev_entry->dev->bus->self;
75377 + for (; !found && dev != NULL; dev = dev->bus->self) {
75378 + list_for_each_entry(e, &dev_data->dev_list, list) {
75379 + if (dev == e->dev) {
75380 + found = 1;
75381 + break;
75382 + }
75383 + }
75384 + }
75385 +
75386 + domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
75387 + bus = (unsigned int)dev_entry->dev->bus->number;
75388 +
75389 + if (!found) {
75390 + err = publish_root_cb(pdev, domain, bus);
75391 + if (err)
75392 + break;
75393 + }
75394 + }
75395 +
75396 + spin_unlock(&dev_data->lock);
75397 +
75398 + return err;
75399 +}
75400 +
75401 +void pciback_release_devices(struct pciback_device *pdev)
75402 +{
75403 + struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
75404 + struct pci_dev_entry *dev_entry, *t;
75405 +
75406 + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
75407 + list_del(&dev_entry->list);
75408 + pcistub_put_pci_dev(dev_entry->dev);
75409 + kfree(dev_entry);
75410 + }
75411 +
75412 + kfree(dev_data);
75413 + pdev->pci_dev_data = NULL;
75414 +}
75415 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pci_stub.c linux-2.6.16.33/drivers/xen/pciback/pci_stub.c
75416 --- linux-2.6.16.33-noxen/drivers/xen/pciback/pci_stub.c 1970-01-01 00:00:00.000000000 +0000
75417 +++ linux-2.6.16.33/drivers/xen/pciback/pci_stub.c 2007-01-08 15:00:45.000000000 +0000
75418 @@ -0,0 +1,916 @@
75419 +/*
75420 + * PCI Stub Driver - Grabs devices in backend to be exported later
75421 + *
75422 + * Ryan Wilson <hap9@epoch.ncsc.mil>
75423 + * Chris Bookholt <hap10@epoch.ncsc.mil>
75424 + */
75425 +#include <linux/module.h>
75426 +#include <linux/init.h>
75427 +#include <linux/list.h>
75428 +#include <linux/spinlock.h>
75429 +#include <linux/kref.h>
75430 +#include <asm/atomic.h>
75431 +#include "pciback.h"
75432 +#include "conf_space.h"
75433 +#include "conf_space_quirks.h"
75434 +
75435 +static char *pci_devs_to_hide = NULL;
75436 +module_param_named(hide, pci_devs_to_hide, charp, 0444);
75437 +
75438 +struct pcistub_device_id {
75439 + struct list_head slot_list;
75440 + int domain;
75441 + unsigned char bus;
75442 + unsigned int devfn;
75443 +};
75444 +static LIST_HEAD(pcistub_device_ids);
75445 +static DEFINE_SPINLOCK(device_ids_lock);
75446 +
75447 +struct pcistub_device {
75448 + struct kref kref;
75449 + struct list_head dev_list;
75450 + spinlock_t lock;
75451 +
75452 + struct pci_dev *dev;
75453 + struct pciback_device *pdev; /* non-NULL if struct pci_dev is in use */
75454 +};
75455 +
75456 +/* Access to pcistub_devices & seized_devices lists and the initialize_devices
75457 + * flag must be locked with pcistub_devices_lock
75458 + */
75459 +static DEFINE_SPINLOCK(pcistub_devices_lock);
75460 +static LIST_HEAD(pcistub_devices);
75461 +
75462 +/* wait for device_initcall before initializing our devices
75463 + * (see pcistub_init_devices_late)
75464 + */
75465 +static int initialize_devices = 0;
75466 +static LIST_HEAD(seized_devices);
75467 +
75468 +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
75469 +{
75470 + struct pcistub_device *psdev;
75471 +
75472 + dev_dbg(&dev->dev, "pcistub_device_alloc\n");
75473 +
75474 + psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
75475 + if (!psdev)
75476 + return NULL;
75477 +
75478 + psdev->dev = pci_dev_get(dev);
75479 + if (!psdev->dev) {
75480 + kfree(psdev);
75481 + return NULL;
75482 + }
75483 +
75484 + kref_init(&psdev->kref);
75485 + spin_lock_init(&psdev->lock);
75486 +
75487 + return psdev;
75488 +}
75489 +
75490 +/* Don't call this directly as it's called by pcistub_device_put */
75491 +static void pcistub_device_release(struct kref *kref)
75492 +{
75493 + struct pcistub_device *psdev;
75494 +
75495 + psdev = container_of(kref, struct pcistub_device, kref);
75496 +
75497 + dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
75498 +
75499 + /* Clean-up the device */
75500 + pciback_reset_device(psdev->dev);
75501 + pciback_config_free_dyn_fields(psdev->dev);
75502 + pciback_config_free_dev(psdev->dev);
75503 + kfree(pci_get_drvdata(psdev->dev));
75504 + pci_set_drvdata(psdev->dev, NULL);
75505 +
75506 + pci_dev_put(psdev->dev);
75507 +
75508 + kfree(psdev);
75509 +}
75510 +
75511 +static inline void pcistub_device_get(struct pcistub_device *psdev)
75512 +{
75513 + kref_get(&psdev->kref);
75514 +}
75515 +
75516 +static inline void pcistub_device_put(struct pcistub_device *psdev)
75517 +{
75518 + kref_put(&psdev->kref, pcistub_device_release);
75519 +}
75520 +
75521 +static struct pcistub_device *pcistub_device_find(int domain, int bus,
75522 + int slot, int func)
75523 +{
75524 + struct pcistub_device *psdev = NULL;
75525 + unsigned long flags;
75526 +
75527 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75528 +
75529 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75530 + if (psdev->dev != NULL
75531 + && domain == pci_domain_nr(psdev->dev->bus)
75532 + && bus == psdev->dev->bus->number
75533 + && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
75534 + pcistub_device_get(psdev);
75535 + goto out;
75536 + }
75537 + }
75538 +
75539 + /* didn't find it */
75540 + psdev = NULL;
75541 +
75542 + out:
75543 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75544 + return psdev;
75545 +}
75546 +
75547 +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
75548 + struct pcistub_device *psdev)
75549 +{
75550 + struct pci_dev *pci_dev = NULL;
75551 + unsigned long flags;
75552 +
75553 + pcistub_device_get(psdev);
75554 +
75555 + spin_lock_irqsave(&psdev->lock, flags);
75556 + if (!psdev->pdev) {
75557 + psdev->pdev = pdev;
75558 + pci_dev = psdev->dev;
75559 + }
75560 + spin_unlock_irqrestore(&psdev->lock, flags);
75561 +
75562 + if (!pci_dev)
75563 + pcistub_device_put(psdev);
75564 +
75565 + return pci_dev;
75566 +}
75567 +
75568 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
75569 + int domain, int bus,
75570 + int slot, int func)
75571 +{
75572 + struct pcistub_device *psdev;
75573 + struct pci_dev *found_dev = NULL;
75574 + unsigned long flags;
75575 +
75576 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75577 +
75578 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75579 + if (psdev->dev != NULL
75580 + && domain == pci_domain_nr(psdev->dev->bus)
75581 + && bus == psdev->dev->bus->number
75582 + && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
75583 + found_dev = pcistub_device_get_pci_dev(pdev, psdev);
75584 + break;
75585 + }
75586 + }
75587 +
75588 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75589 + return found_dev;
75590 +}
75591 +
75592 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
75593 + struct pci_dev *dev)
75594 +{
75595 + struct pcistub_device *psdev;
75596 + struct pci_dev *found_dev = NULL;
75597 + unsigned long flags;
75598 +
75599 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75600 +
75601 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75602 + if (psdev->dev == dev) {
75603 + found_dev = pcistub_device_get_pci_dev(pdev, psdev);
75604 + break;
75605 + }
75606 + }
75607 +
75608 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75609 + return found_dev;
75610 +}
75611 +
75612 +void pcistub_put_pci_dev(struct pci_dev *dev)
75613 +{
75614 + struct pcistub_device *psdev, *found_psdev = NULL;
75615 + unsigned long flags;
75616 +
75617 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75618 +
75619 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75620 + if (psdev->dev == dev) {
75621 + found_psdev = psdev;
75622 + break;
75623 + }
75624 + }
75625 +
75626 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75627 +
75628 + /* Cleanup our device
75629 + * (so it's ready for the next domain)
75630 + */
75631 + pciback_reset_device(found_psdev->dev);
75632 + pciback_config_free_dyn_fields(found_psdev->dev);
75633 + pciback_config_reset_dev(found_psdev->dev);
75634 +
75635 + spin_lock_irqsave(&found_psdev->lock, flags);
75636 + found_psdev->pdev = NULL;
75637 + spin_unlock_irqrestore(&found_psdev->lock, flags);
75638 +
75639 + pcistub_device_put(found_psdev);
75640 +}
75641 +
75642 +static int __devinit pcistub_match_one(struct pci_dev *dev,
75643 + struct pcistub_device_id *pdev_id)
75644 +{
75645 + /* Match the specified device by domain, bus, slot, func and also if
75646 + * any of the device's parent bridges match.
75647 + */
75648 + for (; dev != NULL; dev = dev->bus->self) {
75649 + if (pci_domain_nr(dev->bus) == pdev_id->domain
75650 + && dev->bus->number == pdev_id->bus
75651 + && dev->devfn == pdev_id->devfn)
75652 + return 1;
75653 +
75654 + /* Sometimes topmost bridge links to itself. */
75655 + if (dev == dev->bus->self)
75656 + break;
75657 + }
75658 +
75659 + return 0;
75660 +}
75661 +
75662 +static int __devinit pcistub_match(struct pci_dev *dev)
75663 +{
75664 + struct pcistub_device_id *pdev_id;
75665 + unsigned long flags;
75666 + int found = 0;
75667 +
75668 + spin_lock_irqsave(&device_ids_lock, flags);
75669 + list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
75670 + if (pcistub_match_one(dev, pdev_id)) {
75671 + found = 1;
75672 + break;
75673 + }
75674 + }
75675 + spin_unlock_irqrestore(&device_ids_lock, flags);
75676 +
75677 + return found;
75678 +}
75679 +
75680 +static int __devinit pcistub_init_device(struct pci_dev *dev)
75681 +{
75682 + struct pciback_dev_data *dev_data;
75683 + int err = 0;
75684 +
75685 + dev_dbg(&dev->dev, "initializing...\n");
75686 +
75687 + /* The PCI backend is not intended to be a module (or to work with
75688 + * removable PCI devices (yet). If it were, pciback_config_free()
75689 + * would need to be called somewhere to free the memory allocated
75690 + * here and then to call kfree(pci_get_drvdata(psdev->dev)).
75691 + */
75692 + dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
75693 + if (!dev_data) {
75694 + err = -ENOMEM;
75695 + goto out;
75696 + }
75697 + pci_set_drvdata(dev, dev_data);
75698 +
75699 + dev_dbg(&dev->dev, "initializing config\n");
75700 + err = pciback_config_init_dev(dev);
75701 + if (err)
75702 + goto out;
75703 +
75704 + /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
75705 + * must do this here because pcibios_enable_device may specify
75706 + * the pci device's true irq (and possibly its other resources)
75707 + * if they differ from what's in the configuration space.
75708 + * This makes the assumption that the device's resources won't
75709 + * change after this point (otherwise this code may break!)
75710 + */
75711 + dev_dbg(&dev->dev, "enabling device\n");
75712 + err = pci_enable_device(dev);
75713 + if (err)
75714 + goto config_release;
75715 +
75716 + /* Now disable the device (this also ensures some private device
75717 + * data is setup before we export)
75718 + */
75719 + dev_dbg(&dev->dev, "reset device\n");
75720 + pciback_reset_device(dev);
75721 +
75722 + return 0;
75723 +
75724 + config_release:
75725 + pciback_config_free_dev(dev);
75726 +
75727 + out:
75728 + pci_set_drvdata(dev, NULL);
75729 + kfree(dev_data);
75730 + return err;
75731 +}
75732 +
75733 +/*
75734 + * Because some initialization still happens on
75735 + * devices during fs_initcall, we need to defer
75736 + * full initialization of our devices until
75737 + * device_initcall.
75738 + */
75739 +static int __init pcistub_init_devices_late(void)
75740 +{
75741 + struct pcistub_device *psdev;
75742 + unsigned long flags;
75743 + int err = 0;
75744 +
75745 + pr_debug("pciback: pcistub_init_devices_late\n");
75746 +
75747 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75748 +
75749 + while (!list_empty(&seized_devices)) {
75750 + psdev = container_of(seized_devices.next,
75751 + struct pcistub_device, dev_list);
75752 + list_del(&psdev->dev_list);
75753 +
75754 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75755 +
75756 + err = pcistub_init_device(psdev->dev);
75757 + if (err) {
75758 + dev_err(&psdev->dev->dev,
75759 + "error %d initializing device\n", err);
75760 + kfree(psdev);
75761 + psdev = NULL;
75762 + }
75763 +
75764 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75765 +
75766 + if (psdev)
75767 + list_add_tail(&psdev->dev_list, &pcistub_devices);
75768 + }
75769 +
75770 + initialize_devices = 1;
75771 +
75772 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75773 +
75774 + return 0;
75775 +}
75776 +
75777 +static int __devinit pcistub_seize(struct pci_dev *dev)
75778 +{
75779 + struct pcistub_device *psdev;
75780 + unsigned long flags;
75781 + int err = 0;
75782 +
75783 + psdev = pcistub_device_alloc(dev);
75784 + if (!psdev)
75785 + return -ENOMEM;
75786 +
75787 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75788 +
75789 + if (initialize_devices) {
75790 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75791 +
75792 + /* don't want irqs disabled when calling pcistub_init_device */
75793 + err = pcistub_init_device(psdev->dev);
75794 +
75795 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75796 +
75797 + if (!err)
75798 + list_add(&psdev->dev_list, &pcistub_devices);
75799 + } else {
75800 + dev_dbg(&dev->dev, "deferring initialization\n");
75801 + list_add(&psdev->dev_list, &seized_devices);
75802 + }
75803 +
75804 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75805 +
75806 + if (err)
75807 + pcistub_device_put(psdev);
75808 +
75809 + return err;
75810 +}
75811 +
75812 +static int __devinit pcistub_probe(struct pci_dev *dev,
75813 + const struct pci_device_id *id)
75814 +{
75815 + int err = 0;
75816 +
75817 + dev_dbg(&dev->dev, "probing...\n");
75818 +
75819 + if (pcistub_match(dev)) {
75820 +
75821 + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
75822 + && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
75823 + dev_err(&dev->dev, "can't export pci devices that "
75824 + "don't have a normal (0) or bridge (1) "
75825 + "header type!\n");
75826 + err = -ENODEV;
75827 + goto out;
75828 + }
75829 +
75830 + dev_info(&dev->dev, "seizing device\n");
75831 + err = pcistub_seize(dev);
75832 + } else
75833 + /* Didn't find the device */
75834 + err = -ENODEV;
75835 +
75836 + out:
75837 + return err;
75838 +}
75839 +
75840 +static void pcistub_remove(struct pci_dev *dev)
75841 +{
75842 + struct pcistub_device *psdev, *found_psdev = NULL;
75843 + unsigned long flags;
75844 +
75845 + dev_dbg(&dev->dev, "removing\n");
75846 +
75847 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75848 +
75849 + pciback_config_quirk_release(dev);
75850 +
75851 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
75852 + if (psdev->dev == dev) {
75853 + found_psdev = psdev;
75854 + break;
75855 + }
75856 + }
75857 +
75858 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75859 +
75860 + if (found_psdev) {
75861 + dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
75862 + found_psdev->pdev);
75863 +
75864 + if (found_psdev->pdev) {
75865 + printk(KERN_WARNING "pciback: ****** removing device "
75866 + "%s while still in-use! ******\n",
75867 + pci_name(found_psdev->dev));
75868 + printk(KERN_WARNING "pciback: ****** driver domain may "
75869 + "still access this device's i/o resources!\n");
75870 + printk(KERN_WARNING "pciback: ****** shutdown driver "
75871 + "domain before binding device\n");
75872 + printk(KERN_WARNING "pciback: ****** to other drivers "
75873 + "or domains\n");
75874 +
75875 + pciback_release_pci_dev(found_psdev->pdev,
75876 + found_psdev->dev);
75877 + }
75878 +
75879 + spin_lock_irqsave(&pcistub_devices_lock, flags);
75880 + list_del(&found_psdev->dev_list);
75881 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
75882 +
75883 + /* the final put for releasing from the list */
75884 + pcistub_device_put(found_psdev);
75885 + }
75886 +}
75887 +
75888 +static struct pci_device_id pcistub_ids[] = {
75889 + {
75890 + .vendor = PCI_ANY_ID,
75891 + .device = PCI_ANY_ID,
75892 + .subvendor = PCI_ANY_ID,
75893 + .subdevice = PCI_ANY_ID,
75894 + },
75895 + {0,},
75896 +};
75897 +
75898 +/*
75899 + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
75900 + * for a normal device. I don't want it to be loaded automatically.
75901 + */
75902 +
75903 +static struct pci_driver pciback_pci_driver = {
75904 + .name = "pciback",
75905 + .id_table = pcistub_ids,
75906 + .probe = pcistub_probe,
75907 + .remove = pcistub_remove,
75908 +};
75909 +
75910 +static inline int str_to_slot(const char *buf, int *domain, int *bus,
75911 + int *slot, int *func)
75912 +{
75913 + int err;
75914 +
75915 + err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
75916 + if (err == 4)
75917 + return 0;
75918 + else if (err < 0)
75919 + return -EINVAL;
75920 +
75921 + /* try again without domain */
75922 + *domain = 0;
75923 + err = sscanf(buf, " %x:%x.%x", bus, slot, func);
75924 + if (err == 3)
75925 + return 0;
75926 +
75927 + return -EINVAL;
75928 +}
75929 +
75930 +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
75931 + *slot, int *func, int *reg, int *size, int *mask)
75932 +{
75933 + int err;
75934 +
75935 + err =
75936 + sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
75937 + func, reg, size, mask);
75938 + if (err == 7)
75939 + return 0;
75940 + return -EINVAL;
75941 +}
75942 +
75943 +static int pcistub_device_id_add(int domain, int bus, int slot, int func)
75944 +{
75945 + struct pcistub_device_id *pci_dev_id;
75946 + unsigned long flags;
75947 +
75948 + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
75949 + if (!pci_dev_id)
75950 + return -ENOMEM;
75951 +
75952 + pci_dev_id->domain = domain;
75953 + pci_dev_id->bus = bus;
75954 + pci_dev_id->devfn = PCI_DEVFN(slot, func);
75955 +
75956 + pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
75957 + domain, bus, slot, func);
75958 +
75959 + spin_lock_irqsave(&device_ids_lock, flags);
75960 + list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
75961 + spin_unlock_irqrestore(&device_ids_lock, flags);
75962 +
75963 + return 0;
75964 +}
75965 +
75966 +static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
75967 +{
75968 + struct pcistub_device_id *pci_dev_id, *t;
75969 + int devfn = PCI_DEVFN(slot, func);
75970 + int err = -ENOENT;
75971 + unsigned long flags;
75972 +
75973 + spin_lock_irqsave(&device_ids_lock, flags);
75974 + list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
75975 +
75976 + if (pci_dev_id->domain == domain
75977 + && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
75978 + /* Don't break; here because it's possible the same
75979 + * slot could be in the list more than once
75980 + */
75981 + list_del(&pci_dev_id->slot_list);
75982 + kfree(pci_dev_id);
75983 +
75984 + err = 0;
75985 +
75986 + pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
75987 + "seize list\n", domain, bus, slot, func);
75988 + }
75989 + }
75990 + spin_unlock_irqrestore(&device_ids_lock, flags);
75991 +
75992 + return err;
75993 +}
75994 +
75995 +static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
75996 + int size, int mask)
75997 +{
75998 + int err = 0;
75999 + struct pcistub_device *psdev;
76000 + struct pci_dev *dev;
76001 + struct config_field *field;
76002 +
76003 + psdev = pcistub_device_find(domain, bus, slot, func);
76004 + if (!psdev || !psdev->dev) {
76005 + err = -ENODEV;
76006 + goto out;
76007 + }
76008 + dev = psdev->dev;
76009 +
76010 + /* check for duplicate field */
76011 + if (pciback_field_is_dup(dev, reg))
76012 + goto out;
76013 +
76014 + field = kzalloc(sizeof(*field), GFP_ATOMIC);
76015 + if (!field) {
76016 + err = -ENOMEM;
76017 + goto out;
76018 + }
76019 +
76020 + field->offset = reg;
76021 + field->size = size;
76022 + field->mask = mask;
76023 + field->init = NULL;
76024 + field->reset = NULL;
76025 + field->release = NULL;
76026 + field->clean = pciback_config_field_free;
76027 +
76028 + err = pciback_config_quirks_add_field(dev, field);
76029 + if (err)
76030 + kfree(field);
76031 + out:
76032 + return err;
76033 +}
76034 +
76035 +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
76036 + size_t count)
76037 +{
76038 + int domain, bus, slot, func;
76039 + int err;
76040 +
76041 + err = str_to_slot(buf, &domain, &bus, &slot, &func);
76042 + if (err)
76043 + goto out;
76044 +
76045 + err = pcistub_device_id_add(domain, bus, slot, func);
76046 +
76047 + out:
76048 + if (!err)
76049 + err = count;
76050 + return err;
76051 +}
76052 +
76053 +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
76054 +
76055 +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
76056 + size_t count)
76057 +{
76058 + int domain, bus, slot, func;
76059 + int err;
76060 +
76061 + err = str_to_slot(buf, &domain, &bus, &slot, &func);
76062 + if (err)
76063 + goto out;
76064 +
76065 + err = pcistub_device_id_remove(domain, bus, slot, func);
76066 +
76067 + out:
76068 + if (!err)
76069 + err = count;
76070 + return err;
76071 +}
76072 +
76073 +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
76074 +
76075 +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
76076 +{
76077 + struct pcistub_device_id *pci_dev_id;
76078 + size_t count = 0;
76079 + unsigned long flags;
76080 +
76081 + spin_lock_irqsave(&device_ids_lock, flags);
76082 + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
76083 + if (count >= PAGE_SIZE)
76084 + break;
76085 +
76086 + count += scnprintf(buf + count, PAGE_SIZE - count,
76087 + "%04x:%02x:%02x.%01x\n",
76088 + pci_dev_id->domain, pci_dev_id->bus,
76089 + PCI_SLOT(pci_dev_id->devfn),
76090 + PCI_FUNC(pci_dev_id->devfn));
76091 + }
76092 + spin_unlock_irqrestore(&device_ids_lock, flags);
76093 +
76094 + return count;
76095 +}
76096 +
76097 +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
76098 +
76099 +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
76100 + size_t count)
76101 +{
76102 + int domain, bus, slot, func, reg, size, mask;
76103 + int err;
76104 +
76105 + err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
76106 + &mask);
76107 + if (err)
76108 + goto out;
76109 +
76110 + err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
76111 +
76112 + out:
76113 + if (!err)
76114 + err = count;
76115 + return err;
76116 +}
76117 +
76118 +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
76119 +{
76120 + int count = 0;
76121 + unsigned long flags;
76122 + extern struct list_head pciback_quirks;
76123 + struct pciback_config_quirk *quirk;
76124 + struct pciback_dev_data *dev_data;
76125 + struct config_field *field;
76126 + struct config_field_entry *cfg_entry;
76127 +
76128 + spin_lock_irqsave(&device_ids_lock, flags);
76129 + list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
76130 + if (count >= PAGE_SIZE)
76131 + goto out;
76132 +
76133 + count += scnprintf(buf + count, PAGE_SIZE - count,
76134 + "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
76135 + quirk->pdev->bus->number,
76136 + PCI_SLOT(quirk->pdev->devfn),
76137 + PCI_FUNC(quirk->pdev->devfn),
76138 + quirk->devid.vendor, quirk->devid.device,
76139 + quirk->devid.subvendor,
76140 + quirk->devid.subdevice);
76141 +
76142 + dev_data = pci_get_drvdata(quirk->pdev);
76143 +
76144 + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
76145 + field = cfg_entry->field;
76146 + if (count >= PAGE_SIZE)
76147 + goto out;
76148 +
76149 + count += scnprintf(buf + count, PAGE_SIZE -
76150 + count, "\t\t%08x:%01x:%08x\n",
76151 + field->offset, field->size,
76152 + field->mask);
76153 + }
76154 + }
76155 +
76156 + out:
76157 + spin_unlock_irqrestore(&device_ids_lock, flags);
76158 +
76159 + return count;
76160 +}
76161 +
76162 +DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
76163 +
76164 +static ssize_t permissive_add(struct device_driver *drv, const char *buf,
76165 + size_t count)
76166 +{
76167 + int domain, bus, slot, func;
76168 + int err;
76169 + struct pcistub_device *psdev;
76170 + struct pciback_dev_data *dev_data;
76171 + err = str_to_slot(buf, &domain, &bus, &slot, &func);
76172 + if (err)
76173 + goto out;
76174 + psdev = pcistub_device_find(domain, bus, slot, func);
76175 + if (!psdev) {
76176 + err = -ENODEV;
76177 + goto out;
76178 + }
76179 + if (!psdev->dev) {
76180 + err = -ENODEV;
76181 + goto release;
76182 + }
76183 + dev_data = pci_get_drvdata(psdev->dev);
76184 + /* the driver data for a device should never be null at this point */
76185 + if (!dev_data) {
76186 + err = -ENXIO;
76187 + goto release;
76188 + }
76189 + if (!dev_data->permissive) {
76190 + dev_data->permissive = 1;
76191 + /* Let user know that what they're doing could be unsafe */
76192 + dev_warn(&psdev->dev->dev,
76193 + "enabling permissive mode configuration space accesses!\n");
76194 + dev_warn(&psdev->dev->dev,
76195 + "permissive mode is potentially unsafe!\n");
76196 + }
76197 + release:
76198 + pcistub_device_put(psdev);
76199 + out:
76200 + if (!err)
76201 + err = count;
76202 + return err;
76203 +}
76204 +
76205 +static ssize_t permissive_show(struct device_driver *drv, char *buf)
76206 +{
76207 + struct pcistub_device *psdev;
76208 + struct pciback_dev_data *dev_data;
76209 + size_t count = 0;
76210 + unsigned long flags;
76211 + spin_lock_irqsave(&pcistub_devices_lock, flags);
76212 + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
76213 + if (count >= PAGE_SIZE)
76214 + break;
76215 + if (!psdev->dev)
76216 + continue;
76217 + dev_data = pci_get_drvdata(psdev->dev);
76218 + if (!dev_data || !dev_data->permissive)
76219 + continue;
76220 + count +=
76221 + scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
76222 + pci_name(psdev->dev));
76223 + }
76224 + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
76225 + return count;
76226 +}
76227 +
76228 +DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
76229 +
76230 +static int __init pcistub_init(void)
76231 +{
76232 + int pos = 0;
76233 + int err = 0;
76234 + int domain, bus, slot, func;
76235 + int parsed;
76236 +
76237 + if (pci_devs_to_hide && *pci_devs_to_hide) {
76238 + do {
76239 + parsed = 0;
76240 +
76241 + err = sscanf(pci_devs_to_hide + pos,
76242 + " (%x:%x:%x.%x) %n",
76243 + &domain, &bus, &slot, &func, &parsed);
76244 + if (err != 4) {
76245 + domain = 0;
76246 + err = sscanf(pci_devs_to_hide + pos,
76247 + " (%x:%x.%x) %n",
76248 + &bus, &slot, &func, &parsed);
76249 + if (err != 3)
76250 + goto parse_error;
76251 + }
76252 +
76253 + err = pcistub_device_id_add(domain, bus, slot, func);
76254 + if (err)
76255 + goto out;
76256 +
76257 + /* if parsed<=0, we've reached the end of the string */
76258 + pos += parsed;
76259 + } while (parsed > 0 && pci_devs_to_hide[pos]);
76260 + }
76261 +
76262 + /* If we're the first PCI Device Driver to register, we're the
76263 + * first one to get offered PCI devices as they become
76264 + * available (and thus we can be the first to grab them)
76265 + */
76266 + err = pci_register_driver(&pciback_pci_driver);
76267 + if (err < 0)
76268 + goto out;
76269 +
76270 + driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
76271 + driver_create_file(&pciback_pci_driver.driver,
76272 + &driver_attr_remove_slot);
76273 + driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
76274 + driver_create_file(&pciback_pci_driver.driver, &driver_attr_quirks);
76275 + driver_create_file(&pciback_pci_driver.driver, &driver_attr_permissive);
76276 +
76277 + out:
76278 + return err;
76279 +
76280 + parse_error:
76281 + printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
76282 + pci_devs_to_hide + pos);
76283 + return -EINVAL;
76284 +}
76285 +
76286 +#ifndef MODULE
76287 +/*
76288 + * fs_initcall happens before device_initcall
76289 + * so pciback *should* get called first (b/c we
76290 + * want to suck up any device before other drivers
76291 + * get a chance by being the first pci device
76292 + * driver to register)
76293 + */
76294 +fs_initcall(pcistub_init);
76295 +#endif
76296 +
76297 +static int __init pciback_init(void)
76298 +{
76299 + int err;
76300 +
76301 + err = pciback_config_init();
76302 + if (err)
76303 + return err;
76304 +
76305 +#ifdef MODULE
76306 + err = pcistub_init();
76307 + if (err < 0)
76308 + return err;
76309 +#endif
76310 +
76311 + pcistub_init_devices_late();
76312 + pciback_xenbus_register();
76313 +
76314 + return 0;
76315 +}
76316 +
76317 +static void __exit pciback_cleanup(void)
76318 +{
76319 + pciback_xenbus_unregister();
76320 +
76321 + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
76322 + driver_remove_file(&pciback_pci_driver.driver,
76323 + &driver_attr_remove_slot);
76324 + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
76325 + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
76326 + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
76327 +
76328 + pci_unregister_driver(&pciback_pci_driver);
76329 +}
76330 +
76331 +module_init(pciback_init);
76332 +module_exit(pciback_cleanup);
76333 +
76334 +MODULE_LICENSE("Dual BSD/GPL");
76335 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pciback.h linux-2.6.16.33/drivers/xen/pciback/pciback.h
76336 --- linux-2.6.16.33-noxen/drivers/xen/pciback/pciback.h 1970-01-01 00:00:00.000000000 +0000
76337 +++ linux-2.6.16.33/drivers/xen/pciback/pciback.h 2007-01-08 15:00:45.000000000 +0000
76338 @@ -0,0 +1,93 @@
76339 +/*
76340 + * PCI Backend Common Data Structures & Function Declarations
76341 + *
76342 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76343 + */
76344 +#ifndef __XEN_PCIBACK_H__
76345 +#define __XEN_PCIBACK_H__
76346 +
76347 +#include <linux/pci.h>
76348 +#include <linux/interrupt.h>
76349 +#include <xen/xenbus.h>
76350 +#include <linux/list.h>
76351 +#include <linux/spinlock.h>
76352 +#include <linux/workqueue.h>
76353 +#include <asm/atomic.h>
76354 +#include <xen/interface/io/pciif.h>
76355 +
76356 +struct pci_dev_entry {
76357 + struct list_head list;
76358 + struct pci_dev *dev;
76359 +};
76360 +
76361 +#define _PDEVF_op_active (0)
76362 +#define PDEVF_op_active (1<<(_PDEVF_op_active))
76363 +
76364 +struct pciback_device {
76365 + void *pci_dev_data;
76366 + spinlock_t dev_lock;
76367 +
76368 + struct xenbus_device *xdev;
76369 +
76370 + struct xenbus_watch be_watch;
76371 + u8 be_watching;
76372 +
76373 + int evtchn_irq;
76374 +
76375 + struct vm_struct *sh_area;
76376 + struct xen_pci_sharedinfo *sh_info;
76377 +
76378 + unsigned long flags;
76379 +
76380 + struct work_struct op_work;
76381 +};
76382 +
76383 +struct pciback_dev_data {
76384 + struct list_head config_fields;
76385 + int permissive;
76386 + int warned_on_write;
76387 +};
76388 +
76389 +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
76390 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
76391 + int domain, int bus,
76392 + int slot, int func);
76393 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
76394 + struct pci_dev *dev);
76395 +void pcistub_put_pci_dev(struct pci_dev *dev);
76396 +
76397 +/* Ensure a device is turned off or reset */
76398 +void pciback_reset_device(struct pci_dev *pdev);
76399 +
76400 +/* Access a virtual configuration space for a PCI device */
76401 +int pciback_config_init(void);
76402 +int pciback_config_init_dev(struct pci_dev *dev);
76403 +void pciback_config_free_dyn_fields(struct pci_dev *dev);
76404 +void pciback_config_reset_dev(struct pci_dev *dev);
76405 +void pciback_config_free_dev(struct pci_dev *dev);
76406 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
76407 + u32 * ret_val);
76408 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
76409 +
76410 +/* Handle requests for specific devices from the frontend */
76411 +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
76412 + unsigned int domain, unsigned int bus);
76413 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
76414 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
76415 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76416 + unsigned int domain, unsigned int bus,
76417 + unsigned int devfn);
76418 +int pciback_init_devices(struct pciback_device *pdev);
76419 +int pciback_publish_pci_roots(struct pciback_device *pdev,
76420 + publish_pci_root_cb cb);
76421 +void pciback_release_devices(struct pciback_device *pdev);
76422 +
76423 +/* Handles events from front-end */
76424 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
76425 +void pciback_do_op(void *data);
76426 +
76427 +int pciback_xenbus_register(void);
76428 +void pciback_xenbus_unregister(void);
76429 +
76430 +extern int verbose_request;
76431 +#endif
76432 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/pciback_ops.c linux-2.6.16.33/drivers/xen/pciback/pciback_ops.c
76433 --- linux-2.6.16.33-noxen/drivers/xen/pciback/pciback_ops.c 1970-01-01 00:00:00.000000000 +0000
76434 +++ linux-2.6.16.33/drivers/xen/pciback/pciback_ops.c 2007-01-08 15:00:45.000000000 +0000
76435 @@ -0,0 +1,95 @@
76436 +/*
76437 + * PCI Backend Operations - respond to PCI requests from Frontend
76438 + *
76439 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76440 + */
76441 +#include <linux/module.h>
76442 +#include <asm/bitops.h>
76443 +#include <xen/evtchn.h>
76444 +#include "pciback.h"
76445 +
76446 +int verbose_request = 0;
76447 +module_param(verbose_request, int, 0644);
76448 +
76449 +/* Ensure a device is "turned off" and ready to be exported.
76450 + * (Also see pciback_config_reset to ensure virtual configuration space is
76451 + * ready to be re-exported)
76452 + */
76453 +void pciback_reset_device(struct pci_dev *dev)
76454 +{
76455 + u16 cmd;
76456 +
76457 + /* Disable devices (but not bridges) */
76458 + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
76459 + pci_disable_device(dev);
76460 +
76461 + pci_write_config_word(dev, PCI_COMMAND, 0);
76462 +
76463 + dev->is_enabled = 0;
76464 + dev->is_busmaster = 0;
76465 + } else {
76466 + pci_read_config_word(dev, PCI_COMMAND, &cmd);
76467 + if (cmd & (PCI_COMMAND_INVALIDATE)) {
76468 + cmd &= ~(PCI_COMMAND_INVALIDATE);
76469 + pci_write_config_word(dev, PCI_COMMAND, cmd);
76470 +
76471 + dev->is_busmaster = 0;
76472 + }
76473 + }
76474 +}
76475 +
76476 +static inline void test_and_schedule_op(struct pciback_device *pdev)
76477 +{
76478 + /* Check that frontend is requesting an operation and that we are not
76479 + * already processing a request */
76480 + if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
76481 + && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
76482 + schedule_work(&pdev->op_work);
76483 +}
76484 +
76485 +/* Performing the configuration space reads/writes must not be done in atomic
76486 + * context because some of the pci_* functions can sleep (mostly due to ACPI
76487 + * use of semaphores). This function is intended to be called from a work
76488 + * queue in process context taking a struct pciback_device as a parameter */
76489 +void pciback_do_op(void *data)
76490 +{
76491 + struct pciback_device *pdev = data;
76492 + struct pci_dev *dev;
76493 + struct xen_pci_op *op = &pdev->sh_info->op;
76494 +
76495 + dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
76496 +
76497 + if (dev == NULL)
76498 + op->err = XEN_PCI_ERR_dev_not_found;
76499 + else if (op->cmd == XEN_PCI_OP_conf_read)
76500 + op->err = pciback_config_read(dev, op->offset, op->size,
76501 + &op->value);
76502 + else if (op->cmd == XEN_PCI_OP_conf_write)
76503 + op->err = pciback_config_write(dev, op->offset, op->size,
76504 + op->value);
76505 + else
76506 + op->err = XEN_PCI_ERR_not_implemented;
76507 +
76508 + /* Tell the driver domain that we're done. */
76509 + wmb();
76510 + clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
76511 + notify_remote_via_irq(pdev->evtchn_irq);
76512 +
76513 + /* Mark that we're done. */
76514 + smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
76515 + clear_bit(_PDEVF_op_active, &pdev->flags);
76516 + smp_mb__after_clear_bit(); /* /before/ final check for work */
76517 +
76518 + /* Check to see if the driver domain tried to start another request in
76519 + * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */
76520 + test_and_schedule_op(pdev);
76521 +}
76522 +
76523 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
76524 +{
76525 + struct pciback_device *pdev = dev_id;
76526 +
76527 + test_and_schedule_op(pdev);
76528 +
76529 + return IRQ_HANDLED;
76530 +}
76531 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/slot.c linux-2.6.16.33/drivers/xen/pciback/slot.c
76532 --- linux-2.6.16.33-noxen/drivers/xen/pciback/slot.c 1970-01-01 00:00:00.000000000 +0000
76533 +++ linux-2.6.16.33/drivers/xen/pciback/slot.c 2007-01-08 15:00:45.000000000 +0000
76534 @@ -0,0 +1,151 @@
76535 +/*
76536 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
76537 + * to the frontend
76538 + *
76539 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
76540 + * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
76541 + */
76542 +
76543 +#include <linux/list.h>
76544 +#include <linux/slab.h>
76545 +#include <linux/pci.h>
76546 +#include <linux/spinlock.h>
76547 +#include "pciback.h"
76548 +
76549 +/* There are at most 32 slots in a pci bus. */
76550 +#define PCI_SLOT_MAX 32
76551 +
76552 +#define PCI_BUS_NBR 2
76553 +
76554 +struct slot_dev_data {
76555 + /* Access to dev_list must be protected by lock */
76556 + struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
76557 + spinlock_t lock;
76558 +};
76559 +
76560 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76561 + unsigned int domain, unsigned int bus,
76562 + unsigned int devfn)
76563 +{
76564 + struct pci_dev *dev = NULL;
76565 + struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76566 + unsigned long flags;
76567 +
76568 + if (domain != 0 || PCI_FUNC(devfn) != 0)
76569 + return NULL;
76570 +
76571 + if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
76572 + return NULL;
76573 +
76574 + spin_lock_irqsave(&slot_dev->lock, flags);
76575 + dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
76576 + spin_unlock_irqrestore(&slot_dev->lock, flags);
76577 +
76578 + return dev;
76579 +}
76580 +
76581 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76582 +{
76583 + int err = 0, slot, bus;
76584 + struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76585 + unsigned long flags;
76586 +
76587 + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
76588 + err = -EFAULT;
76589 + xenbus_dev_fatal(pdev->xdev, err,
76590 + "Can't export bridges on the virtual PCI bus");
76591 + goto out;
76592 + }
76593 +
76594 + spin_lock_irqsave(&slot_dev->lock, flags);
76595 +
76596 + /* Assign to a new slot on the virtual PCI bus */
76597 + for (bus = 0; bus < PCI_BUS_NBR; bus++)
76598 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76599 + if (slot_dev->slots[bus][slot] == NULL) {
76600 + printk(KERN_INFO
76601 + "pciback: slot: %s: assign to virtual slot %d, bus %d\n",
76602 + pci_name(dev), slot, bus);
76603 + slot_dev->slots[bus][slot] = dev;
76604 + goto unlock;
76605 + }
76606 + }
76607 +
76608 + err = -ENOMEM;
76609 + xenbus_dev_fatal(pdev->xdev, err,
76610 + "No more space on root virtual PCI bus");
76611 +
76612 + unlock:
76613 + spin_unlock_irqrestore(&slot_dev->lock, flags);
76614 + out:
76615 + return err;
76616 +}
76617 +
76618 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76619 +{
76620 + int slot, bus;
76621 + struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76622 + struct pci_dev *found_dev = NULL;
76623 + unsigned long flags;
76624 +
76625 + spin_lock_irqsave(&slot_dev->lock, flags);
76626 +
76627 + for (bus = 0; bus < PCI_BUS_NBR; bus++)
76628 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76629 + if (slot_dev->slots[bus][slot] == dev) {
76630 + slot_dev->slots[bus][slot] = NULL;
76631 + found_dev = dev;
76632 + goto out;
76633 + }
76634 + }
76635 +
76636 + out:
76637 + spin_unlock_irqrestore(&slot_dev->lock, flags);
76638 +
76639 + if (found_dev)
76640 + pcistub_put_pci_dev(found_dev);
76641 +}
76642 +
76643 +int pciback_init_devices(struct pciback_device *pdev)
76644 +{
76645 + int slot, bus;
76646 + struct slot_dev_data *slot_dev;
76647 +
76648 + slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
76649 + if (!slot_dev)
76650 + return -ENOMEM;
76651 +
76652 + spin_lock_init(&slot_dev->lock);
76653 +
76654 + for (bus = 0; bus < PCI_BUS_NBR; bus++)
76655 + for (slot = 0; slot < PCI_SLOT_MAX; slot++)
76656 + slot_dev->slots[bus][slot] = NULL;
76657 +
76658 + pdev->pci_dev_data = slot_dev;
76659 +
76660 + return 0;
76661 +}
76662 +
76663 +int pciback_publish_pci_roots(struct pciback_device *pdev,
76664 + publish_pci_root_cb publish_cb)
76665 +{
76666 + /* The Virtual PCI bus has only one root */
76667 + return publish_cb(pdev, 0, 0);
76668 +}
76669 +
76670 +void pciback_release_devices(struct pciback_device *pdev)
76671 +{
76672 + int slot, bus;
76673 + struct slot_dev_data *slot_dev = pdev->pci_dev_data;
76674 + struct pci_dev *dev;
76675 +
76676 + for (bus = 0; bus < PCI_BUS_NBR; bus++)
76677 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76678 + dev = slot_dev->slots[bus][slot];
76679 + if (dev != NULL)
76680 + pcistub_put_pci_dev(dev);
76681 + }
76682 +
76683 + kfree(slot_dev);
76684 + pdev->pci_dev_data = NULL;
76685 +}
76686 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/vpci.c linux-2.6.16.33/drivers/xen/pciback/vpci.c
76687 --- linux-2.6.16.33-noxen/drivers/xen/pciback/vpci.c 1970-01-01 00:00:00.000000000 +0000
76688 +++ linux-2.6.16.33/drivers/xen/pciback/vpci.c 2007-01-08 15:00:45.000000000 +0000
76689 @@ -0,0 +1,204 @@
76690 +/*
76691 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
76692 + * to the frontend
76693 + *
76694 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76695 + */
76696 +
76697 +#include <linux/list.h>
76698 +#include <linux/slab.h>
76699 +#include <linux/pci.h>
76700 +#include <linux/spinlock.h>
76701 +#include "pciback.h"
76702 +
76703 +#define PCI_SLOT_MAX 32
76704 +
76705 +struct vpci_dev_data {
76706 + /* Access to dev_list must be protected by lock */
76707 + struct list_head dev_list[PCI_SLOT_MAX];
76708 + spinlock_t lock;
76709 +};
76710 +
76711 +static inline struct list_head *list_first(struct list_head *head)
76712 +{
76713 + return head->next;
76714 +}
76715 +
76716 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
76717 + unsigned int domain, unsigned int bus,
76718 + unsigned int devfn)
76719 +{
76720 + struct pci_dev_entry *entry;
76721 + struct pci_dev *dev = NULL;
76722 + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76723 + unsigned long flags;
76724 +
76725 + if (domain != 0 || bus != 0)
76726 + return NULL;
76727 +
76728 + if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
76729 + spin_lock_irqsave(&vpci_dev->lock, flags);
76730 +
76731 + list_for_each_entry(entry,
76732 + &vpci_dev->dev_list[PCI_SLOT(devfn)],
76733 + list) {
76734 + if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
76735 + dev = entry->dev;
76736 + break;
76737 + }
76738 + }
76739 +
76740 + spin_unlock_irqrestore(&vpci_dev->lock, flags);
76741 + }
76742 + return dev;
76743 +}
76744 +
76745 +static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
76746 +{
76747 + if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
76748 + && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
76749 + return 1;
76750 +
76751 + return 0;
76752 +}
76753 +
76754 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76755 +{
76756 + int err = 0, slot;
76757 + struct pci_dev_entry *t, *dev_entry;
76758 + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76759 + unsigned long flags;
76760 +
76761 + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
76762 + err = -EFAULT;
76763 + xenbus_dev_fatal(pdev->xdev, err,
76764 + "Can't export bridges on the virtual PCI bus");
76765 + goto out;
76766 + }
76767 +
76768 + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
76769 + if (!dev_entry) {
76770 + err = -ENOMEM;
76771 + xenbus_dev_fatal(pdev->xdev, err,
76772 + "Error adding entry to virtual PCI bus");
76773 + goto out;
76774 + }
76775 +
76776 + dev_entry->dev = dev;
76777 +
76778 + spin_lock_irqsave(&vpci_dev->lock, flags);
76779 +
76780 + /* Keep multi-function devices together on the virtual PCI bus */
76781 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76782 + if (!list_empty(&vpci_dev->dev_list[slot])) {
76783 + t = list_entry(list_first(&vpci_dev->dev_list[slot]),
76784 + struct pci_dev_entry, list);
76785 +
76786 + if (match_slot(dev, t->dev)) {
76787 + pr_info("pciback: vpci: %s: "
76788 + "assign to virtual slot %d func %d\n",
76789 + pci_name(dev), slot,
76790 + PCI_FUNC(dev->devfn));
76791 + list_add_tail(&dev_entry->list,
76792 + &vpci_dev->dev_list[slot]);
76793 + goto unlock;
76794 + }
76795 + }
76796 + }
76797 +
76798 + /* Assign to a new slot on the virtual PCI bus */
76799 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76800 + if (list_empty(&vpci_dev->dev_list[slot])) {
76801 + printk(KERN_INFO
76802 + "pciback: vpci: %s: assign to virtual slot %d\n",
76803 + pci_name(dev), slot);
76804 + list_add_tail(&dev_entry->list,
76805 + &vpci_dev->dev_list[slot]);
76806 + goto unlock;
76807 + }
76808 + }
76809 +
76810 + err = -ENOMEM;
76811 + xenbus_dev_fatal(pdev->xdev, err,
76812 + "No more space on root virtual PCI bus");
76813 +
76814 + unlock:
76815 + spin_unlock_irqrestore(&vpci_dev->lock, flags);
76816 + out:
76817 + return err;
76818 +}
76819 +
76820 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
76821 +{
76822 + int slot;
76823 + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76824 + struct pci_dev *found_dev = NULL;
76825 + unsigned long flags;
76826 +
76827 + spin_lock_irqsave(&vpci_dev->lock, flags);
76828 +
76829 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76830 + struct pci_dev_entry *e, *tmp;
76831 + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
76832 + list) {
76833 + if (e->dev == dev) {
76834 + list_del(&e->list);
76835 + found_dev = e->dev;
76836 + kfree(e);
76837 + goto out;
76838 + }
76839 + }
76840 + }
76841 +
76842 + out:
76843 + spin_unlock_irqrestore(&vpci_dev->lock, flags);
76844 +
76845 + if (found_dev)
76846 + pcistub_put_pci_dev(found_dev);
76847 +}
76848 +
76849 +int pciback_init_devices(struct pciback_device *pdev)
76850 +{
76851 + int slot;
76852 + struct vpci_dev_data *vpci_dev;
76853 +
76854 + vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
76855 + if (!vpci_dev)
76856 + return -ENOMEM;
76857 +
76858 + spin_lock_init(&vpci_dev->lock);
76859 +
76860 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76861 + INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
76862 + }
76863 +
76864 + pdev->pci_dev_data = vpci_dev;
76865 +
76866 + return 0;
76867 +}
76868 +
76869 +int pciback_publish_pci_roots(struct pciback_device *pdev,
76870 + publish_pci_root_cb publish_cb)
76871 +{
76872 + /* The Virtual PCI bus has only one root */
76873 + return publish_cb(pdev, 0, 0);
76874 +}
76875 +
76876 +void pciback_release_devices(struct pciback_device *pdev)
76877 +{
76878 + int slot;
76879 + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
76880 +
76881 + for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
76882 + struct pci_dev_entry *e, *tmp;
76883 + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
76884 + list) {
76885 + list_del(&e->list);
76886 + pcistub_put_pci_dev(e->dev);
76887 + kfree(e);
76888 + }
76889 + }
76890 +
76891 + kfree(vpci_dev);
76892 + pdev->pci_dev_data = NULL;
76893 +}
76894 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pciback/xenbus.c linux-2.6.16.33/drivers/xen/pciback/xenbus.c
76895 --- linux-2.6.16.33-noxen/drivers/xen/pciback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
76896 +++ linux-2.6.16.33/drivers/xen/pciback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
76897 @@ -0,0 +1,458 @@
76898 +/*
76899 + * PCI Backend Xenbus Setup - handles setup with frontend and xend
76900 + *
76901 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
76902 + */
76903 +#include <linux/module.h>
76904 +#include <linux/init.h>
76905 +#include <linux/list.h>
76906 +#include <linux/vmalloc.h>
76907 +#include <xen/xenbus.h>
76908 +#include <xen/evtchn.h>
76909 +#include "pciback.h"
76910 +
76911 +#define INVALID_EVTCHN_IRQ (-1)
76912 +
76913 +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
76914 +{
76915 + struct pciback_device *pdev;
76916 +
76917 + pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
76918 + if (pdev == NULL)
76919 + goto out;
76920 + dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
76921 +
76922 + pdev->xdev = xdev;
76923 + xdev->dev.driver_data = pdev;
76924 +
76925 + spin_lock_init(&pdev->dev_lock);
76926 +
76927 + pdev->sh_area = NULL;
76928 + pdev->sh_info = NULL;
76929 + pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
76930 + pdev->be_watching = 0;
76931 +
76932 + INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
76933 +
76934 + if (pciback_init_devices(pdev)) {
76935 + kfree(pdev);
76936 + pdev = NULL;
76937 + }
76938 + out:
76939 + return pdev;
76940 +}
76941 +
76942 +static void free_pdev(struct pciback_device *pdev)
76943 +{
76944 + if (pdev->be_watching)
76945 + unregister_xenbus_watch(&pdev->be_watch);
76946 +
76947 + /* Ensure the guest can't trigger our handler before removing devices */
76948 + if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
76949 + unbind_from_irqhandler(pdev->evtchn_irq, pdev);
76950 +
76951 + /* If the driver domain started an op, make sure we complete it or
76952 + * delete it before releasing the shared memory */
76953 + cancel_delayed_work(&pdev->op_work);
76954 + flush_scheduled_work();
76955 +
76956 + if (pdev->sh_info)
76957 + xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
76958 +
76959 + pciback_release_devices(pdev);
76960 +
76961 + pdev->xdev->dev.driver_data = NULL;
76962 + pdev->xdev = NULL;
76963 +
76964 + kfree(pdev);
76965 +}
76966 +
76967 +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
76968 + int remote_evtchn)
76969 +{
76970 + int err = 0;
76971 + int evtchn;
76972 + struct vm_struct *area;
76973 +
76974 + dev_dbg(&pdev->xdev->dev,
76975 + "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
76976 + gnt_ref, remote_evtchn);
76977 +
76978 + area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
76979 + if (IS_ERR(area)) {
76980 + err = PTR_ERR(area);
76981 + goto out;
76982 + }
76983 + pdev->sh_area = area;
76984 + pdev->sh_info = area->addr;
76985 +
76986 + err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
76987 + if (err)
76988 + goto out;
76989 +
76990 + err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
76991 + SA_SAMPLE_RANDOM, "pciback", pdev);
76992 + if (err < 0) {
76993 + xenbus_dev_fatal(pdev->xdev, err,
76994 + "Error binding event channel to IRQ");
76995 + goto out;
76996 + }
76997 + pdev->evtchn_irq = err;
76998 + err = 0;
76999 +
77000 + dev_dbg(&pdev->xdev->dev, "Attached!\n");
77001 + out:
77002 + return err;
77003 +}
77004 +
77005 +static int pciback_attach(struct pciback_device *pdev)
77006 +{
77007 + int err = 0;
77008 + int gnt_ref, remote_evtchn;
77009 + char *magic = NULL;
77010 +
77011 + spin_lock(&pdev->dev_lock);
77012 +
77013 + /* Make sure we only do this setup once */
77014 + if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77015 + XenbusStateInitialised)
77016 + goto out;
77017 +
77018 + /* Wait for frontend to state that it has published the configuration */
77019 + if (xenbus_read_driver_state(pdev->xdev->otherend) !=
77020 + XenbusStateInitialised)
77021 + goto out;
77022 +
77023 + dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
77024 +
77025 + err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
77026 + "pci-op-ref", "%u", &gnt_ref,
77027 + "event-channel", "%u", &remote_evtchn,
77028 + "magic", NULL, &magic, NULL);
77029 + if (err) {
77030 + /* If configuration didn't get read correctly, wait longer */
77031 + xenbus_dev_fatal(pdev->xdev, err,
77032 + "Error reading configuration from frontend");
77033 + goto out;
77034 + }
77035 +
77036 + if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
77037 + xenbus_dev_fatal(pdev->xdev, -EFAULT,
77038 + "version mismatch (%s/%s) with pcifront - "
77039 + "halting pciback",
77040 + magic, XEN_PCI_MAGIC);
77041 + goto out;
77042 + }
77043 +
77044 + err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
77045 + if (err)
77046 + goto out;
77047 +
77048 + dev_dbg(&pdev->xdev->dev, "Connecting...\n");
77049 +
77050 + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
77051 + if (err)
77052 + xenbus_dev_fatal(pdev->xdev, err,
77053 + "Error switching to connected state!");
77054 +
77055 + dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
77056 + out:
77057 + spin_unlock(&pdev->dev_lock);
77058 +
77059 + if (magic)
77060 + kfree(magic);
77061 +
77062 + return err;
77063 +}
77064 +
77065 +static void pciback_frontend_changed(struct xenbus_device *xdev,
77066 + enum xenbus_state fe_state)
77067 +{
77068 + struct pciback_device *pdev = xdev->dev.driver_data;
77069 +
77070 + dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
77071 +
77072 + switch (fe_state) {
77073 + case XenbusStateInitialised:
77074 + pciback_attach(pdev);
77075 + break;
77076 +
77077 + case XenbusStateClosing:
77078 + xenbus_switch_state(xdev, XenbusStateClosing);
77079 + break;
77080 +
77081 + case XenbusStateUnknown:
77082 + case XenbusStateClosed:
77083 + dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
77084 + device_unregister(&xdev->dev);
77085 + break;
77086 +
77087 + default:
77088 + break;
77089 + }
77090 +}
77091 +
77092 +static int pciback_publish_pci_root(struct pciback_device *pdev,
77093 + unsigned int domain, unsigned int bus)
77094 +{
77095 + unsigned int d, b;
77096 + int i, root_num, len, err;
77097 + char str[64];
77098 +
77099 + dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
77100 +
77101 + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
77102 + "root_num", "%d", &root_num);
77103 + if (err == 0 || err == -ENOENT)
77104 + root_num = 0;
77105 + else if (err < 0)
77106 + goto out;
77107 +
77108 + /* Verify that we haven't already published this pci root */
77109 + for (i = 0; i < root_num; i++) {
77110 + len = snprintf(str, sizeof(str), "root-%d", i);
77111 + if (unlikely(len >= (sizeof(str) - 1))) {
77112 + err = -ENOMEM;
77113 + goto out;
77114 + }
77115 +
77116 + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
77117 + str, "%x:%x", &d, &b);
77118 + if (err < 0)
77119 + goto out;
77120 + if (err != 2) {
77121 + err = -EINVAL;
77122 + goto out;
77123 + }
77124 +
77125 + if (d == domain && b == bus) {
77126 + err = 0;
77127 + goto out;
77128 + }
77129 + }
77130 +
77131 + len = snprintf(str, sizeof(str), "root-%d", root_num);
77132 + if (unlikely(len >= (sizeof(str) - 1))) {
77133 + err = -ENOMEM;
77134 + goto out;
77135 + }
77136 +
77137 + dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
77138 + root_num, domain, bus);
77139 +
77140 + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
77141 + "%04x:%02x", domain, bus);
77142 + if (err)
77143 + goto out;
77144 +
77145 + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
77146 + "root_num", "%d", (root_num + 1));
77147 +
77148 + out:
77149 + return err;
77150 +}
77151 +
77152 +static int pciback_export_device(struct pciback_device *pdev,
77153 + int domain, int bus, int slot, int func)
77154 +{
77155 + struct pci_dev *dev;
77156 + int err = 0;
77157 +
77158 + dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
77159 + domain, bus, slot, func);
77160 +
77161 + dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
77162 + if (!dev) {
77163 + err = -EINVAL;
77164 + xenbus_dev_fatal(pdev->xdev, err,
77165 + "Couldn't locate PCI device "
77166 + "(%04x:%02x:%02x.%01x)! "
77167 + "perhaps already in-use?",
77168 + domain, bus, slot, func);
77169 + goto out;
77170 + }
77171 +
77172 + err = pciback_add_pci_dev(pdev, dev);
77173 + if (err)
77174 + goto out;
77175 +
77176 + /* TODO: It'd be nice to export a bridge and have all of its children
77177 + * get exported with it. This may be best done in xend (which will
77178 + * have to calculate resource usage anyway) but we probably want to
77179 + * put something in here to ensure that if a bridge gets given to a
77180 + * driver domain, that all devices under that bridge are not given
77181 + * to other driver domains (as he who controls the bridge can disable
77182 + * it and stop the other devices from working).
77183 + */
77184 + out:
77185 + return err;
77186 +}
77187 +
77188 +static int pciback_setup_backend(struct pciback_device *pdev)
77189 +{
77190 + /* Get configuration from xend (if available now) */
77191 + int domain, bus, slot, func;
77192 + int err = 0;
77193 + int i, num_devs;
77194 + char dev_str[64];
77195 +
77196 + spin_lock(&pdev->dev_lock);
77197 +
77198 + /* It's possible we could get the call to setup twice, so make sure
77199 + * we're not already connected.
77200 + */
77201 + if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77202 + XenbusStateInitWait)
77203 + goto out;
77204 +
77205 + dev_dbg(&pdev->xdev->dev, "getting be setup\n");
77206 +
77207 + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
77208 + &num_devs);
77209 + if (err != 1) {
77210 + if (err >= 0)
77211 + err = -EINVAL;
77212 + xenbus_dev_fatal(pdev->xdev, err,
77213 + "Error reading number of devices");
77214 + goto out;
77215 + }
77216 +
77217 + for (i = 0; i < num_devs; i++) {
77218 + int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
77219 + if (unlikely(l >= (sizeof(dev_str) - 1))) {
77220 + err = -ENOMEM;
77221 + xenbus_dev_fatal(pdev->xdev, err,
77222 + "String overflow while reading "
77223 + "configuration");
77224 + goto out;
77225 + }
77226 +
77227 + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
77228 + "%x:%x:%x.%x", &domain, &bus, &slot, &func);
77229 + if (err < 0) {
77230 + xenbus_dev_fatal(pdev->xdev, err,
77231 + "Error reading device configuration");
77232 + goto out;
77233 + }
77234 + if (err != 4) {
77235 + err = -EINVAL;
77236 + xenbus_dev_fatal(pdev->xdev, err,
77237 + "Error parsing pci device "
77238 + "configuration");
77239 + goto out;
77240 + }
77241 +
77242 + err = pciback_export_device(pdev, domain, bus, slot, func);
77243 + if (err)
77244 + goto out;
77245 + }
77246 +
77247 + err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
77248 + if (err) {
77249 + xenbus_dev_fatal(pdev->xdev, err,
77250 + "Error while publish PCI root buses "
77251 + "for frontend");
77252 + goto out;
77253 + }
77254 +
77255 + err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
77256 + if (err)
77257 + xenbus_dev_fatal(pdev->xdev, err,
77258 + "Error switching to initialised state!");
77259 +
77260 + out:
77261 + spin_unlock(&pdev->dev_lock);
77262 +
77263 + if (!err)
77264 + /* see if pcifront is already configured (if not, we'll wait) */
77265 + pciback_attach(pdev);
77266 +
77267 + return err;
77268 +}
77269 +
77270 +static void pciback_be_watch(struct xenbus_watch *watch,
77271 + const char **vec, unsigned int len)
77272 +{
77273 + struct pciback_device *pdev =
77274 + container_of(watch, struct pciback_device, be_watch);
77275 +
77276 + switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
77277 + case XenbusStateInitWait:
77278 + pciback_setup_backend(pdev);
77279 + break;
77280 +
77281 + default:
77282 + break;
77283 + }
77284 +}
77285 +
77286 +static int pciback_xenbus_probe(struct xenbus_device *dev,
77287 + const struct xenbus_device_id *id)
77288 +{
77289 + int err = 0;
77290 + struct pciback_device *pdev = alloc_pdev(dev);
77291 +
77292 + if (pdev == NULL) {
77293 + err = -ENOMEM;
77294 + xenbus_dev_fatal(dev, err,
77295 + "Error allocating pciback_device struct");
77296 + goto out;
77297 + }
77298 +
77299 + /* wait for xend to configure us */
77300 + err = xenbus_switch_state(dev, XenbusStateInitWait);
77301 + if (err)
77302 + goto out;
77303 +
77304 + /* watch the backend node for backend configuration information */
77305 + err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
77306 + pciback_be_watch);
77307 + if (err)
77308 + goto out;
77309 + pdev->be_watching = 1;
77310 +
77311 + /* We need to force a call to our callback here in case
77312 + * xend already configured us!
77313 + */
77314 + pciback_be_watch(&pdev->be_watch, NULL, 0);
77315 +
77316 + out:
77317 + return err;
77318 +}
77319 +
77320 +static int pciback_xenbus_remove(struct xenbus_device *dev)
77321 +{
77322 + struct pciback_device *pdev = dev->dev.driver_data;
77323 +
77324 + if (pdev != NULL)
77325 + free_pdev(pdev);
77326 +
77327 + return 0;
77328 +}
77329 +
77330 +static struct xenbus_device_id xenpci_ids[] = {
77331 + {"pci"},
77332 + {{0}},
77333 +};
77334 +
77335 +static struct xenbus_driver xenbus_pciback_driver = {
77336 + .name = "pciback",
77337 + .owner = THIS_MODULE,
77338 + .ids = xenpci_ids,
77339 + .probe = pciback_xenbus_probe,
77340 + .remove = pciback_xenbus_remove,
77341 + .otherend_changed = pciback_frontend_changed,
77342 +};
77343 +
77344 +int __init pciback_xenbus_register(void)
77345 +{
77346 + if (!is_running_on_xen())
77347 + return -ENODEV;
77348 +
77349 + return xenbus_register_backend(&xenbus_pciback_driver);
77350 +}
77351 +
77352 +void __exit pciback_xenbus_unregister(void)
77353 +{
77354 + xenbus_unregister_driver(&xenbus_pciback_driver);
77355 +}
77356 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/Makefile linux-2.6.16.33/drivers/xen/pcifront/Makefile
77357 --- linux-2.6.16.33-noxen/drivers/xen/pcifront/Makefile 1970-01-01 00:00:00.000000000 +0000
77358 +++ linux-2.6.16.33/drivers/xen/pcifront/Makefile 2007-01-08 15:00:45.000000000 +0000
77359 @@ -0,0 +1,7 @@
77360 +obj-y += pcifront.o
77361 +
77362 +pcifront-y := pci_op.o xenbus.o pci.o
77363 +
77364 +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
77365 +EXTRA_CFLAGS += -DDEBUG
77366 +endif
77367 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pci.c linux-2.6.16.33/drivers/xen/pcifront/pci.c
77368 --- linux-2.6.16.33-noxen/drivers/xen/pcifront/pci.c 1970-01-01 00:00:00.000000000 +0000
77369 +++ linux-2.6.16.33/drivers/xen/pcifront/pci.c 2007-01-08 15:00:45.000000000 +0000
77370 @@ -0,0 +1,46 @@
77371 +/*
77372 + * PCI Frontend Operations - ensure only one PCI frontend runs at a time
77373 + *
77374 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77375 + */
77376 +#include <linux/module.h>
77377 +#include <linux/init.h>
77378 +#include <linux/pci.h>
77379 +#include <linux/spinlock.h>
77380 +#include "pcifront.h"
77381 +
77382 +DEFINE_SPINLOCK(pcifront_dev_lock);
77383 +static struct pcifront_device *pcifront_dev = NULL;
77384 +
77385 +int pcifront_connect(struct pcifront_device *pdev)
77386 +{
77387 + int err = 0;
77388 +
77389 + spin_lock(&pcifront_dev_lock);
77390 +
77391 + if (!pcifront_dev) {
77392 + dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
77393 + pcifront_dev = pdev;
77394 + }
77395 + else {
77396 + dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
77397 + err = -EEXIST;
77398 + }
77399 +
77400 + spin_unlock(&pcifront_dev_lock);
77401 +
77402 + return err;
77403 +}
77404 +
77405 +void pcifront_disconnect(struct pcifront_device *pdev)
77406 +{
77407 + spin_lock(&pcifront_dev_lock);
77408 +
77409 + if (pdev == pcifront_dev) {
77410 + dev_info(&pdev->xdev->dev,
77411 + "Disconnecting PCI Frontend Buses\n");
77412 + pcifront_dev = NULL;
77413 + }
77414 +
77415 + spin_unlock(&pcifront_dev_lock);
77416 +}
77417 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pci_op.c linux-2.6.16.33/drivers/xen/pcifront/pci_op.c
77418 --- linux-2.6.16.33-noxen/drivers/xen/pcifront/pci_op.c 1970-01-01 00:00:00.000000000 +0000
77419 +++ linux-2.6.16.33/drivers/xen/pcifront/pci_op.c 2007-01-08 15:00:45.000000000 +0000
77420 @@ -0,0 +1,273 @@
77421 +/*
77422 + * PCI Frontend Operations - Communicates with frontend
77423 + *
77424 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77425 + */
77426 +#include <linux/module.h>
77427 +#include <linux/version.h>
77428 +#include <linux/init.h>
77429 +#include <linux/pci.h>
77430 +#include <linux/spinlock.h>
77431 +#include <linux/time.h>
77432 +#include <xen/evtchn.h>
77433 +#include "pcifront.h"
77434 +
77435 +static int verbose_request = 0;
77436 +module_param(verbose_request, int, 0644);
77437 +
77438 +static int errno_to_pcibios_err(int errno)
77439 +{
77440 + switch (errno) {
77441 + case XEN_PCI_ERR_success:
77442 + return PCIBIOS_SUCCESSFUL;
77443 +
77444 + case XEN_PCI_ERR_dev_not_found:
77445 + return PCIBIOS_DEVICE_NOT_FOUND;
77446 +
77447 + case XEN_PCI_ERR_invalid_offset:
77448 + case XEN_PCI_ERR_op_failed:
77449 + return PCIBIOS_BAD_REGISTER_NUMBER;
77450 +
77451 + case XEN_PCI_ERR_not_implemented:
77452 + return PCIBIOS_FUNC_NOT_SUPPORTED;
77453 +
77454 + case XEN_PCI_ERR_access_denied:
77455 + return PCIBIOS_SET_FAILED;
77456 + }
77457 + return errno;
77458 +}
77459 +
77460 +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
77461 +{
77462 + int err = 0;
77463 + struct xen_pci_op *active_op = &pdev->sh_info->op;
77464 + unsigned long irq_flags;
77465 + evtchn_port_t port = pdev->evtchn;
77466 + nsec_t ns, ns_timeout;
77467 + struct timeval tv;
77468 +
77469 + spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
77470 +
77471 + memcpy(active_op, op, sizeof(struct xen_pci_op));
77472 +
77473 + /* Go */
77474 + wmb();
77475 + set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
77476 + notify_remote_via_evtchn(port);
77477 +
77478 + /*
77479 + * We set a poll timeout of 3 seconds but give up on return after
77480 + * 2 seconds. It is better to time out too late rather than too early
77481 + * (in the latter case we end up continually re-executing poll() with a
77482 + * timeout in the past). 1s difference gives plenty of slack for error.
77483 + */
77484 + do_gettimeofday(&tv);
77485 + ns_timeout = timeval_to_ns(&tv) + 2 * (nsec_t)NSEC_PER_SEC;
77486 +
77487 + clear_evtchn(port);
77488 +
77489 + while (test_bit(_XEN_PCIF_active,
77490 + (unsigned long *)&pdev->sh_info->flags)) {
77491 + if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
77492 + BUG();
77493 + clear_evtchn(port);
77494 + do_gettimeofday(&tv);
77495 + ns = timeval_to_ns(&tv);
77496 + if (ns > ns_timeout) {
77497 + dev_err(&pdev->xdev->dev,
77498 + "pciback not responding!!!\n");
77499 + clear_bit(_XEN_PCIF_active,
77500 + (unsigned long *)&pdev->sh_info->flags);
77501 + err = XEN_PCI_ERR_dev_not_found;
77502 + goto out;
77503 + }
77504 + }
77505 +
77506 + memcpy(op, active_op, sizeof(struct xen_pci_op));
77507 +
77508 + err = op->err;
77509 + out:
77510 + spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
77511 + return err;
77512 +}
77513 +
77514 +/* Access to this function is spinlocked in drivers/pci/access.c */
77515 +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
77516 + int where, int size, u32 * val)
77517 +{
77518 + int err = 0;
77519 + struct xen_pci_op op = {
77520 + .cmd = XEN_PCI_OP_conf_read,
77521 + .domain = pci_domain_nr(bus),
77522 + .bus = bus->number,
77523 + .devfn = devfn,
77524 + .offset = where,
77525 + .size = size,
77526 + };
77527 + struct pcifront_sd *sd = bus->sysdata;
77528 + struct pcifront_device *pdev = pcifront_get_pdev(sd);
77529 +
77530 + if (verbose_request)
77531 + dev_info(&pdev->xdev->dev,
77532 + "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
77533 + pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
77534 + PCI_FUNC(devfn), where, size);
77535 +
77536 + err = do_pci_op(pdev, &op);
77537 +
77538 + if (likely(!err)) {
77539 + if (verbose_request)
77540 + dev_info(&pdev->xdev->dev, "read got back value %x\n",
77541 + op.value);
77542 +
77543 + *val = op.value;
77544 + } else if (err == -ENODEV) {
77545 + /* No device here, pretend that it just returned 0 */
77546 + err = 0;
77547 + *val = 0;
77548 + }
77549 +
77550 + return errno_to_pcibios_err(err);
77551 +}
77552 +
77553 +/* Access to this function is spinlocked in drivers/pci/access.c */
77554 +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
77555 + int where, int size, u32 val)
77556 +{
77557 + struct xen_pci_op op = {
77558 + .cmd = XEN_PCI_OP_conf_write,
77559 + .domain = pci_domain_nr(bus),
77560 + .bus = bus->number,
77561 + .devfn = devfn,
77562 + .offset = where,
77563 + .size = size,
77564 + .value = val,
77565 + };
77566 + struct pcifront_sd *sd = bus->sysdata;
77567 + struct pcifront_device *pdev = pcifront_get_pdev(sd);
77568 +
77569 + if (verbose_request)
77570 + dev_info(&pdev->xdev->dev,
77571 + "write dev=%04x:%02x:%02x.%01x - "
77572 + "offset %x size %d val %x\n",
77573 + pci_domain_nr(bus), bus->number,
77574 + PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
77575 +
77576 + return errno_to_pcibios_err(do_pci_op(pdev, &op));
77577 +}
77578 +
77579 +struct pci_ops pcifront_bus_ops = {
77580 + .read = pcifront_bus_read,
77581 + .write = pcifront_bus_write,
77582 +};
77583 +
77584 +/* Claim resources for the PCI frontend as-is, backend won't allow changes */
77585 +static void pcifront_claim_resource(struct pci_dev *dev, void *data)
77586 +{
77587 + struct pcifront_device *pdev = data;
77588 + int i;
77589 + struct resource *r;
77590 +
77591 + for (i = 0; i < PCI_NUM_RESOURCES; i++) {
77592 + r = &dev->resource[i];
77593 +
77594 + if (!r->parent && r->start && r->flags) {
77595 + dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
77596 + pci_name(dev), i);
77597 + pci_claim_resource(dev, i);
77598 + }
77599 + }
77600 +}
77601 +
77602 +int pcifront_scan_root(struct pcifront_device *pdev,
77603 + unsigned int domain, unsigned int bus)
77604 +{
77605 + struct pci_bus *b;
77606 + struct pcifront_sd *sd = NULL;
77607 + struct pci_bus_entry *bus_entry = NULL;
77608 + int err = 0;
77609 +
77610 +#ifndef CONFIG_PCI_DOMAINS
77611 + if (domain != 0) {
77612 + dev_err(&pdev->xdev->dev,
77613 + "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
77614 + dev_err(&pdev->xdev->dev,
77615 + "Please compile with CONFIG_PCI_DOMAINS\n");
77616 + err = -EINVAL;
77617 + goto err_out;
77618 + }
77619 +#endif
77620 +
77621 + dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
77622 + domain, bus);
77623 +
77624 + bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
77625 + sd = kmalloc(sizeof(*sd), GFP_KERNEL);
77626 + if (!bus_entry || !sd) {
77627 + err = -ENOMEM;
77628 + goto err_out;
77629 + }
77630 + pcifront_init_sd(sd, domain, pdev);
77631 +
77632 + b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
77633 + &pcifront_bus_ops, sd);
77634 + if (!b) {
77635 + dev_err(&pdev->xdev->dev,
77636 + "Error creating PCI Frontend Bus!\n");
77637 + err = -ENOMEM;
77638 + goto err_out;
77639 + }
77640 + bus_entry->bus = b;
77641 +
77642 + list_add(&bus_entry->list, &pdev->root_buses);
77643 +
77644 + /* Claim resources before going "live" with our devices */
77645 + pci_walk_bus(b, pcifront_claim_resource, pdev);
77646 +
77647 + pci_bus_add_devices(b);
77648 +
77649 + return 0;
77650 +
77651 + err_out:
77652 + kfree(bus_entry);
77653 + kfree(sd);
77654 +
77655 + return err;
77656 +}
77657 +
77658 +static void free_root_bus_devs(struct pci_bus *bus)
77659 +{
77660 + struct pci_dev *dev;
77661 +
77662 + spin_lock(&pci_bus_lock);
77663 + while (!list_empty(&bus->devices)) {
77664 + dev = container_of(bus->devices.next, struct pci_dev, bus_list);
77665 + spin_unlock(&pci_bus_lock);
77666 +
77667 + dev_dbg(&dev->dev, "removing device\n");
77668 + pci_remove_bus_device(dev);
77669 +
77670 + spin_lock(&pci_bus_lock);
77671 + }
77672 + spin_unlock(&pci_bus_lock);
77673 +}
77674 +
77675 +void pcifront_free_roots(struct pcifront_device *pdev)
77676 +{
77677 + struct pci_bus_entry *bus_entry, *t;
77678 +
77679 + dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
77680 +
77681 + list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
77682 + list_del(&bus_entry->list);
77683 +
77684 + free_root_bus_devs(bus_entry->bus);
77685 +
77686 + kfree(bus_entry->bus->sysdata);
77687 +
77688 + device_unregister(bus_entry->bus->bridge);
77689 + pci_remove_bus(bus_entry->bus);
77690 +
77691 + kfree(bus_entry);
77692 + }
77693 +}
77694 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/pcifront.h linux-2.6.16.33/drivers/xen/pcifront/pcifront.h
77695 --- linux-2.6.16.33-noxen/drivers/xen/pcifront/pcifront.h 1970-01-01 00:00:00.000000000 +0000
77696 +++ linux-2.6.16.33/drivers/xen/pcifront/pcifront.h 2007-01-08 15:00:45.000000000 +0000
77697 @@ -0,0 +1,40 @@
77698 +/*
77699 + * PCI Frontend - Common data structures & function declarations
77700 + *
77701 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77702 + */
77703 +#ifndef __XEN_PCIFRONT_H__
77704 +#define __XEN_PCIFRONT_H__
77705 +
77706 +#include <linux/spinlock.h>
77707 +#include <linux/pci.h>
77708 +#include <xen/xenbus.h>
77709 +#include <xen/interface/io/pciif.h>
77710 +#include <xen/pcifront.h>
77711 +
77712 +struct pci_bus_entry {
77713 + struct list_head list;
77714 + struct pci_bus *bus;
77715 +};
77716 +
77717 +struct pcifront_device {
77718 + struct xenbus_device *xdev;
77719 + struct list_head root_buses;
77720 + spinlock_t dev_lock;
77721 +
77722 + int evtchn;
77723 + int gnt_ref;
77724 +
77725 + /* Lock this when doing any operations in sh_info */
77726 + spinlock_t sh_info_lock;
77727 + struct xen_pci_sharedinfo *sh_info;
77728 +};
77729 +
77730 +int pcifront_connect(struct pcifront_device *pdev);
77731 +void pcifront_disconnect(struct pcifront_device *pdev);
77732 +
77733 +int pcifront_scan_root(struct pcifront_device *pdev,
77734 + unsigned int domain, unsigned int bus);
77735 +void pcifront_free_roots(struct pcifront_device *pdev);
77736 +
77737 +#endif /* __XEN_PCIFRONT_H__ */
77738 diff -Nur linux-2.6.16.33-noxen/drivers/xen/pcifront/xenbus.c linux-2.6.16.33/drivers/xen/pcifront/xenbus.c
77739 --- linux-2.6.16.33-noxen/drivers/xen/pcifront/xenbus.c 1970-01-01 00:00:00.000000000 +0000
77740 +++ linux-2.6.16.33/drivers/xen/pcifront/xenbus.c 2007-01-08 15:00:45.000000000 +0000
77741 @@ -0,0 +1,295 @@
77742 +/*
77743 + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
77744 + *
77745 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
77746 + */
77747 +#include <linux/module.h>
77748 +#include <linux/init.h>
77749 +#include <linux/mm.h>
77750 +#include <xen/xenbus.h>
77751 +#include <xen/gnttab.h>
77752 +#include "pcifront.h"
77753 +
77754 +#define INVALID_GRANT_REF (0)
77755 +#define INVALID_EVTCHN (-1)
77756 +
77757 +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
77758 +{
77759 + struct pcifront_device *pdev;
77760 +
77761 + pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
77762 + if (pdev == NULL)
77763 + goto out;
77764 +
77765 + pdev->sh_info =
77766 + (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
77767 + if (pdev->sh_info == NULL) {
77768 + kfree(pdev);
77769 + pdev = NULL;
77770 + goto out;
77771 + }
77772 + pdev->sh_info->flags = 0;
77773 +
77774 + xdev->dev.driver_data = pdev;
77775 + pdev->xdev = xdev;
77776 +
77777 + INIT_LIST_HEAD(&pdev->root_buses);
77778 +
77779 + spin_lock_init(&pdev->dev_lock);
77780 + spin_lock_init(&pdev->sh_info_lock);
77781 +
77782 + pdev->evtchn = INVALID_EVTCHN;
77783 + pdev->gnt_ref = INVALID_GRANT_REF;
77784 +
77785 + dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
77786 + pdev, pdev->sh_info);
77787 + out:
77788 + return pdev;
77789 +}
77790 +
77791 +static void free_pdev(struct pcifront_device *pdev)
77792 +{
77793 + dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
77794 +
77795 + pcifront_free_roots(pdev);
77796 +
77797 + if (pdev->evtchn != INVALID_EVTCHN)
77798 + xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
77799 +
77800 + if (pdev->gnt_ref != INVALID_GRANT_REF)
77801 + gnttab_end_foreign_access(pdev->gnt_ref, 0,
77802 + (unsigned long)pdev->sh_info);
77803 +
77804 + pdev->xdev->dev.driver_data = NULL;
77805 +
77806 + kfree(pdev);
77807 +}
77808 +
77809 +static int pcifront_publish_info(struct pcifront_device *pdev)
77810 +{
77811 + int err = 0;
77812 + struct xenbus_transaction trans;
77813 +
77814 + err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
77815 + if (err < 0)
77816 + goto out;
77817 +
77818 + pdev->gnt_ref = err;
77819 +
77820 + err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
77821 + if (err)
77822 + goto out;
77823 +
77824 + do_publish:
77825 + err = xenbus_transaction_start(&trans);
77826 + if (err) {
77827 + xenbus_dev_fatal(pdev->xdev, err,
77828 + "Error writing configuration for backend "
77829 + "(start transaction)");
77830 + goto out;
77831 + }
77832 +
77833 + err = xenbus_printf(trans, pdev->xdev->nodename,
77834 + "pci-op-ref", "%u", pdev->gnt_ref);
77835 + if (!err)
77836 + err = xenbus_printf(trans, pdev->xdev->nodename,
77837 + "event-channel", "%u", pdev->evtchn);
77838 + if (!err)
77839 + err = xenbus_printf(trans, pdev->xdev->nodename,
77840 + "magic", XEN_PCI_MAGIC);
77841 +
77842 + if (err) {
77843 + xenbus_transaction_end(trans, 1);
77844 + xenbus_dev_fatal(pdev->xdev, err,
77845 + "Error writing configuration for backend");
77846 + goto out;
77847 + } else {
77848 + err = xenbus_transaction_end(trans, 0);
77849 + if (err == -EAGAIN)
77850 + goto do_publish;
77851 + else if (err) {
77852 + xenbus_dev_fatal(pdev->xdev, err,
77853 + "Error completing transaction "
77854 + "for backend");
77855 + goto out;
77856 + }
77857 + }
77858 +
77859 + xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
77860 +
77861 + dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
77862 +
77863 + out:
77864 + return err;
77865 +}
77866 +
77867 +static int pcifront_try_connect(struct pcifront_device *pdev)
77868 +{
77869 + int err = -EFAULT;
77870 + int i, num_roots, len;
77871 + char str[64];
77872 + unsigned int domain, bus;
77873 +
77874 + spin_lock(&pdev->dev_lock);
77875 +
77876 + /* Only connect once */
77877 + if (xenbus_read_driver_state(pdev->xdev->nodename) !=
77878 + XenbusStateInitialised)
77879 + goto out;
77880 +
77881 + err = pcifront_connect(pdev);
77882 + if (err) {
77883 + xenbus_dev_fatal(pdev->xdev, err,
77884 + "Error connecting PCI Frontend");
77885 + goto out;
77886 + }
77887 +
77888 + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
77889 + "root_num", "%d", &num_roots);
77890 + if (err == -ENOENT) {
77891 + xenbus_dev_error(pdev->xdev, err,
77892 + "No PCI Roots found, trying 0000:00");
77893 + err = pcifront_scan_root(pdev, 0, 0);
77894 + num_roots = 0;
77895 + } else if (err != 1) {
77896 + if (err == 0)
77897 + err = -EINVAL;
77898 + xenbus_dev_fatal(pdev->xdev, err,
77899 + "Error reading number of PCI roots");
77900 + goto out;
77901 + }
77902 +
77903 + for (i = 0; i < num_roots; i++) {
77904 + len = snprintf(str, sizeof(str), "root-%d", i);
77905 + if (unlikely(len >= (sizeof(str) - 1))) {
77906 + err = -ENOMEM;
77907 + goto out;
77908 + }
77909 +
77910 + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
77911 + "%x:%x", &domain, &bus);
77912 + if (err != 2) {
77913 + if (err >= 0)
77914 + err = -EINVAL;
77915 + xenbus_dev_fatal(pdev->xdev, err,
77916 + "Error reading PCI root %d", i);
77917 + goto out;
77918 + }
77919 +
77920 + err = pcifront_scan_root(pdev, domain, bus);
77921 + if (err) {
77922 + xenbus_dev_fatal(pdev->xdev, err,
77923 + "Error scanning PCI root %04x:%02x",
77924 + domain, bus);
77925 + goto out;
77926 + }
77927 + }
77928 +
77929 + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
77930 + if (err)
77931 + goto out;
77932 +
77933 + out:
77934 + spin_unlock(&pdev->dev_lock);
77935 + return err;
77936 +}
77937 +
77938 +static int pcifront_try_disconnect(struct pcifront_device *pdev)
77939 +{
77940 + int err = 0;
77941 + enum xenbus_state prev_state;
77942 +
77943 + spin_lock(&pdev->dev_lock);
77944 +
77945 + prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
77946 +
77947 + if (prev_state < XenbusStateClosing)
77948 + err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
77949 +
77950 + if (!err && prev_state == XenbusStateConnected)
77951 + pcifront_disconnect(pdev);
77952 +
77953 + spin_unlock(&pdev->dev_lock);
77954 +
77955 + return err;
77956 +}
77957 +
77958 +static void pcifront_backend_changed(struct xenbus_device *xdev,
77959 + enum xenbus_state be_state)
77960 +{
77961 + struct pcifront_device *pdev = xdev->dev.driver_data;
77962 +
77963 + switch (be_state) {
77964 + case XenbusStateClosing:
77965 + dev_warn(&xdev->dev, "backend going away!\n");
77966 + pcifront_try_disconnect(pdev);
77967 + break;
77968 +
77969 + case XenbusStateUnknown:
77970 + case XenbusStateClosed:
77971 + dev_warn(&xdev->dev, "backend went away!\n");
77972 + pcifront_try_disconnect(pdev);
77973 +
77974 + device_unregister(&pdev->xdev->dev);
77975 + break;
77976 +
77977 + case XenbusStateConnected:
77978 + pcifront_try_connect(pdev);
77979 + break;
77980 +
77981 + default:
77982 + break;
77983 + }
77984 +}
77985 +
77986 +static int pcifront_xenbus_probe(struct xenbus_device *xdev,
77987 + const struct xenbus_device_id *id)
77988 +{
77989 + int err = 0;
77990 + struct pcifront_device *pdev = alloc_pdev(xdev);
77991 +
77992 + if (pdev == NULL) {
77993 + err = -ENOMEM;
77994 + xenbus_dev_fatal(xdev, err,
77995 + "Error allocating pcifront_device struct");
77996 + goto out;
77997 + }
77998 +
77999 + err = pcifront_publish_info(pdev);
78000 +
78001 + out:
78002 + return err;
78003 +}
78004 +
78005 +static int pcifront_xenbus_remove(struct xenbus_device *xdev)
78006 +{
78007 + if (xdev->dev.driver_data)
78008 + free_pdev(xdev->dev.driver_data);
78009 +
78010 + return 0;
78011 +}
78012 +
78013 +static struct xenbus_device_id xenpci_ids[] = {
78014 + {"pci"},
78015 + {{0}},
78016 +};
78017 +
78018 +static struct xenbus_driver xenbus_pcifront_driver = {
78019 + .name = "pcifront",
78020 + .owner = THIS_MODULE,
78021 + .ids = xenpci_ids,
78022 + .probe = pcifront_xenbus_probe,
78023 + .remove = pcifront_xenbus_remove,
78024 + .otherend_changed = pcifront_backend_changed,
78025 +};
78026 +
78027 +static int __init pcifront_init(void)
78028 +{
78029 + if (!is_running_on_xen())
78030 + return -ENODEV;
78031 +
78032 + return xenbus_register_frontend(&xenbus_pcifront_driver);
78033 +}
78034 +
78035 +/* Initialize after the Xen PCI Frontend Stub is initialized */
78036 +subsys_initcall(pcifront_init);
78037 diff -Nur linux-2.6.16.33-noxen/drivers/xen/privcmd/Makefile linux-2.6.16.33/drivers/xen/privcmd/Makefile
78038 --- linux-2.6.16.33-noxen/drivers/xen/privcmd/Makefile 1970-01-01 00:00:00.000000000 +0000
78039 +++ linux-2.6.16.33/drivers/xen/privcmd/Makefile 2007-01-08 15:00:45.000000000 +0000
78040 @@ -0,0 +1,2 @@
78041 +
78042 +obj-$(CONFIG_XEN_PRIVCMD) := privcmd.o
78043 diff -Nur linux-2.6.16.33-noxen/drivers/xen/privcmd/privcmd.c linux-2.6.16.33/drivers/xen/privcmd/privcmd.c
78044 --- linux-2.6.16.33-noxen/drivers/xen/privcmd/privcmd.c 1970-01-01 00:00:00.000000000 +0000
78045 +++ linux-2.6.16.33/drivers/xen/privcmd/privcmd.c 2007-01-08 15:00:45.000000000 +0000
78046 @@ -0,0 +1,286 @@
78047 +/******************************************************************************
78048 + * privcmd.c
78049 + *
78050 + * Interface to privileged domain-0 commands.
78051 + *
78052 + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
78053 + */
78054 +
78055 +#include <linux/config.h>
78056 +#include <linux/kernel.h>
78057 +#include <linux/sched.h>
78058 +#include <linux/slab.h>
78059 +#include <linux/string.h>
78060 +#include <linux/errno.h>
78061 +#include <linux/mm.h>
78062 +#include <linux/mman.h>
78063 +#include <linux/swap.h>
78064 +#include <linux/smp_lock.h>
78065 +#include <linux/highmem.h>
78066 +#include <linux/pagemap.h>
78067 +#include <linux/seq_file.h>
78068 +#include <linux/kthread.h>
78069 +#include <asm/hypervisor.h>
78070 +
78071 +#include <asm/pgalloc.h>
78072 +#include <asm/pgtable.h>
78073 +#include <asm/uaccess.h>
78074 +#include <asm/tlb.h>
78075 +#include <asm/hypervisor.h>
78076 +#include <xen/public/privcmd.h>
78077 +#include <xen/interface/xen.h>
78078 +#include <xen/interface/dom0_ops.h>
78079 +#include <xen/xen_proc.h>
78080 +
78081 +static struct proc_dir_entry *privcmd_intf;
78082 +static struct proc_dir_entry *capabilities_intf;
78083 +
78084 +#ifndef HAVE_ARCH_PRIVCMD_MMAP
78085 +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
78086 +#endif
78087 +
78088 +static int privcmd_ioctl(struct inode *inode, struct file *file,
78089 + unsigned int cmd, unsigned long data)
78090 +{
78091 + int ret = -ENOSYS;
78092 + void __user *udata = (void __user *) data;
78093 +
78094 + switch (cmd) {
78095 + case IOCTL_PRIVCMD_HYPERCALL: {
78096 + privcmd_hypercall_t hypercall;
78097 +
78098 + if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
78099 + return -EFAULT;
78100 +
78101 +#if defined(__i386__)
78102 + if (hypercall.op >= (PAGE_SIZE >> 5))
78103 + break;
78104 + __asm__ __volatile__ (
78105 + "pushl %%ebx; pushl %%ecx; pushl %%edx; "
78106 + "pushl %%esi; pushl %%edi; "
78107 + "movl 8(%%eax),%%ebx ;"
78108 + "movl 16(%%eax),%%ecx ;"
78109 + "movl 24(%%eax),%%edx ;"
78110 + "movl 32(%%eax),%%esi ;"
78111 + "movl 40(%%eax),%%edi ;"
78112 + "movl (%%eax),%%eax ;"
78113 + "shll $5,%%eax ;"
78114 + "addl $hypercall_page,%%eax ;"
78115 + "call *%%eax ;"
78116 + "popl %%edi; popl %%esi; popl %%edx; "
78117 + "popl %%ecx; popl %%ebx"
78118 + : "=a" (ret) : "0" (&hypercall) : "memory" );
78119 +#elif defined (__x86_64__)
78120 + if (hypercall.op < (PAGE_SIZE >> 5)) {
78121 + long ign1, ign2, ign3;
78122 + __asm__ __volatile__ (
78123 + "movq %8,%%r10; movq %9,%%r8;"
78124 + "shll $5,%%eax ;"
78125 + "addq $hypercall_page,%%rax ;"
78126 + "call *%%rax"
78127 + : "=a" (ret), "=D" (ign1),
78128 + "=S" (ign2), "=d" (ign3)
78129 + : "0" ((unsigned int)hypercall.op),
78130 + "1" (hypercall.arg[0]),
78131 + "2" (hypercall.arg[1]),
78132 + "3" (hypercall.arg[2]),
78133 + "g" (hypercall.arg[3]),
78134 + "g" (hypercall.arg[4])
78135 + : "r8", "r10", "memory" );
78136 + }
78137 +#elif defined (__ia64__)
78138 + ret = privcmd_hypercall(&hypercall);
78139 +#endif
78140 + }
78141 + break;
78142 +
78143 + case IOCTL_PRIVCMD_MMAP: {
78144 + privcmd_mmap_t mmapcmd;
78145 + privcmd_mmap_entry_t msg;
78146 + privcmd_mmap_entry_t __user *p;
78147 + struct mm_struct *mm = current->mm;
78148 + struct vm_area_struct *vma;
78149 + unsigned long va;
78150 + int i, rc;
78151 +
78152 + if (!is_initial_xendomain())
78153 + return -EPERM;
78154 +
78155 + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
78156 + return -EFAULT;
78157 +
78158 + p = mmapcmd.entry;
78159 + if (copy_from_user(&msg, p, sizeof(msg)))
78160 + return -EFAULT;
78161 +
78162 + down_read(&mm->mmap_sem);
78163 +
78164 + vma = find_vma(mm, msg.va);
78165 + rc = -EINVAL;
78166 + if (!vma || (msg.va != vma->vm_start) ||
78167 + !privcmd_enforce_singleshot_mapping(vma))
78168 + goto mmap_out;
78169 +
78170 + va = vma->vm_start;
78171 +
78172 + for (i = 0; i < mmapcmd.num; i++) {
78173 + rc = -EFAULT;
78174 + if (copy_from_user(&msg, p, sizeof(msg)))
78175 + goto mmap_out;
78176 +
78177 + /* Do not allow range to wrap the address space. */
78178 + rc = -EINVAL;
78179 + if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) ||
78180 + ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va))
78181 + goto mmap_out;
78182 +
78183 + /* Range chunks must be contiguous in va space. */
78184 + if ((msg.va != va) ||
78185 + ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end))
78186 + goto mmap_out;
78187 +
78188 + if ((rc = direct_remap_pfn_range(
78189 + vma,
78190 + msg.va & PAGE_MASK,
78191 + msg.mfn,
78192 + msg.npages << PAGE_SHIFT,
78193 + vma->vm_page_prot,
78194 + mmapcmd.dom)) < 0)
78195 + goto mmap_out;
78196 +
78197 + p++;
78198 + va += msg.npages << PAGE_SHIFT;
78199 + }
78200 +
78201 + rc = 0;
78202 +
78203 + mmap_out:
78204 + up_read(&mm->mmap_sem);
78205 + ret = rc;
78206 + }
78207 + break;
78208 +
78209 + case IOCTL_PRIVCMD_MMAPBATCH: {
78210 + privcmd_mmapbatch_t m;
78211 + struct mm_struct *mm = current->mm;
78212 + struct vm_area_struct *vma;
78213 + xen_pfn_t __user *p;
78214 + unsigned long addr, mfn, nr_pages;
78215 + int i;
78216 +
78217 + if (!is_initial_xendomain())
78218 + return -EPERM;
78219 +
78220 + if (copy_from_user(&m, udata, sizeof(m)))
78221 + return -EFAULT;
78222 +
78223 + nr_pages = m.num;
78224 + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
78225 + return -EINVAL;
78226 +
78227 + down_read(&mm->mmap_sem);
78228 +
78229 + vma = find_vma(mm, m.addr);
78230 + if (!vma ||
78231 + (m.addr != vma->vm_start) ||
78232 + ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
78233 + !privcmd_enforce_singleshot_mapping(vma)) {
78234 + up_read(&mm->mmap_sem);
78235 + return -EINVAL;
78236 + }
78237 +
78238 + p = m.arr;
78239 + addr = m.addr;
78240 + for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) {
78241 + if (get_user(mfn, p)) {
78242 + up_read(&mm->mmap_sem);
78243 + return -EFAULT;
78244 + }
78245 +
78246 + ret = direct_remap_pfn_range(vma, addr & PAGE_MASK,
78247 + mfn, PAGE_SIZE,
78248 + vma->vm_page_prot, m.dom);
78249 + if (ret < 0)
78250 + put_user(0xF0000000 | mfn, p);
78251 + }
78252 +
78253 + up_read(&mm->mmap_sem);
78254 + ret = 0;
78255 + }
78256 + break;
78257 +
78258 + default:
78259 + ret = -EINVAL;
78260 + break;
78261 + }
78262 +
78263 + return ret;
78264 +}
78265 +
78266 +#ifndef HAVE_ARCH_PRIVCMD_MMAP
78267 +static struct page *privcmd_nopage(struct vm_area_struct *vma,
78268 + unsigned long address,
78269 + int *type)
78270 +{
78271 + return NOPAGE_SIGBUS;
78272 +}
78273 +
78274 +static struct vm_operations_struct privcmd_vm_ops = {
78275 + .nopage = privcmd_nopage
78276 +};
78277 +
78278 +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
78279 +{
78280 + /* Unsupported for auto-translate guests. */
78281 + if (xen_feature(XENFEAT_auto_translated_physmap))
78282 + return -ENOSYS;
78283 +
78284 + /* DONTCOPY is essential for Xen as copy_page_range is broken. */
78285 + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
78286 + vma->vm_ops = &privcmd_vm_ops;
78287 + vma->vm_private_data = NULL;
78288 +
78289 + return 0;
78290 +}
78291 +
78292 +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
78293 +{
78294 + return (xchg(&vma->vm_private_data, (void *)1) == NULL);
78295 +}
78296 +#endif
78297 +
78298 +static struct file_operations privcmd_file_ops = {
78299 + .ioctl = privcmd_ioctl,
78300 + .mmap = privcmd_mmap,
78301 +};
78302 +
78303 +static int capabilities_read(char *page, char **start, off_t off,
78304 + int count, int *eof, void *data)
78305 +{
78306 + int len = 0;
78307 + *page = 0;
78308 +
78309 + if (is_initial_xendomain())
78310 + len = sprintf( page, "control_d\n" );
78311 +
78312 + *eof = 1;
78313 + return len;
78314 +}
78315 +
78316 +static int __init privcmd_init(void)
78317 +{
78318 + if (!is_running_on_xen())
78319 + return -ENODEV;
78320 +
78321 + privcmd_intf = create_xen_proc_entry("privcmd", 0400);
78322 + if (privcmd_intf != NULL)
78323 + privcmd_intf->proc_fops = &privcmd_file_ops;
78324 +
78325 + capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
78326 + if (capabilities_intf != NULL)
78327 + capabilities_intf->read_proc = capabilities_read;
78328 +
78329 + return 0;
78330 +}
78331 +
78332 +__initcall(privcmd_init);
78333 diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/Makefile linux-2.6.16.33/drivers/xen/tpmback/Makefile
78334 --- linux-2.6.16.33-noxen/drivers/xen/tpmback/Makefile 1970-01-01 00:00:00.000000000 +0000
78335 +++ linux-2.6.16.33/drivers/xen/tpmback/Makefile 2007-01-08 15:00:45.000000000 +0000
78336 @@ -0,0 +1,4 @@
78337 +
78338 +obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmbk.o
78339 +
78340 +tpmbk-y += tpmback.o interface.o xenbus.o
78341 diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/common.h linux-2.6.16.33/drivers/xen/tpmback/common.h
78342 --- linux-2.6.16.33-noxen/drivers/xen/tpmback/common.h 1970-01-01 00:00:00.000000000 +0000
78343 +++ linux-2.6.16.33/drivers/xen/tpmback/common.h 2007-01-08 15:00:45.000000000 +0000
78344 @@ -0,0 +1,87 @@
78345 +/******************************************************************************
78346 + * drivers/xen/tpmback/common.h
78347 + */
78348 +
78349 +#ifndef __NETIF__BACKEND__COMMON_H__
78350 +#define __NETIF__BACKEND__COMMON_H__
78351 +
78352 +#include <linux/config.h>
78353 +#include <linux/version.h>
78354 +#include <linux/module.h>
78355 +#include <linux/interrupt.h>
78356 +#include <linux/slab.h>
78357 +#include <xen/evtchn.h>
78358 +#include <xen/driver_util.h>
78359 +#include <xen/interface/grant_table.h>
78360 +#include <xen/interface/io/tpmif.h>
78361 +#include <asm/io.h>
78362 +#include <asm/pgalloc.h>
78363 +
78364 +#define DPRINTK(_f, _a...) \
78365 + pr_debug("(file=%s, line=%d) " _f, \
78366 + __FILE__ , __LINE__ , ## _a )
78367 +
78368 +struct backend_info;
78369 +
78370 +typedef struct tpmif_st {
78371 + struct list_head tpmif_list;
78372 + /* Unique identifier for this interface. */
78373 + domid_t domid;
78374 + unsigned int handle;
78375 +
78376 + /* Physical parameters of the comms window. */
78377 + unsigned int evtchn;
78378 + unsigned int irq;
78379 +
78380 + /* The shared rings and indexes. */
78381 + tpmif_tx_interface_t *tx;
78382 + struct vm_struct *tx_area;
78383 +
78384 + /* Miscellaneous private stuff. */
78385 + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
78386 + int active;
78387 +
78388 + struct tpmif_st *hash_next;
78389 + struct list_head list; /* scheduling list */
78390 + atomic_t refcnt;
78391 +
78392 + struct backend_info *bi;
78393 +
78394 + grant_handle_t shmem_handle;
78395 + grant_ref_t shmem_ref;
78396 + struct page **mmap_pages;
78397 +
78398 + char devname[20];
78399 +} tpmif_t;
78400 +
78401 +void tpmif_disconnect_complete(tpmif_t * tpmif);
78402 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
78403 +void tpmif_interface_init(void);
78404 +void tpmif_interface_exit(void);
78405 +void tpmif_schedule_work(tpmif_t * tpmif);
78406 +void tpmif_deschedule_work(tpmif_t * tpmif);
78407 +void tpmif_xenbus_init(void);
78408 +void tpmif_xenbus_exit(void);
78409 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
78410 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
78411 +
78412 +long int tpmback_get_instance(struct backend_info *bi);
78413 +
78414 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
78415 +
78416 +
78417 +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
78418 +#define tpmif_put(_b) \
78419 + do { \
78420 + if (atomic_dec_and_test(&(_b)->refcnt)) \
78421 + tpmif_disconnect_complete(_b); \
78422 + } while (0)
78423 +
78424 +extern int num_frontends;
78425 +
78426 +static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
78427 +{
78428 + return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
78429 +}
78430 +
78431 +#endif /* __TPMIF__BACKEND__COMMON_H__ */
78432 diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/interface.c linux-2.6.16.33/drivers/xen/tpmback/interface.c
78433 --- linux-2.6.16.33-noxen/drivers/xen/tpmback/interface.c 1970-01-01 00:00:00.000000000 +0000
78434 +++ linux-2.6.16.33/drivers/xen/tpmback/interface.c 2007-01-08 15:00:45.000000000 +0000
78435 @@ -0,0 +1,182 @@
78436 + /*****************************************************************************
78437 + * drivers/xen/tpmback/interface.c
78438 + *
78439 + * Vritual TPM interface management.
78440 + *
78441 + * Copyright (c) 2005, IBM Corporation
78442 + *
78443 + * Author: Stefan Berger, stefanb@us.ibm.com
78444 + *
78445 + * This code has been derived from drivers/xen/netback/interface.c
78446 + * Copyright (c) 2004, Keir Fraser
78447 + */
78448 +
78449 +#include "common.h"
78450 +#include <xen/balloon.h>
78451 +#include <xen/gnttab.h>
78452 +
78453 +static kmem_cache_t *tpmif_cachep;
78454 +int num_frontends = 0;
78455 +
78456 +LIST_HEAD(tpmif_list);
78457 +
78458 +static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
78459 +{
78460 + tpmif_t *tpmif;
78461 +
78462 + tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
78463 + if (tpmif == NULL)
78464 + goto out_of_memory;
78465 +
78466 + memset(tpmif, 0, sizeof (*tpmif));
78467 + tpmif->domid = domid;
78468 + tpmif->status = DISCONNECTED;
78469 + tpmif->bi = bi;
78470 + snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
78471 + atomic_set(&tpmif->refcnt, 1);
78472 +
78473 + tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
78474 + if (tpmif->mmap_pages == NULL)
78475 + goto out_of_memory;
78476 +
78477 + list_add(&tpmif->tpmif_list, &tpmif_list);
78478 + num_frontends++;
78479 +
78480 + return tpmif;
78481 +
78482 + out_of_memory:
78483 + if (tpmif != NULL)
78484 + kmem_cache_free(tpmif_cachep, tpmif);
78485 + printk("%s: out of memory\n", __FUNCTION__);
78486 + return ERR_PTR(-ENOMEM);
78487 +}
78488 +
78489 +static void free_tpmif(tpmif_t * tpmif)
78490 +{
78491 + num_frontends--;
78492 + list_del(&tpmif->tpmif_list);
78493 + free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
78494 + kmem_cache_free(tpmif_cachep, tpmif);
78495 +}
78496 +
78497 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
78498 +{
78499 + tpmif_t *tpmif;
78500 +
78501 + list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
78502 + if (tpmif->bi == bi) {
78503 + if (tpmif->domid == domid) {
78504 + tpmif_get(tpmif);
78505 + return tpmif;
78506 + } else {
78507 + return ERR_PTR(-EEXIST);
78508 + }
78509 + }
78510 + }
78511 +
78512 + return alloc_tpmif(domid, bi);
78513 +}
78514 +
78515 +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
78516 +{
78517 + int ret;
78518 + struct gnttab_map_grant_ref op;
78519 +
78520 + gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr,
78521 + GNTMAP_host_map, shared_page, tpmif->domid);
78522 +
78523 + lock_vm_area(tpmif->tx_area);
78524 + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
78525 + unlock_vm_area(tpmif->tx_area);
78526 + BUG_ON(ret);
78527 +
78528 + if (op.status) {
78529 + DPRINTK(" Grant table operation failure !\n");
78530 + return op.status;
78531 + }
78532 +
78533 + tpmif->shmem_ref = shared_page;
78534 + tpmif->shmem_handle = op.handle;
78535 +
78536 + return 0;
78537 +}
78538 +
78539 +static void unmap_frontend_page(tpmif_t *tpmif)
78540 +{
78541 + struct gnttab_unmap_grant_ref op;
78542 + int ret;
78543 +
78544 + gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr,
78545 + GNTMAP_host_map, tpmif->shmem_handle);
78546 +
78547 + lock_vm_area(tpmif->tx_area);
78548 + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
78549 + unlock_vm_area(tpmif->tx_area);
78550 + BUG_ON(ret);
78551 +}
78552 +
78553 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
78554 +{
78555 + int err;
78556 + struct evtchn_bind_interdomain bind_interdomain;
78557 +
78558 + if (tpmif->irq) {
78559 + return 0;
78560 + }
78561 +
78562 + if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
78563 + return -ENOMEM;
78564 +
78565 + err = map_frontend_page(tpmif, shared_page);
78566 + if (err) {
78567 + free_vm_area(tpmif->tx_area);
78568 + return err;
78569 + }
78570 +
78571 +
78572 + bind_interdomain.remote_dom = tpmif->domid;
78573 + bind_interdomain.remote_port = evtchn;
78574 +
78575 + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
78576 + &bind_interdomain);
78577 + if (err) {
78578 + unmap_frontend_page(tpmif);
78579 + free_vm_area(tpmif->tx_area);
78580 + return err;
78581 + }
78582 +
78583 + tpmif->evtchn = bind_interdomain.local_port;
78584 +
78585 + tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
78586 +
78587 + tpmif->irq = bind_evtchn_to_irqhandler(
78588 + tpmif->evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
78589 + tpmif->shmem_ref = shared_page;
78590 + tpmif->active = 1;
78591 +
78592 + return 0;
78593 +}
78594 +
78595 +void tpmif_disconnect_complete(tpmif_t *tpmif)
78596 +{
78597 + if (tpmif->irq)
78598 + unbind_from_irqhandler(tpmif->irq, tpmif);
78599 +
78600 + if (tpmif->tx) {
78601 + unmap_frontend_page(tpmif);
78602 + free_vm_area(tpmif->tx_area);
78603 + }
78604 +
78605 + free_tpmif(tpmif);
78606 +}
78607 +
78608 +void __init tpmif_interface_init(void)
78609 +{
78610 + tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
78611 + 0, 0, NULL, NULL);
78612 +}
78613 +
78614 +void __exit tpmif_interface_exit(void)
78615 +{
78616 + kmem_cache_destroy(tpmif_cachep);
78617 +}
78618 diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/tpmback.c linux-2.6.16.33/drivers/xen/tpmback/tpmback.c
78619 --- linux-2.6.16.33-noxen/drivers/xen/tpmback/tpmback.c 1970-01-01 00:00:00.000000000 +0000
78620 +++ linux-2.6.16.33/drivers/xen/tpmback/tpmback.c 2007-01-08 15:00:45.000000000 +0000
78621 @@ -0,0 +1,944 @@
78622 +/******************************************************************************
78623 + * drivers/xen/tpmback/tpmback.c
78624 + *
78625 + * Copyright (c) 2005, IBM Corporation
78626 + *
78627 + * Author: Stefan Berger, stefanb@us.ibm.com
78628 + * Grant table support: Mahadevan Gomathisankaran
78629 + *
78630 + * This code has been derived from drivers/xen/netback/netback.c
78631 + * Copyright (c) 2002-2004, K A Fraser
78632 + *
78633 + */
78634 +
78635 +#include "common.h"
78636 +#include <xen/evtchn.h>
78637 +
78638 +#include <linux/types.h>
78639 +#include <linux/list.h>
78640 +#include <linux/miscdevice.h>
78641 +#include <linux/poll.h>
78642 +#include <asm/uaccess.h>
78643 +#include <xen/xenbus.h>
78644 +#include <xen/interface/grant_table.h>
78645 +#include <xen/gnttab.h>
78646 +
78647 +/* local data structures */
78648 +struct data_exchange {
78649 + struct list_head pending_pak;
78650 + struct list_head current_pak;
78651 + unsigned int copied_so_far;
78652 + u8 has_opener:1;
78653 + u8 aborted:1;
78654 + rwlock_t pak_lock; // protects all of the previous fields
78655 + wait_queue_head_t wait_queue;
78656 +};
78657 +
78658 +struct vtpm_resp_hdr {
78659 + uint32_t instance_no;
78660 + uint16_t tag_no;
78661 + uint32_t len_no;
78662 + uint32_t ordinal_no;
78663 +} __attribute__ ((packed));
78664 +
78665 +struct packet {
78666 + struct list_head next;
78667 + unsigned int data_len;
78668 + u8 *data_buffer;
78669 + tpmif_t *tpmif;
78670 + u32 tpm_instance;
78671 + u8 req_tag;
78672 + u32 last_read;
78673 + u8 flags;
78674 + struct timer_list processing_timer;
78675 +};
78676 +
78677 +enum {
78678 + PACKET_FLAG_DISCARD_RESPONSE = 1,
78679 +};
78680 +
78681 +/* local variables */
78682 +static struct data_exchange dataex;
78683 +
78684 +/* local function prototypes */
78685 +static int _packet_write(struct packet *pak,
78686 + const char *data, size_t size, int userbuffer);
78687 +static void processing_timeout(unsigned long ptr);
78688 +static int packet_read_shmem(struct packet *pak,
78689 + tpmif_t * tpmif,
78690 + u32 offset,
78691 + char *buffer, int isuserbuffer, u32 left);
78692 +static int vtpm_queue_packet(struct packet *pak);
78693 +
78694 +/***************************************************************
78695 + Buffer copying fo user and kernel space buffes.
78696 +***************************************************************/
78697 +static inline int copy_from_buffer(void *to,
78698 + const void *from, unsigned long size,
78699 + int isuserbuffer)
78700 +{
78701 + if (isuserbuffer) {
78702 + if (copy_from_user(to, (void __user *)from, size))
78703 + return -EFAULT;
78704 + } else {
78705 + memcpy(to, from, size);
78706 + }
78707 + return 0;
78708 +}
78709 +
78710 +static inline int copy_to_buffer(void *to,
78711 + const void *from, unsigned long size,
78712 + int isuserbuffer)
78713 +{
78714 + if (isuserbuffer) {
78715 + if (copy_to_user((void __user *)to, from, size))
78716 + return -EFAULT;
78717 + } else {
78718 + memcpy(to, from, size);
78719 + }
78720 + return 0;
78721 +}
78722 +
78723 +
78724 +static void dataex_init(struct data_exchange *dataex)
78725 +{
78726 + INIT_LIST_HEAD(&dataex->pending_pak);
78727 + INIT_LIST_HEAD(&dataex->current_pak);
78728 + dataex->has_opener = 0;
78729 + rwlock_init(&dataex->pak_lock);
78730 + init_waitqueue_head(&dataex->wait_queue);
78731 +}
78732 +
78733 +/***************************************************************
78734 + Packet-related functions
78735 +***************************************************************/
78736 +
78737 +static struct packet *packet_find_instance(struct list_head *head,
78738 + u32 tpm_instance)
78739 +{
78740 + struct packet *pak;
78741 + struct list_head *p;
78742 +
78743 + /*
78744 + * traverse the list of packets and return the first
78745 + * one with the given instance number
78746 + */
78747 + list_for_each(p, head) {
78748 + pak = list_entry(p, struct packet, next);
78749 +
78750 + if (pak->tpm_instance == tpm_instance) {
78751 + return pak;
78752 + }
78753 + }
78754 + return NULL;
78755 +}
78756 +
78757 +static struct packet *packet_find_packet(struct list_head *head, void *packet)
78758 +{
78759 + struct packet *pak;
78760 + struct list_head *p;
78761 +
78762 + /*
78763 + * traverse the list of packets and return the first
78764 + * one with the given instance number
78765 + */
78766 + list_for_each(p, head) {
78767 + pak = list_entry(p, struct packet, next);
78768 +
78769 + if (pak == packet) {
78770 + return pak;
78771 + }
78772 + }
78773 + return NULL;
78774 +}
78775 +
78776 +static struct packet *packet_alloc(tpmif_t * tpmif,
78777 + u32 size, u8 req_tag, u8 flags)
78778 +{
78779 + struct packet *pak = NULL;
78780 + pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
78781 + if (NULL != pak) {
78782 + if (tpmif) {
78783 + pak->tpmif = tpmif;
78784 + pak->tpm_instance = tpmback_get_instance(tpmif->bi);
78785 + tpmif_get(tpmif);
78786 + }
78787 + pak->data_len = size;
78788 + pak->req_tag = req_tag;
78789 + pak->last_read = 0;
78790 + pak->flags = flags;
78791 +
78792 + /*
78793 + * cannot do tpmif_get(tpmif); bad things happen
78794 + * on the last tpmif_put()
78795 + */
78796 + init_timer(&pak->processing_timer);
78797 + pak->processing_timer.function = processing_timeout;
78798 + pak->processing_timer.data = (unsigned long)pak;
78799 + }
78800 + return pak;
78801 +}
78802 +
78803 +static void inline packet_reset(struct packet *pak)
78804 +{
78805 + pak->last_read = 0;
78806 +}
78807 +
78808 +static void packet_free(struct packet *pak)
78809 +{
78810 + if (timer_pending(&pak->processing_timer)) {
78811 + BUG();
78812 + }
78813 +
78814 + if (pak->tpmif)
78815 + tpmif_put(pak->tpmif);
78816 + kfree(pak->data_buffer);
78817 + /*
78818 + * cannot do tpmif_put(pak->tpmif); bad things happen
78819 + * on the last tpmif_put()
78820 + */
78821 + kfree(pak);
78822 +}
78823 +
78824 +
78825 +/*
78826 + * Write data to the shared memory and send it to the FE.
78827 + */
78828 +static int packet_write(struct packet *pak,
78829 + const char *data, size_t size, int isuserbuffer)
78830 +{
78831 + int rc = 0;
78832 +
78833 + if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
78834 + /* Don't send a respone to this packet. Just acknowledge it. */
78835 + rc = size;
78836 + } else {
78837 + rc = _packet_write(pak, data, size, isuserbuffer);
78838 + }
78839 +
78840 + return rc;
78841 +}
78842 +
78843 +int _packet_write(struct packet *pak,
78844 + const char *data, size_t size, int isuserbuffer)
78845 +{
78846 + /*
78847 + * Write into the shared memory pages directly
78848 + * and send it to the front end.
78849 + */
78850 + tpmif_t *tpmif = pak->tpmif;
78851 + grant_handle_t handle;
78852 + int rc = 0;
78853 + unsigned int i = 0;
78854 + unsigned int offset = 0;
78855 +
78856 + if (tpmif == NULL) {
78857 + return -EFAULT;
78858 + }
78859 +
78860 + if (tpmif->status == DISCONNECTED) {
78861 + return size;
78862 + }
78863 +
78864 + while (offset < size && i < TPMIF_TX_RING_SIZE) {
78865 + unsigned int tocopy;
78866 + struct gnttab_map_grant_ref map_op;
78867 + struct gnttab_unmap_grant_ref unmap_op;
78868 + tpmif_tx_request_t *tx;
78869 +
78870 + tx = &tpmif->tx->ring[i].req;
78871 +
78872 + if (0 == tx->addr) {
78873 + DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
78874 + return 0;
78875 + }
78876 +
78877 + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
78878 + GNTMAP_host_map, tx->ref, tpmif->domid);
78879 +
78880 + if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
78881 + &map_op, 1))) {
78882 + BUG();
78883 + }
78884 +
78885 + handle = map_op.handle;
78886 +
78887 + if (map_op.status) {
78888 + DPRINTK(" Grant table operation failure !\n");
78889 + return 0;
78890 + }
78891 +
78892 + tocopy = min_t(size_t, size - offset, PAGE_SIZE);
78893 +
78894 + if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
78895 + (tx->addr & ~PAGE_MASK)),
78896 + &data[offset], tocopy, isuserbuffer)) {
78897 + tpmif_put(tpmif);
78898 + return -EFAULT;
78899 + }
78900 + tx->size = tocopy;
78901 +
78902 + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
78903 + GNTMAP_host_map, handle);
78904 +
78905 + if (unlikely
78906 + (HYPERVISOR_grant_table_op
78907 + (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
78908 + BUG();
78909 + }
78910 +
78911 + offset += tocopy;
78912 + i++;
78913 + }
78914 +
78915 + rc = offset;
78916 + DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
78917 + notify_remote_via_irq(tpmif->irq);
78918 +
78919 + return rc;
78920 +}
78921 +
78922 +/*
78923 + * Read data from the shared memory and copy it directly into the
78924 + * provided buffer. Advance the read_last indicator which tells
78925 + * how many bytes have already been read.
78926 + */
78927 +static int packet_read(struct packet *pak, size_t numbytes,
78928 + char *buffer, size_t buffersize, int isuserbuffer)
78929 +{
78930 + tpmif_t *tpmif = pak->tpmif;
78931 +
78932 + /*
78933 + * Read 'numbytes' of data from the buffer. The first 4
78934 + * bytes are the instance number in network byte order,
78935 + * after that come the data from the shared memory buffer.
78936 + */
78937 + u32 to_copy;
78938 + u32 offset = 0;
78939 + u32 room_left = buffersize;
78940 +
78941 + if (pak->last_read < 4) {
78942 + /*
78943 + * copy the instance number into the buffer
78944 + */
78945 + u32 instance_no = htonl(pak->tpm_instance);
78946 + u32 last_read = pak->last_read;
78947 +
78948 + to_copy = min_t(size_t, 4 - last_read, numbytes);
78949 +
78950 + if (copy_to_buffer(&buffer[0],
78951 + &(((u8 *) & instance_no)[last_read]),
78952 + to_copy, isuserbuffer)) {
78953 + return -EFAULT;
78954 + }
78955 +
78956 + pak->last_read += to_copy;
78957 + offset += to_copy;
78958 + room_left -= to_copy;
78959 + }
78960 +
78961 + /*
78962 + * If the packet has a data buffer appended, read from it...
78963 + */
78964 +
78965 + if (room_left > 0) {
78966 + if (pak->data_buffer) {
78967 + u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
78968 + u32 last_read = pak->last_read - 4;
78969 +
78970 + if (copy_to_buffer(&buffer[offset],
78971 + &pak->data_buffer[last_read],
78972 + to_copy, isuserbuffer)) {
78973 + return -EFAULT;
78974 + }
78975 + pak->last_read += to_copy;
78976 + offset += to_copy;
78977 + } else {
78978 + offset = packet_read_shmem(pak,
78979 + tpmif,
78980 + offset,
78981 + buffer,
78982 + isuserbuffer, room_left);
78983 + }
78984 + }
78985 + return offset;
78986 +}
78987 +
78988 +static int packet_read_shmem(struct packet *pak,
78989 + tpmif_t * tpmif,
78990 + u32 offset, char *buffer, int isuserbuffer,
78991 + u32 room_left)
78992 +{
78993 + u32 last_read = pak->last_read - 4;
78994 + u32 i = (last_read / PAGE_SIZE);
78995 + u32 pg_offset = last_read & (PAGE_SIZE - 1);
78996 + u32 to_copy;
78997 + grant_handle_t handle;
78998 +
78999 + tpmif_tx_request_t *tx;
79000 +
79001 + tx = &tpmif->tx->ring[0].req;
79002 + /*
79003 + * Start copying data at the page with index 'index'
79004 + * and within that page at offset 'offset'.
79005 + * Copy a maximum of 'room_left' bytes.
79006 + */
79007 + to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
79008 + while (to_copy > 0) {
79009 + void *src;
79010 + struct gnttab_map_grant_ref map_op;
79011 + struct gnttab_unmap_grant_ref unmap_op;
79012 +
79013 + tx = &tpmif->tx->ring[i].req;
79014 +
79015 + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
79016 + GNTMAP_host_map, tx->ref, tpmif->domid);
79017 +
79018 + if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
79019 + &map_op, 1))) {
79020 + BUG();
79021 + }
79022 +
79023 + if (map_op.status) {
79024 + DPRINTK(" Grant table operation failure !\n");
79025 + return -EFAULT;
79026 + }
79027 +
79028 + handle = map_op.handle;
79029 +
79030 + if (to_copy > tx->size) {
79031 + /*
79032 + * User requests more than what's available
79033 + */
79034 + to_copy = min_t(u32, tx->size, to_copy);
79035 + }
79036 +
79037 + DPRINTK("Copying from mapped memory at %08lx\n",
79038 + (unsigned long)(idx_to_kaddr(tpmif, i) |
79039 + (tx->addr & ~PAGE_MASK)));
79040 +
79041 + src = (void *)(idx_to_kaddr(tpmif, i) |
79042 + ((tx->addr & ~PAGE_MASK) + pg_offset));
79043 + if (copy_to_buffer(&buffer[offset],
79044 + src, to_copy, isuserbuffer)) {
79045 + return -EFAULT;
79046 + }
79047 +
79048 + DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
79049 + tpmif->domid, buffer[offset], buffer[offset + 1],
79050 + buffer[offset + 2], buffer[offset + 3]);
79051 +
79052 + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
79053 + GNTMAP_host_map, handle);
79054 +
79055 + if (unlikely
79056 + (HYPERVISOR_grant_table_op
79057 + (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
79058 + BUG();
79059 + }
79060 +
79061 + offset += to_copy;
79062 + pg_offset = 0;
79063 + last_read += to_copy;
79064 + room_left -= to_copy;
79065 +
79066 + to_copy = min_t(u32, PAGE_SIZE, room_left);
79067 + i++;
79068 + } /* while (to_copy > 0) */
79069 + /*
79070 + * Adjust the last_read pointer
79071 + */
79072 + pak->last_read = last_read + 4;
79073 + return offset;
79074 +}
79075 +
79076 +/* ============================================================
79077 + * The file layer for reading data from this device
79078 + * ============================================================
79079 + */
79080 +static int vtpm_op_open(struct inode *inode, struct file *f)
79081 +{
79082 + int rc = 0;
79083 + unsigned long flags;
79084 +
79085 + write_lock_irqsave(&dataex.pak_lock, flags);
79086 + if (dataex.has_opener == 0) {
79087 + dataex.has_opener = 1;
79088 + } else {
79089 + rc = -EPERM;
79090 + }
79091 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79092 + return rc;
79093 +}
79094 +
79095 +static ssize_t vtpm_op_read(struct file *file,
79096 + char __user * data, size_t size, loff_t * offset)
79097 +{
79098 + int ret_size = -ENODATA;
79099 + struct packet *pak = NULL;
79100 + unsigned long flags;
79101 +
79102 + write_lock_irqsave(&dataex.pak_lock, flags);
79103 + if (dataex.aborted) {
79104 + dataex.aborted = 0;
79105 + dataex.copied_so_far = 0;
79106 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79107 + return -EIO;
79108 + }
79109 +
79110 + if (list_empty(&dataex.pending_pak)) {
79111 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79112 + wait_event_interruptible(dataex.wait_queue,
79113 + !list_empty(&dataex.pending_pak));
79114 + write_lock_irqsave(&dataex.pak_lock, flags);
79115 + dataex.copied_so_far = 0;
79116 + }
79117 +
79118 + if (!list_empty(&dataex.pending_pak)) {
79119 + unsigned int left;
79120 +
79121 + pak = list_entry(dataex.pending_pak.next, struct packet, next);
79122 + left = pak->data_len - dataex.copied_so_far;
79123 + list_del(&pak->next);
79124 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79125 +
79126 + DPRINTK("size given by app: %d, available: %d\n", size, left);
79127 +
79128 + ret_size = min_t(size_t, size, left);
79129 +
79130 + ret_size = packet_read(pak, ret_size, data, size, 1);
79131 +
79132 + write_lock_irqsave(&dataex.pak_lock, flags);
79133 +
79134 + if (ret_size < 0) {
79135 + del_singleshot_timer_sync(&pak->processing_timer);
79136 + packet_free(pak);
79137 + dataex.copied_so_far = 0;
79138 + } else {
79139 + DPRINTK("Copied %d bytes to user buffer\n", ret_size);
79140 +
79141 + dataex.copied_so_far += ret_size;
79142 + if (dataex.copied_so_far >= pak->data_len + 4) {
79143 + DPRINTK("All data from this packet given to app.\n");
79144 + /* All data given to app */
79145 +
79146 + del_singleshot_timer_sync(&pak->
79147 + processing_timer);
79148 + list_add_tail(&pak->next, &dataex.current_pak);
79149 + /*
79150 + * The more fontends that are handled at the same time,
79151 + * the more time we give the TPM to process the request.
79152 + */
79153 + mod_timer(&pak->processing_timer,
79154 + jiffies + (num_frontends * 60 * HZ));
79155 + dataex.copied_so_far = 0;
79156 + } else {
79157 + list_add(&pak->next, &dataex.pending_pak);
79158 + }
79159 + }
79160 + }
79161 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79162 +
79163 + DPRINTK("Returning result from read to app: %d\n", ret_size);
79164 +
79165 + return ret_size;
79166 +}
79167 +
79168 +/*
79169 + * Write operation - only works after a previous read operation!
79170 + */
79171 +static ssize_t vtpm_op_write(struct file *file,
79172 + const char __user * data, size_t size,
79173 + loff_t * offset)
79174 +{
79175 + struct packet *pak;
79176 + int rc = 0;
79177 + unsigned int off = 4;
79178 + unsigned long flags;
79179 + struct vtpm_resp_hdr vrh;
79180 +
79181 + /*
79182 + * Minimum required packet size is:
79183 + * 4 bytes for instance number
79184 + * 2 bytes for tag
79185 + * 4 bytes for paramSize
79186 + * 4 bytes for the ordinal
79187 + * sum: 14 bytes
79188 + */
79189 + if (size < sizeof (vrh))
79190 + return -EFAULT;
79191 +
79192 + if (copy_from_user(&vrh, data, sizeof (vrh)))
79193 + return -EFAULT;
79194 +
79195 + /* malformed packet? */
79196 + if ((off + ntohl(vrh.len_no)) != size)
79197 + return -EFAULT;
79198 +
79199 + write_lock_irqsave(&dataex.pak_lock, flags);
79200 + pak = packet_find_instance(&dataex.current_pak,
79201 + ntohl(vrh.instance_no));
79202 +
79203 + if (pak == NULL) {
79204 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79205 + DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
79206 + ntohl(vrh.instance_no));
79207 + return -EFAULT;
79208 + }
79209 +
79210 + del_singleshot_timer_sync(&pak->processing_timer);
79211 + list_del(&pak->next);
79212 +
79213 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79214 +
79215 + /*
79216 + * The first 'offset' bytes must be the instance number - skip them.
79217 + */
79218 + size -= off;
79219 +
79220 + rc = packet_write(pak, &data[off], size, 1);
79221 +
79222 + if (rc > 0) {
79223 + /* I neglected the first 4 bytes */
79224 + rc += off;
79225 + }
79226 + packet_free(pak);
79227 + return rc;
79228 +}
79229 +
79230 +static int vtpm_op_release(struct inode *inode, struct file *file)
79231 +{
79232 + unsigned long flags;
79233 +
79234 + vtpm_release_packets(NULL, 1);
79235 + write_lock_irqsave(&dataex.pak_lock, flags);
79236 + dataex.has_opener = 0;
79237 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79238 + return 0;
79239 +}
79240 +
79241 +static unsigned int vtpm_op_poll(struct file *file,
79242 + struct poll_table_struct *pts)
79243 +{
79244 + unsigned int flags = POLLOUT | POLLWRNORM;
79245 +
79246 + poll_wait(file, &dataex.wait_queue, pts);
79247 + if (!list_empty(&dataex.pending_pak)) {
79248 + flags |= POLLIN | POLLRDNORM;
79249 + }
79250 + return flags;
79251 +}
79252 +
79253 +static struct file_operations vtpm_ops = {
79254 + .owner = THIS_MODULE,
79255 + .llseek = no_llseek,
79256 + .open = vtpm_op_open,
79257 + .read = vtpm_op_read,
79258 + .write = vtpm_op_write,
79259 + .release = vtpm_op_release,
79260 + .poll = vtpm_op_poll,
79261 +};
79262 +
79263 +static struct miscdevice vtpms_miscdevice = {
79264 + .minor = 225,
79265 + .name = "vtpm",
79266 + .fops = &vtpm_ops,
79267 +};
79268 +
79269 +/***************************************************************
79270 + Utility functions
79271 +***************************************************************/
79272 +
79273 +static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
79274 +{
79275 + int rc;
79276 + static const unsigned char tpm_error_message_fail[] = {
79277 + 0x00, 0x00,
79278 + 0x00, 0x00, 0x00, 0x0a,
79279 + 0x00, 0x00, 0x00, 0x09 /* TPM_FAIL */
79280 + };
79281 + unsigned char buffer[sizeof (tpm_error_message_fail)];
79282 +
79283 + memcpy(buffer, tpm_error_message_fail,
79284 + sizeof (tpm_error_message_fail));
79285 + /*
79286 + * Insert the right response tag depending on the given tag
79287 + * All response tags are '+3' to the request tag.
79288 + */
79289 + buffer[1] = req_tag + 3;
79290 +
79291 + /*
79292 + * Write the data to shared memory and notify the front-end
79293 + */
79294 + rc = packet_write(pak, buffer, sizeof (buffer), 0);
79295 +
79296 + return rc;
79297 +}
79298 +
79299 +static int _vtpm_release_packets(struct list_head *head,
79300 + tpmif_t * tpmif, int send_msgs)
79301 +{
79302 + int aborted = 0;
79303 + int c = 0;
79304 + struct packet *pak;
79305 + struct list_head *pos, *tmp;
79306 +
79307 + list_for_each_safe(pos, tmp, head) {
79308 + pak = list_entry(pos, struct packet, next);
79309 + c += 1;
79310 +
79311 + if (tpmif == NULL || pak->tpmif == tpmif) {
79312 + int can_send = 0;
79313 +
79314 + del_singleshot_timer_sync(&pak->processing_timer);
79315 + list_del(&pak->next);
79316 +
79317 + if (pak->tpmif && pak->tpmif->status == CONNECTED) {
79318 + can_send = 1;
79319 + }
79320 +
79321 + if (send_msgs && can_send) {
79322 + tpm_send_fail_message(pak, pak->req_tag);
79323 + }
79324 + packet_free(pak);
79325 + if (c == 1)
79326 + aborted = 1;
79327 + }
79328 + }
79329 + return aborted;
79330 +}
79331 +
79332 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
79333 +{
79334 + unsigned long flags;
79335 +
79336 + write_lock_irqsave(&dataex.pak_lock, flags);
79337 +
79338 + dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
79339 + tpmif,
79340 + send_msgs);
79341 + _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
79342 +
79343 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79344 + return 0;
79345 +}
79346 +
79347 +static int vtpm_queue_packet(struct packet *pak)
79348 +{
79349 + int rc = 0;
79350 +
79351 + if (dataex.has_opener) {
79352 + unsigned long flags;
79353 +
79354 + write_lock_irqsave(&dataex.pak_lock, flags);
79355 + list_add_tail(&pak->next, &dataex.pending_pak);
79356 + /* give the TPM some time to pick up the request */
79357 + mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
79358 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79359 +
79360 + wake_up_interruptible(&dataex.wait_queue);
79361 + } else {
79362 + rc = -EFAULT;
79363 + }
79364 + return rc;
79365 +}
79366 +
79367 +static int vtpm_receive(tpmif_t * tpmif, u32 size)
79368 +{
79369 + int rc = 0;
79370 + unsigned char buffer[10];
79371 + __be32 *native_size;
79372 + struct packet *pak = packet_alloc(tpmif, size, 0, 0);
79373 +
79374 + if (!pak)
79375 + return -ENOMEM;
79376 + /*
79377 + * Read 10 bytes from the received buffer to test its
79378 + * content for validity.
79379 + */
79380 + if (sizeof (buffer) != packet_read(pak,
79381 + sizeof (buffer), buffer,
79382 + sizeof (buffer), 0)) {
79383 + goto failexit;
79384 + }
79385 + /*
79386 + * Reset the packet read pointer so we can read all its
79387 + * contents again.
79388 + */
79389 + packet_reset(pak);
79390 +
79391 + native_size = (__force __be32 *) (&buffer[4 + 2]);
79392 + /*
79393 + * Verify that the size of the packet is correct
79394 + * as indicated and that there's actually someone reading packets.
79395 + * The minimum size of the packet is '10' for tag, size indicator
79396 + * and ordinal.
79397 + */
79398 + if (size < 10 ||
79399 + be32_to_cpu(*native_size) != size ||
79400 + 0 == dataex.has_opener || tpmif->status != CONNECTED) {
79401 + rc = -EINVAL;
79402 + goto failexit;
79403 + } else {
79404 + rc = vtpm_queue_packet(pak);
79405 + if (rc < 0)
79406 + goto failexit;
79407 + }
79408 + return 0;
79409 +
79410 + failexit:
79411 + if (pak) {
79412 + tpm_send_fail_message(pak, buffer[4 + 1]);
79413 + packet_free(pak);
79414 + }
79415 + return rc;
79416 +}
79417 +
79418 +/*
79419 + * Timeout function that gets invoked when a packet has not been processed
79420 + * during the timeout period.
79421 + * The packet must be on a list when this function is invoked. This
79422 + * also means that once its taken off a list, the timer must be
79423 + * destroyed as well.
79424 + */
79425 +static void processing_timeout(unsigned long ptr)
79426 +{
79427 + struct packet *pak = (struct packet *)ptr;
79428 + unsigned long flags;
79429 +
79430 + write_lock_irqsave(&dataex.pak_lock, flags);
79431 + /*
79432 + * The packet needs to be searched whether it
79433 + * is still on the list.
79434 + */
79435 + if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
79436 + pak == packet_find_packet(&dataex.current_pak, pak)) {
79437 + if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
79438 + tpm_send_fail_message(pak, pak->req_tag);
79439 + }
79440 + /* discard future responses */
79441 + pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
79442 + }
79443 +
79444 + write_unlock_irqrestore(&dataex.pak_lock, flags);
79445 +}
79446 +
79447 +static void tpm_tx_action(unsigned long unused);
79448 +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
79449 +
79450 +static struct list_head tpm_schedule_list;
79451 +static spinlock_t tpm_schedule_list_lock;
79452 +
79453 +static inline void maybe_schedule_tx_action(void)
79454 +{
79455 + smp_mb();
79456 + tasklet_schedule(&tpm_tx_tasklet);
79457 +}
79458 +
79459 +static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
79460 +{
79461 + return tpmif->list.next != NULL;
79462 +}
79463 +
79464 +static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
79465 +{
79466 + spin_lock_irq(&tpm_schedule_list_lock);
79467 + if (likely(__on_tpm_schedule_list(tpmif))) {
79468 + list_del(&tpmif->list);
79469 + tpmif->list.next = NULL;
79470 + tpmif_put(tpmif);
79471 + }
79472 + spin_unlock_irq(&tpm_schedule_list_lock);
79473 +}
79474 +
79475 +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
79476 +{
79477 + if (__on_tpm_schedule_list(tpmif))
79478 + return;
79479 +
79480 + spin_lock_irq(&tpm_schedule_list_lock);
79481 + if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
79482 + list_add_tail(&tpmif->list, &tpm_schedule_list);
79483 + tpmif_get(tpmif);
79484 + }
79485 + spin_unlock_irq(&tpm_schedule_list_lock);
79486 +}
79487 +
79488 +void tpmif_schedule_work(tpmif_t * tpmif)
79489 +{
79490 + add_to_tpm_schedule_list_tail(tpmif);
79491 + maybe_schedule_tx_action();
79492 +}
79493 +
79494 +void tpmif_deschedule_work(tpmif_t * tpmif)
79495 +{
79496 + remove_from_tpm_schedule_list(tpmif);
79497 +}
79498 +
79499 +static void tpm_tx_action(unsigned long unused)
79500 +{
79501 + struct list_head *ent;
79502 + tpmif_t *tpmif;
79503 + tpmif_tx_request_t *tx;
79504 +
79505 + DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
79506 +
79507 + while (!list_empty(&tpm_schedule_list)) {
79508 + /* Get a tpmif from the list with work to do. */
79509 + ent = tpm_schedule_list.next;
79510 + tpmif = list_entry(ent, tpmif_t, list);
79511 + tpmif_get(tpmif);
79512 + remove_from_tpm_schedule_list(tpmif);
79513 +
79514 + tx = &tpmif->tx->ring[0].req;
79515 +
79516 + /* pass it up */
79517 + vtpm_receive(tpmif, tx->size);
79518 +
79519 + tpmif_put(tpmif);
79520 + }
79521 +}
79522 +
79523 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
79524 +{
79525 + tpmif_t *tpmif = (tpmif_t *) dev_id;
79526 +
79527 + add_to_tpm_schedule_list_tail(tpmif);
79528 + maybe_schedule_tx_action();
79529 + return IRQ_HANDLED;
79530 +}
79531 +
79532 +static int __init tpmback_init(void)
79533 +{
79534 + int rc;
79535 +
79536 + if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
79537 + printk(KERN_ALERT
79538 + "Could not register misc device for TPM BE.\n");
79539 + return rc;
79540 + }
79541 +
79542 + dataex_init(&dataex);
79543 +
79544 + spin_lock_init(&tpm_schedule_list_lock);
79545 + INIT_LIST_HEAD(&tpm_schedule_list);
79546 +
79547 + tpmif_interface_init();
79548 + tpmif_xenbus_init();
79549 +
79550 + printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
79551 +
79552 + return 0;
79553 +}
79554 +
79555 +module_init(tpmback_init);
79556 +
79557 +void __exit tpmback_exit(void)
79558 +{
79559 + vtpm_release_packets(NULL, 0);
79560 + tpmif_xenbus_exit();
79561 + tpmif_interface_exit();
79562 + misc_deregister(&vtpms_miscdevice);
79563 +}
79564 +
79565 +MODULE_LICENSE("Dual BSD/GPL");
79566 diff -Nur linux-2.6.16.33-noxen/drivers/xen/tpmback/xenbus.c linux-2.6.16.33/drivers/xen/tpmback/xenbus.c
79567 --- linux-2.6.16.33-noxen/drivers/xen/tpmback/xenbus.c 1970-01-01 00:00:00.000000000 +0000
79568 +++ linux-2.6.16.33/drivers/xen/tpmback/xenbus.c 2007-01-08 15:00:45.000000000 +0000
79569 @@ -0,0 +1,289 @@
79570 +/* Xenbus code for tpmif backend
79571 + Copyright (C) 2005 IBM Corporation
79572 + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
79573 +
79574 + This program is free software; you can redistribute it and/or modify
79575 + it under the terms of the GNU General Public License as published by
79576 + the Free Software Foundation; either version 2 of the License, or
79577 + (at your option) any later version.
79578 +
79579 + This program is distributed in the hope that it will be useful,
79580 + but WITHOUT ANY WARRANTY; without even the implied warranty of
79581 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79582 + GNU General Public License for more details.
79583 +
79584 + You should have received a copy of the GNU General Public License
79585 + along with this program; if not, write to the Free Software
79586 + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
79587 +*/
79588 +#include <stdarg.h>
79589 +#include <linux/module.h>
79590 +#include <xen/xenbus.h>
79591 +#include "common.h"
79592 +
79593 +struct backend_info
79594 +{
79595 + struct xenbus_device *dev;
79596 +
79597 + /* our communications channel */
79598 + tpmif_t *tpmif;
79599 +
79600 + long int frontend_id;
79601 + long int instance; // instance of TPM
79602 + u8 is_instance_set;// whether instance number has been set
79603 +
79604 + /* watch front end for changes */
79605 + struct xenbus_watch backend_watch;
79606 +};
79607 +
79608 +static void maybe_connect(struct backend_info *be);
79609 +static void connect(struct backend_info *be);
79610 +static int connect_ring(struct backend_info *be);
79611 +static void backend_changed(struct xenbus_watch *watch,
79612 + const char **vec, unsigned int len);
79613 +static void frontend_changed(struct xenbus_device *dev,
79614 + enum xenbus_state frontend_state);
79615 +
79616 +long int tpmback_get_instance(struct backend_info *bi)
79617 +{
79618 + long int res = -1;
79619 + if (bi && bi->is_instance_set)
79620 + res = bi->instance;
79621 + return res;
79622 +}
79623 +
79624 +static int tpmback_remove(struct xenbus_device *dev)
79625 +{
79626 + struct backend_info *be = dev->dev.driver_data;
79627 +
79628 + if (!be) return 0;
79629 +
79630 + if (be->backend_watch.node) {
79631 + unregister_xenbus_watch(&be->backend_watch);
79632 + kfree(be->backend_watch.node);
79633 + be->backend_watch.node = NULL;
79634 + }
79635 + if (be->tpmif) {
79636 + be->tpmif->bi = NULL;
79637 + vtpm_release_packets(be->tpmif, 0);
79638 + tpmif_put(be->tpmif);
79639 + be->tpmif = NULL;
79640 + }
79641 + kfree(be);
79642 + dev->dev.driver_data = NULL;
79643 + return 0;
79644 +}
79645 +
79646 +static int tpmback_probe(struct xenbus_device *dev,
79647 + const struct xenbus_device_id *id)
79648 +{
79649 + int err;
79650 + struct backend_info *be = kzalloc(sizeof(struct backend_info),
79651 + GFP_KERNEL);
79652 +
79653 + if (!be) {
79654 + xenbus_dev_fatal(dev, -ENOMEM,
79655 + "allocating backend structure");
79656 + return -ENOMEM;
79657 + }
79658 +
79659 + be->is_instance_set = 0;
79660 + be->dev = dev;
79661 + dev->dev.driver_data = be;
79662 +
79663 + err = xenbus_watch_path2(dev, dev->nodename,
79664 + "instance", &be->backend_watch,
79665 + backend_changed);
79666 + if (err) {
79667 + goto fail;
79668 + }
79669 +
79670 + err = xenbus_switch_state(dev, XenbusStateInitWait);
79671 + if (err) {
79672 + goto fail;
79673 + }
79674 + return 0;
79675 +fail:
79676 + tpmback_remove(dev);
79677 + return err;
79678 +}
79679 +
79680 +
79681 +static void backend_changed(struct xenbus_watch *watch,
79682 + const char **vec, unsigned int len)
79683 +{
79684 + int err;
79685 + long instance;
79686 + struct backend_info *be
79687 + = container_of(watch, struct backend_info, backend_watch);
79688 + struct xenbus_device *dev = be->dev;
79689 +
79690 + err = xenbus_scanf(XBT_NIL, dev->nodename,
79691 + "instance","%li", &instance);
79692 + if (XENBUS_EXIST_ERR(err)) {
79693 + return;
79694 + }
79695 +
79696 + if (err != 1) {
79697 + xenbus_dev_fatal(dev, err, "reading instance");
79698 + return;
79699 + }
79700 +
79701 + if (be->is_instance_set == 0) {
79702 + be->instance = instance;
79703 + be->is_instance_set = 1;
79704 + }
79705 +}
79706 +
79707 +
79708 +static void frontend_changed(struct xenbus_device *dev,
79709 + enum xenbus_state frontend_state)
79710 +{
79711 + struct backend_info *be = dev->dev.driver_data;
79712 + int err;
79713 +
79714 + switch (frontend_state) {
79715 + case XenbusStateInitialising:
79716 + case XenbusStateInitialised:
79717 + break;
79718 +
79719 + case XenbusStateConnected:
79720 + err = connect_ring(be);
79721 + if (err) {
79722 + return;
79723 + }
79724 + maybe_connect(be);
79725 + break;
79726 +
79727 + case XenbusStateClosing:
79728 + be->instance = -1;
79729 + xenbus_switch_state(dev, XenbusStateClosing);
79730 + break;
79731 +
79732 + case XenbusStateUnknown: /* keep it here */
79733 + case XenbusStateClosed:
79734 + xenbus_switch_state(dev, XenbusStateClosed);
79735 + device_unregister(&be->dev->dev);
79736 + tpmback_remove(dev);
79737 + break;
79738 +
79739 + default:
79740 + xenbus_dev_fatal(dev, -EINVAL,
79741 + "saw state %d at frontend",
79742 + frontend_state);
79743 + break;
79744 + }
79745 +}
79746 +
79747 +
79748 +
79749 +static void maybe_connect(struct backend_info *be)
79750 +{
79751 + if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
79752 + return;
79753 +
79754 + connect(be);
79755 +}
79756 +
79757 +
79758 +static void connect(struct backend_info *be)
79759 +{
79760 + struct xenbus_transaction xbt;
79761 + int err;
79762 + struct xenbus_device *dev = be->dev;
79763 + unsigned long ready = 1;
79764 +
79765 +again:
79766 + err = xenbus_transaction_start(&xbt);
79767 + if (err) {
79768 + xenbus_dev_fatal(be->dev, err, "starting transaction");
79769 + return;
79770 + }
79771 +
79772 + err = xenbus_printf(xbt, be->dev->nodename,
79773 + "ready", "%lu", ready);
79774 + if (err) {
79775 + xenbus_dev_fatal(be->dev, err, "writing 'ready'");
79776 + goto abort;
79777 + }
79778 +
79779 + err = xenbus_transaction_end(xbt, 0);
79780 + if (err == -EAGAIN)
79781 + goto again;
79782 + if (err)
79783 + xenbus_dev_fatal(be->dev, err, "end of transaction");
79784 +
79785 + err = xenbus_switch_state(dev, XenbusStateConnected);
79786 + if (!err)
79787 + be->tpmif->status = CONNECTED;
79788 + return;
79789 +abort:
79790 + xenbus_transaction_end(xbt, 1);
79791 +}
79792 +
79793 +
79794 +static int connect_ring(struct backend_info *be)
79795 +{
79796 + struct xenbus_device *dev = be->dev;
79797 + unsigned long ring_ref;
79798 + unsigned int evtchn;
79799 + int err;
79800 +
79801 + err = xenbus_gather(XBT_NIL, dev->otherend,
79802 + "ring-ref", "%lu", &ring_ref,
79803 + "event-channel", "%u", &evtchn, NULL);
79804 + if (err) {
79805 + xenbus_dev_error(dev, err,
79806 + "reading %s/ring-ref and event-channel",
79807 + dev->otherend);
79808 + return err;
79809 + }
79810 +
79811 + if (!be->tpmif) {
79812 + be->tpmif = tpmif_find(dev->otherend_id, be);
79813 + if (IS_ERR(be->tpmif)) {
79814 + err = PTR_ERR(be->tpmif);
79815 + be->tpmif = NULL;
79816 + xenbus_dev_fatal(dev,err,"creating vtpm interface");
79817 + return err;
79818 + }
79819 + }
79820 +
79821 + if (be->tpmif != NULL) {
79822 + err = tpmif_map(be->tpmif, ring_ref, evtchn);
79823 + if (err) {
79824 + xenbus_dev_error(dev, err,
79825 + "mapping shared-frame %lu port %u",
79826 + ring_ref, evtchn);
79827 + return err;
79828 + }
79829 + }
79830 + return 0;
79831 +}
79832 +
79833 +
79834 +static struct xenbus_device_id tpmback_ids[] = {
79835 + { "vtpm" },
79836 + { "" }
79837 +};
79838 +
79839 +
79840 +static struct xenbus_driver tpmback = {
79841 + .name = "vtpm",
79842 + .owner = THIS_MODULE,
79843 + .ids = tpmback_ids,
79844 + .probe = tpmback_probe,
79845 + .remove = tpmback_remove,
79846 + .otherend_changed = frontend_changed,
79847 +};
79848 +
79849 +
79850 +void tpmif_xenbus_init(void)
79851 +{
79852 + xenbus_register_backend(&tpmback);
79853 +}
79854 +
79855 +void tpmif_xenbus_exit(void)
79856 +{
79857 + xenbus_unregister_driver(&tpmback);
79858 +}
79859 diff -Nur linux-2.6.16.33-noxen/drivers/xen/util.c linux-2.6.16.33/drivers/xen/util.c
79860 --- linux-2.6.16.33-noxen/drivers/xen/util.c 1970-01-01 00:00:00.000000000 +0000
79861 +++ linux-2.6.16.33/drivers/xen/util.c 2007-01-08 15:00:45.000000000 +0000
79862 @@ -0,0 +1,70 @@
79863 +#include <linux/config.h>
79864 +#include <linux/mm.h>
79865 +#include <linux/module.h>
79866 +#include <linux/slab.h>
79867 +#include <linux/vmalloc.h>
79868 +#include <asm/uaccess.h>
79869 +#include <xen/driver_util.h>
79870 +
79871 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
79872 +{
79873 + /* apply_to_page_range() does all the hard work. */
79874 + return 0;
79875 +}
79876 +
79877 +struct vm_struct *alloc_vm_area(unsigned long size)
79878 +{
79879 + struct vm_struct *area;
79880 +
79881 + area = get_vm_area(size, VM_IOREMAP);
79882 + if (area == NULL)
79883 + return NULL;
79884 +
79885 + /*
79886 + * This ensures that page tables are constructed for this region
79887 + * of kernel virtual address space and mapped into init_mm.
79888 + */
79889 + if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
79890 + area->size, f, NULL)) {
79891 + free_vm_area(area);
79892 + return NULL;
79893 + }
79894 +
79895 + return area;
79896 +}
79897 +EXPORT_SYMBOL_GPL(alloc_vm_area);
79898 +
79899 +void free_vm_area(struct vm_struct *area)
79900 +{
79901 + struct vm_struct *ret;
79902 + ret = remove_vm_area(area->addr);
79903 + BUG_ON(ret != area);
79904 + kfree(area);
79905 +}
79906 +EXPORT_SYMBOL_GPL(free_vm_area);
79907 +
79908 +void lock_vm_area(struct vm_struct *area)
79909 +{
79910 + unsigned long i;
79911 + char c;
79912 +
79913 + /*
79914 + * Prevent context switch to a lazy mm that doesn't have this area
79915 + * mapped into its page tables.
79916 + */
79917 + preempt_disable();
79918 +
79919 + /*
79920 + * Ensure that the page tables are mapped into the current mm. The
79921 + * page-fault path will copy the page directory pointers from init_mm.
79922 + */
79923 + for (i = 0; i < area->size; i += PAGE_SIZE)
79924 + (void)__get_user(c, (char __user *)area->addr + i);
79925 +}
79926 +EXPORT_SYMBOL_GPL(lock_vm_area);
79927 +
79928 +void unlock_vm_area(struct vm_struct *area)
79929 +{
79930 + preempt_enable();
79931 +}
79932 +EXPORT_SYMBOL_GPL(unlock_vm_area);
79933 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/Makefile linux-2.6.16.33/drivers/xen/xenbus/Makefile
79934 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/Makefile 1970-01-01 00:00:00.000000000 +0000
79935 +++ linux-2.6.16.33/drivers/xen/xenbus/Makefile 2007-01-08 15:00:45.000000000 +0000
79936 @@ -0,0 +1,9 @@
79937 +obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
79938 +obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
79939 +
79940 +xenbus_be-objs =
79941 +xenbus_be-objs += xenbus_backend_client.o
79942 +
79943 +xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
79944 +obj-y += $(xenbus-y) $(xenbus-m)
79945 +obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
79946 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_backend_client.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_backend_client.c
79947 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_backend_client.c 1970-01-01 00:00:00.000000000 +0000
79948 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_backend_client.c 2007-01-08 15:00:45.000000000 +0000
79949 @@ -0,0 +1,147 @@
79950 +/******************************************************************************
79951 + * Backend-client-facing interface for the Xenbus driver. In other words, the
79952 + * interface between the Xenbus and the device-specific code in the backend
79953 + * driver.
79954 + *
79955 + * Copyright (C) 2005-2006 XenSource Ltd
79956 + *
79957 + * This program is free software; you can redistribute it and/or
79958 + * modify it under the terms of the GNU General Public License version 2
79959 + * as published by the Free Software Foundation; or, when distributed
79960 + * separately from the Linux kernel or incorporated into other
79961 + * software packages, subject to the following license:
79962 + *
79963 + * Permission is hereby granted, free of charge, to any person obtaining a copy
79964 + * of this source file (the "Software"), to deal in the Software without
79965 + * restriction, including without limitation the rights to use, copy, modify,
79966 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
79967 + * and to permit persons to whom the Software is furnished to do so, subject to
79968 + * the following conditions:
79969 + *
79970 + * The above copyright notice and this permission notice shall be included in
79971 + * all copies or substantial portions of the Software.
79972 + *
79973 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
79974 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
79975 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
79976 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
79977 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
79978 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
79979 + * IN THE SOFTWARE.
79980 + */
79981 +
79982 +#include <linux/err.h>
79983 +#include <xen/gnttab.h>
79984 +#include <xen/xenbus.h>
79985 +#include <xen/driver_util.h>
79986 +
79987 +/* Based on Rusty Russell's skeleton driver's map_page */
79988 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref)
79989 +{
79990 + struct gnttab_map_grant_ref op;
79991 + struct vm_struct *area;
79992 +
79993 + area = alloc_vm_area(PAGE_SIZE);
79994 + if (!area)
79995 + return ERR_PTR(-ENOMEM);
79996 +
79997 + gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
79998 + gnt_ref, dev->otherend_id);
79999 +
80000 + lock_vm_area(area);
80001 + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
80002 + unlock_vm_area(area);
80003 +
80004 + if (op.status != GNTST_okay) {
80005 + free_vm_area(area);
80006 + xenbus_dev_fatal(dev, op.status,
80007 + "mapping in shared page %d from domain %d",
80008 + gnt_ref, dev->otherend_id);
80009 + BUG_ON(!IS_ERR(ERR_PTR(op.status)));
80010 + return ERR_PTR(op.status);
80011 + }
80012 +
80013 + /* Stuff the handle in an unused field */
80014 + area->phys_addr = (unsigned long)op.handle;
80015 +
80016 + return area;
80017 +}
80018 +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
80019 +
80020 +
80021 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
80022 + grant_handle_t *handle, void *vaddr)
80023 +{
80024 + struct gnttab_map_grant_ref op;
80025 +
80026 + gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
80027 + gnt_ref, dev->otherend_id);
80028 + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
80029 +
80030 + if (op.status != GNTST_okay) {
80031 + xenbus_dev_fatal(dev, op.status,
80032 + "mapping in shared page %d from domain %d",
80033 + gnt_ref, dev->otherend_id);
80034 + } else
80035 + *handle = op.handle;
80036 +
80037 + return op.status;
80038 +}
80039 +EXPORT_SYMBOL_GPL(xenbus_map_ring);
80040 +
80041 +
80042 +/* Based on Rusty Russell's skeleton driver's unmap_page */
80043 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
80044 +{
80045 + struct gnttab_unmap_grant_ref op;
80046 +
80047 + gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
80048 + (grant_handle_t)area->phys_addr);
80049 +
80050 + lock_vm_area(area);
80051 + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
80052 + unlock_vm_area(area);
80053 +
80054 + if (op.status == GNTST_okay)
80055 + free_vm_area(area);
80056 + else
80057 + xenbus_dev_error(dev, op.status,
80058 + "unmapping page at handle %d error %d",
80059 + (int16_t)area->phys_addr, op.status);
80060 +
80061 + return op.status;
80062 +}
80063 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
80064 +
80065 +
80066 +int xenbus_unmap_ring(struct xenbus_device *dev,
80067 + grant_handle_t handle, void *vaddr)
80068 +{
80069 + struct gnttab_unmap_grant_ref op;
80070 +
80071 + gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
80072 + handle);
80073 + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
80074 +
80075 + if (op.status != GNTST_okay)
80076 + xenbus_dev_error(dev, op.status,
80077 + "unmapping page at handle %d error %d",
80078 + handle, op.status);
80079 +
80080 + return op.status;
80081 +}
80082 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
80083 +
80084 +int xenbus_dev_is_online(struct xenbus_device *dev)
80085 +{
80086 + int rc, val;
80087 +
80088 + rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
80089 + if (rc != 1)
80090 + val = 0; /* no online node present */
80091 +
80092 + return val;
80093 +}
80094 +EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
80095 +
80096 +MODULE_LICENSE("Dual BSD/GPL");
80097 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_client.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_client.c
80098 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_client.c 1970-01-01 00:00:00.000000000 +0000
80099 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_client.c 2007-01-08 15:00:45.000000000 +0000
80100 @@ -0,0 +1,304 @@
80101 +/******************************************************************************
80102 + * Client-facing interface for the Xenbus driver. In other words, the
80103 + * interface between the Xenbus and the device-specific code, be it the
80104 + * frontend or the backend of that driver.
80105 + *
80106 + * Copyright (C) 2005 XenSource Ltd
80107 + *
80108 + * This program is free software; you can redistribute it and/or
80109 + * modify it under the terms of the GNU General Public License version 2
80110 + * as published by the Free Software Foundation; or, when distributed
80111 + * separately from the Linux kernel or incorporated into other
80112 + * software packages, subject to the following license:
80113 + *
80114 + * Permission is hereby granted, free of charge, to any person obtaining a copy
80115 + * of this source file (the "Software"), to deal in the Software without
80116 + * restriction, including without limitation the rights to use, copy, modify,
80117 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80118 + * and to permit persons to whom the Software is furnished to do so, subject to
80119 + * the following conditions:
80120 + *
80121 + * The above copyright notice and this permission notice shall be included in
80122 + * all copies or substantial portions of the Software.
80123 + *
80124 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80125 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80126 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80127 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80128 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80129 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80130 + * IN THE SOFTWARE.
80131 + */
80132 +
80133 +#include <xen/evtchn.h>
80134 +#include <xen/gnttab.h>
80135 +#include <xen/xenbus.h>
80136 +#include <xen/driver_util.h>
80137 +
80138 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80139 +#include <xen/platform-compat.h>
80140 +#endif
80141 +
80142 +#define DPRINTK(fmt, args...) \
80143 + pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
80144 +
80145 +char *xenbus_strstate(enum xenbus_state state)
80146 +{
80147 + static char *name[] = {
80148 + [ XenbusStateUnknown ] = "Unknown",
80149 + [ XenbusStateInitialising ] = "Initialising",
80150 + [ XenbusStateInitWait ] = "InitWait",
80151 + [ XenbusStateInitialised ] = "Initialised",
80152 + [ XenbusStateConnected ] = "Connected",
80153 + [ XenbusStateClosing ] = "Closing",
80154 + [ XenbusStateClosed ] = "Closed",
80155 + };
80156 + return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
80157 +}
80158 +
80159 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
80160 + struct xenbus_watch *watch,
80161 + void (*callback)(struct xenbus_watch *,
80162 + const char **, unsigned int))
80163 +{
80164 + int err;
80165 +
80166 + watch->node = path;
80167 + watch->callback = callback;
80168 +
80169 + err = register_xenbus_watch(watch);
80170 +
80171 + if (err) {
80172 + watch->node = NULL;
80173 + watch->callback = NULL;
80174 + xenbus_dev_fatal(dev, err, "adding watch on %s", path);
80175 + }
80176 +
80177 + return err;
80178 +}
80179 +EXPORT_SYMBOL_GPL(xenbus_watch_path);
80180 +
80181 +
80182 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
80183 + const char *path2, struct xenbus_watch *watch,
80184 + void (*callback)(struct xenbus_watch *,
80185 + const char **, unsigned int))
80186 +{
80187 + int err;
80188 + char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2);
80189 + if (!state) {
80190 + xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
80191 + return -ENOMEM;
80192 + }
80193 + err = xenbus_watch_path(dev, state, watch, callback);
80194 +
80195 + if (err)
80196 + kfree(state);
80197 + return err;
80198 +}
80199 +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
80200 +
80201 +
80202 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
80203 +{
80204 + /* We check whether the state is currently set to the given value, and
80205 + if not, then the state is set. We don't want to unconditionally
80206 + write the given state, because we don't want to fire watches
80207 + unnecessarily. Furthermore, if the node has gone, we don't write
80208 + to it, as the device will be tearing down, and we don't want to
80209 + resurrect that directory.
80210 +
80211 + Note that, because of this cached value of our state, this function
80212 + will not work inside a Xenstore transaction (something it was
80213 + trying to in the past) because dev->state would not get reset if
80214 + the transaction was aborted.
80215 +
80216 + */
80217 +
80218 + int current_state;
80219 + int err;
80220 +
80221 + if (state == dev->state)
80222 + return 0;
80223 +
80224 + err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
80225 + &current_state);
80226 + if (err != 1)
80227 + return 0;
80228 +
80229 + err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
80230 + if (err) {
80231 + if (state != XenbusStateClosing) /* Avoid looping */
80232 + xenbus_dev_fatal(dev, err, "writing new state");
80233 + return err;
80234 + }
80235 +
80236 + dev->state = state;
80237 +
80238 + return 0;
80239 +}
80240 +EXPORT_SYMBOL_GPL(xenbus_switch_state);
80241 +
80242 +int xenbus_frontend_closed(struct xenbus_device *dev)
80243 +{
80244 + xenbus_switch_state(dev, XenbusStateClosed);
80245 + complete(&dev->down);
80246 + return 0;
80247 +}
80248 +EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
80249 +
80250 +/**
80251 + * Return the path to the error node for the given device, or NULL on failure.
80252 + * If the value returned is non-NULL, then it is the caller's to kfree.
80253 + */
80254 +static char *error_path(struct xenbus_device *dev)
80255 +{
80256 + return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
80257 +}
80258 +
80259 +
80260 +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
80261 + va_list ap)
80262 +{
80263 + int ret;
80264 + unsigned int len;
80265 + char *printf_buffer = NULL, *path_buffer = NULL;
80266 +
80267 +#define PRINTF_BUFFER_SIZE 4096
80268 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
80269 + if (printf_buffer == NULL)
80270 + goto fail;
80271 +
80272 + len = sprintf(printf_buffer, "%i ", -err);
80273 + ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
80274 +
80275 + BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
80276 +
80277 + dev_err(&dev->dev, "%s\n", printf_buffer);
80278 +
80279 + path_buffer = error_path(dev);
80280 +
80281 + if (path_buffer == NULL) {
80282 + printk("xenbus: failed to write error node for %s (%s)\n",
80283 + dev->nodename, printf_buffer);
80284 + goto fail;
80285 + }
80286 +
80287 + if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
80288 + printk("xenbus: failed to write error node for %s (%s)\n",
80289 + dev->nodename, printf_buffer);
80290 + goto fail;
80291 + }
80292 +
80293 +fail:
80294 + if (printf_buffer)
80295 + kfree(printf_buffer);
80296 + if (path_buffer)
80297 + kfree(path_buffer);
80298 +}
80299 +
80300 +
80301 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
80302 + ...)
80303 +{
80304 + va_list ap;
80305 +
80306 + va_start(ap, fmt);
80307 + _dev_error(dev, err, fmt, ap);
80308 + va_end(ap);
80309 +}
80310 +EXPORT_SYMBOL_GPL(xenbus_dev_error);
80311 +
80312 +
80313 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
80314 + ...)
80315 +{
80316 + va_list ap;
80317 +
80318 + va_start(ap, fmt);
80319 + _dev_error(dev, err, fmt, ap);
80320 + va_end(ap);
80321 +
80322 + xenbus_switch_state(dev, XenbusStateClosing);
80323 +}
80324 +EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
80325 +
80326 +
80327 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
80328 +{
80329 + int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
80330 + if (err < 0)
80331 + xenbus_dev_fatal(dev, err, "granting access to ring page");
80332 + return err;
80333 +}
80334 +EXPORT_SYMBOL_GPL(xenbus_grant_ring);
80335 +
80336 +
80337 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
80338 +{
80339 + struct evtchn_alloc_unbound alloc_unbound;
80340 + int err;
80341 +
80342 + alloc_unbound.dom = DOMID_SELF;
80343 + alloc_unbound.remote_dom = dev->otherend_id;
80344 +
80345 + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
80346 + &alloc_unbound);
80347 + if (err)
80348 + xenbus_dev_fatal(dev, err, "allocating event channel");
80349 + else
80350 + *port = alloc_unbound.port;
80351 +
80352 + return err;
80353 +}
80354 +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
80355 +
80356 +
80357 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
80358 +{
80359 + struct evtchn_bind_interdomain bind_interdomain;
80360 + int err;
80361 +
80362 + bind_interdomain.remote_dom = dev->otherend_id;
80363 + bind_interdomain.remote_port = remote_port,
80364 +
80365 + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
80366 + &bind_interdomain);
80367 + if (err)
80368 + xenbus_dev_fatal(dev, err,
80369 + "binding to event channel %d from domain %d",
80370 + remote_port, dev->otherend_id);
80371 + else
80372 + *port = bind_interdomain.local_port;
80373 +
80374 + return err;
80375 +}
80376 +EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
80377 +
80378 +
80379 +int xenbus_free_evtchn(struct xenbus_device *dev, int port)
80380 +{
80381 + struct evtchn_close close;
80382 + int err;
80383 +
80384 + close.port = port;
80385 +
80386 + err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
80387 + if (err)
80388 + xenbus_dev_error(dev, err, "freeing event channel %d", port);
80389 +
80390 + return err;
80391 +}
80392 +EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
80393 +
80394 +
80395 +enum xenbus_state xenbus_read_driver_state(const char *path)
80396 +{
80397 + enum xenbus_state result;
80398 + int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
80399 + if (err)
80400 + result = XenbusStateUnknown;
80401 +
80402 + return result;
80403 +}
80404 +EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
80405 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.c
80406 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.c 1970-01-01 00:00:00.000000000 +0000
80407 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.c 2007-01-08 15:00:45.000000000 +0000
80408 @@ -0,0 +1,210 @@
80409 +/******************************************************************************
80410 + * xenbus_comms.c
80411 + *
80412 + * Low level code to talks to Xen Store: ringbuffer and event channel.
80413 + *
80414 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
80415 + *
80416 + * This program is free software; you can redistribute it and/or
80417 + * modify it under the terms of the GNU General Public License version 2
80418 + * as published by the Free Software Foundation; or, when distributed
80419 + * separately from the Linux kernel or incorporated into other
80420 + * software packages, subject to the following license:
80421 + *
80422 + * Permission is hereby granted, free of charge, to any person obtaining a copy
80423 + * of this source file (the "Software"), to deal in the Software without
80424 + * restriction, including without limitation the rights to use, copy, modify,
80425 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80426 + * and to permit persons to whom the Software is furnished to do so, subject to
80427 + * the following conditions:
80428 + *
80429 + * The above copyright notice and this permission notice shall be included in
80430 + * all copies or substantial portions of the Software.
80431 + *
80432 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80433 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80434 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80435 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80436 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80437 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80438 + * IN THE SOFTWARE.
80439 + */
80440 +
80441 +#include <linux/wait.h>
80442 +#include <linux/interrupt.h>
80443 +#include <linux/sched.h>
80444 +#include <linux/err.h>
80445 +#include <linux/ptrace.h>
80446 +#include <xen/evtchn.h>
80447 +#include <xen/xenbus.h>
80448 +
80449 +#include <asm/hypervisor.h>
80450 +
80451 +#include "xenbus_comms.h"
80452 +
80453 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80454 +#include <xen/platform-compat.h>
80455 +#endif
80456 +
80457 +static int xenbus_irq;
80458 +
80459 +extern void xenbus_probe(void *);
80460 +extern int xenstored_ready;
80461 +static DECLARE_WORK(probe_work, xenbus_probe, NULL);
80462 +
80463 +static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
80464 +
80465 +static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
80466 +{
80467 + if (unlikely(xenstored_ready == 0)) {
80468 + xenstored_ready = 1;
80469 + schedule_work(&probe_work);
80470 + }
80471 +
80472 + wake_up(&xb_waitq);
80473 + return IRQ_HANDLED;
80474 +}
80475 +
80476 +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
80477 +{
80478 + return ((prod - cons) <= XENSTORE_RING_SIZE);
80479 +}
80480 +
80481 +static void *get_output_chunk(XENSTORE_RING_IDX cons,
80482 + XENSTORE_RING_IDX prod,
80483 + char *buf, uint32_t *len)
80484 +{
80485 + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
80486 + if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
80487 + *len = XENSTORE_RING_SIZE - (prod - cons);
80488 + return buf + MASK_XENSTORE_IDX(prod);
80489 +}
80490 +
80491 +static const void *get_input_chunk(XENSTORE_RING_IDX cons,
80492 + XENSTORE_RING_IDX prod,
80493 + const char *buf, uint32_t *len)
80494 +{
80495 + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
80496 + if ((prod - cons) < *len)
80497 + *len = prod - cons;
80498 + return buf + MASK_XENSTORE_IDX(cons);
80499 +}
80500 +
80501 +int xb_write(const void *data, unsigned len)
80502 +{
80503 + struct xenstore_domain_interface *intf = xen_store_interface;
80504 + XENSTORE_RING_IDX cons, prod;
80505 + int rc;
80506 +
80507 + while (len != 0) {
80508 + void *dst;
80509 + unsigned int avail;
80510 +
80511 + rc = wait_event_interruptible(
80512 + xb_waitq,
80513 + (intf->req_prod - intf->req_cons) !=
80514 + XENSTORE_RING_SIZE);
80515 + if (rc < 0)
80516 + return rc;
80517 +
80518 + /* Read indexes, then verify. */
80519 + cons = intf->req_cons;
80520 + prod = intf->req_prod;
80521 + mb();
80522 + if (!check_indexes(cons, prod)) {
80523 + intf->req_cons = intf->req_prod = 0;
80524 + return -EIO;
80525 + }
80526 +
80527 + dst = get_output_chunk(cons, prod, intf->req, &avail);
80528 + if (avail == 0)
80529 + continue;
80530 + if (avail > len)
80531 + avail = len;
80532 +
80533 + memcpy(dst, data, avail);
80534 + data += avail;
80535 + len -= avail;
80536 +
80537 + /* Other side must not see new header until data is there. */
80538 + wmb();
80539 + intf->req_prod += avail;
80540 +
80541 + /* This implies mb() before other side sees interrupt. */
80542 + notify_remote_via_evtchn(xen_store_evtchn);
80543 + }
80544 +
80545 + return 0;
80546 +}
80547 +
80548 +int xb_read(void *data, unsigned len)
80549 +{
80550 + struct xenstore_domain_interface *intf = xen_store_interface;
80551 + XENSTORE_RING_IDX cons, prod;
80552 + int rc;
80553 +
80554 + while (len != 0) {
80555 + unsigned int avail;
80556 + const char *src;
80557 +
80558 + rc = wait_event_interruptible(
80559 + xb_waitq,
80560 + intf->rsp_cons != intf->rsp_prod);
80561 + if (rc < 0)
80562 + return rc;
80563 +
80564 + /* Read indexes, then verify. */
80565 + cons = intf->rsp_cons;
80566 + prod = intf->rsp_prod;
80567 + mb();
80568 + if (!check_indexes(cons, prod)) {
80569 + intf->rsp_cons = intf->rsp_prod = 0;
80570 + return -EIO;
80571 + }
80572 +
80573 + src = get_input_chunk(cons, prod, intf->rsp, &avail);
80574 + if (avail == 0)
80575 + continue;
80576 + if (avail > len)
80577 + avail = len;
80578 +
80579 + /* We must read header before we read data. */
80580 + rmb();
80581 +
80582 + memcpy(data, src, avail);
80583 + data += avail;
80584 + len -= avail;
80585 +
80586 + /* Other side must not see free space until we've copied out */
80587 + mb();
80588 + intf->rsp_cons += avail;
80589 +
80590 + pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
80591 +
80592 + /* Implies mb(): they will see new header. */
80593 + notify_remote_via_evtchn(xen_store_evtchn);
80594 + }
80595 +
80596 + return 0;
80597 +}
80598 +
80599 +/* Set up interrupt handler off store event channel. */
80600 +int xb_init_comms(void)
80601 +{
80602 + int err;
80603 +
80604 + if (xenbus_irq)
80605 + unbind_from_irqhandler(xenbus_irq, &xb_waitq);
80606 +
80607 + err = bind_evtchn_to_irqhandler(
80608 + xen_store_evtchn, wake_waiting,
80609 + 0, "xenbus", &xb_waitq);
80610 + if (err <= 0) {
80611 + printk(KERN_ERR "XENBUS request irq failed %i\n", err);
80612 + return err;
80613 + }
80614 +
80615 + xenbus_irq = err;
80616 +
80617 + return 0;
80618 +}
80619 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.h linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.h
80620 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_comms.h 1970-01-01 00:00:00.000000000 +0000
80621 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_comms.h 2007-01-08 15:00:45.000000000 +0000
80622 @@ -0,0 +1,44 @@
80623 +/*
80624 + * Private include for xenbus communications.
80625 + *
80626 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
80627 + *
80628 + * This program is free software; you can redistribute it and/or
80629 + * modify it under the terms of the GNU General Public License version 2
80630 + * as published by the Free Software Foundation; or, when distributed
80631 + * separately from the Linux kernel or incorporated into other
80632 + * software packages, subject to the following license:
80633 + *
80634 + * Permission is hereby granted, free of charge, to any person obtaining a copy
80635 + * of this source file (the "Software"), to deal in the Software without
80636 + * restriction, including without limitation the rights to use, copy, modify,
80637 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80638 + * and to permit persons to whom the Software is furnished to do so, subject to
80639 + * the following conditions:
80640 + *
80641 + * The above copyright notice and this permission notice shall be included in
80642 + * all copies or substantial portions of the Software.
80643 + *
80644 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80645 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80646 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80647 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80648 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80649 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80650 + * IN THE SOFTWARE.
80651 + */
80652 +
80653 +#ifndef _XENBUS_COMMS_H
80654 +#define _XENBUS_COMMS_H
80655 +
80656 +int xs_init(void);
80657 +int xb_init_comms(void);
80658 +
80659 +/* Low level routines. */
80660 +int xb_write(const void *data, unsigned len);
80661 +int xb_read(void *data, unsigned len);
80662 +int xs_input_avail(void);
80663 +extern struct xenstore_domain_interface *xen_store_interface;
80664 +extern int xen_store_evtchn;
80665 +
80666 +#endif /* _XENBUS_COMMS_H */
80667 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_dev.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_dev.c
80668 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_dev.c 1970-01-01 00:00:00.000000000 +0000
80669 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_dev.c 2007-01-08 15:00:45.000000000 +0000
80670 @@ -0,0 +1,362 @@
80671 +/*
80672 + * xenbus_dev.c
80673 + *
80674 + * Driver giving user-space access to the kernel's xenbus connection
80675 + * to xenstore.
80676 + *
80677 + * Copyright (c) 2005, Christian Limpach
80678 + * Copyright (c) 2005, Rusty Russell, IBM Corporation
80679 + *
80680 + * This program is free software; you can redistribute it and/or
80681 + * modify it under the terms of the GNU General Public License version 2
80682 + * as published by the Free Software Foundation; or, when distributed
80683 + * separately from the Linux kernel or incorporated into other
80684 + * software packages, subject to the following license:
80685 + *
80686 + * Permission is hereby granted, free of charge, to any person obtaining a copy
80687 + * of this source file (the "Software"), to deal in the Software without
80688 + * restriction, including without limitation the rights to use, copy, modify,
80689 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
80690 + * and to permit persons to whom the Software is furnished to do so, subject to
80691 + * the following conditions:
80692 + *
80693 + * The above copyright notice and this permission notice shall be included in
80694 + * all copies or substantial portions of the Software.
80695 + *
80696 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80697 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
80698 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
80699 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
80700 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
80701 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
80702 + * IN THE SOFTWARE.
80703 + */
80704 +
80705 +#include <linux/config.h>
80706 +#include <linux/kernel.h>
80707 +#include <linux/errno.h>
80708 +#include <linux/uio.h>
80709 +#include <linux/notifier.h>
80710 +#include <linux/wait.h>
80711 +#include <linux/fs.h>
80712 +#include <linux/poll.h>
80713 +#include <linux/mutex.h>
80714 +
80715 +#include "xenbus_comms.h"
80716 +
80717 +#include <asm/uaccess.h>
80718 +#include <asm/hypervisor.h>
80719 +#include <xen/xenbus.h>
80720 +#include <xen/xen_proc.h>
80721 +#include <asm/hypervisor.h>
80722 +
80723 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
80724 +#include <xen/platform-compat.h>
80725 +#endif
80726 +
80727 +struct xenbus_dev_transaction {
80728 + struct list_head list;
80729 + struct xenbus_transaction handle;
80730 +};
80731 +
80732 +struct xenbus_dev_data {
80733 + /* In-progress transaction. */
80734 + struct list_head transactions;
80735 +
80736 + /* Active watches. */
80737 + struct list_head watches;
80738 +
80739 + /* Partial request. */
80740 + unsigned int len;
80741 + union {
80742 + struct xsd_sockmsg msg;
80743 + char buffer[PAGE_SIZE];
80744 + } u;
80745 +
80746 + /* Response queue. */
80747 +#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
80748 + char read_buffer[PAGE_SIZE];
80749 + unsigned int read_cons, read_prod;
80750 + wait_queue_head_t read_waitq;
80751 +
80752 + struct mutex reply_mutex;
80753 +};
80754 +
80755 +static struct proc_dir_entry *xenbus_dev_intf;
80756 +
80757 +static ssize_t xenbus_dev_read(struct file *filp,
80758 + char __user *ubuf,
80759 + size_t len, loff_t *ppos)
80760 +{
80761 + struct xenbus_dev_data *u = filp->private_data;
80762 + int i;
80763 +
80764 + if (wait_event_interruptible(u->read_waitq,
80765 + u->read_prod != u->read_cons))
80766 + return -EINTR;
80767 +
80768 + for (i = 0; i < len; i++) {
80769 + if (u->read_cons == u->read_prod)
80770 + break;
80771 + put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
80772 + u->read_cons++;
80773 + }
80774 +
80775 + return i;
80776 +}
80777 +
80778 +static void queue_reply(struct xenbus_dev_data *u,
80779 + char *data, unsigned int len)
80780 +{
80781 + int i;
80782 +
80783 + mutex_lock(&u->reply_mutex);
80784 +
80785 + for (i = 0; i < len; i++, u->read_prod++)
80786 + u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
80787 +
80788 + BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
80789 +
80790 + mutex_unlock(&u->reply_mutex);
80791 +
80792 + wake_up(&u->read_waitq);
80793 +}
80794 +
80795 +struct watch_adapter
80796 +{
80797 + struct list_head list;
80798 + struct xenbus_watch watch;
80799 + struct xenbus_dev_data *dev_data;
80800 + char *token;
80801 +};
80802 +
80803 +static void free_watch_adapter (struct watch_adapter *watch)
80804 +{
80805 + kfree(watch->watch.node);
80806 + kfree(watch->token);
80807 + kfree(watch);
80808 +}
80809 +
80810 +static void watch_fired(struct xenbus_watch *watch,
80811 + const char **vec,
80812 + unsigned int len)
80813 +{
80814 + struct watch_adapter *adap =
80815 + container_of(watch, struct watch_adapter, watch);
80816 + struct xsd_sockmsg hdr;
80817 + const char *path, *token;
80818 + int path_len, tok_len, body_len;
80819 +
80820 + path = vec[XS_WATCH_PATH];
80821 + token = adap->token;
80822 +
80823 + path_len = strlen(path) + 1;
80824 + tok_len = strlen(token) + 1;
80825 + body_len = path_len + tok_len;
80826 +
80827 + hdr.type = XS_WATCH_EVENT;
80828 + hdr.len = body_len;
80829 +
80830 + queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr));
80831 + queue_reply(adap->dev_data, (char *)path, path_len);
80832 + queue_reply(adap->dev_data, (char *)token, tok_len);
80833 +}
80834 +
80835 +static LIST_HEAD(watch_list);
80836 +
80837 +static ssize_t xenbus_dev_write(struct file *filp,
80838 + const char __user *ubuf,
80839 + size_t len, loff_t *ppos)
80840 +{
80841 + struct xenbus_dev_data *u = filp->private_data;
80842 + struct xenbus_dev_transaction *trans = NULL;
80843 + uint32_t msg_type;
80844 + void *reply;
80845 + char *path, *token;
80846 + struct watch_adapter *watch, *tmp_watch;
80847 + int err;
80848 +
80849 + if ((len + u->len) > sizeof(u->u.buffer))
80850 + return -EINVAL;
80851 +
80852 + if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
80853 + return -EFAULT;
80854 +
80855 + u->len += len;
80856 + if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
80857 + return len;
80858 +
80859 + msg_type = u->u.msg.type;
80860 +
80861 + switch (msg_type) {
80862 + case XS_TRANSACTION_START:
80863 + case XS_TRANSACTION_END:
80864 + case XS_DIRECTORY:
80865 + case XS_READ:
80866 + case XS_GET_PERMS:
80867 + case XS_RELEASE:
80868 + case XS_GET_DOMAIN_PATH:
80869 + case XS_WRITE:
80870 + case XS_MKDIR:
80871 + case XS_RM:
80872 + case XS_SET_PERMS:
80873 + if (msg_type == XS_TRANSACTION_START) {
80874 + trans = kmalloc(sizeof(*trans), GFP_KERNEL);
80875 + if (!trans)
80876 + return -ENOMEM;
80877 + }
80878 +
80879 + reply = xenbus_dev_request_and_reply(&u->u.msg);
80880 + if (IS_ERR(reply)) {
80881 + kfree(trans);
80882 + return PTR_ERR(reply);
80883 + }
80884 +
80885 + if (msg_type == XS_TRANSACTION_START) {
80886 + trans->handle.id = simple_strtoul(reply, NULL, 0);
80887 + list_add(&trans->list, &u->transactions);
80888 + } else if (msg_type == XS_TRANSACTION_END) {
80889 + list_for_each_entry(trans, &u->transactions, list)
80890 + if (trans->handle.id == u->u.msg.tx_id)
80891 + break;
80892 + BUG_ON(&trans->list == &u->transactions);
80893 + list_del(&trans->list);
80894 + kfree(trans);
80895 + }
80896 + queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
80897 + queue_reply(u, (char *)reply, u->u.msg.len);
80898 + kfree(reply);
80899 + break;
80900 +
80901 + case XS_WATCH:
80902 + case XS_UNWATCH:
80903 + path = u->u.buffer + sizeof(u->u.msg);
80904 + token = memchr(path, 0, u->u.msg.len);
80905 + if (token == NULL)
80906 + return -EILSEQ;
80907 + token++;
80908 +
80909 + if (msg_type == XS_WATCH) {
80910 + static const char * XS_WATCH_RESP = "OK";
80911 + struct xsd_sockmsg hdr;
80912 +
80913 + watch = kmalloc(sizeof(*watch), GFP_KERNEL);
80914 + watch->watch.node = kmalloc(strlen(path)+1,
80915 + GFP_KERNEL);
80916 + strcpy((char *)watch->watch.node, path);
80917 + watch->watch.callback = watch_fired;
80918 + watch->token = kmalloc(strlen(token)+1, GFP_KERNEL);
80919 + strcpy(watch->token, token);
80920 + watch->dev_data = u;
80921 +
80922 + err = register_xenbus_watch(&watch->watch);
80923 + if (err) {
80924 + free_watch_adapter(watch);
80925 + return err;
80926 + }
80927 +
80928 + list_add(&watch->list, &u->watches);
80929 +
80930 + hdr.type = XS_WATCH;
80931 + hdr.len = strlen(XS_WATCH_RESP) + 1;
80932 + queue_reply(u, (char *)&hdr, sizeof(hdr));
80933 + queue_reply(u, (char *)XS_WATCH_RESP, hdr.len);
80934 + } else {
80935 + list_for_each_entry_safe(watch, tmp_watch,
80936 + &u->watches, list) {
80937 + if (!strcmp(watch->token, token) &&
80938 + !strcmp(watch->watch.node, path))
80939 + break;
80940 + {
80941 + unregister_xenbus_watch(&watch->watch);
80942 + list_del(&watch->list);
80943 + free_watch_adapter(watch);
80944 + break;
80945 + }
80946 + }
80947 + }
80948 +
80949 + break;
80950 +
80951 + default:
80952 + return -EINVAL;
80953 + }
80954 +
80955 + u->len = 0;
80956 + return len;
80957 +}
80958 +
80959 +static int xenbus_dev_open(struct inode *inode, struct file *filp)
80960 +{
80961 + struct xenbus_dev_data *u;
80962 +
80963 + if (xen_store_evtchn == 0)
80964 + return -ENOENT;
80965 +
80966 + nonseekable_open(inode, filp);
80967 +
80968 + u = kzalloc(sizeof(*u), GFP_KERNEL);
80969 + if (u == NULL)
80970 + return -ENOMEM;
80971 +
80972 + INIT_LIST_HEAD(&u->transactions);
80973 + INIT_LIST_HEAD(&u->watches);
80974 + init_waitqueue_head(&u->read_waitq);
80975 +
80976 + mutex_init(&u->reply_mutex);
80977 +
80978 + filp->private_data = u;
80979 +
80980 + return 0;
80981 +}
80982 +
80983 +static int xenbus_dev_release(struct inode *inode, struct file *filp)
80984 +{
80985 + struct xenbus_dev_data *u = filp->private_data;
80986 + struct xenbus_dev_transaction *trans, *tmp;
80987 + struct watch_adapter *watch, *tmp_watch;
80988 +
80989 + list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
80990 + xenbus_transaction_end(trans->handle, 1);
80991 + list_del(&trans->list);
80992 + kfree(trans);
80993 + }
80994 +
80995 + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
80996 + unregister_xenbus_watch(&watch->watch);
80997 + list_del(&watch->list);
80998 + free_watch_adapter(watch);
80999 + }
81000 +
81001 + kfree(u);
81002 +
81003 + return 0;
81004 +}
81005 +
81006 +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
81007 +{
81008 + struct xenbus_dev_data *u = file->private_data;
81009 +
81010 + poll_wait(file, &u->read_waitq, wait);
81011 + if (u->read_cons != u->read_prod)
81012 + return POLLIN | POLLRDNORM;
81013 + return 0;
81014 +}
81015 +
81016 +static struct file_operations xenbus_dev_file_ops = {
81017 + .read = xenbus_dev_read,
81018 + .write = xenbus_dev_write,
81019 + .open = xenbus_dev_open,
81020 + .release = xenbus_dev_release,
81021 + .poll = xenbus_dev_poll,
81022 +};
81023 +
81024 +int __init
81025 +xenbus_dev_init(void)
81026 +{
81027 + xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
81028 + if (xenbus_dev_intf)
81029 + xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
81030 +
81031 + return 0;
81032 +}
81033 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.c
81034 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.c 1970-01-01 00:00:00.000000000 +0000
81035 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.c 2007-01-08 15:00:45.000000000 +0000
81036 @@ -0,0 +1,1017 @@
81037 +/******************************************************************************
81038 + * Talks to Xen Store to figure out what devices we have.
81039 + *
81040 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
81041 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
81042 + * Copyright (C) 2005, 2006 XenSource Ltd
81043 + *
81044 + * This program is free software; you can redistribute it and/or
81045 + * modify it under the terms of the GNU General Public License version 2
81046 + * as published by the Free Software Foundation; or, when distributed
81047 + * separately from the Linux kernel or incorporated into other
81048 + * software packages, subject to the following license:
81049 + *
81050 + * Permission is hereby granted, free of charge, to any person obtaining a copy
81051 + * of this source file (the "Software"), to deal in the Software without
81052 + * restriction, including without limitation the rights to use, copy, modify,
81053 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
81054 + * and to permit persons to whom the Software is furnished to do so, subject to
81055 + * the following conditions:
81056 + *
81057 + * The above copyright notice and this permission notice shall be included in
81058 + * all copies or substantial portions of the Software.
81059 + *
81060 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81061 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81062 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
81063 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81064 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
81065 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
81066 + * IN THE SOFTWARE.
81067 + */
81068 +
81069 +#define DPRINTK(fmt, args...) \
81070 + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
81071 + __FUNCTION__, __LINE__, ##args)
81072 +
81073 +#include <linux/kernel.h>
81074 +#include <linux/err.h>
81075 +#include <linux/string.h>
81076 +#include <linux/ctype.h>
81077 +#include <linux/fcntl.h>
81078 +#include <linux/mm.h>
81079 +#include <linux/notifier.h>
81080 +#include <linux/kthread.h>
81081 +#include <linux/mutex.h>
81082 +
81083 +#include <asm/io.h>
81084 +#include <asm/page.h>
81085 +#include <asm/maddr.h>
81086 +#include <asm/pgtable.h>
81087 +#include <asm/hypervisor.h>
81088 +#include <xen/xenbus.h>
81089 +#include <xen/xen_proc.h>
81090 +#include <xen/evtchn.h>
81091 +#include <xen/features.h>
81092 +#include <xen/hvm.h>
81093 +
81094 +#include "xenbus_comms.h"
81095 +#include "xenbus_probe.h"
81096 +
81097 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
81098 +#include <xen/platform-compat.h>
81099 +#endif
81100 +
81101 +int xen_store_evtchn;
81102 +struct xenstore_domain_interface *xen_store_interface;
81103 +static unsigned long xen_store_mfn;
81104 +
81105 +extern struct mutex xenwatch_mutex;
81106 +
81107 +static struct notifier_block *xenstore_chain;
81108 +
81109 +static void wait_for_devices(struct xenbus_driver *xendrv);
81110 +
81111 +static int xenbus_probe_frontend(const char *type, const char *name);
81112 +
81113 +static void xenbus_dev_shutdown(struct device *_dev);
81114 +
81115 +/* If something in array of ids matches this device, return it. */
81116 +static const struct xenbus_device_id *
81117 +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
81118 +{
81119 + for (; *arr->devicetype != '\0'; arr++) {
81120 + if (!strcmp(arr->devicetype, dev->devicetype))
81121 + return arr;
81122 + }
81123 + return NULL;
81124 +}
81125 +
81126 +int xenbus_match(struct device *_dev, struct device_driver *_drv)
81127 +{
81128 + struct xenbus_driver *drv = to_xenbus_driver(_drv);
81129 +
81130 + if (!drv->ids)
81131 + return 0;
81132 +
81133 + return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
81134 +}
81135 +
81136 +/* device/<type>/<id> => <type>-<id> */
81137 +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
81138 +{
81139 + nodename = strchr(nodename, '/');
81140 + if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
81141 + printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
81142 + return -EINVAL;
81143 + }
81144 +
81145 + strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
81146 + if (!strchr(bus_id, '/')) {
81147 + printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
81148 + return -EINVAL;
81149 + }
81150 + *strchr(bus_id, '/') = '-';
81151 + return 0;
81152 +}
81153 +
81154 +
81155 +static void free_otherend_details(struct xenbus_device *dev)
81156 +{
81157 + kfree(dev->otherend);
81158 + dev->otherend = NULL;
81159 +}
81160 +
81161 +
81162 +static void free_otherend_watch(struct xenbus_device *dev)
81163 +{
81164 + if (dev->otherend_watch.node) {
81165 + unregister_xenbus_watch(&dev->otherend_watch);
81166 + kfree(dev->otherend_watch.node);
81167 + dev->otherend_watch.node = NULL;
81168 + }
81169 +}
81170 +
81171 +
81172 +int read_otherend_details(struct xenbus_device *xendev,
81173 + char *id_node, char *path_node)
81174 +{
81175 + int err = xenbus_gather(XBT_NIL, xendev->nodename,
81176 + id_node, "%i", &xendev->otherend_id,
81177 + path_node, NULL, &xendev->otherend,
81178 + NULL);
81179 + if (err) {
81180 + xenbus_dev_fatal(xendev, err,
81181 + "reading other end details from %s",
81182 + xendev->nodename);
81183 + return err;
81184 + }
81185 + if (strlen(xendev->otherend) == 0 ||
81186 + !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
81187 + xenbus_dev_fatal(xendev, -ENOENT,
81188 + "unable to read other end from %s. "
81189 + "missing or inaccessible.",
81190 + xendev->nodename);
81191 + free_otherend_details(xendev);
81192 + return -ENOENT;
81193 + }
81194 +
81195 + return 0;
81196 +}
81197 +
81198 +
81199 +static int read_backend_details(struct xenbus_device *xendev)
81200 +{
81201 + return read_otherend_details(xendev, "backend-id", "backend");
81202 +}
81203 +
81204 +
81205 +/* Bus type for frontend drivers. */
81206 +static struct xen_bus_type xenbus_frontend = {
81207 + .root = "device",
81208 + .levels = 2, /* device/type/<id> */
81209 + .get_bus_id = frontend_bus_id,
81210 + .probe = xenbus_probe_frontend,
81211 + .bus = {
81212 + .name = "xen",
81213 + .match = xenbus_match,
81214 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
81215 + .probe = xenbus_dev_probe,
81216 + .remove = xenbus_dev_remove,
81217 + .shutdown = xenbus_dev_shutdown,
81218 +#endif
81219 + },
81220 + .dev = {
81221 + .bus_id = "xen",
81222 + },
81223 +};
81224 +
81225 +static void otherend_changed(struct xenbus_watch *watch,
81226 + const char **vec, unsigned int len)
81227 +{
81228 + struct xenbus_device *dev =
81229 + container_of(watch, struct xenbus_device, otherend_watch);
81230 + struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
81231 + enum xenbus_state state;
81232 +
81233 + /* Protect us against watches firing on old details when the otherend
81234 + details change, say immediately after a resume. */
81235 + if (!dev->otherend ||
81236 + strncmp(dev->otherend, vec[XS_WATCH_PATH],
81237 + strlen(dev->otherend))) {
81238 + DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
81239 + return;
81240 + }
81241 +
81242 + state = xenbus_read_driver_state(dev->otherend);
81243 +
81244 + DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state),
81245 + dev->otherend_watch.node, vec[XS_WATCH_PATH]);
81246 +
81247 + /*
81248 + * Ignore xenbus transitions during shutdown. This prevents us doing
81249 + * work that can fail e.g., when the rootfs is gone.
81250 + */
81251 + if (system_state > SYSTEM_RUNNING) {
81252 + struct xen_bus_type *bus = bus;
81253 + bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
81254 + /* If we're frontend, drive the state machine to Closed. */
81255 + /* This should cause the backend to release our resources. */
81256 + if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
81257 + xenbus_frontend_closed(dev);
81258 + return;
81259 + }
81260 +
81261 + if (drv->otherend_changed)
81262 + drv->otherend_changed(dev, state);
81263 +}
81264 +
81265 +
81266 +static int talk_to_otherend(struct xenbus_device *dev)
81267 +{
81268 + struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
81269 +
81270 + free_otherend_watch(dev);
81271 + free_otherend_details(dev);
81272 +
81273 + return drv->read_otherend_details(dev);
81274 +}
81275 +
81276 +
81277 +static int watch_otherend(struct xenbus_device *dev)
81278 +{
81279 + return xenbus_watch_path2(dev, dev->otherend, "state",
81280 + &dev->otherend_watch, otherend_changed);
81281 +}
81282 +
81283 +
81284 +int xenbus_dev_probe(struct device *_dev)
81285 +{
81286 + struct xenbus_device *dev = to_xenbus_device(_dev);
81287 + struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
81288 + const struct xenbus_device_id *id;
81289 + int err;
81290 +
81291 + DPRINTK("%s", dev->nodename);
81292 +
81293 + if (!drv->probe) {
81294 + err = -ENODEV;
81295 + goto fail;
81296 + }
81297 +
81298 + id = match_device(drv->ids, dev);
81299 + if (!id) {
81300 + err = -ENODEV;
81301 + goto fail;
81302 + }
81303 +
81304 + err = talk_to_otherend(dev);
81305 + if (err) {
81306 + printk(KERN_WARNING
81307 + "xenbus_probe: talk_to_otherend on %s failed.\n",
81308 + dev->nodename);
81309 + return err;
81310 + }
81311 +
81312 + err = drv->probe(dev, id);
81313 + if (err)
81314 + goto fail;
81315 +
81316 + err = watch_otherend(dev);
81317 + if (err) {
81318 + printk(KERN_WARNING
81319 + "xenbus_probe: watch_otherend on %s failed.\n",
81320 + dev->nodename);
81321 + return err;
81322 + }
81323 +
81324 + return 0;
81325 +fail:
81326 + xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
81327 + xenbus_switch_state(dev, XenbusStateClosed);
81328 + return -ENODEV;
81329 +}
81330 +
81331 +int xenbus_dev_remove(struct device *_dev)
81332 +{
81333 + struct xenbus_device *dev = to_xenbus_device(_dev);
81334 + struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
81335 +
81336 + DPRINTK("%s", dev->nodename);
81337 +
81338 + free_otherend_watch(dev);
81339 + free_otherend_details(dev);
81340 +
81341 + if (drv->remove)
81342 + drv->remove(dev);
81343 +
81344 + xenbus_switch_state(dev, XenbusStateClosed);
81345 + return 0;
81346 +}
81347 +
81348 +static void xenbus_dev_shutdown(struct device *_dev)
81349 +{
81350 + struct xenbus_device *dev = to_xenbus_device(_dev);
81351 + unsigned long timeout = 5*HZ;
81352 +
81353 + DPRINTK("%s", dev->nodename);
81354 +
81355 + get_device(&dev->dev);
81356 + if (dev->state != XenbusStateConnected) {
81357 + printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__,
81358 + dev->nodename, xenbus_strstate(dev->state));
81359 + goto out;
81360 + }
81361 + xenbus_switch_state(dev, XenbusStateClosing);
81362 + timeout = wait_for_completion_timeout(&dev->down, timeout);
81363 + if (!timeout)
81364 + printk("%s: %s timeout closing device\n", __FUNCTION__, dev->nodename);
81365 + out:
81366 + put_device(&dev->dev);
81367 +}
81368 +
81369 +int xenbus_register_driver_common(struct xenbus_driver *drv,
81370 + struct xen_bus_type *bus)
81371 +{
81372 + int ret;
81373 +
81374 + drv->driver.name = drv->name;
81375 + drv->driver.bus = &bus->bus;
81376 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
81377 + drv->driver.owner = drv->owner;
81378 +#endif
81379 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
81380 + drv->driver.probe = xenbus_dev_probe;
81381 + drv->driver.remove = xenbus_dev_remove;
81382 + drv->driver.shutdown = xenbus_dev_shutdown;
81383 +#endif
81384 +
81385 + mutex_lock(&xenwatch_mutex);
81386 + ret = driver_register(&drv->driver);
81387 + mutex_unlock(&xenwatch_mutex);
81388 + return ret;
81389 +}
81390 +
81391 +int xenbus_register_frontend(struct xenbus_driver *drv)
81392 +{
81393 + int ret;
81394 +
81395 + drv->read_otherend_details = read_backend_details;
81396 +
81397 + ret = xenbus_register_driver_common(drv, &xenbus_frontend);
81398 + if (ret)
81399 + return ret;
81400 +
81401 + /* If this driver is loaded as a module wait for devices to attach. */
81402 + wait_for_devices(drv);
81403 +
81404 + return 0;
81405 +}
81406 +EXPORT_SYMBOL_GPL(xenbus_register_frontend);
81407 +
81408 +void xenbus_unregister_driver(struct xenbus_driver *drv)
81409 +{
81410 + driver_unregister(&drv->driver);
81411 +}
81412 +EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
81413 +
81414 +struct xb_find_info
81415 +{
81416 + struct xenbus_device *dev;
81417 + const char *nodename;
81418 +};
81419 +
81420 +static int cmp_dev(struct device *dev, void *data)
81421 +{
81422 + struct xenbus_device *xendev = to_xenbus_device(dev);
81423 + struct xb_find_info *info = data;
81424 +
81425 + if (!strcmp(xendev->nodename, info->nodename)) {
81426 + info->dev = xendev;
81427 + get_device(dev);
81428 + return 1;
81429 + }
81430 + return 0;
81431 +}
81432 +
81433 +struct xenbus_device *xenbus_device_find(const char *nodename,
81434 + struct bus_type *bus)
81435 +{
81436 + struct xb_find_info info = { .dev = NULL, .nodename = nodename };
81437 +
81438 + bus_for_each_dev(bus, NULL, &info, cmp_dev);
81439 + return info.dev;
81440 +}
81441 +
81442 +static int cleanup_dev(struct device *dev, void *data)
81443 +{
81444 + struct xenbus_device *xendev = to_xenbus_device(dev);
81445 + struct xb_find_info *info = data;
81446 + int len = strlen(info->nodename);
81447 +
81448 + DPRINTK("%s", info->nodename);
81449 +
81450 + /* Match the info->nodename path, or any subdirectory of that path. */
81451 + if (strncmp(xendev->nodename, info->nodename, len))
81452 + return 0;
81453 +
81454 + /* If the node name is longer, ensure it really is a subdirectory. */
81455 + if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
81456 + return 0;
81457 +
81458 + info->dev = xendev;
81459 + get_device(dev);
81460 + return 1;
81461 +}
81462 +
81463 +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
81464 +{
81465 + struct xb_find_info info = { .nodename = path };
81466 +
81467 + do {
81468 + info.dev = NULL;
81469 + bus_for_each_dev(bus, NULL, &info, cleanup_dev);
81470 + if (info.dev) {
81471 + device_unregister(&info.dev->dev);
81472 + put_device(&info.dev->dev);
81473 + }
81474 + } while (info.dev);
81475 +}
81476 +
81477 +static void xenbus_dev_release(struct device *dev)
81478 +{
81479 + if (dev)
81480 + kfree(to_xenbus_device(dev));
81481 +}
81482 +
81483 +static ssize_t xendev_show_nodename(struct device *dev,
81484 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
81485 + struct device_attribute *attr,
81486 +#endif
81487 + char *buf)
81488 +{
81489 + return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
81490 +}
81491 +DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
81492 +
81493 +static ssize_t xendev_show_devtype(struct device *dev,
81494 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
81495 + struct device_attribute *attr,
81496 +#endif
81497 + char *buf)
81498 +{
81499 + return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
81500 +}
81501 +DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
81502 +
81503 +
81504 +int xenbus_probe_node(struct xen_bus_type *bus,
81505 + const char *type,
81506 + const char *nodename)
81507 +{
81508 + int err;
81509 + struct xenbus_device *xendev;
81510 + size_t stringlen;
81511 + char *tmpstring;
81512 +
81513 + enum xenbus_state state = xenbus_read_driver_state(nodename);
81514 +
81515 + if (state != XenbusStateInitialising) {
81516 + /* Device is not new, so ignore it. This can happen if a
81517 + device is going away after switching to Closed. */
81518 + return 0;
81519 + }
81520 +
81521 + stringlen = strlen(nodename) + 1 + strlen(type) + 1;
81522 + xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
81523 + if (!xendev)
81524 + return -ENOMEM;
81525 +
81526 + xendev->state = XenbusStateInitialising;
81527 +
81528 + /* Copy the strings into the extra space. */
81529 +
81530 + tmpstring = (char *)(xendev + 1);
81531 + strcpy(tmpstring, nodename);
81532 + xendev->nodename = tmpstring;
81533 +
81534 + tmpstring += strlen(tmpstring) + 1;
81535 + strcpy(tmpstring, type);
81536 + xendev->devicetype = tmpstring;
81537 + init_completion(&xendev->down);
81538 +
81539 + xendev->dev.parent = &bus->dev;
81540 + xendev->dev.bus = &bus->bus;
81541 + xendev->dev.release = xenbus_dev_release;
81542 +
81543 + err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
81544 + if (err)
81545 + goto fail;
81546 +
81547 + /* Register with generic device framework. */
81548 + err = device_register(&xendev->dev);
81549 + if (err)
81550 + goto fail;
81551 +
81552 + device_create_file(&xendev->dev, &dev_attr_nodename);
81553 + device_create_file(&xendev->dev, &dev_attr_devtype);
81554 +
81555 + return 0;
81556 +fail:
81557 + kfree(xendev);
81558 + return err;
81559 +}
81560 +
81561 +/* device/<typename>/<name> */
81562 +static int xenbus_probe_frontend(const char *type, const char *name)
81563 +{
81564 + char *nodename;
81565 + int err;
81566 +
81567 + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name);
81568 + if (!nodename)
81569 + return -ENOMEM;
81570 +
81571 + DPRINTK("%s", nodename);
81572 +
81573 + err = xenbus_probe_node(&xenbus_frontend, type, nodename);
81574 + kfree(nodename);
81575 + return err;
81576 +}
81577 +
81578 +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
81579 +{
81580 + int err = 0;
81581 + char **dir;
81582 + unsigned int dir_n = 0;
81583 + int i;
81584 +
81585 + dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
81586 + if (IS_ERR(dir))
81587 + return PTR_ERR(dir);
81588 +
81589 + for (i = 0; i < dir_n; i++) {
81590 + err = bus->probe(type, dir[i]);
81591 + if (err)
81592 + break;
81593 + }
81594 + kfree(dir);
81595 + return err;
81596 +}
81597 +
81598 +int xenbus_probe_devices(struct xen_bus_type *bus)
81599 +{
81600 + int err = 0;
81601 + char **dir;
81602 + unsigned int i, dir_n;
81603 +
81604 + dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
81605 + if (IS_ERR(dir))
81606 + return PTR_ERR(dir);
81607 +
81608 + for (i = 0; i < dir_n; i++) {
81609 + err = xenbus_probe_device_type(bus, dir[i]);
81610 + if (err)
81611 + break;
81612 + }
81613 + kfree(dir);
81614 + return err;
81615 +}
81616 +
81617 +static unsigned int char_count(const char *str, char c)
81618 +{
81619 + unsigned int i, ret = 0;
81620 +
81621 + for (i = 0; str[i]; i++)
81622 + if (str[i] == c)
81623 + ret++;
81624 + return ret;
81625 +}
81626 +
81627 +static int strsep_len(const char *str, char c, unsigned int len)
81628 +{
81629 + unsigned int i;
81630 +
81631 + for (i = 0; str[i]; i++)
81632 + if (str[i] == c) {
81633 + if (len == 0)
81634 + return i;
81635 + len--;
81636 + }
81637 + return (len == 0) ? i : -ERANGE;
81638 +}
81639 +
81640 +void dev_changed(const char *node, struct xen_bus_type *bus)
81641 +{
81642 + int exists, rootlen;
81643 + struct xenbus_device *dev;
81644 + char type[BUS_ID_SIZE];
81645 + const char *p, *root;
81646 +
81647 + if (char_count(node, '/') < 2)
81648 + return;
81649 +
81650 + exists = xenbus_exists(XBT_NIL, node, "");
81651 + if (!exists) {
81652 + xenbus_cleanup_devices(node, &bus->bus);
81653 + return;
81654 + }
81655 +
81656 + /* backend/<type>/... or device/<type>/... */
81657 + p = strchr(node, '/') + 1;
81658 + snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
81659 + type[BUS_ID_SIZE-1] = '\0';
81660 +
81661 + rootlen = strsep_len(node, '/', bus->levels);
81662 + if (rootlen < 0)
81663 + return;
81664 + root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
81665 + if (!root)
81666 + return;
81667 +
81668 + dev = xenbus_device_find(root, &bus->bus);
81669 + if (!dev)
81670 + xenbus_probe_node(bus, type, root);
81671 + else
81672 + put_device(&dev->dev);
81673 +
81674 + kfree(root);
81675 +}
81676 +
81677 +static void frontend_changed(struct xenbus_watch *watch,
81678 + const char **vec, unsigned int len)
81679 +{
81680 + DPRINTK("");
81681 +
81682 + dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
81683 +}
81684 +
81685 +/* We watch for devices appearing and vanishing. */
81686 +static struct xenbus_watch fe_watch = {
81687 + .node = "device",
81688 + .callback = frontend_changed,
81689 +};
81690 +
81691 +static int suspend_dev(struct device *dev, void *data)
81692 +{
81693 + int err = 0;
81694 + struct xenbus_driver *drv;
81695 + struct xenbus_device *xdev;
81696 +
81697 + DPRINTK("");
81698 +
81699 + if (dev->driver == NULL)
81700 + return 0;
81701 + drv = to_xenbus_driver(dev->driver);
81702 + xdev = container_of(dev, struct xenbus_device, dev);
81703 + if (drv->suspend)
81704 + err = drv->suspend(xdev);
81705 + if (err)
81706 + printk(KERN_WARNING
81707 + "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
81708 + return 0;
81709 +}
81710 +
81711 +static int resume_dev(struct device *dev, void *data)
81712 +{
81713 + int err;
81714 + struct xenbus_driver *drv;
81715 + struct xenbus_device *xdev;
81716 +
81717 + DPRINTK("");
81718 +
81719 + if (dev->driver == NULL)
81720 + return 0;
81721 +
81722 + drv = to_xenbus_driver(dev->driver);
81723 + xdev = container_of(dev, struct xenbus_device, dev);
81724 +
81725 + err = talk_to_otherend(xdev);
81726 + if (err) {
81727 + printk(KERN_WARNING
81728 + "xenbus: resume (talk_to_otherend) %s failed: %i\n",
81729 + dev->bus_id, err);
81730 + return err;
81731 + }
81732 +
81733 + xdev->state = XenbusStateInitialising;
81734 +
81735 + if (drv->resume) {
81736 + err = drv->resume(xdev);
81737 + if (err) {
81738 + printk(KERN_WARNING
81739 + "xenbus: resume %s failed: %i\n",
81740 + dev->bus_id, err);
81741 + return err;
81742 + }
81743 + }
81744 +
81745 + err = watch_otherend(xdev);
81746 + if (err) {
81747 + printk(KERN_WARNING
81748 + "xenbus_probe: resume (watch_otherend) %s failed: "
81749 + "%d.\n", dev->bus_id, err);
81750 + return err;
81751 + }
81752 +
81753 + return 0;
81754 +}
81755 +
81756 +void xenbus_suspend(void)
81757 +{
81758 + DPRINTK("");
81759 +
81760 + bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
81761 + xenbus_backend_suspend(suspend_dev);
81762 + xs_suspend();
81763 +}
81764 +EXPORT_SYMBOL_GPL(xenbus_suspend);
81765 +
81766 +void xenbus_resume(void)
81767 +{
81768 + xb_init_comms();
81769 + xs_resume();
81770 + bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
81771 + xenbus_backend_resume(resume_dev);
81772 +}
81773 +EXPORT_SYMBOL_GPL(xenbus_resume);
81774 +
81775 +
81776 +/* A flag to determine if xenstored is 'ready' (i.e. has started) */
81777 +int xenstored_ready = 0;
81778 +
81779 +
81780 +int register_xenstore_notifier(struct notifier_block *nb)
81781 +{
81782 + int ret = 0;
81783 +
81784 + if (xenstored_ready > 0)
81785 + ret = nb->notifier_call(nb, 0, NULL);
81786 + else
81787 + notifier_chain_register(&xenstore_chain, nb);
81788 +
81789 + return ret;
81790 +}
81791 +EXPORT_SYMBOL_GPL(register_xenstore_notifier);
81792 +
81793 +void unregister_xenstore_notifier(struct notifier_block *nb)
81794 +{
81795 + notifier_chain_unregister(&xenstore_chain, nb);
81796 +}
81797 +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
81798 +
81799 +
81800 +void xenbus_probe(void *unused)
81801 +{
81802 + BUG_ON((xenstored_ready <= 0));
81803 +
81804 + /* Enumerate devices in xenstore and watch for changes. */
81805 + xenbus_probe_devices(&xenbus_frontend);
81806 + register_xenbus_watch(&fe_watch);
81807 + xenbus_backend_probe_and_watch();
81808 +
81809 + /* Notify others that xenstore is up */
81810 + notifier_call_chain(&xenstore_chain, 0, NULL);
81811 +}
81812 +
81813 +
81814 +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
81815 +static struct file_operations xsd_kva_fops;
81816 +static struct proc_dir_entry *xsd_kva_intf;
81817 +static struct proc_dir_entry *xsd_port_intf;
81818 +
81819 +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
81820 +{
81821 + size_t size = vma->vm_end - vma->vm_start;
81822 +
81823 + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
81824 + return -EINVAL;
81825 +
81826 + if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
81827 + size, vma->vm_page_prot))
81828 + return -EAGAIN;
81829 +
81830 + return 0;
81831 +}
81832 +
81833 +static int xsd_kva_read(char *page, char **start, off_t off,
81834 + int count, int *eof, void *data)
81835 +{
81836 + int len;
81837 +
81838 + len = sprintf(page, "0x%p", xen_store_interface);
81839 + *eof = 1;
81840 + return len;
81841 +}
81842 +
81843 +static int xsd_port_read(char *page, char **start, off_t off,
81844 + int count, int *eof, void *data)
81845 +{
81846 + int len;
81847 +
81848 + len = sprintf(page, "%d", xen_store_evtchn);
81849 + *eof = 1;
81850 + return len;
81851 +}
81852 +#endif
81853 +
81854 +static int __init xenbus_probe_init(void)
81855 +{
81856 + int err = 0;
81857 + unsigned long page = 0;
81858 +
81859 + DPRINTK("");
81860 +
81861 + if (!is_running_on_xen())
81862 + return -ENODEV;
81863 +
81864 + /* Register ourselves with the kernel bus subsystem */
81865 + bus_register(&xenbus_frontend.bus);
81866 + xenbus_backend_bus_register();
81867 +
81868 + /*
81869 + * Domain0 doesn't have a store_evtchn or store_mfn yet.
81870 + */
81871 + if (is_initial_xendomain()) {
81872 + struct evtchn_alloc_unbound alloc_unbound;
81873 +
81874 + /* Allocate page. */
81875 + page = get_zeroed_page(GFP_KERNEL);
81876 + if (!page)
81877 + return -ENOMEM;
81878 +
81879 + xen_store_mfn = xen_start_info->store_mfn =
81880 + pfn_to_mfn(virt_to_phys((void *)page) >>
81881 + PAGE_SHIFT);
81882 +
81883 + /* Next allocate a local port which xenstored can bind to */
81884 + alloc_unbound.dom = DOMID_SELF;
81885 + alloc_unbound.remote_dom = 0;
81886 +
81887 + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
81888 + &alloc_unbound);
81889 + if (err == -ENOSYS)
81890 + goto err;
81891 + BUG_ON(err);
81892 + xen_store_evtchn = xen_start_info->store_evtchn =
81893 + alloc_unbound.port;
81894 +
81895 +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
81896 + /* And finally publish the above info in /proc/xen */
81897 + xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
81898 + if (xsd_kva_intf) {
81899 + memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
81900 + sizeof(xsd_kva_fops));
81901 + xsd_kva_fops.mmap = xsd_kva_mmap;
81902 + xsd_kva_intf->proc_fops = &xsd_kva_fops;
81903 + xsd_kva_intf->read_proc = xsd_kva_read;
81904 + }
81905 + xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
81906 + if (xsd_port_intf)
81907 + xsd_port_intf->read_proc = xsd_port_read;
81908 +#endif
81909 + xen_store_interface = mfn_to_virt(xen_store_mfn);
81910 + } else {
81911 + xenstored_ready = 1;
81912 +#ifdef CONFIG_XEN
81913 + xen_store_evtchn = xen_start_info->store_evtchn;
81914 + xen_store_mfn = xen_start_info->store_mfn;
81915 + xen_store_interface = mfn_to_virt(xen_store_mfn);
81916 +#else
81917 + xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
81918 + xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
81919 + xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
81920 + PAGE_SIZE);
81921 +#endif
81922 + }
81923 +
81924 +
81925 + xenbus_dev_init();
81926 +
81927 + /* Initialize the interface to xenstore. */
81928 + err = xs_init();
81929 + if (err) {
81930 + printk(KERN_WARNING
81931 + "XENBUS: Error initializing xenstore comms: %i\n", err);
81932 + goto err;
81933 + }
81934 +
81935 + /* Register ourselves with the kernel device subsystem */
81936 + device_register(&xenbus_frontend.dev);
81937 + xenbus_backend_device_register();
81938 +
81939 + if (!is_initial_xendomain())
81940 + xenbus_probe(NULL);
81941 +
81942 + return 0;
81943 +
81944 + err:
81945 + if (page)
81946 + free_page(page);
81947 +
81948 + /*
81949 + * Do not unregister the xenbus front/backend buses here. The buses
81950 + * must exist because front/backend drivers will use them when they are
81951 + * registered.
81952 + */
81953 +
81954 + return err;
81955 +}
81956 +
81957 +postcore_initcall(xenbus_probe_init);
81958 +
81959 +MODULE_LICENSE("Dual BSD/GPL");
81960 +
81961 +
81962 +static int is_disconnected_device(struct device *dev, void *data)
81963 +{
81964 + struct xenbus_device *xendev = to_xenbus_device(dev);
81965 + struct device_driver *drv = data;
81966 +
81967 + /*
81968 + * A device with no driver will never connect. We care only about
81969 + * devices which should currently be in the process of connecting.
81970 + */
81971 + if (!dev->driver)
81972 + return 0;
81973 +
81974 + /* Is this search limited to a particular driver? */
81975 + if (drv && (dev->driver != drv))
81976 + return 0;
81977 +
81978 + return (xendev->state != XenbusStateConnected);
81979 +}
81980 +
81981 +static int exists_disconnected_device(struct device_driver *drv)
81982 +{
81983 + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
81984 + is_disconnected_device);
81985 +}
81986 +
81987 +static int print_device_status(struct device *dev, void *data)
81988 +{
81989 + struct xenbus_device *xendev = to_xenbus_device(dev);
81990 + struct device_driver *drv = data;
81991 +
81992 + /* Is this operation limited to a particular driver? */
81993 + if (drv && (dev->driver != drv))
81994 + return 0;
81995 +
81996 + if (!dev->driver) {
81997 + /* Information only: is this too noisy? */
81998 + printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
81999 + xendev->nodename);
82000 + } else if (xendev->state != XenbusStateConnected) {
82001 + printk(KERN_WARNING "XENBUS: Timeout connecting "
82002 + "to device: %s (state %d)\n",
82003 + xendev->nodename, xendev->state);
82004 + }
82005 +
82006 + return 0;
82007 +}
82008 +
82009 +/* We only wait for device setup after most initcalls have run. */
82010 +static int ready_to_wait_for_devices;
82011 +
82012 +/*
82013 + * On a 10 second timeout, wait for all devices currently configured. We need
82014 + * to do this to guarantee that the filesystems and / or network devices
82015 + * needed for boot are available, before we can allow the boot to proceed.
82016 + *
82017 + * This needs to be on a late_initcall, to happen after the frontend device
82018 + * drivers have been initialised, but before the root fs is mounted.
82019 + *
82020 + * A possible improvement here would be to have the tools add a per-device
82021 + * flag to the store entry, indicating whether it is needed at boot time.
82022 + * This would allow people who knew what they were doing to accelerate their
82023 + * boot slightly, but of course needs tools or manual intervention to set up
82024 + * those flags correctly.
82025 + */
82026 +static void wait_for_devices(struct xenbus_driver *xendrv)
82027 +{
82028 + unsigned long timeout = jiffies + 10*HZ;
82029 + struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
82030 +
82031 + if (!ready_to_wait_for_devices || !is_running_on_xen())
82032 + return;
82033 +
82034 + while (exists_disconnected_device(drv)) {
82035 + if (time_after(jiffies, timeout))
82036 + break;
82037 + schedule_timeout_interruptible(HZ/10);
82038 + }
82039 +
82040 + bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
82041 + print_device_status);
82042 +}
82043 +
82044 +#ifndef MODULE
82045 +static int __init boot_wait_for_devices(void)
82046 +{
82047 + ready_to_wait_for_devices = 1;
82048 + wait_for_devices(NULL);
82049 + return 0;
82050 +}
82051 +
82052 +late_initcall(boot_wait_for_devices);
82053 +#endif
82054 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.h linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.h
82055 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe.h 1970-01-01 00:00:00.000000000 +0000
82056 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe.h 2007-01-08 15:00:45.000000000 +0000
82057 @@ -0,0 +1,74 @@
82058 +/******************************************************************************
82059 + * xenbus_probe.h
82060 + *
82061 + * Talks to Xen Store to figure out what devices we have.
82062 + *
82063 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
82064 + * Copyright (C) 2005 XenSource Ltd.
82065 + *
82066 + * This program is free software; you can redistribute it and/or
82067 + * modify it under the terms of the GNU General Public License version 2
82068 + * as published by the Free Software Foundation; or, when distributed
82069 + * separately from the Linux kernel or incorporated into other
82070 + * software packages, subject to the following license:
82071 + *
82072 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82073 + * of this source file (the "Software"), to deal in the Software without
82074 + * restriction, including without limitation the rights to use, copy, modify,
82075 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82076 + * and to permit persons to whom the Software is furnished to do so, subject to
82077 + * the following conditions:
82078 + *
82079 + * The above copyright notice and this permission notice shall be included in
82080 + * all copies or substantial portions of the Software.
82081 + *
82082 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82083 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82084 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82085 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82086 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82087 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82088 + * IN THE SOFTWARE.
82089 + */
82090 +
82091 +#ifndef _XENBUS_PROBE_H
82092 +#define _XENBUS_PROBE_H
82093 +
82094 +#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
82095 +extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
82096 +extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
82097 +extern void xenbus_backend_probe_and_watch(void);
82098 +extern void xenbus_backend_bus_register(void);
82099 +extern void xenbus_backend_device_register(void);
82100 +#else
82101 +static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
82102 +static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
82103 +static inline void xenbus_backend_probe_and_watch(void) {}
82104 +static inline void xenbus_backend_bus_register(void) {}
82105 +static inline void xenbus_backend_device_register(void) {}
82106 +#endif
82107 +
82108 +struct xen_bus_type
82109 +{
82110 + char *root;
82111 + unsigned int levels;
82112 + int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
82113 + int (*probe)(const char *type, const char *dir);
82114 + struct bus_type bus;
82115 + struct device dev;
82116 +};
82117 +
82118 +extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
82119 +extern int xenbus_dev_probe(struct device *_dev);
82120 +extern int xenbus_dev_remove(struct device *_dev);
82121 +extern int xenbus_register_driver_common(struct xenbus_driver *drv,
82122 + struct xen_bus_type *bus);
82123 +extern int xenbus_probe_node(struct xen_bus_type *bus,
82124 + const char *type,
82125 + const char *nodename);
82126 +extern int xenbus_probe_devices(struct xen_bus_type *bus);
82127 +
82128 +extern void dev_changed(const char *node, struct xen_bus_type *bus);
82129 +
82130 +#endif
82131 +
82132 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe_backend.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe_backend.c
82133 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_probe_backend.c 1970-01-01 00:00:00.000000000 +0000
82134 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_probe_backend.c 2007-01-08 15:00:45.000000000 +0000
82135 @@ -0,0 +1,271 @@
82136 +/******************************************************************************
82137 + * Talks to Xen Store to figure out what devices we have (backend half).
82138 + *
82139 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
82140 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
82141 + * Copyright (C) 2005, 2006 XenSource Ltd
82142 + *
82143 + * This program is free software; you can redistribute it and/or
82144 + * modify it under the terms of the GNU General Public License version 2
82145 + * as published by the Free Software Foundation; or, when distributed
82146 + * separately from the Linux kernel or incorporated into other
82147 + * software packages, subject to the following license:
82148 + *
82149 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82150 + * of this source file (the "Software"), to deal in the Software without
82151 + * restriction, including without limitation the rights to use, copy, modify,
82152 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82153 + * and to permit persons to whom the Software is furnished to do so, subject to
82154 + * the following conditions:
82155 + *
82156 + * The above copyright notice and this permission notice shall be included in
82157 + * all copies or substantial portions of the Software.
82158 + *
82159 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82160 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82161 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82162 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82163 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82164 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82165 + * IN THE SOFTWARE.
82166 + */
82167 +
82168 +#define DPRINTK(fmt, args...) \
82169 + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
82170 + __FUNCTION__, __LINE__, ##args)
82171 +
82172 +#include <linux/kernel.h>
82173 +#include <linux/err.h>
82174 +#include <linux/string.h>
82175 +#include <linux/ctype.h>
82176 +#include <linux/fcntl.h>
82177 +#include <linux/mm.h>
82178 +#include <linux/notifier.h>
82179 +#include <linux/kthread.h>
82180 +
82181 +#include <asm/io.h>
82182 +#include <asm/page.h>
82183 +#include <asm/maddr.h>
82184 +#include <asm/pgtable.h>
82185 +#include <asm/hypervisor.h>
82186 +#include <xen/xenbus.h>
82187 +#include <xen/xen_proc.h>
82188 +#include <xen/evtchn.h>
82189 +#include <xen/features.h>
82190 +#include <xen/hvm.h>
82191 +
82192 +#include "xenbus_comms.h"
82193 +#include "xenbus_probe.h"
82194 +
82195 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
82196 +#include <xen/platform-compat.h>
82197 +#endif
82198 +
82199 +static int xenbus_uevent_backend(struct device *dev, char **envp,
82200 + int num_envp, char *buffer, int buffer_size);
82201 +static int xenbus_probe_backend(const char *type, const char *domid);
82202 +
82203 +extern int read_otherend_details(struct xenbus_device *xendev,
82204 + char *id_node, char *path_node);
82205 +
82206 +static int read_frontend_details(struct xenbus_device *xendev)
82207 +{
82208 + return read_otherend_details(xendev, "frontend-id", "frontend");
82209 +}
82210 +
82211 +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
82212 +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
82213 +{
82214 + int domid, err;
82215 + const char *devid, *type, *frontend;
82216 + unsigned int typelen;
82217 +
82218 + type = strchr(nodename, '/');
82219 + if (!type)
82220 + return -EINVAL;
82221 + type++;
82222 + typelen = strcspn(type, "/");
82223 + if (!typelen || type[typelen] != '/')
82224 + return -EINVAL;
82225 +
82226 + devid = strrchr(nodename, '/') + 1;
82227 +
82228 + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
82229 + "frontend", NULL, &frontend,
82230 + NULL);
82231 + if (err)
82232 + return err;
82233 + if (strlen(frontend) == 0)
82234 + err = -ERANGE;
82235 + if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
82236 + err = -ENOENT;
82237 + kfree(frontend);
82238 +
82239 + if (err)
82240 + return err;
82241 +
82242 + if (snprintf(bus_id, BUS_ID_SIZE,
82243 + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
82244 + return -ENOSPC;
82245 + return 0;
82246 +}
82247 +
82248 +static struct xen_bus_type xenbus_backend = {
82249 + .root = "backend",
82250 + .levels = 3, /* backend/type/<frontend>/<id> */
82251 + .get_bus_id = backend_bus_id,
82252 + .probe = xenbus_probe_backend,
82253 + .bus = {
82254 + .name = "xen-backend",
82255 + .match = xenbus_match,
82256 + .probe = xenbus_dev_probe,
82257 + .remove = xenbus_dev_remove,
82258 +// .shutdown = xenbus_dev_shutdown,
82259 + .uevent = xenbus_uevent_backend,
82260 + },
82261 + .dev = {
82262 + .bus_id = "xen-backend",
82263 + },
82264 +};
82265 +
82266 +static int xenbus_uevent_backend(struct device *dev, char **envp,
82267 + int num_envp, char *buffer, int buffer_size)
82268 +{
82269 + struct xenbus_device *xdev;
82270 + struct xenbus_driver *drv;
82271 + int i = 0;
82272 + int length = 0;
82273 +
82274 + DPRINTK("");
82275 +
82276 + if (dev == NULL)
82277 + return -ENODEV;
82278 +
82279 + xdev = to_xenbus_device(dev);
82280 + if (xdev == NULL)
82281 + return -ENODEV;
82282 +
82283 + /* stuff we want to pass to /sbin/hotplug */
82284 + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82285 + "XENBUS_TYPE=%s", xdev->devicetype);
82286 +
82287 + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82288 + "XENBUS_PATH=%s", xdev->nodename);
82289 +
82290 + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
82291 + "XENBUS_BASE_PATH=%s", xenbus_backend.root);
82292 +
82293 + /* terminate, set to next free slot, shrink available space */
82294 + envp[i] = NULL;
82295 + envp = &envp[i];
82296 + num_envp -= i;
82297 + buffer = &buffer[length];
82298 + buffer_size -= length;
82299 +
82300 + if (dev->driver) {
82301 + drv = to_xenbus_driver(dev->driver);
82302 + if (drv && drv->uevent)
82303 + return drv->uevent(xdev, envp, num_envp, buffer,
82304 + buffer_size);
82305 + }
82306 +
82307 + return 0;
82308 +}
82309 +
82310 +int xenbus_register_backend(struct xenbus_driver *drv)
82311 +{
82312 + drv->read_otherend_details = read_frontend_details;
82313 +
82314 + return xenbus_register_driver_common(drv, &xenbus_backend);
82315 +}
82316 +EXPORT_SYMBOL_GPL(xenbus_register_backend);
82317 +
82318 +/* backend/<typename>/<frontend-uuid>/<name> */
82319 +static int xenbus_probe_backend_unit(const char *dir,
82320 + const char *type,
82321 + const char *name)
82322 +{
82323 + char *nodename;
82324 + int err;
82325 +
82326 + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
82327 + if (!nodename)
82328 + return -ENOMEM;
82329 +
82330 + DPRINTK("%s\n", nodename);
82331 +
82332 + err = xenbus_probe_node(&xenbus_backend, type, nodename);
82333 + kfree(nodename);
82334 + return err;
82335 +}
82336 +
82337 +/* backend/<typename>/<frontend-domid> */
82338 +static int xenbus_probe_backend(const char *type, const char *domid)
82339 +{
82340 + char *nodename;
82341 + int err = 0;
82342 + char **dir;
82343 + unsigned int i, dir_n = 0;
82344 +
82345 + DPRINTK("");
82346 +
82347 + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid);
82348 + if (!nodename)
82349 + return -ENOMEM;
82350 +
82351 + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
82352 + if (IS_ERR(dir)) {
82353 + kfree(nodename);
82354 + return PTR_ERR(dir);
82355 + }
82356 +
82357 + for (i = 0; i < dir_n; i++) {
82358 + err = xenbus_probe_backend_unit(nodename, type, dir[i]);
82359 + if (err)
82360 + break;
82361 + }
82362 + kfree(dir);
82363 + kfree(nodename);
82364 + return err;
82365 +}
82366 +
82367 +static void backend_changed(struct xenbus_watch *watch,
82368 + const char **vec, unsigned int len)
82369 +{
82370 + DPRINTK("");
82371 +
82372 + dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
82373 +}
82374 +
82375 +static struct xenbus_watch be_watch = {
82376 + .node = "backend",
82377 + .callback = backend_changed,
82378 +};
82379 +
82380 +void xenbus_backend_suspend(int (*fn)(struct device *, void *))
82381 +{
82382 + DPRINTK("");
82383 + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
82384 +}
82385 +
82386 +void xenbus_backend_resume(int (*fn)(struct device *, void *))
82387 +{
82388 + DPRINTK("");
82389 + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
82390 +}
82391 +
82392 +void xenbus_backend_probe_and_watch(void)
82393 +{
82394 + xenbus_probe_devices(&xenbus_backend);
82395 + register_xenbus_watch(&be_watch);
82396 +}
82397 +
82398 +void xenbus_backend_bus_register(void)
82399 +{
82400 + bus_register(&xenbus_backend.bus);
82401 +}
82402 +
82403 +void xenbus_backend_device_register(void)
82404 +{
82405 + device_register(&xenbus_backend.dev);
82406 +}
82407 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_xs.c linux-2.6.16.33/drivers/xen/xenbus/xenbus_xs.c
82408 --- linux-2.6.16.33-noxen/drivers/xen/xenbus/xenbus_xs.c 1970-01-01 00:00:00.000000000 +0000
82409 +++ linux-2.6.16.33/drivers/xen/xenbus/xenbus_xs.c 2007-01-08 15:00:45.000000000 +0000
82410 @@ -0,0 +1,859 @@
82411 +/******************************************************************************
82412 + * xenbus_xs.c
82413 + *
82414 + * This is the kernel equivalent of the "xs" library. We don't need everything
82415 + * and we use xenbus_comms for communication.
82416 + *
82417 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
82418 + *
82419 + * This program is free software; you can redistribute it and/or
82420 + * modify it under the terms of the GNU General Public License version 2
82421 + * as published by the Free Software Foundation; or, when distributed
82422 + * separately from the Linux kernel or incorporated into other
82423 + * software packages, subject to the following license:
82424 + *
82425 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82426 + * of this source file (the "Software"), to deal in the Software without
82427 + * restriction, including without limitation the rights to use, copy, modify,
82428 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82429 + * and to permit persons to whom the Software is furnished to do so, subject to
82430 + * the following conditions:
82431 + *
82432 + * The above copyright notice and this permission notice shall be included in
82433 + * all copies or substantial portions of the Software.
82434 + *
82435 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82436 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82437 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82438 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82439 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82440 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82441 + * IN THE SOFTWARE.
82442 + */
82443 +
82444 +#include <linux/unistd.h>
82445 +#include <linux/errno.h>
82446 +#include <linux/types.h>
82447 +#include <linux/uio.h>
82448 +#include <linux/kernel.h>
82449 +#include <linux/string.h>
82450 +#include <linux/err.h>
82451 +#include <linux/slab.h>
82452 +#include <linux/fcntl.h>
82453 +#include <linux/kthread.h>
82454 +#include <linux/rwsem.h>
82455 +#include <linux/module.h>
82456 +#include <linux/mutex.h>
82457 +#include <xen/xenbus.h>
82458 +#include "xenbus_comms.h"
82459 +
82460 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
82461 +#include <xen/platform-compat.h>
82462 +#endif
82463 +
82464 +struct xs_stored_msg {
82465 + struct list_head list;
82466 +
82467 + struct xsd_sockmsg hdr;
82468 +
82469 + union {
82470 + /* Queued replies. */
82471 + struct {
82472 + char *body;
82473 + } reply;
82474 +
82475 + /* Queued watch events. */
82476 + struct {
82477 + struct xenbus_watch *handle;
82478 + char **vec;
82479 + unsigned int vec_size;
82480 + } watch;
82481 + } u;
82482 +};
82483 +
82484 +struct xs_handle {
82485 + /* A list of replies. Currently only one will ever be outstanding. */
82486 + struct list_head reply_list;
82487 + spinlock_t reply_lock;
82488 + wait_queue_head_t reply_waitq;
82489 +
82490 + /* One request at a time. */
82491 + struct mutex request_mutex;
82492 +
82493 + /* Protect transactions against save/restore. */
82494 + struct rw_semaphore suspend_mutex;
82495 +};
82496 +
82497 +static struct xs_handle xs_state;
82498 +
82499 +/* List of registered watches, and a lock to protect it. */
82500 +static LIST_HEAD(watches);
82501 +static DEFINE_SPINLOCK(watches_lock);
82502 +
82503 +/* List of pending watch callback events, and a lock to protect it. */
82504 +static LIST_HEAD(watch_events);
82505 +static DEFINE_SPINLOCK(watch_events_lock);
82506 +
82507 +/*
82508 + * Details of the xenwatch callback kernel thread. The thread waits on the
82509 + * watch_events_waitq for work to do (queued on watch_events list). When it
82510 + * wakes up it acquires the xenwatch_mutex before reading the list and
82511 + * carrying out work.
82512 + */
82513 +static pid_t xenwatch_pid;
82514 +/* static */ DEFINE_MUTEX(xenwatch_mutex);
82515 +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
82516 +
82517 +static int get_error(const char *errorstring)
82518 +{
82519 + unsigned int i;
82520 +
82521 + for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
82522 + if (i == ARRAY_SIZE(xsd_errors) - 1) {
82523 + printk(KERN_WARNING
82524 + "XENBUS xen store gave: unknown error %s",
82525 + errorstring);
82526 + return EINVAL;
82527 + }
82528 + }
82529 + return xsd_errors[i].errnum;
82530 +}
82531 +
82532 +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
82533 +{
82534 + struct xs_stored_msg *msg;
82535 + char *body;
82536 +
82537 + spin_lock(&xs_state.reply_lock);
82538 +
82539 + while (list_empty(&xs_state.reply_list)) {
82540 + spin_unlock(&xs_state.reply_lock);
82541 + /* XXX FIXME: Avoid synchronous wait for response here. */
82542 + wait_event(xs_state.reply_waitq,
82543 + !list_empty(&xs_state.reply_list));
82544 + spin_lock(&xs_state.reply_lock);
82545 + }
82546 +
82547 + msg = list_entry(xs_state.reply_list.next,
82548 + struct xs_stored_msg, list);
82549 + list_del(&msg->list);
82550 +
82551 + spin_unlock(&xs_state.reply_lock);
82552 +
82553 + *type = msg->hdr.type;
82554 + if (len)
82555 + *len = msg->hdr.len;
82556 + body = msg->u.reply.body;
82557 +
82558 + kfree(msg);
82559 +
82560 + return body;
82561 +}
82562 +
82563 +/* Emergency write. */
82564 +void xenbus_debug_write(const char *str, unsigned int count)
82565 +{
82566 + struct xsd_sockmsg msg = { 0 };
82567 +
82568 + msg.type = XS_DEBUG;
82569 + msg.len = sizeof("print") + count + 1;
82570 +
82571 + mutex_lock(&xs_state.request_mutex);
82572 + xb_write(&msg, sizeof(msg));
82573 + xb_write("print", sizeof("print"));
82574 + xb_write(str, count);
82575 + xb_write("", 1);
82576 + mutex_unlock(&xs_state.request_mutex);
82577 +}
82578 +
82579 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
82580 +{
82581 + void *ret;
82582 + struct xsd_sockmsg req_msg = *msg;
82583 + int err;
82584 +
82585 + if (req_msg.type == XS_TRANSACTION_START)
82586 + down_read(&xs_state.suspend_mutex);
82587 +
82588 + mutex_lock(&xs_state.request_mutex);
82589 +
82590 + err = xb_write(msg, sizeof(*msg) + msg->len);
82591 + if (err) {
82592 + msg->type = XS_ERROR;
82593 + ret = ERR_PTR(err);
82594 + } else
82595 + ret = read_reply(&msg->type, &msg->len);
82596 +
82597 + mutex_unlock(&xs_state.request_mutex);
82598 +
82599 + if ((req_msg.type == XS_TRANSACTION_END) ||
82600 + ((req_msg.type == XS_TRANSACTION_START) &&
82601 + (msg->type == XS_ERROR)))
82602 + up_read(&xs_state.suspend_mutex);
82603 +
82604 + return ret;
82605 +}
82606 +
82607 +/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
82608 +static void *xs_talkv(struct xenbus_transaction t,
82609 + enum xsd_sockmsg_type type,
82610 + const struct kvec *iovec,
82611 + unsigned int num_vecs,
82612 + unsigned int *len)
82613 +{
82614 + struct xsd_sockmsg msg;
82615 + void *ret = NULL;
82616 + unsigned int i;
82617 + int err;
82618 +
82619 + msg.tx_id = t.id;
82620 + msg.req_id = 0;
82621 + msg.type = type;
82622 + msg.len = 0;
82623 + for (i = 0; i < num_vecs; i++)
82624 + msg.len += iovec[i].iov_len;
82625 +
82626 + mutex_lock(&xs_state.request_mutex);
82627 +
82628 + err = xb_write(&msg, sizeof(msg));
82629 + if (err) {
82630 + mutex_unlock(&xs_state.request_mutex);
82631 + return ERR_PTR(err);
82632 + }
82633 +
82634 + for (i = 0; i < num_vecs; i++) {
82635 + err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
82636 + if (err) {
82637 + mutex_unlock(&xs_state.request_mutex);
82638 + return ERR_PTR(err);
82639 + }
82640 + }
82641 +
82642 + ret = read_reply(&msg.type, len);
82643 +
82644 + mutex_unlock(&xs_state.request_mutex);
82645 +
82646 + if (IS_ERR(ret))
82647 + return ret;
82648 +
82649 + if (msg.type == XS_ERROR) {
82650 + err = get_error(ret);
82651 + kfree(ret);
82652 + return ERR_PTR(-err);
82653 + }
82654 +
82655 + if (msg.type != type) {
82656 + if (printk_ratelimit())
82657 + printk(KERN_WARNING
82658 + "XENBUS unexpected type [%d], expected [%d]\n",
82659 + msg.type, type);
82660 + kfree(ret);
82661 + return ERR_PTR(-EINVAL);
82662 + }
82663 + return ret;
82664 +}
82665 +
82666 +/* Simplified version of xs_talkv: single message. */
82667 +static void *xs_single(struct xenbus_transaction t,
82668 + enum xsd_sockmsg_type type,
82669 + const char *string,
82670 + unsigned int *len)
82671 +{
82672 + struct kvec iovec;
82673 +
82674 + iovec.iov_base = (void *)string;
82675 + iovec.iov_len = strlen(string) + 1;
82676 + return xs_talkv(t, type, &iovec, 1, len);
82677 +}
82678 +
82679 +/* Many commands only need an ack, don't care what it says. */
82680 +static int xs_error(char *reply)
82681 +{
82682 + if (IS_ERR(reply))
82683 + return PTR_ERR(reply);
82684 + kfree(reply);
82685 + return 0;
82686 +}
82687 +
82688 +static unsigned int count_strings(const char *strings, unsigned int len)
82689 +{
82690 + unsigned int num;
82691 + const char *p;
82692 +
82693 + for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
82694 + num++;
82695 +
82696 + return num;
82697 +}
82698 +
82699 +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
82700 +static char *join(const char *dir, const char *name)
82701 +{
82702 + char *buffer;
82703 +
82704 + if (strlen(name) == 0)
82705 + buffer = kasprintf(GFP_KERNEL, "%s", dir);
82706 + else
82707 + buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
82708 + return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
82709 +}
82710 +
82711 +static char **split(char *strings, unsigned int len, unsigned int *num)
82712 +{
82713 + char *p, **ret;
82714 +
82715 + /* Count the strings. */
82716 + *num = count_strings(strings, len);
82717 +
82718 + /* Transfer to one big alloc for easy freeing. */
82719 + ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
82720 + if (!ret) {
82721 + kfree(strings);
82722 + return ERR_PTR(-ENOMEM);
82723 + }
82724 + memcpy(&ret[*num], strings, len);
82725 + kfree(strings);
82726 +
82727 + strings = (char *)&ret[*num];
82728 + for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
82729 + ret[(*num)++] = p;
82730 +
82731 + return ret;
82732 +}
82733 +
82734 +char **xenbus_directory(struct xenbus_transaction t,
82735 + const char *dir, const char *node, unsigned int *num)
82736 +{
82737 + char *strings, *path;
82738 + unsigned int len;
82739 +
82740 + path = join(dir, node);
82741 + if (IS_ERR(path))
82742 + return (char **)path;
82743 +
82744 + strings = xs_single(t, XS_DIRECTORY, path, &len);
82745 + kfree(path);
82746 + if (IS_ERR(strings))
82747 + return (char **)strings;
82748 +
82749 + return split(strings, len, num);
82750 +}
82751 +EXPORT_SYMBOL_GPL(xenbus_directory);
82752 +
82753 +/* Check if a path exists. Return 1 if it does. */
82754 +int xenbus_exists(struct xenbus_transaction t,
82755 + const char *dir, const char *node)
82756 +{
82757 + char **d;
82758 + int dir_n;
82759 +
82760 + d = xenbus_directory(t, dir, node, &dir_n);
82761 + if (IS_ERR(d))
82762 + return 0;
82763 + kfree(d);
82764 + return 1;
82765 +}
82766 +EXPORT_SYMBOL_GPL(xenbus_exists);
82767 +
82768 +/* Get the value of a single file.
82769 + * Returns a kmalloced value: call free() on it after use.
82770 + * len indicates length in bytes.
82771 + */
82772 +void *xenbus_read(struct xenbus_transaction t,
82773 + const char *dir, const char *node, unsigned int *len)
82774 +{
82775 + char *path;
82776 + void *ret;
82777 +
82778 + path = join(dir, node);
82779 + if (IS_ERR(path))
82780 + return (void *)path;
82781 +
82782 + ret = xs_single(t, XS_READ, path, len);
82783 + kfree(path);
82784 + return ret;
82785 +}
82786 +EXPORT_SYMBOL_GPL(xenbus_read);
82787 +
82788 +/* Write the value of a single file.
82789 + * Returns -err on failure.
82790 + */
82791 +int xenbus_write(struct xenbus_transaction t,
82792 + const char *dir, const char *node, const char *string)
82793 +{
82794 + const char *path;
82795 + struct kvec iovec[2];
82796 + int ret;
82797 +
82798 + path = join(dir, node);
82799 + if (IS_ERR(path))
82800 + return PTR_ERR(path);
82801 +
82802 + iovec[0].iov_base = (void *)path;
82803 + iovec[0].iov_len = strlen(path) + 1;
82804 + iovec[1].iov_base = (void *)string;
82805 + iovec[1].iov_len = strlen(string);
82806 +
82807 + ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
82808 + kfree(path);
82809 + return ret;
82810 +}
82811 +EXPORT_SYMBOL_GPL(xenbus_write);
82812 +
82813 +/* Create a new directory. */
82814 +int xenbus_mkdir(struct xenbus_transaction t,
82815 + const char *dir, const char *node)
82816 +{
82817 + char *path;
82818 + int ret;
82819 +
82820 + path = join(dir, node);
82821 + if (IS_ERR(path))
82822 + return PTR_ERR(path);
82823 +
82824 + ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
82825 + kfree(path);
82826 + return ret;
82827 +}
82828 +EXPORT_SYMBOL_GPL(xenbus_mkdir);
82829 +
82830 +/* Destroy a file or directory (directories must be empty). */
82831 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
82832 +{
82833 + char *path;
82834 + int ret;
82835 +
82836 + path = join(dir, node);
82837 + if (IS_ERR(path))
82838 + return PTR_ERR(path);
82839 +
82840 + ret = xs_error(xs_single(t, XS_RM, path, NULL));
82841 + kfree(path);
82842 + return ret;
82843 +}
82844 +EXPORT_SYMBOL_GPL(xenbus_rm);
82845 +
82846 +/* Start a transaction: changes by others will not be seen during this
82847 + * transaction, and changes will not be visible to others until end.
82848 + */
82849 +int xenbus_transaction_start(struct xenbus_transaction *t)
82850 +{
82851 + char *id_str;
82852 +
82853 + down_read(&xs_state.suspend_mutex);
82854 +
82855 + id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
82856 + if (IS_ERR(id_str)) {
82857 + up_read(&xs_state.suspend_mutex);
82858 + return PTR_ERR(id_str);
82859 + }
82860 +
82861 + t->id = simple_strtoul(id_str, NULL, 0);
82862 + kfree(id_str);
82863 + return 0;
82864 +}
82865 +EXPORT_SYMBOL_GPL(xenbus_transaction_start);
82866 +
82867 +/* End a transaction.
82868 + * If abandon is true, transaction is discarded instead of committed.
82869 + */
82870 +int xenbus_transaction_end(struct xenbus_transaction t, int abort)
82871 +{
82872 + char abortstr[2];
82873 + int err;
82874 +
82875 + if (abort)
82876 + strcpy(abortstr, "F");
82877 + else
82878 + strcpy(abortstr, "T");
82879 +
82880 + err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
82881 +
82882 + up_read(&xs_state.suspend_mutex);
82883 +
82884 + return err;
82885 +}
82886 +EXPORT_SYMBOL_GPL(xenbus_transaction_end);
82887 +
82888 +/* Single read and scanf: returns -errno or num scanned. */
82889 +int xenbus_scanf(struct xenbus_transaction t,
82890 + const char *dir, const char *node, const char *fmt, ...)
82891 +{
82892 + va_list ap;
82893 + int ret;
82894 + char *val;
82895 +
82896 + val = xenbus_read(t, dir, node, NULL);
82897 + if (IS_ERR(val))
82898 + return PTR_ERR(val);
82899 +
82900 + va_start(ap, fmt);
82901 + ret = vsscanf(val, fmt, ap);
82902 + va_end(ap);
82903 + kfree(val);
82904 + /* Distinctive errno. */
82905 + if (ret == 0)
82906 + return -ERANGE;
82907 + return ret;
82908 +}
82909 +EXPORT_SYMBOL_GPL(xenbus_scanf);
82910 +
82911 +/* Single printf and write: returns -errno or 0. */
82912 +int xenbus_printf(struct xenbus_transaction t,
82913 + const char *dir, const char *node, const char *fmt, ...)
82914 +{
82915 + va_list ap;
82916 + int ret;
82917 +#define PRINTF_BUFFER_SIZE 4096
82918 + char *printf_buffer;
82919 +
82920 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
82921 + if (printf_buffer == NULL)
82922 + return -ENOMEM;
82923 +
82924 + va_start(ap, fmt);
82925 + ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
82926 + va_end(ap);
82927 +
82928 + BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
82929 + ret = xenbus_write(t, dir, node, printf_buffer);
82930 +
82931 + kfree(printf_buffer);
82932 +
82933 + return ret;
82934 +}
82935 +EXPORT_SYMBOL_GPL(xenbus_printf);
82936 +
82937 +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
82938 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
82939 +{
82940 + va_list ap;
82941 + const char *name;
82942 + int ret = 0;
82943 +
82944 + va_start(ap, dir);
82945 + while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
82946 + const char *fmt = va_arg(ap, char *);
82947 + void *result = va_arg(ap, void *);
82948 + char *p;
82949 +
82950 + p = xenbus_read(t, dir, name, NULL);
82951 + if (IS_ERR(p)) {
82952 + ret = PTR_ERR(p);
82953 + break;
82954 + }
82955 + if (fmt) {
82956 + if (sscanf(p, fmt, result) == 0)
82957 + ret = -EINVAL;
82958 + kfree(p);
82959 + } else
82960 + *(char **)result = p;
82961 + }
82962 + va_end(ap);
82963 + return ret;
82964 +}
82965 +EXPORT_SYMBOL_GPL(xenbus_gather);
82966 +
82967 +static int xs_watch(const char *path, const char *token)
82968 +{
82969 + struct kvec iov[2];
82970 +
82971 + iov[0].iov_base = (void *)path;
82972 + iov[0].iov_len = strlen(path) + 1;
82973 + iov[1].iov_base = (void *)token;
82974 + iov[1].iov_len = strlen(token) + 1;
82975 +
82976 + return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
82977 + ARRAY_SIZE(iov), NULL));
82978 +}
82979 +
82980 +static int xs_unwatch(const char *path, const char *token)
82981 +{
82982 + struct kvec iov[2];
82983 +
82984 + iov[0].iov_base = (char *)path;
82985 + iov[0].iov_len = strlen(path) + 1;
82986 + iov[1].iov_base = (char *)token;
82987 + iov[1].iov_len = strlen(token) + 1;
82988 +
82989 + return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
82990 + ARRAY_SIZE(iov), NULL));
82991 +}
82992 +
82993 +static struct xenbus_watch *find_watch(const char *token)
82994 +{
82995 + struct xenbus_watch *i, *cmp;
82996 +
82997 + cmp = (void *)simple_strtoul(token, NULL, 16);
82998 +
82999 + list_for_each_entry(i, &watches, list)
83000 + if (i == cmp)
83001 + return i;
83002 +
83003 + return NULL;
83004 +}
83005 +
83006 +/* Register callback to watch this node. */
83007 +int register_xenbus_watch(struct xenbus_watch *watch)
83008 +{
83009 + /* Pointer in ascii is the token. */
83010 + char token[sizeof(watch) * 2 + 1];
83011 + int err;
83012 +
83013 + sprintf(token, "%lX", (long)watch);
83014 +
83015 + down_read(&xs_state.suspend_mutex);
83016 +
83017 + spin_lock(&watches_lock);
83018 + BUG_ON(find_watch(token));
83019 + list_add(&watch->list, &watches);
83020 + spin_unlock(&watches_lock);
83021 +
83022 + err = xs_watch(watch->node, token);
83023 +
83024 + /* Ignore errors due to multiple registration. */
83025 + if ((err != 0) && (err != -EEXIST)) {
83026 + spin_lock(&watches_lock);
83027 + list_del(&watch->list);
83028 + spin_unlock(&watches_lock);
83029 + }
83030 +
83031 + up_read(&xs_state.suspend_mutex);
83032 +
83033 + return err;
83034 +}
83035 +EXPORT_SYMBOL_GPL(register_xenbus_watch);
83036 +
83037 +void unregister_xenbus_watch(struct xenbus_watch *watch)
83038 +{
83039 + struct xs_stored_msg *msg, *tmp;
83040 + char token[sizeof(watch) * 2 + 1];
83041 + int err;
83042 +
83043 + sprintf(token, "%lX", (long)watch);
83044 +
83045 + down_read(&xs_state.suspend_mutex);
83046 +
83047 + spin_lock(&watches_lock);
83048 + BUG_ON(!find_watch(token));
83049 + list_del(&watch->list);
83050 + spin_unlock(&watches_lock);
83051 +
83052 + err = xs_unwatch(watch->node, token);
83053 + if (err)
83054 + printk(KERN_WARNING
83055 + "XENBUS Failed to release watch %s: %i\n",
83056 + watch->node, err);
83057 +
83058 + up_read(&xs_state.suspend_mutex);
83059 +
83060 + /* Cancel pending watch events. */
83061 + spin_lock(&watch_events_lock);
83062 + list_for_each_entry_safe(msg, tmp, &watch_events, list) {
83063 + if (msg->u.watch.handle != watch)
83064 + continue;
83065 + list_del(&msg->list);
83066 + kfree(msg->u.watch.vec);
83067 + kfree(msg);
83068 + }
83069 + spin_unlock(&watch_events_lock);
83070 +
83071 + /* Flush any currently-executing callback, unless we are it. :-) */
83072 + if (current->pid != xenwatch_pid) {
83073 + mutex_lock(&xenwatch_mutex);
83074 + mutex_unlock(&xenwatch_mutex);
83075 + }
83076 +}
83077 +EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
83078 +
83079 +void xs_suspend(void)
83080 +{
83081 + struct xenbus_watch *watch;
83082 + char token[sizeof(watch) * 2 + 1];
83083 +
83084 + down_write(&xs_state.suspend_mutex);
83085 +
83086 + /* No need for watches_lock: the suspend_mutex is sufficient. */
83087 + list_for_each_entry(watch, &watches, list) {
83088 + sprintf(token, "%lX", (long)watch);
83089 + xs_unwatch(watch->node, token);
83090 + }
83091 +
83092 + mutex_lock(&xs_state.request_mutex);
83093 +}
83094 +
83095 +void xs_resume(void)
83096 +{
83097 + struct xenbus_watch *watch;
83098 + char token[sizeof(watch) * 2 + 1];
83099 +
83100 + mutex_unlock(&xs_state.request_mutex);
83101 +
83102 + /* No need for watches_lock: the suspend_mutex is sufficient. */
83103 + list_for_each_entry(watch, &watches, list) {
83104 + sprintf(token, "%lX", (long)watch);
83105 + xs_watch(watch->node, token);
83106 + }
83107 +
83108 + up_write(&xs_state.suspend_mutex);
83109 +}
83110 +
83111 +static int xenwatch_handle_callback(void *data)
83112 +{
83113 + struct xs_stored_msg *msg = data;
83114 +
83115 + msg->u.watch.handle->callback(msg->u.watch.handle,
83116 + (const char **)msg->u.watch.vec,
83117 + msg->u.watch.vec_size);
83118 +
83119 + kfree(msg->u.watch.vec);
83120 + kfree(msg);
83121 +
83122 + /* Kill this kthread if we were spawned just for this callback. */
83123 + if (current->pid != xenwatch_pid)
83124 + do_exit(0);
83125 +
83126 + return 0;
83127 +}
83128 +
83129 +static int xenwatch_thread(void *unused)
83130 +{
83131 + struct list_head *ent;
83132 + struct xs_stored_msg *msg;
83133 +
83134 + for (;;) {
83135 + wait_event_interruptible(watch_events_waitq,
83136 + !list_empty(&watch_events));
83137 +
83138 + if (kthread_should_stop())
83139 + break;
83140 +
83141 + mutex_lock(&xenwatch_mutex);
83142 +
83143 + spin_lock(&watch_events_lock);
83144 + ent = watch_events.next;
83145 + if (ent != &watch_events)
83146 + list_del(ent);
83147 + spin_unlock(&watch_events_lock);
83148 +
83149 + if (ent != &watch_events) {
83150 + msg = list_entry(ent, struct xs_stored_msg, list);
83151 + if (msg->u.watch.handle->flags & XBWF_new_thread)
83152 + kthread_run(xenwatch_handle_callback,
83153 + msg, "xenwatch_cb");
83154 + else
83155 + xenwatch_handle_callback(msg);
83156 + }
83157 +
83158 + mutex_unlock(&xenwatch_mutex);
83159 + }
83160 +
83161 + return 0;
83162 +}
83163 +
83164 +static int process_msg(void)
83165 +{
83166 + struct xs_stored_msg *msg;
83167 + char *body;
83168 + int err;
83169 +
83170 + msg = kmalloc(sizeof(*msg), GFP_KERNEL);
83171 + if (msg == NULL)
83172 + return -ENOMEM;
83173 +
83174 + err = xb_read(&msg->hdr, sizeof(msg->hdr));
83175 + if (err) {
83176 + kfree(msg);
83177 + return err;
83178 + }
83179 +
83180 + body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
83181 + if (body == NULL) {
83182 + kfree(msg);
83183 + return -ENOMEM;
83184 + }
83185 +
83186 + err = xb_read(body, msg->hdr.len);
83187 + if (err) {
83188 + kfree(body);
83189 + kfree(msg);
83190 + return err;
83191 + }
83192 + body[msg->hdr.len] = '\0';
83193 +
83194 + if (msg->hdr.type == XS_WATCH_EVENT) {
83195 + msg->u.watch.vec = split(body, msg->hdr.len,
83196 + &msg->u.watch.vec_size);
83197 + if (IS_ERR(msg->u.watch.vec)) {
83198 + kfree(msg);
83199 + return PTR_ERR(msg->u.watch.vec);
83200 + }
83201 +
83202 + spin_lock(&watches_lock);
83203 + msg->u.watch.handle = find_watch(
83204 + msg->u.watch.vec[XS_WATCH_TOKEN]);
83205 + if (msg->u.watch.handle != NULL) {
83206 + spin_lock(&watch_events_lock);
83207 + list_add_tail(&msg->list, &watch_events);
83208 + wake_up(&watch_events_waitq);
83209 + spin_unlock(&watch_events_lock);
83210 + } else {
83211 + kfree(msg->u.watch.vec);
83212 + kfree(msg);
83213 + }
83214 + spin_unlock(&watches_lock);
83215 + } else {
83216 + msg->u.reply.body = body;
83217 + spin_lock(&xs_state.reply_lock);
83218 + list_add_tail(&msg->list, &xs_state.reply_list);
83219 + spin_unlock(&xs_state.reply_lock);
83220 + wake_up(&xs_state.reply_waitq);
83221 + }
83222 +
83223 + return 0;
83224 +}
83225 +
83226 +static int xenbus_thread(void *unused)
83227 +{
83228 + int err;
83229 +
83230 + for (;;) {
83231 + err = process_msg();
83232 + if (err)
83233 + printk(KERN_WARNING "XENBUS error %d while reading "
83234 + "message\n", err);
83235 + if (kthread_should_stop())
83236 + break;
83237 + }
83238 +
83239 + return 0;
83240 +}
83241 +
83242 +int xs_init(void)
83243 +{
83244 + int err;
83245 + struct task_struct *task;
83246 +
83247 + INIT_LIST_HEAD(&xs_state.reply_list);
83248 + spin_lock_init(&xs_state.reply_lock);
83249 + init_waitqueue_head(&xs_state.reply_waitq);
83250 +
83251 + mutex_init(&xs_state.request_mutex);
83252 + init_rwsem(&xs_state.suspend_mutex);
83253 +
83254 + /* Initialize the shared memory rings to talk to xenstored */
83255 + err = xb_init_comms();
83256 + if (err)
83257 + return err;
83258 +
83259 + task = kthread_run(xenwatch_thread, NULL, "xenwatch");
83260 + if (IS_ERR(task))
83261 + return PTR_ERR(task);
83262 + xenwatch_pid = task->pid;
83263 +
83264 + task = kthread_run(xenbus_thread, NULL, "xenbus");
83265 + if (IS_ERR(task))
83266 + return PTR_ERR(task);
83267 +
83268 + return 0;
83269 +}
83270 diff -Nur linux-2.6.16.33-noxen/drivers/xen/xenoprof/xenoprofile.c linux-2.6.16.33/drivers/xen/xenoprof/xenoprofile.c
83271 --- linux-2.6.16.33-noxen/drivers/xen/xenoprof/xenoprofile.c 1970-01-01 00:00:00.000000000 +0000
83272 +++ linux-2.6.16.33/drivers/xen/xenoprof/xenoprofile.c 2007-01-08 15:00:45.000000000 +0000
83273 @@ -0,0 +1,500 @@
83274 +/**
83275 + * @file xenoprofile.c
83276 + *
83277 + * @remark Copyright 2002 OProfile authors
83278 + * @remark Read the file COPYING
83279 + *
83280 + * @author John Levon <levon@movementarian.org>
83281 + *
83282 + * Modified by Aravind Menon and Jose Renato Santos for Xen
83283 + * These modifications are:
83284 + * Copyright (C) 2005 Hewlett-Packard Co.
83285 + *
83286 + * Separated out arch-generic part
83287 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
83288 + * VA Linux Systems Japan K.K.
83289 + */
83290 +
83291 +#include <linux/init.h>
83292 +#include <linux/notifier.h>
83293 +#include <linux/smp.h>
83294 +#include <linux/oprofile.h>
83295 +#include <linux/sysdev.h>
83296 +#include <linux/slab.h>
83297 +#include <linux/interrupt.h>
83298 +#include <linux/vmalloc.h>
83299 +#include <asm/pgtable.h>
83300 +#include <xen/evtchn.h>
83301 +#include <xen/xenoprof.h>
83302 +#include <xen/driver_util.h>
83303 +#include <xen/interface/xen.h>
83304 +#include <xen/interface/xenoprof.h>
83305 +#include "../../../drivers/oprofile/cpu_buffer.h"
83306 +#include "../../../drivers/oprofile/event_buffer.h"
83307 +
83308 +#define MAX_XENOPROF_SAMPLES 16
83309 +
83310 +/* sample buffers shared with Xen */
83311 +xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS];
83312 +/* Shared buffer area */
83313 +struct xenoprof_shared_buffer shared_buffer;
83314 +
83315 +/* Passive sample buffers shared with Xen */
83316 +xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS];
83317 +/* Passive shared buffer area */
83318 +struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
83319 +
83320 +static int xenoprof_start(void);
83321 +static void xenoprof_stop(void);
83322 +
83323 +static int xenoprof_enabled = 0;
83324 +static int xenoprof_is_primary = 0;
83325 +static int active_defined;
83326 +
83327 +/* Number of buffers in shared area (one per VCPU) */
83328 +int nbuf;
83329 +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
83330 +int ovf_irq[NR_CPUS];
83331 +/* cpu model type string - copied from Xen memory space on XENOPROF_init command */
83332 +char cpu_type[XENOPROF_CPU_TYPE_SIZE];
83333 +
83334 +#ifdef CONFIG_PM
83335 +
83336 +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state)
83337 +{
83338 + if (xenoprof_enabled == 1)
83339 + xenoprof_stop();
83340 + return 0;
83341 +}
83342 +
83343 +
83344 +static int xenoprof_resume(struct sys_device * dev)
83345 +{
83346 + if (xenoprof_enabled == 1)
83347 + xenoprof_start();
83348 + return 0;
83349 +}
83350 +
83351 +
83352 +static struct sysdev_class oprofile_sysclass = {
83353 + set_kset_name("oprofile"),
83354 + .resume = xenoprof_resume,
83355 + .suspend = xenoprof_suspend
83356 +};
83357 +
83358 +
83359 +static struct sys_device device_oprofile = {
83360 + .id = 0,
83361 + .cls = &oprofile_sysclass,
83362 +};
83363 +
83364 +
83365 +static int __init init_driverfs(void)
83366 +{
83367 + int error;
83368 + if (!(error = sysdev_class_register(&oprofile_sysclass)))
83369 + error = sysdev_register(&device_oprofile);
83370 + return error;
83371 +}
83372 +
83373 +
83374 +static void exit_driverfs(void)
83375 +{
83376 + sysdev_unregister(&device_oprofile);
83377 + sysdev_class_unregister(&oprofile_sysclass);
83378 +}
83379 +
83380 +#else
83381 +#define init_driverfs() do { } while (0)
83382 +#define exit_driverfs() do { } while (0)
83383 +#endif /* CONFIG_PM */
83384 +
83385 +unsigned long long oprofile_samples = 0;
83386 +unsigned long long p_oprofile_samples = 0;
83387 +
83388 +unsigned int pdomains;
83389 +struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
83390 +
83391 +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
83392 +{
83393 + int head, tail, size;
83394 +
83395 + head = buf->event_head;
83396 + tail = buf->event_tail;
83397 + size = buf->event_size;
83398 +
83399 + if (tail > head) {
83400 + while (tail < size) {
83401 + oprofile_add_pc(buf->event_log[tail].eip,
83402 + buf->event_log[tail].mode,
83403 + buf->event_log[tail].event);
83404 + if (!is_passive)
83405 + oprofile_samples++;
83406 + else
83407 + p_oprofile_samples++;
83408 + tail++;
83409 + }
83410 + tail = 0;
83411 + }
83412 + while (tail < head) {
83413 + oprofile_add_pc(buf->event_log[tail].eip,
83414 + buf->event_log[tail].mode,
83415 + buf->event_log[tail].event);
83416 + if (!is_passive)
83417 + oprofile_samples++;
83418 + else
83419 + p_oprofile_samples++;
83420 + tail++;
83421 + }
83422 +
83423 + buf->event_tail = tail;
83424 +}
83425 +
83426 +static void xenoprof_handle_passive(void)
83427 +{
83428 + int i, j;
83429 + int flag_domain, flag_switch = 0;
83430 +
83431 + for (i = 0; i < pdomains; i++) {
83432 + flag_domain = 0;
83433 + for (j = 0; j < passive_domains[i].nbuf; j++) {
83434 + xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
83435 + if (buf->event_head == buf->event_tail)
83436 + continue;
83437 + if (!flag_domain) {
83438 + if (!oprofile_add_domain_switch(passive_domains[i].
83439 + domain_id))
83440 + goto done;
83441 + flag_domain = 1;
83442 + }
83443 + xenoprof_add_pc(buf, 1);
83444 + flag_switch = 1;
83445 + }
83446 + }
83447 +done:
83448 + if (flag_switch)
83449 + oprofile_add_domain_switch(COORDINATOR_DOMAIN);
83450 +}
83451 +
83452 +static irqreturn_t
83453 +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
83454 +{
83455 + struct xenoprof_buf * buf;
83456 + int cpu;
83457 + static unsigned long flag;
83458 +
83459 + cpu = smp_processor_id();
83460 + buf = xenoprof_buf[cpu];
83461 +
83462 + xenoprof_add_pc(buf, 0);
83463 +
83464 + if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
83465 + xenoprof_handle_passive();
83466 + smp_mb__before_clear_bit();
83467 + clear_bit(0, &flag);
83468 + }
83469 +
83470 + return IRQ_HANDLED;
83471 +}
83472 +
83473 +
83474 +static void unbind_virq(void)
83475 +{
83476 + int i;
83477 +
83478 + for_each_online_cpu(i) {
83479 + if (ovf_irq[i] >= 0) {
83480 + unbind_from_irqhandler(ovf_irq[i], NULL);
83481 + ovf_irq[i] = -1;
83482 + }
83483 + }
83484 +}
83485 +
83486 +
83487 +static int bind_virq(void)
83488 +{
83489 + int i, result;
83490 +
83491 + for_each_online_cpu(i) {
83492 + result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
83493 + i,
83494 + xenoprof_ovf_interrupt,
83495 + SA_INTERRUPT,
83496 + "xenoprof",
83497 + NULL);
83498 +
83499 + if (result < 0) {
83500 + unbind_virq();
83501 + return result;
83502 + }
83503 +
83504 + ovf_irq[i] = result;
83505 + }
83506 +
83507 + return 0;
83508 +}
83509 +
83510 +
83511 +static void unmap_passive_list(void)
83512 +{
83513 + int i;
83514 + for (i = 0; i < pdomains; i++)
83515 + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
83516 + pdomains = 0;
83517 +}
83518 +
83519 +
83520 +static int map_xenoprof_buffer(int max_samples)
83521 +{
83522 + struct xenoprof_get_buffer get_buffer;
83523 + struct xenoprof_buf *buf;
83524 + int ret, i;
83525 +
83526 + if ( shared_buffer.buffer )
83527 + return 0;
83528 +
83529 + get_buffer.max_samples = max_samples;
83530 + ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
83531 + if (ret)
83532 + return ret;
83533 + nbuf = get_buffer.nbuf;
83534 +
83535 + for (i=0; i< nbuf; i++) {
83536 + buf = (struct xenoprof_buf*)
83537 + &shared_buffer.buffer[i * get_buffer.bufsize];
83538 + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
83539 + xenoprof_buf[buf->vcpu_id] = buf;
83540 + }
83541 +
83542 + return 0;
83543 +}
83544 +
83545 +
83546 +static int xenoprof_setup(void)
83547 +{
83548 + int ret;
83549 +
83550 + if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
83551 + return ret;
83552 +
83553 + if ( (ret = bind_virq()) )
83554 + return ret;
83555 +
83556 + if (xenoprof_is_primary) {
83557 + /* Define dom0 as an active domain if not done yet */
83558 + if (!active_defined) {
83559 + domid_t domid;
83560 + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83561 + if (ret)
83562 + goto err;
83563 + domid = 0;
83564 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83565 + if (ret)
83566 + goto err;
83567 + active_defined = 1;
83568 + }
83569 +
83570 + ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
83571 + if (ret)
83572 + goto err;
83573 + xenoprof_arch_counter();
83574 + ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
83575 +
83576 + if (ret)
83577 + goto err;
83578 + }
83579 +
83580 + ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
83581 + if (ret)
83582 + goto err;
83583 +
83584 + xenoprof_enabled = 1;
83585 + return 0;
83586 + err:
83587 + unbind_virq();
83588 + return ret;
83589 +}
83590 +
83591 +
83592 +static void xenoprof_shutdown(void)
83593 +{
83594 + xenoprof_enabled = 0;
83595 +
83596 + HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL);
83597 +
83598 + if (xenoprof_is_primary) {
83599 + HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL);
83600 + active_defined = 0;
83601 + }
83602 +
83603 + unbind_virq();
83604 +
83605 + xenoprof_arch_unmap_shared_buffer(&shared_buffer);
83606 + if (xenoprof_is_primary)
83607 + unmap_passive_list();
83608 +}
83609 +
83610 +
83611 +static int xenoprof_start(void)
83612 +{
83613 + int ret = 0;
83614 +
83615 + if (xenoprof_is_primary)
83616 + ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
83617 + if (!ret)
83618 + xenoprof_arch_start();
83619 + return ret;
83620 +}
83621 +
83622 +
83623 +static void xenoprof_stop(void)
83624 +{
83625 + if (xenoprof_is_primary)
83626 + HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL);
83627 + xenoprof_arch_stop();
83628 +}
83629 +
83630 +
83631 +static int xenoprof_set_active(int * active_domains,
83632 + unsigned int adomains)
83633 +{
83634 + int ret = 0;
83635 + int i;
83636 + int set_dom0 = 0;
83637 + domid_t domid;
83638 +
83639 + if (!xenoprof_is_primary)
83640 + return 0;
83641 +
83642 + if (adomains > MAX_OPROF_DOMAINS)
83643 + return -E2BIG;
83644 +
83645 + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83646 + if (ret)
83647 + return ret;
83648 +
83649 + for (i=0; i<adomains; i++) {
83650 + domid = active_domains[i];
83651 + if (domid != active_domains[i]) {
83652 + ret = -EINVAL;
83653 + goto out;
83654 + }
83655 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83656 + if (ret)
83657 + goto out;
83658 + if (active_domains[i] == 0)
83659 + set_dom0 = 1;
83660 + }
83661 + /* dom0 must always be active but may not be in the list */
83662 + if (!set_dom0) {
83663 + domid = 0;
83664 + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
83665 + }
83666 +
83667 +out:
83668 + if (ret)
83669 + HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
83670 + active_defined = !ret;
83671 + return ret;
83672 +}
83673 +
83674 +static int xenoprof_set_passive(int * p_domains,
83675 + unsigned int pdoms)
83676 +{
83677 + int ret;
83678 + int i, j;
83679 + struct xenoprof_buf *buf;
83680 +
83681 + if (!xenoprof_is_primary)
83682 + return 0;
83683 +
83684 + if (pdoms > MAX_OPROF_DOMAINS)
83685 + return -E2BIG;
83686 +
83687 + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
83688 + if (ret)
83689 + return ret;
83690 + unmap_passive_list();
83691 +
83692 + for (i = 0; i < pdoms; i++) {
83693 + passive_domains[i].domain_id = p_domains[i];
83694 + passive_domains[i].max_samples = 2048;
83695 + ret = xenoprof_arch_set_passive(&passive_domains[i],
83696 + &p_shared_buffer[i]);
83697 + if (ret)
83698 + goto out;
83699 + for (j = 0; j < passive_domains[i].nbuf; j++) {
83700 + buf = (struct xenoprof_buf *)
83701 + &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize];
83702 + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
83703 + p_xenoprof_buf[i][buf->vcpu_id] = buf;
83704 + }
83705 + }
83706 +
83707 + pdomains = pdoms;
83708 + return 0;
83709 +
83710 +out:
83711 + for (j = 0; j < i; j++)
83712 + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
83713 +
83714 + return ret;
83715 +}
83716 +
83717 +struct oprofile_operations xenoprof_ops = {
83718 +#ifdef HAVE_XENOPROF_CREATE_FILES
83719 + .create_files = xenoprof_create_files,
83720 +#endif
83721 + .set_active = xenoprof_set_active,
83722 + .set_passive = xenoprof_set_passive,
83723 + .setup = xenoprof_setup,
83724 + .shutdown = xenoprof_shutdown,
83725 + .start = xenoprof_start,
83726 + .stop = xenoprof_stop
83727 +};
83728 +
83729 +
83730 +/* in order to get driverfs right */
83731 +static int using_xenoprof;
83732 +
83733 +int __init xenoprofile_init(struct oprofile_operations * ops)
83734 +{
83735 + struct xenoprof_init init;
83736 + int ret, i;
83737 +
83738 + ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
83739 + if (!ret) {
83740 + xenoprof_arch_init_counter(&init);
83741 + xenoprof_is_primary = init.is_primary;
83742 +
83743 + /* cpu_type is detected by Xen */
83744 + cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
83745 + strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
83746 + xenoprof_ops.cpu_type = cpu_type;
83747 +
83748 + init_driverfs();
83749 + using_xenoprof = 1;
83750 + *ops = xenoprof_ops;
83751 +
83752 + for (i=0; i<NR_CPUS; i++)
83753 + ovf_irq[i] = -1;
83754 +
83755 + active_defined = 0;
83756 + }
83757 + printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n",
83758 + __func__, ret, init.num_events, xenoprof_is_primary);
83759 + return ret;
83760 +}
83761 +
83762 +
83763 +void xenoprofile_exit(void)
83764 +{
83765 + if (using_xenoprof)
83766 + exit_driverfs();
83767 +
83768 + xenoprof_arch_unmap_shared_buffer(&shared_buffer);
83769 + if (xenoprof_is_primary) {
83770 + unmap_passive_list();
83771 + HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL);
83772 + }
83773 +}
83774 diff -Nur linux-2.6.16.33-noxen/fs/Kconfig linux-2.6.16.33/fs/Kconfig
83775 --- linux-2.6.16.33-noxen/fs/Kconfig 2006-11-22 18:06:31.000000000 +0000
83776 +++ linux-2.6.16.33/fs/Kconfig 2007-01-08 15:00:45.000000000 +0000
83777 @@ -841,6 +841,7 @@
83778 config HUGETLBFS
83779 bool "HugeTLB file system support"
83780 depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
83781 + depends !XEN
83782
83783 config HUGETLB_PAGE
83784 def_bool HUGETLBFS
83785 diff -Nur linux-2.6.16.33-noxen/fs/aio.c linux-2.6.16.33/fs/aio.c
83786 --- linux-2.6.16.33-noxen/fs/aio.c 2006-11-22 18:06:31.000000000 +0000
83787 +++ linux-2.6.16.33/fs/aio.c 2007-05-23 21:00:01.000000000 +0000
83788 @@ -34,6 +34,11 @@
83789 #include <asm/uaccess.h>
83790 #include <asm/mmu_context.h>
83791
83792 +#ifdef CONFIG_EPOLL
83793 +#include <linux/poll.h>
83794 +#include <linux/eventpoll.h>
83795 +#endif
83796 +
83797 #if DEBUG > 1
83798 #define dprintk printk
83799 #else
83800 @@ -1016,6 +1021,10 @@
83801 if (waitqueue_active(&ctx->wait))
83802 wake_up(&ctx->wait);
83803
83804 +#ifdef CONFIG_EPOLL
83805 + if (ctx->file && waitqueue_active(&ctx->poll_wait))
83806 + wake_up(&ctx->poll_wait);
83807 +#endif
83808 if (ret)
83809 put_ioctx(ctx);
83810
83811 @@ -1025,6 +1034,8 @@
83812 /* aio_read_evt
83813 * Pull an event off of the ioctx's event ring. Returns the number of
83814 * events fetched (0 or 1 ;-)
83815 + * If ent parameter is 0, just returns the number of events that would
83816 + * be fetched.
83817 * FIXME: make this use cmpxchg.
83818 * TODO: make the ringbuffer user mmap()able (requires FIXME).
83819 */
83820 @@ -1047,13 +1058,18 @@
83821
83822 head = ring->head % info->nr;
83823 if (head != ring->tail) {
83824 - struct io_event *evp = aio_ring_event(info, head, KM_USER1);
83825 - *ent = *evp;
83826 - head = (head + 1) % info->nr;
83827 - smp_mb(); /* finish reading the event before updatng the head */
83828 - ring->head = head;
83829 - ret = 1;
83830 - put_aio_ring_event(evp, KM_USER1);
83831 + if (ent) { /* event requested */
83832 + struct io_event *evp =
83833 + aio_ring_event(info, head, KM_USER1);
83834 + *ent = *evp;
83835 + head = (head + 1) % info->nr;
83836 + /* finish reading the event before updatng the head */
83837 + smp_mb();
83838 + ring->head = head;
83839 + ret = 1;
83840 + put_aio_ring_event(evp, KM_USER1);
83841 + } else /* only need to know availability */
83842 + ret = 1;
83843 }
83844 spin_unlock(&info->ring_lock);
83845
83846 @@ -1236,9 +1252,78 @@
83847
83848 aio_cancel_all(ioctx);
83849 wait_for_all_aios(ioctx);
83850 +#ifdef CONFIG_EPOLL
83851 + /* forget the poll file, but it's up to the user to close it */
83852 + if (ioctx->file) {
83853 + ioctx->file->private_data = 0;
83854 + ioctx->file = 0;
83855 + }
83856 +#endif
83857 put_ioctx(ioctx); /* once for the lookup */
83858 }
83859
83860 +#ifdef CONFIG_EPOLL
83861 +
83862 +static int aio_queue_fd_close(struct inode *inode, struct file *file)
83863 +{
83864 + struct kioctx *ioctx = file->private_data;
83865 + if (ioctx) {
83866 + file->private_data = 0;
83867 + spin_lock_irq(&ioctx->ctx_lock);
83868 + ioctx->file = 0;
83869 + spin_unlock_irq(&ioctx->ctx_lock);
83870 + }
83871 + return 0;
83872 +}
83873 +
83874 +static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
83875 +{ unsigned int pollflags = 0;
83876 + struct kioctx *ioctx = file->private_data;
83877 +
83878 + if (ioctx) {
83879 +
83880 + spin_lock_irq(&ioctx->ctx_lock);
83881 + /* Insert inside our poll wait queue */
83882 + poll_wait(file, &ioctx->poll_wait, wait);
83883 +
83884 + /* Check our condition */
83885 + if (aio_read_evt(ioctx, 0))
83886 + pollflags = POLLIN | POLLRDNORM;
83887 + spin_unlock_irq(&ioctx->ctx_lock);
83888 + }
83889 +
83890 + return pollflags;
83891 +}
83892 +
83893 +static struct file_operations aioq_fops = {
83894 + .release = aio_queue_fd_close,
83895 + .poll = aio_queue_fd_poll
83896 +};
83897 +
83898 +/* make_aio_fd:
83899 + * Create a file descriptor that can be used to poll the event queue.
83900 + * Based and piggybacked on the excellent epoll code.
83901 + */
83902 +
83903 +static int make_aio_fd(struct kioctx *ioctx)
83904 +{
83905 + int error, fd;
83906 + struct inode *inode;
83907 + struct file *file;
83908 +
83909 + error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
83910 + if (error)
83911 + return error;
83912 +
83913 + /* associate the file with the IO context */
83914 + file->private_data = ioctx;
83915 + ioctx->file = file;
83916 + init_waitqueue_head(&ioctx->poll_wait);
83917 + return fd;
83918 +}
83919 +#endif
83920 +
83921 +
83922 /* sys_io_setup:
83923 * Create an aio_context capable of receiving at least nr_events.
83924 * ctxp must not point to an aio_context that already exists, and
83925 @@ -1251,18 +1336,30 @@
83926 * resources are available. May fail with -EFAULT if an invalid
83927 * pointer is passed for ctxp. Will fail with -ENOSYS if not
83928 * implemented.
83929 + *
83930 + * To request a selectable fd, the user context has to be initialized
83931 + * to 1, instead of 0, and the return value is the fd.
83932 + * This keeps the system call compatible, since a non-zero value
83933 + * was not allowed so far.
83934 */
83935 asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
83936 {
83937 struct kioctx *ioctx = NULL;
83938 unsigned long ctx;
83939 long ret;
83940 + int make_fd = 0;
83941
83942 ret = get_user(ctx, ctxp);
83943 if (unlikely(ret))
83944 goto out;
83945
83946 ret = -EINVAL;
83947 +#ifdef CONFIG_EPOLL
83948 + if (ctx == 1) {
83949 + make_fd = 1;
83950 + ctx = 0;
83951 + }
83952 +#endif
83953 if (unlikely(ctx || nr_events == 0)) {
83954 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
83955 ctx, nr_events);
83956 @@ -1273,8 +1370,12 @@
83957 ret = PTR_ERR(ioctx);
83958 if (!IS_ERR(ioctx)) {
83959 ret = put_user(ioctx->user_id, ctxp);
83960 - if (!ret)
83961 - return 0;
83962 +#ifdef CONFIG_EPOLL
83963 + if (make_fd && ret >= 0)
83964 + ret = make_aio_fd(ioctx);
83965 +#endif
83966 + if (ret >= 0)
83967 + return ret;
83968
83969 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
83970 io_destroy(ioctx);
83971 diff -Nur linux-2.6.16.33-noxen/fs/eventpoll.c linux-2.6.16.33/fs/eventpoll.c
83972 --- linux-2.6.16.33-noxen/fs/eventpoll.c 2006-11-22 18:06:31.000000000 +0000
83973 +++ linux-2.6.16.33/fs/eventpoll.c 2007-05-23 21:00:01.000000000 +0000
83974 @@ -235,8 +235,6 @@
83975
83976 static void ep_poll_safewake_init(struct poll_safewake *psw);
83977 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
83978 -static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
83979 - struct eventpoll *ep);
83980 static int ep_alloc(struct eventpoll **pep);
83981 static void ep_free(struct eventpoll *ep);
83982 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
83983 @@ -266,7 +264,7 @@
83984 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
83985 int maxevents, long timeout);
83986 static int eventpollfs_delete_dentry(struct dentry *dentry);
83987 -static struct inode *ep_eventpoll_inode(void);
83988 +static struct inode *ep_eventpoll_inode(struct file_operations *fops);
83989 static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
83990 int flags, const char *dev_name,
83991 void *data);
83992 @@ -525,7 +523,7 @@
83993 * Creates all the items needed to setup an eventpoll file. That is,
83994 * a file structure, and inode and a free file descriptor.
83995 */
83996 - error = ep_getfd(&fd, &inode, &file, ep);
83997 + error = ep_getfd(&fd, &inode, &file, ep, &eventpoll_fops);
83998 if (error)
83999 goto eexit_2;
84000
84001 @@ -710,8 +708,8 @@
84002 /*
84003 * Creates the file descriptor to be used by the epoll interface.
84004 */
84005 -static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
84006 - struct eventpoll *ep)
84007 +int ep_getfd(int *efd, struct inode **einode, struct file **efile,
84008 + struct eventpoll *ep, struct file_operations *fops)
84009 {
84010 struct qstr this;
84011 char name[32];
84012 @@ -727,7 +725,7 @@
84013 goto eexit_1;
84014
84015 /* Allocates an inode from the eventpoll file system */
84016 - inode = ep_eventpoll_inode();
84017 + inode = ep_eventpoll_inode(fops);
84018 error = PTR_ERR(inode);
84019 if (IS_ERR(inode))
84020 goto eexit_2;
84021 @@ -758,7 +756,7 @@
84022
84023 file->f_pos = 0;
84024 file->f_flags = O_RDONLY;
84025 - file->f_op = &eventpoll_fops;
84026 + file->f_op = fops;
84027 file->f_mode = FMODE_READ;
84028 file->f_version = 0;
84029 file->private_data = ep;
84030 @@ -1574,7 +1572,7 @@
84031 }
84032
84033
84034 -static struct inode *ep_eventpoll_inode(void)
84035 +static struct inode *ep_eventpoll_inode(struct file_operations *fops)
84036 {
84037 int error = -ENOMEM;
84038 struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
84039 @@ -1582,7 +1580,7 @@
84040 if (!inode)
84041 goto eexit_1;
84042
84043 - inode->i_fop = &eventpoll_fops;
84044 + inode->i_fop = fops;
84045
84046 /*
84047 * Mark the inode dirty from the very beginning,
84048 diff -Nur linux-2.6.16.33-noxen/fs/proc/proc_misc.c linux-2.6.16.33/fs/proc/proc_misc.c
84049 --- linux-2.6.16.33-noxen/fs/proc/proc_misc.c 2006-11-22 18:06:31.000000000 +0000
84050 +++ linux-2.6.16.33/fs/proc/proc_misc.c 2007-05-23 21:00:01.000000000 +0000
84051 @@ -433,7 +433,7 @@
84052 (unsigned long long)cputime64_to_clock_t(irq),
84053 (unsigned long long)cputime64_to_clock_t(softirq),
84054 (unsigned long long)cputime64_to_clock_t(steal));
84055 - for_each_online_cpu(i) {
84056 + for_each_cpu(i) {
84057
84058 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
84059 user = kstat_cpu(i).cpustat.user;
84060 diff -Nur linux-2.6.16.33-noxen/include/asm-generic/vmlinux.lds.h linux-2.6.16.33/include/asm-generic/vmlinux.lds.h
84061 --- linux-2.6.16.33-noxen/include/asm-generic/vmlinux.lds.h 2006-11-22 18:06:31.000000000 +0000
84062 +++ linux-2.6.16.33/include/asm-generic/vmlinux.lds.h 2007-05-23 21:00:01.000000000 +0000
84063 @@ -152,3 +152,6 @@
84064 .stab.index 0 : { *(.stab.index) } \
84065 .stab.indexstr 0 : { *(.stab.indexstr) } \
84066 .comment 0 : { *(.comment) }
84067 +
84068 +#define NOTES \
84069 + .notes : { *(.note.*) } :note
84070 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/a.out.h linux-2.6.16.33/include/asm-i386/a.out.h
84071 --- linux-2.6.16.33-noxen/include/asm-i386/a.out.h 2006-11-22 18:06:31.000000000 +0000
84072 +++ linux-2.6.16.33/include/asm-i386/a.out.h 2007-01-08 15:00:45.000000000 +0000
84073 @@ -19,7 +19,7 @@
84074
84075 #ifdef __KERNEL__
84076
84077 -#define STACK_TOP TASK_SIZE
84078 +#define STACK_TOP (TASK_SIZE - 3*PAGE_SIZE)
84079
84080 #endif
84081
84082 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/apic.h linux-2.6.16.33/include/asm-i386/apic.h
84083 --- linux-2.6.16.33-noxen/include/asm-i386/apic.h 2006-11-22 18:06:31.000000000 +0000
84084 +++ linux-2.6.16.33/include/asm-i386/apic.h 2007-01-08 15:00:45.000000000 +0000
84085 @@ -132,10 +132,12 @@
84086
84087 extern int disable_timer_pin_1;
84088
84089 +#ifndef CONFIG_XEN
84090 void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
84091 void switch_APIC_timer_to_ipi(void *cpumask);
84092 void switch_ipi_to_APIC_timer(void *cpumask);
84093 #define ARCH_APICTIMER_STOPS_ON_C3 1
84094 +#endif
84095
84096 extern int timer_over_8254;
84097
84098 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/atomic.h linux-2.6.16.33/include/asm-i386/atomic.h
84099 --- linux-2.6.16.33-noxen/include/asm-i386/atomic.h 2006-11-22 18:06:31.000000000 +0000
84100 +++ linux-2.6.16.33/include/asm-i386/atomic.h 2007-01-08 15:00:45.000000000 +0000
84101 @@ -4,18 +4,13 @@
84102 #include <linux/config.h>
84103 #include <linux/compiler.h>
84104 #include <asm/processor.h>
84105 +#include <asm/smp_alt.h>
84106
84107 /*
84108 * Atomic operations that C can't guarantee us. Useful for
84109 * resource counting etc..
84110 */
84111
84112 -#ifdef CONFIG_SMP
84113 -#define LOCK "lock ; "
84114 -#else
84115 -#define LOCK ""
84116 -#endif
84117 -
84118 /*
84119 * Make sure gcc doesn't try to be clever and move things around
84120 * on us. We need to use _exactly_ the address the user gave us,
84121 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/bitops.h linux-2.6.16.33/include/asm-i386/bitops.h
84122 --- linux-2.6.16.33-noxen/include/asm-i386/bitops.h 2006-11-22 18:06:31.000000000 +0000
84123 +++ linux-2.6.16.33/include/asm-i386/bitops.h 2007-01-08 15:00:45.000000000 +0000
84124 @@ -7,6 +7,7 @@
84125
84126 #include <linux/config.h>
84127 #include <linux/compiler.h>
84128 +#include <asm/smp_alt.h>
84129
84130 /*
84131 * These have to be done with inline assembly: that way the bit-setting
84132 @@ -16,12 +17,6 @@
84133 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
84134 */
84135
84136 -#ifdef CONFIG_SMP
84137 -#define LOCK_PREFIX "lock ; "
84138 -#else
84139 -#define LOCK_PREFIX ""
84140 -#endif
84141 -
84142 #define ADDR (*(volatile long *) addr)
84143
84144 /**
84145 @@ -41,7 +36,7 @@
84146 */
84147 static inline void set_bit(int nr, volatile unsigned long * addr)
84148 {
84149 - __asm__ __volatile__( LOCK_PREFIX
84150 + __asm__ __volatile__( LOCK
84151 "btsl %1,%0"
84152 :"+m" (ADDR)
84153 :"Ir" (nr));
84154 @@ -76,7 +71,7 @@
84155 */
84156 static inline void clear_bit(int nr, volatile unsigned long * addr)
84157 {
84158 - __asm__ __volatile__( LOCK_PREFIX
84159 + __asm__ __volatile__( LOCK
84160 "btrl %1,%0"
84161 :"+m" (ADDR)
84162 :"Ir" (nr));
84163 @@ -121,7 +116,7 @@
84164 */
84165 static inline void change_bit(int nr, volatile unsigned long * addr)
84166 {
84167 - __asm__ __volatile__( LOCK_PREFIX
84168 + __asm__ __volatile__( LOCK
84169 "btcl %1,%0"
84170 :"+m" (ADDR)
84171 :"Ir" (nr));
84172 @@ -140,7 +135,7 @@
84173 {
84174 int oldbit;
84175
84176 - __asm__ __volatile__( LOCK_PREFIX
84177 + __asm__ __volatile__( LOCK
84178 "btsl %2,%1\n\tsbbl %0,%0"
84179 :"=r" (oldbit),"+m" (ADDR)
84180 :"Ir" (nr) : "memory");
84181 @@ -180,7 +175,7 @@
84182 {
84183 int oldbit;
84184
84185 - __asm__ __volatile__( LOCK_PREFIX
84186 + __asm__ __volatile__( LOCK
84187 "btrl %2,%1\n\tsbbl %0,%0"
84188 :"=r" (oldbit),"+m" (ADDR)
84189 :"Ir" (nr) : "memory");
84190 @@ -231,7 +226,7 @@
84191 {
84192 int oldbit;
84193
84194 - __asm__ __volatile__( LOCK_PREFIX
84195 + __asm__ __volatile__( LOCK
84196 "btcl %2,%1\n\tsbbl %0,%0"
84197 :"=r" (oldbit),"+m" (ADDR)
84198 :"Ir" (nr) : "memory");
84199 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/elf.h linux-2.6.16.33/include/asm-i386/elf.h
84200 --- linux-2.6.16.33-noxen/include/asm-i386/elf.h 2006-11-22 18:06:31.000000000 +0000
84201 +++ linux-2.6.16.33/include/asm-i386/elf.h 2007-01-08 15:00:45.000000000 +0000
84202 @@ -129,11 +129,16 @@
84203 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
84204 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
84205
84206 -#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
84207 +#define VSYSCALL_BASE (PAGE_OFFSET - 2*PAGE_SIZE)
84208 #define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
84209 #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
84210 extern void __kernel_vsyscall;
84211
84212 +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
84213 +struct linux_binprm;
84214 +extern int arch_setup_additional_pages(struct linux_binprm *bprm,
84215 + int executable_stack);
84216 +
84217 #define ARCH_DLINFO \
84218 do { \
84219 NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
84220 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/fixmap.h linux-2.6.16.33/include/asm-i386/fixmap.h
84221 --- linux-2.6.16.33-noxen/include/asm-i386/fixmap.h 2006-11-22 18:06:31.000000000 +0000
84222 +++ linux-2.6.16.33/include/asm-i386/fixmap.h 2007-01-08 15:00:45.000000000 +0000
84223 @@ -20,7 +20,7 @@
84224 * Leave one empty page between vmalloc'ed areas and
84225 * the start of the fixmap.
84226 */
84227 -#define __FIXADDR_TOP 0xfffff000
84228 +extern unsigned long __FIXADDR_TOP;
84229
84230 #ifndef __ASSEMBLY__
84231 #include <linux/kernel.h>
84232 @@ -52,7 +52,6 @@
84233 */
84234 enum fixed_addresses {
84235 FIX_HOLE,
84236 - FIX_VSYSCALL,
84237 #ifdef CONFIG_X86_LOCAL_APIC
84238 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
84239 #endif
84240 @@ -95,6 +94,8 @@
84241 extern void __set_fixmap (enum fixed_addresses idx,
84242 unsigned long phys, pgprot_t flags);
84243
84244 +extern void set_fixaddr_top(unsigned long top);
84245 +
84246 #define set_fixmap(idx, phys) \
84247 __set_fixmap(idx, phys, PAGE_KERNEL)
84248 /*
84249 @@ -116,14 +117,6 @@
84250 #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
84251 #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
84252
84253 -/*
84254 - * This is the range that is readable by user mode, and things
84255 - * acting like user mode such as get_user_pages.
84256 - */
84257 -#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL))
84258 -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
84259 -
84260 -
84261 extern void __this_fixmap_does_not_exist(void);
84262
84263 /*
84264 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/futex.h linux-2.6.16.33/include/asm-i386/futex.h
84265 --- linux-2.6.16.33-noxen/include/asm-i386/futex.h 2006-11-22 18:06:31.000000000 +0000
84266 +++ linux-2.6.16.33/include/asm-i386/futex.h 2007-01-08 15:00:45.000000000 +0000
84267 @@ -28,7 +28,7 @@
84268 "1: movl %2, %0\n\
84269 movl %0, %3\n" \
84270 insn "\n" \
84271 -"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\
84272 +"2: " LOCK "cmpxchgl %3, %2\n\
84273 jnz 1b\n\
84274 3: .section .fixup,\"ax\"\n\
84275 4: mov %5, %1\n\
84276 @@ -68,7 +68,7 @@
84277 #endif
84278 switch (op) {
84279 case FUTEX_OP_ADD:
84280 - __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
84281 + __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
84282 oldval, uaddr, oparg);
84283 break;
84284 case FUTEX_OP_OR:
84285 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/kexec.h linux-2.6.16.33/include/asm-i386/kexec.h
84286 --- linux-2.6.16.33-noxen/include/asm-i386/kexec.h 2006-11-22 18:06:31.000000000 +0000
84287 +++ linux-2.6.16.33/include/asm-i386/kexec.h 2007-01-08 15:00:45.000000000 +0000
84288 @@ -1,6 +1,26 @@
84289 #ifndef _I386_KEXEC_H
84290 #define _I386_KEXEC_H
84291
84292 +#define PA_CONTROL_PAGE 0
84293 +#define VA_CONTROL_PAGE 1
84294 +#define PA_PGD 2
84295 +#define VA_PGD 3
84296 +#define PA_PTE_0 4
84297 +#define VA_PTE_0 5
84298 +#define PA_PTE_1 6
84299 +#define VA_PTE_1 7
84300 +#ifdef CONFIG_X86_PAE
84301 +#define PA_PMD_0 8
84302 +#define VA_PMD_0 9
84303 +#define PA_PMD_1 10
84304 +#define VA_PMD_1 11
84305 +#define PAGES_NR 12
84306 +#else
84307 +#define PAGES_NR 8
84308 +#endif
84309 +
84310 +#ifndef __ASSEMBLY__
84311 +
84312 #include <asm/fixmap.h>
84313 #include <asm/ptrace.h>
84314 #include <asm/string.h>
84315 @@ -72,5 +92,26 @@
84316 newregs->eip = (unsigned long)current_text_addr();
84317 }
84318 }
84319 +asmlinkage NORET_TYPE void
84320 +relocate_kernel(unsigned long indirection_page,
84321 + unsigned long control_page,
84322 + unsigned long start_address,
84323 + unsigned int has_pae) ATTRIB_NORET;
84324 +
84325 +
84326 +/* Under Xen we need to work with machine addresses. These macros give the
84327 + * machine address of a certain page to the generic kexec code instead of
84328 + * the pseudo physical address which would be given by the default macros.
84329 + */
84330 +
84331 +#ifdef CONFIG_XEN
84332 +#define KEXEC_ARCH_HAS_PAGE_MACROS
84333 +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
84334 +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
84335 +#define kexec_virt_to_phys(addr) virt_to_machine(addr)
84336 +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
84337 +#endif
84338 +
84339 +#endif /* __ASSEMBLY__ */
84340
84341 #endif /* _I386_KEXEC_H */
84342 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-default/mach_traps.h linux-2.6.16.33/include/asm-i386/mach-default/mach_traps.h
84343 --- linux-2.6.16.33-noxen/include/asm-i386/mach-default/mach_traps.h 2006-11-22 18:06:31.000000000 +0000
84344 +++ linux-2.6.16.33/include/asm-i386/mach-default/mach_traps.h 2007-01-08 15:00:45.000000000 +0000
84345 @@ -15,6 +15,18 @@
84346 outb(reason, 0x61);
84347 }
84348
84349 +static inline void clear_io_check_error(unsigned char reason)
84350 +{
84351 + unsigned long i;
84352 +
84353 + reason = (reason & 0xf) | 8;
84354 + outb(reason, 0x61);
84355 + i = 2000;
84356 + while (--i) udelay(1000);
84357 + reason &= ~8;
84358 + outb(reason, 0x61);
84359 +}
84360 +
84361 static inline unsigned char get_nmi_reason(void)
84362 {
84363 return inb(0x61);
84364 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/agp.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/agp.h
84365 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/agp.h 1970-01-01 00:00:00.000000000 +0000
84366 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/agp.h 2007-01-08 15:00:45.000000000 +0000
84367 @@ -0,0 +1,37 @@
84368 +#ifndef AGP_H
84369 +#define AGP_H 1
84370 +
84371 +#include <asm/pgtable.h>
84372 +#include <asm/cacheflush.h>
84373 +#include <asm/system.h>
84374 +
84375 +/*
84376 + * Functions to keep the agpgart mappings coherent with the MMU.
84377 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
84378 + * mapped uncacheable. Make sure there are no conflicting mappings
84379 + * with different cachability attributes for the same page. This avoids
84380 + * data corruption on some CPUs.
84381 + */
84382 +
84383 +int map_page_into_agp(struct page *page);
84384 +int unmap_page_from_agp(struct page *page);
84385 +#define flush_agp_mappings() global_flush_tlb()
84386 +
84387 +/* Could use CLFLUSH here if the cpu supports it. But then it would
84388 + need to be called for each cacheline of the whole page so it may not be
84389 + worth it. Would need a page for it. */
84390 +#define flush_agp_cache() wbinvd()
84391 +
84392 +/* Convert a physical address to an address suitable for the GART. */
84393 +#define phys_to_gart(x) phys_to_machine(x)
84394 +#define gart_to_phys(x) machine_to_phys(x)
84395 +
84396 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
84397 +#define alloc_gatt_pages(order) ({ \
84398 + char *_t; dma_addr_t _d; \
84399 + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
84400 + _t; })
84401 +#define free_gatt_pages(table, order) \
84402 + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
84403 +
84404 +#endif
84405 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/desc.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/desc.h
84406 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/desc.h 1970-01-01 00:00:00.000000000 +0000
84407 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/desc.h 2007-01-08 15:00:45.000000000 +0000
84408 @@ -0,0 +1,164 @@
84409 +#ifndef __ARCH_DESC_H
84410 +#define __ARCH_DESC_H
84411 +
84412 +#include <asm/ldt.h>
84413 +#include <asm/segment.h>
84414 +
84415 +#define CPU_16BIT_STACK_SIZE 1024
84416 +
84417 +#ifndef __ASSEMBLY__
84418 +
84419 +#include <linux/preempt.h>
84420 +#include <linux/smp.h>
84421 +
84422 +#include <asm/mmu.h>
84423 +
84424 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
84425 +
84426 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
84427 +
84428 +struct Xgt_desc_struct {
84429 + unsigned short size;
84430 + unsigned long address __attribute__((packed));
84431 + unsigned short pad;
84432 +} __attribute__ ((packed));
84433 +
84434 +extern struct Xgt_desc_struct idt_descr;
84435 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
84436 +
84437 +
84438 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
84439 +{
84440 + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
84441 +}
84442 +
84443 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
84444 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
84445 +
84446 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
84447 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
84448 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
84449 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
84450 +
84451 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
84452 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
84453 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
84454 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
84455 +
84456 +/*
84457 + * This is the ldt that every process will get unless we need
84458 + * something other than this.
84459 + */
84460 +extern struct desc_struct default_ldt[];
84461 +extern void set_intr_gate(unsigned int irq, void * addr);
84462 +
84463 +#define _set_tssldt_desc(n,addr,limit,type) \
84464 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
84465 + "movw %w1,2(%2)\n\t" \
84466 + "rorl $16,%1\n\t" \
84467 + "movb %b1,4(%2)\n\t" \
84468 + "movb %4,5(%2)\n\t" \
84469 + "movb $0,6(%2)\n\t" \
84470 + "movb %h1,7(%2)\n\t" \
84471 + "rorl $16,%1" \
84472 + : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
84473 +
84474 +#ifndef CONFIG_X86_NO_TSS
84475 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
84476 +{
84477 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
84478 + offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
84479 +}
84480 +
84481 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
84482 +#endif
84483 +
84484 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
84485 +{
84486 + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
84487 +}
84488 +
84489 +#define LDT_entry_a(info) \
84490 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
84491 +
84492 +#define LDT_entry_b(info) \
84493 + (((info)->base_addr & 0xff000000) | \
84494 + (((info)->base_addr & 0x00ff0000) >> 16) | \
84495 + ((info)->limit & 0xf0000) | \
84496 + (((info)->read_exec_only ^ 1) << 9) | \
84497 + ((info)->contents << 10) | \
84498 + (((info)->seg_not_present ^ 1) << 15) | \
84499 + ((info)->seg_32bit << 22) | \
84500 + ((info)->limit_in_pages << 23) | \
84501 + ((info)->useable << 20) | \
84502 + 0x7000)
84503 +
84504 +#define LDT_empty(info) (\
84505 + (info)->base_addr == 0 && \
84506 + (info)->limit == 0 && \
84507 + (info)->contents == 0 && \
84508 + (info)->read_exec_only == 1 && \
84509 + (info)->seg_32bit == 0 && \
84510 + (info)->limit_in_pages == 0 && \
84511 + (info)->seg_not_present == 1 && \
84512 + (info)->useable == 0 )
84513 +
84514 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
84515 +
84516 +#if TLS_SIZE != 24
84517 +# error update this code.
84518 +#endif
84519 +
84520 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
84521 +{
84522 +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
84523 + C(0); C(1); C(2);
84524 +#undef C
84525 +}
84526 +
84527 +static inline void clear_LDT(void)
84528 +{
84529 + int cpu = get_cpu();
84530 +
84531 + /*
84532 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
84533 + * it slows down context switching. Noone uses it anyway.
84534 + */
84535 + cpu = cpu; /* XXX avoid compiler warning */
84536 + xen_set_ldt(0UL, 0);
84537 + put_cpu();
84538 +}
84539 +
84540 +/*
84541 + * load one particular LDT into the current CPU
84542 + */
84543 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
84544 +{
84545 + void *segments = pc->ldt;
84546 + int count = pc->size;
84547 +
84548 + if (likely(!count))
84549 + segments = NULL;
84550 +
84551 + xen_set_ldt((unsigned long)segments, count);
84552 +}
84553 +
84554 +static inline void load_LDT(mm_context_t *pc)
84555 +{
84556 + int cpu = get_cpu();
84557 + load_LDT_nolock(pc, cpu);
84558 + put_cpu();
84559 +}
84560 +
84561 +static inline unsigned long get_desc_base(unsigned long *desc)
84562 +{
84563 + unsigned long base;
84564 + base = ((desc[0] >> 16) & 0x0000ffff) |
84565 + ((desc[1] << 16) & 0x00ff0000) |
84566 + (desc[1] & 0xff000000);
84567 + return base;
84568 +}
84569 +
84570 +#endif /* !__ASSEMBLY__ */
84571 +
84572 +#endif
84573 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/dma-mapping.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/dma-mapping.h
84574 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/dma-mapping.h 1970-01-01 00:00:00.000000000 +0000
84575 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/dma-mapping.h 2007-01-08 15:00:45.000000000 +0000
84576 @@ -0,0 +1,152 @@
84577 +#ifndef _ASM_I386_DMA_MAPPING_H
84578 +#define _ASM_I386_DMA_MAPPING_H
84579 +
84580 +/*
84581 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
84582 + * documentation.
84583 + */
84584 +
84585 +#include <linux/config.h>
84586 +#include <linux/mm.h>
84587 +#include <asm/cache.h>
84588 +#include <asm/io.h>
84589 +#include <asm/scatterlist.h>
84590 +#include <asm/swiotlb.h>
84591 +
84592 +static inline int
84593 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
84594 +{
84595 + dma_addr_t mask = 0xffffffff;
84596 + /* If the device has a mask, use it, otherwise default to 32 bits */
84597 + if (hwdev && hwdev->dma_mask)
84598 + mask = *hwdev->dma_mask;
84599 + return (addr & ~mask) != 0;
84600 +}
84601 +
84602 +static inline int
84603 +range_straddles_page_boundary(void *p, size_t size)
84604 +{
84605 + extern unsigned long *contiguous_bitmap;
84606 + return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
84607 + !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
84608 +}
84609 +
84610 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
84611 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
84612 +
84613 +void *dma_alloc_coherent(struct device *dev, size_t size,
84614 + dma_addr_t *dma_handle, gfp_t flag);
84615 +
84616 +void dma_free_coherent(struct device *dev, size_t size,
84617 + void *vaddr, dma_addr_t dma_handle);
84618 +
84619 +extern dma_addr_t
84620 +dma_map_single(struct device *dev, void *ptr, size_t size,
84621 + enum dma_data_direction direction);
84622 +
84623 +extern void
84624 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
84625 + enum dma_data_direction direction);
84626 +
84627 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
84628 + int nents, enum dma_data_direction direction);
84629 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
84630 + int nents, enum dma_data_direction direction);
84631 +
84632 +extern dma_addr_t
84633 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
84634 + size_t size, enum dma_data_direction direction);
84635 +
84636 +extern void
84637 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
84638 + enum dma_data_direction direction);
84639 +
84640 +extern void
84641 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
84642 + enum dma_data_direction direction);
84643 +
84644 +extern void
84645 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
84646 + enum dma_data_direction direction);
84647 +
84648 +static inline void
84649 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
84650 + unsigned long offset, size_t size,
84651 + enum dma_data_direction direction)
84652 +{
84653 + dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
84654 +}
84655 +
84656 +static inline void
84657 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
84658 + unsigned long offset, size_t size,
84659 + enum dma_data_direction direction)
84660 +{
84661 + dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
84662 +}
84663 +
84664 +static inline void
84665 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
84666 + enum dma_data_direction direction)
84667 +{
84668 + if (swiotlb)
84669 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
84670 + flush_write_buffers();
84671 +}
84672 +
84673 +static inline void
84674 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
84675 + enum dma_data_direction direction)
84676 +{
84677 + if (swiotlb)
84678 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
84679 + flush_write_buffers();
84680 +}
84681 +
84682 +extern int
84683 +dma_mapping_error(dma_addr_t dma_addr);
84684 +
84685 +extern int
84686 +dma_supported(struct device *dev, u64 mask);
84687 +
84688 +static inline int
84689 +dma_set_mask(struct device *dev, u64 mask)
84690 +{
84691 + if(!dev->dma_mask || !dma_supported(dev, mask))
84692 + return -EIO;
84693 +
84694 + *dev->dma_mask = mask;
84695 +
84696 + return 0;
84697 +}
84698 +
84699 +static inline int
84700 +dma_get_cache_alignment(void)
84701 +{
84702 + /* no easy way to get cache size on all x86, so return the
84703 + * maximum possible, to be safe */
84704 + return (1 << INTERNODE_CACHE_SHIFT);
84705 +}
84706 +
84707 +#define dma_is_consistent(d) (1)
84708 +
84709 +static inline void
84710 +dma_cache_sync(void *vaddr, size_t size,
84711 + enum dma_data_direction direction)
84712 +{
84713 + flush_write_buffers();
84714 +}
84715 +
84716 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
84717 +extern int
84718 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
84719 + dma_addr_t device_addr, size_t size, int flags);
84720 +
84721 +extern void
84722 +dma_release_declared_memory(struct device *dev);
84723 +
84724 +extern void *
84725 +dma_mark_declared_memory_occupied(struct device *dev,
84726 + dma_addr_t device_addr, size_t size);
84727 +
84728 +#endif
84729 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/fixmap.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/fixmap.h
84730 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/fixmap.h 1970-01-01 00:00:00.000000000 +0000
84731 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/fixmap.h 2007-01-08 15:00:45.000000000 +0000
84732 @@ -0,0 +1,155 @@
84733 +/*
84734 + * fixmap.h: compile-time virtual memory allocation
84735 + *
84736 + * This file is subject to the terms and conditions of the GNU General Public
84737 + * License. See the file "COPYING" in the main directory of this archive
84738 + * for more details.
84739 + *
84740 + * Copyright (C) 1998 Ingo Molnar
84741 + *
84742 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
84743 + */
84744 +
84745 +#ifndef _ASM_FIXMAP_H
84746 +#define _ASM_FIXMAP_H
84747 +
84748 +#include <linux/config.h>
84749 +
84750 +/* used by vmalloc.c, vsyscall.lds.S.
84751 + *
84752 + * Leave one empty page between vmalloc'ed areas and
84753 + * the start of the fixmap.
84754 + */
84755 +extern unsigned long __FIXADDR_TOP;
84756 +
84757 +#ifndef __ASSEMBLY__
84758 +#include <linux/kernel.h>
84759 +#include <asm/acpi.h>
84760 +#include <asm/apicdef.h>
84761 +#include <asm/page.h>
84762 +#ifdef CONFIG_HIGHMEM
84763 +#include <linux/threads.h>
84764 +#include <asm/kmap_types.h>
84765 +#endif
84766 +
84767 +/*
84768 + * Here we define all the compile-time 'special' virtual
84769 + * addresses. The point is to have a constant address at
84770 + * compile time, but to set the physical address only
84771 + * in the boot process. We allocate these special addresses
84772 + * from the end of virtual memory (0xfffff000) backwards.
84773 + * Also this lets us do fail-safe vmalloc(), we
84774 + * can guarantee that these special addresses and
84775 + * vmalloc()-ed addresses never overlap.
84776 + *
84777 + * these 'compile-time allocated' memory buffers are
84778 + * fixed-size 4k pages. (or larger if used with an increment
84779 + * highger than 1) use fixmap_set(idx,phys) to associate
84780 + * physical memory with fixmap indices.
84781 + *
84782 + * TLB entries of such buffers will not be flushed across
84783 + * task switches.
84784 + */
84785 +enum fixed_addresses {
84786 + FIX_HOLE,
84787 +#ifdef CONFIG_X86_LOCAL_APIC
84788 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
84789 +#endif
84790 +#ifdef CONFIG_X86_IO_APIC
84791 + FIX_IO_APIC_BASE_0,
84792 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
84793 +#endif
84794 +#ifdef CONFIG_X86_VISWS_APIC
84795 + FIX_CO_CPU, /* Cobalt timer */
84796 + FIX_CO_APIC, /* Cobalt APIC Redirection Table */
84797 + FIX_LI_PCIA, /* Lithium PCI Bridge A */
84798 + FIX_LI_PCIB, /* Lithium PCI Bridge B */
84799 +#endif
84800 +#ifdef CONFIG_X86_F00F_BUG
84801 + FIX_F00F_IDT, /* Virtual mapping for IDT */
84802 +#endif
84803 +#ifdef CONFIG_X86_CYCLONE_TIMER
84804 + FIX_CYCLONE_TIMER, /*cyclone timer register*/
84805 +#endif
84806 +#ifdef CONFIG_HIGHMEM
84807 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
84808 + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
84809 +#endif
84810 +#ifdef CONFIG_ACPI
84811 + FIX_ACPI_BEGIN,
84812 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
84813 +#endif
84814 +#ifdef CONFIG_PCI_MMCONFIG
84815 + FIX_PCIE_MCFG,
84816 +#endif
84817 + FIX_SHARED_INFO,
84818 +#define NR_FIX_ISAMAPS 256
84819 + FIX_ISAMAP_END,
84820 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
84821 + __end_of_permanent_fixed_addresses,
84822 + /* temporary boot-time mappings, used before ioremap() is functional */
84823 +#define NR_FIX_BTMAPS 16
84824 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
84825 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
84826 + FIX_WP_TEST,
84827 + __end_of_fixed_addresses
84828 +};
84829 +
84830 +extern void __set_fixmap(enum fixed_addresses idx,
84831 + maddr_t phys, pgprot_t flags);
84832 +
84833 +extern void set_fixaddr_top(void);
84834 +
84835 +#define set_fixmap(idx, phys) \
84836 + __set_fixmap(idx, phys, PAGE_KERNEL)
84837 +/*
84838 + * Some hardware wants to get fixmapped without caching.
84839 + */
84840 +#define set_fixmap_nocache(idx, phys) \
84841 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
84842 +
84843 +#define clear_fixmap(idx) \
84844 + __set_fixmap(idx, 0, __pgprot(0))
84845 +
84846 +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
84847 +
84848 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
84849 +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
84850 +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
84851 +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
84852 +
84853 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
84854 +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
84855 +
84856 +extern void __this_fixmap_does_not_exist(void);
84857 +
84858 +/*
84859 + * 'index to address' translation. If anyone tries to use the idx
84860 + * directly without tranlation, we catch the bug with a NULL-deference
84861 + * kernel oops. Illegal ranges of incoming indices are caught too.
84862 + */
84863 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
84864 +{
84865 + /*
84866 + * this branch gets completely eliminated after inlining,
84867 + * except when someone tries to use fixaddr indices in an
84868 + * illegal way. (such as mixing up address types or using
84869 + * out-of-range indices).
84870 + *
84871 + * If it doesn't get removed, the linker will complain
84872 + * loudly with a reasonably clear error message..
84873 + */
84874 + if (idx >= __end_of_fixed_addresses)
84875 + __this_fixmap_does_not_exist();
84876 +
84877 + return __fix_to_virt(idx);
84878 +}
84879 +
84880 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
84881 +{
84882 + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
84883 + return __virt_to_fix(vaddr);
84884 +}
84885 +
84886 +#endif /* !__ASSEMBLY__ */
84887 +#endif
84888 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/floppy.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/floppy.h
84889 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/floppy.h 1970-01-01 00:00:00.000000000 +0000
84890 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/floppy.h 2007-01-08 15:00:45.000000000 +0000
84891 @@ -0,0 +1,147 @@
84892 +/*
84893 + * Architecture specific parts of the Floppy driver
84894 + *
84895 + * This file is subject to the terms and conditions of the GNU General Public
84896 + * License. See the file "COPYING" in the main directory of this archive
84897 + * for more details.
84898 + *
84899 + * Copyright (C) 1995
84900 + *
84901 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
84902 + */
84903 +#ifndef __ASM_XEN_I386_FLOPPY_H
84904 +#define __ASM_XEN_I386_FLOPPY_H
84905 +
84906 +#include <linux/vmalloc.h>
84907 +
84908 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
84909 +#include <asm/dma.h>
84910 +#undef MAX_DMA_ADDRESS
84911 +#define MAX_DMA_ADDRESS 0
84912 +#define CROSS_64KB(a,s) (0)
84913 +
84914 +#define fd_inb(port) inb_p(port)
84915 +#define fd_outb(value,port) outb_p(value,port)
84916 +
84917 +#define fd_request_dma() (0)
84918 +#define fd_free_dma() ((void)0)
84919 +#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
84920 +#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
84921 +#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
84922 +#define fd_get_dma_residue() (virtual_dma_count + virtual_dma_residue)
84923 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
84924 +/*
84925 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
84926 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
84927 + */
84928 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
84929 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
84930 +
84931 +static int virtual_dma_count;
84932 +static int virtual_dma_residue;
84933 +static char *virtual_dma_addr;
84934 +static int virtual_dma_mode;
84935 +static int doing_pdma;
84936 +
84937 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
84938 +{
84939 + register unsigned char st;
84940 + register int lcount;
84941 + register char *lptr;
84942 +
84943 + if (!doing_pdma)
84944 + return floppy_interrupt(irq, dev_id, regs);
84945 +
84946 + st = 1;
84947 + for(lcount=virtual_dma_count, lptr=virtual_dma_addr;
84948 + lcount; lcount--, lptr++) {
84949 + st=inb(virtual_dma_port+4) & 0xa0 ;
84950 + if(st != 0xa0)
84951 + break;
84952 + if(virtual_dma_mode)
84953 + outb_p(*lptr, virtual_dma_port+5);
84954 + else
84955 + *lptr = inb_p(virtual_dma_port+5);
84956 + }
84957 + virtual_dma_count = lcount;
84958 + virtual_dma_addr = lptr;
84959 + st = inb(virtual_dma_port+4);
84960 +
84961 + if(st == 0x20)
84962 + return IRQ_HANDLED;
84963 + if(!(st & 0x20)) {
84964 + virtual_dma_residue += virtual_dma_count;
84965 + virtual_dma_count=0;
84966 + doing_pdma = 0;
84967 + floppy_interrupt(irq, dev_id, regs);
84968 + return IRQ_HANDLED;
84969 + }
84970 + return IRQ_HANDLED;
84971 +}
84972 +
84973 +static void fd_disable_dma(void)
84974 +{
84975 + doing_pdma = 0;
84976 + virtual_dma_residue += virtual_dma_count;
84977 + virtual_dma_count=0;
84978 +}
84979 +
84980 +static int fd_request_irq(void)
84981 +{
84982 + return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
84983 + "floppy", NULL);
84984 +}
84985 +
84986 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
84987 +{
84988 + doing_pdma = 1;
84989 + virtual_dma_port = io;
84990 + virtual_dma_mode = (mode == DMA_MODE_WRITE);
84991 + virtual_dma_addr = addr;
84992 + virtual_dma_count = size;
84993 + virtual_dma_residue = 0;
84994 + return 0;
84995 +}
84996 +
84997 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
84998 +#define FDC1 xen_floppy_init()
84999 +static int FDC2 = -1;
85000 +
85001 +static int xen_floppy_init(void)
85002 +{
85003 + use_virtual_dma = 1;
85004 + can_use_virtual_dma = 1;
85005 + return 0x3f0;
85006 +}
85007 +
85008 +/*
85009 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
85010 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
85011 + * coincides with another rtc CMOS user. Paul G.
85012 + */
85013 +#define FLOPPY0_TYPE ({ \
85014 + unsigned long flags; \
85015 + unsigned char val; \
85016 + spin_lock_irqsave(&rtc_lock, flags); \
85017 + val = (CMOS_READ(0x10) >> 4) & 15; \
85018 + spin_unlock_irqrestore(&rtc_lock, flags); \
85019 + val; \
85020 +})
85021 +
85022 +#define FLOPPY1_TYPE ({ \
85023 + unsigned long flags; \
85024 + unsigned char val; \
85025 + spin_lock_irqsave(&rtc_lock, flags); \
85026 + val = CMOS_READ(0x10) & 15; \
85027 + spin_unlock_irqrestore(&rtc_lock, flags); \
85028 + val; \
85029 +})
85030 +
85031 +#define N_FDC 2
85032 +#define N_DRIVE 8
85033 +
85034 +#define FLOPPY_MOTOR_MASK 0xf0
85035 +
85036 +#define EXTRA_FLOPPY_PARAMS
85037 +
85038 +#endif /* __ASM_XEN_I386_FLOPPY_H */
85039 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/highmem.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/highmem.h
85040 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/highmem.h 1970-01-01 00:00:00.000000000 +0000
85041 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/highmem.h 2007-01-08 15:00:45.000000000 +0000
85042 @@ -0,0 +1,81 @@
85043 +/*
85044 + * highmem.h: virtual kernel memory mappings for high memory
85045 + *
85046 + * Used in CONFIG_HIGHMEM systems for memory pages which
85047 + * are not addressable by direct kernel virtual addresses.
85048 + *
85049 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
85050 + * Gerhard.Wichert@pdb.siemens.de
85051 + *
85052 + *
85053 + * Redesigned the x86 32-bit VM architecture to deal with
85054 + * up to 16 Terabyte physical memory. With current x86 CPUs
85055 + * we now support up to 64 Gigabytes physical RAM.
85056 + *
85057 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
85058 + */
85059 +
85060 +#ifndef _ASM_HIGHMEM_H
85061 +#define _ASM_HIGHMEM_H
85062 +
85063 +#ifdef __KERNEL__
85064 +
85065 +#include <linux/config.h>
85066 +#include <linux/interrupt.h>
85067 +#include <linux/threads.h>
85068 +#include <asm/kmap_types.h>
85069 +#include <asm/tlbflush.h>
85070 +
85071 +/* declarations for highmem.c */
85072 +extern unsigned long highstart_pfn, highend_pfn;
85073 +
85074 +extern pte_t *kmap_pte;
85075 +extern pgprot_t kmap_prot;
85076 +extern pte_t *pkmap_page_table;
85077 +
85078 +/*
85079 + * Right now we initialize only a single pte table. It can be extended
85080 + * easily, subsequent pte tables have to be allocated in one physical
85081 + * chunk of RAM.
85082 + */
85083 +#ifdef CONFIG_X86_PAE
85084 +#define LAST_PKMAP 512
85085 +#else
85086 +#define LAST_PKMAP 1024
85087 +#endif
85088 +/*
85089 + * Ordering is:
85090 + *
85091 + * FIXADDR_TOP
85092 + * fixed_addresses
85093 + * FIXADDR_START
85094 + * temp fixed addresses
85095 + * FIXADDR_BOOT_START
85096 + * Persistent kmap area
85097 + * PKMAP_BASE
85098 + * VMALLOC_END
85099 + * Vmalloc area
85100 + * VMALLOC_START
85101 + * high_memory
85102 + */
85103 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
85104 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
85105 +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
85106 +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
85107 +
85108 +extern void * FASTCALL(kmap_high(struct page *page));
85109 +extern void FASTCALL(kunmap_high(struct page *page));
85110 +
85111 +void *kmap(struct page *page);
85112 +void kunmap(struct page *page);
85113 +void *kmap_atomic(struct page *page, enum km_type type);
85114 +void *kmap_atomic_pte(struct page *page, enum km_type type);
85115 +void kunmap_atomic(void *kvaddr, enum km_type type);
85116 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
85117 +struct page *kmap_atomic_to_page(void *ptr);
85118 +
85119 +#define flush_cache_kmaps() do { } while (0)
85120 +
85121 +#endif /* __KERNEL__ */
85122 +
85123 +#endif /* _ASM_HIGHMEM_H */
85124 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hw_irq.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hw_irq.h
85125 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hw_irq.h 1970-01-01 00:00:00.000000000 +0000
85126 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hw_irq.h 2007-01-08 15:00:45.000000000 +0000
85127 @@ -0,0 +1,77 @@
85128 +#ifndef _ASM_HW_IRQ_H
85129 +#define _ASM_HW_IRQ_H
85130 +
85131 +/*
85132 + * linux/include/asm/hw_irq.h
85133 + *
85134 + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
85135 + *
85136 + * moved some of the old arch/i386/kernel/irq.h to here. VY
85137 + *
85138 + * IRQ/IPI changes taken from work by Thomas Radke
85139 + * <tomsoft@informatik.tu-chemnitz.de>
85140 + */
85141 +
85142 +#include <linux/config.h>
85143 +#include <linux/profile.h>
85144 +#include <asm/atomic.h>
85145 +#include <asm/irq.h>
85146 +#include <asm/sections.h>
85147 +
85148 +struct hw_interrupt_type;
85149 +
85150 +/*
85151 + * Various low-level irq details needed by irq.c, process.c,
85152 + * time.c, io_apic.c and smp.c
85153 + *
85154 + * Interrupt entry/exit code at both C and assembly level
85155 + */
85156 +
85157 +extern u8 irq_vector[NR_IRQ_VECTORS];
85158 +#define IO_APIC_VECTOR(irq) (irq_vector[irq])
85159 +#define AUTO_ASSIGN -1
85160 +
85161 +extern void (*interrupt[NR_IRQS])(void);
85162 +
85163 +#ifdef CONFIG_SMP
85164 +fastcall void reschedule_interrupt(void);
85165 +fastcall void invalidate_interrupt(void);
85166 +fastcall void call_function_interrupt(void);
85167 +#endif
85168 +
85169 +#ifdef CONFIG_X86_LOCAL_APIC
85170 +fastcall void apic_timer_interrupt(void);
85171 +fastcall void error_interrupt(void);
85172 +fastcall void spurious_interrupt(void);
85173 +fastcall void thermal_interrupt(struct pt_regs *);
85174 +#define platform_legacy_irq(irq) ((irq) < 16)
85175 +#endif
85176 +
85177 +void disable_8259A_irq(unsigned int irq);
85178 +void enable_8259A_irq(unsigned int irq);
85179 +int i8259A_irq_pending(unsigned int irq);
85180 +void make_8259A_irq(unsigned int irq);
85181 +void init_8259A(int aeoi);
85182 +void FASTCALL(send_IPI_self(int vector));
85183 +void init_VISWS_APIC_irqs(void);
85184 +void setup_IO_APIC(void);
85185 +void disable_IO_APIC(void);
85186 +void print_IO_APIC(void);
85187 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
85188 +void send_IPI(int dest, int vector);
85189 +void setup_ioapic_dest(void);
85190 +
85191 +extern unsigned long io_apic_irqs;
85192 +
85193 +extern atomic_t irq_err_count;
85194 +extern atomic_t irq_mis_count;
85195 +
85196 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
85197 +
85198 +extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
85199 +static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
85200 +{
85201 + resend_irq_on_evtchn(h, i);
85202 +}
85203 +
85204 +#endif /* _ASM_HW_IRQ_H */
85205 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypercall.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypercall.h
85206 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypercall.h 1970-01-01 00:00:00.000000000 +0000
85207 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypercall.h 2007-01-08 15:00:45.000000000 +0000
85208 @@ -0,0 +1,407 @@
85209 +/******************************************************************************
85210 + * hypercall.h
85211 + *
85212 + * Linux-specific hypervisor handling.
85213 + *
85214 + * Copyright (c) 2002-2004, K A Fraser
85215 + *
85216 + * This program is free software; you can redistribute it and/or
85217 + * modify it under the terms of the GNU General Public License version 2
85218 + * as published by the Free Software Foundation; or, when distributed
85219 + * separately from the Linux kernel or incorporated into other
85220 + * software packages, subject to the following license:
85221 + *
85222 + * Permission is hereby granted, free of charge, to any person obtaining a copy
85223 + * of this source file (the "Software"), to deal in the Software without
85224 + * restriction, including without limitation the rights to use, copy, modify,
85225 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
85226 + * and to permit persons to whom the Software is furnished to do so, subject to
85227 + * the following conditions:
85228 + *
85229 + * The above copyright notice and this permission notice shall be included in
85230 + * all copies or substantial portions of the Software.
85231 + *
85232 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
85233 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
85234 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
85235 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
85236 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
85237 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
85238 + * IN THE SOFTWARE.
85239 + */
85240 +
85241 +#ifndef __HYPERCALL_H__
85242 +#define __HYPERCALL_H__
85243 +
85244 +#include <linux/string.h> /* memcpy() */
85245 +
85246 +#ifndef __HYPERVISOR_H__
85247 +# error "please don't include this file directly"
85248 +#endif
85249 +
85250 +#define __STR(x) #x
85251 +#define STR(x) __STR(x)
85252 +
85253 +#ifdef CONFIG_XEN
85254 +#define HYPERCALL_STR(name) \
85255 + "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
85256 +#else
85257 +#define HYPERCALL_STR(name) \
85258 + "mov hypercall_stubs,%%eax; " \
85259 + "add $("STR(__HYPERVISOR_##name)" * 32),%%eax; " \
85260 + "call *%%eax"
85261 +#endif
85262 +
85263 +#define _hypercall0(type, name) \
85264 +({ \
85265 + long __res; \
85266 + asm volatile ( \
85267 + HYPERCALL_STR(name) \
85268 + : "=a" (__res) \
85269 + : \
85270 + : "memory" ); \
85271 + (type)__res; \
85272 +})
85273 +
85274 +#define _hypercall1(type, name, a1) \
85275 +({ \
85276 + long __res, __ign1; \
85277 + asm volatile ( \
85278 + HYPERCALL_STR(name) \
85279 + : "=a" (__res), "=b" (__ign1) \
85280 + : "1" ((long)(a1)) \
85281 + : "memory" ); \
85282 + (type)__res; \
85283 +})
85284 +
85285 +#define _hypercall2(type, name, a1, a2) \
85286 +({ \
85287 + long __res, __ign1, __ign2; \
85288 + asm volatile ( \
85289 + HYPERCALL_STR(name) \
85290 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
85291 + : "1" ((long)(a1)), "2" ((long)(a2)) \
85292 + : "memory" ); \
85293 + (type)__res; \
85294 +})
85295 +
85296 +#define _hypercall3(type, name, a1, a2, a3) \
85297 +({ \
85298 + long __res, __ign1, __ign2, __ign3; \
85299 + asm volatile ( \
85300 + HYPERCALL_STR(name) \
85301 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85302 + "=d" (__ign3) \
85303 + : "1" ((long)(a1)), "2" ((long)(a2)), \
85304 + "3" ((long)(a3)) \
85305 + : "memory" ); \
85306 + (type)__res; \
85307 +})
85308 +
85309 +#define _hypercall4(type, name, a1, a2, a3, a4) \
85310 +({ \
85311 + long __res, __ign1, __ign2, __ign3, __ign4; \
85312 + asm volatile ( \
85313 + HYPERCALL_STR(name) \
85314 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85315 + "=d" (__ign3), "=S" (__ign4) \
85316 + : "1" ((long)(a1)), "2" ((long)(a2)), \
85317 + "3" ((long)(a3)), "4" ((long)(a4)) \
85318 + : "memory" ); \
85319 + (type)__res; \
85320 +})
85321 +
85322 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
85323 +({ \
85324 + long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
85325 + asm volatile ( \
85326 + HYPERCALL_STR(name) \
85327 + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
85328 + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
85329 + : "1" ((long)(a1)), "2" ((long)(a2)), \
85330 + "3" ((long)(a3)), "4" ((long)(a4)), \
85331 + "5" ((long)(a5)) \
85332 + : "memory" ); \
85333 + (type)__res; \
85334 +})
85335 +
85336 +static inline int
85337 +HYPERVISOR_set_trap_table(
85338 + trap_info_t *table)
85339 +{
85340 + return _hypercall1(int, set_trap_table, table);
85341 +}
85342 +
85343 +static inline int
85344 +HYPERVISOR_mmu_update(
85345 + mmu_update_t *req, int count, int *success_count, domid_t domid)
85346 +{
85347 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
85348 +}
85349 +
85350 +static inline int
85351 +HYPERVISOR_mmuext_op(
85352 + struct mmuext_op *op, int count, int *success_count, domid_t domid)
85353 +{
85354 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
85355 +}
85356 +
85357 +static inline int
85358 +HYPERVISOR_set_gdt(
85359 + unsigned long *frame_list, int entries)
85360 +{
85361 + return _hypercall2(int, set_gdt, frame_list, entries);
85362 +}
85363 +
85364 +static inline int
85365 +HYPERVISOR_stack_switch(
85366 + unsigned long ss, unsigned long esp)
85367 +{
85368 + return _hypercall2(int, stack_switch, ss, esp);
85369 +}
85370 +
85371 +static inline int
85372 +HYPERVISOR_set_callbacks(
85373 + unsigned long event_selector, unsigned long event_address,
85374 + unsigned long failsafe_selector, unsigned long failsafe_address)
85375 +{
85376 + return _hypercall4(int, set_callbacks,
85377 + event_selector, event_address,
85378 + failsafe_selector, failsafe_address);
85379 +}
85380 +
85381 +static inline int
85382 +HYPERVISOR_fpu_taskswitch(
85383 + int set)
85384 +{
85385 + return _hypercall1(int, fpu_taskswitch, set);
85386 +}
85387 +
85388 +static inline int
85389 +HYPERVISOR_sched_op_compat(
85390 + int cmd, unsigned long arg)
85391 +{
85392 + return _hypercall2(int, sched_op_compat, cmd, arg);
85393 +}
85394 +
85395 +static inline int
85396 +HYPERVISOR_sched_op(
85397 + int cmd, void *arg)
85398 +{
85399 + return _hypercall2(int, sched_op, cmd, arg);
85400 +}
85401 +
85402 +static inline long
85403 +HYPERVISOR_set_timer_op(
85404 + u64 timeout)
85405 +{
85406 + unsigned long timeout_hi = (unsigned long)(timeout>>32);
85407 + unsigned long timeout_lo = (unsigned long)timeout;
85408 + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
85409 +}
85410 +
85411 +static inline int
85412 +HYPERVISOR_dom0_op(
85413 + dom0_op_t *dom0_op)
85414 +{
85415 + dom0_op->interface_version = DOM0_INTERFACE_VERSION;
85416 + return _hypercall1(int, dom0_op, dom0_op);
85417 +}
85418 +
85419 +static inline int
85420 +HYPERVISOR_set_debugreg(
85421 + int reg, unsigned long value)
85422 +{
85423 + return _hypercall2(int, set_debugreg, reg, value);
85424 +}
85425 +
85426 +static inline unsigned long
85427 +HYPERVISOR_get_debugreg(
85428 + int reg)
85429 +{
85430 + return _hypercall1(unsigned long, get_debugreg, reg);
85431 +}
85432 +
85433 +static inline int
85434 +HYPERVISOR_update_descriptor(
85435 + u64 ma, u64 desc)
85436 +{
85437 + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
85438 +}
85439 +
85440 +static inline int
85441 +HYPERVISOR_memory_op(
85442 + unsigned int cmd, void *arg)
85443 +{
85444 + return _hypercall2(int, memory_op, cmd, arg);
85445 +}
85446 +
85447 +static inline int
85448 +HYPERVISOR_multicall(
85449 + void *call_list, int nr_calls)
85450 +{
85451 + return _hypercall2(int, multicall, call_list, nr_calls);
85452 +}
85453 +
85454 +static inline int
85455 +HYPERVISOR_update_va_mapping(
85456 + unsigned long va, pte_t new_val, unsigned long flags)
85457 +{
85458 + unsigned long pte_hi = 0;
85459 +#ifdef CONFIG_X86_PAE
85460 + pte_hi = new_val.pte_high;
85461 +#endif
85462 + return _hypercall4(int, update_va_mapping, va,
85463 + new_val.pte_low, pte_hi, flags);
85464 +}
85465 +
85466 +static inline int
85467 +HYPERVISOR_event_channel_op(
85468 + int cmd, void *arg)
85469 +{
85470 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
85471 +
85472 +#ifdef CONFIG_XEN_COMPAT_030002
85473 + if (unlikely(rc == -ENOSYS)) {
85474 + struct evtchn_op op;
85475 + op.cmd = cmd;
85476 + memcpy(&op.u, arg, sizeof(op.u));
85477 + rc = _hypercall1(int, event_channel_op_compat, &op);
85478 + memcpy(arg, &op.u, sizeof(op.u));
85479 + }
85480 +#endif
85481 +
85482 + return rc;
85483 +}
85484 +
85485 +static inline int
85486 +HYPERVISOR_acm_op(
85487 + int cmd, void *arg)
85488 +{
85489 + return _hypercall2(int, acm_op, cmd, arg);
85490 +}
85491 +
85492 +static inline int
85493 +HYPERVISOR_xen_version(
85494 + int cmd, void *arg)
85495 +{
85496 + return _hypercall2(int, xen_version, cmd, arg);
85497 +}
85498 +
85499 +static inline int
85500 +HYPERVISOR_console_io(
85501 + int cmd, int count, char *str)
85502 +{
85503 + return _hypercall3(int, console_io, cmd, count, str);
85504 +}
85505 +
85506 +static inline int
85507 +HYPERVISOR_physdev_op(
85508 + int cmd, void *arg)
85509 +{
85510 + int rc = _hypercall2(int, physdev_op, cmd, arg);
85511 +
85512 +#ifdef CONFIG_XEN_COMPAT_030002
85513 + if (unlikely(rc == -ENOSYS)) {
85514 + struct physdev_op op;
85515 + op.cmd = cmd;
85516 + memcpy(&op.u, arg, sizeof(op.u));
85517 + rc = _hypercall1(int, physdev_op_compat, &op);
85518 + memcpy(arg, &op.u, sizeof(op.u));
85519 + }
85520 +#endif
85521 +
85522 + return rc;
85523 +}
85524 +
85525 +static inline int
85526 +HYPERVISOR_grant_table_op(
85527 + unsigned int cmd, void *uop, unsigned int count)
85528 +{
85529 + return _hypercall3(int, grant_table_op, cmd, uop, count);
85530 +}
85531 +
85532 +static inline int
85533 +HYPERVISOR_update_va_mapping_otherdomain(
85534 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
85535 +{
85536 + unsigned long pte_hi = 0;
85537 +#ifdef CONFIG_X86_PAE
85538 + pte_hi = new_val.pte_high;
85539 +#endif
85540 + return _hypercall5(int, update_va_mapping_otherdomain, va,
85541 + new_val.pte_low, pte_hi, flags, domid);
85542 +}
85543 +
85544 +static inline int
85545 +HYPERVISOR_vm_assist(
85546 + unsigned int cmd, unsigned int type)
85547 +{
85548 + return _hypercall2(int, vm_assist, cmd, type);
85549 +}
85550 +
85551 +static inline int
85552 +HYPERVISOR_vcpu_op(
85553 + int cmd, int vcpuid, void *extra_args)
85554 +{
85555 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
85556 +}
85557 +
85558 +static inline int
85559 +HYPERVISOR_suspend(
85560 + unsigned long srec)
85561 +{
85562 + struct sched_shutdown sched_shutdown = {
85563 + .reason = SHUTDOWN_suspend
85564 + };
85565 +
85566 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
85567 + &sched_shutdown, srec);
85568 +
85569 +#ifdef CONFIG_XEN_COMPAT_030002
85570 + if (rc == -ENOSYS)
85571 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
85572 + SHUTDOWN_suspend, srec);
85573 +#endif
85574 +
85575 + return rc;
85576 +}
85577 +
85578 +static inline int
85579 +HYPERVISOR_nmi_op(
85580 + unsigned long op, void *arg)
85581 +{
85582 + return _hypercall2(int, nmi_op, op, arg);
85583 +}
85584 +
85585 +static inline unsigned long
85586 +HYPERVISOR_hvm_op(
85587 + int op, void *arg)
85588 +{
85589 + return _hypercall2(unsigned long, hvm_op, op, arg);
85590 +}
85591 +
85592 +static inline int
85593 +HYPERVISOR_callback_op(
85594 + int cmd, void *arg)
85595 +{
85596 + return _hypercall2(int, callback_op, cmd, arg);
85597 +}
85598 +
85599 +static inline int
85600 +HYPERVISOR_xenoprof_op(
85601 + int op, void *arg)
85602 +{
85603 + return _hypercall2(int, xenoprof_op, op, arg);
85604 +}
85605 +
85606 +static inline int
85607 +HYPERVISOR_kexec_op(
85608 + unsigned long op, void *args)
85609 +{
85610 + return _hypercall2(int, kexec_op, op, args);
85611 +}
85612 +
85613 +
85614 +
85615 +#endif /* __HYPERCALL_H__ */
85616 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypervisor.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypervisor.h
85617 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
85618 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/hypervisor.h 2007-01-08 15:00:45.000000000 +0000
85619 @@ -0,0 +1,246 @@
85620 +/******************************************************************************
85621 + * hypervisor.h
85622 + *
85623 + * Linux-specific hypervisor handling.
85624 + *
85625 + * Copyright (c) 2002-2004, K A Fraser
85626 + *
85627 + * This program is free software; you can redistribute it and/or
85628 + * modify it under the terms of the GNU General Public License version 2
85629 + * as published by the Free Software Foundation; or, when distributed
85630 + * separately from the Linux kernel or incorporated into other
85631 + * software packages, subject to the following license:
85632 + *
85633 + * Permission is hereby granted, free of charge, to any person obtaining a copy
85634 + * of this source file (the "Software"), to deal in the Software without
85635 + * restriction, including without limitation the rights to use, copy, modify,
85636 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
85637 + * and to permit persons to whom the Software is furnished to do so, subject to
85638 + * the following conditions:
85639 + *
85640 + * The above copyright notice and this permission notice shall be included in
85641 + * all copies or substantial portions of the Software.
85642 + *
85643 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
85644 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
85645 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
85646 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
85647 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
85648 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
85649 + * IN THE SOFTWARE.
85650 + */
85651 +
85652 +#ifndef __HYPERVISOR_H__
85653 +#define __HYPERVISOR_H__
85654 +
85655 +#include <linux/config.h>
85656 +#include <linux/types.h>
85657 +#include <linux/kernel.h>
85658 +#include <linux/version.h>
85659 +#include <linux/errno.h>
85660 +#include <xen/interface/xen.h>
85661 +#include <xen/interface/dom0_ops.h>
85662 +#include <xen/interface/event_channel.h>
85663 +#include <xen/interface/physdev.h>
85664 +#include <xen/interface/sched.h>
85665 +#include <xen/interface/nmi.h>
85666 +#include <asm/ptrace.h>
85667 +#include <asm/page.h>
85668 +#if defined(__i386__)
85669 +# ifdef CONFIG_X86_PAE
85670 +# include <asm-generic/pgtable-nopud.h>
85671 +# else
85672 +# include <asm-generic/pgtable-nopmd.h>
85673 +# endif
85674 +#endif
85675 +
85676 +extern shared_info_t *HYPERVISOR_shared_info;
85677 +
85678 +#ifdef CONFIG_X86_32
85679 +extern unsigned long hypervisor_virt_start;
85680 +#endif
85681 +
85682 +/* arch/xen/i386/kernel/setup.c */
85683 +extern start_info_t *xen_start_info;
85684 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
85685 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
85686 +#else
85687 +#define is_initial_xendomain() 0
85688 +#endif
85689 +
85690 +/* arch/xen/kernel/evtchn.c */
85691 +/* Force a proper event-channel callback from Xen. */
85692 +void force_evtchn_callback(void);
85693 +
85694 +/* arch/xen/kernel/process.c */
85695 +void xen_cpu_idle (void);
85696 +
85697 +/* arch/xen/i386/kernel/hypervisor.c */
85698 +void do_hypervisor_callback(struct pt_regs *regs);
85699 +
85700 +/* arch/xen/i386/mm/hypervisor.c */
85701 +/*
85702 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
85703 + * be MACHINE addresses.
85704 + */
85705 +
85706 +void xen_pt_switch(unsigned long ptr);
85707 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
85708 +void xen_load_gs(unsigned int selector); /* x86_64 only */
85709 +void xen_tlb_flush(void);
85710 +void xen_invlpg(unsigned long ptr);
85711 +
85712 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
85713 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
85714 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
85715 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
85716 +void xen_pgd_pin(unsigned long ptr);
85717 +void xen_pgd_unpin(unsigned long ptr);
85718 +
85719 +void xen_set_ldt(unsigned long ptr, unsigned long bytes);
85720 +
85721 +#ifdef CONFIG_SMP
85722 +#include <linux/cpumask.h>
85723 +void xen_tlb_flush_all(void);
85724 +void xen_invlpg_all(unsigned long ptr);
85725 +void xen_tlb_flush_mask(cpumask_t *mask);
85726 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
85727 +#endif
85728 +
85729 +/* Returns zero on success else negative errno. */
85730 +int xen_create_contiguous_region(
85731 + unsigned long vstart, unsigned int order, unsigned int address_bits);
85732 +void xen_destroy_contiguous_region(
85733 + unsigned long vstart, unsigned int order);
85734 +
85735 +/* Turn jiffies into Xen system time. */
85736 +u64 jiffies_to_st(unsigned long jiffies);
85737 +
85738 +#include <asm/hypercall.h>
85739 +
85740 +#if defined(CONFIG_X86_64)
85741 +#define MULTI_UVMFLAGS_INDEX 2
85742 +#define MULTI_UVMDOMID_INDEX 3
85743 +#else
85744 +#define MULTI_UVMFLAGS_INDEX 3
85745 +#define MULTI_UVMDOMID_INDEX 4
85746 +#endif
85747 +
85748 +#define is_running_on_xen() 1
85749 +
85750 +static inline int
85751 +HYPERVISOR_yield(
85752 + void)
85753 +{
85754 + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
85755 +
85756 +#ifdef CONFIG_XEN_COMPAT_030002
85757 + if (rc == -ENOSYS)
85758 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
85759 +#endif
85760 +
85761 + return rc;
85762 +}
85763 +
85764 +static inline int
85765 +HYPERVISOR_block(
85766 + void)
85767 +{
85768 + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
85769 +
85770 +#ifdef CONFIG_XEN_COMPAT_030002
85771 + if (rc == -ENOSYS)
85772 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
85773 +#endif
85774 +
85775 + return rc;
85776 +}
85777 +
85778 +static inline int
85779 +HYPERVISOR_shutdown(
85780 + unsigned int reason)
85781 +{
85782 + struct sched_shutdown sched_shutdown = {
85783 + .reason = reason
85784 + };
85785 +
85786 + int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
85787 +
85788 +#ifdef CONFIG_XEN_COMPAT_030002
85789 + if (rc == -ENOSYS)
85790 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
85791 +#endif
85792 +
85793 + return rc;
85794 +}
85795 +
85796 +static inline int
85797 +HYPERVISOR_poll(
85798 + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
85799 +{
85800 + int rc;
85801 + struct sched_poll sched_poll = {
85802 + .nr_ports = nr_ports,
85803 + .timeout = jiffies_to_st(timeout)
85804 + };
85805 + set_xen_guest_handle(sched_poll.ports, ports);
85806 +
85807 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
85808 +#ifdef CONFIG_XEN_COMPAT_030002
85809 + if (rc == -ENOSYS)
85810 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
85811 +#endif
85812 +
85813 + return rc;
85814 +}
85815 +
85816 +static inline void
85817 +MULTI_update_va_mapping(
85818 + multicall_entry_t *mcl, unsigned long va,
85819 + pte_t new_val, unsigned long flags)
85820 +{
85821 + mcl->op = __HYPERVISOR_update_va_mapping;
85822 + mcl->args[0] = va;
85823 +#if defined(CONFIG_X86_64)
85824 + mcl->args[1] = new_val.pte;
85825 +#elif defined(CONFIG_X86_PAE)
85826 + mcl->args[1] = new_val.pte_low;
85827 + mcl->args[2] = new_val.pte_high;
85828 +#else
85829 + mcl->args[1] = new_val.pte_low;
85830 + mcl->args[2] = 0;
85831 +#endif
85832 + mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
85833 +}
85834 +
85835 +static inline void
85836 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
85837 + void *uop, unsigned int count)
85838 +{
85839 + mcl->op = __HYPERVISOR_grant_table_op;
85840 + mcl->args[0] = cmd;
85841 + mcl->args[1] = (unsigned long)uop;
85842 + mcl->args[2] = count;
85843 +}
85844 +
85845 +static inline void
85846 +MULTI_update_va_mapping_otherdomain(
85847 + multicall_entry_t *mcl, unsigned long va,
85848 + pte_t new_val, unsigned long flags, domid_t domid)
85849 +{
85850 + mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
85851 + mcl->args[0] = va;
85852 +#if defined(CONFIG_X86_64)
85853 + mcl->args[1] = new_val.pte;
85854 +#elif defined(CONFIG_X86_PAE)
85855 + mcl->args[1] = new_val.pte_low;
85856 + mcl->args[2] = new_val.pte_high;
85857 +#else
85858 + mcl->args[1] = new_val.pte_low;
85859 + mcl->args[2] = 0;
85860 +#endif
85861 + mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
85862 + mcl->args[MULTI_UVMDOMID_INDEX] = domid;
85863 +}
85864 +
85865 +#endif /* __HYPERVISOR_H__ */
85866 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/io.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/io.h
85867 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/io.h 1970-01-01 00:00:00.000000000 +0000
85868 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/io.h 2007-01-08 15:00:45.000000000 +0000
85869 @@ -0,0 +1,403 @@
85870 +#ifndef _ASM_IO_H
85871 +#define _ASM_IO_H
85872 +
85873 +#include <linux/config.h>
85874 +#include <linux/string.h>
85875 +#include <linux/compiler.h>
85876 +
85877 +/*
85878 + * This file contains the definitions for the x86 IO instructions
85879 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
85880 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
85881 + * versions of the single-IO instructions (inb_p/inw_p/..).
85882 + *
85883 + * This file is not meant to be obfuscating: it's just complicated
85884 + * to (a) handle it all in a way that makes gcc able to optimize it
85885 + * as well as possible and (b) trying to avoid writing the same thing
85886 + * over and over again with slight variations and possibly making a
85887 + * mistake somewhere.
85888 + */
85889 +
85890 +/*
85891 + * Thanks to James van Artsdalen for a better timing-fix than
85892 + * the two short jumps: using outb's to a nonexistent port seems
85893 + * to guarantee better timings even on fast machines.
85894 + *
85895 + * On the other hand, I'd like to be sure of a non-existent port:
85896 + * I feel a bit unsafe about using 0x80 (should be safe, though)
85897 + *
85898 + * Linus
85899 + */
85900 +
85901 + /*
85902 + * Bit simplified and optimized by Jan Hubicka
85903 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
85904 + *
85905 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
85906 + * isa_read[wl] and isa_write[wl] fixed
85907 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
85908 + */
85909 +
85910 +#define IO_SPACE_LIMIT 0xffff
85911 +
85912 +#define XQUAD_PORTIO_BASE 0xfe400000
85913 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
85914 +
85915 +#ifdef __KERNEL__
85916 +
85917 +#include <asm-generic/iomap.h>
85918 +
85919 +#include <linux/vmalloc.h>
85920 +#include <asm/fixmap.h>
85921 +
85922 +/*
85923 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
85924 + * access
85925 + */
85926 +#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz)
85927 +#define xlate_dev_mem_ptr_unmap(p) iounmap(p)
85928 +
85929 +/*
85930 + * Convert a virtual cached pointer to an uncached pointer
85931 + */
85932 +#define xlate_dev_kmem_ptr(p) p
85933 +
85934 +/**
85935 + * virt_to_phys - map virtual addresses to physical
85936 + * @address: address to remap
85937 + *
85938 + * The returned physical address is the physical (CPU) mapping for
85939 + * the memory address given. It is only valid to use this function on
85940 + * addresses directly mapped or allocated via kmalloc.
85941 + *
85942 + * This function does not give bus mappings for DMA transfers. In
85943 + * almost all conceivable cases a device driver should not be using
85944 + * this function
85945 + */
85946 +
85947 +static inline unsigned long virt_to_phys(volatile void * address)
85948 +{
85949 + return __pa(address);
85950 +}
85951 +
85952 +/**
85953 + * phys_to_virt - map physical address to virtual
85954 + * @address: address to remap
85955 + *
85956 + * The returned virtual address is a current CPU mapping for
85957 + * the memory address given. It is only valid to use this function on
85958 + * addresses that have a kernel mapping
85959 + *
85960 + * This function does not handle bus mappings for DMA transfers. In
85961 + * almost all conceivable cases a device driver should not be using
85962 + * this function
85963 + */
85964 +
85965 +static inline void * phys_to_virt(unsigned long address)
85966 +{
85967 + return __va(address);
85968 +}
85969 +
85970 +/*
85971 + * Change "struct page" to physical address.
85972 + */
85973 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
85974 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
85975 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
85976 +
85977 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
85978 + (unsigned long) bio_offset((bio)))
85979 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
85980 + (unsigned long) (bv)->bv_offset)
85981 +
85982 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
85983 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
85984 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
85985 + bvec_to_pseudophys((vec2))))
85986 +
85987 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
85988 +
85989 +/**
85990 + * ioremap - map bus memory into CPU space
85991 + * @offset: bus address of the memory
85992 + * @size: size of the resource to map
85993 + *
85994 + * ioremap performs a platform specific sequence of operations to
85995 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
85996 + * writew/writel functions and the other mmio helpers. The returned
85997 + * address is not guaranteed to be usable directly as a virtual
85998 + * address.
85999 + */
86000 +
86001 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
86002 +{
86003 + return __ioremap(offset, size, 0);
86004 +}
86005 +
86006 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
86007 +extern void iounmap(volatile void __iomem *addr);
86008 +
86009 +/*
86010 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
86011 + * mappings, before the real ioremap() is functional.
86012 + * A boot-time mapping is currently limited to at most 16 pages.
86013 + */
86014 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
86015 +extern void bt_iounmap(void *addr, unsigned long size);
86016 +
86017 +/* Use early IO mappings for DMI because it's initialized early */
86018 +#define dmi_ioremap bt_ioremap
86019 +#define dmi_iounmap bt_iounmap
86020 +#define dmi_alloc alloc_bootmem
86021 +
86022 +/*
86023 + * ISA I/O bus memory addresses are 1:1 with the physical address.
86024 + */
86025 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
86026 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
86027 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
86028 +
86029 +/*
86030 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
86031 + * are forbidden in portable PCI drivers.
86032 + *
86033 + * Allow them on x86 for legacy drivers, though.
86034 + */
86035 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
86036 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
86037 +
86038 +/*
86039 + * readX/writeX() are used to access memory mapped devices. On some
86040 + * architectures the memory mapped IO stuff needs to be accessed
86041 + * differently. On the x86 architecture, we just read/write the
86042 + * memory location directly.
86043 + */
86044 +
86045 +static inline unsigned char readb(const volatile void __iomem *addr)
86046 +{
86047 + return *(volatile unsigned char __force *) addr;
86048 +}
86049 +static inline unsigned short readw(const volatile void __iomem *addr)
86050 +{
86051 + return *(volatile unsigned short __force *) addr;
86052 +}
86053 +static inline unsigned int readl(const volatile void __iomem *addr)
86054 +{
86055 + return *(volatile unsigned int __force *) addr;
86056 +}
86057 +#define readb_relaxed(addr) readb(addr)
86058 +#define readw_relaxed(addr) readw(addr)
86059 +#define readl_relaxed(addr) readl(addr)
86060 +#define __raw_readb readb
86061 +#define __raw_readw readw
86062 +#define __raw_readl readl
86063 +
86064 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
86065 +{
86066 + *(volatile unsigned char __force *) addr = b;
86067 +}
86068 +static inline void writew(unsigned short b, volatile void __iomem *addr)
86069 +{
86070 + *(volatile unsigned short __force *) addr = b;
86071 +}
86072 +static inline void writel(unsigned int b, volatile void __iomem *addr)
86073 +{
86074 + *(volatile unsigned int __force *) addr = b;
86075 +}
86076 +#define __raw_writeb writeb
86077 +#define __raw_writew writew
86078 +#define __raw_writel writel
86079 +
86080 +#define mmiowb()
86081 +
86082 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
86083 +{
86084 + memset((void __force *) addr, val, count);
86085 +}
86086 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
86087 +{
86088 + __memcpy(dst, (void __force *) src, count);
86089 +}
86090 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
86091 +{
86092 + __memcpy((void __force *) dst, src, count);
86093 +}
86094 +
86095 +/*
86096 + * ISA space is 'always mapped' on a typical x86 system, no need to
86097 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
86098 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
86099 + * are physical addresses. The following constant pointer can be
86100 + * used as the IO-area pointer (it can be iounmapped as well, so the
86101 + * analogy with PCI is quite large):
86102 + */
86103 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
86104 +
86105 +#define isa_readb(a) readb(__ISA_IO_base + (a))
86106 +#define isa_readw(a) readw(__ISA_IO_base + (a))
86107 +#define isa_readl(a) readl(__ISA_IO_base + (a))
86108 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
86109 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
86110 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
86111 +#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c))
86112 +#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c))
86113 +#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c))
86114 +
86115 +
86116 +/*
86117 + * Again, i386 does not require mem IO specific function.
86118 + */
86119 +
86120 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
86121 +#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(__ISA_IO_base + (b)),(c),(d))
86122 +
86123 +/**
86124 + * check_signature - find BIOS signatures
86125 + * @io_addr: mmio address to check
86126 + * @signature: signature block
86127 + * @length: length of signature
86128 + *
86129 + * Perform a signature comparison with the mmio address io_addr. This
86130 + * address should have been obtained by ioremap.
86131 + * Returns 1 on a match.
86132 + */
86133 +
86134 +static inline int check_signature(volatile void __iomem * io_addr,
86135 + const unsigned char *signature, int length)
86136 +{
86137 + int retval = 0;
86138 + do {
86139 + if (readb(io_addr) != *signature)
86140 + goto out;
86141 + io_addr++;
86142 + signature++;
86143 + length--;
86144 + } while (length);
86145 + retval = 1;
86146 +out:
86147 + return retval;
86148 +}
86149 +
86150 +/*
86151 + * Cache management
86152 + *
86153 + * This needed for two cases
86154 + * 1. Out of order aware processors
86155 + * 2. Accidentally out of order processors (PPro errata #51)
86156 + */
86157 +
86158 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
86159 +
86160 +static inline void flush_write_buffers(void)
86161 +{
86162 + __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
86163 +}
86164 +
86165 +#define dma_cache_inv(_start,_size) flush_write_buffers()
86166 +#define dma_cache_wback(_start,_size) flush_write_buffers()
86167 +#define dma_cache_wback_inv(_start,_size) flush_write_buffers()
86168 +
86169 +#else
86170 +
86171 +/* Nothing to do */
86172 +
86173 +#define dma_cache_inv(_start,_size) do { } while (0)
86174 +#define dma_cache_wback(_start,_size) do { } while (0)
86175 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
86176 +#define flush_write_buffers()
86177 +
86178 +#endif
86179 +
86180 +#endif /* __KERNEL__ */
86181 +
86182 +#ifdef SLOW_IO_BY_JUMPING
86183 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
86184 +#else
86185 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
86186 +#endif
86187 +
86188 +static inline void slow_down_io(void) {
86189 + __asm__ __volatile__(
86190 + __SLOW_DOWN_IO
86191 +#ifdef REALLY_SLOW_IO
86192 + __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
86193 +#endif
86194 + : : );
86195 +}
86196 +
86197 +#ifdef CONFIG_X86_NUMAQ
86198 +extern void *xquad_portio; /* Where the IO area was mapped */
86199 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
86200 +#define __BUILDIO(bwl,bw,type) \
86201 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
86202 + if (xquad_portio) \
86203 + write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
86204 + else \
86205 + out##bwl##_local(value, port); \
86206 +} \
86207 +static inline void out##bwl(unsigned type value, int port) { \
86208 + out##bwl##_quad(value, port, 0); \
86209 +} \
86210 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
86211 + if (xquad_portio) \
86212 + return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
86213 + else \
86214 + return in##bwl##_local(port); \
86215 +} \
86216 +static inline unsigned type in##bwl(int port) { \
86217 + return in##bwl##_quad(port, 0); \
86218 +}
86219 +#else
86220 +#define __BUILDIO(bwl,bw,type) \
86221 +static inline void out##bwl(unsigned type value, int port) { \
86222 + out##bwl##_local(value, port); \
86223 +} \
86224 +static inline unsigned type in##bwl(int port) { \
86225 + return in##bwl##_local(port); \
86226 +}
86227 +#endif
86228 +
86229 +
86230 +#define BUILDIO(bwl,bw,type) \
86231 +static inline void out##bwl##_local(unsigned type value, int port) { \
86232 + __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
86233 +} \
86234 +static inline unsigned type in##bwl##_local(int port) { \
86235 + unsigned type value; \
86236 + __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
86237 + return value; \
86238 +} \
86239 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
86240 + out##bwl##_local(value, port); \
86241 + slow_down_io(); \
86242 +} \
86243 +static inline unsigned type in##bwl##_local_p(int port) { \
86244 + unsigned type value = in##bwl##_local(port); \
86245 + slow_down_io(); \
86246 + return value; \
86247 +} \
86248 +__BUILDIO(bwl,bw,type) \
86249 +static inline void out##bwl##_p(unsigned type value, int port) { \
86250 + out##bwl(value, port); \
86251 + slow_down_io(); \
86252 +} \
86253 +static inline unsigned type in##bwl##_p(int port) { \
86254 + unsigned type value = in##bwl(port); \
86255 + slow_down_io(); \
86256 + return value; \
86257 +} \
86258 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
86259 + __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
86260 +} \
86261 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
86262 + __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
86263 +}
86264 +
86265 +BUILDIO(b,b,char)
86266 +BUILDIO(w,w,short)
86267 +BUILDIO(l,,int)
86268 +
86269 +/* We will be supplying our own /dev/mem implementation */
86270 +#define ARCH_HAS_DEV_MEM
86271 +
86272 +#endif
86273 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/kmap_types.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/kmap_types.h
86274 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/kmap_types.h 1970-01-01 00:00:00.000000000 +0000
86275 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/kmap_types.h 2007-01-08 15:00:45.000000000 +0000
86276 @@ -0,0 +1,32 @@
86277 +#ifndef _ASM_KMAP_TYPES_H
86278 +#define _ASM_KMAP_TYPES_H
86279 +
86280 +#include <linux/config.h>
86281 +
86282 +#ifdef CONFIG_DEBUG_HIGHMEM
86283 +# define D(n) __KM_FENCE_##n ,
86284 +#else
86285 +# define D(n)
86286 +#endif
86287 +
86288 +enum km_type {
86289 +D(0) KM_BOUNCE_READ,
86290 +D(1) KM_SKB_SUNRPC_DATA,
86291 +D(2) KM_SKB_DATA_SOFTIRQ,
86292 +D(3) KM_USER0,
86293 +D(4) KM_USER1,
86294 +D(5) KM_BIO_SRC_IRQ,
86295 +D(6) KM_BIO_DST_IRQ,
86296 +D(7) KM_PTE0,
86297 +D(8) KM_PTE1,
86298 +D(9) KM_IRQ0,
86299 +D(10) KM_IRQ1,
86300 +D(11) KM_SOFTIRQ0,
86301 +D(12) KM_SOFTIRQ1,
86302 +D(13) KM_SWIOTLB,
86303 +D(14) KM_TYPE_NR
86304 +};
86305 +
86306 +#undef D
86307 +
86308 +#endif
86309 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/maddr.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/maddr.h
86310 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/maddr.h 1970-01-01 00:00:00.000000000 +0000
86311 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/maddr.h 2007-01-08 15:00:45.000000000 +0000
86312 @@ -0,0 +1,176 @@
86313 +#ifndef _I386_MADDR_H
86314 +#define _I386_MADDR_H
86315 +
86316 +#include <xen/features.h>
86317 +#include <xen/interface/xen.h>
86318 +
86319 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
86320 +#define INVALID_P2M_ENTRY (~0UL)
86321 +#define FOREIGN_FRAME_BIT (1UL<<31)
86322 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
86323 +
86324 +/* Definitions for machine and pseudophysical addresses. */
86325 +#ifdef CONFIG_X86_PAE
86326 +typedef unsigned long long paddr_t;
86327 +typedef unsigned long long maddr_t;
86328 +#else
86329 +typedef unsigned long paddr_t;
86330 +typedef unsigned long maddr_t;
86331 +#endif
86332 +
86333 +#ifdef CONFIG_XEN
86334 +
86335 +extern unsigned long *phys_to_machine_mapping;
86336 +
86337 +#undef machine_to_phys_mapping
86338 +extern unsigned long *machine_to_phys_mapping;
86339 +extern unsigned int machine_to_phys_order;
86340 +
86341 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
86342 +{
86343 + if (xen_feature(XENFEAT_auto_translated_physmap))
86344 + return pfn;
86345 + return phys_to_machine_mapping[(unsigned int)(pfn)] &
86346 + ~FOREIGN_FRAME_BIT;
86347 +}
86348 +
86349 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
86350 +{
86351 + if (xen_feature(XENFEAT_auto_translated_physmap))
86352 + return 1;
86353 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
86354 +}
86355 +
86356 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
86357 +{
86358 + extern unsigned long max_mapnr;
86359 + unsigned long pfn;
86360 +
86361 + if (xen_feature(XENFEAT_auto_translated_physmap))
86362 + return mfn;
86363 +
86364 + if (unlikely((mfn >> machine_to_phys_order) != 0))
86365 + return max_mapnr;
86366 +
86367 + /* The array access can fail (e.g., device space beyond end of RAM). */
86368 + asm (
86369 + "1: movl %1,%0\n"
86370 + "2:\n"
86371 + ".section .fixup,\"ax\"\n"
86372 + "3: movl %2,%0\n"
86373 + " jmp 2b\n"
86374 + ".previous\n"
86375 + ".section __ex_table,\"a\"\n"
86376 + " .align 4\n"
86377 + " .long 1b,3b\n"
86378 + ".previous"
86379 + : "=r" (pfn)
86380 + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
86381 +
86382 + return pfn;
86383 +}
86384 +
86385 +/*
86386 + * We detect special mappings in one of two ways:
86387 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
86388 + * to be outside our maximum possible pseudophys range.
86389 + * 2. If the MFN belongs to a different domain then we will certainly
86390 + * not have MFN in our p2m table. Conversely, if the page is ours,
86391 + * then we'll have p2m(m2p(MFN))==MFN.
86392 + * If we detect a special mapping then it doesn't have a 'struct page'.
86393 + * We force !pfn_valid() by returning an out-of-range pointer.
86394 + *
86395 + * NB. These checks require that, for any MFN that is not in our reservation,
86396 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
86397 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
86398 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
86399 + *
86400 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
86401 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
86402 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
86403 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
86404 + */
86405 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
86406 +{
86407 + extern unsigned long max_mapnr;
86408 + unsigned long pfn = mfn_to_pfn(mfn);
86409 + if ((pfn < max_mapnr)
86410 + && !xen_feature(XENFEAT_auto_translated_physmap)
86411 + && (phys_to_machine_mapping[pfn] != mfn))
86412 + return max_mapnr; /* force !pfn_valid() */
86413 + return pfn;
86414 +}
86415 +
86416 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
86417 +{
86418 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
86419 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
86420 + return;
86421 + }
86422 + phys_to_machine_mapping[pfn] = mfn;
86423 +}
86424 +
86425 +static inline maddr_t phys_to_machine(paddr_t phys)
86426 +{
86427 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
86428 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
86429 + return machine;
86430 +}
86431 +
86432 +static inline paddr_t machine_to_phys(maddr_t machine)
86433 +{
86434 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
86435 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
86436 + return phys;
86437 +}
86438 +
86439 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
86440 +{
86441 + /*
86442 + * In PAE mode, the NX bit needs to be dealt with in the value
86443 + * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
86444 + * but for i386 the conversion to ulong for the argument will
86445 + * clip it off.
86446 + */
86447 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
86448 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
86449 + return phys;
86450 +}
86451 +
86452 +#else /* !CONFIG_XEN */
86453 +
86454 +#define pfn_to_mfn(pfn) (pfn)
86455 +#define mfn_to_pfn(mfn) (mfn)
86456 +#define mfn_to_local_pfn(mfn) (mfn)
86457 +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
86458 +#define phys_to_machine_mapping_valid(pfn) (1)
86459 +#define phys_to_machine(phys) ((maddr_t)(phys))
86460 +#define machine_to_phys(mach) ((paddr_t)(mach))
86461 +#define pte_machine_to_phys(mach) ((paddr_t)(mach))
86462 +
86463 +#endif /* !CONFIG_XEN */
86464 +
86465 +/* VIRT <-> MACHINE conversion */
86466 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
86467 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
86468 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
86469 +
86470 +#ifdef CONFIG_X86_PAE
86471 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
86472 +{
86473 + pte_t pte;
86474 +
86475 + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
86476 + (pgprot_val(pgprot) >> 32);
86477 + pte.pte_high &= (__supported_pte_mask >> 32);
86478 + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
86479 + __supported_pte_mask;
86480 + return pte;
86481 +}
86482 +#else
86483 +#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
86484 +#endif
86485 +
86486 +#define __pte_ma(x) ((pte_t) { (x) } )
86487 +
86488 +#endif /* _I386_MADDR_H */
86489 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu.h
86490 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu.h 1970-01-01 00:00:00.000000000 +0000
86491 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu.h 2007-01-08 15:00:45.000000000 +0000
86492 @@ -0,0 +1,28 @@
86493 +#ifndef __i386_MMU_H
86494 +#define __i386_MMU_H
86495 +
86496 +#include <asm/semaphore.h>
86497 +/*
86498 + * The i386 doesn't have a mmu context, but
86499 + * we put the segment information here.
86500 + *
86501 + * cpu_vm_mask is used to optimize ldt flushing.
86502 + */
86503 +typedef struct {
86504 + int size;
86505 + struct semaphore sem;
86506 + void *ldt;
86507 +#ifdef CONFIG_XEN
86508 + int has_foreign_mappings;
86509 +#endif
86510 +} mm_context_t;
86511 +
86512 +/* mm/memory.c:exit_mmap hook */
86513 +extern void _arch_exit_mmap(struct mm_struct *mm);
86514 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
86515 +
86516 +/* kernel/fork.c:dup_mmap hook */
86517 +extern void _arch_dup_mmap(struct mm_struct *mm);
86518 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
86519 +
86520 +#endif
86521 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu_context.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu_context.h
86522 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/mmu_context.h 1970-01-01 00:00:00.000000000 +0000
86523 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/mmu_context.h 2007-01-08 15:00:45.000000000 +0000
86524 @@ -0,0 +1,109 @@
86525 +#ifndef __I386_SCHED_H
86526 +#define __I386_SCHED_H
86527 +
86528 +#include <linux/config.h>
86529 +#include <asm/desc.h>
86530 +#include <asm/atomic.h>
86531 +#include <asm/pgalloc.h>
86532 +#include <asm/tlbflush.h>
86533 +
86534 +/*
86535 + * Used for LDT copy/destruction.
86536 + */
86537 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
86538 +void destroy_context(struct mm_struct *mm);
86539 +
86540 +
86541 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
86542 +{
86543 +#if 0 /* XEN: no lazy tlb */
86544 + unsigned cpu = smp_processor_id();
86545 + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
86546 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
86547 +#endif
86548 +}
86549 +
86550 +#define prepare_arch_switch(next) __prepare_arch_switch()
86551 +
86552 +static inline void __prepare_arch_switch(void)
86553 +{
86554 + /*
86555 + * Save away %fs and %gs. No need to save %es and %ds, as those
86556 + * are always kernel segments while inside the kernel. Must
86557 + * happen before reload of cr3/ldt (i.e., not in __switch_to).
86558 + */
86559 + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
86560 + : "=m" (current->thread.fs),
86561 + "=m" (current->thread.gs));
86562 + asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
86563 + : : "r" (0) );
86564 +}
86565 +
86566 +extern void mm_pin(struct mm_struct *mm);
86567 +extern void mm_unpin(struct mm_struct *mm);
86568 +void mm_pin_all(void);
86569 +
86570 +static inline void switch_mm(struct mm_struct *prev,
86571 + struct mm_struct *next,
86572 + struct task_struct *tsk)
86573 +{
86574 + int cpu = smp_processor_id();
86575 + struct mmuext_op _op[2], *op = _op;
86576 +
86577 + if (likely(prev != next)) {
86578 + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
86579 + !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
86580 +
86581 + /* stop flush ipis for the previous mm */
86582 + cpu_clear(cpu, prev->cpu_vm_mask);
86583 +#if 0 /* XEN: no lazy tlb */
86584 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
86585 + per_cpu(cpu_tlbstate, cpu).active_mm = next;
86586 +#endif
86587 + cpu_set(cpu, next->cpu_vm_mask);
86588 +
86589 + /* Re-load page tables: load_cr3(next->pgd) */
86590 + op->cmd = MMUEXT_NEW_BASEPTR;
86591 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
86592 + op++;
86593 +
86594 + /*
86595 + * load the LDT, if the LDT is different:
86596 + */
86597 + if (unlikely(prev->context.ldt != next->context.ldt)) {
86598 + /* load_LDT_nolock(&next->context, cpu) */
86599 + op->cmd = MMUEXT_SET_LDT;
86600 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
86601 + op->arg2.nr_ents = next->context.size;
86602 + op++;
86603 + }
86604 +
86605 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
86606 + }
86607 +#if 0 /* XEN: no lazy tlb */
86608 + else {
86609 + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
86610 + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
86611 +
86612 + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
86613 + /* We were in lazy tlb mode and leave_mm disabled
86614 + * tlb flush IPI delivery. We must reload %cr3.
86615 + */
86616 + load_cr3(next->pgd);
86617 + load_LDT_nolock(&next->context, cpu);
86618 + }
86619 + }
86620 +#endif
86621 +}
86622 +
86623 +#define deactivate_mm(tsk, mm) \
86624 + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
86625 +
86626 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
86627 +{
86628 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
86629 + mm_pin(next);
86630 + switch_mm(prev, next, NULL);
86631 +}
86632 +
86633 +#endif
86634 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/page.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/page.h
86635 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/page.h 1970-01-01 00:00:00.000000000 +0000
86636 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/page.h 2007-01-08 15:00:45.000000000 +0000
86637 @@ -0,0 +1,227 @@
86638 +#ifndef _I386_PAGE_H
86639 +#define _I386_PAGE_H
86640 +
86641 +/* PAGE_SHIFT determines the page size */
86642 +#define PAGE_SHIFT 12
86643 +#define PAGE_SIZE (1UL << PAGE_SHIFT)
86644 +#define PAGE_MASK (~(PAGE_SIZE-1))
86645 +
86646 +#ifdef CONFIG_X86_PAE
86647 +#define __PHYSICAL_MASK_SHIFT 36
86648 +#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1)
86649 +#define PHYSICAL_PAGE_MASK (~((1ULL << PAGE_SHIFT) - 1) & __PHYSICAL_MASK)
86650 +#else
86651 +#define __PHYSICAL_MASK_SHIFT 32
86652 +#define __PHYSICAL_MASK (~0UL)
86653 +#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK)
86654 +#endif
86655 +
86656 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
86657 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
86658 +
86659 +#ifdef __KERNEL__
86660 +#ifndef __ASSEMBLY__
86661 +
86662 +#include <linux/config.h>
86663 +#include <linux/string.h>
86664 +#include <linux/types.h>
86665 +#include <linux/kernel.h>
86666 +#include <asm/bug.h>
86667 +#include <xen/interface/xen.h>
86668 +#include <xen/features.h>
86669 +#include <xen/foreign_page.h>
86670 +
86671 +#define arch_free_page(_page,_order) \
86672 +({ int foreign = PageForeign(_page); \
86673 + if (foreign) \
86674 + (PageForeignDestructor(_page))(_page); \
86675 + foreign; \
86676 +})
86677 +#define HAVE_ARCH_FREE_PAGE
86678 +
86679 +#ifdef CONFIG_XEN_SCRUB_PAGES
86680 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
86681 +#else
86682 +#define scrub_pages(_p,_n) ((void)0)
86683 +#endif
86684 +
86685 +#ifdef CONFIG_X86_USE_3DNOW
86686 +
86687 +#include <asm/mmx.h>
86688 +
86689 +#define clear_page(page) mmx_clear_page((void *)(page))
86690 +#define copy_page(to,from) mmx_copy_page(to,from)
86691 +
86692 +#else
86693 +
86694 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
86695 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
86696 +
86697 +/*
86698 + * On older X86 processors it's not a win to use MMX here it seems.
86699 + * Maybe the K6-III ?
86700 + */
86701 +
86702 +#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
86703 +#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
86704 +
86705 +#endif
86706 +
86707 +#define clear_user_page(page, vaddr, pg) clear_page(page)
86708 +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
86709 +
86710 +/*
86711 + * These are used to make use of C type-checking..
86712 + */
86713 +extern int nx_enabled;
86714 +#ifdef CONFIG_X86_PAE
86715 +extern unsigned long long __supported_pte_mask;
86716 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
86717 +typedef struct { unsigned long long pmd; } pmd_t;
86718 +typedef struct { unsigned long long pgd; } pgd_t;
86719 +typedef struct { unsigned long long pgprot; } pgprot_t;
86720 +#define pgprot_val(x) ((x).pgprot)
86721 +#include <asm/maddr.h>
86722 +#define __pte(x) ({ unsigned long long _x = (x); \
86723 + if (_x & 1) _x = phys_to_machine(_x); \
86724 + ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
86725 +#define __pgd(x) ({ unsigned long long _x = (x); \
86726 + (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
86727 +#define __pmd(x) ({ unsigned long long _x = (x); \
86728 + (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
86729 +static inline unsigned long long pte_val(pte_t x)
86730 +{
86731 + unsigned long long ret;
86732 +
86733 + if (x.pte_low) {
86734 + ret = x.pte_low | (unsigned long long)x.pte_high << 32;
86735 + ret = pte_machine_to_phys(ret) | 1;
86736 + } else {
86737 + ret = 0;
86738 + }
86739 + return ret;
86740 +}
86741 +static inline unsigned long long pmd_val(pmd_t x)
86742 +{
86743 + unsigned long long ret = x.pmd;
86744 + if (ret) ret = pte_machine_to_phys(ret) | 1;
86745 + return ret;
86746 +}
86747 +static inline unsigned long long pgd_val(pgd_t x)
86748 +{
86749 + unsigned long long ret = x.pgd;
86750 + if (ret) ret = pte_machine_to_phys(ret) | 1;
86751 + return ret;
86752 +}
86753 +static inline unsigned long long pte_val_ma(pte_t x)
86754 +{
86755 + return (unsigned long long)x.pte_high << 32 | x.pte_low;
86756 +}
86757 +#define HPAGE_SHIFT 21
86758 +#else
86759 +typedef struct { unsigned long pte_low; } pte_t;
86760 +typedef struct { unsigned long pgd; } pgd_t;
86761 +typedef struct { unsigned long pgprot; } pgprot_t;
86762 +#define pgprot_val(x) ((x).pgprot)
86763 +#include <asm/maddr.h>
86764 +#define boot_pte_t pte_t /* or would you rather have a typedef */
86765 +#define pte_val(x) (((x).pte_low & 1) ? \
86766 + pte_machine_to_phys((x).pte_low) : \
86767 + (x).pte_low)
86768 +#define pte_val_ma(x) ((x).pte_low)
86769 +#define __pte(x) ({ unsigned long _x = (x); \
86770 + (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
86771 +#define __pgd(x) ({ unsigned long _x = (x); \
86772 + (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
86773 +static inline unsigned long pgd_val(pgd_t x)
86774 +{
86775 + unsigned long ret = x.pgd;
86776 + if (ret) ret = pte_machine_to_phys(ret) | 1;
86777 + return ret;
86778 +}
86779 +#define HPAGE_SHIFT 22
86780 +#endif
86781 +#define PTE_MASK PAGE_MASK
86782 +
86783 +#ifdef CONFIG_HUGETLB_PAGE
86784 +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
86785 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
86786 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
86787 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
86788 +#endif
86789 +
86790 +#define __pgprot(x) ((pgprot_t) { (x) } )
86791 +
86792 +#endif /* !__ASSEMBLY__ */
86793 +
86794 +/* to align the pointer to the (next) page boundary */
86795 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
86796 +
86797 +/*
86798 + * This handles the memory map.. We could make this a config
86799 + * option, but too many people screw it up, and too few need
86800 + * it.
86801 + *
86802 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
86803 + * a virtual address space of one gigabyte, which limits the
86804 + * amount of physical memory you can use to about 950MB.
86805 + *
86806 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
86807 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
86808 + */
86809 +
86810 +#ifndef __ASSEMBLY__
86811 +
86812 +/*
86813 + * This much address space is reserved for vmalloc() and iomap()
86814 + * as well as fixmap mappings.
86815 + */
86816 +extern unsigned int __VMALLOC_RESERVE;
86817 +
86818 +extern int sysctl_legacy_va_layout;
86819 +
86820 +extern int page_is_ram(unsigned long pagenr);
86821 +
86822 +#endif /* __ASSEMBLY__ */
86823 +
86824 +#ifdef __ASSEMBLY__
86825 +#define __PAGE_OFFSET CONFIG_PAGE_OFFSET
86826 +#define __PHYSICAL_START CONFIG_PHYSICAL_START
86827 +#else
86828 +#define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET)
86829 +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
86830 +#endif
86831 +#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START)
86832 +
86833 +#ifdef CONFIG_XEN_COMPAT_030002
86834 +#undef LOAD_OFFSET
86835 +#define LOAD_OFFSET 0
86836 +#endif /* CONFIG_XEN_COMPAT_030002 */
86837 +
86838 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
86839 +#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
86840 +#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
86841 +#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
86842 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
86843 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
86844 +#ifdef CONFIG_FLATMEM
86845 +#define pfn_to_page(pfn) (mem_map + (pfn))
86846 +#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
86847 +#define pfn_valid(pfn) ((pfn) < max_mapnr)
86848 +#endif /* CONFIG_FLATMEM */
86849 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
86850 +
86851 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
86852 +
86853 +#define VM_DATA_DEFAULT_FLAGS \
86854 + (VM_READ | VM_WRITE | \
86855 + ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
86856 + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
86857 +
86858 +#define __HAVE_ARCH_GATE_AREA 1
86859 +
86860 +#endif /* __KERNEL__ */
86861 +
86862 +#include <asm-generic/page.h>
86863 +
86864 +#endif /* _I386_PAGE_H */
86865 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/param.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/param.h
86866 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/param.h 1970-01-01 00:00:00.000000000 +0000
86867 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/param.h 2007-01-08 15:00:45.000000000 +0000
86868 @@ -0,0 +1,24 @@
86869 +#ifndef _ASMi386_PARAM_H
86870 +#define _ASMi386_PARAM_H
86871 +
86872 +#ifdef __KERNEL__
86873 +# include <linux/config.h>
86874 +# define HZ CONFIG_HZ /* Internal kernel timer frequency */
86875 +# define USER_HZ 100 /* .. some user interfaces are in "ticks" */
86876 +# define CLOCKS_PER_SEC (USER_HZ) /* like times() */
86877 +#endif
86878 +
86879 +#ifndef HZ
86880 +#define HZ 100
86881 +#endif
86882 +
86883 +#define EXEC_PAGESIZE 4096
86884 +
86885 +#ifndef NOGROUP
86886 +#define NOGROUP (-1)
86887 +#endif
86888 +
86889 +#define MAXHOSTNAMELEN 64 /* max length of hostname */
86890 +#define COMMAND_LINE_SIZE 256
86891 +
86892 +#endif
86893 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pci.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pci.h
86894 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pci.h 1970-01-01 00:00:00.000000000 +0000
86895 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pci.h 2007-01-08 15:00:45.000000000 +0000
86896 @@ -0,0 +1,154 @@
86897 +#ifndef __i386_PCI_H
86898 +#define __i386_PCI_H
86899 +
86900 +#include <linux/config.h>
86901 +
86902 +#ifdef __KERNEL__
86903 +#include <linux/mm.h> /* for struct page */
86904 +
86905 +/* Can be used to override the logic in pci_scan_bus for skipping
86906 + already-configured bus numbers - to be used for buggy BIOSes
86907 + or architectures with incomplete PCI setup by the loader */
86908 +
86909 +#ifdef CONFIG_PCI
86910 +extern unsigned int pcibios_assign_all_busses(void);
86911 +#else
86912 +#define pcibios_assign_all_busses() 0
86913 +#endif
86914 +#define pcibios_scan_all_fns(a, b) 0
86915 +
86916 +extern unsigned long pci_mem_start;
86917 +#define PCIBIOS_MIN_IO 0x1000
86918 +#define PCIBIOS_MIN_MEM (pci_mem_start)
86919 +
86920 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
86921 +
86922 +void pcibios_config_init(void);
86923 +struct pci_bus * pcibios_scan_root(int bus);
86924 +
86925 +void pcibios_set_master(struct pci_dev *dev);
86926 +void pcibios_penalize_isa_irq(int irq, int active);
86927 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
86928 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
86929 +
86930 +/* Dynamic DMA mapping stuff.
86931 + * i386 has everything mapped statically.
86932 + */
86933 +
86934 +#include <linux/types.h>
86935 +#include <linux/slab.h>
86936 +#include <asm/scatterlist.h>
86937 +#include <linux/string.h>
86938 +#include <asm/io.h>
86939 +
86940 +struct pci_dev;
86941 +
86942 +#ifdef CONFIG_SWIOTLB
86943 +
86944 +
86945 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
86946 +#define PCI_DMA_BUS_IS_PHYS (0)
86947 +
86948 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
86949 + dma_addr_t ADDR_NAME;
86950 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
86951 + __u32 LEN_NAME;
86952 +#define pci_unmap_addr(PTR, ADDR_NAME) \
86953 + ((PTR)->ADDR_NAME)
86954 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
86955 + (((PTR)->ADDR_NAME) = (VAL))
86956 +#define pci_unmap_len(PTR, LEN_NAME) \
86957 + ((PTR)->LEN_NAME)
86958 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
86959 + (((PTR)->LEN_NAME) = (VAL))
86960 +
86961 +#else
86962 +
86963 +/* The PCI address space does equal the physical memory
86964 + * address space. The networking and block device layers use
86965 + * this boolean for bounce buffer decisions.
86966 + */
86967 +#define PCI_DMA_BUS_IS_PHYS (1)
86968 +
86969 +/* pci_unmap_{page,single} is a nop so... */
86970 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
86971 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
86972 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
86973 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
86974 +#define pci_unmap_len(PTR, LEN_NAME) (0)
86975 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
86976 +
86977 +#endif
86978 +
86979 +/* This is always fine. */
86980 +#define pci_dac_dma_supported(pci_dev, mask) (1)
86981 +
86982 +static inline dma64_addr_t
86983 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
86984 +{
86985 + return ((dma64_addr_t) page_to_phys(page) +
86986 + (dma64_addr_t) offset);
86987 +}
86988 +
86989 +static inline struct page *
86990 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
86991 +{
86992 + return pfn_to_page(dma_addr >> PAGE_SHIFT);
86993 +}
86994 +
86995 +static inline unsigned long
86996 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
86997 +{
86998 + return (dma_addr & ~PAGE_MASK);
86999 +}
87000 +
87001 +static inline void
87002 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
87003 +{
87004 +}
87005 +
87006 +static inline void
87007 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
87008 +{
87009 + flush_write_buffers();
87010 +}
87011 +
87012 +#define HAVE_PCI_MMAP
87013 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
87014 + enum pci_mmap_state mmap_state, int write_combine);
87015 +
87016 +
87017 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
87018 +{
87019 +}
87020 +
87021 +#ifdef CONFIG_PCI
87022 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
87023 + enum pci_dma_burst_strategy *strat,
87024 + unsigned long *strategy_parameter)
87025 +{
87026 + *strat = PCI_DMA_BURST_INFINITY;
87027 + *strategy_parameter = ~0UL;
87028 +}
87029 +#endif
87030 +
87031 +#endif /* __KERNEL__ */
87032 +
87033 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
87034 +#include <xen/pcifront.h>
87035 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
87036 +
87037 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
87038 +#include <asm-generic/pci-dma-compat.h>
87039 +
87040 +/* generic pci stuff */
87041 +#include <asm-generic/pci.h>
87042 +
87043 +/* On Xen we have to scan all functions since Xen hides bridges from
87044 + * us. If a bridge is at fn=0 and that slot has a multifunction
87045 + * device, we won't find the additional devices without scanning all
87046 + * functions. */
87047 +#undef pcibios_scan_all_fns
87048 +#define pcibios_scan_all_fns(a, b) 1
87049 +
87050 +#endif /* __i386_PCI_H */
87051 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgalloc.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgalloc.h
87052 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgalloc.h 1970-01-01 00:00:00.000000000 +0000
87053 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
87054 @@ -0,0 +1,64 @@
87055 +#ifndef _I386_PGALLOC_H
87056 +#define _I386_PGALLOC_H
87057 +
87058 +#include <linux/config.h>
87059 +#include <asm/fixmap.h>
87060 +#include <linux/threads.h>
87061 +#include <linux/mm.h> /* for struct page */
87062 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
87063 +
87064 +/* Is this pagetable pinned? */
87065 +#define PG_pinned PG_arch_1
87066 +
87067 +#define pmd_populate_kernel(mm, pmd, pte) \
87068 + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
87069 +
87070 +#define pmd_populate(mm, pmd, pte) \
87071 +do { \
87072 + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \
87073 + if (!PageHighMem(pte)) \
87074 + BUG_ON(HYPERVISOR_update_va_mapping( \
87075 + (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
87076 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
87077 + set_pmd(pmd, __pmd(_PAGE_TABLE + \
87078 + ((unsigned long long)page_to_pfn(pte) << \
87079 + (unsigned long long) PAGE_SHIFT))); \
87080 + } else { \
87081 + *(pmd) = __pmd(_PAGE_TABLE + \
87082 + ((unsigned long long)page_to_pfn(pte) << \
87083 + (unsigned long long) PAGE_SHIFT)); \
87084 + } \
87085 +} while (0)
87086 +
87087 +/*
87088 + * Allocate and free page tables.
87089 + */
87090 +extern pgd_t *pgd_alloc(struct mm_struct *);
87091 +extern void pgd_free(pgd_t *pgd);
87092 +
87093 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
87094 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
87095 +
87096 +static inline void pte_free_kernel(pte_t *pte)
87097 +{
87098 + free_page((unsigned long)pte);
87099 + make_page_writable(pte, XENFEAT_writable_page_tables);
87100 +}
87101 +
87102 +extern void pte_free(struct page *pte);
87103 +
87104 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
87105 +
87106 +#ifdef CONFIG_X86_PAE
87107 +/*
87108 + * In the PAE case we free the pmds as part of the pgd.
87109 + */
87110 +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
87111 +#define pmd_free(x) do { } while (0)
87112 +#define __pmd_free_tlb(tlb,x) do { } while (0)
87113 +#define pud_populate(mm, pmd, pte) BUG()
87114 +#endif
87115 +
87116 +#define check_pgt_cache() do { } while (0)
87117 +
87118 +#endif /* _I386_PGALLOC_H */
87119 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h
87120 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 1970-01-01 00:00:00.000000000 +0000
87121 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 2007-01-08 15:00:46.000000000 +0000
87122 @@ -0,0 +1,20 @@
87123 +#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
87124 +#define _I386_PGTABLE_2LEVEL_DEFS_H
87125 +
87126 +#define HAVE_SHARED_KERNEL_PMD 0
87127 +
87128 +/*
87129 + * traditional i386 two-level paging structure:
87130 + */
87131 +
87132 +#define PGDIR_SHIFT 22
87133 +#define PTRS_PER_PGD 1024
87134 +
87135 +/*
87136 + * the i386 is two-level, so we don't really have any
87137 + * PMD directory physically.
87138 + */
87139 +
87140 +#define PTRS_PER_PTE 1024
87141 +
87142 +#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */
87143 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level.h
87144 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-2level.h 1970-01-01 00:00:00.000000000 +0000
87145 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-2level.h 2007-01-08 15:00:46.000000000 +0000
87146 @@ -0,0 +1,85 @@
87147 +#ifndef _I386_PGTABLE_2LEVEL_H
87148 +#define _I386_PGTABLE_2LEVEL_H
87149 +
87150 +#include <asm-generic/pgtable-nopmd.h>
87151 +
87152 +#define pte_ERROR(e) \
87153 + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
87154 +#define pgd_ERROR(e) \
87155 + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
87156 +
87157 +/*
87158 + * Certain architectures need to do special things when PTEs
87159 + * within a page table are directly modified. Thus, the following
87160 + * hook is made available.
87161 + */
87162 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
87163 +
87164 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
87165 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
87166 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
87167 + set_pte((ptep), (pteval)); \
87168 +} while (0)
87169 +
87170 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
87171 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
87172 + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
87173 + set_pte((ptep), (pteval)); \
87174 + xen_invlpg((addr)); \
87175 + } \
87176 +} while (0)
87177 +
87178 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
87179 +
87180 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
87181 +
87182 +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
87183 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
87184 +
87185 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
87186 +#define pte_same(a, b) ((a).pte_low == (b).pte_low)
87187 +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
87188 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
87189 +
87190 +#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
87191 +
87192 +#define pte_none(x) (!(x).pte_low)
87193 +#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
87194 +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
87195 +
87196 +/*
87197 + * All present user pages are user-executable:
87198 + */
87199 +static inline int pte_exec(pte_t pte)
87200 +{
87201 + return pte_user(pte);
87202 +}
87203 +
87204 +/*
87205 + * All present pages are kernel-executable:
87206 + */
87207 +static inline int pte_exec_kernel(pte_t pte)
87208 +{
87209 + return 1;
87210 +}
87211 +
87212 +/*
87213 + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
87214 + * into this range:
87215 + */
87216 +#define PTE_FILE_MAX_BITS 29
87217 +
87218 +#define pte_to_pgoff(pte) \
87219 + ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
87220 +
87221 +#define pgoff_to_pte(off) \
87222 + ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
87223 +
87224 +/* Encode and de-code a swap entry */
87225 +#define __swp_type(x) (((x).val >> 1) & 0x1f)
87226 +#define __swp_offset(x) ((x).val >> 8)
87227 +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
87228 +#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
87229 +#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
87230 +
87231 +#endif /* _I386_PGTABLE_2LEVEL_H */
87232 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
87233 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 1970-01-01 00:00:00.000000000 +0000
87234 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 2007-01-08 15:00:46.000000000 +0000
87235 @@ -0,0 +1,24 @@
87236 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
87237 +#define _I386_PGTABLE_3LEVEL_DEFS_H
87238 +
87239 +#define HAVE_SHARED_KERNEL_PMD 0
87240 +
87241 +/*
87242 + * PGDIR_SHIFT determines what a top-level page table entry can map
87243 + */
87244 +#define PGDIR_SHIFT 30
87245 +#define PTRS_PER_PGD 4
87246 +
87247 +/*
87248 + * PMD_SHIFT determines the size of the area a middle-level
87249 + * page table can map
87250 + */
87251 +#define PMD_SHIFT 21
87252 +#define PTRS_PER_PMD 512
87253 +
87254 +/*
87255 + * entries per page directory level
87256 + */
87257 +#define PTRS_PER_PTE 512
87258 +
87259 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
87260 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level.h
87261 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable-3level.h 1970-01-01 00:00:00.000000000 +0000
87262 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable-3level.h 2007-01-08 15:00:46.000000000 +0000
87263 @@ -0,0 +1,183 @@
87264 +#ifndef _I386_PGTABLE_3LEVEL_H
87265 +#define _I386_PGTABLE_3LEVEL_H
87266 +
87267 +#include <asm-generic/pgtable-nopud.h>
87268 +
87269 +/*
87270 + * Intel Physical Address Extension (PAE) Mode - three-level page
87271 + * tables on PPro+ CPUs.
87272 + *
87273 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
87274 + */
87275 +
87276 +#define pte_ERROR(e) \
87277 + printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
87278 +#define pmd_ERROR(e) \
87279 + printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
87280 +#define pgd_ERROR(e) \
87281 + printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
87282 +
87283 +#define pud_none(pud) 0
87284 +#define pud_bad(pud) 0
87285 +#define pud_present(pud) 1
87286 +
87287 +/*
87288 + * Is the pte executable?
87289 + */
87290 +static inline int pte_x(pte_t pte)
87291 +{
87292 + return !(pte_val(pte) & _PAGE_NX);
87293 +}
87294 +
87295 +/*
87296 + * All present user-pages with !NX bit are user-executable:
87297 + */
87298 +static inline int pte_exec(pte_t pte)
87299 +{
87300 + return pte_user(pte) && pte_x(pte);
87301 +}
87302 +/*
87303 + * All present pages with !NX bit are kernel-executable:
87304 + */
87305 +static inline int pte_exec_kernel(pte_t pte)
87306 +{
87307 + return pte_x(pte);
87308 +}
87309 +
87310 +/* Rules for using set_pte: the pte being assigned *must* be
87311 + * either not present or in a state where the hardware will
87312 + * not attempt to update the pte. In places where this is
87313 + * not possible, use pte_get_and_clear to obtain the old pte
87314 + * value and then use set_pte to update it. -ben
87315 + */
87316 +#define __HAVE_ARCH_SET_PTE_ATOMIC
87317 +
87318 +#if 1
87319 +/* use writable pagetables */
87320 +static inline void set_pte(pte_t *ptep, pte_t pte)
87321 +{
87322 + ptep->pte_high = pte.pte_high;
87323 + smp_wmb();
87324 + ptep->pte_low = pte.pte_low;
87325 +}
87326 +# define set_pte_atomic(pteptr,pteval) \
87327 + set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
87328 +#else
87329 +/* no writable pagetables */
87330 +# define set_pte(pteptr,pteval) \
87331 + xen_l1_entry_update((pteptr), (pteval))
87332 +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
87333 +#endif
87334 +
87335 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
87336 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
87337 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
87338 + set_pte((ptep), (pteval)); \
87339 +} while (0)
87340 +
87341 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
87342 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
87343 + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
87344 + set_pte((ptep), (pteval)); \
87345 + xen_invlpg((addr)); \
87346 + } \
87347 +} while (0)
87348 +
87349 +#define set_pmd(pmdptr,pmdval) \
87350 + xen_l2_entry_update((pmdptr), (pmdval))
87351 +#define set_pud(pudptr,pudval) \
87352 + xen_l3_entry_update((pudptr), (pudval))
87353 +
87354 +/*
87355 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
87356 + * the TLB via cr3 if the top-level pgd is changed...
87357 + * We do not let the generic code free and clear pgd entries due to
87358 + * this erratum.
87359 + */
87360 +static inline void pud_clear (pud_t * pud) { }
87361 +
87362 +#define pud_page(pud) \
87363 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
87364 +
87365 +#define pud_page_kernel(pud) \
87366 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
87367 +
87368 +
87369 +/* Find an entry in the second-level page table.. */
87370 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
87371 + pmd_index(address))
87372 +
87373 +/*
87374 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
87375 + * entry, so clear the bottom half first and enforce ordering with a compiler
87376 + * barrier.
87377 + */
87378 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87379 +{
87380 + ptep->pte_low = 0;
87381 + smp_wmb();
87382 + ptep->pte_high = 0;
87383 +}
87384 +
87385 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
87386 +
87387 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87388 +{
87389 + pte_t res;
87390 +
87391 + /* xchg acts as a barrier before the setting of the high bits */
87392 + res.pte_low = xchg(&ptep->pte_low, 0);
87393 + res.pte_high = ptep->pte_high;
87394 + ptep->pte_high = 0;
87395 +
87396 + return res;
87397 +}
87398 +
87399 +static inline int pte_same(pte_t a, pte_t b)
87400 +{
87401 + return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
87402 +}
87403 +
87404 +#define pte_page(x) pfn_to_page(pte_pfn(x))
87405 +
87406 +static inline int pte_none(pte_t pte)
87407 +{
87408 + return !pte.pte_low && !pte.pte_high;
87409 +}
87410 +
87411 +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
87412 + (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
87413 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
87414 +
87415 +extern unsigned long long __supported_pte_mask;
87416 +
87417 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
87418 +{
87419 + return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
87420 +}
87421 +
87422 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
87423 +{
87424 + BUG(); panic("needs review");
87425 + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
87426 + pgprot_val(pgprot)) & __supported_pte_mask);
87427 +}
87428 +
87429 +/*
87430 + * Bits 0, 6 and 7 are taken in the low part of the pte,
87431 + * put the 32 bits of offset into the high part.
87432 + */
87433 +#define pte_to_pgoff(pte) ((pte).pte_high)
87434 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
87435 +#define PTE_FILE_MAX_BITS 32
87436 +
87437 +/* Encode and de-code a swap entry */
87438 +#define __swp_type(x) (((x).val) & 0x1f)
87439 +#define __swp_offset(x) ((x).val >> 5)
87440 +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
87441 +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
87442 +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
87443 +
87444 +#define __pmd_free_tlb(tlb, x) do { } while (0)
87445 +
87446 +#endif /* _I386_PGTABLE_3LEVEL_H */
87447 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable.h
87448 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/pgtable.h 1970-01-01 00:00:00.000000000 +0000
87449 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/pgtable.h 2007-01-08 15:00:46.000000000 +0000
87450 @@ -0,0 +1,510 @@
87451 +#ifndef _I386_PGTABLE_H
87452 +#define _I386_PGTABLE_H
87453 +
87454 +#include <linux/config.h>
87455 +#include <asm/hypervisor.h>
87456 +
87457 +/*
87458 + * The Linux memory management assumes a three-level page table setup. On
87459 + * the i386, we use that, but "fold" the mid level into the top-level page
87460 + * table, so that we physically have the same two-level page table as the
87461 + * i386 mmu expects.
87462 + *
87463 + * This file contains the functions and defines necessary to modify and use
87464 + * the i386 page table tree.
87465 + */
87466 +#ifndef __ASSEMBLY__
87467 +#include <asm/processor.h>
87468 +#include <asm/fixmap.h>
87469 +#include <linux/threads.h>
87470 +
87471 +#ifndef _I386_BITOPS_H
87472 +#include <asm/bitops.h>
87473 +#endif
87474 +
87475 +#include <linux/slab.h>
87476 +#include <linux/list.h>
87477 +#include <linux/spinlock.h>
87478 +
87479 +struct mm_struct;
87480 +struct vm_area_struct;
87481 +
87482 +/*
87483 + * ZERO_PAGE is a global shared page that is always zero: used
87484 + * for zero-mapped memory areas etc..
87485 + */
87486 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
87487 +extern unsigned long empty_zero_page[1024];
87488 +extern pgd_t *swapper_pg_dir;
87489 +extern kmem_cache_t *pgd_cache;
87490 +extern kmem_cache_t *pmd_cache;
87491 +extern spinlock_t pgd_lock;
87492 +extern struct page *pgd_list;
87493 +
87494 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
87495 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
87496 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
87497 +void pgtable_cache_init(void);
87498 +void paging_init(void);
87499 +
87500 +/*
87501 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
87502 + * implements both the traditional 2-level x86 page tables and the
87503 + * newer 3-level PAE-mode page tables.
87504 + */
87505 +#ifdef CONFIG_X86_PAE
87506 +# include <asm/pgtable-3level-defs.h>
87507 +# define PMD_SIZE (1UL << PMD_SHIFT)
87508 +# define PMD_MASK (~(PMD_SIZE-1))
87509 +#else
87510 +# include <asm/pgtable-2level-defs.h>
87511 +#endif
87512 +
87513 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
87514 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
87515 +
87516 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
87517 +#define FIRST_USER_ADDRESS 0
87518 +
87519 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
87520 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
87521 +
87522 +#define TWOLEVEL_PGDIR_SHIFT 22
87523 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
87524 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
87525 +
87526 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
87527 + * current 8MB value just means that there will be a 8MB "hole" after the
87528 + * physical memory until the kernel virtual memory starts. That means that
87529 + * any out-of-bounds memory accesses will hopefully be caught.
87530 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
87531 + * area for the same reason. ;)
87532 + */
87533 +#define VMALLOC_OFFSET (8*1024*1024)
87534 +#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \
87535 + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
87536 +#ifdef CONFIG_HIGHMEM
87537 +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
87538 +#else
87539 +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
87540 +#endif
87541 +
87542 +/*
87543 + * _PAGE_PSE set in the page directory entry just means that
87544 + * the page directory entry points directly to a 4MB-aligned block of
87545 + * memory.
87546 + */
87547 +#define _PAGE_BIT_PRESENT 0
87548 +#define _PAGE_BIT_RW 1
87549 +#define _PAGE_BIT_USER 2
87550 +#define _PAGE_BIT_PWT 3
87551 +#define _PAGE_BIT_PCD 4
87552 +#define _PAGE_BIT_ACCESSED 5
87553 +#define _PAGE_BIT_DIRTY 6
87554 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
87555 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
87556 +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
87557 +#define _PAGE_BIT_UNUSED2 10
87558 +#define _PAGE_BIT_UNUSED3 11
87559 +#define _PAGE_BIT_NX 63
87560 +
87561 +#define _PAGE_PRESENT 0x001
87562 +#define _PAGE_RW 0x002
87563 +#define _PAGE_USER 0x004
87564 +#define _PAGE_PWT 0x008
87565 +#define _PAGE_PCD 0x010
87566 +#define _PAGE_ACCESSED 0x020
87567 +#define _PAGE_DIRTY 0x040
87568 +#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
87569 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
87570 +#define _PAGE_UNUSED1 0x200 /* available for programmer */
87571 +#define _PAGE_UNUSED2 0x400
87572 +#define _PAGE_UNUSED3 0x800
87573 +
87574 +/* If _PAGE_PRESENT is clear, we use these: */
87575 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
87576 +#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
87577 + pte_present gives true */
87578 +#ifdef CONFIG_X86_PAE
87579 +#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
87580 +#else
87581 +#define _PAGE_NX 0
87582 +#endif
87583 +
87584 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
87585 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
87586 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
87587 +
87588 +#define PAGE_NONE \
87589 + __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
87590 +#define PAGE_SHARED \
87591 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
87592 +
87593 +#define PAGE_SHARED_EXEC \
87594 + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
87595 +#define PAGE_COPY_NOEXEC \
87596 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
87597 +#define PAGE_COPY_EXEC \
87598 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
87599 +#define PAGE_COPY \
87600 + PAGE_COPY_NOEXEC
87601 +#define PAGE_READONLY \
87602 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
87603 +#define PAGE_READONLY_EXEC \
87604 + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
87605 +
87606 +#define _PAGE_KERNEL \
87607 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
87608 +#define _PAGE_KERNEL_EXEC \
87609 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
87610 +
87611 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
87612 +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
87613 +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
87614 +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
87615 +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
87616 +
87617 +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
87618 +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
87619 +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
87620 +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
87621 +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
87622 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
87623 +
87624 +/*
87625 + * The i386 can't do page protection for execute, and considers that
87626 + * the same are read. Also, write permissions imply read permissions.
87627 + * This is the closest we can get..
87628 + */
87629 +#define __P000 PAGE_NONE
87630 +#define __P001 PAGE_READONLY
87631 +#define __P010 PAGE_COPY
87632 +#define __P011 PAGE_COPY
87633 +#define __P100 PAGE_READONLY_EXEC
87634 +#define __P101 PAGE_READONLY_EXEC
87635 +#define __P110 PAGE_COPY_EXEC
87636 +#define __P111 PAGE_COPY_EXEC
87637 +
87638 +#define __S000 PAGE_NONE
87639 +#define __S001 PAGE_READONLY
87640 +#define __S010 PAGE_SHARED
87641 +#define __S011 PAGE_SHARED
87642 +#define __S100 PAGE_READONLY_EXEC
87643 +#define __S101 PAGE_READONLY_EXEC
87644 +#define __S110 PAGE_SHARED_EXEC
87645 +#define __S111 PAGE_SHARED_EXEC
87646 +
87647 +/*
87648 + * Define this if things work differently on an i386 and an i486:
87649 + * it will (on an i486) warn about kernel memory accesses that are
87650 + * done without a 'access_ok(VERIFY_WRITE,..)'
87651 + */
87652 +#undef TEST_ACCESS_OK
87653 +
87654 +/* The boot page tables (all created as a single array) */
87655 +extern unsigned long pg0[];
87656 +
87657 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
87658 +
87659 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
87660 +#define pmd_none(x) (!(unsigned long)pmd_val(x))
87661 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
87662 + can temporarily clear it. */
87663 +#define pmd_present(x) (pmd_val(x))
87664 +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
87665 +
87666 +
87667 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
87668 +
87669 +/*
87670 + * The following only work if pte_present() is true.
87671 + * Undefined behaviour if not..
87672 + */
87673 +#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
87674 +static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
87675 +static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
87676 +static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
87677 +static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
87678 +static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
87679 +static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
87680 +
87681 +/*
87682 + * The following only works if pte_present() is not true.
87683 + */
87684 +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
87685 +
87686 +static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
87687 +static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
87688 +static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
87689 +static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
87690 +static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
87691 +static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
87692 +static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
87693 +static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
87694 +static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
87695 +static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
87696 +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; }
87697 +
87698 +#ifdef CONFIG_X86_PAE
87699 +# include <asm/pgtable-3level.h>
87700 +#else
87701 +# include <asm/pgtable-2level.h>
87702 +#endif
87703 +
87704 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
87705 +{
87706 + if (!pte_dirty(*ptep))
87707 + return 0;
87708 + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
87709 +}
87710 +
87711 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
87712 +{
87713 + if (!pte_young(*ptep))
87714 + return 0;
87715 + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
87716 +}
87717 +
87718 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
87719 +{
87720 + pte_t pte;
87721 + if (full) {
87722 + pte = *ptep;
87723 + pte_clear(mm, addr, ptep);
87724 + } else {
87725 + pte = ptep_get_and_clear(mm, addr, ptep);
87726 + }
87727 + return pte;
87728 +}
87729 +
87730 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
87731 +{
87732 + if (pte_write(*ptep))
87733 + clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
87734 +}
87735 +
87736 +/*
87737 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
87738 + *
87739 + * dst - pointer to pgd range anwhere on a pgd page
87740 + * src - ""
87741 + * count - the number of pgds to copy.
87742 + *
87743 + * dst and src can be on the same page, but the range must not overlap,
87744 + * and must not cross a page boundary.
87745 + */
87746 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
87747 +{
87748 + memcpy(dst, src, count * sizeof(pgd_t));
87749 +}
87750 +
87751 +/*
87752 + * Macro to mark a page protection value as "uncacheable". On processors which do not support
87753 + * it, this is a no-op.
87754 + */
87755 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
87756 + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
87757 +
87758 +/*
87759 + * Conversion functions: convert a page and protection to a page entry,
87760 + * and a page entry and page directory to the page they refer to.
87761 + */
87762 +
87763 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
87764 +
87765 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
87766 +{
87767 + pte.pte_low &= _PAGE_CHG_MASK;
87768 + pte.pte_low |= pgprot_val(newprot);
87769 +#ifdef CONFIG_X86_PAE
87770 + /*
87771 + * Chop off the NX bit (if present), and add the NX portion of
87772 + * the newprot (if present):
87773 + */
87774 + pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
87775 + pte.pte_high |= (pgprot_val(newprot) >> 32) & \
87776 + (__supported_pte_mask >> 32);
87777 +#endif
87778 + return pte;
87779 +}
87780 +
87781 +#define pmd_large(pmd) \
87782 +((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
87783 +
87784 +/*
87785 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
87786 + *
87787 + * this macro returns the index of the entry in the pgd page which would
87788 + * control the given virtual address
87789 + */
87790 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
87791 +#define pgd_index_k(addr) pgd_index(addr)
87792 +
87793 +/*
87794 + * pgd_offset() returns a (pgd_t *)
87795 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
87796 + */
87797 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
87798 +
87799 +/*
87800 + * a shortcut which implies the use of the kernel's pgd, instead
87801 + * of a process's
87802 + */
87803 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
87804 +
87805 +/*
87806 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
87807 + *
87808 + * this macro returns the index of the entry in the pmd page which would
87809 + * control the given virtual address
87810 + */
87811 +#define pmd_index(address) \
87812 + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
87813 +
87814 +/*
87815 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
87816 + *
87817 + * this macro returns the index of the entry in the pte page which would
87818 + * control the given virtual address
87819 + */
87820 +#define pte_index(address) \
87821 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
87822 +#define pte_offset_kernel(dir, address) \
87823 + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
87824 +
87825 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
87826 +
87827 +#define pmd_page_kernel(pmd) \
87828 + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
87829 +
87830 +/*
87831 + * Helper function that returns the kernel pagetable entry controlling
87832 + * the virtual address 'address'. NULL means no pagetable entry present.
87833 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
87834 + * as a pte too.
87835 + */
87836 +extern pte_t *lookup_address(unsigned long address);
87837 +
87838 +/*
87839 + * Make a given kernel text page executable/non-executable.
87840 + * Returns the previous executability setting of that page (which
87841 + * is used to restore the previous state). Used by the SMP bootup code.
87842 + * NOTE: this is an __init function for security reasons.
87843 + */
87844 +#ifdef CONFIG_X86_PAE
87845 + extern int set_kernel_exec(unsigned long vaddr, int enable);
87846 +#else
87847 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
87848 +#endif
87849 +
87850 +extern void noexec_setup(const char *str);
87851 +
87852 +#if defined(CONFIG_HIGHPTE)
87853 +#define pte_offset_map(dir, address) \
87854 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
87855 + pte_index(address))
87856 +#define pte_offset_map_nested(dir, address) \
87857 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
87858 + pte_index(address))
87859 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
87860 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
87861 +#else
87862 +#define pte_offset_map(dir, address) \
87863 + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
87864 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
87865 +#define pte_unmap(pte) do { } while (0)
87866 +#define pte_unmap_nested(pte) do { } while (0)
87867 +#endif
87868 +
87869 +/*
87870 + * The i386 doesn't have any external MMU info: the kernel page
87871 + * tables contain all the necessary information.
87872 + *
87873 + * Also, we only update the dirty/accessed state if we set
87874 + * the dirty bit by hand in the kernel, since the hardware
87875 + * will do the accessed bit for us, and we don't want to
87876 + * race with other CPU's that might be updating the dirty
87877 + * bit at the same time.
87878 + */
87879 +#define update_mmu_cache(vma,address,pte) do { } while (0)
87880 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
87881 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
87882 + do { \
87883 + if (__dirty) { \
87884 + if ( likely((__vma)->vm_mm == current->mm) ) { \
87885 + BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
87886 + } else { \
87887 + xen_l1_entry_update((__ptep), (__entry)); \
87888 + flush_tlb_page((__vma), (__address)); \
87889 + } \
87890 + } \
87891 + } while (0)
87892 +
87893 +#define __HAVE_ARCH_PTEP_ESTABLISH
87894 +#define ptep_establish(__vma, __address, __ptep, __entry) \
87895 +do { \
87896 + ptep_set_access_flags(__vma, __address, __ptep, __entry, 1); \
87897 +} while (0)
87898 +
87899 +#include <xen/features.h>
87900 +void make_lowmem_page_readonly(void *va, unsigned int feature);
87901 +void make_lowmem_page_writable(void *va, unsigned int feature);
87902 +void make_page_readonly(void *va, unsigned int feature);
87903 +void make_page_writable(void *va, unsigned int feature);
87904 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
87905 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
87906 +
87907 +#define virt_to_ptep(__va) \
87908 +({ \
87909 + pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \
87910 + pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \
87911 + pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \
87912 + pte_offset_kernel(__pmd, (unsigned long)(__va)); \
87913 +})
87914 +
87915 +#define arbitrary_virt_to_machine(__va) \
87916 +({ \
87917 + maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
87918 + m | ((unsigned long)(__va) & (PAGE_SIZE-1)); \
87919 +})
87920 +
87921 +#endif /* !__ASSEMBLY__ */
87922 +
87923 +#ifdef CONFIG_FLATMEM
87924 +#define kern_addr_valid(addr) (1)
87925 +#endif /* CONFIG_FLATMEM */
87926 +
87927 +int direct_remap_pfn_range(struct vm_area_struct *vma,
87928 + unsigned long address,
87929 + unsigned long mfn,
87930 + unsigned long size,
87931 + pgprot_t prot,
87932 + domid_t domid);
87933 +int direct_kernel_remap_pfn_range(unsigned long address,
87934 + unsigned long mfn,
87935 + unsigned long size,
87936 + pgprot_t prot,
87937 + domid_t domid);
87938 +int create_lookup_pte_addr(struct mm_struct *mm,
87939 + unsigned long address,
87940 + uint64_t *ptep);
87941 +int touch_pte_range(struct mm_struct *mm,
87942 + unsigned long address,
87943 + unsigned long size);
87944 +
87945 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
87946 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
87947 +
87948 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
87949 +#define GET_IOSPACE(pfn) 0
87950 +#define GET_PFN(pfn) (pfn)
87951 +
87952 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
87953 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
87954 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
87955 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
87956 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
87957 +#define __HAVE_ARCH_PTE_SAME
87958 +#include <asm-generic/pgtable.h>
87959 +
87960 +#endif /* _I386_PGTABLE_H */
87961 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/processor.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/processor.h
87962 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/processor.h 1970-01-01 00:00:00.000000000 +0000
87963 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/processor.h 2007-01-08 15:00:46.000000000 +0000
87964 @@ -0,0 +1,750 @@
87965 +/*
87966 + * include/asm-i386/processor.h
87967 + *
87968 + * Copyright (C) 1994 Linus Torvalds
87969 + */
87970 +
87971 +#ifndef __ASM_I386_PROCESSOR_H
87972 +#define __ASM_I386_PROCESSOR_H
87973 +
87974 +#include <asm/vm86.h>
87975 +#include <asm/math_emu.h>
87976 +#include <asm/segment.h>
87977 +#include <asm/page.h>
87978 +#include <asm/types.h>
87979 +#include <asm/sigcontext.h>
87980 +#include <asm/cpufeature.h>
87981 +#include <asm/msr.h>
87982 +#include <asm/system.h>
87983 +#include <linux/cache.h>
87984 +#include <linux/config.h>
87985 +#include <linux/threads.h>
87986 +#include <asm/percpu.h>
87987 +#include <xen/interface/physdev.h>
87988 +
87989 +/* flag for disabling the tsc */
87990 +extern int tsc_disable;
87991 +
87992 +struct desc_struct {
87993 + unsigned long a,b;
87994 +};
87995 +
87996 +#define desc_empty(desc) \
87997 + (!((desc)->a | (desc)->b))
87998 +
87999 +#define desc_equal(desc1, desc2) \
88000 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
88001 +/*
88002 + * Default implementation of macro that returns current
88003 + * instruction pointer ("program counter").
88004 + */
88005 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
88006 +
88007 +/*
88008 + * CPU type and hardware bug flags. Kept separately for each CPU.
88009 + * Members of this structure are referenced in head.S, so think twice
88010 + * before touching them. [mj]
88011 + */
88012 +
88013 +struct cpuinfo_x86 {
88014 + __u8 x86; /* CPU family */
88015 + __u8 x86_vendor; /* CPU vendor */
88016 + __u8 x86_model;
88017 + __u8 x86_mask;
88018 + char wp_works_ok; /* It doesn't on 386's */
88019 + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
88020 + char hard_math;
88021 + char rfu;
88022 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
88023 + unsigned long x86_capability[NCAPINTS];
88024 + char x86_vendor_id[16];
88025 + char x86_model_id[64];
88026 + int x86_cache_size; /* in KB - valid for CPUS which support this
88027 + call */
88028 + int x86_cache_alignment; /* In bytes */
88029 + char fdiv_bug;
88030 + char f00f_bug;
88031 + char coma_bug;
88032 + char pad0;
88033 + int x86_power;
88034 + unsigned long loops_per_jiffy;
88035 + unsigned char x86_max_cores; /* cpuid returned max cores value */
88036 + unsigned char booted_cores; /* number of cores as seen by OS */
88037 + unsigned char apicid;
88038 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
88039 +
88040 +#define X86_VENDOR_INTEL 0
88041 +#define X86_VENDOR_CYRIX 1
88042 +#define X86_VENDOR_AMD 2
88043 +#define X86_VENDOR_UMC 3
88044 +#define X86_VENDOR_NEXGEN 4
88045 +#define X86_VENDOR_CENTAUR 5
88046 +#define X86_VENDOR_RISE 6
88047 +#define X86_VENDOR_TRANSMETA 7
88048 +#define X86_VENDOR_NSC 8
88049 +#define X86_VENDOR_NUM 9
88050 +#define X86_VENDOR_UNKNOWN 0xff
88051 +
88052 +/*
88053 + * capabilities of CPUs
88054 + */
88055 +
88056 +extern struct cpuinfo_x86 boot_cpu_data;
88057 +extern struct cpuinfo_x86 new_cpu_data;
88058 +#ifndef CONFIG_X86_NO_TSS
88059 +extern struct tss_struct doublefault_tss;
88060 +DECLARE_PER_CPU(struct tss_struct, init_tss);
88061 +#endif
88062 +
88063 +#ifdef CONFIG_SMP
88064 +extern struct cpuinfo_x86 cpu_data[];
88065 +#define current_cpu_data cpu_data[smp_processor_id()]
88066 +#else
88067 +#define cpu_data (&boot_cpu_data)
88068 +#define current_cpu_data boot_cpu_data
88069 +#endif
88070 +
88071 +extern int phys_proc_id[NR_CPUS];
88072 +extern int cpu_core_id[NR_CPUS];
88073 +extern char ignore_fpu_irq;
88074 +
88075 +extern void identify_cpu(struct cpuinfo_x86 *);
88076 +extern void print_cpu_info(struct cpuinfo_x86 *);
88077 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
88078 +
88079 +#ifdef CONFIG_X86_HT
88080 +extern void detect_ht(struct cpuinfo_x86 *c);
88081 +#else
88082 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
88083 +#endif
88084 +
88085 +/*
88086 + * EFLAGS bits
88087 + */
88088 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
88089 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
88090 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
88091 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
88092 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
88093 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
88094 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
88095 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
88096 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
88097 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
88098 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
88099 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
88100 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
88101 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
88102 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
88103 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
88104 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
88105 +
88106 +/*
88107 + * Generic CPUID function
88108 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
88109 + * resulting in stale register contents being returned.
88110 + */
88111 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
88112 +{
88113 + __asm__(XEN_CPUID
88114 + : "=a" (*eax),
88115 + "=b" (*ebx),
88116 + "=c" (*ecx),
88117 + "=d" (*edx)
88118 + : "0" (op), "c"(0));
88119 +}
88120 +
88121 +/* Some CPUID calls want 'count' to be placed in ecx */
88122 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
88123 + int *edx)
88124 +{
88125 + __asm__(XEN_CPUID
88126 + : "=a" (*eax),
88127 + "=b" (*ebx),
88128 + "=c" (*ecx),
88129 + "=d" (*edx)
88130 + : "0" (op), "c" (count));
88131 +}
88132 +
88133 +/*
88134 + * CPUID functions returning a single datum
88135 + */
88136 +static inline unsigned int cpuid_eax(unsigned int op)
88137 +{
88138 + unsigned int eax;
88139 +
88140 + __asm__(XEN_CPUID
88141 + : "=a" (eax)
88142 + : "0" (op)
88143 + : "bx", "cx", "dx");
88144 + return eax;
88145 +}
88146 +static inline unsigned int cpuid_ebx(unsigned int op)
88147 +{
88148 + unsigned int eax, ebx;
88149 +
88150 + __asm__(XEN_CPUID
88151 + : "=a" (eax), "=b" (ebx)
88152 + : "0" (op)
88153 + : "cx", "dx" );
88154 + return ebx;
88155 +}
88156 +static inline unsigned int cpuid_ecx(unsigned int op)
88157 +{
88158 + unsigned int eax, ecx;
88159 +
88160 + __asm__(XEN_CPUID
88161 + : "=a" (eax), "=c" (ecx)
88162 + : "0" (op)
88163 + : "bx", "dx" );
88164 + return ecx;
88165 +}
88166 +static inline unsigned int cpuid_edx(unsigned int op)
88167 +{
88168 + unsigned int eax, edx;
88169 +
88170 + __asm__(XEN_CPUID
88171 + : "=a" (eax), "=d" (edx)
88172 + : "0" (op)
88173 + : "bx", "cx");
88174 + return edx;
88175 +}
88176 +
88177 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
88178 +
88179 +/*
88180 + * Intel CPU features in CR4
88181 + */
88182 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
88183 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
88184 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
88185 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
88186 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
88187 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
88188 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
88189 +#define X86_CR4_PGE 0x0080 /* enable global pages */
88190 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
88191 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
88192 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
88193 +
88194 +/*
88195 + * Save the cr4 feature set we're using (ie
88196 + * Pentium 4MB enable and PPro Global page
88197 + * enable), so that any CPU's that boot up
88198 + * after us can get the correct flags.
88199 + */
88200 +extern unsigned long mmu_cr4_features;
88201 +
88202 +static inline void set_in_cr4 (unsigned long mask)
88203 +{
88204 + unsigned cr4;
88205 + mmu_cr4_features |= mask;
88206 + cr4 = read_cr4();
88207 + cr4 |= mask;
88208 + write_cr4(cr4);
88209 +}
88210 +
88211 +static inline void clear_in_cr4 (unsigned long mask)
88212 +{
88213 + unsigned cr4;
88214 + mmu_cr4_features &= ~mask;
88215 + cr4 = read_cr4();
88216 + cr4 &= ~mask;
88217 + write_cr4(cr4);
88218 +}
88219 +
88220 +/*
88221 + * NSC/Cyrix CPU configuration register indexes
88222 + */
88223 +
88224 +#define CX86_PCR0 0x20
88225 +#define CX86_GCR 0xb8
88226 +#define CX86_CCR0 0xc0
88227 +#define CX86_CCR1 0xc1
88228 +#define CX86_CCR2 0xc2
88229 +#define CX86_CCR3 0xc3
88230 +#define CX86_CCR4 0xe8
88231 +#define CX86_CCR5 0xe9
88232 +#define CX86_CCR6 0xea
88233 +#define CX86_CCR7 0xeb
88234 +#define CX86_PCR1 0xf0
88235 +#define CX86_DIR0 0xfe
88236 +#define CX86_DIR1 0xff
88237 +#define CX86_ARR_BASE 0xc4
88238 +#define CX86_RCR_BASE 0xdc
88239 +
88240 +/*
88241 + * NSC/Cyrix CPU indexed register access macros
88242 + */
88243 +
88244 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
88245 +
88246 +#define setCx86(reg, data) do { \
88247 + outb((reg), 0x22); \
88248 + outb((data), 0x23); \
88249 +} while (0)
88250 +
88251 +/* Stop speculative execution */
88252 +static inline void sync_core(void)
88253 +{
88254 + int tmp;
88255 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
88256 +}
88257 +
88258 +static inline void __monitor(const void *eax, unsigned long ecx,
88259 + unsigned long edx)
88260 +{
88261 + /* "monitor %eax,%ecx,%edx;" */
88262 + asm volatile(
88263 + ".byte 0x0f,0x01,0xc8;"
88264 + : :"a" (eax), "c" (ecx), "d"(edx));
88265 +}
88266 +
88267 +static inline void __mwait(unsigned long eax, unsigned long ecx)
88268 +{
88269 + /* "mwait %eax,%ecx;" */
88270 + asm volatile(
88271 + ".byte 0x0f,0x01,0xc9;"
88272 + : :"a" (eax), "c" (ecx));
88273 +}
88274 +
88275 +/* from system description table in BIOS. Mostly for MCA use, but
88276 +others may find it useful. */
88277 +extern unsigned int machine_id;
88278 +extern unsigned int machine_submodel_id;
88279 +extern unsigned int BIOS_revision;
88280 +extern unsigned int mca_pentium_flag;
88281 +
88282 +/* Boot loader type from the setup header */
88283 +extern int bootloader_type;
88284 +
88285 +/*
88286 + * User space process size: 3GB (default).
88287 + */
88288 +#define TASK_SIZE (PAGE_OFFSET)
88289 +
88290 +/* This decides where the kernel will search for a free chunk of vm
88291 + * space during mmap's.
88292 + */
88293 +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
88294 +
88295 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
88296 +
88297 +/*
88298 + * Size of io_bitmap.
88299 + */
88300 +#define IO_BITMAP_BITS 65536
88301 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
88302 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
88303 +#ifndef CONFIG_X86_NO_TSS
88304 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
88305 +#endif
88306 +#define INVALID_IO_BITMAP_OFFSET 0x8000
88307 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
88308 +
88309 +struct i387_fsave_struct {
88310 + long cwd;
88311 + long swd;
88312 + long twd;
88313 + long fip;
88314 + long fcs;
88315 + long foo;
88316 + long fos;
88317 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
88318 + long status; /* software status information */
88319 +};
88320 +
88321 +struct i387_fxsave_struct {
88322 + unsigned short cwd;
88323 + unsigned short swd;
88324 + unsigned short twd;
88325 + unsigned short fop;
88326 + long fip;
88327 + long fcs;
88328 + long foo;
88329 + long fos;
88330 + long mxcsr;
88331 + long mxcsr_mask;
88332 + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
88333 + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
88334 + long padding[56];
88335 +} __attribute__ ((aligned (16)));
88336 +
88337 +struct i387_soft_struct {
88338 + long cwd;
88339 + long swd;
88340 + long twd;
88341 + long fip;
88342 + long fcs;
88343 + long foo;
88344 + long fos;
88345 + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
88346 + unsigned char ftop, changed, lookahead, no_update, rm, alimit;
88347 + struct info *info;
88348 + unsigned long entry_eip;
88349 +};
88350 +
88351 +union i387_union {
88352 + struct i387_fsave_struct fsave;
88353 + struct i387_fxsave_struct fxsave;
88354 + struct i387_soft_struct soft;
88355 +};
88356 +
88357 +typedef struct {
88358 + unsigned long seg;
88359 +} mm_segment_t;
88360 +
88361 +struct thread_struct;
88362 +
88363 +#ifndef CONFIG_X86_NO_TSS
88364 +struct tss_struct {
88365 + unsigned short back_link,__blh;
88366 + unsigned long esp0;
88367 + unsigned short ss0,__ss0h;
88368 + unsigned long esp1;
88369 + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
88370 + unsigned long esp2;
88371 + unsigned short ss2,__ss2h;
88372 + unsigned long __cr3;
88373 + unsigned long eip;
88374 + unsigned long eflags;
88375 + unsigned long eax,ecx,edx,ebx;
88376 + unsigned long esp;
88377 + unsigned long ebp;
88378 + unsigned long esi;
88379 + unsigned long edi;
88380 + unsigned short es, __esh;
88381 + unsigned short cs, __csh;
88382 + unsigned short ss, __ssh;
88383 + unsigned short ds, __dsh;
88384 + unsigned short fs, __fsh;
88385 + unsigned short gs, __gsh;
88386 + unsigned short ldt, __ldth;
88387 + unsigned short trace, io_bitmap_base;
88388 + /*
88389 + * The extra 1 is there because the CPU will access an
88390 + * additional byte beyond the end of the IO permission
88391 + * bitmap. The extra byte must be all 1 bits, and must
88392 + * be within the limit.
88393 + */
88394 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
88395 + /*
88396 + * Cache the current maximum and the last task that used the bitmap:
88397 + */
88398 + unsigned long io_bitmap_max;
88399 + struct thread_struct *io_bitmap_owner;
88400 + /*
88401 + * pads the TSS to be cacheline-aligned (size is 0x100)
88402 + */
88403 + unsigned long __cacheline_filler[35];
88404 + /*
88405 + * .. and then another 0x100 bytes for emergency kernel stack
88406 + */
88407 + unsigned long stack[64];
88408 +} __attribute__((packed));
88409 +#endif
88410 +
88411 +#define ARCH_MIN_TASKALIGN 16
88412 +
88413 +struct thread_struct {
88414 +/* cached TLS descriptors. */
88415 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
88416 + unsigned long esp0;
88417 + unsigned long sysenter_cs;
88418 + unsigned long eip;
88419 + unsigned long esp;
88420 + unsigned long fs;
88421 + unsigned long gs;
88422 +/* Hardware debugging registers */
88423 + unsigned long debugreg[8]; /* %%db0-7 debug registers */
88424 +/* fault info */
88425 + unsigned long cr2, trap_no, error_code;
88426 +/* floating point info */
88427 + union i387_union i387;
88428 +/* virtual 86 mode info */
88429 + struct vm86_struct __user * vm86_info;
88430 + unsigned long screen_bitmap;
88431 + unsigned long v86flags, v86mask, saved_esp0;
88432 + unsigned int saved_fs, saved_gs;
88433 +/* IO permissions */
88434 + unsigned long *io_bitmap_ptr;
88435 + unsigned long iopl;
88436 +/* max allowed port in the bitmap, in bytes: */
88437 + unsigned long io_bitmap_max;
88438 +};
88439 +
88440 +#define INIT_THREAD { \
88441 + .vm86_info = NULL, \
88442 + .sysenter_cs = __KERNEL_CS, \
88443 + .io_bitmap_ptr = NULL, \
88444 +}
88445 +
88446 +#ifndef CONFIG_X86_NO_TSS
88447 +/*
88448 + * Note that the .io_bitmap member must be extra-big. This is because
88449 + * the CPU will access an additional byte beyond the end of the IO
88450 + * permission bitmap. The extra byte must be all 1 bits, and must
88451 + * be within the limit.
88452 + */
88453 +#define INIT_TSS { \
88454 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
88455 + .ss0 = __KERNEL_DS, \
88456 + .ss1 = __KERNEL_CS, \
88457 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
88458 + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
88459 +}
88460 +
88461 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
88462 +{
88463 + tss->esp0 = thread->esp0;
88464 + /* This can only happen when SEP is enabled, no need to test "SEP"arately */
88465 + if (unlikely(tss->ss1 != thread->sysenter_cs)) {
88466 + tss->ss1 = thread->sysenter_cs;
88467 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
88468 + }
88469 +}
88470 +#define load_esp0(tss, thread) \
88471 + __load_esp0(tss, thread)
88472 +#else
88473 +#define load_esp0(tss, thread) \
88474 + HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
88475 +#endif
88476 +
88477 +#define start_thread(regs, new_eip, new_esp) do { \
88478 + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
88479 + set_fs(USER_DS); \
88480 + regs->xds = __USER_DS; \
88481 + regs->xes = __USER_DS; \
88482 + regs->xss = __USER_DS; \
88483 + regs->xcs = __USER_CS; \
88484 + regs->eip = new_eip; \
88485 + regs->esp = new_esp; \
88486 +} while (0)
88487 +
88488 +/*
88489 + * These special macros can be used to get or set a debugging register
88490 + */
88491 +#define get_debugreg(var, register) \
88492 + (var) = HYPERVISOR_get_debugreg((register))
88493 +#define set_debugreg(value, register) \
88494 + HYPERVISOR_set_debugreg((register), (value))
88495 +
88496 +/*
88497 + * Set IOPL bits in EFLAGS from given mask
88498 + */
88499 +static inline void set_iopl_mask(unsigned mask)
88500 +{
88501 + struct physdev_set_iopl set_iopl;
88502 +
88503 + /* Force the change at ring 0. */
88504 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
88505 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
88506 +}
88507 +
88508 +/* Forward declaration, a strange C thing */
88509 +struct task_struct;
88510 +struct mm_struct;
88511 +
88512 +/* Free all resources held by a thread. */
88513 +extern void release_thread(struct task_struct *);
88514 +
88515 +/* Prepare to copy thread state - unlazy all lazy status */
88516 +extern void prepare_to_copy(struct task_struct *tsk);
88517 +
88518 +/*
88519 + * create a kernel thread without removing it from tasklists
88520 + */
88521 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
88522 +
88523 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
88524 +void show_trace(struct task_struct *task, unsigned long *stack);
88525 +
88526 +unsigned long get_wchan(struct task_struct *p);
88527 +
88528 +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
88529 +#define KSTK_TOP(info) \
88530 +({ \
88531 + unsigned long *__ptr = (unsigned long *)(info); \
88532 + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
88533 +})
88534 +
88535 +/*
88536 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
88537 + * This is necessary to guarantee that the entire "struct pt_regs"
88538 + * is accessable even if the CPU haven't stored the SS/ESP registers
88539 + * on the stack (interrupt gate does not save these registers
88540 + * when switching to the same priv ring).
88541 + * Therefore beware: accessing the xss/esp fields of the
88542 + * "struct pt_regs" is possible, but they may contain the
88543 + * completely wrong values.
88544 + */
88545 +#define task_pt_regs(task) \
88546 +({ \
88547 + struct pt_regs *__regs__; \
88548 + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
88549 + __regs__ - 1; \
88550 +})
88551 +
88552 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
88553 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
88554 +
88555 +
88556 +struct microcode_header {
88557 + unsigned int hdrver;
88558 + unsigned int rev;
88559 + unsigned int date;
88560 + unsigned int sig;
88561 + unsigned int cksum;
88562 + unsigned int ldrver;
88563 + unsigned int pf;
88564 + unsigned int datasize;
88565 + unsigned int totalsize;
88566 + unsigned int reserved[3];
88567 +};
88568 +
88569 +struct microcode {
88570 + struct microcode_header hdr;
88571 + unsigned int bits[0];
88572 +};
88573 +
88574 +typedef struct microcode microcode_t;
88575 +typedef struct microcode_header microcode_header_t;
88576 +
88577 +/* microcode format is extended from prescott processors */
88578 +struct extended_signature {
88579 + unsigned int sig;
88580 + unsigned int pf;
88581 + unsigned int cksum;
88582 +};
88583 +
88584 +struct extended_sigtable {
88585 + unsigned int count;
88586 + unsigned int cksum;
88587 + unsigned int reserved[3];
88588 + struct extended_signature sigs[0];
88589 +};
88590 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
88591 +#define MICROCODE_IOCFREE _IO('6',0)
88592 +
88593 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
88594 +static inline void rep_nop(void)
88595 +{
88596 + __asm__ __volatile__("rep;nop": : :"memory");
88597 +}
88598 +
88599 +#define cpu_relax() rep_nop()
88600 +
88601 +/* generic versions from gas */
88602 +#define GENERIC_NOP1 ".byte 0x90\n"
88603 +#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
88604 +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
88605 +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
88606 +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
88607 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
88608 +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
88609 +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
88610 +
88611 +/* Opteron nops */
88612 +#define K8_NOP1 GENERIC_NOP1
88613 +#define K8_NOP2 ".byte 0x66,0x90\n"
88614 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
88615 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
88616 +#define K8_NOP5 K8_NOP3 K8_NOP2
88617 +#define K8_NOP6 K8_NOP3 K8_NOP3
88618 +#define K8_NOP7 K8_NOP4 K8_NOP3
88619 +#define K8_NOP8 K8_NOP4 K8_NOP4
88620 +
88621 +/* K7 nops */
88622 +/* uses eax dependencies (arbitary choice) */
88623 +#define K7_NOP1 GENERIC_NOP1
88624 +#define K7_NOP2 ".byte 0x8b,0xc0\n"
88625 +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
88626 +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
88627 +#define K7_NOP5 K7_NOP4 ASM_NOP1
88628 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
88629 +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
88630 +#define K7_NOP8 K7_NOP7 ASM_NOP1
88631 +
88632 +#ifdef CONFIG_MK8
88633 +#define ASM_NOP1 K8_NOP1
88634 +#define ASM_NOP2 K8_NOP2
88635 +#define ASM_NOP3 K8_NOP3
88636 +#define ASM_NOP4 K8_NOP4
88637 +#define ASM_NOP5 K8_NOP5
88638 +#define ASM_NOP6 K8_NOP6
88639 +#define ASM_NOP7 K8_NOP7
88640 +#define ASM_NOP8 K8_NOP8
88641 +#elif defined(CONFIG_MK7)
88642 +#define ASM_NOP1 K7_NOP1
88643 +#define ASM_NOP2 K7_NOP2
88644 +#define ASM_NOP3 K7_NOP3
88645 +#define ASM_NOP4 K7_NOP4
88646 +#define ASM_NOP5 K7_NOP5
88647 +#define ASM_NOP6 K7_NOP6
88648 +#define ASM_NOP7 K7_NOP7
88649 +#define ASM_NOP8 K7_NOP8
88650 +#else
88651 +#define ASM_NOP1 GENERIC_NOP1
88652 +#define ASM_NOP2 GENERIC_NOP2
88653 +#define ASM_NOP3 GENERIC_NOP3
88654 +#define ASM_NOP4 GENERIC_NOP4
88655 +#define ASM_NOP5 GENERIC_NOP5
88656 +#define ASM_NOP6 GENERIC_NOP6
88657 +#define ASM_NOP7 GENERIC_NOP7
88658 +#define ASM_NOP8 GENERIC_NOP8
88659 +#endif
88660 +
88661 +#define ASM_NOP_MAX 8
88662 +
88663 +/* Prefetch instructions for Pentium III and AMD Athlon */
88664 +/* It's not worth to care about 3dnow! prefetches for the K6
88665 + because they are microcoded there and very slow.
88666 + However we don't do prefetches for pre XP Athlons currently
88667 + That should be fixed. */
88668 +#define ARCH_HAS_PREFETCH
88669 +static inline void prefetch(const void *x)
88670 +{
88671 + alternative_input(ASM_NOP4,
88672 + "prefetchnta (%1)",
88673 + X86_FEATURE_XMM,
88674 + "r" (x));
88675 +}
88676 +
88677 +#define ARCH_HAS_PREFETCH
88678 +#define ARCH_HAS_PREFETCHW
88679 +#define ARCH_HAS_SPINLOCK_PREFETCH
88680 +
88681 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
88682 + spinlocks to avoid one state transition in the cache coherency protocol. */
88683 +static inline void prefetchw(const void *x)
88684 +{
88685 + alternative_input(ASM_NOP4,
88686 + "prefetchw (%1)",
88687 + X86_FEATURE_3DNOW,
88688 + "r" (x));
88689 +}
88690 +#define spin_lock_prefetch(x) prefetchw(x)
88691 +
88692 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
88693 +
88694 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
88695 +
88696 +extern unsigned long boot_option_idle_override;
88697 +extern void enable_sep_cpu(void);
88698 +extern int sysenter_setup(void);
88699 +
88700 +#ifdef CONFIG_MTRR
88701 +extern void mtrr_ap_init(void);
88702 +extern void mtrr_bp_init(void);
88703 +#else
88704 +#define mtrr_ap_init() do {} while (0)
88705 +#define mtrr_bp_init() do {} while (0)
88706 +#endif
88707 +
88708 +#ifdef CONFIG_X86_MCE
88709 +extern void mcheck_init(struct cpuinfo_x86 *c);
88710 +#else
88711 +#define mcheck_init(c) do {} while(0)
88712 +#endif
88713 +
88714 +#endif /* __ASM_I386_PROCESSOR_H */
88715 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/ptrace.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/ptrace.h
88716 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/ptrace.h 1970-01-01 00:00:00.000000000 +0000
88717 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/ptrace.h 2007-01-08 15:00:46.000000000 +0000
88718 @@ -0,0 +1,90 @@
88719 +#ifndef _I386_PTRACE_H
88720 +#define _I386_PTRACE_H
88721 +
88722 +#define EBX 0
88723 +#define ECX 1
88724 +#define EDX 2
88725 +#define ESI 3
88726 +#define EDI 4
88727 +#define EBP 5
88728 +#define EAX 6
88729 +#define DS 7
88730 +#define ES 8
88731 +#define FS 9
88732 +#define GS 10
88733 +#define ORIG_EAX 11
88734 +#define EIP 12
88735 +#define CS 13
88736 +#define EFL 14
88737 +#define UESP 15
88738 +#define SS 16
88739 +#define FRAME_SIZE 17
88740 +
88741 +/* this struct defines the way the registers are stored on the
88742 + stack during a system call. */
88743 +
88744 +struct pt_regs {
88745 + long ebx;
88746 + long ecx;
88747 + long edx;
88748 + long esi;
88749 + long edi;
88750 + long ebp;
88751 + long eax;
88752 + int xds;
88753 + int xes;
88754 + long orig_eax;
88755 + long eip;
88756 + int xcs;
88757 + long eflags;
88758 + long esp;
88759 + int xss;
88760 +};
88761 +
88762 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
88763 +#define PTRACE_GETREGS 12
88764 +#define PTRACE_SETREGS 13
88765 +#define PTRACE_GETFPREGS 14
88766 +#define PTRACE_SETFPREGS 15
88767 +#define PTRACE_GETFPXREGS 18
88768 +#define PTRACE_SETFPXREGS 19
88769 +
88770 +#define PTRACE_OLDSETOPTIONS 21
88771 +
88772 +#define PTRACE_GET_THREAD_AREA 25
88773 +#define PTRACE_SET_THREAD_AREA 26
88774 +
88775 +#define PTRACE_SYSEMU 31
88776 +#define PTRACE_SYSEMU_SINGLESTEP 32
88777 +
88778 +#ifdef __KERNEL__
88779 +
88780 +#include <asm/vm86.h>
88781 +
88782 +struct task_struct;
88783 +extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
88784 +
88785 +/*
88786 + * user_mode_vm(regs) determines whether a register set came from user mode.
88787 + * This is true if V8086 mode was enabled OR if the register set was from
88788 + * protected mode with RPL-3 CS value. This tricky test checks that with
88789 + * one comparison. Many places in the kernel can bypass this full check
88790 + * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
88791 + */
88792 +static inline int user_mode(struct pt_regs *regs)
88793 +{
88794 + return (regs->xcs & 2) != 0;
88795 +}
88796 +static inline int user_mode_vm(struct pt_regs *regs)
88797 +{
88798 + return ((regs->xcs & 2) | (regs->eflags & VM_MASK)) != 0;
88799 +}
88800 +#define instruction_pointer(regs) ((regs)->eip)
88801 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
88802 +extern unsigned long profile_pc(struct pt_regs *regs);
88803 +#else
88804 +#define profile_pc(regs) instruction_pointer(regs)
88805 +#endif
88806 +#endif /* __KERNEL__ */
88807 +
88808 +#endif
88809 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/scatterlist.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/scatterlist.h
88810 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/scatterlist.h 1970-01-01 00:00:00.000000000 +0000
88811 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/scatterlist.h 2007-01-08 15:00:46.000000000 +0000
88812 @@ -0,0 +1,22 @@
88813 +#ifndef _I386_SCATTERLIST_H
88814 +#define _I386_SCATTERLIST_H
88815 +
88816 +struct scatterlist {
88817 + struct page *page;
88818 + unsigned int offset;
88819 + unsigned int length;
88820 + dma_addr_t dma_address;
88821 + unsigned int dma_length;
88822 +};
88823 +
88824 +/* These macros should be used after a pci_map_sg call has been done
88825 + * to get bus addresses of each of the SG entries and their lengths.
88826 + * You should only work with the number of sg entries pci_map_sg
88827 + * returns.
88828 + */
88829 +#define sg_dma_address(sg) ((sg)->dma_address)
88830 +#define sg_dma_len(sg) ((sg)->dma_length)
88831 +
88832 +#define ISA_DMA_THRESHOLD (0x00ffffff)
88833 +
88834 +#endif /* !(_I386_SCATTERLIST_H) */
88835 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/segment.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/segment.h
88836 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/segment.h 1970-01-01 00:00:00.000000000 +0000
88837 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/segment.h 2007-01-08 15:00:46.000000000 +0000
88838 @@ -0,0 +1,117 @@
88839 +#ifndef _ASM_SEGMENT_H
88840 +#define _ASM_SEGMENT_H
88841 +
88842 +/*
88843 + * The layout of the per-CPU GDT under Linux:
88844 + *
88845 + * 0 - null
88846 + * 1 - reserved
88847 + * 2 - reserved
88848 + * 3 - reserved
88849 + *
88850 + * 4 - unused <==== new cacheline
88851 + * 5 - unused
88852 + *
88853 + * ------- start of TLS (Thread-Local Storage) segments:
88854 + *
88855 + * 6 - TLS segment #1 [ glibc's TLS segment ]
88856 + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
88857 + * 8 - TLS segment #3
88858 + * 9 - reserved
88859 + * 10 - reserved
88860 + * 11 - reserved
88861 + *
88862 + * ------- start of kernel segments:
88863 + *
88864 + * 12 - kernel code segment <==== new cacheline
88865 + * 13 - kernel data segment
88866 + * 14 - default user CS
88867 + * 15 - default user DS
88868 + * 16 - TSS
88869 + * 17 - LDT
88870 + * 18 - PNPBIOS support (16->32 gate)
88871 + * 19 - PNPBIOS support
88872 + * 20 - PNPBIOS support
88873 + * 21 - PNPBIOS support
88874 + * 22 - PNPBIOS support
88875 + * 23 - APM BIOS support
88876 + * 24 - APM BIOS support
88877 + * 25 - APM BIOS support
88878 + *
88879 + * 26 - ESPFIX small SS
88880 + * 27 - unused
88881 + * 28 - unused
88882 + * 29 - unused
88883 + * 30 - unused
88884 + * 31 - TSS for double fault handler
88885 + */
88886 +#define GDT_ENTRY_TLS_ENTRIES 3
88887 +#define GDT_ENTRY_TLS_MIN 6
88888 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
88889 +
88890 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
88891 +
88892 +#define GDT_ENTRY_DEFAULT_USER_CS 14
88893 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
88894 +
88895 +#define GDT_ENTRY_DEFAULT_USER_DS 15
88896 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
88897 +
88898 +#define GDT_ENTRY_KERNEL_BASE 12
88899 +
88900 +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
88901 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
88902 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
88903 +
88904 +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
88905 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
88906 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
88907 +
88908 +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
88909 +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
88910 +
88911 +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
88912 +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
88913 +
88914 +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
88915 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
88916 +
88917 +#define GDT_ENTRY_DOUBLEFAULT_TSS 31
88918 +
88919 +/*
88920 + * The GDT has 32 entries
88921 + */
88922 +#define GDT_ENTRIES 32
88923 +
88924 +#define GDT_SIZE (GDT_ENTRIES * 8)
88925 +
88926 +/* Simple and small GDT entries for booting only */
88927 +
88928 +#define GDT_ENTRY_BOOT_CS 2
88929 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
88930 +
88931 +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
88932 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
88933 +
88934 +/* The PnP BIOS entries in the GDT */
88935 +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
88936 +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
88937 +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
88938 +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
88939 +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
88940 +
88941 +/* The PnP BIOS selectors */
88942 +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
88943 +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
88944 +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
88945 +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
88946 +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
88947 +
88948 +/*
88949 + * The interrupt descriptor table has room for 256 idt's,
88950 + * the global descriptor table is dependent on the number
88951 + * of tasks we can have..
88952 + */
88953 +#define IDT_ENTRIES 256
88954 +
88955 +#endif
88956 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/setup.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/setup.h
88957 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/setup.h 1970-01-01 00:00:00.000000000 +0000
88958 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/setup.h 2007-01-08 15:00:46.000000000 +0000
88959 @@ -0,0 +1,66 @@
88960 +/*
88961 + * Just a place holder. We don't want to have to test x86 before
88962 + * we include stuff
88963 + */
88964 +
88965 +#ifndef _i386_SETUP_H
88966 +#define _i386_SETUP_H
88967 +
88968 +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
88969 +#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
88970 +#define PFN_PHYS(x) ((unsigned long long)(x) << PAGE_SHIFT)
88971 +
88972 +/*
88973 + * Reserved space for vmalloc and iomap - defined in asm/page.h
88974 + */
88975 +#define MAXMEM_PFN PFN_DOWN(MAXMEM)
88976 +#define MAX_NONPAE_PFN (1 << 20)
88977 +
88978 +#define PARAM_SIZE 4096
88979 +#define COMMAND_LINE_SIZE 256
88980 +
88981 +#define OLD_CL_MAGIC_ADDR 0x90020
88982 +#define OLD_CL_MAGIC 0xA33F
88983 +#define OLD_CL_BASE_ADDR 0x90000
88984 +#define OLD_CL_OFFSET 0x90022
88985 +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
88986 +
88987 +#ifndef __ASSEMBLY__
88988 +/*
88989 + * This is set up by the setup-routine at boot-time
88990 + */
88991 +extern unsigned char boot_params[PARAM_SIZE];
88992 +
88993 +#define PARAM (boot_params)
88994 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
88995 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
88996 +#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
88997 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
88998 +#define E820_MAP ((struct e820entry *) (PARAM+E820MAP))
88999 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
89000 +#define IST_INFO (*(struct ist_info *) (PARAM+0x60))
89001 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
89002 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
89003 +#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
89004 +#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
89005 +#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
89006 +#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
89007 +#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
89008 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
89009 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
89010 +#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
89011 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
89012 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
89013 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
89014 +#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
89015 +#define INITRD_START (__pa(xen_start_info->mod_start))
89016 +#define INITRD_SIZE (xen_start_info->mod_len)
89017 +#define EDID_INFO (*(struct edid_info *) (PARAM+0x440))
89018 +#define EDD_NR (*(unsigned char *) (PARAM+EDDNR))
89019 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
89020 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
89021 +#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF))
89022 +
89023 +#endif /* __ASSEMBLY__ */
89024 +
89025 +#endif /* _i386_SETUP_H */
89026 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/smp.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/smp.h
89027 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/smp.h 1970-01-01 00:00:00.000000000 +0000
89028 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/smp.h 2007-01-08 15:00:46.000000000 +0000
89029 @@ -0,0 +1,103 @@
89030 +#ifndef __ASM_SMP_H
89031 +#define __ASM_SMP_H
89032 +
89033 +/*
89034 + * We need the APIC definitions automatically as part of 'smp.h'
89035 + */
89036 +#ifndef __ASSEMBLY__
89037 +#include <linux/config.h>
89038 +#include <linux/kernel.h>
89039 +#include <linux/threads.h>
89040 +#include <linux/cpumask.h>
89041 +#endif
89042 +
89043 +#ifdef CONFIG_X86_LOCAL_APIC
89044 +#ifndef __ASSEMBLY__
89045 +#include <asm/fixmap.h>
89046 +#include <asm/bitops.h>
89047 +#include <asm/mpspec.h>
89048 +#ifdef CONFIG_X86_IO_APIC
89049 +#include <asm/io_apic.h>
89050 +#endif
89051 +#include <asm/apic.h>
89052 +#endif
89053 +#endif
89054 +
89055 +#define BAD_APICID 0xFFu
89056 +#ifdef CONFIG_SMP
89057 +#ifndef __ASSEMBLY__
89058 +
89059 +/*
89060 + * Private routines/data
89061 + */
89062 +
89063 +extern void smp_alloc_memory(void);
89064 +extern int pic_mode;
89065 +extern int smp_num_siblings;
89066 +extern cpumask_t cpu_sibling_map[];
89067 +extern cpumask_t cpu_core_map[];
89068 +
89069 +extern void (*mtrr_hook) (void);
89070 +extern void zap_low_mappings (void);
89071 +extern void lock_ipi_call_lock(void);
89072 +extern void unlock_ipi_call_lock(void);
89073 +
89074 +#define MAX_APICID 256
89075 +extern u8 x86_cpu_to_apicid[];
89076 +
89077 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
89078 +
89079 +#ifdef CONFIG_HOTPLUG_CPU
89080 +extern void cpu_exit_clear(void);
89081 +extern void cpu_uninit(void);
89082 +#endif
89083 +
89084 +/*
89085 + * This function is needed by all SMP systems. It must _always_ be valid
89086 + * from the initial startup. We map APIC_BASE very early in page_setup(),
89087 + * so this is correct in the x86 case.
89088 + */
89089 +#define raw_smp_processor_id() (current_thread_info()->cpu)
89090 +
89091 +extern cpumask_t cpu_possible_map;
89092 +#define cpu_callin_map cpu_possible_map
89093 +
89094 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
89095 +static inline int num_booting_cpus(void)
89096 +{
89097 + return cpus_weight(cpu_possible_map);
89098 +}
89099 +
89100 +#ifdef CONFIG_X86_LOCAL_APIC
89101 +
89102 +#ifdef APIC_DEFINITION
89103 +extern int hard_smp_processor_id(void);
89104 +#else
89105 +#include <mach_apicdef.h>
89106 +static inline int hard_smp_processor_id(void)
89107 +{
89108 + /* we don't want to mark this access volatile - bad code generation */
89109 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
89110 +}
89111 +#endif
89112 +
89113 +static __inline int logical_smp_processor_id(void)
89114 +{
89115 + /* we don't want to mark this access volatile - bad code generation */
89116 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
89117 +}
89118 +
89119 +#endif
89120 +
89121 +extern int __cpu_disable(void);
89122 +extern void __cpu_die(unsigned int cpu);
89123 +#endif /* !__ASSEMBLY__ */
89124 +
89125 +#else /* CONFIG_SMP */
89126 +
89127 +#define cpu_physical_id(cpu) boot_cpu_physical_apicid
89128 +
89129 +#define NO_PROC_ID 0xFF /* No processor magic marker */
89130 +
89131 +#endif
89132 +#endif
89133 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/spinlock.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/spinlock.h
89134 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/spinlock.h 1970-01-01 00:00:00.000000000 +0000
89135 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/spinlock.h 2007-01-08 15:00:46.000000000 +0000
89136 @@ -0,0 +1,217 @@
89137 +#ifndef __ASM_SPINLOCK_H
89138 +#define __ASM_SPINLOCK_H
89139 +
89140 +#include <asm/atomic.h>
89141 +#include <asm/rwlock.h>
89142 +#include <asm/page.h>
89143 +#include <linux/config.h>
89144 +#include <linux/compiler.h>
89145 +#include <asm/smp_alt.h>
89146 +
89147 +/*
89148 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
89149 + *
89150 + * Simple spin lock operations. There are two variants, one clears IRQ's
89151 + * on the local processor, one does not.
89152 + *
89153 + * We make no fairness assumptions. They have a cost.
89154 + *
89155 + * (the type definitions are in asm/spinlock_types.h)
89156 + */
89157 +
89158 +#define __raw_spin_is_locked(x) \
89159 + (*(volatile signed char *)(&(x)->slock) <= 0)
89160 +
89161 +#define __raw_spin_lock_string \
89162 + "\n1:\n" \
89163 + LOCK \
89164 + "decb %0\n\t" \
89165 + "jns 3f\n" \
89166 + "2:\t" \
89167 + "rep;nop\n\t" \
89168 + "cmpb $0,%0\n\t" \
89169 + "jle 2b\n\t" \
89170 + "jmp 1b\n" \
89171 + "3:\n\t"
89172 +
89173 +#define __raw_spin_lock_string_flags \
89174 + "\n1:\n" \
89175 + LOCK \
89176 + "decb %0\n\t" \
89177 + "jns 4f\n\t" \
89178 + "2:\t" \
89179 + "testl $0x200, %1\n\t" \
89180 + "jz 3f\n\t" \
89181 + "#sti\n\t" \
89182 + "3:\t" \
89183 + "rep;nop\n\t" \
89184 + "cmpb $0, %0\n\t" \
89185 + "jle 3b\n\t" \
89186 + "#cli\n\t" \
89187 + "jmp 1b\n" \
89188 + "4:\n\t"
89189 +
89190 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
89191 +{
89192 + __asm__ __volatile__(
89193 + __raw_spin_lock_string
89194 + :"=m" (lock->slock) : : "memory");
89195 +}
89196 +
89197 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
89198 +{
89199 + __asm__ __volatile__(
89200 + __raw_spin_lock_string_flags
89201 + :"=m" (lock->slock) : "r" (flags) : "memory");
89202 +}
89203 +
89204 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
89205 +{
89206 + char oldval;
89207 +#ifdef CONFIG_SMP_ALTERNATIVES
89208 + __asm__ __volatile__(
89209 + "1:movb %1,%b0\n"
89210 + "movb $0,%1\n"
89211 + "2:"
89212 + ".section __smp_alternatives,\"a\"\n"
89213 + ".long 1b\n"
89214 + ".long 3f\n"
89215 + ".previous\n"
89216 + ".section __smp_replacements,\"a\"\n"
89217 + "3: .byte 2b - 1b\n"
89218 + ".byte 5f-4f\n"
89219 + ".byte 0\n"
89220 + ".byte 6f-5f\n"
89221 + ".byte -1\n"
89222 + "4: xchgb %b0,%1\n"
89223 + "5: movb %1,%b0\n"
89224 + "movb $0,%1\n"
89225 + "6:\n"
89226 + ".previous\n"
89227 + :"=q" (oldval), "=m" (lock->slock)
89228 + :"0" (0) : "memory");
89229 +#else
89230 + __asm__ __volatile__(
89231 + "xchgb %b0,%1"
89232 + :"=q" (oldval), "=m" (lock->slock)
89233 + :"0" (0) : "memory");
89234 +#endif
89235 + return oldval > 0;
89236 +}
89237 +
89238 +/*
89239 + * __raw_spin_unlock based on writing $1 to the low byte.
89240 + * This method works. Despite all the confusion.
89241 + * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
89242 + * (PPro errata 66, 92)
89243 + */
89244 +
89245 +#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
89246 +
89247 +#define __raw_spin_unlock_string \
89248 + "movb $1,%0" \
89249 + :"=m" (lock->slock) : : "memory"
89250 +
89251 +
89252 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
89253 +{
89254 + __asm__ __volatile__(
89255 + __raw_spin_unlock_string
89256 + );
89257 +}
89258 +
89259 +#else
89260 +
89261 +#define __raw_spin_unlock_string \
89262 + "xchgb %b0, %1" \
89263 + :"=q" (oldval), "=m" (lock->slock) \
89264 + :"0" (oldval) : "memory"
89265 +
89266 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
89267 +{
89268 + char oldval = 1;
89269 +
89270 + __asm__ __volatile__(
89271 + __raw_spin_unlock_string
89272 + );
89273 +}
89274 +
89275 +#endif
89276 +
89277 +#define __raw_spin_unlock_wait(lock) \
89278 + do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
89279 +
89280 +/*
89281 + * Read-write spinlocks, allowing multiple readers
89282 + * but only one writer.
89283 + *
89284 + * NOTE! it is quite common to have readers in interrupts
89285 + * but no interrupt writers. For those circumstances we
89286 + * can "mix" irq-safe locks - any writer needs to get a
89287 + * irq-safe write-lock, but readers can get non-irqsafe
89288 + * read-locks.
89289 + *
89290 + * On x86, we implement read-write locks as a 32-bit counter
89291 + * with the high bit (sign) being the "contended" bit.
89292 + *
89293 + * The inline assembly is non-obvious. Think about it.
89294 + *
89295 + * Changed to use the same technique as rw semaphores. See
89296 + * semaphore.h for details. -ben
89297 + *
89298 + * the helpers are in arch/i386/kernel/semaphore.c
89299 + */
89300 +
89301 +/**
89302 + * read_can_lock - would read_trylock() succeed?
89303 + * @lock: the rwlock in question.
89304 + */
89305 +#define __raw_read_can_lock(x) ((int)(x)->lock > 0)
89306 +
89307 +/**
89308 + * write_can_lock - would write_trylock() succeed?
89309 + * @lock: the rwlock in question.
89310 + */
89311 +#define __raw_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
89312 +
89313 +static inline void __raw_read_lock(raw_rwlock_t *rw)
89314 +{
89315 + __build_read_lock(rw, "__read_lock_failed");
89316 +}
89317 +
89318 +static inline void __raw_write_lock(raw_rwlock_t *rw)
89319 +{
89320 + __build_write_lock(rw, "__write_lock_failed");
89321 +}
89322 +
89323 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
89324 +{
89325 + atomic_t *count = (atomic_t *)lock;
89326 + atomic_dec(count);
89327 + if (atomic_read(count) >= 0)
89328 + return 1;
89329 + atomic_inc(count);
89330 + return 0;
89331 +}
89332 +
89333 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
89334 +{
89335 + atomic_t *count = (atomic_t *)lock;
89336 + if (atomic_sub_and_test(RW_LOCK_BIAS, count))
89337 + return 1;
89338 + atomic_add(RW_LOCK_BIAS, count);
89339 + return 0;
89340 +}
89341 +
89342 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
89343 +{
89344 + asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
89345 +}
89346 +
89347 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
89348 +{
89349 + asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
89350 + : "=m" (rw->lock) : : "memory");
89351 +}
89352 +
89353 +#endif /* __ASM_SPINLOCK_H */
89354 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/swiotlb.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/swiotlb.h
89355 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/swiotlb.h 1970-01-01 00:00:00.000000000 +0000
89356 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/swiotlb.h 2007-01-08 15:00:46.000000000 +0000
89357 @@ -0,0 +1,45 @@
89358 +#ifndef _ASM_SWIOTLB_H
89359 +#define _ASM_SWIOTLB_H 1
89360 +
89361 +#include <linux/config.h>
89362 +
89363 +/* SWIOTLB interface */
89364 +
89365 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
89366 + int dir);
89367 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
89368 + size_t size, int dir);
89369 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
89370 + dma_addr_t dev_addr,
89371 + size_t size, int dir);
89372 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
89373 + dma_addr_t dev_addr,
89374 + size_t size, int dir);
89375 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
89376 + struct scatterlist *sg, int nelems,
89377 + int dir);
89378 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
89379 + struct scatterlist *sg, int nelems,
89380 + int dir);
89381 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
89382 + int nents, int direction);
89383 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
89384 + int nents, int direction);
89385 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
89386 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
89387 + unsigned long offset, size_t size,
89388 + enum dma_data_direction direction);
89389 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
89390 + size_t size, enum dma_data_direction direction);
89391 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
89392 +extern void swiotlb_init(void);
89393 +
89394 +extern unsigned int dma_bits;
89395 +
89396 +#ifdef CONFIG_SWIOTLB
89397 +extern int swiotlb;
89398 +#else
89399 +#define swiotlb 0
89400 +#endif
89401 +
89402 +#endif
89403 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/synch_bitops.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/synch_bitops.h
89404 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
89405 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
89406 @@ -0,0 +1,147 @@
89407 +#ifndef __XEN_SYNCH_BITOPS_H__
89408 +#define __XEN_SYNCH_BITOPS_H__
89409 +
89410 +/*
89411 + * Copyright 1992, Linus Torvalds.
89412 + * Heavily modified to provide guaranteed strong synchronisation
89413 + * when communicating with Xen or other guest OSes running on other CPUs.
89414 + */
89415 +
89416 +#include <linux/config.h>
89417 +
89418 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
89419 +#include <xen/platform-compat.h>
89420 +#endif
89421 +
89422 +#define ADDR (*(volatile long *) addr)
89423 +
89424 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
89425 +{
89426 + __asm__ __volatile__ (
89427 + "lock btsl %1,%0"
89428 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
89429 +}
89430 +
89431 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
89432 +{
89433 + __asm__ __volatile__ (
89434 + "lock btrl %1,%0"
89435 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
89436 +}
89437 +
89438 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
89439 +{
89440 + __asm__ __volatile__ (
89441 + "lock btcl %1,%0"
89442 + : "+m" (ADDR) : "Ir" (nr) : "memory" );
89443 +}
89444 +
89445 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
89446 +{
89447 + int oldbit;
89448 + __asm__ __volatile__ (
89449 + "lock btsl %2,%1\n\tsbbl %0,%0"
89450 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89451 + return oldbit;
89452 +}
89453 +
89454 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
89455 +{
89456 + int oldbit;
89457 + __asm__ __volatile__ (
89458 + "lock btrl %2,%1\n\tsbbl %0,%0"
89459 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89460 + return oldbit;
89461 +}
89462 +
89463 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
89464 +{
89465 + int oldbit;
89466 +
89467 + __asm__ __volatile__ (
89468 + "lock btcl %2,%1\n\tsbbl %0,%0"
89469 + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
89470 + return oldbit;
89471 +}
89472 +
89473 +struct __synch_xchg_dummy { unsigned long a[100]; };
89474 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
89475 +
89476 +#define synch_cmpxchg(ptr, old, new) \
89477 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
89478 + (unsigned long)(old), \
89479 + (unsigned long)(new), \
89480 + sizeof(*(ptr))))
89481 +
89482 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
89483 + unsigned long old,
89484 + unsigned long new, int size)
89485 +{
89486 + unsigned long prev;
89487 + switch (size) {
89488 + case 1:
89489 + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
89490 + : "=a"(prev)
89491 + : "q"(new), "m"(*__synch_xg(ptr)),
89492 + "0"(old)
89493 + : "memory");
89494 + return prev;
89495 + case 2:
89496 + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
89497 + : "=a"(prev)
89498 + : "r"(new), "m"(*__synch_xg(ptr)),
89499 + "0"(old)
89500 + : "memory");
89501 + return prev;
89502 +#ifdef CONFIG_X86_64
89503 + case 4:
89504 + __asm__ __volatile__("lock; cmpxchgl %k1,%2"
89505 + : "=a"(prev)
89506 + : "r"(new), "m"(*__synch_xg(ptr)),
89507 + "0"(old)
89508 + : "memory");
89509 + return prev;
89510 + case 8:
89511 + __asm__ __volatile__("lock; cmpxchgq %1,%2"
89512 + : "=a"(prev)
89513 + : "r"(new), "m"(*__synch_xg(ptr)),
89514 + "0"(old)
89515 + : "memory");
89516 + return prev;
89517 +#else
89518 + case 4:
89519 + __asm__ __volatile__("lock; cmpxchgl %1,%2"
89520 + : "=a"(prev)
89521 + : "r"(new), "m"(*__synch_xg(ptr)),
89522 + "0"(old)
89523 + : "memory");
89524 + return prev;
89525 +#endif
89526 + }
89527 + return old;
89528 +}
89529 +
89530 +static __always_inline int synch_const_test_bit(int nr,
89531 + const volatile void * addr)
89532 +{
89533 + return ((1UL << (nr & 31)) &
89534 + (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
89535 +}
89536 +
89537 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
89538 +{
89539 + int oldbit;
89540 + __asm__ __volatile__ (
89541 + "btl %2,%1\n\tsbbl %0,%0"
89542 + : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
89543 + return oldbit;
89544 +}
89545 +
89546 +#define synch_test_bit(nr,addr) \
89547 +(__builtin_constant_p(nr) ? \
89548 + synch_const_test_bit((nr),(addr)) : \
89549 + synch_var_test_bit((nr),(addr)))
89550 +
89551 +#define synch_cmpxchg_subword synch_cmpxchg
89552 +
89553 +#endif /* __XEN_SYNCH_BITOPS_H__ */
89554 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/system.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/system.h
89555 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/system.h 1970-01-01 00:00:00.000000000 +0000
89556 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/system.h 2007-01-08 15:00:46.000000000 +0000
89557 @@ -0,0 +1,681 @@
89558 +#ifndef __ASM_SYSTEM_H
89559 +#define __ASM_SYSTEM_H
89560 +
89561 +#include <linux/config.h>
89562 +#include <linux/kernel.h>
89563 +#include <linux/bitops.h>
89564 +#include <asm/synch_bitops.h>
89565 +#include <asm/segment.h>
89566 +#include <asm/cpufeature.h>
89567 +#include <asm/hypervisor.h>
89568 +#include <asm/smp_alt.h>
89569 +
89570 +#ifdef __KERNEL__
89571 +
89572 +#ifdef CONFIG_SMP
89573 +#define __vcpu_id smp_processor_id()
89574 +#else
89575 +#define __vcpu_id 0
89576 +#endif
89577 +
89578 +struct task_struct; /* one of the stranger aspects of C forward declarations.. */
89579 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
89580 +
89581 +#define switch_to(prev,next,last) do { \
89582 + unsigned long esi,edi; \
89583 + asm volatile("pushl %%ebp\n\t" \
89584 + "movl %%esp,%0\n\t" /* save ESP */ \
89585 + "movl %5,%%esp\n\t" /* restore ESP */ \
89586 + "movl $1f,%1\n\t" /* save EIP */ \
89587 + "pushl %6\n\t" /* restore EIP */ \
89588 + "jmp __switch_to\n" \
89589 + "1:\t" \
89590 + "popl %%ebp\n\t" \
89591 + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
89592 + "=a" (last),"=S" (esi),"=D" (edi) \
89593 + :"m" (next->thread.esp),"m" (next->thread.eip), \
89594 + "2" (prev), "d" (next)); \
89595 +} while (0)
89596 +
89597 +#define _set_base(addr,base) do { unsigned long __pr; \
89598 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
89599 + "rorl $16,%%edx\n\t" \
89600 + "movb %%dl,%2\n\t" \
89601 + "movb %%dh,%3" \
89602 + :"=&d" (__pr) \
89603 + :"m" (*((addr)+2)), \
89604 + "m" (*((addr)+4)), \
89605 + "m" (*((addr)+7)), \
89606 + "0" (base) \
89607 + ); } while(0)
89608 +
89609 +#define _set_limit(addr,limit) do { unsigned long __lr; \
89610 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
89611 + "rorl $16,%%edx\n\t" \
89612 + "movb %2,%%dh\n\t" \
89613 + "andb $0xf0,%%dh\n\t" \
89614 + "orb %%dh,%%dl\n\t" \
89615 + "movb %%dl,%2" \
89616 + :"=&d" (__lr) \
89617 + :"m" (*(addr)), \
89618 + "m" (*((addr)+6)), \
89619 + "0" (limit) \
89620 + ); } while(0)
89621 +
89622 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
89623 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
89624 +
89625 +/*
89626 + * Load a segment. Fall back on loading the zero
89627 + * segment if something goes wrong..
89628 + */
89629 +#define loadsegment(seg,value) \
89630 + asm volatile("\n" \
89631 + "1:\t" \
89632 + "mov %0,%%" #seg "\n" \
89633 + "2:\n" \
89634 + ".section .fixup,\"ax\"\n" \
89635 + "3:\t" \
89636 + "pushl $0\n\t" \
89637 + "popl %%" #seg "\n\t" \
89638 + "jmp 2b\n" \
89639 + ".previous\n" \
89640 + ".section __ex_table,\"a\"\n\t" \
89641 + ".align 4\n\t" \
89642 + ".long 1b,3b\n" \
89643 + ".previous" \
89644 + : :"rm" (value))
89645 +
89646 +/*
89647 + * Save a segment register away
89648 + */
89649 +#define savesegment(seg, value) \
89650 + asm volatile("mov %%" #seg ",%0":"=rm" (value))
89651 +
89652 +/*
89653 + * Clear and set 'TS' bit respectively
89654 + */
89655 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
89656 +#define read_cr0() ({ \
89657 + unsigned int __dummy; \
89658 + __asm__ __volatile__( \
89659 + "movl %%cr0,%0\n\t" \
89660 + :"=r" (__dummy)); \
89661 + __dummy; \
89662 +})
89663 +#define write_cr0(x) \
89664 + __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
89665 +
89666 +#define read_cr2() \
89667 + (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
89668 +#define write_cr2(x) \
89669 + __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
89670 +
89671 +#define read_cr3() ({ \
89672 + unsigned int __dummy; \
89673 + __asm__ ( \
89674 + "movl %%cr3,%0\n\t" \
89675 + :"=r" (__dummy)); \
89676 + __dummy = xen_cr3_to_pfn(__dummy); \
89677 + mfn_to_pfn(__dummy) << PAGE_SHIFT; \
89678 +})
89679 +#define write_cr3(x) ({ \
89680 + unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
89681 + __dummy = xen_pfn_to_cr3(__dummy); \
89682 + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
89683 +})
89684 +
89685 +#define read_cr4() ({ \
89686 + unsigned int __dummy; \
89687 + __asm__( \
89688 + "movl %%cr4,%0\n\t" \
89689 + :"=r" (__dummy)); \
89690 + __dummy; \
89691 +})
89692 +
89693 +#define read_cr4_safe() ({ \
89694 + unsigned int __dummy; \
89695 + /* This could fault if %cr4 does not exist */ \
89696 + __asm__("1: movl %%cr4, %0 \n" \
89697 + "2: \n" \
89698 + ".section __ex_table,\"a\" \n" \
89699 + ".long 1b,2b \n" \
89700 + ".previous \n" \
89701 + : "=r" (__dummy): "0" (0)); \
89702 + __dummy; \
89703 +})
89704 +
89705 +#define write_cr4(x) \
89706 + __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
89707 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
89708 +
89709 +#endif /* __KERNEL__ */
89710 +
89711 +#define wbinvd() \
89712 + __asm__ __volatile__ ("wbinvd": : :"memory");
89713 +
89714 +static inline unsigned long get_limit(unsigned long segment)
89715 +{
89716 + unsigned long __limit;
89717 + __asm__("lsll %1,%0"
89718 + :"=r" (__limit):"r" (segment));
89719 + return __limit+1;
89720 +}
89721 +
89722 +#define nop() __asm__ __volatile__ ("nop")
89723 +
89724 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
89725 +
89726 +#define tas(ptr) (xchg((ptr),1))
89727 +
89728 +struct __xchg_dummy { unsigned long a[100]; };
89729 +#define __xg(x) ((struct __xchg_dummy *)(x))
89730 +
89731 +
89732 +#ifdef CONFIG_X86_CMPXCHG64
89733 +
89734 +/*
89735 + * The semantics of XCHGCMP8B are a bit strange, this is why
89736 + * there is a loop and the loading of %%eax and %%edx has to
89737 + * be inside. This inlines well in most cases, the cached
89738 + * cost is around ~38 cycles. (in the future we might want
89739 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
89740 + * might have an implicit FPU-save as a cost, so it's not
89741 + * clear which path to go.)
89742 + *
89743 + * cmpxchg8b must be used with the lock prefix here to allow
89744 + * the instruction to be executed atomically, see page 3-102
89745 + * of the instruction set reference 24319102.pdf. We need
89746 + * the reader side to see the coherent 64bit value.
89747 + */
89748 +static inline void __set_64bit (unsigned long long * ptr,
89749 + unsigned int low, unsigned int high)
89750 +{
89751 + __asm__ __volatile__ (
89752 + "\n1:\t"
89753 + "movl (%0), %%eax\n\t"
89754 + "movl 4(%0), %%edx\n\t"
89755 + "lock cmpxchg8b (%0)\n\t"
89756 + "jnz 1b"
89757 + : /* no outputs */
89758 + : "D"(ptr),
89759 + "b"(low),
89760 + "c"(high)
89761 + : "ax","dx","memory");
89762 +}
89763 +
89764 +static inline void __set_64bit_constant (unsigned long long *ptr,
89765 + unsigned long long value)
89766 +{
89767 + __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
89768 +}
89769 +#define ll_low(x) *(((unsigned int*)&(x))+0)
89770 +#define ll_high(x) *(((unsigned int*)&(x))+1)
89771 +
89772 +static inline void __set_64bit_var (unsigned long long *ptr,
89773 + unsigned long long value)
89774 +{
89775 + __set_64bit(ptr,ll_low(value), ll_high(value));
89776 +}
89777 +
89778 +#define set_64bit(ptr,value) \
89779 +(__builtin_constant_p(value) ? \
89780 + __set_64bit_constant(ptr, value) : \
89781 + __set_64bit_var(ptr, value) )
89782 +
89783 +#define _set_64bit(ptr,value) \
89784 +(__builtin_constant_p(value) ? \
89785 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
89786 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
89787 +
89788 +#endif
89789 +
89790 +/*
89791 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
89792 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
89793 + * but generally the primitive is invalid, *ptr is output argument. --ANK
89794 + */
89795 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
89796 +{
89797 + switch (size) {
89798 + case 1:
89799 + __asm__ __volatile__("xchgb %b0,%1"
89800 + :"=q" (x)
89801 + :"m" (*__xg(ptr)), "0" (x)
89802 + :"memory");
89803 + break;
89804 + case 2:
89805 + __asm__ __volatile__("xchgw %w0,%1"
89806 + :"=r" (x)
89807 + :"m" (*__xg(ptr)), "0" (x)
89808 + :"memory");
89809 + break;
89810 + case 4:
89811 + __asm__ __volatile__("xchgl %0,%1"
89812 + :"=r" (x)
89813 + :"m" (*__xg(ptr)), "0" (x)
89814 + :"memory");
89815 + break;
89816 + }
89817 + return x;
89818 +}
89819 +
89820 +/*
89821 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
89822 + * store NEW in MEM. Return the initial value in MEM. Success is
89823 + * indicated by comparing RETURN with OLD.
89824 + */
89825 +
89826 +#ifdef CONFIG_X86_CMPXCHG
89827 +#define __HAVE_ARCH_CMPXCHG 1
89828 +#define cmpxchg(ptr,o,n)\
89829 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
89830 + (unsigned long)(n),sizeof(*(ptr))))
89831 +#endif
89832 +
89833 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
89834 + unsigned long new, int size)
89835 +{
89836 + unsigned long prev;
89837 + switch (size) {
89838 + case 1:
89839 + __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
89840 + : "=a"(prev)
89841 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
89842 + : "memory");
89843 + return prev;
89844 + case 2:
89845 + __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
89846 + : "=a"(prev)
89847 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
89848 + : "memory");
89849 + return prev;
89850 + case 4:
89851 + __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
89852 + : "=a"(prev)
89853 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
89854 + : "memory");
89855 + return prev;
89856 + }
89857 + return old;
89858 +}
89859 +
89860 +#ifndef CONFIG_X86_CMPXCHG
89861 +/*
89862 + * Building a kernel capable running on 80386. It may be necessary to
89863 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
89864 + * a function for each of the sizes we support.
89865 + */
89866 +
89867 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
89868 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
89869 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
89870 +
89871 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
89872 + unsigned long new, int size)
89873 +{
89874 + switch (size) {
89875 + case 1:
89876 + return cmpxchg_386_u8(ptr, old, new);
89877 + case 2:
89878 + return cmpxchg_386_u16(ptr, old, new);
89879 + case 4:
89880 + return cmpxchg_386_u32(ptr, old, new);
89881 + }
89882 + return old;
89883 +}
89884 +
89885 +#define cmpxchg(ptr,o,n) \
89886 +({ \
89887 + __typeof__(*(ptr)) __ret; \
89888 + if (likely(boot_cpu_data.x86 > 3)) \
89889 + __ret = __cmpxchg((ptr), (unsigned long)(o), \
89890 + (unsigned long)(n), sizeof(*(ptr))); \
89891 + else \
89892 + __ret = cmpxchg_386((ptr), (unsigned long)(o), \
89893 + (unsigned long)(n), sizeof(*(ptr))); \
89894 + __ret; \
89895 +})
89896 +#endif
89897 +
89898 +#ifdef CONFIG_X86_CMPXCHG64
89899 +
89900 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
89901 + unsigned long long new)
89902 +{
89903 + unsigned long long prev;
89904 + __asm__ __volatile__(LOCK "cmpxchg8b %3"
89905 + : "=A"(prev)
89906 + : "b"((unsigned long)new),
89907 + "c"((unsigned long)(new >> 32)),
89908 + "m"(*__xg(ptr)),
89909 + "0"(old)
89910 + : "memory");
89911 + return prev;
89912 +}
89913 +
89914 +#define cmpxchg64(ptr,o,n)\
89915 + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
89916 + (unsigned long long)(n)))
89917 +
89918 +#endif
89919 +
89920 +#ifdef __KERNEL__
89921 +struct alt_instr {
89922 + __u8 *instr; /* original instruction */
89923 + __u8 *replacement;
89924 + __u8 cpuid; /* cpuid bit set for replacement */
89925 + __u8 instrlen; /* length of original instruction */
89926 + __u8 replacementlen; /* length of new instruction, <= instrlen */
89927 + __u8 pad;
89928 +};
89929 +#endif
89930 +
89931 +/*
89932 + * Alternative instructions for different CPU types or capabilities.
89933 + *
89934 + * This allows to use optimized instructions even on generic binary
89935 + * kernels.
89936 + *
89937 + * length of oldinstr must be longer or equal the length of newinstr
89938 + * It can be padded with nops as needed.
89939 + *
89940 + * For non barrier like inlines please define new variants
89941 + * without volatile and memory clobber.
89942 + */
89943 +#define alternative(oldinstr, newinstr, feature) \
89944 + asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89945 + ".section .altinstructions,\"a\"\n" \
89946 + " .align 4\n" \
89947 + " .long 661b\n" /* label */ \
89948 + " .long 663f\n" /* new instruction */ \
89949 + " .byte %c0\n" /* feature bit */ \
89950 + " .byte 662b-661b\n" /* sourcelen */ \
89951 + " .byte 664f-663f\n" /* replacementlen */ \
89952 + ".previous\n" \
89953 + ".section .altinstr_replacement,\"ax\"\n" \
89954 + "663:\n\t" newinstr "\n664:\n" /* replacement */ \
89955 + ".previous" :: "i" (feature) : "memory")
89956 +
89957 +/*
89958 + * Alternative inline assembly with input.
89959 + *
89960 + * Pecularities:
89961 + * No memory clobber here.
89962 + * Argument numbers start with 1.
89963 + * Best is to use constraints that are fixed size (like (%1) ... "r")
89964 + * If you use variable sized constraints like "m" or "g" in the
89965 + * replacement maake sure to pad to the worst case length.
89966 + */
89967 +#define alternative_input(oldinstr, newinstr, feature, input...) \
89968 + asm volatile ("661:\n\t" oldinstr "\n662:\n" \
89969 + ".section .altinstructions,\"a\"\n" \
89970 + " .align 4\n" \
89971 + " .long 661b\n" /* label */ \
89972 + " .long 663f\n" /* new instruction */ \
89973 + " .byte %c0\n" /* feature bit */ \
89974 + " .byte 662b-661b\n" /* sourcelen */ \
89975 + " .byte 664f-663f\n" /* replacementlen */ \
89976 + ".previous\n" \
89977 + ".section .altinstr_replacement,\"ax\"\n" \
89978 + "663:\n\t" newinstr "\n664:\n" /* replacement */ \
89979 + ".previous" :: "i" (feature), ##input)
89980 +
89981 +/*
89982 + * Force strict CPU ordering.
89983 + * And yes, this is required on UP too when we're talking
89984 + * to devices.
89985 + *
89986 + * For now, "wmb()" doesn't actually do anything, as all
89987 + * Intel CPU's follow what Intel calls a *Processor Order*,
89988 + * in which all writes are seen in the program order even
89989 + * outside the CPU.
89990 + *
89991 + * I expect future Intel CPU's to have a weaker ordering,
89992 + * but I'd also expect them to finally get their act together
89993 + * and add some real memory barriers if so.
89994 + *
89995 + * Some non intel clones support out of order store. wmb() ceases to be a
89996 + * nop for these.
89997 + */
89998 +
89999 +
90000 +/*
90001 + * Actually only lfence would be needed for mb() because all stores done
90002 + * by the kernel should be already ordered. But keep a full barrier for now.
90003 + */
90004 +
90005 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
90006 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
90007 +
90008 +/**
90009 + * read_barrier_depends - Flush all pending reads that subsequents reads
90010 + * depend on.
90011 + *
90012 + * No data-dependent reads from memory-like regions are ever reordered
90013 + * over this barrier. All reads preceding this primitive are guaranteed
90014 + * to access memory (but not necessarily other CPUs' caches) before any
90015 + * reads following this primitive that depend on the data return by
90016 + * any of the preceding reads. This primitive is much lighter weight than
90017 + * rmb() on most CPUs, and is never heavier weight than is
90018 + * rmb().
90019 + *
90020 + * These ordering constraints are respected by both the local CPU
90021 + * and the compiler.
90022 + *
90023 + * Ordering is not guaranteed by anything other than these primitives,
90024 + * not even by data dependencies. See the documentation for
90025 + * memory_barrier() for examples and URLs to more information.
90026 + *
90027 + * For example, the following code would force ordering (the initial
90028 + * value of "a" is zero, "b" is one, and "p" is "&a"):
90029 + *
90030 + * <programlisting>
90031 + * CPU 0 CPU 1
90032 + *
90033 + * b = 2;
90034 + * memory_barrier();
90035 + * p = &b; q = p;
90036 + * read_barrier_depends();
90037 + * d = *q;
90038 + * </programlisting>
90039 + *
90040 + * because the read of "*q" depends on the read of "p" and these
90041 + * two reads are separated by a read_barrier_depends(). However,
90042 + * the following code, with the same initial values for "a" and "b":
90043 + *
90044 + * <programlisting>
90045 + * CPU 0 CPU 1
90046 + *
90047 + * a = 2;
90048 + * memory_barrier();
90049 + * b = 3; y = b;
90050 + * read_barrier_depends();
90051 + * x = a;
90052 + * </programlisting>
90053 + *
90054 + * does not enforce ordering, since there is no data dependency between
90055 + * the read of "a" and the read of "b". Therefore, on some CPUs, such
90056 + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
90057 + * in cases like thiswhere there are no data dependencies.
90058 + **/
90059 +
90060 +#define read_barrier_depends() do { } while(0)
90061 +
90062 +#ifdef CONFIG_X86_OOSTORE
90063 +/* Actually there are no OOO store capable CPUs for now that do SSE,
90064 + but make it already an possibility. */
90065 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
90066 +#else
90067 +#define wmb() __asm__ __volatile__ ("": : :"memory")
90068 +#endif
90069 +
90070 +#ifdef CONFIG_SMP
90071 +#define smp_wmb() wmb()
90072 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
90073 +#define smp_alt_mb(instr) \
90074 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
90075 + ".section __smp_alternatives,\"a\"\n" \
90076 + ".long 6667b\n" \
90077 + ".long 6673f\n" \
90078 + ".previous\n" \
90079 + ".section __smp_replacements,\"a\"\n" \
90080 + "6673:.byte 6668b-6667b\n" \
90081 + ".byte 6670f-6669f\n" \
90082 + ".byte 6671f-6670f\n" \
90083 + ".byte 0\n" \
90084 + ".byte %c0\n" \
90085 + "6669:lock;addl $0,0(%%esp)\n" \
90086 + "6670:" instr "\n" \
90087 + "6671:\n" \
90088 + ".previous\n" \
90089 + : \
90090 + : "i" (X86_FEATURE_XMM2) \
90091 + : "memory")
90092 +#define smp_rmb() smp_alt_mb("lfence")
90093 +#define smp_mb() smp_alt_mb("mfence")
90094 +#define set_mb(var, value) do { \
90095 +unsigned long __set_mb_temp; \
90096 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
90097 + ".section __smp_alternatives,\"a\"\n" \
90098 + ".long 6667b\n" \
90099 + ".long 6673f\n" \
90100 + ".previous\n" \
90101 + ".section __smp_replacements,\"a\"\n" \
90102 + "6673: .byte 6668b-6667b\n" \
90103 + ".byte 6670f-6669f\n" \
90104 + ".byte 0\n" \
90105 + ".byte 6671f-6670f\n" \
90106 + ".byte -1\n" \
90107 + "6669: xchg %1, %0\n" \
90108 + "6670:movl %1, %0\n" \
90109 + "6671:\n" \
90110 + ".previous\n" \
90111 + : "=m" (var), "=r" (__set_mb_temp) \
90112 + : "1" (value) \
90113 + : "memory"); } while (0)
90114 +#else
90115 +#define smp_rmb() rmb()
90116 +#define smp_mb() mb()
90117 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
90118 +#endif
90119 +#define smp_read_barrier_depends() read_barrier_depends()
90120 +#else
90121 +#define smp_mb() barrier()
90122 +#define smp_rmb() barrier()
90123 +#define smp_wmb() barrier()
90124 +#define smp_read_barrier_depends() do { } while(0)
90125 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
90126 +#endif
90127 +
90128 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
90129 +
90130 +/* interrupt control.. */
90131 +
90132 +/*
90133 + * The use of 'barrier' in the following reflects their use as local-lock
90134 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
90135 + * critical operations are executed. All critical operations must complete
90136 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
90137 + * includes these barriers, for example.
90138 + */
90139 +
90140 +#define __cli() \
90141 +do { \
90142 + vcpu_info_t *_vcpu; \
90143 + preempt_disable(); \
90144 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90145 + _vcpu->evtchn_upcall_mask = 1; \
90146 + preempt_enable_no_resched(); \
90147 + barrier(); \
90148 +} while (0)
90149 +
90150 +#define __sti() \
90151 +do { \
90152 + vcpu_info_t *_vcpu; \
90153 + barrier(); \
90154 + preempt_disable(); \
90155 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90156 + _vcpu->evtchn_upcall_mask = 0; \
90157 + barrier(); /* unmask then check (avoid races) */ \
90158 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
90159 + force_evtchn_callback(); \
90160 + preempt_enable(); \
90161 +} while (0)
90162 +
90163 +#define __save_flags(x) \
90164 +do { \
90165 + vcpu_info_t *_vcpu; \
90166 + preempt_disable(); \
90167 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90168 + (x) = _vcpu->evtchn_upcall_mask; \
90169 + preempt_enable(); \
90170 +} while (0)
90171 +
90172 +#define __restore_flags(x) \
90173 +do { \
90174 + vcpu_info_t *_vcpu; \
90175 + barrier(); \
90176 + preempt_disable(); \
90177 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90178 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
90179 + barrier(); /* unmask then check (avoid races) */ \
90180 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
90181 + force_evtchn_callback(); \
90182 + preempt_enable(); \
90183 + } else \
90184 + preempt_enable_no_resched(); \
90185 +} while (0)
90186 +
90187 +void safe_halt(void);
90188 +void halt(void);
90189 +
90190 +#define __save_and_cli(x) \
90191 +do { \
90192 + vcpu_info_t *_vcpu; \
90193 + preempt_disable(); \
90194 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90195 + (x) = _vcpu->evtchn_upcall_mask; \
90196 + _vcpu->evtchn_upcall_mask = 1; \
90197 + preempt_enable_no_resched(); \
90198 + barrier(); \
90199 +} while (0)
90200 +
90201 +#define local_irq_save(x) __save_and_cli(x)
90202 +#define local_irq_restore(x) __restore_flags(x)
90203 +#define local_save_flags(x) __save_flags(x)
90204 +#define local_irq_disable() __cli()
90205 +#define local_irq_enable() __sti()
90206 +
90207 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
90208 +#define irqs_disabled() \
90209 +({ int ___x; \
90210 + vcpu_info_t *_vcpu; \
90211 + preempt_disable(); \
90212 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
90213 + ___x = (_vcpu->evtchn_upcall_mask != 0); \
90214 + preempt_enable_no_resched(); \
90215 + ___x; })
90216 +
90217 +/*
90218 + * disable hlt during certain critical i/o operations
90219 + */
90220 +#define HAVE_DISABLE_HLT
90221 +void disable_hlt(void);
90222 +void enable_hlt(void);
90223 +
90224 +extern int es7000_plat;
90225 +void cpu_idle_wait(void);
90226 +
90227 +/*
90228 + * On SMP systems, when the scheduler does migration-cost autodetection,
90229 + * it needs a way to flush as much of the CPU's caches as possible:
90230 + */
90231 +static inline void sched_cacheflush(void)
90232 +{
90233 + wbinvd();
90234 +}
90235 +
90236 +extern unsigned long arch_align_stack(unsigned long sp);
90237 +
90238 +#endif
90239 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/tlbflush.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/tlbflush.h
90240 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/tlbflush.h 1970-01-01 00:00:00.000000000 +0000
90241 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/tlbflush.h 2007-01-08 15:00:46.000000000 +0000
90242 @@ -0,0 +1,102 @@
90243 +#ifndef _I386_TLBFLUSH_H
90244 +#define _I386_TLBFLUSH_H
90245 +
90246 +#include <linux/config.h>
90247 +#include <linux/mm.h>
90248 +#include <asm/processor.h>
90249 +
90250 +#define __flush_tlb() xen_tlb_flush()
90251 +#define __flush_tlb_global() xen_tlb_flush()
90252 +#define __flush_tlb_all() xen_tlb_flush()
90253 +
90254 +extern unsigned long pgkern_mask;
90255 +
90256 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
90257 +
90258 +#define __flush_tlb_single(addr) xen_invlpg(addr)
90259 +
90260 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
90261 +
90262 +/*
90263 + * TLB flushing:
90264 + *
90265 + * - flush_tlb() flushes the current mm struct TLBs
90266 + * - flush_tlb_all() flushes all processes TLBs
90267 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
90268 + * - flush_tlb_page(vma, vmaddr) flushes one page
90269 + * - flush_tlb_range(vma, start, end) flushes a range of pages
90270 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
90271 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
90272 + *
90273 + * ..but the i386 has somewhat limited tlb flushing capabilities,
90274 + * and page-granular flushes are available only on i486 and up.
90275 + */
90276 +
90277 +#ifndef CONFIG_SMP
90278 +
90279 +#define flush_tlb() __flush_tlb()
90280 +#define flush_tlb_all() __flush_tlb_all()
90281 +#define local_flush_tlb() __flush_tlb()
90282 +
90283 +static inline void flush_tlb_mm(struct mm_struct *mm)
90284 +{
90285 + if (mm == current->active_mm)
90286 + __flush_tlb();
90287 +}
90288 +
90289 +static inline void flush_tlb_page(struct vm_area_struct *vma,
90290 + unsigned long addr)
90291 +{
90292 + if (vma->vm_mm == current->active_mm)
90293 + __flush_tlb_one(addr);
90294 +}
90295 +
90296 +static inline void flush_tlb_range(struct vm_area_struct *vma,
90297 + unsigned long start, unsigned long end)
90298 +{
90299 + if (vma->vm_mm == current->active_mm)
90300 + __flush_tlb();
90301 +}
90302 +
90303 +#else
90304 +
90305 +#include <asm/smp.h>
90306 +
90307 +#define local_flush_tlb() \
90308 + __flush_tlb()
90309 +
90310 +extern void flush_tlb_all(void);
90311 +extern void flush_tlb_current_task(void);
90312 +extern void flush_tlb_mm(struct mm_struct *);
90313 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
90314 +
90315 +#define flush_tlb() flush_tlb_current_task()
90316 +
90317 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
90318 +{
90319 + flush_tlb_mm(vma->vm_mm);
90320 +}
90321 +
90322 +#define TLBSTATE_OK 1
90323 +#define TLBSTATE_LAZY 2
90324 +
90325 +struct tlb_state
90326 +{
90327 + struct mm_struct *active_mm;
90328 + int state;
90329 + char __cacheline_padding[L1_CACHE_BYTES-8];
90330 +};
90331 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
90332 +
90333 +
90334 +#endif
90335 +
90336 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
90337 +
90338 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
90339 + unsigned long start, unsigned long end)
90340 +{
90341 + /* i386 does not keep any page table caches in TLB */
90342 +}
90343 +
90344 +#endif /* _I386_TLBFLUSH_H */
90345 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/vga.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/vga.h
90346 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/vga.h 1970-01-01 00:00:00.000000000 +0000
90347 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/vga.h 2007-01-08 15:00:46.000000000 +0000
90348 @@ -0,0 +1,20 @@
90349 +/*
90350 + * Access to VGA videoram
90351 + *
90352 + * (c) 1998 Martin Mares <mj@ucw.cz>
90353 + */
90354 +
90355 +#ifndef _LINUX_ASM_VGA_H_
90356 +#define _LINUX_ASM_VGA_H_
90357 +
90358 +/*
90359 + * On the PC, we can just recalculate addresses and then
90360 + * access the videoram directly without any black magic.
90361 + */
90362 +
90363 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
90364 +
90365 +#define vga_readb(x) (*(x))
90366 +#define vga_writeb(x,y) (*(y) = (x))
90367 +
90368 +#endif
90369 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/xenoprof.h linux-2.6.16.33/include/asm-i386/mach-xen/asm/xenoprof.h
90370 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/asm/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
90371 +++ linux-2.6.16.33/include/asm-i386/mach-xen/asm/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
90372 @@ -0,0 +1,48 @@
90373 +/******************************************************************************
90374 + * asm-i386/mach-xen/asm/xenoprof.h
90375 + *
90376 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
90377 + * VA Linux Systems Japan K.K.
90378 + *
90379 + * This program is free software; you can redistribute it and/or modify
90380 + * it under the terms of the GNU General Public License as published by
90381 + * the Free Software Foundation; either version 2 of the License, or
90382 + * (at your option) any later version.
90383 + *
90384 + * This program is distributed in the hope that it will be useful,
90385 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
90386 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
90387 + * GNU General Public License for more details.
90388 + *
90389 + * You should have received a copy of the GNU General Public License
90390 + * along with this program; if not, write to the Free Software
90391 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
90392 + *
90393 + */
90394 +#ifndef __ASM_XENOPROF_H__
90395 +#define __ASM_XENOPROF_H__
90396 +#ifdef CONFIG_XEN
90397 +
90398 +struct super_block;
90399 +struct dentry;
90400 +int xenoprof_create_files(struct super_block * sb, struct dentry * root);
90401 +#define HAVE_XENOPROF_CREATE_FILES
90402 +
90403 +struct xenoprof_init;
90404 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
90405 +void xenoprof_arch_counter(void);
90406 +void xenoprof_arch_start(void);
90407 +void xenoprof_arch_stop(void);
90408 +
90409 +struct xenoprof_arch_shared_buffer {
90410 + /* nothing */
90411 +};
90412 +struct xenoprof_shared_buffer;
90413 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
90414 +struct xenoprof_get_buffer;
90415 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
90416 +struct xenoprof_passive;
90417 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
90418 +
90419 +#endif /* CONFIG_XEN */
90420 +#endif /* __ASM_XENOPROF_H__ */
90421 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/irq_vectors.h linux-2.6.16.33/include/asm-i386/mach-xen/irq_vectors.h
90422 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/irq_vectors.h 1970-01-01 00:00:00.000000000 +0000
90423 +++ linux-2.6.16.33/include/asm-i386/mach-xen/irq_vectors.h 2007-01-08 15:00:46.000000000 +0000
90424 @@ -0,0 +1,125 @@
90425 +/*
90426 + * This file should contain #defines for all of the interrupt vector
90427 + * numbers used by this architecture.
90428 + *
90429 + * In addition, there are some standard defines:
90430 + *
90431 + * FIRST_EXTERNAL_VECTOR:
90432 + * The first free place for external interrupts
90433 + *
90434 + * SYSCALL_VECTOR:
90435 + * The IRQ vector a syscall makes the user to kernel transition
90436 + * under.
90437 + *
90438 + * TIMER_IRQ:
90439 + * The IRQ number the timer interrupt comes in at.
90440 + *
90441 + * NR_IRQS:
90442 + * The total number of interrupt vectors (including all the
90443 + * architecture specific interrupts) needed.
90444 + *
90445 + */
90446 +#ifndef _ASM_IRQ_VECTORS_H
90447 +#define _ASM_IRQ_VECTORS_H
90448 +
90449 +/*
90450 + * IDT vectors usable for external interrupt sources start
90451 + * at 0x20:
90452 + */
90453 +#define FIRST_EXTERNAL_VECTOR 0x20
90454 +
90455 +#define SYSCALL_VECTOR 0x80
90456 +
90457 +/*
90458 + * Vectors 0x20-0x2f are used for ISA interrupts.
90459 + */
90460 +
90461 +#if 0
90462 +/*
90463 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
90464 + *
90465 + * some of the following vectors are 'rare', they are merged
90466 + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
90467 + * TLB, reschedule and local APIC vectors are performance-critical.
90468 + *
90469 + * Vectors 0xf0-0xfa are free (reserved for future Linux use).
90470 + */
90471 +#define SPURIOUS_APIC_VECTOR 0xff
90472 +#define ERROR_APIC_VECTOR 0xfe
90473 +#define INVALIDATE_TLB_VECTOR 0xfd
90474 +#define RESCHEDULE_VECTOR 0xfc
90475 +#define CALL_FUNCTION_VECTOR 0xfb
90476 +
90477 +#define THERMAL_APIC_VECTOR 0xf0
90478 +/*
90479 + * Local APIC timer IRQ vector is on a different priority level,
90480 + * to work around the 'lost local interrupt if more than 2 IRQ
90481 + * sources per level' errata.
90482 + */
90483 +#define LOCAL_TIMER_VECTOR 0xef
90484 +#endif
90485 +
90486 +#define SPURIOUS_APIC_VECTOR 0xff
90487 +#define ERROR_APIC_VECTOR 0xfe
90488 +
90489 +/*
90490 + * First APIC vector available to drivers: (vectors 0x30-0xee)
90491 + * we start at 0x31 to spread out vectors evenly between priority
90492 + * levels. (0x80 is the syscall vector)
90493 + */
90494 +#define FIRST_DEVICE_VECTOR 0x31
90495 +#define FIRST_SYSTEM_VECTOR 0xef
90496 +
90497 +/*
90498 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
90499 + * Right now the APIC is mostly only used for SMP.
90500 + * 256 vectors is an architectural limit. (we can have
90501 + * more than 256 devices theoretically, but they will
90502 + * have to use shared interrupts)
90503 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
90504 + * the usable vector space is 0x20-0xff (224 vectors)
90505 + */
90506 +
90507 +#define RESCHEDULE_VECTOR 0
90508 +#define CALL_FUNCTION_VECTOR 1
90509 +#define NR_IPIS 2
90510 +
90511 +/*
90512 + * The maximum number of vectors supported by i386 processors
90513 + * is limited to 256. For processors other than i386, NR_VECTORS
90514 + * should be changed accordingly.
90515 + */
90516 +#define NR_VECTORS 256
90517 +
90518 +#define FPU_IRQ 13
90519 +
90520 +#define FIRST_VM86_IRQ 3
90521 +#define LAST_VM86_IRQ 15
90522 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
90523 +
90524 +/*
90525 + * The flat IRQ space is divided into two regions:
90526 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
90527 + * if we have physical device-access privilege. This region is at the
90528 + * start of the IRQ space so that existing device drivers do not need
90529 + * to be modified to translate physical IRQ numbers into our IRQ space.
90530 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
90531 + * are bound using the provided bind/unbind functions.
90532 + */
90533 +
90534 +#define PIRQ_BASE 0
90535 +#define NR_PIRQS 256
90536 +
90537 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
90538 +#define NR_DYNIRQS 256
90539 +
90540 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
90541 +#define NR_IRQ_VECTORS NR_IRQS
90542 +
90543 +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
90544 +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
90545 +
90546 +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
90547 +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
90548 +
90549 +#endif /* _ASM_IRQ_VECTORS_H */
90550 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/mach_traps.h linux-2.6.16.33/include/asm-i386/mach-xen/mach_traps.h
90551 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/mach_traps.h 1970-01-01 00:00:00.000000000 +0000
90552 +++ linux-2.6.16.33/include/asm-i386/mach-xen/mach_traps.h 2007-01-08 15:00:46.000000000 +0000
90553 @@ -0,0 +1,33 @@
90554 +/*
90555 + * include/asm-xen/asm-i386/mach-xen/mach_traps.h
90556 + *
90557 + * Machine specific NMI handling for Xen
90558 + */
90559 +#ifndef _MACH_TRAPS_H
90560 +#define _MACH_TRAPS_H
90561 +
90562 +#include <linux/bitops.h>
90563 +#include <xen/interface/nmi.h>
90564 +
90565 +static inline void clear_mem_error(unsigned char reason) {}
90566 +static inline void clear_io_check_error(unsigned char reason) {}
90567 +
90568 +static inline unsigned char get_nmi_reason(void)
90569 +{
90570 + shared_info_t *s = HYPERVISOR_shared_info;
90571 + unsigned char reason = 0;
90572 +
90573 + /* construct a value which looks like it came from
90574 + * port 0x61.
90575 + */
90576 + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
90577 + reason |= 0x40;
90578 + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
90579 + reason |= 0x80;
90580 +
90581 + return reason;
90582 +}
90583 +
90584 +static inline void reassert_nmi(void) {}
90585 +
90586 +#endif /* !_MACH_TRAPS_H */
90587 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_post.h linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_post.h
90588 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_post.h 1970-01-01 00:00:00.000000000 +0000
90589 +++ linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_post.h 2007-01-08 15:00:46.000000000 +0000
90590 @@ -0,0 +1,108 @@
90591 +/**
90592 + * machine_specific_memory_setup - Hook for machine specific memory setup.
90593 + *
90594 + * Description:
90595 + * This is included late in kernel/setup.c so that it can make
90596 + * use of all of the static functions.
90597 + **/
90598 +
90599 +#include <xen/interface/callback.h>
90600 +#include <xen/interface/memory.h>
90601 +
90602 +static char * __init machine_specific_memory_setup(void)
90603 +{
90604 + int rc;
90605 + struct xen_memory_map memmap;
90606 + /*
90607 + * This is rather large for a stack variable but this early in
90608 + * the boot process we know we have plenty slack space.
90609 + */
90610 + struct e820entry map[E820MAX];
90611 +
90612 + memmap.nr_entries = E820MAX;
90613 + set_xen_guest_handle(memmap.buffer, map);
90614 +
90615 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
90616 + if ( rc == -ENOSYS ) {
90617 + memmap.nr_entries = 1;
90618 + map[0].addr = 0ULL;
90619 + map[0].size = PFN_PHYS(xen_start_info->nr_pages);
90620 + /* 8MB slack (to balance backend allocations). */
90621 + map[0].size += 8ULL << 20;
90622 + map[0].type = E820_RAM;
90623 + rc = 0;
90624 + }
90625 + BUG_ON(rc);
90626 +
90627 + sanitize_e820_map(map, (char *)&memmap.nr_entries);
90628 +
90629 + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
90630 +
90631 + return "Xen";
90632 +}
90633 +
90634 +extern void hypervisor_callback(void);
90635 +extern void failsafe_callback(void);
90636 +extern void nmi(void);
90637 +
90638 +unsigned long *machine_to_phys_mapping;
90639 +EXPORT_SYMBOL(machine_to_phys_mapping);
90640 +unsigned int machine_to_phys_order;
90641 +EXPORT_SYMBOL(machine_to_phys_order);
90642 +
90643 +static void __init machine_specific_arch_setup(void)
90644 +{
90645 + int ret;
90646 + struct xen_machphys_mapping mapping;
90647 + unsigned long machine_to_phys_nr_ents;
90648 + struct xen_platform_parameters pp;
90649 + static struct callback_register __initdata event = {
90650 + .type = CALLBACKTYPE_event,
90651 + .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
90652 + };
90653 + static struct callback_register __initdata failsafe = {
90654 + .type = CALLBACKTYPE_failsafe,
90655 + .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
90656 + };
90657 + static struct callback_register __initdata nmi_cb = {
90658 + .type = CALLBACKTYPE_nmi,
90659 + .address = { __KERNEL_CS, (unsigned long)nmi },
90660 + };
90661 +
90662 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
90663 + if (ret == 0)
90664 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
90665 +#ifdef CONFIG_XEN_COMPAT_030002
90666 + if (ret == -ENOSYS)
90667 + ret = HYPERVISOR_set_callbacks(
90668 + event.address.cs, event.address.eip,
90669 + failsafe.address.cs, failsafe.address.eip);
90670 +#endif
90671 + BUG_ON(ret);
90672 +
90673 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
90674 +#ifdef CONFIG_XEN_COMPAT_030002
90675 + if (ret == -ENOSYS) {
90676 + static struct xennmi_callback __initdata cb = {
90677 + .handler_address = (unsigned long)nmi
90678 + };
90679 +
90680 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
90681 + }
90682 +#endif
90683 +
90684 + if (HYPERVISOR_xen_version(XENVER_platform_parameters,
90685 + &pp) == 0) {
90686 + hypervisor_virt_start = pp.virt_start;
90687 + set_fixaddr_top();
90688 + }
90689 +
90690 + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
90691 + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
90692 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
90693 + machine_to_phys_mapping = (unsigned long *)mapping.v_start;
90694 + machine_to_phys_nr_ents = mapping.max_mfn + 1;
90695 + }
90696 + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
90697 + machine_to_phys_order++;
90698 +}
90699 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_pre.h linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_pre.h
90700 --- linux-2.6.16.33-noxen/include/asm-i386/mach-xen/setup_arch_pre.h 1970-01-01 00:00:00.000000000 +0000
90701 +++ linux-2.6.16.33/include/asm-i386/mach-xen/setup_arch_pre.h 2007-01-08 15:00:46.000000000 +0000
90702 @@ -0,0 +1,5 @@
90703 +/* Hook to call BIOS initialisation function */
90704 +
90705 +#define ARCH_SETUP machine_specific_arch_setup();
90706 +
90707 +static void __init machine_specific_arch_setup(void);
90708 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/page.h linux-2.6.16.33/include/asm-i386/page.h
90709 --- linux-2.6.16.33-noxen/include/asm-i386/page.h 2006-11-22 18:06:31.000000000 +0000
90710 +++ linux-2.6.16.33/include/asm-i386/page.h 2007-01-08 15:00:46.000000000 +0000
90711 @@ -121,7 +121,7 @@
90712
90713 #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
90714 #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
90715 -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE)
90716 +#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
90717 #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
90718 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
90719 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
90720 @@ -139,6 +139,8 @@
90721 ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
90722 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
90723
90724 +#define __HAVE_ARCH_GATE_AREA 1
90725 +
90726 #endif /* __KERNEL__ */
90727
90728 #include <asm-generic/page.h>
90729 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/pgtable-2level-defs.h linux-2.6.16.33/include/asm-i386/pgtable-2level-defs.h
90730 --- linux-2.6.16.33-noxen/include/asm-i386/pgtable-2level-defs.h 2006-11-22 18:06:31.000000000 +0000
90731 +++ linux-2.6.16.33/include/asm-i386/pgtable-2level-defs.h 2007-05-23 21:00:01.000000000 +0000
90732 @@ -1,6 +1,8 @@
90733 #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
90734 #define _I386_PGTABLE_2LEVEL_DEFS_H
90735
90736 +#define HAVE_SHARED_KERNEL_PMD 0
90737 +
90738 /*
90739 * traditional i386 two-level paging structure:
90740 */
90741 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/pgtable-3level-defs.h linux-2.6.16.33/include/asm-i386/pgtable-3level-defs.h
90742 --- linux-2.6.16.33-noxen/include/asm-i386/pgtable-3level-defs.h 2006-11-22 18:06:31.000000000 +0000
90743 +++ linux-2.6.16.33/include/asm-i386/pgtable-3level-defs.h 2007-05-23 21:00:01.000000000 +0000
90744 @@ -1,6 +1,8 @@
90745 #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
90746 #define _I386_PGTABLE_3LEVEL_DEFS_H
90747
90748 +#define HAVE_SHARED_KERNEL_PMD 1
90749 +
90750 /*
90751 * PGDIR_SHIFT determines what a top-level page table entry can map
90752 */
90753 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/rwsem.h linux-2.6.16.33/include/asm-i386/rwsem.h
90754 --- linux-2.6.16.33-noxen/include/asm-i386/rwsem.h 2006-11-22 18:06:31.000000000 +0000
90755 +++ linux-2.6.16.33/include/asm-i386/rwsem.h 2007-01-08 15:00:46.000000000 +0000
90756 @@ -40,6 +40,7 @@
90757
90758 #include <linux/list.h>
90759 #include <linux/spinlock.h>
90760 +#include <asm/smp_alt.h>
90761
90762 struct rwsem_waiter;
90763
90764 @@ -99,7 +100,7 @@
90765 {
90766 __asm__ __volatile__(
90767 "# beginning down_read\n\t"
90768 -LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
90769 +LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
90770 " js 2f\n\t" /* jump if we weren't granted the lock */
90771 "1:\n\t"
90772 LOCK_SECTION_START("")
90773 @@ -130,7 +131,7 @@
90774 " movl %1,%2\n\t"
90775 " addl %3,%2\n\t"
90776 " jle 2f\n\t"
90777 -LOCK_PREFIX " cmpxchgl %2,%0\n\t"
90778 +LOCK " cmpxchgl %2,%0\n\t"
90779 " jnz 1b\n\t"
90780 "2:\n\t"
90781 "# ending __down_read_trylock\n\t"
90782 @@ -150,7 +151,7 @@
90783 tmp = RWSEM_ACTIVE_WRITE_BIAS;
90784 __asm__ __volatile__(
90785 "# beginning down_write\n\t"
90786 -LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
90787 +LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
90788 " testl %%edx,%%edx\n\t" /* was the count 0 before? */
90789 " jnz 2f\n\t" /* jump if we weren't granted the lock */
90790 "1:\n\t"
90791 @@ -188,7 +189,7 @@
90792 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
90793 __asm__ __volatile__(
90794 "# beginning __up_read\n\t"
90795 -LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
90796 +LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
90797 " js 2f\n\t" /* jump if the lock is being waited upon */
90798 "1:\n\t"
90799 LOCK_SECTION_START("")
90800 @@ -214,7 +215,7 @@
90801 __asm__ __volatile__(
90802 "# beginning __up_write\n\t"
90803 " movl %2,%%edx\n\t"
90804 -LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
90805 +LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
90806 " jnz 2f\n\t" /* jump if the lock is being waited upon */
90807 "1:\n\t"
90808 LOCK_SECTION_START("")
90809 @@ -239,7 +240,7 @@
90810 {
90811 __asm__ __volatile__(
90812 "# beginning __downgrade_write\n\t"
90813 -LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
90814 +LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
90815 " js 2f\n\t" /* jump if the lock is being waited upon */
90816 "1:\n\t"
90817 LOCK_SECTION_START("")
90818 @@ -263,7 +264,7 @@
90819 static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
90820 {
90821 __asm__ __volatile__(
90822 -LOCK_PREFIX "addl %1,%0"
90823 +LOCK "addl %1,%0"
90824 : "=m"(sem->count)
90825 : "ir"(delta), "m"(sem->count));
90826 }
90827 @@ -276,7 +277,7 @@
90828 int tmp = delta;
90829
90830 __asm__ __volatile__(
90831 -LOCK_PREFIX "xadd %0,(%2)"
90832 +LOCK "xadd %0,(%2)"
90833 : "+r"(tmp), "=m"(sem->count)
90834 : "r"(sem), "m"(sem->count)
90835 : "memory");
90836 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/smp_alt.h linux-2.6.16.33/include/asm-i386/smp_alt.h
90837 --- linux-2.6.16.33-noxen/include/asm-i386/smp_alt.h 1970-01-01 00:00:00.000000000 +0000
90838 +++ linux-2.6.16.33/include/asm-i386/smp_alt.h 2007-01-08 15:00:46.000000000 +0000
90839 @@ -0,0 +1,32 @@
90840 +#ifndef __ASM_SMP_ALT_H__
90841 +#define __ASM_SMP_ALT_H__
90842 +
90843 +#include <linux/config.h>
90844 +
90845 +#ifdef CONFIG_SMP
90846 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
90847 +#define LOCK \
90848 + "6677: nop\n" \
90849 + ".section __smp_alternatives,\"a\"\n" \
90850 + ".long 6677b\n" \
90851 + ".long 6678f\n" \
90852 + ".previous\n" \
90853 + ".section __smp_replacements,\"a\"\n" \
90854 + "6678: .byte 1\n" \
90855 + ".byte 1\n" \
90856 + ".byte 0\n" \
90857 + ".byte 1\n" \
90858 + ".byte -1\n" \
90859 + "lock\n" \
90860 + "nop\n" \
90861 + ".previous\n"
90862 +void prepare_for_smp(void);
90863 +void unprepare_for_smp(void);
90864 +#else
90865 +#define LOCK "lock ; "
90866 +#endif
90867 +#else
90868 +#define LOCK ""
90869 +#endif
90870 +
90871 +#endif /* __ASM_SMP_ALT_H__ */
90872 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/spinlock.h linux-2.6.16.33/include/asm-i386/spinlock.h
90873 --- linux-2.6.16.33-noxen/include/asm-i386/spinlock.h 2006-11-22 18:06:31.000000000 +0000
90874 +++ linux-2.6.16.33/include/asm-i386/spinlock.h 2007-01-08 15:00:46.000000000 +0000
90875 @@ -6,6 +6,7 @@
90876 #include <asm/page.h>
90877 #include <linux/config.h>
90878 #include <linux/compiler.h>
90879 +#include <asm/smp_alt.h>
90880
90881 /*
90882 * Your basic SMP spinlocks, allowing only a single CPU anywhere
90883 @@ -23,7 +24,8 @@
90884
90885 #define __raw_spin_lock_string \
90886 "\n1:\t" \
90887 - "lock ; decb %0\n\t" \
90888 + LOCK \
90889 + "decb %0\n\t" \
90890 "jns 3f\n" \
90891 "2:\t" \
90892 "rep;nop\n\t" \
90893 @@ -34,7 +36,8 @@
90894
90895 #define __raw_spin_lock_string_flags \
90896 "\n1:\t" \
90897 - "lock ; decb %0\n\t" \
90898 + LOCK \
90899 + "decb %0\n\t" \
90900 "jns 4f\n\t" \
90901 "2:\t" \
90902 "testl $0x200, %1\n\t" \
90903 @@ -65,10 +68,34 @@
90904 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
90905 {
90906 char oldval;
90907 +#ifdef CONFIG_SMP_ALTERNATIVES
90908 + __asm__ __volatile__(
90909 + "1:movb %1,%b0\n"
90910 + "movb $0,%1\n"
90911 + "2:"
90912 + ".section __smp_alternatives,\"a\"\n"
90913 + ".long 1b\n"
90914 + ".long 3f\n"
90915 + ".previous\n"
90916 + ".section __smp_replacements,\"a\"\n"
90917 + "3: .byte 2b - 1b\n"
90918 + ".byte 5f-4f\n"
90919 + ".byte 0\n"
90920 + ".byte 6f-5f\n"
90921 + ".byte -1\n"
90922 + "4: xchgb %b0,%1\n"
90923 + "5: movb %1,%b0\n"
90924 + "movb $0,%1\n"
90925 + "6:\n"
90926 + ".previous\n"
90927 + :"=q" (oldval), "=m" (lock->slock)
90928 + :"0" (0) : "memory");
90929 +#else
90930 __asm__ __volatile__(
90931 "xchgb %b0,%1"
90932 :"=q" (oldval), "=m" (lock->slock)
90933 :"0" (0) : "memory");
90934 +#endif
90935 return oldval > 0;
90936 }
90937
90938 @@ -178,12 +205,12 @@
90939
90940 static inline void __raw_read_unlock(raw_rwlock_t *rw)
90941 {
90942 - asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
90943 + asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
90944 }
90945
90946 static inline void __raw_write_unlock(raw_rwlock_t *rw)
90947 {
90948 - asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
90949 + asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
90950 : "=m" (rw->lock) : : "memory");
90951 }
90952
90953 diff -Nur linux-2.6.16.33-noxen/include/asm-i386/system.h linux-2.6.16.33/include/asm-i386/system.h
90954 --- linux-2.6.16.33-noxen/include/asm-i386/system.h 2006-11-22 18:06:31.000000000 +0000
90955 +++ linux-2.6.16.33/include/asm-i386/system.h 2007-01-08 15:00:46.000000000 +0000
90956 @@ -5,7 +5,7 @@
90957 #include <linux/kernel.h>
90958 #include <asm/segment.h>
90959 #include <asm/cpufeature.h>
90960 -#include <linux/bitops.h> /* for LOCK_PREFIX */
90961 +#include <asm/smp_alt.h>
90962
90963 #ifdef __KERNEL__
90964
90965 @@ -271,19 +271,19 @@
90966 unsigned long prev;
90967 switch (size) {
90968 case 1:
90969 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
90970 + __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
90971 : "=a"(prev)
90972 : "q"(new), "m"(*__xg(ptr)), "0"(old)
90973 : "memory");
90974 return prev;
90975 case 2:
90976 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
90977 + __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
90978 : "=a"(prev)
90979 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90980 : "memory");
90981 return prev;
90982 case 4:
90983 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
90984 + __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
90985 : "=a"(prev)
90986 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90987 : "memory");
90988 @@ -336,7 +336,7 @@
90989 unsigned long long new)
90990 {
90991 unsigned long long prev;
90992 - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
90993 + __asm__ __volatile__(LOCK "cmpxchg8b %3"
90994 : "=A"(prev)
90995 : "b"((unsigned long)new),
90996 "c"((unsigned long)(new >> 32)),
90997 @@ -503,11 +503,55 @@
90998 #endif
90999
91000 #ifdef CONFIG_SMP
91001 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
91002 +#define smp_alt_mb(instr) \
91003 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
91004 + ".section __smp_alternatives,\"a\"\n" \
91005 + ".long 6667b\n" \
91006 + ".long 6673f\n" \
91007 + ".previous\n" \
91008 + ".section __smp_replacements,\"a\"\n" \
91009 + "6673:.byte 6668b-6667b\n" \
91010 + ".byte 6670f-6669f\n" \
91011 + ".byte 6671f-6670f\n" \
91012 + ".byte 0\n" \
91013 + ".byte %c0\n" \
91014 + "6669:lock;addl $0,0(%%esp)\n" \
91015 + "6670:" instr "\n" \
91016 + "6671:\n" \
91017 + ".previous\n" \
91018 + : \
91019 + : "i" (X86_FEATURE_XMM2) \
91020 + : "memory")
91021 +#define smp_mb() smp_alt_mb("mfence")
91022 +#define smp_rmb() smp_alt_mb("lfence")
91023 +#define set_mb(var, value) do { \
91024 +unsigned long __set_mb_temp; \
91025 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \
91026 + ".section __smp_alternatives,\"a\"\n" \
91027 + ".long 6667b\n" \
91028 + ".long 6673f\n" \
91029 + ".previous\n" \
91030 + ".section __smp_replacements,\"a\"\n" \
91031 + "6673: .byte 6668b-6667b\n" \
91032 + ".byte 6670f-6669f\n" \
91033 + ".byte 0\n" \
91034 + ".byte 6671f-6670f\n" \
91035 + ".byte -1\n" \
91036 + "6669: xchg %1, %0\n" \
91037 + "6670:movl %1, %0\n" \
91038 + "6671:\n" \
91039 + ".previous\n" \
91040 + : "=m" (var), "=r" (__set_mb_temp) \
91041 + : "1" (value) \
91042 + : "memory"); } while (0)
91043 +#else
91044 #define smp_mb() mb()
91045 #define smp_rmb() rmb()
91046 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
91047 +#endif
91048 #define smp_wmb() wmb()
91049 #define smp_read_barrier_depends() read_barrier_depends()
91050 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
91051 #else
91052 #define smp_mb() barrier()
91053 #define smp_rmb() barrier()
91054 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/agp.h linux-2.6.16.33/include/asm-ia64/agp.h
91055 --- linux-2.6.16.33-noxen/include/asm-ia64/agp.h 2006-11-22 18:06:31.000000000 +0000
91056 +++ linux-2.6.16.33/include/asm-ia64/agp.h 2007-01-08 15:00:46.000000000 +0000
91057 @@ -19,13 +19,44 @@
91058 #define flush_agp_cache() mb()
91059
91060 /* Convert a physical address to an address suitable for the GART. */
91061 +#ifndef CONFIG_XEN
91062 #define phys_to_gart(x) (x)
91063 #define gart_to_phys(x) (x)
91064 +#else
91065 +#define phys_to_gart(x) phys_to_machine_for_dma(x)
91066 +#define gart_to_phys(x) machine_to_phys_for_dma(x)
91067 +#endif
91068
91069 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
91070 +#ifndef CONFIG_XEN
91071 #define alloc_gatt_pages(order) \
91072 ((char *)__get_free_pages(GFP_KERNEL, (order)))
91073 #define free_gatt_pages(table, order) \
91074 free_pages((unsigned long)(table), (order))
91075 +#else
91076 +#include <asm/hypervisor.h>
91077 +static inline char*
91078 +alloc_gatt_pages(unsigned int order)
91079 +{
91080 + unsigned long error;
91081 + unsigned long ret = __get_free_pages(GFP_KERNEL, (order));
91082 + if (ret == 0) {
91083 + goto out;
91084 + }
91085 + error = xen_create_contiguous_region(ret, order, 0);
91086 + if (error) {
91087 + free_pages(ret, order);
91088 + ret = 0;
91089 + }
91090 +out:
91091 + return (char*)ret;
91092 +}
91093 +static inline void
91094 +free_gatt_pages(void* table, unsigned int order)
91095 +{
91096 + xen_destroy_contiguous_region((unsigned long)table, order);
91097 + free_pages((unsigned long)table, order);
91098 +}
91099 +#endif /* CONFIG_XEN */
91100
91101 #endif /* _ASM_IA64_AGP_H */
91102 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/dma-mapping.h linux-2.6.16.33/include/asm-ia64/dma-mapping.h
91103 --- linux-2.6.16.33-noxen/include/asm-ia64/dma-mapping.h 2006-11-22 18:06:31.000000000 +0000
91104 +++ linux-2.6.16.33/include/asm-ia64/dma-mapping.h 2007-01-08 15:00:46.000000000 +0000
91105 @@ -7,7 +7,14 @@
91106 */
91107 #include <linux/config.h>
91108 #include <asm/machvec.h>
91109 +#ifdef CONFIG_XEN
91110 +/* Needed for arch/i386/kernel/swiotlb.c and arch/i386/kernel/pci-dma-xen.c */
91111 +#include <asm/hypervisor.h>
91112 +/* Needed for arch/i386/kernel/swiotlb.c */
91113 +#include <asm-i386/mach-xen/asm/swiotlb.h>
91114 +#endif
91115
91116 +#ifndef CONFIG_XEN
91117 #define dma_alloc_coherent platform_dma_alloc_coherent
91118 #define dma_alloc_noncoherent platform_dma_alloc_coherent /* coherent mem. is cheap */
91119 #define dma_free_coherent platform_dma_free_coherent
91120 @@ -21,6 +28,46 @@
91121 #define dma_sync_single_for_device platform_dma_sync_single_for_device
91122 #define dma_sync_sg_for_device platform_dma_sync_sg_for_device
91123 #define dma_mapping_error platform_dma_mapping_error
91124 +#else
91125 +int dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
91126 + enum dma_data_direction direction);
91127 +void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
91128 + enum dma_data_direction direction);
91129 +int dma_supported(struct device *dev, u64 mask);
91130 +void *dma_alloc_coherent(struct device *dev, size_t size,
91131 + dma_addr_t *dma_handle, gfp_t gfp);
91132 +void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
91133 + dma_addr_t dma_handle);
91134 +dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
91135 + enum dma_data_direction direction);
91136 +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
91137 + enum dma_data_direction direction);
91138 +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
91139 + size_t size, enum dma_data_direction direction);
91140 +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
91141 + size_t size,
91142 + enum dma_data_direction direction);
91143 +int dma_mapping_error(dma_addr_t dma_addr);
91144 +
91145 +#define flush_write_buffers() do { } while (0)
91146 +static inline void
91147 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
91148 + enum dma_data_direction direction)
91149 +{
91150 + if (swiotlb)
91151 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
91152 + flush_write_buffers();
91153 +}
91154 +
91155 +static inline void
91156 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
91157 + enum dma_data_direction direction)
91158 +{
91159 + if (swiotlb)
91160 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
91161 + flush_write_buffers();
91162 +}
91163 +#endif
91164
91165 #define dma_map_page(dev, pg, off, size, dir) \
91166 dma_map_single(dev, page_address(pg) + (off), (size), (dir))
91167 @@ -37,7 +84,9 @@
91168 #define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir) \
91169 dma_sync_single_for_device(dev, dma_handle, size, dir)
91170
91171 +#ifndef CONFIG_XEN
91172 #define dma_supported platform_dma_supported
91173 +#endif
91174
91175 static inline int
91176 dma_set_mask (struct device *dev, u64 mask)
91177 @@ -62,4 +111,27 @@
91178
91179 #define dma_is_consistent(dma_handle) (1) /* all we do is coherent memory... */
91180
91181 +#ifdef CONFIG_XEN
91182 +/* arch/i386/kernel/swiotlb.o requires */
91183 +void contiguous_bitmap_init(unsigned long end_pfn);
91184 +
91185 +static inline int
91186 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
91187 +{
91188 + dma_addr_t mask = DMA_64BIT_MASK;
91189 + /* If the device has a mask, use it, otherwise default to 64 bits */
91190 + if (hwdev && hwdev->dma_mask)
91191 + mask = *hwdev->dma_mask;
91192 + return (addr & ~mask) != 0;
91193 +}
91194 +
91195 +static inline int
91196 +range_straddles_page_boundary(void *p, size_t size)
91197 +{
91198 + extern unsigned long *contiguous_bitmap;
91199 + return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
91200 + !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
91201 +}
91202 +#endif
91203 +
91204 #endif /* _ASM_IA64_DMA_MAPPING_H */
91205 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/fixmap.h linux-2.6.16.33/include/asm-ia64/fixmap.h
91206 --- linux-2.6.16.33-noxen/include/asm-ia64/fixmap.h 1970-01-01 00:00:00.000000000 +0000
91207 +++ linux-2.6.16.33/include/asm-ia64/fixmap.h 2007-01-08 15:00:46.000000000 +0000
91208 @@ -0,0 +1,2 @@
91209 +#define clear_fixmap(x) do {} while (0)
91210 +#define set_fixmap(x,y) do {} while (0)
91211 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/gcc_intrin.h linux-2.6.16.33/include/asm-ia64/gcc_intrin.h
91212 --- linux-2.6.16.33-noxen/include/asm-ia64/gcc_intrin.h 2006-11-22 18:06:31.000000000 +0000
91213 +++ linux-2.6.16.33/include/asm-ia64/gcc_intrin.h 2007-01-08 15:00:46.000000000 +0000
91214 @@ -26,7 +26,7 @@
91215
91216 register unsigned long ia64_r13 asm ("r13") __attribute_used__;
91217
91218 -#define ia64_setreg(regnum, val) \
91219 +#define __ia64_setreg(regnum, val) \
91220 ({ \
91221 switch (regnum) { \
91222 case _IA64_REG_PSR_L: \
91223 @@ -55,7 +55,7 @@
91224 } \
91225 })
91226
91227 -#define ia64_getreg(regnum) \
91228 +#define __ia64_getreg(regnum) \
91229 ({ \
91230 __u64 ia64_intri_res; \
91231 \
91232 @@ -92,7 +92,7 @@
91233
91234 #define ia64_hint_pause 0
91235
91236 -#define ia64_hint(mode) \
91237 +#define __ia64_hint(mode) \
91238 ({ \
91239 switch (mode) { \
91240 case ia64_hint_pause: \
91241 @@ -374,7 +374,7 @@
91242
91243 #define ia64_invala() asm volatile ("invala" ::: "memory")
91244
91245 -#define ia64_thash(addr) \
91246 +#define __ia64_thash(addr) \
91247 ({ \
91248 __u64 ia64_intri_res; \
91249 asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr)); \
91250 @@ -394,18 +394,18 @@
91251
91252 #define ia64_nop(x) asm volatile ("nop %0"::"i"(x));
91253
91254 -#define ia64_itci(addr) asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
91255 +#define __ia64_itci(addr) asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
91256
91257 -#define ia64_itcd(addr) asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
91258 +#define __ia64_itcd(addr) asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
91259
91260
91261 -#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1" \
91262 +#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1" \
91263 :: "r"(trnum), "r"(addr) : "memory")
91264
91265 -#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1" \
91266 +#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1" \
91267 :: "r"(trnum), "r"(addr) : "memory")
91268
91269 -#define ia64_tpa(addr) \
91270 +#define __ia64_tpa(addr) \
91271 ({ \
91272 __u64 ia64_pa; \
91273 asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory"); \
91274 @@ -415,22 +415,22 @@
91275 #define __ia64_set_dbr(index, val) \
91276 asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91277
91278 -#define ia64_set_ibr(index, val) \
91279 +#define __ia64_set_ibr(index, val) \
91280 asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91281
91282 -#define ia64_set_pkr(index, val) \
91283 +#define __ia64_set_pkr(index, val) \
91284 asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
91285
91286 -#define ia64_set_pmc(index, val) \
91287 +#define __ia64_set_pmc(index, val) \
91288 asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
91289
91290 -#define ia64_set_pmd(index, val) \
91291 +#define __ia64_set_pmd(index, val) \
91292 asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
91293
91294 -#define ia64_set_rr(index, val) \
91295 +#define __ia64_set_rr(index, val) \
91296 asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
91297
91298 -#define ia64_get_cpuid(index) \
91299 +#define __ia64_get_cpuid(index) \
91300 ({ \
91301 __u64 ia64_intri_res; \
91302 asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index)); \
91303 @@ -444,21 +444,21 @@
91304 ia64_intri_res; \
91305 })
91306
91307 -#define ia64_get_ibr(index) \
91308 +#define __ia64_get_ibr(index) \
91309 ({ \
91310 __u64 ia64_intri_res; \
91311 asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91312 ia64_intri_res; \
91313 })
91314
91315 -#define ia64_get_pkr(index) \
91316 +#define __ia64_get_pkr(index) \
91317 ({ \
91318 __u64 ia64_intri_res; \
91319 asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91320 ia64_intri_res; \
91321 })
91322
91323 -#define ia64_get_pmc(index) \
91324 +#define __ia64_get_pmc(index) \
91325 ({ \
91326 __u64 ia64_intri_res; \
91327 asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91328 @@ -466,48 +466,48 @@
91329 })
91330
91331
91332 -#define ia64_get_pmd(index) \
91333 +#define __ia64_get_pmd(index) \
91334 ({ \
91335 __u64 ia64_intri_res; \
91336 asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index)); \
91337 ia64_intri_res; \
91338 })
91339
91340 -#define ia64_get_rr(index) \
91341 +#define __ia64_get_rr(index) \
91342 ({ \
91343 __u64 ia64_intri_res; \
91344 asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index)); \
91345 ia64_intri_res; \
91346 })
91347
91348 -#define ia64_fc(addr) asm volatile ("fc %0" :: "r"(addr) : "memory")
91349 +#define __ia64_fc(addr) asm volatile ("fc %0" :: "r"(addr) : "memory")
91350
91351
91352 #define ia64_sync_i() asm volatile (";; sync.i" ::: "memory")
91353
91354 -#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
91355 -#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
91356 +#define __ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
91357 +#define __ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
91358 #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
91359 #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
91360
91361 -#define ia64_ptce(addr) asm volatile ("ptc.e %0" :: "r"(addr))
91362 +#define __ia64_ptce(addr) asm volatile ("ptc.e %0" :: "r"(addr))
91363
91364 -#define ia64_ptcga(addr, size) \
91365 +#define __ia64_ptcga(addr, size) \
91366 do { \
91367 asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory"); \
91368 ia64_dv_serialize_data(); \
91369 } while (0)
91370
91371 -#define ia64_ptcl(addr, size) \
91372 +#define __ia64_ptcl(addr, size) \
91373 do { \
91374 asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory"); \
91375 ia64_dv_serialize_data(); \
91376 } while (0)
91377
91378 -#define ia64_ptri(addr, size) \
91379 +#define __ia64_ptri(addr, size) \
91380 asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
91381
91382 -#define ia64_ptrd(addr, size) \
91383 +#define __ia64_ptrd(addr, size) \
91384 asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
91385
91386 /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
91387 @@ -589,7 +589,7 @@
91388 } \
91389 })
91390
91391 -#define ia64_intrin_local_irq_restore(x) \
91392 +#define __ia64_intrin_local_irq_restore(x) \
91393 do { \
91394 asm volatile (";; cmp.ne p6,p7=%0,r0;;" \
91395 "(p6) ssm psr.i;" \
91396 @@ -598,4 +598,6 @@
91397 :: "r"((x)) : "p6", "p7", "memory"); \
91398 } while (0)
91399
91400 +#define __ia64_get_psr_i() (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
91401 +
91402 #endif /* _ASM_IA64_GCC_INTRIN_H */
91403 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hw_irq.h linux-2.6.16.33/include/asm-ia64/hw_irq.h
91404 --- linux-2.6.16.33-noxen/include/asm-ia64/hw_irq.h 2006-11-22 18:06:31.000000000 +0000
91405 +++ linux-2.6.16.33/include/asm-ia64/hw_irq.h 2007-01-08 15:00:46.000000000 +0000
91406 @@ -15,7 +15,11 @@
91407 #include <asm/ptrace.h>
91408 #include <asm/smp.h>
91409
91410 +#ifndef CONFIG_XEN
91411 typedef u8 ia64_vector;
91412 +#else
91413 +typedef u16 ia64_vector;
91414 +#endif
91415
91416 /*
91417 * 0 special
91418 @@ -89,6 +93,13 @@
91419 static inline void
91420 hw_resend_irq (struct hw_interrupt_type *h, unsigned int vector)
91421 {
91422 +#ifdef CONFIG_XEN
91423 + extern void resend_irq_on_evtchn(struct hw_interrupt_type *h,
91424 + unsigned int i);
91425 + if (is_running_on_xen())
91426 + resend_irq_on_evtchn(h, vector);
91427 + else
91428 +#endif /* CONFIG_XEN */
91429 platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
91430 }
91431
91432 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hypercall.h linux-2.6.16.33/include/asm-ia64/hypercall.h
91433 --- linux-2.6.16.33-noxen/include/asm-ia64/hypercall.h 1970-01-01 00:00:00.000000000 +0000
91434 +++ linux-2.6.16.33/include/asm-ia64/hypercall.h 2007-01-08 15:00:46.000000000 +0000
91435 @@ -0,0 +1,463 @@
91436 +/******************************************************************************
91437 + * hypercall.h
91438 + *
91439 + * Linux-specific hypervisor handling.
91440 + *
91441 + * Copyright (c) 2002-2004, K A Fraser
91442 + *
91443 + * This program is free software; you can redistribute it and/or
91444 + * modify it under the terms of the GNU General Public License version 2
91445 + * as published by the Free Software Foundation; or, when distributed
91446 + * separately from the Linux kernel or incorporated into other
91447 + * software packages, subject to the following license:
91448 + *
91449 + * Permission is hereby granted, free of charge, to any person obtaining a copy
91450 + * of this source file (the "Software"), to deal in the Software without
91451 + * restriction, including without limitation the rights to use, copy, modify,
91452 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
91453 + * and to permit persons to whom the Software is furnished to do so, subject to
91454 + * the following conditions:
91455 + *
91456 + * The above copyright notice and this permission notice shall be included in
91457 + * all copies or substantial portions of the Software.
91458 + *
91459 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
91460 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
91461 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
91462 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
91463 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91464 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
91465 + * IN THE SOFTWARE.
91466 + */
91467 +
91468 +#ifndef __HYPERCALL_H__
91469 +#define __HYPERCALL_H__
91470 +
91471 +#ifndef __HYPERVISOR_H__
91472 +# error "please don't include this file directly"
91473 +#endif
91474 +
91475 +#include <asm/xen/xcom_hcall.h>
91476 +struct xencomm_handle;
91477 +
91478 +/*
91479 + * Assembler stubs for hyper-calls.
91480 + */
91481 +
91482 +#define _hypercall0(type, name) \
91483 +({ \
91484 + long __res; \
91485 + __asm__ __volatile__ (";;\n" \
91486 + "mov r2=%1\n" \
91487 + "break 0x1000 ;;\n" \
91488 + "mov %0=r8 ;;\n" \
91489 + : "=r" (__res) \
91490 + : "J" (__HYPERVISOR_##name) \
91491 + : "r2","r8", \
91492 + "memory" ); \
91493 + (type)__res; \
91494 +})
91495 +
91496 +#define _hypercall1(type, name, a1) \
91497 +({ \
91498 + long __res; \
91499 + __asm__ __volatile__ (";;\n" \
91500 + "mov r14=%2\n" \
91501 + "mov r2=%1\n" \
91502 + "break 0x1000 ;;\n" \
91503 + "mov %0=r8 ;;\n" \
91504 + : "=r" (__res) \
91505 + : "J" (__HYPERVISOR_##name), \
91506 + "rI" ((unsigned long)(a1)) \
91507 + : "r14","r2","r8", \
91508 + "memory" ); \
91509 + (type)__res; \
91510 +})
91511 +
91512 +#define _hypercall2(type, name, a1, a2) \
91513 +({ \
91514 + long __res; \
91515 + __asm__ __volatile__ (";;\n" \
91516 + "mov r14=%2\n" \
91517 + "mov r15=%3\n" \
91518 + "mov r2=%1\n" \
91519 + "break 0x1000 ;;\n" \
91520 + "mov %0=r8 ;;\n" \
91521 + : "=r" (__res) \
91522 + : "J" (__HYPERVISOR_##name), \
91523 + "rI" ((unsigned long)(a1)), \
91524 + "rI" ((unsigned long)(a2)) \
91525 + : "r14","r15","r2","r8", \
91526 + "memory" ); \
91527 + (type)__res; \
91528 +})
91529 +
91530 +#define _hypercall3(type, name, a1, a2, a3) \
91531 +({ \
91532 + long __res; \
91533 + __asm__ __volatile__ (";;\n" \
91534 + "mov r14=%2\n" \
91535 + "mov r15=%3\n" \
91536 + "mov r16=%4\n" \
91537 + "mov r2=%1\n" \
91538 + "break 0x1000 ;;\n" \
91539 + "mov %0=r8 ;;\n" \
91540 + : "=r" (__res) \
91541 + : "J" (__HYPERVISOR_##name), \
91542 + "rI" ((unsigned long)(a1)), \
91543 + "rI" ((unsigned long)(a2)), \
91544 + "rI" ((unsigned long)(a3)) \
91545 + : "r14","r15","r16","r2","r8", \
91546 + "memory" ); \
91547 + (type)__res; \
91548 +})
91549 +
91550 +#define _hypercall4(type, name, a1, a2, a3, a4) \
91551 +({ \
91552 + long __res; \
91553 + __asm__ __volatile__ (";;\n" \
91554 + "mov r14=%2\n" \
91555 + "mov r15=%3\n" \
91556 + "mov r16=%4\n" \
91557 + "mov r17=%5\n" \
91558 + "mov r2=%1\n" \
91559 + "break 0x1000 ;;\n" \
91560 + "mov %0=r8 ;;\n" \
91561 + : "=r" (__res) \
91562 + : "J" (__HYPERVISOR_##name), \
91563 + "rI" ((unsigned long)(a1)), \
91564 + "rI" ((unsigned long)(a2)), \
91565 + "rI" ((unsigned long)(a3)), \
91566 + "rI" ((unsigned long)(a4)) \
91567 + : "r14","r15","r16","r2","r8", \
91568 + "r17","memory" ); \
91569 + (type)__res; \
91570 +})
91571 +
91572 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
91573 +({ \
91574 + long __res; \
91575 + __asm__ __volatile__ (";;\n" \
91576 + "mov r14=%2\n" \
91577 + "mov r15=%3\n" \
91578 + "mov r16=%4\n" \
91579 + "mov r17=%5\n" \
91580 + "mov r18=%6\n" \
91581 + "mov r2=%1\n" \
91582 + "break 0x1000 ;;\n" \
91583 + "mov %0=r8 ;;\n" \
91584 + : "=r" (__res) \
91585 + : "J" (__HYPERVISOR_##name), \
91586 + "rI" ((unsigned long)(a1)), \
91587 + "rI" ((unsigned long)(a2)), \
91588 + "rI" ((unsigned long)(a3)), \
91589 + "rI" ((unsigned long)(a4)), \
91590 + "rI" ((unsigned long)(a5)) \
91591 + : "r14","r15","r16","r2","r8", \
91592 + "r17","r18","memory" ); \
91593 + (type)__res; \
91594 +})
91595 +
91596 +
91597 +static inline int
91598 +xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
91599 +{
91600 + return _hypercall2(int, sched_op, cmd, arg);
91601 +}
91602 +
91603 +static inline long
91604 +HYPERVISOR_set_timer_op(u64 timeout)
91605 +{
91606 + unsigned long timeout_hi = (unsigned long)(timeout >> 32);
91607 + unsigned long timeout_lo = (unsigned long)timeout;
91608 + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
91609 +}
91610 +
91611 +static inline int
91612 +xencomm_arch_hypercall_dom0_op(struct xencomm_handle *op)
91613 +{
91614 + return _hypercall1(int, dom0_op, op);
91615 +}
91616 +
91617 +static inline int
91618 +xencomm_arch_hypercall_sysctl(struct xencomm_handle *op)
91619 +{
91620 + return _hypercall1(int, sysctl, op);
91621 +}
91622 +
91623 +static inline int
91624 +xencomm_arch_hypercall_domctl(struct xencomm_handle *op)
91625 +{
91626 + return _hypercall1(int, domctl, op);
91627 +}
91628 +
91629 +static inline int
91630 +xencomm_arch_hypercall_multicall(struct xencomm_handle *call_list,
91631 + int nr_calls)
91632 +{
91633 + return _hypercall2(int, multicall, call_list, nr_calls);
91634 +}
91635 +
91636 +static inline int
91637 +xencomm_arch_hypercall_memory_op(unsigned int cmd, struct xencomm_handle *arg)
91638 +{
91639 + return _hypercall2(int, memory_op, cmd, arg);
91640 +}
91641 +
91642 +static inline int
91643 +xencomm_arch_hypercall_event_channel_op(int cmd, struct xencomm_handle *arg)
91644 +{
91645 + return _hypercall2(int, event_channel_op, cmd, arg);
91646 +}
91647 +
91648 +static inline int
91649 +xencomm_arch_hypercall_acm_op(unsigned int cmd, struct xencomm_handle *arg)
91650 +{
91651 + return _hypercall2(int, acm_op, cmd, arg);
91652 +}
91653 +
91654 +static inline int
91655 +xencomm_arch_hypercall_xen_version(int cmd, struct xencomm_handle *arg)
91656 +{
91657 + return _hypercall2(int, xen_version, cmd, arg);
91658 +}
91659 +
91660 +static inline int
91661 +xencomm_arch_hypercall_console_io(int cmd, int count,
91662 + struct xencomm_handle *str)
91663 +{
91664 + return _hypercall3(int, console_io, cmd, count, str);
91665 +}
91666 +
91667 +static inline int
91668 +xencomm_arch_hypercall_physdev_op(int cmd, struct xencomm_handle *arg)
91669 +{
91670 + return _hypercall2(int, physdev_op, cmd, arg);
91671 +}
91672 +
91673 +static inline int
91674 +xencomm_arch_hypercall_grant_table_op(unsigned int cmd,
91675 + struct xencomm_handle *uop,
91676 + unsigned int count)
91677 +{
91678 + return _hypercall3(int, grant_table_op, cmd, uop, count);
91679 +}
91680 +
91681 +int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
91682 +
91683 +extern int xencomm_arch_hypercall_suspend(struct xencomm_handle *arg);
91684 +
91685 +static inline int
91686 +xencomm_arch_hypercall_callback_op(int cmd, struct xencomm_handle *arg)
91687 +{
91688 + return _hypercall2(int, callback_op, cmd, arg);
91689 +}
91690 +
91691 +static inline unsigned long
91692 +xencomm_arch_hypercall_hvm_op(int cmd, void *arg)
91693 +{
91694 + return _hypercall2(unsigned long, hvm_op, cmd, arg);
91695 +}
91696 +
91697 +static inline int
91698 +HYPERVISOR_physdev_op(int cmd, void *arg)
91699 +{
91700 + switch (cmd) {
91701 + case PHYSDEVOP_eoi:
91702 + return _hypercall1(int, ia64_fast_eoi,
91703 + ((struct physdev_eoi *)arg)->irq);
91704 + default:
91705 + return xencomm_hypercall_physdev_op(cmd, arg);
91706 + }
91707 +}
91708 +
91709 +static inline int
91710 +xencomm_arch_hypercall_xenoprof_op(int op, struct xencomm_handle *arg)
91711 +{
91712 + return _hypercall2(int, xenoprof_op, op, arg);
91713 +}
91714 +
91715 +extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
91716 +static inline void exit_idle(void) {}
91717 +#define do_IRQ(irq, regs) ({ \
91718 + irq_enter(); \
91719 + __do_IRQ((irq), (regs)); \
91720 + irq_exit(); \
91721 +})
91722 +
91723 +#include <linux/err.h>
91724 +#ifdef CONFIG_XEN
91725 +#include <asm/xen/privop.h>
91726 +#endif /* CONFIG_XEN */
91727 +#ifdef HAVE_XEN_PLATFORM_COMPAT_H
91728 +#include <xen/platform-compat.h>
91729 +#endif
91730 +
91731 +static inline unsigned long
91732 +__HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
91733 +{
91734 + return _hypercall3(unsigned long, ia64_dom0vp_op,
91735 + IA64_DOM0VP_ioremap, ioaddr, size);
91736 +}
91737 +
91738 +static inline unsigned long
91739 +HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
91740 +{
91741 + unsigned long ret = ioaddr;
91742 + if (is_running_on_xen()) {
91743 + ret = __HYPERVISOR_ioremap(ioaddr, size);
91744 + if (unlikely(ret == -ENOSYS))
91745 + panic("hypercall %s failed with %ld. "
91746 + "Please check Xen and Linux config mismatch\n",
91747 + __func__, -ret);
91748 + else if (unlikely(IS_ERR_VALUE(ret)))
91749 + ret = ioaddr;
91750 + }
91751 + return ret;
91752 +}
91753 +
91754 +static inline unsigned long
91755 +__HYPERVISOR_phystomach(unsigned long gpfn)
91756 +{
91757 + return _hypercall2(unsigned long, ia64_dom0vp_op,
91758 + IA64_DOM0VP_phystomach, gpfn);
91759 +}
91760 +
91761 +static inline unsigned long
91762 +HYPERVISOR_phystomach(unsigned long gpfn)
91763 +{
91764 + unsigned long ret = gpfn;
91765 + if (is_running_on_xen()) {
91766 + ret = __HYPERVISOR_phystomach(gpfn);
91767 + }
91768 + return ret;
91769 +}
91770 +
91771 +static inline unsigned long
91772 +__HYPERVISOR_machtophys(unsigned long mfn)
91773 +{
91774 + return _hypercall2(unsigned long, ia64_dom0vp_op,
91775 + IA64_DOM0VP_machtophys, mfn);
91776 +}
91777 +
91778 +static inline unsigned long
91779 +HYPERVISOR_machtophys(unsigned long mfn)
91780 +{
91781 + unsigned long ret = mfn;
91782 + if (is_running_on_xen()) {
91783 + ret = __HYPERVISOR_machtophys(mfn);
91784 + }
91785 + return ret;
91786 +}
91787 +
91788 +static inline unsigned long
91789 +__HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
91790 +{
91791 + return _hypercall3(unsigned long, ia64_dom0vp_op,
91792 + IA64_DOM0VP_zap_physmap, gpfn, extent_order);
91793 +}
91794 +
91795 +static inline unsigned long
91796 +HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
91797 +{
91798 + unsigned long ret = 0;
91799 + if (is_running_on_xen()) {
91800 + ret = __HYPERVISOR_zap_physmap(gpfn, extent_order);
91801 + }
91802 + return ret;
91803 +}
91804 +
91805 +static inline unsigned long
91806 +__HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
91807 + unsigned long flags, domid_t domid)
91808 +{
91809 + return _hypercall5(unsigned long, ia64_dom0vp_op,
91810 + IA64_DOM0VP_add_physmap, gpfn, mfn, flags, domid);
91811 +}
91812 +
91813 +static inline unsigned long
91814 +HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
91815 + unsigned long flags, domid_t domid)
91816 +{
91817 + unsigned long ret = 0;
91818 + BUG_ON(!is_running_on_xen());//XXX
91819 + if (is_running_on_xen()) {
91820 + ret = __HYPERVISOR_add_physmap(gpfn, mfn, flags, domid);
91821 + }
91822 + return ret;
91823 +}
91824 +
91825 +static inline unsigned long
91826 +__HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
91827 + unsigned long flags, domid_t domid)
91828 +{
91829 + return _hypercall5(unsigned long, ia64_dom0vp_op,
91830 + IA64_DOM0VP_add_physmap_with_gmfn,
91831 + gpfn, gmfn, flags, domid);
91832 +}
91833 +
91834 +static inline unsigned long
91835 +HYPERVISOR_add_physmap_with_gmfn(unsigned long gpfn, unsigned long gmfn,
91836 + unsigned long flags, domid_t domid)
91837 +{
91838 + unsigned long ret = 0;
91839 + BUG_ON(!is_running_on_xen());//XXX
91840 + if (is_running_on_xen()) {
91841 + ret = __HYPERVISOR_add_physmap_with_gmfn(gpfn, gmfn,
91842 + flags, domid);
91843 + }
91844 + return ret;
91845 +}
91846 +
91847 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
91848 +static inline unsigned long
91849 +HYPERVISOR_expose_p2m(unsigned long conv_start_gpfn,
91850 + unsigned long assign_start_gpfn,
91851 + unsigned long expose_size, unsigned long granule_pfn)
91852 +{
91853 + return _hypercall5(unsigned long, ia64_dom0vp_op,
91854 + IA64_DOM0VP_expose_p2m, conv_start_gpfn,
91855 + assign_start_gpfn, expose_size, granule_pfn);
91856 +}
91857 +#endif
91858 +
91859 +static inline int
91860 +xencomm_arch_hypercall_perfmon_op(unsigned long cmd,
91861 + struct xencomm_handle *arg,
91862 + unsigned long count)
91863 +{
91864 + return _hypercall4(int, ia64_dom0vp_op,
91865 + IA64_DOM0VP_perfmon, cmd, arg, count);
91866 +}
91867 +
91868 +// for balloon driver
91869 +#define HYPERVISOR_update_va_mapping(va, new_val, flags) (0)
91870 +
91871 +/* Use xencomm to do hypercalls. */
91872 +#ifdef MODULE
91873 +#define HYPERVISOR_sched_op xencomm_mini_hypercall_sched_op
91874 +#define HYPERVISOR_event_channel_op xencomm_mini_hypercall_event_channel_op
91875 +#define HYPERVISOR_callback_op xencomm_mini_hypercall_callback_op
91876 +#define HYPERVISOR_multicall xencomm_mini_hypercall_multicall
91877 +#define HYPERVISOR_xen_version xencomm_mini_hypercall_xen_version
91878 +#define HYPERVISOR_console_io xencomm_mini_hypercall_console_io
91879 +#define HYPERVISOR_hvm_op xencomm_mini_hypercall_hvm_op
91880 +#define HYPERVISOR_memory_op xencomm_mini_hypercall_memory_op
91881 +#define HYPERVISOR_xenoprof_op xencomm_mini_hypercall_xenoprof_op
91882 +#define HYPERVISOR_perfmon_op xencomm_mini_hypercall_perfmon_op
91883 +#else
91884 +#define HYPERVISOR_sched_op xencomm_hypercall_sched_op
91885 +#define HYPERVISOR_event_channel_op xencomm_hypercall_event_channel_op
91886 +#define HYPERVISOR_callback_op xencomm_hypercall_callback_op
91887 +#define HYPERVISOR_multicall xencomm_hypercall_multicall
91888 +#define HYPERVISOR_xen_version xencomm_hypercall_xen_version
91889 +#define HYPERVISOR_console_io xencomm_hypercall_console_io
91890 +#define HYPERVISOR_hvm_op xencomm_hypercall_hvm_op
91891 +#define HYPERVISOR_memory_op xencomm_hypercall_memory_op
91892 +#define HYPERVISOR_xenoprof_op xencomm_hypercall_xenoprof_op
91893 +#define HYPERVISOR_perfmon_op xencomm_hypercall_perfmon_op
91894 +#endif
91895 +
91896 +#define HYPERVISOR_suspend xencomm_hypercall_suspend
91897 +
91898 +#endif /* __HYPERCALL_H__ */
91899 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/hypervisor.h linux-2.6.16.33/include/asm-ia64/hypervisor.h
91900 --- linux-2.6.16.33-noxen/include/asm-ia64/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
91901 +++ linux-2.6.16.33/include/asm-ia64/hypervisor.h 2007-01-08 15:00:46.000000000 +0000
91902 @@ -0,0 +1,223 @@
91903 +/******************************************************************************
91904 + * hypervisor.h
91905 + *
91906 + * Linux-specific hypervisor handling.
91907 + *
91908 + * Copyright (c) 2002-2004, K A Fraser
91909 + *
91910 + * This program is free software; you can redistribute it and/or
91911 + * modify it under the terms of the GNU General Public License version 2
91912 + * as published by the Free Software Foundation; or, when distributed
91913 + * separately from the Linux kernel or incorporated into other
91914 + * software packages, subject to the following license:
91915 + *
91916 + * Permission is hereby granted, free of charge, to any person obtaining a copy
91917 + * of this source file (the "Software"), to deal in the Software without
91918 + * restriction, including without limitation the rights to use, copy, modify,
91919 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
91920 + * and to permit persons to whom the Software is furnished to do so, subject to
91921 + * the following conditions:
91922 + *
91923 + * The above copyright notice and this permission notice shall be included in
91924 + * all copies or substantial portions of the Software.
91925 + *
91926 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
91927 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
91928 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
91929 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
91930 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91931 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
91932 + * IN THE SOFTWARE.
91933 + */
91934 +
91935 +#ifndef __HYPERVISOR_H__
91936 +#define __HYPERVISOR_H__
91937 +
91938 +#ifdef CONFIG_XEN
91939 +extern int running_on_xen;
91940 +#define is_running_on_xen() (running_on_xen)
91941 +#else /* CONFIG_XEN */
91942 +# ifdef CONFIG_VMX_GUEST
91943 +# define is_running_on_xen() (1)
91944 +# else /* CONFIG_VMX_GUEST */
91945 +# define is_running_on_xen() (0)
91946 +# define HYPERVISOR_ioremap(offset, size) (offset)
91947 +# endif /* CONFIG_VMX_GUEST */
91948 +#endif /* CONFIG_XEN */
91949 +
91950 +#if defined(CONFIG_XEN) || defined(CONFIG_VMX_GUEST)
91951 +#include <linux/config.h>
91952 +#include <linux/types.h>
91953 +#include <linux/kernel.h>
91954 +#include <linux/version.h>
91955 +#include <linux/errno.h>
91956 +#include <xen/interface/xen.h>
91957 +#include <xen/interface/dom0_ops.h>
91958 +#include <xen/interface/event_channel.h>
91959 +#include <xen/interface/physdev.h>
91960 +#include <xen/interface/sched.h>
91961 +#include <asm/hypercall.h>
91962 +#include <asm/ptrace.h>
91963 +#include <asm/page.h>
91964 +
91965 +extern shared_info_t *HYPERVISOR_shared_info;
91966 +extern start_info_t *xen_start_info;
91967 +
91968 +void force_evtchn_callback(void);
91969 +
91970 +#ifndef CONFIG_VMX_GUEST
91971 +/* Turn jiffies into Xen system time. XXX Implement me. */
91972 +#define jiffies_to_st(j) 0
91973 +
91974 +static inline int
91975 +HYPERVISOR_yield(
91976 + void)
91977 +{
91978 + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
91979 +
91980 + return rc;
91981 +}
91982 +
91983 +static inline int
91984 +HYPERVISOR_block(
91985 + void)
91986 +{
91987 + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
91988 +
91989 + return rc;
91990 +}
91991 +
91992 +static inline int
91993 +HYPERVISOR_shutdown(
91994 + unsigned int reason)
91995 +{
91996 + struct sched_shutdown sched_shutdown = {
91997 + .reason = reason
91998 + };
91999 +
92000 + int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
92001 +
92002 + return rc;
92003 +}
92004 +
92005 +static inline int
92006 +HYPERVISOR_poll(
92007 + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
92008 +{
92009 + struct sched_poll sched_poll = {
92010 + .nr_ports = nr_ports,
92011 + .timeout = jiffies_to_st(timeout)
92012 + };
92013 +
92014 + int rc;
92015 +
92016 + set_xen_guest_handle(sched_poll.ports, ports);
92017 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
92018 +
92019 + return rc;
92020 +}
92021 +
92022 +// for drivers/xen/privcmd/privcmd.c
92023 +#define machine_to_phys_mapping 0
92024 +struct vm_area_struct;
92025 +int direct_remap_pfn_range(struct vm_area_struct *vma,
92026 + unsigned long address,
92027 + unsigned long mfn,
92028 + unsigned long size,
92029 + pgprot_t prot,
92030 + domid_t domid);
92031 +struct file;
92032 +int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
92033 +int privcmd_mmap(struct file * file, struct vm_area_struct * vma);
92034 +#define HAVE_ARCH_PRIVCMD_MMAP
92035 +
92036 +// for drivers/xen/balloon/balloon.c
92037 +#ifdef CONFIG_XEN_SCRUB_PAGES
92038 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
92039 +#else
92040 +#define scrub_pages(_p,_n) ((void)0)
92041 +#endif
92042 +#define pte_mfn(_x) pte_pfn(_x)
92043 +#define phys_to_machine_mapping_valid(_x) (1)
92044 +
92045 +#endif /* !CONFIG_VMX_GUEST */
92046 +
92047 +#define __pte_ma(_x) ((pte_t) {(_x)}) /* unmodified use */
92048 +#define pfn_pte_ma(_x,_y) __pte_ma(0) /* unmodified use */
92049 +
92050 +#ifndef CONFIG_VMX_GUEST
92051 +int __xen_create_contiguous_region(unsigned long vstart, unsigned int order, unsigned int address_bits);
92052 +static inline int
92053 +xen_create_contiguous_region(unsigned long vstart,
92054 + unsigned int order, unsigned int address_bits)
92055 +{
92056 + int ret = 0;
92057 + if (is_running_on_xen()) {
92058 + ret = __xen_create_contiguous_region(vstart, order,
92059 + address_bits);
92060 + }
92061 + return ret;
92062 +}
92063 +
92064 +void __xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
92065 +static inline void
92066 +xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
92067 +{
92068 + if (is_running_on_xen())
92069 + __xen_destroy_contiguous_region(vstart, order);
92070 +}
92071 +
92072 +#endif /* !CONFIG_VMX_GUEST */
92073 +
92074 +// for netfront.c, netback.c
92075 +#define MULTI_UVMFLAGS_INDEX 0 //XXX any value
92076 +
92077 +static inline void
92078 +MULTI_update_va_mapping(
92079 + multicall_entry_t *mcl, unsigned long va,
92080 + pte_t new_val, unsigned long flags)
92081 +{
92082 + mcl->op = __HYPERVISOR_update_va_mapping;
92083 + mcl->result = 0;
92084 +}
92085 +
92086 +static inline void
92087 +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
92088 + void *uop, unsigned int count)
92089 +{
92090 + mcl->op = __HYPERVISOR_grant_table_op;
92091 + mcl->args[0] = cmd;
92092 + mcl->args[1] = (unsigned long)uop;
92093 + mcl->args[2] = count;
92094 +}
92095 +
92096 +/*
92097 + * for blktap.c
92098 + * int create_lookup_pte_addr(struct mm_struct *mm,
92099 + * unsigned long address,
92100 + * uint64_t *ptep);
92101 + */
92102 +#define create_lookup_pte_addr(mm, address, ptep) \
92103 + ({ \
92104 + printk(KERN_EMERG \
92105 + "%s:%d " \
92106 + "create_lookup_pte_addr() isn't supported.\n", \
92107 + __func__, __LINE__); \
92108 + BUG(); \
92109 + (-ENOSYS); \
92110 + })
92111 +
92112 +// for debug
92113 +asmlinkage int xprintk(const char *fmt, ...);
92114 +#define xprintd(fmt, ...) xprintk("%s:%d " fmt, __func__, __LINE__, \
92115 + ##__VA_ARGS__)
92116 +
92117 +#endif /* CONFIG_XEN || CONFIG_VMX_GUEST */
92118 +
92119 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
92120 +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
92121 +#else
92122 +#define is_initial_xendomain() 0
92123 +#endif
92124 +
92125 +#endif /* __HYPERVISOR_H__ */
92126 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/intel_intrin.h linux-2.6.16.33/include/asm-ia64/intel_intrin.h
92127 --- linux-2.6.16.33-noxen/include/asm-ia64/intel_intrin.h 2006-11-22 18:06:31.000000000 +0000
92128 +++ linux-2.6.16.33/include/asm-ia64/intel_intrin.h 2007-01-08 15:00:46.000000000 +0000
92129 @@ -119,10 +119,10 @@
92130 * intrinsic
92131 */
92132
92133 -#define ia64_getreg __getReg
92134 -#define ia64_setreg __setReg
92135 +#define __ia64_getreg __getReg
92136 +#define __ia64_setreg __setReg
92137
92138 -#define ia64_hint(x)
92139 +#define __ia64_hint(x)
92140
92141 #define ia64_mux1_brcst 0
92142 #define ia64_mux1_mix 8
92143 @@ -135,16 +135,16 @@
92144 #define ia64_getf_exp __getf_exp
92145 #define ia64_shrp _m64_shrp
92146
92147 -#define ia64_tpa __tpa
92148 +#define __ia64_tpa __tpa
92149 #define ia64_invala __invala
92150 #define ia64_invala_gr __invala_gr
92151 #define ia64_invala_fr __invala_fr
92152 #define ia64_nop __nop
92153 #define ia64_sum __sum
92154 -#define ia64_ssm __ssm
92155 +#define __ia64_ssm __ssm
92156 #define ia64_rum __rum
92157 -#define ia64_rsm __rsm
92158 -#define ia64_fc __fc
92159 +#define __ia64_rsm __rsm
92160 +#define __ia64_fc __fc
92161
92162 #define ia64_ldfs __ldfs
92163 #define ia64_ldfd __ldfd
92164 @@ -182,24 +182,24 @@
92165
92166 #define __ia64_set_dbr(index, val) \
92167 __setIndReg(_IA64_REG_INDR_DBR, index, val)
92168 -#define ia64_set_ibr(index, val) \
92169 +#define __ia64_set_ibr(index, val) \
92170 __setIndReg(_IA64_REG_INDR_IBR, index, val)
92171 -#define ia64_set_pkr(index, val) \
92172 +#define __ia64_set_pkr(index, val) \
92173 __setIndReg(_IA64_REG_INDR_PKR, index, val)
92174 -#define ia64_set_pmc(index, val) \
92175 +#define __ia64_set_pmc(index, val) \
92176 __setIndReg(_IA64_REG_INDR_PMC, index, val)
92177 -#define ia64_set_pmd(index, val) \
92178 +#define __ia64_set_pmd(index, val) \
92179 __setIndReg(_IA64_REG_INDR_PMD, index, val)
92180 -#define ia64_set_rr(index, val) \
92181 +#define __ia64_set_rr(index, val) \
92182 __setIndReg(_IA64_REG_INDR_RR, index, val)
92183
92184 -#define ia64_get_cpuid(index) __getIndReg(_IA64_REG_INDR_CPUID, index)
92185 +#define __ia64_get_cpuid(index) __getIndReg(_IA64_REG_INDR_CPUID, index)
92186 #define __ia64_get_dbr(index) __getIndReg(_IA64_REG_INDR_DBR, index)
92187 -#define ia64_get_ibr(index) __getIndReg(_IA64_REG_INDR_IBR, index)
92188 -#define ia64_get_pkr(index) __getIndReg(_IA64_REG_INDR_PKR, index)
92189 -#define ia64_get_pmc(index) __getIndReg(_IA64_REG_INDR_PMC, index)
92190 -#define ia64_get_pmd(index) __getIndReg(_IA64_REG_INDR_PMD, index)
92191 -#define ia64_get_rr(index) __getIndReg(_IA64_REG_INDR_RR, index)
92192 +#define __ia64_get_ibr(index) __getIndReg(_IA64_REG_INDR_IBR, index)
92193 +#define __ia64_get_pkr(index) __getIndReg(_IA64_REG_INDR_PKR, index)
92194 +#define __ia64_get_pmc(index) __getIndReg(_IA64_REG_INDR_PMC, index)
92195 +#define __ia64_get_pmd(index) __getIndReg(_IA64_REG_INDR_PMD, index)
92196 +#define __ia64_get_rr(index) __getIndReg(_IA64_REG_INDR_RR, index)
92197
92198 #define ia64_srlz_d __dsrlz
92199 #define ia64_srlz_i __isrlz
92200 @@ -218,18 +218,18 @@
92201 #define ia64_ld8_acq __ld8_acq
92202
92203 #define ia64_sync_i __synci
92204 -#define ia64_thash __thash
92205 -#define ia64_ttag __ttag
92206 -#define ia64_itcd __itcd
92207 -#define ia64_itci __itci
92208 -#define ia64_itrd __itrd
92209 -#define ia64_itri __itri
92210 -#define ia64_ptce __ptce
92211 -#define ia64_ptcl __ptcl
92212 -#define ia64_ptcg __ptcg
92213 -#define ia64_ptcga __ptcga
92214 -#define ia64_ptri __ptri
92215 -#define ia64_ptrd __ptrd
92216 +#define __ia64_thash __thash
92217 +#define __ia64_ttag __ttag
92218 +#define __ia64_itcd __itcd
92219 +#define __ia64_itci __itci
92220 +#define __ia64_itrd __itrd
92221 +#define __ia64_itri __itri
92222 +#define __ia64_ptce __ptce
92223 +#define __ia64_ptcl __ptcl
92224 +#define __ia64_ptcg __ptcg
92225 +#define __ia64_ptcga __ptcga
92226 +#define __ia64_ptri __ptri
92227 +#define __ia64_ptrd __ptrd
92228 #define ia64_dep_mi _m64_dep_mi
92229
92230 /* Values for lfhint in __lfetch and __lfetch_fault */
92231 @@ -244,14 +244,16 @@
92232 #define ia64_lfetch_fault __lfetch_fault
92233 #define ia64_lfetch_fault_excl __lfetch_fault_excl
92234
92235 -#define ia64_intrin_local_irq_restore(x) \
92236 +#define __ia64_intrin_local_irq_restore(x) \
92237 do { \
92238 if ((x) != 0) { \
92239 - ia64_ssm(IA64_PSR_I); \
92240 + __ia64_ssm(IA64_PSR_I); \
92241 ia64_srlz_d(); \
92242 } else { \
92243 - ia64_rsm(IA64_PSR_I); \
92244 + __ia64_rsm(IA64_PSR_I); \
92245 } \
92246 } while (0)
92247
92248 +#define __ia64_get_psr_i() (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
92249 +
92250 #endif /* _ASM_IA64_INTEL_INTRIN_H */
92251 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/io.h linux-2.6.16.33/include/asm-ia64/io.h
92252 --- linux-2.6.16.33-noxen/include/asm-ia64/io.h 2006-11-22 18:06:31.000000000 +0000
92253 +++ linux-2.6.16.33/include/asm-ia64/io.h 2007-01-08 15:00:46.000000000 +0000
92254 @@ -66,9 +66,11 @@
92255 #define PIO_RESERVED __IA64_UNCACHED_OFFSET
92256 #define HAVE_ARCH_PIO_SIZE
92257
92258 +#include <asm/hypervisor.h>
92259 #include <asm/intrinsics.h>
92260 #include <asm/machvec.h>
92261 #include <asm/page.h>
92262 +#include <asm/privop.h>
92263 #include <asm/system.h>
92264 #include <asm-generic/iomap.h>
92265
92266 @@ -95,9 +97,44 @@
92267 * The following two macros are deprecated and scheduled for removal.
92268 * Please use the PCI-DMA interface defined in <asm/pci.h> instead.
92269 */
92270 +#ifndef CONFIG_XEN
92271 #define bus_to_virt phys_to_virt
92272 #define virt_to_bus virt_to_phys
92273 #define page_to_bus page_to_phys
92274 +#else
92275 +#define bus_to_virt(bus) \
92276 + phys_to_virt(machine_to_phys_for_dma(bus))
92277 +#define virt_to_bus(virt) \
92278 + phys_to_machine_for_dma(virt_to_phys(virt))
92279 +#define page_to_bus(page) \
92280 + phys_to_machine_for_dma(page_to_pseudophys(page))
92281 +
92282 +#define page_to_pseudophys(page) \
92283 + ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
92284 +
92285 +/*
92286 + * Drivers that use page_to_phys() for bus addresses are broken.
92287 + * This includes:
92288 + * drivers/ide/cris/ide-cris.c
92289 + * drivers/scsi/dec_esp.c
92290 + */
92291 +#define page_to_phys(page) (page_to_pseudophys(page))
92292 +#define bvec_to_bus(bv) (page_to_bus((bv)->bv_page) + \
92293 + (unsigned long) (bv)->bv_offset)
92294 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
92295 + (unsigned long) bio_offset((bio)))
92296 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
92297 + (unsigned long) (bv)->bv_offset)
92298 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
92299 + (((bvec_to_bus((vec1)) + (vec1)->bv_len) == bvec_to_bus((vec2))) && \
92300 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
92301 + bvec_to_pseudophys((vec2))))
92302 +
92303 +/* We will be supplying our own /dev/mem implementation */
92304 +#define ARCH_HAS_DEV_MEM
92305 +#define ARCH_HAS_DEV_MEM_MMAP_MEM
92306 +int xen_mmap_mem(struct file * file, struct vm_area_struct * vma);
92307 +#endif /* CONFIG_XEN */
92308
92309 # endif /* KERNEL */
92310
92311 @@ -425,6 +462,9 @@
92312 static inline void __iomem *
92313 ioremap (unsigned long offset, unsigned long size)
92314 {
92315 + offset = HYPERVISOR_ioremap(offset, size);
92316 + if (IS_ERR_VALUE(offset))
92317 + return (void __iomem*)offset;
92318 return (void __iomem *) (__IA64_UNCACHED_OFFSET | (offset));
92319 }
92320
92321 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/iosapic.h linux-2.6.16.33/include/asm-ia64/iosapic.h
92322 --- linux-2.6.16.33-noxen/include/asm-ia64/iosapic.h 2006-11-22 18:06:31.000000000 +0000
92323 +++ linux-2.6.16.33/include/asm-ia64/iosapic.h 2007-01-08 15:00:46.000000000 +0000
92324 @@ -53,6 +53,7 @@
92325
92326 #define NR_IOSAPICS 256
92327
92328 +#ifndef CONFIG_XEN
92329 static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
92330 {
92331 writel(reg, iosapic + IOSAPIC_REG_SELECT);
92332 @@ -64,6 +65,7 @@
92333 writel(reg, iosapic + IOSAPIC_REG_SELECT);
92334 writel(val, iosapic + IOSAPIC_WINDOW);
92335 }
92336 +#endif
92337
92338 static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
92339 {
92340 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/irq.h linux-2.6.16.33/include/asm-ia64/irq.h
92341 --- linux-2.6.16.33-noxen/include/asm-ia64/irq.h 2006-11-22 18:06:31.000000000 +0000
92342 +++ linux-2.6.16.33/include/asm-ia64/irq.h 2007-01-08 15:00:46.000000000 +0000
92343 @@ -11,8 +11,41 @@
92344 * 02/29/00 D.Mosberger moved most things into hw_irq.h
92345 */
92346
92347 +#ifndef CONFIG_XEN
92348 #define NR_IRQS 256
92349 #define NR_IRQ_VECTORS NR_IRQS
92350 +#else
92351 +/*
92352 + * The flat IRQ space is divided into two regions:
92353 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
92354 + * if we have physical device-access privilege. This region is at the
92355 + * start of the IRQ space so that existing device drivers do not need
92356 + * to be modified to translate physical IRQ numbers into our IRQ space.
92357 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
92358 + * are bound using the provided bind/unbind functions.
92359 + */
92360 +
92361 +#define PIRQ_BASE 0
92362 +#define NR_PIRQS 256
92363 +
92364 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
92365 +#define NR_DYNIRQS 256
92366 +
92367 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
92368 +#define NR_IRQ_VECTORS NR_IRQS
92369 +
92370 +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
92371 +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
92372 +
92373 +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
92374 +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
92375 +
92376 +#define RESCHEDULE_VECTOR 0
92377 +#define IPI_VECTOR 1
92378 +#define CMCP_VECTOR 2
92379 +#define CPEP_VECTOR 3
92380 +#define NR_IPIS 4
92381 +#endif /* CONFIG_XEN */
92382
92383 /*
92384 * IRQ line status macro IRQ_PER_CPU is used
92385 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/machvec_dig.h linux-2.6.16.33/include/asm-ia64/machvec_dig.h
92386 --- linux-2.6.16.33-noxen/include/asm-ia64/machvec_dig.h 2006-11-22 18:06:31.000000000 +0000
92387 +++ linux-2.6.16.33/include/asm-ia64/machvec_dig.h 2007-01-08 15:00:46.000000000 +0000
92388 @@ -15,4 +15,19 @@
92389 #define platform_setup dig_setup
92390 #define platform_irq_init dig_irq_init
92391
92392 +#ifdef CONFIG_XEN
92393 +# define platform_dma_map_sg dma_map_sg
92394 +# define platform_dma_unmap_sg dma_unmap_sg
92395 +# define platform_dma_mapping_error dma_mapping_error
92396 +# define platform_dma_supported dma_supported
92397 +# define platform_dma_alloc_coherent dma_alloc_coherent
92398 +# define platform_dma_free_coherent dma_free_coherent
92399 +# define platform_dma_map_single dma_map_single
92400 +# define platform_dma_unmap_single dma_unmap_single
92401 +# define platform_dma_sync_single_for_cpu \
92402 + dma_sync_single_for_cpu
92403 +# define platform_dma_sync_single_for_device \
92404 + dma_sync_single_for_device
92405 +#endif
92406 +
92407 #endif /* _ASM_IA64_MACHVEC_DIG_h */
92408 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/maddr.h linux-2.6.16.33/include/asm-ia64/maddr.h
92409 --- linux-2.6.16.33-noxen/include/asm-ia64/maddr.h 1970-01-01 00:00:00.000000000 +0000
92410 +++ linux-2.6.16.33/include/asm-ia64/maddr.h 2007-01-08 15:00:46.000000000 +0000
92411 @@ -0,0 +1,102 @@
92412 +#ifndef _ASM_IA64_MADDR_H
92413 +#define _ASM_IA64_MADDR_H
92414 +
92415 +#include <linux/kernel.h>
92416 +#include <asm/hypervisor.h>
92417 +#include <xen/features.h>
92418 +#include <xen/interface/xen.h>
92419 +
92420 +#ifdef CONFIG_XEN
92421 +
92422 +#define INVALID_P2M_ENTRY (~0UL)
92423 +
92424 +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M
92425 +extern int p2m_initialized;
92426 +extern unsigned long p2m_min_low_pfn;
92427 +extern unsigned long p2m_max_low_pfn;
92428 +extern unsigned long p2m_convert_min_pfn;
92429 +extern unsigned long p2m_convert_max_pfn;
92430 +extern volatile const pte_t* p2m_pte;
92431 +unsigned long p2m_phystomach(unsigned long gpfn);
92432 +#else
92433 +#define p2m_initialized (0)
92434 +#define p2m_phystomach(gpfn) INVALID_MFN
92435 +#endif
92436 +
92437 +/* XXX xen page size != page size */
92438 +static inline unsigned long
92439 +pfn_to_mfn_for_dma(unsigned long pfn)
92440 +{
92441 + unsigned long mfn;
92442 + if (p2m_initialized)
92443 + return p2m_phystomach(pfn);
92444 + mfn = HYPERVISOR_phystomach(pfn);
92445 + BUG_ON(mfn == 0); // XXX
92446 + BUG_ON(mfn == INVALID_P2M_ENTRY); // XXX
92447 + BUG_ON(mfn == INVALID_MFN);
92448 + return mfn;
92449 +}
92450 +
92451 +static inline unsigned long
92452 +phys_to_machine_for_dma(unsigned long phys)
92453 +{
92454 + unsigned long machine =
92455 + pfn_to_mfn_for_dma(phys >> PAGE_SHIFT) << PAGE_SHIFT;
92456 + machine |= (phys & ~PAGE_MASK);
92457 + return machine;
92458 +}
92459 +
92460 +static inline unsigned long
92461 +mfn_to_pfn_for_dma(unsigned long mfn)
92462 +{
92463 + unsigned long pfn;
92464 + pfn = HYPERVISOR_machtophys(mfn);
92465 + BUG_ON(pfn == 0);
92466 + //BUG_ON(pfn == INVALID_M2P_ENTRY);
92467 + return pfn;
92468 +}
92469 +
92470 +static inline unsigned long
92471 +machine_to_phys_for_dma(unsigned long machine)
92472 +{
92473 + unsigned long phys =
92474 + mfn_to_pfn_for_dma(machine >> PAGE_SHIFT) << PAGE_SHIFT;
92475 + phys |= (machine & ~PAGE_MASK);
92476 + return phys;
92477 +}
92478 +
92479 +static inline unsigned long
92480 +mfn_to_local_pfn(unsigned long mfn)
92481 +{
92482 + extern unsigned long max_mapnr;
92483 + unsigned long pfn = mfn_to_pfn_for_dma(mfn);
92484 + if (!pfn_valid(pfn))
92485 + return INVALID_P2M_ENTRY;
92486 + return pfn;
92487 +}
92488 +
92489 +#else /* !CONFIG_XEN */
92490 +
92491 +#define pfn_to_mfn_for_dma(pfn) (pfn)
92492 +#define mfn_to_pfn_for_dma(mfn) (mfn)
92493 +#define phys_to_machine_for_dma(phys) (phys)
92494 +#define machine_to_phys_for_dma(machine) (machine)
92495 +#define mfn_to_local_pfn(mfn) (mfn)
92496 +
92497 +#endif /* !CONFIG_XEN */
92498 +
92499 +/* XXX to compile set_phys_to_machine(vaddr, FOREIGN_FRAME(m)) */
92500 +#define FOREIGN_FRAME(m) (INVALID_P2M_ENTRY)
92501 +
92502 +#define mfn_to_pfn(mfn) (mfn)
92503 +#define pfn_to_mfn(pfn) (pfn)
92504 +
92505 +#define mfn_to_virt(mfn) (__va((mfn) << PAGE_SHIFT))
92506 +#define virt_to_mfn(virt) (__pa(virt) >> PAGE_SHIFT)
92507 +#define virt_to_machine(virt) __pa(virt) // for tpmfront.c
92508 +
92509 +#define set_phys_to_machine(pfn, mfn) do { } while (0)
92510 +
92511 +typedef unsigned long maddr_t; // to compile netback, netfront
92512 +
92513 +#endif /* _ASM_IA64_MADDR_H */
92514 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/meminit.h linux-2.6.16.33/include/asm-ia64/meminit.h
92515 --- linux-2.6.16.33-noxen/include/asm-ia64/meminit.h 2006-11-22 18:06:31.000000000 +0000
92516 +++ linux-2.6.16.33/include/asm-ia64/meminit.h 2007-01-08 15:00:46.000000000 +0000
92517 @@ -17,10 +17,15 @@
92518 * - command line string
92519 * - kernel code & data
92520 * - Kernel memory map built from EFI memory map
92521 + * - xen start info
92522 *
92523 * More could be added if necessary
92524 */
92525 +#ifndef CONFIG_XEN
92526 #define IA64_MAX_RSVD_REGIONS 6
92527 +#else
92528 +#define IA64_MAX_RSVD_REGIONS 7
92529 +#endif
92530
92531 struct rsvd_region {
92532 unsigned long start; /* virtual address of beginning of element */
92533 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/page.h linux-2.6.16.33/include/asm-ia64/page.h
92534 --- linux-2.6.16.33-noxen/include/asm-ia64/page.h 2006-11-22 18:06:31.000000000 +0000
92535 +++ linux-2.6.16.33/include/asm-ia64/page.h 2007-01-08 15:00:46.000000000 +0000
92536 @@ -117,7 +117,9 @@
92537 # define pfn_to_page(pfn) (vmem_map + (pfn))
92538 #endif
92539
92540 +#ifndef CONFIG_XEN
92541 #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
92542 +#endif
92543 #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
92544 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
92545
92546 @@ -219,4 +221,53 @@
92547 (((current->personality & READ_IMPLIES_EXEC) != 0) \
92548 ? VM_EXEC : 0))
92549
92550 +#ifndef __ASSEMBLY__
92551 +#ifdef CONFIG_XEN
92552 +
92553 +#include <linux/kernel.h>
92554 +#include <asm/hypervisor.h>
92555 +#include <xen/features.h> // to compile netback, netfront
92556 +
92557 +/*
92558 + * XXX hack!
92559 + * Linux/IA64 uses PG_arch_1.
92560 + * This hack will be removed once PG_foreign bit is taken.
92561 + * #include <xen/foreign_page.h>
92562 + */
92563 +#ifdef __ASM_XEN_FOREIGN_PAGE_H__
92564 +# error "don't include include/xen/foreign_page.h!"
92565 +#endif
92566 +
92567 +extern struct address_space xen_ia64_foreign_dummy_mapping;
92568 +#define PageForeign(page) \
92569 + ((page)->mapping == &xen_ia64_foreign_dummy_mapping)
92570 +
92571 +#define SetPageForeign(page, dtor) do { \
92572 + set_page_private((page), (unsigned long)(dtor)); \
92573 + (page)->mapping = &xen_ia64_foreign_dummy_mapping; \
92574 + smp_rmb(); \
92575 +} while (0)
92576 +
92577 +#define ClearPageForeign(page) do { \
92578 + (page)->mapping = NULL; \
92579 + smp_rmb(); \
92580 + set_page_private((page), 0); \
92581 +} while (0)
92582 +
92583 +#define PageForeignDestructor(page) \
92584 + ( (void (*) (struct page *)) page_private(page) )
92585 +
92586 +#define arch_free_page(_page,_order) \
92587 +({ int foreign = PageForeign(_page); \
92588 + if (foreign) \
92589 + (PageForeignDestructor(_page))(_page); \
92590 + foreign; \
92591 +})
92592 +#define HAVE_ARCH_FREE_PAGE
92593 +
92594 +#include <asm/maddr.h>
92595 +
92596 +#endif /* CONFIG_XEN */
92597 +#endif /* __ASSEMBLY__ */
92598 +
92599 #endif /* _ASM_IA64_PAGE_H */
92600 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/pal.h linux-2.6.16.33/include/asm-ia64/pal.h
92601 --- linux-2.6.16.33-noxen/include/asm-ia64/pal.h 2006-11-22 18:06:31.000000000 +0000
92602 +++ linux-2.6.16.33/include/asm-ia64/pal.h 2007-01-08 15:00:46.000000000 +0000
92603 @@ -81,6 +81,7 @@
92604 #ifndef __ASSEMBLY__
92605
92606 #include <linux/types.h>
92607 +#include <asm/processor.h>
92608 #include <asm/fpu.h>
92609
92610 /*
92611 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/pgalloc.h linux-2.6.16.33/include/asm-ia64/pgalloc.h
92612 --- linux-2.6.16.33-noxen/include/asm-ia64/pgalloc.h 2006-11-22 18:06:31.000000000 +0000
92613 +++ linux-2.6.16.33/include/asm-ia64/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
92614 @@ -126,7 +126,11 @@
92615 static inline void
92616 pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
92617 {
92618 +#ifndef CONFIG_XEN
92619 pmd_val(*pmd_entry) = page_to_phys(pte);
92620 +#else
92621 + pmd_val(*pmd_entry) = page_to_pseudophys(pte);
92622 +#endif
92623 }
92624
92625 static inline void
92626 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/privop.h linux-2.6.16.33/include/asm-ia64/privop.h
92627 --- linux-2.6.16.33-noxen/include/asm-ia64/privop.h 1970-01-01 00:00:00.000000000 +0000
92628 +++ linux-2.6.16.33/include/asm-ia64/privop.h 2007-01-08 15:00:46.000000000 +0000
92629 @@ -0,0 +1,60 @@
92630 +#ifndef _ASM_IA64_PRIVOP_H
92631 +#define _ASM_IA64_PRIVOP_H
92632 +
92633 +/*
92634 + * Copyright (C) 2005 Hewlett-Packard Co
92635 + * Dan Magenheimer <dan.magenheimer@hp.com>
92636 + *
92637 + */
92638 +
92639 +#ifdef CONFIG_XEN
92640 +#include <asm/xen/privop.h>
92641 +#endif
92642 +
92643 +#ifndef __ASSEMBLY
92644 +
92645 +#ifndef IA64_PARAVIRTUALIZED
92646 +
92647 +#define ia64_getreg __ia64_getreg
92648 +#define ia64_setreg __ia64_setreg
92649 +#define ia64_hint __ia64_hint
92650 +#define ia64_thash __ia64_thash
92651 +#define ia64_itci __ia64_itci
92652 +#define ia64_itcd __ia64_itcd
92653 +#define ia64_itri __ia64_itri
92654 +#define ia64_itrd __ia64_itrd
92655 +#define ia64_tpa __ia64_tpa
92656 +#define ia64_set_ibr __ia64_set_ibr
92657 +#define ia64_set_pkr __ia64_set_pkr
92658 +#define ia64_set_pmc __ia64_set_pmc
92659 +#define ia64_set_pmd __ia64_set_pmd
92660 +#define ia64_set_rr __ia64_set_rr
92661 +#define ia64_get_cpuid __ia64_get_cpuid
92662 +#define ia64_get_ibr __ia64_get_ibr
92663 +#define ia64_get_pkr __ia64_get_pkr
92664 +#define ia64_get_pmc __ia64_get_pmc
92665 +#define ia64_get_pmd __ia64_get_pmd
92666 +#define ia64_get_rr __ia64_get_rr
92667 +#define ia64_fc __ia64_fc
92668 +#define ia64_ssm __ia64_ssm
92669 +#define ia64_rsm __ia64_rsm
92670 +#define ia64_ptce __ia64_ptce
92671 +#define ia64_ptcga __ia64_ptcga
92672 +#define ia64_ptcl __ia64_ptcl
92673 +#define ia64_ptri __ia64_ptri
92674 +#define ia64_ptrd __ia64_ptrd
92675 +#define ia64_get_psr_i __ia64_get_psr_i
92676 +#define ia64_intrin_local_irq_restore __ia64_intrin_local_irq_restore
92677 +#define ia64_pal_halt_light __ia64_pal_halt_light
92678 +#define ia64_leave_kernel __ia64_leave_kernel
92679 +#define ia64_leave_syscall __ia64_leave_syscall
92680 +#define ia64_trace_syscall __ia64_trace_syscall
92681 +#define ia64_ret_from_clone __ia64_ret_from_clone
92682 +#define ia64_switch_to __ia64_switch_to
92683 +#define ia64_pal_call_static __ia64_pal_call_static
92684 +
92685 +#endif /* !IA64_PARAVIRTUALIZED */
92686 +
92687 +#endif /* !__ASSEMBLY */
92688 +
92689 +#endif /* _ASM_IA64_PRIVOP_H */
92690 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/processor.h linux-2.6.16.33/include/asm-ia64/processor.h
92691 --- linux-2.6.16.33-noxen/include/asm-ia64/processor.h 2006-11-22 18:06:31.000000000 +0000
92692 +++ linux-2.6.16.33/include/asm-ia64/processor.h 2007-01-08 15:00:46.000000000 +0000
92693 @@ -19,6 +19,7 @@
92694 #include <asm/kregs.h>
92695 #include <asm/ptrace.h>
92696 #include <asm/ustack.h>
92697 +#include <asm/privop.h>
92698
92699 #define IA64_NUM_DBG_REGS 8
92700 /*
92701 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/sal.h linux-2.6.16.33/include/asm-ia64/sal.h
92702 --- linux-2.6.16.33-noxen/include/asm-ia64/sal.h 2006-11-22 18:06:31.000000000 +0000
92703 +++ linux-2.6.16.33/include/asm-ia64/sal.h 2007-01-08 15:00:46.000000000 +0000
92704 @@ -42,6 +42,9 @@
92705 #include <asm/pal.h>
92706 #include <asm/system.h>
92707 #include <asm/fpu.h>
92708 +#ifdef CONFIG_XEN
92709 +#include <asm/xen/xencomm.h>
92710 +#endif
92711
92712 extern spinlock_t sal_lock;
92713
92714 @@ -686,10 +689,28 @@
92715 /* Get the processor and platform information logged by SAL with respect to the machine
92716 * state at the time of the MCAs, INITs, CMCs, or CPEs.
92717 */
92718 +#ifdef CONFIG_XEN
92719 +static inline u64 ia64_sal_get_state_info_size (u64 sal_info_type);
92720 +#endif
92721 +
92722 static inline u64
92723 ia64_sal_get_state_info (u64 sal_info_type, u64 *sal_info)
92724 {
92725 struct ia64_sal_retval isrv;
92726 +#ifdef CONFIG_XEN
92727 + if (is_running_on_xen()) {
92728 + struct xencomm_handle *desc;
92729 +
92730 + if (xencomm_create(sal_info,
92731 + ia64_sal_get_state_info_size(sal_info_type),
92732 + &desc, GFP_KERNEL))
92733 + return 0;
92734 +
92735 + SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
92736 + desc, 0, 0, 0, 0);
92737 + xencomm_free(desc);
92738 + } else
92739 +#endif
92740 SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
92741 sal_info, 0, 0, 0, 0);
92742 if (isrv.status)
92743 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/synch_bitops.h linux-2.6.16.33/include/asm-ia64/synch_bitops.h
92744 --- linux-2.6.16.33-noxen/include/asm-ia64/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
92745 +++ linux-2.6.16.33/include/asm-ia64/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
92746 @@ -0,0 +1,63 @@
92747 +#ifndef __XEN_SYNCH_BITOPS_H__
92748 +#define __XEN_SYNCH_BITOPS_H__
92749 +
92750 +/*
92751 + * Copyright 1992, Linus Torvalds.
92752 + * Heavily modified to provide guaranteed strong synchronisation
92753 + * when communicating with Xen or other guest OSes running on other CPUs.
92754 + */
92755 +
92756 +#include <linux/config.h>
92757 +
92758 +#define ADDR (*(volatile long *) addr)
92759 +
92760 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
92761 +{
92762 + set_bit(nr, addr);
92763 +}
92764 +
92765 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
92766 +{
92767 + clear_bit(nr, addr);
92768 +}
92769 +
92770 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
92771 +{
92772 + change_bit(nr, addr);
92773 +}
92774 +
92775 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
92776 +{
92777 + return test_and_set_bit(nr, addr);
92778 +}
92779 +
92780 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
92781 +{
92782 + return test_and_clear_bit(nr, addr);
92783 +}
92784 +
92785 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
92786 +{
92787 + return test_and_change_bit(nr, addr);
92788 +}
92789 +
92790 +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
92791 +{
92792 + return test_bit(nr, addr);
92793 +}
92794 +
92795 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
92796 +{
92797 + return test_bit(nr, addr);
92798 +}
92799 +
92800 +#define synch_cmpxchg ia64_cmpxchg4_acq
92801 +
92802 +#define synch_test_bit(nr,addr) \
92803 +(__builtin_constant_p(nr) ? \
92804 + synch_const_test_bit((nr),(addr)) : \
92805 + synch_var_test_bit((nr),(addr)))
92806 +
92807 +#define synch_cmpxchg_subword synch_cmpxchg
92808 +
92809 +#endif /* __XEN_SYNCH_BITOPS_H__ */
92810 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/system.h linux-2.6.16.33/include/asm-ia64/system.h
92811 --- linux-2.6.16.33-noxen/include/asm-ia64/system.h 2006-11-22 18:06:31.000000000 +0000
92812 +++ linux-2.6.16.33/include/asm-ia64/system.h 2007-01-08 15:00:46.000000000 +0000
92813 @@ -125,7 +125,7 @@
92814 #define __local_irq_save(x) \
92815 do { \
92816 ia64_stop(); \
92817 - (x) = ia64_getreg(_IA64_REG_PSR); \
92818 + (x) = ia64_get_psr_i(); \
92819 ia64_stop(); \
92820 ia64_rsm(IA64_PSR_I); \
92821 } while (0)
92822 @@ -173,7 +173,7 @@
92823 #endif /* !CONFIG_IA64_DEBUG_IRQ */
92824
92825 #define local_irq_enable() ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
92826 -#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
92827 +#define local_save_flags(flags) ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
92828
92829 #define irqs_disabled() \
92830 ({ \
92831 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/uaccess.h linux-2.6.16.33/include/asm-ia64/uaccess.h
92832 --- linux-2.6.16.33-noxen/include/asm-ia64/uaccess.h 2006-11-22 18:06:31.000000000 +0000
92833 +++ linux-2.6.16.33/include/asm-ia64/uaccess.h 2007-01-08 15:00:46.000000000 +0000
92834 @@ -365,6 +365,7 @@
92835 }
92836
92837 #define ARCH_HAS_TRANSLATE_MEM_PTR 1
92838 +#ifndef CONFIG_XEN
92839 static __inline__ char *
92840 xlate_dev_mem_ptr (unsigned long p)
92841 {
92842 @@ -379,6 +380,25 @@
92843
92844 return ptr;
92845 }
92846 +#else
92847 +static __inline__ char *
92848 +xlate_dev_mem_ptr (unsigned long p, ssize_t sz)
92849 +{
92850 + unsigned long pfn = p >> PAGE_SHIFT;
92851 +
92852 + if (pfn_valid(pfn) && !PageUncached(pfn_to_page(pfn)))
92853 + return __va(p);
92854 +
92855 + return ioremap(p, sz);
92856 +}
92857 +
92858 +static __inline__ void
92859 +xlate_dev_mem_ptr_unmap (char* v)
92860 +{
92861 + if (REGION_NUMBER(v) == RGN_UNCACHED)
92862 + iounmap(v);
92863 +}
92864 +#endif
92865
92866 /*
92867 * Convert a virtual cached kernel memory pointer to an uncached pointer
92868 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/privop.h linux-2.6.16.33/include/asm-ia64/xen/privop.h
92869 --- linux-2.6.16.33-noxen/include/asm-ia64/xen/privop.h 1970-01-01 00:00:00.000000000 +0000
92870 +++ linux-2.6.16.33/include/asm-ia64/xen/privop.h 2007-01-08 15:00:46.000000000 +0000
92871 @@ -0,0 +1,303 @@
92872 +#ifndef _ASM_IA64_XEN_PRIVOP_H
92873 +#define _ASM_IA64_XEN_PRIVOP_H
92874 +
92875 +/*
92876 + * Copyright (C) 2005 Hewlett-Packard Co
92877 + * Dan Magenheimer <dan.magenheimer@hp.com>
92878 + *
92879 + * Paravirtualizations of privileged operations for Xen/ia64
92880 + *
92881 + */
92882 +
92883 +
92884 +#include <xen/interface/arch-ia64.h>
92885 +
92886 +#define IA64_PARAVIRTUALIZED
92887 +
92888 +/* At 1 MB, before per-cpu space but still addressable using addl instead
92889 + of movl. */
92890 +#define XSI_BASE 0xfffffffffff00000
92891 +
92892 +/* Address of mapped regs. */
92893 +#define XMAPPEDREGS_BASE (XSI_BASE + XSI_SIZE)
92894 +
92895 +#ifdef __ASSEMBLY__
92896 +#define XEN_HYPER_RFI break HYPERPRIVOP_RFI
92897 +#define XEN_HYPER_RSM_PSR_DT break HYPERPRIVOP_RSM_DT
92898 +#define XEN_HYPER_SSM_PSR_DT break HYPERPRIVOP_SSM_DT
92899 +#define XEN_HYPER_COVER break HYPERPRIVOP_COVER
92900 +#define XEN_HYPER_ITC_D break HYPERPRIVOP_ITC_D
92901 +#define XEN_HYPER_ITC_I break HYPERPRIVOP_ITC_I
92902 +#define XEN_HYPER_SSM_I break HYPERPRIVOP_SSM_I
92903 +#define XEN_HYPER_GET_IVR break HYPERPRIVOP_GET_IVR
92904 +#define XEN_HYPER_GET_TPR break HYPERPRIVOP_GET_TPR
92905 +#define XEN_HYPER_SET_TPR break HYPERPRIVOP_SET_TPR
92906 +#define XEN_HYPER_EOI break HYPERPRIVOP_EOI
92907 +#define XEN_HYPER_SET_ITM break HYPERPRIVOP_SET_ITM
92908 +#define XEN_HYPER_THASH break HYPERPRIVOP_THASH
92909 +#define XEN_HYPER_PTC_GA break HYPERPRIVOP_PTC_GA
92910 +#define XEN_HYPER_ITR_D break HYPERPRIVOP_ITR_D
92911 +#define XEN_HYPER_GET_RR break HYPERPRIVOP_GET_RR
92912 +#define XEN_HYPER_SET_RR break HYPERPRIVOP_SET_RR
92913 +#define XEN_HYPER_SET_KR break HYPERPRIVOP_SET_KR
92914 +#define XEN_HYPER_FC break HYPERPRIVOP_FC
92915 +#define XEN_HYPER_GET_CPUID break HYPERPRIVOP_GET_CPUID
92916 +#define XEN_HYPER_GET_PMD break HYPERPRIVOP_GET_PMD
92917 +#define XEN_HYPER_GET_EFLAG break HYPERPRIVOP_GET_EFLAG
92918 +#define XEN_HYPER_SET_EFLAG break HYPERPRIVOP_SET_EFLAG
92919 +#define XEN_HYPER_RSM_BE break HYPERPRIVOP_RSM_BE
92920 +#define XEN_HYPER_GET_PSR break HYPERPRIVOP_GET_PSR
92921 +
92922 +#define XSI_IFS (XSI_BASE + XSI_IFS_OFS)
92923 +#define XSI_PRECOVER_IFS (XSI_BASE + XSI_PRECOVER_IFS_OFS)
92924 +#define XSI_INCOMPL_REGFR (XSI_BASE + XSI_INCOMPL_REGFR_OFS)
92925 +#define XSI_IFA (XSI_BASE + XSI_IFA_OFS)
92926 +#define XSI_ISR (XSI_BASE + XSI_ISR_OFS)
92927 +#define XSI_IIM (XSI_BASE + XSI_IIM_OFS)
92928 +#define XSI_ITIR (XSI_BASE + XSI_ITIR_OFS)
92929 +#define XSI_PSR_I_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS)
92930 +#define XSI_PSR_IC (XSI_BASE + XSI_PSR_IC_OFS)
92931 +#define XSI_IPSR (XSI_BASE + XSI_IPSR_OFS)
92932 +#define XSI_IIP (XSI_BASE + XSI_IIP_OFS)
92933 +#define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS)
92934 +#define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS)
92935 +#define XSI_IHA (XSI_BASE + XSI_IHA_OFS)
92936 +#endif
92937 +
92938 +#ifndef __ASSEMBLY__
92939 +#define XEN_HYPER_SSM_I asm("break %0" : : "i" (HYPERPRIVOP_SSM_I))
92940 +#define XEN_HYPER_GET_IVR asm("break %0" : : "i" (HYPERPRIVOP_GET_IVR))
92941 +
92942 +/************************************************/
92943 +/* Instructions paravirtualized for correctness */
92944 +/************************************************/
92945 +
92946 +/* "fc" and "thash" are privilege-sensitive instructions, meaning they
92947 + * may have different semantics depending on whether they are executed
92948 + * at PL0 vs PL!=0. When paravirtualized, these instructions mustn't
92949 + * be allowed to execute directly, lest incorrect semantics result. */
92950 +extern unsigned long xen_fc(unsigned long addr);
92951 +#define ia64_fc(addr) xen_fc((unsigned long)(addr))
92952 +extern unsigned long xen_thash(unsigned long addr);
92953 +#define ia64_thash(addr) xen_thash((unsigned long)(addr))
92954 +/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
92955 + * is not currently used (though it may be in a long-format VHPT system!)
92956 + * and the semantics of cover only change if psr.ic is off which is very
92957 + * rare (and currently non-existent outside of assembly code */
92958 +
92959 +/* There are also privilege-sensitive registers. These registers are
92960 + * readable at any privilege level but only writable at PL0. */
92961 +extern unsigned long xen_get_cpuid(int index);
92962 +#define ia64_get_cpuid(i) xen_get_cpuid(i)
92963 +extern unsigned long xen_get_pmd(int index);
92964 +#define ia64_get_pmd(i) xen_get_pmd(i)
92965 +extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */
92966 +extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
92967 +
92968 +/************************************************/
92969 +/* Instructions paravirtualized for performance */
92970 +/************************************************/
92971 +
92972 +/* Xen uses memory-mapped virtual privileged registers for access to many
92973 + * performance-sensitive privileged registers. Some, like the processor
92974 + * status register (psr), are broken up into multiple memory locations.
92975 + * Others, like "pend", are abstractions based on privileged registers.
92976 + * "Pend" is guaranteed to be set if reading cr.ivr would return a
92977 + * (non-spurious) interrupt. */
92978 +#define XEN_MAPPEDREGS ((struct mapped_regs *)XMAPPEDREGS_BASE)
92979 +#define XSI_PSR_I \
92980 + (*XEN_MAPPEDREGS->interrupt_mask_addr)
92981 +#define xen_get_virtual_psr_i() \
92982 + (!XSI_PSR_I)
92983 +#define xen_set_virtual_psr_i(_val) \
92984 + ({ XSI_PSR_I = (uint8_t)(_val) ? 0 : 1; })
92985 +#define xen_set_virtual_psr_ic(_val) \
92986 + ({ XEN_MAPPEDREGS->interrupt_collection_enabled = _val ? 1 : 0; })
92987 +#define xen_get_virtual_pend() \
92988 + (*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
92989 +
92990 +/* Hyperprivops are "break" instructions with a well-defined API.
92991 + * In particular, the virtual psr.ic bit must be off; in this way
92992 + * it is guaranteed to never conflict with a linux break instruction.
92993 + * Normally, this is done in a xen stub but this one is frequent enough
92994 + * that we inline it */
92995 +#define xen_hyper_ssm_i() \
92996 +({ \
92997 + xen_set_virtual_psr_i(0); \
92998 + xen_set_virtual_psr_ic(0); \
92999 + XEN_HYPER_SSM_I; \
93000 +})
93001 +
93002 +/* turning off interrupts can be paravirtualized simply by writing
93003 + * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
93004 +#define xen_rsm_i() xen_set_virtual_psr_i(0)
93005 +
93006 +/* turning on interrupts is a bit more complicated.. write to the
93007 + * memory-mapped virtual psr.i bit first (to avoid race condition),
93008 + * then if any interrupts were pending, we have to execute a hyperprivop
93009 + * to ensure the pending interrupt gets delivered; else we're done! */
93010 +#define xen_ssm_i() \
93011 +({ \
93012 + int old = xen_get_virtual_psr_i(); \
93013 + xen_set_virtual_psr_i(1); \
93014 + if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i(); \
93015 +})
93016 +
93017 +#define xen_ia64_intrin_local_irq_restore(x) \
93018 +{ \
93019 + if (is_running_on_xen()) { \
93020 + if ((x) & IA64_PSR_I) { xen_ssm_i(); } \
93021 + else { xen_rsm_i(); } \
93022 + } \
93023 + else __ia64_intrin_local_irq_restore((x)); \
93024 +}
93025 +
93026 +#define xen_get_psr_i() \
93027 +( \
93028 + (is_running_on_xen()) ? \
93029 + (xen_get_virtual_psr_i() ? IA64_PSR_I : 0) \
93030 + : __ia64_get_psr_i() \
93031 +)
93032 +
93033 +#define xen_ia64_ssm(mask) \
93034 +{ \
93035 + if ((mask)==IA64_PSR_I) { \
93036 + if (is_running_on_xen()) { xen_ssm_i(); } \
93037 + else { __ia64_ssm(mask); } \
93038 + } \
93039 + else { __ia64_ssm(mask); } \
93040 +}
93041 +
93042 +#define xen_ia64_rsm(mask) \
93043 +{ \
93044 + if ((mask)==IA64_PSR_I) { \
93045 + if (is_running_on_xen()) { xen_rsm_i(); } \
93046 + else { __ia64_rsm(mask); } \
93047 + } \
93048 + else { __ia64_rsm(mask); } \
93049 +}
93050 +
93051 +
93052 +/* Although all privileged operations can be left to trap and will
93053 + * be properly handled by Xen, some are frequent enough that we use
93054 + * hyperprivops for performance. */
93055 +
93056 +extern unsigned long xen_get_ivr(void);
93057 +extern unsigned long xen_get_tpr(void);
93058 +extern void xen_set_itm(unsigned long);
93059 +extern void xen_set_tpr(unsigned long);
93060 +extern void xen_eoi(void);
93061 +extern void xen_set_rr(unsigned long index, unsigned long val);
93062 +extern unsigned long xen_get_rr(unsigned long index);
93063 +extern void xen_set_kr(unsigned long index, unsigned long val);
93064 +extern void xen_ptcga(unsigned long addr, unsigned long size);
93065 +
93066 +/* Note: It may look wrong to test for is_running_on_xen() in each case.
93067 + * However regnum is always a constant so, as written, the compiler
93068 + * eliminates the switch statement, whereas is_running_on_xen() must be
93069 + * tested dynamically. */
93070 +#define xen_ia64_getreg(regnum) \
93071 +({ \
93072 + __u64 ia64_intri_res; \
93073 + \
93074 + switch(regnum) { \
93075 + case _IA64_REG_CR_IVR: \
93076 + ia64_intri_res = (is_running_on_xen()) ? \
93077 + xen_get_ivr() : \
93078 + __ia64_getreg(regnum); \
93079 + break; \
93080 + case _IA64_REG_CR_TPR: \
93081 + ia64_intri_res = (is_running_on_xen()) ? \
93082 + xen_get_tpr() : \
93083 + __ia64_getreg(regnum); \
93084 + break; \
93085 + case _IA64_REG_AR_EFLAG: \
93086 + ia64_intri_res = (is_running_on_xen()) ? \
93087 + xen_get_eflag() : \
93088 + __ia64_getreg(regnum); \
93089 + break; \
93090 + default: \
93091 + ia64_intri_res = __ia64_getreg(regnum); \
93092 + break; \
93093 + } \
93094 + ia64_intri_res; \
93095 +})
93096 +
93097 +#define xen_ia64_setreg(regnum,val) \
93098 +({ \
93099 + switch(regnum) { \
93100 + case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7: \
93101 + (is_running_on_xen()) ? \
93102 + xen_set_kr((regnum-_IA64_REG_AR_KR0), val) : \
93103 + __ia64_setreg(regnum,val); \
93104 + break; \
93105 + case _IA64_REG_CR_ITM: \
93106 + (is_running_on_xen()) ? \
93107 + xen_set_itm(val) : \
93108 + __ia64_setreg(regnum,val); \
93109 + break; \
93110 + case _IA64_REG_CR_TPR: \
93111 + (is_running_on_xen()) ? \
93112 + xen_set_tpr(val) : \
93113 + __ia64_setreg(regnum,val); \
93114 + break; \
93115 + case _IA64_REG_CR_EOI: \
93116 + (is_running_on_xen()) ? \
93117 + xen_eoi() : \
93118 + __ia64_setreg(regnum,val); \
93119 + break; \
93120 + case _IA64_REG_AR_EFLAG: \
93121 + (is_running_on_xen()) ? \
93122 + xen_set_eflag(val) : \
93123 + __ia64_setreg(regnum,val); \
93124 + break; \
93125 + default: \
93126 + __ia64_setreg(regnum,val); \
93127 + break; \
93128 + } \
93129 +})
93130 +
93131 +#define ia64_ssm xen_ia64_ssm
93132 +#define ia64_rsm xen_ia64_rsm
93133 +#define ia64_intrin_local_irq_restore xen_ia64_intrin_local_irq_restore
93134 +#define ia64_ptcga xen_ptcga
93135 +#define ia64_set_rr(index,val) xen_set_rr(index,val)
93136 +#define ia64_get_rr(index) xen_get_rr(index)
93137 +#define ia64_getreg xen_ia64_getreg
93138 +#define ia64_setreg xen_ia64_setreg
93139 +#define ia64_get_psr_i xen_get_psr_i
93140 +
93141 +/* the remainder of these are not performance-sensitive so its
93142 + * OK to not paravirtualize and just take a privop trap and emulate */
93143 +#define ia64_hint __ia64_hint
93144 +#define ia64_set_pmd __ia64_set_pmd
93145 +#define ia64_itci __ia64_itci
93146 +#define ia64_itcd __ia64_itcd
93147 +#define ia64_itri __ia64_itri
93148 +#define ia64_itrd __ia64_itrd
93149 +#define ia64_tpa __ia64_tpa
93150 +#define ia64_set_ibr __ia64_set_ibr
93151 +#define ia64_set_pkr __ia64_set_pkr
93152 +#define ia64_set_pmc __ia64_set_pmc
93153 +#define ia64_get_ibr __ia64_get_ibr
93154 +#define ia64_get_pkr __ia64_get_pkr
93155 +#define ia64_get_pmc __ia64_get_pmc
93156 +#define ia64_ptce __ia64_ptce
93157 +#define ia64_ptcl __ia64_ptcl
93158 +#define ia64_ptri __ia64_ptri
93159 +#define ia64_ptrd __ia64_ptrd
93160 +
93161 +#endif /* !__ASSEMBLY__ */
93162 +
93163 +/* these routines utilize privilege-sensitive or performance-sensitive
93164 + * privileged instructions so the code must be replaced with
93165 + * paravirtualized versions */
93166 +#define ia64_pal_halt_light xen_pal_halt_light
93167 +#define ia64_leave_kernel xen_leave_kernel
93168 +#define ia64_leave_syscall xen_leave_syscall
93169 +#define ia64_trace_syscall xen_trace_syscall
93170 +#define ia64_ret_from_clone xen_ret_from_clone
93171 +#define ia64_switch_to xen_switch_to
93172 +#define ia64_pal_call_static xen_pal_call_static
93173 +
93174 +#endif /* _ASM_IA64_XEN_PRIVOP_H */
93175 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/xcom_hcall.h linux-2.6.16.33/include/asm-ia64/xen/xcom_hcall.h
93176 --- linux-2.6.16.33-noxen/include/asm-ia64/xen/xcom_hcall.h 1970-01-01 00:00:00.000000000 +0000
93177 +++ linux-2.6.16.33/include/asm-ia64/xen/xcom_hcall.h 2007-01-08 15:00:46.000000000 +0000
93178 @@ -0,0 +1,86 @@
93179 +/*
93180 + * Copyright (C) 2006 Tristan Gingold <tristan.gingold@bull.net>, Bull SAS
93181 + *
93182 + * This program is free software; you can redistribute it and/or modify
93183 + * it under the terms of the GNU General Public License as published by
93184 + * the Free Software Foundation; either version 2 of the License, or
93185 + * (at your option) any later version.
93186 + *
93187 + * This program is distributed in the hope that it will be useful,
93188 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
93189 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93190 + * GNU General Public License for more details.
93191 + *
93192 + * You should have received a copy of the GNU General Public License
93193 + * along with this program; if not, write to the Free Software
93194 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93195 + */
93196 +
93197 +#ifndef _LINUX_XENCOMM_HCALL_H_
93198 +#define _LINUX_XENCOMM_HCALL_H_
93199 +
93200 +/* These function creates inline descriptor for the parameters and
93201 + calls the corresponding xencomm_arch_hypercall_X.
93202 + Architectures should defines HYPERVISOR_xxx as xencomm_hypercall_xxx unless
93203 + they want to use their own wrapper. */
93204 +extern int xencomm_hypercall_console_io(int cmd, int count, char *str);
93205 +
93206 +extern int xencomm_hypercall_event_channel_op(int cmd, void *op);
93207 +
93208 +extern int xencomm_hypercall_xen_version(int cmd, void *arg);
93209 +
93210 +extern int xencomm_hypercall_physdev_op(int cmd, void *op);
93211 +
93212 +extern int xencomm_hypercall_grant_table_op(unsigned int cmd, void *op,
93213 + unsigned int count);
93214 +
93215 +extern int xencomm_hypercall_sched_op(int cmd, void *arg);
93216 +
93217 +extern int xencomm_hypercall_multicall(void *call_list, int nr_calls);
93218 +
93219 +extern int xencomm_hypercall_callback_op(int cmd, void *arg);
93220 +
93221 +extern int xencomm_hypercall_memory_op(unsigned int cmd, void *arg);
93222 +
93223 +extern unsigned long xencomm_hypercall_hvm_op(int cmd, void *arg);
93224 +
93225 +extern int xencomm_hypercall_suspend(unsigned long srec);
93226 +
93227 +extern int xencomm_hypercall_xenoprof_op(int op, void *arg);
93228 +
93229 +extern int xencomm_hypercall_perfmon_op(unsigned long cmd, void* arg,
93230 + unsigned long count);
93231 +
93232 +/* Using mini xencomm. */
93233 +extern int xencomm_mini_hypercall_console_io(int cmd, int count, char *str);
93234 +
93235 +extern int xencomm_mini_hypercall_event_channel_op(int cmd, void *op);
93236 +
93237 +extern int xencomm_mini_hypercall_xen_version(int cmd, void *arg);
93238 +
93239 +extern int xencomm_mini_hypercall_physdev_op(int cmd, void *op);
93240 +
93241 +extern int xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
93242 + unsigned int count);
93243 +
93244 +extern int xencomm_mini_hypercall_sched_op(int cmd, void *arg);
93245 +
93246 +extern int xencomm_mini_hypercall_multicall(void *call_list, int nr_calls);
93247 +
93248 +extern int xencomm_mini_hypercall_callback_op(int cmd, void *arg);
93249 +
93250 +extern int xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg);
93251 +
93252 +extern unsigned long xencomm_mini_hypercall_hvm_op(int cmd, void *arg);
93253 +
93254 +extern int xencomm_mini_hypercall_xenoprof_op(int op, void *arg);
93255 +
93256 +extern int xencomm_mini_hypercall_perfmon_op(unsigned long cmd, void* arg,
93257 + unsigned long count);
93258 +
93259 +/* For privcmd. Locally declare argument type to avoid include storm.
93260 + Type coherency will be checked within privcmd.c */
93261 +struct privcmd_hypercall;
93262 +extern int privcmd_hypercall(struct privcmd_hypercall *hypercall);
93263 +
93264 +#endif /* _LINUX_XENCOMM_HCALL_H_ */
93265 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xen/xencomm.h linux-2.6.16.33/include/asm-ia64/xen/xencomm.h
93266 --- linux-2.6.16.33-noxen/include/asm-ia64/xen/xencomm.h 1970-01-01 00:00:00.000000000 +0000
93267 +++ linux-2.6.16.33/include/asm-ia64/xen/xencomm.h 2007-01-08 15:00:46.000000000 +0000
93268 @@ -0,0 +1,60 @@
93269 +/*
93270 + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation
93271 + *
93272 + * This program is free software; you can redistribute it and/or modify
93273 + * it under the terms of the GNU General Public License as published by
93274 + * the Free Software Foundation; either version 2 of the License, or
93275 + * (at your option) any later version.
93276 + *
93277 + * This program is distributed in the hope that it will be useful,
93278 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
93279 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93280 + * GNU General Public License for more details.
93281 + *
93282 + * You should have received a copy of the GNU General Public License
93283 + * along with this program; if not, write to the Free Software
93284 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93285 + */
93286 +
93287 +#ifndef _LINUX_XENCOMM_H_
93288 +#define _LINUX_XENCOMM_H_
93289 +
93290 +#include <xen/interface/xencomm.h>
93291 +
93292 +#define XENCOMM_MINI_ADDRS 3
93293 +struct xencomm_mini {
93294 + struct xencomm_desc _desc;
93295 + uint64_t address[XENCOMM_MINI_ADDRS];
93296 +};
93297 +
93298 +/* Must be called before any hypercall. */
93299 +extern void xencomm_init (void);
93300 +
93301 +/* To avoid additionnal virt to phys conversion, an opaque structure is
93302 + presented. */
93303 +struct xencomm_handle;
93304 +
93305 +extern int xencomm_create(void *buffer, unsigned long bytes,
93306 + struct xencomm_handle **desc, gfp_t type);
93307 +extern void xencomm_free(struct xencomm_handle *desc);
93308 +
93309 +extern int xencomm_create_mini(struct xencomm_mini *area, int *nbr_area,
93310 + void *buffer, unsigned long bytes,
93311 + struct xencomm_handle **ret);
93312 +
93313 +/* Translate virtual address to physical address. */
93314 +extern unsigned long xencomm_vaddr_to_paddr(unsigned long vaddr);
93315 +
93316 +/* Inline version. To be used only on linear space (kernel space). */
93317 +static inline struct xencomm_handle *
93318 +xencomm_create_inline(void *buffer)
93319 +{
93320 + unsigned long paddr;
93321 +
93322 + paddr = xencomm_vaddr_to_paddr((unsigned long)buffer);
93323 + return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
93324 +}
93325 +
93326 +#define xen_guest_handle(hnd) ((hnd).p)
93327 +
93328 +#endif /* _LINUX_XENCOMM_H_ */
93329 diff -Nur linux-2.6.16.33-noxen/include/asm-ia64/xenoprof.h linux-2.6.16.33/include/asm-ia64/xenoprof.h
93330 --- linux-2.6.16.33-noxen/include/asm-ia64/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
93331 +++ linux-2.6.16.33/include/asm-ia64/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
93332 @@ -0,0 +1,48 @@
93333 +/******************************************************************************
93334 + * asm-ia64/xenoprof.h
93335 + *
93336 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
93337 + * VA Linux Systems Japan K.K.
93338 + *
93339 + * This program is free software; you can redistribute it and/or modify
93340 + * it under the terms of the GNU General Public License as published by
93341 + * the Free Software Foundation; either version 2 of the License, or
93342 + * (at your option) any later version.
93343 + *
93344 + * This program is distributed in the hope that it will be useful,
93345 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
93346 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
93347 + * GNU General Public License for more details.
93348 + *
93349 + * You should have received a copy of the GNU General Public License
93350 + * along with this program; if not, write to the Free Software
93351 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
93352 + *
93353 + */
93354 +#ifndef __ASM_XENOPROF_H__
93355 +#define __ASM_XENOPROF_H__
93356 +#ifdef CONFIG_XEN
93357 +
93358 +#undef HAVE_XENOPROF_CREATE_FILES
93359 +
93360 +struct xenoprof_init;
93361 +void xenoprof_arch_init_counter(struct xenoprof_init *init);
93362 +void xenoprof_arch_counter(void);
93363 +void xenoprof_arch_start(void);
93364 +void xenoprof_arch_stop(void);
93365 +
93366 +struct xenoprof_arch_shared_buffer {
93367 + struct resource* res;
93368 +};
93369 +
93370 +struct xenoprof_shared_buffer;
93371 +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
93372 +struct xenoprof_get_buffer;
93373 +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer,
93374 + struct xenoprof_shared_buffer* sbuf);
93375 +struct xenoprof_passive;
93376 +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain,
93377 + struct xenoprof_shared_buffer* sbuf);
93378 +
93379 +#endif /* CONFIG_XEN */
93380 +#endif /* __ASM_XENOPROF_H__ */
93381 diff -Nur linux-2.6.16.33-noxen/include/asm-um/page.h linux-2.6.16.33/include/asm-um/page.h
93382 --- linux-2.6.16.33-noxen/include/asm-um/page.h 2006-11-22 18:06:31.000000000 +0000
93383 +++ linux-2.6.16.33/include/asm-um/page.h 2007-01-08 15:00:46.000000000 +0000
93384 @@ -118,7 +118,7 @@
93385 extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
93386 #define HAVE_ARCH_VALIDATE
93387
93388 -extern void arch_free_page(struct page *page, int order);
93389 +extern int arch_free_page(struct page *page, int order);
93390 #define HAVE_ARCH_FREE_PAGE
93391
93392 #include <asm-generic/page.h>
93393 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/apic.h linux-2.6.16.33/include/asm-x86_64/apic.h
93394 --- linux-2.6.16.33-noxen/include/asm-x86_64/apic.h 2006-11-22 18:06:31.000000000 +0000
93395 +++ linux-2.6.16.33/include/asm-x86_64/apic.h 2007-01-08 15:00:46.000000000 +0000
93396 @@ -105,11 +105,13 @@
93397
93398 extern void setup_threshold_lvt(unsigned long lvt_off);
93399
93400 +#ifndef CONFIG_XEN
93401 void smp_send_timer_broadcast_ipi(void);
93402 void switch_APIC_timer_to_ipi(void *cpumask);
93403 void switch_ipi_to_APIC_timer(void *cpumask);
93404
93405 #define ARCH_APICTIMER_STOPS_ON_C3 1
93406 +#endif
93407
93408 #endif /* CONFIG_X86_LOCAL_APIC */
93409
93410 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/hw_irq.h linux-2.6.16.33/include/asm-x86_64/hw_irq.h
93411 --- linux-2.6.16.33-noxen/include/asm-x86_64/hw_irq.h 2006-11-22 18:06:31.000000000 +0000
93412 +++ linux-2.6.16.33/include/asm-x86_64/hw_irq.h 2007-05-23 21:00:01.000000000 +0000
93413 @@ -127,7 +127,7 @@
93414 __asm__( \
93415 "\n.p2align\n" \
93416 "IRQ" #nr "_interrupt:\n\t" \
93417 - "push $" #nr "-256 ; " \
93418 + "push $~(" #nr ") ; " \
93419 "jmp common_interrupt");
93420
93421 #if defined(CONFIG_X86_IO_APIC)
93422 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/kexec.h linux-2.6.16.33/include/asm-x86_64/kexec.h
93423 --- linux-2.6.16.33-noxen/include/asm-x86_64/kexec.h 2006-11-22 18:06:31.000000000 +0000
93424 +++ linux-2.6.16.33/include/asm-x86_64/kexec.h 2007-01-08 15:00:46.000000000 +0000
93425 @@ -1,6 +1,27 @@
93426 #ifndef _X86_64_KEXEC_H
93427 #define _X86_64_KEXEC_H
93428
93429 +#define PA_CONTROL_PAGE 0
93430 +#define VA_CONTROL_PAGE 1
93431 +#define PA_PGD 2
93432 +#define VA_PGD 3
93433 +#define PA_PUD_0 4
93434 +#define VA_PUD_0 5
93435 +#define PA_PMD_0 6
93436 +#define VA_PMD_0 7
93437 +#define PA_PTE_0 8
93438 +#define VA_PTE_0 9
93439 +#define PA_PUD_1 10
93440 +#define VA_PUD_1 11
93441 +#define PA_PMD_1 12
93442 +#define VA_PMD_1 13
93443 +#define PA_PTE_1 14
93444 +#define VA_PTE_1 15
93445 +#define PA_TABLE_PAGE 16
93446 +#define PAGES_NR 17
93447 +
93448 +#ifndef __ASSEMBLY__
93449 +
93450 #include <linux/string.h>
93451
93452 #include <asm/page.h>
93453 @@ -64,4 +85,25 @@
93454 newregs->rip = (unsigned long)current_text_addr();
93455 }
93456 }
93457 +
93458 +NORET_TYPE void
93459 +relocate_kernel(unsigned long indirection_page,
93460 + unsigned long page_list,
93461 + unsigned long start_address) ATTRIB_NORET;
93462 +
93463 +/* Under Xen we need to work with machine addresses. These macros give the
93464 + * machine address of a certain page to the generic kexec code instead of
93465 + * the pseudo physical address which would be given by the default macros.
93466 + */
93467 +
93468 +#ifdef CONFIG_XEN
93469 +#define KEXEC_ARCH_HAS_PAGE_MACROS
93470 +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
93471 +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
93472 +#define kexec_virt_to_phys(addr) virt_to_machine(addr)
93473 +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
93474 +#endif
93475 +
93476 +#endif /* __ASSEMBLY__ */
93477 +
93478 #endif /* _X86_64_KEXEC_H */
93479 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/agp.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/agp.h
93480 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/agp.h 1970-01-01 00:00:00.000000000 +0000
93481 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/agp.h 2007-01-08 15:00:46.000000000 +0000
93482 @@ -0,0 +1,35 @@
93483 +#ifndef AGP_H
93484 +#define AGP_H 1
93485 +
93486 +#include <asm/cacheflush.h>
93487 +#include <asm/system.h>
93488 +
93489 +/*
93490 + * Functions to keep the agpgart mappings coherent.
93491 + * The GART gives the CPU a physical alias of memory. The alias is
93492 + * mapped uncacheable. Make sure there are no conflicting mappings
93493 + * with different cachability attributes for the same page.
93494 + */
93495 +
93496 +int map_page_into_agp(struct page *page);
93497 +int unmap_page_from_agp(struct page *page);
93498 +#define flush_agp_mappings() global_flush_tlb()
93499 +
93500 +/* Could use CLFLUSH here if the cpu supports it. But then it would
93501 + need to be called for each cacheline of the whole page so it may not be
93502 + worth it. Would need a page for it. */
93503 +#define flush_agp_cache() wbinvd()
93504 +
93505 +/* Convert a physical address to an address suitable for the GART. */
93506 +#define phys_to_gart(x) phys_to_machine(x)
93507 +#define gart_to_phys(x) machine_to_phys(x)
93508 +
93509 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
93510 +#define alloc_gatt_pages(order) ({ \
93511 + char *_t; dma_addr_t _d; \
93512 + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \
93513 + _t; })
93514 +#define free_gatt_pages(table, order) \
93515 + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
93516 +
93517 +#endif
93518 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/arch_hooks.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/arch_hooks.h
93519 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/arch_hooks.h 1970-01-01 00:00:00.000000000 +0000
93520 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/arch_hooks.h 2007-01-08 15:00:46.000000000 +0000
93521 @@ -0,0 +1,27 @@
93522 +#ifndef _ASM_ARCH_HOOKS_H
93523 +#define _ASM_ARCH_HOOKS_H
93524 +
93525 +#include <linux/interrupt.h>
93526 +
93527 +/*
93528 + * linux/include/asm/arch_hooks.h
93529 + *
93530 + * define the architecture specific hooks
93531 + */
93532 +
93533 +/* these aren't arch hooks, they are generic routines
93534 + * that can be used by the hooks */
93535 +extern void init_ISA_irqs(void);
93536 +extern void apic_intr_init(void);
93537 +extern void smp_intr_init(void);
93538 +extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
93539 +
93540 +/* these are the defined hooks */
93541 +extern void intr_init_hook(void);
93542 +extern void pre_intr_init_hook(void);
93543 +extern void pre_setup_arch_hook(void);
93544 +extern void trap_init_hook(void);
93545 +extern void time_init_hook(void);
93546 +extern void mca_nmi_hook(void);
93547 +
93548 +#endif
93549 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/bootsetup.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/bootsetup.h
93550 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/bootsetup.h 1970-01-01 00:00:00.000000000 +0000
93551 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/bootsetup.h 2007-01-08 15:00:46.000000000 +0000
93552 @@ -0,0 +1,42 @@
93553 +
93554 +#ifndef _X86_64_BOOTSETUP_H
93555 +#define _X86_64_BOOTSETUP_H 1
93556 +
93557 +#define BOOT_PARAM_SIZE 4096
93558 +extern char x86_boot_params[BOOT_PARAM_SIZE];
93559 +
93560 +/*
93561 + * This is set up by the setup-routine at boot-time
93562 + */
93563 +#define PARAM ((unsigned char *)x86_boot_params)
93564 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
93565 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
93566 +#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
93567 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
93568 +#define E820_MAP ((struct e820entry *) (PARAM+E820MAP))
93569 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
93570 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
93571 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
93572 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
93573 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
93574 +#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
93575 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
93576 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
93577 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
93578 +#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
93579 +
93580 +#define INITRD_START (__pa(xen_start_info->mod_start))
93581 +#define INITRD_SIZE (xen_start_info->mod_len)
93582 +#define EDID_INFO (*(struct edid_info *) (PARAM+0x440))
93583 +
93584 +#define EDD_NR (*(unsigned char *) (PARAM+EDDNR))
93585 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
93586 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
93587 +#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF))
93588 +#define COMMAND_LINE saved_command_line
93589 +
93590 +#define RAMDISK_IMAGE_START_MASK 0x07FF
93591 +#define RAMDISK_PROMPT_FLAG 0x8000
93592 +#define RAMDISK_LOAD_FLAG 0x4000
93593 +
93594 +#endif
93595 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/desc.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/desc.h
93596 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/desc.h 1970-01-01 00:00:00.000000000 +0000
93597 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/desc.h 2007-01-08 15:00:46.000000000 +0000
93598 @@ -0,0 +1,263 @@
93599 +/* Written 2000 by Andi Kleen */
93600 +#ifndef __ARCH_DESC_H
93601 +#define __ARCH_DESC_H
93602 +
93603 +#include <linux/threads.h>
93604 +#include <asm/ldt.h>
93605 +
93606 +#ifndef __ASSEMBLY__
93607 +
93608 +#include <linux/string.h>
93609 +#include <linux/smp.h>
93610 +
93611 +#include <asm/segment.h>
93612 +#include <asm/mmu.h>
93613 +
93614 +// 8 byte segment descriptor
93615 +struct desc_struct {
93616 + u16 limit0;
93617 + u16 base0;
93618 + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
93619 + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
93620 +} __attribute__((packed));
93621 +
93622 +struct n_desc_struct {
93623 + unsigned int a,b;
93624 +};
93625 +
93626 +enum {
93627 + GATE_INTERRUPT = 0xE,
93628 + GATE_TRAP = 0xF,
93629 + GATE_CALL = 0xC,
93630 +};
93631 +
93632 +// 16byte gate
93633 +struct gate_struct {
93634 + u16 offset_low;
93635 + u16 segment;
93636 + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
93637 + u16 offset_middle;
93638 + u32 offset_high;
93639 + u32 zero1;
93640 +} __attribute__((packed));
93641 +
93642 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
93643 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
93644 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
93645 +
93646 +enum {
93647 + DESC_TSS = 0x9,
93648 + DESC_LDT = 0x2,
93649 +};
93650 +
93651 +// LDT or TSS descriptor in the GDT. 16 bytes.
93652 +struct ldttss_desc {
93653 + u16 limit0;
93654 + u16 base0;
93655 + unsigned base1 : 8, type : 5, dpl : 2, p : 1;
93656 + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
93657 + u32 base3;
93658 + u32 zero1;
93659 +} __attribute__((packed));
93660 +
93661 +struct desc_ptr {
93662 + unsigned short size;
93663 + unsigned long address;
93664 +} __attribute__((packed)) ;
93665 +
93666 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
93667 +
93668 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
93669 +
93670 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
93671 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
93672 +
93673 +static inline void clear_LDT(void)
93674 +{
93675 + int cpu = get_cpu();
93676 +
93677 + /*
93678 + * NB. We load the default_ldt for lcall7/27 handling on demand, as
93679 + * it slows down context switching. Noone uses it anyway.
93680 + */
93681 + cpu = cpu; /* XXX avoid compiler warning */
93682 + xen_set_ldt(0UL, 0);
93683 + put_cpu();
93684 +}
93685 +
93686 +/*
93687 + * This is the ldt that every process will get unless we need
93688 + * something other than this.
93689 + */
93690 +extern struct desc_struct default_ldt[];
93691 +#ifndef CONFIG_X86_NO_IDT
93692 +extern struct gate_struct idt_table[];
93693 +#endif
93694 +extern struct desc_ptr cpu_gdt_descr[];
93695 +
93696 +/* the cpu gdt accessor */
93697 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
93698 +
93699 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
93700 +{
93701 + struct gate_struct s;
93702 + s.offset_low = PTR_LOW(func);
93703 + s.segment = __KERNEL_CS;
93704 + s.ist = ist;
93705 + s.p = 1;
93706 + s.dpl = dpl;
93707 + s.zero0 = 0;
93708 + s.zero1 = 0;
93709 + s.type = type;
93710 + s.offset_middle = PTR_MIDDLE(func);
93711 + s.offset_high = PTR_HIGH(func);
93712 + /* does not need to be atomic because it is only done once at setup time */
93713 + memcpy(adr, &s, 16);
93714 +}
93715 +
93716 +#ifndef CONFIG_X86_NO_IDT
93717 +static inline void set_intr_gate(int nr, void *func)
93718 +{
93719 + BUG_ON((unsigned)nr > 0xFF);
93720 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
93721 +}
93722 +
93723 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
93724 +{
93725 + BUG_ON((unsigned)nr > 0xFF);
93726 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
93727 +}
93728 +
93729 +static inline void set_system_gate(int nr, void *func)
93730 +{
93731 + BUG_ON((unsigned)nr > 0xFF);
93732 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
93733 +}
93734 +
93735 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
93736 +{
93737 + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
93738 +}
93739 +#endif
93740 +
93741 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
93742 + unsigned size)
93743 +{
93744 + struct ldttss_desc d;
93745 + memset(&d,0,sizeof(d));
93746 + d.limit0 = size & 0xFFFF;
93747 + d.base0 = PTR_LOW(tss);
93748 + d.base1 = PTR_MIDDLE(tss) & 0xFF;
93749 + d.type = type;
93750 + d.p = 1;
93751 + d.limit1 = (size >> 16) & 0xF;
93752 + d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
93753 + d.base3 = PTR_HIGH(tss);
93754 + memcpy(ptr, &d, 16);
93755 +}
93756 +
93757 +#ifndef CONFIG_X86_NO_TSS
93758 +static inline void set_tss_desc(unsigned cpu, void *addr)
93759 +{
93760 + /*
93761 + * sizeof(unsigned long) coming from an extra "long" at the end
93762 + * of the iobitmap. See tss_struct definition in processor.h
93763 + *
93764 + * -1? seg base+limit should be pointing to the address of the
93765 + * last valid byte
93766 + */
93767 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
93768 + (unsigned long)addr, DESC_TSS,
93769 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
93770 +}
93771 +#endif
93772 +
93773 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
93774 +{
93775 + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
93776 + DESC_LDT, size * 8 - 1);
93777 +}
93778 +
93779 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
93780 +{
93781 + struct desc_struct *d = &cpu_gdt(cpu)[entry];
93782 + u32 addr = (u32)(u64)base;
93783 + BUG_ON((u64)base >> 32);
93784 + d->base0 = addr & 0xffff;
93785 + d->base1 = (addr >> 16) & 0xff;
93786 + d->base2 = (addr >> 24) & 0xff;
93787 +}
93788 +
93789 +#define LDT_entry_a(info) \
93790 + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
93791 +/* Don't allow setting of the lm bit. It is useless anyways because
93792 + 64bit system calls require __USER_CS. */
93793 +#define LDT_entry_b(info) \
93794 + (((info)->base_addr & 0xff000000) | \
93795 + (((info)->base_addr & 0x00ff0000) >> 16) | \
93796 + ((info)->limit & 0xf0000) | \
93797 + (((info)->read_exec_only ^ 1) << 9) | \
93798 + ((info)->contents << 10) | \
93799 + (((info)->seg_not_present ^ 1) << 15) | \
93800 + ((info)->seg_32bit << 22) | \
93801 + ((info)->limit_in_pages << 23) | \
93802 + ((info)->useable << 20) | \
93803 + /* ((info)->lm << 21) | */ \
93804 + 0x7000)
93805 +
93806 +#define LDT_empty(info) (\
93807 + (info)->base_addr == 0 && \
93808 + (info)->limit == 0 && \
93809 + (info)->contents == 0 && \
93810 + (info)->read_exec_only == 1 && \
93811 + (info)->seg_32bit == 0 && \
93812 + (info)->limit_in_pages == 0 && \
93813 + (info)->seg_not_present == 1 && \
93814 + (info)->useable == 0 && \
93815 + (info)->lm == 0)
93816 +
93817 +#if TLS_SIZE != 24
93818 +# error update this code.
93819 +#endif
93820 +
93821 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
93822 +{
93823 +#if 0
93824 + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
93825 + gdt[0] = t->tls_array[0];
93826 + gdt[1] = t->tls_array[1];
93827 + gdt[2] = t->tls_array[2];
93828 +#endif
93829 +#define C(i) \
93830 + HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
93831 +
93832 + C(0); C(1); C(2);
93833 +#undef C
93834 +}
93835 +
93836 +/*
93837 + * load one particular LDT into the current CPU
93838 + */
93839 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
93840 +{
93841 + void *segments = pc->ldt;
93842 + int count = pc->size;
93843 +
93844 + if (likely(!count))
93845 + segments = NULL;
93846 +
93847 + xen_set_ldt((unsigned long)segments, count);
93848 +}
93849 +
93850 +static inline void load_LDT(mm_context_t *pc)
93851 +{
93852 + int cpu = get_cpu();
93853 + load_LDT_nolock(pc, cpu);
93854 + put_cpu();
93855 +}
93856 +
93857 +extern struct desc_ptr idt_descr;
93858 +
93859 +#endif /* !__ASSEMBLY__ */
93860 +
93861 +#endif
93862 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/dma-mapping.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/dma-mapping.h
93863 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/dma-mapping.h 1970-01-01 00:00:00.000000000 +0000
93864 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/dma-mapping.h 2007-01-08 15:00:46.000000000 +0000
93865 @@ -0,0 +1,191 @@
93866 +#ifndef _X8664_DMA_MAPPING_H
93867 +#define _X8664_DMA_MAPPING_H 1
93868 +
93869 +/*
93870 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
93871 + * documentation.
93872 + */
93873 +
93874 +#include <linux/config.h>
93875 +
93876 +#include <asm/scatterlist.h>
93877 +#include <asm/io.h>
93878 +#include <asm/swiotlb.h>
93879 +
93880 +struct dma_mapping_ops {
93881 + int (*mapping_error)(dma_addr_t dma_addr);
93882 + void* (*alloc_coherent)(struct device *dev, size_t size,
93883 + dma_addr_t *dma_handle, gfp_t gfp);
93884 + void (*free_coherent)(struct device *dev, size_t size,
93885 + void *vaddr, dma_addr_t dma_handle);
93886 + dma_addr_t (*map_single)(struct device *hwdev, void *ptr,
93887 + size_t size, int direction);
93888 + /* like map_single, but doesn't check the device mask */
93889 + dma_addr_t (*map_simple)(struct device *hwdev, char *ptr,
93890 + size_t size, int direction);
93891 + void (*unmap_single)(struct device *dev, dma_addr_t addr,
93892 + size_t size, int direction);
93893 + void (*sync_single_for_cpu)(struct device *hwdev,
93894 + dma_addr_t dma_handle, size_t size,
93895 + int direction);
93896 + void (*sync_single_for_device)(struct device *hwdev,
93897 + dma_addr_t dma_handle, size_t size,
93898 + int direction);
93899 + void (*sync_single_range_for_cpu)(struct device *hwdev,
93900 + dma_addr_t dma_handle, unsigned long offset,
93901 + size_t size, int direction);
93902 + void (*sync_single_range_for_device)(struct device *hwdev,
93903 + dma_addr_t dma_handle, unsigned long offset,
93904 + size_t size, int direction);
93905 + void (*sync_sg_for_cpu)(struct device *hwdev,
93906 + struct scatterlist *sg, int nelems,
93907 + int direction);
93908 + void (*sync_sg_for_device)(struct device *hwdev,
93909 + struct scatterlist *sg, int nelems,
93910 + int direction);
93911 + int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
93912 + int nents, int direction);
93913 + void (*unmap_sg)(struct device *hwdev,
93914 + struct scatterlist *sg, int nents,
93915 + int direction);
93916 + int (*dma_supported)(struct device *hwdev, u64 mask);
93917 + int is_phys;
93918 +};
93919 +
93920 +extern dma_addr_t bad_dma_address;
93921 +extern struct dma_mapping_ops* dma_ops;
93922 +extern int iommu_merge;
93923 +
93924 +#if 0
93925 +static inline int dma_mapping_error(dma_addr_t dma_addr)
93926 +{
93927 + if (dma_ops->mapping_error)
93928 + return dma_ops->mapping_error(dma_addr);
93929 +
93930 + return (dma_addr == bad_dma_address);
93931 +}
93932 +
93933 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
93934 + dma_addr_t *dma_handle, gfp_t gfp);
93935 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
93936 + dma_addr_t dma_handle);
93937 +
93938 +static inline dma_addr_t
93939 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
93940 + int direction)
93941 +{
93942 + return dma_ops->map_single(hwdev, ptr, size, direction);
93943 +}
93944 +
93945 +static inline void
93946 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
93947 + int direction)
93948 +{
93949 + dma_ops->unmap_single(dev, addr, size, direction);
93950 +}
93951 +
93952 +#define dma_map_page(dev,page,offset,size,dir) \
93953 + dma_map_single((dev), page_address(page)+(offset), (size), (dir))
93954 +
93955 +#define dma_unmap_page dma_unmap_single
93956 +
93957 +static inline void
93958 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
93959 + size_t size, int direction)
93960 +{
93961 + if (dma_ops->sync_single_for_cpu)
93962 + dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
93963 + direction);
93964 + flush_write_buffers();
93965 +}
93966 +
93967 +static inline void
93968 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
93969 + size_t size, int direction)
93970 +{
93971 + if (dma_ops->sync_single_for_device)
93972 + dma_ops->sync_single_for_device(hwdev, dma_handle, size,
93973 + direction);
93974 + flush_write_buffers();
93975 +}
93976 +
93977 +static inline void
93978 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
93979 + unsigned long offset, size_t size, int direction)
93980 +{
93981 + if (dma_ops->sync_single_range_for_cpu) {
93982 + dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
93983 + }
93984 +
93985 + flush_write_buffers();
93986 +}
93987 +
93988 +static inline void
93989 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
93990 + unsigned long offset, size_t size, int direction)
93991 +{
93992 + if (dma_ops->sync_single_range_for_device)
93993 + dma_ops->sync_single_range_for_device(hwdev, dma_handle,
93994 + offset, size, direction);
93995 +
93996 + flush_write_buffers();
93997 +}
93998 +
93999 +static inline void
94000 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
94001 + int nelems, int direction)
94002 +{
94003 + if (dma_ops->sync_sg_for_cpu)
94004 + dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
94005 + flush_write_buffers();
94006 +}
94007 +
94008 +static inline void
94009 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
94010 + int nelems, int direction)
94011 +{
94012 + if (dma_ops->sync_sg_for_device) {
94013 + dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
94014 + }
94015 +
94016 + flush_write_buffers();
94017 +}
94018 +
94019 +static inline int
94020 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
94021 +{
94022 + return dma_ops->map_sg(hwdev, sg, nents, direction);
94023 +}
94024 +
94025 +static inline void
94026 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
94027 + int direction)
94028 +{
94029 + dma_ops->unmap_sg(hwdev, sg, nents, direction);
94030 +}
94031 +
94032 +extern int dma_supported(struct device *hwdev, u64 mask);
94033 +
94034 +/* same for gart, swiotlb, and nommu */
94035 +static inline int dma_get_cache_alignment(void)
94036 +{
94037 + return boot_cpu_data.x86_clflush_size;
94038 +}
94039 +
94040 +#define dma_is_consistent(h) 1
94041 +
94042 +extern int dma_set_mask(struct device *dev, u64 mask);
94043 +
94044 +static inline void
94045 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
94046 +{
94047 + flush_write_buffers();
94048 +}
94049 +
94050 +extern struct device fallback_dev;
94051 +extern int panic_on_overflow;
94052 +#endif
94053 +
94054 +#endif /* _X8664_DMA_MAPPING_H */
94055 +
94056 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
94057 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/e820.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/e820.h
94058 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/e820.h 1970-01-01 00:00:00.000000000 +0000
94059 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/e820.h 2007-01-08 15:00:46.000000000 +0000
94060 @@ -0,0 +1,63 @@
94061 +/*
94062 + * structures and definitions for the int 15, ax=e820 memory map
94063 + * scheme.
94064 + *
94065 + * In a nutshell, setup.S populates a scratch table in the
94066 + * empty_zero_block that contains a list of usable address/size
94067 + * duples. setup.c, this information is transferred into the e820map,
94068 + * and in init.c/numa.c, that new information is used to mark pages
94069 + * reserved or not.
94070 + */
94071 +#ifndef __E820_HEADER
94072 +#define __E820_HEADER
94073 +
94074 +#include <linux/mmzone.h>
94075 +
94076 +#define E820MAP 0x2d0 /* our map */
94077 +#define E820MAX 128 /* number of entries in E820MAP */
94078 +#define E820NR 0x1e8 /* # entries in E820MAP */
94079 +
94080 +#define E820_RAM 1
94081 +#define E820_RESERVED 2
94082 +#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
94083 +#define E820_NVS 4
94084 +
94085 +#define HIGH_MEMORY (1024*1024)
94086 +
94087 +#define LOWMEMSIZE() (0x9f000)
94088 +
94089 +#ifndef __ASSEMBLY__
94090 +struct e820entry {
94091 + u64 addr; /* start of memory segment */
94092 + u64 size; /* size of memory segment */
94093 + u32 type; /* type of memory segment */
94094 +} __attribute__((packed));
94095 +
94096 +struct e820map {
94097 + int nr_map;
94098 + struct e820entry map[E820MAX];
94099 +};
94100 +
94101 +extern unsigned long find_e820_area(unsigned long start, unsigned long end,
94102 + unsigned size);
94103 +extern void add_memory_region(unsigned long start, unsigned long size,
94104 + int type);
94105 +extern void setup_memory_region(void);
94106 +extern void contig_e820_setup(void);
94107 +extern unsigned long e820_end_of_ram(void);
94108 +extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
94109 +extern void e820_print_map(char *who);
94110 +extern int e820_mapped(unsigned long start, unsigned long end, unsigned type);
94111 +
94112 +extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
94113 +extern void e820_setup_gap(struct e820entry *e820, int nr_map);
94114 +extern unsigned long e820_hole_size(unsigned long start_pfn,
94115 + unsigned long end_pfn);
94116 +
94117 +extern void __init parse_memopt(char *p, char **end);
94118 +extern void __init parse_memmapopt(char *p, char **end);
94119 +
94120 +extern struct e820map e820;
94121 +#endif/*!__ASSEMBLY__*/
94122 +
94123 +#endif/*__E820_HEADER*/
94124 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/fixmap.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/fixmap.h
94125 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/fixmap.h 1970-01-01 00:00:00.000000000 +0000
94126 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/fixmap.h 2007-01-08 15:00:46.000000000 +0000
94127 @@ -0,0 +1,108 @@
94128 +/*
94129 + * fixmap.h: compile-time virtual memory allocation
94130 + *
94131 + * This file is subject to the terms and conditions of the GNU General Public
94132 + * License. See the file "COPYING" in the main directory of this archive
94133 + * for more details.
94134 + *
94135 + * Copyright (C) 1998 Ingo Molnar
94136 + */
94137 +
94138 +#ifndef _ASM_FIXMAP_H
94139 +#define _ASM_FIXMAP_H
94140 +
94141 +#include <linux/config.h>
94142 +#include <linux/kernel.h>
94143 +#include <asm/apicdef.h>
94144 +#include <asm/page.h>
94145 +#include <asm/vsyscall.h>
94146 +#include <asm/vsyscall32.h>
94147 +#include <asm/acpi.h>
94148 +
94149 +/*
94150 + * Here we define all the compile-time 'special' virtual
94151 + * addresses. The point is to have a constant address at
94152 + * compile time, but to set the physical address only
94153 + * in the boot process.
94154 + *
94155 + * these 'compile-time allocated' memory buffers are
94156 + * fixed-size 4k pages. (or larger if used with an increment
94157 + * highger than 1) use fixmap_set(idx,phys) to associate
94158 + * physical memory with fixmap indices.
94159 + *
94160 + * TLB entries of such buffers will not be flushed across
94161 + * task switches.
94162 + */
94163 +
94164 +enum fixed_addresses {
94165 + VSYSCALL_LAST_PAGE,
94166 + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
94167 + VSYSCALL_HPET,
94168 + FIX_HPET_BASE,
94169 +#ifdef CONFIG_X86_LOCAL_APIC
94170 + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
94171 +#endif
94172 +#ifdef CONFIG_X86_IO_APIC
94173 + FIX_IO_APIC_BASE_0,
94174 + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
94175 +#endif
94176 +#ifdef CONFIG_ACPI
94177 + FIX_ACPI_BEGIN,
94178 + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
94179 +#endif
94180 + FIX_SHARED_INFO,
94181 +#define NR_FIX_ISAMAPS 256
94182 + FIX_ISAMAP_END,
94183 + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
94184 + __end_of_fixed_addresses
94185 +};
94186 +
94187 +extern void __set_fixmap (enum fixed_addresses idx,
94188 + unsigned long phys, pgprot_t flags);
94189 +
94190 +#define set_fixmap(idx, phys) \
94191 + __set_fixmap(idx, phys, PAGE_KERNEL)
94192 +/*
94193 + * Some hardware wants to get fixmapped without caching.
94194 + */
94195 +#define set_fixmap_nocache(idx, phys) \
94196 + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
94197 +
94198 +#define clear_fixmap(idx) \
94199 + __set_fixmap(idx, 0, __pgprot(0))
94200 +
94201 +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
94202 +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
94203 +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
94204 +
94205 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
94206 +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
94207 +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
94208 +
94209 +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
94210 +
94211 +extern void __this_fixmap_does_not_exist(void);
94212 +
94213 +/*
94214 + * 'index to address' translation. If anyone tries to use the idx
94215 + * directly without translation, we catch the bug with a NULL-deference
94216 + * kernel oops. Illegal ranges of incoming indices are caught too.
94217 + */
94218 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
94219 +{
94220 + /*
94221 + * this branch gets completely eliminated after inlining,
94222 + * except when someone tries to use fixaddr indices in an
94223 + * illegal way. (such as mixing up address types or using
94224 + * out-of-range indices).
94225 + *
94226 + * If it doesn't get removed, the linker will complain
94227 + * loudly with a reasonably clear error message..
94228 + */
94229 + if (idx >= __end_of_fixed_addresses)
94230 + __this_fixmap_does_not_exist();
94231 +
94232 + return __fix_to_virt(idx);
94233 +}
94234 +
94235 +#endif
94236 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/floppy.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/floppy.h
94237 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/floppy.h 1970-01-01 00:00:00.000000000 +0000
94238 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/floppy.h 2007-01-08 15:00:46.000000000 +0000
94239 @@ -0,0 +1,206 @@
94240 +/*
94241 + * Architecture specific parts of the Floppy driver
94242 + *
94243 + * This file is subject to the terms and conditions of the GNU General Public
94244 + * License. See the file "COPYING" in the main directory of this archive
94245 + * for more details.
94246 + *
94247 + * Copyright (C) 1995
94248 + *
94249 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
94250 + */
94251 +#ifndef __ASM_XEN_X86_64_FLOPPY_H
94252 +#define __ASM_XEN_X86_64_FLOPPY_H
94253 +
94254 +#include <linux/vmalloc.h>
94255 +
94256 +/*
94257 + * The DMA channel used by the floppy controller cannot access data at
94258 + * addresses >= 16MB
94259 + *
94260 + * Went back to the 1MB limit, as some people had problems with the floppy
94261 + * driver otherwise. It doesn't matter much for performance anyway, as most
94262 + * floppy accesses go through the track buffer.
94263 + */
94264 +#define _CROSS_64KB(a,s,vdma) \
94265 +(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
94266 +
94267 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
94268 +#include <asm/dma.h>
94269 +#undef MAX_DMA_ADDRESS
94270 +#define MAX_DMA_ADDRESS 0
94271 +#define CROSS_64KB(a,s) (0)
94272 +
94273 +#define fd_inb(port) inb_p(port)
94274 +#define fd_outb(value,port) outb_p(value,port)
94275 +
94276 +#define fd_request_dma() (0)
94277 +#define fd_free_dma() ((void)0)
94278 +#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
94279 +#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
94280 +#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
94281 +#define fd_get_dma_residue() vdma_get_dma_residue(FLOPPY_DMA)
94282 +/*
94283 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
94284 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
94285 + */
94286 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
94287 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
94288 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
94289 +
94290 +static int virtual_dma_count;
94291 +static int virtual_dma_residue;
94292 +static char *virtual_dma_addr;
94293 +static int virtual_dma_mode;
94294 +static int doing_pdma;
94295 +
94296 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
94297 +{
94298 + register unsigned char st;
94299 +
94300 +#undef TRACE_FLPY_INT
94301 +
94302 +#ifdef TRACE_FLPY_INT
94303 + static int calls=0;
94304 + static int bytes=0;
94305 + static int dma_wait=0;
94306 +#endif
94307 + if (!doing_pdma)
94308 + return floppy_interrupt(irq, dev_id, regs);
94309 +
94310 +#ifdef TRACE_FLPY_INT
94311 + if(!calls)
94312 + bytes = virtual_dma_count;
94313 +#endif
94314 +
94315 + {
94316 + register int lcount;
94317 + register char *lptr;
94318 +
94319 + st = 1;
94320 + for(lcount=virtual_dma_count, lptr=virtual_dma_addr;
94321 + lcount; lcount--, lptr++) {
94322 + st=inb(virtual_dma_port+4) & 0xa0 ;
94323 + if(st != 0xa0)
94324 + break;
94325 + if(virtual_dma_mode)
94326 + outb_p(*lptr, virtual_dma_port+5);
94327 + else
94328 + *lptr = inb_p(virtual_dma_port+5);
94329 + }
94330 + virtual_dma_count = lcount;
94331 + virtual_dma_addr = lptr;
94332 + st = inb(virtual_dma_port+4);
94333 + }
94334 +
94335 +#ifdef TRACE_FLPY_INT
94336 + calls++;
94337 +#endif
94338 + if(st == 0x20)
94339 + return IRQ_HANDLED;
94340 + if(!(st & 0x20)) {
94341 + virtual_dma_residue += virtual_dma_count;
94342 + virtual_dma_count=0;
94343 +#ifdef TRACE_FLPY_INT
94344 + printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
94345 + virtual_dma_count, virtual_dma_residue, calls, bytes,
94346 + dma_wait);
94347 + calls = 0;
94348 + dma_wait=0;
94349 +#endif
94350 + doing_pdma = 0;
94351 + floppy_interrupt(irq, dev_id, regs);
94352 + return IRQ_HANDLED;
94353 + }
94354 +#ifdef TRACE_FLPY_INT
94355 + if(!virtual_dma_count)
94356 + dma_wait++;
94357 +#endif
94358 + return IRQ_HANDLED;
94359 +}
94360 +
94361 +static void fd_disable_dma(void)
94362 +{
94363 + doing_pdma = 0;
94364 + virtual_dma_residue += virtual_dma_count;
94365 + virtual_dma_count=0;
94366 +}
94367 +
94368 +static int vdma_get_dma_residue(unsigned int dummy)
94369 +{
94370 + return virtual_dma_count + virtual_dma_residue;
94371 +}
94372 +
94373 +
94374 +static int fd_request_irq(void)
94375 +{
94376 + return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
94377 + "floppy", NULL);
94378 +}
94379 +
94380 +#if 0
94381 +static unsigned long vdma_mem_alloc(unsigned long size)
94382 +{
94383 + return (unsigned long) vmalloc(size);
94384 +
94385 +}
94386 +
94387 +static void vdma_mem_free(unsigned long addr, unsigned long size)
94388 +{
94389 + vfree((void *)addr);
94390 +}
94391 +#endif
94392 +
94393 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
94394 +{
94395 + doing_pdma = 1;
94396 + virtual_dma_port = io;
94397 + virtual_dma_mode = (mode == DMA_MODE_WRITE);
94398 + virtual_dma_addr = addr;
94399 + virtual_dma_count = size;
94400 + virtual_dma_residue = 0;
94401 + return 0;
94402 +}
94403 +
94404 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
94405 +#define FDC1 xen_floppy_init()
94406 +static int FDC2 = -1;
94407 +
94408 +static int xen_floppy_init(void)
94409 +{
94410 + use_virtual_dma = 1;
94411 + can_use_virtual_dma = 1;
94412 + return 0x3f0;
94413 +}
94414 +
94415 +/*
94416 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
94417 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
94418 + * coincides with another rtc CMOS user. Paul G.
94419 + */
94420 +#define FLOPPY0_TYPE ({ \
94421 + unsigned long flags; \
94422 + unsigned char val; \
94423 + spin_lock_irqsave(&rtc_lock, flags); \
94424 + val = (CMOS_READ(0x10) >> 4) & 15; \
94425 + spin_unlock_irqrestore(&rtc_lock, flags); \
94426 + val; \
94427 +})
94428 +
94429 +#define FLOPPY1_TYPE ({ \
94430 + unsigned long flags; \
94431 + unsigned char val; \
94432 + spin_lock_irqsave(&rtc_lock, flags); \
94433 + val = CMOS_READ(0x10) & 15; \
94434 + spin_unlock_irqrestore(&rtc_lock, flags); \
94435 + val; \
94436 +})
94437 +
94438 +#define N_FDC 2
94439 +#define N_DRIVE 8
94440 +
94441 +#define FLOPPY_MOTOR_MASK 0xf0
94442 +
94443 +#define EXTRA_FLOPPY_PARAMS
94444 +
94445 +#endif /* __ASM_XEN_X86_64_FLOPPY_H */
94446 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hw_irq.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hw_irq.h
94447 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hw_irq.h 1970-01-01 00:00:00.000000000 +0000
94448 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hw_irq.h 2007-01-08 15:00:46.000000000 +0000
94449 @@ -0,0 +1,145 @@
94450 +#ifndef _ASM_HW_IRQ_H
94451 +#define _ASM_HW_IRQ_H
94452 +
94453 +/*
94454 + * linux/include/asm/hw_irq.h
94455 + *
94456 + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
94457 + *
94458 + * moved some of the old arch/i386/kernel/irq.h to here. VY
94459 + *
94460 + * IRQ/IPI changes taken from work by Thomas Radke
94461 + * <tomsoft@informatik.tu-chemnitz.de>
94462 + *
94463 + * hacked by Andi Kleen for x86-64.
94464 + *
94465 + * $Id: hw_irq.h,v 1.24 2001/09/14 20:55:03 vojtech Exp $
94466 + */
94467 +
94468 +#ifndef __ASSEMBLY__
94469 +#include <linux/config.h>
94470 +#include <asm/atomic.h>
94471 +#include <asm/irq.h>
94472 +#include <linux/profile.h>
94473 +#include <linux/smp.h>
94474 +
94475 +struct hw_interrupt_type;
94476 +#endif
94477 +
94478 +#define NMI_VECTOR 0x02
94479 +/*
94480 + * IDT vectors usable for external interrupt sources start
94481 + * at 0x20:
94482 + */
94483 +#define FIRST_EXTERNAL_VECTOR 0x20
94484 +
94485 +#define IA32_SYSCALL_VECTOR 0x80
94486 +
94487 +
94488 +/*
94489 + * Vectors 0x20-0x2f are used for ISA interrupts.
94490 + */
94491 +
94492 +/*
94493 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
94494 + *
94495 + * some of the following vectors are 'rare', they are merged
94496 + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
94497 + * TLB, reschedule and local APIC vectors are performance-critical.
94498 + */
94499 +#ifndef CONFIG_XEN
94500 +#define SPURIOUS_APIC_VECTOR 0xff
94501 +#define ERROR_APIC_VECTOR 0xfe
94502 +#define RESCHEDULE_VECTOR 0xfd
94503 +#define CALL_FUNCTION_VECTOR 0xfc
94504 +/* fb free - please don't readd KDB here because it's useless
94505 + (hint - think what a NMI bit does to a vector) */
94506 +#define THERMAL_APIC_VECTOR 0xfa
94507 +#define THRESHOLD_APIC_VECTOR 0xf9
94508 +/* f8 free */
94509 +#define INVALIDATE_TLB_VECTOR_END 0xf7
94510 +#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
94511 +
94512 +#define NUM_INVALIDATE_TLB_VECTORS 8
94513 +#endif
94514 +
94515 +/*
94516 + * Local APIC timer IRQ vector is on a different priority level,
94517 + * to work around the 'lost local interrupt if more than 2 IRQ
94518 + * sources per level' errata.
94519 + */
94520 +#define LOCAL_TIMER_VECTOR 0xef
94521 +
94522 +/*
94523 + * First APIC vector available to drivers: (vectors 0x30-0xee)
94524 + * we start at 0x31 to spread out vectors evenly between priority
94525 + * levels. (0x80 is the syscall vector)
94526 + */
94527 +#define FIRST_DEVICE_VECTOR 0x31
94528 +#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */
94529 +
94530 +
94531 +#ifndef __ASSEMBLY__
94532 +extern u8 irq_vector[NR_IRQ_VECTORS];
94533 +#define IO_APIC_VECTOR(irq) (irq_vector[irq])
94534 +#define AUTO_ASSIGN -1
94535 +
94536 +/*
94537 + * Various low-level irq details needed by irq.c, process.c,
94538 + * time.c, io_apic.c and smp.c
94539 + *
94540 + * Interrupt entry/exit code at both C and assembly level
94541 + */
94542 +
94543 +extern void disable_8259A_irq(unsigned int irq);
94544 +extern void enable_8259A_irq(unsigned int irq);
94545 +extern int i8259A_irq_pending(unsigned int irq);
94546 +extern void make_8259A_irq(unsigned int irq);
94547 +extern void init_8259A(int aeoi);
94548 +extern void FASTCALL(send_IPI_self(int vector));
94549 +extern void init_VISWS_APIC_irqs(void);
94550 +extern void setup_IO_APIC(void);
94551 +extern void disable_IO_APIC(void);
94552 +extern void print_IO_APIC(void);
94553 +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
94554 +extern void send_IPI(int dest, int vector);
94555 +extern void setup_ioapic_dest(void);
94556 +
94557 +extern unsigned long io_apic_irqs;
94558 +
94559 +extern atomic_t irq_err_count;
94560 +extern atomic_t irq_mis_count;
94561 +
94562 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
94563 +
94564 +#define __STR(x) #x
94565 +#define STR(x) __STR(x)
94566 +
94567 +#include <asm/ptrace.h>
94568 +
94569 +#define IRQ_NAME2(nr) nr##_interrupt(void)
94570 +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
94571 +
94572 +/*
94573 + * SMP has a few special interrupts for IPI messages
94574 + */
94575 +
94576 +#define BUILD_IRQ(nr) \
94577 +asmlinkage void IRQ_NAME(nr); \
94578 +__asm__( \
94579 +"\n.p2align\n" \
94580 +"IRQ" #nr "_interrupt:\n\t" \
94581 + "push $" #nr "-256 ; " \
94582 + "jmp common_interrupt");
94583 +
94584 +extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
94585 +static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
94586 +{
94587 + resend_irq_on_evtchn(h, i);
94588 +}
94589 +
94590 +#define platform_legacy_irq(irq) ((irq) < 16)
94591 +
94592 +#endif
94593 +
94594 +#endif /* _ASM_HW_IRQ_H */
94595 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypercall.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypercall.h
94596 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypercall.h 1970-01-01 00:00:00.000000000 +0000
94597 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypercall.h 2007-01-08 15:00:46.000000000 +0000
94598 @@ -0,0 +1,406 @@
94599 +/******************************************************************************
94600 + * hypercall.h
94601 + *
94602 + * Linux-specific hypervisor handling.
94603 + *
94604 + * Copyright (c) 2002-2004, K A Fraser
94605 + *
94606 + * 64-bit updates:
94607 + * Benjamin Liu <benjamin.liu@intel.com>
94608 + * Jun Nakajima <jun.nakajima@intel.com>
94609 + *
94610 + * This program is free software; you can redistribute it and/or
94611 + * modify it under the terms of the GNU General Public License version 2
94612 + * as published by the Free Software Foundation; or, when distributed
94613 + * separately from the Linux kernel or incorporated into other
94614 + * software packages, subject to the following license:
94615 + *
94616 + * Permission is hereby granted, free of charge, to any person obtaining a copy
94617 + * of this source file (the "Software"), to deal in the Software without
94618 + * restriction, including without limitation the rights to use, copy, modify,
94619 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
94620 + * and to permit persons to whom the Software is furnished to do so, subject to
94621 + * the following conditions:
94622 + *
94623 + * The above copyright notice and this permission notice shall be included in
94624 + * all copies or substantial portions of the Software.
94625 + *
94626 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
94627 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
94628 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94629 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
94630 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
94631 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
94632 + * IN THE SOFTWARE.
94633 + */
94634 +
94635 +#ifndef __HYPERCALL_H__
94636 +#define __HYPERCALL_H__
94637 +
94638 +#include <linux/string.h> /* memcpy() */
94639 +
94640 +#ifndef __HYPERVISOR_H__
94641 +# error "please don't include this file directly"
94642 +#endif
94643 +
94644 +#define __STR(x) #x
94645 +#define STR(x) __STR(x)
94646 +
94647 +#ifdef CONFIG_XEN
94648 +#define HYPERCALL_STR(name) \
94649 + "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"
94650 +#else
94651 +#define HYPERCALL_STR(name) \
94652 + "mov hypercall_stubs,%%rax; " \
94653 + "add $("STR(__HYPERVISOR_##name)" * 32),%%rax; " \
94654 + "call *%%rax"
94655 +#endif
94656 +
94657 +#define _hypercall0(type, name) \
94658 +({ \
94659 + long __res; \
94660 + asm volatile ( \
94661 + HYPERCALL_STR(name) \
94662 + : "=a" (__res) \
94663 + : \
94664 + : "memory" ); \
94665 + (type)__res; \
94666 +})
94667 +
94668 +#define _hypercall1(type, name, a1) \
94669 +({ \
94670 + long __res, __ign1; \
94671 + asm volatile ( \
94672 + HYPERCALL_STR(name) \
94673 + : "=a" (__res), "=D" (__ign1) \
94674 + : "1" ((long)(a1)) \
94675 + : "memory" ); \
94676 + (type)__res; \
94677 +})
94678 +
94679 +#define _hypercall2(type, name, a1, a2) \
94680 +({ \
94681 + long __res, __ign1, __ign2; \
94682 + asm volatile ( \
94683 + HYPERCALL_STR(name) \
94684 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \
94685 + : "1" ((long)(a1)), "2" ((long)(a2)) \
94686 + : "memory" ); \
94687 + (type)__res; \
94688 +})
94689 +
94690 +#define _hypercall3(type, name, a1, a2, a3) \
94691 +({ \
94692 + long __res, __ign1, __ign2, __ign3; \
94693 + asm volatile ( \
94694 + HYPERCALL_STR(name) \
94695 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94696 + "=d" (__ign3) \
94697 + : "1" ((long)(a1)), "2" ((long)(a2)), \
94698 + "3" ((long)(a3)) \
94699 + : "memory" ); \
94700 + (type)__res; \
94701 +})
94702 +
94703 +#define _hypercall4(type, name, a1, a2, a3, a4) \
94704 +({ \
94705 + long __res, __ign1, __ign2, __ign3; \
94706 + asm volatile ( \
94707 + "movq %7,%%r10; " \
94708 + HYPERCALL_STR(name) \
94709 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94710 + "=d" (__ign3) \
94711 + : "1" ((long)(a1)), "2" ((long)(a2)), \
94712 + "3" ((long)(a3)), "g" ((long)(a4)) \
94713 + : "memory", "r10" ); \
94714 + (type)__res; \
94715 +})
94716 +
94717 +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
94718 +({ \
94719 + long __res, __ign1, __ign2, __ign3; \
94720 + asm volatile ( \
94721 + "movq %7,%%r10; movq %8,%%r8; " \
94722 + HYPERCALL_STR(name) \
94723 + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \
94724 + "=d" (__ign3) \
94725 + : "1" ((long)(a1)), "2" ((long)(a2)), \
94726 + "3" ((long)(a3)), "g" ((long)(a4)), \
94727 + "g" ((long)(a5)) \
94728 + : "memory", "r10", "r8" ); \
94729 + (type)__res; \
94730 +})
94731 +
94732 +static inline int
94733 +HYPERVISOR_set_trap_table(
94734 + trap_info_t *table)
94735 +{
94736 + return _hypercall1(int, set_trap_table, table);
94737 +}
94738 +
94739 +static inline int
94740 +HYPERVISOR_mmu_update(
94741 + mmu_update_t *req, int count, int *success_count, domid_t domid)
94742 +{
94743 + return _hypercall4(int, mmu_update, req, count, success_count, domid);
94744 +}
94745 +
94746 +static inline int
94747 +HYPERVISOR_mmuext_op(
94748 + struct mmuext_op *op, int count, int *success_count, domid_t domid)
94749 +{
94750 + return _hypercall4(int, mmuext_op, op, count, success_count, domid);
94751 +}
94752 +
94753 +static inline int
94754 +HYPERVISOR_set_gdt(
94755 + unsigned long *frame_list, int entries)
94756 +{
94757 + return _hypercall2(int, set_gdt, frame_list, entries);
94758 +}
94759 +
94760 +static inline int
94761 +HYPERVISOR_stack_switch(
94762 + unsigned long ss, unsigned long esp)
94763 +{
94764 + return _hypercall2(int, stack_switch, ss, esp);
94765 +}
94766 +
94767 +static inline int
94768 +HYPERVISOR_set_callbacks(
94769 + unsigned long event_address, unsigned long failsafe_address,
94770 + unsigned long syscall_address)
94771 +{
94772 + return _hypercall3(int, set_callbacks,
94773 + event_address, failsafe_address, syscall_address);
94774 +}
94775 +
94776 +static inline int
94777 +HYPERVISOR_fpu_taskswitch(
94778 + int set)
94779 +{
94780 + return _hypercall1(int, fpu_taskswitch, set);
94781 +}
94782 +
94783 +static inline int
94784 +HYPERVISOR_sched_op_compat(
94785 + int cmd, unsigned long arg)
94786 +{
94787 + return _hypercall2(int, sched_op_compat, cmd, arg);
94788 +}
94789 +
94790 +static inline int
94791 +HYPERVISOR_sched_op(
94792 + int cmd, void *arg)
94793 +{
94794 + return _hypercall2(int, sched_op, cmd, arg);
94795 +}
94796 +
94797 +static inline long
94798 +HYPERVISOR_set_timer_op(
94799 + u64 timeout)
94800 +{
94801 + return _hypercall1(long, set_timer_op, timeout);
94802 +}
94803 +
94804 +static inline int
94805 +HYPERVISOR_dom0_op(
94806 + dom0_op_t *dom0_op)
94807 +{
94808 + dom0_op->interface_version = DOM0_INTERFACE_VERSION;
94809 + return _hypercall1(int, dom0_op, dom0_op);
94810 +}
94811 +
94812 +static inline int
94813 +HYPERVISOR_set_debugreg(
94814 + int reg, unsigned long value)
94815 +{
94816 + return _hypercall2(int, set_debugreg, reg, value);
94817 +}
94818 +
94819 +static inline unsigned long
94820 +HYPERVISOR_get_debugreg(
94821 + int reg)
94822 +{
94823 + return _hypercall1(unsigned long, get_debugreg, reg);
94824 +}
94825 +
94826 +static inline int
94827 +HYPERVISOR_update_descriptor(
94828 + unsigned long ma, unsigned long word)
94829 +{
94830 + return _hypercall2(int, update_descriptor, ma, word);
94831 +}
94832 +
94833 +static inline int
94834 +HYPERVISOR_memory_op(
94835 + unsigned int cmd, void *arg)
94836 +{
94837 + return _hypercall2(int, memory_op, cmd, arg);
94838 +}
94839 +
94840 +static inline int
94841 +HYPERVISOR_multicall(
94842 + void *call_list, int nr_calls)
94843 +{
94844 + return _hypercall2(int, multicall, call_list, nr_calls);
94845 +}
94846 +
94847 +static inline int
94848 +HYPERVISOR_update_va_mapping(
94849 + unsigned long va, pte_t new_val, unsigned long flags)
94850 +{
94851 + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
94852 +}
94853 +
94854 +static inline int
94855 +HYPERVISOR_event_channel_op(
94856 + int cmd, void *arg)
94857 +{
94858 + int rc = _hypercall2(int, event_channel_op, cmd, arg);
94859 +
94860 +#ifdef CONFIG_XEN_COMPAT_030002
94861 + if (unlikely(rc == -ENOSYS)) {
94862 + struct evtchn_op op;
94863 + op.cmd = cmd;
94864 + memcpy(&op.u, arg, sizeof(op.u));
94865 + rc = _hypercall1(int, event_channel_op_compat, &op);
94866 + memcpy(arg, &op.u, sizeof(op.u));
94867 + }
94868 +#endif
94869 +
94870 + return rc;
94871 +}
94872 +
94873 +static inline int
94874 +HYPERVISOR_acm_op(
94875 + int cmd, void *arg)
94876 +{
94877 + return _hypercall2(int, acm_op, cmd, arg);
94878 +}
94879 +
94880 +static inline int
94881 +HYPERVISOR_xen_version(
94882 + int cmd, void *arg)
94883 +{
94884 + return _hypercall2(int, xen_version, cmd, arg);
94885 +}
94886 +
94887 +static inline int
94888 +HYPERVISOR_console_io(
94889 + int cmd, int count, char *str)
94890 +{
94891 + return _hypercall3(int, console_io, cmd, count, str);
94892 +}
94893 +
94894 +static inline int
94895 +HYPERVISOR_physdev_op(
94896 + int cmd, void *arg)
94897 +{
94898 + int rc = _hypercall2(int, physdev_op, cmd, arg);
94899 +
94900 +#ifdef CONFIG_XEN_COMPAT_030002
94901 + if (unlikely(rc == -ENOSYS)) {
94902 + struct physdev_op op;
94903 + op.cmd = cmd;
94904 + memcpy(&op.u, arg, sizeof(op.u));
94905 + rc = _hypercall1(int, physdev_op_compat, &op);
94906 + memcpy(arg, &op.u, sizeof(op.u));
94907 + }
94908 +#endif
94909 +
94910 + return rc;
94911 +}
94912 +
94913 +static inline int
94914 +HYPERVISOR_grant_table_op(
94915 + unsigned int cmd, void *uop, unsigned int count)
94916 +{
94917 + return _hypercall3(int, grant_table_op, cmd, uop, count);
94918 +}
94919 +
94920 +static inline int
94921 +HYPERVISOR_update_va_mapping_otherdomain(
94922 + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
94923 +{
94924 + return _hypercall4(int, update_va_mapping_otherdomain, va,
94925 + new_val.pte, flags, domid);
94926 +}
94927 +
94928 +static inline int
94929 +HYPERVISOR_vm_assist(
94930 + unsigned int cmd, unsigned int type)
94931 +{
94932 + return _hypercall2(int, vm_assist, cmd, type);
94933 +}
94934 +
94935 +static inline int
94936 +HYPERVISOR_vcpu_op(
94937 + int cmd, int vcpuid, void *extra_args)
94938 +{
94939 + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
94940 +}
94941 +
94942 +static inline int
94943 +HYPERVISOR_set_segment_base(
94944 + int reg, unsigned long value)
94945 +{
94946 + return _hypercall2(int, set_segment_base, reg, value);
94947 +}
94948 +
94949 +static inline int
94950 +HYPERVISOR_suspend(
94951 + unsigned long srec)
94952 +{
94953 + struct sched_shutdown sched_shutdown = {
94954 + .reason = SHUTDOWN_suspend
94955 + };
94956 +
94957 + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
94958 + &sched_shutdown, srec);
94959 +
94960 +#ifdef CONFIG_XEN_COMPAT_030002
94961 + if (rc == -ENOSYS)
94962 + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
94963 + SHUTDOWN_suspend, srec);
94964 +#endif
94965 +
94966 + return rc;
94967 +}
94968 +
94969 +static inline int
94970 +HYPERVISOR_nmi_op(
94971 + unsigned long op, void *arg)
94972 +{
94973 + return _hypercall2(int, nmi_op, op, arg);
94974 +}
94975 +
94976 +static inline unsigned long
94977 +HYPERVISOR_hvm_op(
94978 + int op, void *arg)
94979 +{
94980 + return _hypercall2(unsigned long, hvm_op, op, arg);
94981 +}
94982 +
94983 +static inline int
94984 +HYPERVISOR_callback_op(
94985 + int cmd, void *arg)
94986 +{
94987 + return _hypercall2(int, callback_op, cmd, arg);
94988 +}
94989 +
94990 +static inline int
94991 +HYPERVISOR_xenoprof_op(
94992 + int op, void *arg)
94993 +{
94994 + return _hypercall2(int, xenoprof_op, op, arg);
94995 +}
94996 +
94997 +static inline int
94998 +HYPERVISOR_kexec_op(
94999 + unsigned long op, void *args)
95000 +{
95001 + return _hypercall2(int, kexec_op, op, args);
95002 +}
95003 +
95004 +#endif /* __HYPERCALL_H__ */
95005 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypervisor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypervisor.h
95006 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/hypervisor.h 1970-01-01 00:00:00.000000000 +0000
95007 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/hypervisor.h 2007-01-08 15:00:46.000000000 +0000
95008 @@ -0,0 +1,2 @@
95009 +
95010 +#include <asm-i386/mach-xen/asm/hypervisor.h>
95011 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/io.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/io.h
95012 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/io.h 1970-01-01 00:00:00.000000000 +0000
95013 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/io.h 2007-01-08 15:00:46.000000000 +0000
95014 @@ -0,0 +1,361 @@
95015 +#ifndef _ASM_IO_H
95016 +#define _ASM_IO_H
95017 +
95018 +#include <linux/config.h>
95019 +#include <asm/fixmap.h>
95020 +
95021 +/*
95022 + * This file contains the definitions for the x86 IO instructions
95023 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
95024 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
95025 + * versions of the single-IO instructions (inb_p/inw_p/..).
95026 + *
95027 + * This file is not meant to be obfuscating: it's just complicated
95028 + * to (a) handle it all in a way that makes gcc able to optimize it
95029 + * as well as possible and (b) trying to avoid writing the same thing
95030 + * over and over again with slight variations and possibly making a
95031 + * mistake somewhere.
95032 + */
95033 +
95034 +/*
95035 + * Thanks to James van Artsdalen for a better timing-fix than
95036 + * the two short jumps: using outb's to a nonexistent port seems
95037 + * to guarantee better timings even on fast machines.
95038 + *
95039 + * On the other hand, I'd like to be sure of a non-existent port:
95040 + * I feel a bit unsafe about using 0x80 (should be safe, though)
95041 + *
95042 + * Linus
95043 + */
95044 +
95045 + /*
95046 + * Bit simplified and optimized by Jan Hubicka
95047 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
95048 + *
95049 + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
95050 + * isa_read[wl] and isa_write[wl] fixed
95051 + * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
95052 + */
95053 +
95054 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
95055 +
95056 +#ifdef REALLY_SLOW_IO
95057 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
95058 +#else
95059 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
95060 +#endif
95061 +
95062 +/*
95063 + * Talk about misusing macros..
95064 + */
95065 +#define __OUT1(s,x) \
95066 +static inline void out##s(unsigned x value, unsigned short port) {
95067 +
95068 +#define __OUT2(s,s1,s2) \
95069 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
95070 +
95071 +#define __OUT(s,s1,x) \
95072 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
95073 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
95074 +
95075 +#define __IN1(s) \
95076 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
95077 +
95078 +#define __IN2(s,s1,s2) \
95079 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
95080 +
95081 +#define __IN(s,s1,i...) \
95082 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
95083 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
95084 +
95085 +#define __INS(s) \
95086 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
95087 +{ __asm__ __volatile__ ("rep ; ins" #s \
95088 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
95089 +
95090 +#define __OUTS(s) \
95091 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
95092 +{ __asm__ __volatile__ ("rep ; outs" #s \
95093 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
95094 +
95095 +#define RETURN_TYPE unsigned char
95096 +__IN(b,"")
95097 +#undef RETURN_TYPE
95098 +#define RETURN_TYPE unsigned short
95099 +__IN(w,"")
95100 +#undef RETURN_TYPE
95101 +#define RETURN_TYPE unsigned int
95102 +__IN(l,"")
95103 +#undef RETURN_TYPE
95104 +
95105 +__OUT(b,"b",char)
95106 +__OUT(w,"w",short)
95107 +__OUT(l,,int)
95108 +
95109 +__INS(b)
95110 +__INS(w)
95111 +__INS(l)
95112 +
95113 +__OUTS(b)
95114 +__OUTS(w)
95115 +__OUTS(l)
95116 +
95117 +#define IO_SPACE_LIMIT 0xffff
95118 +
95119 +#if defined(__KERNEL__) && __x86_64__
95120 +
95121 +#include <linux/vmalloc.h>
95122 +
95123 +#ifndef __i386__
95124 +/*
95125 + * Change virtual addresses to physical addresses and vv.
95126 + * These are pretty trivial
95127 + */
95128 +static inline unsigned long virt_to_phys(volatile void * address)
95129 +{
95130 + return __pa(address);
95131 +}
95132 +
95133 +static inline void * phys_to_virt(unsigned long address)
95134 +{
95135 + return __va(address);
95136 +}
95137 +
95138 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
95139 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
95140 +#endif
95141 +
95142 +/*
95143 + * Change "struct page" to physical address.
95144 + */
95145 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
95146 +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
95147 +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
95148 +
95149 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \
95150 + (unsigned long) bio_offset((bio)))
95151 +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
95152 + (unsigned long) (bv)->bv_offset)
95153 +
95154 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
95155 + (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
95156 + ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
95157 + bvec_to_pseudophys((vec2))))
95158 +
95159 +#include <asm-generic/iomap.h>
95160 +
95161 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
95162 +
95163 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
95164 +{
95165 + return __ioremap(offset, size, 0);
95166 +}
95167 +
95168 +/*
95169 + * This one maps high address device memory and turns off caching for that area.
95170 + * it's useful if some control registers are in such an area and write combining
95171 + * or read caching is not desirable:
95172 + */
95173 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
95174 +extern void iounmap(volatile void __iomem *addr);
95175 +
95176 +/* Use normal IO mappings for DMI */
95177 +#define dmi_ioremap ioremap
95178 +#define dmi_iounmap(x,l) iounmap(x)
95179 +#define dmi_alloc(l) kmalloc(l, GFP_ATOMIC)
95180 +
95181 +/*
95182 + * ISA I/O bus memory addresses are 1:1 with the physical address.
95183 + */
95184 +
95185 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
95186 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
95187 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
95188 +
95189 +/*
95190 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
95191 + * are forbidden in portable PCI drivers.
95192 + *
95193 + * Allow them on x86 for legacy drivers, though.
95194 + */
95195 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
95196 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
95197 +
95198 +/*
95199 + * readX/writeX() are used to access memory mapped devices. On some
95200 + * architectures the memory mapped IO stuff needs to be accessed
95201 + * differently. On the x86 architecture, we just read/write the
95202 + * memory location directly.
95203 + */
95204 +
95205 +static inline __u8 __readb(const volatile void __iomem *addr)
95206 +{
95207 + return *(__force volatile __u8 *)addr;
95208 +}
95209 +static inline __u16 __readw(const volatile void __iomem *addr)
95210 +{
95211 + return *(__force volatile __u16 *)addr;
95212 +}
95213 +static inline __u32 __readl(const volatile void __iomem *addr)
95214 +{
95215 + return *(__force volatile __u32 *)addr;
95216 +}
95217 +static inline __u64 __readq(const volatile void __iomem *addr)
95218 +{
95219 + return *(__force volatile __u64 *)addr;
95220 +}
95221 +#define readb(x) __readb(x)
95222 +#define readw(x) __readw(x)
95223 +#define readl(x) __readl(x)
95224 +#define readq(x) __readq(x)
95225 +#define readb_relaxed(a) readb(a)
95226 +#define readw_relaxed(a) readw(a)
95227 +#define readl_relaxed(a) readl(a)
95228 +#define readq_relaxed(a) readq(a)
95229 +#define __raw_readb readb
95230 +#define __raw_readw readw
95231 +#define __raw_readl readl
95232 +#define __raw_readq readq
95233 +
95234 +#define mmiowb()
95235 +
95236 +#ifdef CONFIG_UNORDERED_IO
95237 +static inline void __writel(__u32 val, volatile void __iomem *addr)
95238 +{
95239 + volatile __u32 __iomem *target = addr;
95240 + asm volatile("movnti %1,%0"
95241 + : "=m" (*target)
95242 + : "r" (val) : "memory");
95243 +}
95244 +
95245 +static inline void __writeq(__u64 val, volatile void __iomem *addr)
95246 +{
95247 + volatile __u64 __iomem *target = addr;
95248 + asm volatile("movnti %1,%0"
95249 + : "=m" (*target)
95250 + : "r" (val) : "memory");
95251 +}
95252 +#else
95253 +static inline void __writel(__u32 b, volatile void __iomem *addr)
95254 +{
95255 + *(__force volatile __u32 *)addr = b;
95256 +}
95257 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
95258 +{
95259 + *(__force volatile __u64 *)addr = b;
95260 +}
95261 +#endif
95262 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
95263 +{
95264 + *(__force volatile __u8 *)addr = b;
95265 +}
95266 +static inline void __writew(__u16 b, volatile void __iomem *addr)
95267 +{
95268 + *(__force volatile __u16 *)addr = b;
95269 +}
95270 +#define writeq(val,addr) __writeq((val),(addr))
95271 +#define writel(val,addr) __writel((val),(addr))
95272 +#define writew(val,addr) __writew((val),(addr))
95273 +#define writeb(val,addr) __writeb((val),(addr))
95274 +#define __raw_writeb writeb
95275 +#define __raw_writew writew
95276 +#define __raw_writel writel
95277 +#define __raw_writeq writeq
95278 +
95279 +void __memcpy_fromio(void*,unsigned long,unsigned);
95280 +void __memcpy_toio(unsigned long,const void*,unsigned);
95281 +
95282 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
95283 +{
95284 + __memcpy_fromio(to,(unsigned long)from,len);
95285 +}
95286 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
95287 +{
95288 + __memcpy_toio((unsigned long)to,from,len);
95289 +}
95290 +
95291 +void memset_io(volatile void __iomem *a, int b, size_t c);
95292 +
95293 +/*
95294 + * ISA space is 'always mapped' on a typical x86 system, no need to
95295 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
95296 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
95297 + * are physical addresses. The following constant pointer can be
95298 + * used as the IO-area pointer (it can be iounmapped as well, so the
95299 + * analogy with PCI is quite large):
95300 + */
95301 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
95302 +
95303 +#define isa_readb(a) readb(__ISA_IO_base + (a))
95304 +#define isa_readw(a) readw(__ISA_IO_base + (a))
95305 +#define isa_readl(a) readl(__ISA_IO_base + (a))
95306 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
95307 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
95308 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
95309 +#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c))
95310 +#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c))
95311 +#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c))
95312 +
95313 +
95314 +/*
95315 + * Again, x86-64 does not require mem IO specific function.
95316 + */
95317 +
95318 +#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
95319 +#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(__ISA_IO_base + (b)),(c),(d))
95320 +
95321 +/**
95322 + * check_signature - find BIOS signatures
95323 + * @io_addr: mmio address to check
95324 + * @signature: signature block
95325 + * @length: length of signature
95326 + *
95327 + * Perform a signature comparison with the mmio address io_addr. This
95328 + * address should have been obtained by ioremap.
95329 + * Returns 1 on a match.
95330 + */
95331 +
95332 +static inline int check_signature(void __iomem *io_addr,
95333 + const unsigned char *signature, int length)
95334 +{
95335 + int retval = 0;
95336 + do {
95337 + if (readb(io_addr) != *signature)
95338 + goto out;
95339 + io_addr++;
95340 + signature++;
95341 + length--;
95342 + } while (length);
95343 + retval = 1;
95344 +out:
95345 + return retval;
95346 +}
95347 +
95348 +/* Nothing to do */
95349 +
95350 +#define dma_cache_inv(_start,_size) do { } while (0)
95351 +#define dma_cache_wback(_start,_size) do { } while (0)
95352 +#define dma_cache_wback_inv(_start,_size) do { } while (0)
95353 +
95354 +#define flush_write_buffers()
95355 +
95356 +extern int iommu_bio_merge;
95357 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
95358 +
95359 +/*
95360 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
95361 + * access
95362 + */
95363 +#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz)
95364 +#define xlate_dev_mem_ptr_unmap(p) iounmap(p)
95365 +
95366 +/*
95367 + * Convert a virtual cached pointer to an uncached pointer
95368 + */
95369 +#define xlate_dev_kmem_ptr(p) p
95370 +
95371 +#endif /* __KERNEL__ */
95372 +
95373 +#define ARCH_HAS_DEV_MEM
95374 +
95375 +#endif
95376 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/irq.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/irq.h
95377 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/irq.h 1970-01-01 00:00:00.000000000 +0000
95378 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/irq.h 2007-01-08 15:00:46.000000000 +0000
95379 @@ -0,0 +1,39 @@
95380 +#ifndef _ASM_IRQ_H
95381 +#define _ASM_IRQ_H
95382 +
95383 +/*
95384 + * linux/include/asm/irq.h
95385 + *
95386 + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
95387 + *
95388 + * IRQ/IPI changes taken from work by Thomas Radke
95389 + * <tomsoft@informatik.tu-chemnitz.de>
95390 + */
95391 +
95392 +#include <linux/config.h>
95393 +#include <linux/sched.h>
95394 +/* include comes from machine specific directory */
95395 +#include "irq_vectors.h"
95396 +#include <asm/thread_info.h>
95397 +
95398 +static __inline__ int irq_canonicalize(int irq)
95399 +{
95400 + return ((irq == 2) ? 9 : irq);
95401 +}
95402 +
95403 +#ifdef CONFIG_X86_LOCAL_APIC
95404 +#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */
95405 +#endif
95406 +
95407 +#define KDB_VECTOR 0xf9
95408 +
95409 +# define irq_ctx_init(cpu) do { } while (0)
95410 +
95411 +#ifdef CONFIG_HOTPLUG_CPU
95412 +#include <linux/cpumask.h>
95413 +extern void fixup_irqs(cpumask_t map);
95414 +#endif
95415 +
95416 +#define __ARCH_HAS_DO_SOFTIRQ 1
95417 +
95418 +#endif /* _ASM_IRQ_H */
95419 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/maddr.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/maddr.h
95420 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/maddr.h 1970-01-01 00:00:00.000000000 +0000
95421 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/maddr.h 2007-01-08 15:00:46.000000000 +0000
95422 @@ -0,0 +1,150 @@
95423 +#ifndef _X86_64_MADDR_H
95424 +#define _X86_64_MADDR_H
95425 +
95426 +#include <xen/features.h>
95427 +#include <xen/interface/xen.h>
95428 +
95429 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
95430 +#define INVALID_P2M_ENTRY (~0UL)
95431 +#define FOREIGN_FRAME_BIT (1UL<<63)
95432 +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
95433 +
95434 +/* Definitions for machine and pseudophysical addresses. */
95435 +typedef unsigned long paddr_t;
95436 +typedef unsigned long maddr_t;
95437 +
95438 +#ifdef CONFIG_XEN
95439 +
95440 +extern unsigned long *phys_to_machine_mapping;
95441 +
95442 +#undef machine_to_phys_mapping
95443 +extern unsigned long *machine_to_phys_mapping;
95444 +extern unsigned int machine_to_phys_order;
95445 +
95446 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
95447 +{
95448 + if (xen_feature(XENFEAT_auto_translated_physmap))
95449 + return pfn;
95450 + return phys_to_machine_mapping[(unsigned int)(pfn)] &
95451 + ~FOREIGN_FRAME_BIT;
95452 +}
95453 +
95454 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
95455 +{
95456 + if (xen_feature(XENFEAT_auto_translated_physmap))
95457 + return 1;
95458 + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
95459 +}
95460 +
95461 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
95462 +{
95463 + unsigned long pfn;
95464 +
95465 + if (xen_feature(XENFEAT_auto_translated_physmap))
95466 + return mfn;
95467 +
95468 + if (unlikely((mfn >> machine_to_phys_order) != 0))
95469 + return end_pfn;
95470 +
95471 + /* The array access can fail (e.g., device space beyond end of RAM). */
95472 + asm (
95473 + "1: movq %1,%0\n"
95474 + "2:\n"
95475 + ".section .fixup,\"ax\"\n"
95476 + "3: movq %2,%0\n"
95477 + " jmp 2b\n"
95478 + ".previous\n"
95479 + ".section __ex_table,\"a\"\n"
95480 + " .align 8\n"
95481 + " .quad 1b,3b\n"
95482 + ".previous"
95483 + : "=r" (pfn)
95484 + : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
95485 +
95486 + return pfn;
95487 +}
95488 +
95489 +/*
95490 + * We detect special mappings in one of two ways:
95491 + * 1. If the MFN is an I/O page then Xen will set the m2p entry
95492 + * to be outside our maximum possible pseudophys range.
95493 + * 2. If the MFN belongs to a different domain then we will certainly
95494 + * not have MFN in our p2m table. Conversely, if the page is ours,
95495 + * then we'll have p2m(m2p(MFN))==MFN.
95496 + * If we detect a special mapping then it doesn't have a 'struct page'.
95497 + * We force !pfn_valid() by returning an out-of-range pointer.
95498 + *
95499 + * NB. These checks require that, for any MFN that is not in our reservation,
95500 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
95501 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
95502 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
95503 + *
95504 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
95505 + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
95506 + * require. In all the cases we care about, the FOREIGN_FRAME bit is
95507 + * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
95508 + */
95509 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
95510 +{
95511 + unsigned long pfn = mfn_to_pfn(mfn);
95512 + if ((pfn < end_pfn)
95513 + && !xen_feature(XENFEAT_auto_translated_physmap)
95514 + && (phys_to_machine_mapping[pfn] != mfn))
95515 + return end_pfn; /* force !pfn_valid() */
95516 + return pfn;
95517 +}
95518 +
95519 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
95520 +{
95521 + if (xen_feature(XENFEAT_auto_translated_physmap)) {
95522 + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
95523 + return;
95524 + }
95525 + phys_to_machine_mapping[pfn] = mfn;
95526 +}
95527 +
95528 +static inline maddr_t phys_to_machine(paddr_t phys)
95529 +{
95530 + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
95531 + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
95532 + return machine;
95533 +}
95534 +
95535 +static inline paddr_t machine_to_phys(maddr_t machine)
95536 +{
95537 + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
95538 + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
95539 + return phys;
95540 +}
95541 +
95542 +static inline paddr_t pte_machine_to_phys(maddr_t machine)
95543 +{
95544 + paddr_t phys;
95545 + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
95546 + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
95547 + return phys;
95548 +}
95549 +
95550 +#else /* !CONFIG_XEN */
95551 +
95552 +#define pfn_to_mfn(pfn) (pfn)
95553 +#define mfn_to_pfn(mfn) (mfn)
95554 +#define mfn_to_local_pfn(mfn) (mfn)
95555 +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn))
95556 +#define phys_to_machine_mapping_valid(pfn) (1)
95557 +#define phys_to_machine(phys) ((maddr_t)(phys))
95558 +#define machine_to_phys(mach) ((paddr_t)(mach))
95559 +#define pte_machine_to_phys(mach) ((paddr_t)(mach))
95560 +
95561 +#endif /* !CONFIG_XEN */
95562 +
95563 +/* VIRT <-> MACHINE conversion */
95564 +#define virt_to_machine(v) (phys_to_machine(__pa(v)))
95565 +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
95566 +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
95567 +
95568 +#define __pte_ma(x) ((pte_t) { (x) } )
95569 +#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
95570 +
95571 +#endif /* _X86_64_MADDR_H */
95572 +
95573 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu.h
95574 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu.h 1970-01-01 00:00:00.000000000 +0000
95575 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu.h 2007-01-08 15:00:46.000000000 +0000
95576 @@ -0,0 +1,38 @@
95577 +#ifndef __x86_64_MMU_H
95578 +#define __x86_64_MMU_H
95579 +
95580 +#include <linux/spinlock.h>
95581 +#include <asm/semaphore.h>
95582 +
95583 +/*
95584 + * The x86_64 doesn't have a mmu context, but
95585 + * we put the segment information here.
95586 + *
95587 + * cpu_vm_mask is used to optimize ldt flushing.
95588 + */
95589 +typedef struct {
95590 + void *ldt;
95591 + rwlock_t ldtlock;
95592 + int size;
95593 + struct semaphore sem;
95594 +#ifdef CONFIG_XEN
95595 + unsigned pinned:1;
95596 + unsigned has_foreign_mappings:1;
95597 + struct list_head unpinned;
95598 +#endif
95599 +} mm_context_t;
95600 +
95601 +#ifdef CONFIG_XEN
95602 +extern struct list_head mm_unpinned;
95603 +extern spinlock_t mm_unpinned_lock;
95604 +
95605 +/* mm/memory.c:exit_mmap hook */
95606 +extern void _arch_exit_mmap(struct mm_struct *mm);
95607 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
95608 +
95609 +/* kernel/fork.c:dup_mmap hook */
95610 +extern void _arch_dup_mmap(struct mm_struct *mm);
95611 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
95612 +#endif
95613 +
95614 +#endif
95615 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu_context.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu_context.h
95616 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/mmu_context.h 1970-01-01 00:00:00.000000000 +0000
95617 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/mmu_context.h 2007-01-08 15:00:46.000000000 +0000
95618 @@ -0,0 +1,136 @@
95619 +#ifndef __X86_64_MMU_CONTEXT_H
95620 +#define __X86_64_MMU_CONTEXT_H
95621 +
95622 +#include <linux/config.h>
95623 +#include <asm/desc.h>
95624 +#include <asm/atomic.h>
95625 +#include <asm/pgalloc.h>
95626 +#include <asm/page.h>
95627 +#include <asm/pda.h>
95628 +#include <asm/pgtable.h>
95629 +#include <asm/tlbflush.h>
95630 +
95631 +/*
95632 + * possibly do the LDT unload here?
95633 + */
95634 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
95635 +void destroy_context(struct mm_struct *mm);
95636 +
95637 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
95638 +{
95639 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95640 + if (read_pda(mmu_state) == TLBSTATE_OK)
95641 + write_pda(mmu_state, TLBSTATE_LAZY);
95642 +#endif
95643 +}
95644 +
95645 +#define prepare_arch_switch(next) __prepare_arch_switch()
95646 +
95647 +static inline void __prepare_arch_switch(void)
95648 +{
95649 + /*
95650 + * Save away %es, %ds, %fs and %gs. Must happen before reload
95651 + * of cr3/ldt (i.e., not in __switch_to).
95652 + */
95653 + __asm__ __volatile__ (
95654 + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
95655 + : "=m" (current->thread.es),
95656 + "=m" (current->thread.ds),
95657 + "=m" (current->thread.fsindex),
95658 + "=m" (current->thread.gsindex) );
95659 +
95660 + if (current->thread.ds)
95661 + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
95662 +
95663 + if (current->thread.es)
95664 + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
95665 +
95666 + if (current->thread.fsindex) {
95667 + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
95668 + current->thread.fs = 0;
95669 + }
95670 +
95671 + if (current->thread.gsindex) {
95672 + load_gs_index(0);
95673 + current->thread.gs = 0;
95674 + }
95675 +}
95676 +
95677 +extern void mm_pin(struct mm_struct *mm);
95678 +extern void mm_unpin(struct mm_struct *mm);
95679 +void mm_pin_all(void);
95680 +
95681 +static inline void load_cr3(pgd_t *pgd)
95682 +{
95683 + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
95684 + "memory");
95685 +}
95686 +
95687 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
95688 + struct task_struct *tsk)
95689 +{
95690 + unsigned cpu = smp_processor_id();
95691 + struct mmuext_op _op[3], *op = _op;
95692 +
95693 + if (likely(prev != next)) {
95694 + BUG_ON(!next->context.pinned);
95695 +
95696 + /* stop flush ipis for the previous mm */
95697 + clear_bit(cpu, &prev->cpu_vm_mask);
95698 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95699 + write_pda(mmu_state, TLBSTATE_OK);
95700 + write_pda(active_mm, next);
95701 +#endif
95702 + set_bit(cpu, &next->cpu_vm_mask);
95703 +
95704 + /* load_cr3(next->pgd) */
95705 + op->cmd = MMUEXT_NEW_BASEPTR;
95706 + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
95707 + op++;
95708 +
95709 + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
95710 + op->cmd = MMUEXT_NEW_USER_BASEPTR;
95711 + op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
95712 + op++;
95713 +
95714 + if (unlikely(next->context.ldt != prev->context.ldt)) {
95715 + /* load_LDT_nolock(&next->context, cpu) */
95716 + op->cmd = MMUEXT_SET_LDT;
95717 + op->arg1.linear_addr = (unsigned long)next->context.ldt;
95718 + op->arg2.nr_ents = next->context.size;
95719 + op++;
95720 + }
95721 +
95722 + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
95723 + }
95724 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
95725 + else {
95726 + write_pda(mmu_state, TLBSTATE_OK);
95727 + if (read_pda(active_mm) != next)
95728 + out_of_line_bug();
95729 + if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
95730 + /* We were in lazy tlb mode and leave_mm disabled
95731 + * tlb flush IPI delivery. We must reload CR3
95732 + * to make sure to use no freed page tables.
95733 + */
95734 + load_cr3(next->pgd);
95735 + xen_new_user_pt(__pa(__user_pgd(next->pgd)));
95736 + load_LDT_nolock(&next->context, cpu);
95737 + }
95738 + }
95739 +#endif
95740 +}
95741 +
95742 +#define deactivate_mm(tsk,mm) do { \
95743 + load_gs_index(0); \
95744 + asm volatile("movl %0,%%fs"::"r"(0)); \
95745 +} while(0)
95746 +
95747 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
95748 +{
95749 + if (!next->context.pinned)
95750 + mm_pin(next);
95751 + switch_mm(prev, next, NULL);
95752 +}
95753 +
95754 +#endif
95755 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/msr.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/msr.h
95756 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/msr.h 1970-01-01 00:00:00.000000000 +0000
95757 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/msr.h 2007-01-08 15:00:46.000000000 +0000
95758 @@ -0,0 +1,399 @@
95759 +#ifndef X86_64_MSR_H
95760 +#define X86_64_MSR_H 1
95761 +
95762 +#ifndef __ASSEMBLY__
95763 +/*
95764 + * Access to machine-specific registers (available on 586 and better only)
95765 + * Note: the rd* operations modify the parameters directly (without using
95766 + * pointer indirection), this allows gcc to optimize better
95767 + */
95768 +
95769 +#define rdmsr(msr,val1,val2) \
95770 + __asm__ __volatile__("rdmsr" \
95771 + : "=a" (val1), "=d" (val2) \
95772 + : "c" (msr))
95773 +
95774 +
95775 +#define rdmsrl(msr,val) do { unsigned long a__,b__; \
95776 + __asm__ __volatile__("rdmsr" \
95777 + : "=a" (a__), "=d" (b__) \
95778 + : "c" (msr)); \
95779 + val = a__ | (b__<<32); \
95780 +} while(0)
95781 +
95782 +#define wrmsr(msr,val1,val2) \
95783 + __asm__ __volatile__("wrmsr" \
95784 + : /* no outputs */ \
95785 + : "c" (msr), "a" (val1), "d" (val2))
95786 +
95787 +#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32)
95788 +
95789 +/* wrmsr with exception handling */
95790 +#define wrmsr_safe(msr,a,b) ({ int ret__; \
95791 + asm volatile("2: wrmsr ; xorl %0,%0\n" \
95792 + "1:\n\t" \
95793 + ".section .fixup,\"ax\"\n\t" \
95794 + "3: movl %4,%0 ; jmp 1b\n\t" \
95795 + ".previous\n\t" \
95796 + ".section __ex_table,\"a\"\n" \
95797 + " .align 8\n\t" \
95798 + " .quad 2b,3b\n\t" \
95799 + ".previous" \
95800 + : "=a" (ret__) \
95801 + : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
95802 + ret__; })
95803 +
95804 +#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
95805 +
95806 +#define rdmsr_safe(msr,a,b) \
95807 + ({ int ret__; \
95808 + asm volatile ("1: rdmsr\n" \
95809 + "2:\n" \
95810 + ".section .fixup,\"ax\"\n" \
95811 + "3: movl %4,%0\n" \
95812 + " jmp 2b\n" \
95813 + ".previous\n" \
95814 + ".section __ex_table,\"a\"\n" \
95815 + " .align 8\n" \
95816 + " .quad 1b,3b\n" \
95817 + ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
95818 + :"c"(msr), "i"(-EIO), "0"(0)); \
95819 + ret__; })
95820 +
95821 +#define rdtsc(low,high) \
95822 + __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
95823 +
95824 +#define rdtscl(low) \
95825 + __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
95826 +
95827 +#define rdtscll(val) do { \
95828 + unsigned int __a,__d; \
95829 + asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
95830 + (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
95831 +} while(0)
95832 +
95833 +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
95834 +
95835 +#define rdpmc(counter,low,high) \
95836 + __asm__ __volatile__("rdpmc" \
95837 + : "=a" (low), "=d" (high) \
95838 + : "c" (counter))
95839 +
95840 +static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
95841 + unsigned int *ecx, unsigned int *edx)
95842 +{
95843 + __asm__(XEN_CPUID
95844 + : "=a" (*eax),
95845 + "=b" (*ebx),
95846 + "=c" (*ecx),
95847 + "=d" (*edx)
95848 + : "0" (op));
95849 +}
95850 +
95851 +/* Some CPUID calls want 'count' to be placed in ecx */
95852 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
95853 + int *edx)
95854 +{
95855 + __asm__(XEN_CPUID
95856 + : "=a" (*eax),
95857 + "=b" (*ebx),
95858 + "=c" (*ecx),
95859 + "=d" (*edx)
95860 + : "0" (op), "c" (count));
95861 +}
95862 +
95863 +/*
95864 + * CPUID functions returning a single datum
95865 + */
95866 +static inline unsigned int cpuid_eax(unsigned int op)
95867 +{
95868 + unsigned int eax;
95869 +
95870 + __asm__(XEN_CPUID
95871 + : "=a" (eax)
95872 + : "0" (op)
95873 + : "bx", "cx", "dx");
95874 + return eax;
95875 +}
95876 +static inline unsigned int cpuid_ebx(unsigned int op)
95877 +{
95878 + unsigned int eax, ebx;
95879 +
95880 + __asm__(XEN_CPUID
95881 + : "=a" (eax), "=b" (ebx)
95882 + : "0" (op)
95883 + : "cx", "dx" );
95884 + return ebx;
95885 +}
95886 +static inline unsigned int cpuid_ecx(unsigned int op)
95887 +{
95888 + unsigned int eax, ecx;
95889 +
95890 + __asm__(XEN_CPUID
95891 + : "=a" (eax), "=c" (ecx)
95892 + : "0" (op)
95893 + : "bx", "dx" );
95894 + return ecx;
95895 +}
95896 +static inline unsigned int cpuid_edx(unsigned int op)
95897 +{
95898 + unsigned int eax, edx;
95899 +
95900 + __asm__(XEN_CPUID
95901 + : "=a" (eax), "=d" (edx)
95902 + : "0" (op)
95903 + : "bx", "cx");
95904 + return edx;
95905 +}
95906 +
95907 +#define MSR_IA32_UCODE_WRITE 0x79
95908 +#define MSR_IA32_UCODE_REV 0x8b
95909 +
95910 +
95911 +#endif
95912 +
95913 +/* AMD/K8 specific MSRs */
95914 +#define MSR_EFER 0xc0000080 /* extended feature register */
95915 +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
95916 +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
95917 +#define MSR_CSTAR 0xc0000083 /* compatibility mode SYSCALL target */
95918 +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
95919 +#define MSR_FS_BASE 0xc0000100 /* 64bit GS base */
95920 +#define MSR_GS_BASE 0xc0000101 /* 64bit FS base */
95921 +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */
95922 +/* EFER bits: */
95923 +#define _EFER_SCE 0 /* SYSCALL/SYSRET */
95924 +#define _EFER_LME 8 /* Long mode enable */
95925 +#define _EFER_LMA 10 /* Long mode active (read-only) */
95926 +#define _EFER_NX 11 /* No execute enable */
95927 +
95928 +#define EFER_SCE (1<<_EFER_SCE)
95929 +#define EFER_LME (1<<_EFER_LME)
95930 +#define EFER_LMA (1<<_EFER_LMA)
95931 +#define EFER_NX (1<<_EFER_NX)
95932 +
95933 +/* Intel MSRs. Some also available on other CPUs */
95934 +#define MSR_IA32_TSC 0x10
95935 +#define MSR_IA32_PLATFORM_ID 0x17
95936 +
95937 +#define MSR_IA32_PERFCTR0 0xc1
95938 +#define MSR_IA32_PERFCTR1 0xc2
95939 +
95940 +#define MSR_MTRRcap 0x0fe
95941 +#define MSR_IA32_BBL_CR_CTL 0x119
95942 +
95943 +#define MSR_IA32_SYSENTER_CS 0x174
95944 +#define MSR_IA32_SYSENTER_ESP 0x175
95945 +#define MSR_IA32_SYSENTER_EIP 0x176
95946 +
95947 +#define MSR_IA32_MCG_CAP 0x179
95948 +#define MSR_IA32_MCG_STATUS 0x17a
95949 +#define MSR_IA32_MCG_CTL 0x17b
95950 +
95951 +#define MSR_IA32_EVNTSEL0 0x186
95952 +#define MSR_IA32_EVNTSEL1 0x187
95953 +
95954 +#define MSR_IA32_DEBUGCTLMSR 0x1d9
95955 +#define MSR_IA32_LASTBRANCHFROMIP 0x1db
95956 +#define MSR_IA32_LASTBRANCHTOIP 0x1dc
95957 +#define MSR_IA32_LASTINTFROMIP 0x1dd
95958 +#define MSR_IA32_LASTINTTOIP 0x1de
95959 +
95960 +#define MSR_MTRRfix64K_00000 0x250
95961 +#define MSR_MTRRfix16K_80000 0x258
95962 +#define MSR_MTRRfix16K_A0000 0x259
95963 +#define MSR_MTRRfix4K_C0000 0x268
95964 +#define MSR_MTRRfix4K_C8000 0x269
95965 +#define MSR_MTRRfix4K_D0000 0x26a
95966 +#define MSR_MTRRfix4K_D8000 0x26b
95967 +#define MSR_MTRRfix4K_E0000 0x26c
95968 +#define MSR_MTRRfix4K_E8000 0x26d
95969 +#define MSR_MTRRfix4K_F0000 0x26e
95970 +#define MSR_MTRRfix4K_F8000 0x26f
95971 +#define MSR_MTRRdefType 0x2ff
95972 +
95973 +#define MSR_IA32_MC0_CTL 0x400
95974 +#define MSR_IA32_MC0_STATUS 0x401
95975 +#define MSR_IA32_MC0_ADDR 0x402
95976 +#define MSR_IA32_MC0_MISC 0x403
95977 +
95978 +#define MSR_P6_PERFCTR0 0xc1
95979 +#define MSR_P6_PERFCTR1 0xc2
95980 +#define MSR_P6_EVNTSEL0 0x186
95981 +#define MSR_P6_EVNTSEL1 0x187
95982 +
95983 +/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
95984 +#define MSR_K7_EVNTSEL0 0xC0010000
95985 +#define MSR_K7_PERFCTR0 0xC0010004
95986 +#define MSR_K7_EVNTSEL1 0xC0010001
95987 +#define MSR_K7_PERFCTR1 0xC0010005
95988 +#define MSR_K7_EVNTSEL2 0xC0010002
95989 +#define MSR_K7_PERFCTR2 0xC0010006
95990 +#define MSR_K7_EVNTSEL3 0xC0010003
95991 +#define MSR_K7_PERFCTR3 0xC0010007
95992 +#define MSR_K8_TOP_MEM1 0xC001001A
95993 +#define MSR_K8_TOP_MEM2 0xC001001D
95994 +#define MSR_K8_SYSCFG 0xC0010010
95995 +#define MSR_K8_HWCR 0xC0010015
95996 +
95997 +/* K6 MSRs */
95998 +#define MSR_K6_EFER 0xC0000080
95999 +#define MSR_K6_STAR 0xC0000081
96000 +#define MSR_K6_WHCR 0xC0000082
96001 +#define MSR_K6_UWCCR 0xC0000085
96002 +#define MSR_K6_PSOR 0xC0000087
96003 +#define MSR_K6_PFIR 0xC0000088
96004 +
96005 +/* Centaur-Hauls/IDT defined MSRs. */
96006 +#define MSR_IDT_FCR1 0x107
96007 +#define MSR_IDT_FCR2 0x108
96008 +#define MSR_IDT_FCR3 0x109
96009 +#define MSR_IDT_FCR4 0x10a
96010 +
96011 +#define MSR_IDT_MCR0 0x110
96012 +#define MSR_IDT_MCR1 0x111
96013 +#define MSR_IDT_MCR2 0x112
96014 +#define MSR_IDT_MCR3 0x113
96015 +#define MSR_IDT_MCR4 0x114
96016 +#define MSR_IDT_MCR5 0x115
96017 +#define MSR_IDT_MCR6 0x116
96018 +#define MSR_IDT_MCR7 0x117
96019 +#define MSR_IDT_MCR_CTRL 0x120
96020 +
96021 +/* VIA Cyrix defined MSRs*/
96022 +#define MSR_VIA_FCR 0x1107
96023 +#define MSR_VIA_LONGHAUL 0x110a
96024 +#define MSR_VIA_RNG 0x110b
96025 +#define MSR_VIA_BCR2 0x1147
96026 +
96027 +/* Intel defined MSRs. */
96028 +#define MSR_IA32_P5_MC_ADDR 0
96029 +#define MSR_IA32_P5_MC_TYPE 1
96030 +#define MSR_IA32_PLATFORM_ID 0x17
96031 +#define MSR_IA32_EBL_CR_POWERON 0x2a
96032 +
96033 +#define MSR_IA32_APICBASE 0x1b
96034 +#define MSR_IA32_APICBASE_BSP (1<<8)
96035 +#define MSR_IA32_APICBASE_ENABLE (1<<11)
96036 +#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
96037 +
96038 +/* P4/Xeon+ specific */
96039 +#define MSR_IA32_MCG_EAX 0x180
96040 +#define MSR_IA32_MCG_EBX 0x181
96041 +#define MSR_IA32_MCG_ECX 0x182
96042 +#define MSR_IA32_MCG_EDX 0x183
96043 +#define MSR_IA32_MCG_ESI 0x184
96044 +#define MSR_IA32_MCG_EDI 0x185
96045 +#define MSR_IA32_MCG_EBP 0x186
96046 +#define MSR_IA32_MCG_ESP 0x187
96047 +#define MSR_IA32_MCG_EFLAGS 0x188
96048 +#define MSR_IA32_MCG_EIP 0x189
96049 +#define MSR_IA32_MCG_RESERVED 0x18A
96050 +
96051 +#define MSR_P6_EVNTSEL0 0x186
96052 +#define MSR_P6_EVNTSEL1 0x187
96053 +
96054 +#define MSR_IA32_PERF_STATUS 0x198
96055 +#define MSR_IA32_PERF_CTL 0x199
96056 +
96057 +#define MSR_IA32_THERM_CONTROL 0x19a
96058 +#define MSR_IA32_THERM_INTERRUPT 0x19b
96059 +#define MSR_IA32_THERM_STATUS 0x19c
96060 +#define MSR_IA32_MISC_ENABLE 0x1a0
96061 +
96062 +#define MSR_IA32_DEBUGCTLMSR 0x1d9
96063 +#define MSR_IA32_LASTBRANCHFROMIP 0x1db
96064 +#define MSR_IA32_LASTBRANCHTOIP 0x1dc
96065 +#define MSR_IA32_LASTINTFROMIP 0x1dd
96066 +#define MSR_IA32_LASTINTTOIP 0x1de
96067 +
96068 +#define MSR_IA32_MC0_CTL 0x400
96069 +#define MSR_IA32_MC0_STATUS 0x401
96070 +#define MSR_IA32_MC0_ADDR 0x402
96071 +#define MSR_IA32_MC0_MISC 0x403
96072 +
96073 +/* Pentium IV performance counter MSRs */
96074 +#define MSR_P4_BPU_PERFCTR0 0x300
96075 +#define MSR_P4_BPU_PERFCTR1 0x301
96076 +#define MSR_P4_BPU_PERFCTR2 0x302
96077 +#define MSR_P4_BPU_PERFCTR3 0x303
96078 +#define MSR_P4_MS_PERFCTR0 0x304
96079 +#define MSR_P4_MS_PERFCTR1 0x305
96080 +#define MSR_P4_MS_PERFCTR2 0x306
96081 +#define MSR_P4_MS_PERFCTR3 0x307
96082 +#define MSR_P4_FLAME_PERFCTR0 0x308
96083 +#define MSR_P4_FLAME_PERFCTR1 0x309
96084 +#define MSR_P4_FLAME_PERFCTR2 0x30a
96085 +#define MSR_P4_FLAME_PERFCTR3 0x30b
96086 +#define MSR_P4_IQ_PERFCTR0 0x30c
96087 +#define MSR_P4_IQ_PERFCTR1 0x30d
96088 +#define MSR_P4_IQ_PERFCTR2 0x30e
96089 +#define MSR_P4_IQ_PERFCTR3 0x30f
96090 +#define MSR_P4_IQ_PERFCTR4 0x310
96091 +#define MSR_P4_IQ_PERFCTR5 0x311
96092 +#define MSR_P4_BPU_CCCR0 0x360
96093 +#define MSR_P4_BPU_CCCR1 0x361
96094 +#define MSR_P4_BPU_CCCR2 0x362
96095 +#define MSR_P4_BPU_CCCR3 0x363
96096 +#define MSR_P4_MS_CCCR0 0x364
96097 +#define MSR_P4_MS_CCCR1 0x365
96098 +#define MSR_P4_MS_CCCR2 0x366
96099 +#define MSR_P4_MS_CCCR3 0x367
96100 +#define MSR_P4_FLAME_CCCR0 0x368
96101 +#define MSR_P4_FLAME_CCCR1 0x369
96102 +#define MSR_P4_FLAME_CCCR2 0x36a
96103 +#define MSR_P4_FLAME_CCCR3 0x36b
96104 +#define MSR_P4_IQ_CCCR0 0x36c
96105 +#define MSR_P4_IQ_CCCR1 0x36d
96106 +#define MSR_P4_IQ_CCCR2 0x36e
96107 +#define MSR_P4_IQ_CCCR3 0x36f
96108 +#define MSR_P4_IQ_CCCR4 0x370
96109 +#define MSR_P4_IQ_CCCR5 0x371
96110 +#define MSR_P4_ALF_ESCR0 0x3ca
96111 +#define MSR_P4_ALF_ESCR1 0x3cb
96112 +#define MSR_P4_BPU_ESCR0 0x3b2
96113 +#define MSR_P4_BPU_ESCR1 0x3b3
96114 +#define MSR_P4_BSU_ESCR0 0x3a0
96115 +#define MSR_P4_BSU_ESCR1 0x3a1
96116 +#define MSR_P4_CRU_ESCR0 0x3b8
96117 +#define MSR_P4_CRU_ESCR1 0x3b9
96118 +#define MSR_P4_CRU_ESCR2 0x3cc
96119 +#define MSR_P4_CRU_ESCR3 0x3cd
96120 +#define MSR_P4_CRU_ESCR4 0x3e0
96121 +#define MSR_P4_CRU_ESCR5 0x3e1
96122 +#define MSR_P4_DAC_ESCR0 0x3a8
96123 +#define MSR_P4_DAC_ESCR1 0x3a9
96124 +#define MSR_P4_FIRM_ESCR0 0x3a4
96125 +#define MSR_P4_FIRM_ESCR1 0x3a5
96126 +#define MSR_P4_FLAME_ESCR0 0x3a6
96127 +#define MSR_P4_FLAME_ESCR1 0x3a7
96128 +#define MSR_P4_FSB_ESCR0 0x3a2
96129 +#define MSR_P4_FSB_ESCR1 0x3a3
96130 +#define MSR_P4_IQ_ESCR0 0x3ba
96131 +#define MSR_P4_IQ_ESCR1 0x3bb
96132 +#define MSR_P4_IS_ESCR0 0x3b4
96133 +#define MSR_P4_IS_ESCR1 0x3b5
96134 +#define MSR_P4_ITLB_ESCR0 0x3b6
96135 +#define MSR_P4_ITLB_ESCR1 0x3b7
96136 +#define MSR_P4_IX_ESCR0 0x3c8
96137 +#define MSR_P4_IX_ESCR1 0x3c9
96138 +#define MSR_P4_MOB_ESCR0 0x3aa
96139 +#define MSR_P4_MOB_ESCR1 0x3ab
96140 +#define MSR_P4_MS_ESCR0 0x3c0
96141 +#define MSR_P4_MS_ESCR1 0x3c1
96142 +#define MSR_P4_PMH_ESCR0 0x3ac
96143 +#define MSR_P4_PMH_ESCR1 0x3ad
96144 +#define MSR_P4_RAT_ESCR0 0x3bc
96145 +#define MSR_P4_RAT_ESCR1 0x3bd
96146 +#define MSR_P4_SAAT_ESCR0 0x3ae
96147 +#define MSR_P4_SAAT_ESCR1 0x3af
96148 +#define MSR_P4_SSU_ESCR0 0x3be
96149 +#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */
96150 +#define MSR_P4_TBPU_ESCR0 0x3c2
96151 +#define MSR_P4_TBPU_ESCR1 0x3c3
96152 +#define MSR_P4_TC_ESCR0 0x3c4
96153 +#define MSR_P4_TC_ESCR1 0x3c5
96154 +#define MSR_P4_U2L_ESCR0 0x3b0
96155 +#define MSR_P4_U2L_ESCR1 0x3b1
96156 +
96157 +#endif
96158 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/nmi.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/nmi.h
96159 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/nmi.h 1970-01-01 00:00:00.000000000 +0000
96160 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/nmi.h 2007-01-08 15:00:46.000000000 +0000
96161 @@ -0,0 +1,75 @@
96162 +/*
96163 + * linux/include/asm-i386/nmi.h
96164 + */
96165 +#ifndef ASM_NMI_H
96166 +#define ASM_NMI_H
96167 +
96168 +#include <linux/pm.h>
96169 +
96170 +#include <xen/interface/nmi.h>
96171 +
96172 +struct pt_regs;
96173 +
96174 +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
96175 +
96176 +/**
96177 + * set_nmi_callback
96178 + *
96179 + * Set a handler for an NMI. Only one handler may be
96180 + * set. Return 1 if the NMI was handled.
96181 + */
96182 +void set_nmi_callback(nmi_callback_t callback);
96183 +
96184 +/**
96185 + * unset_nmi_callback
96186 + *
96187 + * Remove the handler previously set.
96188 + */
96189 +void unset_nmi_callback(void);
96190 +
96191 +#ifdef CONFIG_PM
96192 +
96193 +/** Replace the PM callback routine for NMI. */
96194 +struct pm_dev * set_nmi_pm_callback(pm_callback callback);
96195 +
96196 +/** Unset the PM callback routine back to the default. */
96197 +void unset_nmi_pm_callback(struct pm_dev * dev);
96198 +
96199 +#else
96200 +
96201 +static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
96202 +{
96203 + return 0;
96204 +}
96205 +
96206 +static inline void unset_nmi_pm_callback(struct pm_dev * dev)
96207 +{
96208 +}
96209 +
96210 +#endif /* CONFIG_PM */
96211 +
96212 +extern void default_do_nmi(struct pt_regs *);
96213 +extern void die_nmi(char *str, struct pt_regs *regs);
96214 +
96215 +static inline unsigned char get_nmi_reason(void)
96216 +{
96217 + shared_info_t *s = HYPERVISOR_shared_info;
96218 + unsigned char reason = 0;
96219 +
96220 + /* construct a value which looks like it came from
96221 + * port 0x61.
96222 + */
96223 + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
96224 + reason |= 0x40;
96225 + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
96226 + reason |= 0x80;
96227 +
96228 + return reason;
96229 +}
96230 +
96231 +extern int panic_on_timeout;
96232 +extern int unknown_nmi_panic;
96233 +
96234 +extern int check_nmi_watchdog(void);
96235 +
96236 +#endif /* ASM_NMI_H */
96237 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/page.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/page.h
96238 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/page.h 1970-01-01 00:00:00.000000000 +0000
96239 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/page.h 2007-01-08 15:00:46.000000000 +0000
96240 @@ -0,0 +1,211 @@
96241 +#ifndef _X86_64_PAGE_H
96242 +#define _X86_64_PAGE_H
96243 +
96244 +#include <linux/config.h>
96245 +/* #include <linux/string.h> */
96246 +#ifndef __ASSEMBLY__
96247 +#include <linux/kernel.h>
96248 +#include <linux/types.h>
96249 +#include <asm/bug.h>
96250 +#endif
96251 +#include <xen/interface/xen.h>
96252 +#include <xen/foreign_page.h>
96253 +
96254 +#define arch_free_page(_page,_order) \
96255 +({ int foreign = PageForeign(_page); \
96256 + if (foreign) \
96257 + (PageForeignDestructor(_page))(_page); \
96258 + foreign; \
96259 +})
96260 +#define HAVE_ARCH_FREE_PAGE
96261 +
96262 +#ifdef CONFIG_XEN_SCRUB_PAGES
96263 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
96264 +#else
96265 +#define scrub_pages(_p,_n) ((void)0)
96266 +#endif
96267 +
96268 +/* PAGE_SHIFT determines the page size */
96269 +#define PAGE_SHIFT 12
96270 +#ifdef __ASSEMBLY__
96271 +#define PAGE_SIZE (0x1 << PAGE_SHIFT)
96272 +#else
96273 +#define PAGE_SIZE (1UL << PAGE_SHIFT)
96274 +#endif
96275 +#define PAGE_MASK (~(PAGE_SIZE-1))
96276 +
96277 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
96278 +#define __PHYSICAL_MASK_SHIFT 46
96279 +#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
96280 +#define __VIRTUAL_MASK_SHIFT 48
96281 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
96282 +
96283 +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
96284 +
96285 +#define THREAD_ORDER 1
96286 +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
96287 +#define CURRENT_MASK (~(THREAD_SIZE-1))
96288 +
96289 +#define EXCEPTION_STACK_ORDER 0
96290 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
96291 +
96292 +#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER
96293 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
96294 +
96295 +#define IRQSTACK_ORDER 2
96296 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
96297 +
96298 +#define STACKFAULT_STACK 1
96299 +#define DOUBLEFAULT_STACK 2
96300 +#define NMI_STACK 3
96301 +#define DEBUG_STACK 4
96302 +#define MCE_STACK 5
96303 +#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
96304 +
96305 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
96306 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
96307 +
96308 +#define HPAGE_SHIFT PMD_SHIFT
96309 +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
96310 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
96311 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
96312 +
96313 +#ifdef __KERNEL__
96314 +#ifndef __ASSEMBLY__
96315 +
96316 +extern unsigned long end_pfn;
96317 +
96318 +#include <asm/maddr.h>
96319 +
96320 +void clear_page(void *);
96321 +void copy_page(void *, void *);
96322 +
96323 +#define clear_user_page(page, vaddr, pg) clear_page(page)
96324 +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
96325 +
96326 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
96327 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
96328 +
96329 +/*
96330 + * These are used to make use of C type-checking..
96331 + */
96332 +typedef struct { unsigned long pte; } pte_t;
96333 +typedef struct { unsigned long pmd; } pmd_t;
96334 +typedef struct { unsigned long pud; } pud_t;
96335 +typedef struct { unsigned long pgd; } pgd_t;
96336 +#define PTE_MASK PHYSICAL_PAGE_MASK
96337 +
96338 +typedef struct { unsigned long pgprot; } pgprot_t;
96339 +
96340 +#define pte_val(x) (((x).pte & 1) ? pte_machine_to_phys((x).pte) : \
96341 + (x).pte)
96342 +#define pte_val_ma(x) ((x).pte)
96343 +
96344 +static inline unsigned long pmd_val(pmd_t x)
96345 +{
96346 + unsigned long ret = x.pmd;
96347 + if (ret) ret = pte_machine_to_phys(ret);
96348 + return ret;
96349 +}
96350 +
96351 +static inline unsigned long pud_val(pud_t x)
96352 +{
96353 + unsigned long ret = x.pud;
96354 + if (ret) ret = pte_machine_to_phys(ret);
96355 + return ret;
96356 +}
96357 +
96358 +static inline unsigned long pgd_val(pgd_t x)
96359 +{
96360 + unsigned long ret = x.pgd;
96361 + if (ret) ret = pte_machine_to_phys(ret);
96362 + return ret;
96363 +}
96364 +
96365 +#define pgprot_val(x) ((x).pgprot)
96366 +
96367 +static inline pte_t __pte(unsigned long x)
96368 +{
96369 + if (x & 1) x = phys_to_machine(x);
96370 + return ((pte_t) { (x) });
96371 +}
96372 +
96373 +static inline pmd_t __pmd(unsigned long x)
96374 +{
96375 + if ((x & 1)) x = phys_to_machine(x);
96376 + return ((pmd_t) { (x) });
96377 +}
96378 +
96379 +static inline pud_t __pud(unsigned long x)
96380 +{
96381 + if ((x & 1)) x = phys_to_machine(x);
96382 + return ((pud_t) { (x) });
96383 +}
96384 +
96385 +static inline pgd_t __pgd(unsigned long x)
96386 +{
96387 + if ((x & 1)) x = phys_to_machine(x);
96388 + return ((pgd_t) { (x) });
96389 +}
96390 +
96391 +#define __pgprot(x) ((pgprot_t) { (x) } )
96392 +
96393 +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
96394 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
96395 +#define __START_KERNEL_map 0xffffffff80000000UL
96396 +#define __PAGE_OFFSET 0xffff880000000000UL
96397 +
96398 +#else
96399 +#define __PHYSICAL_START CONFIG_PHYSICAL_START
96400 +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
96401 +#define __START_KERNEL_map 0xffffffff80000000
96402 +#define __PAGE_OFFSET 0xffff880000000000
96403 +#endif /* !__ASSEMBLY__ */
96404 +
96405 +#ifdef CONFIG_XEN_COMPAT_030002
96406 +#undef LOAD_OFFSET
96407 +#define LOAD_OFFSET 0
96408 +#endif /* CONFIG_XEN_COMPAT_030002 */
96409 +
96410 +/* to align the pointer to the (next) page boundary */
96411 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
96412 +
96413 +#define KERNEL_TEXT_SIZE (40UL*1024*1024)
96414 +#define KERNEL_TEXT_START 0xffffffff80000000UL
96415 +
96416 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
96417 +
96418 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
96419 + Otherwise you risk miscompilation. */
96420 +#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
96421 +/* __pa_symbol should be used for C visible symbols.
96422 + This seems to be the official gcc blessed way to do such arithmetic. */
96423 +#define __pa_symbol(x) \
96424 + ({unsigned long v; \
96425 + asm("" : "=r" (v) : "0" (x)); \
96426 + __pa(v); })
96427 +
96428 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
96429 +#define __boot_va(x) __va(x)
96430 +#define __boot_pa(x) __pa(x)
96431 +#ifdef CONFIG_FLATMEM
96432 +#define pfn_to_page(pfn) (mem_map + (pfn))
96433 +#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
96434 +#define pfn_valid(pfn) ((pfn) < end_pfn)
96435 +#endif
96436 +
96437 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
96438 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
96439 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
96440 +
96441 +#define VM_DATA_DEFAULT_FLAGS \
96442 + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
96443 + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
96444 +
96445 +#define __HAVE_ARCH_GATE_AREA 1
96446 +
96447 +#endif /* __KERNEL__ */
96448 +
96449 +#include <asm-generic/page.h>
96450 +
96451 +#endif /* _X86_64_PAGE_H */
96452 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pci.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pci.h
96453 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pci.h 1970-01-01 00:00:00.000000000 +0000
96454 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pci.h 2007-01-08 15:00:46.000000000 +0000
96455 @@ -0,0 +1,174 @@
96456 +#ifndef __x8664_PCI_H
96457 +#define __x8664_PCI_H
96458 +
96459 +#include <linux/config.h>
96460 +#include <asm/io.h>
96461 +
96462 +#ifdef __KERNEL__
96463 +
96464 +#include <linux/mm.h> /* for struct page */
96465 +
96466 +/* Can be used to override the logic in pci_scan_bus for skipping
96467 + already-configured bus numbers - to be used for buggy BIOSes
96468 + or architectures with incomplete PCI setup by the loader */
96469 +
96470 +#ifdef CONFIG_PCI
96471 +extern unsigned int pcibios_assign_all_busses(void);
96472 +#else
96473 +#define pcibios_assign_all_busses() 0
96474 +#endif
96475 +#define pcibios_scan_all_fns(a, b) 0
96476 +
96477 +extern unsigned long pci_mem_start;
96478 +#define PCIBIOS_MIN_IO 0x1000
96479 +#define PCIBIOS_MIN_MEM (pci_mem_start)
96480 +
96481 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
96482 +
96483 +void pcibios_config_init(void);
96484 +struct pci_bus * pcibios_scan_root(int bus);
96485 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
96486 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
96487 +
96488 +void pcibios_set_master(struct pci_dev *dev);
96489 +void pcibios_penalize_isa_irq(int irq, int active);
96490 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
96491 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
96492 +
96493 +#include <linux/types.h>
96494 +#include <linux/slab.h>
96495 +#include <asm/scatterlist.h>
96496 +#include <linux/string.h>
96497 +#include <asm/page.h>
96498 +#include <linux/dma-mapping.h> /* for have_iommu */
96499 +
96500 +extern int iommu_setup(char *opt);
96501 +
96502 +/* The PCI address space does equal the physical memory
96503 + * address space. The networking and block device layers use
96504 + * this boolean for bounce buffer decisions
96505 + *
96506 + * On AMD64 it mostly equals, but we set it to zero if a hardware
96507 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
96508 + */
96509 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
96510 +
96511 +#ifdef CONFIG_GART_IOMMU
96512 +
96513 +/*
96514 + * x86-64 always supports DAC, but sometimes it is useful to force
96515 + * devices through the IOMMU to get automatic sg list merging.
96516 + * Optional right now.
96517 + */
96518 +extern int iommu_sac_force;
96519 +#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force)
96520 +
96521 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96522 + dma_addr_t ADDR_NAME;
96523 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
96524 + __u32 LEN_NAME;
96525 +#define pci_unmap_addr(PTR, ADDR_NAME) \
96526 + ((PTR)->ADDR_NAME)
96527 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
96528 + (((PTR)->ADDR_NAME) = (VAL))
96529 +#define pci_unmap_len(PTR, LEN_NAME) \
96530 + ((PTR)->LEN_NAME)
96531 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
96532 + (((PTR)->LEN_NAME) = (VAL))
96533 +
96534 +#elif defined(CONFIG_SWIOTLB)
96535 +
96536 +#define pci_dac_dma_supported(pci_dev, mask) 1
96537 +
96538 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96539 + dma_addr_t ADDR_NAME;
96540 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
96541 + __u32 LEN_NAME;
96542 +#define pci_unmap_addr(PTR, ADDR_NAME) \
96543 + ((PTR)->ADDR_NAME)
96544 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
96545 + (((PTR)->ADDR_NAME) = (VAL))
96546 +#define pci_unmap_len(PTR, LEN_NAME) \
96547 + ((PTR)->LEN_NAME)
96548 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
96549 + (((PTR)->LEN_NAME) = (VAL))
96550 +
96551 +#else
96552 +/* No IOMMU */
96553 +
96554 +#define pci_dac_dma_supported(pci_dev, mask) 1
96555 +
96556 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
96557 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
96558 +#define pci_unmap_addr(PTR, ADDR_NAME) (0)
96559 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
96560 +#define pci_unmap_len(PTR, LEN_NAME) (0)
96561 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
96562 +
96563 +#endif
96564 +
96565 +#include <asm-generic/pci-dma-compat.h>
96566 +
96567 +static inline dma64_addr_t
96568 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
96569 +{
96570 + return ((dma64_addr_t) page_to_phys(page) +
96571 + (dma64_addr_t) offset);
96572 +}
96573 +
96574 +static inline struct page *
96575 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
96576 +{
96577 + return virt_to_page(__va(dma_addr));
96578 +}
96579 +
96580 +static inline unsigned long
96581 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
96582 +{
96583 + return (dma_addr & ~PAGE_MASK);
96584 +}
96585 +
96586 +static inline void
96587 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
96588 +{
96589 +}
96590 +
96591 +static inline void
96592 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
96593 +{
96594 + flush_write_buffers();
96595 +}
96596 +
96597 +#ifdef CONFIG_PCI
96598 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
96599 + enum pci_dma_burst_strategy *strat,
96600 + unsigned long *strategy_parameter)
96601 +{
96602 + *strat = PCI_DMA_BURST_INFINITY;
96603 + *strategy_parameter = ~0UL;
96604 +}
96605 +#endif
96606 +
96607 +#define HAVE_PCI_MMAP
96608 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
96609 + enum pci_mmap_state mmap_state, int write_combine);
96610 +
96611 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
96612 +{
96613 +}
96614 +
96615 +#endif /* __KERNEL__ */
96616 +
96617 +/* generic pci stuff */
96618 +#ifdef CONFIG_PCI
96619 +#include <asm-generic/pci.h>
96620 +#endif
96621 +
96622 +/* On Xen we have to scan all functions since Xen hides bridges from
96623 + * us. If a bridge is at fn=0 and that slot has a multifunction
96624 + * device, we won't find the additional devices without scanning all
96625 + * functions. */
96626 +#undef pcibios_scan_all_fns
96627 +#define pcibios_scan_all_fns(a, b) 1
96628 +
96629 +#endif /* __x8664_PCI_H */
96630 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgalloc.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgalloc.h
96631 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgalloc.h 1970-01-01 00:00:00.000000000 +0000
96632 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgalloc.h 2007-01-08 15:00:46.000000000 +0000
96633 @@ -0,0 +1,198 @@
96634 +#ifndef _X86_64_PGALLOC_H
96635 +#define _X86_64_PGALLOC_H
96636 +
96637 +#include <asm/fixmap.h>
96638 +#include <asm/pda.h>
96639 +#include <linux/threads.h>
96640 +#include <linux/mm.h>
96641 +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
96642 +
96643 +#include <xen/features.h>
96644 +void make_page_readonly(void *va, unsigned int feature);
96645 +void make_page_writable(void *va, unsigned int feature);
96646 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
96647 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
96648 +
96649 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
96650 +
96651 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
96652 +{
96653 + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
96654 +}
96655 +
96656 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
96657 +{
96658 + if (unlikely((mm)->context.pinned)) {
96659 + BUG_ON(HYPERVISOR_update_va_mapping(
96660 + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
96661 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
96662 + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
96663 + } else {
96664 + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
96665 + }
96666 +}
96667 +
96668 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
96669 +{
96670 + if (unlikely((mm)->context.pinned)) {
96671 + BUG_ON(HYPERVISOR_update_va_mapping(
96672 + (unsigned long)pmd,
96673 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
96674 + PAGE_KERNEL_RO), 0));
96675 + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
96676 + } else {
96677 + *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
96678 + }
96679 +}
96680 +
96681 +/*
96682 + * We need to use the batch mode here, but pgd_pupulate() won't be
96683 + * be called frequently.
96684 + */
96685 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
96686 +{
96687 + if (unlikely((mm)->context.pinned)) {
96688 + BUG_ON(HYPERVISOR_update_va_mapping(
96689 + (unsigned long)pud,
96690 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
96691 + PAGE_KERNEL_RO), 0));
96692 + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
96693 + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
96694 + } else {
96695 + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
96696 + *(__user_pgd(pgd)) = *(pgd);
96697 + }
96698 +}
96699 +
96700 +static inline void pmd_free(pmd_t *pmd)
96701 +{
96702 + pte_t *ptep = virt_to_ptep(pmd);
96703 +
96704 + if (!pte_write(*ptep)) {
96705 + BUG_ON(HYPERVISOR_update_va_mapping(
96706 + (unsigned long)pmd,
96707 + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
96708 + 0));
96709 + }
96710 + free_page((unsigned long)pmd);
96711 +}
96712 +
96713 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
96714 +{
96715 + pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96716 + return pmd;
96717 +}
96718 +
96719 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
96720 +{
96721 + pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96722 + return pud;
96723 +}
96724 +
96725 +static inline void pud_free(pud_t *pud)
96726 +{
96727 + pte_t *ptep = virt_to_ptep(pud);
96728 +
96729 + if (!pte_write(*ptep)) {
96730 + BUG_ON(HYPERVISOR_update_va_mapping(
96731 + (unsigned long)pud,
96732 + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
96733 + 0));
96734 + }
96735 + free_page((unsigned long)pud);
96736 +}
96737 +
96738 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
96739 +{
96740 + /*
96741 + * We allocate two contiguous pages for kernel and user.
96742 + */
96743 + unsigned boundary;
96744 + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
96745 +
96746 + if (!pgd)
96747 + return NULL;
96748 + /*
96749 + * Copy kernel pointers in from init.
96750 + * Could keep a freelist or slab cache of those because the kernel
96751 + * part never changes.
96752 + */
96753 + boundary = pgd_index(__PAGE_OFFSET);
96754 + memset(pgd, 0, boundary * sizeof(pgd_t));
96755 + memcpy(pgd + boundary,
96756 + init_level4_pgt + boundary,
96757 + (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
96758 +
96759 + memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
96760 + /*
96761 + * Set level3_user_pgt for vsyscall area
96762 + */
96763 + set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START),
96764 + mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
96765 + return pgd;
96766 +}
96767 +
96768 +static inline void pgd_free(pgd_t *pgd)
96769 +{
96770 + pte_t *ptep = virt_to_ptep(pgd);
96771 +
96772 + if (!pte_write(*ptep)) {
96773 + xen_pgd_unpin(__pa(pgd));
96774 + BUG_ON(HYPERVISOR_update_va_mapping(
96775 + (unsigned long)pgd,
96776 + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
96777 + 0));
96778 + }
96779 +
96780 + ptep = virt_to_ptep(__user_pgd(pgd));
96781 +
96782 + if (!pte_write(*ptep)) {
96783 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
96784 + BUG_ON(HYPERVISOR_update_va_mapping(
96785 + (unsigned long)__user_pgd(pgd),
96786 + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
96787 + PAGE_KERNEL),
96788 + 0));
96789 + }
96790 +
96791 + free_pages((unsigned long)pgd, 1);
96792 +}
96793 +
96794 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
96795 +{
96796 + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
96797 + if (pte)
96798 + make_page_readonly(pte, XENFEAT_writable_page_tables);
96799 +
96800 + return pte;
96801 +}
96802 +
96803 +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
96804 +{
96805 + struct page *pte;
96806 +
96807 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
96808 + return pte;
96809 +}
96810 +
96811 +/* Should really implement gc for free page table pages. This could be
96812 + done with a reference count in struct page. */
96813 +
96814 +static inline void pte_free_kernel(pte_t *pte)
96815 +{
96816 + BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
96817 + make_page_writable(pte, XENFEAT_writable_page_tables);
96818 + free_page((unsigned long)pte);
96819 +}
96820 +
96821 +extern void pte_free(struct page *pte);
96822 +
96823 +//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
96824 +//#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
96825 +//#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
96826 +
96827 +#define __pte_free_tlb(tlb,x) pte_free((x))
96828 +#define __pmd_free_tlb(tlb,x) pmd_free((x))
96829 +#define __pud_free_tlb(tlb,x) pud_free((x))
96830 +
96831 +#endif /* _X86_64_PGALLOC_H */
96832 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgtable.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgtable.h
96833 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/pgtable.h 1970-01-01 00:00:00.000000000 +0000
96834 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/pgtable.h 2007-01-08 15:00:46.000000000 +0000
96835 @@ -0,0 +1,565 @@
96836 +#ifndef _X86_64_PGTABLE_H
96837 +#define _X86_64_PGTABLE_H
96838 +
96839 +/*
96840 + * This file contains the functions and defines necessary to modify and use
96841 + * the x86-64 page table tree.
96842 + */
96843 +#include <asm/processor.h>
96844 +#include <asm/fixmap.h>
96845 +#include <asm/bitops.h>
96846 +#include <linux/threads.h>
96847 +#include <linux/sched.h>
96848 +#include <asm/pda.h>
96849 +#ifdef CONFIG_XEN
96850 +#include <asm/hypervisor.h>
96851 +
96852 +extern pud_t level3_user_pgt[512];
96853 +extern pud_t init_level4_user_pgt[];
96854 +
96855 +extern void xen_init_pt(void);
96856 +
96857 +#define virt_to_ptep(__va) \
96858 +({ \
96859 + pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \
96860 + pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \
96861 + pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \
96862 + pte_offset_kernel(__pmd, (unsigned long)(__va)); \
96863 +})
96864 +
96865 +#define arbitrary_virt_to_machine(__va) \
96866 +({ \
96867 + maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
96868 + m | ((unsigned long)(__va) & (PAGE_SIZE-1)); \
96869 +})
96870 +#endif
96871 +
96872 +extern pud_t level3_kernel_pgt[512];
96873 +extern pud_t level3_physmem_pgt[512];
96874 +extern pud_t level3_ident_pgt[512];
96875 +extern pmd_t level2_kernel_pgt[512];
96876 +extern pgd_t init_level4_pgt[];
96877 +extern pgd_t boot_level4_pgt[];
96878 +extern unsigned long __supported_pte_mask;
96879 +
96880 +#define swapper_pg_dir init_level4_pgt
96881 +
96882 +extern int nonx_setup(char *str);
96883 +extern void paging_init(void);
96884 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
96885 +
96886 +extern unsigned long pgkern_mask;
96887 +
96888 +/*
96889 + * ZERO_PAGE is a global shared page that is always zero: used
96890 + * for zero-mapped memory areas etc..
96891 + */
96892 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
96893 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
96894 +
96895 +/*
96896 + * PGDIR_SHIFT determines what a top-level page table entry can map
96897 + */
96898 +#define PGDIR_SHIFT 39
96899 +#define PTRS_PER_PGD 512
96900 +
96901 +/*
96902 + * 3rd level page
96903 + */
96904 +#define PUD_SHIFT 30
96905 +#define PTRS_PER_PUD 512
96906 +
96907 +/*
96908 + * PMD_SHIFT determines the size of the area a middle-level
96909 + * page table can map
96910 + */
96911 +#define PMD_SHIFT 21
96912 +#define PTRS_PER_PMD 512
96913 +
96914 +/*
96915 + * entries per page directory level
96916 + */
96917 +#define PTRS_PER_PTE 512
96918 +
96919 +#define pte_ERROR(e) \
96920 + printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
96921 +#define pmd_ERROR(e) \
96922 + printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
96923 +#define pud_ERROR(e) \
96924 + printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
96925 +#define pgd_ERROR(e) \
96926 + printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
96927 +
96928 +#define pgd_none(x) (!pgd_val(x))
96929 +#define pud_none(x) (!pud_val(x))
96930 +
96931 +#define set_pte_batched(pteptr, pteval) \
96932 + queue_l1_entry_update(pteptr, (pteval))
96933 +
96934 +extern inline int pud_present(pud_t pud) { return !pud_none(pud); }
96935 +
96936 +static inline void set_pte(pte_t *dst, pte_t val)
96937 +{
96938 + *dst = val;
96939 +}
96940 +
96941 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
96942 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
96943 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
96944 +
96945 +static inline void pud_clear (pud_t * pud)
96946 +{
96947 + set_pud(pud, __pud(0));
96948 +}
96949 +
96950 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
96951 +
96952 +static inline void pgd_clear (pgd_t * pgd)
96953 +{
96954 + set_pgd(pgd, __pgd(0));
96955 + set_pgd(__user_pgd(pgd), __pgd(0));
96956 +}
96957 +
96958 +#define pud_page(pud) \
96959 + ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
96960 +
96961 +/*
96962 + * A note on implementation of this atomic 'get-and-clear' operation.
96963 + * This is actually very simple because Xen Linux can only run on a single
96964 + * processor. Therefore, we cannot race other processors setting the 'accessed'
96965 + * or 'dirty' bits on a page-table entry.
96966 + * Even if pages are shared between domains, that is not a problem because
96967 + * each domain will have separate page tables, with their own versions of
96968 + * accessed & dirty state.
96969 + */
96970 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
96971 +
96972 +#if 0
96973 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
96974 +{
96975 + pte_t pte = *xp;
96976 + if (pte.pte)
96977 + set_pte(xp, __pte_ma(0));
96978 + return pte;
96979 +}
96980 +#endif
96981 +
96982 +struct mm_struct;
96983 +
96984 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
96985 +{
96986 + pte_t pte;
96987 + if (full) {
96988 + pte = *ptep;
96989 + *ptep = __pte(0);
96990 + } else {
96991 + pte = ptep_get_and_clear(mm, addr, ptep);
96992 + }
96993 + return pte;
96994 +}
96995 +
96996 +#define pte_same(a, b) ((a).pte == (b).pte)
96997 +
96998 +#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
96999 +
97000 +#define PMD_SIZE (1UL << PMD_SHIFT)
97001 +#define PMD_MASK (~(PMD_SIZE-1))
97002 +#define PUD_SIZE (1UL << PUD_SHIFT)
97003 +#define PUD_MASK (~(PUD_SIZE-1))
97004 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
97005 +#define PGDIR_MASK (~(PGDIR_SIZE-1))
97006 +
97007 +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
97008 +#define FIRST_USER_ADDRESS 0
97009 +
97010 +#ifndef __ASSEMBLY__
97011 +#define MAXMEM 0x3fffffffffffUL
97012 +#define VMALLOC_START 0xffffc20000000000UL
97013 +#define VMALLOC_END 0xffffe1ffffffffffUL
97014 +#define MODULES_VADDR 0xffffffff88000000UL
97015 +#define MODULES_END 0xfffffffffff00000UL
97016 +#define MODULES_LEN (MODULES_END - MODULES_VADDR)
97017 +
97018 +#define _PAGE_BIT_PRESENT 0
97019 +#define _PAGE_BIT_RW 1
97020 +#define _PAGE_BIT_USER 2
97021 +#define _PAGE_BIT_PWT 3
97022 +#define _PAGE_BIT_PCD 4
97023 +#define _PAGE_BIT_ACCESSED 5
97024 +#define _PAGE_BIT_DIRTY 6
97025 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
97026 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
97027 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
97028 +
97029 +#define _PAGE_PRESENT 0x001
97030 +#define _PAGE_RW 0x002
97031 +#define _PAGE_USER 0x004
97032 +#define _PAGE_PWT 0x008
97033 +#define _PAGE_PCD 0x010
97034 +#define _PAGE_ACCESSED 0x020
97035 +#define _PAGE_DIRTY 0x040
97036 +#define _PAGE_PSE 0x080 /* 2MB page */
97037 +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
97038 +#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
97039 +
97040 +#define _PAGE_PROTNONE 0x080 /* If not present */
97041 +#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
97042 +
97043 +#ifdef CONFIG_XEN_COMPAT_030002
97044 +extern unsigned int __kernel_page_user;
97045 +#else
97046 +#define __kernel_page_user 0
97047 +#endif
97048 +
97049 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
97050 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
97051 +
97052 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
97053 +
97054 +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
97055 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97056 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
97057 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97058 +#define PAGE_COPY PAGE_COPY_NOEXEC
97059 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97060 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
97061 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97062 +#define __PAGE_KERNEL \
97063 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97064 +#define __PAGE_KERNEL_EXEC \
97065 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
97066 +#define __PAGE_KERNEL_NOCACHE \
97067 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97068 +#define __PAGE_KERNEL_RO \
97069 + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
97070 +#define __PAGE_KERNEL_VSYSCALL \
97071 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
97072 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
97073 + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
97074 +#define __PAGE_KERNEL_LARGE \
97075 + (__PAGE_KERNEL | _PAGE_PSE)
97076 +#define __PAGE_KERNEL_LARGE_EXEC \
97077 + (__PAGE_KERNEL_EXEC | _PAGE_PSE)
97078 +
97079 +/*
97080 + * We don't support GLOBAL page in xenolinux64
97081 + */
97082 +#define MAKE_GLOBAL(x) __pgprot((x))
97083 +
97084 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
97085 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
97086 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
97087 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
97088 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
97089 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
97090 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
97091 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
97092 +
97093 +/* xwr */
97094 +#define __P000 PAGE_NONE
97095 +#define __P001 PAGE_READONLY
97096 +#define __P010 PAGE_COPY
97097 +#define __P011 PAGE_COPY
97098 +#define __P100 PAGE_READONLY_EXEC
97099 +#define __P101 PAGE_READONLY_EXEC
97100 +#define __P110 PAGE_COPY_EXEC
97101 +#define __P111 PAGE_COPY_EXEC
97102 +
97103 +#define __S000 PAGE_NONE
97104 +#define __S001 PAGE_READONLY
97105 +#define __S010 PAGE_SHARED
97106 +#define __S011 PAGE_SHARED
97107 +#define __S100 PAGE_READONLY_EXEC
97108 +#define __S101 PAGE_READONLY_EXEC
97109 +#define __S110 PAGE_SHARED_EXEC
97110 +#define __S111 PAGE_SHARED_EXEC
97111 +
97112 +static inline unsigned long pgd_bad(pgd_t pgd)
97113 +{
97114 + unsigned long val = pgd_val(pgd);
97115 + val &= ~PTE_MASK;
97116 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
97117 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
97118 +}
97119 +
97120 +static inline unsigned long pud_bad(pud_t pud)
97121 +{
97122 + unsigned long val = pud_val(pud);
97123 + val &= ~PTE_MASK;
97124 + val &= ~(_PAGE_USER | _PAGE_DIRTY);
97125 + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
97126 +}
97127 +
97128 +#define set_pte_at(_mm,addr,ptep,pteval) do { \
97129 + if (((_mm) != current->mm && (_mm) != &init_mm) || \
97130 + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
97131 + set_pte((ptep), (pteval)); \
97132 +} while (0)
97133 +
97134 +#define pte_none(x) (!(x).pte)
97135 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
97136 +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
97137 +
97138 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
97139 +
97140 +#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
97141 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
97142 +
97143 +#define pte_page(x) pfn_to_page(pte_pfn(x))
97144 +
97145 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
97146 +{
97147 + pte_t pte;
97148 +
97149 + (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
97150 + (pte).pte |= pgprot_val(pgprot);
97151 + (pte).pte &= __supported_pte_mask;
97152 + return pte;
97153 +}
97154 +
97155 +/*
97156 + * The following only work if pte_present() is true.
97157 + * Undefined behaviour if not..
97158 + */
97159 +#define __pte_val(x) ((x).pte)
97160 +
97161 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
97162 +static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97163 +static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97164 +static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
97165 +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
97166 +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
97167 +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
97168 +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
97169 +static inline int pte_huge(pte_t pte) { return (__pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
97170 +
97171 +static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
97172 +static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; }
97173 +static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
97174 +static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
97175 +static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
97176 +static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
97177 +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
97178 +static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
97179 +static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
97180 +static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
97181 +static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= __LARGE_PTE; return pte; }
97182 +
97183 +struct vm_area_struct;
97184 +
97185 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
97186 +{
97187 + pte_t pte = *ptep;
97188 + int ret = pte_dirty(pte);
97189 + if (ret)
97190 + set_pte(ptep, pte_mkclean(pte));
97191 + return ret;
97192 +}
97193 +
97194 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
97195 +{
97196 + pte_t pte = *ptep;
97197 + int ret = pte_young(pte);
97198 + if (ret)
97199 + set_pte(ptep, pte_mkold(pte));
97200 + return ret;
97201 +}
97202 +
97203 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
97204 +{
97205 + pte_t pte = *ptep;
97206 + if (pte_write(pte))
97207 + set_pte(ptep, pte_wrprotect(pte));
97208 +}
97209 +
97210 +/*
97211 + * Macro to mark a page protection value as "uncacheable".
97212 + */
97213 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
97214 +
97215 +static inline int pmd_large(pmd_t pte) {
97216 + return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
97217 +}
97218 +
97219 +
97220 +/*
97221 + * Conversion functions: convert a page and protection to a page entry,
97222 + * and a page entry and page directory to the page they refer to.
97223 + */
97224 +
97225 +/*
97226 + * Level 4 access.
97227 + * Never use these in the common code.
97228 + */
97229 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
97230 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
97231 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
97232 +#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
97233 +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
97234 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
97235 +
97236 +/* PUD - Level3 access */
97237 +/* to find an entry in a page-table-directory. */
97238 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
97239 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
97240 +static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
97241 +{
97242 + return pud + pud_index(address);
97243 +}
97244 +
97245 +/* Find correct pud via the hidden fourth level page level: */
97246 +
97247 +/* This accesses the reference page table of the boot cpu.
97248 + Other CPUs get synced lazily via the page fault handler. */
97249 +static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
97250 +{
97251 + return pud_offset(pgd_offset_k(address), address);
97252 +}
97253 +
97254 +/* PMD - Level 2 access */
97255 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
97256 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
97257 +
97258 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
97259 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
97260 + pmd_index(address))
97261 +#define pmd_none(x) (!pmd_val(x))
97262 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
97263 + can temporarily clear it. */
97264 +#define pmd_present(x) (pmd_val(x))
97265 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
97266 +#define pmd_bad(x) ((pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
97267 + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
97268 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
97269 +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
97270 +
97271 +#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
97272 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
97273 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
97274 +
97275 +/* PTE - Level 1 access. */
97276 +
97277 +/* page, protection -> pte */
97278 +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
97279 +#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
97280 +
97281 +/* physical address -> PTE */
97282 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
97283 +{
97284 + pte_t pte;
97285 + (pte).pte = physpage | pgprot_val(pgprot);
97286 + return pte;
97287 +}
97288 +
97289 +/* Change flags of a PTE */
97290 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
97291 +{
97292 + (pte).pte &= _PAGE_CHG_MASK;
97293 + (pte).pte |= pgprot_val(newprot);
97294 + (pte).pte &= __supported_pte_mask;
97295 + return pte;
97296 +}
97297 +
97298 +#define pte_index(address) \
97299 + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
97300 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
97301 + pte_index(address))
97302 +
97303 +/* x86-64 always has all page tables mapped. */
97304 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
97305 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
97306 +#define pte_unmap(pte) /* NOP */
97307 +#define pte_unmap_nested(pte) /* NOP */
97308 +
97309 +#define update_mmu_cache(vma,address,pte) do { } while (0)
97310 +
97311 +/* We only update the dirty/accessed state if we set
97312 + * the dirty bit by hand in the kernel, since the hardware
97313 + * will do the accessed bit for us, and we don't want to
97314 + * race with other CPU's that might be updating the dirty
97315 + * bit at the same time. */
97316 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
97317 +#if 0
97318 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
97319 + do { \
97320 + if (__dirty) { \
97321 + set_pte(__ptep, __entry); \
97322 + flush_tlb_page(__vma, __address); \
97323 + } \
97324 + } while (0)
97325 +#endif
97326 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
97327 + do { \
97328 + if (__dirty) { \
97329 + if ( likely((__vma)->vm_mm == current->mm) ) { \
97330 + BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
97331 + } else { \
97332 + xen_l1_entry_update((__ptep), (__entry)); \
97333 + flush_tlb_page((__vma), (__address)); \
97334 + } \
97335 + } \
97336 + } while (0)
97337 +
97338 +/* Encode and de-code a swap entry */
97339 +#define __swp_type(x) (((x).val >> 1) & 0x3f)
97340 +#define __swp_offset(x) ((x).val >> 8)
97341 +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
97342 +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
97343 +#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
97344 +
97345 +#endif /* !__ASSEMBLY__ */
97346 +
97347 +extern int kern_addr_valid(unsigned long addr);
97348 +
97349 +#define DOMID_LOCAL (0xFFFFU)
97350 +
97351 +int direct_remap_pfn_range(struct vm_area_struct *vma,
97352 + unsigned long address,
97353 + unsigned long mfn,
97354 + unsigned long size,
97355 + pgprot_t prot,
97356 + domid_t domid);
97357 +
97358 +int direct_kernel_remap_pfn_range(unsigned long address,
97359 + unsigned long mfn,
97360 + unsigned long size,
97361 + pgprot_t prot,
97362 + domid_t domid);
97363 +
97364 +int create_lookup_pte_addr(struct mm_struct *mm,
97365 + unsigned long address,
97366 + uint64_t *ptep);
97367 +
97368 +int touch_pte_range(struct mm_struct *mm,
97369 + unsigned long address,
97370 + unsigned long size);
97371 +
97372 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
97373 + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
97374 +
97375 +#define MK_IOSPACE_PFN(space, pfn) (pfn)
97376 +#define GET_IOSPACE(pfn) 0
97377 +#define GET_PFN(pfn) (pfn)
97378 +
97379 +#define HAVE_ARCH_UNMAPPED_AREA
97380 +
97381 +#define pgtable_cache_init() do { } while (0)
97382 +#define check_pgt_cache() do { } while (0)
97383 +
97384 +#define PAGE_AGP PAGE_KERNEL_NOCACHE
97385 +#define HAVE_PAGE_AGP 1
97386 +
97387 +/* fs/proc/kcore.c */
97388 +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
97389 +#define kc_offset_to_vaddr(o) \
97390 + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
97391 +
97392 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
97393 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
97394 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
97395 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
97396 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
97397 +#define __HAVE_ARCH_PTE_SAME
97398 +#include <asm-generic/pgtable.h>
97399 +
97400 +#endif /* _X86_64_PGTABLE_H */
97401 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/processor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/processor.h
97402 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/processor.h 1970-01-01 00:00:00.000000000 +0000
97403 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/processor.h 2007-01-08 15:00:46.000000000 +0000
97404 @@ -0,0 +1,493 @@
97405 +/*
97406 + * include/asm-x86_64/processor.h
97407 + *
97408 + * Copyright (C) 1994 Linus Torvalds
97409 + */
97410 +
97411 +#ifndef __ASM_X86_64_PROCESSOR_H
97412 +#define __ASM_X86_64_PROCESSOR_H
97413 +
97414 +#include <asm/segment.h>
97415 +#include <asm/page.h>
97416 +#include <asm/types.h>
97417 +#include <asm/sigcontext.h>
97418 +#include <asm/cpufeature.h>
97419 +#include <linux/config.h>
97420 +#include <linux/threads.h>
97421 +#include <asm/msr.h>
97422 +#include <asm/current.h>
97423 +#include <asm/system.h>
97424 +#include <asm/mmsegment.h>
97425 +#include <asm/percpu.h>
97426 +#include <linux/personality.h>
97427 +
97428 +#define TF_MASK 0x00000100
97429 +#define IF_MASK 0x00000200
97430 +#define IOPL_MASK 0x00003000
97431 +#define NT_MASK 0x00004000
97432 +#define VM_MASK 0x00020000
97433 +#define AC_MASK 0x00040000
97434 +#define VIF_MASK 0x00080000 /* virtual interrupt flag */
97435 +#define VIP_MASK 0x00100000 /* virtual interrupt pending */
97436 +#define ID_MASK 0x00200000
97437 +
97438 +#define desc_empty(desc) \
97439 + (!((desc)->a | (desc)->b))
97440 +
97441 +#define desc_equal(desc1, desc2) \
97442 + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
97443 +
97444 +/*
97445 + * Default implementation of macro that returns current
97446 + * instruction pointer ("program counter").
97447 + */
97448 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
97449 +
97450 +/*
97451 + * CPU type and hardware bug flags. Kept separately for each CPU.
97452 + */
97453 +
97454 +struct cpuinfo_x86 {
97455 + __u8 x86; /* CPU family */
97456 + __u8 x86_vendor; /* CPU vendor */
97457 + __u8 x86_model;
97458 + __u8 x86_mask;
97459 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
97460 + __u32 x86_capability[NCAPINTS];
97461 + char x86_vendor_id[16];
97462 + char x86_model_id[64];
97463 + int x86_cache_size; /* in KB */
97464 + int x86_clflush_size;
97465 + int x86_cache_alignment;
97466 + int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
97467 + __u8 x86_virt_bits, x86_phys_bits;
97468 + __u8 x86_max_cores; /* cpuid returned max cores value */
97469 + __u32 x86_power;
97470 + __u32 extended_cpuid_level; /* Max extended CPUID function supported */
97471 + unsigned long loops_per_jiffy;
97472 + __u8 apicid;
97473 + __u8 booted_cores; /* number of cores as seen by OS */
97474 +} ____cacheline_aligned;
97475 +
97476 +#define X86_VENDOR_INTEL 0
97477 +#define X86_VENDOR_CYRIX 1
97478 +#define X86_VENDOR_AMD 2
97479 +#define X86_VENDOR_UMC 3
97480 +#define X86_VENDOR_NEXGEN 4
97481 +#define X86_VENDOR_CENTAUR 5
97482 +#define X86_VENDOR_RISE 6
97483 +#define X86_VENDOR_TRANSMETA 7
97484 +#define X86_VENDOR_NUM 8
97485 +#define X86_VENDOR_UNKNOWN 0xff
97486 +
97487 +#ifdef CONFIG_SMP
97488 +extern struct cpuinfo_x86 cpu_data[];
97489 +#define current_cpu_data cpu_data[smp_processor_id()]
97490 +#else
97491 +#define cpu_data (&boot_cpu_data)
97492 +#define current_cpu_data boot_cpu_data
97493 +#endif
97494 +
97495 +extern char ignore_irq13;
97496 +
97497 +extern void identify_cpu(struct cpuinfo_x86 *);
97498 +extern void print_cpu_info(struct cpuinfo_x86 *);
97499 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
97500 +
97501 +/*
97502 + * EFLAGS bits
97503 + */
97504 +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
97505 +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
97506 +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
97507 +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
97508 +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
97509 +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
97510 +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
97511 +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
97512 +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
97513 +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
97514 +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
97515 +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
97516 +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
97517 +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
97518 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
97519 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
97520 +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
97521 +
97522 +/*
97523 + * Intel CPU features in CR4
97524 + */
97525 +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
97526 +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
97527 +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
97528 +#define X86_CR4_DE 0x0008 /* enable debugging extensions */
97529 +#define X86_CR4_PSE 0x0010 /* enable page size extensions */
97530 +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
97531 +#define X86_CR4_MCE 0x0040 /* Machine check enable */
97532 +#define X86_CR4_PGE 0x0080 /* enable global pages */
97533 +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
97534 +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
97535 +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
97536 +
97537 +/*
97538 + * Save the cr4 feature set we're using (ie
97539 + * Pentium 4MB enable and PPro Global page
97540 + * enable), so that any CPU's that boot up
97541 + * after us can get the correct flags.
97542 + */
97543 +extern unsigned long mmu_cr4_features;
97544 +
97545 +static inline void set_in_cr4 (unsigned long mask)
97546 +{
97547 + mmu_cr4_features |= mask;
97548 + __asm__("movq %%cr4,%%rax\n\t"
97549 + "orq %0,%%rax\n\t"
97550 + "movq %%rax,%%cr4\n"
97551 + : : "irg" (mask)
97552 + :"ax");
97553 +}
97554 +
97555 +static inline void clear_in_cr4 (unsigned long mask)
97556 +{
97557 + mmu_cr4_features &= ~mask;
97558 + __asm__("movq %%cr4,%%rax\n\t"
97559 + "andq %0,%%rax\n\t"
97560 + "movq %%rax,%%cr4\n"
97561 + : : "irg" (~mask)
97562 + :"ax");
97563 +}
97564 +
97565 +
97566 +/*
97567 + * Bus types
97568 + */
97569 +#define MCA_bus 0
97570 +#define MCA_bus__is_a_macro
97571 +
97572 +/*
97573 + * User space process size. 47bits minus one guard page.
97574 + */
97575 +#define TASK_SIZE64 (0x800000000000UL - 4096)
97576 +
97577 +/* This decides where the kernel will search for a free chunk of vm
97578 + * space during mmap's.
97579 + */
97580 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
97581 +
97582 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
97583 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
97584 +
97585 +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
97586 +
97587 +/*
97588 + * Size of io_bitmap.
97589 + */
97590 +#define IO_BITMAP_BITS 65536
97591 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
97592 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
97593 +#ifndef CONFIG_X86_NO_TSS
97594 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
97595 +#endif
97596 +#define INVALID_IO_BITMAP_OFFSET 0x8000
97597 +
97598 +struct i387_fxsave_struct {
97599 + u16 cwd;
97600 + u16 swd;
97601 + u16 twd;
97602 + u16 fop;
97603 + u64 rip;
97604 + u64 rdp;
97605 + u32 mxcsr;
97606 + u32 mxcsr_mask;
97607 + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
97608 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
97609 + u32 padding[24];
97610 +} __attribute__ ((aligned (16)));
97611 +
97612 +union i387_union {
97613 + struct i387_fxsave_struct fxsave;
97614 +};
97615 +
97616 +#ifndef CONFIG_X86_NO_TSS
97617 +struct tss_struct {
97618 + u32 reserved1;
97619 + u64 rsp0;
97620 + u64 rsp1;
97621 + u64 rsp2;
97622 + u64 reserved2;
97623 + u64 ist[7];
97624 + u32 reserved3;
97625 + u32 reserved4;
97626 + u16 reserved5;
97627 + u16 io_bitmap_base;
97628 + /*
97629 + * The extra 1 is there because the CPU will access an
97630 + * additional byte beyond the end of the IO permission
97631 + * bitmap. The extra byte must be all 1 bits, and must
97632 + * be within the limit. Thus we have:
97633 + *
97634 + * 128 bytes, the bitmap itself, for ports 0..0x3ff
97635 + * 8 bytes, for an extra "long" of ~0UL
97636 + */
97637 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
97638 +} __attribute__((packed)) ____cacheline_aligned;
97639 +
97640 +DECLARE_PER_CPU(struct tss_struct,init_tss);
97641 +#endif
97642 +
97643 +extern struct cpuinfo_x86 boot_cpu_data;
97644 +
97645 +#ifdef CONFIG_X86_VSMP
97646 +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
97647 +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
97648 +#else
97649 +#define ARCH_MIN_TASKALIGN 16
97650 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
97651 +#endif
97652 +
97653 +struct thread_struct {
97654 + unsigned long rsp0;
97655 + unsigned long rsp;
97656 + unsigned long userrsp; /* Copy from PDA */
97657 + unsigned long fs;
97658 + unsigned long gs;
97659 + unsigned short es, ds, fsindex, gsindex;
97660 +/* Hardware debugging registers */
97661 + unsigned long debugreg0;
97662 + unsigned long debugreg1;
97663 + unsigned long debugreg2;
97664 + unsigned long debugreg3;
97665 + unsigned long debugreg6;
97666 + unsigned long debugreg7;
97667 +/* fault info */
97668 + unsigned long cr2, trap_no, error_code;
97669 +/* floating point info */
97670 + union i387_union i387 __attribute__((aligned(16)));
97671 +/* IO permissions. the bitmap could be moved into the GDT, that would make
97672 + switch faster for a limited number of ioperm using tasks. -AK */
97673 + int ioperm;
97674 + unsigned long *io_bitmap_ptr;
97675 + unsigned io_bitmap_max;
97676 +/* cached TLS descriptors. */
97677 + u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
97678 + unsigned int iopl;
97679 +} __attribute__((aligned(16)));
97680 +
97681 +#define INIT_THREAD { \
97682 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
97683 +}
97684 +
97685 +#ifndef CONFIG_X86_NO_TSS
97686 +#define INIT_TSS { \
97687 + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
97688 +}
97689 +#endif
97690 +
97691 +#define INIT_MMAP \
97692 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
97693 +
97694 +#define start_thread(regs,new_rip,new_rsp) do { \
97695 + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
97696 + load_gs_index(0); \
97697 + (regs)->rip = (new_rip); \
97698 + (regs)->rsp = (new_rsp); \
97699 + write_pda(oldrsp, (new_rsp)); \
97700 + (regs)->cs = __USER_CS; \
97701 + (regs)->ss = __USER_DS; \
97702 + (regs)->eflags = 0x200; \
97703 + set_fs(USER_DS); \
97704 +} while(0)
97705 +
97706 +#define get_debugreg(var, register) \
97707 + var = HYPERVISOR_get_debugreg(register)
97708 +#define set_debugreg(value, register) \
97709 + HYPERVISOR_set_debugreg(register, value)
97710 +
97711 +struct task_struct;
97712 +struct mm_struct;
97713 +
97714 +/* Free all resources held by a thread. */
97715 +extern void release_thread(struct task_struct *);
97716 +
97717 +/* Prepare to copy thread state - unlazy all lazy status */
97718 +extern void prepare_to_copy(struct task_struct *tsk);
97719 +
97720 +/*
97721 + * create a kernel thread without removing it from tasklists
97722 + */
97723 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
97724 +
97725 +/*
97726 + * Return saved PC of a blocked thread.
97727 + * What is this good for? it will be always the scheduler or ret_from_fork.
97728 + */
97729 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
97730 +
97731 +extern unsigned long get_wchan(struct task_struct *p);
97732 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
97733 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
97734 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
97735 +
97736 +
97737 +struct microcode_header {
97738 + unsigned int hdrver;
97739 + unsigned int rev;
97740 + unsigned int date;
97741 + unsigned int sig;
97742 + unsigned int cksum;
97743 + unsigned int ldrver;
97744 + unsigned int pf;
97745 + unsigned int datasize;
97746 + unsigned int totalsize;
97747 + unsigned int reserved[3];
97748 +};
97749 +
97750 +struct microcode {
97751 + struct microcode_header hdr;
97752 + unsigned int bits[0];
97753 +};
97754 +
97755 +typedef struct microcode microcode_t;
97756 +typedef struct microcode_header microcode_header_t;
97757 +
97758 +/* microcode format is extended from prescott processors */
97759 +struct extended_signature {
97760 + unsigned int sig;
97761 + unsigned int pf;
97762 + unsigned int cksum;
97763 +};
97764 +
97765 +struct extended_sigtable {
97766 + unsigned int count;
97767 + unsigned int cksum;
97768 + unsigned int reserved[3];
97769 + struct extended_signature sigs[0];
97770 +};
97771 +
97772 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
97773 +#define MICROCODE_IOCFREE _IO('6',0)
97774 +
97775 +
97776 +#define ASM_NOP1 K8_NOP1
97777 +#define ASM_NOP2 K8_NOP2
97778 +#define ASM_NOP3 K8_NOP3
97779 +#define ASM_NOP4 K8_NOP4
97780 +#define ASM_NOP5 K8_NOP5
97781 +#define ASM_NOP6 K8_NOP6
97782 +#define ASM_NOP7 K8_NOP7
97783 +#define ASM_NOP8 K8_NOP8
97784 +
97785 +/* Opteron nops */
97786 +#define K8_NOP1 ".byte 0x90\n"
97787 +#define K8_NOP2 ".byte 0x66,0x90\n"
97788 +#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
97789 +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
97790 +#define K8_NOP5 K8_NOP3 K8_NOP2
97791 +#define K8_NOP6 K8_NOP3 K8_NOP3
97792 +#define K8_NOP7 K8_NOP4 K8_NOP3
97793 +#define K8_NOP8 K8_NOP4 K8_NOP4
97794 +
97795 +#define ASM_NOP_MAX 8
97796 +
97797 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
97798 +static inline void rep_nop(void)
97799 +{
97800 + __asm__ __volatile__("rep;nop": : :"memory");
97801 +}
97802 +
97803 +/* Stop speculative execution */
97804 +static inline void sync_core(void)
97805 +{
97806 + int tmp;
97807 + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
97808 +}
97809 +
97810 +#define cpu_has_fpu 1
97811 +
97812 +#define ARCH_HAS_PREFETCH
97813 +static inline void prefetch(void *x)
97814 +{
97815 + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
97816 +}
97817 +
97818 +#define ARCH_HAS_PREFETCHW 1
97819 +static inline void prefetchw(void *x)
97820 +{
97821 + alternative_input("prefetcht0 (%1)",
97822 + "prefetchw (%1)",
97823 + X86_FEATURE_3DNOW,
97824 + "r" (x));
97825 +}
97826 +
97827 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
97828 +
97829 +#define spin_lock_prefetch(x) prefetchw(x)
97830 +
97831 +#define cpu_relax() rep_nop()
97832 +
97833 +/*
97834 + * NSC/Cyrix CPU configuration register indexes
97835 + */
97836 +#define CX86_CCR0 0xc0
97837 +#define CX86_CCR1 0xc1
97838 +#define CX86_CCR2 0xc2
97839 +#define CX86_CCR3 0xc3
97840 +#define CX86_CCR4 0xe8
97841 +#define CX86_CCR5 0xe9
97842 +#define CX86_CCR6 0xea
97843 +#define CX86_CCR7 0xeb
97844 +#define CX86_DIR0 0xfe
97845 +#define CX86_DIR1 0xff
97846 +#define CX86_ARR_BASE 0xc4
97847 +#define CX86_RCR_BASE 0xdc
97848 +
97849 +/*
97850 + * NSC/Cyrix CPU indexed register access macros
97851 + */
97852 +
97853 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
97854 +
97855 +#define setCx86(reg, data) do { \
97856 + outb((reg), 0x22); \
97857 + outb((data), 0x23); \
97858 +} while (0)
97859 +
97860 +static inline void serialize_cpu(void)
97861 +{
97862 + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
97863 +}
97864 +
97865 +static inline void __monitor(const void *eax, unsigned long ecx,
97866 + unsigned long edx)
97867 +{
97868 + /* "monitor %eax,%ecx,%edx;" */
97869 + asm volatile(
97870 + ".byte 0x0f,0x01,0xc8;"
97871 + : :"a" (eax), "c" (ecx), "d"(edx));
97872 +}
97873 +
97874 +static inline void __mwait(unsigned long eax, unsigned long ecx)
97875 +{
97876 + /* "mwait %eax,%ecx;" */
97877 + asm volatile(
97878 + ".byte 0x0f,0x01,0xc9;"
97879 + : :"a" (eax), "c" (ecx));
97880 +}
97881 +
97882 +#define stack_current() \
97883 +({ \
97884 + struct thread_info *ti; \
97885 + asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
97886 + ti->task; \
97887 +})
97888 +
97889 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
97890 +
97891 +extern unsigned long boot_option_idle_override;
97892 +/* Boot loader type from the setup header */
97893 +extern int bootloader_type;
97894 +
97895 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
97896 +
97897 +#endif /* __ASM_X86_64_PROCESSOR_H */
97898 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/ptrace.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/ptrace.h
97899 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/ptrace.h 1970-01-01 00:00:00.000000000 +0000
97900 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/ptrace.h 2007-01-08 15:00:46.000000000 +0000
97901 @@ -0,0 +1,127 @@
97902 +#ifndef _X86_64_PTRACE_H
97903 +#define _X86_64_PTRACE_H
97904 +
97905 +#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
97906 +#define R15 0
97907 +#define R14 8
97908 +#define R13 16
97909 +#define R12 24
97910 +#define RBP 32
97911 +#define RBX 40
97912 +/* arguments: interrupts/non tracing syscalls only save upto here*/
97913 +#define R11 48
97914 +#define R10 56
97915 +#define R9 64
97916 +#define R8 72
97917 +#define RAX 80
97918 +#define RCX 88
97919 +#define RDX 96
97920 +#define RSI 104
97921 +#define RDI 112
97922 +#define ORIG_RAX 120 /* = ERROR */
97923 +/* end of arguments */
97924 +/* cpu exception frame or undefined in case of fast syscall. */
97925 +#define RIP 128
97926 +#define CS 136
97927 +#define EFLAGS 144
97928 +#define RSP 152
97929 +#define SS 160
97930 +#define ARGOFFSET R11
97931 +#endif /* __ASSEMBLY__ */
97932 +
97933 +/* top of stack page */
97934 +#define FRAME_SIZE 168
97935 +
97936 +#define PTRACE_OLDSETOPTIONS 21
97937 +
97938 +#ifndef __ASSEMBLY__
97939 +
97940 +struct pt_regs {
97941 + unsigned long r15;
97942 + unsigned long r14;
97943 + unsigned long r13;
97944 + unsigned long r12;
97945 + unsigned long rbp;
97946 + unsigned long rbx;
97947 +/* arguments: non interrupts/non tracing syscalls only save upto here*/
97948 + unsigned long r11;
97949 + unsigned long r10;
97950 + unsigned long r9;
97951 + unsigned long r8;
97952 + unsigned long rax;
97953 + unsigned long rcx;
97954 + unsigned long rdx;
97955 + unsigned long rsi;
97956 + unsigned long rdi;
97957 + unsigned long orig_rax;
97958 +/* end of arguments */
97959 +/* cpu exception frame or undefined */
97960 + unsigned long rip;
97961 + unsigned long cs;
97962 + unsigned long eflags;
97963 + unsigned long rsp;
97964 + unsigned long ss;
97965 +/* top of stack page */
97966 +};
97967 +
97968 +#endif
97969 +
97970 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
97971 +#define PTRACE_GETREGS 12
97972 +#define PTRACE_SETREGS 13
97973 +#define PTRACE_GETFPREGS 14
97974 +#define PTRACE_SETFPREGS 15
97975 +#define PTRACE_GETFPXREGS 18
97976 +#define PTRACE_SETFPXREGS 19
97977 +
97978 +/* only useful for access 32bit programs */
97979 +#define PTRACE_GET_THREAD_AREA 25
97980 +#define PTRACE_SET_THREAD_AREA 26
97981 +
97982 +#define PTRACE_ARCH_PRCTL 30 /* arch_prctl for child */
97983 +
97984 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
97985 +#define user_mode(regs) (!!((regs)->cs & 3))
97986 +#define user_mode_vm(regs) user_mode(regs)
97987 +#define instruction_pointer(regs) ((regs)->rip)
97988 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
97989 +extern unsigned long profile_pc(struct pt_regs *regs);
97990 +#else
97991 +#define profile_pc(regs) instruction_pointer(regs)
97992 +#endif
97993 +
97994 +#include <linux/compiler.h>
97995 +
97996 +void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
97997 +
97998 +struct task_struct;
97999 +
98000 +extern unsigned long
98001 +convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
98002 +
98003 +enum {
98004 + EF_CF = 0x00000001,
98005 + EF_PF = 0x00000004,
98006 + EF_AF = 0x00000010,
98007 + EF_ZF = 0x00000040,
98008 + EF_SF = 0x00000080,
98009 + EF_TF = 0x00000100,
98010 + EF_IE = 0x00000200,
98011 + EF_DF = 0x00000400,
98012 + EF_OF = 0x00000800,
98013 + EF_IOPL = 0x00003000,
98014 + EF_IOPL_RING0 = 0x00000000,
98015 + EF_IOPL_RING1 = 0x00001000,
98016 + EF_IOPL_RING2 = 0x00002000,
98017 + EF_NT = 0x00004000, /* nested task */
98018 + EF_RF = 0x00010000, /* resume */
98019 + EF_VM = 0x00020000, /* virtual mode */
98020 + EF_AC = 0x00040000, /* alignment */
98021 + EF_VIF = 0x00080000, /* virtual interrupt */
98022 + EF_VIP = 0x00100000, /* virtual interrupt pending */
98023 + EF_ID = 0x00200000, /* id */
98024 +};
98025 +
98026 +#endif
98027 +
98028 +#endif
98029 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/smp.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/smp.h
98030 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/smp.h 1970-01-01 00:00:00.000000000 +0000
98031 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/smp.h 2007-01-08 15:00:46.000000000 +0000
98032 @@ -0,0 +1,152 @@
98033 +#ifndef __ASM_SMP_H
98034 +#define __ASM_SMP_H
98035 +
98036 +/*
98037 + * We need the APIC definitions automatically as part of 'smp.h'
98038 + */
98039 +#ifndef __ASSEMBLY__
98040 +#include <linux/config.h>
98041 +#include <linux/threads.h>
98042 +#include <linux/cpumask.h>
98043 +#include <linux/bitops.h>
98044 +extern int disable_apic;
98045 +#endif
98046 +
98047 +#ifdef CONFIG_X86_LOCAL_APIC
98048 +#ifndef __ASSEMBLY__
98049 +#include <asm/fixmap.h>
98050 +#include <asm/mpspec.h>
98051 +#ifdef CONFIG_X86_IO_APIC
98052 +#include <asm/io_apic.h>
98053 +#endif
98054 +#include <asm/apic.h>
98055 +#include <asm/thread_info.h>
98056 +#endif
98057 +#endif
98058 +
98059 +#ifdef CONFIG_SMP
98060 +#ifndef ASSEMBLY
98061 +
98062 +#include <asm/pda.h>
98063 +
98064 +struct pt_regs;
98065 +
98066 +extern cpumask_t cpu_present_mask;
98067 +extern cpumask_t cpu_possible_map;
98068 +extern cpumask_t cpu_online_map;
98069 +extern cpumask_t cpu_initialized;
98070 +
98071 +/*
98072 + * Private routines/data
98073 + */
98074 +
98075 +extern void smp_alloc_memory(void);
98076 +extern volatile unsigned long smp_invalidate_needed;
98077 +extern int pic_mode;
98078 +extern void lock_ipi_call_lock(void);
98079 +extern void unlock_ipi_call_lock(void);
98080 +extern int smp_num_siblings;
98081 +extern void smp_send_reschedule(int cpu);
98082 +void smp_stop_cpu(void);
98083 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
98084 + void *info, int retry, int wait);
98085 +
98086 +extern cpumask_t cpu_sibling_map[NR_CPUS];
98087 +extern cpumask_t cpu_core_map[NR_CPUS];
98088 +extern int phys_proc_id[NR_CPUS];
98089 +extern int cpu_core_id[NR_CPUS];
98090 +
98091 +#define SMP_TRAMPOLINE_BASE 0x6000
98092 +
98093 +/*
98094 + * On x86 all CPUs are mapped 1:1 to the APIC space.
98095 + * This simplifies scheduling and IPI sending and
98096 + * compresses data structures.
98097 + */
98098 +
98099 +static inline int num_booting_cpus(void)
98100 +{
98101 + return cpus_weight(cpu_possible_map);
98102 +}
98103 +
98104 +#define raw_smp_processor_id() read_pda(cpunumber)
98105 +
98106 +#ifdef CONFIG_X86_LOCAL_APIC
98107 +static inline int hard_smp_processor_id(void)
98108 +{
98109 + /* we don't want to mark this access volatile - bad code generation */
98110 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
98111 +}
98112 +#endif
98113 +
98114 +extern int safe_smp_processor_id(void);
98115 +extern int __cpu_disable(void);
98116 +extern void __cpu_die(unsigned int cpu);
98117 +extern void prefill_possible_map(void);
98118 +extern unsigned num_processors;
98119 +extern unsigned disabled_cpus;
98120 +
98121 +#endif /* !ASSEMBLY */
98122 +
98123 +#define NO_PROC_ID 0xFF /* No processor magic marker */
98124 +
98125 +#endif
98126 +
98127 +#ifndef ASSEMBLY
98128 +/*
98129 + * Some lowlevel functions might want to know about
98130 + * the real APIC ID <-> CPU # mapping.
98131 + */
98132 +extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
98133 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
98134 +extern u8 bios_cpu_apicid[];
98135 +
98136 +#ifdef CONFIG_X86_LOCAL_APIC
98137 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
98138 +{
98139 + return cpus_addr(cpumask)[0];
98140 +}
98141 +
98142 +static inline int cpu_present_to_apicid(int mps_cpu)
98143 +{
98144 + if (mps_cpu < NR_CPUS)
98145 + return (int)bios_cpu_apicid[mps_cpu];
98146 + else
98147 + return BAD_APICID;
98148 +}
98149 +#endif
98150 +
98151 +#endif /* !ASSEMBLY */
98152 +
98153 +#ifndef CONFIG_SMP
98154 +#define stack_smp_processor_id() 0
98155 +#define safe_smp_processor_id() 0
98156 +#define cpu_logical_map(x) (x)
98157 +#else
98158 +#include <asm/thread_info.h>
98159 +#define stack_smp_processor_id() \
98160 +({ \
98161 + struct thread_info *ti; \
98162 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
98163 + ti->cpu; \
98164 +})
98165 +#endif
98166 +
98167 +#ifndef __ASSEMBLY__
98168 +#ifdef CONFIG_X86_LOCAL_APIC
98169 +static __inline int logical_smp_processor_id(void)
98170 +{
98171 + /* we don't want to mark this access volatile - bad code generation */
98172 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
98173 +}
98174 +#endif
98175 +#endif
98176 +
98177 +#ifdef CONFIG_SMP
98178 +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
98179 +#else
98180 +#define cpu_physical_id(cpu) boot_cpu_id
98181 +#endif
98182 +
98183 +#endif
98184 +
98185 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/synch_bitops.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/synch_bitops.h
98186 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/synch_bitops.h 1970-01-01 00:00:00.000000000 +0000
98187 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/synch_bitops.h 2007-01-08 15:00:46.000000000 +0000
98188 @@ -0,0 +1,2 @@
98189 +
98190 +#include <asm-i386/mach-xen/asm/synch_bitops.h>
98191 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/system.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/system.h
98192 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/system.h 1970-01-01 00:00:00.000000000 +0000
98193 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/system.h 2007-01-08 15:00:46.000000000 +0000
98194 @@ -0,0 +1,434 @@
98195 +#ifndef __ASM_SYSTEM_H
98196 +#define __ASM_SYSTEM_H
98197 +
98198 +#include <linux/config.h>
98199 +#include <linux/kernel.h>
98200 +#include <asm/segment.h>
98201 +#include <asm/synch_bitops.h>
98202 +#include <asm/hypervisor.h>
98203 +#include <xen/interface/arch-x86_64.h>
98204 +
98205 +#ifdef __KERNEL__
98206 +
98207 +#ifdef CONFIG_SMP
98208 +#define __vcpu_id smp_processor_id()
98209 +#else
98210 +#define __vcpu_id 0
98211 +#endif
98212 +
98213 +#ifdef CONFIG_SMP
98214 +#define LOCK_PREFIX "lock ; "
98215 +#else
98216 +#define LOCK_PREFIX ""
98217 +#endif
98218 +
98219 +#define __STR(x) #x
98220 +#define STR(x) __STR(x)
98221 +
98222 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
98223 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
98224 +
98225 +/* frame pointer must be last for get_wchan */
98226 +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
98227 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t"
98228 +
98229 +#define __EXTRA_CLOBBER \
98230 + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
98231 +
98232 +#define switch_to(prev,next,last) \
98233 + asm volatile(SAVE_CONTEXT \
98234 + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
98235 + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
98236 + "call __switch_to\n\t" \
98237 + ".globl thread_return\n" \
98238 + "thread_return:\n\t" \
98239 + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
98240 + "movq %P[thread_info](%%rsi),%%r8\n\t" \
98241 + LOCK "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
98242 + "movq %%rax,%%rdi\n\t" \
98243 + "jc ret_from_fork\n\t" \
98244 + RESTORE_CONTEXT \
98245 + : "=a" (last) \
98246 + : [next] "S" (next), [prev] "D" (prev), \
98247 + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
98248 + [ti_flags] "i" (offsetof(struct thread_info, flags)),\
98249 + [tif_fork] "i" (TIF_FORK), \
98250 + [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
98251 + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
98252 + : "memory", "cc" __EXTRA_CLOBBER)
98253 +
98254 +
98255 +extern void load_gs_index(unsigned);
98256 +
98257 +/*
98258 + * Load a segment. Fall back on loading the zero
98259 + * segment if something goes wrong..
98260 + */
98261 +#define loadsegment(seg,value) \
98262 + asm volatile("\n" \
98263 + "1:\t" \
98264 + "movl %k0,%%" #seg "\n" \
98265 + "2:\n" \
98266 + ".section .fixup,\"ax\"\n" \
98267 + "3:\t" \
98268 + "movl %1,%%" #seg "\n\t" \
98269 + "jmp 2b\n" \
98270 + ".previous\n" \
98271 + ".section __ex_table,\"a\"\n\t" \
98272 + ".align 8\n\t" \
98273 + ".quad 1b,3b\n" \
98274 + ".previous" \
98275 + : :"r" (value), "r" (0))
98276 +
98277 +#define set_debug(value,register) \
98278 + __asm__("movq %0,%%db" #register \
98279 + : /* no output */ \
98280 + :"r" ((unsigned long) value))
98281 +
98282 +
98283 +#ifdef __KERNEL__
98284 +struct alt_instr {
98285 + __u8 *instr; /* original instruction */
98286 + __u8 *replacement;
98287 + __u8 cpuid; /* cpuid bit set for replacement */
98288 + __u8 instrlen; /* length of original instruction */
98289 + __u8 replacementlen; /* length of new instruction, <= instrlen */
98290 + __u8 pad[5];
98291 +};
98292 +#endif
98293 +
98294 +/*
98295 + * Alternative instructions for different CPU types or capabilities.
98296 + *
98297 + * This allows to use optimized instructions even on generic binary
98298 + * kernels.
98299 + *
98300 + * length of oldinstr must be longer or equal the length of newinstr
98301 + * It can be padded with nops as needed.
98302 + *
98303 + * For non barrier like inlines please define new variants
98304 + * without volatile and memory clobber.
98305 + */
98306 +#define alternative(oldinstr, newinstr, feature) \
98307 + asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98308 + ".section .altinstructions,\"a\"\n" \
98309 + " .align 8\n" \
98310 + " .quad 661b\n" /* label */ \
98311 + " .quad 663f\n" /* new instruction */ \
98312 + " .byte %c0\n" /* feature bit */ \
98313 + " .byte 662b-661b\n" /* sourcelen */ \
98314 + " .byte 664f-663f\n" /* replacementlen */ \
98315 + ".previous\n" \
98316 + ".section .altinstr_replacement,\"ax\"\n" \
98317 + "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98318 + ".previous" :: "i" (feature) : "memory")
98319 +
98320 +/*
98321 + * Alternative inline assembly with input.
98322 + *
98323 + * Peculiarities:
98324 + * No memory clobber here.
98325 + * Argument numbers start with 1.
98326 + * Best is to use constraints that are fixed size (like (%1) ... "r")
98327 + * If you use variable sized constraints like "m" or "g" in the
98328 + * replacement make sure to pad to the worst case length.
98329 + */
98330 +#define alternative_input(oldinstr, newinstr, feature, input...) \
98331 + asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98332 + ".section .altinstructions,\"a\"\n" \
98333 + " .align 8\n" \
98334 + " .quad 661b\n" /* label */ \
98335 + " .quad 663f\n" /* new instruction */ \
98336 + " .byte %c0\n" /* feature bit */ \
98337 + " .byte 662b-661b\n" /* sourcelen */ \
98338 + " .byte 664f-663f\n" /* replacementlen */ \
98339 + ".previous\n" \
98340 + ".section .altinstr_replacement,\"ax\"\n" \
98341 + "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98342 + ".previous" :: "i" (feature), ##input)
98343 +
98344 +/* Like alternative_input, but with a single output argument */
98345 +#define alternative_io(oldinstr, newinstr, feature, output, input...) \
98346 + asm volatile ("661:\n\t" oldinstr "\n662:\n" \
98347 + ".section .altinstructions,\"a\"\n" \
98348 + " .align 8\n" \
98349 + " .quad 661b\n" /* label */ \
98350 + " .quad 663f\n" /* new instruction */ \
98351 + " .byte %c[feat]\n" /* feature bit */ \
98352 + " .byte 662b-661b\n" /* sourcelen */ \
98353 + " .byte 664f-663f\n" /* replacementlen */ \
98354 + ".previous\n" \
98355 + ".section .altinstr_replacement,\"ax\"\n" \
98356 + "663:\n\t" newinstr "\n664:\n" /* replacement */ \
98357 + ".previous" : output : [feat] "i" (feature), ##input)
98358 +
98359 +/*
98360 + * Clear and set 'TS' bit respectively
98361 + */
98362 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
98363 +
98364 +static inline unsigned long read_cr0(void)
98365 +{
98366 + unsigned long cr0;
98367 + asm volatile("movq %%cr0,%0" : "=r" (cr0));
98368 + return cr0;
98369 +}
98370 +
98371 +static inline void write_cr0(unsigned long val)
98372 +{
98373 + asm volatile("movq %0,%%cr0" :: "r" (val));
98374 +}
98375 +
98376 +#define read_cr3() ({ \
98377 + unsigned long __dummy; \
98378 + asm("movq %%cr3,%0" : "=r" (__dummy)); \
98379 + machine_to_phys(__dummy); \
98380 +})
98381 +
98382 +static inline unsigned long read_cr4(void)
98383 +{
98384 + unsigned long cr4;
98385 + asm("movq %%cr4,%0" : "=r" (cr4));
98386 + return cr4;
98387 +}
98388 +
98389 +static inline void write_cr4(unsigned long val)
98390 +{
98391 + asm volatile("movq %0,%%cr4" :: "r" (val));
98392 +}
98393 +
98394 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
98395 +
98396 +#define wbinvd() \
98397 + __asm__ __volatile__ ("wbinvd": : :"memory");
98398 +
98399 +/*
98400 + * On SMP systems, when the scheduler does migration-cost autodetection,
98401 + * it needs a way to flush as much of the CPU's caches as possible.
98402 + */
98403 +static inline void sched_cacheflush(void)
98404 +{
98405 + wbinvd();
98406 +}
98407 +
98408 +#endif /* __KERNEL__ */
98409 +
98410 +#define nop() __asm__ __volatile__ ("nop")
98411 +
98412 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
98413 +
98414 +#define tas(ptr) (xchg((ptr),1))
98415 +
98416 +#define __xg(x) ((volatile long *)(x))
98417 +
98418 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
98419 +{
98420 + *ptr = val;
98421 +}
98422 +
98423 +#define _set_64bit set_64bit
98424 +
98425 +/*
98426 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
98427 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
98428 + * but generally the primitive is invalid, *ptr is output argument. --ANK
98429 + */
98430 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
98431 +{
98432 + switch (size) {
98433 + case 1:
98434 + __asm__ __volatile__("xchgb %b0,%1"
98435 + :"=q" (x)
98436 + :"m" (*__xg(ptr)), "0" (x)
98437 + :"memory");
98438 + break;
98439 + case 2:
98440 + __asm__ __volatile__("xchgw %w0,%1"
98441 + :"=r" (x)
98442 + :"m" (*__xg(ptr)), "0" (x)
98443 + :"memory");
98444 + break;
98445 + case 4:
98446 + __asm__ __volatile__("xchgl %k0,%1"
98447 + :"=r" (x)
98448 + :"m" (*__xg(ptr)), "0" (x)
98449 + :"memory");
98450 + break;
98451 + case 8:
98452 + __asm__ __volatile__("xchgq %0,%1"
98453 + :"=r" (x)
98454 + :"m" (*__xg(ptr)), "0" (x)
98455 + :"memory");
98456 + break;
98457 + }
98458 + return x;
98459 +}
98460 +
98461 +/*
98462 + * Atomic compare and exchange. Compare OLD with MEM, if identical,
98463 + * store NEW in MEM. Return the initial value in MEM. Success is
98464 + * indicated by comparing RETURN with OLD.
98465 + */
98466 +
98467 +#define __HAVE_ARCH_CMPXCHG 1
98468 +
98469 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
98470 + unsigned long new, int size)
98471 +{
98472 + unsigned long prev;
98473 + switch (size) {
98474 + case 1:
98475 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
98476 + : "=a"(prev)
98477 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
98478 + : "memory");
98479 + return prev;
98480 + case 2:
98481 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
98482 + : "=a"(prev)
98483 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
98484 + : "memory");
98485 + return prev;
98486 + case 4:
98487 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
98488 + : "=a"(prev)
98489 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
98490 + : "memory");
98491 + return prev;
98492 + case 8:
98493 + __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
98494 + : "=a"(prev)
98495 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
98496 + : "memory");
98497 + return prev;
98498 + }
98499 + return old;
98500 +}
98501 +
98502 +#define cmpxchg(ptr,o,n)\
98503 + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
98504 + (unsigned long)(n),sizeof(*(ptr))))
98505 +
98506 +#ifdef CONFIG_SMP
98507 +#define smp_mb() mb()
98508 +#define smp_rmb() rmb()
98509 +#define smp_wmb() wmb()
98510 +#define smp_read_barrier_depends() do {} while(0)
98511 +#else
98512 +#define smp_mb() barrier()
98513 +#define smp_rmb() barrier()
98514 +#define smp_wmb() barrier()
98515 +#define smp_read_barrier_depends() do {} while(0)
98516 +#endif
98517 +
98518 +
98519 +/*
98520 + * Force strict CPU ordering.
98521 + * And yes, this is required on UP too when we're talking
98522 + * to devices.
98523 + */
98524 +#define mb() asm volatile("mfence":::"memory")
98525 +#define rmb() asm volatile("lfence":::"memory")
98526 +
98527 +#ifdef CONFIG_UNORDERED_IO
98528 +#define wmb() asm volatile("sfence" ::: "memory")
98529 +#else
98530 +#define wmb() asm volatile("" ::: "memory")
98531 +#endif
98532 +#define read_barrier_depends() do {} while(0)
98533 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
98534 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
98535 +
98536 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
98537 +
98538 +
98539 +/*
98540 + * The use of 'barrier' in the following reflects their use as local-lock
98541 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
98542 + * critical operations are executed. All critical operations must complete
98543 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
98544 + * includes these barriers, for example.
98545 + */
98546 +
98547 +#define __cli() \
98548 +do { \
98549 + vcpu_info_t *_vcpu; \
98550 + preempt_disable(); \
98551 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98552 + _vcpu->evtchn_upcall_mask = 1; \
98553 + preempt_enable_no_resched(); \
98554 + barrier(); \
98555 +} while (0)
98556 +
98557 +#define __sti() \
98558 +do { \
98559 + vcpu_info_t *_vcpu; \
98560 + barrier(); \
98561 + preempt_disable(); \
98562 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98563 + _vcpu->evtchn_upcall_mask = 0; \
98564 + barrier(); /* unmask then check (avoid races) */ \
98565 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
98566 + force_evtchn_callback(); \
98567 + preempt_enable(); \
98568 +} while (0)
98569 +
98570 +#define __save_flags(x) \
98571 +do { \
98572 + vcpu_info_t *_vcpu; \
98573 + preempt_disable(); \
98574 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98575 + (x) = _vcpu->evtchn_upcall_mask; \
98576 + preempt_enable(); \
98577 +} while (0)
98578 +
98579 +#define __restore_flags(x) \
98580 +do { \
98581 + vcpu_info_t *_vcpu; \
98582 + barrier(); \
98583 + preempt_disable(); \
98584 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98585 + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
98586 + barrier(); /* unmask then check (avoid races) */ \
98587 + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
98588 + force_evtchn_callback(); \
98589 + preempt_enable(); \
98590 + } else \
98591 + preempt_enable_no_resched(); \
98592 +} while (0)
98593 +
98594 +#define __save_and_cli(x) \
98595 +do { \
98596 + vcpu_info_t *_vcpu; \
98597 + preempt_disable(); \
98598 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98599 + (x) = _vcpu->evtchn_upcall_mask; \
98600 + _vcpu->evtchn_upcall_mask = 1; \
98601 + preempt_enable_no_resched(); \
98602 + barrier(); \
98603 +} while (0)
98604 +
98605 +#define local_irq_save(x) __save_and_cli(x)
98606 +#define local_irq_restore(x) __restore_flags(x)
98607 +#define local_save_flags(x) __save_flags(x)
98608 +#define local_irq_disable() __cli()
98609 +#define local_irq_enable() __sti()
98610 +
98611 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
98612 +#define irqs_disabled() \
98613 +({ int ___x; \
98614 + vcpu_info_t *_vcpu; \
98615 + preempt_disable(); \
98616 + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \
98617 + ___x = (_vcpu->evtchn_upcall_mask != 0); \
98618 + preempt_enable_no_resched(); \
98619 + ___x; })
98620 +
98621 +void safe_halt(void);
98622 +void halt(void);
98623 +
98624 +void cpu_idle_wait(void);
98625 +
98626 +extern unsigned long arch_align_stack(unsigned long sp);
98627 +
98628 +#endif
98629 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/timer.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/timer.h
98630 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/timer.h 1970-01-01 00:00:00.000000000 +0000
98631 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/timer.h 2007-01-08 15:00:46.000000000 +0000
98632 @@ -0,0 +1,67 @@
98633 +#ifndef _ASMi386_TIMER_H
98634 +#define _ASMi386_TIMER_H
98635 +#include <linux/init.h>
98636 +
98637 +/**
98638 + * struct timer_ops - used to define a timer source
98639 + *
98640 + * @name: name of the timer.
98641 + * @init: Probes and initializes the timer. Takes clock= override
98642 + * string as an argument. Returns 0 on success, anything else
98643 + * on failure.
98644 + * @mark_offset: called by the timer interrupt.
98645 + * @get_offset: called by gettimeofday(). Returns the number of microseconds
98646 + * since the last timer interupt.
98647 + * @monotonic_clock: returns the number of nanoseconds since the init of the
98648 + * timer.
98649 + * @delay: delays this many clock cycles.
98650 + */
98651 +struct timer_opts {
98652 + char* name;
98653 + void (*mark_offset)(void);
98654 + unsigned long (*get_offset)(void);
98655 + unsigned long long (*monotonic_clock)(void);
98656 + void (*delay)(unsigned long);
98657 + unsigned long (*read_timer)(void);
98658 + int (*suspend)(pm_message_t state);
98659 + int (*resume)(void);
98660 +};
98661 +
98662 +struct init_timer_opts {
98663 + int (*init)(char *override);
98664 + struct timer_opts *opts;
98665 +};
98666 +
98667 +#define TICK_SIZE (tick_nsec / 1000)
98668 +
98669 +extern struct timer_opts* __init select_timer(void);
98670 +extern void clock_fallback(void);
98671 +void setup_pit_timer(void);
98672 +
98673 +/* Modifiers for buggy PIT handling */
98674 +
98675 +extern int pit_latch_buggy;
98676 +
98677 +extern struct timer_opts *cur_timer;
98678 +extern int timer_ack;
98679 +
98680 +/* list of externed timers */
98681 +extern struct timer_opts timer_none;
98682 +extern struct timer_opts timer_pit;
98683 +extern struct init_timer_opts timer_pit_init;
98684 +extern struct init_timer_opts timer_tsc_init;
98685 +#ifdef CONFIG_X86_CYCLONE_TIMER
98686 +extern struct init_timer_opts timer_cyclone_init;
98687 +#endif
98688 +
98689 +extern unsigned long calibrate_tsc(void);
98690 +extern void init_cpu_khz(void);
98691 +#ifdef CONFIG_HPET_TIMER
98692 +extern struct init_timer_opts timer_hpet_init;
98693 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
98694 +#endif
98695 +
98696 +#ifdef CONFIG_X86_PM_TIMER
98697 +extern struct init_timer_opts timer_pmtmr_init;
98698 +#endif
98699 +#endif
98700 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/tlbflush.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/tlbflush.h
98701 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/tlbflush.h 1970-01-01 00:00:00.000000000 +0000
98702 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/tlbflush.h 2007-01-08 15:00:46.000000000 +0000
98703 @@ -0,0 +1,104 @@
98704 +#ifndef _X8664_TLBFLUSH_H
98705 +#define _X8664_TLBFLUSH_H
98706 +
98707 +#include <linux/config.h>
98708 +#include <linux/mm.h>
98709 +#include <asm/processor.h>
98710 +
98711 +#define __flush_tlb() xen_tlb_flush()
98712 +
98713 +/*
98714 + * Global pages have to be flushed a bit differently. Not a real
98715 + * performance problem because this does not happen often.
98716 + */
98717 +#define __flush_tlb_global() xen_tlb_flush()
98718 +
98719 +
98720 +extern unsigned long pgkern_mask;
98721 +
98722 +#define __flush_tlb_all() __flush_tlb_global()
98723 +
98724 +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
98725 +
98726 +
98727 +/*
98728 + * TLB flushing:
98729 + *
98730 + * - flush_tlb() flushes the current mm struct TLBs
98731 + * - flush_tlb_all() flushes all processes TLBs
98732 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
98733 + * - flush_tlb_page(vma, vmaddr) flushes one page
98734 + * - flush_tlb_range(vma, start, end) flushes a range of pages
98735 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
98736 + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
98737 + *
98738 + * x86-64 can only flush individual pages or full VMs. For a range flush
98739 + * we always do the full VM. Might be worth trying if for a small
98740 + * range a few INVLPGs in a row are a win.
98741 + */
98742 +
98743 +#ifndef CONFIG_SMP
98744 +
98745 +#define flush_tlb() __flush_tlb()
98746 +#define flush_tlb_all() __flush_tlb_all()
98747 +#define local_flush_tlb() __flush_tlb()
98748 +
98749 +static inline void flush_tlb_mm(struct mm_struct *mm)
98750 +{
98751 + if (mm == current->active_mm)
98752 + __flush_tlb();
98753 +}
98754 +
98755 +static inline void flush_tlb_page(struct vm_area_struct *vma,
98756 + unsigned long addr)
98757 +{
98758 + if (vma->vm_mm == current->active_mm)
98759 + __flush_tlb_one(addr);
98760 +}
98761 +
98762 +static inline void flush_tlb_range(struct vm_area_struct *vma,
98763 + unsigned long start, unsigned long end)
98764 +{
98765 + if (vma->vm_mm == current->active_mm)
98766 + __flush_tlb();
98767 +}
98768 +
98769 +#else
98770 +
98771 +#include <asm/smp.h>
98772 +
98773 +#define local_flush_tlb() \
98774 + __flush_tlb()
98775 +
98776 +extern void flush_tlb_all(void);
98777 +extern void flush_tlb_current_task(void);
98778 +extern void flush_tlb_mm(struct mm_struct *);
98779 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
98780 +
98781 +#define flush_tlb() flush_tlb_current_task()
98782 +
98783 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
98784 +{
98785 + flush_tlb_mm(vma->vm_mm);
98786 +}
98787 +
98788 +#define TLBSTATE_OK 1
98789 +#define TLBSTATE_LAZY 2
98790 +
98791 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
98792 + ranges. Cost is about 42k of memory for each CPU. */
98793 +#define ARCH_FREE_PTE_NR 5350
98794 +
98795 +#endif
98796 +
98797 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
98798 +
98799 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
98800 + unsigned long start, unsigned long end)
98801 +{
98802 + /* x86_64 does not keep any page table caches in a software TLB.
98803 + The CPUs do in their hardware TLBs, but they are handled
98804 + by the normal TLB flushing algorithms. */
98805 +}
98806 +
98807 +#endif /* _X8664_TLBFLUSH_H */
98808 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/vga.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/vga.h
98809 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/vga.h 1970-01-01 00:00:00.000000000 +0000
98810 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/vga.h 2007-01-08 15:00:46.000000000 +0000
98811 @@ -0,0 +1,20 @@
98812 +/*
98813 + * Access to VGA videoram
98814 + *
98815 + * (c) 1998 Martin Mares <mj@ucw.cz>
98816 + */
98817 +
98818 +#ifndef _LINUX_ASM_VGA_H_
98819 +#define _LINUX_ASM_VGA_H_
98820 +
98821 +/*
98822 + * On the PC, we can just recalculate addresses and then
98823 + * access the videoram directly without any black magic.
98824 + */
98825 +
98826 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
98827 +
98828 +#define vga_readb(x) (*(x))
98829 +#define vga_writeb(x,y) (*(y) = (x))
98830 +
98831 +#endif
98832 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xenoprof.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xenoprof.h
98833 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
98834 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
98835 @@ -0,0 +1 @@
98836 +#include <asm-i386/mach-xen/asm/xenoprof.h>
98837 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xor.h linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xor.h
98838 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/asm/xor.h 1970-01-01 00:00:00.000000000 +0000
98839 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/asm/xor.h 2007-01-08 15:00:46.000000000 +0000
98840 @@ -0,0 +1,328 @@
98841 +/*
98842 + * x86-64 changes / gcc fixes from Andi Kleen.
98843 + * Copyright 2002 Andi Kleen, SuSE Labs.
98844 + *
98845 + * This hasn't been optimized for the hammer yet, but there are likely
98846 + * no advantages to be gotten from x86-64 here anyways.
98847 + */
98848 +
98849 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
98850 +
98851 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to
98852 + tell it to do a clts before the register saving. */
98853 +#define XMMS_SAVE do { \
98854 + preempt_disable(); \
98855 + if (!(current_thread_info()->status & TS_USEDFPU)) \
98856 + clts(); \
98857 + __asm__ __volatile__ ( \
98858 + "movups %%xmm0,(%1) ;\n\t" \
98859 + "movups %%xmm1,0x10(%1) ;\n\t" \
98860 + "movups %%xmm2,0x20(%1) ;\n\t" \
98861 + "movups %%xmm3,0x30(%1) ;\n\t" \
98862 + : "=&r" (cr0) \
98863 + : "r" (xmm_save) \
98864 + : "memory"); \
98865 +} while(0)
98866 +
98867 +#define XMMS_RESTORE do { \
98868 + asm volatile ( \
98869 + "sfence ;\n\t" \
98870 + "movups (%1),%%xmm0 ;\n\t" \
98871 + "movups 0x10(%1),%%xmm1 ;\n\t" \
98872 + "movups 0x20(%1),%%xmm2 ;\n\t" \
98873 + "movups 0x30(%1),%%xmm3 ;\n\t" \
98874 + : \
98875 + : "r" (cr0), "r" (xmm_save) \
98876 + : "memory"); \
98877 + if (!(current_thread_info()->status & TS_USEDFPU)) \
98878 + stts(); \
98879 + preempt_enable(); \
98880 +} while(0)
98881 +
98882 +#define OFFS(x) "16*("#x")"
98883 +#define PF_OFFS(x) "256+16*("#x")"
98884 +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
98885 +#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
98886 +#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
98887 +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
98888 +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
98889 +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
98890 +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
98891 +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
98892 +#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
98893 +#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
98894 +#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
98895 +#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
98896 +#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
98897 +
98898 +
98899 +static void
98900 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
98901 +{
98902 + unsigned int lines = bytes >> 8;
98903 + unsigned long cr0;
98904 + xmm_store_t xmm_save[4];
98905 +
98906 + XMMS_SAVE;
98907 +
98908 + asm volatile (
98909 +#undef BLOCK
98910 +#define BLOCK(i) \
98911 + LD(i,0) \
98912 + LD(i+1,1) \
98913 + PF1(i) \
98914 + PF1(i+2) \
98915 + LD(i+2,2) \
98916 + LD(i+3,3) \
98917 + PF0(i+4) \
98918 + PF0(i+6) \
98919 + XO1(i,0) \
98920 + XO1(i+1,1) \
98921 + XO1(i+2,2) \
98922 + XO1(i+3,3) \
98923 + ST(i,0) \
98924 + ST(i+1,1) \
98925 + ST(i+2,2) \
98926 + ST(i+3,3) \
98927 +
98928 +
98929 + PF0(0)
98930 + PF0(2)
98931 +
98932 + " .align 32 ;\n"
98933 + " 1: ;\n"
98934 +
98935 + BLOCK(0)
98936 + BLOCK(4)
98937 + BLOCK(8)
98938 + BLOCK(12)
98939 +
98940 + " addq %[inc], %[p1] ;\n"
98941 + " addq %[inc], %[p2] ;\n"
98942 + " decl %[cnt] ; jnz 1b"
98943 + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
98944 + : [inc] "r" (256UL)
98945 + : "memory");
98946 +
98947 + XMMS_RESTORE;
98948 +}
98949 +
98950 +static void
98951 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
98952 + unsigned long *p3)
98953 +{
98954 + unsigned int lines = bytes >> 8;
98955 + xmm_store_t xmm_save[4];
98956 + unsigned long cr0;
98957 +
98958 + XMMS_SAVE;
98959 +
98960 + __asm__ __volatile__ (
98961 +#undef BLOCK
98962 +#define BLOCK(i) \
98963 + PF1(i) \
98964 + PF1(i+2) \
98965 + LD(i,0) \
98966 + LD(i+1,1) \
98967 + LD(i+2,2) \
98968 + LD(i+3,3) \
98969 + PF2(i) \
98970 + PF2(i+2) \
98971 + PF0(i+4) \
98972 + PF0(i+6) \
98973 + XO1(i,0) \
98974 + XO1(i+1,1) \
98975 + XO1(i+2,2) \
98976 + XO1(i+3,3) \
98977 + XO2(i,0) \
98978 + XO2(i+1,1) \
98979 + XO2(i+2,2) \
98980 + XO2(i+3,3) \
98981 + ST(i,0) \
98982 + ST(i+1,1) \
98983 + ST(i+2,2) \
98984 + ST(i+3,3) \
98985 +
98986 +
98987 + PF0(0)
98988 + PF0(2)
98989 +
98990 + " .align 32 ;\n"
98991 + " 1: ;\n"
98992 +
98993 + BLOCK(0)
98994 + BLOCK(4)
98995 + BLOCK(8)
98996 + BLOCK(12)
98997 +
98998 + " addq %[inc], %[p1] ;\n"
98999 + " addq %[inc], %[p2] ;\n"
99000 + " addq %[inc], %[p3] ;\n"
99001 + " decl %[cnt] ; jnz 1b"
99002 + : [cnt] "+r" (lines),
99003 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
99004 + : [inc] "r" (256UL)
99005 + : "memory");
99006 + XMMS_RESTORE;
99007 +}
99008 +
99009 +static void
99010 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
99011 + unsigned long *p3, unsigned long *p4)
99012 +{
99013 + unsigned int lines = bytes >> 8;
99014 + xmm_store_t xmm_save[4];
99015 + unsigned long cr0;
99016 +
99017 + XMMS_SAVE;
99018 +
99019 + __asm__ __volatile__ (
99020 +#undef BLOCK
99021 +#define BLOCK(i) \
99022 + PF1(i) \
99023 + PF1(i+2) \
99024 + LD(i,0) \
99025 + LD(i+1,1) \
99026 + LD(i+2,2) \
99027 + LD(i+3,3) \
99028 + PF2(i) \
99029 + PF2(i+2) \
99030 + XO1(i,0) \
99031 + XO1(i+1,1) \
99032 + XO1(i+2,2) \
99033 + XO1(i+3,3) \
99034 + PF3(i) \
99035 + PF3(i+2) \
99036 + PF0(i+4) \
99037 + PF0(i+6) \
99038 + XO2(i,0) \
99039 + XO2(i+1,1) \
99040 + XO2(i+2,2) \
99041 + XO2(i+3,3) \
99042 + XO3(i,0) \
99043 + XO3(i+1,1) \
99044 + XO3(i+2,2) \
99045 + XO3(i+3,3) \
99046 + ST(i,0) \
99047 + ST(i+1,1) \
99048 + ST(i+2,2) \
99049 + ST(i+3,3) \
99050 +
99051 +
99052 + PF0(0)
99053 + PF0(2)
99054 +
99055 + " .align 32 ;\n"
99056 + " 1: ;\n"
99057 +
99058 + BLOCK(0)
99059 + BLOCK(4)
99060 + BLOCK(8)
99061 + BLOCK(12)
99062 +
99063 + " addq %[inc], %[p1] ;\n"
99064 + " addq %[inc], %[p2] ;\n"
99065 + " addq %[inc], %[p3] ;\n"
99066 + " addq %[inc], %[p4] ;\n"
99067 + " decl %[cnt] ; jnz 1b"
99068 + : [cnt] "+c" (lines),
99069 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
99070 + : [inc] "r" (256UL)
99071 + : "memory" );
99072 +
99073 + XMMS_RESTORE;
99074 +}
99075 +
99076 +static void
99077 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
99078 + unsigned long *p3, unsigned long *p4, unsigned long *p5)
99079 +{
99080 + unsigned int lines = bytes >> 8;
99081 + xmm_store_t xmm_save[4];
99082 + unsigned long cr0;
99083 +
99084 + XMMS_SAVE;
99085 +
99086 + __asm__ __volatile__ (
99087 +#undef BLOCK
99088 +#define BLOCK(i) \
99089 + PF1(i) \
99090 + PF1(i+2) \
99091 + LD(i,0) \
99092 + LD(i+1,1) \
99093 + LD(i+2,2) \
99094 + LD(i+3,3) \
99095 + PF2(i) \
99096 + PF2(i+2) \
99097 + XO1(i,0) \
99098 + XO1(i+1,1) \
99099 + XO1(i+2,2) \
99100 + XO1(i+3,3) \
99101 + PF3(i) \
99102 + PF3(i+2) \
99103 + XO2(i,0) \
99104 + XO2(i+1,1) \
99105 + XO2(i+2,2) \
99106 + XO2(i+3,3) \
99107 + PF4(i) \
99108 + PF4(i+2) \
99109 + PF0(i+4) \
99110 + PF0(i+6) \
99111 + XO3(i,0) \
99112 + XO3(i+1,1) \
99113 + XO3(i+2,2) \
99114 + XO3(i+3,3) \
99115 + XO4(i,0) \
99116 + XO4(i+1,1) \
99117 + XO4(i+2,2) \
99118 + XO4(i+3,3) \
99119 + ST(i,0) \
99120 + ST(i+1,1) \
99121 + ST(i+2,2) \
99122 + ST(i+3,3) \
99123 +
99124 +
99125 + PF0(0)
99126 + PF0(2)
99127 +
99128 + " .align 32 ;\n"
99129 + " 1: ;\n"
99130 +
99131 + BLOCK(0)
99132 + BLOCK(4)
99133 + BLOCK(8)
99134 + BLOCK(12)
99135 +
99136 + " addq %[inc], %[p1] ;\n"
99137 + " addq %[inc], %[p2] ;\n"
99138 + " addq %[inc], %[p3] ;\n"
99139 + " addq %[inc], %[p4] ;\n"
99140 + " addq %[inc], %[p5] ;\n"
99141 + " decl %[cnt] ; jnz 1b"
99142 + : [cnt] "+c" (lines),
99143 + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
99144 + [p5] "+r" (p5)
99145 + : [inc] "r" (256UL)
99146 + : "memory");
99147 +
99148 + XMMS_RESTORE;
99149 +}
99150 +
99151 +static struct xor_block_template xor_block_sse = {
99152 + .name = "generic_sse",
99153 + .do_2 = xor_sse_2,
99154 + .do_3 = xor_sse_3,
99155 + .do_4 = xor_sse_4,
99156 + .do_5 = xor_sse_5,
99157 +};
99158 +
99159 +#undef XOR_TRY_TEMPLATES
99160 +#define XOR_TRY_TEMPLATES \
99161 + do { \
99162 + xor_speed(&xor_block_sse); \
99163 + } while (0)
99164 +
99165 +/* We force the use of the SSE xor block because it can write around L2.
99166 + We may also be able to load into the L1 only depending on how the cpu
99167 + deals with a load to a line that is being prefetched. */
99168 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
99169 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/irq_vectors.h linux-2.6.16.33/include/asm-x86_64/mach-xen/irq_vectors.h
99170 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/irq_vectors.h 1970-01-01 00:00:00.000000000 +0000
99171 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/irq_vectors.h 2007-01-08 15:00:46.000000000 +0000
99172 @@ -0,0 +1,123 @@
99173 +/*
99174 + * This file should contain #defines for all of the interrupt vector
99175 + * numbers used by this architecture.
99176 + *
99177 + * In addition, there are some standard defines:
99178 + *
99179 + * FIRST_EXTERNAL_VECTOR:
99180 + * The first free place for external interrupts
99181 + *
99182 + * SYSCALL_VECTOR:
99183 + * The IRQ vector a syscall makes the user to kernel transition
99184 + * under.
99185 + *
99186 + * TIMER_IRQ:
99187 + * The IRQ number the timer interrupt comes in at.
99188 + *
99189 + * NR_IRQS:
99190 + * The total number of interrupt vectors (including all the
99191 + * architecture specific interrupts) needed.
99192 + *
99193 + */
99194 +#ifndef _ASM_IRQ_VECTORS_H
99195 +#define _ASM_IRQ_VECTORS_H
99196 +
99197 +/*
99198 + * IDT vectors usable for external interrupt sources start
99199 + * at 0x20:
99200 + */
99201 +#define FIRST_EXTERNAL_VECTOR 0x20
99202 +
99203 +#define SYSCALL_VECTOR 0x80
99204 +
99205 +/*
99206 + * Vectors 0x20-0x2f are used for ISA interrupts.
99207 + */
99208 +
99209 +#if 0
99210 +/*
99211 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
99212 + *
99213 + * some of the following vectors are 'rare', they are merged
99214 + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
99215 + * TLB, reschedule and local APIC vectors are performance-critical.
99216 + *
99217 + * Vectors 0xf0-0xfa are free (reserved for future Linux use).
99218 + */
99219 +#define INVALIDATE_TLB_VECTOR 0xfd
99220 +#define RESCHEDULE_VECTOR 0xfc
99221 +#define CALL_FUNCTION_VECTOR 0xfb
99222 +
99223 +#define THERMAL_APIC_VECTOR 0xf0
99224 +/*
99225 + * Local APIC timer IRQ vector is on a different priority level,
99226 + * to work around the 'lost local interrupt if more than 2 IRQ
99227 + * sources per level' errata.
99228 + */
99229 +#define LOCAL_TIMER_VECTOR 0xef
99230 +#endif
99231 +
99232 +#define SPURIOUS_APIC_VECTOR 0xff
99233 +#define ERROR_APIC_VECTOR 0xfe
99234 +
99235 +/*
99236 + * First APIC vector available to drivers: (vectors 0x30-0xee)
99237 + * we start at 0x31 to spread out vectors evenly between priority
99238 + * levels. (0x80 is the syscall vector)
99239 + */
99240 +#define FIRST_DEVICE_VECTOR 0x31
99241 +#define FIRST_SYSTEM_VECTOR 0xef
99242 +
99243 +/*
99244 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
99245 + * Right now the APIC is mostly only used for SMP.
99246 + * 256 vectors is an architectural limit. (we can have
99247 + * more than 256 devices theoretically, but they will
99248 + * have to use shared interrupts)
99249 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
99250 + * the usable vector space is 0x20-0xff (224 vectors)
99251 + */
99252 +
99253 +#define RESCHEDULE_VECTOR 0
99254 +#define CALL_FUNCTION_VECTOR 1
99255 +#define NR_IPIS 2
99256 +
99257 +/*
99258 + * The maximum number of vectors supported by i386 processors
99259 + * is limited to 256. For processors other than i386, NR_VECTORS
99260 + * should be changed accordingly.
99261 + */
99262 +#define NR_VECTORS 256
99263 +
99264 +#define FPU_IRQ 13
99265 +
99266 +#define FIRST_VM86_IRQ 3
99267 +#define LAST_VM86_IRQ 15
99268 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
99269 +
99270 +/*
99271 + * The flat IRQ space is divided into two regions:
99272 + * 1. A one-to-one mapping of real physical IRQs. This space is only used
99273 + * if we have physical device-access privilege. This region is at the
99274 + * start of the IRQ space so that existing device drivers do not need
99275 + * to be modified to translate physical IRQ numbers into our IRQ space.
99276 + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
99277 + * are bound using the provided bind/unbind functions.
99278 + */
99279 +
99280 +#define PIRQ_BASE 0
99281 +#define NR_PIRQS 256
99282 +
99283 +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
99284 +#define NR_DYNIRQS 256
99285 +
99286 +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
99287 +#define NR_IRQ_VECTORS NR_IRQS
99288 +
99289 +#define pirq_to_irq(_x) ((_x) + PIRQ_BASE)
99290 +#define irq_to_pirq(_x) ((_x) - PIRQ_BASE)
99291 +
99292 +#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE)
99293 +#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE)
99294 +
99295 +#endif /* _ASM_IRQ_VECTORS_H */
99296 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_time.h linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_time.h
99297 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_time.h 1970-01-01 00:00:00.000000000 +0000
99298 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_time.h 2007-01-08 15:00:46.000000000 +0000
99299 @@ -0,0 +1,122 @@
99300 +/*
99301 + * include/asm-i386/mach-default/mach_time.h
99302 + *
99303 + * Machine specific set RTC function for generic.
99304 + * Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
99305 + */
99306 +#ifndef _MACH_TIME_H
99307 +#define _MACH_TIME_H
99308 +
99309 +#include <asm-i386/mc146818rtc.h>
99310 +
99311 +/* for check timing call set_rtc_mmss() 500ms */
99312 +/* used in arch/i386/time.c::do_timer_interrupt() */
99313 +#define USEC_AFTER 500000
99314 +#define USEC_BEFORE 500000
99315 +
99316 +/*
99317 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
99318 + * called 500 ms after the second nowtime has started, because when
99319 + * nowtime is written into the registers of the CMOS clock, it will
99320 + * jump to the next second precisely 500 ms later. Check the Motorola
99321 + * MC146818A or Dallas DS12887 data sheet for details.
99322 + *
99323 + * BUG: This routine does not handle hour overflow properly; it just
99324 + * sets the minutes. Usually you'll only notice that after reboot!
99325 + */
99326 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
99327 +{
99328 + int retval = 0;
99329 + int real_seconds, real_minutes, cmos_minutes;
99330 + unsigned char save_control, save_freq_select;
99331 +
99332 + save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
99333 + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
99334 +
99335 + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
99336 + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
99337 +
99338 + cmos_minutes = CMOS_READ(RTC_MINUTES);
99339 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
99340 + BCD_TO_BIN(cmos_minutes);
99341 +
99342 + /*
99343 + * since we're only adjusting minutes and seconds,
99344 + * don't interfere with hour overflow. This avoids
99345 + * messing with unknown time zones but requires your
99346 + * RTC not to be off by more than 15 minutes
99347 + */
99348 + real_seconds = nowtime % 60;
99349 + real_minutes = nowtime / 60;
99350 + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
99351 + real_minutes += 30; /* correct for half hour time zone */
99352 + real_minutes %= 60;
99353 +
99354 + if (abs(real_minutes - cmos_minutes) < 30) {
99355 + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
99356 + BIN_TO_BCD(real_seconds);
99357 + BIN_TO_BCD(real_minutes);
99358 + }
99359 + CMOS_WRITE(real_seconds,RTC_SECONDS);
99360 + CMOS_WRITE(real_minutes,RTC_MINUTES);
99361 + } else {
99362 + printk(KERN_WARNING
99363 + "set_rtc_mmss: can't update from %d to %d\n",
99364 + cmos_minutes, real_minutes);
99365 + retval = -1;
99366 + }
99367 +
99368 + /* The following flags have to be released exactly in this order,
99369 + * otherwise the DS12887 (popular MC146818A clone with integrated
99370 + * battery and quartz) will not reset the oscillator and will not
99371 + * update precisely 500 ms later. You won't find this mentioned in
99372 + * the Dallas Semiconductor data sheets, but who believes data
99373 + * sheets anyway ... -- Markus Kuhn
99374 + */
99375 + CMOS_WRITE(save_control, RTC_CONTROL);
99376 + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
99377 +
99378 + return retval;
99379 +}
99380 +
99381 +static inline unsigned long mach_get_cmos_time(void)
99382 +{
99383 + unsigned int year, mon, day, hour, min, sec;
99384 + int i;
99385 +
99386 + /* The Linux interpretation of the CMOS clock register contents:
99387 + * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
99388 + * RTC registers show the second which has precisely just started.
99389 + * Let's hope other operating systems interpret the RTC the same way.
99390 + */
99391 + /* read RTC exactly on falling edge of update flag */
99392 + for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */
99393 + if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
99394 + break;
99395 + for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */
99396 + if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
99397 + break;
99398 + do { /* Isn't this overkill ? UIP above should guarantee consistency */
99399 + sec = CMOS_READ(RTC_SECONDS);
99400 + min = CMOS_READ(RTC_MINUTES);
99401 + hour = CMOS_READ(RTC_HOURS);
99402 + day = CMOS_READ(RTC_DAY_OF_MONTH);
99403 + mon = CMOS_READ(RTC_MONTH);
99404 + year = CMOS_READ(RTC_YEAR);
99405 + } while (sec != CMOS_READ(RTC_SECONDS));
99406 + if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
99407 + {
99408 + BCD_TO_BIN(sec);
99409 + BCD_TO_BIN(min);
99410 + BCD_TO_BIN(hour);
99411 + BCD_TO_BIN(day);
99412 + BCD_TO_BIN(mon);
99413 + BCD_TO_BIN(year);
99414 + }
99415 + if ((year += 1900) < 1970)
99416 + year += 100;
99417 +
99418 + return mktime(year, mon, day, hour, min, sec);
99419 +}
99420 +
99421 +#endif /* !_MACH_TIME_H */
99422 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_timer.h linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_timer.h
99423 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/mach_timer.h 1970-01-01 00:00:00.000000000 +0000
99424 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/mach_timer.h 2007-01-08 15:00:46.000000000 +0000
99425 @@ -0,0 +1,48 @@
99426 +/*
99427 + * include/asm-i386/mach-default/mach_timer.h
99428 + *
99429 + * Machine specific calibrate_tsc() for generic.
99430 + * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
99431 + */
99432 +/* ------ Calibrate the TSC -------
99433 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
99434 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
99435 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
99436 + * output busy loop as low as possible. We avoid reading the CTC registers
99437 + * directly because of the awkward 8-bit access mechanism of the 82C54
99438 + * device.
99439 + */
99440 +#ifndef _MACH_TIMER_H
99441 +#define _MACH_TIMER_H
99442 +
99443 +#define CALIBRATE_LATCH (5 * LATCH)
99444 +
99445 +static inline void mach_prepare_counter(void)
99446 +{
99447 + /* Set the Gate high, disable speaker */
99448 + outb((inb(0x61) & ~0x02) | 0x01, 0x61);
99449 +
99450 + /*
99451 + * Now let's take care of CTC channel 2
99452 + *
99453 + * Set the Gate high, program CTC channel 2 for mode 0,
99454 + * (interrupt on terminal count mode), binary count,
99455 + * load 5 * LATCH count, (LSB and MSB) to begin countdown.
99456 + *
99457 + * Some devices need a delay here.
99458 + */
99459 + outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
99460 + outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
99461 + outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
99462 +}
99463 +
99464 +static inline void mach_countup(unsigned long *count_p)
99465 +{
99466 + unsigned long count = 0;
99467 + do {
99468 + count++;
99469 + } while ((inb_p(0x61) & 0x20) == 0);
99470 + *count_p = count;
99471 +}
99472 +
99473 +#endif /* !_MACH_TIMER_H */
99474 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_post.h linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_post.h
99475 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_post.h 1970-01-01 00:00:00.000000000 +0000
99476 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_post.h 2007-01-08 15:00:46.000000000 +0000
99477 @@ -0,0 +1,63 @@
99478 +/**
99479 + * machine_specific_* - Hooks for machine specific setup.
99480 + *
99481 + * Description:
99482 + * This is included late in kernel/setup.c so that it can make
99483 + * use of all of the static functions.
99484 + **/
99485 +
99486 +#include <xen/interface/callback.h>
99487 +
99488 +extern void hypervisor_callback(void);
99489 +extern void failsafe_callback(void);
99490 +extern void nmi(void);
99491 +
99492 +static void __init machine_specific_arch_setup(void)
99493 +{
99494 + int ret;
99495 + static struct callback_register __initdata event = {
99496 + .type = CALLBACKTYPE_event,
99497 + .address = (unsigned long) hypervisor_callback,
99498 + };
99499 + static struct callback_register __initdata failsafe = {
99500 + .type = CALLBACKTYPE_failsafe,
99501 + .address = (unsigned long)failsafe_callback,
99502 + };
99503 + static struct callback_register __initdata syscall = {
99504 + .type = CALLBACKTYPE_syscall,
99505 + .address = (unsigned long)system_call,
99506 + };
99507 +#ifdef CONFIG_X86_LOCAL_APIC
99508 + static struct callback_register __initdata nmi_cb = {
99509 + .type = CALLBACKTYPE_nmi,
99510 + .address = (unsigned long)nmi,
99511 + };
99512 +#endif
99513 +
99514 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
99515 + if (ret == 0)
99516 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
99517 + if (ret == 0)
99518 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
99519 +#ifdef CONFIG_XEN_COMPAT_030002
99520 + if (ret == -ENOSYS)
99521 + ret = HYPERVISOR_set_callbacks(
99522 + event.address,
99523 + failsafe.address,
99524 + syscall.address);
99525 +#endif
99526 + BUG_ON(ret);
99527 +
99528 +#ifdef CONFIG_X86_LOCAL_APIC
99529 + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
99530 +#ifdef CONFIG_XEN_COMPAT_030002
99531 + if (ret == -ENOSYS) {
99532 + static struct xennmi_callback __initdata cb = {
99533 + .handler_address = (unsigned long)nmi
99534 + };
99535 +
99536 + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
99537 + }
99538 +#endif
99539 +#endif
99540 +}
99541 diff -Nur linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_pre.h linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_pre.h
99542 --- linux-2.6.16.33-noxen/include/asm-x86_64/mach-xen/setup_arch_pre.h 1970-01-01 00:00:00.000000000 +0000
99543 +++ linux-2.6.16.33/include/asm-x86_64/mach-xen/setup_arch_pre.h 2007-01-08 15:00:46.000000000 +0000
99544 @@ -0,0 +1,5 @@
99545 +/* Hook to call BIOS initialisation function */
99546 +
99547 +#define ARCH_SETUP machine_specific_arch_setup();
99548 +
99549 +static void __init machine_specific_arch_setup(void);
99550 diff -Nur linux-2.6.16.33-noxen/include/linux/aio.h linux-2.6.16.33/include/linux/aio.h
99551 --- linux-2.6.16.33-noxen/include/linux/aio.h 2006-11-22 18:06:31.000000000 +0000
99552 +++ linux-2.6.16.33/include/linux/aio.h 2007-05-23 21:00:01.000000000 +0000
99553 @@ -191,6 +191,11 @@
99554 struct aio_ring_info ring_info;
99555
99556 struct work_struct wq;
99557 +#ifdef CONFIG_EPOLL
99558 + // poll integration
99559 + wait_queue_head_t poll_wait;
99560 + struct file *file;
99561 +#endif
99562 };
99563
99564 /* prototypes */
99565 diff -Nur linux-2.6.16.33-noxen/include/linux/elfnote.h linux-2.6.16.33/include/linux/elfnote.h
99566 --- linux-2.6.16.33-noxen/include/linux/elfnote.h 1970-01-01 00:00:00.000000000 +0000
99567 +++ linux-2.6.16.33/include/linux/elfnote.h 2007-05-23 21:00:01.000000000 +0000
99568 @@ -0,0 +1,90 @@
99569 +#ifndef _LINUX_ELFNOTE_H
99570 +#define _LINUX_ELFNOTE_H
99571 +/*
99572 + * Helper macros to generate ELF Note structures, which are put into a
99573 + * PT_NOTE segment of the final vmlinux image. These are useful for
99574 + * including name-value pairs of metadata into the kernel binary (or
99575 + * modules?) for use by external programs.
99576 + *
99577 + * Each note has three parts: a name, a type and a desc. The name is
99578 + * intended to distinguish the note's originator, so it would be a
99579 + * company, project, subsystem, etc; it must be in a suitable form for
99580 + * use in a section name. The type is an integer which is used to tag
99581 + * the data, and is considered to be within the "name" namespace (so
99582 + * "FooCo"'s type 42 is distinct from "BarProj"'s type 42). The
99583 + * "desc" field is the actual data. There are no constraints on the
99584 + * desc field's contents, though typically they're fairly small.
99585 + *
99586 + * All notes from a given NAME are put into a section named
99587 + * .note.NAME. When the kernel image is finally linked, all the notes
99588 + * are packed into a single .notes section, which is mapped into the
99589 + * PT_NOTE segment. Because notes for a given name are grouped into
99590 + * the same section, they'll all be adjacent the output file.
99591 + *
99592 + * This file defines macros for both C and assembler use. Their
99593 + * syntax is slightly different, but they're semantically similar.
99594 + *
99595 + * See the ELF specification for more detail about ELF notes.
99596 + */
99597 +
99598 +#ifdef __ASSEMBLER__
99599 +/*
99600 + * Generate a structure with the same shape as Elf{32,64}_Nhdr (which
99601 + * turn out to be the same size and shape), followed by the name and
99602 + * desc data with appropriate padding. The 'desctype' argument is the
99603 + * assembler pseudo op defining the type of the data e.g. .asciz while
99604 + * 'descdata' is the data itself e.g. "hello, world".
99605 + *
99606 + * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
99607 + * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
99608 + */
99609 +#define ELFNOTE(name, type, desctype, descdata) \
99610 +.pushsection .note.name ; \
99611 + .align 4 ; \
99612 + .long 2f - 1f /* namesz */ ; \
99613 + .long 4f - 3f /* descsz */ ; \
99614 + .long type ; \
99615 +1:.asciz "name" ; \
99616 +2:.align 4 ; \
99617 +3:desctype descdata ; \
99618 +4:.align 4 ; \
99619 +.popsection ;
99620 +#else /* !__ASSEMBLER__ */
99621 +#include <linux/elf.h>
99622 +/*
99623 + * Use an anonymous structure which matches the shape of
99624 + * Elf{32,64}_Nhdr, but includes the name and desc data. The size and
99625 + * type of name and desc depend on the macro arguments. "name" must
99626 + * be a literal string, and "desc" must be passed by value. You may
99627 + * only define one note per line, since __LINE__ is used to generate
99628 + * unique symbols.
99629 + */
99630 +#define _ELFNOTE_PASTE(a,b) a##b
99631 +#define _ELFNOTE(size, name, unique, type, desc) \
99632 + static const struct { \
99633 + struct elf##size##_note _nhdr; \
99634 + unsigned char _name[sizeof(name)] \
99635 + __attribute__((aligned(sizeof(Elf##size##_Word)))); \
99636 + typeof(desc) _desc \
99637 + __attribute__((aligned(sizeof(Elf##size##_Word)))); \
99638 + } _ELFNOTE_PASTE(_note_, unique) \
99639 + __attribute_used__ \
99640 + __attribute__((section(".note." name), \
99641 + aligned(sizeof(Elf##size##_Word)), \
99642 + unused)) = { \
99643 + { \
99644 + sizeof(name), \
99645 + sizeof(desc), \
99646 + type, \
99647 + }, \
99648 + name, \
99649 + desc \
99650 + }
99651 +#define ELFNOTE(size, name, type, desc) \
99652 + _ELFNOTE(size, name, __LINE__, type, desc)
99653 +
99654 +#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
99655 +#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
99656 +#endif /* __ASSEMBLER__ */
99657 +
99658 +#endif /* _LINUX_ELFNOTE_H */
99659 diff -Nur linux-2.6.16.33-noxen/include/linux/ethtool.h linux-2.6.16.33/include/linux/ethtool.h
99660 --- linux-2.6.16.33-noxen/include/linux/ethtool.h 2006-11-22 18:06:31.000000000 +0000
99661 +++ linux-2.6.16.33/include/linux/ethtool.h 2007-05-23 21:00:01.000000000 +0000
99662 @@ -408,6 +408,8 @@
99663 #define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */
99664 #define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */
99665 #define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */
99666 +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */
99667 +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */
99668
99669 /* compatibility with older code */
99670 #define SPARC_ETH_GSET ETHTOOL_GSET
99671 diff -Nur linux-2.6.16.33-noxen/include/linux/eventpoll.h linux-2.6.16.33/include/linux/eventpoll.h
99672 --- linux-2.6.16.33-noxen/include/linux/eventpoll.h 2006-11-22 18:06:31.000000000 +0000
99673 +++ linux-2.6.16.33/include/linux/eventpoll.h 2007-05-23 21:00:01.000000000 +0000
99674 @@ -86,6 +86,12 @@
99675 }
99676
99677
99678 +/*
99679 + * called by aio code to create fd that can poll the aio event queueQ
99680 + */
99681 +struct eventpoll;
99682 +int ep_getfd(int *efd, struct inode **einode, struct file **efile,
99683 + struct eventpoll *ep, struct file_operations *fops);
99684 #else
99685
99686 static inline void eventpoll_init_file(struct file *file) {}
99687 diff -Nur linux-2.6.16.33-noxen/include/linux/gfp.h linux-2.6.16.33/include/linux/gfp.h
99688 --- linux-2.6.16.33-noxen/include/linux/gfp.h 2006-11-22 18:06:31.000000000 +0000
99689 +++ linux-2.6.16.33/include/linux/gfp.h 2007-01-08 15:00:46.000000000 +0000
99690 @@ -98,7 +98,11 @@
99691 */
99692
99693 #ifndef HAVE_ARCH_FREE_PAGE
99694 -static inline void arch_free_page(struct page *page, int order) { }
99695 +/*
99696 + * If arch_free_page returns non-zero then the generic free_page code can
99697 + * immediately bail: the arch-specific function has done all the work.
99698 + */
99699 +static inline int arch_free_page(struct page *page, int order) { return 0; }
99700 #endif
99701
99702 extern struct page *
99703 diff -Nur linux-2.6.16.33-noxen/include/linux/highmem.h linux-2.6.16.33/include/linux/highmem.h
99704 --- linux-2.6.16.33-noxen/include/linux/highmem.h 2006-11-22 18:06:31.000000000 +0000
99705 +++ linux-2.6.16.33/include/linux/highmem.h 2007-01-08 15:00:46.000000000 +0000
99706 @@ -13,10 +13,16 @@
99707
99708 /* declarations for linux/mm/highmem.c */
99709 unsigned int nr_free_highpages(void);
99710 +#ifdef CONFIG_XEN
99711 +void kmap_flush_unused(void);
99712 +#endif
99713
99714 #else /* CONFIG_HIGHMEM */
99715
99716 static inline unsigned int nr_free_highpages(void) { return 0; }
99717 +#ifdef CONFIG_XEN
99718 +static inline void kmap_flush_unused(void) { }
99719 +#endif
99720
99721 static inline void *kmap(struct page *page)
99722 {
99723 diff -Nur linux-2.6.16.33-noxen/include/linux/interrupt.h linux-2.6.16.33/include/linux/interrupt.h
99724 --- linux-2.6.16.33-noxen/include/linux/interrupt.h 2006-11-22 18:06:31.000000000 +0000
99725 +++ linux-2.6.16.33/include/linux/interrupt.h 2007-01-08 15:00:46.000000000 +0000
99726 @@ -58,6 +58,12 @@
99727 extern void enable_irq(unsigned int irq);
99728 #endif
99729
99730 +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
99731 +int irq_ignore_unhandled(unsigned int irq);
99732 +#else
99733 +#define irq_ignore_unhandled(irq) 0
99734 +#endif
99735 +
99736 #ifndef __ARCH_SET_SOFTIRQ_PENDING
99737 #define set_softirq_pending(x) (local_softirq_pending() = (x))
99738 #define or_softirq_pending(x) (local_softirq_pending() |= (x))
99739 diff -Nur linux-2.6.16.33-noxen/include/linux/kernel.h linux-2.6.16.33/include/linux/kernel.h
99740 --- linux-2.6.16.33-noxen/include/linux/kernel.h 2006-11-22 18:06:31.000000000 +0000
99741 +++ linux-2.6.16.33/include/linux/kernel.h 2007-05-23 21:00:01.000000000 +0000
99742 @@ -111,6 +111,8 @@
99743 __attribute__ ((format (printf, 3, 4)));
99744 extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
99745 __attribute__ ((format (printf, 3, 0)));
99746 +extern char *kasprintf(gfp_t gfp, const char *fmt, ...)
99747 + __attribute__ ((format (printf, 2, 3)));
99748
99749 extern int sscanf(const char *, const char *, ...)
99750 __attribute__ ((format (scanf, 2, 3)));
99751 diff -Nur linux-2.6.16.33-noxen/include/linux/kernel.h~ linux-2.6.16.33/include/linux/kernel.h~
99752 --- linux-2.6.16.33-noxen/include/linux/kernel.h~ 1970-01-01 00:00:00.000000000 +0000
99753 +++ linux-2.6.16.33/include/linux/kernel.h~ 2006-11-22 18:06:31.000000000 +0000
99754 @@ -0,0 +1,332 @@
99755 +#ifndef _LINUX_KERNEL_H
99756 +#define _LINUX_KERNEL_H
99757 +
99758 +/*
99759 + * 'kernel.h' contains some often-used function prototypes etc
99760 + */
99761 +
99762 +#ifdef __KERNEL__
99763 +
99764 +#include <stdarg.h>
99765 +#include <linux/linkage.h>
99766 +#include <linux/stddef.h>
99767 +#include <linux/types.h>
99768 +#include <linux/compiler.h>
99769 +#include <linux/bitops.h>
99770 +#include <asm/byteorder.h>
99771 +#include <asm/bug.h>
99772 +
99773 +extern const char linux_banner[];
99774 +
99775 +#define INT_MAX ((int)(~0U>>1))
99776 +#define INT_MIN (-INT_MAX - 1)
99777 +#define UINT_MAX (~0U)
99778 +#define LONG_MAX ((long)(~0UL>>1))
99779 +#define LONG_MIN (-LONG_MAX - 1)
99780 +#define ULONG_MAX (~0UL)
99781 +
99782 +#define STACK_MAGIC 0xdeadbeef
99783 +
99784 +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
99785 +#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
99786 +
99787 +#define KERN_EMERG "<0>" /* system is unusable */
99788 +#define KERN_ALERT "<1>" /* action must be taken immediately */
99789 +#define KERN_CRIT "<2>" /* critical conditions */
99790 +#define KERN_ERR "<3>" /* error conditions */
99791 +#define KERN_WARNING "<4>" /* warning conditions */
99792 +#define KERN_NOTICE "<5>" /* normal but significant condition */
99793 +#define KERN_INFO "<6>" /* informational */
99794 +#define KERN_DEBUG "<7>" /* debug-level messages */
99795 +
99796 +extern int console_printk[];
99797 +
99798 +#define console_loglevel (console_printk[0])
99799 +#define default_message_loglevel (console_printk[1])
99800 +#define minimum_console_loglevel (console_printk[2])
99801 +#define default_console_loglevel (console_printk[3])
99802 +
99803 +struct completion;
99804 +struct pt_regs;
99805 +struct user;
99806 +
99807 +/**
99808 + * might_sleep - annotation for functions that can sleep
99809 + *
99810 + * this macro will print a stack trace if it is executed in an atomic
99811 + * context (spinlock, irq-handler, ...).
99812 + *
99813 + * This is a useful debugging help to be able to catch problems early and not
99814 + * be biten later when the calling function happens to sleep when it is not
99815 + * supposed to.
99816 + */
99817 +#ifdef CONFIG_PREEMPT_VOLUNTARY
99818 +extern int cond_resched(void);
99819 +# define might_resched() cond_resched()
99820 +#else
99821 +# define might_resched() do { } while (0)
99822 +#endif
99823 +
99824 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
99825 + void __might_sleep(char *file, int line);
99826 +# define might_sleep() \
99827 + do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
99828 +#else
99829 +# define might_sleep() do { might_resched(); } while (0)
99830 +#endif
99831 +
99832 +#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
99833 +
99834 +#define abs(x) ({ \
99835 + int __x = (x); \
99836 + (__x < 0) ? -__x : __x; \
99837 + })
99838 +
99839 +#define labs(x) ({ \
99840 + long __x = (x); \
99841 + (__x < 0) ? -__x : __x; \
99842 + })
99843 +
99844 +extern struct notifier_block *panic_notifier_list;
99845 +extern long (*panic_blink)(long time);
99846 +NORET_TYPE void panic(const char * fmt, ...)
99847 + __attribute__ ((NORET_AND format (printf, 1, 2)));
99848 +fastcall NORET_TYPE void do_exit(long error_code)
99849 + ATTRIB_NORET;
99850 +NORET_TYPE void complete_and_exit(struct completion *, long)
99851 + ATTRIB_NORET;
99852 +extern unsigned long simple_strtoul(const char *,char **,unsigned int);
99853 +extern long simple_strtol(const char *,char **,unsigned int);
99854 +extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
99855 +extern long long simple_strtoll(const char *,char **,unsigned int);
99856 +extern int sprintf(char * buf, const char * fmt, ...)
99857 + __attribute__ ((format (printf, 2, 3)));
99858 +extern int vsprintf(char *buf, const char *, va_list)
99859 + __attribute__ ((format (printf, 2, 0)));
99860 +extern int snprintf(char * buf, size_t size, const char * fmt, ...)
99861 + __attribute__ ((format (printf, 3, 4)));
99862 +extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
99863 + __attribute__ ((format (printf, 3, 0)));
99864 +extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
99865 + __attribute__ ((format (printf, 3, 4)));
99866 +extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
99867 + __attribute__ ((format (printf, 3, 0)));
99868 +
99869 +extern int sscanf(const char *, const char *, ...)
99870 + __attribute__ ((format (scanf, 2, 3)));
99871 +extern int vsscanf(const char *, const char *, va_list)
99872 + __attribute__ ((format (scanf, 2, 0)));
99873 +
99874 +extern int get_option(char **str, int *pint);
99875 +extern char *get_options(const char *str, int nints, int *ints);
99876 +extern unsigned long long memparse(char *ptr, char **retptr);
99877 +
99878 +extern int __kernel_text_address(unsigned long addr);
99879 +extern int kernel_text_address(unsigned long addr);
99880 +extern int session_of_pgrp(int pgrp);
99881 +
99882 +extern void dump_thread(struct pt_regs *regs, struct user *dump);
99883 +
99884 +#ifdef CONFIG_PRINTK
99885 +asmlinkage int vprintk(const char *fmt, va_list args)
99886 + __attribute__ ((format (printf, 1, 0)));
99887 +asmlinkage int printk(const char * fmt, ...)
99888 + __attribute__ ((format (printf, 1, 2)));
99889 +#else
99890 +static inline int vprintk(const char *s, va_list args)
99891 + __attribute__ ((format (printf, 1, 0)));
99892 +static inline int vprintk(const char *s, va_list args) { return 0; }
99893 +static inline int printk(const char *s, ...)
99894 + __attribute__ ((format (printf, 1, 2)));
99895 +static inline int printk(const char *s, ...) { return 0; }
99896 +#endif
99897 +
99898 +unsigned long int_sqrt(unsigned long);
99899 +
99900 +static inline int __attribute_pure__ long_log2(unsigned long x)
99901 +{
99902 + int r = 0;
99903 + for (x >>= 1; x > 0; x >>= 1)
99904 + r++;
99905 + return r;
99906 +}
99907 +
99908 +static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x)
99909 +{
99910 + return (1UL << fls(x - 1));
99911 +}
99912 +
99913 +extern int printk_ratelimit(void);
99914 +extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
99915 +
99916 +static inline void console_silent(void)
99917 +{
99918 + console_loglevel = 0;
99919 +}
99920 +
99921 +static inline void console_verbose(void)
99922 +{
99923 + if (console_loglevel)
99924 + console_loglevel = 15;
99925 +}
99926 +
99927 +extern void bust_spinlocks(int yes);
99928 +extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
99929 +extern __deprecated_for_modules int panic_timeout;
99930 +extern int panic_on_oops;
99931 +extern int tainted;
99932 +extern const char *print_tainted(void);
99933 +extern void add_taint(unsigned);
99934 +
99935 +/* Values used for system_state */
99936 +extern enum system_states {
99937 + SYSTEM_BOOTING,
99938 + SYSTEM_RUNNING,
99939 + SYSTEM_HALT,
99940 + SYSTEM_POWER_OFF,
99941 + SYSTEM_RESTART,
99942 + SYSTEM_SUSPEND_DISK,
99943 +} system_state;
99944 +
99945 +#define TAINT_PROPRIETARY_MODULE (1<<0)
99946 +#define TAINT_FORCED_MODULE (1<<1)
99947 +#define TAINT_UNSAFE_SMP (1<<2)
99948 +#define TAINT_FORCED_RMMOD (1<<3)
99949 +#define TAINT_MACHINE_CHECK (1<<4)
99950 +#define TAINT_BAD_PAGE (1<<5)
99951 +
99952 +extern void dump_stack(void);
99953 +
99954 +#ifdef DEBUG
99955 +#define pr_debug(fmt,arg...) \
99956 + printk(KERN_DEBUG fmt,##arg)
99957 +#else
99958 +#define pr_debug(fmt,arg...) \
99959 + do { } while (0)
99960 +#endif
99961 +
99962 +#define pr_info(fmt,arg...) \
99963 + printk(KERN_INFO fmt,##arg)
99964 +
99965 +/*
99966 + * Display an IP address in readable format.
99967 + */
99968 +
99969 +#define NIPQUAD(addr) \
99970 + ((unsigned char *)&addr)[0], \
99971 + ((unsigned char *)&addr)[1], \
99972 + ((unsigned char *)&addr)[2], \
99973 + ((unsigned char *)&addr)[3]
99974 +#define NIPQUAD_FMT "%u.%u.%u.%u"
99975 +
99976 +#define NIP6(addr) \
99977 + ntohs((addr).s6_addr16[0]), \
99978 + ntohs((addr).s6_addr16[1]), \
99979 + ntohs((addr).s6_addr16[2]), \
99980 + ntohs((addr).s6_addr16[3]), \
99981 + ntohs((addr).s6_addr16[4]), \
99982 + ntohs((addr).s6_addr16[5]), \
99983 + ntohs((addr).s6_addr16[6]), \
99984 + ntohs((addr).s6_addr16[7])
99985 +#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
99986 +#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
99987 +
99988 +#if defined(__LITTLE_ENDIAN)
99989 +#define HIPQUAD(addr) \
99990 + ((unsigned char *)&addr)[3], \
99991 + ((unsigned char *)&addr)[2], \
99992 + ((unsigned char *)&addr)[1], \
99993 + ((unsigned char *)&addr)[0]
99994 +#elif defined(__BIG_ENDIAN)
99995 +#define HIPQUAD NIPQUAD
99996 +#else
99997 +#error "Please fix asm/byteorder.h"
99998 +#endif /* __LITTLE_ENDIAN */
99999 +
100000 +/*
100001 + * min()/max() macros that also do
100002 + * strict type-checking.. See the
100003 + * "unnecessary" pointer comparison.
100004 + */
100005 +#define min(x,y) ({ \
100006 + typeof(x) _x = (x); \
100007 + typeof(y) _y = (y); \
100008 + (void) (&_x == &_y); \
100009 + _x < _y ? _x : _y; })
100010 +
100011 +#define max(x,y) ({ \
100012 + typeof(x) _x = (x); \
100013 + typeof(y) _y = (y); \
100014 + (void) (&_x == &_y); \
100015 + _x > _y ? _x : _y; })
100016 +
100017 +/*
100018 + * ..and if you can't take the strict
100019 + * types, you can specify one yourself.
100020 + *
100021 + * Or not use min/max at all, of course.
100022 + */
100023 +#define min_t(type,x,y) \
100024 + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
100025 +#define max_t(type,x,y) \
100026 + ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
100027 +
100028 +
100029 +/**
100030 + * container_of - cast a member of a structure out to the containing structure
100031 + * @ptr: the pointer to the member.
100032 + * @type: the type of the container struct this is embedded in.
100033 + * @member: the name of the member within the struct.
100034 + *
100035 + */
100036 +#define container_of(ptr, type, member) ({ \
100037 + const typeof( ((type *)0)->member ) *__mptr = (ptr); \
100038 + (type *)( (char *)__mptr - offsetof(type,member) );})
100039 +
100040 +/*
100041 + * Check at compile time that something is of a particular type.
100042 + * Always evaluates to 1 so you may use it easily in comparisons.
100043 + */
100044 +#define typecheck(type,x) \
100045 +({ type __dummy; \
100046 + typeof(x) __dummy2; \
100047 + (void)(&__dummy == &__dummy2); \
100048 + 1; \
100049 +})
100050 +
100051 +/*
100052 + * Check at compile time that 'function' is a certain type, or is a pointer
100053 + * to that type (needs to use typedef for the function type.)
100054 + */
100055 +#define typecheck_fn(type,function) \
100056 +({ typeof(type) __tmp = function; \
100057 + (void)__tmp; \
100058 +})
100059 +
100060 +#endif /* __KERNEL__ */
100061 +
100062 +#define SI_LOAD_SHIFT 16
100063 +struct sysinfo {
100064 + long uptime; /* Seconds since boot */
100065 + unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
100066 + unsigned long totalram; /* Total usable main memory size */
100067 + unsigned long freeram; /* Available memory size */
100068 + unsigned long sharedram; /* Amount of shared memory */
100069 + unsigned long bufferram; /* Memory used by buffers */
100070 + unsigned long totalswap; /* Total swap space size */
100071 + unsigned long freeswap; /* swap space still available */
100072 + unsigned short procs; /* Number of current processes */
100073 + unsigned short pad; /* explicit padding for m68k */
100074 + unsigned long totalhigh; /* Total high memory size */
100075 + unsigned long freehigh; /* Available high memory size */
100076 + unsigned int mem_unit; /* Memory unit size in bytes */
100077 + char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
100078 +};
100079 +
100080 +/* Force a compilation error if condition is true */
100081 +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
100082 +
100083 +/* Trap pasters of __FUNCTION__ at compile-time */
100084 +#define __FUNCTION__ (__func__)
100085 +
100086 +#endif
100087 diff -Nur linux-2.6.16.33-noxen/include/linux/kexec.h linux-2.6.16.33/include/linux/kexec.h
100088 --- linux-2.6.16.33-noxen/include/linux/kexec.h 2006-11-22 18:06:31.000000000 +0000
100089 +++ linux-2.6.16.33/include/linux/kexec.h 2007-01-08 15:00:46.000000000 +0000
100090 @@ -31,6 +31,13 @@
100091 #error KEXEC_ARCH not defined
100092 #endif
100093
100094 +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
100095 +#define kexec_page_to_pfn(page) page_to_pfn(page)
100096 +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn)
100097 +#define kexec_virt_to_phys(addr) virt_to_phys(addr)
100098 +#define kexec_phys_to_virt(addr) phys_to_virt(addr)
100099 +#endif
100100 +
100101 /*
100102 * This structure is used to hold the arguments that are used when loading
100103 * kernel binaries.
100104 @@ -91,6 +98,12 @@
100105 extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
100106 extern int machine_kexec_prepare(struct kimage *image);
100107 extern void machine_kexec_cleanup(struct kimage *image);
100108 +#ifdef CONFIG_XEN
100109 +extern int xen_machine_kexec_load(struct kimage *image);
100110 +extern void xen_machine_kexec_unload(struct kimage *image);
100111 +extern void xen_machine_kexec_setup_resources(void);
100112 +extern void xen_machine_kexec_register_resources(struct resource *res);
100113 +#endif
100114 extern asmlinkage long sys_kexec_load(unsigned long entry,
100115 unsigned long nr_segments,
100116 struct kexec_segment __user *segments,
100117 diff -Nur linux-2.6.16.33-noxen/include/linux/mm.h linux-2.6.16.33/include/linux/mm.h
100118 --- linux-2.6.16.33-noxen/include/linux/mm.h 2006-11-22 18:06:31.000000000 +0000
100119 +++ linux-2.6.16.33/include/linux/mm.h 2007-01-08 15:00:46.000000000 +0000
100120 @@ -166,6 +166,9 @@
100121 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
100122 #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
100123 #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
100124 +#ifdef CONFIG_XEN
100125 +#define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */
100126 +#endif
100127
100128 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
100129 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
100130 @@ -1012,6 +1015,13 @@
100131 #define FOLL_GET 0x04 /* do get_page on page */
100132 #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
100133
100134 +#ifdef CONFIG_XEN
100135 +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
100136 + void *data);
100137 +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
100138 + unsigned long size, pte_fn_t fn, void *data);
100139 +#endif
100140 +
100141 #ifdef CONFIG_PROC_FS
100142 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
100143 #else
100144 diff -Nur linux-2.6.16.33-noxen/include/linux/netdevice.h linux-2.6.16.33/include/linux/netdevice.h
100145 --- linux-2.6.16.33-noxen/include/linux/netdevice.h 2006-11-22 18:06:31.000000000 +0000
100146 +++ linux-2.6.16.33/include/linux/netdevice.h 2007-05-23 21:00:01.000000000 +0000
100147 @@ -230,7 +230,8 @@
100148 __LINK_STATE_SCHED,
100149 __LINK_STATE_NOCARRIER,
100150 __LINK_STATE_RX_SCHED,
100151 - __LINK_STATE_LINKWATCH_PENDING
100152 + __LINK_STATE_LINKWATCH_PENDING,
100153 + __LINK_STATE_QDISC_RUNNING,
100154 };
100155
100156
100157 @@ -306,9 +307,17 @@
100158 #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
100159 #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
100160 #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
100161 -#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
100162 +#define NETIF_F_GSO 2048 /* Enable software GSO. */
100163 #define NETIF_F_LLTX 4096 /* LockLess TX */
100164 -#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/
100165 +
100166 + /* Segmentation offload features */
100167 +#define NETIF_F_GSO_SHIFT 16
100168 +#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
100169 +#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
100170 +#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
100171 +
100172 +#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
100173 +#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
100174
100175 struct net_device *next_sched;
100176
100177 @@ -394,6 +403,9 @@
100178 struct list_head qdisc_list;
100179 unsigned long tx_queue_len; /* Max frames per queue allowed */
100180
100181 + /* Partially transmitted GSO packet. */
100182 + struct sk_buff *gso_skb;
100183 +
100184 /* ingress path synchronizer */
100185 spinlock_t ingress_lock;
100186 struct Qdisc *qdisc_ingress;
100187 @@ -402,7 +414,7 @@
100188 * One part is mostly used on xmit path (device)
100189 */
100190 /* hard_start_xmit synchronizer */
100191 - spinlock_t xmit_lock ____cacheline_aligned_in_smp;
100192 + spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
100193 /* cpu id of processor entered to hard_start_xmit or -1,
100194 if nobody entered there.
100195 */
100196 @@ -527,6 +539,9 @@
100197 struct net_device *,
100198 struct packet_type *,
100199 struct net_device *);
100200 + struct sk_buff *(*gso_segment)(struct sk_buff *skb,
100201 + int features);
100202 + int (*gso_send_check)(struct sk_buff *skb);
100203 void *af_packet_priv;
100204 struct list_head list;
100205 };
100206 @@ -693,7 +708,8 @@
100207 extern int dev_set_mtu(struct net_device *, int);
100208 extern int dev_set_mac_address(struct net_device *,
100209 struct sockaddr *);
100210 -extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
100211 +extern int dev_hard_start_xmit(struct sk_buff *skb,
100212 + struct net_device *dev);
100213
100214 extern void dev_init(void);
100215
100216 @@ -900,11 +916,43 @@
100217 clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
100218 }
100219
100220 +static inline void netif_tx_lock(struct net_device *dev)
100221 +{
100222 + spin_lock(&dev->_xmit_lock);
100223 + dev->xmit_lock_owner = smp_processor_id();
100224 +}
100225 +
100226 +static inline void netif_tx_lock_bh(struct net_device *dev)
100227 +{
100228 + spin_lock_bh(&dev->_xmit_lock);
100229 + dev->xmit_lock_owner = smp_processor_id();
100230 +}
100231 +
100232 +static inline int netif_tx_trylock(struct net_device *dev)
100233 +{
100234 + int ok = spin_trylock(&dev->_xmit_lock);
100235 + if (likely(ok))
100236 + dev->xmit_lock_owner = smp_processor_id();
100237 + return ok;
100238 +}
100239 +
100240 +static inline void netif_tx_unlock(struct net_device *dev)
100241 +{
100242 + dev->xmit_lock_owner = -1;
100243 + spin_unlock(&dev->_xmit_lock);
100244 +}
100245 +
100246 +static inline void netif_tx_unlock_bh(struct net_device *dev)
100247 +{
100248 + dev->xmit_lock_owner = -1;
100249 + spin_unlock_bh(&dev->_xmit_lock);
100250 +}
100251 +
100252 static inline void netif_tx_disable(struct net_device *dev)
100253 {
100254 - spin_lock_bh(&dev->xmit_lock);
100255 + netif_tx_lock_bh(dev);
100256 netif_stop_queue(dev);
100257 - spin_unlock_bh(&dev->xmit_lock);
100258 + netif_tx_unlock_bh(dev);
100259 }
100260
100261 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
100262 @@ -932,6 +980,7 @@
100263 extern int weight_p;
100264 extern int netdev_set_master(struct net_device *dev, struct net_device *master);
100265 extern int skb_checksum_help(struct sk_buff *skb, int inward);
100266 +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features);
100267 #ifdef CONFIG_BUG
100268 extern void netdev_rx_csum_fault(struct net_device *dev);
100269 #else
100270 @@ -951,6 +1000,19 @@
100271
100272 extern void linkwatch_run_queue(void);
100273
100274 +static inline int skb_gso_ok(struct sk_buff *skb, int features)
100275 +{
100276 + int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT;
100277 + return (features & feature) == feature;
100278 +}
100279 +
100280 +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
100281 +{
100282 + return skb_is_gso(skb) &&
100283 + (!skb_gso_ok(skb, dev->features) ||
100284 + unlikely(skb->ip_summed != CHECKSUM_HW));
100285 +}
100286 +
100287 #endif /* __KERNEL__ */
100288
100289 #endif /* _LINUX_DEV_H */
100290 diff -Nur linux-2.6.16.33-noxen/include/linux/oprofile.h linux-2.6.16.33/include/linux/oprofile.h
100291 --- linux-2.6.16.33-noxen/include/linux/oprofile.h 2006-11-22 18:06:31.000000000 +0000
100292 +++ linux-2.6.16.33/include/linux/oprofile.h 2007-05-23 21:00:01.000000000 +0000
100293 @@ -16,6 +16,8 @@
100294 #include <linux/types.h>
100295 #include <linux/spinlock.h>
100296 #include <asm/atomic.h>
100297 +
100298 +#include <xen/interface/xenoprof.h>
100299
100300 struct super_block;
100301 struct dentry;
100302 @@ -27,6 +29,11 @@
100303 /* create any necessary configuration files in the oprofile fs.
100304 * Optional. */
100305 int (*create_files)(struct super_block * sb, struct dentry * root);
100306 + /* setup active domains with Xen */
100307 + int (*set_active)(int *active_domains, unsigned int adomains);
100308 + /* setup passive domains with Xen */
100309 + int (*set_passive)(int *passive_domains, unsigned int pdomains);
100310 +
100311 /* Do any necessary interrupt setup. Optional. */
100312 int (*setup)(void);
100313 /* Do any necessary interrupt shutdown. Optional. */
100314 @@ -68,6 +75,8 @@
100315 /* add a backtrace entry, to be called from the ->backtrace callback */
100316 void oprofile_add_trace(unsigned long eip);
100317
100318 +/* add a domain switch entry */
100319 +int oprofile_add_domain_switch(int32_t domain_id);
100320
100321 /**
100322 * Create a file of the given name as a child of the given root, with
100323 diff -Nur linux-2.6.16.33-noxen/include/linux/rcupdate.h linux-2.6.16.33/include/linux/rcupdate.h
100324 --- linux-2.6.16.33-noxen/include/linux/rcupdate.h 2006-11-22 18:06:31.000000000 +0000
100325 +++ linux-2.6.16.33/include/linux/rcupdate.h 2007-05-23 21:00:01.000000000 +0000
100326 @@ -134,6 +134,7 @@
100327 }
100328
100329 extern int rcu_pending(int cpu);
100330 +extern int rcu_needs_cpu(int cpu);
100331
100332 /**
100333 * rcu_read_lock - mark the beginning of an RCU read-side critical section.
100334 diff -Nur linux-2.6.16.33-noxen/include/linux/skbuff.h linux-2.6.16.33/include/linux/skbuff.h
100335 --- linux-2.6.16.33-noxen/include/linux/skbuff.h 2006-11-22 18:06:31.000000000 +0000
100336 +++ linux-2.6.16.33/include/linux/skbuff.h 2007-01-08 15:00:46.000000000 +0000
100337 @@ -134,9 +134,10 @@
100338 struct skb_shared_info {
100339 atomic_t dataref;
100340 unsigned short nr_frags;
100341 - unsigned short tso_size;
100342 - unsigned short tso_segs;
100343 - unsigned short ufo_size;
100344 + unsigned short gso_size;
100345 + /* Warning: this field is not always filled in (UFO)! */
100346 + unsigned short gso_segs;
100347 + unsigned short gso_type;
100348 unsigned int ip6_frag_id;
100349 struct sk_buff *frag_list;
100350 skb_frag_t frags[MAX_SKB_FRAGS];
100351 @@ -168,6 +169,14 @@
100352 SKB_FCLONE_CLONE,
100353 };
100354
100355 +enum {
100356 + SKB_GSO_TCPV4 = 1 << 0,
100357 + SKB_GSO_UDPV4 = 1 << 1,
100358 +
100359 + /* This indicates the skb is from an untrusted source. */
100360 + SKB_GSO_DODGY = 1 << 2,
100361 +};
100362 +
100363 /**
100364 * struct sk_buff - socket buffer
100365 * @next: Next buffer in list
100366 @@ -189,6 +198,8 @@
100367 * @local_df: allow local fragmentation
100368 * @cloned: Head may be cloned (check refcnt to be sure)
100369 * @nohdr: Payload reference only, must not modify header
100370 + * @proto_data_valid: Protocol data validated since arriving at localhost
100371 + * @proto_csum_blank: Protocol csum must be added before leaving localhost
100372 * @pkt_type: Packet class
100373 * @fclone: skbuff clone status
100374 * @ip_summed: Driver fed us an IP checksum
100375 @@ -265,7 +276,13 @@
100376 nfctinfo:3;
100377 __u8 pkt_type:3,
100378 fclone:2,
100379 +#ifndef CONFIG_XEN
100380 ipvs_property:1;
100381 +#else
100382 + ipvs_property:1,
100383 + proto_data_valid:1,
100384 + proto_csum_blank:1;
100385 +#endif
100386 __be16 protocol;
100387
100388 void (*destructor)(struct sk_buff *skb);
100389 @@ -321,7 +338,8 @@
100390
100391 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
100392 unsigned int size,
100393 - gfp_t priority);
100394 + gfp_t priority,
100395 + int fclone);
100396 extern void kfree_skbmem(struct sk_buff *skb);
100397 extern struct sk_buff *skb_clone(struct sk_buff *skb,
100398 gfp_t priority);
100399 @@ -1051,7 +1069,7 @@
100400 return skb;
100401 }
100402 #else
100403 -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
100404 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
100405 #endif
100406
100407 /**
100408 @@ -1148,18 +1166,34 @@
100409 return 0;
100410 }
100411
100412 +static inline int __skb_linearize(struct sk_buff *skb)
100413 +{
100414 + return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
100415 +}
100416 +
100417 /**
100418 * skb_linearize - convert paged skb to linear one
100419 * @skb: buffer to linarize
100420 - * @gfp: allocation mode
100421 *
100422 * If there is no free memory -ENOMEM is returned, otherwise zero
100423 * is returned and the old skb data released.
100424 */
100425 -extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
100426 -static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
100427 +static inline int skb_linearize(struct sk_buff *skb)
100428 +{
100429 + return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
100430 +}
100431 +
100432 +/**
100433 + * skb_linearize_cow - make sure skb is linear and writable
100434 + * @skb: buffer to process
100435 + *
100436 + * If there is no free memory -ENOMEM is returned, otherwise zero
100437 + * is returned and the old skb data released.
100438 + */
100439 +static inline int skb_linearize_cow(struct sk_buff *skb)
100440 {
100441 - return __skb_linearize(skb, gfp);
100442 + return skb_is_nonlinear(skb) || skb_cloned(skb) ?
100443 + __skb_linearize(skb) : 0;
100444 }
100445
100446 /**
100447 @@ -1254,6 +1288,7 @@
100448 struct sk_buff *skb1, const u32 len);
100449
100450 extern void skb_release_data(struct sk_buff *skb);
100451 +extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
100452
100453 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
100454 int len, void *buffer)
100455 @@ -1377,5 +1412,10 @@
100456 static inline void nf_reset(struct sk_buff *skb) {}
100457 #endif /* CONFIG_NETFILTER */
100458
100459 +static inline int skb_is_gso(const struct sk_buff *skb)
100460 +{
100461 + return skb_shinfo(skb)->gso_size;
100462 +}
100463 +
100464 #endif /* __KERNEL__ */
100465 #endif /* _LINUX_SKBUFF_H */
100466 diff -Nur linux-2.6.16.33-noxen/include/net/pkt_sched.h linux-2.6.16.33/include/net/pkt_sched.h
100467 --- linux-2.6.16.33-noxen/include/net/pkt_sched.h 2006-11-22 18:06:31.000000000 +0000
100468 +++ linux-2.6.16.33/include/net/pkt_sched.h 2007-05-23 21:00:01.000000000 +0000
100469 @@ -218,12 +218,13 @@
100470 struct rtattr *tab);
100471 extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
100472
100473 -extern int qdisc_restart(struct net_device *dev);
100474 +extern void __qdisc_run(struct net_device *dev);
100475
100476 static inline void qdisc_run(struct net_device *dev)
100477 {
100478 - while (!netif_queue_stopped(dev) && qdisc_restart(dev) < 0)
100479 - /* NOTHING */;
100480 + if (!netif_queue_stopped(dev) &&
100481 + !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
100482 + __qdisc_run(dev);
100483 }
100484
100485 extern int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
100486 diff -Nur linux-2.6.16.33-noxen/include/net/protocol.h linux-2.6.16.33/include/net/protocol.h
100487 --- linux-2.6.16.33-noxen/include/net/protocol.h 2006-11-22 18:06:31.000000000 +0000
100488 +++ linux-2.6.16.33/include/net/protocol.h 2007-05-23 21:00:01.000000000 +0000
100489 @@ -37,6 +37,9 @@
100490 struct net_protocol {
100491 int (*handler)(struct sk_buff *skb);
100492 void (*err_handler)(struct sk_buff *skb, u32 info);
100493 + int (*gso_send_check)(struct sk_buff *skb);
100494 + struct sk_buff *(*gso_segment)(struct sk_buff *skb,
100495 + int features);
100496 int no_policy;
100497 };
100498
100499 diff -Nur linux-2.6.16.33-noxen/include/net/sock.h linux-2.6.16.33/include/net/sock.h
100500 --- linux-2.6.16.33-noxen/include/net/sock.h 2006-11-22 18:06:31.000000000 +0000
100501 +++ linux-2.6.16.33/include/net/sock.h 2007-05-23 21:00:01.000000000 +0000
100502 @@ -1064,9 +1064,13 @@
100503 {
100504 __sk_dst_set(sk, dst);
100505 sk->sk_route_caps = dst->dev->features;
100506 + if (sk->sk_route_caps & NETIF_F_GSO)
100507 + sk->sk_route_caps |= NETIF_F_TSO;
100508 if (sk->sk_route_caps & NETIF_F_TSO) {
100509 if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
100510 sk->sk_route_caps &= ~NETIF_F_TSO;
100511 + else
100512 + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
100513 }
100514 }
100515
100516 diff -Nur linux-2.6.16.33-noxen/include/net/tcp.h linux-2.6.16.33/include/net/tcp.h
100517 --- linux-2.6.16.33-noxen/include/net/tcp.h 2006-11-22 18:06:31.000000000 +0000
100518 +++ linux-2.6.16.33/include/net/tcp.h 2007-05-23 21:00:01.000000000 +0000
100519 @@ -552,13 +552,13 @@
100520 */
100521 static inline int tcp_skb_pcount(const struct sk_buff *skb)
100522 {
100523 - return skb_shinfo(skb)->tso_segs;
100524 + return skb_shinfo(skb)->gso_segs;
100525 }
100526
100527 /* This is valid iff tcp_skb_pcount() > 1. */
100528 static inline int tcp_skb_mss(const struct sk_buff *skb)
100529 {
100530 - return skb_shinfo(skb)->tso_size;
100531 + return skb_shinfo(skb)->gso_size;
100532 }
100533
100534 static inline void tcp_dec_pcount_approx(__u32 *count,
100535 @@ -1063,6 +1063,9 @@
100536
100537 extern int tcp_v4_destroy_sock(struct sock *sk);
100538
100539 +extern int tcp_v4_gso_send_check(struct sk_buff *skb);
100540 +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features);
100541 +
100542 #ifdef CONFIG_PROC_FS
100543 extern int tcp4_proc_init(void);
100544 extern void tcp4_proc_exit(void);
100545 diff -Nur linux-2.6.16.33-noxen/include/xen/balloon.h linux-2.6.16.33/include/xen/balloon.h
100546 --- linux-2.6.16.33-noxen/include/xen/balloon.h 1970-01-01 00:00:00.000000000 +0000
100547 +++ linux-2.6.16.33/include/xen/balloon.h 2007-01-08 15:00:46.000000000 +0000
100548 @@ -0,0 +1,57 @@
100549 +/******************************************************************************
100550 + * balloon.h
100551 + *
100552 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
100553 + *
100554 + * Copyright (c) 2003, B Dragovic
100555 + * Copyright (c) 2003-2004, M Williamson, K Fraser
100556 + *
100557 + * This program is free software; you can redistribute it and/or
100558 + * modify it under the terms of the GNU General Public License version 2
100559 + * as published by the Free Software Foundation; or, when distributed
100560 + * separately from the Linux kernel or incorporated into other
100561 + * software packages, subject to the following license:
100562 + *
100563 + * Permission is hereby granted, free of charge, to any person obtaining a copy
100564 + * of this source file (the "Software"), to deal in the Software without
100565 + * restriction, including without limitation the rights to use, copy, modify,
100566 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100567 + * and to permit persons to whom the Software is furnished to do so, subject to
100568 + * the following conditions:
100569 + *
100570 + * The above copyright notice and this permission notice shall be included in
100571 + * all copies or substantial portions of the Software.
100572 + *
100573 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100574 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100575 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100576 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100577 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100578 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100579 + * IN THE SOFTWARE.
100580 + */
100581 +
100582 +#ifndef __ASM_BALLOON_H__
100583 +#define __ASM_BALLOON_H__
100584 +
100585 +/*
100586 + * Inform the balloon driver that it should allow some slop for device-driver
100587 + * memory activities.
100588 + */
100589 +void balloon_update_driver_allowance(long delta);
100590 +
100591 +/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
100592 +struct page **alloc_empty_pages_and_pagevec(int nr_pages);
100593 +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
100594 +
100595 +void balloon_release_driver_page(struct page *page);
100596 +
100597 +/*
100598 + * Prevent the balloon driver from changing the memory reservation during
100599 + * a driver critical region.
100600 + */
100601 +extern spinlock_t balloon_lock;
100602 +#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags)
100603 +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
100604 +
100605 +#endif /* __ASM_BALLOON_H__ */
100606 diff -Nur linux-2.6.16.33-noxen/include/xen/cpu_hotplug.h linux-2.6.16.33/include/xen/cpu_hotplug.h
100607 --- linux-2.6.16.33-noxen/include/xen/cpu_hotplug.h 1970-01-01 00:00:00.000000000 +0000
100608 +++ linux-2.6.16.33/include/xen/cpu_hotplug.h 2007-01-08 15:00:46.000000000 +0000
100609 @@ -0,0 +1,44 @@
100610 +#ifndef __XEN_CPU_HOTPLUG_H__
100611 +#define __XEN_CPU_HOTPLUG_H__
100612 +
100613 +#include <linux/config.h>
100614 +#include <linux/kernel.h>
100615 +#include <linux/cpumask.h>
100616 +
100617 +#if defined(CONFIG_HOTPLUG_CPU)
100618 +
100619 +#if defined(CONFIG_X86)
100620 +void cpu_initialize_context(unsigned int cpu);
100621 +#else
100622 +#define cpu_initialize_context(cpu) ((void)0)
100623 +#endif
100624 +
100625 +int cpu_up_check(unsigned int cpu);
100626 +void init_xenbus_allowed_cpumask(void);
100627 +int smp_suspend(void);
100628 +void smp_resume(void);
100629 +
100630 +void cpu_bringup(void);
100631 +
100632 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
100633 +
100634 +#define cpu_up_check(cpu) (0)
100635 +#define init_xenbus_allowed_cpumask() ((void)0)
100636 +
100637 +static inline int smp_suspend(void)
100638 +{
100639 + if (num_online_cpus() > 1) {
100640 + printk(KERN_WARNING "Can't suspend SMP guests "
100641 + "without CONFIG_HOTPLUG_CPU\n");
100642 + return -EOPNOTSUPP;
100643 + }
100644 + return 0;
100645 +}
100646 +
100647 +static inline void smp_resume(void)
100648 +{
100649 +}
100650 +
100651 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
100652 +
100653 +#endif /* __XEN_CPU_HOTPLUG_H__ */
100654 diff -Nur linux-2.6.16.33-noxen/include/xen/driver_util.h linux-2.6.16.33/include/xen/driver_util.h
100655 --- linux-2.6.16.33-noxen/include/xen/driver_util.h 1970-01-01 00:00:00.000000000 +0000
100656 +++ linux-2.6.16.33/include/xen/driver_util.h 2007-01-08 15:00:46.000000000 +0000
100657 @@ -0,0 +1,16 @@
100658 +
100659 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
100660 +#define __ASM_XEN_DRIVER_UTIL_H__
100661 +
100662 +#include <linux/config.h>
100663 +#include <linux/vmalloc.h>
100664 +
100665 +/* Allocate/destroy a 'vmalloc' VM area. */
100666 +extern struct vm_struct *alloc_vm_area(unsigned long size);
100667 +extern void free_vm_area(struct vm_struct *area);
100668 +
100669 +/* Lock an area so that PTEs are accessible in the current address space. */
100670 +extern void lock_vm_area(struct vm_struct *area);
100671 +extern void unlock_vm_area(struct vm_struct *area);
100672 +
100673 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
100674 diff -Nur linux-2.6.16.33-noxen/include/xen/evtchn.h linux-2.6.16.33/include/xen/evtchn.h
100675 --- linux-2.6.16.33-noxen/include/xen/evtchn.h 1970-01-01 00:00:00.000000000 +0000
100676 +++ linux-2.6.16.33/include/xen/evtchn.h 2007-01-08 15:00:46.000000000 +0000
100677 @@ -0,0 +1,114 @@
100678 +/******************************************************************************
100679 + * evtchn.h
100680 + *
100681 + * Communication via Xen event channels.
100682 + * Also definitions for the device that demuxes notifications to userspace.
100683 + *
100684 + * Copyright (c) 2004-2005, K A Fraser
100685 + *
100686 + * This program is free software; you can redistribute it and/or
100687 + * modify it under the terms of the GNU General Public License version 2
100688 + * as published by the Free Software Foundation; or, when distributed
100689 + * separately from the Linux kernel or incorporated into other
100690 + * software packages, subject to the following license:
100691 + *
100692 + * Permission is hereby granted, free of charge, to any person obtaining a copy
100693 + * of this source file (the "Software"), to deal in the Software without
100694 + * restriction, including without limitation the rights to use, copy, modify,
100695 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100696 + * and to permit persons to whom the Software is furnished to do so, subject to
100697 + * the following conditions:
100698 + *
100699 + * The above copyright notice and this permission notice shall be included in
100700 + * all copies or substantial portions of the Software.
100701 + *
100702 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100703 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100704 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100705 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100706 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100707 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100708 + * IN THE SOFTWARE.
100709 + */
100710 +
100711 +#ifndef __ASM_EVTCHN_H__
100712 +#define __ASM_EVTCHN_H__
100713 +
100714 +#include <linux/config.h>
100715 +#include <linux/interrupt.h>
100716 +#include <asm/hypervisor.h>
100717 +#include <asm/ptrace.h>
100718 +#include <asm/synch_bitops.h>
100719 +#include <xen/interface/event_channel.h>
100720 +#include <linux/smp.h>
100721 +
100722 +/*
100723 + * LOW-LEVEL DEFINITIONS
100724 + */
100725 +
100726 +/*
100727 + * Dynamically bind an event source to an IRQ-like callback handler.
100728 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
100729 + * The IRQ argument passed to the callback handler is the same as returned
100730 + * from the bind call. It may not correspond to a Linux IRQ number.
100731 + * Returns IRQ or negative errno.
100732 + * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
100733 + */
100734 +extern int bind_evtchn_to_irqhandler(
100735 + unsigned int evtchn,
100736 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
100737 + unsigned long irqflags,
100738 + const char *devname,
100739 + void *dev_id);
100740 +extern int bind_virq_to_irqhandler(
100741 + unsigned int virq,
100742 + unsigned int cpu,
100743 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
100744 + unsigned long irqflags,
100745 + const char *devname,
100746 + void *dev_id);
100747 +extern int bind_ipi_to_irqhandler(
100748 + unsigned int ipi,
100749 + unsigned int cpu,
100750 + irqreturn_t (*handler)(int, void *, struct pt_regs *),
100751 + unsigned long irqflags,
100752 + const char *devname,
100753 + void *dev_id);
100754 +
100755 +/*
100756 + * Common unbind function for all event sources. Takes IRQ to unbind from.
100757 + * Automatically closes the underlying event channel (even for bindings
100758 + * made with bind_evtchn_to_irqhandler()).
100759 + */
100760 +extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
100761 +
100762 +extern void irq_resume(void);
100763 +
100764 +/* Entry point for notifications into Linux subsystems. */
100765 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
100766 +
100767 +/* Entry point for notifications into the userland character device. */
100768 +extern void evtchn_device_upcall(int port);
100769 +
100770 +extern void mask_evtchn(int port);
100771 +extern void unmask_evtchn(int port);
100772 +
100773 +static inline void clear_evtchn(int port)
100774 +{
100775 + shared_info_t *s = HYPERVISOR_shared_info;
100776 + synch_clear_bit(port, &s->evtchn_pending[0]);
100777 +}
100778 +
100779 +static inline void notify_remote_via_evtchn(int port)
100780 +{
100781 + struct evtchn_send send = { .port = port };
100782 + (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
100783 +}
100784 +
100785 +/*
100786 + * Unlike notify_remote_via_evtchn(), this is safe to use across
100787 + * save/restore. Notifications on a broken connection are silently dropped.
100788 + */
100789 +extern void notify_remote_via_irq(int irq);
100790 +
100791 +#endif /* __ASM_EVTCHN_H__ */
100792 diff -Nur linux-2.6.16.33-noxen/include/xen/features.h linux-2.6.16.33/include/xen/features.h
100793 --- linux-2.6.16.33-noxen/include/xen/features.h 1970-01-01 00:00:00.000000000 +0000
100794 +++ linux-2.6.16.33/include/xen/features.h 2007-01-08 15:00:46.000000000 +0000
100795 @@ -0,0 +1,20 @@
100796 +/******************************************************************************
100797 + * features.h
100798 + *
100799 + * Query the features reported by Xen.
100800 + *
100801 + * Copyright (c) 2006, Ian Campbell
100802 + */
100803 +
100804 +#ifndef __ASM_XEN_FEATURES_H__
100805 +#define __ASM_XEN_FEATURES_H__
100806 +
100807 +#include <xen/interface/version.h>
100808 +
100809 +extern void setup_xen_features(void);
100810 +
100811 +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
100812 +
100813 +#define xen_feature(flag) (xen_features[flag])
100814 +
100815 +#endif /* __ASM_XEN_FEATURES_H__ */
100816 diff -Nur linux-2.6.16.33-noxen/include/xen/foreign_page.h linux-2.6.16.33/include/xen/foreign_page.h
100817 --- linux-2.6.16.33-noxen/include/xen/foreign_page.h 1970-01-01 00:00:00.000000000 +0000
100818 +++ linux-2.6.16.33/include/xen/foreign_page.h 2007-01-08 15:00:46.000000000 +0000
100819 @@ -0,0 +1,30 @@
100820 +/******************************************************************************
100821 + * foreign_page.h
100822 + *
100823 + * Provide a "foreign" page type, that is owned by a foreign allocator and
100824 + * not the normal buddy allocator in page_alloc.c
100825 + *
100826 + * Copyright (c) 2004, K A Fraser
100827 + */
100828 +
100829 +#ifndef __ASM_XEN_FOREIGN_PAGE_H__
100830 +#define __ASM_XEN_FOREIGN_PAGE_H__
100831 +
100832 +#define PG_foreign PG_arch_1
100833 +
100834 +#define PageForeign(page) test_bit(PG_foreign, &(page)->flags)
100835 +
100836 +#define SetPageForeign(page, dtor) do { \
100837 + set_bit(PG_foreign, &(page)->flags); \
100838 + (page)->mapping = (void *)dtor; \
100839 +} while (0)
100840 +
100841 +#define ClearPageForeign(page) do { \
100842 + clear_bit(PG_foreign, &(page)->flags); \
100843 + (page)->mapping = NULL; \
100844 +} while (0)
100845 +
100846 +#define PageForeignDestructor(page) \
100847 + ( (void (*) (struct page *)) (page)->mapping )
100848 +
100849 +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
100850 diff -Nur linux-2.6.16.33-noxen/include/xen/gnttab.h linux-2.6.16.33/include/xen/gnttab.h
100851 --- linux-2.6.16.33-noxen/include/xen/gnttab.h 1970-01-01 00:00:00.000000000 +0000
100852 +++ linux-2.6.16.33/include/xen/gnttab.h 2007-01-08 15:00:46.000000000 +0000
100853 @@ -0,0 +1,152 @@
100854 +/******************************************************************************
100855 + * gnttab.h
100856 + *
100857 + * Two sets of functionality:
100858 + * 1. Granting foreign access to our memory reservation.
100859 + * 2. Accessing others' memory reservations via grant references.
100860 + * (i.e., mechanisms for both sender and recipient of grant references)
100861 + *
100862 + * Copyright (c) 2004-2005, K A Fraser
100863 + * Copyright (c) 2005, Christopher Clark
100864 + *
100865 + * This program is free software; you can redistribute it and/or
100866 + * modify it under the terms of the GNU General Public License version 2
100867 + * as published by the Free Software Foundation; or, when distributed
100868 + * separately from the Linux kernel or incorporated into other
100869 + * software packages, subject to the following license:
100870 + *
100871 + * Permission is hereby granted, free of charge, to any person obtaining a copy
100872 + * of this source file (the "Software"), to deal in the Software without
100873 + * restriction, including without limitation the rights to use, copy, modify,
100874 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
100875 + * and to permit persons to whom the Software is furnished to do so, subject to
100876 + * the following conditions:
100877 + *
100878 + * The above copyright notice and this permission notice shall be included in
100879 + * all copies or substantial portions of the Software.
100880 + *
100881 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
100882 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
100883 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
100884 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
100885 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
100886 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
100887 + * IN THE SOFTWARE.
100888 + */
100889 +
100890 +#ifndef __ASM_GNTTAB_H__
100891 +#define __ASM_GNTTAB_H__
100892 +
100893 +#include <linux/config.h>
100894 +#include <asm/hypervisor.h>
100895 +#include <asm/maddr.h> /* maddr_t */
100896 +#include <xen/interface/grant_table.h>
100897 +#include <xen/features.h>
100898 +
100899 +/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
100900 +#ifdef __ia64__
100901 +#define NR_GRANT_FRAMES 1
100902 +#else
100903 +#define NR_GRANT_FRAMES 4
100904 +#endif
100905 +
100906 +struct gnttab_free_callback {
100907 + struct gnttab_free_callback *next;
100908 + void (*fn)(void *);
100909 + void *arg;
100910 + u16 count;
100911 +};
100912 +
100913 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
100914 + int readonly);
100915 +
100916 +/*
100917 + * End access through the given grant reference, iff the grant entry is no
100918 + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in
100919 + * use.
100920 + */
100921 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
100922 +
100923 +/*
100924 + * Eventually end access through the given grant reference, and once that
100925 + * access has been ended, free the given page too. Access will be ended
100926 + * immediately iff the grant entry is not in use, otherwise it will happen
100927 + * some time later. page may be 0, in which case no freeing will occur.
100928 + */
100929 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
100930 + unsigned long page);
100931 +
100932 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
100933 +
100934 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
100935 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
100936 +
100937 +int gnttab_query_foreign_access(grant_ref_t ref);
100938 +
100939 +/*
100940 + * operations on reserved batches of grant references
100941 + */
100942 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
100943 +
100944 +void gnttab_free_grant_reference(grant_ref_t ref);
100945 +
100946 +void gnttab_free_grant_references(grant_ref_t head);
100947 +
100948 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
100949 +
100950 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
100951 +
100952 +void gnttab_release_grant_reference(grant_ref_t *private_head,
100953 + grant_ref_t release);
100954 +
100955 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
100956 + void (*fn)(void *), void *arg, u16 count);
100957 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
100958 +
100959 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
100960 + unsigned long frame, int readonly);
100961 +
100962 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
100963 + unsigned long pfn);
100964 +
100965 +#ifdef __ia64__
100966 +#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
100967 +#else
100968 +#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
100969 +#endif
100970 +
100971 +int gnttab_suspend(void);
100972 +int gnttab_resume(void);
100973 +
100974 +static inline void
100975 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
100976 + uint32_t flags, grant_ref_t ref, domid_t domid)
100977 +{
100978 + if (flags & GNTMAP_contains_pte)
100979 + map->host_addr = addr;
100980 + else if (xen_feature(XENFEAT_auto_translated_physmap))
100981 + map->host_addr = __pa(addr);
100982 + else
100983 + map->host_addr = addr;
100984 +
100985 + map->flags = flags;
100986 + map->ref = ref;
100987 + map->dom = domid;
100988 +}
100989 +
100990 +static inline void
100991 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
100992 + uint32_t flags, grant_handle_t handle)
100993 +{
100994 + if (flags & GNTMAP_contains_pte)
100995 + unmap->host_addr = addr;
100996 + else if (xen_feature(XENFEAT_auto_translated_physmap))
100997 + unmap->host_addr = __pa(addr);
100998 + else
100999 + unmap->host_addr = addr;
101000 +
101001 + unmap->handle = handle;
101002 + unmap->dev_bus_addr = 0;
101003 +}
101004 +
101005 +#endif /* __ASM_GNTTAB_H__ */
101006 diff -Nur linux-2.6.16.33-noxen/include/xen/hvm.h linux-2.6.16.33/include/xen/hvm.h
101007 --- linux-2.6.16.33-noxen/include/xen/hvm.h 1970-01-01 00:00:00.000000000 +0000
101008 +++ linux-2.6.16.33/include/xen/hvm.h 2007-01-08 15:00:46.000000000 +0000
101009 @@ -0,0 +1,24 @@
101010 +/* Simple wrappers around HVM functions */
101011 +#ifndef XEN_HVM_H__
101012 +#define XEN_HVM_H__
101013 +
101014 +#include <xen/interface/hvm/params.h>
101015 +#include <asm/hypercall.h>
101016 +
101017 +static inline unsigned long hvm_get_parameter(int idx)
101018 +{
101019 + struct xen_hvm_param xhv;
101020 + int r;
101021 +
101022 + xhv.domid = DOMID_SELF;
101023 + xhv.index = idx;
101024 + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
101025 + if (r < 0) {
101026 + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n",
101027 + idx, r);
101028 + return 0;
101029 + }
101030 + return xhv.value;
101031 +}
101032 +
101033 +#endif /* XEN_HVM_H__ */
101034 diff -Nur linux-2.6.16.33-noxen/include/xen/hypervisor_sysfs.h linux-2.6.16.33/include/xen/hypervisor_sysfs.h
101035 --- linux-2.6.16.33-noxen/include/xen/hypervisor_sysfs.h 1970-01-01 00:00:00.000000000 +0000
101036 +++ linux-2.6.16.33/include/xen/hypervisor_sysfs.h 2007-01-08 15:00:46.000000000 +0000
101037 @@ -0,0 +1,32 @@
101038 +/*
101039 + * copyright (c) 2006 IBM Corporation
101040 + * Authored by: Mike D. Day <ncmike@us.ibm.com>
101041 + *
101042 + * This program is free software; you can redistribute it and/or modify
101043 + * it under the terms of the GNU General Public License version 2 as
101044 + * published by the Free Software Foundation.
101045 + */
101046 +
101047 +#ifndef _HYP_SYSFS_H_
101048 +#define _HYP_SYSFS_H_
101049 +
101050 +#include <linux/kobject.h>
101051 +#include <linux/sysfs.h>
101052 +
101053 +#define HYPERVISOR_ATTR_RO(_name) \
101054 +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
101055 +
101056 +#define HYPERVISOR_ATTR_RW(_name) \
101057 +static struct hyp_sysfs_attr _name##_attr = \
101058 + __ATTR(_name, 0644, _name##_show, _name##_store)
101059 +
101060 +extern struct subsystem hypervisor_subsys;
101061 +
101062 +struct hyp_sysfs_attr {
101063 + struct attribute attr;
101064 + ssize_t (*show)(struct hyp_sysfs_attr *, char *);
101065 + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
101066 + void *hyp_attr_data;
101067 +};
101068 +
101069 +#endif /* _HYP_SYSFS_H_ */
101070 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/COPYING linux-2.6.16.33/include/xen/interface/COPYING
101071 --- linux-2.6.16.33-noxen/include/xen/interface/COPYING 1970-01-01 00:00:00.000000000 +0000
101072 +++ linux-2.6.16.33/include/xen/interface/COPYING 2007-01-08 15:00:55.000000000 +0000
101073 @@ -0,0 +1,38 @@
101074 +XEN NOTICE
101075 +==========
101076 +
101077 +This copyright applies to all files within this subdirectory and its
101078 +subdirectories:
101079 + include/public/*.h
101080 + include/public/hvm/*.h
101081 + include/public/io/*.h
101082 +
101083 +The intention is that these files can be freely copied into the source
101084 +tree of an operating system when porting that OS to run on Xen. Doing
101085 +so does *not* cause the OS to become subject to the terms of the GPL.
101086 +
101087 +All other files in the Xen source distribution are covered by version
101088 +2 of the GNU General Public License except where explicitly stated
101089 +otherwise within individual source files.
101090 +
101091 + -- Keir Fraser (on behalf of the Xen team)
101092 +
101093 +=====================================================================
101094 +
101095 +Permission is hereby granted, free of charge, to any person obtaining a copy
101096 +of this software and associated documentation files (the "Software"), to
101097 +deal in the Software without restriction, including without limitation the
101098 +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101099 +sell copies of the Software, and to permit persons to whom the Software is
101100 +furnished to do so, subject to the following conditions:
101101 +
101102 +The above copyright notice and this permission notice shall be included in
101103 +all copies or substantial portions of the Software.
101104 +
101105 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101106 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101107 +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101108 +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101109 +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101110 +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101111 +DEALINGS IN THE SOFTWARE.
101112 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/acm.h linux-2.6.16.33/include/xen/interface/acm.h
101113 --- linux-2.6.16.33-noxen/include/xen/interface/acm.h 1970-01-01 00:00:00.000000000 +0000
101114 +++ linux-2.6.16.33/include/xen/interface/acm.h 2007-01-08 15:00:55.000000000 +0000
101115 @@ -0,0 +1,205 @@
101116 +/*
101117 + * acm.h: Xen access control module interface defintions
101118 + *
101119 + * Permission is hereby granted, free of charge, to any person obtaining a copy
101120 + * of this software and associated documentation files (the "Software"), to
101121 + * deal in the Software without restriction, including without limitation the
101122 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101123 + * sell copies of the Software, and to permit persons to whom the Software is
101124 + * furnished to do so, subject to the following conditions:
101125 + *
101126 + * The above copyright notice and this permission notice shall be included in
101127 + * all copies or substantial portions of the Software.
101128 + *
101129 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101130 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101131 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101132 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101133 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101134 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101135 + * DEALINGS IN THE SOFTWARE.
101136 + *
101137 + * Reiner Sailer <sailer@watson.ibm.com>
101138 + * Copyright (c) 2005, International Business Machines Corporation.
101139 + */
101140 +
101141 +#ifndef _XEN_PUBLIC_ACM_H
101142 +#define _XEN_PUBLIC_ACM_H
101143 +
101144 +#include "xen.h"
101145 +
101146 +/* if ACM_DEBUG defined, all hooks should
101147 + * print a short trace message (comment it out
101148 + * when not in testing mode )
101149 + */
101150 +/* #define ACM_DEBUG */
101151 +
101152 +#ifdef ACM_DEBUG
101153 +# define printkd(fmt, args...) printk(fmt,## args)
101154 +#else
101155 +# define printkd(fmt, args...)
101156 +#endif
101157 +
101158 +/* default ssid reference value if not supplied */
101159 +#define ACM_DEFAULT_SSID 0x0
101160 +#define ACM_DEFAULT_LOCAL_SSID 0x0
101161 +
101162 +/* Internal ACM ERROR types */
101163 +#define ACM_OK 0
101164 +#define ACM_UNDEF -1
101165 +#define ACM_INIT_SSID_ERROR -2
101166 +#define ACM_INIT_SOID_ERROR -3
101167 +#define ACM_ERROR -4
101168 +
101169 +/* External ACCESS DECISIONS */
101170 +#define ACM_ACCESS_PERMITTED 0
101171 +#define ACM_ACCESS_DENIED -111
101172 +#define ACM_NULL_POINTER_ERROR -200
101173 +
101174 +/* primary policy in lower 4 bits */
101175 +#define ACM_NULL_POLICY 0
101176 +#define ACM_CHINESE_WALL_POLICY 1
101177 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
101178 +#define ACM_POLICY_UNDEFINED 15
101179 +
101180 +/* combinations have secondary policy component in higher 4bit */
101181 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
101182 + ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
101183 +
101184 +/* policy: */
101185 +#define ACM_POLICY_NAME(X) \
101186 + ((X) == (ACM_NULL_POLICY)) ? "NULL" : \
101187 + ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" : \
101188 + ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
101189 + ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
101190 + "UNDEFINED"
101191 +
101192 +/* the following policy versions must be increased
101193 + * whenever the interpretation of the related
101194 + * policy's data structure changes
101195 + */
101196 +#define ACM_POLICY_VERSION 2
101197 +#define ACM_CHWALL_VERSION 1
101198 +#define ACM_STE_VERSION 1
101199 +
101200 +/* defines a ssid reference used by xen */
101201 +typedef uint32_t ssidref_t;
101202 +
101203 +/* hooks that are known to domains */
101204 +#define ACMHOOK_none 0
101205 +#define ACMHOOK_sharing 1
101206 +
101207 +/* -------security policy relevant type definitions-------- */
101208 +
101209 +/* type identifier; compares to "equal" or "not equal" */
101210 +typedef uint16_t domaintype_t;
101211 +
101212 +/* CHINESE WALL POLICY DATA STRUCTURES
101213 + *
101214 + * current accumulated conflict type set:
101215 + * When a domain is started and has a type that is in
101216 + * a conflict set, the conflicting types are incremented in
101217 + * the aggregate set. When a domain is destroyed, the
101218 + * conflicting types to its type are decremented.
101219 + * If a domain has multiple types, this procedure works over
101220 + * all those types.
101221 + *
101222 + * conflict_aggregate_set[i] holds the number of
101223 + * running domains that have a conflict with type i.
101224 + *
101225 + * running_types[i] holds the number of running domains
101226 + * that include type i in their ssidref-referenced type set
101227 + *
101228 + * conflict_sets[i][j] is "0" if type j has no conflict
101229 + * with type i and is "1" otherwise.
101230 + */
101231 +/* high-16 = version, low-16 = check magic */
101232 +#define ACM_MAGIC 0x0001debc
101233 +
101234 +/* each offset in bytes from start of the struct they
101235 + * are part of */
101236 +
101237 +/* each buffer consists of all policy information for
101238 + * the respective policy given in the policy code
101239 + *
101240 + * acm_policy_buffer, acm_chwall_policy_buffer,
101241 + * and acm_ste_policy_buffer need to stay 32-bit aligned
101242 + * because we create binary policies also with external
101243 + * tools that assume packed representations (e.g. the java tool)
101244 + */
101245 +struct acm_policy_buffer {
101246 + uint32_t policy_version; /* ACM_POLICY_VERSION */
101247 + uint32_t magic;
101248 + uint32_t len;
101249 + uint32_t policy_reference_offset;
101250 + uint32_t primary_policy_code;
101251 + uint32_t primary_buffer_offset;
101252 + uint32_t secondary_policy_code;
101253 + uint32_t secondary_buffer_offset;
101254 +};
101255 +
101256 +struct acm_policy_reference_buffer {
101257 + uint32_t len;
101258 +};
101259 +
101260 +struct acm_chwall_policy_buffer {
101261 + uint32_t policy_version; /* ACM_CHWALL_VERSION */
101262 + uint32_t policy_code;
101263 + uint32_t chwall_max_types;
101264 + uint32_t chwall_max_ssidrefs;
101265 + uint32_t chwall_max_conflictsets;
101266 + uint32_t chwall_ssid_offset;
101267 + uint32_t chwall_conflict_sets_offset;
101268 + uint32_t chwall_running_types_offset;
101269 + uint32_t chwall_conflict_aggregate_offset;
101270 +};
101271 +
101272 +struct acm_ste_policy_buffer {
101273 + uint32_t policy_version; /* ACM_STE_VERSION */
101274 + uint32_t policy_code;
101275 + uint32_t ste_max_types;
101276 + uint32_t ste_max_ssidrefs;
101277 + uint32_t ste_ssid_offset;
101278 +};
101279 +
101280 +struct acm_stats_buffer {
101281 + uint32_t magic;
101282 + uint32_t len;
101283 + uint32_t primary_policy_code;
101284 + uint32_t primary_stats_offset;
101285 + uint32_t secondary_policy_code;
101286 + uint32_t secondary_stats_offset;
101287 +};
101288 +
101289 +struct acm_ste_stats_buffer {
101290 + uint32_t ec_eval_count;
101291 + uint32_t gt_eval_count;
101292 + uint32_t ec_denied_count;
101293 + uint32_t gt_denied_count;
101294 + uint32_t ec_cachehit_count;
101295 + uint32_t gt_cachehit_count;
101296 +};
101297 +
101298 +struct acm_ssid_buffer {
101299 + uint32_t len;
101300 + ssidref_t ssidref;
101301 + uint32_t policy_reference_offset;
101302 + uint32_t primary_policy_code;
101303 + uint32_t primary_max_types;
101304 + uint32_t primary_types_offset;
101305 + uint32_t secondary_policy_code;
101306 + uint32_t secondary_max_types;
101307 + uint32_t secondary_types_offset;
101308 +};
101309 +
101310 +#endif
101311 +
101312 +/*
101313 + * Local variables:
101314 + * mode: C
101315 + * c-set-style: "BSD"
101316 + * c-basic-offset: 4
101317 + * tab-width: 4
101318 + * indent-tabs-mode: nil
101319 + * End:
101320 + */
101321 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/acm_ops.h linux-2.6.16.33/include/xen/interface/acm_ops.h
101322 --- linux-2.6.16.33-noxen/include/xen/interface/acm_ops.h 1970-01-01 00:00:00.000000000 +0000
101323 +++ linux-2.6.16.33/include/xen/interface/acm_ops.h 2007-01-08 15:00:55.000000000 +0000
101324 @@ -0,0 +1,120 @@
101325 +/*
101326 + * acm_ops.h: Xen access control module hypervisor commands
101327 + *
101328 + * Permission is hereby granted, free of charge, to any person obtaining a copy
101329 + * of this software and associated documentation files (the "Software"), to
101330 + * deal in the Software without restriction, including without limitation the
101331 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101332 + * sell copies of the Software, and to permit persons to whom the Software is
101333 + * furnished to do so, subject to the following conditions:
101334 + *
101335 + * The above copyright notice and this permission notice shall be included in
101336 + * all copies or substantial portions of the Software.
101337 + *
101338 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101339 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101340 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101341 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101342 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101343 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101344 + * DEALINGS IN THE SOFTWARE.
101345 + *
101346 + * Reiner Sailer <sailer@watson.ibm.com>
101347 + * Copyright (c) 2005,2006 International Business Machines Corporation.
101348 + */
101349 +
101350 +#ifndef __XEN_PUBLIC_ACM_OPS_H__
101351 +#define __XEN_PUBLIC_ACM_OPS_H__
101352 +
101353 +#include "xen.h"
101354 +#include "acm.h"
101355 +
101356 +/*
101357 + * Make sure you increment the interface version whenever you modify this file!
101358 + * This makes sure that old versions of acm tools will stop working in a
101359 + * well-defined way (rather than crashing the machine, for instance).
101360 + */
101361 +#define ACM_INTERFACE_VERSION 0xAAAA0008
101362 +
101363 +/************************************************************************/
101364 +
101365 +/*
101366 + * Prototype for this hypercall is:
101367 + * int acm_op(int cmd, void *args)
101368 + * @cmd == ACMOP_??? (access control module operation).
101369 + * @args == Operation-specific extra arguments (NULL if none).
101370 + */
101371 +
101372 +
101373 +#define ACMOP_setpolicy 1
101374 +struct acm_setpolicy {
101375 + /* IN */
101376 + uint32_t interface_version;
101377 + XEN_GUEST_HANDLE(void) pushcache;
101378 + uint32_t pushcache_size;
101379 +};
101380 +
101381 +
101382 +#define ACMOP_getpolicy 2
101383 +struct acm_getpolicy {
101384 + /* IN */
101385 + uint32_t interface_version;
101386 + XEN_GUEST_HANDLE(void) pullcache;
101387 + uint32_t pullcache_size;
101388 +};
101389 +
101390 +
101391 +#define ACMOP_dumpstats 3
101392 +struct acm_dumpstats {
101393 + /* IN */
101394 + uint32_t interface_version;
101395 + XEN_GUEST_HANDLE(void) pullcache;
101396 + uint32_t pullcache_size;
101397 +};
101398 +
101399 +
101400 +#define ACMOP_getssid 4
101401 +#define ACM_GETBY_ssidref 1
101402 +#define ACM_GETBY_domainid 2
101403 +struct acm_getssid {
101404 + /* IN */
101405 + uint32_t interface_version;
101406 + uint32_t get_ssid_by; /* ACM_GETBY_* */
101407 + union {
101408 + domaintype_t domainid;
101409 + ssidref_t ssidref;
101410 + } id;
101411 + XEN_GUEST_HANDLE(void) ssidbuf;
101412 + uint32_t ssidbuf_size;
101413 +};
101414 +
101415 +#define ACMOP_getdecision 5
101416 +struct acm_getdecision {
101417 + /* IN */
101418 + uint32_t interface_version;
101419 + uint32_t get_decision_by1; /* ACM_GETBY_* */
101420 + uint32_t get_decision_by2; /* ACM_GETBY_* */
101421 + union {
101422 + domaintype_t domainid;
101423 + ssidref_t ssidref;
101424 + } id1;
101425 + union {
101426 + domaintype_t domainid;
101427 + ssidref_t ssidref;
101428 + } id2;
101429 + uint32_t hook;
101430 + /* OUT */
101431 + uint32_t acm_decision;
101432 +};
101433 +
101434 +#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
101435 +
101436 +/*
101437 + * Local variables:
101438 + * mode: C
101439 + * c-set-style: "BSD"
101440 + * c-basic-offset: 4
101441 + * tab-width: 4
101442 + * indent-tabs-mode: nil
101443 + * End:
101444 + */
101445 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-ia64.h linux-2.6.16.33/include/xen/interface/arch-ia64.h
101446 --- linux-2.6.16.33-noxen/include/xen/interface/arch-ia64.h 1970-01-01 00:00:00.000000000 +0000
101447 +++ linux-2.6.16.33/include/xen/interface/arch-ia64.h 2007-01-08 15:00:55.000000000 +0000
101448 @@ -0,0 +1,500 @@
101449 +/******************************************************************************
101450 + * arch-ia64/hypervisor-if.h
101451 + *
101452 + * Guest OS interface to IA64 Xen.
101453 + *
101454 + * Permission is hereby granted, free of charge, to any person obtaining a copy
101455 + * of this software and associated documentation files (the "Software"), to
101456 + * deal in the Software without restriction, including without limitation the
101457 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101458 + * sell copies of the Software, and to permit persons to whom the Software is
101459 + * furnished to do so, subject to the following conditions:
101460 + *
101461 + * The above copyright notice and this permission notice shall be included in
101462 + * all copies or substantial portions of the Software.
101463 + *
101464 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101465 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101466 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101467 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101468 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101469 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101470 + * DEALINGS IN THE SOFTWARE.
101471 + *
101472 + */
101473 +
101474 +#ifndef __HYPERVISOR_IF_IA64_H__
101475 +#define __HYPERVISOR_IF_IA64_H__
101476 +
101477 +/* Structural guest handles introduced in 0x00030201. */
101478 +#if __XEN_INTERFACE_VERSION__ >= 0x00030201
101479 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101480 + typedef struct { type *p; } __guest_handle_ ## name
101481 +#else
101482 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101483 + typedef type * __guest_handle_ ## name
101484 +#endif
101485 +
101486 +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
101487 +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
101488 +#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
101489 +#ifdef __XEN_TOOLS__
101490 +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
101491 +#endif
101492 +
101493 +#ifndef __ASSEMBLY__
101494 +/* Guest handles for primitive C types. */
101495 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
101496 +__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
101497 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
101498 +__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long);
101499 +DEFINE_XEN_GUEST_HANDLE(char);
101500 +DEFINE_XEN_GUEST_HANDLE(int);
101501 +DEFINE_XEN_GUEST_HANDLE(long);
101502 +DEFINE_XEN_GUEST_HANDLE(void);
101503 +
101504 +typedef unsigned long xen_pfn_t;
101505 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
101506 +#endif
101507 +
101508 +/* Arch specific VIRQs definition */
101509 +#define VIRQ_ITC VIRQ_ARCH_0 /* V. Virtual itc timer */
101510 +#define VIRQ_MCA_CMC VIRQ_ARCH_1 /* MCA cmc interrupt */
101511 +#define VIRQ_MCA_CPE VIRQ_ARCH_2 /* MCA cpe interrupt */
101512 +
101513 +/* Maximum number of virtual CPUs in multi-processor guests. */
101514 +/* WARNING: before changing this, check that shared_info fits on a page */
101515 +#define MAX_VIRT_CPUS 64
101516 +
101517 +#ifndef __ASSEMBLY__
101518 +
101519 +typedef unsigned long xen_ulong_t;
101520 +
101521 +#define INVALID_MFN (~0UL)
101522 +
101523 +#define MEM_G (1UL << 30)
101524 +#define MEM_M (1UL << 20)
101525 +
101526 +#define MMIO_START (3 * MEM_G)
101527 +#define MMIO_SIZE (512 * MEM_M)
101528 +
101529 +#define VGA_IO_START 0xA0000UL
101530 +#define VGA_IO_SIZE 0x20000
101531 +
101532 +#define LEGACY_IO_START (MMIO_START + MMIO_SIZE)
101533 +#define LEGACY_IO_SIZE (64*MEM_M)
101534 +
101535 +#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
101536 +#define IO_PAGE_SIZE PAGE_SIZE
101537 +
101538 +#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
101539 +#define STORE_PAGE_SIZE PAGE_SIZE
101540 +
101541 +#define BUFFER_IO_PAGE_START (STORE_PAGE_START+PAGE_SIZE)
101542 +#define BUFFER_IO_PAGE_SIZE PAGE_SIZE
101543 +
101544 +#define IO_SAPIC_START 0xfec00000UL
101545 +#define IO_SAPIC_SIZE 0x100000
101546 +
101547 +#define PIB_START 0xfee00000UL
101548 +#define PIB_SIZE 0x200000
101549 +
101550 +#define GFW_START (4*MEM_G -16*MEM_M)
101551 +#define GFW_SIZE (16*MEM_M)
101552 +
101553 +struct pt_fpreg {
101554 + union {
101555 + unsigned long bits[2];
101556 + long double __dummy; /* force 16-byte alignment */
101557 + } u;
101558 +};
101559 +
101560 +struct cpu_user_regs {
101561 + /* The following registers are saved by SAVE_MIN: */
101562 + unsigned long b6; /* scratch */
101563 + unsigned long b7; /* scratch */
101564 +
101565 + unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
101566 + unsigned long ar_ssd; /* reserved for future use (scratch) */
101567 +
101568 + unsigned long r8; /* scratch (return value register 0) */
101569 + unsigned long r9; /* scratch (return value register 1) */
101570 + unsigned long r10; /* scratch (return value register 2) */
101571 + unsigned long r11; /* scratch (return value register 3) */
101572 +
101573 + unsigned long cr_ipsr; /* interrupted task's psr */
101574 + unsigned long cr_iip; /* interrupted task's instruction pointer */
101575 + unsigned long cr_ifs; /* interrupted task's function state */
101576 +
101577 + unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
101578 + unsigned long ar_pfs; /* prev function state */
101579 + unsigned long ar_rsc; /* RSE configuration */
101580 + /* The following two are valid only if cr_ipsr.cpl > 0: */
101581 + unsigned long ar_rnat; /* RSE NaT */
101582 + unsigned long ar_bspstore; /* RSE bspstore */
101583 +
101584 + unsigned long pr; /* 64 predicate registers (1 bit each) */
101585 + unsigned long b0; /* return pointer (bp) */
101586 + unsigned long loadrs; /* size of dirty partition << 16 */
101587 +
101588 + unsigned long r1; /* the gp pointer */
101589 + unsigned long r12; /* interrupted task's memory stack pointer */
101590 + unsigned long r13; /* thread pointer */
101591 +
101592 + unsigned long ar_fpsr; /* floating point status (preserved) */
101593 + unsigned long r15; /* scratch */
101594 +
101595 + /* The remaining registers are NOT saved for system calls. */
101596 +
101597 + unsigned long r14; /* scratch */
101598 + unsigned long r2; /* scratch */
101599 + unsigned long r3; /* scratch */
101600 + unsigned long r16; /* scratch */
101601 + unsigned long r17; /* scratch */
101602 + unsigned long r18; /* scratch */
101603 + unsigned long r19; /* scratch */
101604 + unsigned long r20; /* scratch */
101605 + unsigned long r21; /* scratch */
101606 + unsigned long r22; /* scratch */
101607 + unsigned long r23; /* scratch */
101608 + unsigned long r24; /* scratch */
101609 + unsigned long r25; /* scratch */
101610 + unsigned long r26; /* scratch */
101611 + unsigned long r27; /* scratch */
101612 + unsigned long r28; /* scratch */
101613 + unsigned long r29; /* scratch */
101614 + unsigned long r30; /* scratch */
101615 + unsigned long r31; /* scratch */
101616 + unsigned long ar_ccv; /* compare/exchange value (scratch) */
101617 +
101618 + /*
101619 + * Floating point registers that the kernel considers scratch:
101620 + */
101621 + struct pt_fpreg f6; /* scratch */
101622 + struct pt_fpreg f7; /* scratch */
101623 + struct pt_fpreg f8; /* scratch */
101624 + struct pt_fpreg f9; /* scratch */
101625 + struct pt_fpreg f10; /* scratch */
101626 + struct pt_fpreg f11; /* scratch */
101627 + unsigned long r4; /* preserved */
101628 + unsigned long r5; /* preserved */
101629 + unsigned long r6; /* preserved */
101630 + unsigned long r7; /* preserved */
101631 + unsigned long eml_unat; /* used for emulating instruction */
101632 + unsigned long pad0; /* alignment pad */
101633 +
101634 +};
101635 +typedef struct cpu_user_regs cpu_user_regs_t;
101636 +
101637 +union vac {
101638 + unsigned long value;
101639 + struct {
101640 + int a_int:1;
101641 + int a_from_int_cr:1;
101642 + int a_to_int_cr:1;
101643 + int a_from_psr:1;
101644 + int a_from_cpuid:1;
101645 + int a_cover:1;
101646 + int a_bsw:1;
101647 + long reserved:57;
101648 + };
101649 +};
101650 +typedef union vac vac_t;
101651 +
101652 +union vdc {
101653 + unsigned long value;
101654 + struct {
101655 + int d_vmsw:1;
101656 + int d_extint:1;
101657 + int d_ibr_dbr:1;
101658 + int d_pmc:1;
101659 + int d_to_pmd:1;
101660 + int d_itm:1;
101661 + long reserved:58;
101662 + };
101663 +};
101664 +typedef union vdc vdc_t;
101665 +
101666 +struct mapped_regs {
101667 + union vac vac;
101668 + union vdc vdc;
101669 + unsigned long virt_env_vaddr;
101670 + unsigned long reserved1[29];
101671 + unsigned long vhpi;
101672 + unsigned long reserved2[95];
101673 + union {
101674 + unsigned long vgr[16];
101675 + unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
101676 + };
101677 + union {
101678 + unsigned long vbgr[16];
101679 + unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
101680 + };
101681 + unsigned long vnat;
101682 + unsigned long vbnat;
101683 + unsigned long vcpuid[5];
101684 + unsigned long reserved3[11];
101685 + unsigned long vpsr;
101686 + unsigned long vpr;
101687 + unsigned long reserved4[76];
101688 + union {
101689 + unsigned long vcr[128];
101690 + struct {
101691 + unsigned long dcr; // CR0
101692 + unsigned long itm;
101693 + unsigned long iva;
101694 + unsigned long rsv1[5];
101695 + unsigned long pta; // CR8
101696 + unsigned long rsv2[7];
101697 + unsigned long ipsr; // CR16
101698 + unsigned long isr;
101699 + unsigned long rsv3;
101700 + unsigned long iip;
101701 + unsigned long ifa;
101702 + unsigned long itir;
101703 + unsigned long iipa;
101704 + unsigned long ifs;
101705 + unsigned long iim; // CR24
101706 + unsigned long iha;
101707 + unsigned long rsv4[38];
101708 + unsigned long lid; // CR64
101709 + unsigned long ivr;
101710 + unsigned long tpr;
101711 + unsigned long eoi;
101712 + unsigned long irr[4];
101713 + unsigned long itv; // CR72
101714 + unsigned long pmv;
101715 + unsigned long cmcv;
101716 + unsigned long rsv5[5];
101717 + unsigned long lrr0; // CR80
101718 + unsigned long lrr1;
101719 + unsigned long rsv6[46];
101720 + };
101721 + };
101722 + union {
101723 + unsigned long reserved5[128];
101724 + struct {
101725 + unsigned long precover_ifs;
101726 + unsigned long unat; // not sure if this is needed until NaT arch is done
101727 + int interrupt_collection_enabled; // virtual psr.ic
101728 + /* virtual interrupt deliverable flag is evtchn_upcall_mask in
101729 + * shared info area now. interrupt_mask_addr is the address
101730 + * of evtchn_upcall_mask for current vcpu
101731 + */
101732 + unsigned char *interrupt_mask_addr;
101733 + int pending_interruption;
101734 + int incomplete_regframe; // see SDM vol2 6.8
101735 + unsigned char vpsr_pp;
101736 + unsigned char reserved5_2[7];
101737 + unsigned long reserved5_1[3];
101738 + int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
101739 + int banknum; // 0 or 1, which virtual register bank is active
101740 + unsigned long rrs[8]; // region registers
101741 + unsigned long krs[8]; // kernel registers
101742 + unsigned long pkrs[8]; // protection key registers
101743 + unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
101744 + };
101745 + };
101746 +};
101747 +typedef struct mapped_regs mapped_regs_t;
101748 +
101749 +struct vpd {
101750 + struct mapped_regs vpd_low;
101751 + unsigned long reserved6[3456];
101752 + unsigned long vmm_avail[128];
101753 + unsigned long reserved7[4096];
101754 +};
101755 +typedef struct vpd vpd_t;
101756 +
101757 +struct arch_vcpu_info {
101758 +};
101759 +typedef struct arch_vcpu_info arch_vcpu_info_t;
101760 +
101761 +struct arch_shared_info {
101762 + /* PFN of the start_info page. */
101763 + unsigned long start_info_pfn;
101764 +
101765 + /* Interrupt vector for event channel. */
101766 + int evtchn_vector;
101767 +
101768 + uint64_t pad[32];
101769 +};
101770 +typedef struct arch_shared_info arch_shared_info_t;
101771 +
101772 +typedef unsigned long xen_callback_t;
101773 +
101774 +struct ia64_tr_entry {
101775 + unsigned long pte;
101776 + unsigned long itir;
101777 + unsigned long vadr;
101778 + unsigned long rid;
101779 +};
101780 +
101781 +struct vcpu_extra_regs {
101782 + struct ia64_tr_entry itrs[8];
101783 + struct ia64_tr_entry dtrs[8];
101784 + unsigned long iva;
101785 + unsigned long dcr;
101786 + unsigned long event_callback_ip;
101787 +};
101788 +
101789 +struct vcpu_guest_context {
101790 +#define VGCF_EXTRA_REGS (1<<1) /* Get/Set extra regs. */
101791 + unsigned long flags; /* VGCF_* flags */
101792 +
101793 + struct cpu_user_regs user_regs;
101794 + struct vcpu_extra_regs extra_regs;
101795 + unsigned long privregs_pfn;
101796 +};
101797 +typedef struct vcpu_guest_context vcpu_guest_context_t;
101798 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
101799 +
101800 +/* dom0 vp op */
101801 +#define __HYPERVISOR_ia64_dom0vp_op __HYPERVISOR_arch_0
101802 +/* Map io space in machine address to dom0 physical address space.
101803 + Currently physical assigned address equals to machine address. */
101804 +#define IA64_DOM0VP_ioremap 0
101805 +
101806 +/* Convert a pseudo physical page frame number to the corresponding
101807 + machine page frame number. If no page is assigned, INVALID_MFN or
101808 + GPFN_INV_MASK is returned depending on domain's non-vti/vti mode. */
101809 +#define IA64_DOM0VP_phystomach 1
101810 +
101811 +/* Convert a machine page frame number to the corresponding pseudo physical
101812 + page frame number of the caller domain. */
101813 +#define IA64_DOM0VP_machtophys 3
101814 +
101815 +/* Reserved for future use. */
101816 +#define IA64_DOM0VP_iounmap 4
101817 +
101818 +/* Unmap and free pages contained in the specified pseudo physical region. */
101819 +#define IA64_DOM0VP_zap_physmap 5
101820 +
101821 +/* Assign machine page frame to dom0's pseudo physical address space. */
101822 +#define IA64_DOM0VP_add_physmap 6
101823 +
101824 +/* expose the p2m table into domain */
101825 +#define IA64_DOM0VP_expose_p2m 7
101826 +
101827 +/* xen perfmon */
101828 +#define IA64_DOM0VP_perfmon 8
101829 +
101830 +/* gmfn version of IA64_DOM0VP_add_physmap */
101831 +#define IA64_DOM0VP_add_physmap_with_gmfn 9
101832 +
101833 +// flags for page assignement to pseudo physical address space
101834 +#define _ASSIGN_readonly 0
101835 +#define ASSIGN_readonly (1UL << _ASSIGN_readonly)
101836 +#define ASSIGN_writable (0UL << _ASSIGN_readonly) // dummy flag
101837 +/* Internal only: memory attribute must be WC/UC/UCE. */
101838 +#define _ASSIGN_nocache 1
101839 +#define ASSIGN_nocache (1UL << _ASSIGN_nocache)
101840 +// tlb tracking
101841 +#define _ASSIGN_tlb_track 2
101842 +#define ASSIGN_tlb_track (1UL << _ASSIGN_tlb_track)
101843 +/* Internal only: associated with PGC_allocated bit */
101844 +#define _ASSIGN_pgc_allocated 3
101845 +#define ASSIGN_pgc_allocated (1UL << _ASSIGN_pgc_allocated)
101846 +
101847 +/* This structure has the same layout of struct ia64_boot_param, defined in
101848 + <asm/system.h>. It is redefined here to ease use. */
101849 +struct xen_ia64_boot_param {
101850 + unsigned long command_line; /* physical address of cmd line args */
101851 + unsigned long efi_systab; /* physical address of EFI system table */
101852 + unsigned long efi_memmap; /* physical address of EFI memory map */
101853 + unsigned long efi_memmap_size; /* size of EFI memory map */
101854 + unsigned long efi_memdesc_size; /* size of an EFI memory map descriptor */
101855 + unsigned int efi_memdesc_version; /* memory descriptor version */
101856 + struct {
101857 + unsigned short num_cols; /* number of columns on console. */
101858 + unsigned short num_rows; /* number of rows on console. */
101859 + unsigned short orig_x; /* cursor's x position */
101860 + unsigned short orig_y; /* cursor's y position */
101861 + } console_info;
101862 + unsigned long fpswa; /* physical address of the fpswa interface */
101863 + unsigned long initrd_start;
101864 + unsigned long initrd_size;
101865 + unsigned long domain_start; /* va where the boot time domain begins */
101866 + unsigned long domain_size; /* how big is the boot domain */
101867 +};
101868 +
101869 +#endif /* !__ASSEMBLY__ */
101870 +
101871 +/* Size of the shared_info area (this is not related to page size). */
101872 +#define XSI_SHIFT 14
101873 +#define XSI_SIZE (1 << XSI_SHIFT)
101874 +/* Log size of mapped_regs area (64 KB - only 4KB is used). */
101875 +#define XMAPPEDREGS_SHIFT 12
101876 +#define XMAPPEDREGS_SIZE (1 << XMAPPEDREGS_SHIFT)
101877 +/* Offset of XASI (Xen arch shared info) wrt XSI_BASE. */
101878 +#define XMAPPEDREGS_OFS XSI_SIZE
101879 +
101880 +/* Hyperprivops. */
101881 +#define HYPERPRIVOP_RFI 0x1
101882 +#define HYPERPRIVOP_RSM_DT 0x2
101883 +#define HYPERPRIVOP_SSM_DT 0x3
101884 +#define HYPERPRIVOP_COVER 0x4
101885 +#define HYPERPRIVOP_ITC_D 0x5
101886 +#define HYPERPRIVOP_ITC_I 0x6
101887 +#define HYPERPRIVOP_SSM_I 0x7
101888 +#define HYPERPRIVOP_GET_IVR 0x8
101889 +#define HYPERPRIVOP_GET_TPR 0x9
101890 +#define HYPERPRIVOP_SET_TPR 0xa
101891 +#define HYPERPRIVOP_EOI 0xb
101892 +#define HYPERPRIVOP_SET_ITM 0xc
101893 +#define HYPERPRIVOP_THASH 0xd
101894 +#define HYPERPRIVOP_PTC_GA 0xe
101895 +#define HYPERPRIVOP_ITR_D 0xf
101896 +#define HYPERPRIVOP_GET_RR 0x10
101897 +#define HYPERPRIVOP_SET_RR 0x11
101898 +#define HYPERPRIVOP_SET_KR 0x12
101899 +#define HYPERPRIVOP_FC 0x13
101900 +#define HYPERPRIVOP_GET_CPUID 0x14
101901 +#define HYPERPRIVOP_GET_PMD 0x15
101902 +#define HYPERPRIVOP_GET_EFLAG 0x16
101903 +#define HYPERPRIVOP_SET_EFLAG 0x17
101904 +#define HYPERPRIVOP_RSM_BE 0x18
101905 +#define HYPERPRIVOP_GET_PSR 0x19
101906 +#define HYPERPRIVOP_MAX 0x19
101907 +
101908 +/* Fast and light hypercalls. */
101909 +#define __HYPERVISOR_ia64_fast_eoi 0x0200
101910 +
101911 +/* Xencomm macros. */
101912 +#define XENCOMM_INLINE_MASK 0xf800000000000000UL
101913 +#define XENCOMM_INLINE_FLAG 0x8000000000000000UL
101914 +
101915 +#define XENCOMM_IS_INLINE(addr) \
101916 + (((unsigned long)(addr) & XENCOMM_INLINE_MASK) == XENCOMM_INLINE_FLAG)
101917 +#define XENCOMM_INLINE_ADDR(addr) \
101918 + ((unsigned long)(addr) & ~XENCOMM_INLINE_MASK)
101919 +
101920 +/* xen perfmon */
101921 +#ifdef XEN
101922 +#ifndef __ASSEMBLY__
101923 +#ifndef _ASM_IA64_PERFMON_H
101924 +
101925 +#include <xen/list.h> // asm/perfmon.h requires struct list_head
101926 +#include <asm/perfmon.h>
101927 +// for PFM_xxx and pfarg_features_t, pfarg_context_t, pfarg_reg_t, pfarg_load_t
101928 +
101929 +#endif /* _ASM_IA64_PERFMON_H */
101930 +
101931 +DEFINE_XEN_GUEST_HANDLE(pfarg_features_t);
101932 +DEFINE_XEN_GUEST_HANDLE(pfarg_context_t);
101933 +DEFINE_XEN_GUEST_HANDLE(pfarg_reg_t);
101934 +DEFINE_XEN_GUEST_HANDLE(pfarg_load_t);
101935 +#endif /* __ASSEMBLY__ */
101936 +#endif /* XEN */
101937 +
101938 +#endif /* __HYPERVISOR_IF_IA64_H__ */
101939 +
101940 +/*
101941 + * Local variables:
101942 + * mode: C
101943 + * c-set-style: "BSD"
101944 + * c-basic-offset: 4
101945 + * tab-width: 4
101946 + * indent-tabs-mode: nil
101947 + * End:
101948 + */
101949 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-powerpc.h linux-2.6.16.33/include/xen/interface/arch-powerpc.h
101950 --- linux-2.6.16.33-noxen/include/xen/interface/arch-powerpc.h 1970-01-01 00:00:00.000000000 +0000
101951 +++ linux-2.6.16.33/include/xen/interface/arch-powerpc.h 2007-01-08 15:00:55.000000000 +0000
101952 @@ -0,0 +1,121 @@
101953 +/*
101954 + * Permission is hereby granted, free of charge, to any person obtaining a copy
101955 + * of this software and associated documentation files (the "Software"), to
101956 + * deal in the Software without restriction, including without limitation the
101957 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
101958 + * sell copies of the Software, and to permit persons to whom the Software is
101959 + * furnished to do so, subject to the following conditions:
101960 + *
101961 + * The above copyright notice and this permission notice shall be included in
101962 + * all copies or substantial portions of the Software.
101963 + *
101964 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
101965 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
101966 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
101967 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
101968 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
101969 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
101970 + * DEALINGS IN THE SOFTWARE.
101971 + *
101972 + * Copyright (C) IBM Corp. 2005, 2006
101973 + *
101974 + * Authors: Hollis Blanchard <hollisb@us.ibm.com>
101975 + */
101976 +
101977 +#ifndef __XEN_PUBLIC_ARCH_PPC_64_H__
101978 +#define __XEN_PUBLIC_ARCH_PPC_64_H__
101979 +
101980 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
101981 + typedef struct { \
101982 + int __pad[(sizeof (long long) - sizeof (void *)) / sizeof (int)]; \
101983 + type *p; \
101984 + } __attribute__((__aligned__(8))) __guest_handle_ ## name
101985 +
101986 +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
101987 +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
101988 +#define set_xen_guest_handle(hnd, val) \
101989 + do { \
101990 + if (sizeof ((hnd).__pad)) \
101991 + (hnd).__pad[0] = 0; \
101992 + (hnd).p = val; \
101993 + } while (0)
101994 +
101995 +#ifdef __XEN_TOOLS__
101996 +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
101997 +#endif
101998 +
101999 +#ifndef __ASSEMBLY__
102000 +/* Guest handles for primitive C types. */
102001 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
102002 +__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
102003 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
102004 +DEFINE_XEN_GUEST_HANDLE(char);
102005 +DEFINE_XEN_GUEST_HANDLE(int);
102006 +DEFINE_XEN_GUEST_HANDLE(long);
102007 +DEFINE_XEN_GUEST_HANDLE(void);
102008 +
102009 +typedef unsigned long long xen_pfn_t;
102010 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
102011 +#endif
102012 +
102013 +/*
102014 + * Pointers and other address fields inside interface structures are padded to
102015 + * 64 bits. This means that field alignments aren't different between 32- and
102016 + * 64-bit architectures.
102017 + */
102018 +/* NB. Multi-level macro ensures __LINE__ is expanded before concatenation. */
102019 +#define __MEMORY_PADDING(_X)
102020 +#define _MEMORY_PADDING(_X) __MEMORY_PADDING(_X)
102021 +#define MEMORY_PADDING _MEMORY_PADDING(__LINE__)
102022 +
102023 +/* And the trap vector is... */
102024 +#define TRAP_INSTR "li 0,-1; sc" /* XXX just "sc"? */
102025 +
102026 +#ifndef __ASSEMBLY__
102027 +
102028 +#define XENCOMM_INLINE_FLAG (1UL << 63)
102029 +
102030 +typedef uint64_t xen_ulong_t;
102031 +
102032 +/* User-accessible registers: need to be saved/restored for every nested Xen
102033 + * invocation. */
102034 +struct cpu_user_regs
102035 +{
102036 + uint64_t gprs[32];
102037 + uint64_t lr;
102038 + uint64_t ctr;
102039 + uint64_t srr0;
102040 + uint64_t srr1;
102041 + uint64_t pc;
102042 + uint64_t msr;
102043 + uint64_t fpscr;
102044 + uint64_t xer;
102045 + uint64_t hid4;
102046 + uint32_t cr;
102047 + uint32_t entry_vector;
102048 +};
102049 +typedef struct cpu_user_regs cpu_user_regs_t;
102050 +
102051 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ /* XXX timebase */
102052 +
102053 +/* ONLY used to communicate with dom0! See also struct exec_domain. */
102054 +struct vcpu_guest_context {
102055 + cpu_user_regs_t user_regs; /* User-level CPU registers */
102056 + uint64_t sdr1; /* Pagetable base */
102057 + /* XXX etc */
102058 +};
102059 +typedef struct vcpu_guest_context vcpu_guest_context_t;
102060 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
102061 +
102062 +struct arch_shared_info {
102063 + uint64_t pad[32];
102064 +};
102065 +
102066 +struct arch_vcpu_info {
102067 +};
102068 +
102069 +/* Support for multi-processor guests. */
102070 +#define MAX_VIRT_CPUS 32
102071 +#endif
102072 +
102073 +#endif
102074 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_32.h linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_32.h
102075 --- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_32.h 1970-01-01 00:00:00.000000000 +0000
102076 +++ linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_32.h 2007-01-08 15:00:55.000000000 +0000
102077 @@ -0,0 +1,151 @@
102078 +/******************************************************************************
102079 + * xen-x86_32.h
102080 + *
102081 + * Guest OS interface to x86 32-bit Xen.
102082 + *
102083 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102084 + * of this software and associated documentation files (the "Software"), to
102085 + * deal in the Software without restriction, including without limitation the
102086 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102087 + * sell copies of the Software, and to permit persons to whom the Software is
102088 + * furnished to do so, subject to the following conditions:
102089 + *
102090 + * The above copyright notice and this permission notice shall be included in
102091 + * all copies or substantial portions of the Software.
102092 + *
102093 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102094 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102095 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102096 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102097 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102098 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102099 + * DEALINGS IN THE SOFTWARE.
102100 + *
102101 + * Copyright (c) 2004-2006, K A Fraser
102102 + */
102103 +
102104 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
102105 +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
102106 +
102107 +/*
102108 + * Hypercall interface:
102109 + * Input: %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
102110 + * Output: %eax
102111 + * Access is via hypercall page (set up by guest loader or via a Xen MSR):
102112 + * call hypercall_page + hypercall-number * 32
102113 + * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
102114 + */
102115 +
102116 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
102117 +/*
102118 + * Legacy hypercall interface:
102119 + * As above, except the entry sequence to the hypervisor is:
102120 + * mov $hypercall-number*32,%eax ; int $0x82
102121 + */
102122 +#define TRAP_INSTR "int $0x82"
102123 +#endif
102124 +
102125 +/*
102126 + * These flat segments are in the Xen-private section of every GDT. Since these
102127 + * are also present in the initial GDT, many OSes will be able to avoid
102128 + * installing their own GDT.
102129 + */
102130 +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
102131 +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
102132 +#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
102133 +#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
102134 +#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
102135 +#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
102136 +
102137 +#define FLAT_KERNEL_CS FLAT_RING1_CS
102138 +#define FLAT_KERNEL_DS FLAT_RING1_DS
102139 +#define FLAT_KERNEL_SS FLAT_RING1_SS
102140 +#define FLAT_USER_CS FLAT_RING3_CS
102141 +#define FLAT_USER_DS FLAT_RING3_DS
102142 +#define FLAT_USER_SS FLAT_RING3_SS
102143 +
102144 +/*
102145 + * Virtual addresses beyond this are not modifiable by guest OSes. The
102146 + * machine->physical mapping table starts at this address, read-only.
102147 + */
102148 +#ifdef CONFIG_X86_PAE
102149 +#define __HYPERVISOR_VIRT_START 0xF5800000
102150 +#define __MACH2PHYS_VIRT_START 0xF5800000
102151 +#define __MACH2PHYS_VIRT_END 0xF6800000
102152 +#else
102153 +#define __HYPERVISOR_VIRT_START 0xFC000000
102154 +#define __MACH2PHYS_VIRT_START 0xFC000000
102155 +#define __MACH2PHYS_VIRT_END 0xFC400000
102156 +#endif
102157 +
102158 +#ifndef HYPERVISOR_VIRT_START
102159 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
102160 +#endif
102161 +
102162 +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
102163 +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
102164 +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
102165 +#ifndef machine_to_phys_mapping
102166 +#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
102167 +#endif
102168 +
102169 +#ifndef __ASSEMBLY__
102170 +
102171 +struct cpu_user_regs {
102172 + uint32_t ebx;
102173 + uint32_t ecx;
102174 + uint32_t edx;
102175 + uint32_t esi;
102176 + uint32_t edi;
102177 + uint32_t ebp;
102178 + uint32_t eax;
102179 + uint16_t error_code; /* private */
102180 + uint16_t entry_vector; /* private */
102181 + uint32_t eip;
102182 + uint16_t cs;
102183 + uint8_t saved_upcall_mask;
102184 + uint8_t _pad0;
102185 + uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
102186 + uint32_t esp;
102187 + uint16_t ss, _pad1;
102188 + uint16_t es, _pad2;
102189 + uint16_t ds, _pad3;
102190 + uint16_t fs, _pad4;
102191 + uint16_t gs, _pad5;
102192 +};
102193 +typedef struct cpu_user_regs cpu_user_regs_t;
102194 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
102195 +
102196 +/*
102197 + * Page-directory addresses above 4GB do not fit into architectural %cr3.
102198 + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
102199 + * must use the following accessor macros to pack/unpack valid MFNs.
102200 + */
102201 +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
102202 +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
102203 +
102204 +struct arch_vcpu_info {
102205 + unsigned long cr2;
102206 + unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
102207 +};
102208 +typedef struct arch_vcpu_info arch_vcpu_info_t;
102209 +
102210 +struct xen_callback {
102211 + unsigned long cs;
102212 + unsigned long eip;
102213 +};
102214 +typedef struct xen_callback xen_callback_t;
102215 +
102216 +#endif /* !__ASSEMBLY__ */
102217 +
102218 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
102219 +
102220 +/*
102221 + * Local variables:
102222 + * mode: C
102223 + * c-set-style: "BSD"
102224 + * c-basic-offset: 4
102225 + * tab-width: 4
102226 + * indent-tabs-mode: nil
102227 + * End:
102228 + */
102229 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_64.h linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_64.h
102230 --- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen-x86_64.h 1970-01-01 00:00:00.000000000 +0000
102231 +++ linux-2.6.16.33/include/xen/interface/arch-x86/xen-x86_64.h 2007-01-08 15:00:55.000000000 +0000
102232 @@ -0,0 +1,208 @@
102233 +/******************************************************************************
102234 + * xen-x86_64.h
102235 + *
102236 + * Guest OS interface to x86 64-bit Xen.
102237 + *
102238 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102239 + * of this software and associated documentation files (the "Software"), to
102240 + * deal in the Software without restriction, including without limitation the
102241 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102242 + * sell copies of the Software, and to permit persons to whom the Software is
102243 + * furnished to do so, subject to the following conditions:
102244 + *
102245 + * The above copyright notice and this permission notice shall be included in
102246 + * all copies or substantial portions of the Software.
102247 + *
102248 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102249 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102250 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102251 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102252 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102253 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102254 + * DEALINGS IN THE SOFTWARE.
102255 + *
102256 + * Copyright (c) 2004-2006, K A Fraser
102257 + */
102258 +
102259 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
102260 +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
102261 +
102262 +/*
102263 + * Hypercall interface:
102264 + * Input: %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
102265 + * Output: %rax
102266 + * Access is via hypercall page (set up by guest loader or via a Xen MSR):
102267 + * call hypercall_page + hypercall-number * 32
102268 + * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
102269 + */
102270 +
102271 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
102272 +/*
102273 + * Legacy hypercall interface:
102274 + * As above, except the entry sequence to the hypervisor is:
102275 + * mov $hypercall-number*32,%eax ; syscall
102276 + * Clobbered: %rcx, %r11, argument registers (as above)
102277 + */
102278 +#define TRAP_INSTR "syscall"
102279 +#endif
102280 +
102281 +/*
102282 + * 64-bit segment selectors
102283 + * These flat segments are in the Xen-private section of every GDT. Since these
102284 + * are also present in the initial GDT, many OSes will be able to avoid
102285 + * installing their own GDT.
102286 + */
102287 +
102288 +#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
102289 +#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
102290 +#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
102291 +#define FLAT_RING3_DS64 0x0000 /* NULL selector */
102292 +#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
102293 +#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
102294 +
102295 +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
102296 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
102297 +#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
102298 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
102299 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
102300 +#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
102301 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
102302 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
102303 +#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
102304 +
102305 +#define FLAT_USER_DS64 FLAT_RING3_DS64
102306 +#define FLAT_USER_DS32 FLAT_RING3_DS32
102307 +#define FLAT_USER_DS FLAT_USER_DS64
102308 +#define FLAT_USER_CS64 FLAT_RING3_CS64
102309 +#define FLAT_USER_CS32 FLAT_RING3_CS32
102310 +#define FLAT_USER_CS FLAT_USER_CS64
102311 +#define FLAT_USER_SS64 FLAT_RING3_SS64
102312 +#define FLAT_USER_SS32 FLAT_RING3_SS32
102313 +#define FLAT_USER_SS FLAT_USER_SS64
102314 +
102315 +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
102316 +#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
102317 +#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
102318 +#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
102319 +
102320 +#ifndef HYPERVISOR_VIRT_START
102321 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
102322 +#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
102323 +#endif
102324 +
102325 +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
102326 +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
102327 +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
102328 +#ifndef machine_to_phys_mapping
102329 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
102330 +#endif
102331 +
102332 +#ifndef __ASSEMBLY__
102333 +
102334 +/*
102335 + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
102336 + * @which == SEGBASE_* ; @base == 64-bit base address
102337 + * Returns 0 on success.
102338 + */
102339 +#define SEGBASE_FS 0
102340 +#define SEGBASE_GS_USER 1
102341 +#define SEGBASE_GS_KERNEL 2
102342 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
102343 +
102344 +/*
102345 + * int HYPERVISOR_iret(void)
102346 + * All arguments are on the kernel stack, in the following format.
102347 + * Never returns if successful. Current kernel context is lost.
102348 + * The saved CS is mapped as follows:
102349 + * RING0 -> RING3 kernel mode.
102350 + * RING1 -> RING3 kernel mode.
102351 + * RING2 -> RING3 kernel mode.
102352 + * RING3 -> RING3 user mode.
102353 + * However RING0 indicates that the guest kernel should return to iteself
102354 + * directly with
102355 + * orb $3,1*8(%rsp)
102356 + * iretq
102357 + * If flags contains VGCF_in_syscall:
102358 + * Restore RAX, RIP, RFLAGS, RSP.
102359 + * Discard R11, RCX, CS, SS.
102360 + * Otherwise:
102361 + * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
102362 + * All other registers are saved on hypercall entry and restored to user.
102363 + */
102364 +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
102365 +#define _VGCF_in_syscall 8
102366 +#define VGCF_in_syscall (1<<_VGCF_in_syscall)
102367 +#define VGCF_IN_SYSCALL VGCF_in_syscall
102368 +struct iret_context {
102369 + /* Top of stack (%rsp at point of hypercall). */
102370 + uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
102371 + /* Bottom of iret stack frame. */
102372 +};
102373 +
102374 +#ifdef __GNUC__
102375 +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
102376 +#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
102377 +#else
102378 +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
102379 +#define __DECL_REG(name) uint64_t r ## name
102380 +#endif
102381 +
102382 +struct cpu_user_regs {
102383 + uint64_t r15;
102384 + uint64_t r14;
102385 + uint64_t r13;
102386 + uint64_t r12;
102387 + __DECL_REG(bp);
102388 + __DECL_REG(bx);
102389 + uint64_t r11;
102390 + uint64_t r10;
102391 + uint64_t r9;
102392 + uint64_t r8;
102393 + __DECL_REG(ax);
102394 + __DECL_REG(cx);
102395 + __DECL_REG(dx);
102396 + __DECL_REG(si);
102397 + __DECL_REG(di);
102398 + uint32_t error_code; /* private */
102399 + uint32_t entry_vector; /* private */
102400 + __DECL_REG(ip);
102401 + uint16_t cs, _pad0[1];
102402 + uint8_t saved_upcall_mask;
102403 + uint8_t _pad1[3];
102404 + __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
102405 + __DECL_REG(sp);
102406 + uint16_t ss, _pad2[3];
102407 + uint16_t es, _pad3[3];
102408 + uint16_t ds, _pad4[3];
102409 + uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
102410 + uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
102411 +};
102412 +typedef struct cpu_user_regs cpu_user_regs_t;
102413 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
102414 +
102415 +#undef __DECL_REG
102416 +
102417 +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
102418 +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
102419 +
102420 +struct arch_vcpu_info {
102421 + unsigned long cr2;
102422 + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
102423 +};
102424 +typedef struct arch_vcpu_info arch_vcpu_info_t;
102425 +
102426 +typedef unsigned long xen_callback_t;
102427 +
102428 +#endif /* !__ASSEMBLY__ */
102429 +
102430 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
102431 +
102432 +/*
102433 + * Local variables:
102434 + * mode: C
102435 + * c-set-style: "BSD"
102436 + * c-basic-offset: 4
102437 + * tab-width: 4
102438 + * indent-tabs-mode: nil
102439 + * End:
102440 + */
102441 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen.h linux-2.6.16.33/include/xen/interface/arch-x86/xen.h
102442 --- linux-2.6.16.33-noxen/include/xen/interface/arch-x86/xen.h 1970-01-01 00:00:00.000000000 +0000
102443 +++ linux-2.6.16.33/include/xen/interface/arch-x86/xen.h 2007-01-08 15:00:55.000000000 +0000
102444 @@ -0,0 +1,190 @@
102445 +/******************************************************************************
102446 + * arch-x86/xen.h
102447 + *
102448 + * Guest OS interface to x86 Xen.
102449 + *
102450 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102451 + * of this software and associated documentation files (the "Software"), to
102452 + * deal in the Software without restriction, including without limitation the
102453 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102454 + * sell copies of the Software, and to permit persons to whom the Software is
102455 + * furnished to do so, subject to the following conditions:
102456 + *
102457 + * The above copyright notice and this permission notice shall be included in
102458 + * all copies or substantial portions of the Software.
102459 + *
102460 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102461 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102462 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102463 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102464 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102465 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102466 + * DEALINGS IN THE SOFTWARE.
102467 + *
102468 + * Copyright (c) 2004-2006, K A Fraser
102469 + */
102470 +
102471 +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
102472 +#define __XEN_PUBLIC_ARCH_X86_XEN_H__
102473 +
102474 +/* Structural guest handles introduced in 0x00030201. */
102475 +#if __XEN_INTERFACE_VERSION__ >= 0x00030201
102476 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
102477 + typedef struct { type *p; } __guest_handle_ ## name
102478 +#else
102479 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
102480 + typedef type * __guest_handle_ ## name
102481 +#endif
102482 +
102483 +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
102484 +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name
102485 +#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
102486 +#ifdef __XEN_TOOLS__
102487 +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0)
102488 +#endif
102489 +
102490 +#ifndef __ASSEMBLY__
102491 +/* Guest handles for primitive C types. */
102492 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
102493 +__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int);
102494 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
102495 +DEFINE_XEN_GUEST_HANDLE(char);
102496 +DEFINE_XEN_GUEST_HANDLE(int);
102497 +DEFINE_XEN_GUEST_HANDLE(long);
102498 +DEFINE_XEN_GUEST_HANDLE(void);
102499 +
102500 +typedef unsigned long xen_pfn_t;
102501 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
102502 +#endif
102503 +
102504 +#if defined(__i386__)
102505 +#include "xen-x86_32.h"
102506 +#elif defined(__x86_64__)
102507 +#include "xen-x86_64.h"
102508 +#endif
102509 +
102510 +/*
102511 + * SEGMENT DESCRIPTOR TABLES
102512 + */
102513 +/*
102514 + * A number of GDT entries are reserved by Xen. These are not situated at the
102515 + * start of the GDT because some stupid OSes export hard-coded selector values
102516 + * in their ABI. These hard-coded values are always near the start of the GDT,
102517 + * so Xen places itself out of the way, at the far end of the GDT.
102518 + */
102519 +#define FIRST_RESERVED_GDT_PAGE 14
102520 +#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
102521 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
102522 +
102523 +/* Maximum number of virtual CPUs in multi-processor guests. */
102524 +#define MAX_VIRT_CPUS 32
102525 +
102526 +#ifndef __ASSEMBLY__
102527 +
102528 +typedef unsigned long xen_ulong_t;
102529 +
102530 +/*
102531 + * Send an array of these to HYPERVISOR_set_trap_table().
102532 + * The privilege level specifies which modes may enter a trap via a software
102533 + * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
102534 + * privilege levels as follows:
102535 + * Level == 0: Noone may enter
102536 + * Level == 1: Kernel may enter
102537 + * Level == 2: Kernel may enter
102538 + * Level == 3: Everyone may enter
102539 + */
102540 +#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
102541 +#define TI_GET_IF(_ti) ((_ti)->flags & 4)
102542 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
102543 +#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2))
102544 +struct trap_info {
102545 + uint8_t vector; /* exception vector */
102546 + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
102547 + uint16_t cs; /* code selector */
102548 + unsigned long address; /* code offset */
102549 +};
102550 +typedef struct trap_info trap_info_t;
102551 +DEFINE_XEN_GUEST_HANDLE(trap_info_t);
102552 +
102553 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
102554 +
102555 +/*
102556 + * The following is all CPU context. Note that the fpu_ctxt block is filled
102557 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
102558 + */
102559 +struct vcpu_guest_context {
102560 + /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
102561 + struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
102562 +#define VGCF_I387_VALID (1<<0)
102563 +#define VGCF_IN_KERNEL (1<<2)
102564 +#define _VGCF_i387_valid 0
102565 +#define VGCF_i387_valid (1<<_VGCF_i387_valid)
102566 +#define _VGCF_in_kernel 2
102567 +#define VGCF_in_kernel (1<<_VGCF_in_kernel)
102568 +#define _VGCF_failsafe_disables_events 3
102569 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
102570 +#define _VGCF_syscall_disables_events 4
102571 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
102572 + unsigned long flags; /* VGCF_* flags */
102573 + struct cpu_user_regs user_regs; /* User-level CPU registers */
102574 + struct trap_info trap_ctxt[256]; /* Virtual IDT */
102575 + unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
102576 + unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
102577 + unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
102578 + unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
102579 + unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
102580 +#ifdef __i386__
102581 + unsigned long event_callback_cs; /* CS:EIP of event callback */
102582 + unsigned long event_callback_eip;
102583 + unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
102584 + unsigned long failsafe_callback_eip;
102585 +#else
102586 + unsigned long event_callback_eip;
102587 + unsigned long failsafe_callback_eip;
102588 + unsigned long syscall_callback_eip;
102589 +#endif
102590 + unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
102591 +#ifdef __x86_64__
102592 + /* Segment base addresses. */
102593 + uint64_t fs_base;
102594 + uint64_t gs_base_kernel;
102595 + uint64_t gs_base_user;
102596 +#endif
102597 +};
102598 +typedef struct vcpu_guest_context vcpu_guest_context_t;
102599 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
102600 +
102601 +struct arch_shared_info {
102602 + unsigned long max_pfn; /* max pfn that appears in table */
102603 + /* Frame containing list of mfns containing list of mfns containing p2m. */
102604 + xen_pfn_t pfn_to_mfn_frame_list_list;
102605 + unsigned long nmi_reason;
102606 + uint64_t pad[32];
102607 +};
102608 +typedef struct arch_shared_info arch_shared_info_t;
102609 +
102610 +#endif /* !__ASSEMBLY__ */
102611 +
102612 +/*
102613 + * Prefix forces emulation of some non-trapping instructions.
102614 + * Currently only CPUID.
102615 + */
102616 +#ifdef __ASSEMBLY__
102617 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
102618 +#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
102619 +#else
102620 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
102621 +#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
102622 +#endif
102623 +
102624 +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
102625 +
102626 +/*
102627 + * Local variables:
102628 + * mode: C
102629 + * c-set-style: "BSD"
102630 + * c-basic-offset: 4
102631 + * tab-width: 4
102632 + * indent-tabs-mode: nil
102633 + * End:
102634 + */
102635 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86_32.h linux-2.6.16.33/include/xen/interface/arch-x86_32.h
102636 --- linux-2.6.16.33-noxen/include/xen/interface/arch-x86_32.h 1970-01-01 00:00:00.000000000 +0000
102637 +++ linux-2.6.16.33/include/xen/interface/arch-x86_32.h 2007-01-08 15:00:55.000000000 +0000
102638 @@ -0,0 +1,27 @@
102639 +/******************************************************************************
102640 + * arch-x86_32.h
102641 + *
102642 + * Guest OS interface to x86 32-bit Xen.
102643 + *
102644 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102645 + * of this software and associated documentation files (the "Software"), to
102646 + * deal in the Software without restriction, including without limitation the
102647 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102648 + * sell copies of the Software, and to permit persons to whom the Software is
102649 + * furnished to do so, subject to the following conditions:
102650 + *
102651 + * The above copyright notice and this permission notice shall be included in
102652 + * all copies or substantial portions of the Software.
102653 + *
102654 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102655 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102656 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102657 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102658 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102659 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102660 + * DEALINGS IN THE SOFTWARE.
102661 + *
102662 + * Copyright (c) 2004-2006, K A Fraser
102663 + */
102664 +
102665 +#include "arch-x86/xen.h"
102666 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/arch-x86_64.h linux-2.6.16.33/include/xen/interface/arch-x86_64.h
102667 --- linux-2.6.16.33-noxen/include/xen/interface/arch-x86_64.h 1970-01-01 00:00:00.000000000 +0000
102668 +++ linux-2.6.16.33/include/xen/interface/arch-x86_64.h 2007-01-08 15:00:55.000000000 +0000
102669 @@ -0,0 +1,27 @@
102670 +/******************************************************************************
102671 + * arch-x86_64.h
102672 + *
102673 + * Guest OS interface to x86 64-bit Xen.
102674 + *
102675 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102676 + * of this software and associated documentation files (the "Software"), to
102677 + * deal in the Software without restriction, including without limitation the
102678 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102679 + * sell copies of the Software, and to permit persons to whom the Software is
102680 + * furnished to do so, subject to the following conditions:
102681 + *
102682 + * The above copyright notice and this permission notice shall be included in
102683 + * all copies or substantial portions of the Software.
102684 + *
102685 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102686 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102687 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102688 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102689 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102690 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102691 + * DEALINGS IN THE SOFTWARE.
102692 + *
102693 + * Copyright (c) 2004-2006, K A Fraser
102694 + */
102695 +
102696 +#include "arch-x86/xen.h"
102697 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/callback.h linux-2.6.16.33/include/xen/interface/callback.h
102698 --- linux-2.6.16.33-noxen/include/xen/interface/callback.h 1970-01-01 00:00:00.000000000 +0000
102699 +++ linux-2.6.16.33/include/xen/interface/callback.h 2007-01-08 15:00:55.000000000 +0000
102700 @@ -0,0 +1,92 @@
102701 +/******************************************************************************
102702 + * callback.h
102703 + *
102704 + * Register guest OS callbacks with Xen.
102705 + *
102706 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102707 + * of this software and associated documentation files (the "Software"), to
102708 + * deal in the Software without restriction, including without limitation the
102709 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102710 + * sell copies of the Software, and to permit persons to whom the Software is
102711 + * furnished to do so, subject to the following conditions:
102712 + *
102713 + * The above copyright notice and this permission notice shall be included in
102714 + * all copies or substantial portions of the Software.
102715 + *
102716 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102717 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102718 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102719 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102720 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102721 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102722 + * DEALINGS IN THE SOFTWARE.
102723 + *
102724 + * Copyright (c) 2006, Ian Campbell
102725 + */
102726 +
102727 +#ifndef __XEN_PUBLIC_CALLBACK_H__
102728 +#define __XEN_PUBLIC_CALLBACK_H__
102729 +
102730 +#include "xen.h"
102731 +
102732 +/*
102733 + * Prototype for this hypercall is:
102734 + * long callback_op(int cmd, void *extra_args)
102735 + * @cmd == CALLBACKOP_??? (callback operation).
102736 + * @extra_args == Operation-specific extra arguments (NULL if none).
102737 + */
102738 +
102739 +#define CALLBACKTYPE_event 0
102740 +#define CALLBACKTYPE_failsafe 1
102741 +#define CALLBACKTYPE_syscall 2 /* x86_64 only */
102742 +/*
102743 + * sysenter is only available on x86_32 with the
102744 + * supervisor_mode_kernel option enabled.
102745 + */
102746 +#define CALLBACKTYPE_sysenter 3
102747 +#define CALLBACKTYPE_nmi 4
102748 +
102749 +/*
102750 + * Disable event deliver during callback? This flag is ignored for event and
102751 + * NMI callbacks: event delivery is unconditionally disabled.
102752 + */
102753 +#define _CALLBACKF_mask_events 0
102754 +#define CALLBACKF_mask_events (1U << _CALLBACKF_mask_events)
102755 +
102756 +/*
102757 + * Register a callback.
102758 + */
102759 +#define CALLBACKOP_register 0
102760 +struct callback_register {
102761 + uint16_t type;
102762 + uint16_t flags;
102763 + xen_callback_t address;
102764 +};
102765 +typedef struct callback_register callback_register_t;
102766 +DEFINE_XEN_GUEST_HANDLE(callback_register_t);
102767 +
102768 +/*
102769 + * Unregister a callback.
102770 + *
102771 + * Not all callbacks can be unregistered. -EINVAL will be returned if
102772 + * you attempt to unregister such a callback.
102773 + */
102774 +#define CALLBACKOP_unregister 1
102775 +struct callback_unregister {
102776 + uint16_t type;
102777 + uint16_t _unused;
102778 +};
102779 +typedef struct callback_unregister callback_unregister_t;
102780 +DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
102781 +
102782 +#endif /* __XEN_PUBLIC_CALLBACK_H__ */
102783 +
102784 +/*
102785 + * Local variables:
102786 + * mode: C
102787 + * c-set-style: "BSD"
102788 + * c-basic-offset: 4
102789 + * tab-width: 4
102790 + * indent-tabs-mode: nil
102791 + * End:
102792 + */
102793 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/dom0_ops.h linux-2.6.16.33/include/xen/interface/dom0_ops.h
102794 --- linux-2.6.16.33-noxen/include/xen/interface/dom0_ops.h 1970-01-01 00:00:00.000000000 +0000
102795 +++ linux-2.6.16.33/include/xen/interface/dom0_ops.h 2007-01-08 15:00:55.000000000 +0000
102796 @@ -0,0 +1,120 @@
102797 +/******************************************************************************
102798 + * dom0_ops.h
102799 + *
102800 + * Process command requests from domain-0 guest OS.
102801 + *
102802 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102803 + * of this software and associated documentation files (the "Software"), to
102804 + * deal in the Software without restriction, including without limitation the
102805 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102806 + * sell copies of the Software, and to permit persons to whom the Software is
102807 + * furnished to do so, subject to the following conditions:
102808 + *
102809 + * The above copyright notice and this permission notice shall be included in
102810 + * all copies or substantial portions of the Software.
102811 + *
102812 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102813 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102814 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102815 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102816 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102817 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102818 + * DEALINGS IN THE SOFTWARE.
102819 + *
102820 + * Copyright (c) 2002-2003, B Dragovic
102821 + * Copyright (c) 2002-2006, K Fraser
102822 + */
102823 +
102824 +#ifndef __XEN_PUBLIC_DOM0_OPS_H__
102825 +#define __XEN_PUBLIC_DOM0_OPS_H__
102826 +
102827 +#include "xen.h"
102828 +#include "platform.h"
102829 +
102830 +#if __XEN_INTERFACE_VERSION__ >= 0x00030204
102831 +#error "dom0_ops.h is a compatibility interface only"
102832 +#endif
102833 +
102834 +#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
102835 +
102836 +#define DOM0_SETTIME XENPF_settime
102837 +#define dom0_settime xenpf_settime
102838 +#define dom0_settime_t xenpf_settime_t
102839 +
102840 +#define DOM0_ADD_MEMTYPE XENPF_add_memtype
102841 +#define dom0_add_memtype xenpf_add_memtype
102842 +#define dom0_add_memtype_t xenpf_add_memtype_t
102843 +
102844 +#define DOM0_DEL_MEMTYPE XENPF_del_memtype
102845 +#define dom0_del_memtype xenpf_del_memtype
102846 +#define dom0_del_memtype_t xenpf_del_memtype_t
102847 +
102848 +#define DOM0_READ_MEMTYPE XENPF_read_memtype
102849 +#define dom0_read_memtype xenpf_read_memtype
102850 +#define dom0_read_memtype_t xenpf_read_memtype_t
102851 +
102852 +#define DOM0_MICROCODE XENPF_microcode_update
102853 +#define dom0_microcode xenpf_microcode_update
102854 +#define dom0_microcode_t xenpf_microcode_update_t
102855 +
102856 +#define DOM0_PLATFORM_QUIRK XENPF_platform_quirk
102857 +#define dom0_platform_quirk xenpf_platform_quirk
102858 +#define dom0_platform_quirk_t xenpf_platform_quirk_t
102859 +
102860 +typedef uint64_t cpumap_t;
102861 +
102862 +/* Unsupported legacy operation -- defined for API compatibility. */
102863 +#define DOM0_MSR 15
102864 +struct dom0_msr {
102865 + /* IN variables. */
102866 + uint32_t write;
102867 + cpumap_t cpu_mask;
102868 + uint32_t msr;
102869 + uint32_t in1;
102870 + uint32_t in2;
102871 + /* OUT variables. */
102872 + uint32_t out1;
102873 + uint32_t out2;
102874 +};
102875 +typedef struct dom0_msr dom0_msr_t;
102876 +DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
102877 +
102878 +/* Unsupported legacy operation -- defined for API compatibility. */
102879 +#define DOM0_PHYSICAL_MEMORY_MAP 40
102880 +struct dom0_memory_map_entry {
102881 + uint64_t start, end;
102882 + uint32_t flags; /* reserved */
102883 + uint8_t is_ram;
102884 +};
102885 +typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
102886 +DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
102887 +
102888 +struct dom0_op {
102889 + uint32_t cmd;
102890 + uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
102891 + union {
102892 + struct dom0_msr msr;
102893 + struct dom0_settime settime;
102894 + struct dom0_add_memtype add_memtype;
102895 + struct dom0_del_memtype del_memtype;
102896 + struct dom0_read_memtype read_memtype;
102897 + struct dom0_microcode microcode;
102898 + struct dom0_platform_quirk platform_quirk;
102899 + struct dom0_memory_map_entry physical_memory_map;
102900 + uint8_t pad[128];
102901 + } u;
102902 +};
102903 +typedef struct dom0_op dom0_op_t;
102904 +DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
102905 +
102906 +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
102907 +
102908 +/*
102909 + * Local variables:
102910 + * mode: C
102911 + * c-set-style: "BSD"
102912 + * c-basic-offset: 4
102913 + * tab-width: 4
102914 + * indent-tabs-mode: nil
102915 + * End:
102916 + */
102917 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/domctl.h linux-2.6.16.33/include/xen/interface/domctl.h
102918 --- linux-2.6.16.33-noxen/include/xen/interface/domctl.h 1970-01-01 00:00:00.000000000 +0000
102919 +++ linux-2.6.16.33/include/xen/interface/domctl.h 2007-01-08 15:00:55.000000000 +0000
102920 @@ -0,0 +1,437 @@
102921 +/******************************************************************************
102922 + * domctl.h
102923 + *
102924 + * Domain management operations. For use by node control stack.
102925 + *
102926 + * Permission is hereby granted, free of charge, to any person obtaining a copy
102927 + * of this software and associated documentation files (the "Software"), to
102928 + * deal in the Software without restriction, including without limitation the
102929 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
102930 + * sell copies of the Software, and to permit persons to whom the Software is
102931 + * furnished to do so, subject to the following conditions:
102932 + *
102933 + * The above copyright notice and this permission notice shall be included in
102934 + * all copies or substantial portions of the Software.
102935 + *
102936 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
102937 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
102938 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
102939 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
102940 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
102941 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
102942 + * DEALINGS IN THE SOFTWARE.
102943 + *
102944 + * Copyright (c) 2002-2003, B Dragovic
102945 + * Copyright (c) 2002-2006, K Fraser
102946 + */
102947 +
102948 +#ifndef __XEN_PUBLIC_DOMCTL_H__
102949 +#define __XEN_PUBLIC_DOMCTL_H__
102950 +
102951 +#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
102952 +#error "domctl operations are intended for use by node control tools only"
102953 +#endif
102954 +
102955 +#include "xen.h"
102956 +
102957 +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000004
102958 +
102959 +struct xenctl_cpumap {
102960 + XEN_GUEST_HANDLE(uint8_t) bitmap;
102961 + uint32_t nr_cpus;
102962 +};
102963 +
102964 +/*
102965 + * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
102966 + * If it is specified as zero, an id is auto-allocated and returned.
102967 + */
102968 +#define XEN_DOMCTL_createdomain 1
102969 +struct xen_domctl_createdomain {
102970 + /* IN parameters */
102971 + uint32_t ssidref;
102972 + xen_domain_handle_t handle;
102973 + /* Is this an HVM guest (as opposed to a PV guest)? */
102974 +#define _XEN_DOMCTL_CDF_hvm_guest 0
102975 +#define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest)
102976 + uint32_t flags;
102977 +};
102978 +typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
102979 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
102980 +
102981 +#define XEN_DOMCTL_destroydomain 2
102982 +#define XEN_DOMCTL_pausedomain 3
102983 +#define XEN_DOMCTL_unpausedomain 4
102984 +
102985 +#define XEN_DOMCTL_getdomaininfo 5
102986 +struct xen_domctl_getdomaininfo {
102987 + /* OUT variables. */
102988 + domid_t domain; /* Also echoed in domctl.domain */
102989 + /* Domain is scheduled to die. */
102990 +#define _XEN_DOMINF_dying 0
102991 +#define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying)
102992 + /* Domain is an HVM guest (as opposed to a PV guest). */
102993 +#define _XEN_DOMINF_hvm_guest 1
102994 +#define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest)
102995 + /* The guest OS has shut down. */
102996 +#define _XEN_DOMINF_shutdown 2
102997 +#define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown)
102998 + /* Currently paused by control software. */
102999 +#define _XEN_DOMINF_paused 3
103000 +#define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused)
103001 + /* Currently blocked pending an event. */
103002 +#define _XEN_DOMINF_blocked 4
103003 +#define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked)
103004 + /* Domain is currently running. */
103005 +#define _XEN_DOMINF_running 5
103006 +#define XEN_DOMINF_running (1U<<_XEN_DOMINF_running)
103007 + /* CPU to which this domain is bound. */
103008 +#define XEN_DOMINF_cpumask 255
103009 +#define XEN_DOMINF_cpushift 8
103010 + /* XEN_DOMINF_shutdown guest-supplied code. */
103011 +#define XEN_DOMINF_shutdownmask 255
103012 +#define XEN_DOMINF_shutdownshift 16
103013 + uint32_t flags; /* XEN_DOMINF_* */
103014 + uint64_t tot_pages;
103015 + uint64_t max_pages;
103016 + uint64_t shared_info_frame; /* GMFN of shared_info struct */
103017 + uint64_t cpu_time;
103018 + uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */
103019 + uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */
103020 + uint32_t ssidref;
103021 + xen_domain_handle_t handle;
103022 +};
103023 +typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
103024 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
103025 +
103026 +
103027 +#define XEN_DOMCTL_getmemlist 6
103028 +struct xen_domctl_getmemlist {
103029 + /* IN variables. */
103030 + /* Max entries to write to output buffer. */
103031 + uint64_t max_pfns;
103032 + /* Start index in guest's page list. */
103033 + uint64_t start_pfn;
103034 + XEN_GUEST_HANDLE(xen_pfn_t) buffer;
103035 + /* OUT variables. */
103036 + uint64_t num_pfns;
103037 +};
103038 +typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
103039 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
103040 +
103041 +
103042 +#define XEN_DOMCTL_getpageframeinfo 7
103043 +
103044 +#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
103045 +#define XEN_DOMCTL_PFINFO_NOTAB (0x0<<28)
103046 +#define XEN_DOMCTL_PFINFO_L1TAB (0x1<<28)
103047 +#define XEN_DOMCTL_PFINFO_L2TAB (0x2<<28)
103048 +#define XEN_DOMCTL_PFINFO_L3TAB (0x3<<28)
103049 +#define XEN_DOMCTL_PFINFO_L4TAB (0x4<<28)
103050 +#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7<<28)
103051 +#define XEN_DOMCTL_PFINFO_LPINTAB (0x1<<31)
103052 +#define XEN_DOMCTL_PFINFO_XTAB (0xf<<28) /* invalid page */
103053 +#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xf<<28)
103054 +
103055 +struct xen_domctl_getpageframeinfo {
103056 + /* IN variables. */
103057 + uint64_t gmfn; /* GMFN to query */
103058 + /* OUT variables. */
103059 + /* Is the page PINNED to a type? */
103060 + uint32_t type; /* see above type defs */
103061 +};
103062 +typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
103063 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
103064 +
103065 +
103066 +#define XEN_DOMCTL_getpageframeinfo2 8
103067 +struct xen_domctl_getpageframeinfo2 {
103068 + /* IN variables. */
103069 + uint64_t num;
103070 + /* IN/OUT variables. */
103071 + XEN_GUEST_HANDLE(ulong) array;
103072 +};
103073 +typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
103074 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
103075 +
103076 +
103077 +/*
103078 + * Control shadow pagetables operation
103079 + */
103080 +#define XEN_DOMCTL_shadow_op 10
103081 +
103082 +/* Disable shadow mode. */
103083 +#define XEN_DOMCTL_SHADOW_OP_OFF 0
103084 +
103085 +/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
103086 +#define XEN_DOMCTL_SHADOW_OP_ENABLE 32
103087 +
103088 +/* Log-dirty bitmap operations. */
103089 + /* Return the bitmap and clean internal copy for next round. */
103090 +#define XEN_DOMCTL_SHADOW_OP_CLEAN 11
103091 + /* Return the bitmap but do not modify internal copy. */
103092 +#define XEN_DOMCTL_SHADOW_OP_PEEK 12
103093 +
103094 +/* Memory allocation accessors. */
103095 +#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION 30
103096 +#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION 31
103097 +
103098 +/* Legacy enable operations. */
103099 + /* Equiv. to ENABLE with no mode flags. */
103100 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST 1
103101 + /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
103102 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY 2
103103 + /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
103104 +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE 3
103105 +
103106 +/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
103107 + /*
103108 + * Shadow pagetables are refcounted: guest does not use explicit mmu
103109 + * operations nor write-protect its pagetables.
103110 + */
103111 +#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT (1 << 1)
103112 + /*
103113 + * Log pages in a bitmap as they are dirtied.
103114 + * Used for live relocation to determine which pages must be re-sent.
103115 + */
103116 +#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
103117 + /*
103118 + * Automatically translate GPFNs into MFNs.
103119 + */
103120 +#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
103121 + /*
103122 + * Xen does not steal virtual address space from the guest.
103123 + * Requires HVM support.
103124 + */
103125 +#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL (1 << 4)
103126 +
103127 +struct xen_domctl_shadow_op_stats {
103128 + uint32_t fault_count;
103129 + uint32_t dirty_count;
103130 +};
103131 +typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
103132 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
103133 +
103134 +struct xen_domctl_shadow_op {
103135 + /* IN variables. */
103136 + uint32_t op; /* XEN_DOMCTL_SHADOW_OP_* */
103137 +
103138 + /* OP_ENABLE */
103139 + uint32_t mode; /* XEN_DOMCTL_SHADOW_ENABLE_* */
103140 +
103141 + /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
103142 + uint32_t mb; /* Shadow memory allocation in MB */
103143 +
103144 + /* OP_PEEK / OP_CLEAN */
103145 + XEN_GUEST_HANDLE(ulong) dirty_bitmap;
103146 + uint64_t pages; /* Size of buffer. Updated with actual size. */
103147 + struct xen_domctl_shadow_op_stats stats;
103148 +};
103149 +typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
103150 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
103151 +
103152 +
103153 +#define XEN_DOMCTL_max_mem 11
103154 +struct xen_domctl_max_mem {
103155 + /* IN variables. */
103156 + uint64_t max_memkb;
103157 +};
103158 +typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
103159 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
103160 +
103161 +
103162 +#define XEN_DOMCTL_setvcpucontext 12
103163 +#define XEN_DOMCTL_getvcpucontext 13
103164 +struct xen_domctl_vcpucontext {
103165 + uint32_t vcpu; /* IN */
103166 + XEN_GUEST_HANDLE(vcpu_guest_context_t) ctxt; /* IN/OUT */
103167 +};
103168 +typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
103169 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
103170 +
103171 +
103172 +#define XEN_DOMCTL_getvcpuinfo 14
103173 +struct xen_domctl_getvcpuinfo {
103174 + /* IN variables. */
103175 + uint32_t vcpu;
103176 + /* OUT variables. */
103177 + uint8_t online; /* currently online (not hotplugged)? */
103178 + uint8_t blocked; /* blocked waiting for an event? */
103179 + uint8_t running; /* currently scheduled on its CPU? */
103180 + uint64_t cpu_time; /* total cpu time consumed (ns) */
103181 + uint32_t cpu; /* current mapping */
103182 +};
103183 +typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
103184 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
103185 +
103186 +
103187 +/* Get/set which physical cpus a vcpu can execute on. */
103188 +#define XEN_DOMCTL_setvcpuaffinity 9
103189 +#define XEN_DOMCTL_getvcpuaffinity 25
103190 +struct xen_domctl_vcpuaffinity {
103191 + uint32_t vcpu; /* IN */
103192 + struct xenctl_cpumap cpumap; /* IN/OUT */
103193 +};
103194 +typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
103195 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
103196 +
103197 +
103198 +#define XEN_DOMCTL_max_vcpus 15
103199 +struct xen_domctl_max_vcpus {
103200 + uint32_t max; /* maximum number of vcpus */
103201 +};
103202 +typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
103203 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
103204 +
103205 +
103206 +#define XEN_DOMCTL_scheduler_op 16
103207 +/* Scheduler types. */
103208 +#define XEN_SCHEDULER_SEDF 4
103209 +#define XEN_SCHEDULER_CREDIT 5
103210 +/* Set or get info? */
103211 +#define XEN_DOMCTL_SCHEDOP_putinfo 0
103212 +#define XEN_DOMCTL_SCHEDOP_getinfo 1
103213 +struct xen_domctl_scheduler_op {
103214 + uint32_t sched_id; /* XEN_SCHEDULER_* */
103215 + uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */
103216 + union {
103217 + struct xen_domctl_sched_sedf {
103218 + uint64_t period;
103219 + uint64_t slice;
103220 + uint64_t latency;
103221 + uint32_t extratime;
103222 + uint32_t weight;
103223 + } sedf;
103224 + struct xen_domctl_sched_credit {
103225 + uint16_t weight;
103226 + uint16_t cap;
103227 + } credit;
103228 + } u;
103229 +};
103230 +typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
103231 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
103232 +
103233 +
103234 +#define XEN_DOMCTL_setdomainhandle 17
103235 +struct xen_domctl_setdomainhandle {
103236 + xen_domain_handle_t handle;
103237 +};
103238 +typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
103239 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
103240 +
103241 +
103242 +#define XEN_DOMCTL_setdebugging 18
103243 +struct xen_domctl_setdebugging {
103244 + uint8_t enable;
103245 +};
103246 +typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
103247 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
103248 +
103249 +
103250 +#define XEN_DOMCTL_irq_permission 19
103251 +struct xen_domctl_irq_permission {
103252 + uint8_t pirq;
103253 + uint8_t allow_access; /* flag to specify enable/disable of IRQ access */
103254 +};
103255 +typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
103256 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
103257 +
103258 +
103259 +#define XEN_DOMCTL_iomem_permission 20
103260 +struct xen_domctl_iomem_permission {
103261 + uint64_t first_mfn; /* first page (physical page number) in range */
103262 + uint64_t nr_mfns; /* number of pages in range (>0) */
103263 + uint8_t allow_access; /* allow (!0) or deny (0) access to range? */
103264 +};
103265 +typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
103266 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
103267 +
103268 +
103269 +#define XEN_DOMCTL_ioport_permission 21
103270 +struct xen_domctl_ioport_permission {
103271 + uint32_t first_port; /* first port int range */
103272 + uint32_t nr_ports; /* size of port range */
103273 + uint8_t allow_access; /* allow or deny access to range? */
103274 +};
103275 +typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
103276 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
103277 +
103278 +#define XEN_DOMCTL_hypercall_init 22
103279 +struct xen_domctl_hypercall_init {
103280 + uint64_t gmfn; /* GMFN to be initialised */
103281 +};
103282 +typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
103283 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
103284 +
103285 +#define XEN_DOMCTL_arch_setup 23
103286 +#define _XEN_DOMAINSETUP_hvm_guest 0
103287 +#define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest)
103288 +#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */
103289 +#define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query)
103290 +typedef struct xen_domctl_arch_setup {
103291 + uint64_t flags; /* XEN_DOMAINSETUP_* */
103292 +#ifdef __ia64__
103293 + uint64_t bp; /* mpaddr of boot param area */
103294 + uint64_t maxmem; /* Highest memory address for MDT. */
103295 + uint64_t xsi_va; /* Xen shared_info area virtual address. */
103296 + uint32_t hypercall_imm; /* Break imm for Xen hypercalls. */
103297 +#endif
103298 +} xen_domctl_arch_setup_t;
103299 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
103300 +
103301 +#define XEN_DOMCTL_settimeoffset 24
103302 +struct xen_domctl_settimeoffset {
103303 + int32_t time_offset_seconds; /* applied to domain wallclock time */
103304 +};
103305 +typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
103306 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
103307 +
103308 +#define XEN_DOMCTL_real_mode_area 26
103309 +struct xen_domctl_real_mode_area {
103310 + uint32_t log; /* log2 of Real Mode Area size */
103311 +};
103312 +typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
103313 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
103314 +
103315 +struct xen_domctl {
103316 + uint32_t cmd;
103317 + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
103318 + domid_t domain;
103319 + union {
103320 + struct xen_domctl_createdomain createdomain;
103321 + struct xen_domctl_getdomaininfo getdomaininfo;
103322 + struct xen_domctl_getmemlist getmemlist;
103323 + struct xen_domctl_getpageframeinfo getpageframeinfo;
103324 + struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
103325 + struct xen_domctl_vcpuaffinity vcpuaffinity;
103326 + struct xen_domctl_shadow_op shadow_op;
103327 + struct xen_domctl_max_mem max_mem;
103328 + struct xen_domctl_vcpucontext vcpucontext;
103329 + struct xen_domctl_getvcpuinfo getvcpuinfo;
103330 + struct xen_domctl_max_vcpus max_vcpus;
103331 + struct xen_domctl_scheduler_op scheduler_op;
103332 + struct xen_domctl_setdomainhandle setdomainhandle;
103333 + struct xen_domctl_setdebugging setdebugging;
103334 + struct xen_domctl_irq_permission irq_permission;
103335 + struct xen_domctl_iomem_permission iomem_permission;
103336 + struct xen_domctl_ioport_permission ioport_permission;
103337 + struct xen_domctl_hypercall_init hypercall_init;
103338 + struct xen_domctl_arch_setup arch_setup;
103339 + struct xen_domctl_settimeoffset settimeoffset;
103340 + struct xen_domctl_real_mode_area real_mode_area;
103341 + uint8_t pad[128];
103342 + } u;
103343 +};
103344 +typedef struct xen_domctl xen_domctl_t;
103345 +DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
103346 +
103347 +#endif /* __XEN_PUBLIC_DOMCTL_H__ */
103348 +
103349 +/*
103350 + * Local variables:
103351 + * mode: C
103352 + * c-set-style: "BSD"
103353 + * c-basic-offset: 4
103354 + * tab-width: 4
103355 + * indent-tabs-mode: nil
103356 + * End:
103357 + */
103358 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/elfnote.h linux-2.6.16.33/include/xen/interface/elfnote.h
103359 --- linux-2.6.16.33-noxen/include/xen/interface/elfnote.h 1970-01-01 00:00:00.000000000 +0000
103360 +++ linux-2.6.16.33/include/xen/interface/elfnote.h 2007-01-08 15:00:55.000000000 +0000
103361 @@ -0,0 +1,179 @@
103362 +/******************************************************************************
103363 + * elfnote.h
103364 + *
103365 + * Definitions used for the Xen ELF notes.
103366 + *
103367 + * Permission is hereby granted, free of charge, to any person obtaining a copy
103368 + * of this software and associated documentation files (the "Software"), to
103369 + * deal in the Software without restriction, including without limitation the
103370 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103371 + * sell copies of the Software, and to permit persons to whom the Software is
103372 + * furnished to do so, subject to the following conditions:
103373 + *
103374 + * The above copyright notice and this permission notice shall be included in
103375 + * all copies or substantial portions of the Software.
103376 + *
103377 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103378 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103379 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103380 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103381 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103382 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103383 + * DEALINGS IN THE SOFTWARE.
103384 + *
103385 + * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
103386 + */
103387 +
103388 +#ifndef __XEN_PUBLIC_ELFNOTE_H__
103389 +#define __XEN_PUBLIC_ELFNOTE_H__
103390 +
103391 +/*
103392 + * The notes should live in a SHT_NOTE segment and have "Xen" in the
103393 + * name field.
103394 + *
103395 + * Numeric types are either 4 or 8 bytes depending on the content of
103396 + * the desc field.
103397 + *
103398 + * LEGACY indicated the fields in the legacy __xen_guest string which
103399 + * this a note type replaces.
103400 + */
103401 +
103402 +/*
103403 + * NAME=VALUE pair (string).
103404 + *
103405 + * LEGACY: FEATURES and PAE
103406 + */
103407 +#define XEN_ELFNOTE_INFO 0
103408 +
103409 +/*
103410 + * The virtual address of the entry point (numeric).
103411 + *
103412 + * LEGACY: VIRT_ENTRY
103413 + */
103414 +#define XEN_ELFNOTE_ENTRY 1
103415 +
103416 +/* The virtual address of the hypercall transfer page (numeric).
103417 + *
103418 + * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
103419 + * number not a virtual address)
103420 + */
103421 +#define XEN_ELFNOTE_HYPERCALL_PAGE 2
103422 +
103423 +/* The virtual address where the kernel image should be mapped (numeric).
103424 + *
103425 + * Defaults to 0.
103426 + *
103427 + * LEGACY: VIRT_BASE
103428 + */
103429 +#define XEN_ELFNOTE_VIRT_BASE 3
103430 +
103431 +/*
103432 + * The offset of the ELF paddr field from the acutal required
103433 + * psuedo-physical address (numeric).
103434 + *
103435 + * This is used to maintain backwards compatibility with older kernels
103436 + * which wrote __PAGE_OFFSET into that field. This field defaults to 0
103437 + * if not present.
103438 + *
103439 + * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
103440 + */
103441 +#define XEN_ELFNOTE_PADDR_OFFSET 4
103442 +
103443 +/*
103444 + * The version of Xen that we work with (string).
103445 + *
103446 + * LEGACY: XEN_VER
103447 + */
103448 +#define XEN_ELFNOTE_XEN_VERSION 5
103449 +
103450 +/*
103451 + * The name of the guest operating system (string).
103452 + *
103453 + * LEGACY: GUEST_OS
103454 + */
103455 +#define XEN_ELFNOTE_GUEST_OS 6
103456 +
103457 +/*
103458 + * The version of the guest operating system (string).
103459 + *
103460 + * LEGACY: GUEST_VER
103461 + */
103462 +#define XEN_ELFNOTE_GUEST_VERSION 7
103463 +
103464 +/*
103465 + * The loader type (string).
103466 + *
103467 + * LEGACY: LOADER
103468 + */
103469 +#define XEN_ELFNOTE_LOADER 8
103470 +
103471 +/*
103472 + * The kernel supports PAE (x86/32 only, string = "yes" or "no").
103473 + *
103474 + * LEGACY: PAE (n.b. The legacy interface included a provision to
103475 + * indicate 'extended-cr3' support allowing L3 page tables to be
103476 + * placed above 4G. It is assumed that any kernel new enough to use
103477 + * these ELF notes will include this and therefore "yes" here is
103478 + * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
103479 + */
103480 +#define XEN_ELFNOTE_PAE_MODE 9
103481 +
103482 +/*
103483 + * The features supported/required by this kernel (string).
103484 + *
103485 + * The string must consist of a list of feature names (as given in
103486 + * features.h, without the "XENFEAT_" prefix) separated by '|'
103487 + * characters. If a feature is required for the kernel to function
103488 + * then the feature name must be preceded by a '!' character.
103489 + *
103490 + * LEGACY: FEATURES
103491 + */
103492 +#define XEN_ELFNOTE_FEATURES 10
103493 +
103494 +/*
103495 + * The kernel requires the symbol table to be loaded (string = "yes" or "no")
103496 + * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
103497 + * of this string as a boolean flag rather than requiring "yes" or
103498 + * "no".
103499 + */
103500 +#define XEN_ELFNOTE_BSD_SYMTAB 11
103501 +
103502 +/*
103503 + * The lowest address the hypervisor hole can begin at (numeric).
103504 + *
103505 + * This must not be set higher than HYPERVISOR_VIRT_START. Its presence
103506 + * also indicates to the hypervisor that the kernel can deal with the
103507 + * hole starting at a higher address.
103508 + */
103509 +#define XEN_ELFNOTE_HV_START_LOW 12
103510 +
103511 +/*
103512 + * System information exported through crash notes.
103513 + *
103514 + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
103515 + * note in case of a system crash. This note will contain various
103516 + * information about the system, see xen/include/xen/elfcore.h.
103517 + */
103518 +#define XEN_ELFNOTE_CRASH_INFO 0x1000001
103519 +
103520 +/*
103521 + * System registers exported through crash notes.
103522 + *
103523 + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
103524 + * note per cpu in case of a system crash. This note is architecture
103525 + * specific and will contain registers not saved in the "CORE" note.
103526 + * See xen/include/xen/elfcore.h for more information.
103527 + */
103528 +#define XEN_ELFNOTE_CRASH_REGS 0x1000002
103529 +
103530 +#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
103531 +
103532 +/*
103533 + * Local variables:
103534 + * mode: C
103535 + * c-set-style: "BSD"
103536 + * c-basic-offset: 4
103537 + * tab-width: 4
103538 + * indent-tabs-mode: nil
103539 + * End:
103540 + */
103541 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/event_channel.h linux-2.6.16.33/include/xen/interface/event_channel.h
103542 --- linux-2.6.16.33-noxen/include/xen/interface/event_channel.h 1970-01-01 00:00:00.000000000 +0000
103543 +++ linux-2.6.16.33/include/xen/interface/event_channel.h 2007-01-08 15:00:55.000000000 +0000
103544 @@ -0,0 +1,251 @@
103545 +/******************************************************************************
103546 + * event_channel.h
103547 + *
103548 + * Event channels between domains.
103549 + *
103550 + * Permission is hereby granted, free of charge, to any person obtaining a copy
103551 + * of this software and associated documentation files (the "Software"), to
103552 + * deal in the Software without restriction, including without limitation the
103553 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103554 + * sell copies of the Software, and to permit persons to whom the Software is
103555 + * furnished to do so, subject to the following conditions:
103556 + *
103557 + * The above copyright notice and this permission notice shall be included in
103558 + * all copies or substantial portions of the Software.
103559 + *
103560 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103561 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103562 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103563 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103564 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103565 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103566 + * DEALINGS IN THE SOFTWARE.
103567 + *
103568 + * Copyright (c) 2003-2004, K A Fraser.
103569 + */
103570 +
103571 +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
103572 +#define __XEN_PUBLIC_EVENT_CHANNEL_H__
103573 +
103574 +/*
103575 + * Prototype for this hypercall is:
103576 + * int event_channel_op(int cmd, void *args)
103577 + * @cmd == EVTCHNOP_??? (event-channel operation).
103578 + * @args == Operation-specific extra arguments (NULL if none).
103579 + */
103580 +
103581 +typedef uint32_t evtchn_port_t;
103582 +DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
103583 +
103584 +/*
103585 + * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
103586 + * accepting interdomain bindings from domain <remote_dom>. A fresh port
103587 + * is allocated in <dom> and returned as <port>.
103588 + * NOTES:
103589 + * 1. If the caller is unprivileged then <dom> must be DOMID_SELF.
103590 + * 2. <rdom> may be DOMID_SELF, allowing loopback connections.
103591 + */
103592 +#define EVTCHNOP_alloc_unbound 6
103593 +struct evtchn_alloc_unbound {
103594 + /* IN parameters */
103595 + domid_t dom, remote_dom;
103596 + /* OUT parameters */
103597 + evtchn_port_t port;
103598 +};
103599 +typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
103600 +
103601 +/*
103602 + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
103603 + * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
103604 + * a port that is unbound and marked as accepting bindings from the calling
103605 + * domain. A fresh port is allocated in the calling domain and returned as
103606 + * <local_port>.
103607 + * NOTES:
103608 + * 2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
103609 + */
103610 +#define EVTCHNOP_bind_interdomain 0
103611 +struct evtchn_bind_interdomain {
103612 + /* IN parameters. */
103613 + domid_t remote_dom;
103614 + evtchn_port_t remote_port;
103615 + /* OUT parameters. */
103616 + evtchn_port_t local_port;
103617 +};
103618 +typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
103619 +
103620 +/*
103621 + * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
103622 + * vcpu.
103623 + * NOTES:
103624 + * 1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
103625 + * in xen.h for the classification of each VIRQ.
103626 + * 2. Global VIRQs must be allocated on VCPU0 but can subsequently be
103627 + * re-bound via EVTCHNOP_bind_vcpu.
103628 + * 3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
103629 + * The allocated event channel is bound to the specified vcpu and the
103630 + * binding cannot be changed.
103631 + */
103632 +#define EVTCHNOP_bind_virq 1
103633 +struct evtchn_bind_virq {
103634 + /* IN parameters. */
103635 + uint32_t virq;
103636 + uint32_t vcpu;
103637 + /* OUT parameters. */
103638 + evtchn_port_t port;
103639 +};
103640 +typedef struct evtchn_bind_virq evtchn_bind_virq_t;
103641 +
103642 +/*
103643 + * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
103644 + * NOTES:
103645 + * 1. A physical IRQ may be bound to at most one event channel per domain.
103646 + * 2. Only a sufficiently-privileged domain may bind to a physical IRQ.
103647 + */
103648 +#define EVTCHNOP_bind_pirq 2
103649 +struct evtchn_bind_pirq {
103650 + /* IN parameters. */
103651 + uint32_t pirq;
103652 +#define BIND_PIRQ__WILL_SHARE 1
103653 + uint32_t flags; /* BIND_PIRQ__* */
103654 + /* OUT parameters. */
103655 + evtchn_port_t port;
103656 +};
103657 +typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
103658 +
103659 +/*
103660 + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
103661 + * NOTES:
103662 + * 1. The allocated event channel is bound to the specified vcpu. The binding
103663 + * may not be changed.
103664 + */
103665 +#define EVTCHNOP_bind_ipi 7
103666 +struct evtchn_bind_ipi {
103667 + uint32_t vcpu;
103668 + /* OUT parameters. */
103669 + evtchn_port_t port;
103670 +};
103671 +typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
103672 +
103673 +/*
103674 + * EVTCHNOP_close: Close a local event channel <port>. If the channel is
103675 + * interdomain then the remote end is placed in the unbound state
103676 + * (EVTCHNSTAT_unbound), awaiting a new connection.
103677 + */
103678 +#define EVTCHNOP_close 3
103679 +struct evtchn_close {
103680 + /* IN parameters. */
103681 + evtchn_port_t port;
103682 +};
103683 +typedef struct evtchn_close evtchn_close_t;
103684 +
103685 +/*
103686 + * EVTCHNOP_send: Send an event to the remote end of the channel whose local
103687 + * endpoint is <port>.
103688 + */
103689 +#define EVTCHNOP_send 4
103690 +struct evtchn_send {
103691 + /* IN parameters. */
103692 + evtchn_port_t port;
103693 +};
103694 +typedef struct evtchn_send evtchn_send_t;
103695 +
103696 +/*
103697 + * EVTCHNOP_status: Get the current status of the communication channel which
103698 + * has an endpoint at <dom, port>.
103699 + * NOTES:
103700 + * 1. <dom> may be specified as DOMID_SELF.
103701 + * 2. Only a sufficiently-privileged domain may obtain the status of an event
103702 + * channel for which <dom> is not DOMID_SELF.
103703 + */
103704 +#define EVTCHNOP_status 5
103705 +struct evtchn_status {
103706 + /* IN parameters */
103707 + domid_t dom;
103708 + evtchn_port_t port;
103709 + /* OUT parameters */
103710 +#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
103711 +#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
103712 +#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
103713 +#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
103714 +#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
103715 +#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
103716 + uint32_t status;
103717 + uint32_t vcpu; /* VCPU to which this channel is bound. */
103718 + union {
103719 + struct {
103720 + domid_t dom;
103721 + } unbound; /* EVTCHNSTAT_unbound */
103722 + struct {
103723 + domid_t dom;
103724 + evtchn_port_t port;
103725 + } interdomain; /* EVTCHNSTAT_interdomain */
103726 + uint32_t pirq; /* EVTCHNSTAT_pirq */
103727 + uint32_t virq; /* EVTCHNSTAT_virq */
103728 + } u;
103729 +};
103730 +typedef struct evtchn_status evtchn_status_t;
103731 +
103732 +/*
103733 + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
103734 + * event is pending.
103735 + * NOTES:
103736 + * 1. IPI-bound channels always notify the vcpu specified at bind time.
103737 + * This binding cannot be changed.
103738 + * 2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
103739 + * This binding cannot be changed.
103740 + * 3. All other channels notify vcpu0 by default. This default is set when
103741 + * the channel is allocated (a port that is freed and subsequently reused
103742 + * has its binding reset to vcpu0).
103743 + */
103744 +#define EVTCHNOP_bind_vcpu 8
103745 +struct evtchn_bind_vcpu {
103746 + /* IN parameters. */
103747 + evtchn_port_t port;
103748 + uint32_t vcpu;
103749 +};
103750 +typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
103751 +
103752 +/*
103753 + * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
103754 + * a notification to the appropriate VCPU if an event is pending.
103755 + */
103756 +#define EVTCHNOP_unmask 9
103757 +struct evtchn_unmask {
103758 + /* IN parameters. */
103759 + evtchn_port_t port;
103760 +};
103761 +typedef struct evtchn_unmask evtchn_unmask_t;
103762 +
103763 +/*
103764 + * Argument to event_channel_op_compat() hypercall. Superceded by new
103765 + * event_channel_op() hypercall since 0x00030202.
103766 + */
103767 +struct evtchn_op {
103768 + uint32_t cmd; /* EVTCHNOP_* */
103769 + union {
103770 + struct evtchn_alloc_unbound alloc_unbound;
103771 + struct evtchn_bind_interdomain bind_interdomain;
103772 + struct evtchn_bind_virq bind_virq;
103773 + struct evtchn_bind_pirq bind_pirq;
103774 + struct evtchn_bind_ipi bind_ipi;
103775 + struct evtchn_close close;
103776 + struct evtchn_send send;
103777 + struct evtchn_status status;
103778 + struct evtchn_bind_vcpu bind_vcpu;
103779 + struct evtchn_unmask unmask;
103780 + } u;
103781 +};
103782 +typedef struct evtchn_op evtchn_op_t;
103783 +DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
103784 +
103785 +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
103786 +
103787 +/*
103788 + * Local variables:
103789 + * mode: C
103790 + * c-set-style: "BSD"
103791 + * c-basic-offset: 4
103792 + * tab-width: 4
103793 + * indent-tabs-mode: nil
103794 + * End:
103795 + */
103796 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/features.h linux-2.6.16.33/include/xen/interface/features.h
103797 --- linux-2.6.16.33-noxen/include/xen/interface/features.h 1970-01-01 00:00:00.000000000 +0000
103798 +++ linux-2.6.16.33/include/xen/interface/features.h 2007-01-08 15:00:55.000000000 +0000
103799 @@ -0,0 +1,71 @@
103800 +/******************************************************************************
103801 + * features.h
103802 + *
103803 + * Feature flags, reported by XENVER_get_features.
103804 + *
103805 + * Permission is hereby granted, free of charge, to any person obtaining a copy
103806 + * of this software and associated documentation files (the "Software"), to
103807 + * deal in the Software without restriction, including without limitation the
103808 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103809 + * sell copies of the Software, and to permit persons to whom the Software is
103810 + * furnished to do so, subject to the following conditions:
103811 + *
103812 + * The above copyright notice and this permission notice shall be included in
103813 + * all copies or substantial portions of the Software.
103814 + *
103815 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103816 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103817 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103818 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103819 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103820 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103821 + * DEALINGS IN THE SOFTWARE.
103822 + *
103823 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
103824 + */
103825 +
103826 +#ifndef __XEN_PUBLIC_FEATURES_H__
103827 +#define __XEN_PUBLIC_FEATURES_H__
103828 +
103829 +/*
103830 + * If set, the guest does not need to write-protect its pagetables, and can
103831 + * update them via direct writes.
103832 + */
103833 +#define XENFEAT_writable_page_tables 0
103834 +
103835 +/*
103836 + * If set, the guest does not need to write-protect its segment descriptor
103837 + * tables, and can update them via direct writes.
103838 + */
103839 +#define XENFEAT_writable_descriptor_tables 1
103840 +
103841 +/*
103842 + * If set, translation between the guest's 'pseudo-physical' address space
103843 + * and the host's machine address space are handled by the hypervisor. In this
103844 + * mode the guest does not need to perform phys-to/from-machine translations
103845 + * when performing page table operations.
103846 + */
103847 +#define XENFEAT_auto_translated_physmap 2
103848 +
103849 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
103850 +#define XENFEAT_supervisor_mode_kernel 3
103851 +
103852 +/*
103853 + * If set, the guest does not need to allocate x86 PAE page directories
103854 + * below 4GB. This flag is usually implied by auto_translated_physmap.
103855 + */
103856 +#define XENFEAT_pae_pgdir_above_4gb 4
103857 +
103858 +#define XENFEAT_NR_SUBMAPS 1
103859 +
103860 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
103861 +
103862 +/*
103863 + * Local variables:
103864 + * mode: C
103865 + * c-set-style: "BSD"
103866 + * c-basic-offset: 4
103867 + * tab-width: 4
103868 + * indent-tabs-mode: nil
103869 + * End:
103870 + */
103871 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/grant_table.h linux-2.6.16.33/include/xen/interface/grant_table.h
103872 --- linux-2.6.16.33-noxen/include/xen/interface/grant_table.h 1970-01-01 00:00:00.000000000 +0000
103873 +++ linux-2.6.16.33/include/xen/interface/grant_table.h 2007-01-08 15:00:55.000000000 +0000
103874 @@ -0,0 +1,380 @@
103875 +/******************************************************************************
103876 + * grant_table.h
103877 + *
103878 + * Interface for granting foreign access to page frames, and receiving
103879 + * page-ownership transfers.
103880 + *
103881 + * Permission is hereby granted, free of charge, to any person obtaining a copy
103882 + * of this software and associated documentation files (the "Software"), to
103883 + * deal in the Software without restriction, including without limitation the
103884 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
103885 + * sell copies of the Software, and to permit persons to whom the Software is
103886 + * furnished to do so, subject to the following conditions:
103887 + *
103888 + * The above copyright notice and this permission notice shall be included in
103889 + * all copies or substantial portions of the Software.
103890 + *
103891 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103892 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
103893 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
103894 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
103895 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
103896 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
103897 + * DEALINGS IN THE SOFTWARE.
103898 + *
103899 + * Copyright (c) 2004, K A Fraser
103900 + */
103901 +
103902 +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
103903 +#define __XEN_PUBLIC_GRANT_TABLE_H__
103904 +
103905 +
103906 +/***********************************
103907 + * GRANT TABLE REPRESENTATION
103908 + */
103909 +
103910 +/* Some rough guidelines on accessing and updating grant-table entries
103911 + * in a concurrency-safe manner. For more information, Linux contains a
103912 + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
103913 + *
103914 + * NB. WMB is a no-op on current-generation x86 processors. However, a
103915 + * compiler barrier will still be required.
103916 + *
103917 + * Introducing a valid entry into the grant table:
103918 + * 1. Write ent->domid.
103919 + * 2. Write ent->frame:
103920 + * GTF_permit_access: Frame to which access is permitted.
103921 + * GTF_accept_transfer: Pseudo-phys frame slot being filled by new
103922 + * frame, or zero if none.
103923 + * 3. Write memory barrier (WMB).
103924 + * 4. Write ent->flags, inc. valid type.
103925 + *
103926 + * Invalidating an unused GTF_permit_access entry:
103927 + * 1. flags = ent->flags.
103928 + * 2. Observe that !(flags & (GTF_reading|GTF_writing)).
103929 + * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
103930 + * NB. No need for WMB as reuse of entry is control-dependent on success of
103931 + * step 3, and all architectures guarantee ordering of ctrl-dep writes.
103932 + *
103933 + * Invalidating an in-use GTF_permit_access entry:
103934 + * This cannot be done directly. Request assistance from the domain controller
103935 + * which can set a timeout on the use of a grant entry and take necessary
103936 + * action. (NB. This is not yet implemented!).
103937 + *
103938 + * Invalidating an unused GTF_accept_transfer entry:
103939 + * 1. flags = ent->flags.
103940 + * 2. Observe that !(flags & GTF_transfer_committed). [*]
103941 + * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
103942 + * NB. No need for WMB as reuse of entry is control-dependent on success of
103943 + * step 3, and all architectures guarantee ordering of ctrl-dep writes.
103944 + * [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
103945 + * The guest must /not/ modify the grant entry until the address of the
103946 + * transferred frame is written. It is safe for the guest to spin waiting
103947 + * for this to occur (detect by observing GTF_transfer_completed in
103948 + * ent->flags).
103949 + *
103950 + * Invalidating a committed GTF_accept_transfer entry:
103951 + * 1. Wait for (ent->flags & GTF_transfer_completed).
103952 + *
103953 + * Changing a GTF_permit_access from writable to read-only:
103954 + * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
103955 + *
103956 + * Changing a GTF_permit_access from read-only to writable:
103957 + * Use SMP-safe bit-setting instruction.
103958 + */
103959 +
103960 +/*
103961 + * A grant table comprises a packed array of grant entries in one or more
103962 + * page frames shared between Xen and a guest.
103963 + * [XEN]: This field is written by Xen and read by the sharing guest.
103964 + * [GST]: This field is written by the guest and read by Xen.
103965 + */
103966 +struct grant_entry {
103967 + /* GTF_xxx: various type and flag information. [XEN,GST] */
103968 + uint16_t flags;
103969 + /* The domain being granted foreign privileges. [GST] */
103970 + domid_t domid;
103971 + /*
103972 + * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
103973 + * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
103974 + */
103975 + uint32_t frame;
103976 +};
103977 +typedef struct grant_entry grant_entry_t;
103978 +
103979 +/*
103980 + * Type of grant entry.
103981 + * GTF_invalid: This grant entry grants no privileges.
103982 + * GTF_permit_access: Allow @domid to map/access @frame.
103983 + * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
103984 + * to this guest. Xen writes the page number to @frame.
103985 + */
103986 +#define GTF_invalid (0U<<0)
103987 +#define GTF_permit_access (1U<<0)
103988 +#define GTF_accept_transfer (2U<<0)
103989 +#define GTF_type_mask (3U<<0)
103990 +
103991 +/*
103992 + * Subflags for GTF_permit_access.
103993 + * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
103994 + * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
103995 + * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
103996 + */
103997 +#define _GTF_readonly (2)
103998 +#define GTF_readonly (1U<<_GTF_readonly)
103999 +#define _GTF_reading (3)
104000 +#define GTF_reading (1U<<_GTF_reading)
104001 +#define _GTF_writing (4)
104002 +#define GTF_writing (1U<<_GTF_writing)
104003 +
104004 +/*
104005 + * Subflags for GTF_accept_transfer:
104006 + * GTF_transfer_committed: Xen sets this flag to indicate that it is committed
104007 + * to transferring ownership of a page frame. When a guest sees this flag
104008 + * it must /not/ modify the grant entry until GTF_transfer_completed is
104009 + * set by Xen.
104010 + * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
104011 + * after reading GTF_transfer_committed. Xen will always write the frame
104012 + * address, followed by ORing this flag, in a timely manner.
104013 + */
104014 +#define _GTF_transfer_committed (2)
104015 +#define GTF_transfer_committed (1U<<_GTF_transfer_committed)
104016 +#define _GTF_transfer_completed (3)
104017 +#define GTF_transfer_completed (1U<<_GTF_transfer_completed)
104018 +
104019 +
104020 +/***********************************
104021 + * GRANT TABLE QUERIES AND USES
104022 + */
104023 +
104024 +/*
104025 + * Reference to a grant entry in a specified domain's grant table.
104026 + */
104027 +typedef uint32_t grant_ref_t;
104028 +
104029 +/*
104030 + * Handle to track a mapping created via a grant reference.
104031 + */
104032 +typedef uint32_t grant_handle_t;
104033 +
104034 +/*
104035 + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
104036 + * by devices and/or host CPUs. If successful, <handle> is a tracking number
104037 + * that must be presented later to destroy the mapping(s). On error, <handle>
104038 + * is a negative status code.
104039 + * NOTES:
104040 + * 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
104041 + * via which I/O devices may access the granted frame.
104042 + * 2. If GNTMAP_host_map is specified then a mapping will be added at
104043 + * either a host virtual address in the current address space, or at
104044 + * a PTE at the specified machine address. The type of mapping to
104045 + * perform is selected through the GNTMAP_contains_pte flag, and the
104046 + * address is specified in <host_addr>.
104047 + * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
104048 + * host mapping is destroyed by other means then it is *NOT* guaranteed
104049 + * to be accounted to the correct grant reference!
104050 + */
104051 +#define GNTTABOP_map_grant_ref 0
104052 +struct gnttab_map_grant_ref {
104053 + /* IN parameters. */
104054 + uint64_t host_addr;
104055 + uint32_t flags; /* GNTMAP_* */
104056 + grant_ref_t ref;
104057 + domid_t dom;
104058 + /* OUT parameters. */
104059 + int16_t status; /* GNTST_* */
104060 + grant_handle_t handle;
104061 + uint64_t dev_bus_addr;
104062 +};
104063 +typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
104064 +DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
104065 +
104066 +/*
104067 + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
104068 + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
104069 + * field is ignored. If non-zero, they must refer to a device/host mapping
104070 + * that is tracked by <handle>
104071 + * NOTES:
104072 + * 1. The call may fail in an undefined manner if either mapping is not
104073 + * tracked by <handle>.
104074 + * 3. After executing a batch of unmaps, it is guaranteed that no stale
104075 + * mappings will remain in the device or host TLBs.
104076 + */
104077 +#define GNTTABOP_unmap_grant_ref 1
104078 +struct gnttab_unmap_grant_ref {
104079 + /* IN parameters. */
104080 + uint64_t host_addr;
104081 + uint64_t dev_bus_addr;
104082 + grant_handle_t handle;
104083 + /* OUT parameters. */
104084 + int16_t status; /* GNTST_* */
104085 +};
104086 +typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
104087 +DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
104088 +
104089 +/*
104090 + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
104091 + * <nr_frames> pages. The frame addresses are written to the <frame_list>.
104092 + * Only <nr_frames> addresses are written, even if the table is larger.
104093 + * NOTES:
104094 + * 1. <dom> may be specified as DOMID_SELF.
104095 + * 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
104096 + * 3. Xen may not support more than a single grant-table page per domain.
104097 + */
104098 +#define GNTTABOP_setup_table 2
104099 +struct gnttab_setup_table {
104100 + /* IN parameters. */
104101 + domid_t dom;
104102 + uint32_t nr_frames;
104103 + /* OUT parameters. */
104104 + int16_t status; /* GNTST_* */
104105 + XEN_GUEST_HANDLE(ulong) frame_list;
104106 +};
104107 +typedef struct gnttab_setup_table gnttab_setup_table_t;
104108 +DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
104109 +
104110 +/*
104111 + * GNTTABOP_dump_table: Dump the contents of the grant table to the
104112 + * xen console. Debugging use only.
104113 + */
104114 +#define GNTTABOP_dump_table 3
104115 +struct gnttab_dump_table {
104116 + /* IN parameters. */
104117 + domid_t dom;
104118 + /* OUT parameters. */
104119 + int16_t status; /* GNTST_* */
104120 +};
104121 +typedef struct gnttab_dump_table gnttab_dump_table_t;
104122 +DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
104123 +
104124 +/*
104125 + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
104126 + * foreign domain has previously registered its interest in the transfer via
104127 + * <domid, ref>.
104128 + *
104129 + * Note that, even if the transfer fails, the specified page no longer belongs
104130 + * to the calling domain *unless* the error is GNTST_bad_page.
104131 + */
104132 +#define GNTTABOP_transfer 4
104133 +struct gnttab_transfer {
104134 + /* IN parameters. */
104135 + xen_pfn_t mfn;
104136 + domid_t domid;
104137 + grant_ref_t ref;
104138 + /* OUT parameters. */
104139 + int16_t status;
104140 +};
104141 +typedef struct gnttab_transfer gnttab_transfer_t;
104142 +DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
104143 +
104144 +
104145 +/*
104146 + * GNTTABOP_copy: Hypervisor based copy
104147 + * source and destinations can be eithers MFNs or, for foreign domains,
104148 + * grant references. the foreign domain has to grant read/write access
104149 + * in its grant table.
104150 + *
104151 + * The flags specify what type source and destinations are (either MFN
104152 + * or grant reference).
104153 + *
104154 + * Note that this can also be used to copy data between two domains
104155 + * via a third party if the source and destination domains had previously
104156 + * grant appropriate access to their pages to the third party.
104157 + *
104158 + * source_offset specifies an offset in the source frame, dest_offset
104159 + * the offset in the target frame and len specifies the number of
104160 + * bytes to be copied.
104161 + */
104162 +
104163 +#define _GNTCOPY_source_gref (0)
104164 +#define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref)
104165 +#define _GNTCOPY_dest_gref (1)
104166 +#define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref)
104167 +
104168 +#define GNTTABOP_copy 5
104169 +typedef struct gnttab_copy {
104170 + /* IN parameters. */
104171 + struct {
104172 + union {
104173 + grant_ref_t ref;
104174 + xen_pfn_t gmfn;
104175 + } u;
104176 + domid_t domid;
104177 + uint16_t offset;
104178 + } source, dest;
104179 + uint16_t len;
104180 + uint16_t flags; /* GNTCOPY_* */
104181 + /* OUT parameters. */
104182 + int16_t status;
104183 +} gnttab_copy_t;
104184 +DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
104185 +
104186 +
104187 +/*
104188 + * Bitfield values for update_pin_status.flags.
104189 + */
104190 + /* Map the grant entry for access by I/O devices. */
104191 +#define _GNTMAP_device_map (0)
104192 +#define GNTMAP_device_map (1<<_GNTMAP_device_map)
104193 + /* Map the grant entry for access by host CPUs. */
104194 +#define _GNTMAP_host_map (1)
104195 +#define GNTMAP_host_map (1<<_GNTMAP_host_map)
104196 + /* Accesses to the granted frame will be restricted to read-only access. */
104197 +#define _GNTMAP_readonly (2)
104198 +#define GNTMAP_readonly (1<<_GNTMAP_readonly)
104199 + /*
104200 + * GNTMAP_host_map subflag:
104201 + * 0 => The host mapping is usable only by the guest OS.
104202 + * 1 => The host mapping is usable by guest OS + current application.
104203 + */
104204 +#define _GNTMAP_application_map (3)
104205 +#define GNTMAP_application_map (1<<_GNTMAP_application_map)
104206 +
104207 + /*
104208 + * GNTMAP_contains_pte subflag:
104209 + * 0 => This map request contains a host virtual address.
104210 + * 1 => This map request contains the machine addess of the PTE to update.
104211 + */
104212 +#define _GNTMAP_contains_pte (4)
104213 +#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte)
104214 +
104215 +/*
104216 + * Values for error status returns. All errors are -ve.
104217 + */
104218 +#define GNTST_okay (0) /* Normal return. */
104219 +#define GNTST_general_error (-1) /* General undefined error. */
104220 +#define GNTST_bad_domain (-2) /* Unrecognsed domain id. */
104221 +#define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */
104222 +#define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */
104223 +#define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */
104224 +#define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/
104225 +#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */
104226 +#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */
104227 +#define GNTST_bad_page (-9) /* Specified page was invalid for op. */
104228 +#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */
104229 +
104230 +#define GNTTABOP_error_msgs { \
104231 + "okay", \
104232 + "undefined error", \
104233 + "unrecognised domain id", \
104234 + "invalid grant reference", \
104235 + "invalid mapping handle", \
104236 + "invalid virtual address", \
104237 + "invalid device address", \
104238 + "no spare translation slot in the I/O MMU", \
104239 + "permission denied", \
104240 + "bad page", \
104241 + "copy arguments cross page boundary" \
104242 +}
104243 +
104244 +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
104245 +
104246 +/*
104247 + * Local variables:
104248 + * mode: C
104249 + * c-set-style: "BSD"
104250 + * c-basic-offset: 4
104251 + * tab-width: 4
104252 + * indent-tabs-mode: nil
104253 + * End:
104254 + */
104255 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/e820.h linux-2.6.16.33/include/xen/interface/hvm/e820.h
104256 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/e820.h 1970-01-01 00:00:00.000000000 +0000
104257 +++ linux-2.6.16.33/include/xen/interface/hvm/e820.h 2007-01-08 15:00:55.000000000 +0000
104258 @@ -0,0 +1,47 @@
104259 +
104260 +/*
104261 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104262 + * of this software and associated documentation files (the "Software"), to
104263 + * deal in the Software without restriction, including without limitation the
104264 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104265 + * sell copies of the Software, and to permit persons to whom the Software is
104266 + * furnished to do so, subject to the following conditions:
104267 + *
104268 + * The above copyright notice and this permission notice shall be included in
104269 + * all copies or substantial portions of the Software.
104270 + *
104271 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104272 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104273 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104274 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104275 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104276 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104277 + * DEALINGS IN THE SOFTWARE.
104278 + */
104279 +
104280 +#ifndef __XEN_PUBLIC_HVM_E820_H__
104281 +#define __XEN_PUBLIC_HVM_E820_H__
104282 +
104283 +/* PC BIOS standard E820 types. */
104284 +#define E820_RAM 1
104285 +#define E820_RESERVED 2
104286 +#define E820_ACPI 3
104287 +#define E820_NVS 4
104288 +
104289 +/* E820 location in HVM virtual address space. */
104290 +#define E820_MAP_PAGE 0x00090000
104291 +#define E820_MAP_NR_OFFSET 0x000001E8
104292 +#define E820_MAP_OFFSET 0x000002D0
104293 +
104294 +struct e820entry {
104295 + uint64_t addr;
104296 + uint64_t size;
104297 + uint32_t type;
104298 +} __attribute__((packed));
104299 +
104300 +#define HVM_BELOW_4G_RAM_END 0xF0000000
104301 +
104302 +#define HVM_BELOW_4G_MMIO_START HVM_BELOW_4G_RAM_END
104303 +#define HVM_BELOW_4G_MMIO_LENGTH ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
104304 +
104305 +#endif /* __XEN_PUBLIC_HVM_E820_H__ */
104306 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_info_table.h linux-2.6.16.33/include/xen/interface/hvm/hvm_info_table.h
104307 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_info_table.h 1970-01-01 00:00:00.000000000 +0000
104308 +++ linux-2.6.16.33/include/xen/interface/hvm/hvm_info_table.h 2007-01-08 15:00:55.000000000 +0000
104309 @@ -0,0 +1,41 @@
104310 +/******************************************************************************
104311 + * hvm/hvm_info_table.h
104312 + *
104313 + * HVM parameter and information table, written into guest memory map.
104314 + *
104315 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104316 + * of this software and associated documentation files (the "Software"), to
104317 + * deal in the Software without restriction, including without limitation the
104318 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104319 + * sell copies of the Software, and to permit persons to whom the Software is
104320 + * furnished to do so, subject to the following conditions:
104321 + *
104322 + * The above copyright notice and this permission notice shall be included in
104323 + * all copies or substantial portions of the Software.
104324 + *
104325 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104326 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104327 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104328 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104329 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104330 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104331 + * DEALINGS IN THE SOFTWARE.
104332 + */
104333 +
104334 +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
104335 +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
104336 +
104337 +#define HVM_INFO_PFN 0x09F
104338 +#define HVM_INFO_OFFSET 0x800
104339 +#define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
104340 +
104341 +struct hvm_info_table {
104342 + char signature[8]; /* "HVM INFO" */
104343 + uint32_t length;
104344 + uint8_t checksum;
104345 + uint8_t acpi_enabled;
104346 + uint8_t apic_mode;
104347 + uint32_t nr_vcpus;
104348 +};
104349 +
104350 +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
104351 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_op.h linux-2.6.16.33/include/xen/interface/hvm/hvm_op.h
104352 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/hvm_op.h 1970-01-01 00:00:00.000000000 +0000
104353 +++ linux-2.6.16.33/include/xen/interface/hvm/hvm_op.h 2007-01-08 15:00:55.000000000 +0000
104354 @@ -0,0 +1,53 @@
104355 +#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
104356 +#define __XEN_PUBLIC_HVM_HVM_OP_H__
104357 +
104358 +/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
104359 +#define HVMOP_set_param 0
104360 +#define HVMOP_get_param 1
104361 +struct xen_hvm_param {
104362 + domid_t domid; /* IN */
104363 + uint32_t index; /* IN */
104364 + uint64_t value; /* IN/OUT */
104365 +};
104366 +typedef struct xen_hvm_param xen_hvm_param_t;
104367 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
104368 +
104369 +/* Set the logical level of one of a domain's PCI INTx wires. */
104370 +#define HVMOP_set_pci_intx_level 2
104371 +struct xen_hvm_set_pci_intx_level {
104372 + /* Domain to be updated. */
104373 + domid_t domid;
104374 + /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
104375 + uint8_t domain, bus, device, intx;
104376 + /* Assertion level (0 = unasserted, 1 = asserted). */
104377 + uint8_t level;
104378 +};
104379 +typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
104380 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
104381 +
104382 +/* Set the logical level of one of a domain's ISA IRQ wires. */
104383 +#define HVMOP_set_isa_irq_level 3
104384 +struct xen_hvm_set_isa_irq_level {
104385 + /* Domain to be updated. */
104386 + domid_t domid;
104387 + /* ISA device identification, by ISA IRQ (0-15). */
104388 + uint8_t isa_irq;
104389 + /* Assertion level (0 = unasserted, 1 = asserted). */
104390 + uint8_t level;
104391 +};
104392 +typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
104393 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
104394 +
104395 +#define HVMOP_set_pci_link_route 4
104396 +struct xen_hvm_set_pci_link_route {
104397 + /* Domain to be updated. */
104398 + domid_t domid;
104399 + /* PCI link identifier (0-3). */
104400 + uint8_t link;
104401 + /* ISA IRQ (1-15), or 0 (disable link). */
104402 + uint8_t isa_irq;
104403 +};
104404 +typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
104405 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
104406 +
104407 +#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
104408 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/ioreq.h linux-2.6.16.33/include/xen/interface/hvm/ioreq.h
104409 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/ioreq.h 1970-01-01 00:00:00.000000000 +0000
104410 +++ linux-2.6.16.33/include/xen/interface/hvm/ioreq.h 2007-01-08 15:00:55.000000000 +0000
104411 @@ -0,0 +1,97 @@
104412 +/*
104413 + * ioreq.h: I/O request definitions for device models
104414 + * Copyright (c) 2004, Intel Corporation.
104415 + *
104416 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104417 + * of this software and associated documentation files (the "Software"), to
104418 + * deal in the Software without restriction, including without limitation the
104419 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104420 + * sell copies of the Software, and to permit persons to whom the Software is
104421 + * furnished to do so, subject to the following conditions:
104422 + *
104423 + * The above copyright notice and this permission notice shall be included in
104424 + * all copies or substantial portions of the Software.
104425 + *
104426 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104427 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104428 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104429 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104430 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104431 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104432 + * DEALINGS IN THE SOFTWARE.
104433 + */
104434 +
104435 +#ifndef _IOREQ_H_
104436 +#define _IOREQ_H_
104437 +
104438 +#define IOREQ_READ 1
104439 +#define IOREQ_WRITE 0
104440 +
104441 +#define STATE_IOREQ_NONE 0
104442 +#define STATE_IOREQ_READY 1
104443 +#define STATE_IOREQ_INPROCESS 2
104444 +#define STATE_IORESP_READY 3
104445 +
104446 +#define IOREQ_TYPE_PIO 0 /* pio */
104447 +#define IOREQ_TYPE_COPY 1 /* mmio ops */
104448 +#define IOREQ_TYPE_AND 2
104449 +#define IOREQ_TYPE_OR 3
104450 +#define IOREQ_TYPE_XOR 4
104451 +#define IOREQ_TYPE_XCHG 5
104452 +#define IOREQ_TYPE_ADD 6
104453 +
104454 +/*
104455 + * VMExit dispatcher should cooperate with instruction decoder to
104456 + * prepare this structure and notify service OS and DM by sending
104457 + * virq
104458 + */
104459 +struct ioreq {
104460 + uint64_t addr; /* physical address */
104461 + uint64_t size; /* size in bytes */
104462 + uint64_t count; /* for rep prefixes */
104463 + uint64_t data; /* data (or paddr of data) */
104464 + uint8_t state:4;
104465 + uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr
104466 + * of the real data to use. */
104467 + uint8_t dir:1; /* 1=read, 0=write */
104468 + uint8_t df:1;
104469 + uint8_t type; /* I/O type */
104470 + uint64_t io_count; /* How many IO done on a vcpu */
104471 +};
104472 +typedef struct ioreq ioreq_t;
104473 +
104474 +struct vcpu_iodata {
104475 + struct ioreq vp_ioreq;
104476 + /* Event channel port */
104477 + unsigned int vp_eport; /* VMX vcpu uses this to notify DM */
104478 +};
104479 +typedef struct vcpu_iodata vcpu_iodata_t;
104480 +
104481 +struct shared_iopage {
104482 + struct vcpu_iodata vcpu_iodata[1];
104483 +};
104484 +typedef struct shared_iopage shared_iopage_t;
104485 +
104486 +#define IOREQ_BUFFER_SLOT_NUM 80
104487 +struct buffered_iopage {
104488 + unsigned long read_pointer;
104489 + unsigned long write_pointer;
104490 + ioreq_t ioreq[IOREQ_BUFFER_SLOT_NUM];
104491 +}; /* sizeof this structure must be in one page */
104492 +typedef struct buffered_iopage buffered_iopage_t;
104493 +
104494 +#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40
104495 +#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04)
104496 +#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08)
104497 +
104498 +#endif /* _IOREQ_H_ */
104499 +
104500 +/*
104501 + * Local variables:
104502 + * mode: C
104503 + * c-set-style: "BSD"
104504 + * c-basic-offset: 4
104505 + * tab-width: 4
104506 + * indent-tabs-mode: nil
104507 + * End:
104508 + */
104509 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/params.h linux-2.6.16.33/include/xen/interface/hvm/params.h
104510 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/params.h 1970-01-01 00:00:00.000000000 +0000
104511 +++ linux-2.6.16.33/include/xen/interface/hvm/params.h 2007-01-08 15:00:55.000000000 +0000
104512 @@ -0,0 +1,36 @@
104513 +
104514 +/*
104515 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104516 + * of this software and associated documentation files (the "Software"), to
104517 + * deal in the Software without restriction, including without limitation the
104518 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104519 + * sell copies of the Software, and to permit persons to whom the Software is
104520 + * furnished to do so, subject to the following conditions:
104521 + *
104522 + * The above copyright notice and this permission notice shall be included in
104523 + * all copies or substantial portions of the Software.
104524 + *
104525 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104526 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104527 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104528 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104529 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104530 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104531 + * DEALINGS IN THE SOFTWARE.
104532 + */
104533 +
104534 +#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
104535 +#define __XEN_PUBLIC_HVM_PARAMS_H__
104536 +
104537 +#include "hvm_op.h"
104538 +
104539 +/* Parameter space for HVMOP_{set,get}_param. */
104540 +#define HVM_PARAM_CALLBACK_IRQ 0
104541 +#define HVM_PARAM_STORE_PFN 1
104542 +#define HVM_PARAM_STORE_EVTCHN 2
104543 +#define HVM_PARAM_PAE_ENABLED 4
104544 +#define HVM_PARAM_IOREQ_PFN 5
104545 +#define HVM_PARAM_BUFIOREQ_PFN 6
104546 +#define HVM_NR_PARAMS 7
104547 +
104548 +#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
104549 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/hvm/vmx_assist.h linux-2.6.16.33/include/xen/interface/hvm/vmx_assist.h
104550 --- linux-2.6.16.33-noxen/include/xen/interface/hvm/vmx_assist.h 1970-01-01 00:00:00.000000000 +0000
104551 +++ linux-2.6.16.33/include/xen/interface/hvm/vmx_assist.h 2007-01-08 15:00:55.000000000 +0000
104552 @@ -0,0 +1,116 @@
104553 +/*
104554 + * vmx_assist.h: Context definitions for the VMXASSIST world switch.
104555 + *
104556 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104557 + * of this software and associated documentation files (the "Software"), to
104558 + * deal in the Software without restriction, including without limitation the
104559 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104560 + * sell copies of the Software, and to permit persons to whom the Software is
104561 + * furnished to do so, subject to the following conditions:
104562 + *
104563 + * The above copyright notice and this permission notice shall be included in
104564 + * all copies or substantial portions of the Software.
104565 + *
104566 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104567 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104568 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104569 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104570 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104571 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104572 + * DEALINGS IN THE SOFTWARE.
104573 + *
104574 + * Leendert van Doorn, leendert@watson.ibm.com
104575 + * Copyright (c) 2005, International Business Machines Corporation.
104576 + */
104577 +
104578 +#ifndef _VMX_ASSIST_H_
104579 +#define _VMX_ASSIST_H_
104580 +
104581 +#define VMXASSIST_BASE 0xD0000
104582 +#define VMXASSIST_MAGIC 0x17101966
104583 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
104584 +
104585 +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
104586 +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
104587 +
104588 +#ifndef __ASSEMBLY__
104589 +
104590 +union vmcs_arbytes {
104591 + struct arbyte_fields {
104592 + unsigned int seg_type : 4,
104593 + s : 1,
104594 + dpl : 2,
104595 + p : 1,
104596 + reserved0 : 4,
104597 + avl : 1,
104598 + reserved1 : 1,
104599 + default_ops_size: 1,
104600 + g : 1,
104601 + null_bit : 1,
104602 + reserved2 : 15;
104603 + } fields;
104604 + unsigned int bytes;
104605 +};
104606 +
104607 +/*
104608 + * World switch state
104609 + */
104610 +struct vmx_assist_context {
104611 + uint32_t eip; /* execution pointer */
104612 + uint32_t esp; /* stack pointer */
104613 + uint32_t eflags; /* flags register */
104614 + uint32_t cr0;
104615 + uint32_t cr3; /* page table directory */
104616 + uint32_t cr4;
104617 + uint32_t idtr_limit; /* idt */
104618 + uint32_t idtr_base;
104619 + uint32_t gdtr_limit; /* gdt */
104620 + uint32_t gdtr_base;
104621 + uint32_t cs_sel; /* cs selector */
104622 + uint32_t cs_limit;
104623 + uint32_t cs_base;
104624 + union vmcs_arbytes cs_arbytes;
104625 + uint32_t ds_sel; /* ds selector */
104626 + uint32_t ds_limit;
104627 + uint32_t ds_base;
104628 + union vmcs_arbytes ds_arbytes;
104629 + uint32_t es_sel; /* es selector */
104630 + uint32_t es_limit;
104631 + uint32_t es_base;
104632 + union vmcs_arbytes es_arbytes;
104633 + uint32_t ss_sel; /* ss selector */
104634 + uint32_t ss_limit;
104635 + uint32_t ss_base;
104636 + union vmcs_arbytes ss_arbytes;
104637 + uint32_t fs_sel; /* fs selector */
104638 + uint32_t fs_limit;
104639 + uint32_t fs_base;
104640 + union vmcs_arbytes fs_arbytes;
104641 + uint32_t gs_sel; /* gs selector */
104642 + uint32_t gs_limit;
104643 + uint32_t gs_base;
104644 + union vmcs_arbytes gs_arbytes;
104645 + uint32_t tr_sel; /* task selector */
104646 + uint32_t tr_limit;
104647 + uint32_t tr_base;
104648 + union vmcs_arbytes tr_arbytes;
104649 + uint32_t ldtr_sel; /* ldtr selector */
104650 + uint32_t ldtr_limit;
104651 + uint32_t ldtr_base;
104652 + union vmcs_arbytes ldtr_arbytes;
104653 +};
104654 +typedef struct vmx_assist_context vmx_assist_context_t;
104655 +
104656 +#endif /* __ASSEMBLY__ */
104657 +
104658 +#endif /* _VMX_ASSIST_H_ */
104659 +
104660 +/*
104661 + * Local variables:
104662 + * mode: C
104663 + * c-set-style: "BSD"
104664 + * c-basic-offset: 4
104665 + * tab-width: 4
104666 + * indent-tabs-mode: nil
104667 + * End:
104668 + */
104669 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/blkif.h linux-2.6.16.33/include/xen/interface/io/blkif.h
104670 --- linux-2.6.16.33-noxen/include/xen/interface/io/blkif.h 1970-01-01 00:00:00.000000000 +0000
104671 +++ linux-2.6.16.33/include/xen/interface/io/blkif.h 2007-01-08 15:00:55.000000000 +0000
104672 @@ -0,0 +1,126 @@
104673 +/******************************************************************************
104674 + * blkif.h
104675 + *
104676 + * Unified block-device I/O interface for Xen guest OSes.
104677 + *
104678 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104679 + * of this software and associated documentation files (the "Software"), to
104680 + * deal in the Software without restriction, including without limitation the
104681 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104682 + * sell copies of the Software, and to permit persons to whom the Software is
104683 + * furnished to do so, subject to the following conditions:
104684 + *
104685 + * The above copyright notice and this permission notice shall be included in
104686 + * all copies or substantial portions of the Software.
104687 + *
104688 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104689 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104690 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104691 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104692 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104693 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104694 + * DEALINGS IN THE SOFTWARE.
104695 + *
104696 + * Copyright (c) 2003-2004, Keir Fraser
104697 + */
104698 +
104699 +#ifndef __XEN_PUBLIC_IO_BLKIF_H__
104700 +#define __XEN_PUBLIC_IO_BLKIF_H__
104701 +
104702 +#include "ring.h"
104703 +#include "../grant_table.h"
104704 +
104705 +/*
104706 + * Front->back notifications: When enqueuing a new request, sending a
104707 + * notification can be made conditional on req_event (i.e., the generic
104708 + * hold-off mechanism provided by the ring macros). Backends must set
104709 + * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
104710 + *
104711 + * Back->front notifications: When enqueuing a new response, sending a
104712 + * notification can be made conditional on rsp_event (i.e., the generic
104713 + * hold-off mechanism provided by the ring macros). Frontends must set
104714 + * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
104715 + */
104716 +
104717 +#ifndef blkif_vdev_t
104718 +#define blkif_vdev_t uint16_t
104719 +#endif
104720 +#define blkif_sector_t uint64_t
104721 +
104722 +/*
104723 + * REQUEST CODES.
104724 + */
104725 +#define BLKIF_OP_READ 0
104726 +#define BLKIF_OP_WRITE 1
104727 +/*
104728 + * Recognised only if "feature-barrier" is present in backend xenbus info.
104729 + * The "feature_barrier" node contains a boolean indicating whether barrier
104730 + * requests are likely to succeed or fail. Either way, a barrier request
104731 + * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
104732 + * the underlying block-device hardware. The boolean simply indicates whether
104733 + * or not it is worthwhile for the frontend to attempt barrier requests.
104734 + * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
104735 + * create the "feature-barrier" node!
104736 + */
104737 +#define BLKIF_OP_WRITE_BARRIER 2
104738 +
104739 +/*
104740 + * Maximum scatter/gather segments per request.
104741 + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
104742 + * NB. This could be 12 if the ring indexes weren't stored in the same page.
104743 + */
104744 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
104745 +
104746 +struct blkif_request {
104747 + uint8_t operation; /* BLKIF_OP_??? */
104748 + uint8_t nr_segments; /* number of segments */
104749 + blkif_vdev_t handle; /* only for read/write requests */
104750 + uint64_t id; /* private guest value, echoed in resp */
104751 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
104752 + struct blkif_request_segment {
104753 + grant_ref_t gref; /* reference to I/O buffer frame */
104754 + /* @first_sect: first sector in frame to transfer (inclusive). */
104755 + /* @last_sect: last sector in frame to transfer (inclusive). */
104756 + uint8_t first_sect, last_sect;
104757 + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
104758 +};
104759 +typedef struct blkif_request blkif_request_t;
104760 +
104761 +struct blkif_response {
104762 + uint64_t id; /* copied from request */
104763 + uint8_t operation; /* copied from request */
104764 + int16_t status; /* BLKIF_RSP_??? */
104765 +};
104766 +typedef struct blkif_response blkif_response_t;
104767 +
104768 +/*
104769 + * STATUS RETURN CODES.
104770 + */
104771 + /* Operation not supported (only happens on barrier writes). */
104772 +#define BLKIF_RSP_EOPNOTSUPP -2
104773 + /* Operation failed for some unspecified reason (-EIO). */
104774 +#define BLKIF_RSP_ERROR -1
104775 + /* Operation completed successfully. */
104776 +#define BLKIF_RSP_OKAY 0
104777 +
104778 +/*
104779 + * Generate blkif ring structures and types.
104780 + */
104781 +
104782 +DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
104783 +
104784 +#define VDISK_CDROM 0x1
104785 +#define VDISK_REMOVABLE 0x2
104786 +#define VDISK_READONLY 0x4
104787 +
104788 +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
104789 +
104790 +/*
104791 + * Local variables:
104792 + * mode: C
104793 + * c-set-style: "BSD"
104794 + * c-basic-offset: 4
104795 + * tab-width: 4
104796 + * indent-tabs-mode: nil
104797 + * End:
104798 + */
104799 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/console.h linux-2.6.16.33/include/xen/interface/io/console.h
104800 --- linux-2.6.16.33-noxen/include/xen/interface/io/console.h 1970-01-01 00:00:00.000000000 +0000
104801 +++ linux-2.6.16.33/include/xen/interface/io/console.h 2007-01-08 15:00:55.000000000 +0000
104802 @@ -0,0 +1,51 @@
104803 +/******************************************************************************
104804 + * console.h
104805 + *
104806 + * Console I/O interface for Xen guest OSes.
104807 + *
104808 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104809 + * of this software and associated documentation files (the "Software"), to
104810 + * deal in the Software without restriction, including without limitation the
104811 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104812 + * sell copies of the Software, and to permit persons to whom the Software is
104813 + * furnished to do so, subject to the following conditions:
104814 + *
104815 + * The above copyright notice and this permission notice shall be included in
104816 + * all copies or substantial portions of the Software.
104817 + *
104818 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104819 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104820 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104821 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104822 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104823 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104824 + * DEALINGS IN THE SOFTWARE.
104825 + *
104826 + * Copyright (c) 2005, Keir Fraser
104827 + */
104828 +
104829 +#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
104830 +#define __XEN_PUBLIC_IO_CONSOLE_H__
104831 +
104832 +typedef uint32_t XENCONS_RING_IDX;
104833 +
104834 +#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
104835 +
104836 +struct xencons_interface {
104837 + char in[1024];
104838 + char out[2048];
104839 + XENCONS_RING_IDX in_cons, in_prod;
104840 + XENCONS_RING_IDX out_cons, out_prod;
104841 +};
104842 +
104843 +#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
104844 +
104845 +/*
104846 + * Local variables:
104847 + * mode: C
104848 + * c-set-style: "BSD"
104849 + * c-basic-offset: 4
104850 + * tab-width: 4
104851 + * indent-tabs-mode: nil
104852 + * End:
104853 + */
104854 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/fbif.h linux-2.6.16.33/include/xen/interface/io/fbif.h
104855 --- linux-2.6.16.33-noxen/include/xen/interface/io/fbif.h 1970-01-01 00:00:00.000000000 +0000
104856 +++ linux-2.6.16.33/include/xen/interface/io/fbif.h 2007-01-08 15:00:55.000000000 +0000
104857 @@ -0,0 +1,138 @@
104858 +/*
104859 + * fbif.h -- Xen virtual frame buffer device
104860 + *
104861 + * Permission is hereby granted, free of charge, to any person obtaining a copy
104862 + * of this software and associated documentation files (the "Software"), to
104863 + * deal in the Software without restriction, including without limitation the
104864 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
104865 + * sell copies of the Software, and to permit persons to whom the Software is
104866 + * furnished to do so, subject to the following conditions:
104867 + *
104868 + * The above copyright notice and this permission notice shall be included in
104869 + * all copies or substantial portions of the Software.
104870 + *
104871 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
104872 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104873 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
104874 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
104875 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
104876 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
104877 + * DEALINGS IN THE SOFTWARE.
104878 + *
104879 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
104880 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
104881 + */
104882 +
104883 +#ifndef __XEN_PUBLIC_IO_FBIF_H__
104884 +#define __XEN_PUBLIC_IO_FBIF_H__
104885 +
104886 +/* Out events (frontend -> backend) */
104887 +
104888 +/*
104889 + * Out events may be sent only when requested by backend, and receipt
104890 + * of an unknown out event is an error.
104891 + */
104892 +
104893 +/* Event type 1 currently not used */
104894 +/*
104895 + * Framebuffer update notification event
104896 + * Capable frontend sets feature-update in xenstore.
104897 + * Backend requests it by setting request-update in xenstore.
104898 + */
104899 +#define XENFB_TYPE_UPDATE 2
104900 +
104901 +struct xenfb_update
104902 +{
104903 + uint8_t type; /* XENFB_TYPE_UPDATE */
104904 + int32_t x; /* source x */
104905 + int32_t y; /* source y */
104906 + int32_t width; /* rect width */
104907 + int32_t height; /* rect height */
104908 +};
104909 +
104910 +#define XENFB_OUT_EVENT_SIZE 40
104911 +
104912 +union xenfb_out_event
104913 +{
104914 + uint8_t type;
104915 + struct xenfb_update update;
104916 + char pad[XENFB_OUT_EVENT_SIZE];
104917 +};
104918 +
104919 +/* In events (backend -> frontend) */
104920 +
104921 +/*
104922 + * Frontends should ignore unknown in events.
104923 + * No in events currently defined.
104924 + */
104925 +
104926 +#define XENFB_IN_EVENT_SIZE 40
104927 +
104928 +union xenfb_in_event
104929 +{
104930 + uint8_t type;
104931 + char pad[XENFB_IN_EVENT_SIZE];
104932 +};
104933 +
104934 +/* shared page */
104935 +
104936 +#define XENFB_IN_RING_SIZE 1024
104937 +#define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE)
104938 +#define XENFB_IN_RING_OFFS 1024
104939 +#define XENFB_IN_RING(page) \
104940 + ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
104941 +#define XENFB_IN_RING_REF(page, idx) \
104942 + (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
104943 +
104944 +#define XENFB_OUT_RING_SIZE 2048
104945 +#define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE)
104946 +#define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE)
104947 +#define XENFB_OUT_RING(page) \
104948 + ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
104949 +#define XENFB_OUT_RING_REF(page, idx) \
104950 + (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
104951 +
104952 +struct xenfb_page
104953 +{
104954 + uint32_t in_cons, in_prod;
104955 + uint32_t out_cons, out_prod;
104956 +
104957 + int32_t width; /* the width of the framebuffer (in pixels) */
104958 + int32_t height; /* the height of the framebuffer (in pixels) */
104959 + uint32_t line_length; /* the length of a row of pixels (in bytes) */
104960 + uint32_t mem_length; /* the length of the framebuffer (in bytes) */
104961 + uint8_t depth; /* the depth of a pixel (in bits) */
104962 +
104963 + /*
104964 + * Framebuffer page directory
104965 + *
104966 + * Each directory page holds PAGE_SIZE / sizeof(*pd)
104967 + * framebuffer pages, and can thus map up to PAGE_SIZE *
104968 + * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and
104969 + * sizeof(unsigned long) == 4, that's 4 Megs. Two directory
104970 + * pages should be enough for a while.
104971 + */
104972 + unsigned long pd[2];
104973 +};
104974 +
104975 +/*
104976 + * Wart: xenkbd needs to know resolution. Put it here until a better
104977 + * solution is found, but don't leak it to the backend.
104978 + */
104979 +#ifdef __KERNEL__
104980 +#define XENFB_WIDTH 800
104981 +#define XENFB_HEIGHT 600
104982 +#define XENFB_DEPTH 32
104983 +#endif
104984 +
104985 +#endif
104986 +
104987 +/*
104988 + * Local variables:
104989 + * mode: C
104990 + * c-set-style: "BSD"
104991 + * c-basic-offset: 4
104992 + * tab-width: 4
104993 + * indent-tabs-mode: nil
104994 + * End:
104995 + */
104996 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/kbdif.h linux-2.6.16.33/include/xen/interface/io/kbdif.h
104997 --- linux-2.6.16.33-noxen/include/xen/interface/io/kbdif.h 1970-01-01 00:00:00.000000000 +0000
104998 +++ linux-2.6.16.33/include/xen/interface/io/kbdif.h 2007-01-08 15:00:55.000000000 +0000
104999 @@ -0,0 +1,130 @@
105000 +/*
105001 + * kbdif.h -- Xen virtual keyboard/mouse
105002 + *
105003 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105004 + * of this software and associated documentation files (the "Software"), to
105005 + * deal in the Software without restriction, including without limitation the
105006 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105007 + * sell copies of the Software, and to permit persons to whom the Software is
105008 + * furnished to do so, subject to the following conditions:
105009 + *
105010 + * The above copyright notice and this permission notice shall be included in
105011 + * all copies or substantial portions of the Software.
105012 + *
105013 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105014 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105015 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105016 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105017 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105018 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105019 + * DEALINGS IN THE SOFTWARE.
105020 + *
105021 + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
105022 + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
105023 + */
105024 +
105025 +#ifndef __XEN_PUBLIC_IO_KBDIF_H__
105026 +#define __XEN_PUBLIC_IO_KBDIF_H__
105027 +
105028 +/* In events (backend -> frontend) */
105029 +
105030 +/*
105031 + * Frontends should ignore unknown in events.
105032 + */
105033 +
105034 +/* Pointer movement event */
105035 +#define XENKBD_TYPE_MOTION 1
105036 +/* Event type 2 currently not used */
105037 +/* Key event (includes pointer buttons) */
105038 +#define XENKBD_TYPE_KEY 3
105039 +/*
105040 + * Pointer position event
105041 + * Capable backend sets feature-abs-pointer in xenstore.
105042 + * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting
105043 + * request-abs-update in xenstore.
105044 + */
105045 +#define XENKBD_TYPE_POS 4
105046 +
105047 +struct xenkbd_motion
105048 +{
105049 + uint8_t type; /* XENKBD_TYPE_MOTION */
105050 + int32_t rel_x; /* relative X motion */
105051 + int32_t rel_y; /* relative Y motion */
105052 +};
105053 +
105054 +struct xenkbd_key
105055 +{
105056 + uint8_t type; /* XENKBD_TYPE_KEY */
105057 + uint8_t pressed; /* 1 if pressed; 0 otherwise */
105058 + uint32_t keycode; /* KEY_* from linux/input.h */
105059 +};
105060 +
105061 +struct xenkbd_position
105062 +{
105063 + uint8_t type; /* XENKBD_TYPE_POS */
105064 + int32_t abs_x; /* absolute X position (in FB pixels) */
105065 + int32_t abs_y; /* absolute Y position (in FB pixels) */
105066 +};
105067 +
105068 +#define XENKBD_IN_EVENT_SIZE 40
105069 +
105070 +union xenkbd_in_event
105071 +{
105072 + uint8_t type;
105073 + struct xenkbd_motion motion;
105074 + struct xenkbd_key key;
105075 + struct xenkbd_position pos;
105076 + char pad[XENKBD_IN_EVENT_SIZE];
105077 +};
105078 +
105079 +/* Out events (frontend -> backend) */
105080 +
105081 +/*
105082 + * Out events may be sent only when requested by backend, and receipt
105083 + * of an unknown out event is an error.
105084 + * No out events currently defined.
105085 + */
105086 +
105087 +#define XENKBD_OUT_EVENT_SIZE 40
105088 +
105089 +union xenkbd_out_event
105090 +{
105091 + uint8_t type;
105092 + char pad[XENKBD_OUT_EVENT_SIZE];
105093 +};
105094 +
105095 +/* shared page */
105096 +
105097 +#define XENKBD_IN_RING_SIZE 2048
105098 +#define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE)
105099 +#define XENKBD_IN_RING_OFFS 1024
105100 +#define XENKBD_IN_RING(page) \
105101 + ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
105102 +#define XENKBD_IN_RING_REF(page, idx) \
105103 + (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
105104 +
105105 +#define XENKBD_OUT_RING_SIZE 1024
105106 +#define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE)
105107 +#define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE)
105108 +#define XENKBD_OUT_RING(page) \
105109 + ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
105110 +#define XENKBD_OUT_RING_REF(page, idx) \
105111 + (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
105112 +
105113 +struct xenkbd_page
105114 +{
105115 + uint32_t in_cons, in_prod;
105116 + uint32_t out_cons, out_prod;
105117 +};
105118 +
105119 +#endif
105120 +
105121 +/*
105122 + * Local variables:
105123 + * mode: C
105124 + * c-set-style: "BSD"
105125 + * c-basic-offset: 4
105126 + * tab-width: 4
105127 + * indent-tabs-mode: nil
105128 + * End:
105129 + */
105130 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/netif.h linux-2.6.16.33/include/xen/interface/io/netif.h
105131 --- linux-2.6.16.33-noxen/include/xen/interface/io/netif.h 1970-01-01 00:00:00.000000000 +0000
105132 +++ linux-2.6.16.33/include/xen/interface/io/netif.h 2007-01-08 15:00:55.000000000 +0000
105133 @@ -0,0 +1,184 @@
105134 +/******************************************************************************
105135 + * netif.h
105136 + *
105137 + * Unified network-device I/O interface for Xen guest OSes.
105138 + *
105139 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105140 + * of this software and associated documentation files (the "Software"), to
105141 + * deal in the Software without restriction, including without limitation the
105142 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105143 + * sell copies of the Software, and to permit persons to whom the Software is
105144 + * furnished to do so, subject to the following conditions:
105145 + *
105146 + * The above copyright notice and this permission notice shall be included in
105147 + * all copies or substantial portions of the Software.
105148 + *
105149 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105150 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105151 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105152 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105153 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105154 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105155 + * DEALINGS IN THE SOFTWARE.
105156 + *
105157 + * Copyright (c) 2003-2004, Keir Fraser
105158 + */
105159 +
105160 +#ifndef __XEN_PUBLIC_IO_NETIF_H__
105161 +#define __XEN_PUBLIC_IO_NETIF_H__
105162 +
105163 +#include "ring.h"
105164 +#include "../grant_table.h"
105165 +
105166 +/*
105167 + * Notifications after enqueuing any type of message should be conditional on
105168 + * the appropriate req_event or rsp_event field in the shared ring.
105169 + * If the client sends notification for rx requests then it should specify
105170 + * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
105171 + * that it cannot safely queue packets (as it may not be kicked to send them).
105172 + */
105173 +
105174 +/*
105175 + * This is the 'wire' format for packets:
105176 + * Request 1: netif_tx_request -- NETTXF_* (any flags)
105177 + * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
105178 + * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE)
105179 + * Request 4: netif_tx_request -- NETTXF_more_data
105180 + * Request 5: netif_tx_request -- NETTXF_more_data
105181 + * ...
105182 + * Request N: netif_tx_request -- 0
105183 + */
105184 +
105185 +/* Protocol checksum field is blank in the packet (hardware offload)? */
105186 +#define _NETTXF_csum_blank (0)
105187 +#define NETTXF_csum_blank (1U<<_NETTXF_csum_blank)
105188 +
105189 +/* Packet data has been validated against protocol checksum. */
105190 +#define _NETTXF_data_validated (1)
105191 +#define NETTXF_data_validated (1U<<_NETTXF_data_validated)
105192 +
105193 +/* Packet continues in the next request descriptor. */
105194 +#define _NETTXF_more_data (2)
105195 +#define NETTXF_more_data (1U<<_NETTXF_more_data)
105196 +
105197 +/* Packet to be followed by extra descriptor(s). */
105198 +#define _NETTXF_extra_info (3)
105199 +#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
105200 +
105201 +struct netif_tx_request {
105202 + grant_ref_t gref; /* Reference to buffer page */
105203 + uint16_t offset; /* Offset within buffer page */
105204 + uint16_t flags; /* NETTXF_* */
105205 + uint16_t id; /* Echoed in response message. */
105206 + uint16_t size; /* Packet size in bytes. */
105207 +};
105208 +typedef struct netif_tx_request netif_tx_request_t;
105209 +
105210 +/* Types of netif_extra_info descriptors. */
105211 +#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
105212 +#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
105213 +#define XEN_NETIF_EXTRA_TYPE_MAX (2)
105214 +
105215 +/* netif_extra_info flags. */
105216 +#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
105217 +#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
105218 +
105219 +/* GSO types - only TCPv4 currently supported. */
105220 +#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
105221 +
105222 +/*
105223 + * This structure needs to fit within both netif_tx_request and
105224 + * netif_rx_response for compatibility.
105225 + */
105226 +struct netif_extra_info {
105227 + uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
105228 + uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
105229 +
105230 + union {
105231 + struct {
105232 + /*
105233 + * Maximum payload size of each segment. For example, for TCP this
105234 + * is just the path MSS.
105235 + */
105236 + uint16_t size;
105237 +
105238 + /*
105239 + * GSO type. This determines the protocol of the packet and any
105240 + * extra features required to segment the packet properly.
105241 + */
105242 + uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
105243 +
105244 + /* Future expansion. */
105245 + uint8_t pad;
105246 +
105247 + /*
105248 + * GSO features. This specifies any extra GSO features required
105249 + * to process this packet, such as ECN support for TCPv4.
105250 + */
105251 + uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
105252 + } gso;
105253 +
105254 + uint16_t pad[3];
105255 + } u;
105256 +};
105257 +
105258 +struct netif_tx_response {
105259 + uint16_t id;
105260 + int16_t status; /* NETIF_RSP_* */
105261 +};
105262 +typedef struct netif_tx_response netif_tx_response_t;
105263 +
105264 +struct netif_rx_request {
105265 + uint16_t id; /* Echoed in response message. */
105266 + grant_ref_t gref; /* Reference to incoming granted frame */
105267 +};
105268 +typedef struct netif_rx_request netif_rx_request_t;
105269 +
105270 +/* Packet data has been validated against protocol checksum. */
105271 +#define _NETRXF_data_validated (0)
105272 +#define NETRXF_data_validated (1U<<_NETRXF_data_validated)
105273 +
105274 +/* Protocol checksum field is blank in the packet (hardware offload)? */
105275 +#define _NETRXF_csum_blank (1)
105276 +#define NETRXF_csum_blank (1U<<_NETRXF_csum_blank)
105277 +
105278 +/* Packet continues in the next request descriptor. */
105279 +#define _NETRXF_more_data (2)
105280 +#define NETRXF_more_data (1U<<_NETRXF_more_data)
105281 +
105282 +/* Packet to be followed by extra descriptor(s). */
105283 +#define _NETRXF_extra_info (3)
105284 +#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
105285 +
105286 +struct netif_rx_response {
105287 + uint16_t id;
105288 + uint16_t offset; /* Offset in page of start of received packet */
105289 + uint16_t flags; /* NETRXF_* */
105290 + int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
105291 +};
105292 +typedef struct netif_rx_response netif_rx_response_t;
105293 +
105294 +/*
105295 + * Generate netif ring structures and types.
105296 + */
105297 +
105298 +DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
105299 +DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
105300 +
105301 +#define NETIF_RSP_DROPPED -2
105302 +#define NETIF_RSP_ERROR -1
105303 +#define NETIF_RSP_OKAY 0
105304 +/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
105305 +#define NETIF_RSP_NULL 1
105306 +
105307 +#endif
105308 +
105309 +/*
105310 + * Local variables:
105311 + * mode: C
105312 + * c-set-style: "BSD"
105313 + * c-basic-offset: 4
105314 + * tab-width: 4
105315 + * indent-tabs-mode: nil
105316 + * End:
105317 + */
105318 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/pciif.h linux-2.6.16.33/include/xen/interface/io/pciif.h
105319 --- linux-2.6.16.33-noxen/include/xen/interface/io/pciif.h 1970-01-01 00:00:00.000000000 +0000
105320 +++ linux-2.6.16.33/include/xen/interface/io/pciif.h 2007-01-08 15:00:55.000000000 +0000
105321 @@ -0,0 +1,83 @@
105322 +/*
105323 + * PCI Backend/Frontend Common Data Structures & Macros
105324 + *
105325 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105326 + * of this software and associated documentation files (the "Software"), to
105327 + * deal in the Software without restriction, including without limitation the
105328 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105329 + * sell copies of the Software, and to permit persons to whom the Software is
105330 + * furnished to do so, subject to the following conditions:
105331 + *
105332 + * The above copyright notice and this permission notice shall be included in
105333 + * all copies or substantial portions of the Software.
105334 + *
105335 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105336 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105337 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105338 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105339 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105340 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105341 + * DEALINGS IN THE SOFTWARE.
105342 + *
105343 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
105344 + */
105345 +#ifndef __XEN_PCI_COMMON_H__
105346 +#define __XEN_PCI_COMMON_H__
105347 +
105348 +/* Be sure to bump this number if you change this file */
105349 +#define XEN_PCI_MAGIC "7"
105350 +
105351 +/* xen_pci_sharedinfo flags */
105352 +#define _XEN_PCIF_active (0)
105353 +#define XEN_PCIF_active (1<<_XEN_PCI_active)
105354 +
105355 +/* xen_pci_op commands */
105356 +#define XEN_PCI_OP_conf_read (0)
105357 +#define XEN_PCI_OP_conf_write (1)
105358 +
105359 +/* xen_pci_op error numbers */
105360 +#define XEN_PCI_ERR_success (0)
105361 +#define XEN_PCI_ERR_dev_not_found (-1)
105362 +#define XEN_PCI_ERR_invalid_offset (-2)
105363 +#define XEN_PCI_ERR_access_denied (-3)
105364 +#define XEN_PCI_ERR_not_implemented (-4)
105365 +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
105366 +#define XEN_PCI_ERR_op_failed (-5)
105367 +
105368 +struct xen_pci_op {
105369 + /* IN: what action to perform: XEN_PCI_OP_* */
105370 + uint32_t cmd;
105371 +
105372 + /* OUT: will contain an error number (if any) from errno.h */
105373 + int32_t err;
105374 +
105375 + /* IN: which device to touch */
105376 + uint32_t domain; /* PCI Domain/Segment */
105377 + uint32_t bus;
105378 + uint32_t devfn;
105379 +
105380 + /* IN: which configuration registers to touch */
105381 + int32_t offset;
105382 + int32_t size;
105383 +
105384 + /* IN/OUT: Contains the result after a READ or the value to WRITE */
105385 + uint32_t value;
105386 +};
105387 +
105388 +struct xen_pci_sharedinfo {
105389 + /* flags - XEN_PCIF_* */
105390 + uint32_t flags;
105391 + struct xen_pci_op op;
105392 +};
105393 +
105394 +#endif /* __XEN_PCI_COMMON_H__ */
105395 +
105396 +/*
105397 + * Local variables:
105398 + * mode: C
105399 + * c-set-style: "BSD"
105400 + * c-basic-offset: 4
105401 + * tab-width: 4
105402 + * indent-tabs-mode: nil
105403 + * End:
105404 + */
105405 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/ring.h linux-2.6.16.33/include/xen/interface/io/ring.h
105406 --- linux-2.6.16.33-noxen/include/xen/interface/io/ring.h 1970-01-01 00:00:00.000000000 +0000
105407 +++ linux-2.6.16.33/include/xen/interface/io/ring.h 2007-01-08 15:00:55.000000000 +0000
105408 @@ -0,0 +1,299 @@
105409 +/******************************************************************************
105410 + * ring.h
105411 + *
105412 + * Shared producer-consumer ring macros.
105413 + *
105414 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105415 + * of this software and associated documentation files (the "Software"), to
105416 + * deal in the Software without restriction, including without limitation the
105417 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105418 + * sell copies of the Software, and to permit persons to whom the Software is
105419 + * furnished to do so, subject to the following conditions:
105420 + *
105421 + * The above copyright notice and this permission notice shall be included in
105422 + * all copies or substantial portions of the Software.
105423 + *
105424 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105425 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105426 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105427 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105428 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105429 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105430 + * DEALINGS IN THE SOFTWARE.
105431 + *
105432 + * Tim Deegan and Andrew Warfield November 2004.
105433 + */
105434 +
105435 +#ifndef __XEN_PUBLIC_IO_RING_H__
105436 +#define __XEN_PUBLIC_IO_RING_H__
105437 +
105438 +typedef unsigned int RING_IDX;
105439 +
105440 +/* Round a 32-bit unsigned constant down to the nearest power of two. */
105441 +#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
105442 +#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x))
105443 +#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x))
105444 +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x))
105445 +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
105446 +
105447 +/*
105448 + * Calculate size of a shared ring, given the total available space for the
105449 + * ring and indexes (_sz), and the name tag of the request/response structure.
105450 + * A ring contains as many entries as will fit, rounded down to the nearest
105451 + * power of two (so we can mask with (size-1) to loop around).
105452 + */
105453 +#define __RING_SIZE(_s, _sz) \
105454 + (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
105455 +
105456 +/*
105457 + * Macros to make the correct C datatypes for a new kind of ring.
105458 + *
105459 + * To make a new ring datatype, you need to have two message structures,
105460 + * let's say request_t, and response_t already defined.
105461 + *
105462 + * In a header where you want the ring datatype declared, you then do:
105463 + *
105464 + * DEFINE_RING_TYPES(mytag, request_t, response_t);
105465 + *
105466 + * These expand out to give you a set of types, as you can see below.
105467 + * The most important of these are:
105468 + *
105469 + * mytag_sring_t - The shared ring.
105470 + * mytag_front_ring_t - The 'front' half of the ring.
105471 + * mytag_back_ring_t - The 'back' half of the ring.
105472 + *
105473 + * To initialize a ring in your code you need to know the location and size
105474 + * of the shared memory area (PAGE_SIZE, for instance). To initialise
105475 + * the front half:
105476 + *
105477 + * mytag_front_ring_t front_ring;
105478 + * SHARED_RING_INIT((mytag_sring_t *)shared_page);
105479 + * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
105480 + *
105481 + * Initializing the back follows similarly (note that only the front
105482 + * initializes the shared ring):
105483 + *
105484 + * mytag_back_ring_t back_ring;
105485 + * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
105486 + */
105487 +
105488 +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
105489 + \
105490 +/* Shared ring entry */ \
105491 +union __name##_sring_entry { \
105492 + __req_t req; \
105493 + __rsp_t rsp; \
105494 +}; \
105495 + \
105496 +/* Shared ring page */ \
105497 +struct __name##_sring { \
105498 + RING_IDX req_prod, req_event; \
105499 + RING_IDX rsp_prod, rsp_event; \
105500 + uint8_t pad[48]; \
105501 + union __name##_sring_entry ring[1]; /* variable-length */ \
105502 +}; \
105503 + \
105504 +/* "Front" end's private variables */ \
105505 +struct __name##_front_ring { \
105506 + RING_IDX req_prod_pvt; \
105507 + RING_IDX rsp_cons; \
105508 + unsigned int nr_ents; \
105509 + struct __name##_sring *sring; \
105510 +}; \
105511 + \
105512 +/* "Back" end's private variables */ \
105513 +struct __name##_back_ring { \
105514 + RING_IDX rsp_prod_pvt; \
105515 + RING_IDX req_cons; \
105516 + unsigned int nr_ents; \
105517 + struct __name##_sring *sring; \
105518 +}; \
105519 + \
105520 +/* Syntactic sugar */ \
105521 +typedef struct __name##_sring __name##_sring_t; \
105522 +typedef struct __name##_front_ring __name##_front_ring_t; \
105523 +typedef struct __name##_back_ring __name##_back_ring_t
105524 +
105525 +/*
105526 + * Macros for manipulating rings.
105527 + *
105528 + * FRONT_RING_whatever works on the "front end" of a ring: here
105529 + * requests are pushed on to the ring and responses taken off it.
105530 + *
105531 + * BACK_RING_whatever works on the "back end" of a ring: here
105532 + * requests are taken off the ring and responses put on.
105533 + *
105534 + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
105535 + * This is OK in 1-for-1 request-response situations where the
105536 + * requestor (front end) never has more than RING_SIZE()-1
105537 + * outstanding requests.
105538 + */
105539 +
105540 +/* Initialising empty rings */
105541 +#define SHARED_RING_INIT(_s) do { \
105542 + (_s)->req_prod = (_s)->rsp_prod = 0; \
105543 + (_s)->req_event = (_s)->rsp_event = 1; \
105544 + memset((_s)->pad, 0, sizeof((_s)->pad)); \
105545 +} while(0)
105546 +
105547 +#define FRONT_RING_INIT(_r, _s, __size) do { \
105548 + (_r)->req_prod_pvt = 0; \
105549 + (_r)->rsp_cons = 0; \
105550 + (_r)->nr_ents = __RING_SIZE(_s, __size); \
105551 + (_r)->sring = (_s); \
105552 +} while (0)
105553 +
105554 +#define BACK_RING_INIT(_r, _s, __size) do { \
105555 + (_r)->rsp_prod_pvt = 0; \
105556 + (_r)->req_cons = 0; \
105557 + (_r)->nr_ents = __RING_SIZE(_s, __size); \
105558 + (_r)->sring = (_s); \
105559 +} while (0)
105560 +
105561 +/* Initialize to existing shared indexes -- for recovery */
105562 +#define FRONT_RING_ATTACH(_r, _s, __size) do { \
105563 + (_r)->sring = (_s); \
105564 + (_r)->req_prod_pvt = (_s)->req_prod; \
105565 + (_r)->rsp_cons = (_s)->rsp_prod; \
105566 + (_r)->nr_ents = __RING_SIZE(_s, __size); \
105567 +} while (0)
105568 +
105569 +#define BACK_RING_ATTACH(_r, _s, __size) do { \
105570 + (_r)->sring = (_s); \
105571 + (_r)->rsp_prod_pvt = (_s)->rsp_prod; \
105572 + (_r)->req_cons = (_s)->req_prod; \
105573 + (_r)->nr_ents = __RING_SIZE(_s, __size); \
105574 +} while (0)
105575 +
105576 +/* How big is this ring? */
105577 +#define RING_SIZE(_r) \
105578 + ((_r)->nr_ents)
105579 +
105580 +/* Number of free requests (for use on front side only). */
105581 +#define RING_FREE_REQUESTS(_r) \
105582 + (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
105583 +
105584 +/* Test if there is an empty slot available on the front ring.
105585 + * (This is only meaningful from the front. )
105586 + */
105587 +#define RING_FULL(_r) \
105588 + (RING_FREE_REQUESTS(_r) == 0)
105589 +
105590 +/* Test if there are outstanding messages to be processed on a ring. */
105591 +#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
105592 + ((_r)->sring->rsp_prod - (_r)->rsp_cons)
105593 +
105594 +#ifdef __GNUC__
105595 +#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \
105596 + unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
105597 + unsigned int rsp = RING_SIZE(_r) - \
105598 + ((_r)->req_cons - (_r)->rsp_prod_pvt); \
105599 + req < rsp ? req : rsp; \
105600 +})
105601 +#else
105602 +/* Same as above, but without the nice GCC ({ ... }) syntax. */
105603 +#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
105604 + ((((_r)->sring->req_prod - (_r)->req_cons) < \
105605 + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \
105606 + ((_r)->sring->req_prod - (_r)->req_cons) : \
105607 + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
105608 +#endif
105609 +
105610 +/* Direct access to individual ring elements, by index. */
105611 +#define RING_GET_REQUEST(_r, _idx) \
105612 + (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
105613 +
105614 +#define RING_GET_RESPONSE(_r, _idx) \
105615 + (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
105616 +
105617 +/* Loop termination condition: Would the specified index overflow the ring? */
105618 +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
105619 + (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
105620 +
105621 +#define RING_PUSH_REQUESTS(_r) do { \
105622 + wmb(); /* back sees requests /before/ updated producer index */ \
105623 + (_r)->sring->req_prod = (_r)->req_prod_pvt; \
105624 +} while (0)
105625 +
105626 +#define RING_PUSH_RESPONSES(_r) do { \
105627 + wmb(); /* front sees responses /before/ updated producer index */ \
105628 + (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
105629 +} while (0)
105630 +
105631 +/*
105632 + * Notification hold-off (req_event and rsp_event):
105633 + *
105634 + * When queueing requests or responses on a shared ring, it may not always be
105635 + * necessary to notify the remote end. For example, if requests are in flight
105636 + * in a backend, the front may be able to queue further requests without
105637 + * notifying the back (if the back checks for new requests when it queues
105638 + * responses).
105639 + *
105640 + * When enqueuing requests or responses:
105641 + *
105642 + * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
105643 + * is a boolean return value. True indicates that the receiver requires an
105644 + * asynchronous notification.
105645 + *
105646 + * After dequeuing requests or responses (before sleeping the connection):
105647 + *
105648 + * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
105649 + * The second argument is a boolean return value. True indicates that there
105650 + * are pending messages on the ring (i.e., the connection should not be put
105651 + * to sleep).
105652 + *
105653 + * These macros will set the req_event/rsp_event field to trigger a
105654 + * notification on the very next message that is enqueued. If you want to
105655 + * create batches of work (i.e., only receive a notification after several
105656 + * messages have been enqueued) then you will need to create a customised
105657 + * version of the FINAL_CHECK macro in your own code, which sets the event
105658 + * field appropriately.
105659 + */
105660 +
105661 +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
105662 + RING_IDX __old = (_r)->sring->req_prod; \
105663 + RING_IDX __new = (_r)->req_prod_pvt; \
105664 + wmb(); /* back sees requests /before/ updated producer index */ \
105665 + (_r)->sring->req_prod = __new; \
105666 + mb(); /* back sees new requests /before/ we check req_event */ \
105667 + (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
105668 + (RING_IDX)(__new - __old)); \
105669 +} while (0)
105670 +
105671 +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
105672 + RING_IDX __old = (_r)->sring->rsp_prod; \
105673 + RING_IDX __new = (_r)->rsp_prod_pvt; \
105674 + wmb(); /* front sees responses /before/ updated producer index */ \
105675 + (_r)->sring->rsp_prod = __new; \
105676 + mb(); /* front sees new responses /before/ we check rsp_event */ \
105677 + (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
105678 + (RING_IDX)(__new - __old)); \
105679 +} while (0)
105680 +
105681 +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
105682 + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
105683 + if (_work_to_do) break; \
105684 + (_r)->sring->req_event = (_r)->req_cons + 1; \
105685 + mb(); \
105686 + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
105687 +} while (0)
105688 +
105689 +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
105690 + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
105691 + if (_work_to_do) break; \
105692 + (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
105693 + mb(); \
105694 + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
105695 +} while (0)
105696 +
105697 +#endif /* __XEN_PUBLIC_IO_RING_H__ */
105698 +
105699 +/*
105700 + * Local variables:
105701 + * mode: C
105702 + * c-set-style: "BSD"
105703 + * c-basic-offset: 4
105704 + * tab-width: 4
105705 + * indent-tabs-mode: nil
105706 + * End:
105707 + */
105708 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/tpmif.h linux-2.6.16.33/include/xen/interface/io/tpmif.h
105709 --- linux-2.6.16.33-noxen/include/xen/interface/io/tpmif.h 1970-01-01 00:00:00.000000000 +0000
105710 +++ linux-2.6.16.33/include/xen/interface/io/tpmif.h 2007-01-08 15:00:55.000000000 +0000
105711 @@ -0,0 +1,77 @@
105712 +/******************************************************************************
105713 + * tpmif.h
105714 + *
105715 + * TPM I/O interface for Xen guest OSes.
105716 + *
105717 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105718 + * of this software and associated documentation files (the "Software"), to
105719 + * deal in the Software without restriction, including without limitation the
105720 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105721 + * sell copies of the Software, and to permit persons to whom the Software is
105722 + * furnished to do so, subject to the following conditions:
105723 + *
105724 + * The above copyright notice and this permission notice shall be included in
105725 + * all copies or substantial portions of the Software.
105726 + *
105727 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105728 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105729 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105730 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105731 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105732 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105733 + * DEALINGS IN THE SOFTWARE.
105734 + *
105735 + * Copyright (c) 2005, IBM Corporation
105736 + *
105737 + * Author: Stefan Berger, stefanb@us.ibm.com
105738 + * Grant table support: Mahadevan Gomathisankaran
105739 + *
105740 + * This code has been derived from tools/libxc/xen/io/netif.h
105741 + *
105742 + * Copyright (c) 2003-2004, Keir Fraser
105743 + */
105744 +
105745 +#ifndef __XEN_PUBLIC_IO_TPMIF_H__
105746 +#define __XEN_PUBLIC_IO_TPMIF_H__
105747 +
105748 +#include "../grant_table.h"
105749 +
105750 +struct tpmif_tx_request {
105751 + unsigned long addr; /* Machine address of packet. */
105752 + grant_ref_t ref; /* grant table access reference */
105753 + uint16_t unused;
105754 + uint16_t size; /* Packet size in bytes. */
105755 +};
105756 +typedef struct tpmif_tx_request tpmif_tx_request_t;
105757 +
105758 +/*
105759 + * The TPMIF_TX_RING_SIZE defines the number of pages the
105760 + * front-end and backend can exchange (= size of array).
105761 + */
105762 +typedef uint32_t TPMIF_RING_IDX;
105763 +
105764 +#define TPMIF_TX_RING_SIZE 10
105765 +
105766 +/* This structure must fit in a memory page. */
105767 +
105768 +struct tpmif_ring {
105769 + struct tpmif_tx_request req;
105770 +};
105771 +typedef struct tpmif_ring tpmif_ring_t;
105772 +
105773 +struct tpmif_tx_interface {
105774 + struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
105775 +};
105776 +typedef struct tpmif_tx_interface tpmif_tx_interface_t;
105777 +
105778 +#endif
105779 +
105780 +/*
105781 + * Local variables:
105782 + * mode: C
105783 + * c-set-style: "BSD"
105784 + * c-basic-offset: 4
105785 + * tab-width: 4
105786 + * indent-tabs-mode: nil
105787 + * End:
105788 + */
105789 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/xenbus.h linux-2.6.16.33/include/xen/interface/io/xenbus.h
105790 --- linux-2.6.16.33-noxen/include/xen/interface/io/xenbus.h 1970-01-01 00:00:00.000000000 +0000
105791 +++ linux-2.6.16.33/include/xen/interface/io/xenbus.h 2007-01-08 15:00:55.000000000 +0000
105792 @@ -0,0 +1,73 @@
105793 +/*****************************************************************************
105794 + * xenbus.h
105795 + *
105796 + * Xenbus protocol details.
105797 + *
105798 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105799 + * of this software and associated documentation files (the "Software"), to
105800 + * deal in the Software without restriction, including without limitation the
105801 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105802 + * sell copies of the Software, and to permit persons to whom the Software is
105803 + * furnished to do so, subject to the following conditions:
105804 + *
105805 + * The above copyright notice and this permission notice shall be included in
105806 + * all copies or substantial portions of the Software.
105807 + *
105808 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105809 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105810 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105811 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105812 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105813 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105814 + * DEALINGS IN THE SOFTWARE.
105815 + *
105816 + * Copyright (C) 2005 XenSource Ltd.
105817 + */
105818 +
105819 +#ifndef _XEN_PUBLIC_IO_XENBUS_H
105820 +#define _XEN_PUBLIC_IO_XENBUS_H
105821 +
105822 +/*
105823 + * The state of either end of the Xenbus, i.e. the current communication
105824 + * status of initialisation across the bus. States here imply nothing about
105825 + * the state of the connection between the driver and the kernel's device
105826 + * layers.
105827 + */
105828 +enum xenbus_state {
105829 + XenbusStateUnknown = 0,
105830 +
105831 + XenbusStateInitialising = 1,
105832 +
105833 + /*
105834 + * InitWait: Finished early initialisation but waiting for information
105835 + * from the peer or hotplug scripts.
105836 + */
105837 + XenbusStateInitWait = 2,
105838 +
105839 + /*
105840 + * Initialised: Waiting for a connection from the peer.
105841 + */
105842 + XenbusStateInitialised = 3,
105843 +
105844 + XenbusStateConnected = 4,
105845 +
105846 + /*
105847 + * Closing: The device is being closed due to an error or an unplug event.
105848 + */
105849 + XenbusStateClosing = 5,
105850 +
105851 + XenbusStateClosed = 6
105852 +};
105853 +typedef enum xenbus_state XenbusState;
105854 +
105855 +#endif /* _XEN_PUBLIC_IO_XENBUS_H */
105856 +
105857 +/*
105858 + * Local variables:
105859 + * mode: C
105860 + * c-set-style: "BSD"
105861 + * c-basic-offset: 4
105862 + * tab-width: 4
105863 + * indent-tabs-mode: nil
105864 + * End:
105865 + */
105866 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/io/xs_wire.h linux-2.6.16.33/include/xen/interface/io/xs_wire.h
105867 --- linux-2.6.16.33-noxen/include/xen/interface/io/xs_wire.h 1970-01-01 00:00:00.000000000 +0000
105868 +++ linux-2.6.16.33/include/xen/interface/io/xs_wire.h 2007-01-08 15:00:55.000000000 +0000
105869 @@ -0,0 +1,116 @@
105870 +/*
105871 + * Details of the "wire" protocol between Xen Store Daemon and client
105872 + * library or guest kernel.
105873 + *
105874 + * Permission is hereby granted, free of charge, to any person obtaining a copy
105875 + * of this software and associated documentation files (the "Software"), to
105876 + * deal in the Software without restriction, including without limitation the
105877 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
105878 + * sell copies of the Software, and to permit persons to whom the Software is
105879 + * furnished to do so, subject to the following conditions:
105880 + *
105881 + * The above copyright notice and this permission notice shall be included in
105882 + * all copies or substantial portions of the Software.
105883 + *
105884 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105885 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
105886 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
105887 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
105888 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
105889 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
105890 + * DEALINGS IN THE SOFTWARE.
105891 + *
105892 + * Copyright (C) 2005 Rusty Russell IBM Corporation
105893 + */
105894 +
105895 +#ifndef _XS_WIRE_H
105896 +#define _XS_WIRE_H
105897 +
105898 +enum xsd_sockmsg_type
105899 +{
105900 + XS_DEBUG,
105901 + XS_DIRECTORY,
105902 + XS_READ,
105903 + XS_GET_PERMS,
105904 + XS_WATCH,
105905 + XS_UNWATCH,
105906 + XS_TRANSACTION_START,
105907 + XS_TRANSACTION_END,
105908 + XS_INTRODUCE,
105909 + XS_RELEASE,
105910 + XS_GET_DOMAIN_PATH,
105911 + XS_WRITE,
105912 + XS_MKDIR,
105913 + XS_RM,
105914 + XS_SET_PERMS,
105915 + XS_WATCH_EVENT,
105916 + XS_ERROR,
105917 + XS_IS_DOMAIN_INTRODUCED
105918 +};
105919 +
105920 +#define XS_WRITE_NONE "NONE"
105921 +#define XS_WRITE_CREATE "CREATE"
105922 +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
105923 +
105924 +/* We hand errors as strings, for portability. */
105925 +struct xsd_errors
105926 +{
105927 + int errnum;
105928 + const char *errstring;
105929 +};
105930 +#define XSD_ERROR(x) { x, #x }
105931 +static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
105932 + XSD_ERROR(EINVAL),
105933 + XSD_ERROR(EACCES),
105934 + XSD_ERROR(EEXIST),
105935 + XSD_ERROR(EISDIR),
105936 + XSD_ERROR(ENOENT),
105937 + XSD_ERROR(ENOMEM),
105938 + XSD_ERROR(ENOSPC),
105939 + XSD_ERROR(EIO),
105940 + XSD_ERROR(ENOTEMPTY),
105941 + XSD_ERROR(ENOSYS),
105942 + XSD_ERROR(EROFS),
105943 + XSD_ERROR(EBUSY),
105944 + XSD_ERROR(EAGAIN),
105945 + XSD_ERROR(EISCONN)
105946 +};
105947 +
105948 +struct xsd_sockmsg
105949 +{
105950 + uint32_t type; /* XS_??? */
105951 + uint32_t req_id;/* Request identifier, echoed in daemon's response. */
105952 + uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
105953 + uint32_t len; /* Length of data following this. */
105954 +
105955 + /* Generally followed by nul-terminated string(s). */
105956 +};
105957 +
105958 +enum xs_watch_type
105959 +{
105960 + XS_WATCH_PATH = 0,
105961 + XS_WATCH_TOKEN
105962 +};
105963 +
105964 +/* Inter-domain shared memory communications. */
105965 +#define XENSTORE_RING_SIZE 1024
105966 +typedef uint32_t XENSTORE_RING_IDX;
105967 +#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
105968 +struct xenstore_domain_interface {
105969 + char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
105970 + char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
105971 + XENSTORE_RING_IDX req_cons, req_prod;
105972 + XENSTORE_RING_IDX rsp_cons, rsp_prod;
105973 +};
105974 +
105975 +#endif /* _XS_WIRE_H */
105976 +
105977 +/*
105978 + * Local variables:
105979 + * mode: C
105980 + * c-set-style: "BSD"
105981 + * c-basic-offset: 4
105982 + * tab-width: 4
105983 + * indent-tabs-mode: nil
105984 + * End:
105985 + */
105986 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/kexec.h linux-2.6.16.33/include/xen/interface/kexec.h
105987 --- linux-2.6.16.33-noxen/include/xen/interface/kexec.h 1970-01-01 00:00:00.000000000 +0000
105988 +++ linux-2.6.16.33/include/xen/interface/kexec.h 2007-01-08 15:00:55.000000000 +0000
105989 @@ -0,0 +1,137 @@
105990 +/******************************************************************************
105991 + * kexec.h - Public portion
105992 + *
105993 + * Xen port written by:
105994 + * - Simon 'Horms' Horman <horms@verge.net.au>
105995 + * - Magnus Damm <magnus@valinux.co.jp>
105996 + */
105997 +
105998 +#ifndef _XEN_PUBLIC_KEXEC_H
105999 +#define _XEN_PUBLIC_KEXEC_H
106000 +
106001 +
106002 +/* This file describes the Kexec / Kdump hypercall interface for Xen.
106003 + *
106004 + * Kexec under vanilla Linux allows a user to reboot the physical machine
106005 + * into a new user-specified kernel. The Xen port extends this idea
106006 + * to allow rebooting of the machine from dom0. When kexec for dom0
106007 + * is used to reboot, both the hypervisor and the domains get replaced
106008 + * with some other kernel. It is possible to kexec between vanilla
106009 + * Linux and Xen and back again. Xen to Xen works well too.
106010 + *
106011 + * The hypercall interface for kexec can be divided into three main
106012 + * types of hypercall operations:
106013 + *
106014 + * 1) Range information:
106015 + * This is used by the dom0 kernel to ask the hypervisor about various
106016 + * address information. This information is needed to allow kexec-tools
106017 + * to fill in the ELF headers for /proc/vmcore properly.
106018 + *
106019 + * 2) Load and unload of images:
106020 + * There are no big surprises here, the kexec binary from kexec-tools
106021 + * runs in userspace in dom0. The tool loads/unloads data into the
106022 + * dom0 kernel such as new kernel, initramfs and hypervisor. When
106023 + * loaded the dom0 kernel performs a load hypercall operation, and
106024 + * before releasing all page references the dom0 kernel calls unload.
106025 + *
106026 + * 3) Kexec operation:
106027 + * This is used to start a previously loaded kernel.
106028 + */
106029 +
106030 +#include "xen.h"
106031 +
106032 +#if defined(__i386__) || defined(__x86_64__)
106033 +#define KEXEC_XEN_NO_PAGES 17
106034 +#endif
106035 +
106036 +/*
106037 + * Prototype for this hypercall is:
106038 + * int kexec_op(int cmd, void *args)
106039 + * @cmd == KEXEC_CMD_...
106040 + * KEXEC operation to perform
106041 + * @args == Operation-specific extra arguments (NULL if none).
106042 + */
106043 +
106044 +/*
106045 + * Kexec supports two types of operation:
106046 + * - kexec into a regular kernel, very similar to a standard reboot
106047 + * - KEXEC_TYPE_DEFAULT is used to specify this type
106048 + * - kexec into a special "crash kernel", aka kexec-on-panic
106049 + * - KEXEC_TYPE_CRASH is used to specify this type
106050 + * - parts of our system may be broken at kexec-on-panic time
106051 + * - the code should be kept as simple and self-contained as possible
106052 + */
106053 +
106054 +#define KEXEC_TYPE_DEFAULT 0
106055 +#define KEXEC_TYPE_CRASH 1
106056 +
106057 +
106058 +/* The kexec implementation for Xen allows the user to load two
106059 + * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
106060 + * All data needed for a kexec reboot is kept in one xen_kexec_image_t
106061 + * per "instance". The data mainly consists of machine address lists to pages
106062 + * together with destination addresses. The data in xen_kexec_image_t
106063 + * is passed to the "code page" which is one page of code that performs
106064 + * the final relocations before jumping to the new kernel.
106065 + */
106066 +
106067 +typedef struct xen_kexec_image {
106068 +#if defined(__i386__) || defined(__x86_64__)
106069 + unsigned long page_list[KEXEC_XEN_NO_PAGES];
106070 +#endif
106071 + unsigned long indirection_page;
106072 + unsigned long start_address;
106073 +} xen_kexec_image_t;
106074 +
106075 +/*
106076 + * Perform kexec having previously loaded a kexec or kdump kernel
106077 + * as appropriate.
106078 + * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
106079 + */
106080 +#define KEXEC_CMD_kexec 0
106081 +typedef struct xen_kexec_exec {
106082 + int type;
106083 +} xen_kexec_exec_t;
106084 +
106085 +/*
106086 + * Load/Unload kernel image for kexec or kdump.
106087 + * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
106088 + * image == relocation information for kexec (ignored for unload) [in]
106089 + */
106090 +#define KEXEC_CMD_kexec_load 1
106091 +#define KEXEC_CMD_kexec_unload 2
106092 +typedef struct xen_kexec_load {
106093 + int type;
106094 + xen_kexec_image_t image;
106095 +} xen_kexec_load_t;
106096 +
106097 +#define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */
106098 +#define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */
106099 +#define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */
106100 +
106101 +/*
106102 + * Find the address and size of certain memory areas
106103 + * range == KEXEC_RANGE_... [in]
106104 + * nr == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
106105 + * size == number of bytes reserved in window [out]
106106 + * start == address of the first byte in the window [out]
106107 + */
106108 +#define KEXEC_CMD_kexec_get_range 3
106109 +typedef struct xen_kexec_range {
106110 + int range;
106111 + int nr;
106112 + unsigned long size;
106113 + unsigned long start;
106114 +} xen_kexec_range_t;
106115 +
106116 +#endif /* _XEN_PUBLIC_KEXEC_H */
106117 +
106118 +/*
106119 + * Local variables:
106120 + * mode: C
106121 + * c-set-style: "BSD"
106122 + * c-basic-offset: 4
106123 + * tab-width: 4
106124 + * indent-tabs-mode: nil
106125 + * End:
106126 + */
106127 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/memory.h linux-2.6.16.33/include/xen/interface/memory.h
106128 --- linux-2.6.16.33-noxen/include/xen/interface/memory.h 1970-01-01 00:00:00.000000000 +0000
106129 +++ linux-2.6.16.33/include/xen/interface/memory.h 2007-01-08 15:00:55.000000000 +0000
106130 @@ -0,0 +1,276 @@
106131 +/******************************************************************************
106132 + * memory.h
106133 + *
106134 + * Memory reservation and information.
106135 + *
106136 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106137 + * of this software and associated documentation files (the "Software"), to
106138 + * deal in the Software without restriction, including without limitation the
106139 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106140 + * sell copies of the Software, and to permit persons to whom the Software is
106141 + * furnished to do so, subject to the following conditions:
106142 + *
106143 + * The above copyright notice and this permission notice shall be included in
106144 + * all copies or substantial portions of the Software.
106145 + *
106146 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106147 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106148 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106149 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106150 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106151 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106152 + * DEALINGS IN THE SOFTWARE.
106153 + *
106154 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106155 + */
106156 +
106157 +#ifndef __XEN_PUBLIC_MEMORY_H__
106158 +#define __XEN_PUBLIC_MEMORY_H__
106159 +
106160 +/*
106161 + * Increase or decrease the specified domain's memory reservation. Returns the
106162 + * number of extents successfully allocated or freed.
106163 + * arg == addr of struct xen_memory_reservation.
106164 + */
106165 +#define XENMEM_increase_reservation 0
106166 +#define XENMEM_decrease_reservation 1
106167 +#define XENMEM_populate_physmap 6
106168 +struct xen_memory_reservation {
106169 +
106170 + /*
106171 + * XENMEM_increase_reservation:
106172 + * OUT: MFN (*not* GMFN) bases of extents that were allocated
106173 + * XENMEM_decrease_reservation:
106174 + * IN: GMFN bases of extents to free
106175 + * XENMEM_populate_physmap:
106176 + * IN: GPFN bases of extents to populate with memory
106177 + * OUT: GMFN bases of extents that were allocated
106178 + * (NB. This command also updates the mach_to_phys translation table)
106179 + */
106180 + XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
106181 +
106182 + /* Number of extents, and size/alignment of each (2^extent_order pages). */
106183 + xen_ulong_t nr_extents;
106184 + unsigned int extent_order;
106185 +
106186 + /*
106187 + * Maximum # bits addressable by the user of the allocated region (e.g.,
106188 + * I/O devices often have a 32-bit limitation even in 64-bit systems). If
106189 + * zero then the user has no addressing restriction.
106190 + * This field is not used by XENMEM_decrease_reservation.
106191 + */
106192 + unsigned int address_bits;
106193 +
106194 + /*
106195 + * Domain whose reservation is being changed.
106196 + * Unprivileged domains can specify only DOMID_SELF.
106197 + */
106198 + domid_t domid;
106199 +};
106200 +typedef struct xen_memory_reservation xen_memory_reservation_t;
106201 +DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
106202 +
106203 +/*
106204 + * An atomic exchange of memory pages. If return code is zero then
106205 + * @out.extent_list provides GMFNs of the newly-allocated memory.
106206 + * Returns zero on complete success, otherwise a negative error code.
106207 + * On complete success then always @nr_exchanged == @in.nr_extents.
106208 + * On partial success @nr_exchanged indicates how much work was done.
106209 + */
106210 +#define XENMEM_exchange 11
106211 +struct xen_memory_exchange {
106212 + /*
106213 + * [IN] Details of memory extents to be exchanged (GMFN bases).
106214 + * Note that @in.address_bits is ignored and unused.
106215 + */
106216 + struct xen_memory_reservation in;
106217 +
106218 + /*
106219 + * [IN/OUT] Details of new memory extents.
106220 + * We require that:
106221 + * 1. @in.domid == @out.domid
106222 + * 2. @in.nr_extents << @in.extent_order ==
106223 + * @out.nr_extents << @out.extent_order
106224 + * 3. @in.extent_start and @out.extent_start lists must not overlap
106225 + * 4. @out.extent_start lists GPFN bases to be populated
106226 + * 5. @out.extent_start is overwritten with allocated GMFN bases
106227 + */
106228 + struct xen_memory_reservation out;
106229 +
106230 + /*
106231 + * [OUT] Number of input extents that were successfully exchanged:
106232 + * 1. The first @nr_exchanged input extents were successfully
106233 + * deallocated.
106234 + * 2. The corresponding first entries in the output extent list correctly
106235 + * indicate the GMFNs that were successfully exchanged.
106236 + * 3. All other input and output extents are untouched.
106237 + * 4. If not all input exents are exchanged then the return code of this
106238 + * command will be non-zero.
106239 + * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
106240 + */
106241 + xen_ulong_t nr_exchanged;
106242 +};
106243 +typedef struct xen_memory_exchange xen_memory_exchange_t;
106244 +DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
106245 +
106246 +/*
106247 + * Returns the maximum machine frame number of mapped RAM in this system.
106248 + * This command always succeeds (it never returns an error code).
106249 + * arg == NULL.
106250 + */
106251 +#define XENMEM_maximum_ram_page 2
106252 +
106253 +/*
106254 + * Returns the current or maximum memory reservation, in pages, of the
106255 + * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
106256 + * arg == addr of domid_t.
106257 + */
106258 +#define XENMEM_current_reservation 3
106259 +#define XENMEM_maximum_reservation 4
106260 +
106261 +/*
106262 + * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
106263 + * mapping table. Architectures which do not have a m2p table do not implement
106264 + * this command.
106265 + * arg == addr of xen_machphys_mfn_list_t.
106266 + */
106267 +#define XENMEM_machphys_mfn_list 5
106268 +struct xen_machphys_mfn_list {
106269 + /*
106270 + * Size of the 'extent_start' array. Fewer entries will be filled if the
106271 + * machphys table is smaller than max_extents * 2MB.
106272 + */
106273 + unsigned int max_extents;
106274 +
106275 + /*
106276 + * Pointer to buffer to fill with list of extent starts. If there are
106277 + * any large discontiguities in the machine address space, 2MB gaps in
106278 + * the machphys table will be represented by an MFN base of zero.
106279 + */
106280 + XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
106281 +
106282 + /*
106283 + * Number of extents written to the above array. This will be smaller
106284 + * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
106285 + */
106286 + unsigned int nr_extents;
106287 +};
106288 +typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
106289 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
106290 +
106291 +/*
106292 + * Returns the location in virtual address space of the machine_to_phys
106293 + * mapping table. Architectures which do not have a m2p table, or which do not
106294 + * map it by default into guest address space, do not implement this command.
106295 + * arg == addr of xen_machphys_mapping_t.
106296 + */
106297 +#define XENMEM_machphys_mapping 12
106298 +struct xen_machphys_mapping {
106299 + xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */
106300 + xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */
106301 +};
106302 +typedef struct xen_machphys_mapping xen_machphys_mapping_t;
106303 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
106304 +
106305 +/*
106306 + * Sets the GPFN at which a particular page appears in the specified guest's
106307 + * pseudophysical address space.
106308 + * arg == addr of xen_add_to_physmap_t.
106309 + */
106310 +#define XENMEM_add_to_physmap 7
106311 +struct xen_add_to_physmap {
106312 + /* Which domain to change the mapping for. */
106313 + domid_t domid;
106314 +
106315 + /* Source mapping space. */
106316 +#define XENMAPSPACE_shared_info 0 /* shared info page */
106317 +#define XENMAPSPACE_grant_table 1 /* grant table page */
106318 + unsigned int space;
106319 +
106320 + /* Index into source mapping space. */
106321 + xen_ulong_t idx;
106322 +
106323 + /* GPFN where the source mapping page should appear. */
106324 + xen_pfn_t gpfn;
106325 +};
106326 +typedef struct xen_add_to_physmap xen_add_to_physmap_t;
106327 +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
106328 +
106329 +/*
106330 + * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
106331 + * code on failure. This call only works for auto-translated guests.
106332 + */
106333 +#define XENMEM_translate_gpfn_list 8
106334 +struct xen_translate_gpfn_list {
106335 + /* Which domain to translate for? */
106336 + domid_t domid;
106337 +
106338 + /* Length of list. */
106339 + xen_ulong_t nr_gpfns;
106340 +
106341 + /* List of GPFNs to translate. */
106342 + XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
106343 +
106344 + /*
106345 + * Output list to contain MFN translations. May be the same as the input
106346 + * list (in which case each input GPFN is overwritten with the output MFN).
106347 + */
106348 + XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
106349 +};
106350 +typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
106351 +DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
106352 +
106353 +/*
106354 + * Returns the pseudo-physical memory map as it was when the domain
106355 + * was started (specified by XENMEM_set_memory_map).
106356 + * arg == addr of xen_memory_map_t.
106357 + */
106358 +#define XENMEM_memory_map 9
106359 +struct xen_memory_map {
106360 + /*
106361 + * On call the number of entries which can be stored in buffer. On
106362 + * return the number of entries which have been stored in
106363 + * buffer.
106364 + */
106365 + unsigned int nr_entries;
106366 +
106367 + /*
106368 + * Entries in the buffer are in the same format as returned by the
106369 + * BIOS INT 0x15 EAX=0xE820 call.
106370 + */
106371 + XEN_GUEST_HANDLE(void) buffer;
106372 +};
106373 +typedef struct xen_memory_map xen_memory_map_t;
106374 +DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
106375 +
106376 +/*
106377 + * Returns the real physical memory map. Passes the same structure as
106378 + * XENMEM_memory_map.
106379 + * arg == addr of xen_memory_map_t.
106380 + */
106381 +#define XENMEM_machine_memory_map 10
106382 +
106383 +/*
106384 + * Set the pseudo-physical memory map of a domain, as returned by
106385 + * XENMEM_memory_map.
106386 + * arg == addr of xen_foreign_memory_map_t.
106387 + */
106388 +#define XENMEM_set_memory_map 13
106389 +struct xen_foreign_memory_map {
106390 + domid_t domid;
106391 + struct xen_memory_map map;
106392 +};
106393 +typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
106394 +DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
106395 +
106396 +#endif /* __XEN_PUBLIC_MEMORY_H__ */
106397 +
106398 +/*
106399 + * Local variables:
106400 + * mode: C
106401 + * c-set-style: "BSD"
106402 + * c-basic-offset: 4
106403 + * tab-width: 4
106404 + * indent-tabs-mode: nil
106405 + * End:
106406 + */
106407 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/nmi.h linux-2.6.16.33/include/xen/interface/nmi.h
106408 --- linux-2.6.16.33-noxen/include/xen/interface/nmi.h 1970-01-01 00:00:00.000000000 +0000
106409 +++ linux-2.6.16.33/include/xen/interface/nmi.h 2007-01-08 15:00:55.000000000 +0000
106410 @@ -0,0 +1,78 @@
106411 +/******************************************************************************
106412 + * nmi.h
106413 + *
106414 + * NMI callback registration and reason codes.
106415 + *
106416 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106417 + * of this software and associated documentation files (the "Software"), to
106418 + * deal in the Software without restriction, including without limitation the
106419 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106420 + * sell copies of the Software, and to permit persons to whom the Software is
106421 + * furnished to do so, subject to the following conditions:
106422 + *
106423 + * The above copyright notice and this permission notice shall be included in
106424 + * all copies or substantial portions of the Software.
106425 + *
106426 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106427 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106428 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106429 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106430 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106431 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106432 + * DEALINGS IN THE SOFTWARE.
106433 + *
106434 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106435 + */
106436 +
106437 +#ifndef __XEN_PUBLIC_NMI_H__
106438 +#define __XEN_PUBLIC_NMI_H__
106439 +
106440 +/*
106441 + * NMI reason codes:
106442 + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
106443 + */
106444 + /* I/O-check error reported via ISA port 0x61, bit 6. */
106445 +#define _XEN_NMIREASON_io_error 0
106446 +#define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error)
106447 + /* Parity error reported via ISA port 0x61, bit 7. */
106448 +#define _XEN_NMIREASON_parity_error 1
106449 +#define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error)
106450 + /* Unknown hardware-generated NMI. */
106451 +#define _XEN_NMIREASON_unknown 2
106452 +#define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown)
106453 +
106454 +/*
106455 + * long nmi_op(unsigned int cmd, void *arg)
106456 + * NB. All ops return zero on success, else a negative error code.
106457 + */
106458 +
106459 +/*
106460 + * Register NMI callback for this (calling) VCPU. Currently this only makes
106461 + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
106462 + * arg == pointer to xennmi_callback structure.
106463 + */
106464 +#define XENNMI_register_callback 0
106465 +struct xennmi_callback {
106466 + unsigned long handler_address;
106467 + unsigned long pad;
106468 +};
106469 +typedef struct xennmi_callback xennmi_callback_t;
106470 +DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
106471 +
106472 +/*
106473 + * Deregister NMI callback for this (calling) VCPU.
106474 + * arg == NULL.
106475 + */
106476 +#define XENNMI_unregister_callback 1
106477 +
106478 +#endif /* __XEN_PUBLIC_NMI_H__ */
106479 +
106480 +/*
106481 + * Local variables:
106482 + * mode: C
106483 + * c-set-style: "BSD"
106484 + * c-basic-offset: 4
106485 + * tab-width: 4
106486 + * indent-tabs-mode: nil
106487 + * End:
106488 + */
106489 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/physdev.h linux-2.6.16.33/include/xen/interface/physdev.h
106490 --- linux-2.6.16.33-noxen/include/xen/interface/physdev.h 1970-01-01 00:00:00.000000000 +0000
106491 +++ linux-2.6.16.33/include/xen/interface/physdev.h 2007-01-08 15:00:55.000000000 +0000
106492 @@ -0,0 +1,169 @@
106493 +/*
106494 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106495 + * of this software and associated documentation files (the "Software"), to
106496 + * deal in the Software without restriction, including without limitation the
106497 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106498 + * sell copies of the Software, and to permit persons to whom the Software is
106499 + * furnished to do so, subject to the following conditions:
106500 + *
106501 + * The above copyright notice and this permission notice shall be included in
106502 + * all copies or substantial portions of the Software.
106503 + *
106504 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106505 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106506 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106507 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106508 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106509 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106510 + * DEALINGS IN THE SOFTWARE.
106511 + */
106512 +
106513 +#ifndef __XEN_PUBLIC_PHYSDEV_H__
106514 +#define __XEN_PUBLIC_PHYSDEV_H__
106515 +
106516 +/*
106517 + * Prototype for this hypercall is:
106518 + * int physdev_op(int cmd, void *args)
106519 + * @cmd == PHYSDEVOP_??? (physdev operation).
106520 + * @args == Operation-specific extra arguments (NULL if none).
106521 + */
106522 +
106523 +/*
106524 + * Notify end-of-interrupt (EOI) for the specified IRQ.
106525 + * @arg == pointer to physdev_eoi structure.
106526 + */
106527 +#define PHYSDEVOP_eoi 12
106528 +struct physdev_eoi {
106529 + /* IN */
106530 + uint32_t irq;
106531 +};
106532 +typedef struct physdev_eoi physdev_eoi_t;
106533 +DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
106534 +
106535 +/*
106536 + * Query the status of an IRQ line.
106537 + * @arg == pointer to physdev_irq_status_query structure.
106538 + */
106539 +#define PHYSDEVOP_irq_status_query 5
106540 +struct physdev_irq_status_query {
106541 + /* IN */
106542 + uint32_t irq;
106543 + /* OUT */
106544 + uint32_t flags; /* XENIRQSTAT_* */
106545 +};
106546 +typedef struct physdev_irq_status_query physdev_irq_status_query_t;
106547 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
106548 +
106549 +/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
106550 +#define _XENIRQSTAT_needs_eoi (0)
106551 +#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
106552 +
106553 +/* IRQ shared by multiple guests? */
106554 +#define _XENIRQSTAT_shared (1)
106555 +#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
106556 +
106557 +/*
106558 + * Set the current VCPU's I/O privilege level.
106559 + * @arg == pointer to physdev_set_iopl structure.
106560 + */
106561 +#define PHYSDEVOP_set_iopl 6
106562 +struct physdev_set_iopl {
106563 + /* IN */
106564 + uint32_t iopl;
106565 +};
106566 +typedef struct physdev_set_iopl physdev_set_iopl_t;
106567 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
106568 +
106569 +/*
106570 + * Set the current VCPU's I/O-port permissions bitmap.
106571 + * @arg == pointer to physdev_set_iobitmap structure.
106572 + */
106573 +#define PHYSDEVOP_set_iobitmap 7
106574 +struct physdev_set_iobitmap {
106575 + /* IN */
106576 + XEN_GUEST_HANDLE_00030205(uint8_t) bitmap;
106577 + uint32_t nr_ports;
106578 +};
106579 +typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
106580 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
106581 +
106582 +/*
106583 + * Read or write an IO-APIC register.
106584 + * @arg == pointer to physdev_apic structure.
106585 + */
106586 +#define PHYSDEVOP_apic_read 8
106587 +#define PHYSDEVOP_apic_write 9
106588 +struct physdev_apic {
106589 + /* IN */
106590 + unsigned long apic_physbase;
106591 + uint32_t reg;
106592 + /* IN or OUT */
106593 + uint32_t value;
106594 +};
106595 +typedef struct physdev_apic physdev_apic_t;
106596 +DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
106597 +
106598 +/*
106599 + * Allocate or free a physical upcall vector for the specified IRQ line.
106600 + * @arg == pointer to physdev_irq structure.
106601 + */
106602 +#define PHYSDEVOP_alloc_irq_vector 10
106603 +#define PHYSDEVOP_free_irq_vector 11
106604 +struct physdev_irq {
106605 + /* IN */
106606 + uint32_t irq;
106607 + /* IN or OUT */
106608 + uint32_t vector;
106609 +};
106610 +typedef struct physdev_irq physdev_irq_t;
106611 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
106612 +
106613 +/*
106614 + * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
106615 + * hypercall since 0x00030202.
106616 + */
106617 +struct physdev_op {
106618 + uint32_t cmd;
106619 + union {
106620 + struct physdev_irq_status_query irq_status_query;
106621 + struct physdev_set_iopl set_iopl;
106622 + struct physdev_set_iobitmap set_iobitmap;
106623 + struct physdev_apic apic_op;
106624 + struct physdev_irq irq_op;
106625 + } u;
106626 +};
106627 +typedef struct physdev_op physdev_op_t;
106628 +DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
106629 +
106630 +/*
106631 + * Notify that some PIRQ-bound event channels have been unmasked.
106632 + * ** This command is obsolete since interface version 0x00030202 and is **
106633 + * ** unsupported by newer versions of Xen. **
106634 + */
106635 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
106636 +
106637 +/*
106638 + * These all-capitals physdev operation names are superceded by the new names
106639 + * (defined above) since interface version 0x00030202.
106640 + */
106641 +#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
106642 +#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
106643 +#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
106644 +#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
106645 +#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
106646 +#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
106647 +#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
106648 +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
106649 +#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
106650 +
106651 +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
106652 +
106653 +/*
106654 + * Local variables:
106655 + * mode: C
106656 + * c-set-style: "BSD"
106657 + * c-basic-offset: 4
106658 + * tab-width: 4
106659 + * indent-tabs-mode: nil
106660 + * End:
106661 + */
106662 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/platform.h linux-2.6.16.33/include/xen/interface/platform.h
106663 --- linux-2.6.16.33-noxen/include/xen/interface/platform.h 1970-01-01 00:00:00.000000000 +0000
106664 +++ linux-2.6.16.33/include/xen/interface/platform.h 2007-01-08 15:00:55.000000000 +0000
106665 @@ -0,0 +1,143 @@
106666 +/******************************************************************************
106667 + * platform.h
106668 + *
106669 + * Hardware platform operations. Intended for use by domain-0 kernel.
106670 + *
106671 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106672 + * of this software and associated documentation files (the "Software"), to
106673 + * deal in the Software without restriction, including without limitation the
106674 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106675 + * sell copies of the Software, and to permit persons to whom the Software is
106676 + * furnished to do so, subject to the following conditions:
106677 + *
106678 + * The above copyright notice and this permission notice shall be included in
106679 + * all copies or substantial portions of the Software.
106680 + *
106681 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106682 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106683 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106684 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106685 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106686 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106687 + * DEALINGS IN THE SOFTWARE.
106688 + *
106689 + * Copyright (c) 2002-2006, K Fraser
106690 + */
106691 +
106692 +#ifndef __XEN_PUBLIC_PLATFORM_H__
106693 +#define __XEN_PUBLIC_PLATFORM_H__
106694 +
106695 +#include "xen.h"
106696 +
106697 +#define XENPF_INTERFACE_VERSION 0x03000001
106698 +
106699 +/*
106700 + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
106701 + * 1 January, 1970 if the current system time was <system_time>.
106702 + */
106703 +#define XENPF_settime 17
106704 +struct xenpf_settime {
106705 + /* IN variables. */
106706 + uint32_t secs;
106707 + uint32_t nsecs;
106708 + uint64_t system_time;
106709 +};
106710 +typedef struct xenpf_settime xenpf_settime_t;
106711 +DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
106712 +
106713 +/*
106714 + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
106715 + * On x86, @type is an architecture-defined MTRR memory type.
106716 + * On success, returns the MTRR that was used (@reg) and a handle that can
106717 + * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
106718 + * (x86-specific).
106719 + */
106720 +#define XENPF_add_memtype 31
106721 +struct xenpf_add_memtype {
106722 + /* IN variables. */
106723 + xen_pfn_t mfn;
106724 + uint64_t nr_mfns;
106725 + uint32_t type;
106726 + /* OUT variables. */
106727 + uint32_t handle;
106728 + uint32_t reg;
106729 +};
106730 +typedef struct xenpf_add_memtype xenpf_add_memtype_t;
106731 +DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
106732 +
106733 +/*
106734 + * Tear down an existing memory-range type. If @handle is remembered then it
106735 + * should be passed in to accurately tear down the correct setting (in case
106736 + * of overlapping memory regions with differing types). If it is not known
106737 + * then @handle should be set to zero. In all cases @reg must be set.
106738 + * (x86-specific).
106739 + */
106740 +#define XENPF_del_memtype 32
106741 +struct xenpf_del_memtype {
106742 + /* IN variables. */
106743 + uint32_t handle;
106744 + uint32_t reg;
106745 +};
106746 +typedef struct xenpf_del_memtype xenpf_del_memtype_t;
106747 +DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
106748 +
106749 +/* Read current type of an MTRR (x86-specific). */
106750 +#define XENPF_read_memtype 33
106751 +struct xenpf_read_memtype {
106752 + /* IN variables. */
106753 + uint32_t reg;
106754 + /* OUT variables. */
106755 + xen_pfn_t mfn;
106756 + uint64_t nr_mfns;
106757 + uint32_t type;
106758 +};
106759 +typedef struct xenpf_read_memtype xenpf_read_memtype_t;
106760 +DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
106761 +
106762 +#define XENPF_microcode_update 35
106763 +struct xenpf_microcode_update {
106764 + /* IN variables. */
106765 + XEN_GUEST_HANDLE(void) data; /* Pointer to microcode data */
106766 + uint32_t length; /* Length of microcode data. */
106767 +};
106768 +typedef struct xenpf_microcode_update xenpf_microcode_update_t;
106769 +DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
106770 +
106771 +#define XENPF_platform_quirk 39
106772 +#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */
106773 +#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */
106774 +#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */
106775 +struct xenpf_platform_quirk {
106776 + /* IN variables. */
106777 + uint32_t quirk_id;
106778 +};
106779 +typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
106780 +DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
106781 +
106782 +struct xen_platform_op {
106783 + uint32_t cmd;
106784 + uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
106785 + union {
106786 + struct xenpf_settime settime;
106787 + struct xenpf_add_memtype add_memtype;
106788 + struct xenpf_del_memtype del_memtype;
106789 + struct xenpf_read_memtype read_memtype;
106790 + struct xenpf_microcode_update microcode;
106791 + struct xenpf_platform_quirk platform_quirk;
106792 + uint8_t pad[128];
106793 + } u;
106794 +};
106795 +typedef struct xen_platform_op xen_platform_op_t;
106796 +DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
106797 +
106798 +#endif /* __XEN_PUBLIC_PLATFORM_H__ */
106799 +
106800 +/*
106801 + * Local variables:
106802 + * mode: C
106803 + * c-set-style: "BSD"
106804 + * c-basic-offset: 4
106805 + * tab-width: 4
106806 + * indent-tabs-mode: nil
106807 + * End:
106808 + */
106809 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/sched.h linux-2.6.16.33/include/xen/interface/sched.h
106810 --- linux-2.6.16.33-noxen/include/xen/interface/sched.h 1970-01-01 00:00:00.000000000 +0000
106811 +++ linux-2.6.16.33/include/xen/interface/sched.h 2007-01-08 15:00:55.000000000 +0000
106812 @@ -0,0 +1,121 @@
106813 +/******************************************************************************
106814 + * sched.h
106815 + *
106816 + * Scheduler state interactions
106817 + *
106818 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106819 + * of this software and associated documentation files (the "Software"), to
106820 + * deal in the Software without restriction, including without limitation the
106821 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106822 + * sell copies of the Software, and to permit persons to whom the Software is
106823 + * furnished to do so, subject to the following conditions:
106824 + *
106825 + * The above copyright notice and this permission notice shall be included in
106826 + * all copies or substantial portions of the Software.
106827 + *
106828 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106829 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106830 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106831 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106832 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106833 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106834 + * DEALINGS IN THE SOFTWARE.
106835 + *
106836 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
106837 + */
106838 +
106839 +#ifndef __XEN_PUBLIC_SCHED_H__
106840 +#define __XEN_PUBLIC_SCHED_H__
106841 +
106842 +#include "event_channel.h"
106843 +
106844 +/*
106845 + * The prototype for this hypercall is:
106846 + * long sched_op(int cmd, void *arg)
106847 + * @cmd == SCHEDOP_??? (scheduler operation).
106848 + * @arg == Operation-specific extra argument(s), as described below.
106849 + *
106850 + * Versions of Xen prior to 3.0.2 provided only the following legacy version
106851 + * of this hypercall, supporting only the commands yield, block and shutdown:
106852 + * long sched_op(int cmd, unsigned long arg)
106853 + * @cmd == SCHEDOP_??? (scheduler operation).
106854 + * @arg == 0 (SCHEDOP_yield and SCHEDOP_block)
106855 + * == SHUTDOWN_* code (SCHEDOP_shutdown)
106856 + * This legacy version is available to new guests as sched_op_compat().
106857 + */
106858 +
106859 +/*
106860 + * Voluntarily yield the CPU.
106861 + * @arg == NULL.
106862 + */
106863 +#define SCHEDOP_yield 0
106864 +
106865 +/*
106866 + * Block execution of this VCPU until an event is received for processing.
106867 + * If called with event upcalls masked, this operation will atomically
106868 + * reenable event delivery and check for pending events before blocking the
106869 + * VCPU. This avoids a "wakeup waiting" race.
106870 + * @arg == NULL.
106871 + */
106872 +#define SCHEDOP_block 1
106873 +
106874 +/*
106875 + * Halt execution of this domain (all VCPUs) and notify the system controller.
106876 + * @arg == pointer to sched_shutdown structure.
106877 + */
106878 +#define SCHEDOP_shutdown 2
106879 +struct sched_shutdown {
106880 + unsigned int reason; /* SHUTDOWN_* */
106881 +};
106882 +typedef struct sched_shutdown sched_shutdown_t;
106883 +DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
106884 +
106885 +/*
106886 + * Poll a set of event-channel ports. Return when one or more are pending. An
106887 + * optional timeout may be specified.
106888 + * @arg == pointer to sched_poll structure.
106889 + */
106890 +#define SCHEDOP_poll 3
106891 +struct sched_poll {
106892 + XEN_GUEST_HANDLE(evtchn_port_t) ports;
106893 + unsigned int nr_ports;
106894 + uint64_t timeout;
106895 +};
106896 +typedef struct sched_poll sched_poll_t;
106897 +DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
106898 +
106899 +/*
106900 + * Declare a shutdown for another domain. The main use of this function is
106901 + * in interpreting shutdown requests and reasons for fully-virtualized
106902 + * domains. A para-virtualized domain may use SCHEDOP_shutdown directly.
106903 + * @arg == pointer to sched_remote_shutdown structure.
106904 + */
106905 +#define SCHEDOP_remote_shutdown 4
106906 +struct sched_remote_shutdown {
106907 + domid_t domain_id; /* Remote domain ID */
106908 + unsigned int reason; /* SHUTDOWN_xxx reason */
106909 +};
106910 +typedef struct sched_remote_shutdown sched_remote_shutdown_t;
106911 +DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
106912 +
106913 +/*
106914 + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
106915 + * software to determine the appropriate action. For the most part, Xen does
106916 + * not care about the shutdown code.
106917 + */
106918 +#define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */
106919 +#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */
106920 +#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
106921 +#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
106922 +
106923 +#endif /* __XEN_PUBLIC_SCHED_H__ */
106924 +
106925 +/*
106926 + * Local variables:
106927 + * mode: C
106928 + * c-set-style: "BSD"
106929 + * c-basic-offset: 4
106930 + * tab-width: 4
106931 + * indent-tabs-mode: nil
106932 + * End:
106933 + */
106934 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/sysctl.h linux-2.6.16.33/include/xen/interface/sysctl.h
106935 --- linux-2.6.16.33-noxen/include/xen/interface/sysctl.h 1970-01-01 00:00:00.000000000 +0000
106936 +++ linux-2.6.16.33/include/xen/interface/sysctl.h 2007-01-08 15:00:55.000000000 +0000
106937 @@ -0,0 +1,169 @@
106938 +/******************************************************************************
106939 + * sysctl.h
106940 + *
106941 + * System management operations. For use by node control stack.
106942 + *
106943 + * Permission is hereby granted, free of charge, to any person obtaining a copy
106944 + * of this software and associated documentation files (the "Software"), to
106945 + * deal in the Software without restriction, including without limitation the
106946 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
106947 + * sell copies of the Software, and to permit persons to whom the Software is
106948 + * furnished to do so, subject to the following conditions:
106949 + *
106950 + * The above copyright notice and this permission notice shall be included in
106951 + * all copies or substantial portions of the Software.
106952 + *
106953 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
106954 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106955 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
106956 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106957 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
106958 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
106959 + * DEALINGS IN THE SOFTWARE.
106960 + *
106961 + * Copyright (c) 2002-2006, K Fraser
106962 + */
106963 +
106964 +#ifndef __XEN_PUBLIC_SYSCTL_H__
106965 +#define __XEN_PUBLIC_SYSCTL_H__
106966 +
106967 +#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
106968 +#error "sysctl operations are intended for use by node control tools only"
106969 +#endif
106970 +
106971 +#include "xen.h"
106972 +#include "domctl.h"
106973 +
106974 +#define XEN_SYSCTL_INTERFACE_VERSION 0x00000002
106975 +
106976 +/*
106977 + * Read console content from Xen buffer ring.
106978 + */
106979 +#define XEN_SYSCTL_readconsole 1
106980 +struct xen_sysctl_readconsole {
106981 + /* IN variables. */
106982 + uint32_t clear; /* Non-zero -> clear after reading. */
106983 + XEN_GUEST_HANDLE(char) buffer; /* Buffer start */
106984 + /* IN/OUT variables. */
106985 + uint32_t count; /* In: Buffer size; Out: Used buffer size */
106986 +};
106987 +typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
106988 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
106989 +
106990 +/* Get trace buffers machine base address */
106991 +#define XEN_SYSCTL_tbuf_op 2
106992 +struct xen_sysctl_tbuf_op {
106993 + /* IN variables */
106994 +#define XEN_SYSCTL_TBUFOP_get_info 0
106995 +#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
106996 +#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
106997 +#define XEN_SYSCTL_TBUFOP_set_size 3
106998 +#define XEN_SYSCTL_TBUFOP_enable 4
106999 +#define XEN_SYSCTL_TBUFOP_disable 5
107000 + uint32_t cmd;
107001 + /* IN/OUT variables */
107002 + struct xenctl_cpumap cpu_mask;
107003 + uint32_t evt_mask;
107004 + /* OUT variables */
107005 + uint64_t buffer_mfn;
107006 + uint32_t size;
107007 +};
107008 +typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
107009 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
107010 +
107011 +/*
107012 + * Get physical information about the host machine
107013 + */
107014 +#define XEN_SYSCTL_physinfo 3
107015 +struct xen_sysctl_physinfo {
107016 + uint32_t threads_per_core;
107017 + uint32_t cores_per_socket;
107018 + uint32_t sockets_per_node;
107019 + uint32_t nr_nodes;
107020 + uint32_t cpu_khz;
107021 + uint64_t total_pages;
107022 + uint64_t free_pages;
107023 + uint64_t scrub_pages;
107024 + uint32_t hw_cap[8];
107025 +};
107026 +typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
107027 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
107028 +
107029 +/*
107030 + * Get the ID of the current scheduler.
107031 + */
107032 +#define XEN_SYSCTL_sched_id 4
107033 +struct xen_sysctl_sched_id {
107034 + /* OUT variable */
107035 + uint32_t sched_id;
107036 +};
107037 +typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
107038 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
107039 +
107040 +/* Interface for controlling Xen software performance counters. */
107041 +#define XEN_SYSCTL_perfc_op 5
107042 +/* Sub-operations: */
107043 +#define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */
107044 +#define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */
107045 +struct xen_sysctl_perfc_desc {
107046 + char name[80]; /* name of perf counter */
107047 + uint32_t nr_vals; /* number of values for this counter */
107048 +};
107049 +typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
107050 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
107051 +typedef uint32_t xen_sysctl_perfc_val_t;
107052 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
107053 +
107054 +struct xen_sysctl_perfc_op {
107055 + /* IN variables. */
107056 + uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */
107057 + /* OUT variables. */
107058 + uint32_t nr_counters; /* number of counters description */
107059 + uint32_t nr_vals; /* number of values */
107060 + /* counter information (or NULL) */
107061 + XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t) desc;
107062 + /* counter values (or NULL) */
107063 + XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t) val;
107064 +};
107065 +typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
107066 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
107067 +
107068 +#define XEN_SYSCTL_getdomaininfolist 6
107069 +struct xen_sysctl_getdomaininfolist {
107070 + /* IN variables. */
107071 + domid_t first_domain;
107072 + uint32_t max_domains;
107073 + XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t) buffer;
107074 + /* OUT variables. */
107075 + uint32_t num_domains;
107076 +};
107077 +typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
107078 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
107079 +
107080 +struct xen_sysctl {
107081 + uint32_t cmd;
107082 + uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
107083 + union {
107084 + struct xen_sysctl_readconsole readconsole;
107085 + struct xen_sysctl_tbuf_op tbuf_op;
107086 + struct xen_sysctl_physinfo physinfo;
107087 + struct xen_sysctl_sched_id sched_id;
107088 + struct xen_sysctl_perfc_op perfc_op;
107089 + struct xen_sysctl_getdomaininfolist getdomaininfolist;
107090 + uint8_t pad[128];
107091 + } u;
107092 +};
107093 +typedef struct xen_sysctl xen_sysctl_t;
107094 +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
107095 +
107096 +#endif /* __XEN_PUBLIC_SYSCTL_H__ */
107097 +
107098 +/*
107099 + * Local variables:
107100 + * mode: C
107101 + * c-set-style: "BSD"
107102 + * c-basic-offset: 4
107103 + * tab-width: 4
107104 + * indent-tabs-mode: nil
107105 + * End:
107106 + */
107107 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/trace.h linux-2.6.16.33/include/xen/interface/trace.h
107108 --- linux-2.6.16.33-noxen/include/xen/interface/trace.h 1970-01-01 00:00:00.000000000 +0000
107109 +++ linux-2.6.16.33/include/xen/interface/trace.h 2007-01-08 15:00:55.000000000 +0000
107110 @@ -0,0 +1,102 @@
107111 +/******************************************************************************
107112 + * include/public/trace.h
107113 + *
107114 + * Permission is hereby granted, free of charge, to any person obtaining a copy
107115 + * of this software and associated documentation files (the "Software"), to
107116 + * deal in the Software without restriction, including without limitation the
107117 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107118 + * sell copies of the Software, and to permit persons to whom the Software is
107119 + * furnished to do so, subject to the following conditions:
107120 + *
107121 + * The above copyright notice and this permission notice shall be included in
107122 + * all copies or substantial portions of the Software.
107123 + *
107124 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107125 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107126 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107127 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107128 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107129 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107130 + * DEALINGS IN THE SOFTWARE.
107131 + *
107132 + * Mark Williamson, (C) 2004 Intel Research Cambridge
107133 + * Copyright (C) 2005 Bin Ren
107134 + */
107135 +
107136 +#ifndef __XEN_PUBLIC_TRACE_H__
107137 +#define __XEN_PUBLIC_TRACE_H__
107138 +
107139 +/* Trace classes */
107140 +#define TRC_CLS_SHIFT 16
107141 +#define TRC_GEN 0x0001f000 /* General trace */
107142 +#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */
107143 +#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */
107144 +#define TRC_VMX 0x0008f000 /* Xen VMX trace */
107145 +#define TRC_MEM 0x0010f000 /* Xen memory trace */
107146 +#define TRC_ALL 0xfffff000
107147 +
107148 +/* Trace subclasses */
107149 +#define TRC_SUBCLS_SHIFT 12
107150 +
107151 +/* trace subclasses for VMX */
107152 +#define TRC_VMXEXIT 0x00081000 /* VMX exit trace */
107153 +#define TRC_VMXENTRY 0x00082000 /* VMX exit trace */
107154 +#define TRC_VMXINTR 0x00084000 /* VMX interrupt trace */
107155 +
107156 +/* Trace events per class */
107157 +#define TRC_LOST_RECORDS (TRC_GEN + 1)
107158 +
107159 +#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1)
107160 +#define TRC_SCHED_DOM_REM (TRC_SCHED + 2)
107161 +#define TRC_SCHED_SLEEP (TRC_SCHED + 3)
107162 +#define TRC_SCHED_WAKE (TRC_SCHED + 4)
107163 +#define TRC_SCHED_YIELD (TRC_SCHED + 5)
107164 +#define TRC_SCHED_BLOCK (TRC_SCHED + 6)
107165 +#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7)
107166 +#define TRC_SCHED_CTL (TRC_SCHED + 8)
107167 +#define TRC_SCHED_ADJDOM (TRC_SCHED + 9)
107168 +#define TRC_SCHED_SWITCH (TRC_SCHED + 10)
107169 +#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11)
107170 +#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12)
107171 +#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13)
107172 +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
107173 +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
107174 +
107175 +#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1)
107176 +#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2)
107177 +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
107178 +
107179 +/* trace events per subclass */
107180 +#define TRC_VMX_VMEXIT (TRC_VMXEXIT + 1)
107181 +#define TRC_VMX_VMENTRY (TRC_VMXENTRY + 1)
107182 +#define TRC_VMX_INTR (TRC_VMXINTR + 1)
107183 +
107184 +
107185 +/* This structure represents a single trace buffer record. */
107186 +struct t_rec {
107187 + uint64_t cycles; /* cycle counter timestamp */
107188 + uint32_t event; /* event ID */
107189 + unsigned long data[5]; /* event data items */
107190 +};
107191 +
107192 +/*
107193 + * This structure contains the metadata for a single trace buffer. The head
107194 + * field, indexes into an array of struct t_rec's.
107195 + */
107196 +struct t_buf {
107197 + uint32_t cons; /* Next item to be consumed by control tools. */
107198 + uint32_t prod; /* Next item to be produced by Xen. */
107199 + /* 'nr_recs' records follow immediately after the meta-data header. */
107200 +};
107201 +
107202 +#endif /* __XEN_PUBLIC_TRACE_H__ */
107203 +
107204 +/*
107205 + * Local variables:
107206 + * mode: C
107207 + * c-set-style: "BSD"
107208 + * c-basic-offset: 4
107209 + * tab-width: 4
107210 + * indent-tabs-mode: nil
107211 + * End:
107212 + */
107213 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/vcpu.h linux-2.6.16.33/include/xen/interface/vcpu.h
107214 --- linux-2.6.16.33-noxen/include/xen/interface/vcpu.h 1970-01-01 00:00:00.000000000 +0000
107215 +++ linux-2.6.16.33/include/xen/interface/vcpu.h 2007-01-08 15:00:55.000000000 +0000
107216 @@ -0,0 +1,142 @@
107217 +/******************************************************************************
107218 + * vcpu.h
107219 + *
107220 + * VCPU initialisation, query, and hotplug.
107221 + *
107222 + * Permission is hereby granted, free of charge, to any person obtaining a copy
107223 + * of this software and associated documentation files (the "Software"), to
107224 + * deal in the Software without restriction, including without limitation the
107225 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107226 + * sell copies of the Software, and to permit persons to whom the Software is
107227 + * furnished to do so, subject to the following conditions:
107228 + *
107229 + * The above copyright notice and this permission notice shall be included in
107230 + * all copies or substantial portions of the Software.
107231 + *
107232 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107233 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107234 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107235 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107236 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107237 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107238 + * DEALINGS IN THE SOFTWARE.
107239 + *
107240 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
107241 + */
107242 +
107243 +#ifndef __XEN_PUBLIC_VCPU_H__
107244 +#define __XEN_PUBLIC_VCPU_H__
107245 +
107246 +/*
107247 + * Prototype for this hypercall is:
107248 + * int vcpu_op(int cmd, int vcpuid, void *extra_args)
107249 + * @cmd == VCPUOP_??? (VCPU operation).
107250 + * @vcpuid == VCPU to operate on.
107251 + * @extra_args == Operation-specific extra arguments (NULL if none).
107252 + */
107253 +
107254 +/*
107255 + * Initialise a VCPU. Each VCPU can be initialised only once. A
107256 + * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
107257 + *
107258 + * @extra_arg == pointer to vcpu_guest_context structure containing initial
107259 + * state for the VCPU.
107260 + */
107261 +#define VCPUOP_initialise 0
107262 +
107263 +/*
107264 + * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
107265 + * if the VCPU has not been initialised (VCPUOP_initialise).
107266 + */
107267 +#define VCPUOP_up 1
107268 +
107269 +/*
107270 + * Bring down a VCPU (i.e., make it non-runnable).
107271 + * There are a few caveats that callers should observe:
107272 + * 1. This operation may return, and VCPU_is_up may return false, before the
107273 + * VCPU stops running (i.e., the command is asynchronous). It is a good
107274 + * idea to ensure that the VCPU has entered a non-critical loop before
107275 + * bringing it down. Alternatively, this operation is guaranteed
107276 + * synchronous if invoked by the VCPU itself.
107277 + * 2. After a VCPU is initialised, there is currently no way to drop all its
107278 + * references to domain memory. Even a VCPU that is down still holds
107279 + * memory references via its pagetable base pointer and GDT. It is good
107280 + * practise to move a VCPU onto an 'idle' or default page table, LDT and
107281 + * GDT before bringing it down.
107282 + */
107283 +#define VCPUOP_down 2
107284 +
107285 +/* Returns 1 if the given VCPU is up. */
107286 +#define VCPUOP_is_up 3
107287 +
107288 +/*
107289 + * Return information about the state and running time of a VCPU.
107290 + * @extra_arg == pointer to vcpu_runstate_info structure.
107291 + */
107292 +#define VCPUOP_get_runstate_info 4
107293 +struct vcpu_runstate_info {
107294 + /* VCPU's current state (RUNSTATE_*). */
107295 + int state;
107296 + /* When was current state entered (system time, ns)? */
107297 + uint64_t state_entry_time;
107298 + /*
107299 + * Time spent in each RUNSTATE_* (ns). The sum of these times is
107300 + * guaranteed not to drift from system time.
107301 + */
107302 + uint64_t time[4];
107303 +};
107304 +typedef struct vcpu_runstate_info vcpu_runstate_info_t;
107305 +DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
107306 +
107307 +/* VCPU is currently running on a physical CPU. */
107308 +#define RUNSTATE_running 0
107309 +
107310 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
107311 +#define RUNSTATE_runnable 1
107312 +
107313 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
107314 +#define RUNSTATE_blocked 2
107315 +
107316 +/*
107317 + * VCPU is not runnable, but it is not blocked.
107318 + * This is a 'catch all' state for things like hotplug and pauses by the
107319 + * system administrator (or for critical sections in the hypervisor).
107320 + * RUNSTATE_blocked dominates this state (it is the preferred state).
107321 + */
107322 +#define RUNSTATE_offline 3
107323 +
107324 +/*
107325 + * Register a shared memory area from which the guest may obtain its own
107326 + * runstate information without needing to execute a hypercall.
107327 + * Notes:
107328 + * 1. The registered address may be virtual or physical or guest handle,
107329 + * depending on the platform. Virtual address or guest handle should be
107330 + * registered on x86 systems.
107331 + * 2. Only one shared area may be registered per VCPU. The shared area is
107332 + * updated by the hypervisor each time the VCPU is scheduled. Thus
107333 + * runstate.state will always be RUNSTATE_running and
107334 + * runstate.state_entry_time will indicate the system time at which the
107335 + * VCPU was last scheduled to run.
107336 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
107337 + */
107338 +#define VCPUOP_register_runstate_memory_area 5
107339 +struct vcpu_register_runstate_memory_area {
107340 + union {
107341 + XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
107342 + struct vcpu_runstate_info *v;
107343 + uint64_t p;
107344 + } addr;
107345 +};
107346 +typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
107347 +
107348 +#endif /* __XEN_PUBLIC_VCPU_H__ */
107349 +
107350 +/*
107351 + * Local variables:
107352 + * mode: C
107353 + * c-set-style: "BSD"
107354 + * c-basic-offset: 4
107355 + * tab-width: 4
107356 + * indent-tabs-mode: nil
107357 + * End:
107358 + */
107359 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/version.h linux-2.6.16.33/include/xen/interface/version.h
107360 --- linux-2.6.16.33-noxen/include/xen/interface/version.h 1970-01-01 00:00:00.000000000 +0000
107361 +++ linux-2.6.16.33/include/xen/interface/version.h 2007-01-08 15:00:55.000000000 +0000
107362 @@ -0,0 +1,91 @@
107363 +/******************************************************************************
107364 + * version.h
107365 + *
107366 + * Xen version, type, and compile information.
107367 + *
107368 + * Permission is hereby granted, free of charge, to any person obtaining a copy
107369 + * of this software and associated documentation files (the "Software"), to
107370 + * deal in the Software without restriction, including without limitation the
107371 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107372 + * sell copies of the Software, and to permit persons to whom the Software is
107373 + * furnished to do so, subject to the following conditions:
107374 + *
107375 + * The above copyright notice and this permission notice shall be included in
107376 + * all copies or substantial portions of the Software.
107377 + *
107378 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107379 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107380 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107381 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107382 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107383 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107384 + * DEALINGS IN THE SOFTWARE.
107385 + *
107386 + * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
107387 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
107388 + */
107389 +
107390 +#ifndef __XEN_PUBLIC_VERSION_H__
107391 +#define __XEN_PUBLIC_VERSION_H__
107392 +
107393 +/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
107394 +
107395 +/* arg == NULL; returns major:minor (16:16). */
107396 +#define XENVER_version 0
107397 +
107398 +/* arg == xen_extraversion_t. */
107399 +#define XENVER_extraversion 1
107400 +typedef char xen_extraversion_t[16];
107401 +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
107402 +
107403 +/* arg == xen_compile_info_t. */
107404 +#define XENVER_compile_info 2
107405 +struct xen_compile_info {
107406 + char compiler[64];
107407 + char compile_by[16];
107408 + char compile_domain[32];
107409 + char compile_date[32];
107410 +};
107411 +typedef struct xen_compile_info xen_compile_info_t;
107412 +
107413 +#define XENVER_capabilities 3
107414 +typedef char xen_capabilities_info_t[1024];
107415 +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
107416 +
107417 +#define XENVER_changeset 4
107418 +typedef char xen_changeset_info_t[64];
107419 +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
107420 +
107421 +#define XENVER_platform_parameters 5
107422 +struct xen_platform_parameters {
107423 + unsigned long virt_start;
107424 +};
107425 +typedef struct xen_platform_parameters xen_platform_parameters_t;
107426 +
107427 +#define XENVER_get_features 6
107428 +struct xen_feature_info {
107429 + unsigned int submap_idx; /* IN: which 32-bit submap to return */
107430 + uint32_t submap; /* OUT: 32-bit submap */
107431 +};
107432 +typedef struct xen_feature_info xen_feature_info_t;
107433 +
107434 +/* Declares the features reported by XENVER_get_features. */
107435 +#include "features.h"
107436 +
107437 +/* arg == NULL; returns host memory page size. */
107438 +#define XENVER_pagesize 7
107439 +
107440 +/* arg == xen_domain_handle_t. */
107441 +#define XENVER_guest_handle 8
107442 +
107443 +#endif /* __XEN_PUBLIC_VERSION_H__ */
107444 +
107445 +/*
107446 + * Local variables:
107447 + * mode: C
107448 + * c-set-style: "BSD"
107449 + * c-basic-offset: 4
107450 + * tab-width: 4
107451 + * indent-tabs-mode: nil
107452 + * End:
107453 + */
107454 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xen-compat.h linux-2.6.16.33/include/xen/interface/xen-compat.h
107455 --- linux-2.6.16.33-noxen/include/xen/interface/xen-compat.h 1970-01-01 00:00:00.000000000 +0000
107456 +++ linux-2.6.16.33/include/xen/interface/xen-compat.h 2007-01-08 15:00:55.000000000 +0000
107457 @@ -0,0 +1,51 @@
107458 +/******************************************************************************
107459 + * xen-compat.h
107460 + *
107461 + * Guest OS interface to Xen. Compatibility layer.
107462 + *
107463 + * Permission is hereby granted, free of charge, to any person obtaining a copy
107464 + * of this software and associated documentation files (the "Software"), to
107465 + * deal in the Software without restriction, including without limitation the
107466 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107467 + * sell copies of the Software, and to permit persons to whom the Software is
107468 + * furnished to do so, subject to the following conditions:
107469 + *
107470 + * The above copyright notice and this permission notice shall be included in
107471 + * all copies or substantial portions of the Software.
107472 + *
107473 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107474 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107475 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107476 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107477 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107478 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107479 + * DEALINGS IN THE SOFTWARE.
107480 + *
107481 + * Copyright (c) 2006, Christian Limpach
107482 + */
107483 +
107484 +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
107485 +#define __XEN_PUBLIC_XEN_COMPAT_H__
107486 +
107487 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030205
107488 +
107489 +#if defined(__XEN__) || defined(__XEN_TOOLS__)
107490 +/* Xen is built with matching headers and implements the latest interface. */
107491 +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
107492 +#elif !defined(__XEN_INTERFACE_VERSION__)
107493 +/* Guests which do not specify a version get the legacy interface. */
107494 +#define __XEN_INTERFACE_VERSION__ 0x00000000
107495 +#endif
107496 +
107497 +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
107498 +#error "These header files do not support the requested interface version."
107499 +#endif
107500 +
107501 +/* Fields defined as a Xen guest handle since 0x00030205. */
107502 +#if __XEN_INTERFACE_VERSION__ >= 0x00030205
107503 +#define XEN_GUEST_HANDLE_00030205(type) XEN_GUEST_HANDLE(type)
107504 +#else
107505 +#define XEN_GUEST_HANDLE_00030205(type) type *
107506 +#endif
107507 +
107508 +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
107509 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xen.h linux-2.6.16.33/include/xen/interface/xen.h
107510 --- linux-2.6.16.33-noxen/include/xen/interface/xen.h 1970-01-01 00:00:00.000000000 +0000
107511 +++ linux-2.6.16.33/include/xen/interface/xen.h 2007-01-08 15:00:56.000000000 +0000
107512 @@ -0,0 +1,597 @@
107513 +/******************************************************************************
107514 + * xen.h
107515 + *
107516 + * Guest OS interface to Xen.
107517 + *
107518 + * Permission is hereby granted, free of charge, to any person obtaining a copy
107519 + * of this software and associated documentation files (the "Software"), to
107520 + * deal in the Software without restriction, including without limitation the
107521 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
107522 + * sell copies of the Software, and to permit persons to whom the Software is
107523 + * furnished to do so, subject to the following conditions:
107524 + *
107525 + * The above copyright notice and this permission notice shall be included in
107526 + * all copies or substantial portions of the Software.
107527 + *
107528 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
107529 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
107530 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107531 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
107532 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
107533 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
107534 + * DEALINGS IN THE SOFTWARE.
107535 + *
107536 + * Copyright (c) 2004, K A Fraser
107537 + */
107538 +
107539 +#ifndef __XEN_PUBLIC_XEN_H__
107540 +#define __XEN_PUBLIC_XEN_H__
107541 +
107542 +#include "xen-compat.h"
107543 +
107544 +#if defined(__i386__) || defined(__x86_64__)
107545 +#include "arch-x86/xen.h"
107546 +#elif defined(__ia64__)
107547 +#include "arch-ia64.h"
107548 +#elif defined(__powerpc__)
107549 +#include "arch-powerpc.h"
107550 +#else
107551 +#error "Unsupported architecture"
107552 +#endif
107553 +
107554 +/*
107555 + * HYPERCALLS
107556 + */
107557 +
107558 +#define __HYPERVISOR_set_trap_table 0
107559 +#define __HYPERVISOR_mmu_update 1
107560 +#define __HYPERVISOR_set_gdt 2
107561 +#define __HYPERVISOR_stack_switch 3
107562 +#define __HYPERVISOR_set_callbacks 4
107563 +#define __HYPERVISOR_fpu_taskswitch 5
107564 +#define __HYPERVISOR_sched_op_compat 6 /* compat since 0x00030101 */
107565 +#define __HYPERVISOR_platform_op 7
107566 +#define __HYPERVISOR_set_debugreg 8
107567 +#define __HYPERVISOR_get_debugreg 9
107568 +#define __HYPERVISOR_update_descriptor 10
107569 +#define __HYPERVISOR_memory_op 12
107570 +#define __HYPERVISOR_multicall 13
107571 +#define __HYPERVISOR_update_va_mapping 14
107572 +#define __HYPERVISOR_set_timer_op 15
107573 +#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
107574 +#define __HYPERVISOR_xen_version 17
107575 +#define __HYPERVISOR_console_io 18
107576 +#define __HYPERVISOR_physdev_op_compat 19 /* compat since 0x00030202 */
107577 +#define __HYPERVISOR_grant_table_op 20
107578 +#define __HYPERVISOR_vm_assist 21
107579 +#define __HYPERVISOR_update_va_mapping_otherdomain 22
107580 +#define __HYPERVISOR_iret 23 /* x86 only */
107581 +#define __HYPERVISOR_vcpu_op 24
107582 +#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
107583 +#define __HYPERVISOR_mmuext_op 26
107584 +#define __HYPERVISOR_acm_op 27
107585 +#define __HYPERVISOR_nmi_op 28
107586 +#define __HYPERVISOR_sched_op 29
107587 +#define __HYPERVISOR_callback_op 30
107588 +#define __HYPERVISOR_xenoprof_op 31
107589 +#define __HYPERVISOR_event_channel_op 32
107590 +#define __HYPERVISOR_physdev_op 33
107591 +#define __HYPERVISOR_hvm_op 34
107592 +#define __HYPERVISOR_sysctl 35
107593 +#define __HYPERVISOR_domctl 36
107594 +#define __HYPERVISOR_kexec_op 37
107595 +
107596 +/* Architecture-specific hypercall definitions. */
107597 +#define __HYPERVISOR_arch_0 48
107598 +#define __HYPERVISOR_arch_1 49
107599 +#define __HYPERVISOR_arch_2 50
107600 +#define __HYPERVISOR_arch_3 51
107601 +#define __HYPERVISOR_arch_4 52
107602 +#define __HYPERVISOR_arch_5 53
107603 +#define __HYPERVISOR_arch_6 54
107604 +#define __HYPERVISOR_arch_7 55
107605 +
107606 +/*
107607 + * HYPERCALL COMPATIBILITY.
107608 + */
107609 +
107610 +/* New sched_op hypercall introduced in 0x00030101. */
107611 +#if __XEN_INTERFACE_VERSION__ < 0x00030101
107612 +#undef __HYPERVISOR_sched_op
107613 +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
107614 +#endif
107615 +
107616 +/* New event-channel and physdev hypercalls introduced in 0x00030202. */
107617 +#if __XEN_INTERFACE_VERSION__ < 0x00030202
107618 +#undef __HYPERVISOR_event_channel_op
107619 +#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
107620 +#undef __HYPERVISOR_physdev_op
107621 +#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
107622 +#endif
107623 +
107624 +/* New platform_op hypercall introduced in 0x00030204. */
107625 +#if __XEN_INTERFACE_VERSION__ < 0x00030204
107626 +#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
107627 +#endif
107628 +
107629 +/*
107630 + * VIRTUAL INTERRUPTS
107631 + *
107632 + * Virtual interrupts that a guest OS may receive from Xen.
107633 + *
107634 + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
107635 + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
107636 + * The latter can be allocated only once per guest: they must initially be
107637 + * allocated to VCPU0 but can subsequently be re-bound.
107638 + */
107639 +#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */
107640 +#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */
107641 +#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */
107642 +#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */
107643 +#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */
107644 +#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */
107645 +#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */
107646 +
107647 +/* Architecture-specific VIRQ definitions. */
107648 +#define VIRQ_ARCH_0 16
107649 +#define VIRQ_ARCH_1 17
107650 +#define VIRQ_ARCH_2 18
107651 +#define VIRQ_ARCH_3 19
107652 +#define VIRQ_ARCH_4 20
107653 +#define VIRQ_ARCH_5 21
107654 +#define VIRQ_ARCH_6 22
107655 +#define VIRQ_ARCH_7 23
107656 +
107657 +#define NR_VIRQS 24
107658 +
107659 +/*
107660 + * MMU-UPDATE REQUESTS
107661 + *
107662 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
107663 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
107664 + * Where the FD has some effect, it is described below.
107665 + * ptr[1:0] specifies the appropriate MMU_* command.
107666 + *
107667 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
107668 + * Updates an entry in a page table. If updating an L1 table, and the new
107669 + * table entry is valid/present, the mapped frame must belong to the FD, if
107670 + * an FD has been specified. If attempting to map an I/O page then the
107671 + * caller assumes the privilege of the FD.
107672 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
107673 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
107674 + * ptr[:2] -- Machine address of the page-table entry to modify.
107675 + * val -- Value to write.
107676 + *
107677 + * ptr[1:0] == MMU_MACHPHYS_UPDATE:
107678 + * Updates an entry in the machine->pseudo-physical mapping table.
107679 + * ptr[:2] -- Machine address within the frame whose mapping to modify.
107680 + * The frame must belong to the FD, if one is specified.
107681 + * val -- Value to write into the mapping entry.
107682 + */
107683 +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
107684 +#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
107685 +
107686 +/*
107687 + * MMU EXTENDED OPERATIONS
107688 + *
107689 + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
107690 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
107691 + * Where the FD has some effect, it is described below.
107692 + *
107693 + * cmd: MMUEXT_(UN)PIN_*_TABLE
107694 + * mfn: Machine frame number to be (un)pinned as a p.t. page.
107695 + * The frame must belong to the FD, if one is specified.
107696 + *
107697 + * cmd: MMUEXT_NEW_BASEPTR
107698 + * mfn: Machine frame number of new page-table base to install in MMU.
107699 + *
107700 + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
107701 + * mfn: Machine frame number of new page-table base to install in MMU
107702 + * when in user space.
107703 + *
107704 + * cmd: MMUEXT_TLB_FLUSH_LOCAL
107705 + * No additional arguments. Flushes local TLB.
107706 + *
107707 + * cmd: MMUEXT_INVLPG_LOCAL
107708 + * linear_addr: Linear address to be flushed from the local TLB.
107709 + *
107710 + * cmd: MMUEXT_TLB_FLUSH_MULTI
107711 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
107712 + *
107713 + * cmd: MMUEXT_INVLPG_MULTI
107714 + * linear_addr: Linear address to be flushed.
107715 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
107716 + *
107717 + * cmd: MMUEXT_TLB_FLUSH_ALL
107718 + * No additional arguments. Flushes all VCPUs' TLBs.
107719 + *
107720 + * cmd: MMUEXT_INVLPG_ALL
107721 + * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
107722 + *
107723 + * cmd: MMUEXT_FLUSH_CACHE
107724 + * No additional arguments. Writes back and flushes cache contents.
107725 + *
107726 + * cmd: MMUEXT_SET_LDT
107727 + * linear_addr: Linear address of LDT base (NB. must be page-aligned).
107728 + * nr_ents: Number of entries in LDT.
107729 + */
107730 +#define MMUEXT_PIN_L1_TABLE 0
107731 +#define MMUEXT_PIN_L2_TABLE 1
107732 +#define MMUEXT_PIN_L3_TABLE 2
107733 +#define MMUEXT_PIN_L4_TABLE 3
107734 +#define MMUEXT_UNPIN_TABLE 4
107735 +#define MMUEXT_NEW_BASEPTR 5
107736 +#define MMUEXT_TLB_FLUSH_LOCAL 6
107737 +#define MMUEXT_INVLPG_LOCAL 7
107738 +#define MMUEXT_TLB_FLUSH_MULTI 8
107739 +#define MMUEXT_INVLPG_MULTI 9
107740 +#define MMUEXT_TLB_FLUSH_ALL 10
107741 +#define MMUEXT_INVLPG_ALL 11
107742 +#define MMUEXT_FLUSH_CACHE 12
107743 +#define MMUEXT_SET_LDT 13
107744 +#define MMUEXT_NEW_USER_BASEPTR 15
107745 +
107746 +#ifndef __ASSEMBLY__
107747 +struct mmuext_op {
107748 + unsigned int cmd;
107749 + union {
107750 + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
107751 + xen_pfn_t mfn;
107752 + /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
107753 + unsigned long linear_addr;
107754 + } arg1;
107755 + union {
107756 + /* SET_LDT */
107757 + unsigned int nr_ents;
107758 + /* TLB_FLUSH_MULTI, INVLPG_MULTI */
107759 + XEN_GUEST_HANDLE_00030205(void) vcpumask;
107760 + } arg2;
107761 +};
107762 +typedef struct mmuext_op mmuext_op_t;
107763 +DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
107764 +#endif
107765 +
107766 +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
107767 +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */
107768 +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */
107769 +#define UVMF_NONE (0UL<<0) /* No flushing at all. */
107770 +#define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */
107771 +#define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */
107772 +#define UVMF_FLUSHTYPE_MASK (3UL<<0)
107773 +#define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */
107774 +#define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */
107775 +#define UVMF_ALL (1UL<<2) /* Flush all TLBs. */
107776 +
107777 +/*
107778 + * Commands to HYPERVISOR_console_io().
107779 + */
107780 +#define CONSOLEIO_write 0
107781 +#define CONSOLEIO_read 1
107782 +
107783 +/*
107784 + * Commands to HYPERVISOR_vm_assist().
107785 + */
107786 +#define VMASST_CMD_enable 0
107787 +#define VMASST_CMD_disable 1
107788 +
107789 +/* x86/32 guests: simulate full 4GB segment limits. */
107790 +#define VMASST_TYPE_4gb_segments 0
107791 +
107792 +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
107793 +#define VMASST_TYPE_4gb_segments_notify 1
107794 +
107795 +/*
107796 + * x86 guests: support writes to bottom-level PTEs.
107797 + * NB1. Page-directory entries cannot be written.
107798 + * NB2. Guest must continue to remove all writable mappings of PTEs.
107799 + */
107800 +#define VMASST_TYPE_writable_pagetables 2
107801 +
107802 +/* x86/PAE guests: support PDPTs above 4GB. */
107803 +#define VMASST_TYPE_pae_extended_cr3 3
107804 +
107805 +#define MAX_VMASST_TYPE 3
107806 +
107807 +#ifndef __ASSEMBLY__
107808 +
107809 +typedef uint16_t domid_t;
107810 +
107811 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
107812 +#define DOMID_FIRST_RESERVED (0x7FF0U)
107813 +
107814 +/* DOMID_SELF is used in certain contexts to refer to oneself. */
107815 +#define DOMID_SELF (0x7FF0U)
107816 +
107817 +/*
107818 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
107819 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
107820 + * is useful to ensure that no mappings to the OS's own heap are accidentally
107821 + * installed. (e.g., in Linux this could cause havoc as reference counts
107822 + * aren't adjusted on the I/O-mapping code path).
107823 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
107824 + * be specified by any calling domain.
107825 + */
107826 +#define DOMID_IO (0x7FF1U)
107827 +
107828 +/*
107829 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
107830 + * Xen's heap space (e.g., the machine_to_phys table).
107831 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
107832 + * the caller is privileged.
107833 + */
107834 +#define DOMID_XEN (0x7FF2U)
107835 +
107836 +/*
107837 + * Send an array of these to HYPERVISOR_mmu_update().
107838 + * NB. The fields are natural pointer/address size for this architecture.
107839 + */
107840 +struct mmu_update {
107841 + uint64_t ptr; /* Machine address of PTE. */
107842 + uint64_t val; /* New contents of PTE. */
107843 +};
107844 +typedef struct mmu_update mmu_update_t;
107845 +DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
107846 +
107847 +/*
107848 + * Send an array of these to HYPERVISOR_multicall().
107849 + * NB. The fields are natural register size for this architecture.
107850 + */
107851 +struct multicall_entry {
107852 + unsigned long op, result;
107853 + unsigned long args[6];
107854 +};
107855 +typedef struct multicall_entry multicall_entry_t;
107856 +DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
107857 +
107858 +/*
107859 + * Event channel endpoints per domain:
107860 + * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
107861 + */
107862 +#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
107863 +
107864 +struct vcpu_time_info {
107865 + /*
107866 + * Updates to the following values are preceded and followed by an
107867 + * increment of 'version'. The guest can therefore detect updates by
107868 + * looking for changes to 'version'. If the least-significant bit of
107869 + * the version number is set then an update is in progress and the guest
107870 + * must wait to read a consistent set of values.
107871 + * The correct way to interact with the version number is similar to
107872 + * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
107873 + */
107874 + uint32_t version;
107875 + uint32_t pad0;
107876 + uint64_t tsc_timestamp; /* TSC at last update of time vals. */
107877 + uint64_t system_time; /* Time, in nanosecs, since boot. */
107878 + /*
107879 + * Current system time:
107880 + * system_time +
107881 + * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
107882 + * CPU frequency (Hz):
107883 + * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
107884 + */
107885 + uint32_t tsc_to_system_mul;
107886 + int8_t tsc_shift;
107887 + int8_t pad1[3];
107888 +}; /* 32 bytes */
107889 +typedef struct vcpu_time_info vcpu_time_info_t;
107890 +
107891 +struct vcpu_info {
107892 + /*
107893 + * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
107894 + * a pending notification for a particular VCPU. It is then cleared
107895 + * by the guest OS /before/ checking for pending work, thus avoiding
107896 + * a set-and-check race. Note that the mask is only accessed by Xen
107897 + * on the CPU that is currently hosting the VCPU. This means that the
107898 + * pending and mask flags can be updated by the guest without special
107899 + * synchronisation (i.e., no need for the x86 LOCK prefix).
107900 + * This may seem suboptimal because if the pending flag is set by
107901 + * a different CPU then an IPI may be scheduled even when the mask
107902 + * is set. However, note:
107903 + * 1. The task of 'interrupt holdoff' is covered by the per-event-
107904 + * channel mask bits. A 'noisy' event that is continually being
107905 + * triggered can be masked at source at this very precise
107906 + * granularity.
107907 + * 2. The main purpose of the per-VCPU mask is therefore to restrict
107908 + * reentrant execution: whether for concurrency control, or to
107909 + * prevent unbounded stack usage. Whatever the purpose, we expect
107910 + * that the mask will be asserted only for short periods at a time,
107911 + * and so the likelihood of a 'spurious' IPI is suitably small.
107912 + * The mask is read before making an event upcall to the guest: a
107913 + * non-zero mask therefore guarantees that the VCPU will not receive
107914 + * an upcall activation. The mask is cleared when the VCPU requests
107915 + * to block: this avoids wakeup-waiting races.
107916 + */
107917 + uint8_t evtchn_upcall_pending;
107918 + uint8_t evtchn_upcall_mask;
107919 + unsigned long evtchn_pending_sel;
107920 + struct arch_vcpu_info arch;
107921 + struct vcpu_time_info time;
107922 +}; /* 64 bytes (x86) */
107923 +typedef struct vcpu_info vcpu_info_t;
107924 +
107925 +/*
107926 + * Xen/kernel shared data -- pointer provided in start_info.
107927 + *
107928 + * This structure is defined to be both smaller than a page, and the
107929 + * only data on the shared page, but may vary in actual size even within
107930 + * compatible Xen versions; guests should not rely on the size
107931 + * of this structure remaining constant.
107932 + */
107933 +struct shared_info {
107934 + struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
107935 +
107936 + /*
107937 + * A domain can create "event channels" on which it can send and receive
107938 + * asynchronous event notifications. There are three classes of event that
107939 + * are delivered by this mechanism:
107940 + * 1. Bi-directional inter- and intra-domain connections. Domains must
107941 + * arrange out-of-band to set up a connection (usually by allocating
107942 + * an unbound 'listener' port and avertising that via a storage service
107943 + * such as xenstore).
107944 + * 2. Physical interrupts. A domain with suitable hardware-access
107945 + * privileges can bind an event-channel port to a physical interrupt
107946 + * source.
107947 + * 3. Virtual interrupts ('events'). A domain can bind an event-channel
107948 + * port to a virtual interrupt source, such as the virtual-timer
107949 + * device or the emergency console.
107950 + *
107951 + * Event channels are addressed by a "port index". Each channel is
107952 + * associated with two bits of information:
107953 + * 1. PENDING -- notifies the domain that there is a pending notification
107954 + * to be processed. This bit is cleared by the guest.
107955 + * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
107956 + * will cause an asynchronous upcall to be scheduled. This bit is only
107957 + * updated by the guest. It is read-only within Xen. If a channel
107958 + * becomes pending while the channel is masked then the 'edge' is lost
107959 + * (i.e., when the channel is unmasked, the guest must manually handle
107960 + * pending notifications as no upcall will be scheduled by Xen).
107961 + *
107962 + * To expedite scanning of pending notifications, any 0->1 pending
107963 + * transition on an unmasked channel causes a corresponding bit in a
107964 + * per-vcpu selector word to be set. Each bit in the selector covers a
107965 + * 'C long' in the PENDING bitfield array.
107966 + */
107967 + unsigned long evtchn_pending[sizeof(unsigned long) * 8];
107968 + unsigned long evtchn_mask[sizeof(unsigned long) * 8];
107969 +
107970 + /*
107971 + * Wallclock time: updated only by control software. Guests should base
107972 + * their gettimeofday() syscall on this wallclock-base value.
107973 + */
107974 + uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
107975 + uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
107976 + uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
107977 +
107978 + struct arch_shared_info arch;
107979 +
107980 +};
107981 +typedef struct shared_info shared_info_t;
107982 +
107983 +/*
107984 + * Start-of-day memory layout for the initial domain (DOM0):
107985 + * 1. The domain is started within contiguous virtual-memory region.
107986 + * 2. The contiguous region begins and ends on an aligned 4MB boundary.
107987 + * 3. The region start corresponds to the load address of the OS image.
107988 + * If the load address is not 4MB aligned then the address is rounded down.
107989 + * 4. This the order of bootstrap elements in the initial virtual region:
107990 + * a. relocated kernel image
107991 + * b. initial ram disk [mod_start, mod_len]
107992 + * c. list of allocated page frames [mfn_list, nr_pages]
107993 + * d. start_info_t structure [register ESI (x86)]
107994 + * e. bootstrap page tables [pt_base, CR3 (x86)]
107995 + * f. bootstrap stack [register ESP (x86)]
107996 + * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
107997 + * 6. The initial ram disk may be omitted.
107998 + * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
107999 + * layout for the domain. In particular, the bootstrap virtual-memory
108000 + * region is a 1:1 mapping to the first section of the pseudo-physical map.
108001 + * 8. All bootstrap elements are mapped read-writable for the guest OS. The
108002 + * only exception is the bootstrap page table, which is mapped read-only.
108003 + * 9. There is guaranteed to be at least 512kB padding after the final
108004 + * bootstrap element. If necessary, the bootstrap virtual region is
108005 + * extended by an extra 4MB to ensure this.
108006 + */
108007 +
108008 +#define MAX_GUEST_CMDLINE 1024
108009 +struct start_info {
108010 + /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
108011 + char magic[32]; /* "xen-<version>-<platform>". */
108012 + unsigned long nr_pages; /* Total pages allocated to this domain. */
108013 + unsigned long shared_info; /* MACHINE address of shared info struct. */
108014 + uint32_t flags; /* SIF_xxx flags. */
108015 + xen_pfn_t store_mfn; /* MACHINE page number of shared page. */
108016 + uint32_t store_evtchn; /* Event channel for store communication. */
108017 + union {
108018 + struct {
108019 + xen_pfn_t mfn; /* MACHINE page number of console page. */
108020 + uint32_t evtchn; /* Event channel for console page. */
108021 + } domU;
108022 + struct {
108023 + uint32_t info_off; /* Offset of console_info struct. */
108024 + uint32_t info_size; /* Size of console_info struct from start.*/
108025 + } dom0;
108026 + } console;
108027 + /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
108028 + unsigned long pt_base; /* VIRTUAL address of page directory. */
108029 + unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
108030 + unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
108031 + unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
108032 + unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
108033 + int8_t cmd_line[MAX_GUEST_CMDLINE];
108034 +};
108035 +typedef struct start_info start_info_t;
108036 +
108037 +/* New console union for dom0 introduced in 0x00030203. */
108038 +#if __XEN_INTERFACE_VERSION__ < 0x00030203
108039 +#define console_mfn console.domU.mfn
108040 +#define console_evtchn console.domU.evtchn
108041 +#endif
108042 +
108043 +/* These flags are passed in the 'flags' field of start_info_t. */
108044 +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
108045 +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
108046 +
108047 +typedef struct dom0_vga_console_info {
108048 + uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
108049 +#define XEN_VGATYPE_TEXT_MODE_3 0x03
108050 +#define XEN_VGATYPE_VESA_LFB 0x23
108051 +
108052 + union {
108053 + struct {
108054 + /* Font height, in pixels. */
108055 + uint16_t font_height;
108056 + /* Cursor location (column, row). */
108057 + uint16_t cursor_x, cursor_y;
108058 + /* Number of rows and columns (dimensions in characters). */
108059 + uint16_t rows, columns;
108060 + } text_mode_3;
108061 +
108062 + struct {
108063 + /* Width and height, in pixels. */
108064 + uint16_t width, height;
108065 + /* Bytes per scan line. */
108066 + uint16_t bytes_per_line;
108067 + /* Bits per pixel. */
108068 + uint16_t bits_per_pixel;
108069 + /* LFB physical address, and size (in units of 64kB). */
108070 + uint32_t lfb_base;
108071 + uint32_t lfb_size;
108072 + /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
108073 + uint8_t red_pos, red_size;
108074 + uint8_t green_pos, green_size;
108075 + uint8_t blue_pos, blue_size;
108076 + uint8_t rsvd_pos, rsvd_size;
108077 + } vesa_lfb;
108078 + } u;
108079 +} dom0_vga_console_info_t;
108080 +
108081 +typedef uint8_t xen_domain_handle_t[16];
108082 +
108083 +/* Turn a plain number into a C unsigned long constant. */
108084 +#define __mk_unsigned_long(x) x ## UL
108085 +#define mk_unsigned_long(x) __mk_unsigned_long(x)
108086 +
108087 +DEFINE_XEN_GUEST_HANDLE(uint8_t);
108088 +DEFINE_XEN_GUEST_HANDLE(uint16_t);
108089 +DEFINE_XEN_GUEST_HANDLE(uint32_t);
108090 +DEFINE_XEN_GUEST_HANDLE(uint64_t);
108091 +
108092 +#else /* __ASSEMBLY__ */
108093 +
108094 +/* In assembly code we cannot use C numeric constant suffixes. */
108095 +#define mk_unsigned_long(x) x
108096 +
108097 +#endif /* !__ASSEMBLY__ */
108098 +
108099 +#endif /* __XEN_PUBLIC_XEN_H__ */
108100 +
108101 +/*
108102 + * Local variables:
108103 + * mode: C
108104 + * c-set-style: "BSD"
108105 + * c-basic-offset: 4
108106 + * tab-width: 4
108107 + * indent-tabs-mode: nil
108108 + * End:
108109 + */
108110 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xencomm.h linux-2.6.16.33/include/xen/interface/xencomm.h
108111 --- linux-2.6.16.33-noxen/include/xen/interface/xencomm.h 1970-01-01 00:00:00.000000000 +0000
108112 +++ linux-2.6.16.33/include/xen/interface/xencomm.h 2007-01-08 15:00:56.000000000 +0000
108113 @@ -0,0 +1,41 @@
108114 +/*
108115 + * Permission is hereby granted, free of charge, to any person obtaining a copy
108116 + * of this software and associated documentation files (the "Software"), to
108117 + * deal in the Software without restriction, including without limitation the
108118 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
108119 + * sell copies of the Software, and to permit persons to whom the Software is
108120 + * furnished to do so, subject to the following conditions:
108121 + *
108122 + * The above copyright notice and this permission notice shall be included in
108123 + * all copies or substantial portions of the Software.
108124 + *
108125 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108126 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108127 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108128 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108129 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108130 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
108131 + * DEALINGS IN THE SOFTWARE.
108132 + *
108133 + * Copyright (C) IBM Corp. 2006
108134 + */
108135 +
108136 +#ifndef _XEN_XENCOMM_H_
108137 +#define _XEN_XENCOMM_H_
108138 +
108139 +/* A xencomm descriptor is a scatter/gather list containing physical
108140 + * addresses corresponding to a virtually contiguous memory area. The
108141 + * hypervisor translates these physical addresses to machine addresses to copy
108142 + * to and from the virtually contiguous area.
108143 + */
108144 +
108145 +#define XENCOMM_MAGIC 0x58434F4D /* 'XCOM' */
108146 +#define XENCOMM_INVALID (~0UL)
108147 +
108148 +struct xencomm_desc {
108149 + uint32_t magic;
108150 + uint32_t nr_addrs; /* the number of entries in address[] */
108151 + uint64_t address[0];
108152 +};
108153 +
108154 +#endif /* _XEN_XENCOMM_H_ */
108155 diff -Nur linux-2.6.16.33-noxen/include/xen/interface/xenoprof.h linux-2.6.16.33/include/xen/interface/xenoprof.h
108156 --- linux-2.6.16.33-noxen/include/xen/interface/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
108157 +++ linux-2.6.16.33/include/xen/interface/xenoprof.h 2007-01-08 15:00:56.000000000 +0000
108158 @@ -0,0 +1,130 @@
108159 +/******************************************************************************
108160 + * xenoprof.h
108161 + *
108162 + * Interface for enabling system wide profiling based on hardware performance
108163 + * counters
108164 + *
108165 + * Permission is hereby granted, free of charge, to any person obtaining a copy
108166 + * of this software and associated documentation files (the "Software"), to
108167 + * deal in the Software without restriction, including without limitation the
108168 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
108169 + * sell copies of the Software, and to permit persons to whom the Software is
108170 + * furnished to do so, subject to the following conditions:
108171 + *
108172 + * The above copyright notice and this permission notice shall be included in
108173 + * all copies or substantial portions of the Software.
108174 + *
108175 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108176 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108177 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108178 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108179 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108180 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
108181 + * DEALINGS IN THE SOFTWARE.
108182 + *
108183 + * Copyright (C) 2005 Hewlett-Packard Co.
108184 + * Written by Aravind Menon & Jose Renato Santos
108185 + */
108186 +
108187 +#ifndef __XEN_PUBLIC_XENOPROF_H__
108188 +#define __XEN_PUBLIC_XENOPROF_H__
108189 +
108190 +#include "xen.h"
108191 +
108192 +/*
108193 + * Commands to HYPERVISOR_xenoprof_op().
108194 + */
108195 +#define XENOPROF_init 0
108196 +#define XENOPROF_reset_active_list 1
108197 +#define XENOPROF_reset_passive_list 2
108198 +#define XENOPROF_set_active 3
108199 +#define XENOPROF_set_passive 4
108200 +#define XENOPROF_reserve_counters 5
108201 +#define XENOPROF_counter 6
108202 +#define XENOPROF_setup_events 7
108203 +#define XENOPROF_enable_virq 8
108204 +#define XENOPROF_start 9
108205 +#define XENOPROF_stop 10
108206 +#define XENOPROF_disable_virq 11
108207 +#define XENOPROF_release_counters 12
108208 +#define XENOPROF_shutdown 13
108209 +#define XENOPROF_get_buffer 14
108210 +#define XENOPROF_last_op 14
108211 +
108212 +#define MAX_OPROF_EVENTS 32
108213 +#define MAX_OPROF_DOMAINS 25
108214 +#define XENOPROF_CPU_TYPE_SIZE 64
108215 +
108216 +/* Xenoprof performance events (not Xen events) */
108217 +struct event_log {
108218 + uint64_t eip;
108219 + uint8_t mode;
108220 + uint8_t event;
108221 +};
108222 +
108223 +/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
108224 +struct xenoprof_buf {
108225 + uint32_t event_head;
108226 + uint32_t event_tail;
108227 + uint32_t event_size;
108228 + uint32_t vcpu_id;
108229 + uint64_t xen_samples;
108230 + uint64_t kernel_samples;
108231 + uint64_t user_samples;
108232 + uint64_t lost_samples;
108233 + struct event_log event_log[1];
108234 +};
108235 +typedef struct xenoprof_buf xenoprof_buf_t;
108236 +DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
108237 +
108238 +struct xenoprof_init {
108239 + int32_t num_events;
108240 + int32_t is_primary;
108241 + char cpu_type[XENOPROF_CPU_TYPE_SIZE];
108242 +};
108243 +typedef struct xenoprof_init xenoprof_init_t;
108244 +DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
108245 +
108246 +struct xenoprof_get_buffer {
108247 + int32_t max_samples;
108248 + int32_t nbuf;
108249 + int32_t bufsize;
108250 + uint64_t buf_gmaddr;
108251 +};
108252 +typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
108253 +DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
108254 +
108255 +struct xenoprof_counter {
108256 + uint32_t ind;
108257 + uint64_t count;
108258 + uint32_t enabled;
108259 + uint32_t event;
108260 + uint32_t hypervisor;
108261 + uint32_t kernel;
108262 + uint32_t user;
108263 + uint64_t unit_mask;
108264 +};
108265 +typedef struct xenoprof_counter xenoprof_counter_t;
108266 +DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
108267 +
108268 +typedef struct xenoprof_passive {
108269 + uint16_t domain_id;
108270 + int32_t max_samples;
108271 + int32_t nbuf;
108272 + int32_t bufsize;
108273 + uint64_t buf_gmaddr;
108274 +} xenoprof_passive_t;
108275 +DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
108276 +
108277 +
108278 +#endif /* __XEN_PUBLIC_XENOPROF_H__ */
108279 +
108280 +/*
108281 + * Local variables:
108282 + * mode: C
108283 + * c-set-style: "BSD"
108284 + * c-basic-offset: 4
108285 + * tab-width: 4
108286 + * indent-tabs-mode: nil
108287 + * End:
108288 + */
108289 diff -Nur linux-2.6.16.33-noxen/include/xen/pcifront.h linux-2.6.16.33/include/xen/pcifront.h
108290 --- linux-2.6.16.33-noxen/include/xen/pcifront.h 1970-01-01 00:00:00.000000000 +0000
108291 +++ linux-2.6.16.33/include/xen/pcifront.h 2007-01-08 15:00:46.000000000 +0000
108292 @@ -0,0 +1,77 @@
108293 +/*
108294 + * PCI Frontend - arch-dependendent declarations
108295 + *
108296 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
108297 + */
108298 +#ifndef __XEN_ASM_PCIFRONT_H__
108299 +#define __XEN_ASM_PCIFRONT_H__
108300 +
108301 +#include <linux/config.h>
108302 +#include <linux/spinlock.h>
108303 +
108304 +#ifdef __KERNEL__
108305 +
108306 +#ifndef __ia64__
108307 +
108308 +struct pcifront_device;
108309 +struct pci_bus;
108310 +
108311 +struct pcifront_sd {
108312 + int domain;
108313 + struct pcifront_device *pdev;
108314 +};
108315 +
108316 +static inline struct pcifront_device *
108317 +pcifront_get_pdev(struct pcifront_sd *sd)
108318 +{
108319 + return sd->pdev;
108320 +}
108321 +
108322 +static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
108323 + struct pcifront_device *pdev)
108324 +{
108325 + sd->domain = domain;
108326 + sd->pdev = pdev;
108327 +}
108328 +
108329 +#if defined(CONFIG_PCI_DOMAINS)
108330 +static inline int pci_domain_nr(struct pci_bus *bus)
108331 +{
108332 + struct pcifront_sd *sd = bus->sysdata;
108333 + return sd->domain;
108334 +}
108335 +static inline int pci_proc_domain(struct pci_bus *bus)
108336 +{
108337 + return pci_domain_nr(bus);
108338 +}
108339 +#endif /* CONFIG_PCI_DOMAINS */
108340 +
108341 +#else /* __ia64__ */
108342 +
108343 +#include <asm/pci.h>
108344 +#define pcifront_sd pci_controller
108345 +
108346 +static inline struct pcifront_device *
108347 +pcifront_get_pdev(struct pcifront_sd *sd)
108348 +{
108349 + return (struct pcifront_device *)sd->platform_data;
108350 +}
108351 +
108352 +static inline void pcifront_init_sd(struct pcifront_sd *sd, int domain,
108353 + struct pcifront_device *pdev)
108354 +{
108355 + sd->segment = domain;
108356 + sd->acpi_handle = NULL;
108357 + sd->iommu = NULL;
108358 + sd->windows = 0;
108359 + sd->window = NULL;
108360 + sd->platform_data = pdev;
108361 +}
108362 +
108363 +#endif /* __ia64__ */
108364 +
108365 +extern spinlock_t pci_bus_lock;
108366 +
108367 +#endif /* __KERNEL__ */
108368 +
108369 +#endif /* __XEN_ASM_PCIFRONT_H__ */
108370 diff -Nur linux-2.6.16.33-noxen/include/xen/public/evtchn.h linux-2.6.16.33/include/xen/public/evtchn.h
108371 --- linux-2.6.16.33-noxen/include/xen/public/evtchn.h 1970-01-01 00:00:00.000000000 +0000
108372 +++ linux-2.6.16.33/include/xen/public/evtchn.h 2007-01-08 15:00:46.000000000 +0000
108373 @@ -0,0 +1,88 @@
108374 +/******************************************************************************
108375 + * evtchn.h
108376 + *
108377 + * Interface to /dev/xen/evtchn.
108378 + *
108379 + * Copyright (c) 2003-2005, K A Fraser
108380 + *
108381 + * This program is free software; you can redistribute it and/or
108382 + * modify it under the terms of the GNU General Public License version 2
108383 + * as published by the Free Software Foundation; or, when distributed
108384 + * separately from the Linux kernel or incorporated into other
108385 + * software packages, subject to the following license:
108386 + *
108387 + * Permission is hereby granted, free of charge, to any person obtaining a copy
108388 + * of this source file (the "Software"), to deal in the Software without
108389 + * restriction, including without limitation the rights to use, copy, modify,
108390 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108391 + * and to permit persons to whom the Software is furnished to do so, subject to
108392 + * the following conditions:
108393 + *
108394 + * The above copyright notice and this permission notice shall be included in
108395 + * all copies or substantial portions of the Software.
108396 + *
108397 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108398 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108399 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108400 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108401 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108402 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108403 + * IN THE SOFTWARE.
108404 + */
108405 +
108406 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
108407 +#define __LINUX_PUBLIC_EVTCHN_H__
108408 +
108409 +/*
108410 + * Bind a fresh port to VIRQ @virq.
108411 + * Return allocated port.
108412 + */
108413 +#define IOCTL_EVTCHN_BIND_VIRQ \
108414 + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
108415 +struct ioctl_evtchn_bind_virq {
108416 + unsigned int virq;
108417 +};
108418 +
108419 +/*
108420 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
108421 + * Return allocated port.
108422 + */
108423 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \
108424 + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
108425 +struct ioctl_evtchn_bind_interdomain {
108426 + unsigned int remote_domain, remote_port;
108427 +};
108428 +
108429 +/*
108430 + * Allocate a fresh port for binding to @remote_domain.
108431 + * Return allocated port.
108432 + */
108433 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \
108434 + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
108435 +struct ioctl_evtchn_bind_unbound_port {
108436 + unsigned int remote_domain;
108437 +};
108438 +
108439 +/*
108440 + * Unbind previously allocated @port.
108441 + */
108442 +#define IOCTL_EVTCHN_UNBIND \
108443 + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
108444 +struct ioctl_evtchn_unbind {
108445 + unsigned int port;
108446 +};
108447 +
108448 +/*
108449 + * Unbind previously allocated @port.
108450 + */
108451 +#define IOCTL_EVTCHN_NOTIFY \
108452 + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
108453 +struct ioctl_evtchn_notify {
108454 + unsigned int port;
108455 +};
108456 +
108457 +/* Clear and reinitialise the event buffer. Clear error condition. */
108458 +#define IOCTL_EVTCHN_RESET \
108459 + _IOC(_IOC_NONE, 'E', 5, 0)
108460 +
108461 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
108462 diff -Nur linux-2.6.16.33-noxen/include/xen/public/privcmd.h linux-2.6.16.33/include/xen/public/privcmd.h
108463 --- linux-2.6.16.33-noxen/include/xen/public/privcmd.h 1970-01-01 00:00:00.000000000 +0000
108464 +++ linux-2.6.16.33/include/xen/public/privcmd.h 2007-01-08 15:00:46.000000000 +0000
108465 @@ -0,0 +1,79 @@
108466 +/******************************************************************************
108467 + * privcmd.h
108468 + *
108469 + * Interface to /proc/xen/privcmd.
108470 + *
108471 + * Copyright (c) 2003-2005, K A Fraser
108472 + *
108473 + * This program is free software; you can redistribute it and/or
108474 + * modify it under the terms of the GNU General Public License version 2
108475 + * as published by the Free Software Foundation; or, when distributed
108476 + * separately from the Linux kernel or incorporated into other
108477 + * software packages, subject to the following license:
108478 + *
108479 + * Permission is hereby granted, free of charge, to any person obtaining a copy
108480 + * of this source file (the "Software"), to deal in the Software without
108481 + * restriction, including without limitation the rights to use, copy, modify,
108482 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108483 + * and to permit persons to whom the Software is furnished to do so, subject to
108484 + * the following conditions:
108485 + *
108486 + * The above copyright notice and this permission notice shall be included in
108487 + * all copies or substantial portions of the Software.
108488 + *
108489 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108490 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108491 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108492 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108493 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108494 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108495 + * IN THE SOFTWARE.
108496 + */
108497 +
108498 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
108499 +#define __LINUX_PUBLIC_PRIVCMD_H__
108500 +
108501 +#include <linux/types.h>
108502 +
108503 +#ifndef __user
108504 +#define __user
108505 +#endif
108506 +
108507 +typedef struct privcmd_hypercall
108508 +{
108509 + __u64 op;
108510 + __u64 arg[5];
108511 +} privcmd_hypercall_t;
108512 +
108513 +typedef struct privcmd_mmap_entry {
108514 + __u64 va;
108515 + __u64 mfn;
108516 + __u64 npages;
108517 +} privcmd_mmap_entry_t;
108518 +
108519 +typedef struct privcmd_mmap {
108520 + int num;
108521 + domid_t dom; /* target domain */
108522 + privcmd_mmap_entry_t __user *entry;
108523 +} privcmd_mmap_t;
108524 +
108525 +typedef struct privcmd_mmapbatch {
108526 + int num; /* number of pages to populate */
108527 + domid_t dom; /* target domain */
108528 + __u64 addr; /* virtual address */
108529 + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
108530 +} privcmd_mmapbatch_t;
108531 +
108532 +/*
108533 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
108534 + * @arg: &privcmd_hypercall_t
108535 + * Return: Value returned from execution of the specified hypercall.
108536 + */
108537 +#define IOCTL_PRIVCMD_HYPERCALL \
108538 + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
108539 +#define IOCTL_PRIVCMD_MMAP \
108540 + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
108541 +#define IOCTL_PRIVCMD_MMAPBATCH \
108542 + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
108543 +
108544 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
108545 diff -Nur linux-2.6.16.33-noxen/include/xen/xen_proc.h linux-2.6.16.33/include/xen/xen_proc.h
108546 --- linux-2.6.16.33-noxen/include/xen/xen_proc.h 1970-01-01 00:00:00.000000000 +0000
108547 +++ linux-2.6.16.33/include/xen/xen_proc.h 2007-01-08 15:00:46.000000000 +0000
108548 @@ -0,0 +1,13 @@
108549 +
108550 +#ifndef __ASM_XEN_PROC_H__
108551 +#define __ASM_XEN_PROC_H__
108552 +
108553 +#include <linux/config.h>
108554 +#include <linux/proc_fs.h>
108555 +
108556 +extern struct proc_dir_entry *create_xen_proc_entry(
108557 + const char *name, mode_t mode);
108558 +extern void remove_xen_proc_entry(
108559 + const char *name);
108560 +
108561 +#endif /* __ASM_XEN_PROC_H__ */
108562 diff -Nur linux-2.6.16.33-noxen/include/xen/xenbus.h linux-2.6.16.33/include/xen/xenbus.h
108563 --- linux-2.6.16.33-noxen/include/xen/xenbus.h 1970-01-01 00:00:00.000000000 +0000
108564 +++ linux-2.6.16.33/include/xen/xenbus.h 2007-01-08 15:00:46.000000000 +0000
108565 @@ -0,0 +1,307 @@
108566 +/******************************************************************************
108567 + * xenbus.h
108568 + *
108569 + * Talks to Xen Store to figure out what devices we have.
108570 + *
108571 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
108572 + * Copyright (C) 2005 XenSource Ltd.
108573 + *
108574 + * This program is free software; you can redistribute it and/or
108575 + * modify it under the terms of the GNU General Public License version 2
108576 + * as published by the Free Software Foundation; or, when distributed
108577 + * separately from the Linux kernel or incorporated into other
108578 + * software packages, subject to the following license:
108579 + *
108580 + * Permission is hereby granted, free of charge, to any person obtaining a copy
108581 + * of this source file (the "Software"), to deal in the Software without
108582 + * restriction, including without limitation the rights to use, copy, modify,
108583 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
108584 + * and to permit persons to whom the Software is furnished to do so, subject to
108585 + * the following conditions:
108586 + *
108587 + * The above copyright notice and this permission notice shall be included in
108588 + * all copies or substantial portions of the Software.
108589 + *
108590 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
108591 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
108592 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
108593 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108594 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
108595 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
108596 + * IN THE SOFTWARE.
108597 + */
108598 +
108599 +#ifndef _XEN_XENBUS_H
108600 +#define _XEN_XENBUS_H
108601 +
108602 +#include <linux/device.h>
108603 +#include <linux/notifier.h>
108604 +#include <linux/mutex.h>
108605 +#include <linux/completion.h>
108606 +#include <linux/init.h>
108607 +#include <xen/interface/xen.h>
108608 +#include <xen/interface/grant_table.h>
108609 +#include <xen/interface/io/xenbus.h>
108610 +#include <xen/interface/io/xs_wire.h>
108611 +
108612 +/* Register callback to watch this node. */
108613 +struct xenbus_watch
108614 +{
108615 + struct list_head list;
108616 +
108617 + /* Path being watched. */
108618 + const char *node;
108619 +
108620 + /* Callback (executed in a process context with no locks held). */
108621 + void (*callback)(struct xenbus_watch *,
108622 + const char **vec, unsigned int len);
108623 +
108624 + /* See XBWF_ definitions below. */
108625 + unsigned long flags;
108626 +};
108627 +
108628 +/*
108629 + * Execute callback in its own kthread. Useful if the callback is long
108630 + * running or heavily serialised, to avoid taking out the main xenwatch thread
108631 + * for a long period of time (or even unwittingly causing a deadlock).
108632 + */
108633 +#define XBWF_new_thread 1
108634 +
108635 +/* A xenbus device. */
108636 +struct xenbus_device {
108637 + const char *devicetype;
108638 + const char *nodename;
108639 + const char *otherend;
108640 + int otherend_id;
108641 + struct xenbus_watch otherend_watch;
108642 + struct device dev;
108643 + enum xenbus_state state;
108644 + struct completion down;
108645 +};
108646 +
108647 +static inline struct xenbus_device *to_xenbus_device(struct device *dev)
108648 +{
108649 + return container_of(dev, struct xenbus_device, dev);
108650 +}
108651 +
108652 +struct xenbus_device_id
108653 +{
108654 + /* .../device/<device_type>/<identifier> */
108655 + char devicetype[32]; /* General class of device. */
108656 +};
108657 +
108658 +/* A xenbus driver. */
108659 +struct xenbus_driver {
108660 + char *name;
108661 + struct module *owner;
108662 + const struct xenbus_device_id *ids;
108663 + int (*probe)(struct xenbus_device *dev,
108664 + const struct xenbus_device_id *id);
108665 + void (*otherend_changed)(struct xenbus_device *dev,
108666 + enum xenbus_state backend_state);
108667 + int (*remove)(struct xenbus_device *dev);
108668 + int (*suspend)(struct xenbus_device *dev);
108669 + int (*resume)(struct xenbus_device *dev);
108670 + int (*uevent)(struct xenbus_device *, char **, int, char *, int);
108671 + struct device_driver driver;
108672 + int (*read_otherend_details)(struct xenbus_device *dev);
108673 +};
108674 +
108675 +static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
108676 +{
108677 + return container_of(drv, struct xenbus_driver, driver);
108678 +}
108679 +
108680 +int xenbus_register_frontend(struct xenbus_driver *drv);
108681 +int xenbus_register_backend(struct xenbus_driver *drv);
108682 +void xenbus_unregister_driver(struct xenbus_driver *drv);
108683 +
108684 +struct xenbus_transaction
108685 +{
108686 + u32 id;
108687 +};
108688 +
108689 +/* Nil transaction ID. */
108690 +#define XBT_NIL ((struct xenbus_transaction) { 0 })
108691 +
108692 +char **xenbus_directory(struct xenbus_transaction t,
108693 + const char *dir, const char *node, unsigned int *num);
108694 +void *xenbus_read(struct xenbus_transaction t,
108695 + const char *dir, const char *node, unsigned int *len);
108696 +int xenbus_write(struct xenbus_transaction t,
108697 + const char *dir, const char *node, const char *string);
108698 +int xenbus_mkdir(struct xenbus_transaction t,
108699 + const char *dir, const char *node);
108700 +int xenbus_exists(struct xenbus_transaction t,
108701 + const char *dir, const char *node);
108702 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
108703 +int xenbus_transaction_start(struct xenbus_transaction *t);
108704 +int xenbus_transaction_end(struct xenbus_transaction t, int abort);
108705 +
108706 +/* Single read and scanf: returns -errno or num scanned if > 0. */
108707 +int xenbus_scanf(struct xenbus_transaction t,
108708 + const char *dir, const char *node, const char *fmt, ...)
108709 + __attribute__((format(scanf, 4, 5)));
108710 +
108711 +/* Single printf and write: returns -errno or 0. */
108712 +int xenbus_printf(struct xenbus_transaction t,
108713 + const char *dir, const char *node, const char *fmt, ...)
108714 + __attribute__((format(printf, 4, 5)));
108715 +
108716 +/* Generic read function: NULL-terminated triples of name,
108717 + * sprintf-style type string, and pointer. Returns 0 or errno.*/
108718 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
108719 +
108720 +/* notifer routines for when the xenstore comes up */
108721 +int register_xenstore_notifier(struct notifier_block *nb);
108722 +void unregister_xenstore_notifier(struct notifier_block *nb);
108723 +
108724 +int register_xenbus_watch(struct xenbus_watch *watch);
108725 +void unregister_xenbus_watch(struct xenbus_watch *watch);
108726 +void xs_suspend(void);
108727 +void xs_resume(void);
108728 +
108729 +/* Used by xenbus_dev to borrow kernel's store connection. */
108730 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
108731 +
108732 +/* Called from xen core code. */
108733 +void xenbus_suspend(void);
108734 +void xenbus_resume(void);
108735 +
108736 +#define XENBUS_IS_ERR_READ(str) ({ \
108737 + if (!IS_ERR(str) && strlen(str) == 0) { \
108738 + kfree(str); \
108739 + str = ERR_PTR(-ERANGE); \
108740 + } \
108741 + IS_ERR(str); \
108742 +})
108743 +
108744 +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
108745 +
108746 +
108747 +/**
108748 + * Register a watch on the given path, using the given xenbus_watch structure
108749 + * for storage, and the given callback function as the callback. Return 0 on
108750 + * success, or -errno on error. On success, the given path will be saved as
108751 + * watch->node, and remains the caller's to free. On error, watch->node will
108752 + * be NULL, the device will switch to XenbusStateClosing, and the error will
108753 + * be saved in the store.
108754 + */
108755 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
108756 + struct xenbus_watch *watch,
108757 + void (*callback)(struct xenbus_watch *,
108758 + const char **, unsigned int));
108759 +
108760 +
108761 +/**
108762 + * Register a watch on the given path/path2, using the given xenbus_watch
108763 + * structure for storage, and the given callback function as the callback.
108764 + * Return 0 on success, or -errno on error. On success, the watched path
108765 + * (path/path2) will be saved as watch->node, and becomes the caller's to
108766 + * kfree(). On error, watch->node will be NULL, so the caller has nothing to
108767 + * free, the device will switch to XenbusStateClosing, and the error will be
108768 + * saved in the store.
108769 + */
108770 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
108771 + const char *path2, struct xenbus_watch *watch,
108772 + void (*callback)(struct xenbus_watch *,
108773 + const char **, unsigned int));
108774 +
108775 +
108776 +/**
108777 + * Advertise in the store a change of the given driver to the given new_state.
108778 + * Return 0 on success, or -errno on error. On error, the device will switch
108779 + * to XenbusStateClosing, and the error will be saved in the store.
108780 + */
108781 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
108782 +
108783 +
108784 +/**
108785 + * Grant access to the given ring_mfn to the peer of the given device. Return
108786 + * 0 on success, or -errno on error. On error, the device will switch to
108787 + * XenbusStateClosing, and the error will be saved in the store.
108788 + */
108789 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
108790 +
108791 +
108792 +/**
108793 + * Map a page of memory into this domain from another domain's grant table.
108794 + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
108795 + * page to that address, and sets *vaddr to that address.
108796 + * xenbus_map_ring does not allocate the virtual address space (you must do
108797 + * this yourself!). It only maps in the page to the specified address.
108798 + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
108799 + * or -ENOMEM on error. If an error is returned, device will switch to
108800 + * XenbusStateClosing and the error message will be saved in XenStore.
108801 + */
108802 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
108803 + int gnt_ref);
108804 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
108805 + grant_handle_t *handle, void *vaddr);
108806 +
108807 +
108808 +/**
108809 + * Unmap a page of memory in this domain that was imported from another domain.
108810 + * Use xenbus_unmap_ring_vfree if you mapped in your memory with
108811 + * xenbus_map_ring_valloc (it will free the virtual address space).
108812 + * Returns 0 on success and returns GNTST_* on error
108813 + * (see xen/include/interface/grant_table.h).
108814 + */
108815 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
108816 +int xenbus_unmap_ring(struct xenbus_device *dev,
108817 + grant_handle_t handle, void *vaddr);
108818 +
108819 +
108820 +/**
108821 + * Allocate an event channel for the given xenbus_device, assigning the newly
108822 + * created local port to *port. Return 0 on success, or -errno on error. On
108823 + * error, the device will switch to XenbusStateClosing, and the error will be
108824 + * saved in the store.
108825 + */
108826 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
108827 +
108828 +
108829 +/**
108830 + * Bind to an existing interdomain event channel in another domain. Returns 0
108831 + * on success and stores the local port in *port. On error, returns -errno,
108832 + * switches the device to XenbusStateClosing, and saves the error in XenStore.
108833 + */
108834 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
108835 +
108836 +
108837 +/**
108838 + * Free an existing event channel. Returns 0 on success or -errno on error.
108839 + */
108840 +int xenbus_free_evtchn(struct xenbus_device *dev, int port);
108841 +
108842 +
108843 +/**
108844 + * Return the state of the driver rooted at the given store path, or
108845 + * XenbusStateUnknown if no state can be read.
108846 + */
108847 +enum xenbus_state xenbus_read_driver_state(const char *path);
108848 +
108849 +
108850 +/***
108851 + * Report the given negative errno into the store, along with the given
108852 + * formatted message.
108853 + */
108854 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
108855 + ...);
108856 +
108857 +
108858 +/***
108859 + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
108860 + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
108861 + * closedown of this driver and its peer.
108862 + */
108863 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
108864 + ...);
108865 +
108866 +int __init xenbus_dev_init(void);
108867 +
108868 +char *xenbus_strstate(enum xenbus_state state);
108869 +int xenbus_dev_is_online(struct xenbus_device *dev);
108870 +int xenbus_frontend_closed(struct xenbus_device *dev);
108871 +
108872 +#endif /* _XEN_XENBUS_H */
108873 diff -Nur linux-2.6.16.33-noxen/include/xen/xencons.h linux-2.6.16.33/include/xen/xencons.h
108874 --- linux-2.6.16.33-noxen/include/xen/xencons.h 1970-01-01 00:00:00.000000000 +0000
108875 +++ linux-2.6.16.33/include/xen/xencons.h 2007-01-08 15:00:46.000000000 +0000
108876 @@ -0,0 +1,19 @@
108877 +#ifndef __ASM_XENCONS_H__
108878 +#define __ASM_XENCONS_H__
108879 +
108880 +struct dom0_vga_console_info;
108881 +void dom0_init_screen_info(const struct dom0_vga_console_info *info);
108882 +
108883 +void xencons_force_flush(void);
108884 +void xencons_resume(void);
108885 +
108886 +/* Interrupt work hooks. Receive data, or kick data out. */
108887 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
108888 +void xencons_tx(void);
108889 +
108890 +int xencons_ring_init(void);
108891 +int xencons_ring_send(const char *data, unsigned len);
108892 +
108893 +void xencons_early_setup(void);
108894 +
108895 +#endif /* __ASM_XENCONS_H__ */
108896 diff -Nur linux-2.6.16.33-noxen/include/xen/xenoprof.h linux-2.6.16.33/include/xen/xenoprof.h
108897 --- linux-2.6.16.33-noxen/include/xen/xenoprof.h 1970-01-01 00:00:00.000000000 +0000
108898 +++ linux-2.6.16.33/include/xen/xenoprof.h 2007-01-08 15:00:46.000000000 +0000
108899 @@ -0,0 +1,42 @@
108900 +/******************************************************************************
108901 + * xen/xenoprof.h
108902 + *
108903 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
108904 + * VA Linux Systems Japan K.K.
108905 + *
108906 + * This program is free software; you can redistribute it and/or modify
108907 + * it under the terms of the GNU General Public License as published by
108908 + * the Free Software Foundation; either version 2 of the License, or
108909 + * (at your option) any later version.
108910 + *
108911 + * This program is distributed in the hope that it will be useful,
108912 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
108913 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
108914 + * GNU General Public License for more details.
108915 + *
108916 + * You should have received a copy of the GNU General Public License
108917 + * along with this program; if not, write to the Free Software
108918 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
108919 + *
108920 + */
108921 +
108922 +#ifndef __XEN_XENOPROF_H__
108923 +#define __XEN_XENOPROF_H__
108924 +#ifdef CONFIG_XEN
108925 +
108926 +#include <asm/xenoprof.h>
108927 +
108928 +struct oprofile_operations;
108929 +int xenoprofile_init(struct oprofile_operations * ops);
108930 +void xenoprofile_exit(void);
108931 +
108932 +struct xenoprof_shared_buffer {
108933 + char *buffer;
108934 + struct xenoprof_arch_shared_buffer arch;
108935 +};
108936 +#else
108937 +#define xenoprofile_init(ops) (-ENOSYS)
108938 +#define xenoprofile_exit() do { } while (0)
108939 +
108940 +#endif /* CONFIG_XEN */
108941 +#endif /* __XEN_XENOPROF_H__ */
108942 diff -Nur linux-2.6.16.33-noxen/kernel/Kconfig.preempt linux-2.6.16.33/kernel/Kconfig.preempt
108943 --- linux-2.6.16.33-noxen/kernel/Kconfig.preempt 2006-11-22 18:06:31.000000000 +0000
108944 +++ linux-2.6.16.33/kernel/Kconfig.preempt 2007-01-08 15:00:46.000000000 +0000
108945 @@ -35,6 +35,7 @@
108946
108947 config PREEMPT
108948 bool "Preemptible Kernel (Low-Latency Desktop)"
108949 + depends on !XEN
108950 help
108951 This option reduces the latency of the kernel by making
108952 all kernel code (that is not executing in a critical section)
108953 diff -Nur linux-2.6.16.33-noxen/kernel/fork.c linux-2.6.16.33/kernel/fork.c
108954 --- linux-2.6.16.33-noxen/kernel/fork.c 2006-11-22 18:06:31.000000000 +0000
108955 +++ linux-2.6.16.33/kernel/fork.c 2007-01-08 15:00:46.000000000 +0000
108956 @@ -274,6 +274,9 @@
108957 if (retval)
108958 goto out;
108959 }
108960 +#ifdef arch_dup_mmap
108961 + arch_dup_mmap(mm, oldmm);
108962 +#endif
108963 retval = 0;
108964 out:
108965 up_write(&mm->mmap_sem);
108966 diff -Nur linux-2.6.16.33-noxen/kernel/irq/spurious.c linux-2.6.16.33/kernel/irq/spurious.c
108967 --- linux-2.6.16.33-noxen/kernel/irq/spurious.c 2006-11-22 18:06:31.000000000 +0000
108968 +++ linux-2.6.16.33/kernel/irq/spurious.c 2007-01-08 15:00:46.000000000 +0000
108969 @@ -137,7 +137,8 @@
108970 struct pt_regs *regs)
108971 {
108972 if (action_ret != IRQ_HANDLED) {
108973 - desc->irqs_unhandled++;
108974 + if (!irq_ignore_unhandled(irq))
108975 + desc->irqs_unhandled++;
108976 if (action_ret != IRQ_NONE)
108977 report_bad_irq(irq, desc, action_ret);
108978 }
108979 diff -Nur linux-2.6.16.33-noxen/kernel/kexec.c linux-2.6.16.33/kernel/kexec.c
108980 --- linux-2.6.16.33-noxen/kernel/kexec.c 2006-11-22 18:06:31.000000000 +0000
108981 +++ linux-2.6.16.33/kernel/kexec.c 2007-01-08 15:00:46.000000000 +0000
108982 @@ -403,7 +403,7 @@
108983 pages = kimage_alloc_pages(GFP_KERNEL, order);
108984 if (!pages)
108985 break;
108986 - pfn = page_to_pfn(pages);
108987 + pfn = kexec_page_to_pfn(pages);
108988 epfn = pfn + count;
108989 addr = pfn << PAGE_SHIFT;
108990 eaddr = epfn << PAGE_SHIFT;
108991 @@ -437,6 +437,7 @@
108992 return pages;
108993 }
108994
108995 +#ifndef CONFIG_XEN
108996 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
108997 unsigned int order)
108998 {
108999 @@ -490,7 +491,7 @@
109000 }
109001 /* If I don't overlap any segments I have found my hole! */
109002 if (i == image->nr_segments) {
109003 - pages = pfn_to_page(hole_start >> PAGE_SHIFT);
109004 + pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
109005 break;
109006 }
109007 }
109008 @@ -517,6 +518,13 @@
109009
109010 return pages;
109011 }
109012 +#else /* !CONFIG_XEN */
109013 +struct page *kimage_alloc_control_pages(struct kimage *image,
109014 + unsigned int order)
109015 +{
109016 + return kimage_alloc_normal_control_pages(image, order);
109017 +}
109018 +#endif
109019
109020 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
109021 {
109022 @@ -532,7 +540,7 @@
109023 return -ENOMEM;
109024
109025 ind_page = page_address(page);
109026 - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
109027 + *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
109028 image->entry = ind_page;
109029 image->last_entry = ind_page +
109030 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
109031 @@ -593,13 +601,13 @@
109032 #define for_each_kimage_entry(image, ptr, entry) \
109033 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
109034 ptr = (entry & IND_INDIRECTION)? \
109035 - phys_to_virt((entry & PAGE_MASK)): ptr +1)
109036 + kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
109037
109038 static void kimage_free_entry(kimage_entry_t entry)
109039 {
109040 struct page *page;
109041
109042 - page = pfn_to_page(entry >> PAGE_SHIFT);
109043 + page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
109044 kimage_free_pages(page);
109045 }
109046
109047 @@ -611,6 +619,10 @@
109048 if (!image)
109049 return;
109050
109051 +#ifdef CONFIG_XEN
109052 + xen_machine_kexec_unload(image);
109053 +#endif
109054 +
109055 kimage_free_extra_pages(image);
109056 for_each_kimage_entry(image, ptr, entry) {
109057 if (entry & IND_INDIRECTION) {
109058 @@ -686,7 +698,7 @@
109059 * have a match.
109060 */
109061 list_for_each_entry(page, &image->dest_pages, lru) {
109062 - addr = page_to_pfn(page) << PAGE_SHIFT;
109063 + addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
109064 if (addr == destination) {
109065 list_del(&page->lru);
109066 return page;
109067 @@ -701,12 +713,12 @@
109068 if (!page)
109069 return NULL;
109070 /* If the page cannot be used file it away */
109071 - if (page_to_pfn(page) >
109072 + if (kexec_page_to_pfn(page) >
109073 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
109074 list_add(&page->lru, &image->unuseable_pages);
109075 continue;
109076 }
109077 - addr = page_to_pfn(page) << PAGE_SHIFT;
109078 + addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
109079
109080 /* If it is the destination page we want use it */
109081 if (addr == destination)
109082 @@ -729,7 +741,7 @@
109083 struct page *old_page;
109084
109085 old_addr = *old & PAGE_MASK;
109086 - old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
109087 + old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
109088 copy_highpage(page, old_page);
109089 *old = addr | (*old & ~PAGE_MASK);
109090
109091 @@ -779,7 +791,7 @@
109092 result = -ENOMEM;
109093 goto out;
109094 }
109095 - result = kimage_add_page(image, page_to_pfn(page)
109096 + result = kimage_add_page(image, kexec_page_to_pfn(page)
109097 << PAGE_SHIFT);
109098 if (result < 0)
109099 goto out;
109100 @@ -811,6 +823,7 @@
109101 return result;
109102 }
109103
109104 +#ifndef CONFIG_XEN
109105 static int kimage_load_crash_segment(struct kimage *image,
109106 struct kexec_segment *segment)
109107 {
109108 @@ -833,7 +846,7 @@
109109 char *ptr;
109110 size_t uchunk, mchunk;
109111
109112 - page = pfn_to_page(maddr >> PAGE_SHIFT);
109113 + page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
109114 if (page == 0) {
109115 result = -ENOMEM;
109116 goto out;
109117 @@ -881,6 +894,13 @@
109118
109119 return result;
109120 }
109121 +#else /* CONFIG_XEN */
109122 +static int kimage_load_segment(struct kimage *image,
109123 + struct kexec_segment *segment)
109124 +{
109125 + return kimage_load_normal_segment(image, segment);
109126 +}
109127 +#endif
109128
109129 /*
109130 * Exec Kernel system call: for obvious reasons only root may call it.
109131 @@ -991,6 +1011,11 @@
109132 if (result)
109133 goto out;
109134 }
109135 +#ifdef CONFIG_XEN
109136 + result = xen_machine_kexec_load(image);
109137 + if (result)
109138 + goto out;
109139 +#endif
109140 /* Install the new kernel, and Uninstall the old */
109141 image = xchg(dest_image, image);
109142
109143 @@ -1045,7 +1070,6 @@
109144 struct kimage *image;
109145 int locked;
109146
109147 -
109148 /* Take the kexec_lock here to prevent sys_kexec_load
109149 * running on one cpu from replacing the crash kernel
109150 * we are using after a panic on a different cpu.
109151 diff -Nur linux-2.6.16.33-noxen/kernel/rcupdate.c linux-2.6.16.33/kernel/rcupdate.c
109152 --- linux-2.6.16.33-noxen/kernel/rcupdate.c 2006-11-22 18:06:31.000000000 +0000
109153 +++ linux-2.6.16.33/kernel/rcupdate.c 2007-05-23 21:00:01.000000000 +0000
109154 @@ -485,6 +485,20 @@
109155 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
109156 }
109157
109158 +/*
109159 + * Check to see if any future RCU-related work will need to be done
109160 + * by the current CPU, even if none need be done immediately, returning
109161 + * 1 if so. This function is part of the RCU implementation; it is -not-
109162 + * an exported member of the RCU API.
109163 + */
109164 +int rcu_needs_cpu(int cpu)
109165 +{
109166 + struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
109167 + struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
109168 +
109169 + return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
109170 +}
109171 +
109172 void rcu_check_callbacks(int cpu, int user)
109173 {
109174 if (user ||
109175 diff -Nur linux-2.6.16.33-noxen/kernel/timer.c linux-2.6.16.33/kernel/timer.c
109176 --- linux-2.6.16.33-noxen/kernel/timer.c 2006-11-22 18:06:31.000000000 +0000
109177 +++ linux-2.6.16.33/kernel/timer.c 2007-05-23 21:00:01.000000000 +0000
109178 @@ -555,6 +555,22 @@
109179 }
109180 spin_unlock(&base->t_base.lock);
109181
109182 + /*
109183 + * It can happen that other CPUs service timer IRQs and increment
109184 + * jiffies, but we have not yet got a local timer tick to process
109185 + * the timer wheels. In that case, the expiry time can be before
109186 + * jiffies, but since the high-resolution timer here is relative to
109187 + * jiffies, the default expression when high-resolution timers are
109188 + * not active,
109189 + *
109190 + * time_before(MAX_JIFFY_OFFSET + jiffies, expires)
109191 + *
109192 + * would falsely evaluate to true. If that is the case, just
109193 + * return jiffies so that we can immediately fire the local timer
109194 + */
109195 + if (time_before(expires, jiffies))
109196 + return jiffies;
109197 +
109198 if (time_before(hr_expires, expires))
109199 return hr_expires;
109200
109201 diff -Nur linux-2.6.16.33-noxen/lib/Makefile linux-2.6.16.33/lib/Makefile
109202 --- linux-2.6.16.33-noxen/lib/Makefile 2006-11-22 18:06:31.000000000 +0000
109203 +++ linux-2.6.16.33/lib/Makefile 2007-01-08 15:00:46.000000000 +0000
109204 @@ -45,6 +45,7 @@
109205 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
109206
109207 obj-$(CONFIG_SWIOTLB) += swiotlb.o
109208 +swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
109209
109210 hostprogs-y := gen_crc32table
109211 clean-files := crc32table.h
109212 diff -Nur linux-2.6.16.33-noxen/lib/vsprintf.c linux-2.6.16.33/lib/vsprintf.c
109213 --- linux-2.6.16.33-noxen/lib/vsprintf.c 2006-11-22 18:06:31.000000000 +0000
109214 +++ linux-2.6.16.33/lib/vsprintf.c 2007-05-23 21:00:01.000000000 +0000
109215 @@ -187,49 +187,49 @@
109216 size -= precision;
109217 if (!(type&(ZEROPAD+LEFT))) {
109218 while(size-->0) {
109219 - if (buf <= end)
109220 + if (buf < end)
109221 *buf = ' ';
109222 ++buf;
109223 }
109224 }
109225 if (sign) {
109226 - if (buf <= end)
109227 + if (buf < end)
109228 *buf = sign;
109229 ++buf;
109230 }
109231 if (type & SPECIAL) {
109232 if (base==8) {
109233 - if (buf <= end)
109234 + if (buf < end)
109235 *buf = '0';
109236 ++buf;
109237 } else if (base==16) {
109238 - if (buf <= end)
109239 + if (buf < end)
109240 *buf = '0';
109241 ++buf;
109242 - if (buf <= end)
109243 + if (buf < end)
109244 *buf = digits[33];
109245 ++buf;
109246 }
109247 }
109248 if (!(type & LEFT)) {
109249 while (size-- > 0) {
109250 - if (buf <= end)
109251 + if (buf < end)
109252 *buf = c;
109253 ++buf;
109254 }
109255 }
109256 while (i < precision--) {
109257 - if (buf <= end)
109258 + if (buf < end)
109259 *buf = '0';
109260 ++buf;
109261 }
109262 while (i-- > 0) {
109263 - if (buf <= end)
109264 + if (buf < end)
109265 *buf = tmp[i];
109266 ++buf;
109267 }
109268 while (size-- > 0) {
109269 - if (buf <= end)
109270 + if (buf < end)
109271 *buf = ' ';
109272 ++buf;
109273 }
109274 @@ -272,7 +272,8 @@
109275 /* 'z' changed to 'Z' --davidm 1/25/99 */
109276 /* 't' added for ptrdiff_t */
109277
109278 - /* Reject out-of-range values early */
109279 + /* Reject out-of-range values early. Large positive sizes are
109280 + used for unknown buffer sizes. */
109281 if (unlikely((int) size < 0)) {
109282 /* There can be only one.. */
109283 static int warn = 1;
109284 @@ -282,16 +283,17 @@
109285 }
109286
109287 str = buf;
109288 - end = buf + size - 1;
109289 + end = buf + size;
109290
109291 - if (end < buf - 1) {
109292 - end = ((void *) -1);
109293 - size = end - buf + 1;
109294 + /* Make sure end is always >= buf */
109295 + if (end < buf) {
109296 + end = ((void *)-1);
109297 + size = end - buf;
109298 }
109299
109300 for (; *fmt ; ++fmt) {
109301 if (*fmt != '%') {
109302 - if (str <= end)
109303 + if (str < end)
109304 *str = *fmt;
109305 ++str;
109306 continue;
109307 @@ -357,17 +359,17 @@
109308 case 'c':
109309 if (!(flags & LEFT)) {
109310 while (--field_width > 0) {
109311 - if (str <= end)
109312 + if (str < end)
109313 *str = ' ';
109314 ++str;
109315 }
109316 }
109317 c = (unsigned char) va_arg(args, int);
109318 - if (str <= end)
109319 + if (str < end)
109320 *str = c;
109321 ++str;
109322 while (--field_width > 0) {
109323 - if (str <= end)
109324 + if (str < end)
109325 *str = ' ';
109326 ++str;
109327 }
109328 @@ -382,18 +384,18 @@
109329
109330 if (!(flags & LEFT)) {
109331 while (len < field_width--) {
109332 - if (str <= end)
109333 + if (str < end)
109334 *str = ' ';
109335 ++str;
109336 }
109337 }
109338 for (i = 0; i < len; ++i) {
109339 - if (str <= end)
109340 + if (str < end)
109341 *str = *s;
109342 ++str; ++s;
109343 }
109344 while (len < field_width--) {
109345 - if (str <= end)
109346 + if (str < end)
109347 *str = ' ';
109348 ++str;
109349 }
109350 @@ -426,7 +428,7 @@
109351 continue;
109352
109353 case '%':
109354 - if (str <= end)
109355 + if (str < end)
109356 *str = '%';
109357 ++str;
109358 continue;
109359 @@ -449,11 +451,11 @@
109360 break;
109361
109362 default:
109363 - if (str <= end)
109364 + if (str < end)
109365 *str = '%';
109366 ++str;
109367 if (*fmt) {
109368 - if (str <= end)
109369 + if (str < end)
109370 *str = *fmt;
109371 ++str;
109372 } else {
109373 @@ -483,14 +485,13 @@
109374 str = number(str, end, num, base,
109375 field_width, precision, flags);
109376 }
109377 - if (str <= end)
109378 - *str = '\0';
109379 - else if (size > 0)
109380 - /* don't write out a null byte if the buf size is zero */
109381 - *end = '\0';
109382 - /* the trailing null byte doesn't count towards the total
109383 - * ++str;
109384 - */
109385 + if (size > 0) {
109386 + if (str < end)
109387 + *str = '\0';
109388 + else
109389 + end[-1] = '\0';
109390 + }
109391 + /* the trailing null byte doesn't count towards the total */
109392 return str-buf;
109393 }
109394
109395 @@ -848,3 +849,26 @@
109396 }
109397
109398 EXPORT_SYMBOL(sscanf);
109399 +
109400 +
109401 +/* Simplified asprintf. */
109402 +char *kasprintf(gfp_t gfp, const char *fmt, ...)
109403 +{
109404 + va_list ap;
109405 + unsigned int len;
109406 + char *p;
109407 +
109408 + va_start(ap, fmt);
109409 + len = vsnprintf(NULL, 0, fmt, ap);
109410 + va_end(ap);
109411 +
109412 + p = kmalloc(len+1, gfp);
109413 + if (!p)
109414 + return NULL;
109415 + va_start(ap, fmt);
109416 + vsnprintf(p, len+1, fmt, ap);
109417 + va_end(ap);
109418 + return p;
109419 +}
109420 +
109421 +EXPORT_SYMBOL(kasprintf);
109422 diff -Nur linux-2.6.16.33-noxen/mm/Kconfig linux-2.6.16.33/mm/Kconfig
109423 --- linux-2.6.16.33-noxen/mm/Kconfig 2006-11-22 18:06:31.000000000 +0000
109424 +++ linux-2.6.16.33/mm/Kconfig 2007-01-08 15:00:46.000000000 +0000
109425 @@ -126,11 +126,14 @@
109426 # Default to 4 for wider testing, though 8 might be more appropriate.
109427 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
109428 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
109429 +# XEN on x86 architecture uses the mapping field on pagetable pages to store a
109430 +# pointer to the destructor. This conflicts with pte_lock_deinit().
109431 #
109432 config SPLIT_PTLOCK_CPUS
109433 int
109434 default "4096" if ARM && !CPU_CACHE_VIPT
109435 default "4096" if PARISC && !PA20
109436 + default "4096" if X86_XEN || X86_64_XEN
109437 default "4"
109438
109439 #
109440 diff -Nur linux-2.6.16.33-noxen/mm/highmem.c linux-2.6.16.33/mm/highmem.c
109441 --- linux-2.6.16.33-noxen/mm/highmem.c 2006-11-22 18:06:31.000000000 +0000
109442 +++ linux-2.6.16.33/mm/highmem.c 2007-01-08 15:00:46.000000000 +0000
109443 @@ -152,6 +152,17 @@
109444 return vaddr;
109445 }
109446
109447 +#ifdef CONFIG_XEN
109448 +void kmap_flush_unused(void)
109449 +{
109450 + spin_lock(&kmap_lock);
109451 + flush_all_zero_pkmaps();
109452 + spin_unlock(&kmap_lock);
109453 +}
109454 +
109455 +EXPORT_SYMBOL(kmap_flush_unused);
109456 +#endif
109457 +
109458 void fastcall *kmap_high(struct page *page)
109459 {
109460 unsigned long vaddr;
109461 diff -Nur linux-2.6.16.33-noxen/mm/memory.c linux-2.6.16.33/mm/memory.c
109462 --- linux-2.6.16.33-noxen/mm/memory.c 2006-11-22 18:06:31.000000000 +0000
109463 +++ linux-2.6.16.33/mm/memory.c 2007-01-08 15:00:46.000000000 +0000
109464 @@ -405,7 +405,8 @@
109465 * Remove this test eventually!
109466 */
109467 if (unlikely(!pfn_valid(pfn))) {
109468 - print_bad_pte(vma, pte, addr);
109469 + if (!(vma->vm_flags & VM_RESERVED))
109470 + print_bad_pte(vma, pte, addr);
109471 return NULL;
109472 }
109473
109474 @@ -881,6 +882,7 @@
109475 tlb_finish_mmu(tlb, address, end);
109476 return end;
109477 }
109478 +EXPORT_SYMBOL(zap_page_range);
109479
109480 /*
109481 * Do a quick page-table lookup for a single page.
109482 @@ -1020,6 +1022,26 @@
109483 continue;
109484 }
109485
109486 +#ifdef CONFIG_XEN
109487 + if (vma && (vma->vm_flags & VM_FOREIGN)) {
109488 + struct page **map = vma->vm_private_data;
109489 + int offset = (start - vma->vm_start) >> PAGE_SHIFT;
109490 + if (map[offset] != NULL) {
109491 + if (pages) {
109492 + struct page *page = map[offset];
109493 +
109494 + pages[i] = page;
109495 + get_page(page);
109496 + }
109497 + if (vmas)
109498 + vmas[i] = vma;
109499 + i++;
109500 + start += PAGE_SIZE;
109501 + len--;
109502 + continue;
109503 + }
109504 + }
109505 +#endif
109506 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
109507 || !(vm_flags & vma->vm_flags))
109508 return i ? : -EFAULT;
109509 @@ -1359,6 +1381,102 @@
109510 }
109511 EXPORT_SYMBOL(remap_pfn_range);
109512
109513 +#ifdef CONFIG_XEN
109514 +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
109515 + unsigned long addr, unsigned long end,
109516 + pte_fn_t fn, void *data)
109517 +{
109518 + pte_t *pte;
109519 + int err;
109520 + struct page *pmd_page;
109521 + spinlock_t *ptl;
109522 +
109523 + pte = (mm == &init_mm) ?
109524 + pte_alloc_kernel(pmd, addr) :
109525 + pte_alloc_map_lock(mm, pmd, addr, &ptl);
109526 + if (!pte)
109527 + return -ENOMEM;
109528 +
109529 + BUG_ON(pmd_huge(*pmd));
109530 +
109531 + pmd_page = pmd_page(*pmd);
109532 +
109533 + do {
109534 + err = fn(pte, pmd_page, addr, data);
109535 + if (err)
109536 + break;
109537 + } while (pte++, addr += PAGE_SIZE, addr != end);
109538 +
109539 + if (mm != &init_mm)
109540 + pte_unmap_unlock(pte-1, ptl);
109541 + return err;
109542 +}
109543 +
109544 +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
109545 + unsigned long addr, unsigned long end,
109546 + pte_fn_t fn, void *data)
109547 +{
109548 + pmd_t *pmd;
109549 + unsigned long next;
109550 + int err;
109551 +
109552 + pmd = pmd_alloc(mm, pud, addr);
109553 + if (!pmd)
109554 + return -ENOMEM;
109555 + do {
109556 + next = pmd_addr_end(addr, end);
109557 + err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
109558 + if (err)
109559 + break;
109560 + } while (pmd++, addr = next, addr != end);
109561 + return err;
109562 +}
109563 +
109564 +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
109565 + unsigned long addr, unsigned long end,
109566 + pte_fn_t fn, void *data)
109567 +{
109568 + pud_t *pud;
109569 + unsigned long next;
109570 + int err;
109571 +
109572 + pud = pud_alloc(mm, pgd, addr);
109573 + if (!pud)
109574 + return -ENOMEM;
109575 + do {
109576 + next = pud_addr_end(addr, end);
109577 + err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
109578 + if (err)
109579 + break;
109580 + } while (pud++, addr = next, addr != end);
109581 + return err;
109582 +}
109583 +
109584 +/*
109585 + * Scan a region of virtual memory, filling in page tables as necessary
109586 + * and calling a provided function on each leaf page table.
109587 + */
109588 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
109589 + unsigned long size, pte_fn_t fn, void *data)
109590 +{
109591 + pgd_t *pgd;
109592 + unsigned long next;
109593 + unsigned long end = addr + size;
109594 + int err;
109595 +
109596 + BUG_ON(addr >= end);
109597 + pgd = pgd_offset(mm, addr);
109598 + do {
109599 + next = pgd_addr_end(addr, end);
109600 + err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
109601 + if (err)
109602 + break;
109603 + } while (pgd++, addr = next, addr != end);
109604 + return err;
109605 +}
109606 +EXPORT_SYMBOL_GPL(apply_to_page_range);
109607 +#endif
109608 +
109609 /*
109610 * handle_pte_fault chooses page fault handler according to an entry
109611 * which was read non-atomically. Before making any commitment, on
109612 diff -Nur linux-2.6.16.33-noxen/mm/mmap.c linux-2.6.16.33/mm/mmap.c
109613 --- linux-2.6.16.33-noxen/mm/mmap.c 2006-11-22 18:06:31.000000000 +0000
109614 +++ linux-2.6.16.33/mm/mmap.c 2007-01-08 15:00:46.000000000 +0000
109615 @@ -1950,6 +1950,10 @@
109616 unsigned long nr_accounted = 0;
109617 unsigned long end;
109618
109619 +#ifdef arch_exit_mmap
109620 + arch_exit_mmap(mm);
109621 +#endif
109622 +
109623 lru_add_drain();
109624 flush_cache_mm(mm);
109625 tlb = tlb_gather_mmu(mm, 1);
109626 diff -Nur linux-2.6.16.33-noxen/mm/page_alloc.c linux-2.6.16.33/mm/page_alloc.c
109627 --- linux-2.6.16.33-noxen/mm/page_alloc.c 2006-11-22 18:06:31.000000000 +0000
109628 +++ linux-2.6.16.33/mm/page_alloc.c 2007-01-08 15:00:46.000000000 +0000
109629 @@ -422,7 +422,8 @@
109630 int i;
109631 int reserved = 0;
109632
109633 - arch_free_page(page, order);
109634 + if (arch_free_page(page, order))
109635 + return;
109636 if (!PageHighMem(page))
109637 mutex_debug_check_no_locks_freed(page_address(page),
109638 PAGE_SIZE<<order);
109639 @@ -716,7 +717,8 @@
109640 struct per_cpu_pages *pcp;
109641 unsigned long flags;
109642
109643 - arch_free_page(page, 0);
109644 + if (arch_free_page(page, 0))
109645 + return;
109646
109647 if (PageAnon(page))
109648 page->mapping = NULL;
109649 diff -Nur linux-2.6.16.33-noxen/net/atm/clip.c linux-2.6.16.33/net/atm/clip.c
109650 --- linux-2.6.16.33-noxen/net/atm/clip.c 2006-11-22 18:06:31.000000000 +0000
109651 +++ linux-2.6.16.33/net/atm/clip.c 2007-05-23 21:00:01.000000000 +0000
109652 @@ -101,7 +101,7 @@
109653 printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n",clip_vcc);
109654 return;
109655 }
109656 - spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */
109657 + netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
109658 entry->neigh->used = jiffies;
109659 for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
109660 if (*walk == clip_vcc) {
109661 @@ -125,7 +125,7 @@
109662 printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
109663 "0x%p)\n",entry,clip_vcc);
109664 out:
109665 - spin_unlock_bh(&entry->neigh->dev->xmit_lock);
109666 + netif_tx_unlock_bh(entry->neigh->dev);
109667 }
109668
109669 /* The neighbour entry n->lock is held. */
109670 diff -Nur linux-2.6.16.33-noxen/net/bridge/br_device.c linux-2.6.16.33/net/bridge/br_device.c
109671 --- linux-2.6.16.33-noxen/net/bridge/br_device.c 2006-11-22 18:06:31.000000000 +0000
109672 +++ linux-2.6.16.33/net/bridge/br_device.c 2007-05-23 21:00:01.000000000 +0000
109673 @@ -146,9 +146,9 @@
109674 struct net_bridge *br = netdev_priv(dev);
109675
109676 if (data)
109677 - br->feature_mask |= NETIF_F_IP_CSUM;
109678 + br->feature_mask |= NETIF_F_NO_CSUM;
109679 else
109680 - br->feature_mask &= ~NETIF_F_IP_CSUM;
109681 + br->feature_mask &= ~NETIF_F_ALL_CSUM;
109682
109683 br_features_recompute(br);
109684 return 0;
109685 @@ -185,6 +185,6 @@
109686 dev->set_mac_address = br_set_mac_address;
109687 dev->priv_flags = IFF_EBRIDGE;
109688
109689 - dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
109690 - | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
109691 + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
109692 + NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_GSO_ROBUST;
109693 }
109694 diff -Nur linux-2.6.16.33-noxen/net/bridge/br_forward.c linux-2.6.16.33/net/bridge/br_forward.c
109695 --- linux-2.6.16.33-noxen/net/bridge/br_forward.c 2006-11-22 18:06:31.000000000 +0000
109696 +++ linux-2.6.16.33/net/bridge/br_forward.c 2007-05-23 21:00:01.000000000 +0000
109697 @@ -32,7 +32,7 @@
109698 int br_dev_queue_push_xmit(struct sk_buff *skb)
109699 {
109700 /* drop mtu oversized packets except tso */
109701 - if (skb->len > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
109702 + if (skb->len > skb->dev->mtu && !skb_is_gso(skb))
109703 kfree_skb(skb);
109704 else {
109705 #ifdef CONFIG_BRIDGE_NETFILTER
109706 diff -Nur linux-2.6.16.33-noxen/net/bridge/br_if.c linux-2.6.16.33/net/bridge/br_if.c
109707 --- linux-2.6.16.33-noxen/net/bridge/br_if.c 2006-11-22 18:06:31.000000000 +0000
109708 +++ linux-2.6.16.33/net/bridge/br_if.c 2007-05-23 21:00:01.000000000 +0000
109709 @@ -385,17 +385,28 @@
109710 struct net_bridge_port *p;
109711 unsigned long features, checksum;
109712
109713 - features = br->feature_mask &~ NETIF_F_IP_CSUM;
109714 - checksum = br->feature_mask & NETIF_F_IP_CSUM;
109715 + checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
109716 + features = br->feature_mask & ~NETIF_F_ALL_CSUM;
109717
109718 list_for_each_entry(p, &br->port_list, list) {
109719 - if (!(p->dev->features
109720 - & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
109721 + unsigned long feature = p->dev->features;
109722 +
109723 + if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
109724 + checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
109725 + if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
109726 + checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
109727 + if (!(feature & NETIF_F_IP_CSUM))
109728 checksum = 0;
109729 - features &= p->dev->features;
109730 +
109731 + if (feature & NETIF_F_GSO)
109732 + feature |= NETIF_F_TSO;
109733 + feature |= NETIF_F_GSO;
109734 +
109735 + features &= feature;
109736 }
109737
109738 - br->dev->features = features | checksum | NETIF_F_LLTX;
109739 + br->dev->features = features | checksum | NETIF_F_LLTX |
109740 + NETIF_F_GSO_ROBUST;
109741 }
109742
109743 /* called with RTNL */
109744 diff -Nur linux-2.6.16.33-noxen/net/bridge/br_netfilter.c linux-2.6.16.33/net/bridge/br_netfilter.c
109745 --- linux-2.6.16.33-noxen/net/bridge/br_netfilter.c 2006-11-22 18:06:31.000000000 +0000
109746 +++ linux-2.6.16.33/net/bridge/br_netfilter.c 2007-05-23 21:00:01.000000000 +0000
109747 @@ -743,7 +743,7 @@
109748 {
109749 if (skb->protocol == htons(ETH_P_IP) &&
109750 skb->len > skb->dev->mtu &&
109751 - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
109752 + !skb_is_gso(skb))
109753 return ip_fragment(skb, br_dev_queue_push_xmit);
109754 else
109755 return br_dev_queue_push_xmit(skb);
109756 diff -Nur linux-2.6.16.33-noxen/net/core/dev.c linux-2.6.16.33/net/core/dev.c
109757 --- linux-2.6.16.33-noxen/net/core/dev.c 2006-11-22 18:06:31.000000000 +0000
109758 +++ linux-2.6.16.33/net/core/dev.c 2007-01-08 15:00:46.000000000 +0000
109759 @@ -115,6 +115,13 @@
109760 #include <net/iw_handler.h>
109761 #endif /* CONFIG_NET_RADIO */
109762 #include <asm/current.h>
109763 +#include <linux/err.h>
109764 +
109765 +#ifdef CONFIG_XEN
109766 +#include <net/ip.h>
109767 +#include <linux/tcp.h>
109768 +#include <linux/udp.h>
109769 +#endif
109770
109771 /*
109772 * The list of packet types we will receive (as opposed to discard)
109773 @@ -1032,7 +1039,7 @@
109774 * taps currently in use.
109775 */
109776
109777 -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
109778 +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
109779 {
109780 struct packet_type *ptype;
109781
109782 @@ -1082,9 +1089,12 @@
109783 unsigned int csum;
109784 int ret = 0, offset = skb->h.raw - skb->data;
109785
109786 - if (inward) {
109787 - skb->ip_summed = CHECKSUM_NONE;
109788 - goto out;
109789 + if (inward)
109790 + goto out_set_summed;
109791 +
109792 + if (unlikely(skb_shinfo(skb)->gso_size)) {
109793 + /* Let GSO fix up the checksum. */
109794 + goto out_set_summed;
109795 }
109796
109797 if (skb_cloned(skb)) {
109798 @@ -1101,11 +1111,65 @@
109799 BUG_ON(skb->csum + 2 > offset);
109800
109801 *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
109802 +
109803 +out_set_summed:
109804 skb->ip_summed = CHECKSUM_NONE;
109805 out:
109806 return ret;
109807 }
109808
109809 +/**
109810 + * skb_gso_segment - Perform segmentation on skb.
109811 + * @skb: buffer to segment
109812 + * @features: features for the output path (see dev->features)
109813 + *
109814 + * This function segments the given skb and returns a list of segments.
109815 + *
109816 + * It may return NULL if the skb requires no segmentation. This is
109817 + * only possible when GSO is used for verifying header integrity.
109818 + */
109819 +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
109820 +{
109821 + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
109822 + struct packet_type *ptype;
109823 + int type = skb->protocol;
109824 + int err;
109825 +
109826 + BUG_ON(skb_shinfo(skb)->frag_list);
109827 +
109828 + skb->mac.raw = skb->data;
109829 + skb->mac_len = skb->nh.raw - skb->data;
109830 + __skb_pull(skb, skb->mac_len);
109831 +
109832 + if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
109833 + if (skb_header_cloned(skb) &&
109834 + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
109835 + return ERR_PTR(err);
109836 + }
109837 +
109838 + rcu_read_lock();
109839 + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
109840 + if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
109841 + if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
109842 + err = ptype->gso_send_check(skb);
109843 + segs = ERR_PTR(err);
109844 + if (err || skb_gso_ok(skb, features))
109845 + break;
109846 + __skb_push(skb, skb->data - skb->nh.raw);
109847 + }
109848 + segs = ptype->gso_segment(skb, features);
109849 + break;
109850 + }
109851 + }
109852 + rcu_read_unlock();
109853 +
109854 + __skb_push(skb, skb->data - skb->mac.raw);
109855 +
109856 + return segs;
109857 +}
109858 +
109859 +EXPORT_SYMBOL(skb_gso_segment);
109860 +
109861 /* Take action when hardware reception checksum errors are detected. */
109862 #ifdef CONFIG_BUG
109863 void netdev_rx_csum_fault(struct net_device *dev)
109864 @@ -1142,79 +1206,148 @@
109865 #define illegal_highdma(dev, skb) (0)
109866 #endif
109867
109868 -/* Keep head the same: replace data */
109869 -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
109870 +struct dev_gso_cb {
109871 + void (*destructor)(struct sk_buff *skb);
109872 +};
109873 +
109874 +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
109875 +
109876 +static void dev_gso_skb_destructor(struct sk_buff *skb)
109877 {
109878 - unsigned int size;
109879 - u8 *data;
109880 - long offset;
109881 - struct skb_shared_info *ninfo;
109882 - int headerlen = skb->data - skb->head;
109883 - int expand = (skb->tail + skb->data_len) - skb->end;
109884 -
109885 - if (skb_shared(skb))
109886 - BUG();
109887 -
109888 - if (expand <= 0)
109889 - expand = 0;
109890 -
109891 - size = skb->end - skb->head + expand;
109892 - size = SKB_DATA_ALIGN(size);
109893 - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
109894 - if (!data)
109895 - return -ENOMEM;
109896 -
109897 - /* Copy entire thing */
109898 - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
109899 - BUG();
109900 -
109901 - /* Set up shinfo */
109902 - ninfo = (struct skb_shared_info*)(data + size);
109903 - atomic_set(&ninfo->dataref, 1);
109904 - ninfo->tso_size = skb_shinfo(skb)->tso_size;
109905 - ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
109906 - ninfo->ufo_size = skb_shinfo(skb)->ufo_size;
109907 - ninfo->nr_frags = 0;
109908 - ninfo->frag_list = NULL;
109909 -
109910 - /* Offset between the two in bytes */
109911 - offset = data - skb->head;
109912 -
109913 - /* Free old data. */
109914 - skb_release_data(skb);
109915 -
109916 - skb->head = data;
109917 - skb->end = data + size;
109918 -
109919 - /* Set up new pointers */
109920 - skb->h.raw += offset;
109921 - skb->nh.raw += offset;
109922 - skb->mac.raw += offset;
109923 - skb->tail += offset;
109924 - skb->data += offset;
109925 + struct dev_gso_cb *cb;
109926 +
109927 + do {
109928 + struct sk_buff *nskb = skb->next;
109929
109930 - /* We are no longer a clone, even if we were. */
109931 - skb->cloned = 0;
109932 + skb->next = nskb->next;
109933 + nskb->next = NULL;
109934 + kfree_skb(nskb);
109935 + } while (skb->next);
109936 +
109937 + cb = DEV_GSO_CB(skb);
109938 + if (cb->destructor)
109939 + cb->destructor(skb);
109940 +}
109941 +
109942 +/**
109943 + * dev_gso_segment - Perform emulated hardware segmentation on skb.
109944 + * @skb: buffer to segment
109945 + *
109946 + * This function segments the given skb and stores the list of segments
109947 + * in skb->next.
109948 + */
109949 +static int dev_gso_segment(struct sk_buff *skb)
109950 +{
109951 + struct net_device *dev = skb->dev;
109952 + struct sk_buff *segs;
109953 + int features = dev->features & ~(illegal_highdma(dev, skb) ?
109954 + NETIF_F_SG : 0);
109955 +
109956 + segs = skb_gso_segment(skb, features);
109957 +
109958 + /* Verifying header integrity only. */
109959 + if (!segs)
109960 + return 0;
109961 +
109962 + if (unlikely(IS_ERR(segs)))
109963 + return PTR_ERR(segs);
109964 +
109965 + skb->next = segs;
109966 + DEV_GSO_CB(skb)->destructor = skb->destructor;
109967 + skb->destructor = dev_gso_skb_destructor;
109968
109969 - skb->tail += skb->data_len;
109970 - skb->data_len = 0;
109971 + return 0;
109972 +}
109973 +
109974 +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
109975 +{
109976 + if (likely(!skb->next)) {
109977 + if (netdev_nit)
109978 + dev_queue_xmit_nit(skb, dev);
109979 +
109980 + if (netif_needs_gso(dev, skb)) {
109981 + if (unlikely(dev_gso_segment(skb)))
109982 + goto out_kfree_skb;
109983 + if (skb->next)
109984 + goto gso;
109985 + }
109986 +
109987 + return dev->hard_start_xmit(skb, dev);
109988 + }
109989 +
109990 +gso:
109991 + do {
109992 + struct sk_buff *nskb = skb->next;
109993 + int rc;
109994 +
109995 + skb->next = nskb->next;
109996 + nskb->next = NULL;
109997 + rc = dev->hard_start_xmit(nskb, dev);
109998 + if (unlikely(rc)) {
109999 + nskb->next = skb->next;
110000 + skb->next = nskb;
110001 + return rc;
110002 + }
110003 + if (unlikely(netif_queue_stopped(dev) && skb->next))
110004 + return NETDEV_TX_BUSY;
110005 + } while (skb->next);
110006 +
110007 + skb->destructor = DEV_GSO_CB(skb)->destructor;
110008 +
110009 +out_kfree_skb:
110010 + kfree_skb(skb);
110011 return 0;
110012 }
110013
110014 #define HARD_TX_LOCK(dev, cpu) { \
110015 if ((dev->features & NETIF_F_LLTX) == 0) { \
110016 - spin_lock(&dev->xmit_lock); \
110017 - dev->xmit_lock_owner = cpu; \
110018 + netif_tx_lock(dev); \
110019 } \
110020 }
110021
110022 #define HARD_TX_UNLOCK(dev) { \
110023 if ((dev->features & NETIF_F_LLTX) == 0) { \
110024 - dev->xmit_lock_owner = -1; \
110025 - spin_unlock(&dev->xmit_lock); \
110026 + netif_tx_unlock(dev); \
110027 } \
110028 }
110029
110030 +#ifdef CONFIG_XEN
110031 +inline int skb_checksum_setup(struct sk_buff *skb)
110032 +{
110033 + if (skb->proto_csum_blank) {
110034 + if (skb->protocol != htons(ETH_P_IP))
110035 + goto out;
110036 + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
110037 + if (skb->h.raw >= skb->tail)
110038 + goto out;
110039 + switch (skb->nh.iph->protocol) {
110040 + case IPPROTO_TCP:
110041 + skb->csum = offsetof(struct tcphdr, check);
110042 + break;
110043 + case IPPROTO_UDP:
110044 + skb->csum = offsetof(struct udphdr, check);
110045 + break;
110046 + default:
110047 + if (net_ratelimit())
110048 + printk(KERN_ERR "Attempting to checksum a non-"
110049 + "TCP/UDP packet, dropping a protocol"
110050 + " %d packet", skb->nh.iph->protocol);
110051 + goto out;
110052 + }
110053 + if ((skb->h.raw + skb->csum + 2) > skb->tail)
110054 + goto out;
110055 + skb->ip_summed = CHECKSUM_HW;
110056 + skb->proto_csum_blank = 0;
110057 + }
110058 + return 0;
110059 +out:
110060 + return -EPROTO;
110061 +}
110062 +#else
110063 +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
110064 +#endif
110065 +
110066 +
110067 /**
110068 * dev_queue_xmit - transmit a buffer
110069 * @skb: buffer to transmit
110070 @@ -1247,9 +1380,19 @@
110071 struct Qdisc *q;
110072 int rc = -ENOMEM;
110073
110074 + /* If a checksum-deferred packet is forwarded to a device that needs a
110075 + * checksum, correct the pointers and force checksumming.
110076 + */
110077 + if (skb_checksum_setup(skb))
110078 + goto out_kfree_skb;
110079 +
110080 + /* GSO will handle the following emulations directly. */
110081 + if (netif_needs_gso(dev, skb))
110082 + goto gso;
110083 +
110084 if (skb_shinfo(skb)->frag_list &&
110085 !(dev->features & NETIF_F_FRAGLIST) &&
110086 - __skb_linearize(skb, GFP_ATOMIC))
110087 + __skb_linearize(skb))
110088 goto out_kfree_skb;
110089
110090 /* Fragmented skb is linearized if device does not support SG,
110091 @@ -1258,25 +1401,26 @@
110092 */
110093 if (skb_shinfo(skb)->nr_frags &&
110094 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
110095 - __skb_linearize(skb, GFP_ATOMIC))
110096 + __skb_linearize(skb))
110097 goto out_kfree_skb;
110098
110099 /* If packet is not checksummed and device does not support
110100 * checksumming for this protocol, complete checksumming here.
110101 */
110102 if (skb->ip_summed == CHECKSUM_HW &&
110103 - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
110104 + (!(dev->features & NETIF_F_GEN_CSUM) &&
110105 (!(dev->features & NETIF_F_IP_CSUM) ||
110106 skb->protocol != htons(ETH_P_IP))))
110107 if (skb_checksum_help(skb, 0))
110108 goto out_kfree_skb;
110109
110110 +gso:
110111 spin_lock_prefetch(&dev->queue_lock);
110112
110113 /* Disable soft irqs for various locks below. Also
110114 * stops preemption for RCU.
110115 */
110116 - local_bh_disable();
110117 + rcu_read_lock_bh();
110118
110119 /* Updates of qdisc are serialized by queue_lock.
110120 * The struct Qdisc which is pointed to by qdisc is now a
110121 @@ -1310,8 +1454,8 @@
110122 /* The device has no queue. Common case for software devices:
110123 loopback, all the sorts of tunnels...
110124
110125 - Really, it is unlikely that xmit_lock protection is necessary here.
110126 - (f.e. loopback and IP tunnels are clean ignoring statistics
110127 + Really, it is unlikely that netif_tx_lock protection is necessary
110128 + here. (f.e. loopback and IP tunnels are clean ignoring statistics
110129 counters.)
110130 However, it is possible, that they rely on protection
110131 made by us here.
110132 @@ -1327,11 +1471,8 @@
110133 HARD_TX_LOCK(dev, cpu);
110134
110135 if (!netif_queue_stopped(dev)) {
110136 - if (netdev_nit)
110137 - dev_queue_xmit_nit(skb, dev);
110138 -
110139 rc = 0;
110140 - if (!dev->hard_start_xmit(skb, dev)) {
110141 + if (!dev_hard_start_xmit(skb, dev)) {
110142 HARD_TX_UNLOCK(dev);
110143 goto out;
110144 }
110145 @@ -1350,13 +1491,13 @@
110146 }
110147
110148 rc = -ENETDOWN;
110149 - local_bh_enable();
110150 + rcu_read_unlock_bh();
110151
110152 out_kfree_skb:
110153 kfree_skb(skb);
110154 return rc;
110155 out:
110156 - local_bh_enable();
110157 + rcu_read_unlock_bh();
110158 return rc;
110159 }
110160
110161 @@ -1610,6 +1751,19 @@
110162 }
110163 #endif
110164
110165 +#ifdef CONFIG_XEN
110166 + switch (skb->ip_summed) {
110167 + case CHECKSUM_UNNECESSARY:
110168 + skb->proto_data_valid = 1;
110169 + break;
110170 + case CHECKSUM_HW:
110171 + /* XXX Implement me. */
110172 + default:
110173 + skb->proto_data_valid = 0;
110174 + break;
110175 + }
110176 +#endif
110177 +
110178 list_for_each_entry_rcu(ptype, &ptype_all, list) {
110179 if (!ptype->dev || ptype->dev == skb->dev) {
110180 if (pt_prev)
110181 @@ -2671,7 +2825,7 @@
110182 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
110183
110184 spin_lock_init(&dev->queue_lock);
110185 - spin_lock_init(&dev->xmit_lock);
110186 + spin_lock_init(&dev->_xmit_lock);
110187 dev->xmit_lock_owner = -1;
110188 #ifdef CONFIG_NET_CLS_ACT
110189 spin_lock_init(&dev->ingress_lock);
110190 @@ -2715,9 +2869,7 @@
110191
110192 /* Fix illegal SG+CSUM combinations. */
110193 if ((dev->features & NETIF_F_SG) &&
110194 - !(dev->features & (NETIF_F_IP_CSUM |
110195 - NETIF_F_NO_CSUM |
110196 - NETIF_F_HW_CSUM))) {
110197 + !(dev->features & NETIF_F_ALL_CSUM)) {
110198 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
110199 dev->name);
110200 dev->features &= ~NETIF_F_SG;
110201 @@ -3269,7 +3421,6 @@
110202 EXPORT_SYMBOL(__dev_get_by_index);
110203 EXPORT_SYMBOL(__dev_get_by_name);
110204 EXPORT_SYMBOL(__dev_remove_pack);
110205 -EXPORT_SYMBOL(__skb_linearize);
110206 EXPORT_SYMBOL(dev_valid_name);
110207 EXPORT_SYMBOL(dev_add_pack);
110208 EXPORT_SYMBOL(dev_alloc_name);
110209 @@ -3301,6 +3452,7 @@
110210 EXPORT_SYMBOL(net_enable_timestamp);
110211 EXPORT_SYMBOL(net_disable_timestamp);
110212 EXPORT_SYMBOL(dev_get_flags);
110213 +EXPORT_SYMBOL(skb_checksum_setup);
110214
110215 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
110216 EXPORT_SYMBOL(br_handle_frame_hook);
110217 diff -Nur linux-2.6.16.33-noxen/net/core/dev_mcast.c linux-2.6.16.33/net/core/dev_mcast.c
110218 --- linux-2.6.16.33-noxen/net/core/dev_mcast.c 2006-11-22 18:06:31.000000000 +0000
110219 +++ linux-2.6.16.33/net/core/dev_mcast.c 2007-05-23 21:00:01.000000000 +0000
110220 @@ -62,7 +62,7 @@
110221 * Device mc lists are changed by bh at least if IPv6 is enabled,
110222 * so that it must be bh protected.
110223 *
110224 - * We block accesses to device mc filters with dev->xmit_lock.
110225 + * We block accesses to device mc filters with netif_tx_lock.
110226 */
110227
110228 /*
110229 @@ -93,9 +93,9 @@
110230
110231 void dev_mc_upload(struct net_device *dev)
110232 {
110233 - spin_lock_bh(&dev->xmit_lock);
110234 + netif_tx_lock_bh(dev);
110235 __dev_mc_upload(dev);
110236 - spin_unlock_bh(&dev->xmit_lock);
110237 + netif_tx_unlock_bh(dev);
110238 }
110239
110240 /*
110241 @@ -107,7 +107,7 @@
110242 int err = 0;
110243 struct dev_mc_list *dmi, **dmip;
110244
110245 - spin_lock_bh(&dev->xmit_lock);
110246 + netif_tx_lock_bh(dev);
110247
110248 for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
110249 /*
110250 @@ -139,13 +139,13 @@
110251 */
110252 __dev_mc_upload(dev);
110253
110254 - spin_unlock_bh(&dev->xmit_lock);
110255 + netif_tx_unlock_bh(dev);
110256 return 0;
110257 }
110258 }
110259 err = -ENOENT;
110260 done:
110261 - spin_unlock_bh(&dev->xmit_lock);
110262 + netif_tx_unlock_bh(dev);
110263 return err;
110264 }
110265
110266 @@ -160,7 +160,7 @@
110267
110268 dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
110269
110270 - spin_lock_bh(&dev->xmit_lock);
110271 + netif_tx_lock_bh(dev);
110272 for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
110273 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
110274 dmi->dmi_addrlen == alen) {
110275 @@ -176,7 +176,7 @@
110276 }
110277
110278 if ((dmi = dmi1) == NULL) {
110279 - spin_unlock_bh(&dev->xmit_lock);
110280 + netif_tx_unlock_bh(dev);
110281 return -ENOMEM;
110282 }
110283 memcpy(dmi->dmi_addr, addr, alen);
110284 @@ -189,11 +189,11 @@
110285
110286 __dev_mc_upload(dev);
110287
110288 - spin_unlock_bh(&dev->xmit_lock);
110289 + netif_tx_unlock_bh(dev);
110290 return 0;
110291
110292 done:
110293 - spin_unlock_bh(&dev->xmit_lock);
110294 + netif_tx_unlock_bh(dev);
110295 kfree(dmi1);
110296 return err;
110297 }
110298 @@ -204,7 +204,7 @@
110299
110300 void dev_mc_discard(struct net_device *dev)
110301 {
110302 - spin_lock_bh(&dev->xmit_lock);
110303 + netif_tx_lock_bh(dev);
110304
110305 while (dev->mc_list != NULL) {
110306 struct dev_mc_list *tmp = dev->mc_list;
110307 @@ -215,7 +215,7 @@
110308 }
110309 dev->mc_count = 0;
110310
110311 - spin_unlock_bh(&dev->xmit_lock);
110312 + netif_tx_unlock_bh(dev);
110313 }
110314
110315 #ifdef CONFIG_PROC_FS
110316 @@ -250,7 +250,7 @@
110317 struct dev_mc_list *m;
110318 struct net_device *dev = v;
110319
110320 - spin_lock_bh(&dev->xmit_lock);
110321 + netif_tx_lock_bh(dev);
110322 for (m = dev->mc_list; m; m = m->next) {
110323 int i;
110324
110325 @@ -262,7 +262,7 @@
110326
110327 seq_putc(seq, '\n');
110328 }
110329 - spin_unlock_bh(&dev->xmit_lock);
110330 + netif_tx_unlock_bh(dev);
110331 return 0;
110332 }
110333
110334 diff -Nur linux-2.6.16.33-noxen/net/core/ethtool.c linux-2.6.16.33/net/core/ethtool.c
110335 --- linux-2.6.16.33-noxen/net/core/ethtool.c 2006-11-22 18:06:31.000000000 +0000
110336 +++ linux-2.6.16.33/net/core/ethtool.c 2007-05-23 21:00:01.000000000 +0000
110337 @@ -30,7 +30,7 @@
110338
110339 u32 ethtool_op_get_tx_csum(struct net_device *dev)
110340 {
110341 - return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
110342 + return (dev->features & NETIF_F_ALL_CSUM) != 0;
110343 }
110344
110345 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
110346 @@ -551,9 +551,7 @@
110347 return -EFAULT;
110348
110349 if (edata.data &&
110350 - !(dev->features & (NETIF_F_IP_CSUM |
110351 - NETIF_F_NO_CSUM |
110352 - NETIF_F_HW_CSUM)))
110353 + !(dev->features & NETIF_F_ALL_CSUM))
110354 return -EINVAL;
110355
110356 return __ethtool_set_sg(dev, edata.data);
110357 @@ -561,7 +559,7 @@
110358
110359 static int ethtool_get_tso(struct net_device *dev, char __user *useraddr)
110360 {
110361 - struct ethtool_value edata = { ETHTOOL_GTSO };
110362 + struct ethtool_value edata = { ETHTOOL_GUFO };
110363
110364 if (!dev->ethtool_ops->get_tso)
110365 return -EOPNOTSUPP;
110366 @@ -616,6 +614,29 @@
110367 return dev->ethtool_ops->set_ufo(dev, edata.data);
110368 }
110369
110370 +static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
110371 +{
110372 + struct ethtool_value edata = { ETHTOOL_GGSO };
110373 +
110374 + edata.data = dev->features & NETIF_F_GSO;
110375 + if (copy_to_user(useraddr, &edata, sizeof(edata)))
110376 + return -EFAULT;
110377 + return 0;
110378 +}
110379 +
110380 +static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
110381 +{
110382 + struct ethtool_value edata;
110383 +
110384 + if (copy_from_user(&edata, useraddr, sizeof(edata)))
110385 + return -EFAULT;
110386 + if (edata.data)
110387 + dev->features |= NETIF_F_GSO;
110388 + else
110389 + dev->features &= ~NETIF_F_GSO;
110390 + return 0;
110391 +}
110392 +
110393 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
110394 {
110395 struct ethtool_test test;
110396 @@ -907,6 +928,12 @@
110397 case ETHTOOL_SUFO:
110398 rc = ethtool_set_ufo(dev, useraddr);
110399 break;
110400 + case ETHTOOL_GGSO:
110401 + rc = ethtool_get_gso(dev, useraddr);
110402 + break;
110403 + case ETHTOOL_SGSO:
110404 + rc = ethtool_set_gso(dev, useraddr);
110405 + break;
110406 default:
110407 rc = -EOPNOTSUPP;
110408 }
110409 diff -Nur linux-2.6.16.33-noxen/net/core/netpoll.c linux-2.6.16.33/net/core/netpoll.c
110410 --- linux-2.6.16.33-noxen/net/core/netpoll.c 2006-11-22 18:06:31.000000000 +0000
110411 +++ linux-2.6.16.33/net/core/netpoll.c 2007-05-23 21:00:01.000000000 +0000
110412 @@ -273,24 +273,21 @@
110413
110414 do {
110415 npinfo->tries--;
110416 - spin_lock(&np->dev->xmit_lock);
110417 - np->dev->xmit_lock_owner = smp_processor_id();
110418 + netif_tx_lock(np->dev);
110419
110420 /*
110421 * network drivers do not expect to be called if the queue is
110422 * stopped.
110423 */
110424 if (netif_queue_stopped(np->dev)) {
110425 - np->dev->xmit_lock_owner = -1;
110426 - spin_unlock(&np->dev->xmit_lock);
110427 + netif_tx_unlock(np->dev);
110428 netpoll_poll(np);
110429 udelay(50);
110430 continue;
110431 }
110432
110433 status = np->dev->hard_start_xmit(skb, np->dev);
110434 - np->dev->xmit_lock_owner = -1;
110435 - spin_unlock(&np->dev->xmit_lock);
110436 + netif_tx_unlock(np->dev);
110437
110438 /* success */
110439 if(!status) {
110440 diff -Nur linux-2.6.16.33-noxen/net/core/pktgen.c linux-2.6.16.33/net/core/pktgen.c
110441 --- linux-2.6.16.33-noxen/net/core/pktgen.c 2006-11-22 18:06:31.000000000 +0000
110442 +++ linux-2.6.16.33/net/core/pktgen.c 2007-05-23 21:00:01.000000000 +0000
110443 @@ -2586,7 +2586,7 @@
110444 }
110445 }
110446
110447 - spin_lock_bh(&odev->xmit_lock);
110448 + netif_tx_lock_bh(odev);
110449 if (!netif_queue_stopped(odev)) {
110450
110451 atomic_inc(&(pkt_dev->skb->users));
110452 @@ -2631,7 +2631,7 @@
110453 pkt_dev->next_tx_ns = 0;
110454 }
110455
110456 - spin_unlock_bh(&odev->xmit_lock);
110457 + netif_tx_unlock_bh(odev);
110458
110459 /* If pkt_dev->count is zero, then run forever */
110460 if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
110461 diff -Nur linux-2.6.16.33-noxen/net/core/skbuff.c linux-2.6.16.33/net/core/skbuff.c
110462 --- linux-2.6.16.33-noxen/net/core/skbuff.c 2006-11-22 18:06:31.000000000 +0000
110463 +++ linux-2.6.16.33/net/core/skbuff.c 2007-01-08 15:00:46.000000000 +0000
110464 @@ -132,6 +132,7 @@
110465 * Buffers may only be allocated from interrupts using a @gfp_mask of
110466 * %GFP_ATOMIC.
110467 */
110468 +#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
110469 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
110470 int fclone)
110471 {
110472 @@ -164,9 +165,9 @@
110473 shinfo = skb_shinfo(skb);
110474 atomic_set(&shinfo->dataref, 1);
110475 shinfo->nr_frags = 0;
110476 - shinfo->tso_size = 0;
110477 - shinfo->tso_segs = 0;
110478 - shinfo->ufo_size = 0;
110479 + shinfo->gso_size = 0;
110480 + shinfo->gso_segs = 0;
110481 + shinfo->gso_type = 0;
110482 shinfo->ip6_frag_id = 0;
110483 shinfo->frag_list = NULL;
110484
110485 @@ -186,6 +187,7 @@
110486 skb = NULL;
110487 goto out;
110488 }
110489 +#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
110490
110491 /**
110492 * alloc_skb_from_cache - allocate a network buffer
110493 @@ -203,14 +205,18 @@
110494 */
110495 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
110496 unsigned int size,
110497 - gfp_t gfp_mask)
110498 + gfp_t gfp_mask,
110499 + int fclone)
110500 {
110501 + kmem_cache_t *cache;
110502 + struct skb_shared_info *shinfo;
110503 struct sk_buff *skb;
110504 u8 *data;
110505
110506 + cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
110507 +
110508 /* Get the HEAD */
110509 - skb = kmem_cache_alloc(skbuff_head_cache,
110510 - gfp_mask & ~__GFP_DMA);
110511 + skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
110512 if (!skb)
110513 goto out;
110514
110515 @@ -227,17 +233,29 @@
110516 skb->data = data;
110517 skb->tail = data;
110518 skb->end = data + size;
110519 + /* make sure we initialize shinfo sequentially */
110520 + shinfo = skb_shinfo(skb);
110521 + atomic_set(&shinfo->dataref, 1);
110522 + shinfo->nr_frags = 0;
110523 + shinfo->gso_size = 0;
110524 + shinfo->gso_segs = 0;
110525 + shinfo->gso_type = 0;
110526 + shinfo->ip6_frag_id = 0;
110527 + shinfo->frag_list = NULL;
110528
110529 - atomic_set(&(skb_shinfo(skb)->dataref), 1);
110530 - skb_shinfo(skb)->nr_frags = 0;
110531 - skb_shinfo(skb)->tso_size = 0;
110532 - skb_shinfo(skb)->tso_segs = 0;
110533 - skb_shinfo(skb)->ufo_size = 0;
110534 - skb_shinfo(skb)->frag_list = NULL;
110535 + if (fclone) {
110536 + struct sk_buff *child = skb + 1;
110537 + atomic_t *fclone_ref = (atomic_t *) (child + 1);
110538 +
110539 + skb->fclone = SKB_FCLONE_ORIG;
110540 + atomic_set(fclone_ref, 1);
110541 +
110542 + child->fclone = SKB_FCLONE_UNAVAILABLE;
110543 + }
110544 out:
110545 return skb;
110546 nodata:
110547 - kmem_cache_free(skbuff_head_cache, skb);
110548 + kmem_cache_free(cache, skb);
110549 skb = NULL;
110550 goto out;
110551 }
110552 @@ -414,6 +432,10 @@
110553 C(local_df);
110554 n->cloned = 1;
110555 n->nohdr = 0;
110556 +#ifdef CONFIG_XEN
110557 + C(proto_data_valid);
110558 + C(proto_csum_blank);
110559 +#endif
110560 C(pkt_type);
110561 C(ip_summed);
110562 C(priority);
110563 @@ -507,9 +529,9 @@
110564 new->tc_index = old->tc_index;
110565 #endif
110566 atomic_set(&new->users, 1);
110567 - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
110568 - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
110569 - skb_shinfo(new)->ufo_size = skb_shinfo(old)->ufo_size;
110570 + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
110571 + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
110572 + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
110573 }
110574
110575 /**
110576 @@ -1822,6 +1844,132 @@
110577 return 0;
110578 }
110579
110580 +/**
110581 + * skb_segment - Perform protocol segmentation on skb.
110582 + * @skb: buffer to segment
110583 + * @features: features for the output path (see dev->features)
110584 + *
110585 + * This function performs segmentation on the given skb. It returns
110586 + * the segment at the given position. It returns NULL if there are
110587 + * no more segments to generate, or when an error is encountered.
110588 + */
110589 +struct sk_buff *skb_segment(struct sk_buff *skb, int features)
110590 +{
110591 + struct sk_buff *segs = NULL;
110592 + struct sk_buff *tail = NULL;
110593 + unsigned int mss = skb_shinfo(skb)->gso_size;
110594 + unsigned int doffset = skb->data - skb->mac.raw;
110595 + unsigned int offset = doffset;
110596 + unsigned int headroom;
110597 + unsigned int len;
110598 + int sg = features & NETIF_F_SG;
110599 + int nfrags = skb_shinfo(skb)->nr_frags;
110600 + int err = -ENOMEM;
110601 + int i = 0;
110602 + int pos;
110603 +
110604 + __skb_push(skb, doffset);
110605 + headroom = skb_headroom(skb);
110606 + pos = skb_headlen(skb);
110607 +
110608 + do {
110609 + struct sk_buff *nskb;
110610 + skb_frag_t *frag;
110611 + int hsize;
110612 + int k;
110613 + int size;
110614 +
110615 + len = skb->len - offset;
110616 + if (len > mss)
110617 + len = mss;
110618 +
110619 + hsize = skb_headlen(skb) - offset;
110620 + if (hsize < 0)
110621 + hsize = 0;
110622 + if (hsize > len || !sg)
110623 + hsize = len;
110624 +
110625 + nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
110626 + if (unlikely(!nskb))
110627 + goto err;
110628 +
110629 + if (segs)
110630 + tail->next = nskb;
110631 + else
110632 + segs = nskb;
110633 + tail = nskb;
110634 +
110635 + nskb->dev = skb->dev;
110636 + nskb->priority = skb->priority;
110637 + nskb->protocol = skb->protocol;
110638 + nskb->dst = dst_clone(skb->dst);
110639 + memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
110640 + nskb->pkt_type = skb->pkt_type;
110641 + nskb->mac_len = skb->mac_len;
110642 +
110643 + skb_reserve(nskb, headroom);
110644 + nskb->mac.raw = nskb->data;
110645 + nskb->nh.raw = nskb->data + skb->mac_len;
110646 + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
110647 + memcpy(skb_put(nskb, doffset), skb->data, doffset);
110648 +
110649 + if (!sg) {
110650 + nskb->csum = skb_copy_and_csum_bits(skb, offset,
110651 + skb_put(nskb, len),
110652 + len, 0);
110653 + continue;
110654 + }
110655 +
110656 + frag = skb_shinfo(nskb)->frags;
110657 + k = 0;
110658 +
110659 + nskb->ip_summed = CHECKSUM_HW;
110660 + nskb->csum = skb->csum;
110661 + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
110662 +
110663 + while (pos < offset + len) {
110664 + BUG_ON(i >= nfrags);
110665 +
110666 + *frag = skb_shinfo(skb)->frags[i];
110667 + get_page(frag->page);
110668 + size = frag->size;
110669 +
110670 + if (pos < offset) {
110671 + frag->page_offset += offset - pos;
110672 + frag->size -= offset - pos;
110673 + }
110674 +
110675 + k++;
110676 +
110677 + if (pos + size <= offset + len) {
110678 + i++;
110679 + pos += size;
110680 + } else {
110681 + frag->size -= pos + size - (offset + len);
110682 + break;
110683 + }
110684 +
110685 + frag++;
110686 + }
110687 +
110688 + skb_shinfo(nskb)->nr_frags = k;
110689 + nskb->data_len = len - hsize;
110690 + nskb->len += nskb->data_len;
110691 + nskb->truesize += nskb->data_len;
110692 + } while ((offset += len) < skb->len);
110693 +
110694 + return segs;
110695 +
110696 +err:
110697 + while ((skb = segs)) {
110698 + segs = skb->next;
110699 + kfree(skb);
110700 + }
110701 + return ERR_PTR(err);
110702 +}
110703 +
110704 +EXPORT_SYMBOL_GPL(skb_segment);
110705 +
110706 void __init skb_init(void)
110707 {
110708 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
110709 diff -Nur linux-2.6.16.33-noxen/net/core/skbuff.c~ linux-2.6.16.33/net/core/skbuff.c~
110710 --- linux-2.6.16.33-noxen/net/core/skbuff.c~ 1970-01-01 00:00:00.000000000 +0000
110711 +++ linux-2.6.16.33/net/core/skbuff.c~ 2007-05-23 21:00:01.000000000 +0000
110712 @@ -0,0 +1,2003 @@
110713 +/*
110714 + * Routines having to do with the 'struct sk_buff' memory handlers.
110715 + *
110716 + * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
110717 + * Florian La Roche <rzsfl@rz.uni-sb.de>
110718 + *
110719 + * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
110720 + *
110721 + * Fixes:
110722 + * Alan Cox : Fixed the worst of the load
110723 + * balancer bugs.
110724 + * Dave Platt : Interrupt stacking fix.
110725 + * Richard Kooijman : Timestamp fixes.
110726 + * Alan Cox : Changed buffer format.
110727 + * Alan Cox : destructor hook for AF_UNIX etc.
110728 + * Linus Torvalds : Better skb_clone.
110729 + * Alan Cox : Added skb_copy.
110730 + * Alan Cox : Added all the changed routines Linus
110731 + * only put in the headers
110732 + * Ray VanTassle : Fixed --skb->lock in free
110733 + * Alan Cox : skb_copy copy arp field
110734 + * Andi Kleen : slabified it.
110735 + * Robert Olsson : Removed skb_head_pool
110736 + *
110737 + * NOTE:
110738 + * The __skb_ routines should be called with interrupts
110739 + * disabled, or you better be *real* sure that the operation is atomic
110740 + * with respect to whatever list is being frobbed (e.g. via lock_sock()
110741 + * or via disabling bottom half handlers, etc).
110742 + *
110743 + * This program is free software; you can redistribute it and/or
110744 + * modify it under the terms of the GNU General Public License
110745 + * as published by the Free Software Foundation; either version
110746 + * 2 of the License, or (at your option) any later version.
110747 + */
110748 +
110749 +/*
110750 + * The functions in this file will not compile correctly with gcc 2.4.x
110751 + */
110752 +
110753 +#include <linux/config.h>
110754 +#include <linux/module.h>
110755 +#include <linux/types.h>
110756 +#include <linux/kernel.h>
110757 +#include <linux/sched.h>
110758 +#include <linux/mm.h>
110759 +#include <linux/interrupt.h>
110760 +#include <linux/in.h>
110761 +#include <linux/inet.h>
110762 +#include <linux/slab.h>
110763 +#include <linux/netdevice.h>
110764 +#ifdef CONFIG_NET_CLS_ACT
110765 +#include <net/pkt_sched.h>
110766 +#endif
110767 +#include <linux/string.h>
110768 +#include <linux/skbuff.h>
110769 +#include <linux/cache.h>
110770 +#include <linux/rtnetlink.h>
110771 +#include <linux/init.h>
110772 +#include <linux/highmem.h>
110773 +
110774 +#include <net/protocol.h>
110775 +#include <net/dst.h>
110776 +#include <net/sock.h>
110777 +#include <net/checksum.h>
110778 +#include <net/xfrm.h>
110779 +
110780 +#include <asm/uaccess.h>
110781 +#include <asm/system.h>
110782 +
110783 +static kmem_cache_t *skbuff_head_cache __read_mostly;
110784 +static kmem_cache_t *skbuff_fclone_cache __read_mostly;
110785 +
110786 +/*
110787 + * Keep out-of-line to prevent kernel bloat.
110788 + * __builtin_return_address is not used because it is not always
110789 + * reliable.
110790 + */
110791 +
110792 +/**
110793 + * skb_over_panic - private function
110794 + * @skb: buffer
110795 + * @sz: size
110796 + * @here: address
110797 + *
110798 + * Out of line support code for skb_put(). Not user callable.
110799 + */
110800 +void skb_over_panic(struct sk_buff *skb, int sz, void *here)
110801 +{
110802 + printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
110803 + "data:%p tail:%p end:%p dev:%s\n",
110804 + here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
110805 + skb->dev ? skb->dev->name : "<NULL>");
110806 + BUG();
110807 +}
110808 +
110809 +/**
110810 + * skb_under_panic - private function
110811 + * @skb: buffer
110812 + * @sz: size
110813 + * @here: address
110814 + *
110815 + * Out of line support code for skb_push(). Not user callable.
110816 + */
110817 +
110818 +void skb_under_panic(struct sk_buff *skb, int sz, void *here)
110819 +{
110820 + printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
110821 + "data:%p tail:%p end:%p dev:%s\n",
110822 + here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end,
110823 + skb->dev ? skb->dev->name : "<NULL>");
110824 + BUG();
110825 +}
110826 +
110827 +/* Allocate a new skbuff. We do this ourselves so we can fill in a few
110828 + * 'private' fields and also do memory statistics to find all the
110829 + * [BEEP] leaks.
110830 + *
110831 + */
110832 +
110833 +/**
110834 + * __alloc_skb - allocate a network buffer
110835 + * @size: size to allocate
110836 + * @gfp_mask: allocation mask
110837 + * @fclone: allocate from fclone cache instead of head cache
110838 + * and allocate a cloned (child) skb
110839 + *
110840 + * Allocate a new &sk_buff. The returned buffer has no headroom and a
110841 + * tail room of size bytes. The object has a reference count of one.
110842 + * The return is the buffer. On a failure the return is %NULL.
110843 + *
110844 + * Buffers may only be allocated from interrupts using a @gfp_mask of
110845 + * %GFP_ATOMIC.
110846 + */
110847 +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
110848 + int fclone)
110849 +{
110850 + kmem_cache_t *cache;
110851 + struct skb_shared_info *shinfo;
110852 + struct sk_buff *skb;
110853 + u8 *data;
110854 +
110855 + cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
110856 +
110857 + /* Get the HEAD */
110858 + skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
110859 + if (!skb)
110860 + goto out;
110861 +
110862 + /* Get the DATA. Size must match skb_add_mtu(). */
110863 + size = SKB_DATA_ALIGN(size);
110864 + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
110865 + if (!data)
110866 + goto nodata;
110867 +
110868 + memset(skb, 0, offsetof(struct sk_buff, truesize));
110869 + skb->truesize = size + sizeof(struct sk_buff);
110870 + atomic_set(&skb->users, 1);
110871 + skb->head = data;
110872 + skb->data = data;
110873 + skb->tail = data;
110874 + skb->end = data + size;
110875 + /* make sure we initialize shinfo sequentially */
110876 + shinfo = skb_shinfo(skb);
110877 + atomic_set(&shinfo->dataref, 1);
110878 + shinfo->nr_frags = 0;
110879 + shinfo->gso_size = 0;
110880 + shinfo->gso_segs = 0;
110881 + shinfo->gso_type = 0;
110882 + shinfo->ip6_frag_id = 0;
110883 + shinfo->frag_list = NULL;
110884 +
110885 + if (fclone) {
110886 + struct sk_buff *child = skb + 1;
110887 + atomic_t *fclone_ref = (atomic_t *) (child + 1);
110888 +
110889 + skb->fclone = SKB_FCLONE_ORIG;
110890 + atomic_set(fclone_ref, 1);
110891 +
110892 + child->fclone = SKB_FCLONE_UNAVAILABLE;
110893 + }
110894 +out:
110895 + return skb;
110896 +nodata:
110897 + kmem_cache_free(cache, skb);
110898 + skb = NULL;
110899 + goto out;
110900 +}
110901 +
110902 +/**
110903 + * alloc_skb_from_cache - allocate a network buffer
110904 + * @cp: kmem_cache from which to allocate the data area
110905 + * (object size must be big enough for @size bytes + skb overheads)
110906 + * @size: size to allocate
110907 + * @gfp_mask: allocation mask
110908 + *
110909 + * Allocate a new &sk_buff. The returned buffer has no headroom and
110910 + * tail room of size bytes. The object has a reference count of one.
110911 + * The return is the buffer. On a failure the return is %NULL.
110912 + *
110913 + * Buffers may only be allocated from interrupts using a @gfp_mask of
110914 + * %GFP_ATOMIC.
110915 + */
110916 +struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
110917 + unsigned int size,
110918 + gfp_t gfp_mask)
110919 +{
110920 + struct sk_buff *skb;
110921 + u8 *data;
110922 +
110923 + /* Get the HEAD */
110924 + skb = kmem_cache_alloc(skbuff_head_cache,
110925 + gfp_mask & ~__GFP_DMA);
110926 + if (!skb)
110927 + goto out;
110928 +
110929 + /* Get the DATA. */
110930 + size = SKB_DATA_ALIGN(size);
110931 + data = kmem_cache_alloc(cp, gfp_mask);
110932 + if (!data)
110933 + goto nodata;
110934 +
110935 + memset(skb, 0, offsetof(struct sk_buff, truesize));
110936 + skb->truesize = size + sizeof(struct sk_buff);
110937 + atomic_set(&skb->users, 1);
110938 + skb->head = data;
110939 + skb->data = data;
110940 + skb->tail = data;
110941 + skb->end = data + size;
110942 +
110943 + atomic_set(&(skb_shinfo(skb)->dataref), 1);
110944 + skb_shinfo(skb)->nr_frags = 0;
110945 + skb_shinfo(skb)->gso_size = 0;
110946 + skb_shinfo(skb)->gso_segs = 0;
110947 + skb_shinfo(skb)->gso_type = 0;
110948 + skb_shinfo(skb)->frag_list = NULL;
110949 +out:
110950 + return skb;
110951 +nodata:
110952 + kmem_cache_free(skbuff_head_cache, skb);
110953 + skb = NULL;
110954 + goto out;
110955 +}
110956 +
110957 +
110958 +static void skb_drop_list(struct sk_buff **listp)
110959 +{
110960 + struct sk_buff *list = *listp;
110961 +
110962 + *listp = NULL;
110963 +
110964 + do {
110965 + struct sk_buff *this = list;
110966 + list = list->next;
110967 + kfree_skb(this);
110968 + } while (list);
110969 +}
110970 +
110971 +static inline void skb_drop_fraglist(struct sk_buff *skb)
110972 +{
110973 + skb_drop_list(&skb_shinfo(skb)->frag_list);
110974 +}
110975 +
110976 +static void skb_clone_fraglist(struct sk_buff *skb)
110977 +{
110978 + struct sk_buff *list;
110979 +
110980 + for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
110981 + skb_get(list);
110982 +}
110983 +
110984 +void skb_release_data(struct sk_buff *skb)
110985 +{
110986 + if (!skb->cloned ||
110987 + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
110988 + &skb_shinfo(skb)->dataref)) {
110989 + if (skb_shinfo(skb)->nr_frags) {
110990 + int i;
110991 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
110992 + put_page(skb_shinfo(skb)->frags[i].page);
110993 + }
110994 +
110995 + if (skb_shinfo(skb)->frag_list)
110996 + skb_drop_fraglist(skb);
110997 +
110998 + kfree(skb->head);
110999 + }
111000 +}
111001 +
111002 +/*
111003 + * Free an skbuff by memory without cleaning the state.
111004 + */
111005 +void kfree_skbmem(struct sk_buff *skb)
111006 +{
111007 + struct sk_buff *other;
111008 + atomic_t *fclone_ref;
111009 +
111010 + skb_release_data(skb);
111011 + switch (skb->fclone) {
111012 + case SKB_FCLONE_UNAVAILABLE:
111013 + kmem_cache_free(skbuff_head_cache, skb);
111014 + break;
111015 +
111016 + case SKB_FCLONE_ORIG:
111017 + fclone_ref = (atomic_t *) (skb + 2);
111018 + if (atomic_dec_and_test(fclone_ref))
111019 + kmem_cache_free(skbuff_fclone_cache, skb);
111020 + break;
111021 +
111022 + case SKB_FCLONE_CLONE:
111023 + fclone_ref = (atomic_t *) (skb + 1);
111024 + other = skb - 1;
111025 +
111026 + /* The clone portion is available for
111027 + * fast-cloning again.
111028 + */
111029 + skb->fclone = SKB_FCLONE_UNAVAILABLE;
111030 +
111031 + if (atomic_dec_and_test(fclone_ref))
111032 + kmem_cache_free(skbuff_fclone_cache, other);
111033 + break;
111034 + };
111035 +}
111036 +
111037 +/**
111038 + * __kfree_skb - private function
111039 + * @skb: buffer
111040 + *
111041 + * Free an sk_buff. Release anything attached to the buffer.
111042 + * Clean the state. This is an internal helper function. Users should
111043 + * always call kfree_skb
111044 + */
111045 +
111046 +void __kfree_skb(struct sk_buff *skb)
111047 +{
111048 + dst_release(skb->dst);
111049 +#ifdef CONFIG_XFRM
111050 + secpath_put(skb->sp);
111051 +#endif
111052 + if (skb->destructor) {
111053 + WARN_ON(in_irq());
111054 + skb->destructor(skb);
111055 + }
111056 +#ifdef CONFIG_NETFILTER
111057 + nf_conntrack_put(skb->nfct);
111058 +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111059 + nf_conntrack_put_reasm(skb->nfct_reasm);
111060 +#endif
111061 +#ifdef CONFIG_BRIDGE_NETFILTER
111062 + nf_bridge_put(skb->nf_bridge);
111063 +#endif
111064 +#endif
111065 +/* XXX: IS this still necessary? - JHS */
111066 +#ifdef CONFIG_NET_SCHED
111067 + skb->tc_index = 0;
111068 +#ifdef CONFIG_NET_CLS_ACT
111069 + skb->tc_verd = 0;
111070 +#endif
111071 +#endif
111072 +
111073 + kfree_skbmem(skb);
111074 +}
111075 +
111076 +/**
111077 + * skb_clone - duplicate an sk_buff
111078 + * @skb: buffer to clone
111079 + * @gfp_mask: allocation priority
111080 + *
111081 + * Duplicate an &sk_buff. The new one is not owned by a socket. Both
111082 + * copies share the same packet data but not structure. The new
111083 + * buffer has a reference count of 1. If the allocation fails the
111084 + * function returns %NULL otherwise the new buffer is returned.
111085 + *
111086 + * If this function is called from an interrupt gfp_mask() must be
111087 + * %GFP_ATOMIC.
111088 + */
111089 +
111090 +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
111091 +{
111092 + struct sk_buff *n;
111093 +
111094 + n = skb + 1;
111095 + if (skb->fclone == SKB_FCLONE_ORIG &&
111096 + n->fclone == SKB_FCLONE_UNAVAILABLE) {
111097 + atomic_t *fclone_ref = (atomic_t *) (n + 1);
111098 + n->fclone = SKB_FCLONE_CLONE;
111099 + atomic_inc(fclone_ref);
111100 + } else {
111101 + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
111102 + if (!n)
111103 + return NULL;
111104 + n->fclone = SKB_FCLONE_UNAVAILABLE;
111105 + }
111106 +
111107 +#define C(x) n->x = skb->x
111108 +
111109 + n->next = n->prev = NULL;
111110 + n->sk = NULL;
111111 + C(tstamp);
111112 + C(dev);
111113 + C(h);
111114 + C(nh);
111115 + C(mac);
111116 + C(dst);
111117 + dst_clone(skb->dst);
111118 + C(sp);
111119 +#ifdef CONFIG_INET
111120 + secpath_get(skb->sp);
111121 +#endif
111122 + memcpy(n->cb, skb->cb, sizeof(skb->cb));
111123 + C(len);
111124 + C(data_len);
111125 + C(csum);
111126 + C(local_df);
111127 + n->cloned = 1;
111128 + n->nohdr = 0;
111129 + C(pkt_type);
111130 + C(ip_summed);
111131 + C(priority);
111132 +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
111133 + C(ipvs_property);
111134 +#endif
111135 + C(protocol);
111136 + n->destructor = NULL;
111137 +#ifdef CONFIG_NETFILTER
111138 + C(nfmark);
111139 + C(nfct);
111140 + nf_conntrack_get(skb->nfct);
111141 + C(nfctinfo);
111142 +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111143 + C(nfct_reasm);
111144 + nf_conntrack_get_reasm(skb->nfct_reasm);
111145 +#endif
111146 +#ifdef CONFIG_BRIDGE_NETFILTER
111147 + C(nf_bridge);
111148 + nf_bridge_get(skb->nf_bridge);
111149 +#endif
111150 +#endif /*CONFIG_NETFILTER*/
111151 +#ifdef CONFIG_NET_SCHED
111152 + C(tc_index);
111153 +#ifdef CONFIG_NET_CLS_ACT
111154 + n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
111155 + n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
111156 + n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
111157 + C(input_dev);
111158 +#endif
111159 +
111160 +#endif
111161 + C(truesize);
111162 + atomic_set(&n->users, 1);
111163 + C(head);
111164 + C(data);
111165 + C(tail);
111166 + C(end);
111167 +
111168 + atomic_inc(&(skb_shinfo(skb)->dataref));
111169 + skb->cloned = 1;
111170 +
111171 + return n;
111172 +}
111173 +
111174 +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
111175 +{
111176 + /*
111177 + * Shift between the two data areas in bytes
111178 + */
111179 + unsigned long offset = new->data - old->data;
111180 +
111181 + new->sk = NULL;
111182 + new->dev = old->dev;
111183 + new->priority = old->priority;
111184 + new->protocol = old->protocol;
111185 + new->dst = dst_clone(old->dst);
111186 +#ifdef CONFIG_INET
111187 + new->sp = secpath_get(old->sp);
111188 +#endif
111189 + new->h.raw = old->h.raw + offset;
111190 + new->nh.raw = old->nh.raw + offset;
111191 + new->mac.raw = old->mac.raw + offset;
111192 + memcpy(new->cb, old->cb, sizeof(old->cb));
111193 + new->local_df = old->local_df;
111194 + new->fclone = SKB_FCLONE_UNAVAILABLE;
111195 + new->pkt_type = old->pkt_type;
111196 + new->tstamp = old->tstamp;
111197 + new->destructor = NULL;
111198 +#ifdef CONFIG_NETFILTER
111199 + new->nfmark = old->nfmark;
111200 + new->nfct = old->nfct;
111201 + nf_conntrack_get(old->nfct);
111202 + new->nfctinfo = old->nfctinfo;
111203 +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
111204 + new->nfct_reasm = old->nfct_reasm;
111205 + nf_conntrack_get_reasm(old->nfct_reasm);
111206 +#endif
111207 +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
111208 + new->ipvs_property = old->ipvs_property;
111209 +#endif
111210 +#ifdef CONFIG_BRIDGE_NETFILTER
111211 + new->nf_bridge = old->nf_bridge;
111212 + nf_bridge_get(old->nf_bridge);
111213 +#endif
111214 +#endif
111215 +#ifdef CONFIG_NET_SCHED
111216 +#ifdef CONFIG_NET_CLS_ACT
111217 + new->tc_verd = old->tc_verd;
111218 +#endif
111219 + new->tc_index = old->tc_index;
111220 +#endif
111221 + atomic_set(&new->users, 1);
111222 + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
111223 + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
111224 + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
111225 +}
111226 +
111227 +/**
111228 + * skb_copy - create private copy of an sk_buff
111229 + * @skb: buffer to copy
111230 + * @gfp_mask: allocation priority
111231 + *
111232 + * Make a copy of both an &sk_buff and its data. This is used when the
111233 + * caller wishes to modify the data and needs a private copy of the
111234 + * data to alter. Returns %NULL on failure or the pointer to the buffer
111235 + * on success. The returned buffer has a reference count of 1.
111236 + *
111237 + * As by-product this function converts non-linear &sk_buff to linear
111238 + * one, so that &sk_buff becomes completely private and caller is allowed
111239 + * to modify all the data of returned buffer. This means that this
111240 + * function is not recommended for use in circumstances when only
111241 + * header is going to be modified. Use pskb_copy() instead.
111242 + */
111243 +
111244 +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
111245 +{
111246 + int headerlen = skb->data - skb->head;
111247 + /*
111248 + * Allocate the copy buffer
111249 + */
111250 + struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
111251 + gfp_mask);
111252 + if (!n)
111253 + return NULL;
111254 +
111255 + /* Set the data pointer */
111256 + skb_reserve(n, headerlen);
111257 + /* Set the tail pointer and length */
111258 + skb_put(n, skb->len);
111259 + n->csum = skb->csum;
111260 + n->ip_summed = skb->ip_summed;
111261 +
111262 + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
111263 + BUG();
111264 +
111265 + copy_skb_header(n, skb);
111266 + return n;
111267 +}
111268 +
111269 +
111270 +/**
111271 + * pskb_copy - create copy of an sk_buff with private head.
111272 + * @skb: buffer to copy
111273 + * @gfp_mask: allocation priority
111274 + *
111275 + * Make a copy of both an &sk_buff and part of its data, located
111276 + * in header. Fragmented data remain shared. This is used when
111277 + * the caller wishes to modify only header of &sk_buff and needs
111278 + * private copy of the header to alter. Returns %NULL on failure
111279 + * or the pointer to the buffer on success.
111280 + * The returned buffer has a reference count of 1.
111281 + */
111282 +
111283 +struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
111284 +{
111285 + /*
111286 + * Allocate the copy buffer
111287 + */
111288 + struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
111289 +
111290 + if (!n)
111291 + goto out;
111292 +
111293 + /* Set the data pointer */
111294 + skb_reserve(n, skb->data - skb->head);
111295 + /* Set the tail pointer and length */
111296 + skb_put(n, skb_headlen(skb));
111297 + /* Copy the bytes */
111298 + memcpy(n->data, skb->data, n->len);
111299 + n->csum = skb->csum;
111300 + n->ip_summed = skb->ip_summed;
111301 +
111302 + n->truesize += skb->data_len;
111303 + n->data_len = skb->data_len;
111304 + n->len = skb->len;
111305 +
111306 + if (skb_shinfo(skb)->nr_frags) {
111307 + int i;
111308 +
111309 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111310 + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
111311 + get_page(skb_shinfo(n)->frags[i].page);
111312 + }
111313 + skb_shinfo(n)->nr_frags = i;
111314 + }
111315 +
111316 + if (skb_shinfo(skb)->frag_list) {
111317 + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
111318 + skb_clone_fraglist(n);
111319 + }
111320 +
111321 + copy_skb_header(n, skb);
111322 +out:
111323 + return n;
111324 +}
111325 +
111326 +/**
111327 + * pskb_expand_head - reallocate header of &sk_buff
111328 + * @skb: buffer to reallocate
111329 + * @nhead: room to add at head
111330 + * @ntail: room to add at tail
111331 + * @gfp_mask: allocation priority
111332 + *
111333 + * Expands (or creates identical copy, if &nhead and &ntail are zero)
111334 + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
111335 + * reference count of 1. Returns zero in the case of success or error,
111336 + * if expansion failed. In the last case, &sk_buff is not changed.
111337 + *
111338 + * All the pointers pointing into skb header may change and must be
111339 + * reloaded after call to this function.
111340 + */
111341 +
111342 +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
111343 + gfp_t gfp_mask)
111344 +{
111345 + int i;
111346 + u8 *data;
111347 + int size = nhead + (skb->end - skb->head) + ntail;
111348 + long off;
111349 +
111350 + if (skb_shared(skb))
111351 + BUG();
111352 +
111353 + size = SKB_DATA_ALIGN(size);
111354 +
111355 + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
111356 + if (!data)
111357 + goto nodata;
111358 +
111359 + /* Copy only real data... and, alas, header. This should be
111360 + * optimized for the cases when header is void. */
111361 + memcpy(data + nhead, skb->head, skb->tail - skb->head);
111362 + memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
111363 +
111364 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
111365 + get_page(skb_shinfo(skb)->frags[i].page);
111366 +
111367 + if (skb_shinfo(skb)->frag_list)
111368 + skb_clone_fraglist(skb);
111369 +
111370 + skb_release_data(skb);
111371 +
111372 + off = (data + nhead) - skb->head;
111373 +
111374 + skb->head = data;
111375 + skb->end = data + size;
111376 + skb->data += off;
111377 + skb->tail += off;
111378 + skb->mac.raw += off;
111379 + skb->h.raw += off;
111380 + skb->nh.raw += off;
111381 + skb->cloned = 0;
111382 + skb->nohdr = 0;
111383 + atomic_set(&skb_shinfo(skb)->dataref, 1);
111384 + return 0;
111385 +
111386 +nodata:
111387 + return -ENOMEM;
111388 +}
111389 +
111390 +/* Make private copy of skb with writable head and some headroom */
111391 +
111392 +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
111393 +{
111394 + struct sk_buff *skb2;
111395 + int delta = headroom - skb_headroom(skb);
111396 +
111397 + if (delta <= 0)
111398 + skb2 = pskb_copy(skb, GFP_ATOMIC);
111399 + else {
111400 + skb2 = skb_clone(skb, GFP_ATOMIC);
111401 + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
111402 + GFP_ATOMIC)) {
111403 + kfree_skb(skb2);
111404 + skb2 = NULL;
111405 + }
111406 + }
111407 + return skb2;
111408 +}
111409 +
111410 +
111411 +/**
111412 + * skb_copy_expand - copy and expand sk_buff
111413 + * @skb: buffer to copy
111414 + * @newheadroom: new free bytes at head
111415 + * @newtailroom: new free bytes at tail
111416 + * @gfp_mask: allocation priority
111417 + *
111418 + * Make a copy of both an &sk_buff and its data and while doing so
111419 + * allocate additional space.
111420 + *
111421 + * This is used when the caller wishes to modify the data and needs a
111422 + * private copy of the data to alter as well as more space for new fields.
111423 + * Returns %NULL on failure or the pointer to the buffer
111424 + * on success. The returned buffer has a reference count of 1.
111425 + *
111426 + * You must pass %GFP_ATOMIC as the allocation priority if this function
111427 + * is called from an interrupt.
111428 + *
111429 + * BUG ALERT: ip_summed is not copied. Why does this work? Is it used
111430 + * only by netfilter in the cases when checksum is recalculated? --ANK
111431 + */
111432 +struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
111433 + int newheadroom, int newtailroom,
111434 + gfp_t gfp_mask)
111435 +{
111436 + /*
111437 + * Allocate the copy buffer
111438 + */
111439 + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
111440 + gfp_mask);
111441 + int head_copy_len, head_copy_off;
111442 +
111443 + if (!n)
111444 + return NULL;
111445 +
111446 + skb_reserve(n, newheadroom);
111447 +
111448 + /* Set the tail pointer and length */
111449 + skb_put(n, skb->len);
111450 +
111451 + head_copy_len = skb_headroom(skb);
111452 + head_copy_off = 0;
111453 + if (newheadroom <= head_copy_len)
111454 + head_copy_len = newheadroom;
111455 + else
111456 + head_copy_off = newheadroom - head_copy_len;
111457 +
111458 + /* Copy the linear header and data. */
111459 + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
111460 + skb->len + head_copy_len))
111461 + BUG();
111462 +
111463 + copy_skb_header(n, skb);
111464 +
111465 + return n;
111466 +}
111467 +
111468 +/**
111469 + * skb_pad - zero pad the tail of an skb
111470 + * @skb: buffer to pad
111471 + * @pad: space to pad
111472 + *
111473 + * Ensure that a buffer is followed by a padding area that is zero
111474 + * filled. Used by network drivers which may DMA or transfer data
111475 + * beyond the buffer end onto the wire.
111476 + *
111477 + * May return NULL in out of memory cases.
111478 + */
111479 +
111480 +struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
111481 +{
111482 + struct sk_buff *nskb;
111483 +
111484 + /* If the skbuff is non linear tailroom is always zero.. */
111485 + if (skb_tailroom(skb) >= pad) {
111486 + memset(skb->data+skb->len, 0, pad);
111487 + return skb;
111488 + }
111489 +
111490 + nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
111491 + kfree_skb(skb);
111492 + if (nskb)
111493 + memset(nskb->data+nskb->len, 0, pad);
111494 + return nskb;
111495 +}
111496 +
111497 +/* Trims skb to length len. It can change skb pointers.
111498 + */
111499 +
111500 +int ___pskb_trim(struct sk_buff *skb, unsigned int len)
111501 +{
111502 + struct sk_buff **fragp;
111503 + struct sk_buff *frag;
111504 + int offset = skb_headlen(skb);
111505 + int nfrags = skb_shinfo(skb)->nr_frags;
111506 + int i;
111507 + int err;
111508 +
111509 + if (skb_cloned(skb) &&
111510 + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
111511 + return err;
111512 +
111513 + i = 0;
111514 + if (offset >= len)
111515 + goto drop_pages;
111516 +
111517 + for (; i < nfrags; i++) {
111518 + int end = offset + skb_shinfo(skb)->frags[i].size;
111519 +
111520 + if (end < len) {
111521 + offset = end;
111522 + continue;
111523 + }
111524 +
111525 + skb_shinfo(skb)->frags[i++].size = len - offset;
111526 +
111527 +drop_pages:
111528 + skb_shinfo(skb)->nr_frags = i;
111529 +
111530 + for (; i < nfrags; i++)
111531 + put_page(skb_shinfo(skb)->frags[i].page);
111532 +
111533 + if (skb_shinfo(skb)->frag_list)
111534 + skb_drop_fraglist(skb);
111535 + goto done;
111536 + }
111537 +
111538 + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
111539 + fragp = &frag->next) {
111540 + int end = offset + frag->len;
111541 +
111542 + if (skb_shared(frag)) {
111543 + struct sk_buff *nfrag;
111544 +
111545 + nfrag = skb_clone(frag, GFP_ATOMIC);
111546 + if (unlikely(!nfrag))
111547 + return -ENOMEM;
111548 +
111549 + nfrag->next = frag->next;
111550 + kfree_skb(frag);
111551 + frag = nfrag;
111552 + *fragp = frag;
111553 + }
111554 +
111555 + if (end < len) {
111556 + offset = end;
111557 + continue;
111558 + }
111559 +
111560 + if (end > len &&
111561 + unlikely((err = pskb_trim(frag, len - offset))))
111562 + return err;
111563 +
111564 + if (frag->next)
111565 + skb_drop_list(&frag->next);
111566 + break;
111567 + }
111568 +
111569 +done:
111570 + if (len > skb_headlen(skb)) {
111571 + skb->data_len -= skb->len - len;
111572 + skb->len = len;
111573 + } else {
111574 + skb->len = len;
111575 + skb->data_len = 0;
111576 + skb->tail = skb->data + len;
111577 + }
111578 +
111579 + return 0;
111580 +}
111581 +
111582 +/**
111583 + * __pskb_pull_tail - advance tail of skb header
111584 + * @skb: buffer to reallocate
111585 + * @delta: number of bytes to advance tail
111586 + *
111587 + * The function makes a sense only on a fragmented &sk_buff,
111588 + * it expands header moving its tail forward and copying necessary
111589 + * data from fragmented part.
111590 + *
111591 + * &sk_buff MUST have reference count of 1.
111592 + *
111593 + * Returns %NULL (and &sk_buff does not change) if pull failed
111594 + * or value of new tail of skb in the case of success.
111595 + *
111596 + * All the pointers pointing into skb header may change and must be
111597 + * reloaded after call to this function.
111598 + */
111599 +
111600 +/* Moves tail of skb head forward, copying data from fragmented part,
111601 + * when it is necessary.
111602 + * 1. It may fail due to malloc failure.
111603 + * 2. It may change skb pointers.
111604 + *
111605 + * It is pretty complicated. Luckily, it is called only in exceptional cases.
111606 + */
111607 +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
111608 +{
111609 + /* If skb has not enough free space at tail, get new one
111610 + * plus 128 bytes for future expansions. If we have enough
111611 + * room at tail, reallocate without expansion only if skb is cloned.
111612 + */
111613 + int i, k, eat = (skb->tail + delta) - skb->end;
111614 +
111615 + if (eat > 0 || skb_cloned(skb)) {
111616 + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
111617 + GFP_ATOMIC))
111618 + return NULL;
111619 + }
111620 +
111621 + if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
111622 + BUG();
111623 +
111624 + /* Optimization: no fragments, no reasons to preestimate
111625 + * size of pulled pages. Superb.
111626 + */
111627 + if (!skb_shinfo(skb)->frag_list)
111628 + goto pull_pages;
111629 +
111630 + /* Estimate size of pulled pages. */
111631 + eat = delta;
111632 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111633 + if (skb_shinfo(skb)->frags[i].size >= eat)
111634 + goto pull_pages;
111635 + eat -= skb_shinfo(skb)->frags[i].size;
111636 + }
111637 +
111638 + /* If we need update frag list, we are in troubles.
111639 + * Certainly, it possible to add an offset to skb data,
111640 + * but taking into account that pulling is expected to
111641 + * be very rare operation, it is worth to fight against
111642 + * further bloating skb head and crucify ourselves here instead.
111643 + * Pure masohism, indeed. 8)8)
111644 + */
111645 + if (eat) {
111646 + struct sk_buff *list = skb_shinfo(skb)->frag_list;
111647 + struct sk_buff *clone = NULL;
111648 + struct sk_buff *insp = NULL;
111649 +
111650 + do {
111651 + BUG_ON(!list);
111652 +
111653 + if (list->len <= eat) {
111654 + /* Eaten as whole. */
111655 + eat -= list->len;
111656 + list = list->next;
111657 + insp = list;
111658 + } else {
111659 + /* Eaten partially. */
111660 +
111661 + if (skb_shared(list)) {
111662 + /* Sucks! We need to fork list. :-( */
111663 + clone = skb_clone(list, GFP_ATOMIC);
111664 + if (!clone)
111665 + return NULL;
111666 + insp = list->next;
111667 + list = clone;
111668 + } else {
111669 + /* This may be pulled without
111670 + * problems. */
111671 + insp = list;
111672 + }
111673 + if (!pskb_pull(list, eat)) {
111674 + if (clone)
111675 + kfree_skb(clone);
111676 + return NULL;
111677 + }
111678 + break;
111679 + }
111680 + } while (eat);
111681 +
111682 + /* Free pulled out fragments. */
111683 + while ((list = skb_shinfo(skb)->frag_list) != insp) {
111684 + skb_shinfo(skb)->frag_list = list->next;
111685 + kfree_skb(list);
111686 + }
111687 + /* And insert new clone at head. */
111688 + if (clone) {
111689 + clone->next = list;
111690 + skb_shinfo(skb)->frag_list = clone;
111691 + }
111692 + }
111693 + /* Success! Now we may commit changes to skb data. */
111694 +
111695 +pull_pages:
111696 + eat = delta;
111697 + k = 0;
111698 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111699 + if (skb_shinfo(skb)->frags[i].size <= eat) {
111700 + put_page(skb_shinfo(skb)->frags[i].page);
111701 + eat -= skb_shinfo(skb)->frags[i].size;
111702 + } else {
111703 + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
111704 + if (eat) {
111705 + skb_shinfo(skb)->frags[k].page_offset += eat;
111706 + skb_shinfo(skb)->frags[k].size -= eat;
111707 + eat = 0;
111708 + }
111709 + k++;
111710 + }
111711 + }
111712 + skb_shinfo(skb)->nr_frags = k;
111713 +
111714 + skb->tail += delta;
111715 + skb->data_len -= delta;
111716 +
111717 + return skb->tail;
111718 +}
111719 +
111720 +/* Copy some data bits from skb to kernel buffer. */
111721 +
111722 +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
111723 +{
111724 + int i, copy;
111725 + int start = skb_headlen(skb);
111726 +
111727 + if (offset > (int)skb->len - len)
111728 + goto fault;
111729 +
111730 + /* Copy header. */
111731 + if ((copy = start - offset) > 0) {
111732 + if (copy > len)
111733 + copy = len;
111734 + memcpy(to, skb->data + offset, copy);
111735 + if ((len -= copy) == 0)
111736 + return 0;
111737 + offset += copy;
111738 + to += copy;
111739 + }
111740 +
111741 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111742 + int end;
111743 +
111744 + BUG_TRAP(start <= offset + len);
111745 +
111746 + end = start + skb_shinfo(skb)->frags[i].size;
111747 + if ((copy = end - offset) > 0) {
111748 + u8 *vaddr;
111749 +
111750 + if (copy > len)
111751 + copy = len;
111752 +
111753 + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
111754 + memcpy(to,
111755 + vaddr + skb_shinfo(skb)->frags[i].page_offset+
111756 + offset - start, copy);
111757 + kunmap_skb_frag(vaddr);
111758 +
111759 + if ((len -= copy) == 0)
111760 + return 0;
111761 + offset += copy;
111762 + to += copy;
111763 + }
111764 + start = end;
111765 + }
111766 +
111767 + if (skb_shinfo(skb)->frag_list) {
111768 + struct sk_buff *list = skb_shinfo(skb)->frag_list;
111769 +
111770 + for (; list; list = list->next) {
111771 + int end;
111772 +
111773 + BUG_TRAP(start <= offset + len);
111774 +
111775 + end = start + list->len;
111776 + if ((copy = end - offset) > 0) {
111777 + if (copy > len)
111778 + copy = len;
111779 + if (skb_copy_bits(list, offset - start,
111780 + to, copy))
111781 + goto fault;
111782 + if ((len -= copy) == 0)
111783 + return 0;
111784 + offset += copy;
111785 + to += copy;
111786 + }
111787 + start = end;
111788 + }
111789 + }
111790 + if (!len)
111791 + return 0;
111792 +
111793 +fault:
111794 + return -EFAULT;
111795 +}
111796 +
111797 +/**
111798 + * skb_store_bits - store bits from kernel buffer to skb
111799 + * @skb: destination buffer
111800 + * @offset: offset in destination
111801 + * @from: source buffer
111802 + * @len: number of bytes to copy
111803 + *
111804 + * Copy the specified number of bytes from the source buffer to the
111805 + * destination skb. This function handles all the messy bits of
111806 + * traversing fragment lists and such.
111807 + */
111808 +
111809 +int skb_store_bits(const struct sk_buff *skb, int offset, void *from, int len)
111810 +{
111811 + int i, copy;
111812 + int start = skb_headlen(skb);
111813 +
111814 + if (offset > (int)skb->len - len)
111815 + goto fault;
111816 +
111817 + if ((copy = start - offset) > 0) {
111818 + if (copy > len)
111819 + copy = len;
111820 + memcpy(skb->data + offset, from, copy);
111821 + if ((len -= copy) == 0)
111822 + return 0;
111823 + offset += copy;
111824 + from += copy;
111825 + }
111826 +
111827 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111828 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111829 + int end;
111830 +
111831 + BUG_TRAP(start <= offset + len);
111832 +
111833 + end = start + frag->size;
111834 + if ((copy = end - offset) > 0) {
111835 + u8 *vaddr;
111836 +
111837 + if (copy > len)
111838 + copy = len;
111839 +
111840 + vaddr = kmap_skb_frag(frag);
111841 + memcpy(vaddr + frag->page_offset + offset - start,
111842 + from, copy);
111843 + kunmap_skb_frag(vaddr);
111844 +
111845 + if ((len -= copy) == 0)
111846 + return 0;
111847 + offset += copy;
111848 + from += copy;
111849 + }
111850 + start = end;
111851 + }
111852 +
111853 + if (skb_shinfo(skb)->frag_list) {
111854 + struct sk_buff *list = skb_shinfo(skb)->frag_list;
111855 +
111856 + for (; list; list = list->next) {
111857 + int end;
111858 +
111859 + BUG_TRAP(start <= offset + len);
111860 +
111861 + end = start + list->len;
111862 + if ((copy = end - offset) > 0) {
111863 + if (copy > len)
111864 + copy = len;
111865 + if (skb_store_bits(list, offset - start,
111866 + from, copy))
111867 + goto fault;
111868 + if ((len -= copy) == 0)
111869 + return 0;
111870 + offset += copy;
111871 + from += copy;
111872 + }
111873 + start = end;
111874 + }
111875 + }
111876 + if (!len)
111877 + return 0;
111878 +
111879 +fault:
111880 + return -EFAULT;
111881 +}
111882 +
111883 +EXPORT_SYMBOL(skb_store_bits);
111884 +
111885 +/* Checksum skb data. */
111886 +
111887 +unsigned int skb_checksum(const struct sk_buff *skb, int offset,
111888 + int len, unsigned int csum)
111889 +{
111890 + int start = skb_headlen(skb);
111891 + int i, copy = start - offset;
111892 + int pos = 0;
111893 +
111894 + /* Checksum header. */
111895 + if (copy > 0) {
111896 + if (copy > len)
111897 + copy = len;
111898 + csum = csum_partial(skb->data + offset, copy, csum);
111899 + if ((len -= copy) == 0)
111900 + return csum;
111901 + offset += copy;
111902 + pos = copy;
111903 + }
111904 +
111905 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111906 + int end;
111907 +
111908 + BUG_TRAP(start <= offset + len);
111909 +
111910 + end = start + skb_shinfo(skb)->frags[i].size;
111911 + if ((copy = end - offset) > 0) {
111912 + unsigned int csum2;
111913 + u8 *vaddr;
111914 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111915 +
111916 + if (copy > len)
111917 + copy = len;
111918 + vaddr = kmap_skb_frag(frag);
111919 + csum2 = csum_partial(vaddr + frag->page_offset +
111920 + offset - start, copy, 0);
111921 + kunmap_skb_frag(vaddr);
111922 + csum = csum_block_add(csum, csum2, pos);
111923 + if (!(len -= copy))
111924 + return csum;
111925 + offset += copy;
111926 + pos += copy;
111927 + }
111928 + start = end;
111929 + }
111930 +
111931 + if (skb_shinfo(skb)->frag_list) {
111932 + struct sk_buff *list = skb_shinfo(skb)->frag_list;
111933 +
111934 + for (; list; list = list->next) {
111935 + int end;
111936 +
111937 + BUG_TRAP(start <= offset + len);
111938 +
111939 + end = start + list->len;
111940 + if ((copy = end - offset) > 0) {
111941 + unsigned int csum2;
111942 + if (copy > len)
111943 + copy = len;
111944 + csum2 = skb_checksum(list, offset - start,
111945 + copy, 0);
111946 + csum = csum_block_add(csum, csum2, pos);
111947 + if ((len -= copy) == 0)
111948 + return csum;
111949 + offset += copy;
111950 + pos += copy;
111951 + }
111952 + start = end;
111953 + }
111954 + }
111955 + BUG_ON(len);
111956 +
111957 + return csum;
111958 +}
111959 +
111960 +/* Both of above in one bottle. */
111961 +
111962 +unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
111963 + u8 *to, int len, unsigned int csum)
111964 +{
111965 + int start = skb_headlen(skb);
111966 + int i, copy = start - offset;
111967 + int pos = 0;
111968 +
111969 + /* Copy header. */
111970 + if (copy > 0) {
111971 + if (copy > len)
111972 + copy = len;
111973 + csum = csum_partial_copy_nocheck(skb->data + offset, to,
111974 + copy, csum);
111975 + if ((len -= copy) == 0)
111976 + return csum;
111977 + offset += copy;
111978 + to += copy;
111979 + pos = copy;
111980 + }
111981 +
111982 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
111983 + int end;
111984 +
111985 + BUG_TRAP(start <= offset + len);
111986 +
111987 + end = start + skb_shinfo(skb)->frags[i].size;
111988 + if ((copy = end - offset) > 0) {
111989 + unsigned int csum2;
111990 + u8 *vaddr;
111991 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
111992 +
111993 + if (copy > len)
111994 + copy = len;
111995 + vaddr = kmap_skb_frag(frag);
111996 + csum2 = csum_partial_copy_nocheck(vaddr +
111997 + frag->page_offset +
111998 + offset - start, to,
111999 + copy, 0);
112000 + kunmap_skb_frag(vaddr);
112001 + csum = csum_block_add(csum, csum2, pos);
112002 + if (!(len -= copy))
112003 + return csum;
112004 + offset += copy;
112005 + to += copy;
112006 + pos += copy;
112007 + }
112008 + start = end;
112009 + }
112010 +
112011 + if (skb_shinfo(skb)->frag_list) {
112012 + struct sk_buff *list = skb_shinfo(skb)->frag_list;
112013 +
112014 + for (; list; list = list->next) {
112015 + unsigned int csum2;
112016 + int end;
112017 +
112018 + BUG_TRAP(start <= offset + len);
112019 +
112020 + end = start + list->len;
112021 + if ((copy = end - offset) > 0) {
112022 + if (copy > len)
112023 + copy = len;
112024 + csum2 = skb_copy_and_csum_bits(list,
112025 + offset - start,
112026 + to, copy, 0);
112027 + csum = csum_block_add(csum, csum2, pos);
112028 + if ((len -= copy) == 0)
112029 + return csum;
112030 + offset += copy;
112031 + to += copy;
112032 + pos += copy;
112033 + }
112034 + start = end;
112035 + }
112036 + }
112037 + BUG_ON(len);
112038 + return csum;
112039 +}
112040 +
112041 +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
112042 +{
112043 + unsigned int csum;
112044 + long csstart;
112045 +
112046 + if (skb->ip_summed == CHECKSUM_HW)
112047 + csstart = skb->h.raw - skb->data;
112048 + else
112049 + csstart = skb_headlen(skb);
112050 +
112051 + BUG_ON(csstart > skb_headlen(skb));
112052 +
112053 + memcpy(to, skb->data, csstart);
112054 +
112055 + csum = 0;
112056 + if (csstart != skb->len)
112057 + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
112058 + skb->len - csstart, 0);
112059 +
112060 + if (skb->ip_summed == CHECKSUM_HW) {
112061 + long csstuff = csstart + skb->csum;
112062 +
112063 + *((unsigned short *)(to + csstuff)) = csum_fold(csum);
112064 + }
112065 +}
112066 +
112067 +/**
112068 + * skb_dequeue - remove from the head of the queue
112069 + * @list: list to dequeue from
112070 + *
112071 + * Remove the head of the list. The list lock is taken so the function
112072 + * may be used safely with other locking list functions. The head item is
112073 + * returned or %NULL if the list is empty.
112074 + */
112075 +
112076 +struct sk_buff *skb_dequeue(struct sk_buff_head *list)
112077 +{
112078 + unsigned long flags;
112079 + struct sk_buff *result;
112080 +
112081 + spin_lock_irqsave(&list->lock, flags);
112082 + result = __skb_dequeue(list);
112083 + spin_unlock_irqrestore(&list->lock, flags);
112084 + return result;
112085 +}
112086 +
112087 +/**
112088 + * skb_dequeue_tail - remove from the tail of the queue
112089 + * @list: list to dequeue from
112090 + *
112091 + * Remove the tail of the list. The list lock is taken so the function
112092 + * may be used safely with other locking list functions. The tail item is
112093 + * returned or %NULL if the list is empty.
112094 + */
112095 +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
112096 +{
112097 + unsigned long flags;
112098 + struct sk_buff *result;
112099 +
112100 + spin_lock_irqsave(&list->lock, flags);
112101 + result = __skb_dequeue_tail(list);
112102 + spin_unlock_irqrestore(&list->lock, flags);
112103 + return result;
112104 +}
112105 +
112106 +/**
112107 + * skb_queue_purge - empty a list
112108 + * @list: list to empty
112109 + *
112110 + * Delete all buffers on an &sk_buff list. Each buffer is removed from
112111 + * the list and one reference dropped. This function takes the list
112112 + * lock and is atomic with respect to other list locking functions.
112113 + */
112114 +void skb_queue_purge(struct sk_buff_head *list)
112115 +{
112116 + struct sk_buff *skb;
112117 + while ((skb = skb_dequeue(list)) != NULL)
112118 + kfree_skb(skb);
112119 +}
112120 +
112121 +/**
112122 + * skb_queue_head - queue a buffer at the list head
112123 + * @list: list to use
112124 + * @newsk: buffer to queue
112125 + *
112126 + * Queue a buffer at the start of the list. This function takes the
112127 + * list lock and can be used safely with other locking &sk_buff functions
112128 + * safely.
112129 + *
112130 + * A buffer cannot be placed on two lists at the same time.
112131 + */
112132 +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
112133 +{
112134 + unsigned long flags;
112135 +
112136 + spin_lock_irqsave(&list->lock, flags);
112137 + __skb_queue_head(list, newsk);
112138 + spin_unlock_irqrestore(&list->lock, flags);
112139 +}
112140 +
112141 +/**
112142 + * skb_queue_tail - queue a buffer at the list tail
112143 + * @list: list to use
112144 + * @newsk: buffer to queue
112145 + *
112146 + * Queue a buffer at the tail of the list. This function takes the
112147 + * list lock and can be used safely with other locking &sk_buff functions
112148 + * safely.
112149 + *
112150 + * A buffer cannot be placed on two lists at the same time.
112151 + */
112152 +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
112153 +{
112154 + unsigned long flags;
112155 +
112156 + spin_lock_irqsave(&list->lock, flags);
112157 + __skb_queue_tail(list, newsk);
112158 + spin_unlock_irqrestore(&list->lock, flags);
112159 +}
112160 +
112161 +/**
112162 + * skb_unlink - remove a buffer from a list
112163 + * @skb: buffer to remove
112164 + * @list: list to use
112165 + *
112166 + * Remove a packet from a list. The list locks are taken and this
112167 + * function is atomic with respect to other list locked calls
112168 + *
112169 + * You must know what list the SKB is on.
112170 + */
112171 +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
112172 +{
112173 + unsigned long flags;
112174 +
112175 + spin_lock_irqsave(&list->lock, flags);
112176 + __skb_unlink(skb, list);
112177 + spin_unlock_irqrestore(&list->lock, flags);
112178 +}
112179 +
112180 +/**
112181 + * skb_append - append a buffer
112182 + * @old: buffer to insert after
112183 + * @newsk: buffer to insert
112184 + * @list: list to use
112185 + *
112186 + * Place a packet after a given packet in a list. The list locks are taken
112187 + * and this function is atomic with respect to other list locked calls.
112188 + * A buffer cannot be placed on two lists at the same time.
112189 + */
112190 +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
112191 +{
112192 + unsigned long flags;
112193 +
112194 + spin_lock_irqsave(&list->lock, flags);
112195 + __skb_append(old, newsk, list);
112196 + spin_unlock_irqrestore(&list->lock, flags);
112197 +}
112198 +
112199 +
112200 +/**
112201 + * skb_insert - insert a buffer
112202 + * @old: buffer to insert before
112203 + * @newsk: buffer to insert
112204 + * @list: list to use
112205 + *
112206 + * Place a packet before a given packet in a list. The list locks are
112207 + * taken and this function is atomic with respect to other list locked
112208 + * calls.
112209 + *
112210 + * A buffer cannot be placed on two lists at the same time.
112211 + */
112212 +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
112213 +{
112214 + unsigned long flags;
112215 +
112216 + spin_lock_irqsave(&list->lock, flags);
112217 + __skb_insert(newsk, old->prev, old, list);
112218 + spin_unlock_irqrestore(&list->lock, flags);
112219 +}
112220 +
112221 +#if 0
112222 +/*
112223 + * Tune the memory allocator for a new MTU size.
112224 + */
112225 +void skb_add_mtu(int mtu)
112226 +{
112227 + /* Must match allocation in alloc_skb */
112228 + mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
112229 +
112230 + kmem_add_cache_size(mtu);
112231 +}
112232 +#endif
112233 +
112234 +static inline void skb_split_inside_header(struct sk_buff *skb,
112235 + struct sk_buff* skb1,
112236 + const u32 len, const int pos)
112237 +{
112238 + int i;
112239 +
112240 + memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
112241 +
112242 + /* And move data appendix as is. */
112243 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
112244 + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
112245 +
112246 + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
112247 + skb_shinfo(skb)->nr_frags = 0;
112248 + skb1->data_len = skb->data_len;
112249 + skb1->len += skb1->data_len;
112250 + skb->data_len = 0;
112251 + skb->len = len;
112252 + skb->tail = skb->data + len;
112253 +}
112254 +
112255 +static inline void skb_split_no_header(struct sk_buff *skb,
112256 + struct sk_buff* skb1,
112257 + const u32 len, int pos)
112258 +{
112259 + int i, k = 0;
112260 + const int nfrags = skb_shinfo(skb)->nr_frags;
112261 +
112262 + skb_shinfo(skb)->nr_frags = 0;
112263 + skb1->len = skb1->data_len = skb->len - len;
112264 + skb->len = len;
112265 + skb->data_len = len - pos;
112266 +
112267 + for (i = 0; i < nfrags; i++) {
112268 + int size = skb_shinfo(skb)->frags[i].size;
112269 +
112270 + if (pos + size > len) {
112271 + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
112272 +
112273 + if (pos < len) {
112274 + /* Split frag.
112275 + * We have two variants in this case:
112276 + * 1. Move all the frag to the second
112277 + * part, if it is possible. F.e.
112278 + * this approach is mandatory for TUX,
112279 + * where splitting is expensive.
112280 + * 2. Split is accurately. We make this.
112281 + */
112282 + get_page(skb_shinfo(skb)->frags[i].page);
112283 + skb_shinfo(skb1)->frags[0].page_offset += len - pos;
112284 + skb_shinfo(skb1)->frags[0].size -= len - pos;
112285 + skb_shinfo(skb)->frags[i].size = len - pos;
112286 + skb_shinfo(skb)->nr_frags++;
112287 + }
112288 + k++;
112289 + } else
112290 + skb_shinfo(skb)->nr_frags++;
112291 + pos += size;
112292 + }
112293 + skb_shinfo(skb1)->nr_frags = k;
112294 +}
112295 +
112296 +/**
112297 + * skb_split - Split fragmented skb to two parts at length len.
112298 + * @skb: the buffer to split
112299 + * @skb1: the buffer to receive the second part
112300 + * @len: new length for skb
112301 + */
112302 +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
112303 +{
112304 + int pos = skb_headlen(skb);
112305 +
112306 + if (len < pos) /* Split line is inside header. */
112307 + skb_split_inside_header(skb, skb1, len, pos);
112308 + else /* Second chunk has no header, nothing to copy. */
112309 + skb_split_no_header(skb, skb1, len, pos);
112310 +}
112311 +
112312 +/**
112313 + * skb_prepare_seq_read - Prepare a sequential read of skb data
112314 + * @skb: the buffer to read
112315 + * @from: lower offset of data to be read
112316 + * @to: upper offset of data to be read
112317 + * @st: state variable
112318 + *
112319 + * Initializes the specified state variable. Must be called before
112320 + * invoking skb_seq_read() for the first time.
112321 + */
112322 +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
112323 + unsigned int to, struct skb_seq_state *st)
112324 +{
112325 + st->lower_offset = from;
112326 + st->upper_offset = to;
112327 + st->root_skb = st->cur_skb = skb;
112328 + st->frag_idx = st->stepped_offset = 0;
112329 + st->frag_data = NULL;
112330 +}
112331 +
112332 +/**
112333 + * skb_seq_read - Sequentially read skb data
112334 + * @consumed: number of bytes consumed by the caller so far
112335 + * @data: destination pointer for data to be returned
112336 + * @st: state variable
112337 + *
112338 + * Reads a block of skb data at &consumed relative to the
112339 + * lower offset specified to skb_prepare_seq_read(). Assigns
112340 + * the head of the data block to &data and returns the length
112341 + * of the block or 0 if the end of the skb data or the upper
112342 + * offset has been reached.
112343 + *
112344 + * The caller is not required to consume all of the data
112345 + * returned, i.e. &consumed is typically set to the number
112346 + * of bytes already consumed and the next call to
112347 + * skb_seq_read() will return the remaining part of the block.
112348 + *
112349 + * Note: The size of each block of data returned can be arbitary,
112350 + * this limitation is the cost for zerocopy seqeuental
112351 + * reads of potentially non linear data.
112352 + *
112353 + * Note: Fragment lists within fragments are not implemented
112354 + * at the moment, state->root_skb could be replaced with
112355 + * a stack for this purpose.
112356 + */
112357 +unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
112358 + struct skb_seq_state *st)
112359 +{
112360 + unsigned int block_limit, abs_offset = consumed + st->lower_offset;
112361 + skb_frag_t *frag;
112362 +
112363 + if (unlikely(abs_offset >= st->upper_offset))
112364 + return 0;
112365 +
112366 +next_skb:
112367 + block_limit = skb_headlen(st->cur_skb);
112368 +
112369 + if (abs_offset < block_limit) {
112370 + *data = st->cur_skb->data + abs_offset;
112371 + return block_limit - abs_offset;
112372 + }
112373 +
112374 + if (st->frag_idx == 0 && !st->frag_data)
112375 + st->stepped_offset += skb_headlen(st->cur_skb);
112376 +
112377 + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
112378 + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
112379 + block_limit = frag->size + st->stepped_offset;
112380 +
112381 + if (abs_offset < block_limit) {
112382 + if (!st->frag_data)
112383 + st->frag_data = kmap_skb_frag(frag);
112384 +
112385 + *data = (u8 *) st->frag_data + frag->page_offset +
112386 + (abs_offset - st->stepped_offset);
112387 +
112388 + return block_limit - abs_offset;
112389 + }
112390 +
112391 + if (st->frag_data) {
112392 + kunmap_skb_frag(st->frag_data);
112393 + st->frag_data = NULL;
112394 + }
112395 +
112396 + st->frag_idx++;
112397 + st->stepped_offset += frag->size;
112398 + }
112399 +
112400 + if (st->cur_skb->next) {
112401 + st->cur_skb = st->cur_skb->next;
112402 + st->frag_idx = 0;
112403 + goto next_skb;
112404 + } else if (st->root_skb == st->cur_skb &&
112405 + skb_shinfo(st->root_skb)->frag_list) {
112406 + st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
112407 + goto next_skb;
112408 + }
112409 +
112410 + return 0;
112411 +}
112412 +
112413 +/**
112414 + * skb_abort_seq_read - Abort a sequential read of skb data
112415 + * @st: state variable
112416 + *
112417 + * Must be called if skb_seq_read() was not called until it
112418 + * returned 0.
112419 + */
112420 +void skb_abort_seq_read(struct skb_seq_state *st)
112421 +{
112422 + if (st->frag_data)
112423 + kunmap_skb_frag(st->frag_data);
112424 +}
112425 +
112426 +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
112427 +
112428 +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
112429 + struct ts_config *conf,
112430 + struct ts_state *state)
112431 +{
112432 + return skb_seq_read(offset, text, TS_SKB_CB(state));
112433 +}
112434 +
112435 +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
112436 +{
112437 + skb_abort_seq_read(TS_SKB_CB(state));
112438 +}
112439 +
112440 +/**
112441 + * skb_find_text - Find a text pattern in skb data
112442 + * @skb: the buffer to look in
112443 + * @from: search offset
112444 + * @to: search limit
112445 + * @config: textsearch configuration
112446 + * @state: uninitialized textsearch state variable
112447 + *
112448 + * Finds a pattern in the skb data according to the specified
112449 + * textsearch configuration. Use textsearch_next() to retrieve
112450 + * subsequent occurrences of the pattern. Returns the offset
112451 + * to the first occurrence or UINT_MAX if no match was found.
112452 + */
112453 +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
112454 + unsigned int to, struct ts_config *config,
112455 + struct ts_state *state)
112456 +{
112457 + config->get_next_block = skb_ts_get_next_block;
112458 + config->finish = skb_ts_finish;
112459 +
112460 + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
112461 +
112462 + return textsearch_find(config, state);
112463 +}
112464 +
112465 +/**
112466 + * skb_append_datato_frags: - append the user data to a skb
112467 + * @sk: sock structure
112468 + * @skb: skb structure to be appened with user data.
112469 + * @getfrag: call back function to be used for getting the user data
112470 + * @from: pointer to user message iov
112471 + * @length: length of the iov message
112472 + *
112473 + * Description: This procedure append the user data in the fragment part
112474 + * of the skb if any page alloc fails user this procedure returns -ENOMEM
112475 + */
112476 +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
112477 + int (*getfrag)(void *from, char *to, int offset,
112478 + int len, int odd, struct sk_buff *skb),
112479 + void *from, int length)
112480 +{
112481 + int frg_cnt = 0;
112482 + skb_frag_t *frag = NULL;
112483 + struct page *page = NULL;
112484 + int copy, left;
112485 + int offset = 0;
112486 + int ret;
112487 +
112488 + do {
112489 + /* Return error if we don't have space for new frag */
112490 + frg_cnt = skb_shinfo(skb)->nr_frags;
112491 + if (frg_cnt >= MAX_SKB_FRAGS)
112492 + return -EFAULT;
112493 +
112494 + /* allocate a new page for next frag */
112495 + page = alloc_pages(sk->sk_allocation, 0);
112496 +
112497 + /* If alloc_page fails just return failure and caller will
112498 + * free previous allocated pages by doing kfree_skb()
112499 + */
112500 + if (page == NULL)
112501 + return -ENOMEM;
112502 +
112503 + /* initialize the next frag */
112504 + sk->sk_sndmsg_page = page;
112505 + sk->sk_sndmsg_off = 0;
112506 + skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
112507 + skb->truesize += PAGE_SIZE;
112508 + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
112509 +
112510 + /* get the new initialized frag */
112511 + frg_cnt = skb_shinfo(skb)->nr_frags;
112512 + frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
112513 +
112514 + /* copy the user data to page */
112515 + left = PAGE_SIZE - frag->page_offset;
112516 + copy = (length > left)? left : length;
112517 +
112518 + ret = getfrag(from, (page_address(frag->page) +
112519 + frag->page_offset + frag->size),
112520 + offset, copy, 0, skb);
112521 + if (ret < 0)
112522 + return -EFAULT;
112523 +
112524 + /* copy was successful so update the size parameters */
112525 + sk->sk_sndmsg_off += copy;
112526 + frag->size += copy;
112527 + skb->len += copy;
112528 + skb->data_len += copy;
112529 + offset += copy;
112530 + length -= copy;
112531 +
112532 + } while (length > 0);
112533 +
112534 + return 0;
112535 +}
112536 +
112537 +/**
112538 + * skb_segment - Perform protocol segmentation on skb.
112539 + * @skb: buffer to segment
112540 + * @features: features for the output path (see dev->features)
112541 + *
112542 + * This function performs segmentation on the given skb. It returns
112543 + * the segment at the given position. It returns NULL if there are
112544 + * no more segments to generate, or when an error is encountered.
112545 + */
112546 +struct sk_buff *skb_segment(struct sk_buff *skb, int features)
112547 +{
112548 + struct sk_buff *segs = NULL;
112549 + struct sk_buff *tail = NULL;
112550 + unsigned int mss = skb_shinfo(skb)->gso_size;
112551 + unsigned int doffset = skb->data - skb->mac.raw;
112552 + unsigned int offset = doffset;
112553 + unsigned int headroom;
112554 + unsigned int len;
112555 + int sg = features & NETIF_F_SG;
112556 + int nfrags = skb_shinfo(skb)->nr_frags;
112557 + int err = -ENOMEM;
112558 + int i = 0;
112559 + int pos;
112560 +
112561 + __skb_push(skb, doffset);
112562 + headroom = skb_headroom(skb);
112563 + pos = skb_headlen(skb);
112564 +
112565 + do {
112566 + struct sk_buff *nskb;
112567 + skb_frag_t *frag;
112568 + int hsize, nsize;
112569 + int k;
112570 + int size;
112571 +
112572 + len = skb->len - offset;
112573 + if (len > mss)
112574 + len = mss;
112575 +
112576 + hsize = skb_headlen(skb) - offset;
112577 + if (hsize < 0)
112578 + hsize = 0;
112579 + nsize = hsize + doffset;
112580 + if (nsize > len + doffset || !sg)
112581 + nsize = len + doffset;
112582 +
112583 + nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
112584 + if (unlikely(!nskb))
112585 + goto err;
112586 +
112587 + if (segs)
112588 + tail->next = nskb;
112589 + else
112590 + segs = nskb;
112591 + tail = nskb;
112592 +
112593 + nskb->dev = skb->dev;
112594 + nskb->priority = skb->priority;
112595 + nskb->protocol = skb->protocol;
112596 + nskb->dst = dst_clone(skb->dst);
112597 + memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
112598 + nskb->pkt_type = skb->pkt_type;
112599 + nskb->mac_len = skb->mac_len;
112600 +
112601 + skb_reserve(nskb, headroom);
112602 + nskb->mac.raw = nskb->data;
112603 + nskb->nh.raw = nskb->data + skb->mac_len;
112604 + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
112605 + memcpy(skb_put(nskb, doffset), skb->data, doffset);
112606 +
112607 + if (!sg) {
112608 + nskb->csum = skb_copy_and_csum_bits(skb, offset,
112609 + skb_put(nskb, len),
112610 + len, 0);
112611 + continue;
112612 + }
112613 +
112614 + frag = skb_shinfo(nskb)->frags;
112615 + k = 0;
112616 +
112617 + nskb->ip_summed = CHECKSUM_HW;
112618 + nskb->csum = skb->csum;
112619 + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
112620 +
112621 + while (pos < offset + len) {
112622 + BUG_ON(i >= nfrags);
112623 +
112624 + *frag = skb_shinfo(skb)->frags[i];
112625 + get_page(frag->page);
112626 + size = frag->size;
112627 +
112628 + if (pos < offset) {
112629 + frag->page_offset += offset - pos;
112630 + frag->size -= offset - pos;
112631 + }
112632 +
112633 + k++;
112634 +
112635 + if (pos + size <= offset + len) {
112636 + i++;
112637 + pos += size;
112638 + } else {
112639 + frag->size -= pos + size - (offset + len);
112640 + break;
112641 + }
112642 +
112643 + frag++;
112644 + }
112645 +
112646 + skb_shinfo(nskb)->nr_frags = k;
112647 + nskb->data_len = len - hsize;
112648 + nskb->len += nskb->data_len;
112649 + nskb->truesize += nskb->data_len;
112650 + } while ((offset += len) < skb->len);
112651 +
112652 + return segs;
112653 +
112654 +err:
112655 + while ((skb = segs)) {
112656 + segs = skb->next;
112657 + kfree(skb);
112658 + }
112659 + return ERR_PTR(err);
112660 +}
112661 +
112662 +EXPORT_SYMBOL_GPL(skb_segment);
112663 +
112664 +void __init skb_init(void)
112665 +{
112666 + skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
112667 + sizeof(struct sk_buff),
112668 + 0,
112669 + SLAB_HWCACHE_ALIGN,
112670 + NULL, NULL);
112671 + if (!skbuff_head_cache)
112672 + panic("cannot create skbuff cache");
112673 +
112674 + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
112675 + (2*sizeof(struct sk_buff)) +
112676 + sizeof(atomic_t),
112677 + 0,
112678 + SLAB_HWCACHE_ALIGN,
112679 + NULL, NULL);
112680 + if (!skbuff_fclone_cache)
112681 + panic("cannot create skbuff cache");
112682 +}
112683 +
112684 +EXPORT_SYMBOL(___pskb_trim);
112685 +EXPORT_SYMBOL(__kfree_skb);
112686 +EXPORT_SYMBOL(__pskb_pull_tail);
112687 +EXPORT_SYMBOL(__alloc_skb);
112688 +EXPORT_SYMBOL(pskb_copy);
112689 +EXPORT_SYMBOL(pskb_expand_head);
112690 +EXPORT_SYMBOL(skb_checksum);
112691 +EXPORT_SYMBOL(skb_clone);
112692 +EXPORT_SYMBOL(skb_clone_fraglist);
112693 +EXPORT_SYMBOL(skb_copy);
112694 +EXPORT_SYMBOL(skb_copy_and_csum_bits);
112695 +EXPORT_SYMBOL(skb_copy_and_csum_dev);
112696 +EXPORT_SYMBOL(skb_copy_bits);
112697 +EXPORT_SYMBOL(skb_copy_expand);
112698 +EXPORT_SYMBOL(skb_over_panic);
112699 +EXPORT_SYMBOL(skb_pad);
112700 +EXPORT_SYMBOL(skb_realloc_headroom);
112701 +EXPORT_SYMBOL(skb_under_panic);
112702 +EXPORT_SYMBOL(skb_dequeue);
112703 +EXPORT_SYMBOL(skb_dequeue_tail);
112704 +EXPORT_SYMBOL(skb_insert);
112705 +EXPORT_SYMBOL(skb_queue_purge);
112706 +EXPORT_SYMBOL(skb_queue_head);
112707 +EXPORT_SYMBOL(skb_queue_tail);
112708 +EXPORT_SYMBOL(skb_unlink);
112709 +EXPORT_SYMBOL(skb_append);
112710 +EXPORT_SYMBOL(skb_split);
112711 +EXPORT_SYMBOL(skb_prepare_seq_read);
112712 +EXPORT_SYMBOL(skb_seq_read);
112713 +EXPORT_SYMBOL(skb_abort_seq_read);
112714 +EXPORT_SYMBOL(skb_find_text);
112715 +EXPORT_SYMBOL(skb_append_datato_frags);
112716 diff -Nur linux-2.6.16.33-noxen/net/decnet/dn_nsp_in.c linux-2.6.16.33/net/decnet/dn_nsp_in.c
112717 --- linux-2.6.16.33-noxen/net/decnet/dn_nsp_in.c 2006-11-22 18:06:31.000000000 +0000
112718 +++ linux-2.6.16.33/net/decnet/dn_nsp_in.c 2007-05-23 21:00:01.000000000 +0000
112719 @@ -801,8 +801,7 @@
112720 * We linearize everything except data segments here.
112721 */
112722 if (cb->nsp_flags & ~0x60) {
112723 - if (unlikely(skb_is_nonlinear(skb)) &&
112724 - skb_linearize(skb, GFP_ATOMIC) != 0)
112725 + if (unlikely(skb_linearize(skb)))
112726 goto free_out;
112727 }
112728
112729 diff -Nur linux-2.6.16.33-noxen/net/decnet/dn_route.c linux-2.6.16.33/net/decnet/dn_route.c
112730 --- linux-2.6.16.33-noxen/net/decnet/dn_route.c 2006-11-22 18:06:31.000000000 +0000
112731 +++ linux-2.6.16.33/net/decnet/dn_route.c 2007-05-23 21:00:01.000000000 +0000
112732 @@ -629,8 +629,7 @@
112733 padlen);
112734
112735 if (flags & DN_RT_PKT_CNTL) {
112736 - if (unlikely(skb_is_nonlinear(skb)) &&
112737 - skb_linearize(skb, GFP_ATOMIC) != 0)
112738 + if (unlikely(skb_linearize(skb)))
112739 goto dump_it;
112740
112741 switch(flags & DN_RT_CNTL_MSK) {
112742 diff -Nur linux-2.6.16.33-noxen/net/ipv4/af_inet.c linux-2.6.16.33/net/ipv4/af_inet.c
112743 --- linux-2.6.16.33-noxen/net/ipv4/af_inet.c 2006-11-22 18:06:31.000000000 +0000
112744 +++ linux-2.6.16.33/net/ipv4/af_inet.c 2007-05-23 21:00:01.000000000 +0000
112745 @@ -68,6 +68,7 @@
112746 */
112747
112748 #include <linux/config.h>
112749 +#include <linux/err.h>
112750 #include <linux/errno.h>
112751 #include <linux/types.h>
112752 #include <linux/socket.h>
112753 @@ -1084,6 +1085,88 @@
112754
112755 EXPORT_SYMBOL(inet_sk_rebuild_header);
112756
112757 +static int inet_gso_send_check(struct sk_buff *skb)
112758 +{
112759 + struct iphdr *iph;
112760 + struct net_protocol *ops;
112761 + int proto;
112762 + int ihl;
112763 + int err = -EINVAL;
112764 +
112765 + if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
112766 + goto out;
112767 +
112768 + iph = skb->nh.iph;
112769 + ihl = iph->ihl * 4;
112770 + if (ihl < sizeof(*iph))
112771 + goto out;
112772 +
112773 + if (unlikely(!pskb_may_pull(skb, ihl)))
112774 + goto out;
112775 +
112776 + skb->h.raw = __skb_pull(skb, ihl);
112777 + iph = skb->nh.iph;
112778 + proto = iph->protocol & (MAX_INET_PROTOS - 1);
112779 + err = -EPROTONOSUPPORT;
112780 +
112781 + rcu_read_lock();
112782 + ops = rcu_dereference(inet_protos[proto]);
112783 + if (likely(ops && ops->gso_send_check))
112784 + err = ops->gso_send_check(skb);
112785 + rcu_read_unlock();
112786 +
112787 +out:
112788 + return err;
112789 +}
112790 +
112791 +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
112792 +{
112793 + struct sk_buff *segs = ERR_PTR(-EINVAL);
112794 + struct iphdr *iph;
112795 + struct net_protocol *ops;
112796 + int proto;
112797 + int ihl;
112798 + int id;
112799 +
112800 + if (!pskb_may_pull(skb, sizeof(*iph)))
112801 + goto out;
112802 +
112803 + iph = skb->nh.iph;
112804 + ihl = iph->ihl * 4;
112805 + if (ihl < sizeof(*iph))
112806 + goto out;
112807 +
112808 + if (!pskb_may_pull(skb, ihl))
112809 + goto out;
112810 +
112811 + skb->h.raw = __skb_pull(skb, ihl);
112812 + iph = skb->nh.iph;
112813 + id = ntohs(iph->id);
112814 + proto = iph->protocol & (MAX_INET_PROTOS - 1);
112815 + segs = ERR_PTR(-EPROTONOSUPPORT);
112816 +
112817 + rcu_read_lock();
112818 + ops = rcu_dereference(inet_protos[proto]);
112819 + if (ops && ops->gso_segment)
112820 + segs = ops->gso_segment(skb, features);
112821 + rcu_read_unlock();
112822 +
112823 + if (!segs || unlikely(IS_ERR(segs)))
112824 + goto out;
112825 +
112826 + skb = segs;
112827 + do {
112828 + iph = skb->nh.iph;
112829 + iph->id = htons(id++);
112830 + iph->tot_len = htons(skb->len - skb->mac_len);
112831 + iph->check = 0;
112832 + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
112833 + } while ((skb = skb->next));
112834 +
112835 +out:
112836 + return segs;
112837 +}
112838 +
112839 #ifdef CONFIG_IP_MULTICAST
112840 static struct net_protocol igmp_protocol = {
112841 .handler = igmp_rcv,
112842 @@ -1093,6 +1176,8 @@
112843 static struct net_protocol tcp_protocol = {
112844 .handler = tcp_v4_rcv,
112845 .err_handler = tcp_v4_err,
112846 + .gso_send_check = tcp_v4_gso_send_check,
112847 + .gso_segment = tcp_tso_segment,
112848 .no_policy = 1,
112849 };
112850
112851 @@ -1138,6 +1223,8 @@
112852 static struct packet_type ip_packet_type = {
112853 .type = __constant_htons(ETH_P_IP),
112854 .func = ip_rcv,
112855 + .gso_send_check = inet_gso_send_check,
112856 + .gso_segment = inet_gso_segment,
112857 };
112858
112859 static int __init inet_init(void)
112860 diff -Nur linux-2.6.16.33-noxen/net/ipv4/ip_output.c linux-2.6.16.33/net/ipv4/ip_output.c
112861 --- linux-2.6.16.33-noxen/net/ipv4/ip_output.c 2006-11-22 18:06:31.000000000 +0000
112862 +++ linux-2.6.16.33/net/ipv4/ip_output.c 2007-05-23 21:00:01.000000000 +0000
112863 @@ -210,8 +210,7 @@
112864 return dst_output(skb);
112865 }
112866 #endif
112867 - if (skb->len > dst_mtu(skb->dst) &&
112868 - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
112869 + if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
112870 return ip_fragment(skb, ip_finish_output2);
112871 else
112872 return ip_finish_output2(skb);
112873 @@ -362,7 +361,7 @@
112874 }
112875
112876 ip_select_ident_more(iph, &rt->u.dst, sk,
112877 - (skb_shinfo(skb)->tso_segs ?: 1) - 1);
112878 + (skb_shinfo(skb)->gso_segs ?: 1) - 1);
112879
112880 /* Add an IP checksum. */
112881 ip_send_check(iph);
112882 @@ -743,7 +742,8 @@
112883 (length - transhdrlen));
112884 if (!err) {
112885 /* specify the length of each IP datagram fragment*/
112886 - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
112887 + skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
112888 + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
112889 __skb_queue_tail(&sk->sk_write_queue, skb);
112890
112891 return 0;
112892 @@ -839,7 +839,7 @@
112893 */
112894 if (transhdrlen &&
112895 length + fragheaderlen <= mtu &&
112896 - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
112897 + rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
112898 !exthdrlen)
112899 csummode = CHECKSUM_HW;
112900
112901 @@ -1086,14 +1086,16 @@
112902
112903 inet->cork.length += size;
112904 if ((sk->sk_protocol == IPPROTO_UDP) &&
112905 - (rt->u.dst.dev->features & NETIF_F_UFO))
112906 - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
112907 + (rt->u.dst.dev->features & NETIF_F_UFO)) {
112908 + skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
112909 + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
112910 + }
112911
112912
112913 while (size > 0) {
112914 int i;
112915
112916 - if (skb_shinfo(skb)->ufo_size)
112917 + if (skb_is_gso(skb))
112918 len = size;
112919 else {
112920
112921 diff -Nur linux-2.6.16.33-noxen/net/ipv4/ipcomp.c linux-2.6.16.33/net/ipv4/ipcomp.c
112922 --- linux-2.6.16.33-noxen/net/ipv4/ipcomp.c 2006-11-22 18:06:31.000000000 +0000
112923 +++ linux-2.6.16.33/net/ipv4/ipcomp.c 2007-05-23 21:00:01.000000000 +0000
112924 @@ -84,7 +84,7 @@
112925 struct xfrm_decap_state *decap, struct sk_buff *skb)
112926 {
112927 u8 nexthdr;
112928 - int err = 0;
112929 + int err = -ENOMEM;
112930 struct iphdr *iph;
112931 union {
112932 struct iphdr iph;
112933 @@ -92,11 +92,8 @@
112934 } tmp_iph;
112935
112936
112937 - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
112938 - skb_linearize(skb, GFP_ATOMIC) != 0) {
112939 - err = -ENOMEM;
112940 + if (skb_linearize_cow(skb))
112941 goto out;
112942 - }
112943
112944 skb->ip_summed = CHECKSUM_NONE;
112945
112946 @@ -171,10 +168,8 @@
112947 goto out_ok;
112948 }
112949
112950 - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
112951 - skb_linearize(skb, GFP_ATOMIC) != 0) {
112952 + if (skb_linearize_cow(skb))
112953 goto out_ok;
112954 - }
112955
112956 err = ipcomp_compress(x, skb);
112957 iph = skb->nh.iph;
112958 diff -Nur linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_tcp.c linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_tcp.c
112959 --- linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-11-22 18:06:31.000000000 +0000
112960 +++ linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_tcp.c 2007-05-23 21:00:01.000000000 +0000
112961 @@ -129,7 +129,12 @@
112962 if (hdrsize < sizeof(*hdr))
112963 return 1;
112964
112965 - hdr->check = ip_nat_cheat_check(~oldip, newip,
112966 +#ifdef CONFIG_XEN
112967 + if ((*pskb)->proto_csum_blank)
112968 + hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
112969 + else
112970 +#endif
112971 + hdr->check = ip_nat_cheat_check(~oldip, newip,
112972 ip_nat_cheat_check(oldport ^ 0xFFFF,
112973 newport,
112974 hdr->check));
112975 diff -Nur linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_udp.c linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_udp.c
112976 --- linux-2.6.16.33-noxen/net/ipv4/netfilter/ip_nat_proto_udp.c 2006-11-22 18:06:31.000000000 +0000
112977 +++ linux-2.6.16.33/net/ipv4/netfilter/ip_nat_proto_udp.c 2007-05-23 21:00:01.000000000 +0000
112978 @@ -113,11 +113,17 @@
112979 newport = tuple->dst.u.udp.port;
112980 portptr = &hdr->dest;
112981 }
112982 - if (hdr->check) /* 0 is a special case meaning no checksum */
112983 - hdr->check = ip_nat_cheat_check(~oldip, newip,
112984 + if (hdr->check) { /* 0 is a special case meaning no checksum */
112985 +#ifdef CONFIG_XEN
112986 + if ((*pskb)->proto_csum_blank)
112987 + hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
112988 + else
112989 +#endif
112990 + hdr->check = ip_nat_cheat_check(~oldip, newip,
112991 ip_nat_cheat_check(*portptr ^ 0xFFFF,
112992 newport,
112993 hdr->check));
112994 + }
112995 *portptr = newport;
112996 return 1;
112997 }
112998 diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp.c linux-2.6.16.33/net/ipv4/tcp.c
112999 --- linux-2.6.16.33-noxen/net/ipv4/tcp.c 2006-11-22 18:06:31.000000000 +0000
113000 +++ linux-2.6.16.33/net/ipv4/tcp.c 2007-05-23 21:00:01.000000000 +0000
113001 @@ -257,6 +257,7 @@
113002 #include <linux/fs.h>
113003 #include <linux/random.h>
113004 #include <linux/bootmem.h>
113005 +#include <linux/err.h>
113006
113007 #include <net/icmp.h>
113008 #include <net/tcp.h>
113009 @@ -570,7 +571,7 @@
113010 skb->ip_summed = CHECKSUM_HW;
113011 tp->write_seq += copy;
113012 TCP_SKB_CB(skb)->end_seq += copy;
113013 - skb_shinfo(skb)->tso_segs = 0;
113014 + skb_shinfo(skb)->gso_segs = 0;
113015
113016 if (!copied)
113017 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
113018 @@ -621,14 +622,10 @@
113019 ssize_t res;
113020 struct sock *sk = sock->sk;
113021
113022 -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
113023 -
113024 if (!(sk->sk_route_caps & NETIF_F_SG) ||
113025 - !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
113026 + !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
113027 return sock_no_sendpage(sock, page, offset, size, flags);
113028
113029 -#undef TCP_ZC_CSUM_FLAGS
113030 -
113031 lock_sock(sk);
113032 TCP_CHECK_TIMER(sk);
113033 res = do_tcp_sendpages(sk, &page, offset, size, flags);
113034 @@ -725,9 +722,7 @@
113035 /*
113036 * Check whether we can use HW checksum.
113037 */
113038 - if (sk->sk_route_caps &
113039 - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
113040 - NETIF_F_HW_CSUM))
113041 + if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
113042 skb->ip_summed = CHECKSUM_HW;
113043
113044 skb_entail(sk, tp, skb);
113045 @@ -823,7 +818,7 @@
113046
113047 tp->write_seq += copy;
113048 TCP_SKB_CB(skb)->end_seq += copy;
113049 - skb_shinfo(skb)->tso_segs = 0;
113050 + skb_shinfo(skb)->gso_segs = 0;
113051
113052 from += copy;
113053 copied += copy;
113054 @@ -2026,6 +2021,77 @@
113055 }
113056
113057
113058 +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
113059 +{
113060 + struct sk_buff *segs = ERR_PTR(-EINVAL);
113061 + struct tcphdr *th;
113062 + unsigned thlen;
113063 + unsigned int seq;
113064 + unsigned int delta;
113065 + unsigned int oldlen;
113066 + unsigned int len;
113067 +
113068 + if (!pskb_may_pull(skb, sizeof(*th)))
113069 + goto out;
113070 +
113071 + th = skb->h.th;
113072 + thlen = th->doff * 4;
113073 + if (thlen < sizeof(*th))
113074 + goto out;
113075 +
113076 + if (!pskb_may_pull(skb, thlen))
113077 + goto out;
113078 +
113079 + oldlen = (u16)~skb->len;
113080 + __skb_pull(skb, thlen);
113081 +
113082 + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
113083 + /* Packet is from an untrusted source, reset gso_segs. */
113084 + int mss = skb_shinfo(skb)->gso_size;
113085 +
113086 + skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
113087 +
113088 + segs = NULL;
113089 + goto out;
113090 + }
113091 +
113092 + segs = skb_segment(skb, features);
113093 + if (IS_ERR(segs))
113094 + goto out;
113095 +
113096 + len = skb_shinfo(skb)->gso_size;
113097 + delta = htonl(oldlen + (thlen + len));
113098 +
113099 + skb = segs;
113100 + th = skb->h.th;
113101 + seq = ntohl(th->seq);
113102 +
113103 + do {
113104 + th->fin = th->psh = 0;
113105 +
113106 + th->check = ~csum_fold(th->check + delta);
113107 + if (skb->ip_summed != CHECKSUM_HW)
113108 + th->check = csum_fold(csum_partial(skb->h.raw, thlen,
113109 + skb->csum));
113110 +
113111 + seq += len;
113112 + skb = skb->next;
113113 + th = skb->h.th;
113114 +
113115 + th->seq = htonl(seq);
113116 + th->cwr = 0;
113117 + } while (skb->next);
113118 +
113119 + delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
113120 + th->check = ~csum_fold(th->check + delta);
113121 + if (skb->ip_summed != CHECKSUM_HW)
113122 + th->check = csum_fold(csum_partial(skb->h.raw, thlen,
113123 + skb->csum));
113124 +
113125 +out:
113126 + return segs;
113127 +}
113128 +
113129 extern void __skb_cb_too_small_for_tcp(int, int);
113130 extern struct tcp_congestion_ops tcp_reno;
113131
113132 diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_input.c linux-2.6.16.33/net/ipv4/tcp_input.c
113133 --- linux-2.6.16.33-noxen/net/ipv4/tcp_input.c 2006-11-22 18:06:31.000000000 +0000
113134 +++ linux-2.6.16.33/net/ipv4/tcp_input.c 2007-05-23 21:00:01.000000000 +0000
113135 @@ -127,7 +127,7 @@
113136 /* skb->len may jitter because of SACKs, even if peer
113137 * sends good full-sized frames.
113138 */
113139 - len = skb->len;
113140 + len = skb_shinfo(skb)->gso_size ?: skb->len;
113141 if (len >= icsk->icsk_ack.rcv_mss) {
113142 icsk->icsk_ack.rcv_mss = len;
113143 } else {
113144 @@ -1072,7 +1072,7 @@
113145 else
113146 pkt_len = (end_seq -
113147 TCP_SKB_CB(skb)->seq);
113148 - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
113149 + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
113150 break;
113151 pcount = tcp_skb_pcount(skb);
113152 }
113153 diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_ipv4.c linux-2.6.16.33/net/ipv4/tcp_ipv4.c
113154 --- linux-2.6.16.33-noxen/net/ipv4/tcp_ipv4.c 2006-11-22 18:06:31.000000000 +0000
113155 +++ linux-2.6.16.33/net/ipv4/tcp_ipv4.c 2007-05-23 21:00:01.000000000 +0000
113156 @@ -495,6 +495,24 @@
113157 }
113158 }
113159
113160 +int tcp_v4_gso_send_check(struct sk_buff *skb)
113161 +{
113162 + struct iphdr *iph;
113163 + struct tcphdr *th;
113164 +
113165 + if (!pskb_may_pull(skb, sizeof(*th)))
113166 + return -EINVAL;
113167 +
113168 + iph = skb->nh.iph;
113169 + th = skb->h.th;
113170 +
113171 + th->check = 0;
113172 + th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
113173 + skb->csum = offsetof(struct tcphdr, check);
113174 + skb->ip_summed = CHECKSUM_HW;
113175 + return 0;
113176 +}
113177 +
113178 /*
113179 * This routine will send an RST to the other tcp.
113180 *
113181 diff -Nur linux-2.6.16.33-noxen/net/ipv4/tcp_output.c linux-2.6.16.33/net/ipv4/tcp_output.c
113182 --- linux-2.6.16.33-noxen/net/ipv4/tcp_output.c 2006-11-22 18:06:31.000000000 +0000
113183 +++ linux-2.6.16.33/net/ipv4/tcp_output.c 2007-05-23 21:00:01.000000000 +0000
113184 @@ -497,15 +497,17 @@
113185 /* Avoid the costly divide in the normal
113186 * non-TSO case.
113187 */
113188 - skb_shinfo(skb)->tso_segs = 1;
113189 - skb_shinfo(skb)->tso_size = 0;
113190 + skb_shinfo(skb)->gso_segs = 1;
113191 + skb_shinfo(skb)->gso_size = 0;
113192 + skb_shinfo(skb)->gso_type = 0;
113193 } else {
113194 unsigned int factor;
113195
113196 factor = skb->len + (mss_now - 1);
113197 factor /= mss_now;
113198 - skb_shinfo(skb)->tso_segs = factor;
113199 - skb_shinfo(skb)->tso_size = mss_now;
113200 + skb_shinfo(skb)->gso_segs = factor;
113201 + skb_shinfo(skb)->gso_size = mss_now;
113202 + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
113203 }
113204 }
113205
113206 @@ -850,7 +852,7 @@
113207
113208 if (!tso_segs ||
113209 (tso_segs > 1 &&
113210 - skb_shinfo(skb)->tso_size != mss_now)) {
113211 + tcp_skb_mss(skb) != mss_now)) {
113212 tcp_set_skb_tso_segs(sk, skb, mss_now);
113213 tso_segs = tcp_skb_pcount(skb);
113214 }
113215 @@ -1510,8 +1512,9 @@
113216 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
113217 if (!pskb_trim(skb, 0)) {
113218 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
113219 - skb_shinfo(skb)->tso_segs = 1;
113220 - skb_shinfo(skb)->tso_size = 0;
113221 + skb_shinfo(skb)->gso_segs = 1;
113222 + skb_shinfo(skb)->gso_size = 0;
113223 + skb_shinfo(skb)->gso_type = 0;
113224 skb->ip_summed = CHECKSUM_NONE;
113225 skb->csum = 0;
113226 }
113227 @@ -1716,8 +1719,9 @@
113228 skb->csum = 0;
113229 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
113230 TCP_SKB_CB(skb)->sacked = 0;
113231 - skb_shinfo(skb)->tso_segs = 1;
113232 - skb_shinfo(skb)->tso_size = 0;
113233 + skb_shinfo(skb)->gso_segs = 1;
113234 + skb_shinfo(skb)->gso_size = 0;
113235 + skb_shinfo(skb)->gso_type = 0;
113236
113237 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
113238 TCP_SKB_CB(skb)->seq = tp->write_seq;
113239 @@ -1749,8 +1753,9 @@
113240 skb->csum = 0;
113241 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
113242 TCP_SKB_CB(skb)->sacked = 0;
113243 - skb_shinfo(skb)->tso_segs = 1;
113244 - skb_shinfo(skb)->tso_size = 0;
113245 + skb_shinfo(skb)->gso_segs = 1;
113246 + skb_shinfo(skb)->gso_size = 0;
113247 + skb_shinfo(skb)->gso_type = 0;
113248
113249 /* Send it off. */
113250 TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
113251 @@ -1833,8 +1838,9 @@
113252 TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
113253 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
113254 TCP_SKB_CB(skb)->sacked = 0;
113255 - skb_shinfo(skb)->tso_segs = 1;
113256 - skb_shinfo(skb)->tso_size = 0;
113257 + skb_shinfo(skb)->gso_segs = 1;
113258 + skb_shinfo(skb)->gso_size = 0;
113259 + skb_shinfo(skb)->gso_type = 0;
113260 th->seq = htonl(TCP_SKB_CB(skb)->seq);
113261 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
113262 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
113263 @@ -1937,8 +1943,9 @@
113264 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
113265 TCP_ECN_send_syn(sk, tp, buff);
113266 TCP_SKB_CB(buff)->sacked = 0;
113267 - skb_shinfo(buff)->tso_segs = 1;
113268 - skb_shinfo(buff)->tso_size = 0;
113269 + skb_shinfo(buff)->gso_segs = 1;
113270 + skb_shinfo(buff)->gso_size = 0;
113271 + skb_shinfo(buff)->gso_type = 0;
113272 buff->csum = 0;
113273 TCP_SKB_CB(buff)->seq = tp->write_seq++;
113274 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
113275 @@ -2042,8 +2049,9 @@
113276 buff->csum = 0;
113277 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
113278 TCP_SKB_CB(buff)->sacked = 0;
113279 - skb_shinfo(buff)->tso_segs = 1;
113280 - skb_shinfo(buff)->tso_size = 0;
113281 + skb_shinfo(buff)->gso_segs = 1;
113282 + skb_shinfo(buff)->gso_size = 0;
113283 + skb_shinfo(buff)->gso_type = 0;
113284
113285 /* Send it off, this clears delayed acks for us. */
113286 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
113287 @@ -2078,8 +2086,9 @@
113288 skb->csum = 0;
113289 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
113290 TCP_SKB_CB(skb)->sacked = urgent;
113291 - skb_shinfo(skb)->tso_segs = 1;
113292 - skb_shinfo(skb)->tso_size = 0;
113293 + skb_shinfo(skb)->gso_segs = 1;
113294 + skb_shinfo(skb)->gso_size = 0;
113295 + skb_shinfo(skb)->gso_type = 0;
113296
113297 /* Use a previous sequence. This should cause the other
113298 * end to send an ack. Don't queue or clone SKB, just
113299 diff -Nur linux-2.6.16.33-noxen/net/ipv4/xfrm4_output.c linux-2.6.16.33/net/ipv4/xfrm4_output.c
113300 --- linux-2.6.16.33-noxen/net/ipv4/xfrm4_output.c 2006-11-22 18:06:31.000000000 +0000
113301 +++ linux-2.6.16.33/net/ipv4/xfrm4_output.c 2007-05-23 21:00:01.000000000 +0000
113302 @@ -9,6 +9,8 @@
113303 */
113304
113305 #include <linux/compiler.h>
113306 +#include <linux/if_ether.h>
113307 +#include <linux/kernel.h>
113308 #include <linux/skbuff.h>
113309 #include <linux/spinlock.h>
113310 #include <linux/netfilter_ipv4.h>
113311 @@ -17,6 +19,8 @@
113312 #include <net/xfrm.h>
113313 #include <net/icmp.h>
113314
113315 +extern int skb_checksum_setup(struct sk_buff *skb);
113316 +
113317 /* Add encapsulation header.
113318 *
113319 * In transport mode, the IP header will be moved forward to make space
113320 @@ -103,6 +107,10 @@
113321 struct xfrm_state *x = dst->xfrm;
113322 int err;
113323
113324 + err = skb_checksum_setup(skb);
113325 + if (err)
113326 + goto error_nolock;
113327 +
113328 if (skb->ip_summed == CHECKSUM_HW) {
113329 err = skb_checksum_help(skb, 0);
113330 if (err)
113331 @@ -152,16 +160,10 @@
113332 goto out_exit;
113333 }
113334
113335 -static int xfrm4_output_finish(struct sk_buff *skb)
113336 +static int xfrm4_output_finish2(struct sk_buff *skb)
113337 {
113338 int err;
113339
113340 -#ifdef CONFIG_NETFILTER
113341 - if (!skb->dst->xfrm) {
113342 - IPCB(skb)->flags |= IPSKB_REROUTED;
113343 - return dst_output(skb);
113344 - }
113345 -#endif
113346 while (likely((err = xfrm4_output_one(skb)) == 0)) {
113347 nf_reset(skb);
113348
113349 @@ -174,7 +176,7 @@
113350 return dst_output(skb);
113351
113352 err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
113353 - skb->dst->dev, xfrm4_output_finish);
113354 + skb->dst->dev, xfrm4_output_finish2);
113355 if (unlikely(err != 1))
113356 break;
113357 }
113358 @@ -182,6 +184,48 @@
113359 return err;
113360 }
113361
113362 +static int xfrm4_output_finish(struct sk_buff *skb)
113363 +{
113364 + struct sk_buff *segs;
113365 +
113366 +#ifdef CONFIG_NETFILTER
113367 + if (!skb->dst->xfrm) {
113368 + IPCB(skb)->flags |= IPSKB_REROUTED;
113369 + return dst_output(skb);
113370 + }
113371 +#endif
113372 +
113373 + if (!skb_is_gso(skb))
113374 + return xfrm4_output_finish2(skb);
113375 +
113376 + skb->protocol = htons(ETH_P_IP);
113377 + segs = skb_gso_segment(skb, 0);
113378 + kfree_skb(skb);
113379 + if (unlikely(IS_ERR(segs)))
113380 + return PTR_ERR(segs);
113381 +
113382 + do {
113383 + struct sk_buff *nskb = segs->next;
113384 + int err;
113385 +
113386 + segs->next = NULL;
113387 + err = xfrm4_output_finish2(segs);
113388 +
113389 + if (unlikely(err)) {
113390 + while ((segs = nskb)) {
113391 + nskb = segs->next;
113392 + segs->next = NULL;
113393 + kfree_skb(segs);
113394 + }
113395 + return err;
113396 + }
113397 +
113398 + segs = nskb;
113399 + } while (segs);
113400 +
113401 + return 0;
113402 +}
113403 +
113404 int xfrm4_output(struct sk_buff *skb)
113405 {
113406 return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
113407 diff -Nur linux-2.6.16.33-noxen/net/ipv6/addrconf.c linux-2.6.16.33/net/ipv6/addrconf.c
113408 --- linux-2.6.16.33-noxen/net/ipv6/addrconf.c 2006-11-22 18:06:31.000000000 +0000
113409 +++ linux-2.6.16.33/net/ipv6/addrconf.c 2007-05-23 21:00:01.000000000 +0000
113410 @@ -2471,6 +2471,7 @@
113411 spin_lock_bh(&ifp->lock);
113412
113413 if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
113414 + !(dev->flags&IFF_MULTICAST) ||
113415 !(ifp->flags&IFA_F_TENTATIVE)) {
113416 ifp->flags &= ~IFA_F_TENTATIVE;
113417 spin_unlock_bh(&ifp->lock);
113418 @@ -2555,6 +2556,7 @@
113419 if (ifp->idev->cnf.forwarding == 0 &&
113420 ifp->idev->cnf.rtr_solicits > 0 &&
113421 (dev->flags&IFF_LOOPBACK) == 0 &&
113422 + (dev->flags & IFF_MULTICAST) &&
113423 (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
113424 struct in6_addr all_routers;
113425
113426 diff -Nur linux-2.6.16.33-noxen/net/ipv6/ip6_output.c linux-2.6.16.33/net/ipv6/ip6_output.c
113427 --- linux-2.6.16.33-noxen/net/ipv6/ip6_output.c 2006-11-22 18:06:31.000000000 +0000
113428 +++ linux-2.6.16.33/net/ipv6/ip6_output.c 2007-05-23 21:00:01.000000000 +0000
113429 @@ -147,7 +147,7 @@
113430
113431 int ip6_output(struct sk_buff *skb)
113432 {
113433 - if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
113434 + if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
113435 dst_allfrag(skb->dst))
113436 return ip6_fragment(skb, ip6_output2);
113437 else
113438 @@ -829,8 +829,9 @@
113439 struct frag_hdr fhdr;
113440
113441 /* specify the length of each IP datagram fragment*/
113442 - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
113443 - sizeof(struct frag_hdr);
113444 + skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
113445 + sizeof(struct frag_hdr);
113446 + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
113447 ipv6_select_ident(skb, &fhdr);
113448 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
113449 __skb_queue_tail(&sk->sk_write_queue, skb);
113450 diff -Nur linux-2.6.16.33-noxen/net/ipv6/ipcomp6.c linux-2.6.16.33/net/ipv6/ipcomp6.c
113451 --- linux-2.6.16.33-noxen/net/ipv6/ipcomp6.c 2006-11-22 18:06:31.000000000 +0000
113452 +++ linux-2.6.16.33/net/ipv6/ipcomp6.c 2007-05-23 21:00:01.000000000 +0000
113453 @@ -64,7 +64,7 @@
113454
113455 static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
113456 {
113457 - int err = 0;
113458 + int err = -ENOMEM;
113459 u8 nexthdr = 0;
113460 int hdr_len = skb->h.raw - skb->nh.raw;
113461 unsigned char *tmp_hdr = NULL;
113462 @@ -75,11 +75,8 @@
113463 struct crypto_tfm *tfm;
113464 int cpu;
113465
113466 - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
113467 - skb_linearize(skb, GFP_ATOMIC) != 0) {
113468 - err = -ENOMEM;
113469 + if (skb_linearize_cow(skb))
113470 goto out;
113471 - }
113472
113473 skb->ip_summed = CHECKSUM_NONE;
113474
113475 @@ -158,10 +155,8 @@
113476 goto out_ok;
113477 }
113478
113479 - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
113480 - skb_linearize(skb, GFP_ATOMIC) != 0) {
113481 + if (skb_linearize_cow(skb))
113482 goto out_ok;
113483 - }
113484
113485 /* compression */
113486 plen = skb->len - hdr_len;
113487 diff -Nur linux-2.6.16.33-noxen/net/ipv6/xfrm6_output.c linux-2.6.16.33/net/ipv6/xfrm6_output.c
113488 --- linux-2.6.16.33-noxen/net/ipv6/xfrm6_output.c 2006-11-22 18:06:31.000000000 +0000
113489 +++ linux-2.6.16.33/net/ipv6/xfrm6_output.c 2007-05-23 21:00:01.000000000 +0000
113490 @@ -151,7 +151,7 @@
113491 goto out_exit;
113492 }
113493
113494 -static int xfrm6_output_finish(struct sk_buff *skb)
113495 +static int xfrm6_output_finish2(struct sk_buff *skb)
113496 {
113497 int err;
113498
113499 @@ -167,7 +167,7 @@
113500 return dst_output(skb);
113501
113502 err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
113503 - skb->dst->dev, xfrm6_output_finish);
113504 + skb->dst->dev, xfrm6_output_finish2);
113505 if (unlikely(err != 1))
113506 break;
113507 }
113508 @@ -175,6 +175,41 @@
113509 return err;
113510 }
113511
113512 +static int xfrm6_output_finish(struct sk_buff *skb)
113513 +{
113514 + struct sk_buff *segs;
113515 +
113516 + if (!skb_is_gso(skb))
113517 + return xfrm6_output_finish2(skb);
113518 +
113519 + skb->protocol = htons(ETH_P_IP);
113520 + segs = skb_gso_segment(skb, 0);
113521 + kfree_skb(skb);
113522 + if (unlikely(IS_ERR(segs)))
113523 + return PTR_ERR(segs);
113524 +
113525 + do {
113526 + struct sk_buff *nskb = segs->next;
113527 + int err;
113528 +
113529 + segs->next = NULL;
113530 + err = xfrm6_output_finish2(segs);
113531 +
113532 + if (unlikely(err)) {
113533 + while ((segs = nskb)) {
113534 + nskb = segs->next;
113535 + segs->next = NULL;
113536 + kfree_skb(segs);
113537 + }
113538 + return err;
113539 + }
113540 +
113541 + segs = nskb;
113542 + } while (segs);
113543 +
113544 + return 0;
113545 +}
113546 +
113547 int xfrm6_output(struct sk_buff *skb)
113548 {
113549 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
113550 diff -Nur linux-2.6.16.33-noxen/net/sched/sch_generic.c linux-2.6.16.33/net/sched/sch_generic.c
113551 --- linux-2.6.16.33-noxen/net/sched/sch_generic.c 2006-11-22 18:06:31.000000000 +0000
113552 +++ linux-2.6.16.33/net/sched/sch_generic.c 2007-05-23 21:00:01.000000000 +0000
113553 @@ -72,9 +72,9 @@
113554 dev->queue_lock serializes queue accesses for this device
113555 AND dev->qdisc pointer itself.
113556
113557 - dev->xmit_lock serializes accesses to device driver.
113558 + netif_tx_lock serializes accesses to device driver.
113559
113560 - dev->queue_lock and dev->xmit_lock are mutually exclusive,
113561 + dev->queue_lock and netif_tx_lock are mutually exclusive,
113562 if one is grabbed, another must be free.
113563 */
113564
113565 @@ -90,14 +90,17 @@
113566 NOTE: Called under dev->queue_lock with locally disabled BH.
113567 */
113568
113569 -int qdisc_restart(struct net_device *dev)
113570 +static inline int qdisc_restart(struct net_device *dev)
113571 {
113572 struct Qdisc *q = dev->qdisc;
113573 struct sk_buff *skb;
113574
113575 /* Dequeue packet */
113576 - if ((skb = q->dequeue(q)) != NULL) {
113577 + if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
113578 unsigned nolock = (dev->features & NETIF_F_LLTX);
113579 +
113580 + dev->gso_skb = NULL;
113581 +
113582 /*
113583 * When the driver has LLTX set it does its own locking
113584 * in start_xmit. No need to add additional overhead by
113585 @@ -108,7 +111,7 @@
113586 * will be requeued.
113587 */
113588 if (!nolock) {
113589 - if (!spin_trylock(&dev->xmit_lock)) {
113590 + if (!netif_tx_trylock(dev)) {
113591 collision:
113592 /* So, someone grabbed the driver. */
113593
113594 @@ -126,8 +129,6 @@
113595 __get_cpu_var(netdev_rx_stat).cpu_collision++;
113596 goto requeue;
113597 }
113598 - /* Remember that the driver is grabbed by us. */
113599 - dev->xmit_lock_owner = smp_processor_id();
113600 }
113601
113602 {
113603 @@ -136,14 +137,11 @@
113604
113605 if (!netif_queue_stopped(dev)) {
113606 int ret;
113607 - if (netdev_nit)
113608 - dev_queue_xmit_nit(skb, dev);
113609
113610 - ret = dev->hard_start_xmit(skb, dev);
113611 + ret = dev_hard_start_xmit(skb, dev);
113612 if (ret == NETDEV_TX_OK) {
113613 if (!nolock) {
113614 - dev->xmit_lock_owner = -1;
113615 - spin_unlock(&dev->xmit_lock);
113616 + netif_tx_unlock(dev);
113617 }
113618 spin_lock(&dev->queue_lock);
113619 return -1;
113620 @@ -157,8 +155,7 @@
113621 /* NETDEV_TX_BUSY - we need to requeue */
113622 /* Release the driver */
113623 if (!nolock) {
113624 - dev->xmit_lock_owner = -1;
113625 - spin_unlock(&dev->xmit_lock);
113626 + netif_tx_unlock(dev);
113627 }
113628 spin_lock(&dev->queue_lock);
113629 q = dev->qdisc;
113630 @@ -175,7 +172,10 @@
113631 */
113632
113633 requeue:
113634 - q->ops->requeue(skb, q);
113635 + if (skb->next)
113636 + dev->gso_skb = skb;
113637 + else
113638 + q->ops->requeue(skb, q);
113639 netif_schedule(dev);
113640 return 1;
113641 }
113642 @@ -183,11 +183,23 @@
113643 return q->q.qlen;
113644 }
113645
113646 +void __qdisc_run(struct net_device *dev)
113647 +{
113648 + if (unlikely(dev->qdisc == &noop_qdisc))
113649 + goto out;
113650 +
113651 + while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
113652 + /* NOTHING */;
113653 +
113654 +out:
113655 + clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
113656 +}
113657 +
113658 static void dev_watchdog(unsigned long arg)
113659 {
113660 struct net_device *dev = (struct net_device *)arg;
113661
113662 - spin_lock(&dev->xmit_lock);
113663 + netif_tx_lock(dev);
113664 if (dev->qdisc != &noop_qdisc) {
113665 if (netif_device_present(dev) &&
113666 netif_running(dev) &&
113667 @@ -201,7 +213,7 @@
113668 dev_hold(dev);
113669 }
113670 }
113671 - spin_unlock(&dev->xmit_lock);
113672 + netif_tx_unlock(dev);
113673
113674 dev_put(dev);
113675 }
113676 @@ -225,17 +237,17 @@
113677
113678 static void dev_watchdog_up(struct net_device *dev)
113679 {
113680 - spin_lock_bh(&dev->xmit_lock);
113681 + netif_tx_lock_bh(dev);
113682 __netdev_watchdog_up(dev);
113683 - spin_unlock_bh(&dev->xmit_lock);
113684 + netif_tx_unlock_bh(dev);
113685 }
113686
113687 static void dev_watchdog_down(struct net_device *dev)
113688 {
113689 - spin_lock_bh(&dev->xmit_lock);
113690 + netif_tx_lock_bh(dev);
113691 if (del_timer(&dev->watchdog_timer))
113692 __dev_put(dev);
113693 - spin_unlock_bh(&dev->xmit_lock);
113694 + netif_tx_unlock_bh(dev);
113695 }
113696
113697 void netif_carrier_on(struct net_device *dev)
113698 @@ -577,10 +589,17 @@
113699
113700 dev_watchdog_down(dev);
113701
113702 - while (test_bit(__LINK_STATE_SCHED, &dev->state))
113703 + /* Wait for outstanding dev_queue_xmit calls. */
113704 + synchronize_rcu();
113705 +
113706 + /* Wait for outstanding qdisc_run calls. */
113707 + while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
113708 yield();
113709
113710 - spin_unlock_wait(&dev->xmit_lock);
113711 + if (dev->gso_skb) {
113712 + kfree_skb(dev->gso_skb);
113713 + dev->gso_skb = NULL;
113714 + }
113715 }
113716
113717 void dev_init_scheduler(struct net_device *dev)
113718 @@ -622,6 +641,5 @@
113719 EXPORT_SYMBOL(qdisc_alloc);
113720 EXPORT_SYMBOL(qdisc_destroy);
113721 EXPORT_SYMBOL(qdisc_reset);
113722 -EXPORT_SYMBOL(qdisc_restart);
113723 EXPORT_SYMBOL(qdisc_lock_tree);
113724 EXPORT_SYMBOL(qdisc_unlock_tree);
113725 diff -Nur linux-2.6.16.33-noxen/net/sched/sch_teql.c linux-2.6.16.33/net/sched/sch_teql.c
113726 --- linux-2.6.16.33-noxen/net/sched/sch_teql.c 2006-11-22 18:06:31.000000000 +0000
113727 +++ linux-2.6.16.33/net/sched/sch_teql.c 2007-05-23 21:00:01.000000000 +0000
113728 @@ -302,20 +302,17 @@
113729
113730 switch (teql_resolve(skb, skb_res, slave)) {
113731 case 0:
113732 - if (spin_trylock(&slave->xmit_lock)) {
113733 - slave->xmit_lock_owner = smp_processor_id();
113734 + if (netif_tx_trylock(slave)) {
113735 if (!netif_queue_stopped(slave) &&
113736 slave->hard_start_xmit(skb, slave) == 0) {
113737 - slave->xmit_lock_owner = -1;
113738 - spin_unlock(&slave->xmit_lock);
113739 + netif_tx_unlock(slave);
113740 master->slaves = NEXT_SLAVE(q);
113741 netif_wake_queue(dev);
113742 master->stats.tx_packets++;
113743 master->stats.tx_bytes += len;
113744 return 0;
113745 }
113746 - slave->xmit_lock_owner = -1;
113747 - spin_unlock(&slave->xmit_lock);
113748 + netif_tx_unlock(slave);
113749 }
113750 if (netif_queue_stopped(dev))
113751 busy = 1;
113752 diff -Nur linux-2.6.16.33-noxen/scripts/Makefile.xen linux-2.6.16.33/scripts/Makefile.xen
113753 --- linux-2.6.16.33-noxen/scripts/Makefile.xen 1970-01-01 00:00:00.000000000 +0000
113754 +++ linux-2.6.16.33/scripts/Makefile.xen 2007-01-08 15:00:46.000000000 +0000
113755 @@ -0,0 +1,14 @@
113756 +
113757 +# cherrypickxen($1 = allobj)
113758 +cherrypickxen = $(foreach var, $(1), \
113759 + $(shell o=$(var); \
113760 + c=$${o%.o}-xen.c; \
113761 + s=$${o%.o}-xen.S; \
113762 + oxen=$${o%.o}-xen.o; \
113763 + [ -f $(srctree)/$(src)/$${c} ] || \
113764 + [ -f $(srctree)/$(src)/$${s} ] \
113765 + && echo $$oxen \
113766 + || echo $(var) ) \
113767 + )
113768 +# filterxen($1 = allobj, $2 = noobjs)
113769 +filterxen = $(filter-out $(2), $(1))