]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/suse-2.6.27.31/patches.xen/xen3-patch-2.6.25
Add a patch to fix Intel E100 wake-on-lan problems.
[ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.xen / xen3-patch-2.6.25
CommitLineData
2cb7cef9
BS
1From: kernel.org
2Subject: 2.6.25
3Patch-mainline: 2.6.25
4
5Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
6
7Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
8
9--- sle11-2009-06-29.orig/arch/x86/Kconfig 2009-02-16 16:18:36.000000000 +0100
10+++ sle11-2009-06-29/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
11@@ -27,7 +27,7 @@ config X86
12 select HAVE_KRETPROBES
13 select HAVE_DYNAMIC_FTRACE
14 select HAVE_FTRACE
15- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
16+ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
17 select HAVE_ARCH_KGDB if !X86_VOYAGER
18 select HAVE_ARCH_TRACEHOOK
19 select HAVE_GENERIC_DMA_COHERENT if X86_32
20@@ -211,14 +211,12 @@ config X86_TRAMPOLINE
21 default y
22
23 config X86_NO_TSS
24- bool
25+ def_bool y
26 depends on XEN
27- default y
28
29 config X86_NO_IDT
30- bool
31+ def_bool y
32 depends on XEN
33- default y
34
35 config KTIME_SCALAR
36 def_bool X86_32
37@@ -728,9 +726,8 @@ config X86_VISWS_APIC
38 depends on X86_32 && X86_VISWS
39
40 config X86_XEN_GENAPIC
41- bool
42+ def_bool y
43 depends on X86_64_XEN
44- default y
45
46 config X86_MCE
47 bool "Machine Check Exception"
48@@ -1117,7 +1114,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
49
50 config ARCH_SPARSEMEM_DEFAULT
51 def_bool y
52- depends on X86_64
53+ depends on X86_64 && !X86_64_XEN
54
55 config ARCH_SPARSEMEM_ENABLE
56 def_bool y
57@@ -1747,10 +1744,10 @@ config PCI_MMCONFIG
58 depends on X86_64 && PCI && ACPI
59
60 config XEN_PCIDEV_FRONTEND
61- bool "Xen PCI Frontend" if X86_64
62+ def_bool y
63+ prompt "Xen PCI Frontend" if X86_64
64 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
65 select HOTPLUG
66- default y
67 help
68 The PCI device frontend driver allows the kernel to import arbitrary
69 PCI devices from a PCI backend to support PCI driver domains.
70@@ -1758,7 +1755,6 @@ config XEN_PCIDEV_FRONTEND
71 config XEN_PCIDEV_FE_DEBUG
72 bool "Xen PCI Frontend Debugging"
73 depends on XEN_PCIDEV_FRONTEND
74- default n
75 help
76 Enables some debug statements within the PCI Frontend.
77
78--- sle11-2009-06-29.orig/arch/x86/Kconfig.debug 2009-02-02 09:40:56.000000000 +0100
79+++ sle11-2009-06-29/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
80@@ -279,6 +279,7 @@ config DEBUG_BOOT_PARAMS
81 bool "Debug boot parameters"
82 depends on DEBUG_KERNEL
83 depends on DEBUG_FS
84+ depends on !XEN
85 help
86 This option will cause struct boot_params to be exported via debugfs.
87
88--- sle11-2009-06-29.orig/arch/x86/ia32/ia32entry-xen.S 2009-02-16 16:18:36.000000000 +0100
89+++ sle11-2009-06-29/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
90@@ -12,7 +12,6 @@
91 #include <asm/ia32_unistd.h>
92 #include <asm/thread_info.h>
93 #include <asm/segment.h>
94-#include <asm/vsyscall32.h>
95 #include <asm/irqflags.h>
96 #include <linux/linkage.h>
97
98@@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
99 CFI_RESTORE rcx
100 movl %ebp,%ebp /* zero extension */
101 movl %eax,%eax
102+ movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
103 movl $__USER32_DS,40(%rsp)
104 movq %rbp,32(%rsp)
105 movl $__USER32_CS,16(%rsp)
106- movl $VSYSCALL32_SYSEXIT,8(%rsp)
107+ movq %r10,8(%rsp)
108 movq %rax,(%rsp)
109 cld
110 SAVE_ARGS 0,0,1
111@@ -582,8 +582,8 @@ ia32_sys_call_table:
112 .quad compat_sys_futex /* 240 */
113 .quad compat_sys_sched_setaffinity
114 .quad compat_sys_sched_getaffinity
115- .quad sys32_set_thread_area
116- .quad sys32_get_thread_area
117+ .quad sys_set_thread_area
118+ .quad sys_get_thread_area
119 .quad compat_sys_io_setup /* 245 */
120 .quad sys_io_destroy
121 .quad compat_sys_io_getevents
122@@ -661,7 +661,9 @@ ia32_sys_call_table:
123 .quad sys_epoll_pwait
124 .quad compat_sys_utimensat /* 320 */
125 .quad compat_sys_signalfd
126- .quad compat_sys_timerfd
127+ .quad sys_timerfd_create
128 .quad sys_eventfd
129 .quad sys32_fallocate
130+ .quad compat_sys_timerfd_settime /* 325 */
131+ .quad compat_sys_timerfd_gettime
132 ia32_syscall_end:
133--- sle11-2009-06-29.orig/arch/x86/kernel/Makefile 2009-02-16 16:18:36.000000000 +0100
134+++ sle11-2009-06-29/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
135@@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
136
137 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
138
139+ obj-$(CONFIG_XEN) += nmi_64.o
140 time_64-$(CONFIG_XEN) += time_32.o
141 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
142 endif
143
144 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
145 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
146-disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
147-%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
148--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:11:08.000000000 +0100
149+++ sle11-2009-06-29/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
150@@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
151 #ifndef CONFIG_XEN
152 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
153 return __va(phys);
154+#else
155+ if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
156+ return isa_bus_to_virt(phys);
157 #endif
158
159 offset = phys & (PAGE_SIZE - 1);
160--- /dev/null 1970-01-01 00:00:00.000000000 +0000
161+++ sle11-2009-06-29/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
162@@ -0,0 +1,95 @@
163+/*
164+ * sleep.c - x86-specific ACPI sleep support.
165+ *
166+ * Copyright (C) 2001-2003 Patrick Mochel
167+ * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
168+ */
169+
170+#include <linux/acpi.h>
171+#include <linux/bootmem.h>
172+#include <linux/dmi.h>
173+#include <linux/cpumask.h>
174+
175+#include <asm/smp.h>
176+
177+#ifndef CONFIG_ACPI_PV_SLEEP
178+/* address in low memory of the wakeup routine. */
179+unsigned long acpi_wakeup_address = 0;
180+unsigned long acpi_realmode_flags;
181+extern char wakeup_start, wakeup_end;
182+
183+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
184+#endif
185+
186+/**
187+ * acpi_save_state_mem - save kernel state
188+ *
189+ * Create an identity mapped page table and copy the wakeup routine to
190+ * low memory.
191+ */
192+int acpi_save_state_mem(void)
193+{
194+#ifndef CONFIG_ACPI_PV_SLEEP
195+ if (!acpi_wakeup_address) {
196+ printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
197+ return -ENOMEM;
198+ }
199+ memcpy((void *)acpi_wakeup_address, &wakeup_start,
200+ &wakeup_end - &wakeup_start);
201+ acpi_copy_wakeup_routine(acpi_wakeup_address);
202+#endif
203+
204+ return 0;
205+}
206+
207+/*
208+ * acpi_restore_state - undo effects of acpi_save_state_mem
209+ */
210+void acpi_restore_state_mem(void)
211+{
212+}
213+
214+
215+/**
216+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
217+ *
218+ * We allocate a page from the first 1MB of memory for the wakeup
219+ * routine for when we come back from a sleep state. The
220+ * runtime allocator allows specification of <16MB pages, but not
221+ * <1MB pages.
222+ */
223+void __init acpi_reserve_bootmem(void)
224+{
225+#ifndef CONFIG_ACPI_PV_SLEEP
226+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
227+ printk(KERN_ERR
228+ "ACPI: Wakeup code way too big, S3 disabled.\n");
229+ return;
230+ }
231+
232+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
233+ if (!acpi_wakeup_address)
234+ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
235+#endif
236+}
237+
238+
239+#ifndef CONFIG_ACPI_PV_SLEEP
240+static int __init acpi_sleep_setup(char *str)
241+{
242+ while ((str != NULL) && (*str != '\0')) {
243+ if (strncmp(str, "s3_bios", 7) == 0)
244+ acpi_realmode_flags |= 1;
245+ if (strncmp(str, "s3_mode", 7) == 0)
246+ acpi_realmode_flags |= 2;
247+ if (strncmp(str, "s3_beep", 7) == 0)
248+ acpi_realmode_flags |= 4;
249+ str = strchr(str, ',');
250+ if (str != NULL)
251+ str += strspn(str, ", \t");
252+ }
253+ return 1;
254+}
255+
256+__setup("acpi_sleep=", acpi_sleep_setup);
257+#endif /* CONFIG_ACPI_PV_SLEEP */
258--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/sleep_32-xen.c 2009-02-16 16:18:36.000000000 +0100
259+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
260@@ -1,117 +0,0 @@
261-/*
262- * sleep.c - x86-specific ACPI sleep support.
263- *
264- * Copyright (C) 2001-2003 Patrick Mochel
265- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
266- */
267-
268-#include <linux/acpi.h>
269-#include <linux/bootmem.h>
270-#include <linux/dmi.h>
271-#include <linux/cpumask.h>
272-
273-#include <asm/smp.h>
274-
275-#ifndef CONFIG_ACPI_PV_SLEEP
276-/* address in low memory of the wakeup routine. */
277-unsigned long acpi_wakeup_address = 0;
278-unsigned long acpi_realmode_flags;
279-extern char wakeup_start, wakeup_end;
280-
281-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
282-#endif
283-
284-/**
285- * acpi_save_state_mem - save kernel state
286- *
287- * Create an identity mapped page table and copy the wakeup routine to
288- * low memory.
289- */
290-int acpi_save_state_mem(void)
291-{
292-#ifndef CONFIG_ACPI_PV_SLEEP
293- if (!acpi_wakeup_address)
294- return 1;
295- memcpy((void *)acpi_wakeup_address, &wakeup_start,
296- &wakeup_end - &wakeup_start);
297- acpi_copy_wakeup_routine(acpi_wakeup_address);
298-#endif
299- return 0;
300-}
301-
302-/*
303- * acpi_restore_state - undo effects of acpi_save_state_mem
304- */
305-void acpi_restore_state_mem(void)
306-{
307-}
308-
309-/**
310- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
311- *
312- * We allocate a page from the first 1MB of memory for the wakeup
313- * routine for when we come back from a sleep state. The
314- * runtime allocator allows specification of <16MB pages, but not
315- * <1MB pages.
316- */
317-void __init acpi_reserve_bootmem(void)
318-{
319-#ifndef CONFIG_ACPI_PV_SLEEP
320- if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
321- printk(KERN_ERR
322- "ACPI: Wakeup code way too big, S3 disabled.\n");
323- return;
324- }
325-
326- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
327- if (!acpi_wakeup_address)
328- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
329-#endif
330-}
331-
332-#ifndef CONFIG_ACPI_PV_SLEEP
333-static int __init acpi_sleep_setup(char *str)
334-{
335- while ((str != NULL) && (*str != '\0')) {
336- if (strncmp(str, "s3_bios", 7) == 0)
337- acpi_realmode_flags |= 1;
338- if (strncmp(str, "s3_mode", 7) == 0)
339- acpi_realmode_flags |= 2;
340- if (strncmp(str, "s3_beep", 7) == 0)
341- acpi_realmode_flags |= 4;
342- str = strchr(str, ',');
343- if (str != NULL)
344- str += strspn(str, ", \t");
345- }
346- return 1;
347-}
348-
349-__setup("acpi_sleep=", acpi_sleep_setup);
350-
351-/* Ouch, we want to delete this. We already have better version in userspace, in
352- s2ram from suspend.sf.net project */
353-static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
354-{
355- acpi_realmode_flags |= 2;
356- return 0;
357-}
358-
359-static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
360- { /* Reset video mode after returning from ACPI S3 sleep */
361- .callback = reset_videomode_after_s3,
362- .ident = "Toshiba Satellite 4030cdt",
363- .matches = {
364- DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
365- },
366- },
367- {}
368-};
369-
370-static int __init acpisleep_dmi_init(void)
371-{
372- dmi_check_system(acpisleep_dmi_table);
373- return 0;
374-}
375-
376-core_initcall(acpisleep_dmi_init);
377-#endif /* CONFIG_ACPI_PV_SLEEP */
378--- sle11-2009-06-29.orig/arch/x86/kernel/acpi/sleep_64-xen.c 2009-02-16 16:18:36.000000000 +0100
379+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
380@@ -1,125 +0,0 @@
381-/*
382- * acpi.c - Architecture-Specific Low-Level ACPI Support
383- *
384- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
385- * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
386- * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
387- * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
388- * Copyright (C) 2003 Pavel Machek, SuSE Labs
389- *
390- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
391- *
392- * This program is free software; you can redistribute it and/or modify
393- * it under the terms of the GNU General Public License as published by
394- * the Free Software Foundation; either version 2 of the License, or
395- * (at your option) any later version.
396- *
397- * This program is distributed in the hope that it will be useful,
398- * but WITHOUT ANY WARRANTY; without even the implied warranty of
399- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
400- * GNU General Public License for more details.
401- *
402- * You should have received a copy of the GNU General Public License
403- * along with this program; if not, write to the Free Software
404- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
405- *
406- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
407- */
408-
409-#include <linux/kernel.h>
410-#include <linux/init.h>
411-#include <linux/types.h>
412-#include <linux/stddef.h>
413-#include <linux/slab.h>
414-#include <linux/pci.h>
415-#include <linux/bootmem.h>
416-#include <linux/acpi.h>
417-#include <linux/cpumask.h>
418-
419-#include <asm/mpspec.h>
420-#include <asm/io.h>
421-#include <asm/apic.h>
422-#include <asm/apicdef.h>
423-#include <asm/page.h>
424-#include <asm/pgtable.h>
425-#include <asm/pgalloc.h>
426-#include <asm/io_apic.h>
427-#include <asm/proto.h>
428-#include <asm/tlbflush.h>
429-
430-/* --------------------------------------------------------------------------
431- Low-Level Sleep Support
432- -------------------------------------------------------------------------- */
433-
434-#ifndef CONFIG_ACPI_PV_SLEEP
435-/* address in low memory of the wakeup routine. */
436-unsigned long acpi_wakeup_address = 0;
437-unsigned long acpi_realmode_flags;
438-extern char wakeup_start, wakeup_end;
439-
440-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
441-#endif
442-
443-/**
444- * acpi_save_state_mem - save kernel state
445- *
446- * Create an identity mapped page table and copy the wakeup routine to
447- * low memory.
448- */
449-int acpi_save_state_mem(void)
450-{
451-#ifndef CONFIG_ACPI_PV_SLEEP
452- memcpy((void *)acpi_wakeup_address, &wakeup_start,
453- &wakeup_end - &wakeup_start);
454- acpi_copy_wakeup_routine(acpi_wakeup_address);
455-#endif
456- return 0;
457-}
458-
459-/*
460- * acpi_restore_state
461- */
462-void acpi_restore_state_mem(void)
463-{
464-}
465-
466-/**
467- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
468- *
469- * We allocate a page in low memory for the wakeup
470- * routine for when we come back from a sleep state. The
471- * runtime allocator allows specification of <16M pages, but not
472- * <1M pages.
473- */
474-void __init acpi_reserve_bootmem(void)
475-{
476-#ifndef CONFIG_ACPI_PV_SLEEP
477- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
478- if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
479- printk(KERN_CRIT
480- "ACPI: Wakeup code way too big, will crash on attempt"
481- " to suspend\n");
482-#endif
483-}
484-
485-#ifndef CONFIG_ACPI_PV_SLEEP
486-static int __init acpi_sleep_setup(char *str)
487-{
488- while ((str != NULL) && (*str != '\0')) {
489- if (strncmp(str, "s3_bios", 7) == 0)
490- acpi_realmode_flags |= 1;
491- if (strncmp(str, "s3_mode", 7) == 0)
492- acpi_realmode_flags |= 2;
493- if (strncmp(str, "s3_beep", 7) == 0)
494- acpi_realmode_flags |= 4;
495- str = strchr(str, ',');
496- if (str != NULL)
497- str += strspn(str, ", \t");
498- }
499-
500- return 1;
501-}
502-
503-__setup("acpi_sleep=", acpi_sleep_setup);
504-#endif /* CONFIG_ACPI_PV_SLEEP */
505-
506--- sle11-2009-06-29.orig/arch/x86/kernel/apic_32-xen.c 2008-12-15 11:27:22.000000000 +0100
507+++ sle11-2009-06-29/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
508@@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
509 * This initializes the IO-APIC and APIC hardware if this is
510 * a UP kernel.
511 */
512-int __init APIC_init_uniprocessor (void)
513+int __init APIC_init_uniprocessor(void)
514 {
515 #ifdef CONFIG_X86_IO_APIC
516 if (smp_found_config)
517--- sle11-2009-06-29.orig/arch/x86/kernel/apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
518+++ sle11-2009-06-29/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
519@@ -34,34 +34,17 @@
520 #include <asm/hpet.h>
521 #include <asm/idle.h>
522
523-int apic_verbosity;
524+int disable_apic;
525
526 /*
527- * 'what should we do if we get a hw irq event on an illegal vector'.
528- * each architecture has to answer this themselves.
529+ * Debug level, exported for io_apic.c
530 */
531-void ack_bad_irq(unsigned int irq)
532-{
533- printk("unexpected IRQ trap at irq %02x\n", irq);
534- /*
535- * Currently unexpected vectors happen only on SMP and APIC.
536- * We _must_ ack these because every local APIC has only N
537- * irq slots per priority level, and a 'hanging, unacked' IRQ
538- * holds up an irq slot - in excessive cases (when multiple
539- * unexpected vectors occur) that might lock up the APIC
540- * completely.
541- * But don't ack when the APIC is disabled. -AK
542- */
543- if (!disable_apic)
544- ack_APIC_irq();
545-}
546-
547-int setup_profiling_timer(unsigned int multiplier)
548-{
549- return -EINVAL;
550-}
551+int apic_verbosity;
552
553-void smp_local_timer_interrupt(void)
554+/*
555+ * The guts of the apic timer interrupt
556+ */
557+static void local_apic_timer_interrupt(void)
558 {
559 #ifndef CONFIG_XEN
560 int cpu = smp_processor_id();
561@@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
562 */
563 exit_idle();
564 irq_enter();
565- smp_local_timer_interrupt();
566+ local_apic_timer_interrupt();
567 irq_exit();
568 set_irq_regs(old_regs);
569 }
570
571+int setup_profiling_timer(unsigned int multiplier)
572+{
573+ return -EINVAL;
574+}
575+
576+/*
577+ * This initializes the IO-APIC and APIC hardware if this is
578+ * a UP kernel.
579+ */
580+int __init APIC_init_uniprocessor(void)
581+{
582+#ifdef CONFIG_X86_IO_APIC
583+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
584+ setup_IO_APIC();
585+#endif
586+
587+ return 1;
588+}
589+
590+/*
591+ * Local APIC interrupts
592+ */
593+
594 /*
595 * This interrupt should _never_ happen with our APIC/SMP architecture
596 */
597@@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
598 /*
599 * This interrupt should never happen with our APIC/SMP architecture
600 */
601-
602 asmlinkage void smp_error_interrupt(void)
603 {
604 unsigned int v, v1;
605@@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
606 smp_processor_id(), v , v1);
607 irq_exit();
608 }
609-
610-int disable_apic;
611-
612-/*
613- * This initializes the IO-APIC and APIC hardware if this is
614- * a UP kernel.
615- */
616-int __init APIC_init_uniprocessor (void)
617-{
618-#ifdef CONFIG_X86_IO_APIC
619- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
620- setup_IO_APIC();
621-#endif
622-
623- return 1;
624-}
625--- sle11-2009-06-29.orig/arch/x86/kernel/asm-offsets_32.c 2009-02-16 16:17:21.000000000 +0100
626+++ sle11-2009-06-29/arch/x86/kernel/asm-offsets_32.c 2009-03-16 16:33:40.000000000 +0100
627@@ -23,8 +23,10 @@
628 #include <xen/interface/xen.h>
629 #endif
630
631+#ifdef CONFIG_LGUEST_GUEST
632 #include <linux/lguest.h>
633 #include "../../../drivers/lguest/lg.h"
634+#endif
635
636 /* workaround for a warning with -Wmissing-prototypes */
637 void foo(void);
638--- sle11-2009-06-29.orig/arch/x86/kernel/cpu/common-xen.c 2009-02-16 16:18:36.000000000 +0100
639+++ sle11-2009-06-29/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
640@@ -27,45 +27,50 @@
641 #include "cpu.h"
642
643 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
644- [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
645- [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
646- [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
647- [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
648+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
649+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
650+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
651+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
652 #ifndef CONFIG_XEN
653 /*
654 * Segments used for calling PnP BIOS have byte granularity.
655 * They code segments and data segments have fixed 64k limits,
656 * the transfer segment sizes are set at run time.
657 */
658- [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
659- [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
660- [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
661- [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
662- [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
663+ /* 32-bit code */
664+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
665+ /* 16-bit code */
666+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
667+ /* 16-bit data */
668+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
669+ /* 16-bit data */
670+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
671+ /* 16-bit data */
672+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
673 /*
674 * The APM segments have byte granularity and their bases
675 * are set at run time. All have 64k limits.
676 */
677- [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
678+ /* 32-bit code */
679+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
680 /* 16-bit code */
681- [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
682- [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
683+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
684+ /* data */
685+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
686
687- [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
688+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
689 #endif
690- [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
691+ [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
692 } };
693 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
694
695+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
696+
697 static int cachesize_override __cpuinitdata = -1;
698-static int disable_x86_fxsr __cpuinitdata;
699 static int disable_x86_serial_nr __cpuinitdata = 1;
700-static int disable_x86_sep __cpuinitdata;
701
702 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
703
704-extern int disable_pse;
705-
706 static void __cpuinit default_init(struct cpuinfo_x86 * c)
707 {
708 /* Not much we can do here... */
709@@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
710
711 static int __init x86_fxsr_setup(char * s)
712 {
713- /* Tell all the other CPUs to not use it... */
714- disable_x86_fxsr = 1;
715-
716- /*
717- * ... and clear the bits early in the boot_cpu_data
718- * so that the bootup process doesn't try to do this
719- * either.
720- */
721- clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
722- clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
723+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
724+ setup_clear_cpu_cap(X86_FEATURE_XMM);
725 return 1;
726 }
727 __setup("nofxsr", x86_fxsr_setup);
728@@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
729
730 static int __init x86_sep_setup(char * s)
731 {
732- disable_x86_sep = 1;
733+ setup_clear_cpu_cap(X86_FEATURE_SEP);
734 return 1;
735 }
736 __setup("nosep", x86_sep_setup);
737@@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
738 void __init cpu_detect(struct cpuinfo_x86 *c)
739 {
740 /* Get vendor name */
741- cpuid(0x00000000, &c->cpuid_level,
742- (int *)&c->x86_vendor_id[0],
743- (int *)&c->x86_vendor_id[8],
744- (int *)&c->x86_vendor_id[4]);
745+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
746+ (unsigned int *)&c->x86_vendor_id[0],
747+ (unsigned int *)&c->x86_vendor_id[8],
748+ (unsigned int *)&c->x86_vendor_id[4]);
749
750 c->x86 = 4;
751 if (c->cpuid_level >= 0x00000001) {
752@@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
753 if (c->x86 >= 0x6)
754 c->x86_model += ((tfms >> 16) & 0xF) << 4;
755 c->x86_mask = tfms & 15;
756- if (cap0 & (1<<19))
757+ if (cap0 & (1<<19)) {
758 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
759+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
760+ }
761+ }
762+}
763+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
764+{
765+ u32 tfms, xlvl;
766+ unsigned int ebx;
767+
768+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
769+ if (have_cpuid_p()) {
770+ /* Intel-defined flags: level 0x00000001 */
771+ if (c->cpuid_level >= 0x00000001) {
772+ u32 capability, excap;
773+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
774+ c->x86_capability[0] = capability;
775+ c->x86_capability[4] = excap;
776+ }
777+
778+ /* AMD-defined flags: level 0x80000001 */
779+ xlvl = cpuid_eax(0x80000000);
780+ if ((xlvl & 0xffff0000) == 0x80000000) {
781+ if (xlvl >= 0x80000001) {
782+ c->x86_capability[1] = cpuid_edx(0x80000001);
783+ c->x86_capability[6] = cpuid_ecx(0x80000001);
784+ }
785+ }
786+
787 }
788+
789 }
790
791 /* Do minimum CPU detection early.
792@@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
793 struct cpuinfo_x86 *c = &boot_cpu_data;
794
795 c->x86_cache_alignment = 32;
796+ c->x86_clflush_size = 32;
797
798 if (!have_cpuid_p())
799 return;
800@@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
801 cpu_detect(c);
802
803 get_cpu_vendor(c, 1);
804+
805+ switch (c->x86_vendor) {
806+ case X86_VENDOR_AMD:
807+ early_init_amd(c);
808+ break;
809+ case X86_VENDOR_INTEL:
810+ early_init_intel(c);
811+ break;
812+ }
813+
814+ early_get_cap(c);
815 }
816
817 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
818 {
819 u32 tfms, xlvl;
820- int ebx;
821+ unsigned int ebx;
822
823 if (have_cpuid_p()) {
824 /* Get vendor name */
825- cpuid(0x00000000, &c->cpuid_level,
826- (int *)&c->x86_vendor_id[0],
827- (int *)&c->x86_vendor_id[8],
828- (int *)&c->x86_vendor_id[4]);
829+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
830+ (unsigned int *)&c->x86_vendor_id[0],
831+ (unsigned int *)&c->x86_vendor_id[8],
832+ (unsigned int *)&c->x86_vendor_id[4]);
833
834 get_cpu_vendor(c, 0);
835 /* Initialize the standard set of capabilities */
836@@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
837 init_scattered_cpuid_features(c);
838 }
839
840- early_intel_workaround(c);
841-
842 #ifdef CONFIG_X86_HT
843 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
844 #endif
845@@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
846 /*
847 * This does the hard work of actually picking apart the CPU stuff...
848 */
849-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
850+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
851 {
852 int i;
853
854@@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
855
856 generic_identify(c);
857
858- printk(KERN_DEBUG "CPU: After generic identify, caps:");
859- for (i = 0; i < NCAPINTS; i++)
860- printk(" %08lx", c->x86_capability[i]);
861- printk("\n");
862-
863- if (this_cpu->c_identify) {
864+ if (this_cpu->c_identify)
865 this_cpu->c_identify(c);
866
867- printk(KERN_DEBUG "CPU: After vendor identify, caps:");
868- for (i = 0; i < NCAPINTS; i++)
869- printk(" %08lx", c->x86_capability[i]);
870- printk("\n");
871- }
872-
873 /*
874 * Vendor-specific initialization. In this section we
875 * canonicalize the feature flags, meaning if there are
876@@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
877 * we do "generic changes."
878 */
879
880- /* TSC disabled? */
881- if ( tsc_disable )
882- clear_bit(X86_FEATURE_TSC, c->x86_capability);
883-
884- /* FXSR disabled? */
885- if (disable_x86_fxsr) {
886- clear_bit(X86_FEATURE_FXSR, c->x86_capability);
887- clear_bit(X86_FEATURE_XMM, c->x86_capability);
888- }
889-
890- /* SEP disabled? */
891- if (disable_x86_sep)
892- clear_bit(X86_FEATURE_SEP, c->x86_capability);
893-
894- if (disable_pse)
895- clear_bit(X86_FEATURE_PSE, c->x86_capability);
896-
897 /* If the model name is still unset, do table lookup. */
898 if ( !c->x86_model_id[0] ) {
899 char *p;
900@@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
901 c->x86, c->x86_model);
902 }
903
904- /* Now the feature flags better reflect actual CPU features! */
905-
906- printk(KERN_DEBUG "CPU: After all inits, caps:");
907- for (i = 0; i < NCAPINTS; i++)
908- printk(" %08lx", c->x86_capability[i]);
909- printk("\n");
910-
911 /*
912 * On SMP, boot_cpu_data holds the common feature set between
913 * all CPUs; so make sure that we indicate which features are
914@@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
915 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
916 }
917
918+ /* Clear all flags overriden by options */
919+ for (i = 0; i < NCAPINTS; i++)
920+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
921+
922 /* Init Machine Check Exception if available. */
923 mcheck_init(c);
924+
925+ select_idle_routine(c);
926 }
927
928 void __init identify_boot_cpu(void)
929@@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
930 identify_cpu(&boot_cpu_data);
931 sysenter_setup();
932 enable_sep_cpu();
933- mtrr_bp_init();
934 }
935
936 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
937@@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
938 }
939 #endif
940
941+static __init int setup_noclflush(char *arg)
942+{
943+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
944+ return 1;
945+}
946+__setup("noclflush", setup_noclflush);
947+
948 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
949 {
950 char *vendor = NULL;
951@@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
952 printk("\n");
953 }
954
955+static __init int setup_disablecpuid(char *arg)
956+{
957+ int bit;
958+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
959+ setup_clear_cpu_cap(bit);
960+ else
961+ return 0;
962+ return 1;
963+}
964+__setup("clearcpuid=", setup_disablecpuid);
965+
966 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
967
968 /* This is hacky. :)
969@@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
970 * They will insert themselves into the cpu_devs structure.
971 * Then, when cpu_init() is called, we can just iterate over that array.
972 */
973-
974-extern int intel_cpu_init(void);
975-extern int cyrix_init_cpu(void);
976-extern int nsc_init_cpu(void);
977-extern int amd_init_cpu(void);
978-extern int centaur_init_cpu(void);
979-extern int transmeta_init_cpu(void);
980-extern int nexgen_init_cpu(void);
981-extern int umc_init_cpu(void);
982-
983 void __init early_cpu_init(void)
984 {
985 intel_cpu_init();
986@@ -627,21 +641,13 @@ void __init early_cpu_init(void)
987 nexgen_init_cpu();
988 umc_init_cpu();
989 early_cpu_detect();
990-
991-#ifdef CONFIG_DEBUG_PAGEALLOC
992- /* pse is not compatible with on-the-fly unmapping,
993- * disable it even if the cpus claim to support it.
994- */
995- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
996- disable_pse = 1;
997-#endif
998 }
999
1000 /* Make sure %fs is initialized properly in idle threads */
1001-struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1002+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1003 {
1004 memset(regs, 0, sizeof(struct pt_regs));
1005- regs->xfs = __KERNEL_PERCPU;
1006+ regs->fs = __KERNEL_PERCPU;
1007 return regs;
1008 }
1009
1010@@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1011 * it's on the real one. */
1012 void switch_to_new_gdt(void)
1013 {
1014- struct Xgt_desc_struct gdt_descr;
1015+ struct desc_ptr gdt_descr;
1016 unsigned long va, frames[16];
1017 int f;
1018
1019@@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1020
1021 if (cpu_has_vme || cpu_has_de)
1022 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1023- if (tsc_disable && cpu_has_tsc) {
1024- printk(KERN_NOTICE "Disabling TSC...\n");
1025- /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1026- clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1027- set_in_cr4(X86_CR4_TSD);
1028- }
1029
1030 switch_to_new_gdt();
1031
1032@@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1033 BUG();
1034 enter_lazy_tlb(&init_mm, curr);
1035
1036- load_esp0(t, thread);
1037+ load_sp0(t, thread);
1038
1039 load_LDT(&init_mm.context);
1040
1041--- sle11-2009-06-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-02-16 16:17:21.000000000 +0100
1042+++ sle11-2009-06-29/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
1043@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1044
1045 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1046 unsigned int num_var_ranges;
1047-unsigned int *usage_table;
1048+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1049
1050 static void __init set_num_var_ranges(void)
1051 {
1052@@ -52,17 +52,12 @@ static void __init init_table(void)
1053 int i, max;
1054
1055 max = num_var_ranges;
1056- if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1057- == NULL) {
1058- printk(KERN_ERR "mtrr: could not allocate\n");
1059- return;
1060- }
1061 for (i = 0; i < max; i++)
1062- usage_table[i] = 0;
1063+ mtrr_usage_table[i] = 0;
1064 }
1065
1066 int mtrr_add_page(unsigned long base, unsigned long size,
1067- unsigned int type, char increment)
1068+ unsigned int type, bool increment)
1069 {
1070 int error;
1071 struct xen_platform_op op;
1072@@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1073 }
1074
1075 if (increment)
1076- ++usage_table[op.u.add_memtype.reg];
1077+ ++mtrr_usage_table[op.u.add_memtype.reg];
1078
1079 mutex_unlock(&mtrr_mutex);
1080
1081@@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1082
1083 int
1084 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1085- char increment)
1086+ bool increment)
1087 {
1088 if (mtrr_check(base, size))
1089 return -EINVAL;
1090@@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1091 goto out;
1092 }
1093 }
1094- if (usage_table[reg] < 1) {
1095+ if (mtrr_usage_table[reg] < 1) {
1096 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1097 goto out;
1098 }
1099- if (--usage_table[reg] < 1) {
1100+ if (--mtrr_usage_table[reg] < 1) {
1101 op.cmd = XENPF_del_memtype;
1102 op.u.del_memtype.handle = 0;
1103 op.u.del_memtype.reg = reg;
1104--- sle11-2009-06-29.orig/arch/x86/kernel/e820_32-xen.c 2009-02-16 16:18:36.000000000 +0100
1105+++ sle11-2009-06-29/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
1106@@ -7,7 +7,6 @@
1107 #include <linux/kexec.h>
1108 #include <linux/module.h>
1109 #include <linux/mm.h>
1110-#include <linux/efi.h>
1111 #include <linux/pfn.h>
1112 #include <linux/uaccess.h>
1113 #include <linux/suspend.h>
1114@@ -18,11 +17,6 @@
1115 #include <asm/setup.h>
1116 #include <xen/interface/memory.h>
1117
1118-#ifdef CONFIG_EFI
1119-int efi_enabled = 0;
1120-EXPORT_SYMBOL(efi_enabled);
1121-#endif
1122-
1123 struct e820map e820;
1124 struct change_member {
1125 struct e820entry *pbios; /* pointer to original bios entry */
1126@@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1127 EXPORT_SYMBOL(pci_mem_start);
1128 #endif
1129 extern int user_defined_memmap;
1130-struct resource data_resource = {
1131- .name = "Kernel data",
1132- .start = 0,
1133- .end = 0,
1134- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1135-};
1136-
1137-struct resource code_resource = {
1138- .name = "Kernel code",
1139- .start = 0,
1140- .end = 0,
1141- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1142-};
1143-
1144-struct resource bss_resource = {
1145- .name = "Kernel bss",
1146- .start = 0,
1147- .end = 0,
1148- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1149-};
1150
1151 static struct resource system_rom_resource = {
1152 .name = "System ROM",
1153@@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1154 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1155 };
1156
1157-static struct resource video_ram_resource = {
1158- .name = "Video RAM area",
1159- .start = 0xa0000,
1160- .end = 0xbffff,
1161- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1162-};
1163-
1164-static struct resource standard_io_resources[] = { {
1165- .name = "dma1",
1166- .start = 0x0000,
1167- .end = 0x001f,
1168- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1169-}, {
1170- .name = "pic1",
1171- .start = 0x0020,
1172- .end = 0x0021,
1173- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1174-}, {
1175- .name = "timer0",
1176- .start = 0x0040,
1177- .end = 0x0043,
1178- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1179-}, {
1180- .name = "timer1",
1181- .start = 0x0050,
1182- .end = 0x0053,
1183- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1184-}, {
1185- .name = "keyboard",
1186- .start = 0x0060,
1187- .end = 0x006f,
1188- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1189-}, {
1190- .name = "dma page reg",
1191- .start = 0x0080,
1192- .end = 0x008f,
1193- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1194-}, {
1195- .name = "pic2",
1196- .start = 0x00a0,
1197- .end = 0x00a1,
1198- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1199-}, {
1200- .name = "dma2",
1201- .start = 0x00c0,
1202- .end = 0x00df,
1203- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1204-}, {
1205- .name = "fpu",
1206- .start = 0x00f0,
1207- .end = 0x00ff,
1208- .flags = IORESOURCE_BUSY | IORESOURCE_IO
1209-} };
1210-
1211 #define ROMSIGNATURE 0xaa55
1212
1213 static int __init romsignature(const unsigned char *rom)
1214@@ -272,10 +192,9 @@ static struct e820map machine_e820;
1215 * Request address space for all standard RAM and ROM resources
1216 * and also for regions reported as reserved by the e820.
1217 */
1218-static void __init
1219-legacy_init_iomem_resources(struct resource *code_resource,
1220- struct resource *data_resource,
1221- struct resource *bss_resource)
1222+void __init init_iomem_resources(struct resource *code_resource,
1223+ struct resource *data_resource,
1224+ struct resource *bss_resource)
1225 {
1226 int i;
1227
1228@@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1229
1230 #undef e820
1231
1232-/*
1233- * Request address space for all standard resources
1234- *
1235- * This is called just before pcibios_init(), which is also a
1236- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1237- */
1238-static int __init request_standard_resources(void)
1239-{
1240- int i;
1241-
1242- /* Nothing to do if not running in dom0. */
1243- if (!is_initial_xendomain())
1244- return 0;
1245-
1246- printk("Setting up standard PCI resources\n");
1247- if (efi_enabled)
1248- efi_initialize_iomem_resources(&code_resource,
1249- &data_resource, &bss_resource);
1250- else
1251- legacy_init_iomem_resources(&code_resource,
1252- &data_resource, &bss_resource);
1253-
1254- /* EFI systems may still have VGA */
1255- request_resource(&iomem_resource, &video_ram_resource);
1256-
1257- /* request I/O space for devices used on all i[345]86 PCs */
1258- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1259- request_resource(&ioport_resource, &standard_io_resources[i]);
1260- return 0;
1261-}
1262-
1263-subsys_initcall(request_standard_resources);
1264-
1265 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1266 /**
1267 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1268@@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1269 {
1270 int x;
1271
1272- if (!efi_enabled) {
1273- x = e820.nr_map;
1274-
1275- if (x == E820MAX) {
1276- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1277- return;
1278- }
1279+ x = e820.nr_map;
1280
1281- e820.map[x].addr = start;
1282- e820.map[x].size = size;
1283- e820.map[x].type = type;
1284- e820.nr_map++;
1285+ if (x == E820MAX) {
1286+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1287+ return;
1288 }
1289+
1290+ e820.map[x].addr = start;
1291+ e820.map[x].size = size;
1292+ e820.map[x].type = type;
1293+ e820.nr_map++;
1294 } /* add_memory_region */
1295
1296 /*
1297@@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1298 }
1299
1300 /*
1301- * Callback for efi_memory_walk.
1302- */
1303-static int __init
1304-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1305-{
1306- unsigned long *max_pfn = arg, pfn;
1307-
1308- if (start < end) {
1309- pfn = PFN_UP(end -1);
1310- if (pfn > *max_pfn)
1311- *max_pfn = pfn;
1312- }
1313- return 0;
1314-}
1315-
1316-static int __init
1317-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1318-{
1319- memory_present(0, PFN_UP(start), PFN_DOWN(end));
1320- return 0;
1321-}
1322-
1323-/*
1324 * Find the highest page frame number we have available
1325 */
1326 void __init find_max_pfn(void)
1327@@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1328 int i;
1329
1330 max_pfn = 0;
1331- if (efi_enabled) {
1332- efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1333- efi_memmap_walk(efi_memory_present_wrapper, NULL);
1334- return;
1335- }
1336
1337 for (i = 0; i < e820.nr_map; i++) {
1338 unsigned long start, end;
1339@@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1340 }
1341
1342 /*
1343- * Free all available memory for boot time allocation. Used
1344- * as a callback function by efi_memory_walk()
1345- */
1346-
1347-static int __init
1348-free_available_memory(unsigned long start, unsigned long end, void *arg)
1349-{
1350- /* check max_low_pfn */
1351- if (start >= (max_low_pfn << PAGE_SHIFT))
1352- return 0;
1353- if (end >= (max_low_pfn << PAGE_SHIFT))
1354- end = max_low_pfn << PAGE_SHIFT;
1355- if (start < end)
1356- free_bootmem(start, end - start);
1357-
1358- return 0;
1359-}
1360-/*
1361 * Register fully available low RAM pages with the bootmem allocator.
1362 */
1363 void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1364 {
1365 int i;
1366
1367- if (efi_enabled) {
1368- efi_memmap_walk(free_available_memory, NULL);
1369- return;
1370- }
1371 for (i = 0; i < e820.nr_map; i++) {
1372 unsigned long curr_pfn, last_pfn, size;
1373 /*
1374@@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1375 }
1376 }
1377
1378-static __init __always_inline void efi_limit_regions(unsigned long long size)
1379-{
1380- unsigned long long current_addr = 0;
1381- efi_memory_desc_t *md, *next_md;
1382- void *p, *p1;
1383- int i, j;
1384-
1385- j = 0;
1386- p1 = memmap.map;
1387- for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1388- md = p;
1389- next_md = p1;
1390- current_addr = md->phys_addr +
1391- PFN_PHYS(md->num_pages);
1392- if (is_available_memory(md)) {
1393- if (md->phys_addr >= size) continue;
1394- memcpy(next_md, md, memmap.desc_size);
1395- if (current_addr >= size) {
1396- next_md->num_pages -=
1397- PFN_UP(current_addr-size);
1398- }
1399- p1 += memmap.desc_size;
1400- next_md = p1;
1401- j++;
1402- } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1403- EFI_MEMORY_RUNTIME) {
1404- /* In order to make runtime services
1405- * available we have to include runtime
1406- * memory regions in memory map */
1407- memcpy(next_md, md, memmap.desc_size);
1408- p1 += memmap.desc_size;
1409- next_md = p1;
1410- j++;
1411- }
1412- }
1413- memmap.nr_map = j;
1414- memmap.map_end = memmap.map +
1415- (memmap.nr_map * memmap.desc_size);
1416-}
1417-
1418 void __init limit_regions(unsigned long long size)
1419 {
1420 unsigned long long current_addr = 0;
1421 int i;
1422
1423 print_memory_map("limit_regions start");
1424- if (efi_enabled) {
1425- efi_limit_regions(size);
1426- return;
1427- }
1428 for (i = 0; i < e820.nr_map; i++) {
1429 current_addr = e820.map[i].addr + e820.map[i].size;
1430 if (current_addr < size)
1431@@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1432 return 0;
1433 }
1434 early_param("memmap", parse_memmap);
1435+
1436+#ifndef CONFIG_XEN
1437+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1438+ unsigned new_type)
1439+{
1440+ int i;
1441+
1442+ BUG_ON(old_type == new_type);
1443+
1444+ for (i = 0; i < e820.nr_map; i++) {
1445+ struct e820entry *ei = &e820.map[i];
1446+ u64 final_start, final_end;
1447+ if (ei->type != old_type)
1448+ continue;
1449+ /* totally covered? */
1450+ if (ei->addr >= start && ei->size <= size) {
1451+ ei->type = new_type;
1452+ continue;
1453+ }
1454+ /* partially covered */
1455+ final_start = max(start, ei->addr);
1456+ final_end = min(start + size, ei->addr + ei->size);
1457+ if (final_start >= final_end)
1458+ continue;
1459+ add_memory_region(final_start, final_end - final_start,
1460+ new_type);
1461+ }
1462+}
1463+
1464+void __init update_e820(void)
1465+{
1466+ u8 nr_map;
1467+
1468+ nr_map = e820.nr_map;
1469+ if (sanitize_e820_map(e820.map, &nr_map))
1470+ return;
1471+ e820.nr_map = nr_map;
1472+ printk(KERN_INFO "modified physical RAM map:\n");
1473+ print_memory_map("modified");
1474+}
1475+#endif
1476--- sle11-2009-06-29.orig/arch/x86/kernel/e820_64-xen.c 2009-02-16 16:18:36.000000000 +0100
1477+++ sle11-2009-06-29/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
1478@@ -1,4 +1,4 @@
1479-/*
1480+/*
1481 * Handle the memory map.
1482 * The functions here do the job until bootmem takes over.
1483 *
1484@@ -26,6 +26,7 @@
1485 #include <asm/proto.h>
1486 #include <asm/setup.h>
1487 #include <asm/sections.h>
1488+#include <asm/kdebug.h>
1489 #include <xen/interface/memory.h>
1490
1491 struct e820map e820 __initdata;
1492@@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1493 struct e820map machine_e820;
1494 #endif
1495
1496-/*
1497+/*
1498 * PFN of last memory page.
1499 */
1500-unsigned long end_pfn;
1501-EXPORT_SYMBOL(end_pfn);
1502+unsigned long end_pfn;
1503
1504-/*
1505+/*
1506 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1507 * The direct mapping extends to end_pfn_map, so that we can directly access
1508 * apertures, ACPI and other tables without having to play with fixmaps.
1509- */
1510-unsigned long end_pfn_map;
1511+ */
1512+unsigned long end_pfn_map;
1513
1514-/*
1515+/*
1516 * Last pfn which the user wants to use.
1517 */
1518 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1519
1520-extern struct resource code_resource, data_resource, bss_resource;
1521-
1522-/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1523-static inline int bad_addr(unsigned long *addrp, unsigned long size)
1524-{
1525- unsigned long addr = *addrp, last = addr + size;
1526+/*
1527+ * Early reserved memory areas.
1528+ */
1529+#define MAX_EARLY_RES 20
1530
1531+struct early_res {
1532+ unsigned long start, end;
1533+ char name[16];
1534+};
1535+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1536 #ifndef CONFIG_XEN
1537- /* various gunk below that needed for SMP startup */
1538- if (addr < 0x8000) {
1539- *addrp = PAGE_ALIGN(0x8000);
1540- return 1;
1541- }
1542-
1543- /* direct mapping tables of the kernel */
1544- if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1545- *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1546- return 1;
1547- }
1548-
1549- /* initrd */
1550-#ifdef CONFIG_BLK_DEV_INITRD
1551- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1552- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1553- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
1554- unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
1555-
1556- if (last >= ramdisk_image && addr < ramdisk_end) {
1557- *addrp = PAGE_ALIGN(ramdisk_end);
1558- return 1;
1559- }
1560- }
1561+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
1562+#ifdef CONFIG_SMP
1563+ { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1564 #endif
1565- /* kernel code */
1566- if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1567- *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1568- return 1;
1569- }
1570+#endif
1571+ {}
1572+};
1573
1574- if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1575- *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1576- return 1;
1577+void __init reserve_early(unsigned long start, unsigned long end, char *name)
1578+{
1579+ int i;
1580+ struct early_res *r;
1581+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1582+ r = &early_res[i];
1583+ if (end > r->start && start < r->end)
1584+ panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1585+ start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1586 }
1587+ if (i >= MAX_EARLY_RES)
1588+ panic("Too many early reservations");
1589+ r = &early_res[i];
1590+ r->start = start;
1591+ r->end = end;
1592+ if (name)
1593+ strncpy(r->name, name, sizeof(r->name) - 1);
1594+}
1595
1596-#ifdef CONFIG_NUMA
1597- /* NUMA memory to node map */
1598- if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1599- *addrp = nodemap_addr + nodemap_size;
1600- return 1;
1601+void __init early_res_to_bootmem(void)
1602+{
1603+ int i;
1604+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1605+ struct early_res *r = &early_res[i];
1606+ printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1607+ r->start, r->end - 1, r->name);
1608+ reserve_bootmem_generic(r->start, r->end - r->start);
1609 }
1610-#endif
1611- /* XXX ramdisk image here? */
1612-#else
1613- if (last < (table_end<<PAGE_SHIFT)) {
1614- *addrp = table_end << PAGE_SHIFT;
1615- return 1;
1616+}
1617+
1618+/* Check for already reserved areas */
1619+static inline int bad_addr(unsigned long *addrp, unsigned long size)
1620+{
1621+ int i;
1622+ unsigned long addr = *addrp, last;
1623+ int changed = 0;
1624+again:
1625+ last = addr + size;
1626+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1627+ struct early_res *r = &early_res[i];
1628+ if (last >= r->start && addr < r->end) {
1629+ *addrp = addr = r->end;
1630+ changed = 1;
1631+ goto again;
1632+ }
1633 }
1634-#endif
1635- return 0;
1636-}
1637+ return changed;
1638+}
1639
1640 /*
1641 * This function checks if any part of the range <start,end> is mapped
1642 * with type.
1643 */
1644-int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1645-{
1646+int
1647+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1648+{
1649 int i;
1650
1651 #ifndef CONFIG_XEN
1652- for (i = 0; i < e820.nr_map; i++) {
1653- struct e820entry *ei = &e820.map[i];
1654+ for (i = 0; i < e820.nr_map; i++) {
1655+ struct e820entry *ei = &e820.map[i];
1656 #else
1657 if (!is_initial_xendomain())
1658 return 0;
1659@@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1660 const struct e820entry *ei = &machine_e820.map[i];
1661 #endif
1662
1663- if (type && ei->type != type)
1664+ if (type && ei->type != type)
1665 continue;
1666 if (ei->addr >= end || ei->addr + ei->size <= start)
1667- continue;
1668- return 1;
1669- }
1670+ continue;
1671+ return 1;
1672+ }
1673 return 0;
1674 }
1675 EXPORT_SYMBOL_GPL(e820_any_mapped);
1676@@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1677 * Note: this function only works correct if the e820 table is sorted and
1678 * not-overlapping, which is the case
1679 */
1680-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1681+int __init e820_all_mapped(unsigned long start, unsigned long end,
1682+ unsigned type)
1683 {
1684 int i;
1685
1686@@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1687 */
1688 if (ei->addr <= start)
1689 start = ei->addr + ei->size;
1690- /* if start is now at or beyond end, we're done, full coverage */
1691+ /*
1692+ * if start is now at or beyond end, we're done, full
1693+ * coverage
1694+ */
1695 if (start >= end)
1696- return 1; /* we're done */
1697+ return 1;
1698 }
1699 return 0;
1700 }
1701
1702-/*
1703- * Find a free area in a specific range.
1704- */
1705-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1706-{
1707- int i;
1708- for (i = 0; i < e820.nr_map; i++) {
1709- struct e820entry *ei = &e820.map[i];
1710- unsigned long addr = ei->addr, last;
1711- if (ei->type != E820_RAM)
1712- continue;
1713- if (addr < start)
1714+/*
1715+ * Find a free area with specified alignment in a specific range.
1716+ */
1717+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1718+ unsigned size, unsigned long align)
1719+{
1720+ int i;
1721+ unsigned long mask = ~(align - 1);
1722+
1723+ for (i = 0; i < e820.nr_map; i++) {
1724+ struct e820entry *ei = &e820.map[i];
1725+ unsigned long addr = ei->addr, last;
1726+
1727+ if (ei->type != E820_RAM)
1728+ continue;
1729+ if (addr < start)
1730 addr = start;
1731- if (addr > ei->addr + ei->size)
1732- continue;
1733+ if (addr > ei->addr + ei->size)
1734+ continue;
1735 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1736 ;
1737- last = PAGE_ALIGN(addr) + size;
1738+ addr = (addr + align - 1) & mask;
1739+ last = addr + size;
1740 if (last > ei->addr + ei->size)
1741 continue;
1742- if (last > end)
1743+ if (last > end)
1744 continue;
1745- return addr;
1746- }
1747- return -1UL;
1748-}
1749+ return addr;
1750+ }
1751+ return -1UL;
1752+}
1753
1754 /*
1755 * Find the highest page frame number we have available
1756 */
1757 unsigned long __init e820_end_of_ram(void)
1758 {
1759- unsigned long end_pfn = 0;
1760+ unsigned long end_pfn;
1761+
1762 end_pfn = find_max_pfn_with_active_regions();
1763-
1764- if (end_pfn > end_pfn_map)
1765+
1766+ if (end_pfn > end_pfn_map)
1767 end_pfn_map = end_pfn;
1768 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1769 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1770 if (end_pfn > end_user_pfn)
1771 end_pfn = end_user_pfn;
1772- if (end_pfn > end_pfn_map)
1773- end_pfn = end_pfn_map;
1774+ if (end_pfn > end_pfn_map)
1775+ end_pfn = end_pfn_map;
1776
1777- printk("end_pfn_map = %lu\n", end_pfn_map);
1778- return end_pfn;
1779+ printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1780+ return end_pfn;
1781 }
1782
1783 /*
1784 * Mark e820 reserved areas as busy for the resource manager.
1785 */
1786-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1787+void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1788+ struct resource *code_resource,
1789+ struct resource *data_resource,
1790+ struct resource *bss_resource)
1791 {
1792 int i;
1793 for (i = 0; i < nr_map; i++) {
1794@@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1795 request_resource(&iomem_resource, res);
1796 if (e820[i].type == E820_RAM) {
1797 /*
1798- * We don't know which RAM region contains kernel data,
1799- * so we try it repeatedly and let the resource manager
1800- * test it.
1801+ * We don't know which RAM region contains kernel data,
1802+ * so we try it repeatedly and let the resource manager
1803+ * test it.
1804 */
1805 #ifndef CONFIG_XEN
1806- request_resource(res, &code_resource);
1807- request_resource(res, &data_resource);
1808- request_resource(res, &bss_resource);
1809+ request_resource(res, code_resource);
1810+ request_resource(res, data_resource);
1811+ request_resource(res, bss_resource);
1812 #endif
1813 #ifdef CONFIG_KEXEC
1814 if (crashk_res.start != crashk_res.end)
1815@@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1816 add_active_range(nid, ei_startpfn, ei_endpfn);
1817 }
1818
1819-/*
1820+/*
1821 * Add a memory region to the kernel e820 map.
1822- */
1823+ */
1824 void __init add_memory_region(unsigned long start, unsigned long size, int type)
1825 {
1826 int x = e820.nr_map;
1827@@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1828 {
1829 unsigned long start_pfn = start >> PAGE_SHIFT;
1830 unsigned long end_pfn = end >> PAGE_SHIFT;
1831- unsigned long ei_startpfn;
1832- unsigned long ei_endpfn;
1833- unsigned long ram = 0;
1834+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
1835 int i;
1836
1837 for (i = 0; i < e820.nr_map; i++) {
1838@@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1839 return end - start - (ram << PAGE_SHIFT);
1840 }
1841
1842-void __init e820_print_map(char *who)
1843+static void __init e820_print_map(char *who)
1844 {
1845 int i;
1846
1847 for (i = 0; i < e820.nr_map; i++) {
1848 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1849- (unsigned long long) e820.map[i].addr,
1850- (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1851+ (unsigned long long) e820.map[i].addr,
1852+ (unsigned long long)
1853+ (e820.map[i].addr + e820.map[i].size));
1854 switch (e820.map[i].type) {
1855- case E820_RAM: printk("(usable)\n");
1856- break;
1857+ case E820_RAM:
1858+ printk(KERN_CONT "(usable)\n");
1859+ break;
1860 case E820_RESERVED:
1861- printk("(reserved)\n");
1862- break;
1863+ printk(KERN_CONT "(reserved)\n");
1864+ break;
1865 case E820_ACPI:
1866- printk("(ACPI data)\n");
1867- break;
1868+ printk(KERN_CONT "(ACPI data)\n");
1869+ break;
1870 case E820_NVS:
1871- printk("(ACPI NVS)\n");
1872- break;
1873- default: printk("type %u\n", e820.map[i].type);
1874- break;
1875+ printk(KERN_CONT "(ACPI NVS)\n");
1876+ break;
1877+ default:
1878+ printk(KERN_CONT "type %u\n", e820.map[i].type);
1879+ break;
1880 }
1881 }
1882 }
1883@@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
1884 /*
1885 * Sanitize the BIOS e820 map.
1886 *
1887- * Some e820 responses include overlapping entries. The following
1888+ * Some e820 responses include overlapping entries. The following
1889 * replaces the original e820 map with a new one, removing overlaps.
1890 *
1891 */
1892-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
1893+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
1894 {
1895 struct change_member {
1896 struct e820entry *pbios; /* pointer to original bios entry */
1897@@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
1898 int i;
1899
1900 /*
1901- Visually we're performing the following (1,2,3,4 = memory types)...
1902+ Visually we're performing the following
1903+ (1,2,3,4 = memory types)...
1904
1905 Sample memory map (w/overlaps):
1906 ____22__________________
1907@@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
1908 old_nr = *pnr_map;
1909
1910 /* bail out if we find any unreasonable addresses in bios map */
1911- for (i=0; i<old_nr; i++)
1912+ for (i = 0; i < old_nr; i++)
1913 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1914 return -1;
1915
1916 /* create pointers for initial change-point information (for sorting) */
1917- for (i=0; i < 2*old_nr; i++)
1918+ for (i = 0; i < 2 * old_nr; i++)
1919 change_point[i] = &change_point_list[i];
1920
1921 /* record all known change-points (starting and ending addresses),
1922 omitting those that are for empty memory regions */
1923 chgidx = 0;
1924- for (i=0; i < old_nr; i++) {
1925+ for (i = 0; i < old_nr; i++) {
1926 if (biosmap[i].size != 0) {
1927 change_point[chgidx]->addr = biosmap[i].addr;
1928 change_point[chgidx++]->pbios = &biosmap[i];
1929- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
1930+ change_point[chgidx]->addr = biosmap[i].addr +
1931+ biosmap[i].size;
1932 change_point[chgidx++]->pbios = &biosmap[i];
1933 }
1934 }
1935@@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
1936 still_changing = 1;
1937 while (still_changing) {
1938 still_changing = 0;
1939- for (i=1; i < chg_nr; i++) {
1940- /* if <current_addr> > <last_addr>, swap */
1941- /* or, if current=<start_addr> & last=<end_addr>, swap */
1942- if ((change_point[i]->addr < change_point[i-1]->addr) ||
1943- ((change_point[i]->addr == change_point[i-1]->addr) &&
1944- (change_point[i]->addr == change_point[i]->pbios->addr) &&
1945- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
1946- )
1947- {
1948+ for (i = 1; i < chg_nr; i++) {
1949+ unsigned long long curaddr, lastaddr;
1950+ unsigned long long curpbaddr, lastpbaddr;
1951+
1952+ curaddr = change_point[i]->addr;
1953+ lastaddr = change_point[i - 1]->addr;
1954+ curpbaddr = change_point[i]->pbios->addr;
1955+ lastpbaddr = change_point[i - 1]->pbios->addr;
1956+
1957+ /*
1958+ * swap entries, when:
1959+ *
1960+ * curaddr > lastaddr or
1961+ * curaddr == lastaddr and curaddr == curpbaddr and
1962+ * lastaddr != lastpbaddr
1963+ */
1964+ if (curaddr < lastaddr ||
1965+ (curaddr == lastaddr && curaddr == curpbaddr &&
1966+ lastaddr != lastpbaddr)) {
1967 change_tmp = change_point[i];
1968 change_point[i] = change_point[i-1];
1969 change_point[i-1] = change_tmp;
1970- still_changing=1;
1971+ still_changing = 1;
1972 }
1973 }
1974 }
1975
1976 /* create a new bios memory map, removing overlaps */
1977- overlap_entries=0; /* number of entries in the overlap table */
1978- new_bios_entry=0; /* index for creating new bios map entries */
1979+ overlap_entries = 0; /* number of entries in the overlap table */
1980+ new_bios_entry = 0; /* index for creating new bios map entries */
1981 last_type = 0; /* start with undefined memory type */
1982 last_addr = 0; /* start with 0 as last starting address */
1983+
1984 /* loop through change-points, determining affect on the new bios map */
1985- for (chgidx=0; chgidx < chg_nr; chgidx++)
1986- {
1987+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1988 /* keep track of all overlapping bios entries */
1989- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
1990- {
1991- /* add map entry to overlap list (> 1 entry implies an overlap) */
1992- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
1993- }
1994- else
1995- {
1996- /* remove entry from list (order independent, so swap with last) */
1997- for (i=0; i<overlap_entries; i++)
1998- {
1999- if (overlap_list[i] == change_point[chgidx]->pbios)
2000- overlap_list[i] = overlap_list[overlap_entries-1];
2001+ if (change_point[chgidx]->addr ==
2002+ change_point[chgidx]->pbios->addr) {
2003+ /*
2004+ * add map entry to overlap list (> 1 entry
2005+ * implies an overlap)
2006+ */
2007+ overlap_list[overlap_entries++] =
2008+ change_point[chgidx]->pbios;
2009+ } else {
2010+ /*
2011+ * remove entry from list (order independent,
2012+ * so swap with last)
2013+ */
2014+ for (i = 0; i < overlap_entries; i++) {
2015+ if (overlap_list[i] ==
2016+ change_point[chgidx]->pbios)
2017+ overlap_list[i] =
2018+ overlap_list[overlap_entries-1];
2019 }
2020 overlap_entries--;
2021 }
2022- /* if there are overlapping entries, decide which "type" to use */
2023- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2024+ /*
2025+ * if there are overlapping entries, decide which
2026+ * "type" to use (larger value takes precedence --
2027+ * 1=usable, 2,3,4,4+=unusable)
2028+ */
2029 current_type = 0;
2030- for (i=0; i<overlap_entries; i++)
2031+ for (i = 0; i < overlap_entries; i++)
2032 if (overlap_list[i]->type > current_type)
2033 current_type = overlap_list[i]->type;
2034- /* continue building up new bios map based on this information */
2035+ /*
2036+ * continue building up new bios map based on this
2037+ * information
2038+ */
2039 if (current_type != last_type) {
2040 if (last_type != 0) {
2041 new_bios[new_bios_entry].size =
2042 change_point[chgidx]->addr - last_addr;
2043- /* move forward only if the new size was non-zero */
2044+ /*
2045+ * move forward only if the new size
2046+ * was non-zero
2047+ */
2048 if (new_bios[new_bios_entry].size != 0)
2049+ /*
2050+ * no more space left for new
2051+ * bios entries ?
2052+ */
2053 if (++new_bios_entry >= E820MAX)
2054- break; /* no more space left for new bios entries */
2055+ break;
2056 }
2057 if (current_type != 0) {
2058- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2059+ new_bios[new_bios_entry].addr =
2060+ change_point[chgidx]->addr;
2061 new_bios[new_bios_entry].type = current_type;
2062- last_addr=change_point[chgidx]->addr;
2063+ last_addr = change_point[chgidx]->addr;
2064 }
2065 last_type = current_type;
2066 }
2067 }
2068- new_nr = new_bios_entry; /* retain count for new bios entries */
2069+ /* retain count for new bios entries */
2070+ new_nr = new_bios_entry;
2071
2072 /* copy new bios mapping into original location */
2073- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2074+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2075 *pnr_map = new_nr;
2076
2077 return 0;
2078@@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2079 * will have given us a memory map that we can use to properly
2080 * set up memory. If we aren't, we'll fake a memory map.
2081 */
2082-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2083+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2084 {
2085 #ifndef CONFIG_XEN
2086 /* Only one memory region (or negative)? Ignore it */
2087@@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2088 return -1;
2089
2090 add_memory_region(start, size, type);
2091- } while (biosmap++,--nr_map);
2092+ } while (biosmap++, --nr_map);
2093
2094 #ifdef CONFIG_XEN
2095 if (is_initial_xendomain()) {
2096@@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2097 return 0;
2098 }
2099
2100-void early_panic(char *msg)
2101+static void early_panic(char *msg)
2102 {
2103 early_printk(msg);
2104 panic(msg);
2105 }
2106
2107-#ifndef CONFIG_XEN
2108-void __init setup_memory_region(void)
2109+/* We're not void only for x86 32-bit compat */
2110+char * __init machine_specific_memory_setup(void)
2111 {
2112+#ifndef CONFIG_XEN
2113+ char *who = "BIOS-e820";
2114 /*
2115 * Try to copy the BIOS-supplied E820-map.
2116 *
2117@@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2118 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2119 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2120 early_panic("Cannot find a valid memory map");
2121- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2122- e820_print_map("BIOS-e820");
2123-}
2124-
2125 #else /* CONFIG_XEN */
2126-
2127-void __init setup_memory_region(void)
2128-{
2129+ char *who = "Xen";
2130 int rc;
2131 struct xen_memory_map memmap;
2132 /*
2133@@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2134
2135 if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2136 early_panic("Cannot find a valid memory map");
2137-
2138+#endif
2139 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2140- e820_print_map("Xen");
2141+ e820_print_map(who);
2142+
2143+ /* In case someone cares... */
2144+ return who;
2145 }
2146-#endif
2147
2148 static int __init parse_memopt(char *p)
2149 {
2150@@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2151 if (!p)
2152 return -EINVAL;
2153 end_user_pfn = memparse(p, &p);
2154- end_user_pfn >>= PAGE_SHIFT;
2155+ end_user_pfn >>= PAGE_SHIFT;
2156
2157 end = end_user_pfn<<PAGE_SHIFT;
2158 i = e820.nr_map-1;
2159@@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2160 }
2161
2162 return 0;
2163-}
2164+}
2165 early_param("mem", parse_memopt);
2166
2167 static int userdef __initdata;
2168@@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2169
2170 if (!strcmp(p, "exactmap")) {
2171 #ifdef CONFIG_CRASH_DUMP
2172- /* If we are doing a crash dump, we
2173- * still need to know the real mem
2174- * size before original memory map is
2175+ /*
2176+ * If we are doing a crash dump, we still need to know
2177+ * the real mem size before original memory map is
2178 * reset.
2179 */
2180 e820_register_active_regions(0, 0, -1UL);
2181@@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2182 mem_size = memparse(p, &p);
2183 if (p == oldp)
2184 return -EINVAL;
2185+
2186+ userdef = 1;
2187 if (*p == '@') {
2188 start_at = memparse(p+1, &p);
2189 add_memory_region(start_at, mem_size, E820_RAM);
2190@@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2191 void __init finish_e820_parsing(void)
2192 {
2193 if (userdef) {
2194+ char nr = e820.nr_map;
2195+
2196+ if (sanitize_e820_map(e820.map, &nr) < 0)
2197+ early_panic("Invalid user supplied memory map");
2198+ e820.nr_map = nr;
2199+
2200 printk(KERN_INFO "user-defined physical RAM map:\n");
2201 e820_print_map("user");
2202 }
2203 }
2204
2205+#ifndef CONFIG_XEN
2206+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2207+ unsigned new_type)
2208+{
2209+ int i;
2210+
2211+ BUG_ON(old_type == new_type);
2212+
2213+ for (i = 0; i < e820.nr_map; i++) {
2214+ struct e820entry *ei = &e820.map[i];
2215+ u64 final_start, final_end;
2216+ if (ei->type != old_type)
2217+ continue;
2218+ /* totally covered? */
2219+ if (ei->addr >= start && ei->size <= size) {
2220+ ei->type = new_type;
2221+ continue;
2222+ }
2223+ /* partially covered */
2224+ final_start = max(start, ei->addr);
2225+ final_end = min(start + size, ei->addr + ei->size);
2226+ if (final_start >= final_end)
2227+ continue;
2228+ add_memory_region(final_start, final_end - final_start,
2229+ new_type);
2230+ }
2231+}
2232+
2233+void __init update_e820(void)
2234+{
2235+ u8 nr_map;
2236+
2237+ nr_map = e820.nr_map;
2238+ if (sanitize_e820_map(e820.map, &nr_map))
2239+ return;
2240+ e820.nr_map = nr_map;
2241+ printk(KERN_INFO "modified physical RAM map:\n");
2242+ e820_print_map("modified");
2243+}
2244+#endif
2245+
2246 unsigned long pci_mem_start = 0xaeedbabe;
2247 EXPORT_SYMBOL(pci_mem_start);
2248
2249@@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2250
2251 if (!found) {
2252 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2253- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2254- KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2255+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2256+ "address range\n"
2257+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
2258+ "registers may break!\n");
2259 }
2260
2261 /*
2262@@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2263 /* Fun with two's complement */
2264 pci_mem_start = (gapstart + round) & -round;
2265
2266- printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2267- pci_mem_start, gapstart, gapsize);
2268+ printk(KERN_INFO
2269+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2270+ pci_mem_start, gapstart, gapsize);
2271 }
2272
2273 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2274--- sle11-2009-06-29.orig/arch/x86/kernel/early_printk-xen.c 2009-02-16 16:18:36.000000000 +0100
2275+++ sle11-2009-06-29/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
2276@@ -222,7 +222,7 @@ static struct console simnow_console = {
2277 };
2278
2279 /* Direct interface for emergencies */
2280-struct console *early_console = &early_vga_console;
2281+static struct console *early_console = &early_vga_console;
2282 static int early_console_initialized = 0;
2283
2284 void early_printk(const char *fmt, ...)
2285--- sle11-2009-06-29.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:18.000000000 +0200
2286+++ sle11-2009-06-29/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
2287@@ -59,7 +59,7 @@
2288 * for paravirtualization. The following will never clobber any registers:
2289 * INTERRUPT_RETURN (aka. "iret")
2290 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2291- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2292+ * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2293 *
2294 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2295 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2296@@ -282,16 +282,21 @@ END(resume_kernel)
2297 #endif
2298 CFI_ENDPROC
2299
2300+ .macro test_tif ti_reg # system call tracing in operation / emulation
2301+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2302+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2303+ .endm
2304+
2305 /* SYSENTER_RETURN points to after the "sysenter" instruction in
2306 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
2307
2308 # sysenter call handler stub
2309-ENTRY(sysenter_entry)
2310+ENTRY(ia32_sysenter_target)
2311 CFI_STARTPROC simple
2312 CFI_SIGNAL_FRAME
2313 CFI_DEF_CFA esp, 0
2314 CFI_REGISTER esp, ebp
2315- movl SYSENTER_stack_esp0(%esp),%esp
2316+ movl SYSENTER_stack_sp0(%esp),%esp
2317 sysenter_past_esp:
2318 /*
2319 * No need to follow this irqs on/off section: the syscall
2320@@ -334,9 +339,7 @@ sysenter_past_esp:
2321 CFI_ADJUST_CFA_OFFSET 4
2322 SAVE_ALL
2323 GET_THREAD_INFO(%ebp)
2324-
2325- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2326- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2327+ test_tif %ebp
2328 jnz syscall_trace_entry
2329 cmpl $(nr_syscalls), %eax
2330 jae syscall_badsys
2331@@ -354,7 +357,7 @@ sysenter_past_esp:
2332 xorl %ebp,%ebp
2333 TRACE_IRQS_ON
2334 1: mov PT_FS(%esp), %fs
2335- ENABLE_INTERRUPTS_SYSEXIT
2336+ ENABLE_INTERRUPTS_SYSCALL_RET
2337 CFI_ENDPROC
2338 .pushsection .fixup,"ax"
2339 2: movl $0,PT_FS(%esp)
2340@@ -363,10 +366,10 @@ sysenter_past_esp:
2341 .align 4
2342 .long 1b,2b
2343 .popsection
2344-ENDPROC(sysenter_entry)
2345+ENDPROC(ia32_sysenter_target)
2346
2347 # pv sysenter call handler stub
2348-ENTRY(sysenter_entry_pv)
2349+ENTRY(ia32pv_sysenter_target)
2350 RING0_INT_FRAME
2351 movl $__USER_DS,16(%esp)
2352 movl %ebp,12(%esp)
2353@@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2354 .previous
2355 /* fall through */
2356 CFI_ENDPROC
2357-ENDPROC(sysenter_entry_pv)
2358+ENDPROC(ia32pv_sysenter_target)
2359
2360 # system call handler stub
2361 ENTRY(system_call)
2362@@ -398,9 +401,7 @@ ENTRY(system_call)
2363 CFI_ADJUST_CFA_OFFSET 4
2364 SAVE_ALL
2365 GET_THREAD_INFO(%ebp)
2366- # system call tracing in operation / emulation
2367- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2368- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2369+ test_tif %ebp
2370 jnz syscall_trace_entry
2371 cmpl $(nr_syscalls), %eax
2372 jae syscall_badsys
2373@@ -452,7 +453,8 @@ restore_nocheck_notrace:
2374 RESTORE_REGS
2375 addl $4, %esp # skip orig_eax/error_code
2376 CFI_ADJUST_CFA_OFFSET -4
2377-1: INTERRUPT_RETURN
2378+irq_return:
2379+ INTERRUPT_RETURN
2380 .section .fixup,"ax"
2381 iret_exc:
2382 pushl $0 # no error code
2383@@ -461,7 +463,7 @@ iret_exc:
2384 .previous
2385 .section __ex_table,"a"
2386 .align 4
2387- .long 1b,iret_exc
2388+ .long irq_return,iret_exc
2389 .previous
2390
2391 CFI_RESTORE_STATE
2392@@ -657,7 +659,7 @@ END(syscall_badsys)
2393 * Build the entry stubs and pointer table with
2394 * some assembler magic.
2395 */
2396-.data
2397+.section .rodata,"a"
2398 ENTRY(interrupt)
2399 .text
2400
2401@@ -963,7 +965,7 @@ END(device_not_available)
2402 * that sets up the real kernel stack. Check here, since we can't
2403 * allow the wrong stack to be used.
2404 *
2405- * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2406+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2407 * already pushed 3 words if it hits on the sysenter instruction:
2408 * eflags, cs and eip.
2409 *
2410@@ -975,7 +977,7 @@ END(device_not_available)
2411 cmpw $__KERNEL_CS,4(%esp); \
2412 jne ok; \
2413 label: \
2414- movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2415+ movl SYSENTER_stack_sp0+offset(%esp),%esp; \
2416 CFI_DEF_CFA esp, 0; \
2417 CFI_UNDEFINED eip; \
2418 pushfl; \
2419@@ -990,7 +992,7 @@ label: \
2420 KPROBE_ENTRY(debug)
2421 RING0_INT_FRAME
2422 #ifndef CONFIG_XEN
2423- cmpl $sysenter_entry,(%esp)
2424+ cmpl $ia32_sysenter_target,(%esp)
2425 jne debug_stack_correct
2426 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2427 debug_stack_correct:
2428@@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi)
2429 popl %eax
2430 CFI_ADJUST_CFA_OFFSET -4
2431 je nmi_espfix_stack
2432- cmpl $sysenter_entry,(%esp)
2433+ cmpl $ia32_sysenter_target,(%esp)
2434 je nmi_stack_fixup
2435 pushl %eax
2436 CFI_ADJUST_CFA_OFFSET 4
2437@@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi)
2438 popl %eax
2439 CFI_ADJUST_CFA_OFFSET -4
2440 jae nmi_stack_correct
2441- cmpl $sysenter_entry,12(%esp)
2442+ cmpl $ia32_sysenter_target,12(%esp)
2443 je nmi_debug_stack_check
2444 nmi_stack_correct:
2445 /* We have a RING0_INT_FRAME here */
2446@@ -1089,12 +1091,8 @@ nmi_espfix_stack:
2447 RESTORE_REGS
2448 lss 12+4(%esp), %esp # back to espfix stack
2449 CFI_ADJUST_CFA_OFFSET -24
2450-1: INTERRUPT_RETURN
2451+ jmp irq_return
2452 CFI_ENDPROC
2453-.section __ex_table,"a"
2454- .align 4
2455- .long 1b,iret_exc
2456-.previous
2457 #else
2458 KPROBE_ENTRY(nmi)
2459 RING0_INT_FRAME
2460@@ -1112,17 +1110,17 @@ KPROBE_END(nmi)
2461
2462 #ifdef CONFIG_PARAVIRT
2463 ENTRY(native_iret)
2464-1: iret
2465+ iret
2466 .section __ex_table,"a"
2467 .align 4
2468- .long 1b,iret_exc
2469+ .long native_iret, iret_exc
2470 .previous
2471 END(native_iret)
2472
2473-ENTRY(native_irq_enable_sysexit)
2474+ENTRY(native_irq_enable_syscall_ret)
2475 sti
2476 sysexit
2477-END(native_irq_enable_sysexit)
2478+END(native_irq_enable_syscall_ret)
2479 #endif
2480
2481 KPROBE_ENTRY(int3)
2482@@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper)
2483 CFI_ENDPROC
2484 ENDPROC(kernel_thread_helper)
2485
2486+#include <asm/alternative-asm.h>
2487+
2488+ # pv syscall call handler stub
2489+ENTRY(ia32pv_cstar_target)
2490+ RING0_INT_FRAME
2491+ movl $__USER_DS,16(%esp)
2492+ movl %ebp,%ecx
2493+ movl $__USER_CS,4(%esp)
2494+ movl 12(%esp),%ebp
2495+ pushl %eax # save orig_eax
2496+ CFI_ADJUST_CFA_OFFSET 4
2497+/*
2498+ * Load the potential sixth argument from user stack.
2499+ * Careful about security.
2500+ */
2501+ cmpl $__PAGE_OFFSET-4,%ebp
2502+ CFI_REMEMBER_STATE
2503+ ja cstar_fault
2504+1: movl (%ebp),%ebp
2505+.section __ex_table,"a"
2506+ .align 4
2507+ .long 1b,cstar_fault
2508+.previous
2509+ SAVE_ALL
2510+ GET_THREAD_INFO(%ebp)
2511+ test_tif %ebp
2512+ jnz cstar_trace_entry
2513+ cmpl $nr_syscalls,%eax
2514+ jae cstar_badsys
2515+.Lcstar_call:
2516+ btl %eax,cstar_special
2517+ jc .Lcstar_special
2518+ call *cstar_call_table(,%eax,4)
2519+ movl %eax,PT_EAX(%esp) # store the return value
2520+.Lcstar_exit:
2521+ movl PT_ECX(%esp),%ecx
2522+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2523+ jmp syscall_exit
2524+.Lcstar_special:
2525+ movl PT_ECX(%esp),%ecx
2526+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2527+ jmp syscall_call
2528+cstar_set_tif:
2529+ movl $cstar_clear_tif,(%esp) # replace return address
2530+ LOCK_PREFIX
2531+ orl $_TIF_CSTAR,TI_flags(%ebp)
2532+ jmp *sys_call_table(,%eax,4)
2533+cstar_clear_tif:
2534+ movl %eax,PT_EAX(%esp) # store the return value
2535+ LOCK_PREFIX
2536+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2537+ jmp .Lcstar_exit
2538+cstar_trace_entry:
2539+ movl $-ENOSYS,PT_EAX(%esp)
2540+ cmpl $nr_syscalls,%eax
2541+ jae 1f
2542+ btl %eax,cstar_special
2543+ jc .Lcstar_trace_special
2544+1: movl %esp,%eax
2545+ xorl %edx,%edx
2546+ LOCK_PREFIX
2547+ orl $_TIF_CSTAR,TI_flags(%ebp)
2548+ call do_syscall_trace
2549+ LOCK_PREFIX
2550+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2551+ testl %eax,%eax
2552+ jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
2553+ # so must skip actual syscall
2554+ movl PT_ORIG_EAX(%esp),%eax
2555+ cmpl $nr_syscalls,%eax
2556+ jb .Lcstar_call
2557+ jmp .Lcstar_exit
2558+.Lcstar_trace_special:
2559+ movl PT_ECX(%esp),%ecx
2560+ movl %esp,%eax
2561+ xorl %edx,%edx
2562+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2563+ call do_syscall_trace
2564+ testl %eax,%eax
2565+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2566+ # so must skip actual syscall
2567+ movl PT_ORIG_EAX(%esp),%eax
2568+ cmpl $nr_syscalls,%eax
2569+ jb syscall_call
2570+ jmp syscall_exit
2571+cstar_badsys:
2572+ movl $-ENOSYS,PT_EAX(%esp)
2573+.Lcstar_resume:
2574+ movl PT_ECX(%esp),%ecx
2575+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2576+ jmp resume_userspace
2577+ CFI_RESTORE_STATE
2578+cstar_fault:
2579+ movl $-EFAULT,%eax
2580+ SAVE_ALL
2581+ GET_THREAD_INFO(%ebp)
2582+ jmp .Lcstar_resume
2583+ CFI_ENDPROC
2584+ENDPROC(ia32pv_cstar_target)
2585+
2586+ENTRY(cstar_ret_from_fork)
2587+ CFI_STARTPROC
2588+ movl PT_ECX(%esp),%ecx
2589+ GET_THREAD_INFO(%ebp)
2590+ movl %ecx,PT_EBP(%esp) # put user EBP back in place
2591+ LOCK_PREFIX
2592+ andl $~_TIF_CSTAR,TI_flags(%ebp)
2593+ jmp ret_from_fork
2594+ CFI_ENDPROC
2595+END(ret_from_fork)
2596+
2597 .section .rodata,"a"
2598 #include "syscall_table_32.S"
2599
2600 syscall_table_size=(.-sys_call_table)
2601+
2602+#include <asm/unistd.h>
2603+cstar_special:
2604+nr=0
2605+mask=0
2606+.rept nr_syscalls+31
2607+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
2608+ .if nr == \n
2609+ mask = mask | (1 << (\n & 31))
2610+ .endif
2611+ .endr
2612+ nr = nr + 1
2613+ .if (nr & 31) == 0
2614+ .long mask
2615+ mask = 0
2616+ .endif
2617+.endr
2618+#define sys_call_table cstar_call_table
2619+#define sys_fork cstar_set_tif
2620+#define sys_clone cstar_set_tif
2621+#define sys_vfork cstar_set_tif
2622+#include "syscall_table_32.S"
2623+#undef sys_call_table
2624+#undef sys_fork
2625+#undef sys_clone
2626+#undef sys_vfork
2627--- sle11-2009-06-29.orig/arch/x86/kernel/entry_64-xen.S 2009-02-16 16:18:36.000000000 +0100
2628+++ sle11-2009-06-29/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
2629@@ -54,17 +54,22 @@
2630 #include <asm/page.h>
2631 #include <asm/irqflags.h>
2632 #include <asm/errno.h>
2633-#include <xen/interface/arch-x86_64.h>
2634+#include <xen/interface/xen.h>
2635 #include <xen/interface/features.h>
2636
2637-#include "xen_entry_64.S"
2638-
2639 .code64
2640
2641 #ifndef CONFIG_PREEMPT
2642 #define retint_kernel retint_restore_args
2643 #endif
2644
2645+#ifdef CONFIG_PARAVIRT
2646+ENTRY(native_irq_enable_syscall_ret)
2647+ movq %gs:pda_oldrsp,%rsp
2648+ swapgs
2649+ sysretq
2650+#endif /* CONFIG_PARAVIRT */
2651+
2652
2653 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2654 #ifdef CONFIG_TRACE_IRQFLAGS
2655@@ -277,7 +282,7 @@ ret_from_sys_call:
2656 sysret_check:
2657 LOCKDEP_SYS_EXIT
2658 GET_THREAD_INFO(%rcx)
2659- XEN_BLOCK_EVENTS(%rsi)
2660+ DISABLE_INTERRUPTS(CLBR_NONE)
2661 TRACE_IRQS_OFF
2662 movl threadinfo_flags(%rcx),%edx
2663 andl %edi,%edx
2664@@ -287,7 +292,7 @@ sysret_check:
2665 * sysretq will re-enable interrupts:
2666 */
2667 TRACE_IRQS_ON
2668- XEN_UNBLOCK_EVENTS(%rsi)
2669+ ENABLE_INTERRUPTS(CLBR_NONE)
2670 RESTORE_ARGS 0,8,0
2671 HYPERVISOR_IRET VGCF_IN_SYSCALL
2672
2673@@ -298,7 +303,7 @@ sysret_careful:
2674 bt $TIF_NEED_RESCHED,%edx
2675 jnc sysret_signal
2676 TRACE_IRQS_ON
2677- XEN_UNBLOCK_EVENTS(%rsi)
2678+ ENABLE_INTERRUPTS(CLBR_NONE)
2679 pushq %rdi
2680 CFI_ADJUST_CFA_OFFSET 8
2681 call schedule
2682@@ -309,9 +314,8 @@ sysret_careful:
2683 /* Handle a signal */
2684 sysret_signal:
2685 TRACE_IRQS_ON
2686-/* sti */
2687- XEN_UNBLOCK_EVENTS(%rsi)
2688- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2689+ ENABLE_INTERRUPTS(CLBR_NONE)
2690+ testl $_TIF_DO_NOTIFY_MASK,%edx
2691 jz 1f
2692
2693 /* Really a signal */
2694@@ -323,7 +327,7 @@ sysret_signal:
2695 1: movl $_TIF_NEED_RESCHED,%edi
2696 /* Use IRET because user could have changed frame. This
2697 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2698- XEN_BLOCK_EVENTS(%rsi)
2699+ DISABLE_INTERRUPTS(CLBR_NONE)
2700 TRACE_IRQS_OFF
2701 jmp int_with_check
2702
2703@@ -355,7 +359,7 @@ tracesys:
2704 */
2705 .globl int_ret_from_sys_call
2706 int_ret_from_sys_call:
2707- XEN_BLOCK_EVENTS(%rsi)
2708+ DISABLE_INTERRUPTS(CLBR_NONE)
2709 TRACE_IRQS_OFF
2710 testb $3,CS-ARGOFFSET(%rsp)
2711 jnz 1f
2712@@ -381,22 +385,20 @@ int_careful:
2713 bt $TIF_NEED_RESCHED,%edx
2714 jnc int_very_careful
2715 TRACE_IRQS_ON
2716-/* sti */
2717- XEN_UNBLOCK_EVENTS(%rsi)
2718+ ENABLE_INTERRUPTS(CLBR_NONE)
2719 pushq %rdi
2720 CFI_ADJUST_CFA_OFFSET 8
2721 call schedule
2722 popq %rdi
2723 CFI_ADJUST_CFA_OFFSET -8
2724- XEN_BLOCK_EVENTS(%rsi)
2725+ DISABLE_INTERRUPTS(CLBR_NONE)
2726 TRACE_IRQS_OFF
2727 jmp int_with_check
2728
2729 /* handle signals and tracing -- both require a full stack frame */
2730 int_very_careful:
2731 TRACE_IRQS_ON
2732-/* sti */
2733- XEN_UNBLOCK_EVENTS(%rsi)
2734+ ENABLE_INTERRUPTS(CLBR_NONE)
2735 SAVE_REST
2736 /* Check for syscall exit trace */
2737 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2738@@ -411,7 +413,7 @@ int_very_careful:
2739 jmp int_restore_rest
2740
2741 int_signal:
2742- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2743+ testl $_TIF_DO_NOTIFY_MASK,%edx
2744 jz 1f
2745 movq %rsp,%rdi # &ptregs -> arg1
2746 xorl %esi,%esi # oldset -> arg2
2747@@ -419,7 +421,7 @@ int_signal:
2748 1: movl $_TIF_NEED_RESCHED,%edi
2749 int_restore_rest:
2750 RESTORE_REST
2751- XEN_BLOCK_EVENTS(%rsi)
2752+ DISABLE_INTERRUPTS(CLBR_NONE)
2753 TRACE_IRQS_OFF
2754 jmp int_with_check
2755 CFI_ENDPROC
2756@@ -474,6 +476,7 @@ ENTRY(stub_execve)
2757 CFI_REGISTER rip, r11
2758 SAVE_REST
2759 FIXUP_TOP_OF_STACK %r11
2760+ movq %rsp, %rcx
2761 call sys_execve
2762 RESTORE_TOP_OF_STACK %r11
2763 movq %rax,RAX(%rsp)
2764@@ -526,11 +529,10 @@ retint_check:
2765 retint_restore_args: /* return to kernel space */
2766 movl EFLAGS-REST_SKIP(%rsp), %eax
2767 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
2768- XEN_GET_VCPU_INFO(%rsi)
2769+ GET_VCPU_INFO
2770 andb evtchn_upcall_mask(%rsi),%al
2771 andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2772 jnz restore_all_enable_events # != 0 => enable event delivery
2773- XEN_PUT_VCPU_INFO(%rsi)
2774
2775 RESTORE_ARGS 0,8,0
2776 HYPERVISOR_IRET 0
2777@@ -541,31 +543,29 @@ retint_careful:
2778 bt $TIF_NEED_RESCHED,%edx
2779 jnc retint_signal
2780 TRACE_IRQS_ON
2781- XEN_UNBLOCK_EVENTS(%rsi)
2782-/* sti */
2783+ ENABLE_INTERRUPTS(CLBR_NONE)
2784 pushq %rdi
2785 CFI_ADJUST_CFA_OFFSET 8
2786 call schedule
2787 popq %rdi
2788 CFI_ADJUST_CFA_OFFSET -8
2789 GET_THREAD_INFO(%rcx)
2790- XEN_BLOCK_EVENTS(%rsi)
2791-/* cli */
2792+ DISABLE_INTERRUPTS(CLBR_NONE)
2793 TRACE_IRQS_OFF
2794 jmp retint_check
2795
2796 retint_signal:
2797- testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2798+ testl $_TIF_DO_NOTIFY_MASK,%edx
2799 jz retint_restore_args
2800 TRACE_IRQS_ON
2801- XEN_UNBLOCK_EVENTS(%rsi)
2802+ ENABLE_INTERRUPTS(CLBR_NONE)
2803 SAVE_REST
2804 movq $-1,ORIG_RAX(%rsp)
2805 xorl %esi,%esi # oldset
2806 movq %rsp,%rdi # &pt_regs
2807 call do_notify_resume
2808 RESTORE_REST
2809- XEN_BLOCK_EVENTS(%rsi)
2810+ DISABLE_INTERRUPTS(CLBR_NONE)
2811 TRACE_IRQS_OFF
2812 movl $_TIF_NEED_RESCHED,%edi
2813 GET_THREAD_INFO(%rcx)
2814@@ -702,7 +702,7 @@ END(spurious_interrupt)
2815 rdmsr
2816 testl %edx,%edx
2817 js 1f
2818- swapgs
2819+ SWAPGS
2820 xorl %ebx,%ebx
2821 1:
2822 #endif
2823@@ -719,8 +719,7 @@ END(spurious_interrupt)
2824 .if \ist
2825 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2826 .endif
2827-/* cli */
2828- XEN_BLOCK_EVENTS(%rsi)
2829+ DISABLE_INTERRUPTS(CLBR_NONE)
2830 .if \irqtrace
2831 TRACE_IRQS_OFF
2832 .endif
2833@@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2834 .if \trace
2835 TRACE_IRQS_IRETQ 0
2836 .endif
2837- swapgs
2838+ SWAPGS_UNSAFE_STACK
2839 paranoid_restore\trace:
2840 RESTORE_ALL 8
2841- iretq
2842+ jmp irq_return
2843 paranoid_userspace\trace:
2844 GET_THREAD_INFO(%rcx)
2845 movl threadinfo_flags(%rcx),%ebx
2846@@ -767,11 +766,11 @@ paranoid_userspace\trace:
2847 .if \trace
2848 TRACE_IRQS_ON
2849 .endif
2850- sti
2851+ ENABLE_INTERRUPTS(CLBR_NONE)
2852 xorl %esi,%esi /* arg2: oldset */
2853 movq %rsp,%rdi /* arg1: &pt_regs */
2854 call do_notify_resume
2855- cli
2856+ DISABLE_INTERRUPTS(CLBR_NONE)
2857 .if \trace
2858 TRACE_IRQS_OFF
2859 .endif
2860@@ -780,9 +779,9 @@ paranoid_schedule\trace:
2861 .if \trace
2862 TRACE_IRQS_ON
2863 .endif
2864- sti
2865+ ENABLE_INTERRUPTS(CLBR_ANY)
2866 call schedule
2867- cli
2868+ DISABLE_INTERRUPTS(CLBR_ANY)
2869 .if \trace
2870 TRACE_IRQS_OFF
2871 .endif
2872@@ -846,8 +845,7 @@ error_call_handler:
2873 call *%rax
2874 error_exit:
2875 RESTORE_REST
2876-/* cli */
2877- XEN_BLOCK_EVENTS(%rsi)
2878+ DISABLE_INTERRUPTS(CLBR_NONE)
2879 TRACE_IRQS_OFF
2880 GET_THREAD_INFO(%rcx)
2881 testb $3,CS-ARGOFFSET(%rsp)
2882@@ -875,7 +873,7 @@ error_kernelspace:
2883 iret run with kernel gs again, so don't set the user space flag.
2884 B stepping K8s sometimes report an truncated RIP for IRET
2885 exceptions returning to compat mode. Check for these here too. */
2886- leaq iret_label(%rip),%rbp
2887+ leaq irq_return(%rip),%rbp
2888 cmpq %rbp,RIP(%rsp)
2889 je error_swapgs
2890 movl %ebp,%ebp /* zero extend */
2891@@ -930,19 +928,17 @@ END(do_hypervisor_callback)
2892 restore_all_enable_events:
2893 CFI_DEFAULT_STACK adj=1
2894 TRACE_IRQS_ON
2895- XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
2896+ __ENABLE_INTERRUPTS
2897
2898 scrit: /**** START OF CRITICAL REGION ****/
2899- XEN_TEST_PENDING(%rsi)
2900+ __TEST_PENDING
2901 CFI_REMEMBER_STATE
2902 jnz 14f # process more events if necessary...
2903- XEN_PUT_VCPU_INFO(%rsi)
2904 RESTORE_ARGS 0,8,0
2905 HYPERVISOR_IRET 0
2906
2907 CFI_RESTORE_STATE
2908-14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
2909- XEN_PUT_VCPU_INFO(%rsi)
2910+14: __DISABLE_INTERRUPTS
2911 SAVE_REST
2912 movq %rsp,%rdi # set the argument again
2913 jmp 11b
2914@@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
2915 * rdi: name, rsi: argv, rdx: envp
2916 *
2917 * We want to fallback into:
2918- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
2919+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
2920 *
2921 * do_sys_execve asm fallback arguments:
2922- * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
2923+ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
2924 */
2925 ENTRY(kernel_execve)
2926 CFI_STARTPROC
2927 FAKE_STACK_FRAME $0
2928 SAVE_ALL
2929+ movq %rsp,%rcx
2930 call sys_execve
2931 movq %rax, RAX(%rsp)
2932 RESTORE_REST
2933@@ -1144,7 +1141,7 @@ do_nmi_callback:
2934 call do_nmi
2935 orl $NMI_MASK,EFLAGS(%rsp)
2936 RESTORE_REST
2937- XEN_BLOCK_EVENTS(%rsi)
2938+ DISABLE_INTERRUPTS(CLBR_NONE)
2939 TRACE_IRQS_OFF
2940 GET_THREAD_INFO(%rcx)
2941 jmp retint_restore_args
2942--- sle11-2009-06-29.orig/arch/x86/kernel/fixup.c 2009-06-29 15:14:52.000000000 +0200
2943+++ sle11-2009-06-29/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
2944@@ -36,7 +36,7 @@
2945
2946 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
2947
2948-fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2949+void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2950 {
2951 static unsigned long printed = 0;
2952 char info[100];
2953--- sle11-2009-06-29.orig/arch/x86/kernel/genapic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
2954+++ sle11-2009-06-29/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
2955@@ -24,20 +24,13 @@
2956 #include <acpi/acpi_bus.h>
2957 #endif
2958
2959-/*
2960- * which logical CPU number maps to which CPU (physical APIC ID)
2961- *
2962- * The following static array is used during kernel startup
2963- * and the x86_cpu_to_apicid_ptr contains the address of the
2964- * array during this time. Is it zeroed when the per_cpu
2965- * data area is removed.
2966- */
2967+/* which logical CPU number maps to which CPU (physical APIC ID) */
2968 #ifndef CONFIG_XEN
2969-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2970+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2971 = { [0 ... NR_CPUS-1] = BAD_APICID };
2972-void *x86_cpu_to_apicid_ptr;
2973+void *x86_cpu_to_apicid_early_ptr;
2974 #endif
2975-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
2976+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
2977 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
2978
2979 #ifndef CONFIG_XEN
2980--- sle11-2009-06-29.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:18:36.000000000 +0100
2981+++ sle11-2009-06-29/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
2982@@ -16,6 +16,7 @@
2983 #include <linux/kernel.h>
2984 #include <linux/string.h>
2985 #include <linux/percpu.h>
2986+#include <linux/start_kernel.h>
2987 #include <linux/module.h>
2988
2989 #include <asm/processor.h>
2990@@ -26,6 +27,8 @@
2991 #include <asm/pgtable.h>
2992 #include <asm/tlbflush.h>
2993 #include <asm/sections.h>
2994+#include <asm/kdebug.h>
2995+#include <asm/e820.h>
2996
2997 unsigned long start_pfn;
2998
2999@@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3000 {
3001 pgd_t *pgd = pgd_offset_k(0UL);
3002 pgd_clear(pgd);
3003- __flush_tlb();
3004+ __flush_tlb_all();
3005 }
3006
3007 /* Don't add a printk in there. printk relies on the PDA which is not initialized
3008@@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3009 unsigned int machine_to_phys_order;
3010 EXPORT_SYMBOL(machine_to_phys_order);
3011
3012+#define EBDA_ADDR_POINTER 0x40E
3013+
3014+static __init void reserve_ebda(void)
3015+{
3016+#ifndef CONFIG_XEN
3017+ unsigned ebda_addr, ebda_size;
3018+
3019+ /*
3020+ * there is a real-mode segmented pointer pointing to the
3021+ * 4K EBDA area at 0x40E
3022+ */
3023+ ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3024+ ebda_addr <<= 4;
3025+
3026+ if (!ebda_addr)
3027+ return;
3028+
3029+ ebda_size = *(unsigned short *)__va(ebda_addr);
3030+
3031+ /* Round EBDA up to pages */
3032+ if (ebda_size == 0)
3033+ ebda_size = 1;
3034+ ebda_size <<= 10;
3035+ ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3036+ if (ebda_size > 64*1024)
3037+ ebda_size = 64*1024;
3038+
3039+ reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3040+#endif
3041+}
3042+
3043 void __init x86_64_start_kernel(char * real_mode_data)
3044 {
3045 struct xen_machphys_mapping mapping;
3046@@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3047 /* Make NULL pointers segfault */
3048 zap_identity_mappings();
3049
3050- for (i = 0; i < IDT_ENTRIES; i++)
3051+ /* Cleanup the over mapped high alias */
3052+ cleanup_highmap();
3053+
3054+ for (i = 0; i < IDT_ENTRIES; i++) {
3055+#ifdef CONFIG_EARLY_PRINTK
3056+ set_intr_gate(i, &early_idt_handlers[i]);
3057+#else
3058 set_intr_gate(i, early_idt_handler);
3059+#endif
3060+ }
3061 load_idt((const struct desc_ptr *)&idt_descr);
3062 #endif
3063
3064@@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3065
3066 pda_init(0);
3067 copy_bootdata(__va(real_mode_data));
3068-#ifdef CONFIG_SMP
3069- cpu_set(0, cpu_online_map);
3070-#endif
3071+
3072+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3073+
3074+ reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3075+ start_pfn << PAGE_SHIFT, "Xen provided");
3076+
3077+ reserve_ebda();
3078+
3079+ /*
3080+ * At this point everything still needed from the boot loader
3081+ * or BIOS or kernel text should be early reserved or marked not
3082+ * RAM in e820. All other memory is free game.
3083+ */
3084+
3085 start_kernel();
3086 }
3087--- sle11-2009-06-29.orig/arch/x86/kernel/head_32-xen.S 2009-02-16 16:17:21.000000000 +0100
3088+++ sle11-2009-06-29/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
3089@@ -3,6 +3,7 @@
3090 .text
3091 #include <linux/elfnote.h>
3092 #include <linux/threads.h>
3093+#include <linux/init.h>
3094 #include <linux/linkage.h>
3095 #include <asm/segment.h>
3096 #include <asm/page.h>
3097@@ -88,7 +89,7 @@ ENTRY(_stext)
3098 */
3099 .section ".bss.page_aligned","wa"
3100 .align PAGE_SIZE_asm
3101-ENTRY(swapper_pg_pmd)
3102+ENTRY(swapper_pg_fixmap)
3103 .fill 1024,4,0
3104 ENTRY(empty_zero_page)
3105 .fill 4096,1,0
3106--- sle11-2009-06-29.orig/arch/x86/kernel/init_task-xen.c 2009-02-16 16:18:36.000000000 +0100
3107+++ sle11-2009-06-29/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
3108@@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3109 #endif
3110 struct mm_struct init_mm = INIT_MM(init_mm);
3111 #undef swapper_pg_dir
3112-EXPORT_SYMBOL(init_mm);
3113+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3114
3115 /*
3116 * Initial thread structure.
3117--- sle11-2009-06-29.orig/arch/x86/kernel/io_apic_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3118+++ sle11-2009-06-29/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3119@@ -35,6 +35,7 @@
3120 #include <linux/htirq.h>
3121 #include <linux/freezer.h>
3122 #include <linux/kthread.h>
3123+#include <linux/jiffies.h> /* time_after() */
3124
3125 #include <asm/io.h>
3126 #include <asm/smp.h>
3127@@ -48,8 +49,6 @@
3128 #include <mach_apic.h>
3129 #include <mach_apicdef.h>
3130
3131-#include "io_ports.h"
3132-
3133 #ifdef CONFIG_XEN
3134 #include <xen/interface/xen.h>
3135 #include <xen/interface/physdev.h>
3136@@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3137 # include <asm/processor.h> /* kernel_thread() */
3138 # include <linux/kernel_stat.h> /* kstat */
3139 # include <linux/slab.h> /* kmalloc() */
3140-# include <linux/timer.h> /* time_after() */
3141+# include <linux/timer.h>
3142
3143 #define IRQBALANCE_CHECK_ARCH -999
3144 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3145@@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3146 #endif
3147
3148 #ifndef CONFIG_SMP
3149-void fastcall send_IPI_self(int vector)
3150+void send_IPI_self(int vector)
3151 {
3152 #ifndef CONFIG_XEN
3153 unsigned int cfg;
3154@@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3155 * might have cached one ExtINT interrupt. Finally, at
3156 * least one tick may be lost due to delays.
3157 */
3158- if (jiffies - t1 > 4)
3159+ if (time_after(jiffies, t1 + 4))
3160 return 1;
3161
3162 return 0;
3163@@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3164 .eoi = ack_apic,
3165 };
3166
3167-static void setup_nmi (void)
3168+static void __init setup_nmi(void)
3169 {
3170 /*
3171 * Dirty trick to enable the NMI watchdog ...
3172@@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3173 */
3174 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3175
3176- on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3177+ enable_NMI_through_LVT0();
3178
3179 apic_printk(APIC_VERBOSE, " done.\n");
3180 }
3181@@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3182 }
3183
3184 static struct sysdev_class ioapic_sysdev_class = {
3185- set_kset_name("ioapic"),
3186+ .name = "ioapic",
3187 .suspend = ioapic_suspend,
3188 .resume = ioapic_resume,
3189 };
3190--- sle11-2009-06-29.orig/arch/x86/kernel/io_apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3191+++ sle11-2009-06-29/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
3192@@ -32,9 +32,11 @@
3193 #include <linux/msi.h>
3194 #include <linux/htirq.h>
3195 #include <linux/dmar.h>
3196+#include <linux/jiffies.h>
3197 #ifdef CONFIG_ACPI
3198 #include <acpi/acpi_bus.h>
3199 #endif
3200+#include <linux/bootmem.h>
3201
3202 #include <asm/idle.h>
3203 #include <asm/io.h>
3204@@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3205 v = apic_read(APIC_LVR);
3206 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3207 ver = GET_APIC_VERSION(v);
3208- maxlvt = get_maxlvt();
3209+ maxlvt = lapic_get_maxlvt();
3210
3211 v = apic_read(APIC_TASKPRI);
3212 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3213@@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3214 }
3215 #endif /* !CONFIG_XEN */
3216
3217-static void __init enable_IO_APIC(void)
3218+void __init enable_IO_APIC(void)
3219 {
3220 union IO_APIC_reg_01 reg_01;
3221 #ifndef CONFIG_XEN
3222@@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3223 */
3224
3225 /* jiffies wrap? */
3226- if (jiffies - t1 > 4)
3227+ if (time_after(jiffies, t1 + 4))
3228 return 1;
3229 return 0;
3230 }
3231@@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3232 if (likely(!cfg->move_in_progress))
3233 return;
3234
3235- vector = ~get_irq_regs()->orig_rax;
3236+ vector = ~get_irq_regs()->orig_ax;
3237 me = smp_processor_id();
3238 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3239 cpumask_t cleanup_mask;
3240@@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3241 int do_unmask_irq = 0;
3242
3243 irq_complete_move(irq);
3244-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3245+#ifdef CONFIG_GENERIC_PENDING_IRQ
3246 /* If we are moving the irq we need to mask it */
3247 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3248 do_unmask_irq = 1;
3249@@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3250 .end = end_lapic_irq,
3251 };
3252
3253-static void setup_nmi (void)
3254+static void __init setup_nmi(void)
3255 {
3256 /*
3257 * Dirty trick to enable the NMI watchdog ...
3258@@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3259 */
3260 printk(KERN_INFO "activating NMI Watchdog ...");
3261
3262- enable_NMI_through_LVT0(NULL);
3263+ enable_NMI_through_LVT0();
3264
3265 printk(" done.\n");
3266 }
3267@@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3268 *
3269 * FIXME: really need to revamp this for modern platforms only.
3270 */
3271-static inline void check_timer(void)
3272+static inline void __init check_timer(void)
3273 {
3274 struct irq_cfg *cfg = irq_cfg + 0;
3275 int apic1, pin1, apic2, pin2;
3276@@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3277 }
3278
3279 static struct sysdev_class ioapic_sysdev_class = {
3280- set_kset_name("ioapic"),
3281+ .name = "ioapic",
3282 .suspend = ioapic_suspend,
3283 .resume = ioapic_resume,
3284 };
3285@@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3286 }
3287 }
3288 #endif
3289-#endif /* !CONFIG_XEN */
3290
3291+#define IOAPIC_RESOURCE_NAME_SIZE 11
3292+
3293+static struct resource *ioapic_resources;
3294+
3295+static struct resource * __init ioapic_setup_resources(void)
3296+{
3297+ unsigned long n;
3298+ struct resource *res;
3299+ char *mem;
3300+ int i;
3301+
3302+ if (nr_ioapics <= 0)
3303+ return NULL;
3304+
3305+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3306+ n *= nr_ioapics;
3307+
3308+ mem = alloc_bootmem(n);
3309+ res = (void *)mem;
3310+
3311+ if (mem != NULL) {
3312+ memset(mem, 0, n);
3313+ mem += sizeof(struct resource) * nr_ioapics;
3314+
3315+ for (i = 0; i < nr_ioapics; i++) {
3316+ res[i].name = mem;
3317+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3318+ sprintf(mem, "IOAPIC %u", i);
3319+ mem += IOAPIC_RESOURCE_NAME_SIZE;
3320+ }
3321+ }
3322+
3323+ ioapic_resources = res;
3324+
3325+ return res;
3326+}
3327+
3328+void __init ioapic_init_mappings(void)
3329+{
3330+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3331+ struct resource *ioapic_res;
3332+ int i;
3333+
3334+ ioapic_res = ioapic_setup_resources();
3335+ for (i = 0; i < nr_ioapics; i++) {
3336+ if (smp_found_config) {
3337+ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3338+ } else {
3339+ ioapic_phys = (unsigned long)
3340+ alloc_bootmem_pages(PAGE_SIZE);
3341+ ioapic_phys = __pa(ioapic_phys);
3342+ }
3343+ set_fixmap_nocache(idx, ioapic_phys);
3344+ apic_printk(APIC_VERBOSE,
3345+ "mapped IOAPIC to %016lx (%016lx)\n",
3346+ __fix_to_virt(idx), ioapic_phys);
3347+ idx++;
3348+
3349+ if (ioapic_res != NULL) {
3350+ ioapic_res->start = ioapic_phys;
3351+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3352+ ioapic_res++;
3353+ }
3354+ }
3355+}
3356+
3357+static int __init ioapic_insert_resources(void)
3358+{
3359+ int i;
3360+ struct resource *r = ioapic_resources;
3361+
3362+ if (!r) {
3363+ printk(KERN_ERR
3364+ "IO APIC resources could be not be allocated.\n");
3365+ return -1;
3366+ }
3367+
3368+ for (i = 0; i < nr_ioapics; i++) {
3369+ insert_resource(&iomem_resource, r);
3370+ r++;
3371+ }
3372+
3373+ return 0;
3374+}
3375+
3376+/* Insert the IO APIC resources after PCI initialization has occured to handle
3377+ * IO APICS that are mapped in on a BAR in PCI space. */
3378+late_initcall(ioapic_insert_resources);
3379+#endif /* !CONFIG_XEN */
3380--- /dev/null 1970-01-01 00:00:00.000000000 +0000
3381+++ sle11-2009-06-29/arch/x86/kernel/ioport-xen.c 2009-03-16 16:33:40.000000000 +0100
3382@@ -0,0 +1,112 @@
3383+/*
3384+ * This contains the io-permission bitmap code - written by obz, with changes
3385+ * by Linus. 32/64 bits code unification by Miguel Botón.
3386+ */
3387+
3388+#include <linux/sched.h>
3389+#include <linux/kernel.h>
3390+#include <linux/capability.h>
3391+#include <linux/errno.h>
3392+#include <linux/types.h>
3393+#include <linux/ioport.h>
3394+#include <linux/smp.h>
3395+#include <linux/stddef.h>
3396+#include <linux/slab.h>
3397+#include <linux/thread_info.h>
3398+#include <linux/syscalls.h>
3399+#include <xen/interface/physdev.h>
3400+
3401+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3402+static void set_bitmap(unsigned long *bitmap, unsigned int base,
3403+ unsigned int extent, int new_value)
3404+{
3405+ unsigned int i;
3406+
3407+ for (i = base; i < base + extent; i++) {
3408+ if (new_value)
3409+ __set_bit(i, bitmap);
3410+ else
3411+ __clear_bit(i, bitmap);
3412+ }
3413+}
3414+
3415+/*
3416+ * this changes the io permissions bitmap in the current task.
3417+ */
3418+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3419+{
3420+ struct thread_struct * t = &current->thread;
3421+ struct physdev_set_iobitmap set_iobitmap;
3422+
3423+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3424+ return -EINVAL;
3425+ if (turn_on && !capable(CAP_SYS_RAWIO))
3426+ return -EPERM;
3427+
3428+ /*
3429+ * If it's the first ioperm() call in this thread's lifetime, set the
3430+ * IO bitmap up. ioperm() is much less timing critical than clone(),
3431+ * this is why we delay this operation until now:
3432+ */
3433+ if (!t->io_bitmap_ptr) {
3434+ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3435+
3436+ if (!bitmap)
3437+ return -ENOMEM;
3438+
3439+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
3440+ t->io_bitmap_ptr = bitmap;
3441+ set_thread_flag(TIF_IO_BITMAP);
3442+
3443+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3444+ set_iobitmap.nr_ports = IO_BITMAP_BITS;
3445+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3446+ &set_iobitmap));
3447+ }
3448+
3449+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3450+
3451+ return 0;
3452+}
3453+
3454+/*
3455+ * sys_iopl has to be used when you want to access the IO ports
3456+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3457+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
3458+ */
3459+static int do_iopl(unsigned int level, struct thread_struct *t)
3460+{
3461+ unsigned int old = t->iopl >> 12;
3462+
3463+ if (level > 3)
3464+ return -EINVAL;
3465+ /* Trying to gain more privileges? */
3466+ if (level > old) {
3467+ if (!capable(CAP_SYS_RAWIO))
3468+ return -EPERM;
3469+ }
3470+
3471+ return 0;
3472+}
3473+
3474+#ifdef CONFIG_X86_32
3475+asmlinkage long sys_iopl(unsigned long regsp)
3476+{
3477+ struct pt_regs *regs = (struct pt_regs *)&regsp;
3478+ unsigned int level = regs->bx;
3479+#else
3480+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3481+{
3482+#endif
3483+ struct thread_struct *t = &current->thread;
3484+ int rc;
3485+
3486+ rc = do_iopl(level, t);
3487+ if (rc < 0)
3488+ goto out;
3489+
3490+ t->iopl = level << 12;
3491+ set_iopl_mask(t->iopl);
3492+out:
3493+ return rc;
3494+}
3495--- sle11-2009-06-29.orig/arch/x86/kernel/ioport_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3496+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3497@@ -1,121 +0,0 @@
3498-/*
3499- * This contains the io-permission bitmap code - written by obz, with changes
3500- * by Linus.
3501- */
3502-
3503-#include <linux/sched.h>
3504-#include <linux/kernel.h>
3505-#include <linux/capability.h>
3506-#include <linux/errno.h>
3507-#include <linux/types.h>
3508-#include <linux/ioport.h>
3509-#include <linux/smp.h>
3510-#include <linux/stddef.h>
3511-#include <linux/slab.h>
3512-#include <linux/thread_info.h>
3513-#include <linux/syscalls.h>
3514-#include <xen/interface/physdev.h>
3515-
3516-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3517-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3518-{
3519- unsigned long mask;
3520- unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3521- unsigned int low_index = base & (BITS_PER_LONG-1);
3522- int length = low_index + extent;
3523-
3524- if (low_index != 0) {
3525- mask = (~0UL << low_index);
3526- if (length < BITS_PER_LONG)
3527- mask &= ~(~0UL << length);
3528- if (new_value)
3529- *bitmap_base++ |= mask;
3530- else
3531- *bitmap_base++ &= ~mask;
3532- length -= BITS_PER_LONG;
3533- }
3534-
3535- mask = (new_value ? ~0UL : 0UL);
3536- while (length >= BITS_PER_LONG) {
3537- *bitmap_base++ = mask;
3538- length -= BITS_PER_LONG;
3539- }
3540-
3541- if (length > 0) {
3542- mask = ~(~0UL << length);
3543- if (new_value)
3544- *bitmap_base++ |= mask;
3545- else
3546- *bitmap_base++ &= ~mask;
3547- }
3548-}
3549-
3550-
3551-/*
3552- * this changes the io permissions bitmap in the current task.
3553- */
3554-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3555-{
3556- struct thread_struct * t = &current->thread;
3557- unsigned long *bitmap;
3558- struct physdev_set_iobitmap set_iobitmap;
3559-
3560- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3561- return -EINVAL;
3562- if (turn_on && !capable(CAP_SYS_RAWIO))
3563- return -EPERM;
3564-
3565- /*
3566- * If it's the first ioperm() call in this thread's lifetime, set the
3567- * IO bitmap up. ioperm() is much less timing critical than clone(),
3568- * this is why we delay this operation until now:
3569- */
3570- if (!t->io_bitmap_ptr) {
3571- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3572- if (!bitmap)
3573- return -ENOMEM;
3574-
3575- memset(bitmap, 0xff, IO_BITMAP_BYTES);
3576- t->io_bitmap_ptr = bitmap;
3577- set_thread_flag(TIF_IO_BITMAP);
3578-
3579- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3580- set_iobitmap.nr_ports = IO_BITMAP_BITS;
3581- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3582- &set_iobitmap));
3583- }
3584-
3585- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3586-
3587- return 0;
3588-}
3589-
3590-/*
3591- * sys_iopl has to be used when you want to access the IO ports
3592- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3593- * you'd need 8kB of bitmaps/process, which is a bit excessive.
3594- *
3595- * Here we just change the eflags value on the stack: we allow
3596- * only the super-user to do it. This depends on the stack-layout
3597- * on system-call entry - see also fork() and the signal handling
3598- * code.
3599- */
3600-
3601-asmlinkage long sys_iopl(unsigned long unused)
3602-{
3603- volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3604- unsigned int level = regs->ebx;
3605- struct thread_struct *t = &current->thread;
3606- unsigned int old = (t->iopl >> 12) & 3;
3607-
3608- if (level > 3)
3609- return -EINVAL;
3610- /* Trying to gain more privileges? */
3611- if (level > old) {
3612- if (!capable(CAP_SYS_RAWIO))
3613- return -EPERM;
3614- }
3615- t->iopl = level << 12;
3616- set_iopl_mask(t->iopl);
3617- return 0;
3618-}
3619--- sle11-2009-06-29.orig/arch/x86/kernel/ioport_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3620+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3621@@ -1,99 +0,0 @@
3622-/*
3623- * This contains the io-permission bitmap code - written by obz, with changes
3624- * by Linus.
3625- */
3626-
3627-#include <linux/sched.h>
3628-#include <linux/kernel.h>
3629-#include <linux/capability.h>
3630-#include <linux/errno.h>
3631-#include <linux/types.h>
3632-#include <linux/ioport.h>
3633-#include <linux/mm.h>
3634-#include <linux/smp.h>
3635-#include <linux/stddef.h>
3636-#include <linux/slab.h>
3637-#include <linux/thread_info.h>
3638-#include <linux/syscalls.h>
3639-#include <xen/interface/physdev.h>
3640-
3641-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3642-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3643-{
3644- int i;
3645-
3646- if (new_value)
3647- for (i = base; i < base + extent; i++)
3648- __set_bit(i, bitmap);
3649- else
3650- for (i = base; i < base + extent; i++)
3651- clear_bit(i, bitmap);
3652-}
3653-
3654-/*
3655- * this changes the io permissions bitmap in the current task.
3656- */
3657-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3658-{
3659- struct thread_struct * t = &current->thread;
3660- unsigned long *bitmap;
3661- struct physdev_set_iobitmap set_iobitmap;
3662-
3663- if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3664- return -EINVAL;
3665- if (turn_on && !capable(CAP_SYS_RAWIO))
3666- return -EPERM;
3667-
3668- /*
3669- * If it's the first ioperm() call in this thread's lifetime, set the
3670- * IO bitmap up. ioperm() is much less timing critical than clone(),
3671- * this is why we delay this operation until now:
3672- */
3673- if (!t->io_bitmap_ptr) {
3674- bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3675- if (!bitmap)
3676- return -ENOMEM;
3677-
3678- memset(bitmap, 0xff, IO_BITMAP_BYTES);
3679- t->io_bitmap_ptr = bitmap;
3680- set_thread_flag(TIF_IO_BITMAP);
3681-
3682- set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3683- set_iobitmap.nr_ports = IO_BITMAP_BITS;
3684- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3685- &set_iobitmap));
3686- }
3687-
3688- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3689-
3690- return 0;
3691-}
3692-
3693-/*
3694- * sys_iopl has to be used when you want to access the IO ports
3695- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3696- * you'd need 8kB of bitmaps/process, which is a bit excessive.
3697- *
3698- */
3699-
3700-asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3701-{
3702- unsigned int old_iopl = current->thread.iopl;
3703- struct physdev_set_iopl set_iopl;
3704-
3705- if (new_iopl > 3)
3706- return -EINVAL;
3707-
3708- /* Need "raw I/O" privileges for direct port access. */
3709- if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3710- return -EPERM;
3711-
3712- /* Change our version of the privilege levels. */
3713- current->thread.iopl = new_iopl;
3714-
3715- /* Force the change at ring 0. */
3716- set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3717- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3718-
3719- return 0;
3720-}
3721--- sle11-2009-06-29.orig/arch/x86/kernel/irq_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3722+++ sle11-2009-06-29/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3723@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3724 * SMP cross-CPU interrupts have their own specific
3725 * handlers).
3726 */
3727-fastcall unsigned int do_IRQ(struct pt_regs *regs)
3728+unsigned int do_IRQ(struct pt_regs *regs)
3729 {
3730 struct pt_regs *old_regs;
3731 /* high bit used in ret_from_ code */
3732- int irq = ~regs->orig_eax;
3733+ int irq = ~regs->orig_ax;
3734 struct irq_desc *desc = irq_desc + irq;
3735 #ifdef CONFIG_4KSTACKS
3736 union irq_ctx *curctx, *irqctx;
3737@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3738 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3739 /* Debugging check for stack overflow: is there less than 1KB free? */
3740 {
3741- long esp;
3742+ long sp;
3743
3744 __asm__ __volatile__("andl %%esp,%0" :
3745- "=r" (esp) : "0" (THREAD_SIZE - 1));
3746- if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3747+ "=r" (sp) : "0" (THREAD_SIZE - 1));
3748+ if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3749 printk("do_IRQ: stack overflow: %ld\n",
3750- esp - sizeof(struct thread_info));
3751+ sp - sizeof(struct thread_info));
3752 dump_stack();
3753 }
3754 }
3755@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3756 * current stack (which is the irq stack already after all)
3757 */
3758 if (curctx != irqctx) {
3759- int arg1, arg2, ebx;
3760+ int arg1, arg2, bx;
3761
3762 /* build the stack frame on the IRQ stack */
3763 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3764@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3765 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3766
3767 asm volatile(
3768- " xchgl %%ebx,%%esp \n"
3769- " call *%%edi \n"
3770- " movl %%ebx,%%esp \n"
3771- : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3772+ " xchgl %%ebx,%%esp \n"
3773+ " call *%%edi \n"
3774+ " movl %%ebx,%%esp \n"
3775+ : "=a" (arg1), "=d" (arg2), "=b" (bx)
3776 : "0" (irq), "1" (desc), "2" (isp),
3777 "D" (desc->handle_irq)
3778 : "memory", "cc"
3779--- sle11-2009-06-29.orig/arch/x86/kernel/irq_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3780+++ sle11-2009-06-29/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
3781@@ -20,6 +20,28 @@
3782
3783 atomic_t irq_err_count;
3784
3785+/*
3786+ * 'what should we do if we get a hw irq event on an illegal vector'.
3787+ * each architecture has to answer this themselves.
3788+ */
3789+void ack_bad_irq(unsigned int irq)
3790+{
3791+ printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3792+#ifdef CONFIG_X86_LOCAL_APIC
3793+ /*
3794+ * Currently unexpected vectors happen only on SMP and APIC.
3795+ * We _must_ ack these because every local APIC has only N
3796+ * irq slots per priority level, and a 'hanging, unacked' IRQ
3797+ * holds up an irq slot - in excessive cases (when multiple
3798+ * unexpected vectors occur) that might lock up the APIC
3799+ * completely.
3800+ * But don't ack when the APIC is disabled. -AK
3801+ */
3802+ if (!disable_apic)
3803+ ack_APIC_irq();
3804+#endif
3805+}
3806+
3807 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3808 /*
3809 * Probabilistic stack overflow check:
3810@@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3811 u64 curbase = (u64)task_stack_page(current);
3812 static unsigned long warned = -60*HZ;
3813
3814- if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3815- regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
3816+ if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3817+ regs->sp < curbase + sizeof(struct thread_info) + 128 &&
3818 time_after(jiffies, warned + 60*HZ)) {
3819- printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3820- current->comm, curbase, regs->rsp);
3821+ printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3822+ current->comm, curbase, regs->sp);
3823 show_stack(NULL,NULL);
3824 warned = jiffies;
3825 }
3826@@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3827 struct pt_regs *old_regs = set_irq_regs(regs);
3828
3829 /* high bit used in ret_from_ code */
3830- unsigned irq = ~regs->orig_rax;
3831+ unsigned irq = ~regs->orig_ax;
3832
3833 /*exit_idle();*/
3834 /*irq_enter();*/
3835@@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3836 }
3837 local_irq_restore(flags);
3838 }
3839-
3840-#ifndef CONFIG_X86_LOCAL_APIC
3841-/*
3842- * 'what should we do if we get a hw irq event on an illegal vector'.
3843- * each architecture has to answer this themselves.
3844- */
3845-void ack_bad_irq(unsigned int irq)
3846-{
3847- printk("unexpected IRQ trap at irq %02x\n", irq);
3848-}
3849-#endif
3850--- /dev/null 1970-01-01 00:00:00.000000000 +0000
3851+++ sle11-2009-06-29/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
3852@@ -0,0 +1,272 @@
3853+/*
3854+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3855+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3856+ * Copyright (C) 2002 Andi Kleen
3857+ *
3858+ * This handles calls from both 32bit and 64bit mode.
3859+ */
3860+
3861+#include <linux/errno.h>
3862+#include <linux/sched.h>
3863+#include <linux/string.h>
3864+#include <linux/mm.h>
3865+#include <linux/smp.h>
3866+#include <linux/vmalloc.h>
3867+
3868+#include <asm/uaccess.h>
3869+#include <asm/system.h>
3870+#include <asm/ldt.h>
3871+#include <asm/desc.h>
3872+#include <asm/mmu_context.h>
3873+
3874+#ifdef CONFIG_SMP
3875+static void flush_ldt(void *null)
3876+{
3877+ if (current->active_mm)
3878+ load_LDT(&current->active_mm->context);
3879+}
3880+#endif
3881+
3882+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
3883+{
3884+ void *oldldt, *newldt;
3885+ int oldsize;
3886+
3887+ if (mincount <= pc->size)
3888+ return 0;
3889+ oldsize = pc->size;
3890+ mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
3891+ (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
3892+ if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
3893+ newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
3894+ else
3895+ newldt = (void *)__get_free_page(GFP_KERNEL);
3896+
3897+ if (!newldt)
3898+ return -ENOMEM;
3899+
3900+ if (oldsize)
3901+ memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
3902+ oldldt = pc->ldt;
3903+ memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
3904+ (mincount - oldsize) * LDT_ENTRY_SIZE);
3905+
3906+#ifdef CONFIG_X86_64
3907+ /* CHECKME: Do we really need this ? */
3908+ wmb();
3909+#endif
3910+ pc->ldt = newldt;
3911+ wmb();
3912+ pc->size = mincount;
3913+ wmb();
3914+
3915+ if (reload) {
3916+#ifdef CONFIG_SMP
3917+ cpumask_t mask;
3918+
3919+ preempt_disable();
3920+#endif
3921+ make_pages_readonly(newldt,
3922+ (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
3923+ XENFEAT_writable_descriptor_tables);
3924+ load_LDT(pc);
3925+#ifdef CONFIG_SMP
3926+ mask = cpumask_of_cpu(smp_processor_id());
3927+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
3928+ smp_call_function(flush_ldt, NULL, 1, 1);
3929+ preempt_enable();
3930+#endif
3931+ }
3932+ if (oldsize) {
3933+ make_pages_writable(oldldt,
3934+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
3935+ XENFEAT_writable_descriptor_tables);
3936+ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
3937+ vfree(oldldt);
3938+ else
3939+ put_page(virt_to_page(oldldt));
3940+ }
3941+ return 0;
3942+}
3943+
3944+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
3945+{
3946+ int err = alloc_ldt(new, old->size, 0);
3947+
3948+ if (err < 0)
3949+ return err;
3950+ memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
3951+ make_pages_readonly(new->ldt,
3952+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3953+ XENFEAT_writable_descriptor_tables);
3954+ return 0;
3955+}
3956+
3957+/*
3958+ * we do not have to muck with descriptors here, that is
3959+ * done in switch_mm() as needed.
3960+ */
3961+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
3962+{
3963+ struct mm_struct *old_mm;
3964+ int retval = 0;
3965+
3966+ memset(&mm->context, 0, sizeof(mm->context));
3967+ mutex_init(&mm->context.lock);
3968+ old_mm = current->mm;
3969+ if (old_mm)
3970+ mm->context.vdso = old_mm->context.vdso;
3971+ if (old_mm && old_mm->context.size > 0) {
3972+ mutex_lock(&old_mm->context.lock);
3973+ retval = copy_ldt(&mm->context, &old_mm->context);
3974+ mutex_unlock(&old_mm->context.lock);
3975+ }
3976+ return retval;
3977+}
3978+
3979+/*
3980+ * No need to lock the MM as we are the last user
3981+ *
3982+ * 64bit: Don't touch the LDT register - we're already in the next thread.
3983+ */
3984+void destroy_context(struct mm_struct *mm)
3985+{
3986+ if (mm->context.size) {
3987+ /* CHECKME: Can this ever happen ? */
3988+ if (mm == current->active_mm)
3989+ clear_LDT();
3990+ make_pages_writable(mm->context.ldt,
3991+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3992+ XENFEAT_writable_descriptor_tables);
3993+ if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
3994+ vfree(mm->context.ldt);
3995+ else
3996+ put_page(virt_to_page(mm->context.ldt));
3997+ mm->context.size = 0;
3998+ }
3999+}
4000+
4001+static int read_ldt(void __user *ptr, unsigned long bytecount)
4002+{
4003+ int err;
4004+ unsigned long size;
4005+ struct mm_struct *mm = current->mm;
4006+
4007+ if (!mm->context.size)
4008+ return 0;
4009+ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4010+ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4011+
4012+ mutex_lock(&mm->context.lock);
4013+ size = mm->context.size * LDT_ENTRY_SIZE;
4014+ if (size > bytecount)
4015+ size = bytecount;
4016+
4017+ err = 0;
4018+ if (copy_to_user(ptr, mm->context.ldt, size))
4019+ err = -EFAULT;
4020+ mutex_unlock(&mm->context.lock);
4021+ if (err < 0)
4022+ goto error_return;
4023+ if (size != bytecount) {
4024+ /* zero-fill the rest */
4025+ if (clear_user(ptr + size, bytecount - size) != 0) {
4026+ err = -EFAULT;
4027+ goto error_return;
4028+ }
4029+ }
4030+ return bytecount;
4031+error_return:
4032+ return err;
4033+}
4034+
4035+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4036+{
4037+ /* CHECKME: Can we use _one_ random number ? */
4038+#ifdef CONFIG_X86_32
4039+ unsigned long size = 5 * sizeof(struct desc_struct);
4040+#else
4041+ unsigned long size = 128;
4042+#endif
4043+ if (bytecount > size)
4044+ bytecount = size;
4045+ if (clear_user(ptr, bytecount))
4046+ return -EFAULT;
4047+ return bytecount;
4048+}
4049+
4050+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4051+{
4052+ struct mm_struct *mm = current->mm;
4053+ struct desc_struct ldt;
4054+ int error;
4055+ struct user_desc ldt_info;
4056+
4057+ error = -EINVAL;
4058+ if (bytecount != sizeof(ldt_info))
4059+ goto out;
4060+ error = -EFAULT;
4061+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4062+ goto out;
4063+
4064+ error = -EINVAL;
4065+ if (ldt_info.entry_number >= LDT_ENTRIES)
4066+ goto out;
4067+ if (ldt_info.contents == 3) {
4068+ if (oldmode)
4069+ goto out;
4070+ if (ldt_info.seg_not_present == 0)
4071+ goto out;
4072+ }
4073+
4074+ mutex_lock(&mm->context.lock);
4075+ if (ldt_info.entry_number >= mm->context.size) {
4076+ error = alloc_ldt(&current->mm->context,
4077+ ldt_info.entry_number + 1, 1);
4078+ if (error < 0)
4079+ goto out_unlock;
4080+ }
4081+
4082+ /* Allow LDTs to be cleared by the user. */
4083+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4084+ if (oldmode || LDT_empty(&ldt_info)) {
4085+ memset(&ldt, 0, sizeof(ldt));
4086+ goto install;
4087+ }
4088+ }
4089+
4090+ fill_ldt(&ldt, &ldt_info);
4091+ if (oldmode)
4092+ ldt.avl = 0;
4093+
4094+ /* Install the new entry ... */
4095+install:
4096+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4097+
4098+out_unlock:
4099+ mutex_unlock(&mm->context.lock);
4100+out:
4101+ return error;
4102+}
4103+
4104+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4105+ unsigned long bytecount)
4106+{
4107+ int ret = -ENOSYS;
4108+
4109+ switch (func) {
4110+ case 0:
4111+ ret = read_ldt(ptr, bytecount);
4112+ break;
4113+ case 1:
4114+ ret = write_ldt(ptr, bytecount, 1);
4115+ break;
4116+ case 2:
4117+ ret = read_default_ldt(ptr, bytecount);
4118+ break;
4119+ case 0x11:
4120+ ret = write_ldt(ptr, bytecount, 0);
4121+ break;
4122+ }
4123+ return ret;
4124+}
4125--- sle11-2009-06-29.orig/arch/x86/kernel/ldt_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4126+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4127@@ -1,265 +0,0 @@
4128-/*
4129- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4130- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4131- */
4132-
4133-#include <linux/errno.h>
4134-#include <linux/sched.h>
4135-#include <linux/string.h>
4136-#include <linux/mm.h>
4137-#include <linux/smp.h>
4138-#include <linux/vmalloc.h>
4139-#include <linux/slab.h>
4140-
4141-#include <asm/uaccess.h>
4142-#include <asm/system.h>
4143-#include <asm/ldt.h>
4144-#include <asm/desc.h>
4145-#include <asm/mmu_context.h>
4146-
4147-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4148-static void flush_ldt(void *null)
4149-{
4150- if (current->active_mm)
4151- load_LDT(&current->active_mm->context);
4152-}
4153-#endif
4154-
4155-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4156-{
4157- void *oldldt;
4158- void *newldt;
4159- int oldsize;
4160-
4161- if (mincount <= pc->size)
4162- return 0;
4163- oldsize = pc->size;
4164- mincount = (mincount+511)&(~511);
4165- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4166- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4167- else
4168- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4169-
4170- if (!newldt)
4171- return -ENOMEM;
4172-
4173- if (oldsize)
4174- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4175- oldldt = pc->ldt;
4176- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4177- pc->ldt = newldt;
4178- wmb();
4179- pc->size = mincount;
4180- wmb();
4181-
4182- if (reload) {
4183-#ifdef CONFIG_SMP
4184- cpumask_t mask;
4185- preempt_disable();
4186-#endif
4187- make_pages_readonly(
4188- pc->ldt,
4189- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4190- XENFEAT_writable_descriptor_tables);
4191- load_LDT(pc);
4192-#ifdef CONFIG_SMP
4193- mask = cpumask_of_cpu(smp_processor_id());
4194- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4195- smp_call_function(flush_ldt, NULL, 1, 1);
4196- preempt_enable();
4197-#endif
4198- }
4199- if (oldsize) {
4200- make_pages_writable(
4201- oldldt,
4202- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4203- XENFEAT_writable_descriptor_tables);
4204- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4205- vfree(oldldt);
4206- else
4207- kfree(oldldt);
4208- }
4209- return 0;
4210-}
4211-
4212-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4213-{
4214- int err = alloc_ldt(new, old->size, 0);
4215- if (err < 0)
4216- return err;
4217- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4218- make_pages_readonly(
4219- new->ldt,
4220- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4221- XENFEAT_writable_descriptor_tables);
4222- return 0;
4223-}
4224-
4225-/*
4226- * we do not have to muck with descriptors here, that is
4227- * done in switch_mm() as needed.
4228- */
4229-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4230-{
4231- struct mm_struct * old_mm;
4232- int retval = 0;
4233-
4234- mutex_init(&mm->context.lock);
4235- mm->context.size = 0;
4236- mm->context.has_foreign_mappings = 0;
4237- old_mm = current->mm;
4238- if (old_mm && old_mm->context.size > 0) {
4239- mutex_lock(&old_mm->context.lock);
4240- retval = copy_ldt(&mm->context, &old_mm->context);
4241- mutex_unlock(&old_mm->context.lock);
4242- }
4243- return retval;
4244-}
4245-
4246-/*
4247- * No need to lock the MM as we are the last user
4248- */
4249-void destroy_context(struct mm_struct *mm)
4250-{
4251- if (mm->context.size) {
4252- if (mm == current->active_mm)
4253- clear_LDT();
4254- make_pages_writable(
4255- mm->context.ldt,
4256- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4257- XENFEAT_writable_descriptor_tables);
4258- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4259- vfree(mm->context.ldt);
4260- else
4261- kfree(mm->context.ldt);
4262- mm->context.size = 0;
4263- }
4264-}
4265-
4266-static int read_ldt(void __user * ptr, unsigned long bytecount)
4267-{
4268- int err;
4269- unsigned long size;
4270- struct mm_struct * mm = current->mm;
4271-
4272- if (!mm->context.size)
4273- return 0;
4274- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4275- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4276-
4277- mutex_lock(&mm->context.lock);
4278- size = mm->context.size*LDT_ENTRY_SIZE;
4279- if (size > bytecount)
4280- size = bytecount;
4281-
4282- err = 0;
4283- if (copy_to_user(ptr, mm->context.ldt, size))
4284- err = -EFAULT;
4285- mutex_unlock(&mm->context.lock);
4286- if (err < 0)
4287- goto error_return;
4288- if (size != bytecount) {
4289- /* zero-fill the rest */
4290- if (clear_user(ptr+size, bytecount-size) != 0) {
4291- err = -EFAULT;
4292- goto error_return;
4293- }
4294- }
4295- return bytecount;
4296-error_return:
4297- return err;
4298-}
4299-
4300-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4301-{
4302- int err;
4303- unsigned long size;
4304-
4305- err = 0;
4306- size = 5*sizeof(struct desc_struct);
4307- if (size > bytecount)
4308- size = bytecount;
4309-
4310- err = size;
4311- if (clear_user(ptr, size))
4312- err = -EFAULT;
4313-
4314- return err;
4315-}
4316-
4317-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4318-{
4319- struct mm_struct * mm = current->mm;
4320- __u32 entry_1, entry_2;
4321- int error;
4322- struct user_desc ldt_info;
4323-
4324- error = -EINVAL;
4325- if (bytecount != sizeof(ldt_info))
4326- goto out;
4327- error = -EFAULT;
4328- if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4329- goto out;
4330-
4331- error = -EINVAL;
4332- if (ldt_info.entry_number >= LDT_ENTRIES)
4333- goto out;
4334- if (ldt_info.contents == 3) {
4335- if (oldmode)
4336- goto out;
4337- if (ldt_info.seg_not_present == 0)
4338- goto out;
4339- }
4340-
4341- mutex_lock(&mm->context.lock);
4342- if (ldt_info.entry_number >= mm->context.size) {
4343- error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4344- if (error < 0)
4345- goto out_unlock;
4346- }
4347-
4348- /* Allow LDTs to be cleared by the user. */
4349- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4350- if (oldmode || LDT_empty(&ldt_info)) {
4351- entry_1 = 0;
4352- entry_2 = 0;
4353- goto install;
4354- }
4355- }
4356-
4357- entry_1 = LDT_entry_a(&ldt_info);
4358- entry_2 = LDT_entry_b(&ldt_info);
4359- if (oldmode)
4360- entry_2 &= ~(1 << 20);
4361-
4362- /* Install the new entry ... */
4363-install:
4364- error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4365- entry_1, entry_2);
4366-
4367-out_unlock:
4368- mutex_unlock(&mm->context.lock);
4369-out:
4370- return error;
4371-}
4372-
4373-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4374-{
4375- int ret = -ENOSYS;
4376-
4377- switch (func) {
4378- case 0:
4379- ret = read_ldt(ptr, bytecount);
4380- break;
4381- case 1:
4382- ret = write_ldt(ptr, bytecount, 1);
4383- break;
4384- case 2:
4385- ret = read_default_ldt(ptr, bytecount);
4386- break;
4387- case 0x11:
4388- ret = write_ldt(ptr, bytecount, 0);
4389- break;
4390- }
4391- return ret;
4392-}
4393--- sle11-2009-06-29.orig/arch/x86/kernel/ldt_64-xen.c 2009-02-16 16:18:36.000000000 +0100
4394+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4395@@ -1,271 +0,0 @@
4396-/*
4397- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4398- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4399- * Copyright (C) 2002 Andi Kleen
4400- *
4401- * This handles calls from both 32bit and 64bit mode.
4402- */
4403-
4404-#include <linux/errno.h>
4405-#include <linux/sched.h>
4406-#include <linux/string.h>
4407-#include <linux/mm.h>
4408-#include <linux/smp.h>
4409-#include <linux/vmalloc.h>
4410-#include <linux/slab.h>
4411-
4412-#include <asm/uaccess.h>
4413-#include <asm/system.h>
4414-#include <asm/ldt.h>
4415-#include <asm/desc.h>
4416-#include <asm/proto.h>
4417-#include <asm/pgalloc.h>
4418-
4419-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4420-static void flush_ldt(void *null)
4421-{
4422- if (current->active_mm)
4423- load_LDT(&current->active_mm->context);
4424-}
4425-#endif
4426-
4427-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4428-{
4429- void *oldldt;
4430- void *newldt;
4431- unsigned oldsize;
4432-
4433- if (mincount <= (unsigned)pc->size)
4434- return 0;
4435- oldsize = pc->size;
4436- mincount = (mincount+511)&(~511);
4437- if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4438- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4439- else
4440- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4441-
4442- if (!newldt)
4443- return -ENOMEM;
4444-
4445- if (oldsize)
4446- memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4447- oldldt = pc->ldt;
4448- memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4449- wmb();
4450- pc->ldt = newldt;
4451- wmb();
4452- pc->size = mincount;
4453- wmb();
4454- if (reload) {
4455-#ifdef CONFIG_SMP
4456- cpumask_t mask;
4457-
4458- preempt_disable();
4459-#endif
4460- make_pages_readonly(
4461- pc->ldt,
4462- (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4463- XENFEAT_writable_descriptor_tables);
4464- load_LDT(pc);
4465-#ifdef CONFIG_SMP
4466- mask = cpumask_of_cpu(smp_processor_id());
4467- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4468- smp_call_function(flush_ldt, NULL, 1, 1);
4469- preempt_enable();
4470-#endif
4471- }
4472- if (oldsize) {
4473- make_pages_writable(
4474- oldldt,
4475- (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4476- XENFEAT_writable_descriptor_tables);
4477- if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4478- vfree(oldldt);
4479- else
4480- kfree(oldldt);
4481- }
4482- return 0;
4483-}
4484-
4485-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4486-{
4487- int err = alloc_ldt(new, old->size, 0);
4488- if (err < 0)
4489- return err;
4490- memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4491- make_pages_readonly(
4492- new->ldt,
4493- (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4494- XENFEAT_writable_descriptor_tables);
4495- return 0;
4496-}
4497-
4498-/*
4499- * we do not have to muck with descriptors here, that is
4500- * done in switch_mm() as needed.
4501- */
4502-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4503-{
4504- struct mm_struct * old_mm;
4505- int retval = 0;
4506-
4507- memset(&mm->context, 0, sizeof(mm->context));
4508- mutex_init(&mm->context.lock);
4509- old_mm = current->mm;
4510- if (old_mm)
4511- mm->context.vdso = old_mm->context.vdso;
4512- if (old_mm && old_mm->context.size > 0) {
4513- mutex_lock(&old_mm->context.lock);
4514- retval = copy_ldt(&mm->context, &old_mm->context);
4515- mutex_unlock(&old_mm->context.lock);
4516- }
4517- return retval;
4518-}
4519-
4520-/*
4521- *
4522- * Don't touch the LDT register - we're already in the next thread.
4523- */
4524-void destroy_context(struct mm_struct *mm)
4525-{
4526- if (mm->context.size) {
4527- if (mm == current->active_mm)
4528- clear_LDT();
4529- make_pages_writable(
4530- mm->context.ldt,
4531- (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4532- XENFEAT_writable_descriptor_tables);
4533- if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4534- vfree(mm->context.ldt);
4535- else
4536- kfree(mm->context.ldt);
4537- mm->context.size = 0;
4538- }
4539-}
4540-
4541-static int read_ldt(void __user * ptr, unsigned long bytecount)
4542-{
4543- int err;
4544- unsigned long size;
4545- struct mm_struct * mm = current->mm;
4546-
4547- if (!mm->context.size)
4548- return 0;
4549- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4550- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4551-
4552- mutex_lock(&mm->context.lock);
4553- size = mm->context.size*LDT_ENTRY_SIZE;
4554- if (size > bytecount)
4555- size = bytecount;
4556-
4557- err = 0;
4558- if (copy_to_user(ptr, mm->context.ldt, size))
4559- err = -EFAULT;
4560- mutex_unlock(&mm->context.lock);
4561- if (err < 0)
4562- goto error_return;
4563- if (size != bytecount) {
4564- /* zero-fill the rest */
4565- if (clear_user(ptr+size, bytecount-size) != 0) {
4566- err = -EFAULT;
4567- goto error_return;
4568- }
4569- }
4570- return bytecount;
4571-error_return:
4572- return err;
4573-}
4574-
4575-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4576-{
4577- /* Arbitrary number */
4578- /* x86-64 default LDT is all zeros */
4579- if (bytecount > 128)
4580- bytecount = 128;
4581- if (clear_user(ptr, bytecount))
4582- return -EFAULT;
4583- return bytecount;
4584-}
4585-
4586-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4587-{
4588- struct task_struct *me = current;
4589- struct mm_struct * mm = me->mm;
4590- __u32 entry_1, entry_2, *lp;
4591- unsigned long mach_lp;
4592- int error;
4593- struct user_desc ldt_info;
4594-
4595- error = -EINVAL;
4596-
4597- if (bytecount != sizeof(ldt_info))
4598- goto out;
4599- error = -EFAULT;
4600- if (copy_from_user(&ldt_info, ptr, bytecount))
4601- goto out;
4602-
4603- error = -EINVAL;
4604- if (ldt_info.entry_number >= LDT_ENTRIES)
4605- goto out;
4606- if (ldt_info.contents == 3) {
4607- if (oldmode)
4608- goto out;
4609- if (ldt_info.seg_not_present == 0)
4610- goto out;
4611- }
4612-
4613- mutex_lock(&mm->context.lock);
4614- if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4615- error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
4616- if (error < 0)
4617- goto out_unlock;
4618- }
4619-
4620- lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4621- mach_lp = arbitrary_virt_to_machine(lp);
4622-
4623- /* Allow LDTs to be cleared by the user. */
4624- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4625- if (oldmode || LDT_empty(&ldt_info)) {
4626- entry_1 = 0;
4627- entry_2 = 0;
4628- goto install;
4629- }
4630- }
4631-
4632- entry_1 = LDT_entry_a(&ldt_info);
4633- entry_2 = LDT_entry_b(&ldt_info);
4634- if (oldmode)
4635- entry_2 &= ~(1 << 20);
4636-
4637- /* Install the new entry ... */
4638-install:
4639- error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4640-
4641-out_unlock:
4642- mutex_unlock(&mm->context.lock);
4643-out:
4644- return error;
4645-}
4646-
4647-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4648-{
4649- int ret = -ENOSYS;
4650-
4651- switch (func) {
4652- case 0:
4653- ret = read_ldt(ptr, bytecount);
4654- break;
4655- case 1:
4656- ret = write_ldt(ptr, bytecount, 1);
4657- break;
4658- case 2:
4659- ret = read_default_ldt(ptr, bytecount);
4660- break;
4661- case 0x11:
4662- ret = write_ldt(ptr, bytecount, 0);
4663- break;
4664- }
4665- return ret;
4666-}
4667--- sle11-2009-06-29.orig/arch/x86/kernel/machine_kexec_64.c 2008-11-25 12:35:54.000000000 +0100
4668+++ sle11-2009-06-29/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
4669@@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4670
4671 void arch_crash_save_vmcoreinfo(void)
4672 {
4673+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4674 VMCOREINFO_SYMBOL(phys_base);
4675+#endif
4676 VMCOREINFO_SYMBOL(init_level4_pgt);
4677
4678 #ifdef CONFIG_NUMA
4679--- sle11-2009-06-29.orig/arch/x86/kernel/microcode-xen.c 2009-02-16 16:17:21.000000000 +0100
4680+++ sle11-2009-06-29/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
4681@@ -167,7 +167,7 @@ static int request_microcode(void)
4682 }
4683
4684 op.cmd = XENPF_microcode_update;
4685- set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4686+ set_xen_guest_handle(op.u.microcode.data, firmware->data);
4687 op.u.microcode.length = firmware->size;
4688 error = HYPERVISOR_platform_op(&op);
4689
4690--- sle11-2009-06-29.orig/arch/x86/kernel/mpparse_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4691+++ sle11-2009-06-29/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
4692@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4693 /* Processor that is doing the boot up */
4694 unsigned int boot_cpu_physical_apicid = -1U;
4695 /* Internal processor count */
4696-unsigned int __cpuinitdata num_processors;
4697+unsigned int num_processors;
4698
4699 /* Bitmask of physically existing CPUs */
4700 physid_mask_t phys_cpu_present_map;
4701@@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4702 if (!(m->mpc_flags & MPC_APIC_USABLE))
4703 return;
4704
4705- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4706+ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4707 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4708 if (nr_ioapics >= MAX_IO_APICS) {
4709 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4710@@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4711
4712 mps_oem_check(mpc, oem, str);
4713
4714- printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4715+ printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4716
4717- /*
4718+ /*
4719 * Save the local APIC address (it might be non-default) -- but only
4720 * if we're not using ACPI.
4721 */
4722@@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4723 unsigned long *bp = isa_bus_to_virt(base);
4724 struct intel_mp_floating *mpf;
4725
4726- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4727+ printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4728 if (sizeof(*mpf) != 16)
4729 printk("Error: MPF size\n");
4730
4731@@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4732
4733 smp_found_config = 1;
4734 #ifndef CONFIG_XEN
4735- printk(KERN_INFO "found SMP MP-table at %08lx\n",
4736- virt_to_phys(mpf));
4737- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4738+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4739+ mpf, virt_to_phys(mpf));
4740+ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4741+ BOOTMEM_DEFAULT);
4742 if (mpf->mpf_physptr) {
4743 /*
4744 * We cannot access to MPC table to compute
4745@@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4746 unsigned long end = max_low_pfn * PAGE_SIZE;
4747 if (mpf->mpf_physptr + size > end)
4748 size = end - mpf->mpf_physptr;
4749- reserve_bootmem(mpf->mpf_physptr, size);
4750+ reserve_bootmem(mpf->mpf_physptr, size,
4751+ BOOTMEM_DEFAULT);
4752 }
4753 #else
4754- printk(KERN_INFO "found SMP MP-table at %08lx\n",
4755- ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4756+ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4757+ mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4758 #endif
4759
4760 mpf_found = mpf;
4761@@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4762 */
4763 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4764 mp_ioapic_routing[idx].gsi_base = gsi_base;
4765- mp_ioapic_routing[idx].gsi_end = gsi_base +
4766+ mp_ioapic_routing[idx].gsi_end = gsi_base +
4767 io_apic_get_redir_entries(idx);
4768
4769- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4770- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4771- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4772- mp_ioapic_routing[idx].gsi_base,
4773- mp_ioapic_routing[idx].gsi_end);
4774+ printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4775+ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4776+ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4777+ mp_ioapic_routing[idx].gsi_base,
4778+ mp_ioapic_routing[idx].gsi_end);
4779 }
4780
4781 void __init
4782@@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4783 }
4784
4785 #define MAX_GSI_NUM 4096
4786+#define IRQ_COMPRESSION_START 64
4787
4788 int mp_register_gsi(u32 gsi, int triggering, int polarity)
4789 {
4790 int ioapic = -1;
4791 int ioapic_pin = 0;
4792 int idx, bit = 0;
4793- static int pci_irq = 16;
4794+ static int pci_irq = IRQ_COMPRESSION_START;
4795 /*
4796- * Mapping between Global System Interrups, which
4797+ * Mapping between Global System Interrupts, which
4798 * represent all possible interrupts, and IRQs
4799 * assigned to actual devices.
4800 */
4801@@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4802 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4803 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4804 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4805- return gsi_to_irq[gsi];
4806+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4807 }
4808
4809 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4810
4811- if (triggering == ACPI_LEVEL_SENSITIVE) {
4812+ /*
4813+ * For GSI >= 64, use IRQ compression
4814+ */
4815+ if ((gsi >= IRQ_COMPRESSION_START)
4816+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
4817 /*
4818 * For PCI devices assign IRQs in order, avoiding gaps
4819 * due to unused I/O APIC pins.
4820--- sle11-2009-06-29.orig/arch/x86/kernel/mpparse_64-xen.c 2009-02-16 16:18:36.000000000 +0100
4821+++ sle11-2009-06-29/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
4822@@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4823 EXPORT_SYMBOL(boot_cpu_id);
4824
4825 /* Internal processor count */
4826-unsigned int num_processors __cpuinitdata = 0;
4827+unsigned int num_processors;
4828
4829 unsigned disabled_cpus __cpuinitdata;
4830
4831 /* Bitmask of physically existing CPUs */
4832 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4833
4834-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4835+#ifndef CONFIG_XEN
4836+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4837+ = { [0 ... NR_CPUS-1] = BAD_APICID };
4838+void *x86_bios_cpu_apicid_early_ptr;
4839+#endif
4840+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4841+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4842
4843
4844 /*
4845@@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4846 physid_set(m->mpc_apicid, phys_cpu_present_map);
4847 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4848 /*
4849- * bios_cpu_apicid is required to have processors listed
4850+ * x86_bios_cpu_apicid is required to have processors listed
4851 * in same order as logical cpu numbers. Hence the first
4852 * entry is BSP, and so on.
4853 */
4854 cpu = 0;
4855 }
4856- bios_cpu_apicid[cpu] = m->mpc_apicid;
4857- /*
4858- * We get called early in the the start_kernel initialization
4859- * process when the per_cpu data area is not yet setup, so we
4860- * use a static array that is removed after the per_cpu data
4861- * area is created.
4862- */
4863- if (x86_cpu_to_apicid_ptr) {
4864- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
4865- x86_cpu_to_apicid[cpu] = m->mpc_apicid;
4866+ /* are we being called early in kernel startup? */
4867+ if (x86_cpu_to_apicid_early_ptr) {
4868+ u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
4869+ u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
4870+
4871+ cpu_to_apicid[cpu] = m->mpc_apicid;
4872+ bios_cpu_apicid[cpu] = m->mpc_apicid;
4873 } else {
4874 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
4875+ per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
4876 }
4877
4878 cpu_set(cpu, cpu_possible_map);
4879--- sle11-2009-06-29.orig/arch/x86/kernel/pci-dma-xen.c 2009-02-16 16:18:36.000000000 +0100
4880+++ sle11-2009-06-29/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
4881@@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
4882 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
4883 }
4884 EXPORT_SYMBOL(dma_sync_single_for_device);
4885+
4886+void
4887+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
4888+ enum dma_data_direction direction)
4889+{
4890+ if (swiotlb)
4891+ swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
4892+ flush_write_buffers();
4893+}
4894+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
4895+
4896+void
4897+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
4898+ enum dma_data_direction direction)
4899+{
4900+ if (swiotlb)
4901+ swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
4902+ flush_write_buffers();
4903+}
4904+EXPORT_SYMBOL(dma_sync_sg_for_device);
4905--- sle11-2009-06-29.orig/arch/x86/kernel/process_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4906+++ sle11-2009-06-29/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
4907@@ -23,7 +23,6 @@
4908 #include <linux/slab.h>
4909 #include <linux/vmalloc.h>
4910 #include <linux/user.h>
4911-#include <linux/a.out.h>
4912 #include <linux/interrupt.h>
4913 #include <linux/utsname.h>
4914 #include <linux/delay.h>
4915@@ -59,8 +58,10 @@
4916
4917 #include <asm/tlbflush.h>
4918 #include <asm/cpu.h>
4919+#include <asm/kdebug.h>
4920
4921 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
4922+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
4923
4924 static int hlt_counter;
4925
4926@@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
4927 */
4928 unsigned long thread_saved_pc(struct task_struct *tsk)
4929 {
4930- return ((unsigned long *)tsk->thread.esp)[3];
4931+ return ((unsigned long *)tsk->thread.sp)[3];
4932 }
4933
4934 /*
4935@@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
4936 */
4937 void (*pm_idle)(void);
4938 EXPORT_SYMBOL(pm_idle);
4939-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
4940
4941 void disable_hlt(void)
4942 {
4943@@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
4944 * to poll the ->work.need_resched flag instead of waiting for the
4945 * cross-CPU IPI to arrive. Use this option with caution.
4946 */
4947-static void poll_idle (void)
4948+static void poll_idle(void)
4949 {
4950 cpu_relax();
4951 }
4952@@ -122,10 +122,19 @@ static void xen_idle(void)
4953 smp_mb();
4954
4955 local_irq_disable();
4956- if (!need_resched())
4957+ if (!need_resched()) {
4958+ ktime_t t0, t1;
4959+ u64 t0n, t1n;
4960+
4961+ t0 = ktime_get();
4962+ t0n = ktime_to_ns(t0);
4963 safe_halt(); /* enables interrupts racelessly */
4964- else
4965- local_irq_enable();
4966+ local_irq_disable();
4967+ t1 = ktime_get();
4968+ t1n = ktime_to_ns(t1);
4969+ sched_clock_idle_wakeup_event(t1n - t0n);
4970+ }
4971+ local_irq_enable();
4972 current_thread_info()->status |= TS_POLLING;
4973 }
4974 #ifdef CONFIG_APM_MODULE
4975@@ -168,13 +177,13 @@ void cpu_idle(void)
4976 while (!need_resched()) {
4977 void (*idle)(void);
4978
4979- if (__get_cpu_var(cpu_idle_state))
4980- __get_cpu_var(cpu_idle_state) = 0;
4981-
4982 check_pgt_cache();
4983 rmb();
4984 idle = xen_idle; /* no alternatives */
4985
4986+ if (rcu_pending(cpu))
4987+ rcu_check_callbacks(cpu, 0);
4988+
4989 if (cpu_is_offline(cpu))
4990 play_dead();
4991
4992@@ -192,40 +201,19 @@ static void do_nothing(void *unused)
4993 {
4994 }
4995
4996+/*
4997+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
4998+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
4999+ * handler on SMP systems.
5000+ *
5001+ * Caller must have changed pm_idle to the new value before the call. Old
5002+ * pm_idle value will not be used by any CPU after the return of this function.
5003+ */
5004 void cpu_idle_wait(void)
5005 {
5006- unsigned int cpu, this_cpu = get_cpu();
5007- cpumask_t map, tmp = current->cpus_allowed;
5008-
5009- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5010- put_cpu();
5011-
5012- cpus_clear(map);
5013- for_each_online_cpu(cpu) {
5014- per_cpu(cpu_idle_state, cpu) = 1;
5015- cpu_set(cpu, map);
5016- }
5017-
5018- __get_cpu_var(cpu_idle_state) = 0;
5019-
5020- wmb();
5021- do {
5022- ssleep(1);
5023- for_each_online_cpu(cpu) {
5024- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5025- cpu_clear(cpu, map);
5026- }
5027- cpus_and(map, map, cpu_online_map);
5028- /*
5029- * We waited 1 sec, if a CPU still did not call idle
5030- * it may be because it is in idle and not waking up
5031- * because it has nothing to do.
5032- * Give all the remaining CPUS a kick.
5033- */
5034- smp_call_function_mask(map, do_nothing, 0, 0);
5035- } while (!cpus_empty(map));
5036-
5037- set_cpus_allowed(current, tmp);
5038+ smp_mb();
5039+ /* kick all the CPUs so that they exit out of pm_idle */
5040+ smp_call_function(do_nothing, NULL, 0, 1);
5041 }
5042 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5043
5044@@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5045 {
5046 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5047 unsigned long d0, d1, d2, d3, d6, d7;
5048- unsigned long esp;
5049+ unsigned long sp;
5050 unsigned short ss, gs;
5051
5052 if (user_mode_vm(regs)) {
5053- esp = regs->esp;
5054- ss = regs->xss & 0xffff;
5055+ sp = regs->sp;
5056+ ss = regs->ss & 0xffff;
5057 savesegment(gs, gs);
5058 } else {
5059- esp = (unsigned long) (&regs->esp);
5060+ sp = (unsigned long) (&regs->sp);
5061 savesegment(ss, ss);
5062 savesegment(gs, gs);
5063 }
5064@@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5065 init_utsname()->version);
5066
5067 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5068- 0xffff & regs->xcs, regs->eip, regs->eflags,
5069+ 0xffff & regs->cs, regs->ip, regs->flags,
5070 smp_processor_id());
5071- print_symbol("EIP is at %s\n", regs->eip);
5072+ print_symbol("EIP is at %s\n", regs->ip);
5073
5074 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5075- regs->eax, regs->ebx, regs->ecx, regs->edx);
5076+ regs->ax, regs->bx, regs->cx, regs->dx);
5077 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5078- regs->esi, regs->edi, regs->ebp, esp);
5079+ regs->si, regs->di, regs->bp, sp);
5080 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5081- regs->xds & 0xffff, regs->xes & 0xffff,
5082- regs->xfs & 0xffff, gs, ss);
5083+ regs->ds & 0xffff, regs->es & 0xffff,
5084+ regs->fs & 0xffff, gs, ss);
5085
5086 if (!all)
5087 return;
5088@@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5089 void show_regs(struct pt_regs *regs)
5090 {
5091 __show_registers(regs, 1);
5092- show_trace(NULL, regs, &regs->esp);
5093+ show_trace(NULL, regs, &regs->sp, regs->bp);
5094 }
5095
5096 /*
5097- * This gets run with %ebx containing the
5098- * function to call, and %edx containing
5099+ * This gets run with %bx containing the
5100+ * function to call, and %dx containing
5101 * the "args".
5102 */
5103 extern void kernel_thread_helper(void);
5104@@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5105
5106 memset(&regs, 0, sizeof(regs));
5107
5108- regs.ebx = (unsigned long) fn;
5109- regs.edx = (unsigned long) arg;
5110+ regs.bx = (unsigned long) fn;
5111+ regs.dx = (unsigned long) arg;
5112
5113- regs.xds = __USER_DS;
5114- regs.xes = __USER_DS;
5115- regs.xfs = __KERNEL_PERCPU;
5116- regs.orig_eax = -1;
5117- regs.eip = (unsigned long) kernel_thread_helper;
5118- regs.xcs = __KERNEL_CS | get_kernel_rpl();
5119- regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5120+ regs.ds = __USER_DS;
5121+ regs.es = __USER_DS;
5122+ regs.fs = __KERNEL_PERCPU;
5123+ regs.orig_ax = -1;
5124+ regs.ip = (unsigned long) kernel_thread_helper;
5125+ regs.cs = __KERNEL_CS | get_kernel_rpl();
5126+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5127
5128 /* Ok, create the new process.. */
5129 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
5130@@ -368,7 +356,12 @@ void flush_thread(void)
5131 {
5132 struct task_struct *tsk = current;
5133
5134- memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5135+ tsk->thread.debugreg0 = 0;
5136+ tsk->thread.debugreg1 = 0;
5137+ tsk->thread.debugreg2 = 0;
5138+ tsk->thread.debugreg3 = 0;
5139+ tsk->thread.debugreg6 = 0;
5140+ tsk->thread.debugreg7 = 0;
5141 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5142 clear_tsk_thread_flag(tsk, TIF_DEBUG);
5143 /*
5144@@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5145 unlazy_fpu(tsk);
5146 }
5147
5148-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5149+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5150 unsigned long unused,
5151 struct task_struct * p, struct pt_regs * regs)
5152 {
5153@@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5154
5155 childregs = task_pt_regs(p);
5156 *childregs = *regs;
5157- childregs->eax = 0;
5158- childregs->esp = esp;
5159+ childregs->ax = 0;
5160+ childregs->sp = sp;
5161
5162- p->thread.esp = (unsigned long) childregs;
5163- p->thread.esp0 = (unsigned long) (childregs+1);
5164+ p->thread.sp = (unsigned long) childregs;
5165+ p->thread.sp0 = (unsigned long) (childregs+1);
5166
5167- p->thread.eip = (unsigned long) ret_from_fork;
5168+ p->thread.ip = (unsigned long) ret_from_fork;
5169
5170- savesegment(gs,p->thread.gs);
5171+ savesegment(gs, p->thread.gs);
5172
5173 tsk = current;
5174+ if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5175+ p->thread.ip = (unsigned long) cstar_ret_from_fork;
5176 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5177 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5178 IO_BITMAP_BYTES, GFP_KERNEL);
5179@@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5180 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5181 }
5182
5183+ err = 0;
5184+
5185 /*
5186 * Set a new TLS for the child thread?
5187 */
5188- if (clone_flags & CLONE_SETTLS) {
5189- struct desc_struct *desc;
5190- struct user_desc info;
5191- int idx;
5192-
5193- err = -EFAULT;
5194- if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5195- goto out;
5196- err = -EINVAL;
5197- if (LDT_empty(&info))
5198- goto out;
5199-
5200- idx = info.entry_number;
5201- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5202- goto out;
5203-
5204- desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5205- desc->a = LDT_entry_a(&info);
5206- desc->b = LDT_entry_b(&info);
5207- }
5208+ if (clone_flags & CLONE_SETTLS)
5209+ err = do_set_thread_area(p, -1,
5210+ (struct user_desc __user *)childregs->si, 0);
5211
5212 p->thread.iopl = current->thread.iopl;
5213
5214- err = 0;
5215- out:
5216 if (err && p->thread.io_bitmap_ptr) {
5217 kfree(p->thread.io_bitmap_ptr);
5218 p->thread.io_bitmap_max = 0;
5219@@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5220 return err;
5221 }
5222
5223-/*
5224- * fill in the user structure for a core dump..
5225- */
5226-void dump_thread(struct pt_regs * regs, struct user * dump)
5227-{
5228- int i;
5229-
5230-/* changed the size calculations - should hopefully work better. lbt */
5231- dump->magic = CMAGIC;
5232- dump->start_code = 0;
5233- dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5234- dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5235- dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5236- dump->u_dsize -= dump->u_tsize;
5237- dump->u_ssize = 0;
5238- for (i = 0; i < 8; i++)
5239- dump->u_debugreg[i] = current->thread.debugreg[i];
5240-
5241- if (dump->start_stack < TASK_SIZE)
5242- dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5243-
5244- dump->regs.ebx = regs->ebx;
5245- dump->regs.ecx = regs->ecx;
5246- dump->regs.edx = regs->edx;
5247- dump->regs.esi = regs->esi;
5248- dump->regs.edi = regs->edi;
5249- dump->regs.ebp = regs->ebp;
5250- dump->regs.eax = regs->eax;
5251- dump->regs.ds = regs->xds;
5252- dump->regs.es = regs->xes;
5253- dump->regs.fs = regs->xfs;
5254- savesegment(gs,dump->regs.gs);
5255- dump->regs.orig_eax = regs->orig_eax;
5256- dump->regs.eip = regs->eip;
5257- dump->regs.cs = regs->xcs;
5258- dump->regs.eflags = regs->eflags;
5259- dump->regs.esp = regs->esp;
5260- dump->regs.ss = regs->xss;
5261-
5262- dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5263-}
5264-EXPORT_SYMBOL(dump_thread);
5265-
5266-/*
5267- * Capture the user space registers if the task is not running (in user space)
5268- */
5269-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5270-{
5271- struct pt_regs ptregs = *task_pt_regs(tsk);
5272- ptregs.xcs &= 0xffff;
5273- ptregs.xds &= 0xffff;
5274- ptregs.xes &= 0xffff;
5275- ptregs.xss &= 0xffff;
5276-
5277- elf_core_copy_regs(regs, &ptregs);
5278-
5279- return 1;
5280-}
5281-
5282 #ifdef CONFIG_SECCOMP
5283-void hard_disable_TSC(void)
5284+static void hard_disable_TSC(void)
5285 {
5286 write_cr4(read_cr4() | X86_CR4_TSD);
5287 }
5288@@ -534,7 +453,7 @@ void disable_TSC(void)
5289 hard_disable_TSC();
5290 preempt_enable();
5291 }
5292-void hard_enable_TSC(void)
5293+static void hard_enable_TSC(void)
5294 {
5295 write_cr4(read_cr4() & ~X86_CR4_TSD);
5296 }
5297@@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5298 static noinline void
5299 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5300 {
5301- struct thread_struct *next;
5302+ struct thread_struct *prev, *next;
5303+ unsigned long debugctl;
5304
5305+ prev = &prev_p->thread;
5306 next = &next_p->thread;
5307
5308+ debugctl = prev->debugctlmsr;
5309+ if (next->ds_area_msr != prev->ds_area_msr) {
5310+ /* we clear debugctl to make sure DS
5311+ * is not in use when we change it */
5312+ debugctl = 0;
5313+ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5314+ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5315+ }
5316+
5317+ if (next->debugctlmsr != debugctl)
5318+ wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5319+
5320 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5321- set_debugreg(next->debugreg[0], 0);
5322- set_debugreg(next->debugreg[1], 1);
5323- set_debugreg(next->debugreg[2], 2);
5324- set_debugreg(next->debugreg[3], 3);
5325+ set_debugreg(next->debugreg0, 0);
5326+ set_debugreg(next->debugreg1, 1);
5327+ set_debugreg(next->debugreg2, 2);
5328+ set_debugreg(next->debugreg3, 3);
5329 /* no 4 and 5 */
5330- set_debugreg(next->debugreg[6], 6);
5331- set_debugreg(next->debugreg[7], 7);
5332+ set_debugreg(next->debugreg6, 6);
5333+ set_debugreg(next->debugreg7, 7);
5334 }
5335
5336 #ifdef CONFIG_SECCOMP
5337@@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5338 hard_enable_TSC();
5339 }
5340 #endif
5341+
5342+#ifdef X86_BTS
5343+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5344+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5345+
5346+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5347+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5348+#endif
5349 }
5350
5351 /*
5352@@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5353 * More important, however, is the fact that this allows us much
5354 * more flexibility.
5355 *
5356- * The return value (in %eax) will be the "prev" task after
5357+ * The return value (in %ax) will be the "prev" task after
5358 * the task-switch, and shows up in ret_from_fork in entry.S,
5359 * for example.
5360 */
5361-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5362+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5363 {
5364 struct thread_struct *prev = &prev_p->thread,
5365 *next = &next_p->thread;
5366@@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5367 #endif
5368
5369 /*
5370- * Reload esp0.
5371- * This is load_esp0(tss, next) with a multicall.
5372+ * Reload sp0.
5373+ * This is load_sp0(tss, next) with a multicall.
5374 */
5375 mcl->op = __HYPERVISOR_stack_switch;
5376 mcl->args[0] = __KERNEL_DS;
5377- mcl->args[1] = next->esp0;
5378+ mcl->args[1] = next->sp0;
5379 mcl++;
5380
5381 /*
5382@@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5383
5384 asmlinkage int sys_fork(struct pt_regs regs)
5385 {
5386- return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5387+ return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5388 }
5389
5390 asmlinkage int sys_clone(struct pt_regs regs)
5391@@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5392 unsigned long newsp;
5393 int __user *parent_tidptr, *child_tidptr;
5394
5395- clone_flags = regs.ebx;
5396- newsp = regs.ecx;
5397- parent_tidptr = (int __user *)regs.edx;
5398- child_tidptr = (int __user *)regs.edi;
5399+ clone_flags = regs.bx;
5400+ newsp = regs.cx;
5401+ parent_tidptr = (int __user *)regs.dx;
5402+ child_tidptr = (int __user *)regs.di;
5403 if (!newsp)
5404- newsp = regs.esp;
5405+ newsp = regs.sp;
5406 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
5407 }
5408
5409@@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5410 */
5411 asmlinkage int sys_vfork(struct pt_regs regs)
5412 {
5413- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
5414+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
5415 }
5416
5417 /*
5418@@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5419 int error;
5420 char * filename;
5421
5422- filename = getname((char __user *) regs.ebx);
5423+ filename = getname((char __user *) regs.bx);
5424 error = PTR_ERR(filename);
5425 if (IS_ERR(filename))
5426 goto out;
5427 error = do_execve(filename,
5428- (char __user * __user *) regs.ecx,
5429- (char __user * __user *) regs.edx,
5430+ (char __user * __user *) regs.cx,
5431+ (char __user * __user *) regs.dx,
5432 &regs);
5433 if (error == 0) {
5434- task_lock(current);
5435- current->ptrace &= ~PT_DTRACE;
5436- task_unlock(current);
5437 /* Make sure we don't return using sysenter.. */
5438 set_thread_flag(TIF_IRET);
5439 }
5440@@ -800,145 +738,37 @@ out:
5441
5442 unsigned long get_wchan(struct task_struct *p)
5443 {
5444- unsigned long ebp, esp, eip;
5445+ unsigned long bp, sp, ip;
5446 unsigned long stack_page;
5447 int count = 0;
5448 if (!p || p == current || p->state == TASK_RUNNING)
5449 return 0;
5450 stack_page = (unsigned long)task_stack_page(p);
5451- esp = p->thread.esp;
5452- if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5453+ sp = p->thread.sp;
5454+ if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5455 return 0;
5456- /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5457- ebp = *(unsigned long *) esp;
5458+ /* include/asm-i386/system.h:switch_to() pushes bp last. */
5459+ bp = *(unsigned long *) sp;
5460 do {
5461- if (ebp < stack_page || ebp > top_ebp+stack_page)
5462+ if (bp < stack_page || bp > top_ebp+stack_page)
5463 return 0;
5464- eip = *(unsigned long *) (ebp+4);
5465- if (!in_sched_functions(eip))
5466- return eip;
5467- ebp = *(unsigned long *) ebp;
5468+ ip = *(unsigned long *) (bp+4);
5469+ if (!in_sched_functions(ip))
5470+ return ip;
5471+ bp = *(unsigned long *) bp;
5472 } while (count++ < 16);
5473 return 0;
5474 }
5475
5476-/*
5477- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5478- */
5479-static int get_free_idx(void)
5480-{
5481- struct thread_struct *t = &current->thread;
5482- int idx;
5483-
5484- for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5485- if (desc_empty(t->tls_array + idx))
5486- return idx + GDT_ENTRY_TLS_MIN;
5487- return -ESRCH;
5488-}
5489-
5490-/*
5491- * Set a given TLS descriptor:
5492- */
5493-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5494-{
5495- struct thread_struct *t = &current->thread;
5496- struct user_desc info;
5497- struct desc_struct *desc;
5498- int cpu, idx;
5499-
5500- if (copy_from_user(&info, u_info, sizeof(info)))
5501- return -EFAULT;
5502- idx = info.entry_number;
5503-
5504- /*
5505- * index -1 means the kernel should try to find and
5506- * allocate an empty descriptor:
5507- */
5508- if (idx == -1) {
5509- idx = get_free_idx();
5510- if (idx < 0)
5511- return idx;
5512- if (put_user(idx, &u_info->entry_number))
5513- return -EFAULT;
5514- }
5515-
5516- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5517- return -EINVAL;
5518-
5519- desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5520-
5521- /*
5522- * We must not get preempted while modifying the TLS.
5523- */
5524- cpu = get_cpu();
5525-
5526- if (LDT_empty(&info)) {
5527- desc->a = 0;
5528- desc->b = 0;
5529- } else {
5530- desc->a = LDT_entry_a(&info);
5531- desc->b = LDT_entry_b(&info);
5532- }
5533- load_TLS(t, cpu);
5534-
5535- put_cpu();
5536-
5537- return 0;
5538-}
5539-
5540-/*
5541- * Get the current Thread-Local Storage area:
5542- */
5543-
5544-#define GET_BASE(desc) ( \
5545- (((desc)->a >> 16) & 0x0000ffff) | \
5546- (((desc)->b << 16) & 0x00ff0000) | \
5547- ( (desc)->b & 0xff000000) )
5548-
5549-#define GET_LIMIT(desc) ( \
5550- ((desc)->a & 0x0ffff) | \
5551- ((desc)->b & 0xf0000) )
5552-
5553-#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
5554-#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
5555-#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
5556-#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
5557-#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
5558-#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
5559-
5560-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5561-{
5562- struct user_desc info;
5563- struct desc_struct *desc;
5564- int idx;
5565-
5566- if (get_user(idx, &u_info->entry_number))
5567- return -EFAULT;
5568- if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5569- return -EINVAL;
5570-
5571- memset(&info, 0, sizeof(info));
5572-
5573- desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5574-
5575- info.entry_number = idx;
5576- info.base_addr = GET_BASE(desc);
5577- info.limit = GET_LIMIT(desc);
5578- info.seg_32bit = GET_32BIT(desc);
5579- info.contents = GET_CONTENTS(desc);
5580- info.read_exec_only = !GET_WRITABLE(desc);
5581- info.limit_in_pages = GET_LIMIT_PAGES(desc);
5582- info.seg_not_present = !GET_PRESENT(desc);
5583- info.useable = GET_USEABLE(desc);
5584-
5585- if (copy_to_user(u_info, &info, sizeof(info)))
5586- return -EFAULT;
5587- return 0;
5588-}
5589-
5590 unsigned long arch_align_stack(unsigned long sp)
5591 {
5592 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5593 sp -= get_random_int() % 8192;
5594 return sp & ~0xf;
5595 }
5596+
5597+unsigned long arch_randomize_brk(struct mm_struct *mm)
5598+{
5599+ unsigned long range_end = mm->brk + 0x02000000;
5600+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5601+}
5602--- sle11-2009-06-29.orig/arch/x86/kernel/process_64-xen.c 2009-02-16 16:18:36.000000000 +0100
5603+++ sle11-2009-06-29/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
5604@@ -3,7 +3,7 @@
5605 *
5606 * Pentium III FXSR, SSE support
5607 * Gareth Hughes <gareth@valinux.com>, May 2000
5608- *
5609+ *
5610 * X86-64 port
5611 * Andi Kleen.
5612 *
5613@@ -22,19 +22,18 @@
5614 #include <linux/cpu.h>
5615 #include <linux/errno.h>
5616 #include <linux/sched.h>
5617+#include <linux/fs.h>
5618 #include <linux/kernel.h>
5619 #include <linux/mm.h>
5620-#include <linux/fs.h>
5621 #include <linux/elfcore.h>
5622 #include <linux/smp.h>
5623 #include <linux/slab.h>
5624 #include <linux/user.h>
5625-#include <linux/module.h>
5626-#include <linux/a.out.h>
5627 #include <linux/interrupt.h>
5628+#include <linux/utsname.h>
5629 #include <linux/delay.h>
5630+#include <linux/module.h>
5631 #include <linux/ptrace.h>
5632-#include <linux/utsname.h>
5633 #include <linux/random.h>
5634 #include <linux/notifier.h>
5635 #include <linux/kprobes.h>
5636@@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5637 */
5638 void (*pm_idle)(void);
5639 EXPORT_SYMBOL(pm_idle);
5640-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5641
5642 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5643
5644@@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5645 {
5646 atomic_notifier_chain_register(&idle_notifier, n);
5647 }
5648-EXPORT_SYMBOL_GPL(idle_notifier_register);
5649-
5650-void idle_notifier_unregister(struct notifier_block *n)
5651-{
5652- atomic_notifier_chain_unregister(&idle_notifier, n);
5653-}
5654-EXPORT_SYMBOL(idle_notifier_unregister);
5655
5656 void enter_idle(void)
5657 {
5658@@ -116,7 +107,7 @@ void exit_idle(void)
5659 * to poll the ->need_resched flag instead of waiting for the
5660 * cross-CPU IPI to arrive. Use this option with caution.
5661 */
5662-static void poll_idle (void)
5663+static void poll_idle(void)
5664 {
5665 local_irq_enable();
5666 cpu_relax();
5667@@ -131,10 +122,19 @@ static void xen_idle(void)
5668 */
5669 smp_mb();
5670 local_irq_disable();
5671- if (!need_resched())
5672- safe_halt();
5673- else
5674- local_irq_enable();
5675+ if (!need_resched()) {
5676+ ktime_t t0, t1;
5677+ u64 t0n, t1n;
5678+
5679+ t0 = ktime_get();
5680+ t0n = ktime_to_ns(t0);
5681+ safe_halt(); /* enables interrupts racelessly */
5682+ local_irq_disable();
5683+ t1 = ktime_get();
5684+ t1n = ktime_to_ns(t1);
5685+ sched_clock_idle_wakeup_event(t1n - t0n);
5686+ }
5687+ local_irq_enable();
5688 current_thread_info()->status |= TS_POLLING;
5689 }
5690
5691@@ -161,19 +161,15 @@ static inline void play_dead(void)
5692 * low exit latency (ie sit in a loop waiting for
5693 * somebody to say that they'd like to reschedule)
5694 */
5695-void cpu_idle (void)
5696+void cpu_idle(void)
5697 {
5698 current_thread_info()->status |= TS_POLLING;
5699 /* endless idle loop with no priority at all */
5700 while (1) {
5701+ tick_nohz_stop_sched_tick();
5702 while (!need_resched()) {
5703 void (*idle)(void);
5704
5705- if (__get_cpu_var(cpu_idle_state))
5706- __get_cpu_var(cpu_idle_state) = 0;
5707-
5708- tick_nohz_stop_sched_tick();
5709-
5710 rmb();
5711 idle = xen_idle; /* no alternatives */
5712 if (cpu_is_offline(smp_processor_id()))
5713@@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5714 {
5715 }
5716
5717+/*
5718+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5719+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
5720+ * handler on SMP systems.
5721+ *
5722+ * Caller must have changed pm_idle to the new value before the call. Old
5723+ * pm_idle value will not be used by any CPU after the return of this function.
5724+ */
5725 void cpu_idle_wait(void)
5726 {
5727- unsigned int cpu, this_cpu = get_cpu();
5728- cpumask_t map, tmp = current->cpus_allowed;
5729-
5730- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5731- put_cpu();
5732-
5733- cpus_clear(map);
5734- for_each_online_cpu(cpu) {
5735- per_cpu(cpu_idle_state, cpu) = 1;
5736- cpu_set(cpu, map);
5737- }
5738-
5739- __get_cpu_var(cpu_idle_state) = 0;
5740-
5741- wmb();
5742- do {
5743- ssleep(1);
5744- for_each_online_cpu(cpu) {
5745- if (cpu_isset(cpu, map) &&
5746- !per_cpu(cpu_idle_state, cpu))
5747- cpu_clear(cpu, map);
5748- }
5749- cpus_and(map, map, cpu_online_map);
5750- /*
5751- * We waited 1 sec, if a CPU still did not call idle
5752- * it may be because it is in idle and not waking up
5753- * because it has nothing to do.
5754- * Give all the remaining CPUS a kick.
5755- */
5756- smp_call_function_mask(map, do_nothing, 0, 0);
5757- } while (!cpus_empty(map));
5758-
5759- set_cpus_allowed(current, tmp);
5760+ smp_mb();
5761+ /* kick all the CPUs so that they exit out of pm_idle */
5762+ smp_call_function(do_nothing, NULL, 0, 1);
5763 }
5764 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5765
5766-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5767+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5768 {
5769 }
5770
5771-static int __init idle_setup (char *str)
5772+static int __init idle_setup(char *str)
5773 {
5774 if (!strcmp(str, "poll")) {
5775 printk("using polling idle threads.\n");
5776@@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5777 }
5778 early_param("idle", idle_setup);
5779
5780-/* Prints also some state that isn't saved in the pt_regs */
5781+/* Prints also some state that isn't saved in the pt_regs */
5782 void __show_regs(struct pt_regs * regs)
5783 {
5784 unsigned long fs, gs, shadowgs;
5785 unsigned long d0, d1, d2, d3, d6, d7;
5786- unsigned int fsindex,gsindex;
5787- unsigned int ds,cs,es;
5788+ unsigned int fsindex, gsindex;
5789+ unsigned int ds, cs, es;
5790
5791 printk("\n");
5792 print_modules();
5793@@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5794 init_utsname()->release,
5795 (int)strcspn(init_utsname()->version, " "),
5796 init_utsname()->version);
5797- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5798- printk_address(regs->rip);
5799- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5800- regs->eflags);
5801+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5802+ printk_address(regs->ip, 1);
5803+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
5804+ regs->flags);
5805 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5806- regs->rax, regs->rbx, regs->rcx);
5807+ regs->ax, regs->bx, regs->cx);
5808 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5809- regs->rdx, regs->rsi, regs->rdi);
5810+ regs->dx, regs->si, regs->di);
5811 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5812- regs->rbp, regs->r8, regs->r9);
5813+ regs->bp, regs->r8, regs->r9);
5814 printk("R10: %016lx R11: %016lx R12: %016lx\n",
5815 regs->r10, regs->r11, regs->r12);
5816 printk("R13: %016lx R14: %016lx R15: %016lx\n",
5817@@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5818 {
5819 printk("CPU %d:", smp_processor_id());
5820 __show_regs(regs);
5821- show_trace(NULL, regs, (void *)(regs + 1));
5822+ show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5823 }
5824
5825 /*
5826@@ -329,7 +303,7 @@ void exit_thread(void)
5827 struct task_struct *me = current;
5828 struct thread_struct *t = &me->thread;
5829
5830- if (me->thread.io_bitmap_ptr) {
5831+ if (me->thread.io_bitmap_ptr) {
5832 #ifndef CONFIG_X86_NO_TSS
5833 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5834 #endif
5835@@ -382,7 +356,7 @@ void flush_thread(void)
5836 tsk->thread.debugreg3 = 0;
5837 tsk->thread.debugreg6 = 0;
5838 tsk->thread.debugreg7 = 0;
5839- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5840+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5841 /*
5842 * Forget coprocessor state..
5843 */
5844@@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5845
5846 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5847 {
5848- struct user_desc ud = {
5849+ struct user_desc ud = {
5850 .base_addr = addr,
5851 .limit = 0xfffff,
5852 .seg_32bit = 1,
5853 .limit_in_pages = 1,
5854 .useable = 1,
5855 };
5856- struct n_desc_struct *desc = (void *)t->thread.tls_array;
5857+ struct desc_struct *desc = t->thread.tls_array;
5858 desc += tls;
5859- desc->a = LDT_entry_a(&ud);
5860- desc->b = LDT_entry_b(&ud);
5861+ fill_ldt(desc, &ud);
5862 }
5863
5864 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
5865 {
5866- struct desc_struct *desc = (void *)t->thread.tls_array;
5867- desc += tls;
5868- return desc->base0 |
5869- (((u32)desc->base1) << 16) |
5870- (((u32)desc->base2) << 24);
5871+ return get_desc_base(&t->thread.tls_array[tls]);
5872 }
5873
5874 /*
5875@@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
5876 unlazy_fpu(tsk);
5877 }
5878
5879-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
5880+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5881 unsigned long unused,
5882 struct task_struct * p, struct pt_regs * regs)
5883 {
5884@@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
5885 (THREAD_SIZE + task_stack_page(p))) - 1;
5886 *childregs = *regs;
5887
5888- childregs->rax = 0;
5889- childregs->rsp = rsp;
5890- if (rsp == ~0UL)
5891- childregs->rsp = (unsigned long)childregs;
5892-
5893- p->thread.rsp = (unsigned long) childregs;
5894- p->thread.rsp0 = (unsigned long) (childregs+1);
5895- p->thread.userrsp = me->thread.userrsp;
5896+ childregs->ax = 0;
5897+ childregs->sp = sp;
5898+ if (sp == ~0UL)
5899+ childregs->sp = (unsigned long)childregs;
5900+
5901+ p->thread.sp = (unsigned long) childregs;
5902+ p->thread.sp0 = (unsigned long) (childregs+1);
5903+ p->thread.usersp = me->thread.usersp;
5904
5905 set_tsk_thread_flag(p, TIF_FORK);
5906
5907@@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
5908 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5909 IO_BITMAP_BYTES);
5910 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5911- }
5912+ }
5913
5914 /*
5915 * Set a new TLS for the child thread?
5916@@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
5917 if (clone_flags & CLONE_SETTLS) {
5918 #ifdef CONFIG_IA32_EMULATION
5919 if (test_thread_flag(TIF_IA32))
5920- err = ia32_child_tls(p, childregs);
5921+ err = do_set_thread_area(p, -1,
5922+ (struct user_desc __user *)childregs->si, 0);
5923 else
5924 #endif
5925 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
5926@@ -502,26 +472,32 @@ out:
5927 return err;
5928 }
5929
5930-static inline void __save_init_fpu( struct task_struct *tsk )
5931-{
5932- asm volatile( "rex64 ; fxsave %0 ; fnclex"
5933- : "=m" (tsk->thread.i387.fxsave));
5934- tsk->thread_info->status &= ~TS_USEDFPU;
5935-}
5936-
5937 /*
5938 * This special macro can be used to load a debugging register
5939 */
5940-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5941+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
5942
5943 static inline void __switch_to_xtra(struct task_struct *prev_p,
5944- struct task_struct *next_p)
5945+ struct task_struct *next_p)
5946 {
5947 struct thread_struct *prev, *next;
5948+ unsigned long debugctl;
5949
5950 prev = &prev_p->thread,
5951 next = &next_p->thread;
5952
5953+ debugctl = prev->debugctlmsr;
5954+ if (next->ds_area_msr != prev->ds_area_msr) {
5955+ /* we clear debugctl to make sure DS
5956+ * is not in use when we change it */
5957+ debugctl = 0;
5958+ wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5959+ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
5960+ }
5961+
5962+ if (next->debugctlmsr != debugctl)
5963+ wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
5964+
5965 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5966 loaddebug(next, 0);
5967 loaddebug(next, 1);
5968@@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
5969 loaddebug(next, 6);
5970 loaddebug(next, 7);
5971 }
5972+
5973+#ifdef X86_BTS
5974+ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5975+ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5976+
5977+ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5978+ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5979+#endif
5980 }
5981
5982 /*
5983 * switch_to(x,y) should switch tasks from x to y.
5984 *
5985- * This could still be optimized:
5986+ * This could still be optimized:
5987 * - fold all the options into a flag word and test it with a single test.
5988 * - could test fs/gs bitsliced
5989 *
5990@@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
5991 {
5992 struct thread_struct *prev = &prev_p->thread,
5993 *next = &next_p->thread;
5994- int cpu = smp_processor_id();
5995+ int cpu = smp_processor_id();
5996 #ifndef CONFIG_X86_NO_TSS
5997 struct tss_struct *tss = &per_cpu(init_tss, cpu);
5998 #endif
5999@@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6000 prev_p->fpu_counter = 0;
6001
6002 /*
6003- * Reload esp0, LDT and the page table pointer:
6004+ * Reload sp0.
6005+ * This is load_sp0(tss, next) with a multicall.
6006 */
6007 mcl->op = __HYPERVISOR_stack_switch;
6008 mcl->args[0] = __KERNEL_DS;
6009- mcl->args[1] = next->rsp0;
6010+ mcl->args[1] = next->sp0;
6011 mcl++;
6012
6013 /*
6014@@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6015 * This is load_TLS(next, cpu) with multicalls.
6016 */
6017 #define C(i) do { \
6018- if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
6019+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
6020+ next->tls_array[i].b != prev->tls_array[i].b)) { \
6021 mcl->op = __HYPERVISOR_update_descriptor; \
6022 mcl->args[0] = virt_to_machine( \
6023- &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
6024- mcl->args[1] = next->tls_array[i]; \
6025+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6026+ mcl->args[1] = *(u64 *)&next->tls_array[i]; \
6027 mcl++; \
6028 } \
6029 } while (0)
6030@@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6031 #undef C
6032
6033 if (unlikely(prev->iopl != next->iopl)) {
6034- iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6035+ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6036 #if CONFIG_XEN_COMPAT > 0x030002
6037 mcl->op = __HYPERVISOR_physdev_op;
6038 mcl->args[0] = PHYSDEVOP_set_iopl;
6039@@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6040 /*
6041 * Switch the PDA context.
6042 */
6043- prev->userrsp = read_pda(oldrsp);
6044- write_pda(oldrsp, next->userrsp);
6045+ prev->usersp = read_pda(oldrsp);
6046+ write_pda(oldrsp, next->usersp);
6047 write_pda(pcurrent, next_p);
6048 write_pda(kernelstack,
6049 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6050@@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6051 /*
6052 * Now maybe reload the debug registers
6053 */
6054- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6055+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6056+ task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6057 __switch_to_xtra(prev_p, next_p);
6058
6059 /* If the task has used fpu the last 5 timeslices, just do a full
6060@@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6061 /*
6062 * sys_execve() executes a new program.
6063 */
6064-asmlinkage
6065+asmlinkage
6066 long sys_execve(char __user *name, char __user * __user *argv,
6067- char __user * __user *envp, struct pt_regs regs)
6068+ char __user * __user *envp, struct pt_regs *regs)
6069 {
6070 long error;
6071 char * filename;
6072
6073 filename = getname(name);
6074 error = PTR_ERR(filename);
6075- if (IS_ERR(filename))
6076+ if (IS_ERR(filename))
6077 return error;
6078- error = do_execve(filename, argv, envp, &regs);
6079- if (error == 0) {
6080- task_lock(current);
6081- current->ptrace &= ~PT_DTRACE;
6082- task_unlock(current);
6083- }
6084+ error = do_execve(filename, argv, envp, regs);
6085 putname(filename);
6086 return error;
6087 }
6088@@ -728,18 +710,18 @@ void set_personality_64bit(void)
6089 /* inherit personality from parent */
6090
6091 /* Make sure to be in 64bit mode */
6092- clear_thread_flag(TIF_IA32);
6093+ clear_thread_flag(TIF_IA32);
6094
6095 /* TBD: overwrites user setup. Should have two bits.
6096 But 64bit processes have always behaved this way,
6097 so it's not too bad. The main problem is just that
6098- 32bit childs are affected again. */
6099+ 32bit childs are affected again. */
6100 current->personality &= ~READ_IMPLIES_EXEC;
6101 }
6102
6103 asmlinkage long sys_fork(struct pt_regs *regs)
6104 {
6105- return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6106+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6107 }
6108
6109 asmlinkage long
6110@@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6111 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6112 {
6113 if (!newsp)
6114- newsp = regs->rsp;
6115+ newsp = regs->sp;
6116 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6117 }
6118
6119@@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6120 */
6121 asmlinkage long sys_vfork(struct pt_regs *regs)
6122 {
6123- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6124+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6125 NULL, NULL);
6126 }
6127
6128 unsigned long get_wchan(struct task_struct *p)
6129 {
6130 unsigned long stack;
6131- u64 fp,rip;
6132+ u64 fp,ip;
6133 int count = 0;
6134
6135 if (!p || p == current || p->state==TASK_RUNNING)
6136 return 0;
6137 stack = (unsigned long)task_stack_page(p);
6138- if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6139+ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6140 return 0;
6141- fp = *(u64 *)(p->thread.rsp);
6142+ fp = *(u64 *)(p->thread.sp);
6143 do {
6144 if (fp < (unsigned long)stack ||
6145 fp > (unsigned long)stack+THREAD_SIZE)
6146 return 0;
6147- rip = *(u64 *)(fp+8);
6148- if (!in_sched_functions(rip))
6149- return rip;
6150+ ip = *(u64 *)(fp+8);
6151+ if (!in_sched_functions(ip))
6152+ return ip;
6153 fp = *(u64 *)fp;
6154 } while (count++ < 16);
6155 return 0;
6156@@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6157 /* Not strictly needed for fs, but do it for symmetry
6158 with gs */
6159 if (addr >= TASK_SIZE_OF(task))
6160- return -EPERM;
6161+ return -EPERM;
6162 cpu = get_cpu();
6163- /* handle small bases via the GDT because that's faster to
6164+ /* handle small bases via the GDT because that's faster to
6165 switch. */
6166- if (addr <= 0xffffffff) {
6167+ if (addr <= 0xffffffff) {
6168 set_32bit_tls(task, FS_TLS, addr);
6169- if (doit) {
6170- load_TLS(&task->thread, cpu);
6171+ if (doit) {
6172+ load_TLS(&task->thread, cpu);
6173 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6174 }
6175 task->thread.fsindex = FS_TLS_SEL;
6176 task->thread.fs = 0;
6177- } else {
6178+ } else {
6179 task->thread.fsindex = 0;
6180 task->thread.fs = addr;
6181 if (doit) {
6182@@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6183 }
6184 put_cpu();
6185 break;
6186- case ARCH_GET_FS: {
6187- unsigned long base;
6188+ case ARCH_GET_FS: {
6189+ unsigned long base;
6190 if (task->thread.fsindex == FS_TLS_SEL)
6191 base = read_32bit_tls(task, FS_TLS);
6192 else if (doit)
6193 rdmsrl(MSR_FS_BASE, base);
6194 else
6195 base = task->thread.fs;
6196- ret = put_user(base, (unsigned long __user *)addr);
6197- break;
6198+ ret = put_user(base, (unsigned long __user *)addr);
6199+ break;
6200 }
6201- case ARCH_GET_GS: {
6202+ case ARCH_GET_GS: {
6203 unsigned long base;
6204 unsigned gsindex;
6205 if (task->thread.gsindex == GS_TLS_SEL)
6206 base = read_32bit_tls(task, GS_TLS);
6207 else if (doit) {
6208- asm("movl %%gs,%0" : "=r" (gsindex));
6209+ asm("movl %%gs,%0" : "=r" (gsindex));
6210 if (gsindex)
6211 rdmsrl(MSR_KERNEL_GS_BASE, base);
6212 else
6213@@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6214 }
6215 else
6216 base = task->thread.gs;
6217- ret = put_user(base, (unsigned long __user *)addr);
6218+ ret = put_user(base, (unsigned long __user *)addr);
6219 break;
6220 }
6221
6222 default:
6223 ret = -EINVAL;
6224 break;
6225- }
6226+ }
6227
6228- return ret;
6229-}
6230+ return ret;
6231+}
6232
6233 long sys_arch_prctl(int code, unsigned long addr)
6234 {
6235 return do_arch_prctl(current, code, addr);
6236-}
6237-
6238-/*
6239- * Capture the user space registers if the task is not running (in user space)
6240- */
6241-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6242-{
6243- struct pt_regs *pp, ptregs;
6244-
6245- pp = task_pt_regs(tsk);
6246-
6247- ptregs = *pp;
6248- ptregs.cs &= 0xffff;
6249- ptregs.ss &= 0xffff;
6250-
6251- elf_core_copy_regs(regs, &ptregs);
6252-
6253- boot_option_idle_override = 1;
6254- return 1;
6255 }
6256
6257 unsigned long arch_align_stack(unsigned long sp)
6258@@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6259 sp -= get_random_int() % 8192;
6260 return sp & ~0xf;
6261 }
6262+
6263+unsigned long arch_randomize_brk(struct mm_struct *mm)
6264+{
6265+ unsigned long range_end = mm->brk + 0x02000000;
6266+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6267+}
6268--- sle11-2009-06-29.orig/arch/x86/kernel/quirks-xen.c 2009-02-16 16:18:36.000000000 +0100
6269+++ sle11-2009-06-29/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
6270@@ -9,7 +9,7 @@
6271 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6272 {
6273 u8 config, rev;
6274- u32 word;
6275+ u16 word;
6276
6277 /* BIOS may enable hardware IRQ balancing for
6278 * E7520/E7320/E7525(revision ID 0x9 and below)
6279@@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6280 pci_read_config_byte(dev, 0xf4, &config);
6281 pci_write_config_byte(dev, 0xf4, config|0x2);
6282
6283- /* read xTPR register */
6284- raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6285+ /*
6286+ * read xTPR register. We may not have a pci_dev for device 8
6287+ * because it might be hidden until the above write.
6288+ */
6289+ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6290
6291 if (!(word & (1 << 13))) {
6292 struct xen_platform_op op;
6293
6294- printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6295- "Disabling irq balancing and affinity\n");
6296+ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6297+ "disabling irq balancing and affinity\n");
6298 op.cmd = XENPF_platform_quirk;
6299 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6300 WARN_ON(HYPERVISOR_platform_op(&op));
6301@@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6302 pci_read_config_dword(dev, 0xF0, &rcba);
6303 rcba &= 0xFFFFC000;
6304 if (rcba == 0) {
6305- printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6306+ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6307+ "cannot force enable HPET\n");
6308 return;
6309 }
6310
6311 /* use bits 31:14, 16 kB aligned */
6312 rcba_base = ioremap_nocache(rcba, 0x4000);
6313 if (rcba_base == NULL) {
6314- printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6315+ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6316+ "cannot force enable HPET\n");
6317 return;
6318 }
6319
6320@@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6321 /* HPET is enabled in HPTC. Just not reported by BIOS */
6322 val = val & 0x3;
6323 force_hpet_address = 0xFED00000 | (val << 12);
6324- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6325- force_hpet_address);
6326+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6327+ "0x%lx\n", force_hpet_address);
6328 iounmap(rcba_base);
6329 return;
6330 }
6331@@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6332 if (err) {
6333 force_hpet_address = 0;
6334 iounmap(rcba_base);
6335- printk(KERN_DEBUG "Failed to force enable HPET\n");
6336+ dev_printk(KERN_DEBUG, &dev->dev,
6337+ "Failed to force enable HPET\n");
6338 } else {
6339 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6340- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6341- force_hpet_address);
6342+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6343+ "0x%lx\n", force_hpet_address);
6344 }
6345 }
6346
6347@@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6348 ich_force_enable_hpet);
6349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6350 ich_force_enable_hpet);
6351+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6352+ ich_force_enable_hpet);
6353
6354
6355 static struct pci_dev *cached_dev;
6356@@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6357 if (val & 0x4) {
6358 val &= 0x3;
6359 force_hpet_address = 0xFED00000 | (val << 12);
6360- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6361- force_hpet_address);
6362+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6363+ force_hpet_address);
6364 return;
6365 }
6366
6367@@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6368 /* HPET is enabled in HPTC. Just not reported by BIOS */
6369 val &= 0x3;
6370 force_hpet_address = 0xFED00000 | (val << 12);
6371- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6372- force_hpet_address);
6373+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6374+ "0x%lx\n", force_hpet_address);
6375 cached_dev = dev;
6376 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6377 return;
6378 }
6379
6380- printk(KERN_DEBUG "Failed to force enable HPET\n");
6381+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6382 }
6383
6384 /*
6385@@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6386 */
6387 if (val & 0x80) {
6388 force_hpet_address = (val & ~0x3ff);
6389- printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6390- force_hpet_address);
6391+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6392+ force_hpet_address);
6393 return;
6394 }
6395
6396@@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6397 pci_read_config_dword(dev, 0x68, &val);
6398 if (val & 0x80) {
6399 force_hpet_address = (val & ~0x3ff);
6400- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6401- force_hpet_address);
6402+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6403+ "0x%lx\n", force_hpet_address);
6404 cached_dev = dev;
6405 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6406 return;
6407 }
6408
6409- printk(KERN_DEBUG "Failed to force enable HPET\n");
6410+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6411 }
6412
6413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6414@@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6415 pci_read_config_dword(dev, 0x44, &val);
6416 force_hpet_address = val & 0xfffffffe;
6417 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6418- printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6419+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6420 force_hpet_address);
6421 cached_dev = dev;
6422 return;
6423@@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6424 nvidia_force_enable_hpet);
6425
6426 /* LPC bridges */
6427+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6428+ nvidia_force_enable_hpet);
6429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6430 nvidia_force_enable_hpet);
6431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6432@@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6433 void force_hpet_resume(void)
6434 {
6435 switch (force_hpet_resume_type) {
6436- case ICH_FORCE_HPET_RESUME:
6437- return ich_force_hpet_resume();
6438-
6439- case OLD_ICH_FORCE_HPET_RESUME:
6440- return old_ich_force_hpet_resume();
6441-
6442- case VT8237_FORCE_HPET_RESUME:
6443- return vt8237_force_hpet_resume();
6444-
6445- case NVIDIA_FORCE_HPET_RESUME:
6446- return nvidia_force_hpet_resume();
6447-
6448- default:
6449+ case ICH_FORCE_HPET_RESUME:
6450+ ich_force_hpet_resume();
6451+ return;
6452+ case OLD_ICH_FORCE_HPET_RESUME:
6453+ old_ich_force_hpet_resume();
6454+ return;
6455+ case VT8237_FORCE_HPET_RESUME:
6456+ vt8237_force_hpet_resume();
6457+ return;
6458+ case NVIDIA_FORCE_HPET_RESUME:
6459+ nvidia_force_hpet_resume();
6460+ return;
6461+ default:
6462 break;
6463 }
6464 }
6465--- sle11-2009-06-29.orig/arch/x86/kernel/rtc.c 2009-06-29 15:14:52.000000000 +0200
6466+++ sle11-2009-06-29/arch/x86/kernel/rtc.c 2009-03-16 16:33:40.000000000 +0100
6467@@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6468 {
6469 unsigned long retval, flags;
6470
6471+#ifdef CONFIG_XEN
6472+ if (!is_initial_xendomain())
6473+ return xen_read_persistent_clock();
6474+#endif
6475 spin_lock_irqsave(&rtc_lock, flags);
6476 retval = get_wallclock();
6477 spin_unlock_irqrestore(&rtc_lock, flags);
6478@@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6479
6480 int update_persistent_clock(struct timespec now)
6481 {
6482+#ifdef CONFIG_XEN
6483+ if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6484+ return 0;
6485+#endif
6486 return set_rtc_mmss(now.tv_sec);
6487 }
6488
6489--- sle11-2009-06-29.orig/arch/x86/kernel/setup64-xen.c 2009-02-16 16:18:36.000000000 +0100
6490+++ sle11-2009-06-29/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
6491@@ -31,7 +31,11 @@
6492 #include <asm/hypervisor.h>
6493 #endif
6494
6495+#ifndef CONFIG_DEBUG_BOOT_PARAMS
6496 struct boot_params __initdata boot_params;
6497+#else
6498+struct boot_params boot_params;
6499+#endif
6500
6501 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
6502
6503@@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
6504
6505 unsigned long __supported_pte_mask __read_mostly = ~0UL;
6506 EXPORT_SYMBOL(__supported_pte_mask);
6507+
6508 static int do_not_nx __cpuinitdata = 0;
6509
6510 /* noexec=on|off
6511@@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
6512 __setup("noexec32=", nonx32_setup);
6513
6514 /*
6515+ * Copy data used in early init routines from the initial arrays to the
6516+ * per cpu data areas. These arrays then become expendable and the
6517+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
6518+ */
6519+static void __init setup_per_cpu_maps(void)
6520+{
6521+#ifndef CONFIG_XEN
6522+ int cpu;
6523+
6524+ for_each_possible_cpu(cpu) {
6525+#ifdef CONFIG_SMP
6526+ if (per_cpu_offset(cpu)) {
6527+#endif
6528+ per_cpu(x86_cpu_to_apicid, cpu) =
6529+ x86_cpu_to_apicid_init[cpu];
6530+ per_cpu(x86_bios_cpu_apicid, cpu) =
6531+ x86_bios_cpu_apicid_init[cpu];
6532+#ifdef CONFIG_NUMA
6533+ per_cpu(x86_cpu_to_node_map, cpu) =
6534+ x86_cpu_to_node_map_init[cpu];
6535+#endif
6536+#ifdef CONFIG_SMP
6537+ }
6538+ else
6539+ printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
6540+ cpu);
6541+#endif
6542+ }
6543+
6544+ /* indicate the early static arrays will soon be gone */
6545+ x86_cpu_to_apicid_early_ptr = NULL;
6546+ x86_bios_cpu_apicid_early_ptr = NULL;
6547+#ifdef CONFIG_NUMA
6548+ x86_cpu_to_node_map_early_ptr = NULL;
6549+#endif
6550+#endif
6551+}
6552+
6553+/*
6554 * Great future plan:
6555 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
6556 * Always point %gs to its beginning
6557@@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
6558 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
6559 for_each_cpu_mask (i, cpu_possible_map) {
6560 char *ptr;
6561+#ifndef CONFIG_NEED_MULTIPLE_NODES
6562+ ptr = alloc_bootmem_pages(size);
6563+#else
6564+ int node = early_cpu_to_node(i);
6565
6566- if (!NODE_DATA(cpu_to_node(i))) {
6567- printk("cpu with no node %d, num_online_nodes %d\n",
6568- i, num_online_nodes());
6569+ if (!node_online(node) || !NODE_DATA(node))
6570 ptr = alloc_bootmem_pages(size);
6571- } else {
6572- ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
6573- }
6574+ else
6575+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
6576+#endif
6577 if (!ptr)
6578 panic("Cannot allocate cpu data for CPU %d\n", i);
6579 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
6580 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
6581 }
6582+
6583+ /* setup percpu data maps early */
6584+ setup_per_cpu_maps();
6585 }
6586
6587 #ifdef CONFIG_XEN
6588@@ -224,7 +273,8 @@ void syscall_init(void)
6589 wrmsrl(MSR_CSTAR, ignore_sysret);
6590
6591 /* Flags to clear on syscall */
6592- wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
6593+ wrmsrl(MSR_SYSCALL_MASK,
6594+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
6595 #endif
6596 #ifdef CONFIG_IA32_EMULATION
6597 syscall32_cpu_init ();
6598@@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
6599 */
6600 #ifndef CONFIG_XEN
6601 if (cpu)
6602- memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
6603+ memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
6604 #endif
6605
6606 cpu_gdt_descr[cpu].size = GDT_SIZE;
6607@@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
6608 v, cpu);
6609 }
6610 estacks += PAGE_SIZE << order[v];
6611- orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
6612+ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
6613 }
6614
6615- t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6616+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6617 /*
6618 * <= is required because the CPU will access up to
6619 * 8 bits beyond the end of the IO permission bitmap.
6620--- sle11-2009-06-29.orig/arch/x86/kernel/setup_32-xen.c 2009-02-16 16:18:36.000000000 +0100
6621+++ sle11-2009-06-29/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
6622@@ -47,9 +47,12 @@
6623 #include <linux/crash_dump.h>
6624 #include <linux/dmi.h>
6625 #include <linux/pfn.h>
6626+#include <linux/pci.h>
6627+#include <linux/init_ohci1394_dma.h>
6628
6629 #include <video/edid.h>
6630
6631+#include <asm/mtrr.h>
6632 #include <asm/apic.h>
6633 #include <asm/e820.h>
6634 #include <asm/mpspec.h>
6635@@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6636 xen_panic_event, NULL, 0 /* try to go last */
6637 };
6638
6639-int disable_pse __cpuinitdata = 0;
6640-
6641 /*
6642 * Machine setup..
6643 */
6644-extern struct resource code_resource;
6645-extern struct resource data_resource;
6646-extern struct resource bss_resource;
6647+static struct resource data_resource = {
6648+ .name = "Kernel data",
6649+ .start = 0,
6650+ .end = 0,
6651+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6652+};
6653+
6654+static struct resource code_resource = {
6655+ .name = "Kernel code",
6656+ .start = 0,
6657+ .end = 0,
6658+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6659+};
6660+
6661+static struct resource bss_resource = {
6662+ .name = "Kernel bss",
6663+ .start = 0,
6664+ .end = 0,
6665+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6666+};
6667+
6668+static struct resource video_ram_resource = {
6669+ .name = "Video RAM area",
6670+ .start = 0xa0000,
6671+ .end = 0xbffff,
6672+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6673+};
6674+
6675+static struct resource standard_io_resources[] = { {
6676+ .name = "dma1",
6677+ .start = 0x0000,
6678+ .end = 0x001f,
6679+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6680+}, {
6681+ .name = "pic1",
6682+ .start = 0x0020,
6683+ .end = 0x0021,
6684+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6685+}, {
6686+ .name = "timer0",
6687+ .start = 0x0040,
6688+ .end = 0x0043,
6689+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6690+}, {
6691+ .name = "timer1",
6692+ .start = 0x0050,
6693+ .end = 0x0053,
6694+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6695+}, {
6696+ .name = "keyboard",
6697+ .start = 0x0060,
6698+ .end = 0x006f,
6699+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6700+}, {
6701+ .name = "dma page reg",
6702+ .start = 0x0080,
6703+ .end = 0x008f,
6704+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6705+}, {
6706+ .name = "pic2",
6707+ .start = 0x00a0,
6708+ .end = 0x00a1,
6709+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6710+}, {
6711+ .name = "dma2",
6712+ .start = 0x00c0,
6713+ .end = 0x00df,
6714+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6715+}, {
6716+ .name = "fpu",
6717+ .start = 0x00f0,
6718+ .end = 0x00ff,
6719+ .flags = IORESOURCE_BUSY | IORESOURCE_IO
6720+} };
6721
6722 /* cpu data as detected by the assembly code in head.S */
6723 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6724@@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6725 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6726 EXPORT_SYMBOL(boot_cpu_data);
6727
6728+#ifndef CONFIG_X86_PAE
6729 unsigned long mmu_cr4_features;
6730+#else
6731+unsigned long mmu_cr4_features = X86_CR4_PAE;
6732+#endif
6733
6734 /* for MCA, but anyone else can use it if they want */
6735 unsigned int machine_id;
6736 unsigned int machine_submodel_id;
6737 unsigned int BIOS_revision;
6738-unsigned int mca_pentium_flag;
6739
6740 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6741 int bootloader_type;
6742@@ -131,13 +206,17 @@ extern int root_mountflags;
6743
6744 unsigned long saved_videomode;
6745
6746-#define RAMDISK_IMAGE_START_MASK 0x07FF
6747+#define RAMDISK_IMAGE_START_MASK 0x07FF
6748 #define RAMDISK_PROMPT_FLAG 0x8000
6749-#define RAMDISK_LOAD_FLAG 0x4000
6750+#define RAMDISK_LOAD_FLAG 0x4000
6751
6752 static char __initdata command_line[COMMAND_LINE_SIZE];
6753
6754+#ifndef CONFIG_DEBUG_BOOT_PARAMS
6755 struct boot_params __initdata boot_params;
6756+#else
6757+struct boot_params boot_params;
6758+#endif
6759
6760 /*
6761 * Point at the empty zero page to start with. We map the real shared_info
6762@@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6763 return -EINVAL;
6764
6765 if (strcmp(arg, "nopentium") == 0) {
6766- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6767- disable_pse = 1;
6768+ setup_clear_cpu_cap(X86_FEATURE_PSE);
6769 } else {
6770 /* If the user specifies memory size, we
6771 * limit the BIOS-provided memory map to
6772@@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6773 * trim the existing memory map.
6774 */
6775 unsigned long long mem_size;
6776-
6777+
6778 mem_size = memparse(arg, &arg);
6779 limit_regions(mem_size);
6780 user_defined_memmap = 1;
6781@@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6782 unsigned int addr;
6783 addr = get_bios_ebda();
6784 if (addr)
6785- reserve_bootmem(addr, PAGE_SIZE);
6786+ reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6787 }
6788 #endif
6789
6790@@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6791 min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6792 xen_start_info->nr_pt_frames;
6793
6794- find_max_pfn();
6795-
6796 max_low_pfn = find_max_low_pfn();
6797
6798 #ifdef CONFIG_HIGHMEM
6799@@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6800 (unsigned long)(total_mem >> 20));
6801 crashk_res.start = crash_base;
6802 crashk_res.end = crash_base + crash_size - 1;
6803- reserve_bootmem(crash_base, crash_size);
6804+ reserve_bootmem(crash_base, crash_size,
6805+ BOOTMEM_DEFAULT);
6806 } else
6807 printk(KERN_INFO "crashkernel reservation failed - "
6808 "you have to specify a base address\n");
6809@@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6810 {}
6811 #endif
6812
6813+#ifdef CONFIG_BLK_DEV_INITRD
6814+
6815+static bool do_relocate_initrd = false;
6816+
6817+static void __init reserve_initrd(void)
6818+{
6819+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6820+ unsigned long ramdisk_size = xen_start_info->mod_len;
6821+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6822+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6823+ unsigned long ramdisk_here;
6824+
6825+ initrd_start = 0;
6826+
6827+ if (!xen_start_info->mod_start || !ramdisk_size)
6828+ return; /* No initrd provided by bootloader */
6829+
6830+ if (ramdisk_end < ramdisk_image) {
6831+ printk(KERN_ERR "initrd wraps around end of memory, "
6832+ "disabling initrd\n");
6833+ return;
6834+ }
6835+ if (ramdisk_size >= end_of_lowmem/2) {
6836+ printk(KERN_ERR "initrd too large to handle, "
6837+ "disabling initrd\n");
6838+ return;
6839+ }
6840+ if (ramdisk_end <= end_of_lowmem) {
6841+ /* All in lowmem, easy case */
6842+ reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6843+ initrd_start = ramdisk_image + PAGE_OFFSET;
6844+ initrd_end = initrd_start+ramdisk_size;
6845+ return;
6846+ }
6847+
6848+ /* We need to move the initrd down into lowmem */
6849+ ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6850+
6851+ /* Note: this includes all the lowmem currently occupied by
6852+ the initrd, we rely on that fact to keep the data intact. */
6853+ reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6854+ initrd_start = ramdisk_here + PAGE_OFFSET;
6855+ initrd_end = initrd_start + ramdisk_size;
6856+
6857+ do_relocate_initrd = true;
6858+}
6859+
6860+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
6861+
6862+static void __init relocate_initrd(void)
6863+{
6864+ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6865+ unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
6866+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6867+ unsigned long ramdisk_here;
6868+ unsigned long slop, clen, mapaddr;
6869+ char *p, *q;
6870+
6871+ if (!do_relocate_initrd)
6872+ return;
6873+
6874+ ramdisk_here = initrd_start - PAGE_OFFSET;
6875+
6876+ q = (char *)initrd_start;
6877+
6878+ /* Copy any lowmem portion of the initrd */
6879+ if (ramdisk_image < end_of_lowmem) {
6880+ clen = end_of_lowmem - ramdisk_image;
6881+ p = (char *)__va(ramdisk_image);
6882+ memcpy(q, p, clen);
6883+ q += clen;
6884+ ramdisk_image += clen;
6885+ ramdisk_size -= clen;
6886+ }
6887+
6888+ /* Copy the highmem portion of the initrd */
6889+ while (ramdisk_size) {
6890+ slop = ramdisk_image & ~PAGE_MASK;
6891+ clen = ramdisk_size;
6892+ if (clen > MAX_MAP_CHUNK-slop)
6893+ clen = MAX_MAP_CHUNK-slop;
6894+ mapaddr = ramdisk_image & PAGE_MASK;
6895+ p = early_ioremap(mapaddr, clen+slop);
6896+ memcpy(q, p+slop, clen);
6897+ early_iounmap(p, clen+slop);
6898+ q += clen;
6899+ ramdisk_image += clen;
6900+ ramdisk_size -= clen;
6901+ }
6902+}
6903+
6904+#endif /* CONFIG_BLK_DEV_INITRD */
6905+
6906 void __init setup_bootmem_allocator(void)
6907 {
6908 unsigned long bootmap_size;
6909@@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6910 * bootmem allocator with an invalid RAM area.
6911 */
6912 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6913- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6914+ bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6915+ BOOTMEM_DEFAULT);
6916
6917 #ifndef CONFIG_XEN
6918 /*
6919 * reserve physical page 0 - it's a special BIOS page on many boxes,
6920 * enabling clean reboots, SMP operation, laptop functions.
6921 */
6922- reserve_bootmem(0, PAGE_SIZE);
6923+ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6924
6925 /* reserve EBDA region, it's a 4K region */
6926 reserve_ebda_region();
6927@@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6928 unless you have no PS/2 mouse plugged in. */
6929 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6930 boot_cpu_data.x86 == 6)
6931- reserve_bootmem(0xa0000 - 4096, 4096);
6932+ reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6933
6934 #ifdef CONFIG_SMP
6935 /*
6936@@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6937 * FIXME: Don't need the extra page at 4K, but need to fix
6938 * trampoline before removing it. (see the GDT stuff)
6939 */
6940- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6941+ reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6942 #endif
6943 #ifdef CONFIG_ACPI_SLEEP
6944 /*
6945@@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6946 */
6947 acpi_reserve_bootmem();
6948 #endif
6949- numa_kva_reserve();
6950 #endif /* !CONFIG_XEN */
6951
6952 #ifdef CONFIG_BLK_DEV_INITRD
6953- if (xen_start_info->mod_start) {
6954- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6955- unsigned long ramdisk_size = xen_start_info->mod_len;
6956- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6957- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6958-
6959- if (ramdisk_end <= end_of_lowmem) {
6960- /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6961- initrd_start = ramdisk_image + PAGE_OFFSET;
6962- initrd_end = initrd_start+ramdisk_size;
6963- initrd_below_start_ok = 1;
6964- } else {
6965- printk(KERN_ERR "initrd extends beyond end of memory "
6966- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6967- ramdisk_end, end_of_lowmem);
6968- initrd_start = 0;
6969- }
6970- }
6971+ reserve_initrd();
6972 #endif
6973+ numa_kva_reserve();
6974 reserve_crashkernel();
6975 }
6976
6977@@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6978 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6979 pre_setup_arch_hook();
6980 early_cpu_init();
6981+ early_ioremap_init();
6982 #ifdef CONFIG_SMP
6983 prefill_possible_map();
6984 #endif
6985
6986- /*
6987- * FIXME: This isn't an official loader_type right
6988- * now but does currently work with elilo.
6989- * If we were configured as an EFI kernel, check to make
6990- * sure that we were loaded correctly from elilo and that
6991- * the system table is valid. If not, then initialize normally.
6992- */
6993 #ifdef CONFIG_EFI
6994- if ((boot_params.hdr.type_of_loader == 0x50) &&
6995- boot_params.efi_info.efi_systab)
6996+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
6997+ "EL32", 4))
6998 efi_enabled = 1;
6999 #endif
7000
7001@@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7002 #endif
7003
7004 ARCH_SETUP
7005- if (efi_enabled)
7006- efi_init();
7007- else {
7008- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7009- print_memory_map(memory_setup());
7010- }
7011+
7012+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7013+ print_memory_map(memory_setup());
7014
7015 copy_edd();
7016
7017@@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7018 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7019 *cmdline_p = command_line;
7020
7021+ if (efi_enabled)
7022+ efi_init();
7023+
7024+ /* update e820 for memory not covered by WB MTRRs */
7025+ find_max_pfn();
7026+ mtrr_bp_init();
7027+#ifndef CONFIG_XEN
7028+ if (mtrr_trim_uncached_memory(max_pfn))
7029+ find_max_pfn();
7030+#endif
7031+
7032 max_low_pfn = setup_memory();
7033
7034 #ifdef CONFIG_VMI
7035@@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7036 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7037 #endif
7038 paging_init();
7039+
7040+ /*
7041+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7042+ */
7043+
7044+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7045+ if (init_ohci1394_dma_early)
7046+ init_ohci1394_dma_on_all_controllers();
7047+#endif
7048+
7049 remapped_pgdat_init();
7050 sparse_init();
7051 zone_sizes_init();
7052@@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7053 * NOTE: at this point the bootmem allocator is fully available.
7054 */
7055
7056+#ifdef CONFIG_BLK_DEV_INITRD
7057+ relocate_initrd();
7058+#endif
7059+
7060 paravirt_post_allocator_init();
7061
7062 if (is_initial_xendomain())
7063 dmi_scan_machine();
7064
7065+ io_delay_init();
7066+
7067 #ifdef CONFIG_X86_GENERICARCH
7068 generic_apic_probe();
7069-#endif
7070- if (efi_enabled)
7071- efi_map_memmap();
7072+#endif
7073
7074 set_iopl.iopl = 1;
7075 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7076@@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7077 acpi_boot_table_init();
7078 #endif
7079
7080-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7081+#ifndef CONFIG_XEN
7082 early_quirks();
7083 #endif
7084
7085@@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7086 /* we're never actually going to get here... */
7087 return NOTIFY_DONE;
7088 }
7089+
7090+/*
7091+ * Request address space for all standard resources
7092+ *
7093+ * This is called just before pcibios_init(), which is also a
7094+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7095+ */
7096+static int __init request_standard_resources(void)
7097+{
7098+ int i;
7099+
7100+ /* Nothing to do if not running in dom0. */
7101+ if (!is_initial_xendomain())
7102+ return 0;
7103+
7104+ printk(KERN_INFO "Setting up standard PCI resources\n");
7105+ init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7106+
7107+ request_resource(&iomem_resource, &video_ram_resource);
7108+
7109+ /* request I/O space for devices used on all i[345]86 PCs */
7110+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7111+ request_resource(&ioport_resource, &standard_io_resources[i]);
7112+ return 0;
7113+}
7114+
7115+subsys_initcall(request_standard_resources);
7116--- sle11-2009-06-29.orig/arch/x86/kernel/setup_64-xen.c 2009-02-16 16:18:36.000000000 +0100
7117+++ sle11-2009-06-29/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7118@@ -15,7 +15,6 @@
7119 #include <linux/ptrace.h>
7120 #include <linux/slab.h>
7121 #include <linux/user.h>
7122-#include <linux/a.out.h>
7123 #include <linux/screen_info.h>
7124 #include <linux/ioport.h>
7125 #include <linux/delay.h>
7126@@ -30,6 +29,7 @@
7127 #include <linux/crash_dump.h>
7128 #include <linux/root_dev.h>
7129 #include <linux/pci.h>
7130+#include <linux/efi.h>
7131 #include <linux/acpi.h>
7132 #include <linux/kallsyms.h>
7133 #include <linux/edd.h>
7134@@ -39,10 +39,13 @@
7135 #include <linux/dmi.h>
7136 #include <linux/dma-mapping.h>
7137 #include <linux/ctype.h>
7138+#include <linux/uaccess.h>
7139+#include <linux/init_ohci1394_dma.h>
7140
7141 #include <asm/mtrr.h>
7142 #include <asm/uaccess.h>
7143 #include <asm/system.h>
7144+#include <asm/vsyscall.h>
7145 #include <asm/io.h>
7146 #include <asm/smp.h>
7147 #include <asm/msr.h>
7148@@ -50,6 +53,7 @@
7149 #include <video/edid.h>
7150 #include <asm/e820.h>
7151 #include <asm/dma.h>
7152+#include <asm/gart.h>
7153 #include <asm/mpspec.h>
7154 #include <asm/mmu_context.h>
7155 #include <asm/proto.h>
7156@@ -59,6 +63,9 @@
7157 #include <asm/sections.h>
7158 #include <asm/dmi.h>
7159 #include <asm/cacheflush.h>
7160+#include <asm/mce.h>
7161+#include <asm/ds.h>
7162+#include <asm/topology.h>
7163 #ifdef CONFIG_XEN
7164 #include <linux/percpu.h>
7165 #include <xen/interface/physdev.h>
7166@@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7167 struct cpuinfo_x86 boot_cpu_data __read_mostly;
7168 EXPORT_SYMBOL(boot_cpu_data);
7169
7170+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7171+
7172 unsigned long mmu_cr4_features;
7173
7174 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7175@@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7176
7177 int force_mwait __cpuinitdata;
7178
7179-/*
7180+/*
7181 * Early DMI memory
7182 */
7183 int dmi_alloc_index;
7184@@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7185
7186 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7187
7188-struct resource data_resource = {
7189+static struct resource data_resource = {
7190 .name = "Kernel data",
7191 .start = 0,
7192 .end = 0,
7193 .flags = IORESOURCE_RAM,
7194 };
7195-struct resource code_resource = {
7196+static struct resource code_resource = {
7197 .name = "Kernel code",
7198 .start = 0,
7199 .end = 0,
7200 .flags = IORESOURCE_RAM,
7201 };
7202-struct resource bss_resource = {
7203+static struct resource bss_resource = {
7204 .name = "Kernel bss",
7205 .start = 0,
7206 .end = 0,
7207 .flags = IORESOURCE_RAM,
7208 };
7209
7210+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7211+
7212 #ifdef CONFIG_PROC_VMCORE
7213 /* elfcorehdr= specifies the location of elf core header
7214 * stored by the crashed kernel. This option will be passed
7215@@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7216 unsigned long bootmap_size, bootmap;
7217
7218 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7219- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7220+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7221+ PAGE_SIZE);
7222 if (bootmap == -1L)
7223- panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7224+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7225 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7226 e820_register_active_regions(0, start_pfn, end_pfn);
7227 #ifdef CONFIG_XEN
7228@@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7229 #else
7230 free_bootmem_with_active_regions(0, end_pfn);
7231 #endif
7232- reserve_bootmem(bootmap, bootmap_size);
7233-}
7234+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7235+}
7236 #endif
7237
7238 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7239@@ -249,27 +261,35 @@ static inline void copy_edd(void)
7240 #ifndef CONFIG_XEN
7241 static void __init reserve_crashkernel(void)
7242 {
7243- unsigned long long free_mem;
7244+ unsigned long long total_mem;
7245 unsigned long long crash_size, crash_base;
7246 int ret;
7247
7248- free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7249+ total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7250
7251- ret = parse_crashkernel(boot_command_line, free_mem,
7252+ ret = parse_crashkernel(boot_command_line, total_mem,
7253 &crash_size, &crash_base);
7254 if (ret == 0 && crash_size) {
7255- if (crash_base > 0) {
7256- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7257- "for crashkernel (System RAM: %ldMB)\n",
7258- (unsigned long)(crash_size >> 20),
7259- (unsigned long)(crash_base >> 20),
7260- (unsigned long)(free_mem >> 20));
7261- crashk_res.start = crash_base;
7262- crashk_res.end = crash_base + crash_size - 1;
7263- reserve_bootmem(crash_base, crash_size);
7264- } else
7265+ if (crash_base <= 0) {
7266 printk(KERN_INFO "crashkernel reservation failed - "
7267 "you have to specify a base address\n");
7268+ return;
7269+ }
7270+
7271+ if (reserve_bootmem(crash_base, crash_size,
7272+ BOOTMEM_EXCLUSIVE) < 0) {
7273+ printk(KERN_INFO "crashkernel reservation failed - "
7274+ "memory is in use\n");
7275+ return;
7276+ }
7277+
7278+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7279+ "for crashkernel (System RAM: %ldMB)\n",
7280+ (unsigned long)(crash_size >> 20),
7281+ (unsigned long)(crash_base >> 20),
7282+ (unsigned long)(total_mem >> 20));
7283+ crashk_res.start = crash_base;
7284+ crashk_res.end = crash_base + crash_size - 1;
7285 }
7286 }
7287 #else
7288@@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7289 {}
7290 #endif
7291
7292-#ifndef CONFIG_XEN
7293-#define EBDA_ADDR_POINTER 0x40E
7294-
7295-unsigned __initdata ebda_addr;
7296-unsigned __initdata ebda_size;
7297-
7298-static void discover_ebda(void)
7299+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7300+void __attribute__((weak)) __init memory_setup(void)
7301 {
7302- /*
7303- * there is a real-mode segmented pointer pointing to the
7304- * 4K EBDA area at 0x40E
7305- */
7306- ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7307- ebda_addr <<= 4;
7308-
7309- ebda_size = *(unsigned short *)__va(ebda_addr);
7310-
7311- /* Round EBDA up to pages */
7312- if (ebda_size == 0)
7313- ebda_size = 1;
7314- ebda_size <<= 10;
7315- ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7316- if (ebda_size > 64*1024)
7317- ebda_size = 64*1024;
7318+ machine_specific_memory_setup();
7319 }
7320-#else
7321-#define discover_ebda() ((void)0)
7322-#endif
7323
7324+/*
7325+ * setup_arch - architecture-specific boot-time initializations
7326+ *
7327+ * Note: On x86_64, fixmaps are ready for use even before this is called.
7328+ */
7329 void __init setup_arch(char **cmdline_p)
7330 {
7331+ unsigned i;
7332+
7333 #ifdef CONFIG_XEN
7334 extern struct e820map machine_e820;
7335
7336@@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7337 /* Register a call for panic conditions. */
7338 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7339
7340+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7341+ VMASST_TYPE_writable_pagetables));
7342+
7343+ early_ioremap_init();
7344+
7345 ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7346 screen_info = boot_params.screen_info;
7347
7348@@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7349 screen_info.orig_video_isVGA = 0;
7350
7351 copy_edid();
7352-
7353- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7354- VMASST_TYPE_writable_pagetables));
7355-
7356- ARCH_SETUP
7357 #else
7358 printk(KERN_INFO "Command line: %s\n", boot_command_line);
7359
7360@@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7361 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7362 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7363 #endif
7364- setup_memory_region();
7365+#ifdef CONFIG_EFI
7366+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7367+ "EL64", 4))
7368+ efi_enabled = 1;
7369+#endif
7370+
7371+ ARCH_SETUP
7372+
7373+ memory_setup();
7374 copy_edd();
7375
7376 if (!boot_params.hdr.root_flags)
7377@@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7378
7379 parse_early_param();
7380
7381+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7382+ if (init_ohci1394_dma_early)
7383+ init_ohci1394_dma_on_all_controllers();
7384+#endif
7385+
7386 finish_e820_parsing();
7387
7388+ early_gart_iommu_check();
7389+
7390 e820_register_active_regions(0, 0, -1UL);
7391 /*
7392 * partially used pages are not usable - thus
7393 * we are rounding upwards:
7394 */
7395 end_pfn = e820_end_of_ram();
7396+ /* update e820 for memory not covered by WB MTRRs */
7397+ mtrr_bp_init();
7398+#ifndef CONFIG_XEN
7399+ if (mtrr_trim_uncached_memory(end_pfn)) {
7400+ e820_register_active_regions(0, 0, -1UL);
7401+ end_pfn = e820_end_of_ram();
7402+ }
7403+#endif
7404+
7405 num_physpages = end_pfn;
7406+ max_mapnr = end_pfn;
7407
7408 check_efer();
7409
7410- discover_ebda();
7411-
7412 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7413+ if (efi_enabled)
7414+ efi_init();
7415
7416 if (is_initial_xendomain())
7417 dmi_scan_machine();
7418
7419+ io_delay_init();
7420+
7421 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7422- /* setup to use the static apicid table during kernel startup */
7423- x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7424+ /* setup to use the early static init tables during kernel startup */
7425+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7426+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7427+#ifdef CONFIG_NUMA
7428+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7429+#endif
7430 #endif
7431
7432 /* How many end-of-memory variables you have, grandma! */
7433@@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7434 #endif
7435
7436 #ifdef CONFIG_NUMA
7437- numa_initmem_init(0, end_pfn);
7438+ numa_initmem_init(0, end_pfn);
7439 #else
7440 contig_initmem_init(0, end_pfn);
7441 #endif
7442
7443-#ifdef CONFIG_XEN
7444- /*
7445- * Reserve kernel, physmap, start info, initial page tables, and
7446- * direct mapping.
7447- */
7448- reserve_bootmem_generic(__pa_symbol(&_text),
7449- (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7450-#else
7451- /* Reserve direct mapping */
7452- reserve_bootmem_generic(table_start << PAGE_SHIFT,
7453- (table_end - table_start) << PAGE_SHIFT);
7454-
7455- /* reserve kernel */
7456- reserve_bootmem_generic(__pa_symbol(&_text),
7457- __pa_symbol(&_end) - __pa_symbol(&_text));
7458+ early_res_to_bootmem();
7459
7460+#ifndef CONFIG_XEN
7461+#ifdef CONFIG_ACPI_SLEEP
7462 /*
7463- * reserve physical page 0 - it's a special BIOS page on many boxes,
7464- * enabling clean reboots, SMP operation, laptop functions.
7465+ * Reserve low memory region for sleep support.
7466 */
7467- reserve_bootmem_generic(0, PAGE_SIZE);
7468-
7469- /* reserve ebda region */
7470- if (ebda_addr)
7471- reserve_bootmem_generic(ebda_addr, ebda_size);
7472-#ifdef CONFIG_NUMA
7473- /* reserve nodemap region */
7474- if (nodemap_addr)
7475- reserve_bootmem_generic(nodemap_addr, nodemap_size);
7476+ acpi_reserve_bootmem();
7477 #endif
7478
7479-#ifdef CONFIG_SMP
7480- /* Reserve SMP trampoline */
7481- reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7482-#endif
7483+ if (efi_enabled)
7484+ efi_reserve_bootmem();
7485 #endif
7486
7487-#ifdef CONFIG_ACPI_SLEEP
7488- /*
7489- * Reserve low memory region for sleep support.
7490- */
7491- acpi_reserve_bootmem();
7492-#endif
7493 #ifdef CONFIG_BLK_DEV_INITRD
7494 #ifdef CONFIG_XEN
7495 if (xen_start_info->mod_start) {
7496@@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7497 initrd_below_start_ok = 1;
7498 #endif
7499 } else {
7500+ /* Assumes everything on node 0 */
7501+ free_bootmem(ramdisk_image, ramdisk_size);
7502 printk(KERN_ERR "initrd extends beyond end of memory "
7503 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7504 ramdisk_end, end_of_mem);
7505@@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7506 #endif
7507 reserve_crashkernel();
7508 paging_init();
7509+ map_vsyscall();
7510 #ifdef CONFIG_X86_LOCAL_APIC
7511 /*
7512- * Find and reserve possible boot-time SMP configuration:
7513- */
7514+ * Find and reserve possible boot-time SMP configuration:
7515+ */
7516 find_smp_config();
7517 #endif
7518 #ifdef CONFIG_XEN
7519@@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7520 #endif
7521 #endif
7522
7523-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7524+#ifndef CONFIG_XEN
7525 early_quirks();
7526 #endif
7527
7528- /*
7529- * set this early, so we dont allocate cpu0
7530- * if MADT list doesnt list BSP first
7531- * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7532- */
7533- cpu_set(0, cpu_present_map);
7534 #ifdef CONFIG_ACPI
7535 /*
7536 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7537@@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7538 get_smp_config();
7539 #ifndef CONFIG_XEN
7540 init_apic_mappings();
7541+ ioapic_init_mappings();
7542 #endif
7543 #endif
7544 #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7545@@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7546 */
7547 #ifdef CONFIG_XEN
7548 if (is_initial_xendomain())
7549- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7550+ e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7551+ &code_resource, &data_resource, &bss_resource);
7552 #else
7553- e820_reserve_resources(e820.map, e820.nr_map);
7554+ e820_reserve_resources(e820.map, e820.nr_map,
7555+ &code_resource, &data_resource, &bss_resource);
7556 e820_mark_nosave_regions();
7557 #endif
7558
7559- {
7560- unsigned i;
7561 /* request I/O space for devices used on all i[345]86 PCs */
7562 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7563 request_resource(&ioport_resource, &standard_io_resources[i]);
7564- }
7565
7566 #ifdef CONFIG_XEN
7567 if (is_initial_xendomain())
7568@@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7569
7570 #ifdef CONFIG_VT
7571 #if defined(CONFIG_VGA_CONSOLE)
7572- conswitchp = &vga_con;
7573+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7574+ conswitchp = &vga_con;
7575 #elif defined(CONFIG_DUMMY_CONSOLE)
7576 conswitchp = &dummy_con;
7577 #endif
7578@@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7579
7580 if (n >= 0x80000005) {
7581 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7582- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7583- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7584- c->x86_cache_size=(ecx>>24)+(edx>>24);
7585+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7586+ "D cache %dK (%d bytes/line)\n",
7587+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7588+ c->x86_cache_size = (ecx>>24) + (edx>>24);
7589 /* On K8 L1 TLB is inclusive, so don't count it */
7590 c->x86_tlbsize = 0;
7591 }
7592@@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7593 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7594 c->x86_cache_size, ecx & 0xFF);
7595 }
7596-
7597- if (n >= 0x80000007)
7598- cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7599 if (n >= 0x80000008) {
7600- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7601+ cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7602 c->x86_virt_bits = (eax >> 8) & 0xff;
7603 c->x86_phys_bits = eax & 0xff;
7604 }
7605 }
7606
7607 #ifdef CONFIG_NUMA
7608-static int nearby_node(int apicid)
7609+static int __cpuinit nearby_node(int apicid)
7610 {
7611- int i;
7612+ int i, node;
7613+
7614 for (i = apicid - 1; i >= 0; i--) {
7615- int node = apicid_to_node[i];
7616+ node = apicid_to_node[i];
7617 if (node != NUMA_NO_NODE && node_online(node))
7618 return node;
7619 }
7620 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7621- int node = apicid_to_node[i];
7622+ node = apicid_to_node[i];
7623 if (node != NUMA_NO_NODE && node_online(node))
7624 return node;
7625 }
7626@@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7627 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7628 * Assumes number of cores is a power of two.
7629 */
7630-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7631+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7632 {
7633 #ifdef CONFIG_SMP
7634 unsigned bits;
7635@@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7636 int node = 0;
7637 unsigned apicid = hard_smp_processor_id();
7638 #endif
7639- unsigned ecx = cpuid_ecx(0x80000008);
7640+ bits = c->x86_coreid_bits;
7641+
7642+ /* Low order bits define the core id (index of core in socket) */
7643+ c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7644+ /* Convert the APIC ID into the socket ID */
7645+ c->phys_proc_id = phys_pkg_id(bits);
7646+
7647+#ifdef CONFIG_NUMA
7648+ node = c->phys_proc_id;
7649+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
7650+ node = apicid_to_node[apicid];
7651+ if (!node_online(node)) {
7652+ /* Two possibilities here:
7653+ - The CPU is missing memory and no node was created.
7654+ In that case try picking one from a nearby CPU
7655+ - The APIC IDs differ from the HyperTransport node IDs
7656+ which the K8 northbridge parsing fills in.
7657+ Assume they are all increased by a constant offset,
7658+ but in the same order as the HT nodeids.
7659+ If that doesn't result in a usable node fall back to the
7660+ path for the previous case. */
7661+
7662+ int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7663+
7664+ if (ht_nodeid >= 0 &&
7665+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7666+ node = apicid_to_node[ht_nodeid];
7667+ /* Pick a nearby node */
7668+ if (!node_online(node))
7669+ node = nearby_node(apicid);
7670+ }
7671+ numa_set_node(cpu, node);
7672+
7673+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7674+#endif
7675+#endif
7676+}
7677+
7678+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7679+{
7680+#ifdef CONFIG_SMP
7681+ unsigned bits, ecx;
7682+
7683+ /* Multi core CPU? */
7684+ if (c->extended_cpuid_level < 0x80000008)
7685+ return;
7686+
7687+ ecx = cpuid_ecx(0x80000008);
7688
7689 c->x86_max_cores = (ecx & 0xff) + 1;
7690
7691@@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7692 bits++;
7693 }
7694
7695- /* Low order bits define the core id (index of core in socket) */
7696- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7697- /* Convert the APIC ID into the socket ID */
7698- c->phys_proc_id = phys_pkg_id(bits);
7699-
7700-#ifdef CONFIG_NUMA
7701- node = c->phys_proc_id;
7702- if (apicid_to_node[apicid] != NUMA_NO_NODE)
7703- node = apicid_to_node[apicid];
7704- if (!node_online(node)) {
7705- /* Two possibilities here:
7706- - The CPU is missing memory and no node was created.
7707- In that case try picking one from a nearby CPU
7708- - The APIC IDs differ from the HyperTransport node IDs
7709- which the K8 northbridge parsing fills in.
7710- Assume they are all increased by a constant offset,
7711- but in the same order as the HT nodeids.
7712- If that doesn't result in a usable node fall back to the
7713- path for the previous case. */
7714- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7715- if (ht_nodeid >= 0 &&
7716- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7717- node = apicid_to_node[ht_nodeid];
7718- /* Pick a nearby node */
7719- if (!node_online(node))
7720- node = nearby_node(apicid);
7721- }
7722- numa_set_node(cpu, node);
7723+ c->x86_coreid_bits = bits;
7724
7725- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7726-#endif
7727 #endif
7728 }
7729
7730@@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7731 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7732 static __cpuinit int amd_apic_timer_broken(void)
7733 {
7734- u32 lo, hi;
7735- u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7736+ u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7737+
7738 switch (eax & CPUID_XFAM) {
7739 case CPUID_XFAM_K8:
7740 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7741@@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7742 }
7743 #endif
7744
7745+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7746+{
7747+ early_init_amd_mc(c);
7748+
7749+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7750+ if (c->x86_power & (1<<8))
7751+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7752+}
7753+
7754 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7755 {
7756 unsigned level;
7757@@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7758 /*
7759 * Disable TLB flush filter by setting HWCR.FFDIS on K8
7760 * bit 6 of msr C001_0015
7761- *
7762+ *
7763 * Errata 63 for SH-B3 steppings
7764 * Errata 122 for all steppings (F+ have it disabled by default)
7765 */
7766@@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7767
7768 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7769 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7770- clear_bit(0*32+31, &c->x86_capability);
7771-
7772+ clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7773+
7774 /* On C+ stepping K8 rep microcode works well for copy/memset */
7775 level = cpuid_eax(1);
7776- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7777- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7778+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7779+ level >= 0x0f58))
7780+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7781 if (c->x86 == 0x10 || c->x86 == 0x11)
7782- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7783+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7784
7785 /* Enable workaround for FXSAVE leak */
7786 if (c->x86 >= 6)
7787- set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7788+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7789
7790 level = get_model_name(c);
7791 if (!level) {
7792- switch (c->x86) {
7793+ switch (c->x86) {
7794 case 15:
7795 /* Should distinguish Models here, but this is only
7796 a fallback anyways. */
7797 strcpy(c->x86_model_id, "Hammer");
7798- break;
7799- }
7800- }
7801+ break;
7802+ }
7803+ }
7804 display_cacheinfo(c);
7805
7806- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7807- if (c->x86_power & (1<<8))
7808- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7809-
7810 /* Multi core CPU? */
7811 if (c->extended_cpuid_level >= 0x80000008)
7812 amd_detect_cmp(c);
7813@@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7814 num_cache_leaves = 3;
7815
7816 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7817- set_bit(X86_FEATURE_K8, &c->x86_capability);
7818-
7819- /* RDTSC can be speculated around */
7820- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7821+ set_cpu_cap(c, X86_FEATURE_K8);
7822
7823- /* Family 10 doesn't support C states in MWAIT so don't use it */
7824- if (c->x86 == 0x10 && !force_mwait)
7825- clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7826+ /* MFENCE stops RDTSC speculation */
7827+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7828
7829 #ifndef CONFIG_XEN
7830 if (amd_apic_timer_broken())
7831@@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7832 #endif
7833 }
7834
7835-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7836+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7837 {
7838 #ifdef CONFIG_SMP
7839- u32 eax, ebx, ecx, edx;
7840- int index_msb, core_bits;
7841+ u32 eax, ebx, ecx, edx;
7842+ int index_msb, core_bits;
7843
7844 cpuid(1, &eax, &ebx, &ecx, &edx);
7845
7846
7847 if (!cpu_has(c, X86_FEATURE_HT))
7848 return;
7849- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7850+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7851 goto out;
7852
7853 smp_num_siblings = (ebx & 0xff0000) >> 16;
7854
7855 if (smp_num_siblings == 1) {
7856 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
7857- } else if (smp_num_siblings > 1 ) {
7858+ } else if (smp_num_siblings > 1) {
7859
7860 if (smp_num_siblings > NR_CPUS) {
7861- printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7862+ printk(KERN_WARNING "CPU: Unsupported number of "
7863+ "siblings %d", smp_num_siblings);
7864 smp_num_siblings = 1;
7865 return;
7866 }
7867@@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7868
7869 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7870
7871- index_msb = get_count_order(smp_num_siblings) ;
7872+ index_msb = get_count_order(smp_num_siblings);
7873
7874 core_bits = get_count_order(c->x86_max_cores);
7875
7876@@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7877 }
7878 out:
7879 if ((c->x86_max_cores * smp_num_siblings) > 1) {
7880- printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7881- printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7882+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
7883+ c->phys_proc_id);
7884+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
7885+ c->cpu_core_id);
7886 }
7887
7888 #endif
7889@@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7890 return 1;
7891 }
7892
7893-static void srat_detect_node(void)
7894+static void __cpuinit srat_detect_node(void)
7895 {
7896 #ifdef CONFIG_NUMA
7897 unsigned node;
7898@@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7899 /* Don't do the funky fallback heuristics the AMD version employs
7900 for now. */
7901 node = apicid_to_node[apicid];
7902- if (node == NUMA_NO_NODE)
7903+ if (node == NUMA_NO_NODE || !node_online(node))
7904 node = first_node(node_online_map);
7905 numa_set_node(cpu, node);
7906
7907@@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7908 #endif
7909 }
7910
7911+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7912+{
7913+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7914+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
7915+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7916+}
7917+
7918 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7919 {
7920 /* Cache sizes */
7921 unsigned n;
7922
7923 init_intel_cacheinfo(c);
7924- if (c->cpuid_level > 9 ) {
7925+ if (c->cpuid_level > 9) {
7926 unsigned eax = cpuid_eax(10);
7927 /* Check for version and the number of counters */
7928 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7929- set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7930+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7931 }
7932
7933 if (cpu_has_ds) {
7934 unsigned int l1, l2;
7935 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7936 if (!(l1 & (1<<11)))
7937- set_bit(X86_FEATURE_BTS, c->x86_capability);
7938+ set_cpu_cap(c, X86_FEATURE_BTS);
7939 if (!(l1 & (1<<12)))
7940- set_bit(X86_FEATURE_PEBS, c->x86_capability);
7941+ set_cpu_cap(c, X86_FEATURE_PEBS);
7942 }
7943
7944+
7945+ if (cpu_has_bts)
7946+ ds_init_intel(c);
7947+
7948 n = c->extended_cpuid_level;
7949 if (n >= 0x80000008) {
7950 unsigned eax = cpuid_eax(0x80000008);
7951@@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7952 c->x86_cache_alignment = c->x86_clflush_size * 2;
7953 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7954 (c->x86 == 0x6 && c->x86_model >= 0x0e))
7955- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7956+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7957 if (c->x86 == 6)
7958- set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7959- if (c->x86 == 15)
7960- set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7961- else
7962- clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7963- c->x86_max_cores = intel_num_cpu_cores(c);
7964+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7965+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7966+ c->x86_max_cores = intel_num_cpu_cores(c);
7967
7968 srat_detect_node();
7969 }
7970@@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7971 c->x86_vendor = X86_VENDOR_UNKNOWN;
7972 }
7973
7974-struct cpu_model_info {
7975- int vendor;
7976- int family;
7977- char *model_names[16];
7978-};
7979-
7980 /* Do some early cpuid on the boot CPU to get some parameter that are
7981 needed before check_bugs. Everything advanced is in identify_cpu
7982 below. */
7983-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7984+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7985 {
7986- u32 tfms;
7987+ u32 tfms, xlvl;
7988
7989 c->loops_per_jiffy = loops_per_jiffy;
7990 c->x86_cache_size = -1;
7991@@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
7992 c->x86_clflush_size = 64;
7993 c->x86_cache_alignment = c->x86_clflush_size;
7994 c->x86_max_cores = 1;
7995+ c->x86_coreid_bits = 0;
7996 c->extended_cpuid_level = 0;
7997 memset(&c->x86_capability, 0, sizeof c->x86_capability);
7998
7999@@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8000 (unsigned int *)&c->x86_vendor_id[0],
8001 (unsigned int *)&c->x86_vendor_id[8],
8002 (unsigned int *)&c->x86_vendor_id[4]);
8003-
8004+
8005 get_cpu_vendor(c);
8006
8007 /* Initialize the standard set of capabilities */
8008@@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8009 c->x86 += (tfms >> 20) & 0xff;
8010 if (c->x86 >= 0x6)
8011 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8012- if (c->x86_capability[0] & (1<<19))
8013+ if (c->x86_capability[0] & (1<<19))
8014 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8015 } else {
8016 /* Have CPUID level 0 only - unheard of */
8017@@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8018 #ifdef CONFIG_SMP
8019 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8020 #endif
8021-}
8022-
8023-/*
8024- * This does the hard work of actually picking apart the CPU stuff...
8025- */
8026-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8027-{
8028- int i;
8029- u32 xlvl;
8030-
8031- early_identify_cpu(c);
8032-
8033 /* AMD-defined flags: level 0x80000001 */
8034 xlvl = cpuid_eax(0x80000000);
8035 c->extended_cpuid_level = xlvl;
8036@@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8037 c->x86_capability[2] = cpuid_edx(0x80860001);
8038 }
8039
8040+ c->extended_cpuid_level = cpuid_eax(0x80000000);
8041+ if (c->extended_cpuid_level >= 0x80000007)
8042+ c->x86_power = cpuid_edx(0x80000007);
8043+
8044+ switch (c->x86_vendor) {
8045+ case X86_VENDOR_AMD:
8046+ early_init_amd(c);
8047+ break;
8048+ case X86_VENDOR_INTEL:
8049+ early_init_intel(c);
8050+ break;
8051+ }
8052+
8053+}
8054+
8055+/*
8056+ * This does the hard work of actually picking apart the CPU stuff...
8057+ */
8058+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8059+{
8060+ int i;
8061+
8062+ early_identify_cpu(c);
8063+
8064 init_scattered_cpuid_features(c);
8065
8066 c->apicid = phys_pkg_id(0);
8067@@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8068 break;
8069 }
8070
8071- select_idle_routine(c);
8072- detect_ht(c);
8073+ detect_ht(c);
8074
8075 /*
8076 * On SMP, boot_cpu_data holds the common feature set between
8077@@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8078 */
8079 if (c != &boot_cpu_data) {
8080 /* AND the already accumulated flags with these */
8081- for (i = 0 ; i < NCAPINTS ; i++)
8082+ for (i = 0; i < NCAPINTS; i++)
8083 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8084 }
8085
8086+ /* Clear all flags overriden by options */
8087+ for (i = 0; i < NCAPINTS; i++)
8088+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
8089+
8090 #ifdef CONFIG_X86_MCE
8091 mcheck_init(c);
8092 #endif
8093+ select_idle_routine(c);
8094+
8095 if (c != &boot_cpu_data)
8096 mtrr_ap_init();
8097 #ifdef CONFIG_NUMA
8098 numa_add_cpu(smp_processor_id());
8099 #endif
8100+
8101 }
8102-
8103+
8104+static __init int setup_noclflush(char *arg)
8105+{
8106+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8107+ return 1;
8108+}
8109+__setup("noclflush", setup_noclflush);
8110
8111 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8112 {
8113 if (c->x86_model_id[0])
8114- printk("%s", c->x86_model_id);
8115+ printk(KERN_CONT "%s", c->x86_model_id);
8116+
8117+ if (c->x86_mask || c->cpuid_level >= 0)
8118+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8119+ else
8120+ printk(KERN_CONT "\n");
8121+}
8122
8123- if (c->x86_mask || c->cpuid_level >= 0)
8124- printk(" stepping %02x\n", c->x86_mask);
8125+static __init int setup_disablecpuid(char *arg)
8126+{
8127+ int bit;
8128+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8129+ setup_clear_cpu_cap(bit);
8130 else
8131- printk("\n");
8132+ return 0;
8133+ return 1;
8134 }
8135+__setup("clearcpuid=", setup_disablecpuid);
8136
8137 /*
8138 * Get CPU information for use by the procfs.
8139@@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8140 static int show_cpuinfo(struct seq_file *m, void *v)
8141 {
8142 struct cpuinfo_x86 *c = v;
8143- int cpu = 0;
8144-
8145- /*
8146- * These flag bits must match the definitions in <asm/cpufeature.h>.
8147- * NULL means this bit is undefined or reserved; either way it doesn't
8148- * have meaning as far as Linux is concerned. Note that it's important
8149- * to realize there is a difference between this table and CPUID -- if
8150- * applications want to get the raw CPUID data, they should access
8151- * /dev/cpu/<cpu_nr>/cpuid instead.
8152- */
8153- static const char *const x86_cap_flags[] = {
8154- /* Intel-defined */
8155- "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8156- "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8157- "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8158- "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8159-
8160- /* AMD-defined */
8161- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8162- NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8163- NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8164- NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8165- "3dnowext", "3dnow",
8166-
8167- /* Transmeta-defined */
8168- "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8169- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8170- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8171- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8172-
8173- /* Other (Linux-defined) */
8174- "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8175- NULL, NULL, NULL, NULL,
8176- "constant_tsc", "up", NULL, "arch_perfmon",
8177- "pebs", "bts", NULL, "sync_rdtsc",
8178- "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8179- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8180-
8181- /* Intel-defined (#2) */
8182- "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8183- "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8184- NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8185- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8186-
8187- /* VIA/Cyrix/Centaur-defined */
8188- NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8189- "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8190- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8191- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8192-
8193- /* AMD-defined (#2) */
8194- "lahf_lm", "cmp_legacy", "svm", "extapic",
8195- "cr8_legacy", "abm", "sse4a", "misalignsse",
8196- "3dnowprefetch", "osvw", "ibs", "sse5",
8197- "skinit", "wdt", NULL, NULL,
8198- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8199- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8200-
8201- /* Auxiliary (Linux-defined) */
8202- "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8203- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8204- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8205- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8206- };
8207- static const char *const x86_power_flags[] = {
8208- "ts", /* temperature sensor */
8209- "fid", /* frequency id control */
8210- "vid", /* voltage id control */
8211- "ttp", /* thermal trip */
8212- "tm",
8213- "stc",
8214- "100mhzsteps",
8215- "hwpstate",
8216- "", /* tsc invariant mapped to constant_tsc */
8217- /* nothing */
8218- };
8219-
8220+ int cpu = 0, i;
8221
8222 #ifdef CONFIG_SMP
8223 cpu = c->cpu_index;
8224 #endif
8225
8226- seq_printf(m,"processor\t: %u\n"
8227- "vendor_id\t: %s\n"
8228- "cpu family\t: %d\n"
8229- "model\t\t: %d\n"
8230- "model name\t: %s\n",
8231- (unsigned)cpu,
8232- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8233- c->x86,
8234- (int)c->x86_model,
8235- c->x86_model_id[0] ? c->x86_model_id : "unknown");
8236-
8237+ seq_printf(m, "processor\t: %u\n"
8238+ "vendor_id\t: %s\n"
8239+ "cpu family\t: %d\n"
8240+ "model\t\t: %d\n"
8241+ "model name\t: %s\n",
8242+ (unsigned)cpu,
8243+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8244+ c->x86,
8245+ (int)c->x86_model,
8246+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
8247+
8248 if (c->x86_mask || c->cpuid_level >= 0)
8249 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8250 else
8251 seq_printf(m, "stepping\t: unknown\n");
8252-
8253- if (cpu_has(c,X86_FEATURE_TSC)) {
8254+
8255+ if (cpu_has(c, X86_FEATURE_TSC)) {
8256 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8257+
8258 if (!freq)
8259 freq = cpu_khz;
8260 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8261- freq / 1000, (freq % 1000));
8262+ freq / 1000, (freq % 1000));
8263 }
8264
8265 /* Cache size */
8266- if (c->x86_cache_size >= 0)
8267+ if (c->x86_cache_size >= 0)
8268 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8269-
8270+
8271 #ifdef CONFIG_SMP
8272 if (smp_num_siblings * c->x86_max_cores > 1) {
8273 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8274@@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8275 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8276 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8277 }
8278-#endif
8279+#endif
8280
8281 seq_printf(m,
8282- "fpu\t\t: yes\n"
8283- "fpu_exception\t: yes\n"
8284- "cpuid level\t: %d\n"
8285- "wp\t\t: yes\n"
8286- "flags\t\t:",
8287+ "fpu\t\t: yes\n"
8288+ "fpu_exception\t: yes\n"
8289+ "cpuid level\t: %d\n"
8290+ "wp\t\t: yes\n"
8291+ "flags\t\t:",
8292 c->cpuid_level);
8293
8294- {
8295- int i;
8296- for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8297- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8298- seq_printf(m, " %s", x86_cap_flags[i]);
8299- }
8300-
8301+ for (i = 0; i < 32*NCAPINTS; i++)
8302+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8303+ seq_printf(m, " %s", x86_cap_flags[i]);
8304+
8305 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8306 c->loops_per_jiffy/(500000/HZ),
8307 (c->loops_per_jiffy/(5000/HZ)) % 100);
8308
8309- if (c->x86_tlbsize > 0)
8310+ if (c->x86_tlbsize > 0)
8311 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8312 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8313 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8314
8315- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8316+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8317 c->x86_phys_bits, c->x86_virt_bits);
8318
8319 seq_printf(m, "power management:");
8320- {
8321- unsigned i;
8322- for (i = 0; i < 32; i++)
8323- if (c->x86_power & (1 << i)) {
8324- if (i < ARRAY_SIZE(x86_power_flags) &&
8325- x86_power_flags[i])
8326- seq_printf(m, "%s%s",
8327- x86_power_flags[i][0]?" ":"",
8328- x86_power_flags[i]);
8329- else
8330- seq_printf(m, " [%d]", i);
8331- }
8332+ for (i = 0; i < 32; i++) {
8333+ if (c->x86_power & (1 << i)) {
8334+ if (i < ARRAY_SIZE(x86_power_flags) &&
8335+ x86_power_flags[i])
8336+ seq_printf(m, "%s%s",
8337+ x86_power_flags[i][0]?" ":"",
8338+ x86_power_flags[i]);
8339+ else
8340+ seq_printf(m, " [%d]", i);
8341+ }
8342 }
8343
8344 seq_printf(m, "\n\n");
8345@@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8346 {
8347 }
8348
8349-struct seq_operations cpuinfo_op = {
8350- .start =c_start,
8351+const struct seq_operations cpuinfo_op = {
8352+ .start = c_start,
8353 .next = c_next,
8354 .stop = c_stop,
8355 .show = show_cpuinfo,
8356--- sle11-2009-06-29.orig/arch/x86/kernel/smp_32-xen.c 2009-02-16 16:18:36.000000000 +0100
8357+++ sle11-2009-06-29/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8358@@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8359 }
8360 }
8361
8362-void fastcall send_IPI_self(int vector)
8363+void send_IPI_self(int vector)
8364 {
8365 __send_IPI_shortcut(APIC_DEST_SELF, vector);
8366 }
8367@@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8368 * We need to reload %cr3 since the page tables may be going
8369 * away from under us..
8370 */
8371-void leave_mm(unsigned long cpu)
8372+void leave_mm(int cpu)
8373 {
8374 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8375 BUG();
8376 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8377 load_cr3(swapper_pg_dir);
8378 }
8379+EXPORT_SYMBOL_GPL(leave_mm);
8380
8381 /*
8382 *
8383--- sle11-2009-06-29.orig/arch/x86/kernel/smp_64-xen.c 2009-02-16 16:18:36.000000000 +0100
8384+++ sle11-2009-06-29/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8385@@ -33,7 +33,7 @@
8386
8387 #ifndef CONFIG_XEN
8388 /*
8389- * Smarter SMP flushing macros.
8390+ * Smarter SMP flushing macros.
8391 * c/o Linus Torvalds.
8392 *
8393 * These mean you can really definitely utterly forget about
8394@@ -41,15 +41,15 @@
8395 *
8396 * Optimizations Manfred Spraul <manfred@colorfullife.com>
8397 *
8398- * More scalable flush, from Andi Kleen
8399+ * More scalable flush, from Andi Kleen
8400 *
8401- * To avoid global state use 8 different call vectors.
8402- * Each CPU uses a specific vector to trigger flushes on other
8403- * CPUs. Depending on the received vector the target CPUs look into
8404+ * To avoid global state use 8 different call vectors.
8405+ * Each CPU uses a specific vector to trigger flushes on other
8406+ * CPUs. Depending on the received vector the target CPUs look into
8407 * the right per cpu variable for the flush data.
8408 *
8409- * With more than 8 CPUs they are hashed to the 8 available
8410- * vectors. The limited global vector space forces us to this right now.
8411+ * With more than 8 CPUs they are hashed to the 8 available
8412+ * vectors. The limited global vector space forces us to this right now.
8413 * In future when interrupts are split into per CPU domains this could be
8414 * fixed, at the cost of triggering multiple IPIs in some cases.
8415 */
8416@@ -59,7 +59,6 @@ union smp_flush_state {
8417 cpumask_t flush_cpumask;
8418 struct mm_struct *flush_mm;
8419 unsigned long flush_va;
8420-#define FLUSH_ALL -1ULL
8421 spinlock_t tlbstate_lock;
8422 };
8423 char pad[SMP_CACHE_BYTES];
8424@@ -71,16 +70,17 @@ union smp_flush_state {
8425 static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8426
8427 /*
8428- * We cannot call mmdrop() because we are in interrupt context,
8429+ * We cannot call mmdrop() because we are in interrupt context,
8430 * instead update mm->cpu_vm_mask.
8431 */
8432-static inline void leave_mm(unsigned long cpu)
8433+void leave_mm(int cpu)
8434 {
8435 if (read_pda(mmu_state) == TLBSTATE_OK)
8436 BUG();
8437 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8438 load_cr3(swapper_pg_dir);
8439 }
8440+EXPORT_SYMBOL_GPL(leave_mm);
8441
8442 /*
8443 *
8444@@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8445 * 1) switch_mm() either 1a) or 1b)
8446 * 1a) thread switch to a different mm
8447 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8448- * Stop ipi delivery for the old mm. This is not synchronized with
8449- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8450- * for the wrong mm, and in the worst case we perform a superfluous
8451- * tlb flush.
8452+ * Stop ipi delivery for the old mm. This is not synchronized with
8453+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8454+ * for the wrong mm, and in the worst case we perform a superfluous
8455+ * tlb flush.
8456 * 1a2) set cpu mmu_state to TLBSTATE_OK
8457- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8458+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8459 * was in lazy tlb mode.
8460 * 1a3) update cpu active_mm
8461- * Now cpu0 accepts tlb flushes for the new mm.
8462+ * Now cpu0 accepts tlb flushes for the new mm.
8463 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8464- * Now the other cpus will send tlb flush ipis.
8465+ * Now the other cpus will send tlb flush ipis.
8466 * 1a4) change cr3.
8467 * 1b) thread switch without mm change
8468 * cpu active_mm is correct, cpu0 already handles
8469 * flush ipis.
8470 * 1b1) set cpu mmu_state to TLBSTATE_OK
8471 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8472- * Atomically set the bit [other cpus will start sending flush ipis],
8473- * and test the bit.
8474+ * Atomically set the bit [other cpus will start sending flush ipis],
8475+ * and test the bit.
8476 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8477 * 2) switch %%esp, ie current
8478 *
8479@@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8480 * orig_rax contains the negated interrupt vector.
8481 * Use that to determine where the sender put the data.
8482 */
8483- sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8484+ sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8485 f = &per_cpu(flush_state, sender);
8486
8487 if (!cpu_isset(cpu, f->flush_cpumask))
8488 goto out;
8489- /*
8490+ /*
8491 * This was a BUG() but until someone can quote me the
8492 * line from the intel manual that guarantees an IPI to
8493 * multiple CPUs is retried _only_ on the erroring CPUs
8494@@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8495 *
8496 * BUG();
8497 */
8498-
8499+
8500 if (f->flush_mm == read_pda(active_mm)) {
8501 if (read_pda(mmu_state) == TLBSTATE_OK) {
8502- if (f->flush_va == FLUSH_ALL)
8503+ if (f->flush_va == TLB_FLUSH_ALL)
8504 local_flush_tlb();
8505 else
8506 __flush_tlb_one(f->flush_va);
8507@@ -170,19 +170,22 @@ out:
8508 add_pda(irq_tlb_count, 1);
8509 }
8510
8511-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8512- unsigned long va)
8513+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8514+ unsigned long va)
8515 {
8516 int sender;
8517 union smp_flush_state *f;
8518+ cpumask_t cpumask = *cpumaskp;
8519
8520 /* Caller has disabled preemption */
8521 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8522 f = &per_cpu(flush_state, sender);
8523
8524- /* Could avoid this lock when
8525- num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8526- probably not worth checking this for a cache-hot lock. */
8527+ /*
8528+ * Could avoid this lock when
8529+ * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8530+ * probably not worth checking this for a cache-hot lock.
8531+ */
8532 spin_lock(&f->tlbstate_lock);
8533
8534 f->flush_mm = mm;
8535@@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8536 int __cpuinit init_smp_flush(void)
8537 {
8538 int i;
8539+
8540 for_each_cpu_mask(i, cpu_possible_map) {
8541 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8542 }
8543 return 0;
8544 }
8545-
8546 core_initcall(init_smp_flush);
8547-
8548+
8549 void flush_tlb_current_task(void)
8550 {
8551 struct mm_struct *mm = current->mm;
8552@@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8553
8554 local_flush_tlb();
8555 if (!cpus_empty(cpu_mask))
8556- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8557+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8558 preempt_enable();
8559 }
8560-EXPORT_SYMBOL(flush_tlb_current_task);
8561
8562 void flush_tlb_mm (struct mm_struct * mm)
8563 {
8564@@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8565 leave_mm(smp_processor_id());
8566 }
8567 if (!cpus_empty(cpu_mask))
8568- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8569+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8570
8571 preempt_enable();
8572 }
8573-EXPORT_SYMBOL(flush_tlb_mm);
8574
8575 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8576 {
8577@@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8578 if (current->active_mm == mm) {
8579 if(current->mm)
8580 __flush_tlb_one(va);
8581- else
8582- leave_mm(smp_processor_id());
8583+ else
8584+ leave_mm(smp_processor_id());
8585 }
8586
8587 if (!cpus_empty(cpu_mask))
8588@@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8589
8590 preempt_enable();
8591 }
8592-EXPORT_SYMBOL(flush_tlb_page);
8593
8594 static void do_flush_tlb_all(void* info)
8595 {
8596@@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8597 * this function sends a 'generic call function' IPI to all other CPU
8598 * of the system defined in the mask.
8599 */
8600-
8601-static int
8602-__smp_call_function_mask(cpumask_t mask,
8603- void (*func)(void *), void *info,
8604- int wait)
8605+static int __smp_call_function_mask(cpumask_t mask,
8606+ void (*func)(void *), void *info,
8607+ int wait)
8608 {
8609 struct call_data_struct data;
8610 cpumask_t allbutself;
8611@@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8612 */
8613
8614 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8615- int nonatomic, int wait)
8616+ int nonatomic, int wait)
8617 {
8618 /* prevent preemption and reschedule on another processor */
8619- int ret;
8620- int me = get_cpu();
8621+ int ret, me = get_cpu();
8622
8623 /* Can deadlock when called with interrupts disabled */
8624 WARN_ON(irqs_disabled());
8625@@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8626 */
8627 cpu_clear(smp_processor_id(), cpu_online_map);
8628 disable_all_local_evtchn();
8629- for (;;)
8630+ for (;;)
8631 halt();
8632-}
8633+}
8634
8635 void smp_send_stop(void)
8636 {
8637--- sle11-2009-06-29.orig/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:35.000000000 +0100
8638+++ sle11-2009-06-29/arch/x86/kernel/time_32-xen.c 2009-03-24 10:12:48.000000000 +0100
8639@@ -28,21 +28,9 @@
8640 * serialize accesses to xtime/lost_ticks).
8641 */
8642
8643-#include <linux/errno.h>
8644-#include <linux/sched.h>
8645-#include <linux/kernel.h>
8646-#include <linux/param.h>
8647-#include <linux/string.h>
8648-#include <linux/mm.h>
8649+#include <linux/init.h>
8650 #include <linux/interrupt.h>
8651 #include <linux/time.h>
8652-#include <linux/delay.h>
8653-#include <linux/init.h>
8654-#include <linux/smp.h>
8655-#include <linux/module.h>
8656-#include <linux/sysdev.h>
8657-#include <linux/bcd.h>
8658-#include <linux/efi.h>
8659 #include <linux/mca.h>
8660 #include <linux/sysctl.h>
8661 #include <linux/percpu.h>
8662@@ -50,26 +38,10 @@
8663 #include <linux/posix-timers.h>
8664 #include <linux/cpufreq.h>
8665 #include <linux/clocksource.h>
8666+#include <linux/sysdev.h>
8667
8668-#include <asm/io.h>
8669-#include <asm/smp.h>
8670-#include <asm/irq.h>
8671-#include <asm/msr.h>
8672 #include <asm/delay.h>
8673-#include <asm/mpspec.h>
8674-#include <asm/uaccess.h>
8675-#include <asm/processor.h>
8676-#include <asm/timer.h>
8677 #include <asm/time.h>
8678-#include <asm/sections.h>
8679-
8680-#include "mach_time.h"
8681-
8682-#include <linux/timex.h>
8683-
8684-#include <asm/hpet.h>
8685-
8686-#include <asm/arch_hooks.h>
8687
8688 #include <xen/evtchn.h>
8689 #include <xen/sysctl.h>
8690@@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8691 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
8692 EXPORT_SYMBOL(cpu_khz);
8693
8694-DEFINE_SPINLOCK(rtc_lock);
8695-EXPORT_SYMBOL(rtc_lock);
8696-
8697 /* These are peridically updated in shared_info, and then copied here. */
8698 struct shadow_time_info {
8699 u64 tsc_timestamp; /* TSC at last update of time vals. */
8700@@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8701 }
8702 __setup("independent_wallclock", __independent_wallclock);
8703
8704+int xen_independent_wallclock(void)
8705+{
8706+ return independent_wallclock;
8707+}
8708+
8709 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8710 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8711 static int __init __permitted_clock_jitter(char *str)
8712@@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8713 return cmpxchg64(ptr, 0, 0);
8714 #else
8715 return *ptr;
8716-#define cmpxchg64 cmpxchg
8717 #endif
8718 }
8719
8720@@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8721 return cmpxchg64_local(ptr, 0, 0);
8722 #else
8723 return *ptr;
8724-#define cmpxchg64_local cmpxchg_local
8725 #endif
8726 }
8727
8728@@ -339,35 +311,6 @@ static inline int time_values_up_to_date
8729 return (dst->version == src->version);
8730 }
8731
8732-/*
8733- * This is a special lock that is owned by the CPU and holds the index
8734- * register we are working with. It is required for NMI access to the
8735- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
8736- */
8737-volatile unsigned long cmos_lock = 0;
8738-EXPORT_SYMBOL(cmos_lock);
8739-
8740-/* Routines for accessing the CMOS RAM/RTC. */
8741-unsigned char rtc_cmos_read(unsigned char addr)
8742-{
8743- unsigned char val;
8744- lock_cmos_prefix(addr);
8745- outb_p(addr, RTC_PORT(0));
8746- val = inb_p(RTC_PORT(1));
8747- lock_cmos_suffix(addr);
8748- return val;
8749-}
8750-EXPORT_SYMBOL(rtc_cmos_read);
8751-
8752-void rtc_cmos_write(unsigned char val, unsigned char addr)
8753-{
8754- lock_cmos_prefix(addr);
8755- outb_p(addr, RTC_PORT(0));
8756- outb_p(val, RTC_PORT(1));
8757- lock_cmos_suffix(addr);
8758-}
8759-EXPORT_SYMBOL(rtc_cmos_write);
8760-
8761 static void sync_xen_wallclock(unsigned long dummy);
8762 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8763 static void sync_xen_wallclock(unsigned long dummy)
8764@@ -376,7 +319,8 @@ static void sync_xen_wallclock(unsigned
8765 s64 nsec;
8766 struct xen_platform_op op;
8767
8768- if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8769+ BUG_ON(!is_initial_xendomain());
8770+ if (!ntp_synced() || independent_wallclock)
8771 return;
8772
8773 write_seqlock_irq(&xtime_lock);
8774@@ -399,23 +343,6 @@ static void sync_xen_wallclock(unsigned
8775 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8776 }
8777
8778-static int set_rtc_mmss(unsigned long nowtime)
8779-{
8780- int retval;
8781- unsigned long flags;
8782-
8783- if (independent_wallclock || !is_initial_xendomain())
8784- return 0;
8785-
8786- /* gets recalled with irq locally disabled */
8787- /* XXX - does irqsave resolve this? -johnstul */
8788- spin_lock_irqsave(&rtc_lock, flags);
8789- retval = set_wallclock(nowtime);
8790- spin_unlock_irqrestore(&rtc_lock, flags);
8791-
8792- return retval;
8793-}
8794-
8795 static unsigned long long local_clock(void)
8796 {
8797 unsigned int cpu = get_cpu();
8798@@ -498,28 +425,24 @@ unsigned long profile_pc(struct pt_regs
8799
8800 #if defined(CONFIG_SMP) || defined(__x86_64__)
8801 # ifdef __i386__
8802- if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8803+ if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8804 # else
8805 if (!user_mode(regs)
8806 # endif
8807 && in_lock_functions(pc)) {
8808 # ifdef CONFIG_FRAME_POINTER
8809-# ifdef __i386__
8810- return ((unsigned long *)regs->ebp)[1];
8811-# else
8812- return ((unsigned long *)regs->rbp)[1];
8813-# endif
8814+ return ((unsigned long *)regs->bp)[1];
8815 # else
8816 # ifdef __i386__
8817- unsigned long *sp = (unsigned long *)&regs->esp;
8818+ unsigned long *sp = (unsigned long *)&regs->sp;
8819 # else
8820- unsigned long *sp = (unsigned long *)regs->rsp;
8821+ unsigned long *sp = (unsigned long *)regs->sp;
8822 # endif
8823
8824 /* Return address is either directly at stack pointer
8825- or above a saved eflags. Eflags has bits 22-31 zero,
8826+ or above a saved flags. Eflags has bits 22-31 zero,
8827 kernel addresses don't. */
8828- if (sp[0] >> 22)
8829+ if (sp[0] >> 22)
8830 return sp[0];
8831 if (sp[1] >> 22)
8832 return sp[1];
8833@@ -748,25 +671,32 @@ static void init_missing_ticks_accountin
8834 runstate->time[RUNSTATE_offline];
8835 }
8836
8837-/* not static: needed by APM */
8838-unsigned long read_persistent_clock(void)
8839+unsigned long xen_read_persistent_clock(void)
8840 {
8841- unsigned long retval;
8842- unsigned long flags;
8843-
8844- spin_lock_irqsave(&rtc_lock, flags);
8845+ const shared_info_t *s = HYPERVISOR_shared_info;
8846+ u32 version, sec, nsec;
8847+ u64 delta;
8848
8849- retval = get_wallclock();
8850+ do {
8851+ version = s->wc_version;
8852+ rmb();
8853+ sec = s->wc_sec;
8854+ nsec = s->wc_nsec;
8855+ rmb();
8856+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
8857
8858- spin_unlock_irqrestore(&rtc_lock, flags);
8859+ delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
8860+ do_div(delta, NSEC_PER_SEC);
8861
8862- return retval;
8863+ return delta;
8864 }
8865
8866-int update_persistent_clock(struct timespec now)
8867+int xen_update_persistent_clock(void)
8868 {
8869+ if (!is_initial_xendomain())
8870+ return -1;
8871 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
8872- return set_rtc_mmss(now.tv_sec);
8873+ return 0;
8874 }
8875
8876 extern void (*late_time_init)(void);
8877--- sle11-2009-06-29.orig/arch/x86/kernel/traps_32-xen.c 2009-02-16 16:18:36.000000000 +0100
8878+++ sle11-2009-06-29/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8879@@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
8880 * F0 0F bug workaround.. We have a special link segment
8881 * for this.
8882 */
8883-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
8884+gate_desc idt_table[256]
8885+ __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
8886 #endif
8887
8888 asmlinkage void divide_error(void);
8889@@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
8890 int kstack_depth_to_print = 24;
8891 static unsigned int code_bytes = 64;
8892
8893+void printk_address(unsigned long address, int reliable)
8894+{
8895+#ifdef CONFIG_KALLSYMS
8896+ unsigned long offset = 0, symsize;
8897+ const char *symname;
8898+ char *modname;
8899+ char *delim = ":";
8900+ char namebuf[128];
8901+ char reliab[4] = "";
8902+
8903+ symname = kallsyms_lookup(address, &symsize, &offset,
8904+ &modname, namebuf);
8905+ if (!symname) {
8906+ printk(" [<%08lx>]\n", address);
8907+ return;
8908+ }
8909+ if (!reliable)
8910+ strcpy(reliab, "? ");
8911+
8912+ if (!modname)
8913+ modname = delim = "";
8914+ printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
8915+ address, reliab, delim, modname, delim, symname, offset, symsize);
8916+#else
8917+ printk(" [<%08lx>]\n", address);
8918+#endif
8919+}
8920+
8921 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
8922 {
8923 return p > (void *)tinfo &&
8924@@ -122,48 +151,35 @@ struct stack_frame {
8925 };
8926
8927 static inline unsigned long print_context_stack(struct thread_info *tinfo,
8928- unsigned long *stack, unsigned long ebp,
8929+ unsigned long *stack, unsigned long bp,
8930 const struct stacktrace_ops *ops, void *data)
8931 {
8932-#ifdef CONFIG_FRAME_POINTER
8933- struct stack_frame *frame = (struct stack_frame *)ebp;
8934- while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
8935- struct stack_frame *next;
8936- unsigned long addr;
8937+ struct stack_frame *frame = (struct stack_frame *)bp;
8938
8939- addr = frame->return_address;
8940- ops->address(data, addr);
8941- /*
8942- * break out of recursive entries (such as
8943- * end_of_stack_stop_unwind_function). Also,
8944- * we can never allow a frame pointer to
8945- * move downwards!
8946- */
8947- next = frame->next_frame;
8948- if (next <= frame)
8949- break;
8950- frame = next;
8951- }
8952-#else
8953 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
8954 unsigned long addr;
8955
8956- addr = *stack++;
8957- if (__kernel_text_address(addr))
8958- ops->address(data, addr);
8959+ addr = *stack;
8960+ if (__kernel_text_address(addr)) {
8961+ if ((unsigned long) stack == bp + 4) {
8962+ ops->address(data, addr, 1);
8963+ frame = frame->next_frame;
8964+ bp = (unsigned long) frame;
8965+ } else {
8966+ ops->address(data, addr, bp == 0);
8967+ }
8968+ }
8969+ stack++;
8970 }
8971-#endif
8972- return ebp;
8973+ return bp;
8974 }
8975
8976 #define MSG(msg) ops->warning(data, msg)
8977
8978 void dump_trace(struct task_struct *task, struct pt_regs *regs,
8979- unsigned long *stack,
8980+ unsigned long *stack, unsigned long bp,
8981 const struct stacktrace_ops *ops, void *data)
8982 {
8983- unsigned long ebp = 0;
8984-
8985 if (!task)
8986 task = current;
8987
8988@@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
8989 unsigned long dummy;
8990 stack = &dummy;
8991 if (task != current)
8992- stack = (unsigned long *)task->thread.esp;
8993+ stack = (unsigned long *)task->thread.sp;
8994 }
8995
8996 #ifdef CONFIG_FRAME_POINTER
8997- if (!ebp) {
8998+ if (!bp) {
8999 if (task == current) {
9000- /* Grab ebp right from our regs */
9001- asm ("movl %%ebp, %0" : "=r" (ebp) : );
9002+ /* Grab bp right from our regs */
9003+ asm ("movl %%ebp, %0" : "=r" (bp) : );
9004 } else {
9005- /* ebp is the last reg pushed by switch_to */
9006- ebp = *(unsigned long *) task->thread.esp;
9007+ /* bp is the last reg pushed by switch_to */
9008+ bp = *(unsigned long *) task->thread.sp;
9009 }
9010 }
9011 #endif
9012@@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9013 struct thread_info *context;
9014 context = (struct thread_info *)
9015 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9016- ebp = print_context_stack(context, stack, ebp, ops, data);
9017+ bp = print_context_stack(context, stack, bp, ops, data);
9018 /* Should be after the line below, but somewhere
9019 in early boot context comes out corrupted and we
9020 can't reference it -AK */
9021@@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9022 /*
9023 * Print one address/symbol entries per line.
9024 */
9025-static void print_trace_address(void *data, unsigned long addr)
9026+static void print_trace_address(void *data, unsigned long addr, int reliable)
9027 {
9028 printk("%s [<%08lx>] ", (char *)data, addr);
9029+ if (!reliable)
9030+ printk("? ");
9031 print_symbol("%s\n", addr);
9032 touch_nmi_watchdog();
9033 }
9034@@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9035
9036 static void
9037 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9038- unsigned long * stack, char *log_lvl)
9039+ unsigned long *stack, unsigned long bp, char *log_lvl)
9040 {
9041- dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9042+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9043 printk("%s =======================\n", log_lvl);
9044 }
9045
9046 void show_trace(struct task_struct *task, struct pt_regs *regs,
9047- unsigned long * stack)
9048+ unsigned long *stack, unsigned long bp)
9049 {
9050- show_trace_log_lvl(task, regs, stack, "");
9051+ show_trace_log_lvl(task, regs, stack, bp, "");
9052 }
9053
9054 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9055- unsigned long *esp, char *log_lvl)
9056+ unsigned long *sp, unsigned long bp, char *log_lvl)
9057 {
9058 unsigned long *stack;
9059 int i;
9060
9061- if (esp == NULL) {
9062+ if (sp == NULL) {
9063 if (task)
9064- esp = (unsigned long*)task->thread.esp;
9065+ sp = (unsigned long*)task->thread.sp;
9066 else
9067- esp = (unsigned long *)&esp;
9068+ sp = (unsigned long *)&sp;
9069 }
9070
9071- stack = esp;
9072+ stack = sp;
9073 for(i = 0; i < kstack_depth_to_print; i++) {
9074 if (kstack_end(stack))
9075 break;
9076@@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9077 printk("%08lx ", *stack++);
9078 }
9079 printk("\n%sCall Trace:\n", log_lvl);
9080- show_trace_log_lvl(task, regs, esp, log_lvl);
9081+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9082 }
9083
9084-void show_stack(struct task_struct *task, unsigned long *esp)
9085+void show_stack(struct task_struct *task, unsigned long *sp)
9086 {
9087 printk(" ");
9088- show_stack_log_lvl(task, NULL, esp, "");
9089+ show_stack_log_lvl(task, NULL, sp, 0, "");
9090 }
9091
9092 /*
9093@@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9094 void dump_stack(void)
9095 {
9096 unsigned long stack;
9097+ unsigned long bp = 0;
9098+
9099+#ifdef CONFIG_FRAME_POINTER
9100+ if (!bp)
9101+ asm("movl %%ebp, %0" : "=r" (bp):);
9102+#endif
9103
9104 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9105 current->pid, current->comm, print_tainted(),
9106 init_utsname()->release,
9107 (int)strcspn(init_utsname()->version, " "),
9108 init_utsname()->version);
9109- show_trace(current, NULL, &stack);
9110+ show_trace(current, NULL, &stack, bp);
9111 }
9112
9113 EXPORT_SYMBOL(dump_stack);
9114@@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9115 * time of the fault..
9116 */
9117 if (!user_mode_vm(regs)) {
9118- u8 *eip;
9119+ u8 *ip;
9120 unsigned int code_prologue = code_bytes * 43 / 64;
9121 unsigned int code_len = code_bytes;
9122 unsigned char c;
9123
9124 printk("\n" KERN_EMERG "Stack: ");
9125- show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
9126+ show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
9127
9128 printk(KERN_EMERG "Code: ");
9129
9130- eip = (u8 *)regs->eip - code_prologue;
9131- if (eip < (u8 *)PAGE_OFFSET ||
9132- probe_kernel_address(eip, c)) {
9133+ ip = (u8 *)regs->ip - code_prologue;
9134+ if (ip < (u8 *)PAGE_OFFSET ||
9135+ probe_kernel_address(ip, c)) {
9136 /* try starting at EIP */
9137- eip = (u8 *)regs->eip;
9138+ ip = (u8 *)regs->ip;
9139 code_len = code_len - code_prologue + 1;
9140 }
9141- for (i = 0; i < code_len; i++, eip++) {
9142- if (eip < (u8 *)PAGE_OFFSET ||
9143- probe_kernel_address(eip, c)) {
9144+ for (i = 0; i < code_len; i++, ip++) {
9145+ if (ip < (u8 *)PAGE_OFFSET ||
9146+ probe_kernel_address(ip, c)) {
9147 printk(" Bad EIP value.");
9148 break;
9149 }
9150- if (eip == (u8 *)regs->eip)
9151+ if (ip == (u8 *)regs->ip)
9152 printk("<%02x> ", c);
9153 else
9154 printk("%02x ", c);
9155@@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9156 printk("\n");
9157 }
9158
9159-int is_valid_bugaddr(unsigned long eip)
9160+int is_valid_bugaddr(unsigned long ip)
9161 {
9162 unsigned short ud2;
9163
9164- if (eip < PAGE_OFFSET)
9165+ if (ip < PAGE_OFFSET)
9166 return 0;
9167- if (probe_kernel_address((unsigned short *)eip, ud2))
9168+ if (probe_kernel_address((unsigned short *)ip, ud2))
9169 return 0;
9170
9171 return ud2 == 0x0b0f;
9172 }
9173
9174+static int die_counter;
9175+
9176+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9177+{
9178+ unsigned long sp;
9179+ unsigned short ss;
9180+
9181+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9182+#ifdef CONFIG_PREEMPT
9183+ printk("PREEMPT ");
9184+#endif
9185+#ifdef CONFIG_SMP
9186+ printk("SMP ");
9187+#endif
9188+#ifdef CONFIG_DEBUG_PAGEALLOC
9189+ printk("DEBUG_PAGEALLOC");
9190+#endif
9191+ printk("\n");
9192+
9193+ if (notify_die(DIE_OOPS, str, regs, err,
9194+ current->thread.trap_no, SIGSEGV) !=
9195+ NOTIFY_STOP) {
9196+ show_registers(regs);
9197+ /* Executive summary in case the oops scrolled away */
9198+ sp = (unsigned long) (&regs->sp);
9199+ savesegment(ss, ss);
9200+ if (user_mode(regs)) {
9201+ sp = regs->sp;
9202+ ss = regs->ss & 0xffff;
9203+ }
9204+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9205+ print_symbol("%s", regs->ip);
9206+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
9207+ return 0;
9208+ } else {
9209+ return 1;
9210+ }
9211+}
9212+
9213 /*
9214 * This is gone through when something in the kernel has done something bad and
9215 * is about to be terminated.
9216@@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9217 .lock_owner = -1,
9218 .lock_owner_depth = 0
9219 };
9220- static int die_counter;
9221 unsigned long flags;
9222
9223 oops_enter();
9224@@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9225 raw_local_irq_save(flags);
9226
9227 if (++die.lock_owner_depth < 3) {
9228- unsigned long esp;
9229- unsigned short ss;
9230-
9231- report_bug(regs->eip, regs);
9232-
9233- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9234- ++die_counter);
9235-#ifdef CONFIG_PREEMPT
9236- printk("PREEMPT ");
9237-#endif
9238-#ifdef CONFIG_SMP
9239- printk("SMP ");
9240-#endif
9241-#ifdef CONFIG_DEBUG_PAGEALLOC
9242- printk("DEBUG_PAGEALLOC");
9243-#endif
9244- printk("\n");
9245+ report_bug(regs->ip, regs);
9246
9247- if (notify_die(DIE_OOPS, str, regs, err,
9248- current->thread.trap_no, SIGSEGV) !=
9249- NOTIFY_STOP) {
9250- show_registers(regs);
9251- /* Executive summary in case the oops scrolled away */
9252- esp = (unsigned long) (&regs->esp);
9253- savesegment(ss, ss);
9254- if (user_mode(regs)) {
9255- esp = regs->esp;
9256- ss = regs->xss & 0xffff;
9257- }
9258- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9259- print_symbol("%s", regs->eip);
9260- printk(" SS:ESP %04x:%08lx\n", ss, esp);
9261- }
9262- else
9263+ if (__die(str, regs, err))
9264 regs = NULL;
9265- } else
9266+ } else {
9267 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9268+ }
9269
9270 bust_spinlocks(0);
9271 die.lock_owner = -1;
9272@@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9273 {
9274 struct task_struct *tsk = current;
9275
9276- if (regs->eflags & VM_MASK) {
9277+ if (regs->flags & VM_MASK) {
9278 if (vm86)
9279 goto vm86_trap;
9280 goto trap_signal;
9281@@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9282 }
9283
9284 #define DO_ERROR(trapnr, signr, str, name) \
9285-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9286+void do_##name(struct pt_regs * regs, long error_code) \
9287 { \
9288 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9289 == NOTIFY_STOP) \
9290@@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9291 }
9292
9293 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9294-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9295+void do_##name(struct pt_regs * regs, long error_code) \
9296 { \
9297 siginfo_t info; \
9298 if (irq) \
9299@@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9300 }
9301
9302 #define DO_VM86_ERROR(trapnr, signr, str, name) \
9303-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9304+void do_##name(struct pt_regs * regs, long error_code) \
9305 { \
9306 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9307 == NOTIFY_STOP) \
9308@@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9309 }
9310
9311 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9312-fastcall void do_##name(struct pt_regs * regs, long error_code) \
9313+void do_##name(struct pt_regs * regs, long error_code) \
9314 { \
9315 siginfo_t info; \
9316 info.si_signo = signr; \
9317@@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9318 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9319 }
9320
9321-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
9322+DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
9323 #ifndef CONFIG_KPROBES
9324 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9325 #endif
9326 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9327 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9328-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9329+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9330 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
9331 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9332 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
9333@@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s
9334 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9335 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9336
9337-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9338+void __kprobes do_general_protection(struct pt_regs * regs,
9339 long error_code)
9340 {
9341- if (regs->eflags & VM_MASK)
9342+ if (regs->flags & VM_MASK)
9343 goto gp_in_vm86;
9344
9345 if (!user_mode(regs))
9346@@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9347 current->thread.error_code = error_code;
9348 current->thread.trap_no = 13;
9349 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9350- printk_ratelimit())
9351+ printk_ratelimit()) {
9352 printk(KERN_INFO
9353- "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9354+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9355 current->comm, task_pid_nr(current),
9356- regs->eip, regs->esp, error_code);
9357+ regs->ip, regs->sp, error_code);
9358+ print_vma_addr(" in ", regs->ip);
9359+ printk("\n");
9360+ }
9361
9362 force_sig(SIGSEGV, current);
9363 return;
9364@@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9365 */
9366 bust_spinlocks(1);
9367 printk(KERN_EMERG "%s", msg);
9368- printk(" on CPU%d, eip %08lx, registers:\n",
9369- smp_processor_id(), regs->eip);
9370+ printk(" on CPU%d, ip %08lx, registers:\n",
9371+ smp_processor_id(), regs->ip);
9372 show_registers(regs);
9373 console_silent();
9374 spin_unlock(&nmi_print_lock);
9375@@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9376
9377 static int ignore_nmis;
9378
9379-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9380+__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9381 {
9382 int cpu;
9383
9384@@ -762,7 +797,7 @@ void restart_nmi(void)
9385 }
9386
9387 #ifdef CONFIG_KPROBES
9388-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9389+void __kprobes do_int3(struct pt_regs *regs, long error_code)
9390 {
9391 trace_hardirqs_fixup();
9392
9393@@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9394 * find every occurrence of the TF bit that could be saved away even
9395 * by user code)
9396 */
9397-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9398+void __kprobes do_debug(struct pt_regs * regs, long error_code)
9399 {
9400 unsigned int condition;
9401 struct task_struct *tsk = current;
9402@@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9403
9404 get_debugreg(condition, 6);
9405
9406+ /*
9407+ * The processor cleared BTF, so don't mark that we need it set.
9408+ */
9409+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9410+ tsk->thread.debugctlmsr = 0;
9411+
9412 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9413 SIGTRAP) == NOTIFY_STOP)
9414 return;
9415 /* It's safe to allow irq's after DR6 has been saved */
9416- if (regs->eflags & X86_EFLAGS_IF)
9417+ if (regs->flags & X86_EFLAGS_IF)
9418 local_irq_enable();
9419
9420 /* Mask out spurious debug traps due to lazy DR7 setting */
9421 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9422- if (!tsk->thread.debugreg[7])
9423+ if (!tsk->thread.debugreg7)
9424 goto clear_dr7;
9425 }
9426
9427- if (regs->eflags & VM_MASK)
9428+ if (regs->flags & VM_MASK)
9429 goto debug_vm86;
9430
9431 /* Save debug status register where ptrace can see it */
9432- tsk->thread.debugreg[6] = condition;
9433+ tsk->thread.debugreg6 = condition;
9434
9435 /*
9436 * Single-stepping through TF: make sure we ignore any events in
9437@@ -856,7 +897,7 @@ debug_vm86:
9438
9439 clear_TF_reenable:
9440 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9441- regs->eflags &= ~TF_MASK;
9442+ regs->flags &= ~TF_MASK;
9443 return;
9444 }
9445
9446@@ -865,7 +906,7 @@ clear_TF_reenable:
9447 * the correct behaviour even in the presence of the asynchronous
9448 * IRQ13 behaviour
9449 */
9450-void math_error(void __user *eip)
9451+void math_error(void __user *ip)
9452 {
9453 struct task_struct * task;
9454 siginfo_t info;
9455@@ -881,7 +922,7 @@ void math_error(void __user *eip)
9456 info.si_signo = SIGFPE;
9457 info.si_errno = 0;
9458 info.si_code = __SI_FAULT;
9459- info.si_addr = eip;
9460+ info.si_addr = ip;
9461 /*
9462 * (~cwd & swd) will mask out exceptions that are not set to unmasked
9463 * status. 0x3f is the exception bits in these regs, 0x200 is the
9464@@ -924,13 +965,13 @@ void math_error(void __user *eip)
9465 force_sig_info(SIGFPE, &info, task);
9466 }
9467
9468-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9469+void do_coprocessor_error(struct pt_regs * regs, long error_code)
9470 {
9471 ignore_fpu_irq = 1;
9472- math_error((void __user *)regs->eip);
9473+ math_error((void __user *)regs->ip);
9474 }
9475
9476-static void simd_math_error(void __user *eip)
9477+static void simd_math_error(void __user *ip)
9478 {
9479 struct task_struct * task;
9480 siginfo_t info;
9481@@ -946,7 +987,7 @@ static void simd_math_error(void __user
9482 info.si_signo = SIGFPE;
9483 info.si_errno = 0;
9484 info.si_code = __SI_FAULT;
9485- info.si_addr = eip;
9486+ info.si_addr = ip;
9487 /*
9488 * The SIMD FPU exceptions are handled a little differently, as there
9489 * is only a single status/control register. Thus, to determine which
9490@@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9491 force_sig_info(SIGFPE, &info, task);
9492 }
9493
9494-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9495+void do_simd_coprocessor_error(struct pt_regs * regs,
9496 long error_code)
9497 {
9498 if (cpu_has_xmm) {
9499 /* Handle SIMD FPU exceptions on PIII+ processors. */
9500 ignore_fpu_irq = 1;
9501- simd_math_error((void __user *)regs->eip);
9502+ simd_math_error((void __user *)regs->ip);
9503 } else {
9504 /*
9505 * Handle strange cache flush from user space exception
9506 * in all other cases. This is undocumented behaviour.
9507 */
9508- if (regs->eflags & VM_MASK) {
9509+ if (regs->flags & VM_MASK) {
9510 handle_vm86_fault((struct kernel_vm86_regs *)regs,
9511 error_code);
9512 return;
9513@@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9514 }
9515
9516 #ifndef CONFIG_XEN
9517-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9518+void do_spurious_interrupt_bug(struct pt_regs * regs,
9519 long error_code)
9520 {
9521 #if 0
9522@@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9523 #endif
9524 }
9525
9526-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9527+unsigned long patch_espfix_desc(unsigned long uesp,
9528 unsigned long kesp)
9529 {
9530 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9531@@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9532 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9533 * for those that specify <dpl>|4 in the second field.
9534 */
9535-static trap_info_t __cpuinitdata trap_table[] = {
9536+static const trap_info_t __cpuinitconst trap_table[] = {
9537 { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
9538 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
9539 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
9540@@ -1105,17 +1146,12 @@ void __init trap_init(void)
9541 if (ret)
9542 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9543
9544+ /*
9545+ * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9546+ * Generate a build-time error if the alignment is wrong.
9547+ */
9548+ BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9549 if (cpu_has_fxsr) {
9550- /*
9551- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9552- * Generates a compile-time "error: zero width for bit-field" if
9553- * the alignment is wrong.
9554- */
9555- struct fxsrAlignAssert {
9556- int _:!(offsetof(struct task_struct,
9557- thread.i387.fxsave) & 15);
9558- };
9559-
9560 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9561 set_in_cr4(X86_CR4_OSFXSR);
9562 printk("done.\n");
9563--- sle11-2009-06-29.orig/arch/x86/kernel/traps_64-xen.c 2009-02-16 16:18:36.000000000 +0100
9564+++ sle11-2009-06-29/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
9565@@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9566 asmlinkage void machine_check(void);
9567 asmlinkage void spurious_interrupt_bug(void);
9568
9569+static unsigned int code_bytes = 64;
9570+
9571 static inline void conditional_sti(struct pt_regs *regs)
9572 {
9573- if (regs->eflags & X86_EFLAGS_IF)
9574+ if (regs->flags & X86_EFLAGS_IF)
9575 local_irq_enable();
9576 }
9577
9578 static inline void preempt_conditional_sti(struct pt_regs *regs)
9579 {
9580- preempt_disable();
9581- if (regs->eflags & X86_EFLAGS_IF)
9582+ inc_preempt_count();
9583+ if (regs->flags & X86_EFLAGS_IF)
9584 local_irq_enable();
9585 }
9586
9587 static inline void preempt_conditional_cli(struct pt_regs *regs)
9588 {
9589- if (regs->eflags & X86_EFLAGS_IF)
9590+ if (regs->flags & X86_EFLAGS_IF)
9591 local_irq_disable();
9592 /* Make sure to not schedule here because we could be running
9593 on an exception stack. */
9594- preempt_enable_no_resched();
9595+ dec_preempt_count();
9596 }
9597
9598 int kstack_depth_to_print = 12;
9599
9600-#ifdef CONFIG_KALLSYMS
9601-void printk_address(unsigned long address)
9602+void printk_address(unsigned long address, int reliable)
9603 {
9604+#ifdef CONFIG_KALLSYMS
9605 unsigned long offset = 0, symsize;
9606 const char *symname;
9607 char *modname;
9608 char *delim = ":";
9609- char namebuf[128];
9610+ char namebuf[KSYM_NAME_LEN];
9611+ char reliab[4] = "";
9612
9613 symname = kallsyms_lookup(address, &symsize, &offset,
9614 &modname, namebuf);
9615@@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9616 printk(" [<%016lx>]\n", address);
9617 return;
9618 }
9619+ if (!reliable)
9620+ strcpy(reliab, "? ");
9621+
9622 if (!modname)
9623- modname = delim = "";
9624- printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9625- address, delim, modname, delim, symname, offset, symsize);
9626-}
9627+ modname = delim = "";
9628+ printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9629+ address, reliab, delim, modname, delim, symname, offset, symsize);
9630 #else
9631-void printk_address(unsigned long address)
9632-{
9633 printk(" [<%016lx>]\n", address);
9634-}
9635 #endif
9636+}
9637
9638 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9639 unsigned *usedp, char **idp)
9640@@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9641 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9642 */
9643
9644-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9645+static inline int valid_stack_ptr(struct thread_info *tinfo,
9646+ void *p, unsigned int size, void *end)
9647 {
9648- void *t = (void *)tinfo;
9649- return p > t && p < t + THREAD_SIZE - 3;
9650+ void *t = tinfo;
9651+ if (end) {
9652+ if (p < end && p >= (end-THREAD_SIZE))
9653+ return 1;
9654+ else
9655+ return 0;
9656+ }
9657+ return p > t && p < t + THREAD_SIZE - size;
9658+}
9659+
9660+/* The form of the top of the frame on the stack */
9661+struct stack_frame {
9662+ struct stack_frame *next_frame;
9663+ unsigned long return_address;
9664+};
9665+
9666+
9667+static inline unsigned long print_context_stack(struct thread_info *tinfo,
9668+ unsigned long *stack, unsigned long bp,
9669+ const struct stacktrace_ops *ops, void *data,
9670+ unsigned long *end)
9671+{
9672+ struct stack_frame *frame = (struct stack_frame *)bp;
9673+
9674+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9675+ unsigned long addr;
9676+
9677+ addr = *stack;
9678+ if (__kernel_text_address(addr)) {
9679+ if ((unsigned long) stack == bp + 8) {
9680+ ops->address(data, addr, 1);
9681+ frame = frame->next_frame;
9682+ bp = (unsigned long) frame;
9683+ } else {
9684+ ops->address(data, addr, bp == 0);
9685+ }
9686+ }
9687+ stack++;
9688+ }
9689+ return bp;
9690 }
9691
9692 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9693- unsigned long *stack,
9694+ unsigned long *stack, unsigned long bp,
9695 const struct stacktrace_ops *ops, void *data)
9696 {
9697 const unsigned cpu = get_cpu();
9698@@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9699
9700 if (!tsk)
9701 tsk = current;
9702+ tinfo = task_thread_info(tsk);
9703
9704 if (!stack) {
9705 unsigned long dummy;
9706 stack = &dummy;
9707 if (tsk && tsk != current)
9708- stack = (unsigned long *)tsk->thread.rsp;
9709+ stack = (unsigned long *)tsk->thread.sp;
9710 }
9711
9712- /*
9713- * Print function call entries within a stack. 'cond' is the
9714- * "end of stackframe" condition, that the 'stack++'
9715- * iteration will eventually trigger.
9716- */
9717-#define HANDLE_STACK(cond) \
9718- do while (cond) { \
9719- unsigned long addr = *stack++; \
9720- /* Use unlocked access here because except for NMIs \
9721- we should be already protected against module unloads */ \
9722- if (__kernel_text_address(addr)) { \
9723- /* \
9724- * If the address is either in the text segment of the \
9725- * kernel, or in the region which contains vmalloc'ed \
9726- * memory, it *may* be the address of a calling \
9727- * routine; if so, print it so that someone tracing \
9728- * down the cause of the crash will be able to figure \
9729- * out the call path that was taken. \
9730- */ \
9731- ops->address(data, addr); \
9732- } \
9733- } while (0)
9734+#ifdef CONFIG_FRAME_POINTER
9735+ if (!bp) {
9736+ if (tsk == current) {
9737+ /* Grab bp right from our regs */
9738+ asm("movq %%rbp, %0" : "=r" (bp):);
9739+ } else {
9740+ /* bp is the last reg pushed by switch_to */
9741+ bp = *(unsigned long *) tsk->thread.sp;
9742+ }
9743+ }
9744+#endif
9745+
9746+
9747
9748 /*
9749 * Print function call entries in all stacks, starting at the
9750@@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9751 if (estack_end) {
9752 if (ops->stack(data, id) < 0)
9753 break;
9754- HANDLE_STACK (stack < estack_end);
9755+
9756+ bp = print_context_stack(tinfo, stack, bp, ops,
9757+ data, estack_end);
9758 ops->stack(data, "<EOE>");
9759 /*
9760 * We link to the next stack via the
9761@@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9762 if (stack >= irqstack && stack < irqstack_end) {
9763 if (ops->stack(data, "IRQ") < 0)
9764 break;
9765- HANDLE_STACK (stack < irqstack_end);
9766+ bp = print_context_stack(tinfo, stack, bp,
9767+ ops, data, irqstack_end);
9768 /*
9769 * We link to the next stack (which would be
9770 * the process stack normally) the last
9771@@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9772 /*
9773 * This handles the process stack:
9774 */
9775- tinfo = task_thread_info(tsk);
9776- HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9777-#undef HANDLE_STACK
9778+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9779 put_cpu();
9780 }
9781 EXPORT_SYMBOL(dump_trace);
9782@@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9783 return 0;
9784 }
9785
9786-static void print_trace_address(void *data, unsigned long addr)
9787+static void print_trace_address(void *data, unsigned long addr, int reliable)
9788 {
9789 touch_nmi_watchdog();
9790- printk_address(addr);
9791+ printk_address(addr, reliable);
9792 }
9793
9794 static const struct stacktrace_ops print_trace_ops = {
9795@@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9796 };
9797
9798 void
9799-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9800+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9801+ unsigned long bp)
9802 {
9803 printk("\nCall Trace:\n");
9804- dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9805+ dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9806 printk("\n");
9807 }
9808
9809 static void
9810-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9811+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9812+ unsigned long bp)
9813 {
9814 unsigned long *stack;
9815 int i;
9816@@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9817 // debugging aid: "show_stack(NULL, NULL);" prints the
9818 // back trace for this cpu.
9819
9820- if (rsp == NULL) {
9821+ if (sp == NULL) {
9822 if (tsk)
9823- rsp = (unsigned long *)tsk->thread.rsp;
9824+ sp = (unsigned long *)tsk->thread.sp;
9825 else
9826- rsp = (unsigned long *)&rsp;
9827+ sp = (unsigned long *)&sp;
9828 }
9829
9830- stack = rsp;
9831+ stack = sp;
9832 for(i=0; i < kstack_depth_to_print; i++) {
9833 if (stack >= irqstack && stack <= irqstack_end) {
9834 if (stack == irqstack_end) {
9835@@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9836 printk(" %016lx", *stack++);
9837 touch_nmi_watchdog();
9838 }
9839- show_trace(tsk, regs, rsp);
9840+ show_trace(tsk, regs, sp, bp);
9841 }
9842
9843-void show_stack(struct task_struct *tsk, unsigned long * rsp)
9844+void show_stack(struct task_struct *tsk, unsigned long * sp)
9845 {
9846- _show_stack(tsk, NULL, rsp);
9847+ _show_stack(tsk, NULL, sp, 0);
9848 }
9849
9850 /*
9851@@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9852 void dump_stack(void)
9853 {
9854 unsigned long dummy;
9855+ unsigned long bp = 0;
9856+
9857+#ifdef CONFIG_FRAME_POINTER
9858+ if (!bp)
9859+ asm("movq %%rbp, %0" : "=r" (bp):);
9860+#endif
9861
9862 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9863 current->pid, current->comm, print_tainted(),
9864 init_utsname()->release,
9865 (int)strcspn(init_utsname()->version, " "),
9866 init_utsname()->version);
9867- show_trace(NULL, NULL, &dummy);
9868+ show_trace(NULL, NULL, &dummy, bp);
9869 }
9870
9871 EXPORT_SYMBOL(dump_stack);
9872@@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
9873 void show_registers(struct pt_regs *regs)
9874 {
9875 int i;
9876- int in_kernel = !user_mode(regs);
9877- unsigned long rsp;
9878+ unsigned long sp;
9879 const int cpu = smp_processor_id();
9880 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
9881+ u8 *ip;
9882+ unsigned int code_prologue = code_bytes * 43 / 64;
9883+ unsigned int code_len = code_bytes;
9884
9885- rsp = regs->rsp;
9886+ sp = regs->sp;
9887+ ip = (u8 *) regs->ip - code_prologue;
9888 printk("CPU %d ", cpu);
9889 __show_regs(regs);
9890 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
9891@@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
9892 * When in-kernel, we also print out the stack and code at the
9893 * time of the fault..
9894 */
9895- if (in_kernel) {
9896+ if (!user_mode(regs)) {
9897+ unsigned char c;
9898 printk("Stack: ");
9899- _show_stack(NULL, regs, (unsigned long*)rsp);
9900+ _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
9901+ printk("\n");
9902
9903- printk("\nCode: ");
9904- if (regs->rip < PAGE_OFFSET)
9905- goto bad;
9906-
9907- for (i=0; i<20; i++) {
9908- unsigned char c;
9909- if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
9910-bad:
9911+ printk(KERN_EMERG "Code: ");
9912+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
9913+ /* try starting at RIP */
9914+ ip = (u8 *) regs->ip;
9915+ code_len = code_len - code_prologue + 1;
9916+ }
9917+ for (i = 0; i < code_len; i++, ip++) {
9918+ if (ip < (u8 *)PAGE_OFFSET ||
9919+ probe_kernel_address(ip, c)) {
9920 printk(" Bad RIP value.");
9921 break;
9922 }
9923- printk("%02x ", c);
9924+ if (ip == (u8 *)regs->ip)
9925+ printk("<%02x> ", c);
9926+ else
9927+ printk("%02x ", c);
9928 }
9929 }
9930 printk("\n");
9931 }
9932
9933-int is_valid_bugaddr(unsigned long rip)
9934+int is_valid_bugaddr(unsigned long ip)
9935 {
9936 unsigned short ud2;
9937
9938- if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
9939+ if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
9940 return 0;
9941
9942 return ud2 == 0x0b0f;
9943 }
9944
9945-#ifdef CONFIG_BUG
9946-void out_of_line_bug(void)
9947-{
9948- BUG();
9949-}
9950-EXPORT_SYMBOL(out_of_line_bug);
9951-#endif
9952-
9953 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
9954 static int die_owner = -1;
9955 static unsigned int die_nest_count;
9956@@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
9957 return flags;
9958 }
9959
9960-void __kprobes oops_end(unsigned long flags)
9961+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
9962 {
9963 die_owner = -1;
9964 bust_spinlocks(0);
9965@@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
9966 /* Nest count reaches zero, release the lock. */
9967 __raw_spin_unlock(&die_lock);
9968 raw_local_irq_restore(flags);
9969+ if (!regs) {
9970+ oops_exit();
9971+ return;
9972+ }
9973 if (panic_on_oops)
9974 panic("Fatal exception");
9975 oops_exit();
9976+ do_exit(signr);
9977 }
9978
9979-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
9980+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9981 {
9982 static int die_counter;
9983 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
9984@@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
9985 printk("DEBUG_PAGEALLOC");
9986 #endif
9987 printk("\n");
9988- notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
9989+ if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
9990+ return 1;
9991 show_registers(regs);
9992 add_taint(TAINT_DIE);
9993 /* Executive summary in case the oops scrolled away */
9994 printk(KERN_ALERT "RIP ");
9995- printk_address(regs->rip);
9996- printk(" RSP <%016lx>\n", regs->rsp);
9997+ printk_address(regs->ip, 1);
9998+ printk(" RSP <%016lx>\n", regs->sp);
9999 if (kexec_should_crash(current))
10000 crash_kexec(regs);
10001+ return 0;
10002 }
10003
10004 void die(const char * str, struct pt_regs * regs, long err)
10005@@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10006 unsigned long flags = oops_begin();
10007
10008 if (!user_mode(regs))
10009- report_bug(regs->rip, regs);
10010+ report_bug(regs->ip, regs);
10011
10012- __die(str, regs, err);
10013- oops_end(flags);
10014- do_exit(SIGSEGV);
10015+ if (__die(str, regs, err))
10016+ regs = NULL;
10017+ oops_end(flags, regs, SIGSEGV);
10018 }
10019
10020 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10021@@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10022 crash_kexec(regs);
10023 if (do_panic || panic_on_oops)
10024 panic("Non maskable interrupt");
10025- oops_end(flags);
10026+ oops_end(flags, NULL, SIGBUS);
10027 nmi_exit();
10028 local_irq_enable();
10029- do_exit(SIGSEGV);
10030+ do_exit(SIGBUS);
10031 }
10032 #endif
10033
10034@@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10035 tsk->thread.trap_no = trapnr;
10036
10037 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10038- printk_ratelimit())
10039+ printk_ratelimit()) {
10040 printk(KERN_INFO
10041- "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10042+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10043 tsk->comm, tsk->pid, str,
10044- regs->rip, regs->rsp, error_code);
10045+ regs->ip, regs->sp, error_code);
10046+ print_vma_addr(" in ", regs->ip);
10047+ printk("\n");
10048+ }
10049
10050 if (info)
10051 force_sig_info(signr, info, tsk);
10052@@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10053 }
10054
10055
10056- /* kernel trap */
10057- {
10058- const struct exception_table_entry *fixup;
10059- fixup = search_exception_tables(regs->rip);
10060- if (fixup)
10061- regs->rip = fixup->fixup;
10062- else {
10063- tsk->thread.error_code = error_code;
10064- tsk->thread.trap_no = trapnr;
10065- die(str, regs, error_code);
10066- }
10067- return;
10068+ if (!fixup_exception(regs)) {
10069+ tsk->thread.error_code = error_code;
10070+ tsk->thread.trap_no = trapnr;
10071+ die(str, regs, error_code);
10072 }
10073+ return;
10074 }
10075
10076 #define DO_ERROR(trapnr, signr, str, name) \
10077@@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10078 do_trap(trapnr, signr, str, regs, error_code, &info); \
10079 }
10080
10081-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
10082+DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10083 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10084 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10085-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10086+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10087 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10088 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10089 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10090@@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10091 tsk->thread.trap_no = 13;
10092
10093 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10094- printk_ratelimit())
10095+ printk_ratelimit()) {
10096 printk(KERN_INFO
10097- "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10098+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10099 tsk->comm, tsk->pid,
10100- regs->rip, regs->rsp, error_code);
10101+ regs->ip, regs->sp, error_code);
10102+ print_vma_addr(" in ", regs->ip);
10103+ printk("\n");
10104+ }
10105
10106 force_sig(SIGSEGV, tsk);
10107 return;
10108 }
10109
10110- /* kernel gp */
10111- {
10112- const struct exception_table_entry *fixup;
10113- fixup = search_exception_tables(regs->rip);
10114- if (fixup) {
10115- regs->rip = fixup->fixup;
10116- return;
10117- }
10118+ if (fixup_exception(regs))
10119+ return;
10120
10121- tsk->thread.error_code = error_code;
10122- tsk->thread.trap_no = 13;
10123- if (notify_die(DIE_GPF, "general protection fault", regs,
10124- error_code, 13, SIGSEGV) == NOTIFY_STOP)
10125- return;
10126- die("general protection fault", regs, error_code);
10127- }
10128+ tsk->thread.error_code = error_code;
10129+ tsk->thread.trap_no = 13;
10130+ if (notify_die(DIE_GPF, "general protection fault", regs,
10131+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
10132+ return;
10133+ die("general protection fault", regs, error_code);
10134 }
10135
10136 static __kprobes void
10137@@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10138 {
10139 struct pt_regs *regs = eregs;
10140 /* Did already sync */
10141- if (eregs == (struct pt_regs *)eregs->rsp)
10142+ if (eregs == (struct pt_regs *)eregs->sp)
10143 ;
10144 /* Exception from user space */
10145 else if (user_mode(eregs))
10146 regs = task_pt_regs(current);
10147 /* Exception from kernel and interrupts are enabled. Move to
10148 kernel process stack. */
10149- else if (eregs->eflags & X86_EFLAGS_IF)
10150- regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10151+ else if (eregs->flags & X86_EFLAGS_IF)
10152+ regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10153 if (eregs != regs)
10154 *regs = *eregs;
10155 return regs;
10156@@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10157
10158 get_debugreg(condition, 6);
10159
10160+ /*
10161+ * The processor cleared BTF, so don't mark that we need it set.
10162+ */
10163+ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10164+ tsk->thread.debugctlmsr = 0;
10165+
10166 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10167 SIGTRAP) == NOTIFY_STOP)
10168 return;
10169@@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10170
10171 tsk->thread.debugreg6 = condition;
10172
10173- /* Mask out spurious TF errors due to lazy TF clearing */
10174+
10175+ /*
10176+ * Single-stepping through TF: make sure we ignore any events in
10177+ * kernel space (but re-enable TF when returning to user mode).
10178+ */
10179 if (condition & DR_STEP) {
10180- /*
10181- * The TF error should be masked out only if the current
10182- * process is not traced and if the TRAP flag has been set
10183- * previously by a tracing process (condition detected by
10184- * the PT_DTRACE flag); remember that the i386 TRAP flag
10185- * can be modified by the process itself in user mode,
10186- * allowing programs to debug themselves without the ptrace()
10187- * interface.
10188- */
10189 if (!user_mode(regs))
10190 goto clear_TF_reenable;
10191- /*
10192- * Was the TF flag set by a debugger? If so, clear it now,
10193- * so that register information is correct.
10194- */
10195- if (tsk->ptrace & PT_DTRACE) {
10196- regs->eflags &= ~TF_MASK;
10197- tsk->ptrace &= ~PT_DTRACE;
10198- }
10199 }
10200
10201 /* Ok, finally something we can handle */
10202@@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10203 info.si_signo = SIGTRAP;
10204 info.si_errno = 0;
10205 info.si_code = TRAP_BRKPT;
10206- info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10207+ info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10208 force_sig_info(SIGTRAP, &info, tsk);
10209
10210 clear_dr7:
10211@@ -913,18 +949,15 @@ clear_dr7:
10212
10213 clear_TF_reenable:
10214 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10215- regs->eflags &= ~TF_MASK;
10216+ regs->flags &= ~X86_EFLAGS_TF;
10217 preempt_conditional_cli(regs);
10218 }
10219
10220 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10221 {
10222- const struct exception_table_entry *fixup;
10223- fixup = search_exception_tables(regs->rip);
10224- if (fixup) {
10225- regs->rip = fixup->fixup;
10226+ if (fixup_exception(regs))
10227 return 1;
10228- }
10229+
10230 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10231 /* Illegal floating point operation in the kernel */
10232 current->thread.trap_no = trapnr;
10233@@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10234 */
10235 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10236 {
10237- void __user *rip = (void __user *)(regs->rip);
10238+ void __user *ip = (void __user *)(regs->ip);
10239 struct task_struct * task;
10240 siginfo_t info;
10241 unsigned short cwd, swd;
10242@@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10243 info.si_signo = SIGFPE;
10244 info.si_errno = 0;
10245 info.si_code = __SI_FAULT;
10246- info.si_addr = rip;
10247+ info.si_addr = ip;
10248 /*
10249 * (~cwd & swd) will mask out exceptions that are not set to unmasked
10250 * status. 0x3f is the exception bits in these regs, 0x200 is the
10251@@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10252
10253 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10254 {
10255- void __user *rip = (void __user *)(regs->rip);
10256+ void __user *ip = (void __user *)(regs->ip);
10257 struct task_struct * task;
10258 siginfo_t info;
10259 unsigned short mxcsr;
10260@@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10261 info.si_signo = SIGFPE;
10262 info.si_errno = 0;
10263 info.si_code = __SI_FAULT;
10264- info.si_addr = rip;
10265+ info.si_addr = ip;
10266 /*
10267 * The SIMD FPU exceptions are handled a little differently, as there
10268 * is only a single status/control register. Thus, to determine which
10269@@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10270 task_thread_info(me)->status |= TS_USEDFPU;
10271 me->fpu_counter++;
10272 }
10273+EXPORT_SYMBOL_GPL(math_state_restore);
10274
10275
10276 /*
10277 * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10278 * specify <dpl>|4 in the second field.
10279 */
10280-static trap_info_t __cpuinitdata trap_table[] = {
10281+static const trap_info_t __cpuinitconst trap_table[] = {
10282 { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
10283 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
10284 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
10285@@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10286 return 0;
10287 }
10288 early_param("kstack", kstack_setup);
10289+
10290+
10291+static int __init code_bytes_setup(char *s)
10292+{
10293+ code_bytes = simple_strtoul(s, NULL, 0);
10294+ if (code_bytes > 8192)
10295+ code_bytes = 8192;
10296+
10297+ return 1;
10298+}
10299+__setup("code_bytes=", code_bytes_setup);
10300--- sle11-2009-06-29.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-02-16 16:18:36.000000000 +0100
10301+++ sle11-2009-06-29/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
10302@@ -43,12 +43,7 @@
10303 #include <asm/vgtod.h>
10304
10305 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10306-#define __syscall_clobber "r11","rcx","memory"
10307-#define __pa_vsymbol(x) \
10308- ({unsigned long v; \
10309- extern char __vsyscall_0; \
10310- asm("" : "=r" (v) : "0" (x)); \
10311- ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10312+#define __syscall_clobber "r11","cx","memory"
10313
10314 /*
10315 * vsyscall_gtod_data contains data that is :
10316@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10317 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10318 {
10319 int ret;
10320- asm volatile("vsysc2: syscall"
10321+ asm volatile("syscall"
10322 : "=a" (ret)
10323 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10324 : __syscall_clobber );
10325@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10326 static __always_inline long time_syscall(long *t)
10327 {
10328 long secs;
10329- asm volatile("vsysc1: syscall"
10330+ asm volatile("syscall"
10331 : "=a" (secs)
10332 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10333 return secs;
10334@@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10335 long __vsyscall(2)
10336 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10337 {
10338- unsigned int dummy, p;
10339+ unsigned int p;
10340 unsigned long j = 0;
10341
10342 /* Fast cache - only recompute value once per jiffies and avoid
10343@@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10344 p = tcache->blob[1];
10345 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10346 /* Load per CPU data from RDTSCP */
10347- rdtscp(dummy, dummy, p);
10348+ native_read_tscp(&p);
10349 } else {
10350 /* Load per CPU data from GDT */
10351 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10352@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10353
10354 #ifdef CONFIG_SYSCTL
10355
10356-#define SYSCALL 0x050f
10357-#define NOP2 0x9090
10358-
10359-/*
10360- * NOP out syscall in vsyscall page when not needed.
10361- */
10362-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10363- void __user *buffer, size_t *lenp, loff_t *ppos)
10364+static int
10365+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10366+ void __user *buffer, size_t *lenp, loff_t *ppos)
10367 {
10368- extern u16 vsysc1, vsysc2;
10369- u16 __iomem *map1;
10370- u16 __iomem *map2;
10371- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10372- if (!write)
10373- return ret;
10374- /* gcc has some trouble with __va(__pa()), so just do it this
10375- way. */
10376- map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10377- if (!map1)
10378- return -ENOMEM;
10379- map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10380- if (!map2) {
10381- ret = -ENOMEM;
10382- goto out;
10383- }
10384- if (!vsyscall_gtod_data.sysctl_enabled) {
10385- writew(SYSCALL, map1);
10386- writew(SYSCALL, map2);
10387- } else {
10388- writew(NOP2, map1);
10389- writew(NOP2, map2);
10390- }
10391- iounmap(map2);
10392-out:
10393- iounmap(map1);
10394- return ret;
10395+ return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10396 }
10397
10398 static ctl_table kernel_table2[] = {
10399@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10400 .child = kernel_table2 },
10401 {}
10402 };
10403-
10404 #endif
10405
10406 /* Assume __initcall executes before all user space. Hopefully kmod
10407@@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10408 d |= cpu;
10409 d |= (node & 0xf) << 12;
10410 d |= (node >> 4) << 48;
10411- if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10412+ if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10413 + GDT_ENTRY_PER_CPU),
10414 d))
10415 BUG();
10416@@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10417 return NOTIFY_DONE;
10418 }
10419
10420-static void __init map_vsyscall(void)
10421+void __init map_vsyscall(void)
10422 {
10423 extern char __vsyscall_0;
10424 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10425@@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10426 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10427 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10428 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10429- map_vsyscall();
10430 #ifdef CONFIG_XEN
10431 vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10432 if (boot_cpu_has(X86_FEATURE_RDTSCP))
10433--- sle11-2009-06-29.orig/arch/x86/kernel/xen_entry_64.S 2009-06-29 15:14:52.000000000 +0200
10434+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
10435@@ -1,36 +0,0 @@
10436-/*
10437- * Copied from arch/xen/i386/kernel/entry.S
10438- */
10439-/* Offsets into shared_info_t. */
10440-#define evtchn_upcall_pending /* 0 */
10441-#define evtchn_upcall_mask 1
10442-
10443-#define sizeof_vcpu_shift 6
10444-
10445-#ifdef CONFIG_SMP
10446-//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10447-//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
10448-#define preempt_disable(reg)
10449-#define preempt_enable(reg)
10450-#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
10451- movq %gs:pda_cpunumber,reg ; \
10452- shl $32, reg ; \
10453- shr $32-sizeof_vcpu_shift,reg ; \
10454- addq HYPERVISOR_shared_info,reg
10455-#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
10456-#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10457-#else
10458-#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10459-#define XEN_PUT_VCPU_INFO(reg)
10460-#define XEN_PUT_VCPU_INFO_fixup
10461-#endif
10462-
10463-#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
10464-#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10465-#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10466- XEN_LOCKED_BLOCK_EVENTS(reg) ; \
10467- XEN_PUT_VCPU_INFO(reg)
10468-#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10469- XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
10470- XEN_PUT_VCPU_INFO(reg)
10471-#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
10472--- sle11-2009-06-29.orig/arch/x86/mach-xen/setup.c 2009-02-16 16:17:21.000000000 +0100
10473+++ sle11-2009-06-29/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
10474@@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10475
10476 /* Do an early initialization of the fixmap area */
10477 {
10478- extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10479+ extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10480 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10481- pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10482- pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10483+ pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10484 pmd_t *pmd = pmd_offset(pud, addr);
10485
10486- swapper_pg_dir = pgd;
10487- init_mm.pgd = pgd;
10488- make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10489- set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10490+ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10491+ set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10492 }
10493 }
10494--- /dev/null 1970-01-01 00:00:00.000000000 +0000
10495+++ sle11-2009-06-29/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
10496@@ -0,0 +1,1025 @@
10497+/*
10498+ * Copyright (C) 1995 Linus Torvalds
10499+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
10500+ */
10501+
10502+#include <linux/signal.h>
10503+#include <linux/sched.h>
10504+#include <linux/kernel.h>
10505+#include <linux/errno.h>
10506+#include <linux/string.h>
10507+#include <linux/types.h>
10508+#include <linux/ptrace.h>
10509+#include <linux/mman.h>
10510+#include <linux/mm.h>
10511+#include <linux/smp.h>
10512+#include <linux/interrupt.h>
10513+#include <linux/init.h>
10514+#include <linux/tty.h>
10515+#include <linux/vt_kern.h> /* For unblank_screen() */
10516+#include <linux/compiler.h>
10517+#include <linux/highmem.h>
10518+#include <linux/bootmem.h> /* for max_low_pfn */
10519+#include <linux/vmalloc.h>
10520+#include <linux/module.h>
10521+#include <linux/kprobes.h>
10522+#include <linux/uaccess.h>
10523+#include <linux/kdebug.h>
10524+
10525+#include <asm/system.h>
10526+#include <asm/desc.h>
10527+#include <asm/segment.h>
10528+#include <asm/pgalloc.h>
10529+#include <asm/smp.h>
10530+#include <asm/tlbflush.h>
10531+#include <asm/proto.h>
10532+#include <asm-generic/sections.h>
10533+
10534+/*
10535+ * Page fault error code bits
10536+ * bit 0 == 0 means no page found, 1 means protection fault
10537+ * bit 1 == 0 means read, 1 means write
10538+ * bit 2 == 0 means kernel, 1 means user-mode
10539+ * bit 3 == 1 means use of reserved bit detected
10540+ * bit 4 == 1 means fault was an instruction fetch
10541+ */
10542+#define PF_PROT (1<<0)
10543+#define PF_WRITE (1<<1)
10544+#define PF_USER (1<<2)
10545+#define PF_RSVD (1<<3)
10546+#define PF_INSTR (1<<4)
10547+
10548+static inline int notify_page_fault(struct pt_regs *regs)
10549+{
10550+#ifdef CONFIG_KPROBES
10551+ int ret = 0;
10552+
10553+ /* kprobe_running() needs smp_processor_id() */
10554+#ifdef CONFIG_X86_32
10555+ if (!user_mode_vm(regs)) {
10556+#else
10557+ if (!user_mode(regs)) {
10558+#endif
10559+ preempt_disable();
10560+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
10561+ ret = 1;
10562+ preempt_enable();
10563+ }
10564+
10565+ return ret;
10566+#else
10567+ return 0;
10568+#endif
10569+}
10570+
10571+/*
10572+ * X86_32
10573+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10574+ * Check that here and ignore it.
10575+ *
10576+ * X86_64
10577+ * Sometimes the CPU reports invalid exceptions on prefetch.
10578+ * Check that here and ignore it.
10579+ *
10580+ * Opcode checker based on code by Richard Brunner
10581+ */
10582+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
10583+ unsigned long error_code)
10584+{
10585+ unsigned char *instr;
10586+ int scan_more = 1;
10587+ int prefetch = 0;
10588+ unsigned char *max_instr;
10589+
10590+ /*
10591+ * If it was a exec (instruction fetch) fault on NX page, then
10592+ * do not ignore the fault:
10593+ */
10594+ if (error_code & PF_INSTR)
10595+ return 0;
10596+
10597+ instr = (unsigned char *)convert_ip_to_linear(current, regs);
10598+ max_instr = instr + 15;
10599+
10600+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
10601+ return 0;
10602+
10603+ while (scan_more && instr < max_instr) {
10604+ unsigned char opcode;
10605+ unsigned char instr_hi;
10606+ unsigned char instr_lo;
10607+
10608+ if (probe_kernel_address(instr, opcode))
10609+ break;
10610+
10611+ instr_hi = opcode & 0xf0;
10612+ instr_lo = opcode & 0x0f;
10613+ instr++;
10614+
10615+ switch (instr_hi) {
10616+ case 0x20:
10617+ case 0x30:
10618+ /*
10619+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
10620+ * In X86_64 long mode, the CPU will signal invalid
10621+ * opcode if some of these prefixes are present so
10622+ * X86_64 will never get here anyway
10623+ */
10624+ scan_more = ((instr_lo & 7) == 0x6);
10625+ break;
10626+#ifdef CONFIG_X86_64
10627+ case 0x40:
10628+ /*
10629+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
10630+ * Need to figure out under what instruction mode the
10631+ * instruction was issued. Could check the LDT for lm,
10632+ * but for now it's good enough to assume that long
10633+ * mode only uses well known segments or kernel.
10634+ */
10635+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
10636+ break;
10637+#endif
10638+ case 0x60:
10639+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
10640+ scan_more = (instr_lo & 0xC) == 0x4;
10641+ break;
10642+ case 0xF0:
10643+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
10644+ scan_more = !instr_lo || (instr_lo>>1) == 1;
10645+ break;
10646+ case 0x00:
10647+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
10648+ scan_more = 0;
10649+
10650+ if (probe_kernel_address(instr, opcode))
10651+ break;
10652+ prefetch = (instr_lo == 0xF) &&
10653+ (opcode == 0x0D || opcode == 0x18);
10654+ break;
10655+ default:
10656+ scan_more = 0;
10657+ break;
10658+ }
10659+ }
10660+ return prefetch;
10661+}
10662+
10663+static void force_sig_info_fault(int si_signo, int si_code,
10664+ unsigned long address, struct task_struct *tsk)
10665+{
10666+ siginfo_t info;
10667+
10668+ info.si_signo = si_signo;
10669+ info.si_errno = 0;
10670+ info.si_code = si_code;
10671+ info.si_addr = (void __user *)address;
10672+ force_sig_info(si_signo, &info, tsk);
10673+}
10674+
10675+#ifdef CONFIG_X86_64
10676+static int bad_address(void *p)
10677+{
10678+ unsigned long dummy;
10679+ return probe_kernel_address((unsigned long *)p, dummy);
10680+}
10681+#endif
10682+
10683+static void dump_pagetable(unsigned long address)
10684+{
10685+#ifdef CONFIG_X86_32
10686+ __typeof__(pte_val(__pte(0))) page;
10687+
10688+ page = read_cr3();
10689+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
10690+#ifdef CONFIG_X86_PAE
10691+ printk("*pdpt = %016Lx ", page);
10692+ if ((page & _PAGE_PRESENT)
10693+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
10694+ page = mfn_to_pfn(page >> PAGE_SHIFT);
10695+ page <<= PAGE_SHIFT;
10696+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
10697+ & (PTRS_PER_PMD - 1)];
10698+ printk(KERN_CONT "*pde = %016Lx ", page);
10699+ page &= ~_PAGE_NX;
10700+ }
10701+#else
10702+ printk("*pde = %08lx ", page);
10703+#endif
10704+
10705+ /*
10706+ * We must not directly access the pte in the highpte
10707+ * case if the page table is located in highmem.
10708+ * And let's rather not kmap-atomic the pte, just in case
10709+ * it's allocated already.
10710+ */
10711+ if ((page & _PAGE_PRESENT)
10712+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
10713+ && !(page & _PAGE_PSE)) {
10714+ page = mfn_to_pfn(page >> PAGE_SHIFT);
10715+ page <<= PAGE_SHIFT;
10716+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
10717+ & (PTRS_PER_PTE - 1)];
10718+ printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
10719+ }
10720+
10721+ printk(KERN_CONT "\n");
10722+#else /* CONFIG_X86_64 */
10723+ pgd_t *pgd;
10724+ pud_t *pud;
10725+ pmd_t *pmd;
10726+ pte_t *pte;
10727+
10728+ pgd = (pgd_t *)read_cr3();
10729+
10730+ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
10731+ pgd += pgd_index(address);
10732+ if (bad_address(pgd)) goto bad;
10733+ printk("PGD %lx ", pgd_val(*pgd));
10734+ if (!pgd_present(*pgd)) goto ret;
10735+
10736+ pud = pud_offset(pgd, address);
10737+ if (bad_address(pud)) goto bad;
10738+ printk(KERN_CONT "PUD %lx ", pud_val(*pud));
10739+ if (!pud_present(*pud) || pud_large(*pud))
10740+ goto ret;
10741+
10742+ pmd = pmd_offset(pud, address);
10743+ if (bad_address(pmd)) goto bad;
10744+ printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
10745+ if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
10746+
10747+ pte = pte_offset_kernel(pmd, address);
10748+ if (bad_address(pte)) goto bad;
10749+ printk(KERN_CONT "PTE %lx", pte_val(*pte));
10750+ret:
10751+ printk(KERN_CONT "\n");
10752+ return;
10753+bad:
10754+ printk("BAD\n");
10755+#endif
10756+}
10757+
10758+#ifdef CONFIG_X86_32
10759+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10760+{
10761+ unsigned index = pgd_index(address);
10762+ pgd_t *pgd_k;
10763+ pud_t *pud, *pud_k;
10764+ pmd_t *pmd, *pmd_k;
10765+
10766+ pgd += index;
10767+ pgd_k = init_mm.pgd + index;
10768+
10769+ if (!pgd_present(*pgd_k))
10770+ return NULL;
10771+
10772+ /*
10773+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
10774+ * and redundant with the set_pmd() on non-PAE. As would
10775+ * set_pud.
10776+ */
10777+
10778+ pud = pud_offset(pgd, address);
10779+ pud_k = pud_offset(pgd_k, address);
10780+ if (!pud_present(*pud_k))
10781+ return NULL;
10782+
10783+ pmd = pmd_offset(pud, address);
10784+ pmd_k = pmd_offset(pud_k, address);
10785+ if (!pmd_present(*pmd_k))
10786+ return NULL;
10787+ if (!pmd_present(*pmd)) {
10788+ bool lazy = x86_read_percpu(xen_lazy_mmu);
10789+
10790+ x86_write_percpu(xen_lazy_mmu, false);
10791+#if CONFIG_XEN_COMPAT > 0x030002
10792+ set_pmd(pmd, *pmd_k);
10793+#else
10794+ /*
10795+ * When running on older Xen we must launder *pmd_k through
10796+ * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10797+ */
10798+ set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10799+#endif
10800+ x86_write_percpu(xen_lazy_mmu, lazy);
10801+ } else
10802+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
10803+ return pmd_k;
10804+}
10805+#endif
10806+
10807+#ifdef CONFIG_X86_64
10808+static const char errata93_warning[] =
10809+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
10810+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
10811+KERN_ERR "******* Please consider a BIOS update.\n"
10812+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
10813+#endif
10814+
10815+/* Workaround for K8 erratum #93 & buggy BIOS.
10816+ BIOS SMM functions are required to use a specific workaround
10817+ to avoid corruption of the 64bit RIP register on C stepping K8.
10818+ A lot of BIOS that didn't get tested properly miss this.
10819+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
10820+ Try to work around it here.
10821+ Note we only handle faults in kernel here.
10822+ Does nothing for X86_32
10823+ */
10824+static int is_errata93(struct pt_regs *regs, unsigned long address)
10825+{
10826+#ifdef CONFIG_X86_64
10827+ static int warned;
10828+ if (address != regs->ip)
10829+ return 0;
10830+ if ((address >> 32) != 0)
10831+ return 0;
10832+ address |= 0xffffffffUL << 32;
10833+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
10834+ (address >= MODULES_VADDR && address <= MODULES_END)) {
10835+ if (!warned) {
10836+ printk(errata93_warning);
10837+ warned = 1;
10838+ }
10839+ regs->ip = address;
10840+ return 1;
10841+ }
10842+#endif
10843+ return 0;
10844+}
10845+
10846+/*
10847+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
10848+ * addresses >4GB. We catch this in the page fault handler because these
10849+ * addresses are not reachable. Just detect this case and return. Any code
10850+ * segment in LDT is compatibility mode.
10851+ */
10852+static int is_errata100(struct pt_regs *regs, unsigned long address)
10853+{
10854+#ifdef CONFIG_X86_64
10855+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
10856+ (address >> 32))
10857+ return 1;
10858+#endif
10859+ return 0;
10860+}
10861+
10862+void do_invalid_op(struct pt_regs *, unsigned long);
10863+
10864+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
10865+{
10866+#ifdef CONFIG_X86_F00F_BUG
10867+ unsigned long nr;
10868+ /*
10869+ * Pentium F0 0F C7 C8 bug workaround.
10870+ */
10871+ if (boot_cpu_data.f00f_bug) {
10872+ nr = (address - idt_descr.address) >> 3;
10873+
10874+ if (nr == 6) {
10875+ do_invalid_op(regs, 0);
10876+ return 1;
10877+ }
10878+ }
10879+#endif
10880+ return 0;
10881+}
10882+
10883+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
10884+ unsigned long address)
10885+{
10886+#ifdef CONFIG_X86_32
10887+ if (!oops_may_print())
10888+ return;
10889+#endif
10890+
10891+#ifdef CONFIG_X86_PAE
10892+ if (error_code & PF_INSTR) {
10893+ unsigned int level;
10894+ pte_t *pte = lookup_address(address, &level);
10895+
10896+ if (pte && pte_present(*pte) && !pte_exec(*pte))
10897+ printk(KERN_CRIT "kernel tried to execute "
10898+ "NX-protected page - exploit attempt? "
10899+ "(uid: %d)\n", current->uid);
10900+ }
10901+#endif
10902+
10903+ printk(KERN_ALERT "BUG: unable to handle kernel ");
10904+ if (address < PAGE_SIZE)
10905+ printk(KERN_CONT "NULL pointer dereference");
10906+ else
10907+ printk(KERN_CONT "paging request");
10908+#ifdef CONFIG_X86_32
10909+ printk(KERN_CONT " at %08lx\n", address);
10910+#else
10911+ printk(KERN_CONT " at %016lx\n", address);
10912+#endif
10913+ printk(KERN_ALERT "IP:");
10914+ printk_address(regs->ip, 1);
10915+ dump_pagetable(address);
10916+}
10917+
10918+#ifdef CONFIG_X86_64
10919+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
10920+ unsigned long error_code)
10921+{
10922+ unsigned long flags = oops_begin();
10923+ struct task_struct *tsk;
10924+
10925+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
10926+ current->comm, address);
10927+ dump_pagetable(address);
10928+ tsk = current;
10929+ tsk->thread.cr2 = address;
10930+ tsk->thread.trap_no = 14;
10931+ tsk->thread.error_code = error_code;
10932+ if (__die("Bad pagetable", regs, error_code))
10933+ regs = NULL;
10934+ oops_end(flags, regs, SIGKILL);
10935+}
10936+#endif
10937+
10938+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
10939+{
10940+ if ((error_code & PF_WRITE) && !pte_write(*pte))
10941+ return 0;
10942+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
10943+ return 0;
10944+
10945+ return 1;
10946+}
10947+
10948+/*
10949+ * Handle a spurious fault caused by a stale TLB entry. This allows
10950+ * us to lazily refresh the TLB when increasing the permissions of a
10951+ * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
10952+ * expensive since that implies doing a full cross-processor TLB
10953+ * flush, even if no stale TLB entries exist on other processors.
10954+ * There are no security implications to leaving a stale TLB when
10955+ * increasing the permissions on a page.
10956+ */
10957+static int spurious_fault(unsigned long address,
10958+ unsigned long error_code)
10959+{
10960+ pgd_t *pgd;
10961+ pud_t *pud;
10962+ pmd_t *pmd;
10963+ pte_t *pte;
10964+
10965+ /* Reserved-bit violation or user access to kernel space? */
10966+ if (error_code & (PF_USER | PF_RSVD))
10967+ return 0;
10968+
10969+ pgd = init_mm.pgd + pgd_index(address);
10970+ if (!pgd_present(*pgd))
10971+ return 0;
10972+
10973+ pud = pud_offset(pgd, address);
10974+ if (!pud_present(*pud))
10975+ return 0;
10976+
10977+ if (pud_large(*pud))
10978+ return spurious_fault_check(error_code, (pte_t *) pud);
10979+
10980+ pmd = pmd_offset(pud, address);
10981+ if (!pmd_present(*pmd))
10982+ return 0;
10983+
10984+ if (pmd_large(*pmd))
10985+ return spurious_fault_check(error_code, (pte_t *) pmd);
10986+
10987+ pte = pte_offset_kernel(pmd, address);
10988+ if (!pte_present(*pte))
10989+ return 0;
10990+
10991+ return spurious_fault_check(error_code, pte);
10992+}
10993+
10994+/*
10995+ * X86_32
10996+ * Handle a fault on the vmalloc or module mapping area
10997+ *
10998+ * X86_64
10999+ * Handle a fault on the vmalloc area
11000+ *
11001+ * This assumes no large pages in there.
11002+ */
11003+static int vmalloc_fault(unsigned long address)
11004+{
11005+#ifdef CONFIG_X86_32
11006+ unsigned long pgd_paddr;
11007+ pmd_t *pmd_k;
11008+ pte_t *pte_k;
11009+ /*
11010+ * Synchronize this task's top level page-table
11011+ * with the 'reference' page table.
11012+ *
11013+ * Do _not_ use "current" here. We might be inside
11014+ * an interrupt in the middle of a task switch..
11015+ */
11016+ pgd_paddr = read_cr3();
11017+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11018+ if (!pmd_k)
11019+ return -1;
11020+ pte_k = pte_offset_kernel(pmd_k, address);
11021+ if (!pte_present(*pte_k))
11022+ return -1;
11023+ return 0;
11024+#else
11025+ pgd_t *pgd, *pgd_ref;
11026+ pud_t *pud, *pud_ref;
11027+ pmd_t *pmd, *pmd_ref;
11028+ pte_t *pte, *pte_ref;
11029+
11030+ /* Make sure we are in vmalloc area */
11031+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
11032+ return -1;
11033+
11034+ /* Copy kernel mappings over when needed. This can also
11035+ happen within a race in page table update. In the later
11036+ case just flush. */
11037+
11038+ /* On Xen the line below does not always work. Needs investigating! */
11039+ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11040+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11041+ pgd += pgd_index(address);
11042+ pgd_ref = pgd_offset_k(address);
11043+ if (pgd_none(*pgd_ref))
11044+ return -1;
11045+ if (pgd_none(*pgd))
11046+ set_pgd(pgd, *pgd_ref);
11047+ else
11048+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11049+
11050+ /* Below here mismatches are bugs because these lower tables
11051+ are shared */
11052+
11053+ pud = pud_offset(pgd, address);
11054+ pud_ref = pud_offset(pgd_ref, address);
11055+ if (pud_none(*pud_ref))
11056+ return -1;
11057+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11058+ BUG();
11059+ pmd = pmd_offset(pud, address);
11060+ pmd_ref = pmd_offset(pud_ref, address);
11061+ if (pmd_none(*pmd_ref))
11062+ return -1;
11063+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11064+ BUG();
11065+ pte_ref = pte_offset_kernel(pmd_ref, address);
11066+ if (!pte_present(*pte_ref))
11067+ return -1;
11068+ pte = pte_offset_kernel(pmd, address);
11069+ /* Don't use pte_page here, because the mappings can point
11070+ outside mem_map, and the NUMA hash lookup cannot handle
11071+ that. */
11072+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11073+ BUG();
11074+ return 0;
11075+#endif
11076+}
11077+
11078+int show_unhandled_signals = 1;
11079+
11080+/*
11081+ * This routine handles page faults. It determines the address,
11082+ * and the problem, and then passes it off to one of the appropriate
11083+ * routines.
11084+ */
11085+#ifdef CONFIG_X86_64
11086+asmlinkage
11087+#endif
11088+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
11089+{
11090+ struct task_struct *tsk;
11091+ struct mm_struct *mm;
11092+ struct vm_area_struct *vma;
11093+ unsigned long address;
11094+ int write, si_code;
11095+ int fault;
11096+#ifdef CONFIG_X86_64
11097+ unsigned long flags;
11098+#endif
11099+
11100+ /*
11101+ * We can fault from pretty much anywhere, with unknown IRQ state.
11102+ */
11103+ trace_hardirqs_fixup();
11104+
11105+ /* Set the "privileged fault" bit to something sane. */
11106+ if (user_mode_vm(regs))
11107+ error_code |= PF_USER;
11108+ else
11109+ error_code &= ~PF_USER;
11110+
11111+ tsk = current;
11112+ mm = tsk->mm;
11113+ prefetchw(&mm->mmap_sem);
11114+
11115+ /* get the address */
11116+ address = read_cr2();
11117+
11118+ si_code = SEGV_MAPERR;
11119+
11120+ if (notify_page_fault(regs))
11121+ return;
11122+
11123+ /*
11124+ * We fault-in kernel-space virtual memory on-demand. The
11125+ * 'reference' page table is init_mm.pgd.
11126+ *
11127+ * NOTE! We MUST NOT take any locks for this case. We may
11128+ * be in an interrupt or a critical region, and should
11129+ * only copy the information from the master page table,
11130+ * nothing more.
11131+ *
11132+ * This verifies that the fault happens in kernel space
11133+ * (error_code & 4) == 0, and that the fault was not a
11134+ * protection error (error_code & 9) == 0.
11135+ */
11136+#ifdef CONFIG_X86_32
11137+ if (unlikely(address >= TASK_SIZE)) {
11138+#else
11139+ if (unlikely(address >= TASK_SIZE64)) {
11140+#endif
11141+ /* Faults in hypervisor area can never be patched up. */
11142+#if defined(CONFIG_X86_XEN)
11143+ if (address >= hypervisor_virt_start)
11144+ goto bad_area_nosemaphore;
11145+#elif defined(CONFIG_X86_64_XEN)
11146+ if (address >= HYPERVISOR_VIRT_START
11147+ && address < HYPERVISOR_VIRT_END)
11148+ goto bad_area_nosemaphore;
11149+#endif
11150+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11151+ vmalloc_fault(address) >= 0)
11152+ return;
11153+
11154+ /* Can handle a stale RO->RW TLB */
11155+ if (spurious_fault(address, error_code))
11156+ return;
11157+
11158+ /*
11159+ * Don't take the mm semaphore here. If we fixup a prefetch
11160+ * fault we could otherwise deadlock.
11161+ */
11162+ goto bad_area_nosemaphore;
11163+ }
11164+
11165+
11166+#ifdef CONFIG_X86_32
11167+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11168+ fault has been handled. */
11169+ if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11170+ local_irq_enable();
11171+
11172+ /*
11173+ * If we're in an interrupt, have no user context or are running in an
11174+ * atomic region then we must not take the fault.
11175+ */
11176+ if (in_atomic() || !mm)
11177+ goto bad_area_nosemaphore;
11178+#else /* CONFIG_X86_64 */
11179+ if (likely(regs->flags & X86_EFLAGS_IF))
11180+ local_irq_enable();
11181+
11182+ if (unlikely(error_code & PF_RSVD))
11183+ pgtable_bad(address, regs, error_code);
11184+
11185+ /*
11186+ * If we're in an interrupt, have no user context or are running in an
11187+ * atomic region then we must not take the fault.
11188+ */
11189+ if (unlikely(in_atomic() || !mm))
11190+ goto bad_area_nosemaphore;
11191+
11192+ /*
11193+ * User-mode registers count as a user access even for any
11194+ * potential system fault or CPU buglet.
11195+ */
11196+ if (user_mode_vm(regs))
11197+ error_code |= PF_USER;
11198+again:
11199+#endif
11200+ /* When running in the kernel we expect faults to occur only to
11201+ * addresses in user space. All other faults represent errors in the
11202+ * kernel and should generate an OOPS. Unfortunately, in the case of an
11203+ * erroneous fault occurring in a code path which already holds mmap_sem
11204+ * we will deadlock attempting to validate the fault against the
11205+ * address space. Luckily the kernel only validly references user
11206+ * space from well defined areas of code, which are listed in the
11207+ * exceptions table.
11208+ *
11209+ * As the vast majority of faults will be valid we will only perform
11210+ * the source reference check when there is a possibility of a deadlock.
11211+ * Attempt to lock the address space, if we cannot we then validate the
11212+ * source. If this is invalid we can skip the address space check,
11213+ * thus avoiding the deadlock.
11214+ */
11215+ if (!down_read_trylock(&mm->mmap_sem)) {
11216+ if ((error_code & PF_USER) == 0 &&
11217+ !search_exception_tables(regs->ip))
11218+ goto bad_area_nosemaphore;
11219+ down_read(&mm->mmap_sem);
11220+ }
11221+
11222+ vma = find_vma(mm, address);
11223+ if (!vma)
11224+ goto bad_area;
11225+ if (vma->vm_start <= address)
11226+ goto good_area;
11227+ if (!(vma->vm_flags & VM_GROWSDOWN))
11228+ goto bad_area;
11229+ if (error_code & PF_USER) {
11230+ /*
11231+ * Accessing the stack below %sp is always a bug.
11232+ * The large cushion allows instructions like enter
11233+ * and pusha to work. ("enter $65535,$31" pushes
11234+ * 32 pointers and then decrements %sp by 65535.)
11235+ */
11236+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
11237+ goto bad_area;
11238+ }
11239+ if (expand_stack(vma, address))
11240+ goto bad_area;
11241+/*
11242+ * Ok, we have a good vm_area for this memory access, so
11243+ * we can handle it..
11244+ */
11245+good_area:
11246+ si_code = SEGV_ACCERR;
11247+ write = 0;
11248+ switch (error_code & (PF_PROT|PF_WRITE)) {
11249+ default: /* 3: write, present */
11250+ /* fall through */
11251+ case PF_WRITE: /* write, not present */
11252+ if (!(vma->vm_flags & VM_WRITE))
11253+ goto bad_area;
11254+ write++;
11255+ break;
11256+ case PF_PROT: /* read, present */
11257+ goto bad_area;
11258+ case 0: /* read, not present */
11259+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11260+ goto bad_area;
11261+ }
11262+
11263+#ifdef CONFIG_X86_32
11264+survive:
11265+#endif
11266+ /*
11267+ * If for any reason at all we couldn't handle the fault,
11268+ * make sure we exit gracefully rather than endlessly redo
11269+ * the fault.
11270+ */
11271+ fault = handle_mm_fault(mm, vma, address, write);
11272+ if (unlikely(fault & VM_FAULT_ERROR)) {
11273+ if (fault & VM_FAULT_OOM)
11274+ goto out_of_memory;
11275+ else if (fault & VM_FAULT_SIGBUS)
11276+ goto do_sigbus;
11277+ BUG();
11278+ }
11279+ if (fault & VM_FAULT_MAJOR)
11280+ tsk->maj_flt++;
11281+ else
11282+ tsk->min_flt++;
11283+
11284+#ifdef CONFIG_X86_32
11285+ /*
11286+ * Did it hit the DOS screen memory VA from vm86 mode?
11287+ */
11288+ if (v8086_mode(regs)) {
11289+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11290+ if (bit < 32)
11291+ tsk->thread.screen_bitmap |= 1 << bit;
11292+ }
11293+#endif
11294+ up_read(&mm->mmap_sem);
11295+ return;
11296+
11297+/*
11298+ * Something tried to access memory that isn't in our memory map..
11299+ * Fix it, but check if it's kernel or user first..
11300+ */
11301+bad_area:
11302+ up_read(&mm->mmap_sem);
11303+
11304+bad_area_nosemaphore:
11305+ /* User mode accesses just cause a SIGSEGV */
11306+ if (error_code & PF_USER) {
11307+ /*
11308+ * It's possible to have interrupts off here.
11309+ */
11310+ local_irq_enable();
11311+
11312+ /*
11313+ * Valid to do another page fault here because this one came
11314+ * from user space.
11315+ */
11316+ if (is_prefetch(regs, address, error_code))
11317+ return;
11318+
11319+ if (is_errata100(regs, address))
11320+ return;
11321+
11322+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11323+ printk_ratelimit()) {
11324+ printk(
11325+#ifdef CONFIG_X86_32
11326+ "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
11327+#else
11328+ "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
11329+#endif
11330+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11331+ tsk->comm, task_pid_nr(tsk), address, regs->ip,
11332+ regs->sp, error_code);
11333+ print_vma_addr(" in ", regs->ip);
11334+ printk("\n");
11335+ }
11336+
11337+ tsk->thread.cr2 = address;
11338+ /* Kernel addresses are always protection faults */
11339+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11340+ tsk->thread.trap_no = 14;
11341+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11342+ return;
11343+ }
11344+
11345+ if (is_f00f_bug(regs, address))
11346+ return;
11347+
11348+no_context:
11349+ /* Are we prepared to handle this kernel fault? */
11350+ if (fixup_exception(regs))
11351+ return;
11352+
11353+ /*
11354+ * X86_32
11355+ * Valid to do another page fault here, because if this fault
11356+ * had been triggered by is_prefetch fixup_exception would have
11357+ * handled it.
11358+ *
11359+ * X86_64
11360+ * Hall of shame of CPU/BIOS bugs.
11361+ */
11362+ if (is_prefetch(regs, address, error_code))
11363+ return;
11364+
11365+ if (is_errata93(regs, address))
11366+ return;
11367+
11368+/*
11369+ * Oops. The kernel tried to access some bad page. We'll have to
11370+ * terminate things with extreme prejudice.
11371+ */
11372+#ifdef CONFIG_X86_32
11373+ bust_spinlocks(1);
11374+#else
11375+ flags = oops_begin();
11376+#endif
11377+
11378+ show_fault_oops(regs, error_code, address);
11379+
11380+ tsk->thread.cr2 = address;
11381+ tsk->thread.trap_no = 14;
11382+ tsk->thread.error_code = error_code;
11383+
11384+#ifdef CONFIG_X86_32
11385+ die("Oops", regs, error_code);
11386+ bust_spinlocks(0);
11387+ do_exit(SIGKILL);
11388+#else
11389+ if (__die("Oops", regs, error_code))
11390+ regs = NULL;
11391+ /* Executive summary in case the body of the oops scrolled away */
11392+ printk(KERN_EMERG "CR2: %016lx\n", address);
11393+ oops_end(flags, regs, SIGKILL);
11394+#endif
11395+
11396+/*
11397+ * We ran out of memory, or some other thing happened to us that made
11398+ * us unable to handle the page fault gracefully.
11399+ */
11400+out_of_memory:
11401+ up_read(&mm->mmap_sem);
11402+ if (is_global_init(tsk)) {
11403+ yield();
11404+#ifdef CONFIG_X86_32
11405+ down_read(&mm->mmap_sem);
11406+ goto survive;
11407+#else
11408+ goto again;
11409+#endif
11410+ }
11411+
11412+ printk("VM: killing process %s\n", tsk->comm);
11413+ if (error_code & PF_USER)
11414+ do_group_exit(SIGKILL);
11415+ goto no_context;
11416+
11417+do_sigbus:
11418+ up_read(&mm->mmap_sem);
11419+
11420+ /* Kernel mode? Handle exceptions or die */
11421+ if (!(error_code & PF_USER))
11422+ goto no_context;
11423+#ifdef CONFIG_X86_32
11424+ /* User space => ok to do another page fault */
11425+ if (is_prefetch(regs, address, error_code))
11426+ return;
11427+#endif
11428+ tsk->thread.cr2 = address;
11429+ tsk->thread.error_code = error_code;
11430+ tsk->thread.trap_no = 14;
11431+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11432+}
11433+
11434+DEFINE_SPINLOCK(pgd_lock);
11435+LIST_HEAD(pgd_list);
11436+
11437+void vmalloc_sync_all(void)
11438+{
11439+#ifdef CONFIG_X86_32
11440+ /*
11441+ * Note that races in the updates of insync and start aren't
11442+ * problematic: insync can only get set bits added, and updates to
11443+ * start are only improving performance (without affecting correctness
11444+ * if undone).
11445+ * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11446+ * This change works just fine with 2-level paging too.
11447+ */
11448+#define sync_index(a) ((a) >> PMD_SHIFT)
11449+ static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11450+ static unsigned long start = TASK_SIZE;
11451+ unsigned long address;
11452+
11453+ if (SHARED_KERNEL_PMD)
11454+ return;
11455+
11456+ BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
11457+ for (address = start;
11458+ address < hypervisor_virt_start;
11459+ address += PMD_SIZE) {
11460+ if (!test_bit(sync_index(address), insync)) {
11461+ unsigned long flags;
11462+ struct page *page;
11463+
11464+ spin_lock_irqsave(&pgd_lock, flags);
11465+ /* XEN: failure path assumes non-empty pgd_list. */
11466+ if (unlikely(list_empty(&pgd_list))) {
11467+ spin_unlock_irqrestore(&pgd_lock, flags);
11468+ return;
11469+ }
11470+ list_for_each_entry(page, &pgd_list, lru) {
11471+ if (!vmalloc_sync_one(page_address(page),
11472+ address))
11473+ break;
11474+ }
11475+ spin_unlock_irqrestore(&pgd_lock, flags);
11476+ if (!page)
11477+ set_bit(sync_index(address), insync);
11478+ }
11479+ if (address == start && test_bit(sync_index(address), insync))
11480+ start = address + PMD_SIZE;
11481+ }
11482+#else /* CONFIG_X86_64 */
11483+ /*
11484+ * Note that races in the updates of insync and start aren't
11485+ * problematic: insync can only get set bits added, and updates to
11486+ * start are only improving performance (without affecting correctness
11487+ * if undone).
11488+ */
11489+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
11490+ static unsigned long start = VMALLOC_START & PGDIR_MASK;
11491+ unsigned long address;
11492+
11493+ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
11494+ if (!test_bit(pgd_index(address), insync)) {
11495+ const pgd_t *pgd_ref = pgd_offset_k(address);
11496+ unsigned long flags;
11497+ struct page *page;
11498+
11499+ if (pgd_none(*pgd_ref))
11500+ continue;
11501+ spin_lock_irqsave(&pgd_lock, flags);
11502+ list_for_each_entry(page, &pgd_list, lru) {
11503+ pgd_t *pgd;
11504+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
11505+ if (pgd_none(*pgd))
11506+ set_pgd(pgd, *pgd_ref);
11507+ else
11508+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11509+ }
11510+ spin_unlock_irqrestore(&pgd_lock, flags);
11511+ set_bit(pgd_index(address), insync);
11512+ }
11513+ if (address == start)
11514+ start = address + PGDIR_SIZE;
11515+ }
11516+ /* Check that there is no need to do the same for the modules area. */
11517+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11518+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11519+ (__START_KERNEL & PGDIR_MASK)));
11520+#endif
11521+}
11522--- sle11-2009-06-29.orig/arch/x86/mm/fault_32-xen.c 2009-02-16 16:18:36.000000000 +0100
11523+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
11524@@ -1,757 +0,0 @@
11525-/*
11526- * linux/arch/i386/mm/fault.c
11527- *
11528- * Copyright (C) 1995 Linus Torvalds
11529- */
11530-
11531-#include <linux/signal.h>
11532-#include <linux/sched.h>
11533-#include <linux/kernel.h>
11534-#include <linux/errno.h>
11535-#include <linux/string.h>
11536-#include <linux/types.h>
11537-#include <linux/ptrace.h>
11538-#include <linux/mman.h>
11539-#include <linux/mm.h>
11540-#include <linux/smp.h>
11541-#include <linux/interrupt.h>
11542-#include <linux/init.h>
11543-#include <linux/tty.h>
11544-#include <linux/vt_kern.h> /* For unblank_screen() */
11545-#include <linux/highmem.h>
11546-#include <linux/bootmem.h> /* for max_low_pfn */
11547-#include <linux/vmalloc.h>
11548-#include <linux/module.h>
11549-#include <linux/kprobes.h>
11550-#include <linux/uaccess.h>
11551-#include <linux/kdebug.h>
11552-#include <linux/kprobes.h>
11553-
11554-#include <asm/system.h>
11555-#include <asm/desc.h>
11556-#include <asm/segment.h>
11557-
11558-extern void die(const char *,struct pt_regs *,long);
11559-
11560-#ifdef CONFIG_KPROBES
11561-static inline int notify_page_fault(struct pt_regs *regs)
11562-{
11563- int ret = 0;
11564-
11565- /* kprobe_running() needs smp_processor_id() */
11566- if (!user_mode_vm(regs)) {
11567- preempt_disable();
11568- if (kprobe_running() && kprobe_fault_handler(regs, 14))
11569- ret = 1;
11570- preempt_enable();
11571- }
11572-
11573- return ret;
11574-}
11575-#else
11576-static inline int notify_page_fault(struct pt_regs *regs)
11577-{
11578- return 0;
11579-}
11580-#endif
11581-
11582-/*
11583- * Return EIP plus the CS segment base. The segment limit is also
11584- * adjusted, clamped to the kernel/user address space (whichever is
11585- * appropriate), and returned in *eip_limit.
11586- *
11587- * The segment is checked, because it might have been changed by another
11588- * task between the original faulting instruction and here.
11589- *
11590- * If CS is no longer a valid code segment, or if EIP is beyond the
11591- * limit, or if it is a kernel address when CS is not a kernel segment,
11592- * then the returned value will be greater than *eip_limit.
11593- *
11594- * This is slow, but is very rarely executed.
11595- */
11596-static inline unsigned long get_segment_eip(struct pt_regs *regs,
11597- unsigned long *eip_limit)
11598-{
11599- unsigned long eip = regs->eip;
11600- unsigned seg = regs->xcs & 0xffff;
11601- u32 seg_ar, seg_limit, base, *desc;
11602-
11603- /* Unlikely, but must come before segment checks. */
11604- if (unlikely(regs->eflags & VM_MASK)) {
11605- base = seg << 4;
11606- *eip_limit = base + 0xffff;
11607- return base + (eip & 0xffff);
11608- }
11609-
11610- /* The standard kernel/user address space limit. */
11611- *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
11612-
11613- /* By far the most common cases. */
11614- if (likely(SEGMENT_IS_FLAT_CODE(seg)))
11615- return eip;
11616-
11617- /* Check the segment exists, is within the current LDT/GDT size,
11618- that kernel/user (ring 0..3) has the appropriate privilege,
11619- that it's a code segment, and get the limit. */
11620- __asm__ ("larl %3,%0; lsll %3,%1"
11621- : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
11622- if ((~seg_ar & 0x9800) || eip > seg_limit) {
11623- *eip_limit = 0;
11624- return 1; /* So that returned eip > *eip_limit. */
11625- }
11626-
11627- /* Get the GDT/LDT descriptor base.
11628- When you look for races in this code remember that
11629- LDT and other horrors are only used in user space. */
11630- if (seg & (1<<2)) {
11631- /* Must lock the LDT while reading it. */
11632- mutex_lock(&current->mm->context.lock);
11633- desc = current->mm->context.ldt;
11634- desc = (void *)desc + (seg & ~7);
11635- } else {
11636- /* Must disable preemption while reading the GDT. */
11637- desc = (u32 *)get_cpu_gdt_table(get_cpu());
11638- desc = (void *)desc + (seg & ~7);
11639- }
11640-
11641- /* Decode the code segment base from the descriptor */
11642- base = get_desc_base((unsigned long *)desc);
11643-
11644- if (seg & (1<<2)) {
11645- mutex_unlock(&current->mm->context.lock);
11646- } else
11647- put_cpu();
11648-
11649- /* Adjust EIP and segment limit, and clamp at the kernel limit.
11650- It's legitimate for segments to wrap at 0xffffffff. */
11651- seg_limit += base;
11652- if (seg_limit < *eip_limit && seg_limit >= base)
11653- *eip_limit = seg_limit;
11654- return eip + base;
11655-}
11656-
11657-/*
11658- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
11659- * Check that here and ignore it.
11660- */
11661-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
11662-{
11663- unsigned long limit;
11664- unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
11665- int scan_more = 1;
11666- int prefetch = 0;
11667- int i;
11668-
11669- for (i = 0; scan_more && i < 15; i++) {
11670- unsigned char opcode;
11671- unsigned char instr_hi;
11672- unsigned char instr_lo;
11673-
11674- if (instr > (unsigned char *)limit)
11675- break;
11676- if (probe_kernel_address(instr, opcode))
11677- break;
11678-
11679- instr_hi = opcode & 0xf0;
11680- instr_lo = opcode & 0x0f;
11681- instr++;
11682-
11683- switch (instr_hi) {
11684- case 0x20:
11685- case 0x30:
11686- /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
11687- scan_more = ((instr_lo & 7) == 0x6);
11688- break;
11689-
11690- case 0x60:
11691- /* 0x64 thru 0x67 are valid prefixes in all modes. */
11692- scan_more = (instr_lo & 0xC) == 0x4;
11693- break;
11694- case 0xF0:
11695- /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
11696- scan_more = !instr_lo || (instr_lo>>1) == 1;
11697- break;
11698- case 0x00:
11699- /* Prefetch instruction is 0x0F0D or 0x0F18 */
11700- scan_more = 0;
11701- if (instr > (unsigned char *)limit)
11702- break;
11703- if (probe_kernel_address(instr, opcode))
11704- break;
11705- prefetch = (instr_lo == 0xF) &&
11706- (opcode == 0x0D || opcode == 0x18);
11707- break;
11708- default:
11709- scan_more = 0;
11710- break;
11711- }
11712- }
11713- return prefetch;
11714-}
11715-
11716-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11717- unsigned long error_code)
11718-{
11719- if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11720- boot_cpu_data.x86 >= 6)) {
11721- /* Catch an obscure case of prefetch inside an NX page. */
11722- if (nx_enabled && (error_code & 16))
11723- return 0;
11724- return __is_prefetch(regs, addr);
11725- }
11726- return 0;
11727-}
11728-
11729-static noinline void force_sig_info_fault(int si_signo, int si_code,
11730- unsigned long address, struct task_struct *tsk)
11731-{
11732- siginfo_t info;
11733-
11734- info.si_signo = si_signo;
11735- info.si_errno = 0;
11736- info.si_code = si_code;
11737- info.si_addr = (void __user *)address;
11738- force_sig_info(si_signo, &info, tsk);
11739-}
11740-
11741-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
11742-
11743-#ifdef CONFIG_X86_PAE
11744-static void dump_fault_path(unsigned long address)
11745-{
11746- unsigned long *p, page;
11747- unsigned long mfn;
11748-
11749- page = read_cr3();
11750- p = (unsigned long *)__va(page);
11751- p += (address >> 30) * 2;
11752- printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
11753- if (p[0] & _PAGE_PRESENT) {
11754- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11755- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11756- p = (unsigned long *)__va(page);
11757- address &= 0x3fffffff;
11758- p += (address >> 21) * 2;
11759- printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
11760- page, p[1], p[0]);
11761- mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11762-#ifdef CONFIG_HIGHPTE
11763- if (mfn_to_pfn(mfn) >= highstart_pfn)
11764- return;
11765-#endif
11766- if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
11767- page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11768- p = (unsigned long *) __va(page);
11769- address &= 0x001fffff;
11770- p += (address >> 12) * 2;
11771- printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
11772- page, p[1], p[0]);
11773- }
11774- }
11775-}
11776-#else
11777-static void dump_fault_path(unsigned long address)
11778-{
11779- unsigned long page;
11780-
11781- page = read_cr3();
11782- page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
11783- printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
11784- machine_to_phys(page));
11785- /*
11786- * We must not directly access the pte in the highpte
11787- * case if the page table is located in highmem.
11788- * And lets rather not kmap-atomic the pte, just in case
11789- * it's allocated already.
11790- */
11791- if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
11792- && (page & _PAGE_PRESENT)
11793- && !(page & _PAGE_PSE)) {
11794- page = machine_to_phys(page & PAGE_MASK);
11795- page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
11796- & (PTRS_PER_PTE - 1)];
11797- printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
11798- machine_to_phys(page));
11799- }
11800-}
11801-#endif
11802-
11803-static int spurious_fault(struct pt_regs *regs,
11804- unsigned long address,
11805- unsigned long error_code)
11806-{
11807- pgd_t *pgd;
11808- pud_t *pud;
11809- pmd_t *pmd;
11810- pte_t *pte;
11811-
11812- /* Reserved-bit violation or user access to kernel space? */
11813- if (error_code & 0x0c)
11814- return 0;
11815-
11816- pgd = init_mm.pgd + pgd_index(address);
11817- if (!pgd_present(*pgd))
11818- return 0;
11819-
11820- pud = pud_offset(pgd, address);
11821- if (!pud_present(*pud))
11822- return 0;
11823-
11824- pmd = pmd_offset(pud, address);
11825- if (!pmd_present(*pmd))
11826- return 0;
11827-
11828- pte = pte_offset_kernel(pmd, address);
11829- if (!pte_present(*pte))
11830- return 0;
11831- if ((error_code & 0x02) && !pte_write(*pte))
11832- return 0;
11833-#ifdef CONFIG_X86_PAE
11834- if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
11835- return 0;
11836-#endif
11837-
11838- return 1;
11839-}
11840-
11841-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
11842-{
11843- unsigned index = pgd_index(address);
11844- pgd_t *pgd_k;
11845- pud_t *pud, *pud_k;
11846- pmd_t *pmd, *pmd_k;
11847-
11848- pgd += index;
11849- pgd_k = init_mm.pgd + index;
11850-
11851- if (!pgd_present(*pgd_k))
11852- return NULL;
11853-
11854- /*
11855- * set_pgd(pgd, *pgd_k); here would be useless on PAE
11856- * and redundant with the set_pmd() on non-PAE. As would
11857- * set_pud.
11858- */
11859-
11860- pud = pud_offset(pgd, address);
11861- pud_k = pud_offset(pgd_k, address);
11862- if (!pud_present(*pud_k))
11863- return NULL;
11864-
11865- pmd = pmd_offset(pud, address);
11866- pmd_k = pmd_offset(pud_k, address);
11867- if (!pmd_present(*pmd_k))
11868- return NULL;
11869- if (!pmd_present(*pmd)) {
11870- bool lazy = x86_read_percpu(xen_lazy_mmu);
11871-
11872- x86_write_percpu(xen_lazy_mmu, false);
11873-#if CONFIG_XEN_COMPAT > 0x030002
11874- set_pmd(pmd, *pmd_k);
11875-#else
11876- /*
11877- * When running on older Xen we must launder *pmd_k through
11878- * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
11879- */
11880- set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
11881-#endif
11882- x86_write_percpu(xen_lazy_mmu, lazy);
11883- } else
11884- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
11885- return pmd_k;
11886-}
11887-
11888-/*
11889- * Handle a fault on the vmalloc or module mapping area
11890- *
11891- * This assumes no large pages in there.
11892- */
11893-static inline int vmalloc_fault(unsigned long address)
11894-{
11895- unsigned long pgd_paddr;
11896- pmd_t *pmd_k;
11897- pte_t *pte_k;
11898- /*
11899- * Synchronize this task's top level page-table
11900- * with the 'reference' page table.
11901- *
11902- * Do _not_ use "current" here. We might be inside
11903- * an interrupt in the middle of a task switch..
11904- */
11905- pgd_paddr = read_cr3();
11906- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11907- if (!pmd_k)
11908- return -1;
11909- pte_k = pte_offset_kernel(pmd_k, address);
11910- if (!pte_present(*pte_k))
11911- return -1;
11912- return 0;
11913-}
11914-
11915-int show_unhandled_signals = 1;
11916-
11917-/*
11918- * This routine handles page faults. It determines the address,
11919- * and the problem, and then passes it off to one of the appropriate
11920- * routines.
11921- *
11922- * error_code:
11923- * bit 0 == 0 means no page found, 1 means protection fault
11924- * bit 1 == 0 means read, 1 means write
11925- * bit 2 == 0 means kernel, 1 means user-mode
11926- * bit 3 == 1 means use of reserved bit detected
11927- * bit 4 == 1 means fault was an instruction fetch
11928- */
11929-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11930- unsigned long error_code)
11931-{
11932- struct task_struct *tsk;
11933- struct mm_struct *mm;
11934- struct vm_area_struct * vma;
11935- unsigned long address;
11936- int write, si_code;
11937- int fault;
11938-
11939- /*
11940- * We can fault from pretty much anywhere, with unknown IRQ state.
11941- */
11942- trace_hardirqs_fixup();
11943-
11944- /* get the address */
11945- address = read_cr2();
11946-
11947- /* Set the "privileged fault" bit to something sane. */
11948- error_code &= ~4;
11949- error_code |= (regs->xcs & 2) << 1;
11950- if (regs->eflags & X86_EFLAGS_VM)
11951- error_code |= 4;
11952-
11953- tsk = current;
11954-
11955- si_code = SEGV_MAPERR;
11956-
11957- /*
11958- * We fault-in kernel-space virtual memory on-demand. The
11959- * 'reference' page table is init_mm.pgd.
11960- *
11961- * NOTE! We MUST NOT take any locks for this case. We may
11962- * be in an interrupt or a critical region, and should
11963- * only copy the information from the master page table,
11964- * nothing more.
11965- *
11966- * This verifies that the fault happens in kernel space
11967- * (error_code & 4) == 0, and that the fault was not a
11968- * protection error (error_code & 9) == 0.
11969- */
11970- if (unlikely(address >= TASK_SIZE)) {
11971-#ifdef CONFIG_XEN
11972- /* Faults in hypervisor area can never be patched up. */
11973- if (address >= hypervisor_virt_start)
11974- goto bad_area_nosemaphore;
11975-#endif
11976- if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11977- return;
11978- /* Can take a spurious fault if mapping changes R/O -> R/W. */
11979- if (spurious_fault(regs, address, error_code))
11980- return;
11981- if (notify_page_fault(regs))
11982- return;
11983- /*
11984- * Don't take the mm semaphore here. If we fixup a prefetch
11985- * fault we could otherwise deadlock.
11986- */
11987- goto bad_area_nosemaphore;
11988- }
11989-
11990- if (notify_page_fault(regs))
11991- return;
11992-
11993- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11994- fault has been handled. */
11995- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11996- local_irq_enable();
11997-
11998- mm = tsk->mm;
11999-
12000- /*
12001- * If we're in an interrupt, have no user context or are running in an
12002- * atomic region then we must not take the fault..
12003- */
12004- if (in_atomic() || !mm)
12005- goto bad_area_nosemaphore;
12006-
12007- /* When running in the kernel we expect faults to occur only to
12008- * addresses in user space. All other faults represent errors in the
12009- * kernel and should generate an OOPS. Unfortunately, in the case of an
12010- * erroneous fault occurring in a code path which already holds mmap_sem
12011- * we will deadlock attempting to validate the fault against the
12012- * address space. Luckily the kernel only validly references user
12013- * space from well defined areas of code, which are listed in the
12014- * exceptions table.
12015- *
12016- * As the vast majority of faults will be valid we will only perform
12017- * the source reference check when there is a possibility of a deadlock.
12018- * Attempt to lock the address space, if we cannot we then validate the
12019- * source. If this is invalid we can skip the address space check,
12020- * thus avoiding the deadlock.
12021- */
12022- if (!down_read_trylock(&mm->mmap_sem)) {
12023- if ((error_code & 4) == 0 &&
12024- !search_exception_tables(regs->eip))
12025- goto bad_area_nosemaphore;
12026- down_read(&mm->mmap_sem);
12027- }
12028-
12029- vma = find_vma(mm, address);
12030- if (!vma)
12031- goto bad_area;
12032- if (vma->vm_start <= address)
12033- goto good_area;
12034- if (!(vma->vm_flags & VM_GROWSDOWN))
12035- goto bad_area;
12036- if (error_code & 4) {
12037- /*
12038- * Accessing the stack below %esp is always a bug.
12039- * The large cushion allows instructions like enter
12040- * and pusha to work. ("enter $65535,$31" pushes
12041- * 32 pointers and then decrements %esp by 65535.)
12042- */
12043- if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
12044- goto bad_area;
12045- }
12046- if (expand_stack(vma, address))
12047- goto bad_area;
12048-/*
12049- * Ok, we have a good vm_area for this memory access, so
12050- * we can handle it..
12051- */
12052-good_area:
12053- si_code = SEGV_ACCERR;
12054- write = 0;
12055- switch (error_code & 3) {
12056- default: /* 3: write, present */
12057- /* fall through */
12058- case 2: /* write, not present */
12059- if (!(vma->vm_flags & VM_WRITE))
12060- goto bad_area;
12061- write++;
12062- break;
12063- case 1: /* read, present */
12064- goto bad_area;
12065- case 0: /* read, not present */
12066- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12067- goto bad_area;
12068- }
12069-
12070- survive:
12071- /*
12072- * If for any reason at all we couldn't handle the fault,
12073- * make sure we exit gracefully rather than endlessly redo
12074- * the fault.
12075- */
12076- fault = handle_mm_fault(mm, vma, address, write);
12077- if (unlikely(fault & VM_FAULT_ERROR)) {
12078- if (fault & VM_FAULT_OOM)
12079- goto out_of_memory;
12080- else if (fault & VM_FAULT_SIGBUS)
12081- goto do_sigbus;
12082- BUG();
12083- }
12084- if (fault & VM_FAULT_MAJOR)
12085- tsk->maj_flt++;
12086- else
12087- tsk->min_flt++;
12088-
12089- /*
12090- * Did it hit the DOS screen memory VA from vm86 mode?
12091- */
12092- if (regs->eflags & VM_MASK) {
12093- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12094- if (bit < 32)
12095- tsk->thread.screen_bitmap |= 1 << bit;
12096- }
12097- up_read(&mm->mmap_sem);
12098- return;
12099-
12100-/*
12101- * Something tried to access memory that isn't in our memory map..
12102- * Fix it, but check if it's kernel or user first..
12103- */
12104-bad_area:
12105- up_read(&mm->mmap_sem);
12106-
12107-bad_area_nosemaphore:
12108- /* User mode accesses just cause a SIGSEGV */
12109- if (error_code & 4) {
12110- /*
12111- * It's possible to have interrupts off here.
12112- */
12113- local_irq_enable();
12114-
12115- /*
12116- * Valid to do another page fault here because this one came
12117- * from user space.
12118- */
12119- if (is_prefetch(regs, address, error_code))
12120- return;
12121-
12122- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12123- printk_ratelimit()) {
12124- printk("%s%s[%d]: segfault at %08lx eip %08lx "
12125- "esp %08lx error %lx\n",
12126- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12127- tsk->comm, task_pid_nr(tsk), address, regs->eip,
12128- regs->esp, error_code);
12129- }
12130- tsk->thread.cr2 = address;
12131- /* Kernel addresses are always protection faults */
12132- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12133- tsk->thread.trap_no = 14;
12134- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12135- return;
12136- }
12137-
12138-#ifdef CONFIG_X86_F00F_BUG
12139- /*
12140- * Pentium F0 0F C7 C8 bug workaround.
12141- */
12142- if (boot_cpu_data.f00f_bug) {
12143- unsigned long nr;
12144-
12145- nr = (address - idt_descr.address) >> 3;
12146-
12147- if (nr == 6) {
12148- do_invalid_op(regs, 0);
12149- return;
12150- }
12151- }
12152-#endif
12153-
12154-no_context:
12155- /* Are we prepared to handle this kernel fault? */
12156- if (fixup_exception(regs))
12157- return;
12158-
12159- /*
12160- * Valid to do another page fault here, because if this fault
12161- * had been triggered by is_prefetch fixup_exception would have
12162- * handled it.
12163- */
12164- if (is_prefetch(regs, address, error_code))
12165- return;
12166-
12167-/*
12168- * Oops. The kernel tried to access some bad page. We'll have to
12169- * terminate things with extreme prejudice.
12170- */
12171-
12172- bust_spinlocks(1);
12173-
12174- if (oops_may_print()) {
12175-#ifdef CONFIG_X86_PAE
12176- if (error_code & 16) {
12177- pte_t *pte = lookup_address(address);
12178-
12179- if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
12180- printk(KERN_CRIT "kernel tried to execute "
12181- "NX-protected page - exploit attempt? "
12182- "(uid: %d)\n", current->uid);
12183- }
12184-#endif
12185- if (address < PAGE_SIZE)
12186- printk(KERN_ALERT "BUG: unable to handle kernel NULL "
12187- "pointer dereference");
12188- else
12189- printk(KERN_ALERT "BUG: unable to handle kernel paging"
12190- " request");
12191- printk(" at virtual address %08lx\n",address);
12192- printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
12193- dump_fault_path(address);
12194- }
12195- tsk->thread.cr2 = address;
12196- tsk->thread.trap_no = 14;
12197- tsk->thread.error_code = error_code;
12198- die("Oops", regs, error_code);
12199- bust_spinlocks(0);
12200- do_exit(SIGKILL);
12201-
12202-/*
12203- * We ran out of memory, or some other thing happened to us that made
12204- * us unable to handle the page fault gracefully.
12205- */
12206-out_of_memory:
12207- up_read(&mm->mmap_sem);
12208- if (is_global_init(tsk)) {
12209- yield();
12210- down_read(&mm->mmap_sem);
12211- goto survive;
12212- }
12213- printk("VM: killing process %s\n", tsk->comm);
12214- if (error_code & 4)
12215- do_group_exit(SIGKILL);
12216- goto no_context;
12217-
12218-do_sigbus:
12219- up_read(&mm->mmap_sem);
12220-
12221- /* Kernel mode? Handle exceptions or die */
12222- if (!(error_code & 4))
12223- goto no_context;
12224-
12225- /* User space => ok to do another page fault */
12226- if (is_prefetch(regs, address, error_code))
12227- return;
12228-
12229- tsk->thread.cr2 = address;
12230- tsk->thread.error_code = error_code;
12231- tsk->thread.trap_no = 14;
12232- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
12233-}
12234-
12235-void vmalloc_sync_all(void)
12236-{
12237- /*
12238- * Note that races in the updates of insync and start aren't
12239- * problematic: insync can only get set bits added, and updates to
12240- * start are only improving performance (without affecting correctness
12241- * if undone).
12242- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
12243- * This change works just fine with 2-level paging too.
12244- */
12245-#define sync_index(a) ((a) >> PMD_SHIFT)
12246- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
12247- static unsigned long start = TASK_SIZE;
12248- unsigned long address;
12249-
12250- if (SHARED_KERNEL_PMD)
12251- return;
12252-
12253- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
12254- for (address = start;
12255- address >= TASK_SIZE && address < hypervisor_virt_start;
12256- address += 1UL << PMD_SHIFT) {
12257- if (!test_bit(sync_index(address), insync)) {
12258- unsigned long flags;
12259- struct page *page;
12260-
12261- spin_lock_irqsave(&pgd_lock, flags);
12262- /* XEN: failure path assumes non-empty pgd_list. */
12263- if (unlikely(!pgd_list)) {
12264- spin_unlock_irqrestore(&pgd_lock, flags);
12265- return;
12266- }
12267- for (page = pgd_list; page; page =
12268- (struct page *)page->index)
12269- if (!vmalloc_sync_one(page_address(page),
12270- address)) {
12271- BUG_ON(page != pgd_list);
12272- break;
12273- }
12274- spin_unlock_irqrestore(&pgd_lock, flags);
12275- if (!page)
12276- set_bit(sync_index(address), insync);
12277- }
12278- if (address == start && test_bit(sync_index(address), insync))
12279- start = address + (1UL << PMD_SHIFT);
12280- }
12281-}
12282--- sle11-2009-06-29.orig/arch/x86/mm/fault_64-xen.c 2009-02-16 16:18:36.000000000 +0100
12283+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12284@@ -1,686 +0,0 @@
12285-/*
12286- * linux/arch/x86-64/mm/fault.c
12287- *
12288- * Copyright (C) 1995 Linus Torvalds
12289- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12290- */
12291-
12292-#include <linux/signal.h>
12293-#include <linux/sched.h>
12294-#include <linux/kernel.h>
12295-#include <linux/errno.h>
12296-#include <linux/string.h>
12297-#include <linux/types.h>
12298-#include <linux/ptrace.h>
12299-#include <linux/mman.h>
12300-#include <linux/mm.h>
12301-#include <linux/smp.h>
12302-#include <linux/interrupt.h>
12303-#include <linux/init.h>
12304-#include <linux/tty.h>
12305-#include <linux/vt_kern.h> /* For unblank_screen() */
12306-#include <linux/compiler.h>
12307-#include <linux/vmalloc.h>
12308-#include <linux/module.h>
12309-#include <linux/kprobes.h>
12310-#include <linux/uaccess.h>
12311-#include <linux/kdebug.h>
12312-#include <linux/kprobes.h>
12313-
12314-#include <asm/system.h>
12315-#include <asm/pgalloc.h>
12316-#include <asm/smp.h>
12317-#include <asm/tlbflush.h>
12318-#include <asm/proto.h>
12319-#include <asm-generic/sections.h>
12320-
12321-/* Page fault error code bits */
12322-#define PF_PROT (1<<0) /* or no page found */
12323-#define PF_WRITE (1<<1)
12324-#define PF_USER (1<<2)
12325-#define PF_RSVD (1<<3)
12326-#define PF_INSTR (1<<4)
12327-
12328-#ifdef CONFIG_KPROBES
12329-static inline int notify_page_fault(struct pt_regs *regs)
12330-{
12331- int ret = 0;
12332-
12333- /* kprobe_running() needs smp_processor_id() */
12334- if (!user_mode(regs)) {
12335- preempt_disable();
12336- if (kprobe_running() && kprobe_fault_handler(regs, 14))
12337- ret = 1;
12338- preempt_enable();
12339- }
12340-
12341- return ret;
12342-}
12343-#else
12344-static inline int notify_page_fault(struct pt_regs *regs)
12345-{
12346- return 0;
12347-}
12348-#endif
12349-
12350-/* Sometimes the CPU reports invalid exceptions on prefetch.
12351- Check that here and ignore.
12352- Opcode checker based on code by Richard Brunner */
12353-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
12354- unsigned long error_code)
12355-{
12356- unsigned char *instr;
12357- int scan_more = 1;
12358- int prefetch = 0;
12359- unsigned char *max_instr;
12360-
12361- /* If it was a exec fault ignore */
12362- if (error_code & PF_INSTR)
12363- return 0;
12364-
12365- instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
12366- max_instr = instr + 15;
12367-
12368- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12369- return 0;
12370-
12371- while (scan_more && instr < max_instr) {
12372- unsigned char opcode;
12373- unsigned char instr_hi;
12374- unsigned char instr_lo;
12375-
12376- if (probe_kernel_address(instr, opcode))
12377- break;
12378-
12379- instr_hi = opcode & 0xf0;
12380- instr_lo = opcode & 0x0f;
12381- instr++;
12382-
12383- switch (instr_hi) {
12384- case 0x20:
12385- case 0x30:
12386- /* Values 0x26,0x2E,0x36,0x3E are valid x86
12387- prefixes. In long mode, the CPU will signal
12388- invalid opcode if some of these prefixes are
12389- present so we will never get here anyway */
12390- scan_more = ((instr_lo & 7) == 0x6);
12391- break;
12392-
12393- case 0x40:
12394- /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
12395- Need to figure out under what instruction mode the
12396- instruction was issued ... */
12397- /* Could check the LDT for lm, but for now it's good
12398- enough to assume that long mode only uses well known
12399- segments or kernel. */
12400- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12401- break;
12402-
12403- case 0x60:
12404- /* 0x64 thru 0x67 are valid prefixes in all modes. */
12405- scan_more = (instr_lo & 0xC) == 0x4;
12406- break;
12407- case 0xF0:
12408- /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
12409- scan_more = !instr_lo || (instr_lo>>1) == 1;
12410- break;
12411- case 0x00:
12412- /* Prefetch instruction is 0x0F0D or 0x0F18 */
12413- scan_more = 0;
12414- if (probe_kernel_address(instr, opcode))
12415- break;
12416- prefetch = (instr_lo == 0xF) &&
12417- (opcode == 0x0D || opcode == 0x18);
12418- break;
12419- default:
12420- scan_more = 0;
12421- break;
12422- }
12423- }
12424- return prefetch;
12425-}
12426-
12427-static int bad_address(void *p)
12428-{
12429- unsigned long dummy;
12430- return probe_kernel_address((unsigned long *)p, dummy);
12431-}
12432-
12433-void dump_pagetable(unsigned long address)
12434-{
12435- pgd_t *pgd;
12436- pud_t *pud;
12437- pmd_t *pmd;
12438- pte_t *pte;
12439-
12440- pgd = (pgd_t *)read_cr3();
12441-
12442- pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12443- pgd += pgd_index(address);
12444- if (bad_address(pgd)) goto bad;
12445- printk("PGD %lx ", pgd_val(*pgd));
12446- if (!pgd_present(*pgd)) goto ret;
12447-
12448- pud = pud_offset(pgd, address);
12449- if (bad_address(pud)) goto bad;
12450- printk("PUD %lx ", pud_val(*pud));
12451- if (!pud_present(*pud)) goto ret;
12452-
12453- pmd = pmd_offset(pud, address);
12454- if (bad_address(pmd)) goto bad;
12455- printk("PMD %lx ", pmd_val(*pmd));
12456- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12457-
12458- pte = pte_offset_kernel(pmd, address);
12459- if (bad_address(pte)) goto bad;
12460- printk("PTE %lx", pte_val(*pte));
12461-ret:
12462- printk("\n");
12463- return;
12464-bad:
12465- printk("BAD\n");
12466-}
12467-
12468-static const char errata93_warning[] =
12469-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12470-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12471-KERN_ERR "******* Please consider a BIOS update.\n"
12472-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12473-
12474-/* Workaround for K8 erratum #93 & buggy BIOS.
12475- BIOS SMM functions are required to use a specific workaround
12476- to avoid corruption of the 64bit RIP register on C stepping K8.
12477- A lot of BIOS that didn't get tested properly miss this.
12478- The OS sees this as a page fault with the upper 32bits of RIP cleared.
12479- Try to work around it here.
12480- Note we only handle faults in kernel here. */
12481-
12482-static int is_errata93(struct pt_regs *regs, unsigned long address)
12483-{
12484- static int warned;
12485- if (address != regs->rip)
12486- return 0;
12487- if ((address >> 32) != 0)
12488- return 0;
12489- address |= 0xffffffffUL << 32;
12490- if ((address >= (u64)_stext && address <= (u64)_etext) ||
12491- (address >= MODULES_VADDR && address <= MODULES_END)) {
12492- if (!warned) {
12493- printk(errata93_warning);
12494- warned = 1;
12495- }
12496- regs->rip = address;
12497- return 1;
12498- }
12499- return 0;
12500-}
12501-
12502-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12503- unsigned long error_code)
12504-{
12505- unsigned long flags = oops_begin();
12506- struct task_struct *tsk;
12507-
12508- printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12509- current->comm, address);
12510- dump_pagetable(address);
12511- tsk = current;
12512- tsk->thread.cr2 = address;
12513- tsk->thread.trap_no = 14;
12514- tsk->thread.error_code = error_code;
12515- __die("Bad pagetable", regs, error_code);
12516- oops_end(flags);
12517- do_exit(SIGKILL);
12518-}
12519-
12520-/*
12521- * Handle a fault on the vmalloc area
12522- *
12523- * This assumes no large pages in there.
12524- */
12525-static int vmalloc_fault(unsigned long address)
12526-{
12527- pgd_t *pgd, *pgd_ref;
12528- pud_t *pud, *pud_ref;
12529- pmd_t *pmd, *pmd_ref;
12530- pte_t *pte, *pte_ref;
12531-
12532- /* Copy kernel mappings over when needed. This can also
12533- happen within a race in page table update. In the later
12534- case just flush. */
12535-
12536- /* On Xen the line below does not always work. Needs investigating! */
12537- /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12538- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12539- pgd += pgd_index(address);
12540- pgd_ref = pgd_offset_k(address);
12541- if (pgd_none(*pgd_ref))
12542- return -1;
12543- if (pgd_none(*pgd))
12544- set_pgd(pgd, *pgd_ref);
12545- else
12546- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12547-
12548- /* Below here mismatches are bugs because these lower tables
12549- are shared */
12550-
12551- pud = pud_offset(pgd, address);
12552- pud_ref = pud_offset(pgd_ref, address);
12553- if (pud_none(*pud_ref))
12554- return -1;
12555- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12556- BUG();
12557- pmd = pmd_offset(pud, address);
12558- pmd_ref = pmd_offset(pud_ref, address);
12559- if (pmd_none(*pmd_ref))
12560- return -1;
12561- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12562- BUG();
12563- pte_ref = pte_offset_kernel(pmd_ref, address);
12564- if (!pte_present(*pte_ref))
12565- return -1;
12566- pte = pte_offset_kernel(pmd, address);
12567- /* Don't use pte_page here, because the mappings can point
12568- outside mem_map, and the NUMA hash lookup cannot handle
12569- that. */
12570- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12571- BUG();
12572- return 0;
12573-}
12574-
12575-int show_unhandled_signals = 1;
12576-
12577-
12578-#define MEM_VERBOSE 1
12579-
12580-#ifdef MEM_VERBOSE
12581-#define MEM_LOG(_f, _a...) \
12582- printk("fault.c:[%d]-> " _f "\n", \
12583- __LINE__ , ## _a )
12584-#else
12585-#define MEM_LOG(_f, _a...) ((void)0)
12586-#endif
12587-
12588-static int spurious_fault(struct pt_regs *regs,
12589- unsigned long address,
12590- unsigned long error_code)
12591-{
12592- pgd_t *pgd;
12593- pud_t *pud;
12594- pmd_t *pmd;
12595- pte_t *pte;
12596-
12597-#ifdef CONFIG_XEN
12598- /* Faults in hypervisor area are never spurious. */
12599- if ((address >= HYPERVISOR_VIRT_START) &&
12600- (address < HYPERVISOR_VIRT_END))
12601- return 0;
12602-#endif
12603-
12604- /* Reserved-bit violation or user access to kernel space? */
12605- if (error_code & (PF_RSVD|PF_USER))
12606- return 0;
12607-
12608- pgd = init_mm.pgd + pgd_index(address);
12609- if (!pgd_present(*pgd))
12610- return 0;
12611-
12612- pud = pud_offset(pgd, address);
12613- if (!pud_present(*pud))
12614- return 0;
12615-
12616- pmd = pmd_offset(pud, address);
12617- if (!pmd_present(*pmd))
12618- return 0;
12619-
12620- pte = pte_offset_kernel(pmd, address);
12621- if (!pte_present(*pte))
12622- return 0;
12623- if ((error_code & PF_WRITE) && !pte_write(*pte))
12624- return 0;
12625- if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
12626- return 0;
12627-
12628- return 1;
12629-}
12630-
12631-/*
12632- * This routine handles page faults. It determines the address,
12633- * and the problem, and then passes it off to one of the appropriate
12634- * routines.
12635- */
12636-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
12637- unsigned long error_code)
12638-{
12639- struct task_struct *tsk;
12640- struct mm_struct *mm;
12641- struct vm_area_struct * vma;
12642- unsigned long address;
12643- const struct exception_table_entry *fixup;
12644- int write, fault;
12645- unsigned long flags;
12646- siginfo_t info;
12647-
12648- if (!user_mode(regs))
12649- error_code &= ~PF_USER; /* means kernel */
12650-
12651- /*
12652- * We can fault from pretty much anywhere, with unknown IRQ state.
12653- */
12654- trace_hardirqs_fixup();
12655-
12656- tsk = current;
12657- mm = tsk->mm;
12658- prefetchw(&mm->mmap_sem);
12659-
12660- /* get the address */
12661- address = read_cr2();
12662-
12663- info.si_code = SEGV_MAPERR;
12664-
12665-
12666- /*
12667- * We fault-in kernel-space virtual memory on-demand. The
12668- * 'reference' page table is init_mm.pgd.
12669- *
12670- * NOTE! We MUST NOT take any locks for this case. We may
12671- * be in an interrupt or a critical region, and should
12672- * only copy the information from the master page table,
12673- * nothing more.
12674- *
12675- * This verifies that the fault happens in kernel space
12676- * (error_code & 4) == 0, and that the fault was not a
12677- * protection error (error_code & 9) == 0.
12678- */
12679- if (unlikely(address >= TASK_SIZE64)) {
12680- /*
12681- * Don't check for the module range here: its PML4
12682- * is always initialized because it's shared with the main
12683- * kernel text. Only vmalloc may need PML4 syncups.
12684- */
12685- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12686- ((address >= VMALLOC_START && address < VMALLOC_END))) {
12687- if (vmalloc_fault(address) >= 0)
12688- return;
12689- }
12690- /* Can take a spurious fault if mapping changes R/O -> R/W. */
12691- if (spurious_fault(regs, address, error_code))
12692- return;
12693- if (notify_page_fault(regs))
12694- return;
12695- /*
12696- * Don't take the mm semaphore here. If we fixup a prefetch
12697- * fault we could otherwise deadlock.
12698- */
12699- goto bad_area_nosemaphore;
12700- }
12701-
12702- if (notify_page_fault(regs))
12703- return;
12704-
12705- if (likely(regs->eflags & X86_EFLAGS_IF))
12706- local_irq_enable();
12707-
12708- if (unlikely(error_code & PF_RSVD))
12709- pgtable_bad(address, regs, error_code);
12710-
12711- /*
12712- * If we're in an interrupt or have no user
12713- * context, we must not take the fault..
12714- */
12715- if (unlikely(in_atomic() || !mm))
12716- goto bad_area_nosemaphore;
12717-
12718- /*
12719- * User-mode registers count as a user access even for any
12720- * potential system fault or CPU buglet.
12721- */
12722- if (user_mode_vm(regs))
12723- error_code |= PF_USER;
12724-
12725- again:
12726- /* When running in the kernel we expect faults to occur only to
12727- * addresses in user space. All other faults represent errors in the
12728- * kernel and should generate an OOPS. Unfortunately, in the case of an
12729- * erroneous fault occurring in a code path which already holds mmap_sem
12730- * we will deadlock attempting to validate the fault against the
12731- * address space. Luckily the kernel only validly references user
12732- * space from well defined areas of code, which are listed in the
12733- * exceptions table.
12734- *
12735- * As the vast majority of faults will be valid we will only perform
12736- * the source reference check when there is a possibility of a deadlock.
12737- * Attempt to lock the address space, if we cannot we then validate the
12738- * source. If this is invalid we can skip the address space check,
12739- * thus avoiding the deadlock.
12740- */
12741- if (!down_read_trylock(&mm->mmap_sem)) {
12742- if ((error_code & PF_USER) == 0 &&
12743- !search_exception_tables(regs->rip))
12744- goto bad_area_nosemaphore;
12745- down_read(&mm->mmap_sem);
12746- }
12747-
12748- vma = find_vma(mm, address);
12749- if (!vma)
12750- goto bad_area;
12751- if (likely(vma->vm_start <= address))
12752- goto good_area;
12753- if (!(vma->vm_flags & VM_GROWSDOWN))
12754- goto bad_area;
12755- if (error_code & 4) {
12756- /* Allow userspace just enough access below the stack pointer
12757- * to let the 'enter' instruction work.
12758- */
12759- if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
12760- goto bad_area;
12761- }
12762- if (expand_stack(vma, address))
12763- goto bad_area;
12764-/*
12765- * Ok, we have a good vm_area for this memory access, so
12766- * we can handle it..
12767- */
12768-good_area:
12769- info.si_code = SEGV_ACCERR;
12770- write = 0;
12771- switch (error_code & (PF_PROT|PF_WRITE)) {
12772- default: /* 3: write, present */
12773- /* fall through */
12774- case PF_WRITE: /* write, not present */
12775- if (!(vma->vm_flags & VM_WRITE))
12776- goto bad_area;
12777- write++;
12778- break;
12779- case PF_PROT: /* read, present */
12780- goto bad_area;
12781- case 0: /* read, not present */
12782- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12783- goto bad_area;
12784- }
12785-
12786- /*
12787- * If for any reason at all we couldn't handle the fault,
12788- * make sure we exit gracefully rather than endlessly redo
12789- * the fault.
12790- */
12791- fault = handle_mm_fault(mm, vma, address, write);
12792- if (unlikely(fault & VM_FAULT_ERROR)) {
12793- if (fault & VM_FAULT_OOM)
12794- goto out_of_memory;
12795- else if (fault & VM_FAULT_SIGBUS)
12796- goto do_sigbus;
12797- BUG();
12798- }
12799- if (fault & VM_FAULT_MAJOR)
12800- tsk->maj_flt++;
12801- else
12802- tsk->min_flt++;
12803- up_read(&mm->mmap_sem);
12804- return;
12805-
12806-/*
12807- * Something tried to access memory that isn't in our memory map..
12808- * Fix it, but check if it's kernel or user first..
12809- */
12810-bad_area:
12811- up_read(&mm->mmap_sem);
12812-
12813-bad_area_nosemaphore:
12814- /* User mode accesses just cause a SIGSEGV */
12815- if (error_code & PF_USER) {
12816-
12817- /*
12818- * It's possible to have interrupts off here.
12819- */
12820- local_irq_enable();
12821-
12822- if (is_prefetch(regs, address, error_code))
12823- return;
12824-
12825- /* Work around K8 erratum #100 K8 in compat mode
12826- occasionally jumps to illegal addresses >4GB. We
12827- catch this here in the page fault handler because
12828- these addresses are not reachable. Just detect this
12829- case and return. Any code segment in LDT is
12830- compatibility mode. */
12831- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12832- (address >> 32))
12833- return;
12834-
12835- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12836- printk_ratelimit()) {
12837- printk(
12838- "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
12839- tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
12840- tsk->comm, tsk->pid, address, regs->rip,
12841- regs->rsp, error_code);
12842- }
12843-
12844- tsk->thread.cr2 = address;
12845- /* Kernel addresses are always protection faults */
12846- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12847- tsk->thread.trap_no = 14;
12848- info.si_signo = SIGSEGV;
12849- info.si_errno = 0;
12850- /* info.si_code has been set above */
12851- info.si_addr = (void __user *)address;
12852- force_sig_info(SIGSEGV, &info, tsk);
12853- return;
12854- }
12855-
12856-no_context:
12857-
12858- /* Are we prepared to handle this kernel fault? */
12859- fixup = search_exception_tables(regs->rip);
12860- if (fixup) {
12861- regs->rip = fixup->fixup;
12862- return;
12863- }
12864-
12865- /*
12866- * Hall of shame of CPU/BIOS bugs.
12867- */
12868-
12869- if (is_prefetch(regs, address, error_code))
12870- return;
12871-
12872- if (is_errata93(regs, address))
12873- return;
12874-
12875-/*
12876- * Oops. The kernel tried to access some bad page. We'll have to
12877- * terminate things with extreme prejudice.
12878- */
12879-
12880- flags = oops_begin();
12881-
12882- if (address < PAGE_SIZE)
12883- printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
12884- else
12885- printk(KERN_ALERT "Unable to handle kernel paging request");
12886- printk(" at %016lx RIP: \n" KERN_ALERT,address);
12887- printk_address(regs->rip);
12888- dump_pagetable(address);
12889- tsk->thread.cr2 = address;
12890- tsk->thread.trap_no = 14;
12891- tsk->thread.error_code = error_code;
12892- __die("Oops", regs, error_code);
12893- /* Executive summary in case the body of the oops scrolled away */
12894- printk(KERN_EMERG "CR2: %016lx\n", address);
12895- oops_end(flags);
12896- do_exit(SIGKILL);
12897-
12898-/*
12899- * We ran out of memory, or some other thing happened to us that made
12900- * us unable to handle the page fault gracefully.
12901- */
12902-out_of_memory:
12903- up_read(&mm->mmap_sem);
12904- if (is_global_init(current)) {
12905- yield();
12906- goto again;
12907- }
12908- printk("VM: killing process %s\n", tsk->comm);
12909- if (error_code & 4)
12910- do_group_exit(SIGKILL);
12911- goto no_context;
12912-
12913-do_sigbus:
12914- up_read(&mm->mmap_sem);
12915-
12916- /* Kernel mode? Handle exceptions or die */
12917- if (!(error_code & PF_USER))
12918- goto no_context;
12919-
12920- tsk->thread.cr2 = address;
12921- tsk->thread.error_code = error_code;
12922- tsk->thread.trap_no = 14;
12923- info.si_signo = SIGBUS;
12924- info.si_errno = 0;
12925- info.si_code = BUS_ADRERR;
12926- info.si_addr = (void __user *)address;
12927- force_sig_info(SIGBUS, &info, tsk);
12928- return;
12929-}
12930-
12931-DEFINE_SPINLOCK(pgd_lock);
12932-LIST_HEAD(pgd_list);
12933-
12934-void vmalloc_sync_all(void)
12935-{
12936- /* Note that races in the updates of insync and start aren't
12937- problematic:
12938- insync can only get set bits added, and updates to start are only
12939- improving performance (without affecting correctness if undone). */
12940- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12941- static unsigned long start = VMALLOC_START & PGDIR_MASK;
12942- unsigned long address;
12943-
12944- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12945- if (!test_bit(pgd_index(address), insync)) {
12946- const pgd_t *pgd_ref = pgd_offset_k(address);
12947- struct page *page;
12948-
12949- if (pgd_none(*pgd_ref))
12950- continue;
12951- spin_lock(&pgd_lock);
12952- list_for_each_entry(page, &pgd_list, lru) {
12953- pgd_t *pgd;
12954- pgd = (pgd_t *)page_address(page) + pgd_index(address);
12955- if (pgd_none(*pgd))
12956- set_pgd(pgd, *pgd_ref);
12957- else
12958- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12959- }
12960- spin_unlock(&pgd_lock);
12961- set_bit(pgd_index(address), insync);
12962- }
12963- if (address == start)
12964- start = address + PGDIR_SIZE;
12965- }
12966- /* Check that there is no need to do the same for the modules area. */
12967- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12968- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12969- (__START_KERNEL & PGDIR_MASK)));
12970-}
12971--- sle11-2009-06-29.orig/arch/x86/mm/highmem_32-xen.c 2009-02-16 16:17:21.000000000 +0100
12972+++ sle11-2009-06-29/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
12973@@ -18,6 +18,49 @@ void kunmap(struct page *page)
12974 kunmap_high(page);
12975 }
12976
12977+static void debug_kmap_atomic_prot(enum km_type type)
12978+{
12979+#ifdef CONFIG_DEBUG_HIGHMEM
12980+ static unsigned warn_count = 10;
12981+
12982+ if (unlikely(warn_count == 0))
12983+ return;
12984+
12985+ if (unlikely(in_interrupt())) {
12986+ if (in_irq()) {
12987+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
12988+ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
12989+ type != KM_BOUNCE_READ) {
12990+ WARN_ON(1);
12991+ warn_count--;
12992+ }
12993+ } else if (!irqs_disabled()) { /* softirq */
12994+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
12995+ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
12996+ type != KM_SKB_SUNRPC_DATA &&
12997+ type != KM_SKB_DATA_SOFTIRQ &&
12998+ type != KM_BOUNCE_READ) {
12999+ WARN_ON(1);
13000+ warn_count--;
13001+ }
13002+ }
13003+ }
13004+
13005+ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13006+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13007+ if (!irqs_disabled()) {
13008+ WARN_ON(1);
13009+ warn_count--;
13010+ }
13011+ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13012+ if (irq_count() == 0 && !irqs_disabled()) {
13013+ WARN_ON(1);
13014+ warn_count--;
13015+ }
13016+ }
13017+#endif
13018+}
13019+
13020 /*
13021 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13022 * no global lock is needed and because the kmap code must perform a global TLB
13023@@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13024 if (!PageHighMem(page))
13025 return page_address(page);
13026
13027+ debug_kmap_atomic_prot(type);
13028+
13029 idx = type + KM_TYPE_NR*smp_processor_id();
13030 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13031 BUG_ON(!pte_none(*(kmap_pte-idx)));
13032--- sle11-2009-06-29.orig/arch/x86/mm/hypervisor.c 2009-05-06 10:23:43.000000000 +0200
13033+++ sle11-2009-06-29/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
13034@@ -869,15 +869,11 @@ int xen_limit_pages_to_max_mfn(
13035 }
13036 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13037
13038-#ifdef __i386__
13039-int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13040+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13041 {
13042- __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13043- maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13044- return HYPERVISOR_update_descriptor(
13045- mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13046+ maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13047+ return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13048 }
13049-#endif
13050
13051 #define MAX_BATCHED_FULL_PTES 32
13052
13053--- sle11-2009-06-29.orig/arch/x86/mm/init_32-xen.c 2009-02-16 16:18:36.000000000 +0100
13054+++ sle11-2009-06-29/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
13055@@ -27,13 +27,13 @@
13056 #include <linux/bootmem.h>
13057 #include <linux/slab.h>
13058 #include <linux/proc_fs.h>
13059-#include <linux/efi.h>
13060 #include <linux/memory_hotplug.h>
13061 #include <linux/initrd.h>
13062 #include <linux/cpumask.h>
13063 #include <linux/dma-mapping.h>
13064 #include <linux/scatterlist.h>
13065
13066+#include <asm/asm.h>
13067 #include <asm/processor.h>
13068 #include <asm/system.h>
13069 #include <asm/uaccess.h>
13070@@ -42,18 +42,22 @@
13071 #include <asm/fixmap.h>
13072 #include <asm/e820.h>
13073 #include <asm/apic.h>
13074+#include <asm/bugs.h>
13075 #include <asm/tlb.h>
13076 #include <asm/tlbflush.h>
13077+#include <asm/pgalloc.h>
13078 #include <asm/sections.h>
13079 #include <asm/hypervisor.h>
13080 #include <asm/swiotlb.h>
13081+#include <asm/setup.h>
13082+#include <asm/cacheflush.h>
13083
13084 unsigned int __VMALLOC_RESERVE = 128 << 20;
13085
13086 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13087 unsigned long highstart_pfn, highend_pfn;
13088
13089-static int noinline do_test_wp_bit(void);
13090+static noinline int do_test_wp_bit(void);
13091
13092 /*
13093 * Creates a middle page table and puts a pointer to it in the
13094@@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13095 {
13096 pud_t *pud;
13097 pmd_t *pmd_table;
13098-
13099+
13100 #ifdef CONFIG_X86_PAE
13101 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13102 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13103
13104- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13105+ paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13106 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13107 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13108 pud = pud_offset(pgd, 0);
13109- if (pmd_table != pmd_offset(pud, 0))
13110- BUG();
13111+ BUG_ON(pmd_table != pmd_offset(pud, 0));
13112 }
13113 #endif
13114 pud = pud_offset(pgd, 0);
13115@@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13116
13117 /*
13118 * Create a page table and place a pointer to it in a middle page
13119- * directory entry.
13120+ * directory entry:
13121 */
13122 static pte_t * __init one_page_table_init(pmd_t *pmd)
13123 {
13124@@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13125 #ifdef CONFIG_DEBUG_PAGEALLOC
13126 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13127 #endif
13128- if (!page_table)
13129+ if (!page_table) {
13130 page_table =
13131 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13132+ }
13133
13134 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13135 make_lowmem_page_readonly(page_table,
13136@@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13137 }
13138
13139 /*
13140- * This function initializes a certain range of kernel virtual memory
13141+ * This function initializes a certain range of kernel virtual memory
13142 * with new bootmem page tables, everywhere page tables are missing in
13143 * the given range.
13144- */
13145-
13146-/*
13147- * NOTE: The pagetables are allocated contiguous on the physical space
13148- * so we can cache the place of the first one and move around without
13149+ *
13150+ * NOTE: The pagetables are allocated contiguous on the physical space
13151+ * so we can cache the place of the first one and move around without
13152 * checking the pgd every time.
13153 */
13154-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13155+static void __init
13156+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13157 {
13158- pgd_t *pgd;
13159- pmd_t *pmd;
13160 int pgd_idx, pmd_idx;
13161 unsigned long vaddr;
13162+ pgd_t *pgd;
13163+ pmd_t *pmd;
13164
13165 vaddr = start;
13166 pgd_idx = pgd_index(vaddr);
13167@@ -139,7 +142,8 @@ static void __init page_table_range_init
13168 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13169 pmd = one_md_table_init(pgd);
13170 pmd = pmd + pmd_index(vaddr);
13171- for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13172+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13173+ pmd++, pmd_idx++) {
13174 if (vaddr < hypervisor_virt_start)
13175 one_page_table_init(pmd);
13176
13177@@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13178 }
13179
13180 /*
13181- * This maps the physical memory to kernel virtual address space, a total
13182- * of max_low_pfn pages, by creating page tables starting from address
13183- * PAGE_OFFSET.
13184+ * This maps the physical memory to kernel virtual address space, a total
13185+ * of max_low_pfn pages, by creating page tables starting from address
13186+ * PAGE_OFFSET:
13187 */
13188 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13189 {
13190+ int pgd_idx, pmd_idx, pte_ofs;
13191 unsigned long pfn;
13192 pgd_t *pgd;
13193 pmd_t *pmd;
13194 pte_t *pte;
13195- int pgd_idx, pmd_idx, pte_ofs;
13196
13197 unsigned long max_ram_pfn = xen_start_info->nr_pages;
13198 if (max_ram_pfn > max_low_pfn)
13199@@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13200 if (pfn >= max_low_pfn)
13201 continue;
13202 pmd += pmd_idx;
13203- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13204- unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13205- if (address >= hypervisor_virt_start)
13206+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13207+ pmd++, pmd_idx++) {
13208+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13209+
13210+ if (addr >= hypervisor_virt_start)
13211 continue;
13212
13213- /* Map with big pages if possible, otherwise create normal page tables. */
13214+ /*
13215+ * Map with big pages if possible, otherwise
13216+ * create normal page tables:
13217+ */
13218 if (cpu_has_pse) {
13219- unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13220- if (is_kernel_text(address) || is_kernel_text(address2))
13221- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13222- else
13223- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13224+ unsigned int addr2;
13225+ pgprot_t prot = PAGE_KERNEL_LARGE;
13226+
13227+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13228+ PAGE_OFFSET + PAGE_SIZE-1;
13229+
13230+ if (is_kernel_text(addr) ||
13231+ is_kernel_text(addr2))
13232+ prot = PAGE_KERNEL_LARGE_EXEC;
13233+
13234+ set_pmd(pmd, pfn_pmd(pfn, prot));
13235
13236 pfn += PTRS_PER_PTE;
13237- } else {
13238- pte = one_page_table_init(pmd);
13239+ continue;
13240+ }
13241+ pte = one_page_table_init(pmd);
13242+
13243+ for (pte += pte_ofs;
13244+ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13245+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13246+ pgprot_t prot = PAGE_KERNEL;
13247+
13248+ /* XEN: Only map initial RAM allocation. */
13249+ if ((pfn >= max_ram_pfn) || pte_present(*pte))
13250+ continue;
13251+ if (is_kernel_text(addr))
13252+ prot = PAGE_KERNEL_EXEC;
13253
13254- for (pte += pte_ofs;
13255- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13256- pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13257- /* XEN: Only map initial RAM allocation. */
13258- if ((pfn >= max_ram_pfn) || pte_present(*pte))
13259- continue;
13260- if (is_kernel_text(address))
13261- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13262- else
13263- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13264- }
13265- pte_ofs = 0;
13266+ set_pte(pte, pfn_pte(pfn, prot));
13267 }
13268+ pte_ofs = 0;
13269 }
13270 pmd_idx = 0;
13271 }
13272@@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13273
13274 #endif
13275
13276-int page_is_ram(unsigned long pagenr)
13277-{
13278- int i;
13279- unsigned long addr, end;
13280-
13281- if (efi_enabled) {
13282- efi_memory_desc_t *md;
13283- void *p;
13284-
13285- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13286- md = p;
13287- if (!is_available_memory(md))
13288- continue;
13289- addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13290- end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13291-
13292- if ((pagenr >= addr) && (pagenr < end))
13293- return 1;
13294- }
13295- return 0;
13296- }
13297-
13298- for (i = 0; i < e820.nr_map; i++) {
13299-
13300- if (e820.map[i].type != E820_RAM) /* not usable memory */
13301- continue;
13302- /*
13303- * !!!FIXME!!! Some BIOSen report areas as RAM that
13304- * are not. Notably the 640->1Mb area. We need a sanity
13305- * check here.
13306- */
13307- addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13308- end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13309- if ((pagenr >= addr) && (pagenr < end))
13310- return 1;
13311- }
13312- return 0;
13313-}
13314-
13315 #ifdef CONFIG_HIGHMEM
13316 pte_t *kmap_pte;
13317 pgprot_t kmap_prot;
13318
13319-#define kmap_get_fixmap_pte(vaddr) \
13320- pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13321+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13322+{
13323+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13324+ vaddr), vaddr), vaddr);
13325+}
13326
13327 static void __init kmap_init(void)
13328 {
13329 unsigned long kmap_vstart;
13330
13331- /* cache the first kmap pte */
13332+ /*
13333+ * Cache the first kmap pte:
13334+ */
13335 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13336 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13337
13338@@ -304,11 +287,11 @@ static void __init kmap_init(void)
13339
13340 static void __init permanent_kmaps_init(pgd_t *pgd_base)
13341 {
13342+ unsigned long vaddr;
13343 pgd_t *pgd;
13344 pud_t *pud;
13345 pmd_t *pmd;
13346 pte_t *pte;
13347- unsigned long vaddr;
13348
13349 vaddr = PKMAP_BASE;
13350 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13351@@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13352 pud = pud_offset(pgd, vaddr);
13353 pmd = pmd_offset(pud, vaddr);
13354 pte = pte_offset_kernel(pmd, vaddr);
13355- pkmap_page_table = pte;
13356+ pkmap_page_table = pte;
13357 }
13358
13359 static void __meminit free_new_highpage(struct page *page, int pfn)
13360@@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13361 SetPageReserved(page);
13362 }
13363
13364-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13365+static int __meminit
13366+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13367 {
13368 free_new_highpage(page, pfn);
13369 totalram_pages++;
13370@@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13371 max_mapnr = max(pfn, max_mapnr);
13372 #endif
13373 num_physpages++;
13374+
13375 return 0;
13376 }
13377
13378@@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13379 * Not currently handling the NUMA case.
13380 * Assuming single node and all memory that
13381 * has been added dynamically that would be
13382- * onlined here is in HIGHMEM
13383+ * onlined here is in HIGHMEM.
13384 */
13385 void __meminit online_page(struct page *page)
13386 {
13387@@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13388 add_one_highpage_hotplug(page, page_to_pfn(page));
13389 }
13390
13391-
13392-#ifdef CONFIG_NUMA
13393-extern void set_highmem_pages_init(int);
13394-#else
13395+#ifndef CONFIG_NUMA
13396 static void __init set_highmem_pages_init(int bad_ppro)
13397 {
13398 int pfn;
13399+
13400 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13401 /*
13402 * Holes under sparsemem might not have no mem_map[]:
13403@@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13404 }
13405 totalram_pages += totalhigh_pages;
13406 }
13407-#endif /* CONFIG_FLATMEM */
13408+#endif /* !CONFIG_NUMA */
13409
13410 #else
13411-#define kmap_init() do { } while (0)
13412-#define permanent_kmaps_init(pgd_base) do { } while (0)
13413-#define set_highmem_pages_init(bad_ppro) do { } while (0)
13414+# define kmap_init() do { } while (0)
13415+# define permanent_kmaps_init(pgd_base) do { } while (0)
13416+# define set_highmem_pages_init(bad_ppro) do { } while (0)
13417 #endif /* CONFIG_HIGHMEM */
13418
13419-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13420+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13421 EXPORT_SYMBOL(__PAGE_KERNEL);
13422-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13423
13424-#ifdef CONFIG_NUMA
13425-extern void __init remap_numa_kva(void);
13426-#else
13427-#define remap_numa_kva() do {} while (0)
13428-#endif
13429+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13430
13431 pgd_t *swapper_pg_dir;
13432
13433@@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13434 * the boot process.
13435 *
13436 * If we're booting on native hardware, this will be a pagetable
13437- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13438- * (even if we'll end up running in PAE). The root of the pagetable
13439- * will be swapper_pg_dir.
13440+ * constructed in arch/x86/kernel/head_32.S. The root of the
13441+ * pagetable will be swapper_pg_dir.
13442 *
13443 * If we're booting paravirtualized under a hypervisor, then there are
13444 * more options: we may already be running PAE, and the pagetable may
13445@@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13446 * be partially populated, and so it avoids stomping on any existing
13447 * mappings.
13448 */
13449-static void __init pagetable_init (void)
13450+static void __init pagetable_init(void)
13451 {
13452- unsigned long vaddr, end;
13453 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13454+ unsigned long vaddr, end;
13455
13456 xen_pagetable_setup_start(pgd_base);
13457
13458@@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13459 * Fixed mappings, only the page table structure has to be
13460 * created - mappings will be set by set_fixmap():
13461 */
13462+ early_ioremap_clear();
13463 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13464 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13465 page_table_range_init(vaddr, end, pgd_base);
13466+ early_ioremap_reset();
13467
13468 permanent_kmaps_init(pgd_base);
13469
13470 xen_pagetable_setup_done(pgd_base);
13471 }
13472
13473-#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13474+#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13475 /*
13476- * Swap suspend & friends need this for resume because things like the intel-agp
13477+ * ACPI suspend needs this for resume, because things like the intel-agp
13478 * driver might have split up a kernel 4MB mapping.
13479 */
13480-char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13481- __attribute__ ((aligned (PAGE_SIZE)));
13482+char swsusp_pg_dir[PAGE_SIZE]
13483+ __attribute__ ((aligned(PAGE_SIZE)));
13484
13485 static inline void save_pg_dir(void)
13486 {
13487 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13488 }
13489-#else
13490+#else /* !CONFIG_ACPI_SLEEP */
13491 static inline void save_pg_dir(void)
13492 {
13493 }
13494-#endif
13495+#endif /* !CONFIG_ACPI_SLEEP */
13496
13497-void zap_low_mappings (void)
13498+void zap_low_mappings(void)
13499 {
13500 int i;
13501
13502@@ -488,22 +467,24 @@ void zap_low_mappings (void)
13503 * Note that "pgd_clear()" doesn't do it for
13504 * us, because pgd_clear() is a no-op on i386.
13505 */
13506- for (i = 0; i < USER_PTRS_PER_PGD; i++)
13507+ for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13508 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13509 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13510 #else
13511 set_pgd(swapper_pg_dir+i, __pgd(0));
13512 #endif
13513+ }
13514 flush_tlb_all();
13515 }
13516
13517-int nx_enabled = 0;
13518+int nx_enabled;
13519+
13520+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13521+EXPORT_SYMBOL_GPL(__supported_pte_mask);
13522
13523 #ifdef CONFIG_X86_PAE
13524
13525-static int disable_nx __initdata = 0;
13526-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13527-EXPORT_SYMBOL_GPL(__supported_pte_mask);
13528+static int disable_nx __initdata;
13529
13530 /*
13531 * noexec = on|off
13532@@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13533 __supported_pte_mask |= _PAGE_NX;
13534 disable_nx = 0;
13535 }
13536- } else if (!strcmp(str,"off")) {
13537- disable_nx = 1;
13538- __supported_pte_mask &= ~_PAGE_NX;
13539- } else
13540- return -EINVAL;
13541+ } else {
13542+ if (!strcmp(str, "off")) {
13543+ disable_nx = 1;
13544+ __supported_pte_mask &= ~_PAGE_NX;
13545+ } else {
13546+ return -EINVAL;
13547+ }
13548+ }
13549
13550 return 0;
13551 }
13552@@ -536,6 +520,7 @@ static void __init set_nx(void)
13553
13554 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13555 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13556+
13557 if ((v[3] & (1 << 20)) && !disable_nx) {
13558 rdmsr(MSR_EFER, l, h);
13559 l |= EFER_NX;
13560@@ -545,35 +530,6 @@ static void __init set_nx(void)
13561 }
13562 }
13563 }
13564-
13565-/*
13566- * Enables/disables executability of a given kernel page and
13567- * returns the previous setting.
13568- */
13569-int __init set_kernel_exec(unsigned long vaddr, int enable)
13570-{
13571- pte_t *pte;
13572- int ret = 1;
13573-
13574- if (!nx_enabled)
13575- goto out;
13576-
13577- pte = lookup_address(vaddr);
13578- BUG_ON(!pte);
13579-
13580- if (!pte_exec_kernel(*pte))
13581- ret = 0;
13582-
13583- if (enable)
13584- pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13585- else
13586- pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13587- pte_update_defer(&init_mm, vaddr, pte);
13588- __flush_tlb_all();
13589-out:
13590- return ret;
13591-}
13592-
13593 #endif
13594
13595 /*
13596@@ -590,21 +546,10 @@ void __init paging_init(void)
13597 #ifdef CONFIG_X86_PAE
13598 set_nx();
13599 if (nx_enabled)
13600- printk("NX (Execute Disable) protection: active\n");
13601+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13602 #endif
13603-
13604 pagetable_init();
13605
13606-#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13607- /*
13608- * We will bail out later - printk doesn't work right now so
13609- * the user would just see a hanging kernel.
13610- * when running as xen domain we are already in PAE mode at
13611- * this point.
13612- */
13613- if (cpu_has_pae)
13614- set_in_cr4(X86_CR4_PAE);
13615-#endif
13616 __flush_tlb_all();
13617
13618 kmap_init();
13619@@ -631,10 +576,10 @@ void __init paging_init(void)
13620 * used to involve black magic jumps to work around some nasty CPU bugs,
13621 * but fortunately the switch to using exceptions got rid of all that.
13622 */
13623-
13624 static void __init test_wp_bit(void)
13625 {
13626- printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13627+ printk(KERN_INFO
13628+ "Checking if this processor honours the WP bit even in supervisor mode...");
13629
13630 /* Any page-aligned address will do, the test is non-destructive */
13631 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13632@@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13633 clear_fixmap(FIX_WP_TEST);
13634
13635 if (!boot_cpu_data.wp_works_ok) {
13636- printk("No.\n");
13637+ printk(KERN_CONT "No.\n");
13638 #ifdef CONFIG_X86_WP_WORKS_OK
13639- panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13640+ panic(
13641+ "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13642 #endif
13643 } else {
13644- printk("Ok.\n");
13645+ printk(KERN_CONT "Ok.\n");
13646 }
13647 }
13648
13649-static struct kcore_list kcore_mem, kcore_vmalloc;
13650+static struct kcore_list kcore_mem, kcore_vmalloc;
13651
13652 void __init mem_init(void)
13653 {
13654- extern int ppro_with_ram_bug(void);
13655 int codesize, reservedpages, datasize, initsize;
13656- int tmp;
13657- int bad_ppro;
13658+ int tmp, bad_ppro;
13659 unsigned long pfn;
13660
13661 #if defined(CONFIG_SWIOTLB)
13662@@ -668,19 +612,19 @@ void __init mem_init(void)
13663 #ifdef CONFIG_FLATMEM
13664 BUG_ON(!mem_map);
13665 #endif
13666-
13667 bad_ppro = ppro_with_ram_bug();
13668
13669 #ifdef CONFIG_HIGHMEM
13670 /* check that fixmap and pkmap do not overlap */
13671- if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13672- printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13673+ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13674+ printk(KERN_ERR
13675+ "fixmap and kmap areas overlap - this will crash\n");
13676 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13677- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13678+ PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13679+ FIXADDR_START);
13680 BUG();
13681 }
13682 #endif
13683-
13684 /* this will put all low memory onto the freelists */
13685 totalram_pages += free_all_bootmem();
13686 /* XEN: init and count low-mem pages outside initial allocation. */
13687@@ -693,7 +637,7 @@ void __init mem_init(void)
13688 reservedpages = 0;
13689 for (tmp = 0; tmp < max_low_pfn; tmp++)
13690 /*
13691- * Only count reserved RAM pages
13692+ * Only count reserved RAM pages:
13693 */
13694 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13695 reservedpages++;
13696@@ -704,11 +648,12 @@ void __init mem_init(void)
13697 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
13698 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
13699
13700- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13701- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13702+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13703+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13704 VMALLOC_END-VMALLOC_START);
13705
13706- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13707+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13708+ "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13709 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13710 num_physpages << (PAGE_SHIFT-10),
13711 codesize >> 10,
13712@@ -719,54 +664,53 @@ void __init mem_init(void)
13713 );
13714
13715 #if 1 /* double-sanity-check paranoia */
13716- printk("virtual kernel memory layout:\n"
13717- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13718+ printk(KERN_INFO "virtual kernel memory layout:\n"
13719+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13720 #ifdef CONFIG_HIGHMEM
13721- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13722+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13723 #endif
13724- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13725- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13726- " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13727- " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13728- " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13729- FIXADDR_START, FIXADDR_TOP,
13730- (FIXADDR_TOP - FIXADDR_START) >> 10,
13731+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13732+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13733+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13734+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13735+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13736+ FIXADDR_START, FIXADDR_TOP,
13737+ (FIXADDR_TOP - FIXADDR_START) >> 10,
13738
13739 #ifdef CONFIG_HIGHMEM
13740- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13741- (LAST_PKMAP*PAGE_SIZE) >> 10,
13742+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13743+ (LAST_PKMAP*PAGE_SIZE) >> 10,
13744 #endif
13745
13746- VMALLOC_START, VMALLOC_END,
13747- (VMALLOC_END - VMALLOC_START) >> 20,
13748+ VMALLOC_START, VMALLOC_END,
13749+ (VMALLOC_END - VMALLOC_START) >> 20,
13750
13751- (unsigned long)__va(0), (unsigned long)high_memory,
13752- ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13753+ (unsigned long)__va(0), (unsigned long)high_memory,
13754+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13755
13756- (unsigned long)&__init_begin, (unsigned long)&__init_end,
13757- ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13758+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
13759+ ((unsigned long)&__init_end -
13760+ (unsigned long)&__init_begin) >> 10,
13761
13762- (unsigned long)&_etext, (unsigned long)&_edata,
13763- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13764+ (unsigned long)&_etext, (unsigned long)&_edata,
13765+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13766
13767- (unsigned long)&_text, (unsigned long)&_etext,
13768- ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13769+ (unsigned long)&_text, (unsigned long)&_etext,
13770+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13771
13772 #ifdef CONFIG_HIGHMEM
13773- BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13774- BUG_ON(VMALLOC_END > PKMAP_BASE);
13775+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13776+ BUG_ON(VMALLOC_END > PKMAP_BASE);
13777 #endif
13778- BUG_ON(VMALLOC_START > VMALLOC_END);
13779- BUG_ON((unsigned long)high_memory > VMALLOC_START);
13780+ BUG_ON(VMALLOC_START > VMALLOC_END);
13781+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
13782 #endif /* double-sanity-check paranoia */
13783
13784-#ifdef CONFIG_X86_PAE
13785- if (!cpu_has_pae)
13786- panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13787-#endif
13788 if (boot_cpu_data.wp_works_ok < 0)
13789 test_wp_bit();
13790
13791+ cpa_init();
13792+
13793 /*
13794 * Subtle. SMP is doing it's boot stuff late (because it has to
13795 * fork idle threads) - but it also needs low mappings for the
13796@@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13797
13798 return __add_pages(zone, start_pfn, nr_pages);
13799 }
13800-
13801 #endif
13802
13803-struct kmem_cache *pmd_cache;
13804-
13805-void __init pgtable_cache_init(void)
13806-{
13807- if (PTRS_PER_PMD > 1)
13808- pmd_cache = kmem_cache_create("pmd",
13809- PTRS_PER_PMD*sizeof(pmd_t),
13810- PTRS_PER_PMD*sizeof(pmd_t),
13811- SLAB_PANIC,
13812- pmd_ctor);
13813-}
13814-
13815 /*
13816 * This function cannot be __init, since exceptions don't work in that
13817 * section. Put this after the callers, so that it cannot be inlined.
13818 */
13819-static int noinline do_test_wp_bit(void)
13820+static noinline int do_test_wp_bit(void)
13821 {
13822 char tmp_reg;
13823 int flag;
13824
13825 __asm__ __volatile__(
13826- " movb %0,%1 \n"
13827- "1: movb %1,%0 \n"
13828- " xorl %2,%2 \n"
13829+ " movb %0, %1 \n"
13830+ "1: movb %1, %0 \n"
13831+ " xorl %2, %2 \n"
13832 "2: \n"
13833- ".section __ex_table,\"a\"\n"
13834- " .align 4 \n"
13835- " .long 1b,2b \n"
13836- ".previous \n"
13837+ _ASM_EXTABLE(1b,2b)
13838 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13839 "=q" (tmp_reg),
13840 "=r" (flag)
13841 :"2" (1)
13842 :"memory");
13843-
13844+
13845 return flag;
13846 }
13847
13848 #ifdef CONFIG_DEBUG_RODATA
13849+const int rodata_test_data = 0xC3;
13850+EXPORT_SYMBOL_GPL(rodata_test_data);
13851
13852 void mark_rodata_ro(void)
13853 {
13854@@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13855 if (num_possible_cpus() <= 1)
13856 #endif
13857 {
13858- change_page_attr(virt_to_page(start),
13859- size >> PAGE_SHIFT, PAGE_KERNEL_RX);
13860- printk("Write protecting the kernel text: %luk\n", size >> 10);
13861+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13862+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
13863+ size >> 10);
13864+
13865+#ifdef CONFIG_CPA_DEBUG
13866+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
13867+ start, start+size);
13868+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
13869+
13870+ printk(KERN_INFO "Testing CPA: write protecting again\n");
13871+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
13872+#endif
13873 }
13874 #endif
13875 start += size;
13876 size = (unsigned long)__end_rodata - start;
13877- change_page_attr(virt_to_page(start),
13878- size >> PAGE_SHIFT, PAGE_KERNEL_RO);
13879- printk("Write protecting the kernel read-only data: %luk\n",
13880- size >> 10);
13881+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13882+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
13883+ size >> 10);
13884+ rodata_test();
13885+
13886+#ifdef CONFIG_CPA_DEBUG
13887+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
13888+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
13889
13890- /*
13891- * change_page_attr() requires a global_flush_tlb() call after it.
13892- * We do this after the printk so that if something went wrong in the
13893- * change, the printk gets out at least to give a better debug hint
13894- * of who is the culprit.
13895- */
13896- global_flush_tlb();
13897+ printk(KERN_INFO "Testing CPA: write protecting again\n");
13898+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13899+#endif
13900 }
13901 #endif
13902
13903 void free_init_pages(char *what, unsigned long begin, unsigned long end)
13904 {
13905+#ifdef CONFIG_DEBUG_PAGEALLOC
13906+ /*
13907+ * If debugging page accesses then do not free this memory but
13908+ * mark them not present - any buggy init-section access will
13909+ * create a kernel page fault:
13910+ */
13911+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
13912+ begin, PAGE_ALIGN(end));
13913+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
13914+#else
13915 unsigned long addr;
13916
13917+ /*
13918+ * We just marked the kernel text read only above, now that
13919+ * we are going to free part of that, we need to make that
13920+ * writeable first.
13921+ */
13922+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
13923+
13924 for (addr = begin; addr < end; addr += PAGE_SIZE) {
13925 ClearPageReserved(virt_to_page(addr));
13926 init_page_count(virt_to_page(addr));
13927@@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
13928 totalram_pages++;
13929 }
13930 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
13931+#endif
13932 }
13933
13934 void free_initmem(void)
13935@@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
13936 free_init_pages("initrd memory", start, end);
13937 }
13938 #endif
13939-
13940--- sle11-2009-06-29.orig/arch/x86/mm/init_64-xen.c 2009-02-16 16:18:36.000000000 +0100
13941+++ sle11-2009-06-29/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
13942@@ -46,14 +46,13 @@
13943 #include <asm/proto.h>
13944 #include <asm/smp.h>
13945 #include <asm/sections.h>
13946+#include <asm/kdebug.h>
13947+#include <asm/numa.h>
13948+#include <asm/cacheflush.h>
13949
13950 #include <xen/features.h>
13951
13952-#ifndef Dprintk
13953-#define Dprintk(x...)
13954-#endif
13955-
13956-const struct dma_mapping_ops* dma_ops;
13957+const struct dma_mapping_ops *dma_ops;
13958 EXPORT_SYMBOL(dma_ops);
13959
13960 #if CONFIG_XEN_COMPAT <= 0x030002
13961@@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
13962 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
13963 __START_KERNEL_map)))
13964
13965-static void __meminit early_make_page_readonly(void *va, unsigned int feature)
13966+pmd_t *__init early_get_pmd(unsigned long va)
13967+{
13968+ unsigned long addr;
13969+ unsigned long *page = (unsigned long *)init_level4_pgt;
13970+
13971+ addr = page[pgd_index(va)];
13972+ addr_to_page(addr, page);
13973+
13974+ addr = page[pud_index(va)];
13975+ addr_to_page(addr, page);
13976+
13977+ return (pmd_t *)&page[pmd_index(va)];
13978+}
13979+
13980+void __meminit early_make_page_readonly(void *va, unsigned int feature)
13981 {
13982 unsigned long addr, _va = (unsigned long)va;
13983 pte_t pte, *ptep;
13984@@ -107,76 +120,6 @@ static void __meminit early_make_page_re
13985 BUG();
13986 }
13987
13988-static void __make_page_readonly(void *va)
13989-{
13990- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
13991- unsigned long addr = (unsigned long) va;
13992-
13993- pgd = pgd_offset_k(addr);
13994- pud = pud_offset(pgd, addr);
13995- pmd = pmd_offset(pud, addr);
13996- ptep = pte_offset_kernel(pmd, addr);
13997-
13998- pte.pte = ptep->pte & ~_PAGE_RW;
13999- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14000- xen_l1_entry_update(ptep, pte); /* fallback */
14001-
14002- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14003- __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14004-}
14005-
14006-static void __make_page_writable(void *va)
14007-{
14008- pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14009- unsigned long addr = (unsigned long) va;
14010-
14011- pgd = pgd_offset_k(addr);
14012- pud = pud_offset(pgd, addr);
14013- pmd = pmd_offset(pud, addr);
14014- ptep = pte_offset_kernel(pmd, addr);
14015-
14016- pte.pte = ptep->pte | _PAGE_RW;
14017- if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14018- xen_l1_entry_update(ptep, pte); /* fallback */
14019-
14020- if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14021- __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14022-}
14023-
14024-void make_page_readonly(void *va, unsigned int feature)
14025-{
14026- if (!xen_feature(feature))
14027- __make_page_readonly(va);
14028-}
14029-
14030-void make_page_writable(void *va, unsigned int feature)
14031-{
14032- if (!xen_feature(feature))
14033- __make_page_writable(va);
14034-}
14035-
14036-void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14037-{
14038- if (xen_feature(feature))
14039- return;
14040-
14041- while (nr-- != 0) {
14042- __make_page_readonly(va);
14043- va = (void*)((unsigned long)va + PAGE_SIZE);
14044- }
14045-}
14046-
14047-void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14048-{
14049- if (xen_feature(feature))
14050- return;
14051-
14052- while (nr-- != 0) {
14053- __make_page_writable(va);
14054- va = (void*)((unsigned long)va + PAGE_SIZE);
14055- }
14056-}
14057-
14058 /*
14059 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14060 * physical space so we can cache the place of the first one and move
14061@@ -187,22 +130,26 @@ void show_mem(void)
14062 {
14063 long i, total = 0, reserved = 0;
14064 long shared = 0, cached = 0;
14065- pg_data_t *pgdat;
14066 struct page *page;
14067+ pg_data_t *pgdat;
14068
14069 printk(KERN_INFO "Mem-info:\n");
14070 show_free_areas();
14071- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14072+ printk(KERN_INFO "Free swap: %6ldkB\n",
14073+ nr_swap_pages << (PAGE_SHIFT-10));
14074
14075 for_each_online_pgdat(pgdat) {
14076- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14077- /* this loop can take a while with 256 GB and 4k pages
14078- so update the NMI watchdog */
14079- if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14080+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14081+ /*
14082+ * This loop can take a while with 256 GB and
14083+ * 4k pages so defer the NMI watchdog:
14084+ */
14085+ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14086 touch_nmi_watchdog();
14087- }
14088+
14089 if (!pfn_valid(pgdat->node_start_pfn + i))
14090 continue;
14091+
14092 page = pfn_to_page(pgdat->node_start_pfn + i);
14093 total++;
14094 if (PageReserved(page))
14095@@ -211,58 +158,67 @@ void show_mem(void)
14096 cached++;
14097 else if (page_count(page))
14098 shared += page_count(page) - 1;
14099- }
14100+ }
14101 }
14102- printk(KERN_INFO "%lu pages of RAM\n", total);
14103- printk(KERN_INFO "%lu reserved pages\n",reserved);
14104- printk(KERN_INFO "%lu pages shared\n",shared);
14105- printk(KERN_INFO "%lu pages swap cached\n",cached);
14106+ printk(KERN_INFO "%lu pages of RAM\n", total);
14107+ printk(KERN_INFO "%lu reserved pages\n", reserved);
14108+ printk(KERN_INFO "%lu pages shared\n", shared);
14109+ printk(KERN_INFO "%lu pages swap cached\n", cached);
14110 }
14111
14112+static unsigned long __meminitdata table_start;
14113+static unsigned long __meminitdata table_end;
14114
14115 static __init void *spp_getpage(void)
14116-{
14117+{
14118 void *ptr;
14119+
14120 if (after_bootmem)
14121- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14122+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14123 else if (start_pfn < table_end) {
14124 ptr = __va(start_pfn << PAGE_SHIFT);
14125 start_pfn++;
14126 memset(ptr, 0, PAGE_SIZE);
14127 } else
14128 ptr = alloc_bootmem_pages(PAGE_SIZE);
14129- if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14130- panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14131
14132- Dprintk("spp_getpage %p\n", ptr);
14133+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14134+ panic("set_pte_phys: cannot allocate page data %s\n",
14135+ after_bootmem ? "after bootmem" : "");
14136+ }
14137+
14138+ pr_debug("spp_getpage %p\n", ptr);
14139+
14140 return ptr;
14141-}
14142+}
14143
14144 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14145 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14146
14147-static __init void set_pte_phys(unsigned long vaddr,
14148- unsigned long phys, pgprot_t prot, int user_mode)
14149+static __init void
14150+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14151 {
14152 pgd_t *pgd;
14153 pud_t *pud;
14154 pmd_t *pmd;
14155 pte_t *pte, new_pte;
14156
14157- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14158+ pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14159
14160 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14161 if (pgd_none(*pgd)) {
14162- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14163+ printk(KERN_ERR
14164+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14165 return;
14166 }
14167 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14168 if (pud_none(*pud)) {
14169- pmd = (pmd_t *) spp_getpage();
14170+ pmd = (pmd_t *) spp_getpage();
14171 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14172 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14173 if (pmd != pmd_offset(pud, 0)) {
14174- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14175+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14176+ pmd, pmd_offset(pud, 0));
14177 return;
14178 }
14179 }
14180@@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14181 make_page_readonly(pte, XENFEAT_writable_page_tables);
14182 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14183 if (pte != pte_offset_kernel(pmd, 0)) {
14184- printk("PAGETABLE BUG #02!\n");
14185+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
14186 return;
14187 }
14188 }
14189@@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14190 __flush_tlb_one(vaddr);
14191 }
14192
14193-static __init void set_pte_phys_ma(unsigned long vaddr,
14194- unsigned long phys, pgprot_t prot)
14195+static __init void
14196+set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14197 {
14198 pgd_t *pgd;
14199 pud_t *pud;
14200 pmd_t *pmd;
14201 pte_t *pte, new_pte;
14202
14203- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14204+ pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14205
14206 pgd = pgd_offset_k(vaddr);
14207 if (pgd_none(*pgd)) {
14208- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14209+ printk(KERN_ERR
14210+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14211 return;
14212 }
14213 pud = pud_offset(pgd, vaddr);
14214 if (pud_none(*pud)) {
14215-
14216- pmd = (pmd_t *) spp_getpage();
14217+ pmd = (pmd_t *) spp_getpage();
14218 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14219 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14220 if (pmd != pmd_offset(pud, 0)) {
14221- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14222- return;
14223+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14224+ pmd, pmd_offset(pud, 0));
14225 }
14226 }
14227 pmd = pmd_offset(pud, vaddr);
14228@@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14229 make_page_readonly(pte, XENFEAT_writable_page_tables);
14230 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14231 if (pte != pte_offset_kernel(pmd, 0)) {
14232- printk("PAGETABLE BUG #02!\n");
14233+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
14234 return;
14235 }
14236 }
14237@@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14238 __flush_tlb_one(vaddr);
14239 }
14240
14241+#ifndef CONFIG_XEN
14242+/*
14243+ * The head.S code sets up the kernel high mapping:
14244+ *
14245+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14246+ *
14247+ * phys_addr holds the negative offset to the kernel, which is added
14248+ * to the compile time generated pmds. This results in invalid pmds up
14249+ * to the point where we hit the physaddr 0 mapping.
14250+ *
14251+ * We limit the mappings to the region from _text to _end. _end is
14252+ * rounded up to the 2MB boundary. This catches the invalid pmds as
14253+ * well, as they are located before _text:
14254+ */
14255+void __init cleanup_highmap(void)
14256+{
14257+ unsigned long vaddr = __START_KERNEL_map;
14258+ unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14259+ pmd_t *pmd = level2_kernel_pgt;
14260+ pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14261+
14262+ for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14263+ if (!pmd_present(*pmd))
14264+ continue;
14265+ if (vaddr < (unsigned long) _text || vaddr > end)
14266+ set_pmd(pmd, __pmd(0));
14267+ }
14268+}
14269+#endif
14270+
14271 /* NOTE: this is meant to be run only at boot */
14272-void __init
14273-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14274+void __init
14275+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14276 {
14277 unsigned long address = __fix_to_virt(idx);
14278
14279 if (idx >= __end_of_fixed_addresses) {
14280- printk("Invalid __set_fixmap\n");
14281+ printk(KERN_ERR "Invalid __set_fixmap\n");
14282 return;
14283 }
14284 switch (idx) {
14285@@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14286 }
14287 }
14288
14289-unsigned long __meminitdata table_start, table_end;
14290-
14291 static __meminit void *alloc_static_page(unsigned long *phys)
14292 {
14293 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14294
14295 if (after_bootmem) {
14296 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14297-
14298 *phys = __pa(adr);
14299+
14300 return adr;
14301 }
14302
14303@@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14304
14305 #define PTE_SIZE PAGE_SIZE
14306
14307-static inline int make_readonly(unsigned long paddr)
14308+static inline int __meminit make_readonly(unsigned long paddr)
14309 {
14310 extern char __vsyscall_0;
14311 int readonly = 0;
14312@@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14313 /* Must run before zap_low_mappings */
14314 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14315 {
14316- unsigned long vaddr;
14317 pmd_t *pmd, *last_pmd;
14318+ unsigned long vaddr;
14319 int i, pmds;
14320
14321 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14322 vaddr = __START_KERNEL_map;
14323 pmd = level2_kernel_pgt;
14324 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14325+
14326 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14327 for (i = 0; i < pmds; i++) {
14328 if (pmd_present(pmd[i]))
14329- goto next;
14330+ goto continue_outer_loop;
14331 }
14332 vaddr += addr & ~PMD_MASK;
14333 addr &= PMD_MASK;
14334+
14335 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14336- set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14337- __flush_tlb();
14338+ set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14339+ __flush_tlb_all();
14340+
14341 return (void *)vaddr;
14342- next:
14343+continue_outer_loop:
14344 ;
14345 }
14346 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14347 return NULL;
14348 }
14349
14350-/* To avoid virtual aliases later */
14351+/*
14352+ * To avoid virtual aliases later:
14353+ */
14354 __meminit void early_iounmap(void *addr, unsigned long size)
14355 {
14356 unsigned long vaddr;
14357@@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14358 vaddr = (unsigned long)addr;
14359 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14360 pmd = level2_kernel_pgt + pmd_index(vaddr);
14361+
14362 for (i = 0; i < pmds; i++)
14363 pmd_clear(pmd + i);
14364- __flush_tlb();
14365+
14366+ __flush_tlb_all();
14367 }
14368 #endif
14369
14370@@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14371 static void __meminit
14372 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14373 {
14374- pmd_t *pmd = pmd_offset(pud,0);
14375+ pmd_t *pmd = pmd_offset(pud, 0);
14376 spin_lock(&init_mm.page_table_lock);
14377 phys_pmd_init(pmd, address, end);
14378 spin_unlock(&init_mm.page_table_lock);
14379 __flush_tlb_all();
14380 }
14381
14382-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14383-{
14384+static void __meminit
14385+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14386+{
14387 int i = pud_index(addr);
14388
14389- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14390+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14391 unsigned long pmd_phys;
14392 pud_t *pud = pud_page + pud_index(addr);
14393 pmd_t *pmd;
14394@@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14395
14396 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14397 }
14398- __flush_tlb();
14399-}
14400+ __flush_tlb_all();
14401+}
14402
14403 void __init xen_init_pt(void)
14404 {
14405@@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14406 static void __init extend_init_mapping(unsigned long tables_space)
14407 {
14408 unsigned long va = __START_KERNEL_map;
14409+ unsigned long start = start_pfn;
14410 unsigned long phys, addr, *pte_page;
14411 pmd_t *pmd;
14412 pte_t *pte, new_pte;
14413@@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14414 BUG();
14415 va += PAGE_SIZE;
14416 }
14417+
14418+ if (start_pfn > start)
14419+ reserve_early(start << PAGE_SHIFT,
14420+ start_pfn << PAGE_SHIFT, "INITMAP");
14421 }
14422
14423 static void __init find_early_table_space(unsigned long end)
14424@@ -706,7 +703,7 @@ static void __init find_early_table_spac
14425 (table_start << PAGE_SHIFT) + tables);
14426 }
14427
14428-static void xen_finish_init_mapping(void)
14429+static void __init xen_finish_init_mapping(void)
14430 {
14431 unsigned long i, start, end;
14432
14433@@ -738,13 +735,6 @@ static void xen_finish_init_mapping(void
14434 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
14435 table_end = ~0UL;
14436
14437- /*
14438- * Prefetch pte's for the bt_ioremap() area. It gets used before the
14439- * boot-time allocator is online, so allocate-on-demand would fail.
14440- */
14441- for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14442- __set_fixmap(i, 0, __pgprot(0));
14443-
14444 /* Switch to the real shared_info page, and clear the dummy page. */
14445 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14446 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
14447@@ -764,20 +754,23 @@ static void xen_finish_init_mapping(void
14448 table_end = start_pfn;
14449 }
14450
14451-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14452- This runs before bootmem is initialized and gets pages directly from the
14453- physical memory. To access them they are temporarily mapped. */
14454+/*
14455+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14456+ * This runs before bootmem is initialized and gets pages directly from
14457+ * the physical memory. To access them they are temporarily mapped.
14458+ */
14459 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14460-{
14461+{
14462 unsigned long next;
14463
14464- Dprintk("init_memory_mapping\n");
14465+ pr_debug("init_memory_mapping\n");
14466
14467- /*
14468+ /*
14469 * Find space for the kernel direct mapping tables.
14470- * Later we should allocate these tables in the local node of the memory
14471- * mapped. Unfortunately this is done currently before the nodes are
14472- * discovered.
14473+ *
14474+ * Later we should allocate these tables in the local node of the
14475+ * memory mapped. Unfortunately this is done currently before the
14476+ * nodes are discovered.
14477 */
14478 if (!after_bootmem)
14479 find_early_table_space(end);
14480@@ -786,8 +779,8 @@ void __init_refok init_memory_mapping(un
14481 end = (unsigned long)__va(end);
14482
14483 for (; start < end; start = next) {
14484- unsigned long pud_phys;
14485 pgd_t *pgd = pgd_offset_k(start);
14486+ unsigned long pud_phys;
14487 pud_t *pud;
14488
14489 if (after_bootmem)
14490@@ -795,8 +788,8 @@ void __init_refok init_memory_mapping(un
14491 else
14492 pud = alloc_static_page(&pud_phys);
14493 next = start + PGDIR_SIZE;
14494- if (next > end)
14495- next = end;
14496+ if (next > end)
14497+ next = end;
14498 phys_pud_init(pud, __pa(start), __pa(next));
14499 if (!after_bootmem) {
14500 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14501@@ -810,12 +803,17 @@ void __init_refok init_memory_mapping(un
14502 }
14503
14504 __flush_tlb_all();
14505+
14506+ if (!after_bootmem)
14507+ reserve_early(table_start << PAGE_SHIFT,
14508+ table_end << PAGE_SHIFT, "PGTABLE");
14509 }
14510
14511 #ifndef CONFIG_NUMA
14512 void __init paging_init(void)
14513 {
14514 unsigned long max_zone_pfns[MAX_NR_ZONES];
14515+
14516 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14517 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14518 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14519@@ -829,40 +827,6 @@ void __init paging_init(void)
14520 }
14521 #endif
14522
14523-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14524- from the CPU leading to inconsistent cache lines. address and size
14525- must be aligned to 2MB boundaries.
14526- Does nothing when the mapping doesn't exist. */
14527-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14528-{
14529- unsigned long end = address + size;
14530-
14531- BUG_ON(address & ~LARGE_PAGE_MASK);
14532- BUG_ON(size & ~LARGE_PAGE_MASK);
14533-
14534- for (; address < end; address += LARGE_PAGE_SIZE) {
14535- pgd_t *pgd = pgd_offset_k(address);
14536- pud_t *pud;
14537- pmd_t *pmd;
14538- if (pgd_none(*pgd))
14539- continue;
14540- pud = pud_offset(pgd, address);
14541- if (pud_none(*pud))
14542- continue;
14543- pmd = pmd_offset(pud, address);
14544- if (!pmd || pmd_none(*pmd))
14545- continue;
14546- if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14547- /* Could handle this, but it should not happen currently. */
14548- printk(KERN_ERR
14549- "clear_kernel_mapping: mapping has been split. will leak memory\n");
14550- pmd_ERROR(*pmd);
14551- }
14552- set_pmd(pmd, __pmd(0));
14553- }
14554- __flush_tlb_all();
14555-}
14556-
14557 /*
14558 * Memory hotplug specific functions
14559 */
14560@@ -888,16 +852,12 @@ int arch_add_memory(int nid, u64 start,
14561 unsigned long nr_pages = size >> PAGE_SHIFT;
14562 int ret;
14563
14564- init_memory_mapping(start, (start + size -1));
14565+ init_memory_mapping(start, start + size-1);
14566
14567 ret = __add_pages(zone, start_pfn, nr_pages);
14568- if (ret)
14569- goto error;
14570+ WARN_ON(1);
14571
14572 return ret;
14573-error:
14574- printk("%s: Problem encountered in __add_pages!\n", __func__);
14575- return ret;
14576 }
14577 EXPORT_SYMBOL_GPL(arch_add_memory);
14578
14579@@ -911,36 +871,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14580
14581 #endif /* CONFIG_MEMORY_HOTPLUG */
14582
14583-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14584-/*
14585- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14586- * just online the pages.
14587- */
14588-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14589-{
14590- int err = -EIO;
14591- unsigned long pfn;
14592- unsigned long total = 0, mem = 0;
14593- for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14594- if (pfn_valid(pfn)) {
14595- online_page(pfn_to_page(pfn));
14596- err = 0;
14597- mem++;
14598- }
14599- total++;
14600- }
14601- if (!err) {
14602- z->spanned_pages += total;
14603- z->present_pages += mem;
14604- z->zone_pgdat->node_spanned_pages += total;
14605- z->zone_pgdat->node_present_pages += mem;
14606- }
14607- return err;
14608-}
14609-#endif
14610-
14611-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14612- kcore_vsyscall;
14613+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14614+ kcore_modules, kcore_vsyscall;
14615
14616 void __init mem_init(void)
14617 {
14618@@ -949,8 +881,7 @@ void __init mem_init(void)
14619
14620 pci_iommu_alloc();
14621
14622- /* clear the zero-page */
14623- memset(empty_zero_page, 0, PAGE_SIZE);
14624+ /* clear_bss() already clear the empty_zero_page */
14625
14626 reservedpages = 0;
14627
14628@@ -968,7 +899,6 @@ void __init mem_init(void)
14629 }
14630 reservedpages = end_pfn - totalram_pages -
14631 absent_pages_in_range(0, end_pfn);
14632-
14633 after_bootmem = 1;
14634
14635 codesize = (unsigned long) &_etext - (unsigned long) &_text;
14636@@ -976,46 +906,64 @@ void __init mem_init(void)
14637 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
14638
14639 /* Register memory areas for /proc/kcore */
14640- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14641- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14642+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14643+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14644 VMALLOC_END-VMALLOC_START);
14645 kclist_add(&kcore_kernel, &_stext, _end - _stext);
14646 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14647- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14648+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14649 VSYSCALL_END - VSYSCALL_START);
14650
14651- printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14652+ printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14653+ "%ldk reserved, %ldk data, %ldk init)\n",
14654 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14655 end_pfn << (PAGE_SHIFT-10),
14656 codesize >> 10,
14657 reservedpages << (PAGE_SHIFT-10),
14658 datasize >> 10,
14659 initsize >> 10);
14660+
14661+ cpa_init();
14662 }
14663
14664 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14665 {
14666- unsigned long addr;
14667+ unsigned long addr = begin;
14668
14669- if (begin >= end)
14670+ if (addr >= end)
14671 return;
14672
14673+ /*
14674+ * If debugging page accesses then do not free this memory but
14675+ * mark them not present - any buggy init-section access will
14676+ * create a kernel page fault:
14677+ */
14678+#ifdef CONFIG_DEBUG_PAGEALLOC
14679+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14680+ begin, PAGE_ALIGN(end));
14681+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14682+#else
14683 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14684- for (addr = begin; addr < end; addr += PAGE_SIZE) {
14685+
14686+ for (; addr < end; addr += PAGE_SIZE) {
14687 ClearPageReserved(virt_to_page(addr));
14688 init_page_count(virt_to_page(addr));
14689 memset((void *)(addr & ~(PAGE_SIZE-1)),
14690 POISON_FREE_INITMEM, PAGE_SIZE);
14691 if (addr >= __START_KERNEL_map) {
14692 /* make_readonly() reports all kernel addresses. */
14693- __make_page_writable(__va(__pa(addr)));
14694- change_page_attr_addr(addr, 1, __pgprot(0));
14695+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14696+ pfn_pte(__pa(addr) >> PAGE_SHIFT,
14697+ PAGE_KERNEL),
14698+ 0))
14699+ BUG();
14700+ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14701+ BUG();
14702 }
14703 free_page(addr);
14704 totalram_pages++;
14705 }
14706- if (addr > __START_KERNEL_map)
14707- global_flush_tlb();
14708+#endif
14709 }
14710
14711 void free_initmem(void)
14712@@ -1026,6 +974,8 @@ void free_initmem(void)
14713 }
14714
14715 #ifdef CONFIG_DEBUG_RODATA
14716+const int rodata_test_data = 0xC3;
14717+EXPORT_SYMBOL_GPL(rodata_test_data);
14718
14719 void mark_rodata_ro(void)
14720 {
14721@@ -1047,18 +997,27 @@ void mark_rodata_ro(void)
14722 if (end <= start)
14723 return;
14724
14725- change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14726
14727 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14728 (end - start) >> 10);
14729+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14730
14731 /*
14732- * change_page_attr_addr() requires a global_flush_tlb() call after it.
14733- * We do this after the printk so that if something went wrong in the
14734- * change, the printk gets out at least to give a better debug hint
14735- * of who is the culprit.
14736+ * The rodata section (but not the kernel text!) should also be
14737+ * not-executable.
14738 */
14739- global_flush_tlb();
14740+ start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14741+ set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14742+
14743+ rodata_test();
14744+
14745+#ifdef CONFIG_CPA_DEBUG
14746+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14747+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14748+
14749+ printk(KERN_INFO "Testing CPA: again\n");
14750+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14751+#endif
14752 }
14753 #endif
14754
14755@@ -1069,17 +1028,21 @@ void free_initrd_mem(unsigned long start
14756 }
14757 #endif
14758
14759-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14760-{
14761+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14762+{
14763 #ifdef CONFIG_NUMA
14764 int nid = phys_to_nid(phys);
14765 #endif
14766 unsigned long pfn = phys >> PAGE_SHIFT;
14767+
14768 if (pfn >= end_pfn) {
14769- /* This can happen with kdump kernels when accessing firmware
14770- tables. */
14771+ /*
14772+ * This can happen with kdump kernels when accessing
14773+ * firmware tables:
14774+ */
14775 if (pfn < end_pfn_map)
14776 return;
14777+
14778 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14779 phys, len);
14780 return;
14781@@ -1087,9 +1050,9 @@ void __init reserve_bootmem_generic(unsi
14782
14783 /* Should check here against the e820 map to avoid double free */
14784 #ifdef CONFIG_NUMA
14785- reserve_bootmem_node(NODE_DATA(nid), phys, len);
14786-#else
14787- reserve_bootmem(phys, len);
14788+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14789+#else
14790+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14791 #endif
14792 #ifndef CONFIG_XEN
14793 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14794@@ -1099,46 +1062,49 @@ void __init reserve_bootmem_generic(unsi
14795 #endif
14796 }
14797
14798-int kern_addr_valid(unsigned long addr)
14799-{
14800+int kern_addr_valid(unsigned long addr)
14801+{
14802 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14803- pgd_t *pgd;
14804- pud_t *pud;
14805- pmd_t *pmd;
14806- pte_t *pte;
14807+ pgd_t *pgd;
14808+ pud_t *pud;
14809+ pmd_t *pmd;
14810+ pte_t *pte;
14811
14812 if (above != 0 && above != -1UL)
14813- return 0;
14814-
14815+ return 0;
14816+
14817 pgd = pgd_offset_k(addr);
14818 if (pgd_none(*pgd))
14819 return 0;
14820
14821 pud = pud_offset(pgd, addr);
14822 if (pud_none(*pud))
14823- return 0;
14824+ return 0;
14825
14826 pmd = pmd_offset(pud, addr);
14827 if (pmd_none(*pmd))
14828 return 0;
14829+
14830 if (pmd_large(*pmd))
14831 return pfn_valid(pmd_pfn(*pmd));
14832
14833 pte = pte_offset_kernel(pmd, addr);
14834 if (pte_none(*pte))
14835 return 0;
14836+
14837 return pfn_valid(pte_pfn(*pte));
14838 }
14839
14840-/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
14841- covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14842- not need special handling anymore. */
14843-
14844+/*
14845+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
14846+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14847+ * not need special handling anymore:
14848+ */
14849 static struct vm_area_struct gate_vma = {
14850- .vm_start = VSYSCALL_START,
14851- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14852- .vm_page_prot = PAGE_READONLY_EXEC,
14853- .vm_flags = VM_READ | VM_EXEC
14854+ .vm_start = VSYSCALL_START,
14855+ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14856+ .vm_page_prot = PAGE_READONLY_EXEC,
14857+ .vm_flags = VM_READ | VM_EXEC
14858 };
14859
14860 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
14861@@ -1153,14 +1119,17 @@ struct vm_area_struct *get_gate_vma(stru
14862 int in_gate_area(struct task_struct *task, unsigned long addr)
14863 {
14864 struct vm_area_struct *vma = get_gate_vma(task);
14865+
14866 if (!vma)
14867 return 0;
14868+
14869 return (addr >= vma->vm_start) && (addr < vma->vm_end);
14870 }
14871
14872-/* Use this when you have no reliable task/vma, typically from interrupt
14873- * context. It is less reliable than using the task's vma and may give
14874- * false positives.
14875+/*
14876+ * Use this when you have no reliable task/vma, typically from interrupt
14877+ * context. It is less reliable than using the task's vma and may give
14878+ * false positives:
14879 */
14880 int in_gate_area_no_task(unsigned long addr)
14881 {
14882@@ -1180,8 +1149,8 @@ const char *arch_vma_name(struct vm_area
14883 /*
14884 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
14885 */
14886-int __meminit vmemmap_populate(struct page *start_page,
14887- unsigned long size, int node)
14888+int __meminit
14889+vmemmap_populate(struct page *start_page, unsigned long size, int node)
14890 {
14891 unsigned long addr = (unsigned long)start_page;
14892 unsigned long end = (unsigned long)(start_page + size);
14893@@ -1196,6 +1165,7 @@ int __meminit vmemmap_populate(struct pa
14894 pgd = vmemmap_pgd_populate(addr, node);
14895 if (!pgd)
14896 return -ENOMEM;
14897+
14898 pud = vmemmap_pud_populate(pgd, addr, node);
14899 if (!pud)
14900 return -ENOMEM;
14901@@ -1203,20 +1173,22 @@ int __meminit vmemmap_populate(struct pa
14902 pmd = pmd_offset(pud, addr);
14903 if (pmd_none(*pmd)) {
14904 pte_t entry;
14905- void *p = vmemmap_alloc_block(PMD_SIZE, node);
14906+ void *p;
14907+
14908+ p = vmemmap_alloc_block(PMD_SIZE, node);
14909 if (!p)
14910 return -ENOMEM;
14911
14912- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
14913- mk_pte_huge(entry);
14914- set_pmd(pmd, __pmd(pte_val(entry)));
14915+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
14916+ PAGE_KERNEL_LARGE);
14917+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
14918
14919 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
14920 addr, addr + PMD_SIZE - 1, p, node);
14921- } else
14922+ } else {
14923 vmemmap_verify((pte_t *)pmd, node, addr, next);
14924+ }
14925 }
14926-
14927 return 0;
14928 }
14929 #endif
14930--- /dev/null 1970-01-01 00:00:00.000000000 +0000
14931+++ sle11-2009-06-29/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
14932@@ -0,0 +1,687 @@
14933+/*
14934+ * Re-map IO memory to kernel address space so that we can access it.
14935+ * This is needed for high PCI addresses that aren't mapped in the
14936+ * 640k-1MB IO memory area on PC's
14937+ *
14938+ * (C) Copyright 1995 1996 Linus Torvalds
14939+ */
14940+
14941+#include <linux/bootmem.h>
14942+#include <linux/init.h>
14943+#include <linux/io.h>
14944+#include <linux/module.h>
14945+#include <linux/pfn.h>
14946+#include <linux/slab.h>
14947+#include <linux/vmalloc.h>
14948+
14949+#include <asm/cacheflush.h>
14950+#include <asm/e820.h>
14951+#include <asm/fixmap.h>
14952+#include <asm/pgtable.h>
14953+#include <asm/tlbflush.h>
14954+#include <asm/pgalloc.h>
14955+
14956+enum ioremap_mode {
14957+ IOR_MODE_UNCACHED,
14958+ IOR_MODE_CACHED,
14959+};
14960+
14961+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
14962+
14963+unsigned long __phys_addr(unsigned long x)
14964+{
14965+ if (x >= __START_KERNEL_map)
14966+ return x - __START_KERNEL_map + phys_base;
14967+ return x - PAGE_OFFSET;
14968+}
14969+EXPORT_SYMBOL(__phys_addr);
14970+
14971+#endif
14972+
14973+static int direct_remap_area_pte_fn(pte_t *pte,
14974+ struct page *pmd_page,
14975+ unsigned long address,
14976+ void *data)
14977+{
14978+ mmu_update_t **v = (mmu_update_t **)data;
14979+
14980+ BUG_ON(!pte_none(*pte));
14981+
14982+ (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
14983+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
14984+ (*v)++;
14985+
14986+ return 0;
14987+}
14988+
14989+static int __direct_remap_pfn_range(struct mm_struct *mm,
14990+ unsigned long address,
14991+ unsigned long mfn,
14992+ unsigned long size,
14993+ pgprot_t prot,
14994+ domid_t domid)
14995+{
14996+ int rc;
14997+ unsigned long i, start_address;
14998+ mmu_update_t *u, *v, *w;
14999+
15000+ u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15001+ if (u == NULL)
15002+ return -ENOMEM;
15003+
15004+ start_address = address;
15005+
15006+ flush_cache_all();
15007+
15008+ for (i = 0; i < size; i += PAGE_SIZE) {
15009+ if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15010+ /* Flush a full batch after filling in the PTE ptrs. */
15011+ rc = apply_to_page_range(mm, start_address,
15012+ address - start_address,
15013+ direct_remap_area_pte_fn, &w);
15014+ if (rc)
15015+ goto out;
15016+ rc = -EFAULT;
15017+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15018+ goto out;
15019+ v = w = u;
15020+ start_address = address;
15021+ }
15022+
15023+ /*
15024+ * Fill in the machine address: PTE ptr is done later by
15025+ * apply_to_page_range().
15026+ */
15027+ v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15028+
15029+ mfn++;
15030+ address += PAGE_SIZE;
15031+ v++;
15032+ }
15033+
15034+ if (v != u) {
15035+ /* Final batch. */
15036+ rc = apply_to_page_range(mm, start_address,
15037+ address - start_address,
15038+ direct_remap_area_pte_fn, &w);
15039+ if (rc)
15040+ goto out;
15041+ rc = -EFAULT;
15042+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15043+ goto out;
15044+ }
15045+
15046+ rc = 0;
15047+
15048+ out:
15049+ flush_tlb_all();
15050+
15051+ free_page((unsigned long)u);
15052+
15053+ return rc;
15054+}
15055+
15056+int direct_remap_pfn_range(struct vm_area_struct *vma,
15057+ unsigned long address,
15058+ unsigned long mfn,
15059+ unsigned long size,
15060+ pgprot_t prot,
15061+ domid_t domid)
15062+{
15063+ if (xen_feature(XENFEAT_auto_translated_physmap))
15064+ return remap_pfn_range(vma, address, mfn, size, prot);
15065+
15066+ if (domid == DOMID_SELF)
15067+ return -EINVAL;
15068+
15069+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15070+
15071+ vma->vm_mm->context.has_foreign_mappings = 1;
15072+
15073+ return __direct_remap_pfn_range(
15074+ vma->vm_mm, address, mfn, size, prot, domid);
15075+}
15076+EXPORT_SYMBOL(direct_remap_pfn_range);
15077+
15078+int direct_kernel_remap_pfn_range(unsigned long address,
15079+ unsigned long mfn,
15080+ unsigned long size,
15081+ pgprot_t prot,
15082+ domid_t domid)
15083+{
15084+ return __direct_remap_pfn_range(
15085+ &init_mm, address, mfn, size, prot, domid);
15086+}
15087+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15088+
15089+static int lookup_pte_fn(
15090+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15091+{
15092+ uint64_t *ptep = (uint64_t *)data;
15093+ if (ptep)
15094+ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15095+ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15096+ return 0;
15097+}
15098+
15099+int create_lookup_pte_addr(struct mm_struct *mm,
15100+ unsigned long address,
15101+ uint64_t *ptep)
15102+{
15103+ return apply_to_page_range(mm, address, PAGE_SIZE,
15104+ lookup_pte_fn, ptep);
15105+}
15106+
15107+EXPORT_SYMBOL(create_lookup_pte_addr);
15108+
15109+static int noop_fn(
15110+ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15111+{
15112+ return 0;
15113+}
15114+
15115+int touch_pte_range(struct mm_struct *mm,
15116+ unsigned long address,
15117+ unsigned long size)
15118+{
15119+ return apply_to_page_range(mm, address, size, noop_fn, NULL);
15120+}
15121+
15122+EXPORT_SYMBOL(touch_pte_range);
15123+
15124+#ifdef CONFIG_X86_32
15125+int page_is_ram(unsigned long pagenr)
15126+{
15127+ unsigned long addr, end;
15128+ int i;
15129+
15130+#ifndef CONFIG_XEN
15131+ /*
15132+ * A special case is the first 4Kb of memory;
15133+ * This is a BIOS owned area, not kernel ram, but generally
15134+ * not listed as such in the E820 table.
15135+ */
15136+ if (pagenr == 0)
15137+ return 0;
15138+
15139+ /*
15140+ * Second special case: Some BIOSen report the PC BIOS
15141+ * area (640->1Mb) as ram even though it is not.
15142+ */
15143+ if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15144+ pagenr < (BIOS_END >> PAGE_SHIFT))
15145+ return 0;
15146+#endif
15147+
15148+ for (i = 0; i < e820.nr_map; i++) {
15149+ /*
15150+ * Not usable memory:
15151+ */
15152+ if (e820.map[i].type != E820_RAM)
15153+ continue;
15154+ addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15155+ end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15156+
15157+
15158+ if ((pagenr >= addr) && (pagenr < end))
15159+ return 1;
15160+ }
15161+ return 0;
15162+}
15163+#endif
15164+
15165+/*
15166+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
15167+ * conflicts.
15168+ */
15169+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15170+ enum ioremap_mode mode)
15171+{
15172+ unsigned long nrpages = size >> PAGE_SHIFT;
15173+ int err;
15174+
15175+ switch (mode) {
15176+ case IOR_MODE_UNCACHED:
15177+ default:
15178+ err = set_memory_uc(vaddr, nrpages);
15179+ break;
15180+ case IOR_MODE_CACHED:
15181+ err = set_memory_wb(vaddr, nrpages);
15182+ break;
15183+ }
15184+
15185+ return err;
15186+}
15187+
15188+/*
15189+ * Remap an arbitrary physical address space into the kernel virtual
15190+ * address space. Needed when the kernel wants to access high addresses
15191+ * directly.
15192+ *
15193+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15194+ * have to convert them into an offset in a page-aligned mapping, but the
15195+ * caller shouldn't need to know that small detail.
15196+ */
15197+static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15198+ enum ioremap_mode mode)
15199+{
15200+ unsigned long mfn, offset, last_addr, vaddr;
15201+ struct vm_struct *area;
15202+ pgprot_t prot;
15203+ domid_t domid = DOMID_IO;
15204+
15205+ /* Don't allow wraparound or zero size */
15206+ last_addr = phys_addr + size - 1;
15207+ if (!size || last_addr < phys_addr)
15208+ return NULL;
15209+
15210+ /*
15211+ * Don't remap the low PCI/ISA area, it's always mapped..
15212+ */
15213+ if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15214+ return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15215+
15216+ /*
15217+ * Don't allow anybody to remap normal RAM that we're using..
15218+ */
15219+ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15220+ unsigned long pfn = mfn_to_local_pfn(mfn);
15221+
15222+ if (pfn >= max_pfn)
15223+ continue;
15224+
15225+ domid = DOMID_SELF;
15226+
15227+ if (pfn >= max_pfn_mapped) /* bogus */
15228+ continue;
15229+
15230+ if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15231+ return NULL;
15232+ }
15233+
15234+ switch (mode) {
15235+ case IOR_MODE_UNCACHED:
15236+ default:
15237+ /*
15238+ * FIXME: we will use UC MINUS for now, as video fb drivers
15239+ * depend on it. Upcoming ioremap_wc() will fix this behavior.
15240+ */
15241+ prot = PAGE_KERNEL_UC_MINUS;
15242+ break;
15243+ case IOR_MODE_CACHED:
15244+ prot = PAGE_KERNEL;
15245+ break;
15246+ }
15247+
15248+ /*
15249+ * Mappings have to be page-aligned
15250+ */
15251+ offset = phys_addr & ~PAGE_MASK;
15252+ phys_addr &= PAGE_MASK;
15253+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
15254+
15255+ /*
15256+ * Ok, go for it..
15257+ */
15258+ area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15259+ if (!area)
15260+ return NULL;
15261+ area->phys_addr = phys_addr;
15262+ vaddr = (unsigned long) area->addr;
15263+ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15264+ size, prot, domid)) {
15265+ free_vm_area(area);
15266+ return NULL;
15267+ }
15268+
15269+ if (ioremap_change_attr(vaddr, size, mode) < 0) {
15270+ iounmap((void __iomem *) vaddr);
15271+ return NULL;
15272+ }
15273+
15274+ return (void __iomem *) (vaddr + offset);
15275+}
15276+
15277+/**
15278+ * ioremap_nocache - map bus memory into CPU space
15279+ * @offset: bus address of the memory
15280+ * @size: size of the resource to map
15281+ *
15282+ * ioremap_nocache performs a platform specific sequence of operations to
15283+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
15284+ * writew/writel functions and the other mmio helpers. The returned
15285+ * address is not guaranteed to be usable directly as a virtual
15286+ * address.
15287+ *
15288+ * This version of ioremap ensures that the memory is marked uncachable
15289+ * on the CPU as well as honouring existing caching rules from things like
15290+ * the PCI bus. Note that there are other caches and buffers on many
15291+ * busses. In particular driver authors should read up on PCI writes
15292+ *
15293+ * It's useful if some control registers are in such an area and
15294+ * write combining or read caching is not desirable:
15295+ *
15296+ * Must be freed with iounmap.
15297+ */
15298+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15299+{
15300+ return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15301+}
15302+EXPORT_SYMBOL(ioremap_nocache);
15303+
15304+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15305+{
15306+ return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15307+}
15308+EXPORT_SYMBOL(ioremap_cache);
15309+
15310+/**
15311+ * iounmap - Free a IO remapping
15312+ * @addr: virtual address from ioremap_*
15313+ *
15314+ * Caller must ensure there is only one unmapping for the same pointer.
15315+ */
15316+void iounmap(volatile void __iomem *addr)
15317+{
15318+ struct vm_struct *p, *o;
15319+
15320+ if ((void __force *)addr <= high_memory)
15321+ return;
15322+
15323+ /*
15324+ * __ioremap special-cases the PCI/ISA range by not instantiating a
15325+ * vm_area and by simply returning an address into the kernel mapping
15326+ * of ISA space. So handle that here.
15327+ */
15328+ if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15329+ return;
15330+
15331+ addr = (volatile void __iomem *)
15332+ (PAGE_MASK & (unsigned long __force)addr);
15333+
15334+ /* Use the vm area unlocked, assuming the caller
15335+ ensures there isn't another iounmap for the same address
15336+ in parallel. Reuse of the virtual address is prevented by
15337+ leaving it in the global lists until we're done with it.
15338+ cpa takes care of the direct mappings. */
15339+ read_lock(&vmlist_lock);
15340+ for (p = vmlist; p; p = p->next) {
15341+ if (p->addr == addr)
15342+ break;
15343+ }
15344+ read_unlock(&vmlist_lock);
15345+
15346+ if (!p) {
15347+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
15348+ dump_stack();
15349+ return;
15350+ }
15351+
15352+ if ((p->flags >> 20) != IOR_MODE_CACHED) {
15353+ unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15354+ unsigned long mfn = p->phys_addr;
15355+ unsigned long va = (unsigned long)addr;
15356+
15357+ for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15358+ if (mfn_to_local_pfn(mfn) < max_pfn)
15359+ set_memory_wb(va, 1);
15360+ }
15361+
15362+ /* Finally remove it */
15363+ o = remove_vm_area((void *)addr);
15364+ BUG_ON(p != o || o == NULL);
15365+ kfree(p);
15366+}
15367+EXPORT_SYMBOL(iounmap);
15368+
15369+int __initdata early_ioremap_debug;
15370+
15371+static int __init early_ioremap_debug_setup(char *str)
15372+{
15373+ early_ioremap_debug = 1;
15374+
15375+ return 0;
15376+}
15377+early_param("early_ioremap_debug", early_ioremap_debug_setup);
15378+
15379+static __initdata int after_paging_init;
15380+static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15381+ __attribute__((aligned(PAGE_SIZE)));
15382+
15383+#ifdef CONFIG_X86_32
15384+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15385+{
15386+ /* Don't assume we're using swapper_pg_dir at this point */
15387+ pgd_t *base = __va(read_cr3());
15388+ pgd_t *pgd = &base[pgd_index(addr)];
15389+ pud_t *pud = pud_offset(pgd, addr);
15390+ pmd_t *pmd = pmd_offset(pud, addr);
15391+
15392+ return pmd;
15393+}
15394+#else
15395+#define early_ioremap_pmd early_get_pmd
15396+#define make_lowmem_page_readonly early_make_page_readonly
15397+#define make_lowmem_page_writable make_page_writable
15398+#endif
15399+
15400+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15401+{
15402+ return &bm_pte[pte_index(addr)];
15403+}
15404+
15405+void __init early_ioremap_init(void)
15406+{
15407+ pmd_t *pmd;
15408+
15409+ if (early_ioremap_debug)
15410+ printk(KERN_INFO "early_ioremap_init()\n");
15411+
15412+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15413+ memset(bm_pte, 0, sizeof(bm_pte));
15414+ make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
15415+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
15416+
15417+ /*
15418+ * The boot-ioremap range spans multiple pmds, for which
15419+ * we are not prepared:
15420+ */
15421+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
15422+ WARN_ON(1);
15423+ printk(KERN_WARNING "pmd %p != %p\n",
15424+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
15425+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
15426+ fix_to_virt(FIX_BTMAP_BEGIN));
15427+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
15428+ fix_to_virt(FIX_BTMAP_END));
15429+
15430+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
15431+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
15432+ FIX_BTMAP_BEGIN);
15433+ }
15434+}
15435+
15436+#ifdef CONFIG_X86_32
15437+void __init early_ioremap_clear(void)
15438+{
15439+ pmd_t *pmd;
15440+
15441+ if (early_ioremap_debug)
15442+ printk(KERN_INFO "early_ioremap_clear()\n");
15443+
15444+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15445+ pmd_clear(pmd);
15446+ make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
15447+ /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
15448+ __flush_tlb_all();
15449+}
15450+
15451+void __init early_ioremap_reset(void)
15452+{
15453+ enum fixed_addresses idx;
15454+ unsigned long addr, phys;
15455+ pte_t *pte;
15456+
15457+ after_paging_init = 1;
15458+ for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
15459+ addr = fix_to_virt(idx);
15460+ pte = early_ioremap_pte(addr);
15461+ if (pte_present(*pte)) {
15462+ phys = __pte_val(*pte) & PAGE_MASK;
15463+ set_fixmap(idx, phys);
15464+ }
15465+ }
15466+}
15467+#endif /* CONFIG_X86_32 */
15468+
15469+static void __init __early_set_fixmap(enum fixed_addresses idx,
15470+ unsigned long phys, pgprot_t flags)
15471+{
15472+ unsigned long addr = __fix_to_virt(idx);
15473+ pte_t *pte;
15474+
15475+ if (idx >= __end_of_fixed_addresses) {
15476+ BUG();
15477+ return;
15478+ }
15479+ pte = early_ioremap_pte(addr);
15480+ if (pgprot_val(flags))
15481+ set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
15482+ else
15483+ pte_clear(NULL, addr, pte);
15484+ __flush_tlb_one(addr);
15485+}
15486+
15487+static inline void __init early_set_fixmap(enum fixed_addresses idx,
15488+ unsigned long phys)
15489+{
15490+ if (after_paging_init)
15491+ set_fixmap(idx, phys);
15492+ else
15493+ __early_set_fixmap(idx, phys, PAGE_KERNEL);
15494+}
15495+
15496+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
15497+{
15498+ if (after_paging_init)
15499+ clear_fixmap(idx);
15500+ else
15501+ __early_set_fixmap(idx, 0, __pgprot(0));
15502+}
15503+
15504+
15505+int __initdata early_ioremap_nested;
15506+
15507+static int __init check_early_ioremap_leak(void)
15508+{
15509+ if (!early_ioremap_nested)
15510+ return 0;
15511+
15512+ printk(KERN_WARNING
15513+ "Debug warning: early ioremap leak of %d areas detected.\n",
15514+ early_ioremap_nested);
15515+ printk(KERN_WARNING
15516+ "please boot with early_ioremap_debug and report the dmesg.\n");
15517+ WARN_ON(1);
15518+
15519+ return 1;
15520+}
15521+late_initcall(check_early_ioremap_leak);
15522+
15523+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
15524+{
15525+ unsigned long offset, last_addr;
15526+ unsigned int nrpages, nesting;
15527+ enum fixed_addresses idx0, idx;
15528+
15529+ WARN_ON(system_state != SYSTEM_BOOTING);
15530+
15531+ nesting = early_ioremap_nested;
15532+ if (early_ioremap_debug) {
15533+ printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
15534+ phys_addr, size, nesting);
15535+ dump_stack();
15536+ }
15537+
15538+ /* Don't allow wraparound or zero size */
15539+ last_addr = phys_addr + size - 1;
15540+ if (!size || last_addr < phys_addr) {
15541+ WARN_ON(1);
15542+ return NULL;
15543+ }
15544+
15545+ if (nesting >= FIX_BTMAPS_NESTING) {
15546+ WARN_ON(1);
15547+ return NULL;
15548+ }
15549+ early_ioremap_nested++;
15550+ /*
15551+ * Mappings have to be page-aligned
15552+ */
15553+ offset = phys_addr & ~PAGE_MASK;
15554+ phys_addr &= PAGE_MASK;
15555+ size = PAGE_ALIGN(last_addr) - phys_addr;
15556+
15557+ /*
15558+ * Mappings have to fit in the FIX_BTMAP area.
15559+ */
15560+ nrpages = size >> PAGE_SHIFT;
15561+ if (nrpages > NR_FIX_BTMAPS) {
15562+ WARN_ON(1);
15563+ return NULL;
15564+ }
15565+
15566+ /*
15567+ * Ok, go for it..
15568+ */
15569+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15570+ idx = idx0;
15571+ while (nrpages > 0) {
15572+ early_set_fixmap(idx, phys_addr);
15573+ phys_addr += PAGE_SIZE;
15574+ --idx;
15575+ --nrpages;
15576+ }
15577+ if (early_ioremap_debug)
15578+ printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
15579+
15580+ return (void *) (offset + fix_to_virt(idx0));
15581+}
15582+
15583+void __init early_iounmap(void *addr, unsigned long size)
15584+{
15585+ unsigned long virt_addr;
15586+ unsigned long offset;
15587+ unsigned int nrpages;
15588+ enum fixed_addresses idx;
15589+ unsigned int nesting;
15590+
15591+ nesting = --early_ioremap_nested;
15592+ WARN_ON(nesting < 0);
15593+
15594+ if (early_ioremap_debug) {
15595+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
15596+ size, nesting);
15597+ dump_stack();
15598+ }
15599+
15600+ virt_addr = (unsigned long)addr;
15601+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
15602+ WARN_ON(1);
15603+ return;
15604+ }
15605+ offset = virt_addr & ~PAGE_MASK;
15606+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15607+
15608+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15609+ while (nrpages > 0) {
15610+ early_clear_fixmap(idx);
15611+ --idx;
15612+ --nrpages;
15613+ }
15614+}
15615+
15616+void __this_fixmap_does_not_exist(void)
15617+{
15618+ WARN_ON(1);
15619+}
15620--- sle11-2009-06-29.orig/arch/x86/mm/ioremap_32-xen.c 2009-02-16 16:17:21.000000000 +0100
15621+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
15622@@ -1,445 +0,0 @@
15623-/*
15624- * arch/i386/mm/ioremap.c
15625- *
15626- * Re-map IO memory to kernel address space so that we can access it.
15627- * This is needed for high PCI addresses that aren't mapped in the
15628- * 640k-1MB IO memory area on PC's
15629- *
15630- * (C) Copyright 1995 1996 Linus Torvalds
15631- */
15632-
15633-#include <linux/vmalloc.h>
15634-#include <linux/init.h>
15635-#include <linux/slab.h>
15636-#include <linux/module.h>
15637-#include <linux/io.h>
15638-#include <linux/sched.h>
15639-#include <asm/fixmap.h>
15640-#include <asm/cacheflush.h>
15641-#include <asm/tlbflush.h>
15642-#include <asm/pgtable.h>
15643-#include <asm/pgalloc.h>
15644-
15645-#define ISA_START_ADDRESS 0x0
15646-#define ISA_END_ADDRESS 0x100000
15647-
15648-static int direct_remap_area_pte_fn(pte_t *pte,
15649- struct page *pmd_page,
15650- unsigned long address,
15651- void *data)
15652-{
15653- mmu_update_t **v = (mmu_update_t **)data;
15654-
15655- BUG_ON(!pte_none(*pte));
15656-
15657- (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15658- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15659- (*v)++;
15660-
15661- return 0;
15662-}
15663-
15664-static int __direct_remap_pfn_range(struct mm_struct *mm,
15665- unsigned long address,
15666- unsigned long mfn,
15667- unsigned long size,
15668- pgprot_t prot,
15669- domid_t domid)
15670-{
15671- int rc;
15672- unsigned long i, start_address;
15673- mmu_update_t *u, *v, *w;
15674-
15675- u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15676- if (u == NULL)
15677- return -ENOMEM;
15678-
15679- start_address = address;
15680-
15681- flush_cache_all();
15682-
15683- for (i = 0; i < size; i += PAGE_SIZE) {
15684- if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15685- /* Flush a full batch after filling in the PTE ptrs. */
15686- rc = apply_to_page_range(mm, start_address,
15687- address - start_address,
15688- direct_remap_area_pte_fn, &w);
15689- if (rc)
15690- goto out;
15691- rc = -EFAULT;
15692- if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15693- goto out;
15694- v = w = u;
15695- start_address = address;
15696- }
15697-
15698- /*
15699- * Fill in the machine address: PTE ptr is done later by
15700- * apply_to_page_range().
15701- */
15702- v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15703-
15704- mfn++;
15705- address += PAGE_SIZE;
15706- v++;
15707- }
15708-
15709- if (v != u) {
15710- /* Final batch. */
15711- rc = apply_to_page_range(mm, start_address,
15712- address - start_address,
15713- direct_remap_area_pte_fn, &w);
15714- if (rc)
15715- goto out;
15716- rc = -EFAULT;
15717- if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15718- goto out;
15719- }
15720-
15721- rc = 0;
15722-
15723- out:
15724- flush_tlb_all();
15725-
15726- free_page((unsigned long)u);
15727-
15728- return rc;
15729-}
15730-
15731-int direct_remap_pfn_range(struct vm_area_struct *vma,
15732- unsigned long address,
15733- unsigned long mfn,
15734- unsigned long size,
15735- pgprot_t prot,
15736- domid_t domid)
15737-{
15738- if (xen_feature(XENFEAT_auto_translated_physmap))
15739- return remap_pfn_range(vma, address, mfn, size, prot);
15740-
15741- if (domid == DOMID_SELF)
15742- return -EINVAL;
15743-
15744- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15745-
15746- vma->vm_mm->context.has_foreign_mappings = 1;
15747-
15748- return __direct_remap_pfn_range(
15749- vma->vm_mm, address, mfn, size, prot, domid);
15750-}
15751-EXPORT_SYMBOL(direct_remap_pfn_range);
15752-
15753-int direct_kernel_remap_pfn_range(unsigned long address,
15754- unsigned long mfn,
15755- unsigned long size,
15756- pgprot_t prot,
15757- domid_t domid)
15758-{
15759- return __direct_remap_pfn_range(
15760- &init_mm, address, mfn, size, prot, domid);
15761-}
15762-EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15763-
15764-static int lookup_pte_fn(
15765- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15766-{
15767- uint64_t *ptep = (uint64_t *)data;
15768- if (ptep)
15769- *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15770- PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15771- return 0;
15772-}
15773-
15774-int create_lookup_pte_addr(struct mm_struct *mm,
15775- unsigned long address,
15776- uint64_t *ptep)
15777-{
15778- return apply_to_page_range(mm, address, PAGE_SIZE,
15779- lookup_pte_fn, ptep);
15780-}
15781-
15782-EXPORT_SYMBOL(create_lookup_pte_addr);
15783-
15784-static int noop_fn(
15785- pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15786-{
15787- return 0;
15788-}
15789-
15790-int touch_pte_range(struct mm_struct *mm,
15791- unsigned long address,
15792- unsigned long size)
15793-{
15794- return apply_to_page_range(mm, address, size, noop_fn, NULL);
15795-}
15796-
15797-EXPORT_SYMBOL(touch_pte_range);
15798-
15799-/*
15800- * Does @address reside within a non-highmem page that is local to this virtual
15801- * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15802- * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15803- * why this works.
15804- */
15805-static inline int is_local_lowmem(unsigned long address)
15806-{
15807- extern unsigned long max_low_pfn;
15808- return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15809-}
15810-
15811-/*
15812- * Generic mapping function (not visible outside):
15813- */
15814-
15815-/*
15816- * Remap an arbitrary physical address space into the kernel virtual
15817- * address space. Needed when the kernel wants to access high addresses
15818- * directly.
15819- *
15820- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15821- * have to convert them into an offset in a page-aligned mapping, but the
15822- * caller shouldn't need to know that small detail.
15823- */
15824-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15825-{
15826- void __iomem * addr;
15827- struct vm_struct * area;
15828- unsigned long offset, last_addr;
15829- pgprot_t prot;
15830- domid_t domid = DOMID_IO;
15831-
15832- /* Don't allow wraparound or zero size */
15833- last_addr = phys_addr + size - 1;
15834- if (!size || last_addr < phys_addr)
15835- return NULL;
15836-
15837- /*
15838- * Don't remap the low PCI/ISA area, it's always mapped..
15839- */
15840- if (is_initial_xendomain() &&
15841- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15842- return (void __iomem *) isa_bus_to_virt(phys_addr);
15843-
15844- /*
15845- * Don't allow anybody to remap normal RAM that we're using..
15846- */
15847- if (is_local_lowmem(phys_addr)) {
15848- char *t_addr, *t_end;
15849- struct page *page;
15850-
15851- t_addr = bus_to_virt(phys_addr);
15852- t_end = t_addr + (size - 1);
15853-
15854- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15855- if(!PageReserved(page))
15856- return NULL;
15857-
15858- domid = DOMID_SELF;
15859- }
15860-
15861- prot = __pgprot(_KERNPG_TABLE | flags);
15862-
15863- /*
15864- * Mappings have to be page-aligned
15865- */
15866- offset = phys_addr & ~PAGE_MASK;
15867- phys_addr &= PAGE_MASK;
15868- size = PAGE_ALIGN(last_addr+1) - phys_addr;
15869-
15870- /*
15871- * Ok, go for it..
15872- */
15873- area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15874- if (!area)
15875- return NULL;
15876- area->phys_addr = phys_addr;
15877- addr = (void __iomem *) area->addr;
15878- if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15879- phys_addr>>PAGE_SHIFT,
15880- size, prot, domid)) {
15881- vunmap((void __force *) addr);
15882- return NULL;
15883- }
15884- return (void __iomem *) (offset + (char __iomem *)addr);
15885-}
15886-EXPORT_SYMBOL(__ioremap);
15887-
15888-/**
15889- * ioremap_nocache - map bus memory into CPU space
15890- * @offset: bus address of the memory
15891- * @size: size of the resource to map
15892- *
15893- * ioremap_nocache performs a platform specific sequence of operations to
15894- * make bus memory CPU accessible via the readb/readw/readl/writeb/
15895- * writew/writel functions and the other mmio helpers. The returned
15896- * address is not guaranteed to be usable directly as a virtual
15897- * address.
15898- *
15899- * This version of ioremap ensures that the memory is marked uncachable
15900- * on the CPU as well as honouring existing caching rules from things like
15901- * the PCI bus. Note that there are other caches and buffers on many
15902- * busses. In particular driver authors should read up on PCI writes
15903- *
15904- * It's useful if some control registers are in such an area and
15905- * write combining or read caching is not desirable:
15906- *
15907- * Must be freed with iounmap.
15908- */
15909-
15910-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15911-{
15912- unsigned long last_addr;
15913- void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15914- if (!p)
15915- return p;
15916-
15917- /* Guaranteed to be > phys_addr, as per __ioremap() */
15918- last_addr = phys_addr + size - 1;
15919-
15920- if (is_local_lowmem(last_addr)) {
15921- struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15922- unsigned long npages;
15923-
15924- phys_addr &= PAGE_MASK;
15925-
15926- /* This might overflow and become zero.. */
15927- last_addr = PAGE_ALIGN(last_addr);
15928-
15929- /* .. but that's ok, because modulo-2**n arithmetic will make
15930- * the page-aligned "last - first" come out right.
15931- */
15932- npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15933-
15934- if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15935- iounmap(p);
15936- p = NULL;
15937- }
15938- global_flush_tlb();
15939- }
15940-
15941- return p;
15942-}
15943-EXPORT_SYMBOL(ioremap_nocache);
15944-
15945-/**
15946- * iounmap - Free a IO remapping
15947- * @addr: virtual address from ioremap_*
15948- *
15949- * Caller must ensure there is only one unmapping for the same pointer.
15950- */
15951-void iounmap(volatile void __iomem *addr)
15952-{
15953- struct vm_struct *p, *o;
15954-
15955- if ((void __force *)addr <= high_memory)
15956- return;
15957-
15958- /*
15959- * __ioremap special-cases the PCI/ISA range by not instantiating a
15960- * vm_area and by simply returning an address into the kernel mapping
15961- * of ISA space. So handle that here.
15962- */
15963- if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15964- return;
15965-
15966- addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15967-
15968- /* Use the vm area unlocked, assuming the caller
15969- ensures there isn't another iounmap for the same address
15970- in parallel. Reuse of the virtual address is prevented by
15971- leaving it in the global lists until we're done with it.
15972- cpa takes care of the direct mappings. */
15973- read_lock(&vmlist_lock);
15974- for (p = vmlist; p; p = p->next) {
15975- if (p->addr == addr)
15976- break;
15977- }
15978- read_unlock(&vmlist_lock);
15979-
15980- if (!p) {
15981- printk("iounmap: bad address %p\n", addr);
15982- dump_stack();
15983- return;
15984- }
15985-
15986- /* Reset the direct mapping. Can block */
15987- if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15988- change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15989- get_vm_area_size(p) >> PAGE_SHIFT,
15990- PAGE_KERNEL);
15991- global_flush_tlb();
15992- }
15993-
15994- /* Finally remove it */
15995- o = remove_vm_area((void *)addr);
15996- BUG_ON(p != o || o == NULL);
15997- kfree(p);
15998-}
15999-EXPORT_SYMBOL(iounmap);
16000-
16001-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
16002-{
16003- unsigned long offset, last_addr;
16004- unsigned int nrpages;
16005- enum fixed_addresses idx;
16006-
16007- /* Don't allow wraparound or zero size */
16008- last_addr = phys_addr + size - 1;
16009- if (!size || last_addr < phys_addr)
16010- return NULL;
16011-
16012- /*
16013- * Don't remap the low PCI/ISA area, it's always mapped..
16014- */
16015- if (is_initial_xendomain() &&
16016- phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
16017- return isa_bus_to_virt(phys_addr);
16018-
16019- /*
16020- * Mappings have to be page-aligned
16021- */
16022- offset = phys_addr & ~PAGE_MASK;
16023- phys_addr &= PAGE_MASK;
16024- size = PAGE_ALIGN(last_addr) - phys_addr;
16025-
16026- /*
16027- * Mappings have to fit in the FIX_BTMAP area.
16028- */
16029- nrpages = size >> PAGE_SHIFT;
16030- if (nrpages > NR_FIX_BTMAPS)
16031- return NULL;
16032-
16033- /*
16034- * Ok, go for it..
16035- */
16036- idx = FIX_BTMAP_BEGIN;
16037- while (nrpages > 0) {
16038- set_fixmap(idx, phys_addr);
16039- phys_addr += PAGE_SIZE;
16040- --idx;
16041- --nrpages;
16042- }
16043- return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
16044-}
16045-
16046-void __init bt_iounmap(void *addr, unsigned long size)
16047-{
16048- unsigned long virt_addr;
16049- unsigned long offset;
16050- unsigned int nrpages;
16051- enum fixed_addresses idx;
16052-
16053- virt_addr = (unsigned long)addr;
16054- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
16055- return;
16056- if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
16057- return;
16058- offset = virt_addr & ~PAGE_MASK;
16059- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16060-
16061- idx = FIX_BTMAP_BEGIN;
16062- while (nrpages > 0) {
16063- clear_fixmap(idx);
16064- --idx;
16065- --nrpages;
16066- }
16067-}
16068--- /dev/null 1970-01-01 00:00:00.000000000 +0000
16069+++ sle11-2009-06-29/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
16070@@ -0,0 +1,1413 @@
16071+/*
16072+ * Copyright 2002 Andi Kleen, SuSE Labs.
16073+ * Thanks to Ben LaHaise for precious feedback.
16074+ */
16075+#include <linux/highmem.h>
16076+#include <linux/bootmem.h>
16077+#include <linux/module.h>
16078+#include <linux/sched.h>
16079+#include <linux/slab.h>
16080+#include <linux/mm.h>
16081+#include <linux/interrupt.h>
16082+
16083+#include <asm/e820.h>
16084+#include <asm/processor.h>
16085+#include <asm/tlbflush.h>
16086+#include <asm/sections.h>
16087+#include <asm/uaccess.h>
16088+#include <asm/pgalloc.h>
16089+#include <asm/proto.h>
16090+#include <asm/mmu_context.h>
16091+
16092+#ifndef CONFIG_X86_64
16093+#define TASK_SIZE64 TASK_SIZE
16094+#endif
16095+
16096+static void _pin_lock(struct mm_struct *mm, int lock) {
16097+ if (lock)
16098+ spin_lock(&mm->page_table_lock);
16099+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16100+ /* While mm->page_table_lock protects us against insertions and
16101+ * removals of higher level page table pages, it doesn't protect
16102+ * against updates of pte-s. Such updates, however, require the
16103+ * pte pages to be in consistent state (unpinned+writable or
16104+ * pinned+readonly). The pinning and attribute changes, however
16105+ * cannot be done atomically, which is why such updates must be
16106+ * prevented from happening concurrently.
16107+ * Note that no pte lock can ever elsewhere be acquired nesting
16108+ * with an already acquired one in the same mm, or with the mm's
16109+ * page_table_lock already acquired, as that would break in the
16110+ * non-split case (where all these are actually resolving to the
16111+ * one page_table_lock). Thus acquiring all of them here is not
16112+ * going to result in dead locks, and the order of acquires
16113+ * doesn't matter.
16114+ */
16115+ {
16116+ pgd_t *pgd = mm->pgd;
16117+ unsigned g;
16118+
16119+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16120+ pud_t *pud;
16121+ unsigned u;
16122+
16123+ if (pgd_none(*pgd))
16124+ continue;
16125+ pud = pud_offset(pgd, 0);
16126+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16127+ pmd_t *pmd;
16128+ unsigned m;
16129+
16130+ if (pud_none(*pud))
16131+ continue;
16132+ pmd = pmd_offset(pud, 0);
16133+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16134+ spinlock_t *ptl;
16135+
16136+ if (pmd_none(*pmd))
16137+ continue;
16138+ ptl = pte_lockptr(0, pmd);
16139+ if (lock)
16140+ spin_lock(ptl);
16141+ else
16142+ spin_unlock(ptl);
16143+ }
16144+ }
16145+ }
16146+ }
16147+#endif
16148+ if (!lock)
16149+ spin_unlock(&mm->page_table_lock);
16150+}
16151+#define pin_lock(mm) _pin_lock(mm, 1)
16152+#define pin_unlock(mm) _pin_lock(mm, 0)
16153+
16154+#define PIN_BATCH sizeof(void *)
16155+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16156+
16157+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16158+ unsigned int cpu, unsigned int seq)
16159+{
16160+ unsigned long pfn = page_to_pfn(page);
16161+
16162+ if (PageHighMem(page)) {
16163+ if (pgprot_val(flags) & _PAGE_RW)
16164+ ClearPagePinned(page);
16165+ else
16166+ SetPagePinned(page);
16167+ } else {
16168+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16169+ (unsigned long)__va(pfn << PAGE_SHIFT),
16170+ pfn_pte(pfn, flags), 0);
16171+ if (unlikely(++seq == PIN_BATCH)) {
16172+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16173+ PIN_BATCH, NULL)))
16174+ BUG();
16175+ seq = 0;
16176+ }
16177+ }
16178+
16179+ return seq;
16180+}
16181+
16182+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16183+{
16184+ pgd_t *pgd = pgd_base;
16185+ pud_t *pud;
16186+ pmd_t *pmd;
16187+ int g,u,m;
16188+ unsigned int cpu, seq;
16189+ multicall_entry_t *mcl;
16190+
16191+ if (xen_feature(XENFEAT_auto_translated_physmap))
16192+ return;
16193+
16194+ cpu = get_cpu();
16195+
16196+ /*
16197+ * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16198+ * may not be the 'current' task's pagetables (e.g., current may be
16199+ * 32-bit, but the pagetables may be for a 64-bit task).
16200+ * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16201+ * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16202+ */
16203+ for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16204+ if (pgd_none(*pgd))
16205+ continue;
16206+ pud = pud_offset(pgd, 0);
16207+ if (PTRS_PER_PUD > 1) /* not folded */
16208+ seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16209+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16210+ if (pud_none(*pud))
16211+ continue;
16212+ pmd = pmd_offset(pud, 0);
16213+ if (PTRS_PER_PMD > 1) /* not folded */
16214+ seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16215+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16216+ if (pmd_none(*pmd))
16217+ continue;
16218+ seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16219+ }
16220+ }
16221+ }
16222+
16223+ mcl = per_cpu(pb_mcl, cpu);
16224+#ifdef CONFIG_X86_64
16225+ if (unlikely(seq > PIN_BATCH - 2)) {
16226+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16227+ BUG();
16228+ seq = 0;
16229+ }
16230+ MULTI_update_va_mapping(mcl + seq,
16231+ (unsigned long)__user_pgd(pgd_base),
16232+ pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16233+ 0);
16234+ MULTI_update_va_mapping(mcl + seq + 1,
16235+ (unsigned long)pgd_base,
16236+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16237+ UVMF_TLB_FLUSH);
16238+ if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16239+ BUG();
16240+#else
16241+ if (likely(seq != 0)) {
16242+ MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16243+ (unsigned long)pgd_base,
16244+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16245+ UVMF_TLB_FLUSH);
16246+ if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16247+ seq + 1, NULL)))
16248+ BUG();
16249+ } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16250+ pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16251+ UVMF_TLB_FLUSH))
16252+ BUG();
16253+#endif
16254+
16255+ put_cpu();
16256+}
16257+
16258+static void __pgd_pin(pgd_t *pgd)
16259+{
16260+ pgd_walk(pgd, PAGE_KERNEL_RO);
16261+ kmap_flush_unused();
16262+ xen_pgd_pin(__pa(pgd)); /* kernel */
16263+#ifdef CONFIG_X86_64
16264+ xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16265+#endif
16266+ SetPagePinned(virt_to_page(pgd));
16267+}
16268+
16269+static void __pgd_unpin(pgd_t *pgd)
16270+{
16271+ xen_pgd_unpin(__pa(pgd));
16272+#ifdef CONFIG_X86_64
16273+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
16274+#endif
16275+ pgd_walk(pgd, PAGE_KERNEL);
16276+ ClearPagePinned(virt_to_page(pgd));
16277+}
16278+
16279+void pgd_test_and_unpin(pgd_t *pgd)
16280+{
16281+ if (PagePinned(virt_to_page(pgd)))
16282+ __pgd_unpin(pgd);
16283+}
16284+
16285+void mm_pin(struct mm_struct *mm)
16286+{
16287+ if (xen_feature(XENFEAT_writable_page_tables))
16288+ return;
16289+
16290+ pin_lock(mm);
16291+ __pgd_pin(mm->pgd);
16292+ pin_unlock(mm);
16293+}
16294+
16295+void mm_unpin(struct mm_struct *mm)
16296+{
16297+ if (xen_feature(XENFEAT_writable_page_tables))
16298+ return;
16299+
16300+ pin_lock(mm);
16301+ __pgd_unpin(mm->pgd);
16302+ pin_unlock(mm);
16303+}
16304+
16305+void mm_pin_all(void)
16306+{
16307+ struct page *page;
16308+ unsigned long flags;
16309+
16310+ if (xen_feature(XENFEAT_writable_page_tables))
16311+ return;
16312+
16313+ /*
16314+ * Allow uninterrupted access to the pgd_list. Also protects
16315+ * __pgd_pin() by disabling preemption.
16316+ * All other CPUs must be at a safe point (e.g., in stop_machine
16317+ * or offlined entirely).
16318+ */
16319+ spin_lock_irqsave(&pgd_lock, flags);
16320+ list_for_each_entry(page, &pgd_list, lru) {
16321+ if (!PagePinned(page))
16322+ __pgd_pin((pgd_t *)page_address(page));
16323+ }
16324+ spin_unlock_irqrestore(&pgd_lock, flags);
16325+}
16326+
16327+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16328+{
16329+ if (!PagePinned(virt_to_page(mm->pgd)))
16330+ mm_pin(mm);
16331+}
16332+
16333+void arch_exit_mmap(struct mm_struct *mm)
16334+{
16335+ struct task_struct *tsk = current;
16336+
16337+ task_lock(tsk);
16338+
16339+ /*
16340+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16341+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16342+ */
16343+ if (tsk->active_mm == mm) {
16344+ tsk->active_mm = &init_mm;
16345+ atomic_inc(&init_mm.mm_count);
16346+
16347+ switch_mm(mm, &init_mm, tsk);
16348+
16349+ atomic_dec(&mm->mm_count);
16350+ BUG_ON(atomic_read(&mm->mm_count) == 0);
16351+ }
16352+
16353+ task_unlock(tsk);
16354+
16355+ if (PagePinned(virt_to_page(mm->pgd))
16356+ && atomic_read(&mm->mm_count) == 1
16357+ && !mm->context.has_foreign_mappings)
16358+ mm_unpin(mm);
16359+}
16360+
16361+static void _pte_free(struct page *page, unsigned int order)
16362+{
16363+ BUG_ON(order);
16364+ __pte_free(page);
16365+}
16366+
16367+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
16368+{
16369+ struct page *pte;
16370+
16371+#ifdef CONFIG_HIGHPTE
16372+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
16373+#else
16374+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16375+#endif
16376+ if (pte) {
16377+ pgtable_page_ctor(pte);
16378+ SetPageForeign(pte, _pte_free);
16379+ init_page_count(pte);
16380+ }
16381+ return pte;
16382+}
16383+
16384+void __pte_free(pgtable_t pte)
16385+{
16386+ if (!PageHighMem(pte)) {
16387+ unsigned long va = (unsigned long)page_address(pte);
16388+ unsigned int level;
16389+ pte_t *ptep = lookup_address(va, &level);
16390+
16391+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16392+ if (!pte_write(*ptep)
16393+ && HYPERVISOR_update_va_mapping(va,
16394+ mk_pte(pte, PAGE_KERNEL),
16395+ 0))
16396+ BUG();
16397+ } else
16398+#ifdef CONFIG_HIGHPTE
16399+ ClearPagePinned(pte);
16400+#else
16401+ BUG();
16402+#endif
16403+
16404+ ClearPageForeign(pte);
16405+ init_page_count(pte);
16406+ pgtable_page_dtor(pte);
16407+ __free_page(pte);
16408+}
16409+
16410+#if PAGETABLE_LEVELS >= 3
16411+static void _pmd_free(struct page *page, unsigned int order)
16412+{
16413+ BUG_ON(order);
16414+ __pmd_free(page);
16415+}
16416+
16417+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
16418+{
16419+ struct page *pmd;
16420+
16421+ pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16422+ if (!pmd)
16423+ return NULL;
16424+ SetPageForeign(pmd, _pmd_free);
16425+ init_page_count(pmd);
16426+ return page_address(pmd);
16427+}
16428+
16429+void __pmd_free(pgtable_t pmd)
16430+{
16431+ unsigned long va = (unsigned long)page_address(pmd);
16432+ unsigned int level;
16433+ pte_t *ptep = lookup_address(va, &level);
16434+
16435+ BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16436+ if (!pte_write(*ptep)
16437+ && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
16438+ BUG();
16439+
16440+ ClearPageForeign(pmd);
16441+ init_page_count(pmd);
16442+ __free_page(pmd);
16443+}
16444+#endif
16445+
16446+/* blktap and gntdev need this, as otherwise they would implicitly (and
16447+ * needlessly, as they never use it) reference init_mm. */
16448+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
16449+ unsigned long addr, pte_t *ptep, int full)
16450+{
16451+ return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
16452+}
16453+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
16454+
16455+/*
16456+ * The current flushing context - we pass it instead of 5 arguments:
16457+ */
16458+struct cpa_data {
16459+ unsigned long vaddr;
16460+ pgprot_t mask_set;
16461+ pgprot_t mask_clr;
16462+ int numpages;
16463+ int flushtlb;
16464+ unsigned long pfn;
16465+};
16466+
16467+#ifdef CONFIG_X86_64
16468+
16469+static inline unsigned long highmap_start_pfn(void)
16470+{
16471+ return __pa(_text) >> PAGE_SHIFT;
16472+}
16473+
16474+static inline unsigned long highmap_end_pfn(void)
16475+{
16476+ return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
16477+}
16478+
16479+#endif
16480+
16481+#ifdef CONFIG_DEBUG_PAGEALLOC
16482+# define debug_pagealloc 1
16483+#else
16484+# define debug_pagealloc 0
16485+#endif
16486+
16487+static inline int
16488+within(unsigned long addr, unsigned long start, unsigned long end)
16489+{
16490+ return addr >= start && addr < end;
16491+}
16492+
16493+/*
16494+ * Flushing functions
16495+ */
16496+
16497+/**
16498+ * clflush_cache_range - flush a cache range with clflush
16499+ * @addr: virtual start address
16500+ * @size: number of bytes to flush
16501+ *
16502+ * clflush is an unordered instruction which needs fencing with mfence
16503+ * to avoid ordering issues.
16504+ */
16505+void clflush_cache_range(void *vaddr, unsigned int size)
16506+{
16507+ void *vend = vaddr + size - 1;
16508+
16509+ mb();
16510+
16511+ for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
16512+ clflush(vaddr);
16513+ /*
16514+ * Flush any possible final partial cacheline:
16515+ */
16516+ clflush(vend);
16517+
16518+ mb();
16519+}
16520+
16521+static void __cpa_flush_all(void *arg)
16522+{
16523+ unsigned long cache = (unsigned long)arg;
16524+
16525+ /*
16526+ * Flush all to work around Errata in early athlons regarding
16527+ * large page flushing.
16528+ */
16529+ __flush_tlb_all();
16530+
16531+ if (cache && boot_cpu_data.x86_model >= 4)
16532+ wbinvd();
16533+}
16534+
16535+static void cpa_flush_all(unsigned long cache)
16536+{
16537+ BUG_ON(irqs_disabled());
16538+
16539+ on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
16540+}
16541+
16542+static void __cpa_flush_range(void *arg)
16543+{
16544+ /*
16545+ * We could optimize that further and do individual per page
16546+ * tlb invalidates for a low number of pages. Caveat: we must
16547+ * flush the high aliases on 64bit as well.
16548+ */
16549+ __flush_tlb_all();
16550+}
16551+
16552+static void cpa_flush_range(unsigned long start, int numpages, int cache)
16553+{
16554+ unsigned int i, level;
16555+ unsigned long addr;
16556+
16557+ BUG_ON(irqs_disabled());
16558+ WARN_ON(PAGE_ALIGN(start) != start);
16559+
16560+ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
16561+
16562+ if (!cache)
16563+ return;
16564+
16565+ /*
16566+ * We only need to flush on one CPU,
16567+ * clflush is a MESI-coherent instruction that
16568+ * will cause all other CPUs to flush the same
16569+ * cachelines:
16570+ */
16571+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
16572+ pte_t *pte = lookup_address(addr, &level);
16573+
16574+ /*
16575+ * Only flush present addresses:
16576+ */
16577+ if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
16578+ clflush_cache_range((void *) addr, PAGE_SIZE);
16579+ }
16580+}
16581+
16582+/*
16583+ * Certain areas of memory on x86 require very specific protection flags,
16584+ * for example the BIOS area or kernel text. Callers don't always get this
16585+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
16586+ * checks and fixes these known static required protection bits.
16587+ */
16588+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
16589+ unsigned long pfn)
16590+{
16591+ pgprot_t forbidden = __pgprot(0);
16592+
16593+#ifndef CONFIG_XEN
16594+ /*
16595+ * The BIOS area between 640k and 1Mb needs to be executable for
16596+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
16597+ */
16598+ if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
16599+ pgprot_val(forbidden) |= _PAGE_NX;
16600+#endif
16601+
16602+ /*
16603+ * The kernel text needs to be executable for obvious reasons
16604+ * Does not cover __inittext since that is gone later on. On
16605+ * 64bit we do not enforce !NX on the low mapping
16606+ */
16607+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
16608+ pgprot_val(forbidden) |= _PAGE_NX;
16609+
16610+ /*
16611+ * The .rodata section needs to be read-only. Using the pfn
16612+ * catches all aliases.
16613+ */
16614+ if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
16615+ __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
16616+ pgprot_val(forbidden) |= _PAGE_RW;
16617+
16618+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
16619+
16620+ return prot;
16621+}
16622+
16623+/*
16624+ * Lookup the page table entry for a virtual address. Return a pointer
16625+ * to the entry and the level of the mapping.
16626+ *
16627+ * Note: We return pud and pmd either when the entry is marked large
16628+ * or when the present bit is not set. Otherwise we would return a
16629+ * pointer to a nonexisting mapping.
16630+ */
16631+pte_t *lookup_address(unsigned long address, unsigned int *level)
16632+{
16633+ pgd_t *pgd = pgd_offset_k(address);
16634+ pud_t *pud;
16635+ pmd_t *pmd;
16636+
16637+ *level = PG_LEVEL_NONE;
16638+
16639+ if (pgd_none(*pgd))
16640+ return NULL;
16641+
16642+ pud = pud_offset(pgd, address);
16643+ if (pud_none(*pud))
16644+ return NULL;
16645+
16646+ *level = PG_LEVEL_1G;
16647+ if (pud_large(*pud) || !pud_present(*pud))
16648+ return (pte_t *)pud;
16649+
16650+ pmd = pmd_offset(pud, address);
16651+ if (pmd_none(*pmd))
16652+ return NULL;
16653+
16654+ *level = PG_LEVEL_2M;
16655+ if (pmd_large(*pmd) || !pmd_present(*pmd))
16656+ return (pte_t *)pmd;
16657+
16658+ *level = PG_LEVEL_4K;
16659+
16660+ return pte_offset_kernel(pmd, address);
16661+}
16662+
16663+/*
16664+ * Set the new pmd in all the pgds we know about:
16665+ */
16666+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
16667+ unsigned int level, pte_t pte)
16668+{
16669+ /* change init_mm */
16670+ switch(level) {
16671+ case PG_LEVEL_2M:
16672+ xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
16673+ break;
16674+#ifdef CONFIG_X86_64
16675+ case PG_LEVEL_1G:
16676+ xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
16677+ break;
16678+#endif
16679+ default:
16680+ BUG();
16681+ }
16682+#ifdef CONFIG_X86_32
16683+ if (!SHARED_KERNEL_PMD) {
16684+ struct page *page;
16685+
16686+ list_for_each_entry(page, &pgd_list, lru) {
16687+ pgd_t *pgd;
16688+ pud_t *pud;
16689+ pmd_t *pmd;
16690+
16691+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
16692+ pud = pud_offset(pgd, address);
16693+ pmd = pmd_offset(pud, address);
16694+ xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
16695+ }
16696+ }
16697+#endif
16698+}
16699+
16700+static int
16701+try_preserve_large_page(pte_t *kpte, unsigned long address,
16702+ struct cpa_data *cpa)
16703+{
16704+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
16705+ pte_t new_pte, old_pte, *tmp;
16706+ pgprot_t old_prot, new_prot;
16707+ int i, do_split = 1;
16708+ unsigned int level;
16709+
16710+ spin_lock_irqsave(&pgd_lock, flags);
16711+ /*
16712+ * Check for races, another CPU might have split this page
16713+ * up already:
16714+ */
16715+ tmp = lookup_address(address, &level);
16716+ if (tmp != kpte)
16717+ goto out_unlock;
16718+
16719+ switch (level) {
16720+ case PG_LEVEL_2M:
16721+ psize = PMD_PAGE_SIZE;
16722+ pmask = PMD_PAGE_MASK;
16723+ break;
16724+#ifdef CONFIG_X86_64
16725+ case PG_LEVEL_1G:
16726+ psize = PUD_PAGE_SIZE;
16727+ pmask = PUD_PAGE_MASK;
16728+ break;
16729+#endif
16730+ default:
16731+ do_split = -EINVAL;
16732+ goto out_unlock;
16733+ }
16734+
16735+ /*
16736+ * Calculate the number of pages, which fit into this large
16737+ * page starting at address:
16738+ */
16739+ nextpage_addr = (address + psize) & pmask;
16740+ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
16741+ if (numpages < cpa->numpages)
16742+ cpa->numpages = numpages;
16743+
16744+ /*
16745+ * We are safe now. Check whether the new pgprot is the same:
16746+ */
16747+ old_pte = *kpte;
16748+ old_prot = new_prot = pte_pgprot(old_pte);
16749+
16750+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
16751+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
16752+
16753+ /*
16754+ * old_pte points to the large page base address. So we need
16755+ * to add the offset of the virtual address:
16756+ */
16757+ pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
16758+ cpa->pfn = pfn;
16759+
16760+ new_prot = static_protections(new_prot, address, pfn);
16761+
16762+ /*
16763+ * We need to check the full range, whether
16764+ * static_protection() requires a different pgprot for one of
16765+ * the pages in the range we try to preserve:
16766+ */
16767+ if (pfn < max_mapnr) {
16768+ addr = address + PAGE_SIZE;
16769+ for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
16770+ i++, addr += PAGE_SIZE) {
16771+ pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
16772+
16773+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
16774+ goto out_unlock;
16775+ }
16776+ }
16777+
16778+ /*
16779+ * If there are no changes, return. maxpages has been updated
16780+ * above:
16781+ */
16782+ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
16783+ do_split = 0;
16784+ goto out_unlock;
16785+ }
16786+
16787+ /*
16788+ * We need to change the attributes. Check, whether we can
16789+ * change the large page in one go. We request a split, when
16790+ * the address is not aligned and the number of pages is
16791+ * smaller than the number of pages in the large page. Note
16792+ * that we limited the number of possible pages already to
16793+ * the number of pages in the large page.
16794+ */
16795+ if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
16796+ /*
16797+ * The address is aligned and the number of pages
16798+ * covers the full page.
16799+ */
16800+ new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
16801+ __set_pmd_pte(kpte, address, level, new_pte);
16802+ cpa->flushtlb = 1;
16803+ do_split = 0;
16804+ }
16805+
16806+out_unlock:
16807+ spin_unlock_irqrestore(&pgd_lock, flags);
16808+
16809+ return do_split;
16810+}
16811+
16812+static LIST_HEAD(page_pool);
16813+static unsigned long pool_size, pool_pages, pool_low;
16814+static unsigned long pool_used, pool_failed;
16815+
16816+static void cpa_fill_pool(struct page **ret)
16817+{
16818+ gfp_t gfp = GFP_KERNEL;
16819+ unsigned long flags;
16820+ struct page *p;
16821+
16822+ /*
16823+ * Avoid recursion (on debug-pagealloc) and also signal
16824+ * our priority to get to these pagetables:
16825+ */
16826+ if (current->flags & PF_MEMALLOC)
16827+ return;
16828+ current->flags |= PF_MEMALLOC;
16829+
16830+ /*
16831+ * Allocate atomically from atomic contexts:
16832+ */
16833+ if (in_atomic() || irqs_disabled() || debug_pagealloc)
16834+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
16835+
16836+ while (pool_pages < pool_size || (ret && !*ret)) {
16837+ p = alloc_pages(gfp, 0);
16838+ if (!p) {
16839+ pool_failed++;
16840+ break;
16841+ }
16842+ /*
16843+ * If the call site needs a page right now, provide it:
16844+ */
16845+ if (ret && !*ret) {
16846+ *ret = p;
16847+ continue;
16848+ }
16849+ spin_lock_irqsave(&pgd_lock, flags);
16850+ list_add(&p->lru, &page_pool);
16851+ pool_pages++;
16852+ spin_unlock_irqrestore(&pgd_lock, flags);
16853+ }
16854+
16855+ current->flags &= ~PF_MEMALLOC;
16856+}
16857+
16858+#define SHIFT_MB (20 - PAGE_SHIFT)
16859+#define ROUND_MB_GB ((1 << 10) - 1)
16860+#define SHIFT_MB_GB 10
16861+#define POOL_PAGES_PER_GB 16
16862+
16863+void __init cpa_init(void)
16864+{
16865+ struct sysinfo si;
16866+ unsigned long gb;
16867+
16868+ si_meminfo(&si);
16869+ /*
16870+ * Calculate the number of pool pages:
16871+ *
16872+ * Convert totalram (nr of pages) to MiB and round to the next
16873+ * GiB. Shift MiB to Gib and multiply the result by
16874+ * POOL_PAGES_PER_GB:
16875+ */
16876+ if (debug_pagealloc) {
16877+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
16878+ pool_size = POOL_PAGES_PER_GB * gb;
16879+ } else {
16880+ pool_size = 1;
16881+ }
16882+ pool_low = pool_size;
16883+
16884+ cpa_fill_pool(NULL);
16885+ printk(KERN_DEBUG
16886+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
16887+ pool_pages, pool_size);
16888+}
16889+
16890+static int split_large_page(pte_t *kpte, unsigned long address)
16891+{
16892+ unsigned long flags, mfn, mfninc = 1;
16893+ unsigned int i, level;
16894+ pte_t *pbase, *tmp;
16895+ pgprot_t ref_prot;
16896+ struct page *base;
16897+
16898+ /*
16899+ * Get a page from the pool. The pool list is protected by the
16900+ * pgd_lock, which we have to take anyway for the split
16901+ * operation:
16902+ */
16903+ spin_lock_irqsave(&pgd_lock, flags);
16904+ if (list_empty(&page_pool)) {
16905+ spin_unlock_irqrestore(&pgd_lock, flags);
16906+ base = NULL;
16907+ cpa_fill_pool(&base);
16908+ if (!base)
16909+ return -ENOMEM;
16910+ spin_lock_irqsave(&pgd_lock, flags);
16911+ } else {
16912+ base = list_first_entry(&page_pool, struct page, lru);
16913+ list_del(&base->lru);
16914+ pool_pages--;
16915+
16916+ if (pool_pages < pool_low)
16917+ pool_low = pool_pages;
16918+ }
16919+
16920+ /*
16921+ * Check for races, another CPU might have split this page
16922+ * up for us already:
16923+ */
16924+ tmp = lookup_address(address, &level);
16925+ if (tmp != kpte)
16926+ goto out_unlock;
16927+
16928+ pbase = (pte_t *)page_address(base);
16929+#ifdef CONFIG_X86_32
16930+ paravirt_alloc_pt(&init_mm, page_to_pfn(base));
16931+#endif
16932+ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
16933+
16934+#ifdef CONFIG_X86_64
16935+ if (level == PG_LEVEL_1G) {
16936+ mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
16937+ pgprot_val(ref_prot) |= _PAGE_PSE;
16938+ }
16939+#endif
16940+
16941+ /*
16942+ * Get the target mfn from the original entry:
16943+ */
16944+ mfn = __pte_mfn(*kpte);
16945+ for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
16946+ set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
16947+
16948+ /*
16949+ * Install the new, split up pagetable. Important details here:
16950+ *
16951+ * On Intel the NX bit of all levels must be cleared to make a
16952+ * page executable. See section 4.13.2 of Intel 64 and IA-32
16953+ * Architectures Software Developer's Manual).
16954+ *
16955+ * Mark the entry present. The current mapping might be
16956+ * set to not present, which we preserved above.
16957+ */
16958+ if (!xen_feature(XENFEAT_writable_page_tables) &&
16959+ HYPERVISOR_update_va_mapping((unsigned long)pbase,
16960+ mk_pte(base, PAGE_KERNEL_RO), 0))
16961+ BUG();
16962+ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
16963+ pgprot_val(ref_prot) |= _PAGE_PRESENT;
16964+ __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
16965+ base = NULL;
16966+
16967+out_unlock:
16968+ /*
16969+ * If we dropped out via the lookup_address check under
16970+ * pgd_lock then stick the page back into the pool:
16971+ */
16972+ if (base) {
16973+ list_add(&base->lru, &page_pool);
16974+ pool_pages++;
16975+ } else
16976+ pool_used++;
16977+ spin_unlock_irqrestore(&pgd_lock, flags);
16978+
16979+ return 0;
16980+}
16981+
16982+static int __change_page_attr(struct cpa_data *cpa, int primary)
16983+{
16984+ unsigned long address = cpa->vaddr;
16985+ int do_split, err;
16986+ unsigned int level;
16987+ pte_t *kpte, old_pte;
16988+
16989+repeat:
16990+ kpte = lookup_address(address, &level);
16991+ if (!kpte)
16992+ return primary ? -EINVAL : 0;
16993+
16994+ old_pte = *kpte;
16995+ if (!__pte_val(old_pte)) {
16996+ if (!primary)
16997+ return 0;
16998+ printk(KERN_WARNING "CPA: called for zero pte. "
16999+ "vaddr = %lx cpa->vaddr = %lx\n", address,
17000+ cpa->vaddr);
17001+ WARN_ON(1);
17002+ return -EINVAL;
17003+ }
17004+
17005+ if (level == PG_LEVEL_4K) {
17006+ pte_t new_pte;
17007+ pgprot_t new_prot = pte_pgprot(old_pte);
17008+ unsigned long mfn = __pte_mfn(old_pte);
17009+
17010+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17011+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17012+
17013+ new_prot = static_protections(new_prot, address,
17014+ mfn_to_local_pfn(mfn));
17015+
17016+ /*
17017+ * We need to keep the mfn from the existing PTE,
17018+ * after all we're only going to change it's attributes
17019+ * not the memory it points to
17020+ */
17021+ new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17022+ cpa->pfn = mfn_to_local_pfn(mfn);
17023+ /*
17024+ * Do we really change anything ?
17025+ */
17026+ if (__pte_val(old_pte) != __pte_val(new_pte)) {
17027+ set_pte_atomic(kpte, new_pte);
17028+ cpa->flushtlb = 1;
17029+ }
17030+ cpa->numpages = 1;
17031+ return 0;
17032+ }
17033+
17034+ /*
17035+ * Check, whether we can keep the large page intact
17036+ * and just change the pte:
17037+ */
17038+ do_split = try_preserve_large_page(kpte, address, cpa);
17039+ /*
17040+ * When the range fits into the existing large page,
17041+ * return. cp->numpages and cpa->tlbflush have been updated in
17042+ * try_large_page:
17043+ */
17044+ if (do_split <= 0)
17045+ return do_split;
17046+
17047+ /*
17048+ * We have to split the large page:
17049+ */
17050+ err = split_large_page(kpte, address);
17051+ if (!err) {
17052+ cpa->flushtlb = 1;
17053+ goto repeat;
17054+ }
17055+
17056+ return err;
17057+}
17058+
17059+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17060+
17061+static int cpa_process_alias(struct cpa_data *cpa)
17062+{
17063+ struct cpa_data alias_cpa;
17064+ int ret = 0;
17065+
17066+ if (cpa->pfn > max_pfn_mapped)
17067+ return 0;
17068+
17069+ /*
17070+ * No need to redo, when the primary call touched the direct
17071+ * mapping already:
17072+ */
17073+ if (!within(cpa->vaddr, PAGE_OFFSET,
17074+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17075+
17076+ alias_cpa = *cpa;
17077+ alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17078+
17079+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
17080+ }
17081+
17082+#ifdef CONFIG_X86_64
17083+ if (ret)
17084+ return ret;
17085+ /*
17086+ * No need to redo, when the primary call touched the high
17087+ * mapping already:
17088+ */
17089+ if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17090+ return 0;
17091+
17092+ /*
17093+ * If the physical address is inside the kernel map, we need
17094+ * to touch the high mapped kernel as well:
17095+ */
17096+ if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17097+ return 0;
17098+
17099+ alias_cpa = *cpa;
17100+ alias_cpa.vaddr =
17101+ (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17102+
17103+ /*
17104+ * The high mapping range is imprecise, so ignore the return value.
17105+ */
17106+ __change_page_attr_set_clr(&alias_cpa, 0);
17107+#endif
17108+ return ret;
17109+}
17110+
17111+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17112+{
17113+ int ret, numpages = cpa->numpages;
17114+
17115+ while (numpages) {
17116+ /*
17117+ * Store the remaining nr of pages for the large page
17118+ * preservation check.
17119+ */
17120+ cpa->numpages = numpages;
17121+
17122+ ret = __change_page_attr(cpa, checkalias);
17123+ if (ret)
17124+ return ret;
17125+
17126+ if (checkalias) {
17127+ ret = cpa_process_alias(cpa);
17128+ if (ret)
17129+ return ret;
17130+ }
17131+
17132+ /*
17133+ * Adjust the number of pages with the result of the
17134+ * CPA operation. Either a large page has been
17135+ * preserved or a single page update happened.
17136+ */
17137+ BUG_ON(cpa->numpages > numpages);
17138+ numpages -= cpa->numpages;
17139+ cpa->vaddr += cpa->numpages * PAGE_SIZE;
17140+ }
17141+ return 0;
17142+}
17143+
17144+static inline int cache_attr(pgprot_t attr)
17145+{
17146+ return pgprot_val(attr) &
17147+ (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17148+}
17149+
17150+static int change_page_attr_set_clr(unsigned long addr, int numpages,
17151+ pgprot_t mask_set, pgprot_t mask_clr)
17152+{
17153+ struct cpa_data cpa;
17154+ int ret, cache, checkalias;
17155+
17156+ /*
17157+ * Check, if we are requested to change a not supported
17158+ * feature:
17159+ */
17160+ mask_set = canon_pgprot(mask_set);
17161+ mask_clr = canon_pgprot(mask_clr);
17162+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17163+ return 0;
17164+
17165+ /* Ensure we are PAGE_SIZE aligned */
17166+ if (addr & ~PAGE_MASK) {
17167+ addr &= PAGE_MASK;
17168+ /*
17169+ * People should not be passing in unaligned addresses:
17170+ */
17171+ WARN_ON_ONCE(1);
17172+ }
17173+
17174+ cpa.vaddr = addr;
17175+ cpa.numpages = numpages;
17176+ cpa.mask_set = mask_set;
17177+ cpa.mask_clr = mask_clr;
17178+ cpa.flushtlb = 0;
17179+
17180+ /* No alias checking for _NX bit modifications */
17181+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17182+
17183+ ret = __change_page_attr_set_clr(&cpa, checkalias);
17184+
17185+ /*
17186+ * Check whether we really changed something:
17187+ */
17188+ if (!cpa.flushtlb)
17189+ goto out;
17190+
17191+ /*
17192+ * No need to flush, when we did not set any of the caching
17193+ * attributes:
17194+ */
17195+ cache = cache_attr(mask_set);
17196+
17197+ /*
17198+ * On success we use clflush, when the CPU supports it to
17199+ * avoid the wbindv. If the CPU does not support it and in the
17200+ * error case we fall back to cpa_flush_all (which uses
17201+ * wbindv):
17202+ */
17203+ if (!ret && cpu_has_clflush)
17204+ cpa_flush_range(addr, numpages, cache);
17205+ else
17206+ cpa_flush_all(cache);
17207+
17208+out:
17209+ cpa_fill_pool(NULL);
17210+
17211+ return ret;
17212+}
17213+
17214+static inline int change_page_attr_set(unsigned long addr, int numpages,
17215+ pgprot_t mask)
17216+{
17217+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17218+}
17219+
17220+static inline int change_page_attr_clear(unsigned long addr, int numpages,
17221+ pgprot_t mask)
17222+{
17223+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17224+}
17225+
17226+int set_memory_uc(unsigned long addr, int numpages)
17227+{
17228+ return change_page_attr_set(addr, numpages,
17229+ __pgprot(_PAGE_PCD));
17230+}
17231+EXPORT_SYMBOL(set_memory_uc);
17232+
17233+int set_memory_wb(unsigned long addr, int numpages)
17234+{
17235+ return change_page_attr_clear(addr, numpages,
17236+ __pgprot(_PAGE_PCD | _PAGE_PWT));
17237+}
17238+EXPORT_SYMBOL(set_memory_wb);
17239+
17240+int set_memory_x(unsigned long addr, int numpages)
17241+{
17242+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17243+}
17244+EXPORT_SYMBOL(set_memory_x);
17245+
17246+int set_memory_nx(unsigned long addr, int numpages)
17247+{
17248+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17249+}
17250+EXPORT_SYMBOL(set_memory_nx);
17251+
17252+int set_memory_ro(unsigned long addr, int numpages)
17253+{
17254+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17255+}
17256+
17257+int set_memory_rw(unsigned long addr, int numpages)
17258+{
17259+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17260+}
17261+
17262+int set_memory_np(unsigned long addr, int numpages)
17263+{
17264+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17265+}
17266+
17267+int set_pages_uc(struct page *page, int numpages)
17268+{
17269+ unsigned long addr = (unsigned long)page_address(page);
17270+
17271+ return set_memory_uc(addr, numpages);
17272+}
17273+EXPORT_SYMBOL(set_pages_uc);
17274+
17275+int set_pages_wb(struct page *page, int numpages)
17276+{
17277+ unsigned long addr = (unsigned long)page_address(page);
17278+
17279+ return set_memory_wb(addr, numpages);
17280+}
17281+EXPORT_SYMBOL(set_pages_wb);
17282+
17283+int set_pages_x(struct page *page, int numpages)
17284+{
17285+ unsigned long addr = (unsigned long)page_address(page);
17286+
17287+ return set_memory_x(addr, numpages);
17288+}
17289+EXPORT_SYMBOL(set_pages_x);
17290+
17291+int set_pages_nx(struct page *page, int numpages)
17292+{
17293+ unsigned long addr = (unsigned long)page_address(page);
17294+
17295+ return set_memory_nx(addr, numpages);
17296+}
17297+EXPORT_SYMBOL(set_pages_nx);
17298+
17299+int set_pages_ro(struct page *page, int numpages)
17300+{
17301+ unsigned long addr = (unsigned long)page_address(page);
17302+
17303+ return set_memory_ro(addr, numpages);
17304+}
17305+
17306+int set_pages_rw(struct page *page, int numpages)
17307+{
17308+ unsigned long addr = (unsigned long)page_address(page);
17309+
17310+ return set_memory_rw(addr, numpages);
17311+}
17312+
17313+#ifdef CONFIG_DEBUG_PAGEALLOC
17314+
17315+static int __set_pages_p(struct page *page, int numpages)
17316+{
17317+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17318+ .numpages = numpages,
17319+ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
17320+ .mask_clr = __pgprot(0)};
17321+
17322+ return __change_page_attr_set_clr(&cpa, 1);
17323+}
17324+
17325+static int __set_pages_np(struct page *page, int numpages)
17326+{
17327+ struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17328+ .numpages = numpages,
17329+ .mask_set = __pgprot(0),
17330+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
17331+
17332+ return __change_page_attr_set_clr(&cpa, 1);
17333+}
17334+
17335+void kernel_map_pages(struct page *page, int numpages, int enable)
17336+{
17337+ if (PageHighMem(page))
17338+ return;
17339+ if (!enable) {
17340+ debug_check_no_locks_freed(page_address(page),
17341+ numpages * PAGE_SIZE);
17342+ }
17343+
17344+ /*
17345+ * If page allocator is not up yet then do not call c_p_a():
17346+ */
17347+ if (!debug_pagealloc_enabled)
17348+ return;
17349+
17350+ /*
17351+ * The return value is ignored as the calls cannot fail.
17352+ * Large pages are kept enabled at boot time, and are
17353+ * split up quickly with DEBUG_PAGEALLOC. If a splitup
17354+ * fails here (due to temporary memory shortage) no damage
17355+ * is done because we just keep the largepage intact up
17356+ * to the next attempt when it will likely be split up:
17357+ */
17358+ if (enable)
17359+ __set_pages_p(page, numpages);
17360+ else
17361+ __set_pages_np(page, numpages);
17362+
17363+ /*
17364+ * We should perform an IPI and flush all tlbs,
17365+ * but that can deadlock->flush only current cpu:
17366+ */
17367+ __flush_tlb_all();
17368+
17369+ /*
17370+ * Try to refill the page pool here. We can do this only after
17371+ * the tlb flush.
17372+ */
17373+ cpa_fill_pool(NULL);
17374+}
17375+
17376+#ifdef CONFIG_HIBERNATION
17377+
17378+bool kernel_page_present(struct page *page)
17379+{
17380+ unsigned int level;
17381+ pte_t *pte;
17382+
17383+ if (PageHighMem(page))
17384+ return false;
17385+
17386+ pte = lookup_address((unsigned long)page_address(page), &level);
17387+ return (__pte_val(*pte) & _PAGE_PRESENT);
17388+}
17389+
17390+#endif /* CONFIG_HIBERNATION */
17391+
17392+#endif /* CONFIG_DEBUG_PAGEALLOC */
17393+
17394+static inline int in_secondary_range(unsigned long va)
17395+{
17396+#ifdef CONFIG_X86_64
17397+ return va >= VMALLOC_START && va < VMALLOC_END;
17398+#else
17399+ return va >= (unsigned long)high_memory;
17400+#endif
17401+}
17402+
17403+static void __make_page_readonly(unsigned long va)
17404+{
17405+ pte_t *pte;
17406+ unsigned int level;
17407+
17408+ pte = lookup_address(va, &level);
17409+ BUG_ON(!pte || level != PG_LEVEL_4K);
17410+ if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
17411+ BUG();
17412+ if (in_secondary_range(va)) {
17413+ unsigned long pfn = pte_pfn(*pte);
17414+
17415+#ifdef CONFIG_HIGHMEM
17416+ if (pfn >= highstart_pfn)
17417+ kmap_flush_unused(); /* flush stale writable kmaps */
17418+ else
17419+#endif
17420+ __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
17421+ }
17422+}
17423+
17424+static void __make_page_writable(unsigned long va)
17425+{
17426+ pte_t *pte;
17427+ unsigned int level;
17428+
17429+ pte = lookup_address(va, &level);
17430+ BUG_ON(!pte || level != PG_LEVEL_4K);
17431+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
17432+ BUG();
17433+ if (in_secondary_range(va)) {
17434+ unsigned long pfn = pte_pfn(*pte);
17435+
17436+#ifdef CONFIG_HIGHMEM
17437+ if (pfn < highstart_pfn)
17438+#endif
17439+ __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
17440+ }
17441+}
17442+
17443+void make_page_readonly(void *va, unsigned int feature)
17444+{
17445+ if (!xen_feature(feature))
17446+ __make_page_readonly((unsigned long)va);
17447+}
17448+
17449+void make_page_writable(void *va, unsigned int feature)
17450+{
17451+ if (!xen_feature(feature))
17452+ __make_page_writable((unsigned long)va);
17453+}
17454+
17455+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17456+{
17457+ unsigned long addr;
17458+
17459+ if (xen_feature(feature))
17460+ return;
17461+
17462+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17463+ __make_page_readonly(addr);
17464+}
17465+
17466+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17467+{
17468+ unsigned long addr;
17469+
17470+ if (xen_feature(feature))
17471+ return;
17472+
17473+ for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17474+ __make_page_writable(addr);
17475+}
17476+
17477+/*
17478+ * The testcases use internal knowledge of the implementation that shouldn't
17479+ * be exposed to the rest of the kernel. Include these directly here.
17480+ */
17481+#ifdef CONFIG_CPA_DEBUG
17482+#include "pageattr-test.c"
17483+#endif
17484--- sle11-2009-06-29.orig/arch/x86/mm/pageattr_64-xen.c 2009-02-16 16:18:36.000000000 +0100
17485+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
17486@@ -1,542 +0,0 @@
17487-/*
17488- * Copyright 2002 Andi Kleen, SuSE Labs.
17489- * Thanks to Ben LaHaise for precious feedback.
17490- */
17491-
17492-#include <linux/mm.h>
17493-#include <linux/sched.h>
17494-#include <linux/highmem.h>
17495-#include <linux/module.h>
17496-#include <linux/slab.h>
17497-#include <asm/uaccess.h>
17498-#include <asm/processor.h>
17499-#include <asm/tlbflush.h>
17500-#include <asm/io.h>
17501-
17502-#ifdef CONFIG_XEN
17503-#include <asm/pgalloc.h>
17504-#include <asm/mmu_context.h>
17505-
17506-static void _pin_lock(struct mm_struct *mm, int lock) {
17507- if (lock)
17508- spin_lock(&mm->page_table_lock);
17509-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17510- /* While mm->page_table_lock protects us against insertions and
17511- * removals of higher level page table pages, it doesn't protect
17512- * against updates of pte-s. Such updates, however, require the
17513- * pte pages to be in consistent state (unpinned+writable or
17514- * pinned+readonly). The pinning and attribute changes, however
17515- * cannot be done atomically, which is why such updates must be
17516- * prevented from happening concurrently.
17517- * Note that no pte lock can ever elsewhere be acquired nesting
17518- * with an already acquired one in the same mm, or with the mm's
17519- * page_table_lock already acquired, as that would break in the
17520- * non-split case (where all these are actually resolving to the
17521- * one page_table_lock). Thus acquiring all of them here is not
17522- * going to result in dead locks, and the order of acquires
17523- * doesn't matter.
17524- */
17525- {
17526- pgd_t *pgd = mm->pgd;
17527- unsigned g;
17528-
17529- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17530- pud_t *pud;
17531- unsigned u;
17532-
17533- if (pgd_none(*pgd))
17534- continue;
17535- pud = pud_offset(pgd, 0);
17536- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17537- pmd_t *pmd;
17538- unsigned m;
17539-
17540- if (pud_none(*pud))
17541- continue;
17542- pmd = pmd_offset(pud, 0);
17543- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17544- spinlock_t *ptl;
17545-
17546- if (pmd_none(*pmd))
17547- continue;
17548- ptl = pte_lockptr(0, pmd);
17549- if (lock)
17550- spin_lock(ptl);
17551- else
17552- spin_unlock(ptl);
17553- }
17554- }
17555- }
17556- }
17557-#endif
17558- if (!lock)
17559- spin_unlock(&mm->page_table_lock);
17560-}
17561-#define pin_lock(mm) _pin_lock(mm, 1)
17562-#define pin_unlock(mm) _pin_lock(mm, 0)
17563-
17564-#define PIN_BATCH 8
17565-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17566-
17567-static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
17568- unsigned int cpu, unsigned int seq)
17569-{
17570- struct page *page = virt_to_page(pt);
17571- unsigned long pfn = page_to_pfn(page);
17572-
17573- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17574- (unsigned long)__va(pfn << PAGE_SHIFT),
17575- pfn_pte(pfn, flags), 0);
17576- if (unlikely(++seq == PIN_BATCH)) {
17577- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17578- PIN_BATCH, NULL)))
17579- BUG();
17580- seq = 0;
17581- }
17582-
17583- return seq;
17584-}
17585-
17586-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17587-{
17588- pgd_t *pgd = pgd_base;
17589- pud_t *pud;
17590- pmd_t *pmd;
17591- pte_t *pte;
17592- int g,u,m;
17593- unsigned int cpu, seq;
17594- multicall_entry_t *mcl;
17595-
17596- cpu = get_cpu();
17597-
17598- /*
17599- * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
17600- * be the 'current' task's pagetables (e.g., current may be 32-bit,
17601- * but the pagetables may be for a 64-bit task).
17602- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
17603- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
17604- */
17605- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17606- if (pgd_none(*pgd))
17607- continue;
17608- pud = pud_offset(pgd, 0);
17609- if (PTRS_PER_PUD > 1) /* not folded */
17610- seq = pgd_walk_set_prot(pud,flags,cpu,seq);
17611- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17612- if (pud_none(*pud))
17613- continue;
17614- pmd = pmd_offset(pud, 0);
17615- if (PTRS_PER_PMD > 1) /* not folded */
17616- seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
17617- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17618- if (pmd_none(*pmd))
17619- continue;
17620- pte = pte_offset_kernel(pmd,0);
17621- seq = pgd_walk_set_prot(pte,flags,cpu,seq);
17622- }
17623- }
17624- }
17625-
17626- mcl = per_cpu(pb_mcl, cpu);
17627- if (unlikely(seq > PIN_BATCH - 2)) {
17628- if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
17629- BUG();
17630- seq = 0;
17631- }
17632- MULTI_update_va_mapping(mcl + seq,
17633- (unsigned long)__user_pgd(pgd_base),
17634- pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
17635- 0);
17636- MULTI_update_va_mapping(mcl + seq + 1,
17637- (unsigned long)pgd_base,
17638- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17639- UVMF_TLB_FLUSH);
17640- if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
17641- BUG();
17642-
17643- put_cpu();
17644-}
17645-
17646-static void __pgd_pin(pgd_t *pgd)
17647-{
17648- pgd_walk(pgd, PAGE_KERNEL_RO);
17649- xen_pgd_pin(__pa(pgd)); /* kernel */
17650- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
17651- SetPagePinned(virt_to_page(pgd));
17652-}
17653-
17654-static void __pgd_unpin(pgd_t *pgd)
17655-{
17656- xen_pgd_unpin(__pa(pgd));
17657- xen_pgd_unpin(__pa(__user_pgd(pgd)));
17658- pgd_walk(pgd, PAGE_KERNEL);
17659- ClearPagePinned(virt_to_page(pgd));
17660-}
17661-
17662-void pgd_test_and_unpin(pgd_t *pgd)
17663-{
17664- if (PagePinned(virt_to_page(pgd)))
17665- __pgd_unpin(pgd);
17666-}
17667-
17668-void mm_pin(struct mm_struct *mm)
17669-{
17670- if (xen_feature(XENFEAT_writable_page_tables))
17671- return;
17672-
17673- pin_lock(mm);
17674- __pgd_pin(mm->pgd);
17675- pin_unlock(mm);
17676-}
17677-
17678-void mm_unpin(struct mm_struct *mm)
17679-{
17680- if (xen_feature(XENFEAT_writable_page_tables))
17681- return;
17682-
17683- pin_lock(mm);
17684- __pgd_unpin(mm->pgd);
17685- pin_unlock(mm);
17686-}
17687-
17688-void mm_pin_all(void)
17689-{
17690- struct page *page;
17691- unsigned long flags;
17692-
17693- if (xen_feature(XENFEAT_writable_page_tables))
17694- return;
17695-
17696- /*
17697- * Allow uninterrupted access to the pgd_list. Also protects
17698- * __pgd_pin() by disabling preemption.
17699- * All other CPUs must be at a safe point (e.g., in stop_machine
17700- * or offlined entirely).
17701- */
17702- spin_lock_irqsave(&pgd_lock, flags);
17703- list_for_each_entry(page, &pgd_list, lru) {
17704- if (!PagePinned(page))
17705- __pgd_pin((pgd_t *)page_address(page));
17706- }
17707- spin_unlock_irqrestore(&pgd_lock, flags);
17708-}
17709-
17710-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17711-{
17712- if (!PagePinned(virt_to_page(mm->pgd)))
17713- mm_pin(mm);
17714-}
17715-
17716-void arch_exit_mmap(struct mm_struct *mm)
17717-{
17718- struct task_struct *tsk = current;
17719-
17720- task_lock(tsk);
17721-
17722- /*
17723- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17724- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17725- */
17726- if (tsk->active_mm == mm) {
17727- tsk->active_mm = &init_mm;
17728- atomic_inc(&init_mm.mm_count);
17729-
17730- switch_mm(mm, &init_mm, tsk);
17731-
17732- atomic_dec(&mm->mm_count);
17733- BUG_ON(atomic_read(&mm->mm_count) == 0);
17734- }
17735-
17736- task_unlock(tsk);
17737-
17738- if (PagePinned(virt_to_page(mm->pgd))
17739- && (atomic_read(&mm->mm_count) == 1)
17740- && !mm->context.has_foreign_mappings)
17741- mm_unpin(mm);
17742-}
17743-
17744-static void _pte_free(struct page *page, unsigned int order)
17745-{
17746- BUG_ON(order);
17747- pte_free(page);
17748-}
17749-
17750-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17751-{
17752- struct page *pte;
17753-
17754- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17755- if (pte) {
17756- SetPageForeign(pte, _pte_free);
17757- init_page_count(pte);
17758- }
17759- return pte;
17760-}
17761-
17762-void pte_free(struct page *pte)
17763-{
17764- unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
17765-
17766- if (!pte_write(*virt_to_ptep(va)))
17767- if (HYPERVISOR_update_va_mapping(
17768- va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
17769- BUG();
17770-
17771- ClearPageForeign(pte);
17772- init_page_count(pte);
17773-
17774- __free_page(pte);
17775-}
17776-#endif /* CONFIG_XEN */
17777-
17778-pte_t *lookup_address(unsigned long address)
17779-{
17780- pgd_t *pgd = pgd_offset_k(address);
17781- pud_t *pud;
17782- pmd_t *pmd;
17783- pte_t *pte;
17784- if (pgd_none(*pgd))
17785- return NULL;
17786- pud = pud_offset(pgd, address);
17787- if (!pud_present(*pud))
17788- return NULL;
17789- pmd = pmd_offset(pud, address);
17790- if (!pmd_present(*pmd))
17791- return NULL;
17792- if (pmd_large(*pmd))
17793- return (pte_t *)pmd;
17794- pte = pte_offset_kernel(pmd, address);
17795- if (pte && !pte_present(*pte))
17796- pte = NULL;
17797- return pte;
17798-}
17799-
17800-static struct page *split_large_page(unsigned long address, pgprot_t prot,
17801- pgprot_t ref_prot)
17802-{
17803- int i;
17804- unsigned long addr;
17805- struct page *base = alloc_pages(GFP_KERNEL, 0);
17806- pte_t *pbase;
17807- if (!base)
17808- return NULL;
17809- /*
17810- * page_private is used to track the number of entries in
17811- * the page table page have non standard attributes.
17812- */
17813- SetPagePrivate(base);
17814- page_private(base) = 0;
17815-
17816- address = __pa(address);
17817- addr = address & LARGE_PAGE_MASK;
17818- pbase = (pte_t *)page_address(base);
17819- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
17820- pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
17821- addr == address ? prot : ref_prot);
17822- }
17823- return base;
17824-}
17825-
17826-void clflush_cache_range(void *adr, int size)
17827-{
17828- int i;
17829- for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
17830- clflush(adr+i);
17831-}
17832-
17833-static void flush_kernel_map(void *arg)
17834-{
17835- struct list_head *l = (struct list_head *)arg;
17836- struct page *pg;
17837-
17838- /* When clflush is available always use it because it is
17839- much cheaper than WBINVD. */
17840- /* clflush is still broken. Disable for now. */
17841- if (1 || !cpu_has_clflush)
17842- asm volatile("wbinvd" ::: "memory");
17843- else list_for_each_entry(pg, l, lru) {
17844- void *adr = page_address(pg);
17845- clflush_cache_range(adr, PAGE_SIZE);
17846- }
17847- __flush_tlb_all();
17848-}
17849-
17850-static inline void flush_map(struct list_head *l)
17851-{
17852- on_each_cpu(flush_kernel_map, l, 1, 1);
17853-}
17854-
17855-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
17856-
17857-static inline void save_page(struct page *fpage)
17858-{
17859- if (!test_and_set_bit(PG_arch_1, &fpage->flags))
17860- list_add(&fpage->lru, &deferred_pages);
17861-}
17862-
17863-/*
17864- * No more special protections in this 2/4MB area - revert to a
17865- * large page again.
17866- */
17867-static void revert_page(unsigned long address, pgprot_t ref_prot)
17868-{
17869- pgd_t *pgd;
17870- pud_t *pud;
17871- pmd_t *pmd;
17872- pte_t large_pte;
17873- unsigned long pfn;
17874-
17875- pgd = pgd_offset_k(address);
17876- BUG_ON(pgd_none(*pgd));
17877- pud = pud_offset(pgd,address);
17878- BUG_ON(pud_none(*pud));
17879- pmd = pmd_offset(pud, address);
17880- BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
17881- pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
17882- large_pte = pfn_pte(pfn, ref_prot);
17883- large_pte = pte_mkhuge(large_pte);
17884- set_pte((pte_t *)pmd, large_pte);
17885-}
17886-
17887-static int
17888-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
17889- pgprot_t ref_prot)
17890-{
17891- pte_t *kpte;
17892- struct page *kpte_page;
17893- pgprot_t ref_prot2;
17894-
17895- kpte = lookup_address(address);
17896- if (!kpte) return 0;
17897- kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
17898- BUG_ON(PageLRU(kpte_page));
17899- BUG_ON(PageCompound(kpte_page));
17900- if (pgprot_val(prot) != pgprot_val(ref_prot)) {
17901- if (!pte_huge(*kpte)) {
17902- set_pte(kpte, pfn_pte(pfn, prot));
17903- } else {
17904- /*
17905- * split_large_page will take the reference for this
17906- * change_page_attr on the split page.
17907- */
17908- struct page *split;
17909- ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
17910- split = split_large_page(address, prot, ref_prot2);
17911- if (!split)
17912- return -ENOMEM;
17913- pgprot_val(ref_prot2) &= ~_PAGE_NX;
17914- set_pte(kpte, mk_pte(split, ref_prot2));
17915- kpte_page = split;
17916- }
17917- page_private(kpte_page)++;
17918- } else if (!pte_huge(*kpte)) {
17919- set_pte(kpte, pfn_pte(pfn, ref_prot));
17920- BUG_ON(page_private(kpte_page) == 0);
17921- page_private(kpte_page)--;
17922- } else
17923- BUG();
17924-
17925- /* on x86-64 the direct mapping set at boot is not using 4k pages */
17926- /*
17927- * ..., but the XEN guest kernels (currently) do:
17928- * If the pte was reserved, it means it was created at boot
17929- * time (not via split_large_page) and in turn we must not
17930- * replace it with a large page.
17931- */
17932-#ifndef CONFIG_XEN
17933- BUG_ON(PageReserved(kpte_page));
17934-#else
17935- if (PageReserved(kpte_page))
17936- return 0;
17937-#endif
17938-
17939- save_page(kpte_page);
17940- if (page_private(kpte_page) == 0)
17941- revert_page(address, ref_prot);
17942- return 0;
17943-}
17944-
17945-/*
17946- * Change the page attributes of an page in the linear mapping.
17947- *
17948- * This should be used when a page is mapped with a different caching policy
17949- * than write-back somewhere - some CPUs do not like it when mappings with
17950- * different caching policies exist. This changes the page attributes of the
17951- * in kernel linear mapping too.
17952- *
17953- * The caller needs to ensure that there are no conflicting mappings elsewhere.
17954- * This function only deals with the kernel linear map.
17955- *
17956- * Caller must call global_flush_tlb() after this.
17957- */
17958-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
17959-{
17960- int err = 0, kernel_map = 0;
17961- int i;
17962-
17963- if (address >= __START_KERNEL_map
17964- && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
17965- address = (unsigned long)__va(__pa(address));
17966- kernel_map = 1;
17967- }
17968-
17969- down_write(&init_mm.mmap_sem);
17970- for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
17971- unsigned long pfn = __pa(address) >> PAGE_SHIFT;
17972-
17973- if (!kernel_map || pte_present(pfn_pte(0, prot))) {
17974- err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
17975- if (err)
17976- break;
17977- }
17978- /* Handle kernel mapping too which aliases part of the
17979- * lowmem */
17980- if (__pa(address) < KERNEL_TEXT_SIZE) {
17981- unsigned long addr2;
17982- pgprot_t prot2;
17983- addr2 = __START_KERNEL_map + __pa(address);
17984- /* Make sure the kernel mappings stay executable */
17985- prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
17986- err = __change_page_attr(addr2, pfn, prot2,
17987- PAGE_KERNEL_EXEC);
17988- }
17989- }
17990- up_write(&init_mm.mmap_sem);
17991- return err;
17992-}
17993-
17994-/* Don't call this for MMIO areas that may not have a mem_map entry */
17995-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
17996-{
17997- unsigned long addr = (unsigned long)page_address(page);
17998- return change_page_attr_addr(addr, numpages, prot);
17999-}
18000-
18001-void global_flush_tlb(void)
18002-{
18003- struct page *pg, *next;
18004- struct list_head l;
18005-
18006- /*
18007- * Write-protect the semaphore, to exclude two contexts
18008- * doing a list_replace_init() call in parallel and to
18009- * exclude new additions to the deferred_pages list:
18010- */
18011- down_write(&init_mm.mmap_sem);
18012- list_replace_init(&deferred_pages, &l);
18013- up_write(&init_mm.mmap_sem);
18014-
18015- flush_map(&l);
18016-
18017- list_for_each_entry_safe(pg, next, &l, lru) {
18018- list_del(&pg->lru);
18019- clear_bit(PG_arch_1, &pg->flags);
18020- if (page_private(pg) != 0)
18021- continue;
18022- ClearPagePrivate(pg);
18023- __free_page(pg);
18024- }
18025-}
18026-
18027-EXPORT_SYMBOL(change_page_attr);
18028-EXPORT_SYMBOL(global_flush_tlb);
18029--- sle11-2009-06-29.orig/arch/x86/mm/pgtable_32-xen.c 2009-02-16 16:18:36.000000000 +0100
18030+++ sle11-2009-06-29/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
18031@@ -29,8 +29,6 @@
18032 #include <xen/features.h>
18033 #include <asm/hypervisor.h>
18034
18035-static void pgd_test_and_unpin(pgd_t *pgd);
18036-
18037 void show_mem(void)
18038 {
18039 int total = 0, reserved = 0;
18040@@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18041 return pte;
18042 }
18043
18044-static void _pte_free(struct page *page, unsigned int order)
18045-{
18046- BUG_ON(order);
18047- pte_free(page);
18048-}
18049-
18050-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18051-{
18052- struct page *pte;
18053-
18054-#ifdef CONFIG_HIGHPTE
18055- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18056-#else
18057- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18058-#endif
18059- if (pte) {
18060- SetPageForeign(pte, _pte_free);
18061- init_page_count(pte);
18062- }
18063- return pte;
18064-}
18065-
18066-void pte_free(struct page *pte)
18067-{
18068- unsigned long pfn = page_to_pfn(pte);
18069-
18070- if (!PageHighMem(pte)) {
18071- unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18072-
18073- if (!pte_write(*virt_to_ptep(va)))
18074- if (HYPERVISOR_update_va_mapping(
18075- va, pfn_pte(pfn, PAGE_KERNEL), 0))
18076- BUG();
18077- } else
18078- ClearPagePinned(pte);
18079-
18080- ClearPageForeign(pte);
18081- init_page_count(pte);
18082-
18083- __free_page(pte);
18084-}
18085-
18086-void pmd_ctor(struct kmem_cache *cache, void *pmd)
18087-{
18088- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18089-}
18090-
18091 /*
18092 * List of all pgd's needed for non-PAE so it can invalidate entries
18093 * in both cached and uncached pgd's; not needed for PAE since the
18094@@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18095 * vmalloc faults work because attached pagetables are never freed.
18096 * -- wli
18097 */
18098-DEFINE_SPINLOCK(pgd_lock);
18099-struct page *pgd_list;
18100-
18101 static inline void pgd_list_add(pgd_t *pgd)
18102 {
18103 struct page *page = virt_to_page(pgd);
18104- page->index = (unsigned long)pgd_list;
18105- if (pgd_list)
18106- set_page_private(pgd_list, (unsigned long)&page->index);
18107- pgd_list = page;
18108- set_page_private(page, (unsigned long)&pgd_list);
18109+
18110+ list_add(&page->lru, &pgd_list);
18111 }
18112
18113 static inline void pgd_list_del(pgd_t *pgd)
18114 {
18115- struct page *next, **pprev, *page = virt_to_page(pgd);
18116- next = (struct page *)page->index;
18117- pprev = (struct page **)page_private(page);
18118- *pprev = next;
18119- if (next)
18120- set_page_private(next, (unsigned long)pprev);
18121-}
18122+ struct page *page = virt_to_page(pgd);
18123
18124+ list_del(&page->lru);
18125+}
18126
18127+#define UNSHARED_PTRS_PER_PGD \
18128+ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18129
18130-#if (PTRS_PER_PMD == 1)
18131-/* Non-PAE pgd constructor */
18132-static void pgd_ctor(void *pgd)
18133+static void pgd_ctor(void *p)
18134 {
18135+ pgd_t *pgd = p;
18136 unsigned long flags;
18137
18138- /* !PAE, no pagetable sharing */
18139+ pgd_test_and_unpin(pgd);
18140+
18141+ /* Clear usermode parts of PGD */
18142 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18143
18144 spin_lock_irqsave(&pgd_lock, flags);
18145
18146- /* must happen under lock */
18147- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18148- swapper_pg_dir + USER_PTRS_PER_PGD,
18149- KERNEL_PGD_PTRS);
18150-
18151- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18152- __pa(swapper_pg_dir) >> PAGE_SHIFT,
18153- USER_PTRS_PER_PGD,
18154- KERNEL_PGD_PTRS);
18155- pgd_list_add(pgd);
18156- spin_unlock_irqrestore(&pgd_lock, flags);
18157-}
18158-#else /* PTRS_PER_PMD > 1 */
18159-/* PAE pgd constructor */
18160-static void pgd_ctor(void *pgd)
18161-{
18162- /* PAE, kernel PMD may be shared */
18163-
18164- if (SHARED_KERNEL_PMD) {
18165- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18166+ /* If the pgd points to a shared pagetable level (either the
18167+ ptes in non-PAE, or shared PMD in PAE), then just copy the
18168+ references from swapper_pg_dir. */
18169+ if (PAGETABLE_LEVELS == 2 ||
18170+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18171+ clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18172 swapper_pg_dir + USER_PTRS_PER_PGD,
18173 KERNEL_PGD_PTRS);
18174- } else {
18175- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18176+ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18177+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
18178+ USER_PTRS_PER_PGD,
18179+ KERNEL_PGD_PTRS);
18180 }
18181+
18182+ /* list required to sync kernel mapping updates */
18183+ if (PAGETABLE_LEVELS == 2)
18184+ pgd_list_add(pgd);
18185+
18186+ spin_unlock_irqrestore(&pgd_lock, flags);
18187 }
18188-#endif /* PTRS_PER_PMD */
18189
18190 static void pgd_dtor(void *pgd)
18191 {
18192 unsigned long flags; /* can be called from interrupt context */
18193
18194- if (SHARED_KERNEL_PMD)
18195- return;
18196-
18197- paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18198- spin_lock_irqsave(&pgd_lock, flags);
18199- pgd_list_del(pgd);
18200- spin_unlock_irqrestore(&pgd_lock, flags);
18201+ if (!SHARED_KERNEL_PMD) {
18202+ spin_lock_irqsave(&pgd_lock, flags);
18203+ pgd_list_del(pgd);
18204+ spin_unlock_irqrestore(&pgd_lock, flags);
18205+ }
18206
18207 pgd_test_and_unpin(pgd);
18208 }
18209
18210-#define UNSHARED_PTRS_PER_PGD \
18211- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18212-
18213-/* If we allocate a pmd for part of the kernel address space, then
18214- make sure its initialized with the appropriate kernel mappings.
18215- Otherwise use a cached zeroed pmd. */
18216-static pmd_t *pmd_cache_alloc(int idx)
18217+#ifdef CONFIG_X86_PAE
18218+/*
18219+ * Mop up any pmd pages which may still be attached to the pgd.
18220+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
18221+ * preallocate which never got a corresponding vma will need to be
18222+ * freed manually.
18223+ */
18224+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18225 {
18226- pmd_t *pmd;
18227+ int i;
18228
18229- if (idx >= USER_PTRS_PER_PGD) {
18230- pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18231+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18232+ pgd_t pgd = pgdp[i];
18233
18234-#ifndef CONFIG_XEN
18235- if (pmd)
18236- memcpy(pmd,
18237- (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18238- sizeof(pmd_t) * PTRS_PER_PMD);
18239-#endif
18240- } else
18241- pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18242+ if (__pgd_val(pgd) != 0) {
18243+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18244
18245- return pmd;
18246-}
18247+ pgdp[i] = xen_make_pgd(0);
18248
18249-static void pmd_cache_free(pmd_t *pmd, int idx)
18250-{
18251- if (idx >= USER_PTRS_PER_PGD) {
18252- make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18253- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18254- free_page((unsigned long)pmd);
18255- } else
18256- kmem_cache_free(pmd_cache, pmd);
18257+ paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18258+ pmd_free(mm, pmd);
18259+ }
18260+ }
18261 }
18262
18263-pgd_t *pgd_alloc(struct mm_struct *mm)
18264+/*
18265+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18266+ * updating the top-level pagetable entries to guarantee the
18267+ * processor notices the update. Since this is expensive, and
18268+ * all 4 top-level entries are used almost immediately in a
18269+ * new process's life, we just pre-populate them here.
18270+ *
18271+ * Also, if we're in a paravirt environment where the kernel pmd is
18272+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18273+ * and initialize the kernel pmds here.
18274+ */
18275+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18276 {
18277+ pud_t *pud;
18278+ pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18279+ unsigned long addr, flags;
18280 int i;
18281- pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18282- pmd_t **pmds = NULL;
18283- unsigned long flags;
18284-
18285- pgd_test_and_unpin(pgd);
18286-
18287- if (PTRS_PER_PMD == 1 || !pgd)
18288- return pgd;
18289-
18290-#ifdef CONFIG_XEN
18291- if (!SHARED_KERNEL_PMD) {
18292- /*
18293- * We can race save/restore (if we sleep during a GFP_KERNEL memory
18294- * allocation). We therefore store virtual addresses of pmds as they
18295- * do not change across save/restore, and poke the machine addresses
18296- * into the pgdir under the pgd_lock.
18297- */
18298- pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18299- if (!pmds) {
18300- quicklist_free(0, pgd_dtor, pgd);
18301- return NULL;
18302- }
18303- }
18304-#endif
18305
18306- /* Allocate pmds, remember virtual addresses. */
18307- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18308- pmd_t *pmd = pmd_cache_alloc(i);
18309-
18310- if (!pmd)
18311+ /*
18312+ * We can race save/restore (if we sleep during a GFP_KERNEL memory
18313+ * allocation). We therefore store virtual addresses of pmds as they
18314+ * do not change across save/restore, and poke the machine addresses
18315+ * into the pgdir under the pgd_lock.
18316+ */
18317+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18318+ pmds[i] = pmd_alloc_one(mm, addr);
18319+ if (!pmds[i])
18320 goto out_oom;
18321-
18322- paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18323- if (pmds)
18324- pmds[i] = pmd;
18325- else
18326- set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18327 }
18328
18329-#ifdef CONFIG_XEN
18330- if (SHARED_KERNEL_PMD)
18331- return pgd;
18332-
18333 spin_lock_irqsave(&pgd_lock, flags);
18334
18335 /* Protect against save/restore: move below 4GB under pgd_lock. */
18336- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18337- int rc = xen_create_contiguous_region(
18338- (unsigned long)pgd, 0, 32);
18339- if (rc) {
18340- spin_unlock_irqrestore(&pgd_lock, flags);
18341- goto out_oom;
18342- }
18343+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18344+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18345+ spin_unlock_irqrestore(&pgd_lock, flags);
18346+out_oom:
18347+ while (i--)
18348+ pmd_free(mm, pmds[i]);
18349+ return 0;
18350 }
18351
18352 /* Copy kernel pmd contents and write-protect the new pmds. */
18353- for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18354- memcpy(pmds[i],
18355- (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18356- sizeof(pmd_t) * PTRS_PER_PMD);
18357- make_lowmem_page_readonly(
18358- pmds[i], XENFEAT_writable_page_tables);
18359- }
18360+ pud = pud_offset(pgd, 0);
18361+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18362+ i++, pud++, addr += PUD_SIZE) {
18363+ if (i >= USER_PTRS_PER_PGD) {
18364+ memcpy(pmds[i],
18365+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18366+ sizeof(pmd_t) * PTRS_PER_PMD);
18367+ make_lowmem_page_readonly(
18368+ pmds[i], XENFEAT_writable_page_tables);
18369+ }
18370
18371- /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18372- for (i = 0; i < PTRS_PER_PGD; i++)
18373- set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18374+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18375+ pud_populate(mm, pud, pmds[i]);
18376+ }
18377
18378- /* Ensure this pgd gets picked up and pinned on save/restore. */
18379+ /* List required to sync kernel mapping updates and
18380+ * to pin/unpin on save/restore. */
18381 pgd_list_add(pgd);
18382
18383 spin_unlock_irqrestore(&pgd_lock, flags);
18384
18385- kfree(pmds);
18386-#endif
18387+ return 1;
18388+}
18389+#else /* !CONFIG_X86_PAE */
18390+/* No need to prepopulate any pagetable entries in non-PAE modes. */
18391+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18392+{
18393+ return 1;
18394+}
18395
18396- return pgd;
18397+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18398+{
18399+}
18400+#endif /* CONFIG_X86_PAE */
18401
18402-out_oom:
18403- if (!pmds) {
18404- for (i--; i >= 0; i--) {
18405- pgd_t pgdent = pgd[i];
18406- void* pmd = (void *)__va(pgd_val(pgdent)-1);
18407- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18408- pmd_cache_free(pmd, i);
18409- }
18410- } else {
18411- for (i--; i >= 0; i--) {
18412- paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18413- pmd_cache_free(pmds[i], i);
18414- }
18415- kfree(pmds);
18416+pgd_t *pgd_alloc(struct mm_struct *mm)
18417+{
18418+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18419+
18420+ /* so that alloc_pd can use it */
18421+ mm->pgd = pgd;
18422+ if (pgd)
18423+ pgd_ctor(pgd);
18424+
18425+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18426+ free_page((unsigned long)pgd);
18427+ pgd = NULL;
18428 }
18429- quicklist_free(0, pgd_dtor, pgd);
18430- return NULL;
18431+
18432+ return pgd;
18433 }
18434
18435-void pgd_free(pgd_t *pgd)
18436+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18437 {
18438- int i;
18439-
18440 /*
18441 * After this the pgd should not be pinned for the duration of this
18442 * function's execution. We should never sleep and thus never race:
18443@@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18444 * 2. The machine addresses in PGD entries will not become invalid
18445 * due to a concurrent save/restore.
18446 */
18447- pgd_test_and_unpin(pgd);
18448+ pgd_dtor(pgd);
18449
18450- /* in the PAE case user pgd entries are overwritten before usage */
18451- if (PTRS_PER_PMD > 1) {
18452- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18453- pgd_t pgdent = pgd[i];
18454- void* pmd = (void *)__va(pgd_val(pgdent)-1);
18455- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18456- pmd_cache_free(pmd, i);
18457- }
18458+ if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18459+ xen_destroy_contiguous_region((unsigned long)pgd, 0);
18460
18461- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18462- xen_destroy_contiguous_region((unsigned long)pgd, 0);
18463- }
18464+ pgd_mop_up_pmds(mm, pgd);
18465+ free_page((unsigned long)pgd);
18466+}
18467
18468- /* in the non-PAE case, free_pgtables() clears user pgd entries */
18469- quicklist_free(0, pgd_dtor, pgd);
18470+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18471+{
18472+ pgtable_page_dtor(pte);
18473+ paravirt_release_pt(page_to_pfn(pte));
18474+ tlb_remove_page(tlb, pte);
18475 }
18476
18477-void check_pgt_cache(void)
18478+#ifdef CONFIG_X86_PAE
18479+
18480+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18481 {
18482- quicklist_trim(0, pgd_dtor, 25, 16);
18483+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18484+ tlb_remove_page(tlb, virt_to_page(pmd));
18485 }
18486
18487+#endif
18488+
18489 void make_lowmem_page_readonly(void *va, unsigned int feature)
18490 {
18491 pte_t *pte;
18492+ unsigned int level;
18493 int rc;
18494
18495 if (xen_feature(feature))
18496 return;
18497
18498- pte = virt_to_ptep(va);
18499+ pte = lookup_address((unsigned long)va, &level);
18500+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18501 rc = HYPERVISOR_update_va_mapping(
18502 (unsigned long)va, pte_wrprotect(*pte), 0);
18503 BUG_ON(rc);
18504@@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18505 void make_lowmem_page_writable(void *va, unsigned int feature)
18506 {
18507 pte_t *pte;
18508+ unsigned int level;
18509 int rc;
18510
18511 if (xen_feature(feature))
18512 return;
18513
18514- pte = virt_to_ptep(va);
18515+ pte = lookup_address((unsigned long)va, &level);
18516+ BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18517 rc = HYPERVISOR_update_va_mapping(
18518 (unsigned long)va, pte_mkwrite(*pte), 0);
18519 BUG_ON(rc);
18520 }
18521-
18522-void make_page_readonly(void *va, unsigned int feature)
18523-{
18524- pte_t *pte;
18525- int rc;
18526-
18527- if (xen_feature(feature))
18528- return;
18529-
18530- pte = virt_to_ptep(va);
18531- rc = HYPERVISOR_update_va_mapping(
18532- (unsigned long)va, pte_wrprotect(*pte), 0);
18533- if (rc) /* fallback? */
18534- xen_l1_entry_update(pte, pte_wrprotect(*pte));
18535- if ((unsigned long)va >= (unsigned long)high_memory) {
18536- unsigned long pfn = pte_pfn(*pte);
18537-#ifdef CONFIG_HIGHMEM
18538- if (pfn >= highstart_pfn)
18539- kmap_flush_unused(); /* flush stale writable kmaps */
18540- else
18541-#endif
18542- make_lowmem_page_readonly(
18543- phys_to_virt(pfn << PAGE_SHIFT), feature);
18544- }
18545-}
18546-
18547-void make_page_writable(void *va, unsigned int feature)
18548-{
18549- pte_t *pte;
18550- int rc;
18551-
18552- if (xen_feature(feature))
18553- return;
18554-
18555- pte = virt_to_ptep(va);
18556- rc = HYPERVISOR_update_va_mapping(
18557- (unsigned long)va, pte_mkwrite(*pte), 0);
18558- if (rc) /* fallback? */
18559- xen_l1_entry_update(pte, pte_mkwrite(*pte));
18560- if ((unsigned long)va >= (unsigned long)high_memory) {
18561- unsigned long pfn = pte_pfn(*pte);
18562-#ifdef CONFIG_HIGHMEM
18563- if (pfn < highstart_pfn)
18564-#endif
18565- make_lowmem_page_writable(
18566- phys_to_virt(pfn << PAGE_SHIFT), feature);
18567- }
18568-}
18569-
18570-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18571-{
18572- if (xen_feature(feature))
18573- return;
18574-
18575- while (nr-- != 0) {
18576- make_page_readonly(va, feature);
18577- va = (void *)((unsigned long)va + PAGE_SIZE);
18578- }
18579-}
18580-
18581-void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18582-{
18583- if (xen_feature(feature))
18584- return;
18585-
18586- while (nr-- != 0) {
18587- make_page_writable(va, feature);
18588- va = (void *)((unsigned long)va + PAGE_SIZE);
18589- }
18590-}
18591-
18592-static void _pin_lock(struct mm_struct *mm, int lock) {
18593- if (lock)
18594- spin_lock(&mm->page_table_lock);
18595-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18596- /* While mm->page_table_lock protects us against insertions and
18597- * removals of higher level page table pages, it doesn't protect
18598- * against updates of pte-s. Such updates, however, require the
18599- * pte pages to be in consistent state (unpinned+writable or
18600- * pinned+readonly). The pinning and attribute changes, however
18601- * cannot be done atomically, which is why such updates must be
18602- * prevented from happening concurrently.
18603- * Note that no pte lock can ever elsewhere be acquired nesting
18604- * with an already acquired one in the same mm, or with the mm's
18605- * page_table_lock already acquired, as that would break in the
18606- * non-split case (where all these are actually resolving to the
18607- * one page_table_lock). Thus acquiring all of them here is not
18608- * going to result in dead locks, and the order of acquires
18609- * doesn't matter.
18610- */
18611- {
18612- pgd_t *pgd = mm->pgd;
18613- unsigned g;
18614-
18615- for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18616- pud_t *pud;
18617- unsigned u;
18618-
18619- if (pgd_none(*pgd))
18620- continue;
18621- pud = pud_offset(pgd, 0);
18622- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18623- pmd_t *pmd;
18624- unsigned m;
18625-
18626- if (pud_none(*pud))
18627- continue;
18628- pmd = pmd_offset(pud, 0);
18629- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18630- spinlock_t *ptl;
18631-
18632- if (pmd_none(*pmd))
18633- continue;
18634- ptl = pte_lockptr(0, pmd);
18635- if (lock)
18636- spin_lock(ptl);
18637- else
18638- spin_unlock(ptl);
18639- }
18640- }
18641- }
18642- }
18643-#endif
18644- if (!lock)
18645- spin_unlock(&mm->page_table_lock);
18646-}
18647-#define pin_lock(mm) _pin_lock(mm, 1)
18648-#define pin_unlock(mm) _pin_lock(mm, 0)
18649-
18650-#define PIN_BATCH 4
18651-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18652-
18653-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18654- unsigned int cpu, unsigned seq)
18655-{
18656- unsigned long pfn = page_to_pfn(page);
18657-
18658- if (PageHighMem(page)) {
18659- if (pgprot_val(flags) & _PAGE_RW)
18660- ClearPagePinned(page);
18661- else
18662- SetPagePinned(page);
18663- } else {
18664- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18665- (unsigned long)__va(pfn << PAGE_SHIFT),
18666- pfn_pte(pfn, flags), 0);
18667- if (unlikely(++seq == PIN_BATCH)) {
18668- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18669- PIN_BATCH, NULL)))
18670- BUG();
18671- seq = 0;
18672- }
18673- }
18674-
18675- return seq;
18676-}
18677-
18678-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18679-{
18680- pgd_t *pgd = pgd_base;
18681- pud_t *pud;
18682- pmd_t *pmd;
18683- int g, u, m;
18684- unsigned int cpu, seq;
18685-
18686- if (xen_feature(XENFEAT_auto_translated_physmap))
18687- return;
18688-
18689- cpu = get_cpu();
18690-
18691- for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18692- if (pgd_none(*pgd))
18693- continue;
18694- pud = pud_offset(pgd, 0);
18695- if (PTRS_PER_PUD > 1) /* not folded */
18696- seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18697- for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18698- if (pud_none(*pud))
18699- continue;
18700- pmd = pmd_offset(pud, 0);
18701- if (PTRS_PER_PMD > 1) /* not folded */
18702- seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18703- for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18704- if (pmd_none(*pmd))
18705- continue;
18706- seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18707- }
18708- }
18709- }
18710-
18711- if (likely(seq != 0)) {
18712- MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18713- (unsigned long)pgd_base,
18714- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18715- UVMF_TLB_FLUSH);
18716- if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18717- seq + 1, NULL)))
18718- BUG();
18719- } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18720- pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18721- UVMF_TLB_FLUSH))
18722- BUG();
18723-
18724- put_cpu();
18725-}
18726-
18727-static void __pgd_pin(pgd_t *pgd)
18728-{
18729- pgd_walk(pgd, PAGE_KERNEL_RO);
18730- kmap_flush_unused();
18731- xen_pgd_pin(__pa(pgd));
18732- SetPagePinned(virt_to_page(pgd));
18733-}
18734-
18735-static void __pgd_unpin(pgd_t *pgd)
18736-{
18737- xen_pgd_unpin(__pa(pgd));
18738- pgd_walk(pgd, PAGE_KERNEL);
18739- ClearPagePinned(virt_to_page(pgd));
18740-}
18741-
18742-static void pgd_test_and_unpin(pgd_t *pgd)
18743-{
18744- if (PagePinned(virt_to_page(pgd)))
18745- __pgd_unpin(pgd);
18746-}
18747-
18748-void mm_pin(struct mm_struct *mm)
18749-{
18750- if (xen_feature(XENFEAT_writable_page_tables))
18751- return;
18752- pin_lock(mm);
18753- __pgd_pin(mm->pgd);
18754- pin_unlock(mm);
18755-}
18756-
18757-void mm_unpin(struct mm_struct *mm)
18758-{
18759- if (xen_feature(XENFEAT_writable_page_tables))
18760- return;
18761- pin_lock(mm);
18762- __pgd_unpin(mm->pgd);
18763- pin_unlock(mm);
18764-}
18765-
18766-void mm_pin_all(void)
18767-{
18768- struct page *page;
18769- unsigned long flags;
18770-
18771- if (xen_feature(XENFEAT_writable_page_tables))
18772- return;
18773-
18774- /*
18775- * Allow uninterrupted access to the pgd_list. Also protects
18776- * __pgd_pin() by disabling preemption.
18777- * All other CPUs must be at a safe point (e.g., in stop_machine
18778- * or offlined entirely).
18779- */
18780- spin_lock_irqsave(&pgd_lock, flags);
18781- for (page = pgd_list; page; page = (struct page *)page->index) {
18782- if (!PagePinned(page))
18783- __pgd_pin((pgd_t *)page_address(page));
18784- }
18785- spin_unlock_irqrestore(&pgd_lock, flags);
18786-}
18787-
18788-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18789-{
18790- if (!PagePinned(virt_to_page(mm->pgd)))
18791- mm_pin(mm);
18792-}
18793-
18794-void arch_exit_mmap(struct mm_struct *mm)
18795-{
18796- struct task_struct *tsk = current;
18797-
18798- task_lock(tsk);
18799-
18800- /*
18801- * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18802- * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18803- */
18804- if (tsk->active_mm == mm) {
18805- tsk->active_mm = &init_mm;
18806- atomic_inc(&init_mm.mm_count);
18807-
18808- switch_mm(mm, &init_mm, tsk);
18809-
18810- atomic_dec(&mm->mm_count);
18811- BUG_ON(atomic_read(&mm->mm_count) == 0);
18812- }
18813-
18814- task_unlock(tsk);
18815-
18816- if (PagePinned(virt_to_page(mm->pgd)) &&
18817- (atomic_read(&mm->mm_count) == 1) &&
18818- !mm->context.has_foreign_mappings)
18819- mm_unpin(mm);
18820-}
18821--- sle11-2009-06-29.orig/arch/x86/pci/irq-xen.c 2009-02-16 16:18:36.000000000 +0100
18822+++ sle11-2009-06-29/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
18823@@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18824 {
18825 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18826
18827+ WARN_ON_ONCE(pirq >= 16);
18828 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18829 }
18830
18831@@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18832 {
18833 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18834 unsigned int val = irqmap[irq];
18835-
18836+
18837+ WARN_ON_ONCE(pirq >= 16);
18838 if (val) {
18839 write_config_nybble(router, 0x48, pirq-1, val);
18840 return 1;
18841@@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18842 static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18843 {
18844 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18845+
18846+ WARN_ON_ONCE(pirq >= 5);
18847 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18848 }
18849
18850 static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18851 {
18852 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18853+
18854+ WARN_ON_ONCE(pirq >= 5);
18855 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18856 return 1;
18857 }
18858@@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18859 static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18860 {
18861 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18862+
18863+ WARN_ON_ONCE(pirq >= 4);
18864 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18865 }
18866
18867 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18868 {
18869 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18870+
18871+ WARN_ON_ONCE(pirq >= 4);
18872 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18873 return 1;
18874 }
18875@@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
18876
18877 static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18878 {
18879+ WARN_ON_ONCE(pirq >= 9);
18880 if (pirq > 8) {
18881 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18882 return 0;
18883@@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
18884
18885 static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18886 {
18887+ WARN_ON_ONCE(pirq >= 9);
18888 if (pirq > 8) {
18889 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18890 return 0;
18891@@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
18892 */
18893 static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18894 {
18895- outb_p(pirq, 0xc00);
18896+ outb(pirq, 0xc00);
18897 return inb(0xc01) & 0xf;
18898 }
18899
18900 static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18901 {
18902- outb_p(pirq, 0xc00);
18903- outb_p(irq, 0xc01);
18904+ outb(pirq, 0xc00);
18905+ outb(irq, 0xc01);
18906 return 1;
18907 }
18908
18909@@ -575,6 +587,10 @@ static __init int intel_router_probe(str
18910 case PCI_DEVICE_ID_INTEL_ICH9_4:
18911 case PCI_DEVICE_ID_INTEL_ICH9_5:
18912 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
18913+ case PCI_DEVICE_ID_INTEL_ICH10_0:
18914+ case PCI_DEVICE_ID_INTEL_ICH10_1:
18915+ case PCI_DEVICE_ID_INTEL_ICH10_2:
18916+ case PCI_DEVICE_ID_INTEL_ICH10_3:
18917 r->name = "PIIX/ICH";
18918 r->get = pirq_piix_get;
18919 r->set = pirq_piix_set;
18920--- sle11-2009-06-29.orig/arch/x86/vdso/Makefile 2008-11-25 12:35:54.000000000 +0100
18921+++ sle11-2009-06-29/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
18922@@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80
18923 vdso32.so-$(CONFIG_COMPAT) += syscall
18924 vdso32.so-$(VDSO32-y) += sysenter
18925 xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
18926+xen-vdso32-$(CONFIG_X86_32) += syscall
18927 vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
18928
18929 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
18930--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32/syscall.S 2009-06-29 15:14:52.000000000 +0200
18931+++ sle11-2009-06-29/arch/x86/vdso/vdso32/syscall.S 2009-03-16 16:33:40.000000000 +0100
18932@@ -19,8 +19,10 @@ __kernel_vsyscall:
18933 .Lpush_ebp:
18934 movl %ecx, %ebp
18935 syscall
18936+#ifndef CONFIG_XEN
18937 movl $__USER32_DS, %ecx
18938 movl %ecx, %ss
18939+#endif
18940 movl %ebp, %ecx
18941 popl %ebp
18942 .Lpop_ebp:
18943--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32.S 2009-06-29 15:14:52.000000000 +0200
18944+++ sle11-2009-06-29/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
18945@@ -19,4 +19,16 @@ vdso32_sysenter_start:
18946 .incbin "arch/x86/vdso/vdso32-sysenter.so"
18947 vdso32_sysenter_end:
18948
18949+#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
18950+ .globl vdso32_int80_start, vdso32_int80_end
18951+vdso32_int80_start:
18952+ .incbin "arch/x86/vdso/vdso32-int80.so"
18953+vdso32_int80_end:
18954+#elif defined(CONFIG_X86_XEN)
18955+ .globl vdso32_syscall_start, vdso32_syscall_end
18956+vdso32_syscall_start:
18957+ .incbin "arch/x86/vdso/vdso32-syscall.so"
18958+vdso32_syscall_end:
18959+#endif
18960+
18961 __FINIT
18962--- sle11-2009-06-29.orig/arch/x86/vdso/vdso32-setup.c 2008-11-25 12:35:53.000000000 +0100
18963+++ sle11-2009-06-29/arch/x86/vdso/vdso32-setup.c 2009-03-16 16:33:40.000000000 +0100
18964@@ -26,10 +26,6 @@
18965 #include <asm/vdso.h>
18966 #include <asm/proto.h>
18967
18968-#ifdef CONFIG_XEN
18969-#include <xen/interface/callback.h>
18970-#endif
18971-
18972 enum {
18973 VDSO_DISABLED = 0,
18974 VDSO_ENABLED = 1,
18975@@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
18976
18977 void enable_sep_cpu(void)
18978 {
18979-#ifndef CONFIG_XEN
18980 int cpu = get_cpu();
18981 struct tss_struct *tss = &per_cpu(init_tss, cpu);
18982
18983@@ -244,35 +239,6 @@ void enable_sep_cpu(void)
18984 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
18985 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
18986 put_cpu();
18987-#else
18988- extern asmlinkage void ia32pv_sysenter_target(void);
18989- static struct callback_register sysenter = {
18990- .type = CALLBACKTYPE_sysenter,
18991- .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
18992- };
18993-
18994- if (!boot_cpu_has(X86_FEATURE_SEP))
18995- return;
18996-
18997- get_cpu();
18998-
18999- if (xen_feature(XENFEAT_supervisor_mode_kernel))
19000- sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19001-
19002- switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19003- case 0:
19004- break;
19005-#if CONFIG_XEN_COMPAT < 0x030200
19006- case -ENOSYS:
19007- sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19008- if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19009- break;
19010-#endif
19011- default:
19012- clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19013- break;
19014- }
19015-#endif
19016 }
19017
19018 static struct vm_area_struct gate_vma;
19019--- /dev/null 1970-01-01 00:00:00.000000000 +0000
19020+++ sle11-2009-06-29/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
19021@@ -0,0 +1,506 @@
19022+/*
19023+ * (C) Copyright 2002 Linus Torvalds
19024+ * Portions based on the vdso-randomization code from exec-shield:
19025+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19026+ *
19027+ * This file contains the needed initializations to support sysenter.
19028+ */
19029+
19030+#include <linux/init.h>
19031+#include <linux/smp.h>
19032+#include <linux/thread_info.h>
19033+#include <linux/sched.h>
19034+#include <linux/gfp.h>
19035+#include <linux/string.h>
19036+#include <linux/elf.h>
19037+#include <linux/mm.h>
19038+#include <linux/err.h>
19039+#include <linux/module.h>
19040+
19041+#include <asm/cpufeature.h>
19042+#include <asm/msr.h>
19043+#include <asm/pgtable.h>
19044+#include <asm/unistd.h>
19045+#include <asm/elf.h>
19046+#include <asm/tlbflush.h>
19047+#include <asm/vdso.h>
19048+#include <asm/proto.h>
19049+
19050+#include <xen/interface/callback.h>
19051+
19052+enum {
19053+ VDSO_DISABLED = 0,
19054+ VDSO_ENABLED = 1,
19055+ VDSO_COMPAT = 2,
19056+};
19057+
19058+#ifdef CONFIG_COMPAT_VDSO
19059+#define VDSO_DEFAULT VDSO_COMPAT
19060+#else
19061+#define VDSO_DEFAULT VDSO_ENABLED
19062+#endif
19063+
19064+#ifdef CONFIG_X86_64
19065+#define vdso_enabled sysctl_vsyscall32
19066+#define arch_setup_additional_pages syscall32_setup_pages
19067+#endif
19068+
19069+/*
19070+ * This is the difference between the prelinked addresses in the vDSO images
19071+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19072+ * in the user address space.
19073+ */
19074+#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19075+
19076+/*
19077+ * Should the kernel map a VDSO page into processes and pass its
19078+ * address down to glibc upon exec()?
19079+ */
19080+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19081+
19082+static int __init vdso_setup(char *s)
19083+{
19084+ vdso_enabled = simple_strtoul(s, NULL, 0);
19085+
19086+ return 1;
19087+}
19088+
19089+/*
19090+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19091+ * behavior on both 64-bit and 32-bit kernels.
19092+ * On 32-bit kernels, vdso=[012] means the same thing.
19093+ */
19094+__setup("vdso32=", vdso_setup);
19095+
19096+#ifdef CONFIG_X86_32
19097+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19098+
19099+EXPORT_SYMBOL_GPL(vdso_enabled);
19100+#endif
19101+
19102+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19103+ unsigned offset, unsigned size)
19104+{
19105+ Elf32_Sym *sym = (void *)ehdr + offset;
19106+ unsigned nsym = size / sizeof(*sym);
19107+ unsigned i;
19108+
19109+ for(i = 0; i < nsym; i++, sym++) {
19110+ if (sym->st_shndx == SHN_UNDEF ||
19111+ sym->st_shndx == SHN_ABS)
19112+ continue; /* skip */
19113+
19114+ if (sym->st_shndx > SHN_LORESERVE) {
19115+ printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19116+ sym->st_shndx);
19117+ continue;
19118+ }
19119+
19120+ switch(ELF_ST_TYPE(sym->st_info)) {
19121+ case STT_OBJECT:
19122+ case STT_FUNC:
19123+ case STT_SECTION:
19124+ case STT_FILE:
19125+ sym->st_value += VDSO_ADDR_ADJUST;
19126+ }
19127+ }
19128+}
19129+
19130+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19131+{
19132+ Elf32_Dyn *dyn = (void *)ehdr + offset;
19133+
19134+ for(; dyn->d_tag != DT_NULL; dyn++)
19135+ switch(dyn->d_tag) {
19136+ case DT_PLTGOT:
19137+ case DT_HASH:
19138+ case DT_STRTAB:
19139+ case DT_SYMTAB:
19140+ case DT_RELA:
19141+ case DT_INIT:
19142+ case DT_FINI:
19143+ case DT_REL:
19144+ case DT_DEBUG:
19145+ case DT_JMPREL:
19146+ case DT_VERSYM:
19147+ case DT_VERDEF:
19148+ case DT_VERNEED:
19149+ case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19150+ /* definitely pointers needing relocation */
19151+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19152+ break;
19153+
19154+ case DT_ENCODING ... OLD_DT_LOOS-1:
19155+ case DT_LOOS ... DT_HIOS-1:
19156+ /* Tags above DT_ENCODING are pointers if
19157+ they're even */
19158+ if (dyn->d_tag >= DT_ENCODING &&
19159+ (dyn->d_tag & 1) == 0)
19160+ dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19161+ break;
19162+
19163+ case DT_VERDEFNUM:
19164+ case DT_VERNEEDNUM:
19165+ case DT_FLAGS_1:
19166+ case DT_RELACOUNT:
19167+ case DT_RELCOUNT:
19168+ case DT_VALRNGLO ... DT_VALRNGHI:
19169+ /* definitely not pointers */
19170+ break;
19171+
19172+ case OLD_DT_LOOS ... DT_LOOS-1:
19173+ case DT_HIOS ... DT_VALRNGLO-1:
19174+ default:
19175+ if (dyn->d_tag > DT_ENCODING)
19176+ printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19177+ dyn->d_tag);
19178+ break;
19179+ }
19180+}
19181+
19182+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19183+{
19184+ Elf32_Phdr *phdr;
19185+ Elf32_Shdr *shdr;
19186+ int i;
19187+
19188+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19189+ !elf_check_arch_ia32(ehdr) ||
19190+ ehdr->e_type != ET_DYN);
19191+
19192+ ehdr->e_entry += VDSO_ADDR_ADJUST;
19193+
19194+ /* rebase phdrs */
19195+ phdr = (void *)ehdr + ehdr->e_phoff;
19196+ for (i = 0; i < ehdr->e_phnum; i++) {
19197+ phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19198+
19199+ /* relocate dynamic stuff */
19200+ if (phdr[i].p_type == PT_DYNAMIC)
19201+ reloc_dyn(ehdr, phdr[i].p_offset);
19202+ }
19203+
19204+ /* rebase sections */
19205+ shdr = (void *)ehdr + ehdr->e_shoff;
19206+ for(i = 0; i < ehdr->e_shnum; i++) {
19207+ if (!(shdr[i].sh_flags & SHF_ALLOC))
19208+ continue;
19209+
19210+ shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19211+
19212+ if (shdr[i].sh_type == SHT_SYMTAB ||
19213+ shdr[i].sh_type == SHT_DYNSYM)
19214+ reloc_symtab(ehdr, shdr[i].sh_offset,
19215+ shdr[i].sh_size);
19216+ }
19217+}
19218+
19219+/*
19220+ * These symbols are defined by vdso32.S to mark the bounds
19221+ * of the ELF DSO images included therein.
19222+ */
19223+extern const char vdso32_default_start, vdso32_default_end;
19224+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19225+static struct page *vdso32_pages[1];
19226+
19227+#ifdef CONFIG_X86_64
19228+
19229+#if CONFIG_XEN_COMPAT < 0x030200
19230+static int use_int80 = 1;
19231+#endif
19232+static int use_sysenter __read_mostly = -1;
19233+
19234+#define vdso32_sysenter() (use_sysenter > 0)
19235+
19236+/* May not be __init: called during resume */
19237+void syscall32_cpu_init(void)
19238+{
19239+ static const struct callback_register cstar = {
19240+ .type = CALLBACKTYPE_syscall32,
19241+ .address = (unsigned long)ia32_cstar_target
19242+ };
19243+ static const struct callback_register sysenter = {
19244+ .type = CALLBACKTYPE_sysenter,
19245+ .address = (unsigned long)ia32_sysenter_target
19246+ };
19247+
19248+ if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19249+ (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19250+#if CONFIG_XEN_COMPAT < 0x030200
19251+ return;
19252+ use_int80 = 0;
19253+#else
19254+ BUG();
19255+#endif
19256+
19257+ if (use_sysenter < 0)
19258+ use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19259+}
19260+
19261+#define compat_uses_vma 1
19262+
19263+static inline void map_compat_vdso(int map)
19264+{
19265+}
19266+
19267+#else /* CONFIG_X86_32 */
19268+
19269+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
19270+
19271+extern asmlinkage void ia32pv_cstar_target(void);
19272+static const struct callback_register __cpuinitconst cstar = {
19273+ .type = CALLBACKTYPE_syscall32,
19274+ .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19275+};
19276+
19277+void __cpuinit enable_sep_cpu(void)
19278+{
19279+ extern asmlinkage void ia32pv_sysenter_target(void);
19280+ static struct callback_register __cpuinitdata sysenter = {
19281+ .type = CALLBACKTYPE_sysenter,
19282+ .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19283+ };
19284+
19285+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19286+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19287+ BUG();
19288+ return;
19289+ }
19290+
19291+ if (!boot_cpu_has(X86_FEATURE_SEP))
19292+ return;
19293+
19294+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
19295+ sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19296+
19297+ switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19298+ case 0:
19299+ break;
19300+#if CONFIG_XEN_COMPAT < 0x030200
19301+ case -ENOSYS:
19302+ sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19303+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19304+ break;
19305+#endif
19306+ default:
19307+ setup_clear_cpu_cap(X86_FEATURE_SEP);
19308+ break;
19309+ }
19310+}
19311+
19312+static struct vm_area_struct gate_vma;
19313+
19314+static int __init gate_vma_init(void)
19315+{
19316+ gate_vma.vm_mm = NULL;
19317+ gate_vma.vm_start = FIXADDR_USER_START;
19318+ gate_vma.vm_end = FIXADDR_USER_END;
19319+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19320+ gate_vma.vm_page_prot = __P101;
19321+ /*
19322+ * Make sure the vDSO gets into every core dump.
19323+ * Dumping its contents makes post-mortem fully interpretable later
19324+ * without matching up the same kernel and hardware config to see
19325+ * what PC values meant.
19326+ */
19327+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
19328+ return 0;
19329+}
19330+
19331+#define compat_uses_vma 0
19332+
19333+static void map_compat_vdso(int map)
19334+{
19335+ static int vdso_mapped;
19336+
19337+ if (map == vdso_mapped)
19338+ return;
19339+
19340+ vdso_mapped = map;
19341+
19342+ __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19343+ map ? PAGE_READONLY_EXEC : PAGE_NONE);
19344+
19345+ /* flush stray tlbs */
19346+ flush_tlb_all();
19347+}
19348+
19349+#endif /* CONFIG_X86_64 */
19350+
19351+int __init sysenter_setup(void)
19352+{
19353+ void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19354+ const void *vsyscall;
19355+ size_t vsyscall_len;
19356+
19357+ vdso32_pages[0] = virt_to_page(syscall_page);
19358+
19359+#ifdef CONFIG_X86_32
19360+ gate_vma_init();
19361+
19362+ printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19363+#endif
19364+
19365+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19366+ if (use_int80) {
19367+ extern const char vdso32_int80_start, vdso32_int80_end;
19368+
19369+ vsyscall = &vdso32_int80_start;
19370+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19371+ } else
19372+#elif defined(CONFIG_X86_32)
19373+ if (boot_cpu_has(X86_FEATURE_SYSCALL)
19374+ && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19375+ || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19376+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19377+ barrier(); /* until clear_bit()'s constraints are correct ... */
19378+ if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19379+ extern const char vdso32_syscall_start, vdso32_syscall_end;
19380+
19381+ vsyscall = &vdso32_syscall_start;
19382+ vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19383+ } else
19384+#endif
19385+ if (!vdso32_sysenter()) {
19386+ vsyscall = &vdso32_default_start;
19387+ vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19388+ } else {
19389+ vsyscall = &vdso32_sysenter_start;
19390+ vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19391+ }
19392+
19393+ memcpy(syscall_page, vsyscall, vsyscall_len);
19394+ relocate_vdso(syscall_page);
19395+
19396+ return 0;
19397+}
19398+
19399+/* Setup a VMA at program startup for the vsyscall page */
19400+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19401+{
19402+ struct mm_struct *mm = current->mm;
19403+ unsigned long addr;
19404+ int ret = 0;
19405+ bool compat;
19406+
19407+ down_write(&mm->mmap_sem);
19408+
19409+ /* Test compat mode once here, in case someone
19410+ changes it via sysctl */
19411+ compat = (vdso_enabled == VDSO_COMPAT);
19412+
19413+ map_compat_vdso(compat);
19414+
19415+ if (compat)
19416+ addr = VDSO_HIGH_BASE;
19417+ else {
19418+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19419+ if (IS_ERR_VALUE(addr)) {
19420+ ret = addr;
19421+ goto up_fail;
19422+ }
19423+ }
19424+
19425+ if (compat_uses_vma || !compat) {
19426+ /*
19427+ * MAYWRITE to allow gdb to COW and set breakpoints
19428+ *
19429+ * Make sure the vDSO gets into every core dump.
19430+ * Dumping its contents makes post-mortem fully
19431+ * interpretable later without matching up the same
19432+ * kernel and hardware config to see what PC values
19433+ * meant.
19434+ */
19435+ ret = install_special_mapping(mm, addr, PAGE_SIZE,
19436+ VM_READ|VM_EXEC|
19437+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19438+ VM_ALWAYSDUMP,
19439+ vdso32_pages);
19440+
19441+ if (ret)
19442+ goto up_fail;
19443+ }
19444+
19445+ current->mm->context.vdso = (void *)addr;
19446+ current_thread_info()->sysenter_return =
19447+ VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19448+
19449+ up_fail:
19450+ up_write(&mm->mmap_sem);
19451+
19452+ return ret;
19453+}
19454+
19455+#ifdef CONFIG_X86_64
19456+
19457+/*
19458+ * This must be done early in case we have an initrd containing 32-bit
19459+ * binaries (e.g., hotplug). This could be pushed upstream.
19460+ */
19461+core_initcall(sysenter_setup);
19462+
19463+#ifdef CONFIG_SYSCTL
19464+/* Register vsyscall32 into the ABI table */
19465+#include <linux/sysctl.h>
19466+
19467+static ctl_table abi_table2[] = {
19468+ {
19469+ .procname = "vsyscall32",
19470+ .data = &sysctl_vsyscall32,
19471+ .maxlen = sizeof(int),
19472+ .mode = 0644,
19473+ .proc_handler = proc_dointvec
19474+ },
19475+ {}
19476+};
19477+
19478+static ctl_table abi_root_table2[] = {
19479+ {
19480+ .ctl_name = CTL_ABI,
19481+ .procname = "abi",
19482+ .mode = 0555,
19483+ .child = abi_table2
19484+ },
19485+ {}
19486+};
19487+
19488+static __init int ia32_binfmt_init(void)
19489+{
19490+ register_sysctl_table(abi_root_table2);
19491+ return 0;
19492+}
19493+__initcall(ia32_binfmt_init);
19494+#endif
19495+
19496+#else /* CONFIG_X86_32 */
19497+
19498+const char *arch_vma_name(struct vm_area_struct *vma)
19499+{
19500+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19501+ return "[vdso]";
19502+ return NULL;
19503+}
19504+
19505+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19506+{
19507+ struct mm_struct *mm = tsk->mm;
19508+
19509+ /* Check to see if this task was created in compat vdso mode */
19510+ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19511+ return &gate_vma;
19512+ return NULL;
19513+}
19514+
19515+int in_gate_area(struct task_struct *task, unsigned long addr)
19516+{
19517+ const struct vm_area_struct *vma = get_gate_vma(task);
19518+
19519+ return vma && addr >= vma->vm_start && addr < vma->vm_end;
19520+}
19521+
19522+int in_gate_area_no_task(unsigned long addr)
19523+{
19524+ return 0;
19525+}
19526+
19527+#endif /* CONFIG_X86_64 */
19528--- sle11-2009-06-29.orig/drivers/pci/msi-xen.c 2009-02-16 16:18:36.000000000 +0100
19529+++ sle11-2009-06-29/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
19530@@ -43,6 +43,53 @@ struct msi_pirq_entry {
19531 int entry_nr;
19532 };
19533
19534+/* Arch hooks */
19535+
19536+int __attribute__ ((weak))
19537+arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19538+{
19539+ return 0;
19540+}
19541+
19542+#ifndef CONFIG_XEN
19543+int __attribute__ ((weak))
19544+arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19545+{
19546+ return 0;
19547+}
19548+
19549+int __attribute__ ((weak))
19550+arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19551+{
19552+ struct msi_desc *entry;
19553+ int ret;
19554+
19555+ list_for_each_entry(entry, &dev->msi_list, list) {
19556+ ret = arch_setup_msi_irq(dev, entry);
19557+ if (ret)
19558+ return ret;
19559+ }
19560+
19561+ return 0;
19562+}
19563+
19564+void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19565+{
19566+ return;
19567+}
19568+
19569+void __attribute__ ((weak))
19570+arch_teardown_msi_irqs(struct pci_dev *dev)
19571+{
19572+ struct msi_desc *entry;
19573+
19574+ list_for_each_entry(entry, &dev->msi_list, list) {
19575+ if (entry->irq != 0)
19576+ arch_teardown_msi_irq(entry->irq);
19577+ }
19578+}
19579+#endif
19580+
19581 static void msi_set_enable(struct pci_dev *dev, int enable)
19582 {
19583 int pos;
19584@@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19585 pci_intx(dev, enable);
19586 }
19587
19588-#ifdef CONFIG_PM
19589 static void __pci_restore_msi_state(struct pci_dev *dev)
19590 {
19591 int pirq;
19592@@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19593 __pci_restore_msi_state(dev);
19594 __pci_restore_msix_state(dev);
19595 }
19596-#endif /* CONFIG_PM */
19597+EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19598
19599 /**
19600 * msi_capability_init - configure device's MSI capability structure
19601@@ -755,51 +801,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19602 INIT_LIST_HEAD(&dev->msi_list);
19603 #endif
19604 }
19605-
19606-
19607-/* Arch hooks */
19608-
19609-int __attribute__ ((weak))
19610-arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19611-{
19612- return 0;
19613-}
19614-
19615-#ifndef CONFIG_XEN
19616-int __attribute__ ((weak))
19617-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19618-{
19619- return 0;
19620-}
19621-
19622-int __attribute__ ((weak))
19623-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19624-{
19625- struct msi_desc *entry;
19626- int ret;
19627-
19628- list_for_each_entry(entry, &dev->msi_list, list) {
19629- ret = arch_setup_msi_irq(dev, entry);
19630- if (ret)
19631- return ret;
19632- }
19633-
19634- return 0;
19635-}
19636-
19637-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19638-{
19639- return;
19640-}
19641-
19642-void __attribute__ ((weak))
19643-arch_teardown_msi_irqs(struct pci_dev *dev)
19644-{
19645- struct msi_desc *entry;
19646-
19647- list_for_each_entry(entry, &dev->msi_list, list) {
19648- if (entry->irq != 0)
19649- arch_teardown_msi_irq(entry->irq);
19650- }
19651-}
19652-#endif
19653--- sle11-2009-06-29.orig/drivers/pci/pci.c 2009-06-29 15:14:52.000000000 +0200
19654+++ sle11-2009-06-29/drivers/pci/pci.c 2009-03-16 16:33:40.000000000 +0100
19655@@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19656 * Restore the BAR values for a given device, so as to make it
19657 * accessible by its driver.
19658 */
19659+#ifndef CONFIG_XEN
19660 static void
19661+#else
19662+EXPORT_SYMBOL_GPL(pci_restore_bars);
19663+void
19664+#endif
19665 pci_restore_bars(struct pci_dev *dev)
19666 {
19667 int i, numres;
19668--- sle11-2009-06-29.orig/drivers/xen/balloon/sysfs.c 2009-03-04 11:25:55.000000000 +0100
19669+++ sle11-2009-06-29/drivers/xen/balloon/sysfs.c 2009-06-29 15:29:24.000000000 +0200
19670@@ -104,7 +104,7 @@ static struct attribute_group balloon_in
19671 };
19672
19673 static struct sysdev_class balloon_sysdev_class = {
19674- set_kset_name(BALLOON_CLASS_NAME),
19675+ .name = BALLOON_CLASS_NAME,
19676 };
19677
19678 static struct sys_device balloon_sysdev;
19679--- sle11-2009-06-29.orig/drivers/xen/blkback/blkback.c 2009-02-16 16:18:36.000000000 +0100
19680+++ sle11-2009-06-29/drivers/xen/blkback/blkback.c 2009-03-16 16:33:40.000000000 +0100
19681@@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19682 return;
19683 if (blkif->plug->unplug_fn)
19684 blkif->plug->unplug_fn(blkif->plug);
19685- blk_put_queue(blkif->plug);
19686+ kobject_put(&blkif->plug->kobj);
19687 blkif->plug = NULL;
19688 }
19689
19690@@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19691 if (q == blkif->plug)
19692 return;
19693 unplug_queue(blkif);
19694- blk_get_queue(q);
19695+ WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19696+ kobject_get(&q->kobj);
19697 blkif->plug = q;
19698 }
19699
19700--- sle11-2009-06-29.orig/drivers/xen/blkfront/blkfront.c 2009-02-16 16:18:36.000000000 +0100
19701+++ sle11-2009-06-29/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
19702@@ -713,7 +713,6 @@ static irqreturn_t blkif_int(int irq, vo
19703 RING_IDX i, rp;
19704 unsigned long flags;
19705 struct blkfront_info *info = (struct blkfront_info *)dev_id;
19706- int uptodate;
19707
19708 spin_lock_irqsave(&blkif_io_lock, flags);
19709
19710@@ -738,13 +737,13 @@ static irqreturn_t blkif_int(int irq, vo
19711
19712 ADD_ID_TO_FREELIST(info, id);
19713
19714- uptodate = (bret->status == BLKIF_RSP_OKAY);
19715+ ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19716 switch (bret->operation) {
19717 case BLKIF_OP_WRITE_BARRIER:
19718 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19719 printk("blkfront: %s: write barrier op failed\n",
19720 info->gd->disk_name);
19721- uptodate = -EOPNOTSUPP;
19722+ ret = -EOPNOTSUPP;
19723 info->feature_barrier = 0;
19724 xlvbd_barrier(info);
19725 }
19726@@ -755,10 +754,8 @@ static irqreturn_t blkif_int(int irq, vo
19727 DPRINTK("Bad return from blkdev data "
19728 "request: %x\n", bret->status);
19729
19730- ret = end_that_request_first(req, uptodate,
19731- req->hard_nr_sectors);
19732+ ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19733 BUG_ON(ret);
19734- end_that_request_last(req, uptodate);
19735 break;
19736 default:
19737 BUG();
19738--- sle11-2009-06-29.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:37:50.000000000 +0200
19739+++ sle11-2009-06-29/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
19740@@ -331,8 +331,8 @@ static pte_t blktap_clear_pte(struct vm_
19741 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19742 */
19743 if (uvaddr < uvstart || vma->vm_file == NULL)
19744- return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19745- ptep, is_fullmm);
19746+ return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19747+ is_fullmm);
19748
19749 info = vma->vm_file->private_data;
19750 priv = vma->vm_private_data;
19751@@ -379,8 +379,8 @@ static pte_t blktap_clear_pte(struct vm_
19752 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19753
19754 /* USING SHADOW PAGE TABLES. */
19755- copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19756- is_fullmm);
19757+ copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19758+ is_fullmm);
19759 }
19760
19761 if (count) {
19762--- sle11-2009-06-29.orig/drivers/xen/core/Makefile 2009-06-29 15:14:52.000000000 +0200
19763+++ sle11-2009-06-29/drivers/xen/core/Makefile 2009-03-16 16:33:40.000000000 +0100
19764@@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
19765 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
19766 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
19767 obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
19768+obj-$(CONFIG_X86_SMP) += spinlock.o
19769 obj-$(CONFIG_KEXEC) += machine_kexec.o
19770 obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
19771--- sle11-2009-06-29.orig/drivers/xen/core/evtchn.c 2009-03-04 11:25:55.000000000 +0100
19772+++ sle11-2009-06-29/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
19773@@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc
19774
19775 /* Upcall to generic IRQ layer. */
19776 #ifdef CONFIG_X86
19777-extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19778+extern unsigned int do_IRQ(struct pt_regs *regs);
19779 void __init xen_init_IRQ(void);
19780 void __init init_IRQ(void)
19781 {
19782@@ -203,13 +203,11 @@ void __init init_IRQ(void)
19783 }
19784 #if defined (__i386__)
19785 static inline void exit_idle(void) {}
19786-#define IRQ_REG orig_eax
19787 #elif defined (__x86_64__)
19788 #include <asm/idle.h>
19789-#define IRQ_REG orig_rax
19790 #endif
19791 #define do_IRQ(irq, regs) do { \
19792- (regs)->IRQ_REG = ~(irq); \
19793+ (regs)->orig_ax = ~(irq); \
19794 do_IRQ((regs)); \
19795 } while (0)
19796 #endif
19797@@ -670,13 +668,12 @@ static void set_affinity_irq(unsigned in
19798 int resend_irq_on_evtchn(unsigned int irq)
19799 {
19800 int masked, evtchn = evtchn_from_irq(irq);
19801- shared_info_t *s = HYPERVISOR_shared_info;
19802
19803 if (!VALID_EVTCHN(evtchn))
19804 return 1;
19805
19806 masked = test_and_set_evtchn_mask(evtchn);
19807- synch_set_bit(evtchn, s->evtchn_pending);
19808+ set_evtchn(evtchn);
19809 if (!masked)
19810 unmask_evtchn(evtchn);
19811
19812@@ -969,6 +966,43 @@ void disable_all_local_evtchn(void)
19813 synch_set_bit(i, &s->evtchn_mask[0]);
19814 }
19815
19816+/* Clear an irq's pending state, in preparation for polling on it. */
19817+void xen_clear_irq_pending(int irq)
19818+{
19819+ int evtchn = evtchn_from_irq(irq);
19820+
19821+ if (VALID_EVTCHN(evtchn))
19822+ clear_evtchn(evtchn);
19823+}
19824+
19825+/* Set an irq's pending state, to avoid blocking on it. */
19826+void xen_set_irq_pending(int irq)
19827+{
19828+ int evtchn = evtchn_from_irq(irq);
19829+
19830+ if (VALID_EVTCHN(evtchn))
19831+ set_evtchn(evtchn);
19832+}
19833+
19834+/* Test an irq's pending state. */
19835+int xen_test_irq_pending(int irq)
19836+{
19837+ int evtchn = evtchn_from_irq(irq);
19838+
19839+ return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19840+}
19841+
19842+/* Poll waiting for an irq to become pending. In the usual case, the
19843+ irq will be disabled so it won't deliver an interrupt. */
19844+void xen_poll_irq(int irq)
19845+{
19846+ evtchn_port_t evtchn = evtchn_from_irq(irq);
19847+
19848+ if (VALID_EVTCHN(evtchn)
19849+ && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19850+ BUG();
19851+}
19852+
19853 static void restore_cpu_virqs(unsigned int cpu)
19854 {
19855 struct evtchn_bind_virq bind_virq;
19856@@ -1022,8 +1056,8 @@ static void restore_cpu_ipis(unsigned in
19857 bind_evtchn_to_cpu(evtchn, cpu);
19858
19859 /* Ready for use. */
19860- unmask_evtchn(evtchn);
19861-
19862+ if (!(irq_desc[irq].status & IRQ_DISABLED))
19863+ unmask_evtchn(evtchn);
19864 }
19865 }
19866
19867--- sle11-2009-06-29.orig/drivers/xen/core/hypervisor_sysfs.c 2008-12-15 11:27:22.000000000 +0100
19868+++ sle11-2009-06-29/drivers/xen/core/hypervisor_sysfs.c 2009-03-16 16:33:40.000000000 +0100
19869@@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19870 if (!is_running_on_xen())
19871 return -ENODEV;
19872
19873- hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19874+ hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19875 return 0;
19876 }
19877
19878--- sle11-2009-06-29.orig/drivers/xen/core/smpboot.c 2009-02-16 16:18:36.000000000 +0100
19879+++ sle11-2009-06-29/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
19880@@ -135,6 +135,10 @@ static int __cpuinit xen_smp_intr_init(u
19881 goto fail;
19882 per_cpu(callfunc_irq, cpu) = rc;
19883
19884+ rc = xen_spinlock_init(cpu);
19885+ if (rc < 0)
19886+ goto fail;
19887+
19888 if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
19889 goto fail;
19890
19891@@ -145,6 +149,7 @@ static int __cpuinit xen_smp_intr_init(u
19892 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19893 if (per_cpu(callfunc_irq, cpu) >= 0)
19894 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19895+ xen_spinlock_cleanup(cpu);
19896 return rc;
19897 }
19898
19899@@ -156,6 +161,7 @@ static void xen_smp_intr_exit(unsigned i
19900
19901 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19902 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19903+ xen_spinlock_cleanup(cpu);
19904 }
19905 #endif
19906
19907@@ -208,36 +214,25 @@ static void __cpuinit cpu_initialize_con
19908 smp_trap_init(ctxt.trap_ctxt);
19909
19910 ctxt.ldt_ents = 0;
19911- ctxt.gdt_ents = GDT_SIZE / 8;
19912-
19913-#ifdef __i386__
19914 ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
19915+ ctxt.gdt_ents = GDT_SIZE / 8;
19916
19917 ctxt.user_regs.cs = __KERNEL_CS;
19918- ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
19919+ ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
19920
19921 ctxt.kernel_ss = __KERNEL_DS;
19922- ctxt.kernel_sp = idle->thread.esp0;
19923+ ctxt.kernel_sp = idle->thread.sp0;
19924
19925- ctxt.event_callback_cs = __KERNEL_CS;
19926 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
19927- ctxt.failsafe_callback_cs = __KERNEL_CS;
19928 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19929+#ifdef __i386__
19930+ ctxt.event_callback_cs = __KERNEL_CS;
19931+ ctxt.failsafe_callback_cs = __KERNEL_CS;
19932
19933 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
19934
19935 ctxt.user_regs.fs = __KERNEL_PERCPU;
19936 #else /* __x86_64__ */
19937- ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
19938-
19939- ctxt.user_regs.cs = __KERNEL_CS;
19940- ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
19941-
19942- ctxt.kernel_ss = __KERNEL_DS;
19943- ctxt.kernel_sp = idle->thread.rsp0;
19944-
19945- ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
19946- ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19947 ctxt.syscall_callback_eip = (unsigned long)system_call;
19948
19949 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
19950--- /dev/null 1970-01-01 00:00:00.000000000 +0000
19951+++ sle11-2009-06-29/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
19952@@ -0,0 +1,161 @@
19953+/*
19954+ * Xen spinlock functions
19955+ *
19956+ * See arch/x86/xen/smp.c for copyright and credits for derived
19957+ * portions of this file.
19958+ */
19959+
19960+#include <linux/init.h>
19961+#include <linux/irq.h>
19962+#include <linux/kernel.h>
19963+#include <linux/kernel_stat.h>
19964+#include <linux/module.h>
19965+#include <xen/evtchn.h>
19966+
19967+extern irqreturn_t smp_reschedule_interrupt(int, void *);
19968+
19969+static DEFINE_PER_CPU(int, spinlock_irq) = -1;
19970+static char spinlock_name[NR_CPUS][15];
19971+
19972+struct spinning {
19973+ raw_spinlock_t *lock;
19974+ unsigned int ticket;
19975+ struct spinning *prev;
19976+};
19977+static DEFINE_PER_CPU(struct spinning *, spinning);
19978+/*
19979+ * Protect removal of objects: Addition can be done lockless, and even
19980+ * removal itself doesn't need protection - what needs to be prevented is
19981+ * removed objects going out of scope (as they're allocated on the stack.
19982+ */
19983+static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
19984+
19985+int __cpuinit xen_spinlock_init(unsigned int cpu)
19986+{
19987+ int rc;
19988+
19989+ sprintf(spinlock_name[cpu], "spinlock%u", cpu);
19990+ rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
19991+ cpu,
19992+ smp_reschedule_interrupt,
19993+ IRQF_DISABLED|IRQF_NOBALANCING,
19994+ spinlock_name[cpu],
19995+ NULL);
19996+ if (rc < 0)
19997+ return rc;
19998+
19999+ disable_irq(rc); /* make sure it's never delivered */
20000+ per_cpu(spinlock_irq, cpu) = rc;
20001+
20002+ return 0;
20003+}
20004+
20005+void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20006+{
20007+ if (per_cpu(spinlock_irq, cpu) >= 0)
20008+ unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20009+ per_cpu(spinlock_irq, cpu) = -1;
20010+}
20011+
20012+int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20013+{
20014+ int rc = 0, irq = __get_cpu_var(spinlock_irq);
20015+ raw_rwlock_t *rm_lock;
20016+ unsigned long flags;
20017+ struct spinning spinning;
20018+
20019+ /* If kicker interrupt not initialized yet, just spin. */
20020+ if (unlikely(irq < 0) || unlikely(!cpu_online(smp_processor_id())))
20021+ return 0;
20022+
20023+ token >>= TICKET_SHIFT;
20024+
20025+ /* announce we're spinning */
20026+ spinning.ticket = token;
20027+ spinning.lock = lock;
20028+ spinning.prev = __get_cpu_var(spinning);
20029+ smp_wmb();
20030+ __get_cpu_var(spinning) = &spinning;
20031+
20032+ /* clear pending */
20033+ xen_clear_irq_pending(irq);
20034+
20035+ do {
20036+ /* Check again to make sure it didn't become free while
20037+ * we weren't looking. */
20038+ if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20039+ /* If we interrupted another spinlock while it was
20040+ * blocking, make sure it doesn't block (again)
20041+ * without rechecking the lock. */
20042+ if (spinning.prev)
20043+ xen_set_irq_pending(irq);
20044+ rc = 1;
20045+ break;
20046+ }
20047+
20048+ /* block until irq becomes pending */
20049+ xen_poll_irq(irq);
20050+ } while (!xen_test_irq_pending(irq));
20051+
20052+ /* Leave the irq pending so that any interrupted blocker will
20053+ * re-check. */
20054+ kstat_this_cpu.irqs[irq] += !rc;
20055+
20056+ /* announce we're done */
20057+ __get_cpu_var(spinning) = spinning.prev;
20058+ rm_lock = &__get_cpu_var(spinning_rm_lock);
20059+ raw_local_irq_save(flags);
20060+ __raw_write_lock(rm_lock);
20061+ __raw_write_unlock(rm_lock);
20062+ raw_local_irq_restore(flags);
20063+
20064+ return rc;
20065+}
20066+EXPORT_SYMBOL(xen_spin_wait);
20067+
20068+unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20069+{
20070+ return token;//todo
20071+}
20072+EXPORT_SYMBOL(xen_spin_adjust);
20073+
20074+int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20075+ unsigned int flags)
20076+{
20077+ return xen_spin_wait(lock, *token);//todo
20078+}
20079+EXPORT_SYMBOL(xen_spin_wait_flags);
20080+
20081+void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20082+{
20083+ unsigned int cpu;
20084+
20085+ token &= (1U << TICKET_SHIFT) - 1;
20086+ for_each_online_cpu(cpu) {
20087+ raw_rwlock_t *rm_lock;
20088+ unsigned long flags;
20089+ struct spinning *spinning;
20090+
20091+ if (cpu == raw_smp_processor_id())
20092+ continue;
20093+
20094+ rm_lock = &per_cpu(spinning_rm_lock, cpu);
20095+ raw_local_irq_save(flags);
20096+ __raw_read_lock(rm_lock);
20097+
20098+ spinning = per_cpu(spinning, cpu);
20099+ smp_rmb();
20100+ if (spinning
20101+ && (spinning->lock != lock || spinning->ticket != token))
20102+ spinning = NULL;
20103+
20104+ __raw_read_unlock(rm_lock);
20105+ raw_local_irq_restore(flags);
20106+
20107+ if (unlikely(spinning)) {
20108+ notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20109+ return;
20110+ }
20111+ }
20112+}
20113+EXPORT_SYMBOL(xen_spin_kick);
20114--- sle11-2009-06-29.orig/drivers/xen/core/xen_sysfs.c 2008-12-15 11:27:22.000000000 +0100
20115+++ sle11-2009-06-29/drivers/xen/core/xen_sysfs.c 2009-03-16 16:33:40.000000000 +0100
20116@@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20117
20118 static int __init xen_sysfs_type_init(void)
20119 {
20120- return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20121+ return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20122 }
20123
20124 static void xen_sysfs_type_destroy(void)
20125 {
20126- sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20127+ sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20128 }
20129
20130 /* xen version attributes */
20131@@ -90,13 +90,12 @@ static struct attribute_group version_gr
20132
20133 static int __init xen_sysfs_version_init(void)
20134 {
20135- return sysfs_create_group(&hypervisor_subsys.kobj,
20136- &version_group);
20137+ return sysfs_create_group(hypervisor_kobj, &version_group);
20138 }
20139
20140 static void xen_sysfs_version_destroy(void)
20141 {
20142- sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20143+ sysfs_remove_group(hypervisor_kobj, &version_group);
20144 }
20145
20146 /* UUID */
20147@@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20148
20149 static int __init xen_sysfs_uuid_init(void)
20150 {
20151- return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20152+ return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20153 }
20154
20155 static void xen_sysfs_uuid_destroy(void)
20156 {
20157- sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20158+ sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20159 }
20160
20161 /* xen compilation attributes */
20162@@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20163
20164 int __init static xen_compilation_init(void)
20165 {
20166- return sysfs_create_group(&hypervisor_subsys.kobj,
20167- &xen_compilation_group);
20168+ return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20169 }
20170
20171 static void xen_compilation_destroy(void)
20172 {
20173- sysfs_remove_group(&hypervisor_subsys.kobj,
20174- &xen_compilation_group);
20175+ sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20176 }
20177
20178 /* xen properties info */
20179@@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20180
20181 static int __init xen_properties_init(void)
20182 {
20183- return sysfs_create_group(&hypervisor_subsys.kobj,
20184- &xen_properties_group);
20185+ return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20186 }
20187
20188 static void xen_properties_destroy(void)
20189 {
20190- sysfs_remove_group(&hypervisor_subsys.kobj,
20191- &xen_properties_group);
20192+ sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20193 }
20194
20195 #ifdef CONFIG_KEXEC
20196@@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20197
20198 static int __init xen_sysfs_vmcoreinfo_init(void)
20199 {
20200- return sysfs_create_file(&hypervisor_subsys.kobj,
20201- &vmcoreinfo_attr.attr);
20202+ return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20203 }
20204
20205 static void xen_sysfs_vmcoreinfo_destroy(void)
20206 {
20207- sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20208+ sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20209 }
20210
20211 #endif
20212--- sle11-2009-06-29.orig/drivers/xen/gntdev/gntdev.c 2009-03-04 11:28:34.000000000 +0100
20213+++ sle11-2009-06-29/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
20214@@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20215 op.status);
20216 } else {
20217 /* USING SHADOW PAGE TABLES. */
20218- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20219+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20220 }
20221
20222 /* Finally, we unmap the grant from kernel space. */
20223@@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20224 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20225
20226 } else {
20227- copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20228+ copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20229 }
20230
20231 return copy;
20232--- sle11-2009-06-29.orig/drivers/xen/scsifront/scsifront.c 2009-02-16 16:18:36.000000000 +0100
20233+++ sle11-2009-06-29/drivers/xen/scsifront/scsifront.c 2009-03-16 16:33:40.000000000 +0100
20234@@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20235 return -ENOMEM;
20236 }
20237
20238- if (sc->use_sg) {
20239+ if (scsi_bufflen(sc)) {
20240 /* quoted scsi_lib.c/scsi_req_map_sg . */
20241- struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20242- unsigned int data_len = sc->request_bufflen;
20243+ struct scatterlist *sg, *sgl = scsi_sglist(sc);
20244+ unsigned int data_len = scsi_bufflen(sc);
20245
20246- nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20247+ nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20248 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20249 printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20250 ref_cnt = (-E2BIG);
20251 goto big_to_sg;
20252 }
20253
20254- for_each_sg (sgl, sg, sc->use_sg, i) {
20255+ for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20256 page = sg_page(sg);
20257 off = sg->offset;
20258 len = sg->length;
20259@@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20260 ref_cnt++;
20261 }
20262 }
20263- } else if (sc->request_bufflen) {
20264- unsigned long end = ((unsigned long)sc->request_buffer
20265- + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20266- unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20267-
20268- page = virt_to_page(sc->request_buffer);
20269- nr_pages = end - start;
20270- len = sc->request_bufflen;
20271-
20272- if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20273- ref_cnt = (-E2BIG);
20274- goto big_to_sg;
20275- }
20276-
20277- buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20278-
20279- off = offset_in_page((unsigned long)sc->request_buffer);
20280- for (i = 0; i < nr_pages; i++) {
20281- bytes = PAGE_SIZE - off;
20282-
20283- if (bytes > len)
20284- bytes = len;
20285-
20286- ref = gnttab_claim_grant_reference(&gref_head);
20287- BUG_ON(ref == -ENOSPC);
20288-
20289- gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20290- buffer_pfn, write);
20291-
20292- info->shadow[id].gref[i] = ref;
20293- ring_req->seg[i].gref = ref;
20294- ring_req->seg[i].offset = (uint16_t)off;
20295- ring_req->seg[i].length = (uint16_t)bytes;
20296-
20297- buffer_pfn++;
20298- len -= bytes;
20299- off = 0;
20300- ref_cnt++;
20301- }
20302 }
20303
20304 big_to_sg:
20305--- sle11-2009-06-29.orig/drivers/xen/xenoprof/xenoprofile.c 2009-03-11 15:39:38.000000000 +0100
20306+++ sle11-2009-06-29/drivers/xen/xenoprof/xenoprofile.c 2009-03-16 16:33:40.000000000 +0100
20307@@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de
20308
20309
20310 static struct sysdev_class oprofile_sysclass = {
20311- set_kset_name("oprofile"),
20312+ .name = "oprofile",
20313 .resume = xenoprof_resume,
20314 .suspend = xenoprof_suspend
20315 };
20316--- sle11-2009-06-29.orig/include/asm-x86/e820.h 2009-06-29 15:14:52.000000000 +0200
20317+++ sle11-2009-06-29/include/asm-x86/e820.h 2009-03-16 16:33:40.000000000 +0100
20318@@ -127,7 +127,11 @@ extern char *memory_setup(void);
20319 #endif /* __KERNEL__ */
20320 #endif /* __ASSEMBLY__ */
20321
20322+#ifndef CONFIG_XEN
20323 #define ISA_START_ADDRESS 0xa0000
20324+#else
20325+#define ISA_START_ADDRESS 0
20326+#endif
20327 #define ISA_END_ADDRESS 0x100000
20328 #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
20329
20330--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/agp.h 2009-02-16 16:18:36.000000000 +0100
20331+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/agp.h 2009-03-16 16:33:40.000000000 +0100
20332@@ -13,18 +13,13 @@
20333 * page. This avoids data corruption on some CPUs.
20334 */
20335
20336-/*
20337- * Caller's responsibility to call global_flush_tlb() for performance
20338- * reasons
20339- */
20340 #define map_page_into_agp(page) ( \
20341 xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20342- ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20343+ ?: set_pages_uc(page, 1))
20344 #define unmap_page_from_agp(page) ( \
20345 xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20346 /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20347- change_page_attr(page, 1, PAGE_KERNEL))
20348-#define flush_agp_mappings() global_flush_tlb()
20349+ set_pages_wb(page, 1))
20350
20351 /*
20352 * Could use CLFLUSH here if the cpu supports it. But then it would
20353--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc.h 2009-02-16 16:18:36.000000000 +0100
20354+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
20355@@ -1,5 +1,404 @@
20356+#ifndef _ASM_DESC_H_
20357+#define _ASM_DESC_H_
20358+
20359+#ifndef __ASSEMBLY__
20360+#include <asm/desc_defs.h>
20361+#include <asm/ldt.h>
20362+#include <asm/mmu.h>
20363+#include <linux/smp.h>
20364+
20365+static inline void fill_ldt(struct desc_struct *desc,
20366+ const struct user_desc *info)
20367+{
20368+ desc->limit0 = info->limit & 0x0ffff;
20369+ desc->base0 = info->base_addr & 0x0000ffff;
20370+
20371+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20372+ desc->type = (info->read_exec_only ^ 1) << 1;
20373+ desc->type |= info->contents << 2;
20374+ desc->s = 1;
20375+ desc->dpl = 0x3;
20376+ desc->p = info->seg_not_present ^ 1;
20377+ desc->limit = (info->limit & 0xf0000) >> 16;
20378+ desc->avl = info->useable;
20379+ desc->d = info->seg_32bit;
20380+ desc->g = info->limit_in_pages;
20381+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
20382+}
20383+
20384+#ifndef CONFIG_X86_NO_IDT
20385+extern struct desc_ptr idt_descr;
20386+extern gate_desc idt_table[];
20387+#endif
20388+
20389+#ifdef CONFIG_X86_64
20390+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20391+extern struct desc_ptr cpu_gdt_descr[];
20392+/* the cpu gdt accessor */
20393+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
20394+
20395+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
20396+ unsigned dpl, unsigned ist, unsigned seg)
20397+{
20398+ gate->offset_low = PTR_LOW(func);
20399+ gate->segment = __KERNEL_CS;
20400+ gate->ist = ist;
20401+ gate->p = 1;
20402+ gate->dpl = dpl;
20403+ gate->zero0 = 0;
20404+ gate->zero1 = 0;
20405+ gate->type = type;
20406+ gate->offset_middle = PTR_MIDDLE(func);
20407+ gate->offset_high = PTR_HIGH(func);
20408+}
20409+
20410+#else
20411+struct gdt_page {
20412+ struct desc_struct gdt[GDT_ENTRIES];
20413+} __attribute__((aligned(PAGE_SIZE)));
20414+DECLARE_PER_CPU(struct gdt_page, gdt_page);
20415+
20416+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20417+{
20418+ return per_cpu(gdt_page, cpu).gdt;
20419+}
20420+
20421+static inline void pack_gate(gate_desc *gate, unsigned char type,
20422+ unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
20423+
20424+{
20425+ gate->a = (seg << 16) | (base & 0xffff);
20426+ gate->b = (base & 0xffff0000) |
20427+ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
20428+}
20429+
20430+#endif
20431+
20432+static inline int desc_empty(const void *ptr)
20433+{
20434+ const u32 *desc = ptr;
20435+ return !(desc[0] | desc[1]);
20436+}
20437+
20438+#ifndef CONFIG_XEN
20439+#define load_TR_desc() native_load_tr_desc()
20440+#define load_gdt(dtr) native_load_gdt(dtr)
20441+#define load_idt(dtr) native_load_idt(dtr)
20442+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20443+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20444+
20445+#define store_gdt(dtr) native_store_gdt(dtr)
20446+#define store_idt(dtr) native_store_idt(dtr)
20447+#define store_tr(tr) (tr = native_store_tr())
20448+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20449+
20450+#define load_TLS(t, cpu) native_load_tls(t, cpu)
20451+#define set_ldt native_set_ldt
20452+
20453+#define write_ldt_entry(dt, entry, desc) \
20454+ native_write_ldt_entry(dt, entry, desc)
20455+#define write_gdt_entry(dt, entry, desc, type) \
20456+ native_write_gdt_entry(dt, entry, desc, type)
20457+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
20458+
20459+static inline void native_write_idt_entry(gate_desc *idt, int entry,
20460+ const gate_desc *gate)
20461+{
20462+ memcpy(&idt[entry], gate, sizeof(*gate));
20463+}
20464+
20465+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
20466+ const void *desc)
20467+{
20468+ memcpy(&ldt[entry], desc, 8);
20469+}
20470+
20471+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
20472+ const void *desc, int type)
20473+{
20474+ unsigned int size;
20475+ switch (type) {
20476+ case DESC_TSS:
20477+ size = sizeof(tss_desc);
20478+ break;
20479+ case DESC_LDT:
20480+ size = sizeof(ldt_desc);
20481+ break;
20482+ default:
20483+ size = sizeof(struct desc_struct);
20484+ break;
20485+ }
20486+ memcpy(&gdt[entry], desc, size);
20487+}
20488+#endif
20489+
20490+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
20491+ unsigned long limit, unsigned char type,
20492+ unsigned char flags)
20493+{
20494+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
20495+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20496+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
20497+ ((flags & 0xf) << 20);
20498+ desc->p = 1;
20499+}
20500+
20501+
20502+#ifndef CONFIG_XEN
20503+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
20504+ unsigned type, unsigned size)
20505+{
20506+#ifdef CONFIG_X86_64
20507+ struct ldttss_desc64 *desc = d;
20508+ memset(desc, 0, sizeof(*desc));
20509+ desc->limit0 = size & 0xFFFF;
20510+ desc->base0 = PTR_LOW(addr);
20511+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
20512+ desc->type = type;
20513+ desc->p = 1;
20514+ desc->limit1 = (size >> 16) & 0xF;
20515+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
20516+ desc->base3 = PTR_HIGH(addr);
20517+#else
20518+
20519+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
20520+#endif
20521+}
20522+
20523+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
20524+{
20525+ struct desc_struct *d = get_cpu_gdt_table(cpu);
20526+ tss_desc tss;
20527+
20528+ /*
20529+ * sizeof(unsigned long) coming from an extra "long" at the end
20530+ * of the iobitmap. See tss_struct definition in processor.h
20531+ *
20532+ * -1? seg base+limit should be pointing to the address of the
20533+ * last valid byte
20534+ */
20535+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
20536+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20537+ write_gdt_entry(d, entry, &tss, DESC_TSS);
20538+}
20539+
20540+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20541+
20542+static inline void native_set_ldt(const void *addr, unsigned int entries)
20543+{
20544+ if (likely(entries == 0))
20545+ __asm__ __volatile__("lldt %w0"::"q" (0));
20546+ else {
20547+ unsigned cpu = smp_processor_id();
20548+ ldt_desc ldt;
20549+
20550+ set_tssldt_descriptor(&ldt, (unsigned long)addr,
20551+ DESC_LDT, entries * sizeof(ldt) - 1);
20552+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
20553+ &ldt, DESC_LDT);
20554+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20555+ }
20556+}
20557+
20558+static inline void native_load_tr_desc(void)
20559+{
20560+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20561+}
20562+
20563+static inline void native_load_gdt(const struct desc_ptr *dtr)
20564+{
20565+ asm volatile("lgdt %0"::"m" (*dtr));
20566+}
20567+
20568+static inline void native_load_idt(const struct desc_ptr *dtr)
20569+{
20570+ asm volatile("lidt %0"::"m" (*dtr));
20571+}
20572+
20573+static inline void native_store_gdt(struct desc_ptr *dtr)
20574+{
20575+ asm volatile("sgdt %0":"=m" (*dtr));
20576+}
20577+
20578+static inline void native_store_idt(struct desc_ptr *dtr)
20579+{
20580+ asm volatile("sidt %0":"=m" (*dtr));
20581+}
20582+
20583+static inline unsigned long native_store_tr(void)
20584+{
20585+ unsigned long tr;
20586+ asm volatile("str %0":"=r" (tr));
20587+ return tr;
20588+}
20589+
20590+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20591+{
20592+ unsigned int i;
20593+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20594+
20595+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20596+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20597+}
20598+#else
20599+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20600+#define set_ldt xen_set_ldt
20601+
20602+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
20603+ const void *desc);
20604+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
20605+ const void *desc, int type);
20606+
20607+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20608+{
20609+ unsigned int i;
20610+ struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20611+
20612+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20613+ if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20614+ *(u64 *)&t->tls_array[i]))
20615+ BUG();
20616+}
20617+#endif
20618+
20619+#define _LDT_empty(info) (\
20620+ (info)->base_addr == 0 && \
20621+ (info)->limit == 0 && \
20622+ (info)->contents == 0 && \
20623+ (info)->read_exec_only == 1 && \
20624+ (info)->seg_32bit == 0 && \
20625+ (info)->limit_in_pages == 0 && \
20626+ (info)->seg_not_present == 1 && \
20627+ (info)->useable == 0)
20628+
20629+#ifdef CONFIG_X86_64
20630+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
20631+#else
20632+#define LDT_empty(info) (_LDT_empty(info))
20633+#endif
20634+
20635+static inline void clear_LDT(void)
20636+{
20637+ set_ldt(NULL, 0);
20638+}
20639+
20640+/*
20641+ * load one particular LDT into the current CPU
20642+ */
20643+static inline void load_LDT_nolock(mm_context_t *pc)
20644+{
20645+ set_ldt(pc->ldt, pc->size);
20646+}
20647+
20648+static inline void load_LDT(mm_context_t *pc)
20649+{
20650+ preempt_disable();
20651+ load_LDT_nolock(pc);
20652+ preempt_enable();
20653+}
20654+
20655+static inline unsigned long get_desc_base(const struct desc_struct *desc)
20656+{
20657+ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
20658+}
20659+
20660+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
20661+{
20662+ return desc->limit0 | (desc->limit << 16);
20663+}
20664+
20665+#ifndef CONFIG_X86_NO_IDT
20666+static inline void _set_gate(int gate, unsigned type, void *addr,
20667+ unsigned dpl, unsigned ist, unsigned seg)
20668+{
20669+ gate_desc s;
20670+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
20671+ /*
20672+ * does not need to be atomic because it is only done once at
20673+ * setup time
20674+ */
20675+ write_idt_entry(idt_table, gate, &s);
20676+}
20677+
20678+/*
20679+ * This needs to use 'idt_table' rather than 'idt', and
20680+ * thus use the _nonmapped_ version of the IDT, as the
20681+ * Pentium F0 0F bugfix can have resulted in the mapped
20682+ * IDT being write-protected.
20683+ */
20684+static inline void set_intr_gate(unsigned int n, void *addr)
20685+{
20686+ BUG_ON((unsigned)n > 0xFF);
20687+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
20688+}
20689+
20690+/*
20691+ * This routine sets up an interrupt gate at directory privilege level 3.
20692+ */
20693+static inline void set_system_intr_gate(unsigned int n, void *addr)
20694+{
20695+ BUG_ON((unsigned)n > 0xFF);
20696+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20697+}
20698+
20699+static inline void set_trap_gate(unsigned int n, void *addr)
20700+{
20701+ BUG_ON((unsigned)n > 0xFF);
20702+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
20703+}
20704+
20705+static inline void set_system_gate(unsigned int n, void *addr)
20706+{
20707+ BUG_ON((unsigned)n > 0xFF);
20708 #ifdef CONFIG_X86_32
20709-# include "desc_32.h"
20710+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
20711+#else
20712+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20713+#endif
20714+}
20715+
20716+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
20717+{
20718+ BUG_ON((unsigned)n > 0xFF);
20719+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
20720+}
20721+
20722+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
20723+{
20724+ BUG_ON((unsigned)n > 0xFF);
20725+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
20726+}
20727+
20728+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
20729+{
20730+ BUG_ON((unsigned)n > 0xFF);
20731+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
20732+}
20733+#endif
20734+
20735 #else
20736-# include "desc_64.h"
20737+/*
20738+ * GET_DESC_BASE reads the descriptor base of the specified segment.
20739+ *
20740+ * Args:
20741+ * idx - descriptor index
20742+ * gdt - GDT pointer
20743+ * base - 32bit register to which the base will be written
20744+ * lo_w - lo word of the "base" register
20745+ * lo_b - lo byte of the "base" register
20746+ * hi_b - hi byte of the low word of the "base" register
20747+ *
20748+ * Example:
20749+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20750+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20751+ */
20752+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20753+ movb idx*8+4(gdt), lo_b; \
20754+ movb idx*8+7(gdt), hi_b; \
20755+ shll $16, base; \
20756+ movw idx*8+2(gdt), lo_w;
20757+
20758+
20759+#endif /* __ASSEMBLY__ */
20760+
20761 #endif
20762--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc_32.h 2008-12-15 11:27:22.000000000 +0100
20763+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
20764@@ -1,262 +0,0 @@
20765-#ifndef __ARCH_DESC_H
20766-#define __ARCH_DESC_H
20767-
20768-#include <asm/ldt.h>
20769-#include <asm/segment.h>
20770-
20771-#ifndef __ASSEMBLY__
20772-
20773-#include <linux/preempt.h>
20774-#include <linux/smp.h>
20775-
20776-#include <asm/mmu.h>
20777-
20778-struct Xgt_desc_struct {
20779- unsigned short size;
20780- unsigned long address __attribute__((packed));
20781- unsigned short pad;
20782-} __attribute__ ((packed));
20783-
20784-struct gdt_page
20785-{
20786- struct desc_struct gdt[GDT_ENTRIES];
20787-} __attribute__((aligned(PAGE_SIZE)));
20788-DECLARE_PER_CPU(struct gdt_page, gdt_page);
20789-
20790-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20791-{
20792- return per_cpu(gdt_page, cpu).gdt;
20793-}
20794-
20795-extern struct Xgt_desc_struct idt_descr;
20796-extern struct desc_struct idt_table[];
20797-extern void set_intr_gate(unsigned int irq, void * addr);
20798-
20799-static inline void pack_descriptor(__u32 *a, __u32 *b,
20800- unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20801-{
20802- *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20803- *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20804- (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20805-}
20806-
20807-static inline void pack_gate(__u32 *a, __u32 *b,
20808- unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20809-{
20810- *a = (seg << 16) | (base & 0xffff);
20811- *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20812-}
20813-
20814-#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
20815-#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
20816-#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
20817-#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
20818-#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
20819-#define DESCTYPE_DPL3 0x60 /* DPL-3 */
20820-#define DESCTYPE_S 0x10 /* !system */
20821-
20822-#ifndef CONFIG_XEN
20823-#define load_TR_desc() native_load_tr_desc()
20824-#define load_gdt(dtr) native_load_gdt(dtr)
20825-#define load_idt(dtr) native_load_idt(dtr)
20826-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20827-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20828-
20829-#define store_gdt(dtr) native_store_gdt(dtr)
20830-#define store_idt(dtr) native_store_idt(dtr)
20831-#define store_tr(tr) (tr = native_store_tr())
20832-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20833-
20834-#define load_TLS(t, cpu) native_load_tls(t, cpu)
20835-#define set_ldt native_set_ldt
20836-
20837-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20838-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20839-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20840-
20841-static inline void write_dt_entry(struct desc_struct *dt,
20842- int entry, u32 entry_low, u32 entry_high)
20843-{
20844- dt[entry].a = entry_low;
20845- dt[entry].b = entry_high;
20846-}
20847-
20848-static inline void native_set_ldt(const void *addr, unsigned int entries)
20849-{
20850- if (likely(entries == 0))
20851- __asm__ __volatile__("lldt %w0"::"q" (0));
20852- else {
20853- unsigned cpu = smp_processor_id();
20854- __u32 a, b;
20855-
20856- pack_descriptor(&a, &b, (unsigned long)addr,
20857- entries * sizeof(struct desc_struct) - 1,
20858- DESCTYPE_LDT, 0);
20859- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20860- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20861- }
20862-}
20863-
20864-
20865-static inline void native_load_tr_desc(void)
20866-{
20867- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20868-}
20869-
20870-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20871-{
20872- asm volatile("lgdt %0"::"m" (*dtr));
20873-}
20874-
20875-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20876-{
20877- asm volatile("lidt %0"::"m" (*dtr));
20878-}
20879-
20880-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20881-{
20882- asm ("sgdt %0":"=m" (*dtr));
20883-}
20884-
20885-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20886-{
20887- asm ("sidt %0":"=m" (*dtr));
20888-}
20889-
20890-static inline unsigned long native_store_tr(void)
20891-{
20892- unsigned long tr;
20893- asm ("str %0":"=r" (tr));
20894- return tr;
20895-}
20896-
20897-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20898-{
20899- unsigned int i;
20900- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20901-
20902- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20903- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20904-}
20905-#else
20906-#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20907-#define set_ldt xen_set_ldt
20908-
20909-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20910-extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20911-
20912-static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20913-{
20914- unsigned int i;
20915- struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20916-
20917- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20918- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20919- *(u64 *)&t->tls_array[i]))
20920- BUG();
20921-}
20922-#endif
20923-
20924-#ifndef CONFIG_X86_NO_IDT
20925-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20926-{
20927- __u32 a, b;
20928- pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20929- write_idt_entry(idt_table, gate, a, b);
20930-}
20931-#endif
20932-
20933-#ifndef CONFIG_X86_NO_TSS
20934-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20935-{
20936- __u32 a, b;
20937- pack_descriptor(&a, &b, (unsigned long)addr,
20938- offsetof(struct tss_struct, __cacheline_filler) - 1,
20939- DESCTYPE_TSS, 0);
20940- write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20941-}
20942-#endif
20943-
20944-
20945-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20946-
20947-#define LDT_entry_a(info) \
20948- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20949-
20950-#define LDT_entry_b(info) \
20951- (((info)->base_addr & 0xff000000) | \
20952- (((info)->base_addr & 0x00ff0000) >> 16) | \
20953- ((info)->limit & 0xf0000) | \
20954- (((info)->read_exec_only ^ 1) << 9) | \
20955- ((info)->contents << 10) | \
20956- (((info)->seg_not_present ^ 1) << 15) | \
20957- ((info)->seg_32bit << 22) | \
20958- ((info)->limit_in_pages << 23) | \
20959- ((info)->useable << 20) | \
20960- 0x7000)
20961-
20962-#define LDT_empty(info) (\
20963- (info)->base_addr == 0 && \
20964- (info)->limit == 0 && \
20965- (info)->contents == 0 && \
20966- (info)->read_exec_only == 1 && \
20967- (info)->seg_32bit == 0 && \
20968- (info)->limit_in_pages == 0 && \
20969- (info)->seg_not_present == 1 && \
20970- (info)->useable == 0 )
20971-
20972-static inline void clear_LDT(void)
20973-{
20974- set_ldt(NULL, 0);
20975-}
20976-
20977-/*
20978- * load one particular LDT into the current CPU
20979- */
20980-static inline void load_LDT_nolock(mm_context_t *pc)
20981-{
20982- set_ldt(pc->ldt, pc->size);
20983-}
20984-
20985-static inline void load_LDT(mm_context_t *pc)
20986-{
20987- preempt_disable();
20988- load_LDT_nolock(pc);
20989- preempt_enable();
20990-}
20991-
20992-static inline unsigned long get_desc_base(unsigned long *desc)
20993-{
20994- unsigned long base;
20995- base = ((desc[0] >> 16) & 0x0000ffff) |
20996- ((desc[1] << 16) & 0x00ff0000) |
20997- (desc[1] & 0xff000000);
20998- return base;
20999-}
21000-
21001-#else /* __ASSEMBLY__ */
21002-
21003-/*
21004- * GET_DESC_BASE reads the descriptor base of the specified segment.
21005- *
21006- * Args:
21007- * idx - descriptor index
21008- * gdt - GDT pointer
21009- * base - 32bit register to which the base will be written
21010- * lo_w - lo word of the "base" register
21011- * lo_b - lo byte of the "base" register
21012- * hi_b - hi byte of the low word of the "base" register
21013- *
21014- * Example:
21015- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21016- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21017- */
21018-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21019- movb idx*8+4(gdt), lo_b; \
21020- movb idx*8+7(gdt), hi_b; \
21021- shll $16, base; \
21022- movw idx*8+2(gdt), lo_w;
21023-
21024-#endif /* !__ASSEMBLY__ */
21025-
21026-#endif
21027--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/desc_64.h 2009-02-16 16:18:36.000000000 +0100
21028+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21029@@ -1,228 +0,0 @@
21030-/* Written 2000 by Andi Kleen */
21031-#ifndef __ARCH_DESC_H
21032-#define __ARCH_DESC_H
21033-
21034-#include <linux/threads.h>
21035-#include <asm/ldt.h>
21036-
21037-#ifndef __ASSEMBLY__
21038-
21039-#include <linux/string.h>
21040-#include <linux/smp.h>
21041-#include <asm/desc_defs.h>
21042-
21043-#include <asm/segment.h>
21044-#include <asm/mmu.h>
21045-
21046-extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
21047-
21048-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
21049-
21050-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
21051-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
21052-
21053-static inline void clear_LDT(void)
21054-{
21055- int cpu = get_cpu();
21056-
21057- /*
21058- * NB. We load the default_ldt for lcall7/27 handling on demand, as
21059- * it slows down context switching. Noone uses it anyway.
21060- */
21061- cpu = cpu; /* XXX avoid compiler warning */
21062- xen_set_ldt(NULL, 0);
21063- put_cpu();
21064-}
21065-
21066-#ifndef CONFIG_X86_NO_TSS
21067-static inline unsigned long __store_tr(void)
21068-{
21069- unsigned long tr;
21070-
21071- asm volatile ("str %w0":"=r" (tr));
21072- return tr;
21073-}
21074-
21075-#define store_tr(tr) (tr) = __store_tr()
21076-#endif
21077-
21078-/*
21079- * This is the ldt that every process will get unless we need
21080- * something other than this.
21081- */
21082-extern struct desc_struct default_ldt[];
21083-#ifndef CONFIG_X86_NO_IDT
21084-extern struct gate_struct idt_table[];
21085-#endif
21086-extern struct desc_ptr cpu_gdt_descr[];
21087-
21088-/* the cpu gdt accessor */
21089-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
21090-
21091-#ifndef CONFIG_XEN
21092-static inline void load_gdt(const struct desc_ptr *ptr)
21093-{
21094- asm volatile("lgdt %w0"::"m" (*ptr));
21095-}
21096-
21097-static inline void store_gdt(struct desc_ptr *ptr)
21098-{
21099- asm("sgdt %w0":"=m" (*ptr));
21100-}
21101-#endif
21102-
21103-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
21104-{
21105- struct gate_struct s;
21106- s.offset_low = PTR_LOW(func);
21107- s.segment = __KERNEL_CS;
21108- s.ist = ist;
21109- s.p = 1;
21110- s.dpl = dpl;
21111- s.zero0 = 0;
21112- s.zero1 = 0;
21113- s.type = type;
21114- s.offset_middle = PTR_MIDDLE(func);
21115- s.offset_high = PTR_HIGH(func);
21116- /* does not need to be atomic because it is only done once at setup time */
21117- memcpy(adr, &s, 16);
21118-}
21119-
21120-#ifndef CONFIG_X86_NO_IDT
21121-static inline void set_intr_gate(int nr, void *func)
21122-{
21123- BUG_ON((unsigned)nr > 0xFF);
21124- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
21125-}
21126-
21127-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
21128-{
21129- BUG_ON((unsigned)nr > 0xFF);
21130- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
21131-}
21132-
21133-static inline void set_system_gate(int nr, void *func)
21134-{
21135- BUG_ON((unsigned)nr > 0xFF);
21136- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
21137-}
21138-
21139-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
21140-{
21141- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
21142-}
21143-
21144-static inline void load_idt(const struct desc_ptr *ptr)
21145-{
21146- asm volatile("lidt %w0"::"m" (*ptr));
21147-}
21148-
21149-static inline void store_idt(struct desc_ptr *dtr)
21150-{
21151- asm("sidt %w0":"=m" (*dtr));
21152-}
21153-#endif
21154-
21155-static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
21156- unsigned size)
21157-{
21158- struct ldttss_desc d;
21159- memset(&d,0,sizeof(d));
21160- d.limit0 = size & 0xFFFF;
21161- d.base0 = PTR_LOW(tss);
21162- d.base1 = PTR_MIDDLE(tss) & 0xFF;
21163- d.type = type;
21164- d.p = 1;
21165- d.limit1 = (size >> 16) & 0xF;
21166- d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
21167- d.base3 = PTR_HIGH(tss);
21168- memcpy(ptr, &d, 16);
21169-}
21170-
21171-#ifndef CONFIG_X86_NO_TSS
21172-static inline void set_tss_desc(unsigned cpu, void *addr)
21173-{
21174- /*
21175- * sizeof(unsigned long) coming from an extra "long" at the end
21176- * of the iobitmap. See tss_struct definition in processor.h
21177- *
21178- * -1? seg base+limit should be pointing to the address of the
21179- * last valid byte
21180- */
21181- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
21182- (unsigned long)addr, DESC_TSS,
21183- IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21184-}
21185-#endif
21186-
21187-static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
21188-{
21189- set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
21190- DESC_LDT, size * 8 - 1);
21191-}
21192-
21193-#define LDT_entry_a(info) \
21194- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
21195-/* Don't allow setting of the lm bit. It is useless anyways because
21196- 64bit system calls require __USER_CS. */
21197-#define LDT_entry_b(info) \
21198- (((info)->base_addr & 0xff000000) | \
21199- (((info)->base_addr & 0x00ff0000) >> 16) | \
21200- ((info)->limit & 0xf0000) | \
21201- (((info)->read_exec_only ^ 1) << 9) | \
21202- ((info)->contents << 10) | \
21203- (((info)->seg_not_present ^ 1) << 15) | \
21204- ((info)->seg_32bit << 22) | \
21205- ((info)->limit_in_pages << 23) | \
21206- ((info)->useable << 20) | \
21207- /* ((info)->lm << 21) | */ \
21208- 0x7000)
21209-
21210-#define LDT_empty(info) (\
21211- (info)->base_addr == 0 && \
21212- (info)->limit == 0 && \
21213- (info)->contents == 0 && \
21214- (info)->read_exec_only == 1 && \
21215- (info)->seg_32bit == 0 && \
21216- (info)->limit_in_pages == 0 && \
21217- (info)->seg_not_present == 1 && \
21218- (info)->useable == 0 && \
21219- (info)->lm == 0)
21220-
21221-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
21222-{
21223- unsigned int i;
21224- u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
21225-
21226- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21227- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21228- t->tls_array[i]))
21229- BUG();
21230-}
21231-
21232-/*
21233- * load one particular LDT into the current CPU
21234- */
21235-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
21236-{
21237- void *segments = pc->ldt;
21238- int count = pc->size;
21239-
21240- if (likely(!count))
21241- segments = NULL;
21242-
21243- xen_set_ldt(segments, count);
21244-}
21245-
21246-static inline void load_LDT(mm_context_t *pc)
21247-{
21248- int cpu = get_cpu();
21249- load_LDT_nolock(pc, cpu);
21250- put_cpu();
21251-}
21252-
21253-extern struct desc_ptr idt_descr;
21254-
21255-#endif /* !__ASSEMBLY__ */
21256-
21257-#endif
21258--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-02-16 16:18:36.000000000 +0100
21259+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
21260@@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21261 dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21262 }
21263
21264-static inline void
21265+extern void
21266 dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21267- enum dma_data_direction direction)
21268-{
21269- if (swiotlb)
21270- swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21271- flush_write_buffers();
21272-}
21273+ enum dma_data_direction direction);
21274
21275-static inline void
21276+extern void
21277 dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21278- enum dma_data_direction direction)
21279-{
21280- if (swiotlb)
21281- swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21282- flush_write_buffers();
21283-}
21284+ enum dma_data_direction direction);
21285
21286 extern int
21287 dma_mapping_error(dma_addr_t dma_addr);
21288--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-02-16 16:17:21.000000000 +0100
21289+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
21290@@ -64,7 +64,7 @@ enum fixed_addresses {
21291 #endif
21292 #ifdef CONFIG_X86_VISWS_APIC
21293 FIX_CO_CPU, /* Cobalt timer */
21294- FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21295+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21296 FIX_LI_PCIA, /* Lithium PCI Bridge A */
21297 FIX_LI_PCIB, /* Lithium PCI Bridge B */
21298 #endif
21299@@ -73,7 +73,7 @@ enum fixed_addresses {
21300 #endif
21301 #ifdef CONFIG_X86_CYCLONE_TIMER
21302 FIX_CYCLONE_TIMER, /*cyclone timer register*/
21303-#endif
21304+#endif
21305 #ifdef CONFIG_HIGHMEM
21306 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21307 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21308@@ -93,11 +93,23 @@ enum fixed_addresses {
21309 FIX_ISAMAP_END,
21310 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21311 __end_of_permanent_fixed_addresses,
21312- /* temporary boot-time mappings, used before ioremap() is functional */
21313-#define NR_FIX_BTMAPS 16
21314- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21315- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21316+ /*
21317+ * 256 temporary boot-time mappings, used by early_ioremap(),
21318+ * before ioremap() is functional.
21319+ *
21320+ * We round it up to the next 512 pages boundary so that we
21321+ * can have a single pgd entry and a single pte table:
21322+ */
21323+#define NR_FIX_BTMAPS 64
21324+#define FIX_BTMAPS_NESTING 4
21325+ FIX_BTMAP_END =
21326+ __end_of_permanent_fixed_addresses + 512 -
21327+ (__end_of_permanent_fixed_addresses & 511),
21328+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21329 FIX_WP_TEST,
21330+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21331+ FIX_OHCI1394_BASE,
21332+#endif
21333 __end_of_fixed_addresses
21334 };
21335
21336--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-02-16 16:17:21.000000000 +0100
21337+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
21338@@ -15,6 +15,7 @@
21339 #include <asm/apicdef.h>
21340 #include <asm/page.h>
21341 #include <asm/vsyscall.h>
21342+#include <asm/efi.h>
21343 #include <asm/acpi.h>
21344
21345 /*
21346@@ -46,6 +47,10 @@ enum fixed_addresses {
21347 FIX_IO_APIC_BASE_0,
21348 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21349 #endif
21350+#ifdef CONFIG_EFI
21351+ FIX_EFI_IO_MAP_LAST_PAGE,
21352+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21353+#endif
21354 #ifdef CONFIG_ACPI
21355 FIX_ACPI_BEGIN,
21356 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21357@@ -55,10 +60,22 @@ enum fixed_addresses {
21358 FIX_ISAMAP_END,
21359 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21360 __end_of_permanent_fixed_addresses,
21361- /* temporary boot-time mappings, used before ioremap() is functional */
21362-#define NR_FIX_BTMAPS 16
21363- FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21364- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21365+ /*
21366+ * 256 temporary boot-time mappings, used by early_ioremap(),
21367+ * before ioremap() is functional.
21368+ *
21369+ * We round it up to the next 512 pages boundary so that we
21370+ * can have a single pgd entry and a single pte table:
21371+ */
21372+#define NR_FIX_BTMAPS 64
21373+#define FIX_BTMAPS_NESTING 4
21374+ FIX_BTMAP_END =
21375+ __end_of_permanent_fixed_addresses + 512 -
21376+ (__end_of_permanent_fixed_addresses & 511),
21377+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21378+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21379+ FIX_OHCI1394_BASE,
21380+#endif
21381 __end_of_fixed_addresses
21382 };
21383
21384--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-02-16 16:17:21.000000000 +0100
21385+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
21386@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21387 * easily, subsequent pte tables have to be allocated in one physical
21388 * chunk of RAM.
21389 */
21390-#ifdef CONFIG_X86_PAE
21391-#define LAST_PKMAP 512
21392-#else
21393-#define LAST_PKMAP 1024
21394-#endif
21395 /*
21396 * Ordering is:
21397 *
21398@@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21399 * VMALLOC_START
21400 * high_memory
21401 */
21402-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21403 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21404 #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21405 #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21406
21407-extern void * FASTCALL(kmap_high(struct page *page));
21408-extern void FASTCALL(kunmap_high(struct page *page));
21409+extern void *kmap_high(struct page *page);
21410+extern void kunmap_high(struct page *page);
21411
21412 void *kmap(struct page *page);
21413 void kunmap(struct page *page);
21414--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-02-16 16:18:36.000000000 +0100
21415+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
21416@@ -264,6 +264,25 @@ HYPERVISOR_poll(
21417 return rc;
21418 }
21419
21420+static inline int __must_check
21421+HYPERVISOR_poll_no_timeout(
21422+ evtchn_port_t *ports, unsigned int nr_ports)
21423+{
21424+ int rc;
21425+ struct sched_poll sched_poll = {
21426+ .nr_ports = nr_ports
21427+ };
21428+ set_xen_guest_handle(sched_poll.ports, ports);
21429+
21430+ rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21431+#if CONFIG_XEN_COMPAT <= 0x030002
21432+ if (rc == -ENOSYS)
21433+ rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21434+#endif
21435+
21436+ return rc;
21437+}
21438+
21439 #ifdef CONFIG_XEN
21440
21441 static inline void
21442--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-02-16 16:18:36.000000000 +0100
21443+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
21444@@ -1,5 +1,247 @@
21445-#ifdef CONFIG_X86_32
21446-# include "irqflags_32.h"
21447+#ifndef _X86_IRQFLAGS_H_
21448+#define _X86_IRQFLAGS_H_
21449+
21450+#include <asm/processor-flags.h>
21451+
21452+#ifndef __ASSEMBLY__
21453+/*
21454+ * The use of 'barrier' in the following reflects their use as local-lock
21455+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21456+ * critical operations are executed. All critical operations must complete
21457+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21458+ * includes these barriers, for example.
21459+ */
21460+
21461+#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21462+
21463+#define xen_restore_fl(f) \
21464+do { \
21465+ vcpu_info_t *_vcpu; \
21466+ barrier(); \
21467+ _vcpu = current_vcpu_info(); \
21468+ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21469+ barrier(); /* unmask then check (avoid races) */\
21470+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
21471+ force_evtchn_callback(); \
21472+ } \
21473+} while (0)
21474+
21475+#define xen_irq_disable() \
21476+do { \
21477+ current_vcpu_info()->evtchn_upcall_mask = 1; \
21478+ barrier(); \
21479+} while (0)
21480+
21481+#define xen_irq_enable() \
21482+do { \
21483+ vcpu_info_t *_vcpu; \
21484+ barrier(); \
21485+ _vcpu = current_vcpu_info(); \
21486+ _vcpu->evtchn_upcall_mask = 0; \
21487+ barrier(); /* unmask then check (avoid races) */ \
21488+ if (unlikely(_vcpu->evtchn_upcall_pending)) \
21489+ force_evtchn_callback(); \
21490+} while (0)
21491+
21492+void xen_safe_halt(void);
21493+
21494+void xen_halt(void);
21495+
21496+#define __raw_local_save_flags() xen_save_fl()
21497+
21498+#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21499+
21500+#define raw_local_irq_disable() xen_irq_disable()
21501+
21502+#define raw_local_irq_enable() xen_irq_enable()
21503+
21504+/*
21505+ * Used in the idle loop; sti takes one instruction cycle
21506+ * to complete:
21507+ */
21508+static inline void raw_safe_halt(void)
21509+{
21510+ xen_safe_halt();
21511+}
21512+
21513+/*
21514+ * Used when interrupts are already enabled or to
21515+ * shutdown the processor:
21516+ */
21517+static inline void halt(void)
21518+{
21519+ xen_halt();
21520+}
21521+
21522+/*
21523+ * For spinlocks, etc:
21524+ */
21525+#define __raw_local_irq_save() \
21526+({ \
21527+ unsigned long flags = __raw_local_save_flags(); \
21528+ \
21529+ raw_local_irq_disable(); \
21530+ \
21531+ flags; \
21532+})
21533 #else
21534-# include "irqflags_64.h"
21535+
21536+/* Offsets into shared_info_t. */
21537+#define evtchn_upcall_pending /* 0 */
21538+#define evtchn_upcall_mask 1
21539+
21540+#define sizeof_vcpu_shift 6
21541+
21542+#ifdef CONFIG_X86_64
21543+# define __REG_si %rsi
21544+# define __CPU_num %gs:pda_cpunumber
21545+#else
21546+# define __REG_si %esi
21547+# define __CPU_num TI_cpu(%ebp)
21548+#endif
21549+
21550+#ifdef CONFIG_SMP
21551+#define GET_VCPU_INFO movl __CPU_num,%esi ; \
21552+ shl $sizeof_vcpu_shift,%esi ; \
21553+ add HYPERVISOR_shared_info,__REG_si
21554+#else
21555+#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
21556+#endif
21557+
21558+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
21559+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
21560+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
21561+#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21562+ __DISABLE_INTERRUPTS
21563+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21564+ __ENABLE_INTERRUPTS
21565+
21566+#ifndef CONFIG_X86_64
21567+#define INTERRUPT_RETURN iret
21568+#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
21569+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21570+ __TEST_PENDING ; \
21571+ jnz 14f /* process more events if necessary... */ ; \
21572+ movl PT_ESI(%esp), %esi ; \
21573+ sysexit ; \
21574+14: __DISABLE_INTERRUPTS ; \
21575+ TRACE_IRQS_OFF ; \
21576+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21577+ push %esp ; \
21578+ call evtchn_do_upcall ; \
21579+ add $4,%esp ; \
21580+ jmp ret_from_intr
21581+#endif
21582+
21583+
21584+#endif /* __ASSEMBLY__ */
21585+
21586+#ifndef __ASSEMBLY__
21587+#define raw_local_save_flags(flags) \
21588+ do { (flags) = __raw_local_save_flags(); } while (0)
21589+
21590+#define raw_local_irq_save(flags) \
21591+ do { (flags) = __raw_local_irq_save(); } while (0)
21592+
21593+static inline int raw_irqs_disabled_flags(unsigned long flags)
21594+{
21595+ return (flags != 0);
21596+}
21597+
21598+#define raw_irqs_disabled() \
21599+({ \
21600+ unsigned long flags = __raw_local_save_flags(); \
21601+ \
21602+ raw_irqs_disabled_flags(flags); \
21603+})
21604+
21605+/*
21606+ * makes the traced hardirq state match with the machine state
21607+ *
21608+ * should be a rarely used function, only in places where its
21609+ * otherwise impossible to know the irq state, like in traps.
21610+ */
21611+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21612+{
21613+ if (raw_irqs_disabled_flags(flags))
21614+ trace_hardirqs_off();
21615+ else
21616+ trace_hardirqs_on();
21617+}
21618+
21619+#define trace_hardirqs_fixup() \
21620+ trace_hardirqs_fixup_flags(__raw_local_save_flags())
21621+
21622+#else
21623+
21624+#ifdef CONFIG_X86_64
21625+/*
21626+ * Currently paravirt can't handle swapgs nicely when we
21627+ * don't have a stack we can rely on (such as a user space
21628+ * stack). So we either find a way around these or just fault
21629+ * and emulate if a guest tries to call swapgs directly.
21630+ *
21631+ * Either way, this is a good way to document that we don't
21632+ * have a reliable stack. x86_64 only.
21633+ */
21634+#define SWAPGS_UNSAFE_STACK swapgs
21635+#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
21636+#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
21637+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
21638+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
21639+ TRACE_IRQS_ON; \
21640+ ENABLE_INTERRUPTS(CLBR_NONE); \
21641+ SAVE_REST; \
21642+ LOCKDEP_SYS_EXIT; \
21643+ RESTORE_REST; \
21644+ __DISABLE_INTERRUPTS; \
21645+ TRACE_IRQS_OFF;
21646+
21647+#else
21648+#define ARCH_TRACE_IRQS_ON \
21649+ pushl %eax; \
21650+ pushl %ecx; \
21651+ pushl %edx; \
21652+ call trace_hardirqs_on; \
21653+ popl %edx; \
21654+ popl %ecx; \
21655+ popl %eax;
21656+
21657+#define ARCH_TRACE_IRQS_OFF \
21658+ pushl %eax; \
21659+ pushl %ecx; \
21660+ pushl %edx; \
21661+ call trace_hardirqs_off; \
21662+ popl %edx; \
21663+ popl %ecx; \
21664+ popl %eax;
21665+
21666+#define ARCH_LOCKDEP_SYS_EXIT \
21667+ pushl %eax; \
21668+ pushl %ecx; \
21669+ pushl %edx; \
21670+ call lockdep_sys_exit; \
21671+ popl %edx; \
21672+ popl %ecx; \
21673+ popl %eax;
21674+
21675+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
21676+#endif
21677+
21678+#ifdef CONFIG_TRACE_IRQFLAGS
21679+# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
21680+# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
21681+#else
21682+# define TRACE_IRQS_ON
21683+# define TRACE_IRQS_OFF
21684+#endif
21685+#ifdef CONFIG_DEBUG_LOCK_ALLOC
21686+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
21687+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
21688+# else
21689+# define LOCKDEP_SYS_EXIT
21690+# define LOCKDEP_SYS_EXIT_IRQ
21691+# endif
21692+
21693+#endif /* __ASSEMBLY__ */
21694 #endif
21695--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags_32.h 2009-02-16 16:18:36.000000000 +0100
21696+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21697@@ -1,212 +0,0 @@
21698-/*
21699- * include/asm-i386/irqflags.h
21700- *
21701- * IRQ flags handling
21702- *
21703- * This file gets included from lowlevel asm headers too, to provide
21704- * wrapped versions of the local_irq_*() APIs, based on the
21705- * raw_local_irq_*() functions from the lowlevel headers.
21706- */
21707-#ifndef _ASM_IRQFLAGS_H
21708-#define _ASM_IRQFLAGS_H
21709-
21710-#ifndef __ASSEMBLY__
21711-#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21712-
21713-#define xen_restore_fl(f) \
21714-do { \
21715- vcpu_info_t *_vcpu; \
21716- barrier(); \
21717- _vcpu = current_vcpu_info(); \
21718- if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21719- barrier(); /* unmask then check (avoid races) */\
21720- if (unlikely(_vcpu->evtchn_upcall_pending)) \
21721- force_evtchn_callback(); \
21722- } \
21723-} while (0)
21724-
21725-#define xen_irq_disable() \
21726-do { \
21727- current_vcpu_info()->evtchn_upcall_mask = 1; \
21728- barrier(); \
21729-} while (0)
21730-
21731-#define xen_irq_enable() \
21732-do { \
21733- vcpu_info_t *_vcpu; \
21734- barrier(); \
21735- _vcpu = current_vcpu_info(); \
21736- _vcpu->evtchn_upcall_mask = 0; \
21737- barrier(); /* unmask then check (avoid races) */ \
21738- if (unlikely(_vcpu->evtchn_upcall_pending)) \
21739- force_evtchn_callback(); \
21740-} while (0)
21741-
21742-void xen_safe_halt(void);
21743-
21744-void xen_halt(void);
21745-
21746-/*
21747- * The use of 'barrier' in the following reflects their use as local-lock
21748- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21749- * critical operations are executed. All critical operations must complete
21750- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21751- * includes these barriers, for example.
21752- */
21753-
21754-#define __raw_local_save_flags() xen_save_fl()
21755-
21756-#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21757-
21758-#define raw_local_irq_disable() xen_irq_disable()
21759-
21760-#define raw_local_irq_enable() xen_irq_enable()
21761-
21762-/*
21763- * Used in the idle loop; sti takes one instruction cycle
21764- * to complete:
21765- */
21766-static inline void raw_safe_halt(void)
21767-{
21768- xen_safe_halt();
21769-}
21770-
21771-/*
21772- * Used when interrupts are already enabled or to
21773- * shutdown the processor:
21774- */
21775-static inline void halt(void)
21776-{
21777- xen_halt();
21778-}
21779-
21780-/*
21781- * For spinlocks, etc:
21782- */
21783-#define __raw_local_irq_save() \
21784-({ \
21785- unsigned long flags = __raw_local_save_flags(); \
21786- \
21787- raw_local_irq_disable(); \
21788- \
21789- flags; \
21790-})
21791-
21792-#else
21793-/* Offsets into shared_info_t. */
21794-#define evtchn_upcall_pending /* 0 */
21795-#define evtchn_upcall_mask 1
21796-
21797-#define sizeof_vcpu_shift 6
21798-
21799-#ifdef CONFIG_SMP
21800-#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
21801- shl $sizeof_vcpu_shift,%esi ; \
21802- addl HYPERVISOR_shared_info,%esi
21803-#else
21804-#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
21805-#endif
21806-
21807-#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
21808-#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
21809-#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
21810-#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21811- __DISABLE_INTERRUPTS
21812-#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21813- __ENABLE_INTERRUPTS
21814-#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
21815-sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21816- __TEST_PENDING ; \
21817- jnz 14f /* process more events if necessary... */ ; \
21818- movl PT_ESI(%esp), %esi ; \
21819- sysexit ; \
21820-14: __DISABLE_INTERRUPTS ; \
21821- TRACE_IRQS_OFF ; \
21822-sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21823- push %esp ; \
21824- call evtchn_do_upcall ; \
21825- add $4,%esp ; \
21826- jmp ret_from_intr
21827-#define INTERRUPT_RETURN iret
21828-#endif /* __ASSEMBLY__ */
21829-
21830-#ifndef __ASSEMBLY__
21831-#define raw_local_save_flags(flags) \
21832- do { (flags) = __raw_local_save_flags(); } while (0)
21833-
21834-#define raw_local_irq_save(flags) \
21835- do { (flags) = __raw_local_irq_save(); } while (0)
21836-
21837-static inline int raw_irqs_disabled_flags(unsigned long flags)
21838-{
21839- return (flags != 0);
21840-}
21841-
21842-#define raw_irqs_disabled() \
21843-({ \
21844- unsigned long flags = __raw_local_save_flags(); \
21845- \
21846- raw_irqs_disabled_flags(flags); \
21847-})
21848-
21849-/*
21850- * makes the traced hardirq state match with the machine state
21851- *
21852- * should be a rarely used function, only in places where its
21853- * otherwise impossible to know the irq state, like in traps.
21854- */
21855-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21856-{
21857- if (raw_irqs_disabled_flags(flags))
21858- trace_hardirqs_off();
21859- else
21860- trace_hardirqs_on();
21861-}
21862-
21863-#define trace_hardirqs_fixup() \
21864- trace_hardirqs_fixup_flags(__raw_local_save_flags())
21865-#endif /* __ASSEMBLY__ */
21866-
21867-/*
21868- * Do the CPU's IRQ-state tracing from assembly code. We call a
21869- * C function, so save all the C-clobbered registers:
21870- */
21871-#ifdef CONFIG_TRACE_IRQFLAGS
21872-
21873-# define TRACE_IRQS_ON \
21874- pushl %eax; \
21875- pushl %ecx; \
21876- pushl %edx; \
21877- call trace_hardirqs_on; \
21878- popl %edx; \
21879- popl %ecx; \
21880- popl %eax;
21881-
21882-# define TRACE_IRQS_OFF \
21883- pushl %eax; \
21884- pushl %ecx; \
21885- pushl %edx; \
21886- call trace_hardirqs_off; \
21887- popl %edx; \
21888- popl %ecx; \
21889- popl %eax;
21890-
21891-#else
21892-# define TRACE_IRQS_ON
21893-# define TRACE_IRQS_OFF
21894-#endif
21895-
21896-#ifdef CONFIG_DEBUG_LOCK_ALLOC
21897-# define LOCKDEP_SYS_EXIT \
21898- pushl %eax; \
21899- pushl %ecx; \
21900- pushl %edx; \
21901- call lockdep_sys_exit; \
21902- popl %edx; \
21903- popl %ecx; \
21904- popl %eax;
21905-#else
21906-# define LOCKDEP_SYS_EXIT
21907-#endif
21908-
21909-#endif
21910--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/irqflags_64.h 2009-02-16 16:18:36.000000000 +0100
21911+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21912@@ -1,178 +0,0 @@
21913-/*
21914- * include/asm-x86_64/irqflags.h
21915- *
21916- * IRQ flags handling
21917- *
21918- * This file gets included from lowlevel asm headers too, to provide
21919- * wrapped versions of the local_irq_*() APIs, based on the
21920- * raw_local_irq_*() functions from the lowlevel headers.
21921- */
21922-#ifndef _ASM_IRQFLAGS_H
21923-#define _ASM_IRQFLAGS_H
21924-#include <asm/processor-flags.h>
21925-
21926-#ifndef __ASSEMBLY__
21927-/*
21928- * Interrupt control:
21929- */
21930-
21931-/*
21932- * The use of 'barrier' in the following reflects their use as local-lock
21933- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21934- * critical operations are executed. All critical operations must complete
21935- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21936- * includes these barriers, for example.
21937- */
21938-
21939-#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
21940-
21941-#define raw_local_save_flags(flags) \
21942- do { (flags) = __raw_local_save_flags(); } while (0)
21943-
21944-#define raw_local_irq_restore(x) \
21945-do { \
21946- vcpu_info_t *_vcpu; \
21947- barrier(); \
21948- _vcpu = current_vcpu_info(); \
21949- if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
21950- barrier(); /* unmask then check (avoid races) */ \
21951- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
21952- force_evtchn_callback(); \
21953- } \
21954-} while (0)
21955-
21956-#ifdef CONFIG_X86_VSMP
21957-
21958-/*
21959- * Interrupt control for the VSMP architecture:
21960- */
21961-
21962-static inline void raw_local_irq_disable(void)
21963-{
21964- unsigned long flags = __raw_local_save_flags();
21965-
21966- raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
21967-}
21968-
21969-static inline void raw_local_irq_enable(void)
21970-{
21971- unsigned long flags = __raw_local_save_flags();
21972-
21973- raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
21974-}
21975-
21976-static inline int raw_irqs_disabled_flags(unsigned long flags)
21977-{
21978- return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
21979-}
21980-
21981-#else /* CONFIG_X86_VSMP */
21982-
21983-#define raw_local_irq_disable() \
21984-do { \
21985- current_vcpu_info()->evtchn_upcall_mask = 1; \
21986- barrier(); \
21987-} while (0)
21988-
21989-#define raw_local_irq_enable() \
21990-do { \
21991- vcpu_info_t *_vcpu; \
21992- barrier(); \
21993- _vcpu = current_vcpu_info(); \
21994- _vcpu->evtchn_upcall_mask = 0; \
21995- barrier(); /* unmask then check (avoid races) */ \
21996- if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
21997- force_evtchn_callback(); \
21998-} while (0)
21999-
22000-static inline int raw_irqs_disabled_flags(unsigned long flags)
22001-{
22002- return (flags != 0);
22003-}
22004-
22005-#endif
22006-
22007-/*
22008- * For spinlocks, etc.:
22009- */
22010-
22011-#define __raw_local_irq_save() \
22012-({ \
22013- unsigned long flags = __raw_local_save_flags(); \
22014- \
22015- raw_local_irq_disable(); \
22016- \
22017- flags; \
22018-})
22019-
22020-#define raw_local_irq_save(flags) \
22021- do { (flags) = __raw_local_irq_save(); } while (0)
22022-
22023-#define raw_irqs_disabled() \
22024-({ \
22025- unsigned long flags = __raw_local_save_flags(); \
22026- \
22027- raw_irqs_disabled_flags(flags); \
22028-})
22029-
22030-/*
22031- * makes the traced hardirq state match with the machine state
22032- *
22033- * should be a rarely used function, only in places where its
22034- * otherwise impossible to know the irq state, like in traps.
22035- */
22036-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22037-{
22038- if (raw_irqs_disabled_flags(flags))
22039- trace_hardirqs_off();
22040- else
22041- trace_hardirqs_on();
22042-}
22043-
22044-#define trace_hardirqs_fixup() \
22045- trace_hardirqs_fixup_flags(__raw_local_save_flags())
22046-/*
22047- * Used in the idle loop; sti takes one instruction cycle
22048- * to complete:
22049- */
22050-void xen_safe_halt(void);
22051-static inline void raw_safe_halt(void)
22052-{
22053- xen_safe_halt();
22054-}
22055-
22056-/*
22057- * Used when interrupts are already enabled or to
22058- * shutdown the processor:
22059- */
22060-void xen_halt(void);
22061-static inline void halt(void)
22062-{
22063- xen_halt();
22064-}
22065-
22066-#else /* __ASSEMBLY__: */
22067-# ifdef CONFIG_TRACE_IRQFLAGS
22068-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
22069-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22070-# else
22071-# define TRACE_IRQS_ON
22072-# define TRACE_IRQS_OFF
22073-# endif
22074-# ifdef CONFIG_DEBUG_LOCK_ALLOC
22075-# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22076-# define LOCKDEP_SYS_EXIT_IRQ \
22077- TRACE_IRQS_ON; \
22078- sti; \
22079- SAVE_REST; \
22080- LOCKDEP_SYS_EXIT; \
22081- RESTORE_REST; \
22082- cli; \
22083- TRACE_IRQS_OFF;
22084-# else
22085-# define LOCKDEP_SYS_EXIT
22086-# define LOCKDEP_SYS_EXIT_IRQ
22087-# endif
22088-#endif
22089-
22090-#endif
22091--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/maddr_32.h 2009-02-16 16:17:21.000000000 +0100
22092+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/maddr_32.h 2009-03-16 16:33:40.000000000 +0100
22093@@ -1,6 +1,7 @@
22094 #ifndef _I386_MADDR_H
22095 #define _I386_MADDR_H
22096
22097+#include <asm/bug.h>
22098 #include <xen/features.h>
22099 #include <xen/interface/xen.h>
22100
22101@@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22102 phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22103 return phys;
22104 }
22105-#endif
22106-
22107-#ifdef CONFIG_X86_PAE
22108-#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22109-extern unsigned long long __supported_pte_mask;
22110-static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22111-{
22112- pte_t pte;
22113-
22114- pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22115- (pgprot_val(pgprot) >> 32);
22116- pte.pte_high &= (__supported_pte_mask >> 32);
22117- pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22118- __supported_pte_mask;
22119- return pte;
22120-}
22121 #else
22122-#define __pte_ma(x) ((pte_t) { (x) } )
22123-#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22124+#define pte_phys_to_machine phys_to_machine
22125+#define pte_machine_to_phys machine_to_phys
22126 #endif
22127
22128 #else /* !CONFIG_XEN */
22129--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/maddr_64.h 2009-06-29 15:14:52.000000000 +0200
22130+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/maddr_64.h 2009-03-16 16:33:40.000000000 +0100
22131@@ -1,6 +1,7 @@
22132 #ifndef _X86_64_MADDR_H
22133 #define _X86_64_MADDR_H
22134
22135+#include <asm/bug.h>
22136 #include <xen/features.h>
22137 #include <xen/interface/xen.h>
22138
22139@@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22140 #ifdef CONFIG_XEN
22141
22142 extern unsigned long *phys_to_machine_mapping;
22143+extern unsigned long max_mapnr;
22144
22145 #undef machine_to_phys_mapping
22146 extern unsigned long *machine_to_phys_mapping;
22147@@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22148 {
22149 if (xen_feature(XENFEAT_auto_translated_physmap))
22150 return pfn;
22151- BUG_ON(end_pfn && pfn >= end_pfn);
22152+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22153 return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22154 }
22155
22156@@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22157 {
22158 if (xen_feature(XENFEAT_auto_translated_physmap))
22159 return 1;
22160- BUG_ON(end_pfn && pfn >= end_pfn);
22161+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22162 return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22163 }
22164
22165@@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22166 return mfn;
22167
22168 if (unlikely((mfn >> machine_to_phys_order) != 0))
22169- return end_pfn;
22170+ return max_mapnr;
22171
22172 /* The array access can fail (e.g., device space beyond end of RAM). */
22173 asm (
22174@@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22175 " .quad 1b,3b\n"
22176 ".previous"
22177 : "=r" (pfn)
22178- : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22179+ : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22180
22181 return pfn;
22182 }
22183@@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22184 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22185 {
22186 unsigned long pfn = mfn_to_pfn(mfn);
22187- if ((pfn < end_pfn)
22188+ if ((pfn < max_mapnr)
22189 && !xen_feature(XENFEAT_auto_translated_physmap)
22190 && (phys_to_machine_mapping[pfn] != mfn))
22191- return end_pfn; /* force !pfn_valid() */
22192+ return max_mapnr; /* force !pfn_valid() */
22193 return pfn;
22194 }
22195
22196 static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22197 {
22198- BUG_ON(end_pfn && pfn >= end_pfn);
22199+ BUG_ON(max_mapnr && pfn >= max_mapnr);
22200 if (xen_feature(XENFEAT_auto_translated_physmap)) {
22201 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22202 return;
22203@@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22204 return phys;
22205 }
22206
22207-#define __pte_ma(x) ((pte_t) { (x) } )
22208-#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22209-
22210 #else /* !CONFIG_XEN */
22211
22212 #define pfn_to_mfn(pfn) (pfn)
22213--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-02-16 16:17:21.000000000 +0100
22214+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
22215@@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22216 : : "r" (0) );
22217 }
22218
22219-void leave_mm(unsigned long cpu);
22220-
22221 static inline void switch_mm(struct mm_struct *prev,
22222 struct mm_struct *next,
22223 struct task_struct *tsk)
22224--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-02-16 16:17:21.000000000 +0100
22225+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
22226@@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22227 extern void mm_unpin(struct mm_struct *mm);
22228 void mm_pin_all(void);
22229
22230-static inline void load_cr3(pgd_t *pgd)
22231-{
22232- asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22233- "memory");
22234-}
22235-
22236 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22237 struct task_struct *tsk)
22238 {
22239@@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22240 op++;
22241
22242 if (unlikely(next->context.ldt != prev->context.ldt)) {
22243- /* load_LDT_nolock(&next->context, cpu) */
22244+ /* load_LDT_nolock(&next->context) */
22245 op->cmd = MMUEXT_SET_LDT;
22246 op->arg1.linear_addr = (unsigned long)next->context.ldt;
22247 op->arg2.nr_ents = next->context.size;
22248@@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22249 else {
22250 write_pda(mmu_state, TLBSTATE_OK);
22251 if (read_pda(active_mm) != next)
22252- out_of_line_bug();
22253+ BUG();
22254 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22255 /* We were in lazy tlb mode and leave_mm disabled
22256 * tlb flush IPI delivery. We must reload CR3
22257@@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22258 */
22259 load_cr3(next->pgd);
22260 xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22261- load_LDT_nolock(&next->context, cpu);
22262+ load_LDT_nolock(&next->context);
22263 }
22264 }
22265 #endif
22266--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/page.h 2009-02-16 16:18:36.000000000 +0100
22267+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
22268@@ -1,13 +1,231 @@
22269+#ifndef _ASM_X86_PAGE_H
22270+#define _ASM_X86_PAGE_H
22271+
22272+#include <linux/const.h>
22273+
22274+/* PAGE_SHIFT determines the page size */
22275+#define PAGE_SHIFT 12
22276+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22277+#define PAGE_MASK (~(PAGE_SIZE-1))
22278+
22279 #ifdef __KERNEL__
22280-# ifdef CONFIG_X86_32
22281-# include "page_32.h"
22282-# else
22283-# include "page_64.h"
22284-# endif
22285+
22286+/*
22287+ * Need to repeat this here in order to not include pgtable.h (which in turn
22288+ * depends on definitions made here), but to be able to use the symbolics
22289+ * below. The preprocessor will warn if the two definitions aren't identical.
22290+ */
22291+#define _PAGE_BIT_PRESENT 0
22292+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
22293+#define _PAGE_BIT_IO 9
22294+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
22295+
22296+#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22297+#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
22298+
22299+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
22300+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
22301+
22302+#define HPAGE_SHIFT PMD_SHIFT
22303+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22304+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22305+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22306+
22307+/* to align the pointer to the (next) page boundary */
22308+#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22309+
22310+#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22311+#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22312+
22313+#ifndef __ASSEMBLY__
22314+#include <linux/types.h>
22315+#endif
22316+
22317+#ifdef CONFIG_X86_64
22318+#include <asm/page_64.h>
22319+#define max_pfn_mapped end_pfn_map
22320+#else
22321+#include <asm/page_32.h>
22322+#define max_pfn_mapped max_low_pfn
22323+#endif /* CONFIG_X86_64 */
22324+
22325+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
22326+
22327+#define VM_DATA_DEFAULT_FLAGS \
22328+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22329+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22330+
22331+
22332+#ifndef __ASSEMBLY__
22333+
22334+extern int page_is_ram(unsigned long pagenr);
22335+
22336+struct page;
22337+
22338+static inline void clear_user_page(void *page, unsigned long vaddr,
22339+ struct page *pg)
22340+{
22341+ clear_page(page);
22342+}
22343+
22344+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22345+ struct page *topage)
22346+{
22347+ copy_page(to, from);
22348+}
22349+
22350+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22351+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22352+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22353+
22354+typedef struct { pgprotval_t pgprot; } pgprot_t;
22355+
22356+#define pgprot_val(x) ((x).pgprot)
22357+#define __pgprot(x) ((pgprot_t) { (x) } )
22358+
22359+#include <asm/maddr.h>
22360+
22361+typedef struct { pgdval_t pgd; } pgd_t;
22362+
22363+#define __pgd_ma(x) ((pgd_t) { (x) } )
22364+static inline pgd_t xen_make_pgd(pgdval_t val)
22365+{
22366+ if (val & _PAGE_PRESENT)
22367+ val = pte_phys_to_machine(val);
22368+ return (pgd_t) { val };
22369+}
22370+
22371+#define __pgd_val(x) ((x).pgd)
22372+static inline pgdval_t xen_pgd_val(pgd_t pgd)
22373+{
22374+ pgdval_t ret = __pgd_val(pgd);
22375+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22376+ if (ret)
22377+ ret = machine_to_phys(ret) | _PAGE_PRESENT;
22378+#else
22379+ if (ret & _PAGE_PRESENT)
22380+ ret = pte_machine_to_phys(ret);
22381+#endif
22382+ return ret;
22383+}
22384+
22385+#if PAGETABLE_LEVELS >= 3
22386+#if PAGETABLE_LEVELS == 4
22387+typedef struct { pudval_t pud; } pud_t;
22388+
22389+#define __pud_ma(x) ((pud_t) { (x) } )
22390+static inline pud_t xen_make_pud(pudval_t val)
22391+{
22392+ if (val & _PAGE_PRESENT)
22393+ val = pte_phys_to_machine(val);
22394+ return (pud_t) { val };
22395+}
22396+
22397+#define __pud_val(x) ((x).pud)
22398+static inline pudval_t xen_pud_val(pud_t pud)
22399+{
22400+ pudval_t ret = __pud_val(pud);
22401+ if (ret & _PAGE_PRESENT)
22402+ ret = pte_machine_to_phys(ret);
22403+ return ret;
22404+}
22405+#else /* PAGETABLE_LEVELS == 3 */
22406+#include <asm-generic/pgtable-nopud.h>
22407+
22408+#define __pud_val(x) __pgd_val((x).pgd)
22409+static inline pudval_t xen_pud_val(pud_t pud)
22410+{
22411+ return xen_pgd_val(pud.pgd);
22412+}
22413+#endif /* PAGETABLE_LEVELS == 4 */
22414+
22415+typedef struct { pmdval_t pmd; } pmd_t;
22416+
22417+#define __pmd_ma(x) ((pmd_t) { (x) } )
22418+static inline pmd_t xen_make_pmd(pmdval_t val)
22419+{
22420+ if (val & _PAGE_PRESENT)
22421+ val = pte_phys_to_machine(val);
22422+ return (pmd_t) { val };
22423+}
22424+
22425+#define __pmd_val(x) ((x).pmd)
22426+static inline pmdval_t xen_pmd_val(pmd_t pmd)
22427+{
22428+ pmdval_t ret = __pmd_val(pmd);
22429+#if CONFIG_XEN_COMPAT <= 0x030002
22430+ if (ret)
22431+ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22432 #else
22433-# ifdef __i386__
22434-# include "page_32.h"
22435-# else
22436-# include "page_64.h"
22437-# endif
22438+ if (ret & _PAGE_PRESENT)
22439+ ret = pte_machine_to_phys(ret);
22440+#endif
22441+ return ret;
22442+}
22443+#else /* PAGETABLE_LEVELS == 2 */
22444+#include <asm-generic/pgtable-nopmd.h>
22445+
22446+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
22447+#define __pmd_val(x) __pgd_val((x).pud.pgd)
22448+static inline pmdval_t xen_pmd_val(pmd_t pmd)
22449+{
22450+ return xen_pgd_val(pmd.pud.pgd);
22451+}
22452+#endif /* PAGETABLE_LEVELS >= 3 */
22453+
22454+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
22455+static inline pte_t xen_make_pte(pteval_t val)
22456+{
22457+ if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22458+ val = pte_phys_to_machine(val);
22459+ return (pte_t) { .pte = val };
22460+}
22461+
22462+#define __pte_val(x) ((x).pte)
22463+static inline pteval_t xen_pte_val(pte_t pte)
22464+{
22465+ pteval_t ret = __pte_val(pte);
22466+ if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22467+ ret = pte_machine_to_phys(ret);
22468+ return ret;
22469+}
22470+
22471+#define pgd_val(x) xen_pgd_val(x)
22472+#define __pgd(x) xen_make_pgd(x)
22473+
22474+#ifndef __PAGETABLE_PUD_FOLDED
22475+#define pud_val(x) xen_pud_val(x)
22476+#define __pud(x) xen_make_pud(x)
22477+#endif
22478+
22479+#ifndef __PAGETABLE_PMD_FOLDED
22480+#define pmd_val(x) xen_pmd_val(x)
22481+#define __pmd(x) xen_make_pmd(x)
22482 #endif
22483+
22484+#define pte_val(x) xen_pte_val(x)
22485+#define __pte(x) xen_make_pte(x)
22486+
22487+#define __pa(x) __phys_addr((unsigned long)(x))
22488+/* __pa_symbol should be used for C visible symbols.
22489+ This seems to be the official gcc blessed way to do such arithmetic. */
22490+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
22491+
22492+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22493+
22494+#define __boot_va(x) __va(x)
22495+#define __boot_pa(x) __pa(x)
22496+
22497+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22498+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22499+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22500+
22501+#endif /* __ASSEMBLY__ */
22502+
22503+#include <asm-generic/memory_model.h>
22504+#include <asm-generic/page.h>
22505+
22506+#define __HAVE_ARCH_GATE_AREA 1
22507+
22508+#endif /* __KERNEL__ */
22509+#endif /* _ASM_X86_PAGE_H */
22510--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-02-16 16:18:36.000000000 +0100
22511+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
22512@@ -1,37 +1,9 @@
22513 #ifndef _X86_64_PAGE_H
22514 #define _X86_64_PAGE_H
22515
22516-/* #include <linux/string.h> */
22517-#ifndef __ASSEMBLY__
22518-#include <linux/kernel.h>
22519-#include <linux/types.h>
22520-#include <asm/bug.h>
22521-#endif
22522-#include <linux/const.h>
22523-#include <xen/interface/xen.h>
22524-
22525-/*
22526- * Need to repeat this here in order to not include pgtable.h (which in turn
22527- * depends on definitions made here), but to be able to use the symbolic
22528- * below. The preprocessor will warn if the two definitions aren't identical.
22529- */
22530-#define _PAGE_PRESENT 0x001
22531-#define _PAGE_IO 0x200
22532-
22533-/* PAGE_SHIFT determines the page size */
22534-#define PAGE_SHIFT 12
22535-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22536-#define PAGE_MASK (~(PAGE_SIZE-1))
22537-
22538-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22539-#define __PHYSICAL_MASK_SHIFT 46
22540-#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22541-#define __VIRTUAL_MASK_SHIFT 48
22542-#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22543-
22544-#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22545+#define PAGETABLE_LEVELS 4
22546
22547-#define THREAD_ORDER 1
22548+#define THREAD_ORDER 1
22549 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22550 #define CURRENT_MASK (~(THREAD_SIZE-1))
22551
22552@@ -51,106 +23,10 @@
22553 #define MCE_STACK 5
22554 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
22555
22556-#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22557-#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22558-
22559-#define HPAGE_SHIFT PMD_SHIFT
22560-#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22561-#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22562-#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22563-
22564-#ifdef __KERNEL__
22565-#ifndef __ASSEMBLY__
22566-
22567-extern unsigned long end_pfn;
22568-
22569-#include <asm/maddr.h>
22570-
22571-void clear_page(void *);
22572-void copy_page(void *, void *);
22573-
22574-#define clear_user_page(page, vaddr, pg) clear_page(page)
22575-#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
22576-
22577-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22578- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22579-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22580-
22581-/*
22582- * These are used to make use of C type-checking..
22583- */
22584-typedef struct { unsigned long pte; } pte_t;
22585-typedef struct { unsigned long pmd; } pmd_t;
22586-typedef struct { unsigned long pud; } pud_t;
22587-typedef struct { unsigned long pgd; } pgd_t;
22588-#define PTE_MASK PHYSICAL_PAGE_MASK
22589-
22590-typedef struct { unsigned long pgprot; } pgprot_t;
22591-
22592-#define __pte_val(x) ((x).pte)
22593-#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
22594- == _PAGE_PRESENT ? \
22595- pte_machine_to_phys(__pte_val(x)) : \
22596- __pte_val(x))
22597-
22598-#define __pmd_val(x) ((x).pmd)
22599-static inline unsigned long pmd_val(pmd_t x)
22600-{
22601- unsigned long ret = __pmd_val(x);
22602-#if CONFIG_XEN_COMPAT <= 0x030002
22603- if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22604-#else
22605- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22606-#endif
22607- return ret;
22608-}
22609-
22610-#define __pud_val(x) ((x).pud)
22611-static inline unsigned long pud_val(pud_t x)
22612-{
22613- unsigned long ret = __pud_val(x);
22614- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22615- return ret;
22616-}
22617-
22618-#define __pgd_val(x) ((x).pgd)
22619-static inline unsigned long pgd_val(pgd_t x)
22620-{
22621- unsigned long ret = __pgd_val(x);
22622- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22623- return ret;
22624-}
22625-
22626-#define pgprot_val(x) ((x).pgprot)
22627-
22628-static inline pte_t __pte(unsigned long x)
22629-{
22630- if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22631- x = pte_phys_to_machine(x);
22632- return ((pte_t) { (x) });
22633-}
22634-
22635-static inline pmd_t __pmd(unsigned long x)
22636-{
22637- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22638- return ((pmd_t) { (x) });
22639-}
22640-
22641-static inline pud_t __pud(unsigned long x)
22642-{
22643- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22644- return ((pud_t) { (x) });
22645-}
22646-
22647-static inline pgd_t __pgd(unsigned long x)
22648-{
22649- if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22650- return ((pgd_t) { (x) });
22651-}
22652-
22653-#define __pgprot(x) ((pgprot_t) { (x) } )
22654+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
22655+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
22656
22657-#endif /* !__ASSEMBLY__ */
22658+#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22659
22660 #define __PHYSICAL_START CONFIG_PHYSICAL_START
22661 #define __KERNEL_ALIGN 0x200000
22662@@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22663
22664 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
22665 #define __START_KERNEL_map _AC(0xffffffff80000000, UL)
22666-#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22667
22668 #if CONFIG_XEN_COMPAT <= 0x030002
22669 #undef LOAD_OFFSET
22670 #define LOAD_OFFSET 0
22671 #endif
22672
22673-/* to align the pointer to the (next) page boundary */
22674-#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22675-
22676-#define KERNEL_TEXT_SIZE (40*1024*1024)
22677-#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22678+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22679+#define __PHYSICAL_MASK_SHIFT 46
22680+#define __VIRTUAL_MASK_SHIFT 48
22681
22682-#define PAGE_OFFSET __PAGE_OFFSET
22683+/*
22684+ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22685+ * arch/x86/kernel/head_64.S), and it is mapped here:
22686+ */
22687+#define KERNEL_IMAGE_SIZE (128*1024*1024)
22688+#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
22689
22690 #ifndef __ASSEMBLY__
22691+void clear_page(void *page);
22692+void copy_page(void *to, void *from);
22693+
22694+extern unsigned long end_pfn;
22695+extern unsigned long end_pfn_map;
22696+
22697 static inline unsigned long __phys_addr(unsigned long x)
22698 {
22699- return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22700+ return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22701 }
22702-#endif
22703
22704-#define __pa(x) __phys_addr((unsigned long)(x))
22705-#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22706+#define __phys_reloc_hide(x) (x)
22707
22708-#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22709-#define __boot_va(x) __va(x)
22710-#define __boot_pa(x) __pa(x)
22711-#ifdef CONFIG_FLATMEM
22712-#define pfn_valid(pfn) ((pfn) < end_pfn)
22713-#endif
22714+/*
22715+ * These are used to make use of C type-checking..
22716+ */
22717+typedef unsigned long pteval_t;
22718+typedef unsigned long pmdval_t;
22719+typedef unsigned long pudval_t;
22720+typedef unsigned long pgdval_t;
22721+typedef unsigned long pgprotval_t;
22722+typedef unsigned long phys_addr_t;
22723
22724-#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22725-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22726-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22727-
22728-#define VM_DATA_DEFAULT_FLAGS \
22729- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22730- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22731+typedef struct page *pgtable_t;
22732+
22733+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22734
22735-#define __HAVE_ARCH_GATE_AREA 1
22736 #define vmemmap ((struct page *)VMEMMAP_START)
22737
22738-#include <asm-generic/memory_model.h>
22739-#include <asm-generic/page.h>
22740+#endif /* !__ASSEMBLY__ */
22741+
22742+#ifdef CONFIG_FLATMEM
22743+#define pfn_valid(pfn) ((pfn) < max_mapnr)
22744+#endif
22745
22746-#endif /* __KERNEL__ */
22747
22748 #endif /* _X86_64_PAGE_H */
22749--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pci.h 2009-02-16 16:18:36.000000000 +0100
22750+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
22751@@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
22752
22753
22754 #ifdef CONFIG_PCI
22755+extern void early_quirks(void);
22756 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
22757 enum pci_dma_burst_strategy *strat,
22758 unsigned long *strategy_parameter)
22759@@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
22760 *strat = PCI_DMA_BURST_INFINITY;
22761 *strategy_parameter = ~0UL;
22762 }
22763+#else
22764+static inline void early_quirks(void) { }
22765 #endif
22766
22767-
22768 #endif /* __KERNEL__ */
22769
22770 #ifdef CONFIG_X86_32
22771@@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
22772 /* generic pci stuff */
22773 #include <asm-generic/pci.h>
22774
22775+#ifdef CONFIG_NUMA
22776+/* Returns the node based on pci bus */
22777+static inline int __pcibus_to_node(struct pci_bus *bus)
22778+{
22779+ struct pci_sysdata *sd = bus->sysdata;
22780+
22781+ return sd->node;
22782+}
22783
22784+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
22785+{
22786+ return node_to_cpumask(__pcibus_to_node(bus));
22787+}
22788+#endif
22789
22790 #endif
22791--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-02-16 16:17:21.000000000 +0100
22792+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
22793@@ -3,69 +3,109 @@
22794
22795 #include <linux/threads.h>
22796 #include <linux/mm.h> /* for struct page */
22797+#include <linux/pagemap.h>
22798+#include <asm/tlb.h>
22799+#include <asm-generic/tlb.h>
22800 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
22801
22802 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
22803-#define paravirt_alloc_pd(pfn) do { } while (0)
22804-#define paravirt_alloc_pd(pfn) do { } while (0)
22805+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
22806 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
22807 #define paravirt_release_pt(pfn) do { } while (0)
22808 #define paravirt_release_pd(pfn) do { } while (0)
22809
22810-#define pmd_populate_kernel(mm, pmd, pte) \
22811-do { \
22812- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
22813- set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
22814-} while (0)
22815-
22816-#define pmd_populate(mm, pmd, pte) \
22817-do { \
22818- unsigned long pfn = page_to_pfn(pte); \
22819- paravirt_alloc_pt(mm, pfn); \
22820- if (PagePinned(virt_to_page((mm)->pgd))) { \
22821- if (!PageHighMem(pte)) \
22822- BUG_ON(HYPERVISOR_update_va_mapping( \
22823- (unsigned long)__va(pfn << PAGE_SHIFT), \
22824- pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
22825- else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
22826- kmap_flush_unused(); \
22827- set_pmd(pmd, \
22828- __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
22829- } else \
22830- *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
22831-} while (0)
22832+static inline void pmd_populate_kernel(struct mm_struct *mm,
22833+ pmd_t *pmd, pte_t *pte)
22834+{
22835+ paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
22836+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
22837+}
22838+
22839+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22840+{
22841+ unsigned long pfn = page_to_pfn(pte);
22842+
22843+ paravirt_alloc_pt(mm, pfn);
22844+ if (PagePinned(virt_to_page(mm->pgd))) {
22845+ if (!PageHighMem(pte))
22846+ BUG_ON(HYPERVISOR_update_va_mapping(
22847+ (unsigned long)__va(pfn << PAGE_SHIFT),
22848+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22849+ else if (!test_and_set_bit(PG_pinned, &pte->flags))
22850+ kmap_flush_unused();
22851+ set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
22852+ } else
22853+ *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
22854+}
22855+#define pmd_pgtable(pmd) pmd_page(pmd)
22856
22857 /*
22858 * Allocate and free page tables.
22859 */
22860+extern void pgd_test_and_unpin(pgd_t *);
22861 extern pgd_t *pgd_alloc(struct mm_struct *);
22862-extern void pgd_free(pgd_t *pgd);
22863+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
22864
22865 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
22866-extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
22867+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
22868
22869-static inline void pte_free_kernel(pte_t *pte)
22870+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
22871 {
22872 make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
22873 free_page((unsigned long)pte);
22874 }
22875
22876-extern void pte_free(struct page *pte);
22877+extern void __pte_free(pgtable_t);
22878+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
22879+{
22880+ __pte_free(pte);
22881+}
22882+
22883
22884-#define __pte_free_tlb(tlb,pte) \
22885-do { \
22886- paravirt_release_pt(page_to_pfn(pte)); \
22887- tlb_remove_page((tlb),(pte)); \
22888-} while (0)
22889+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
22890
22891 #ifdef CONFIG_X86_PAE
22892 /*
22893 * In the PAE case we free the pmds as part of the pgd.
22894 */
22895-#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
22896-#define pmd_free(x) do { } while (0)
22897-#define __pmd_free_tlb(tlb,x) do { } while (0)
22898-#define pud_populate(mm, pmd, pte) BUG()
22899-#endif
22900+extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
22901+
22902+extern void __pmd_free(pgtable_t);
22903+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
22904+{
22905+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
22906+ __pmd_free(virt_to_page(pmd));
22907+}
22908+
22909+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
22910+
22911+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
22912+{
22913+ struct page *page = virt_to_page(pmd);
22914+ unsigned long pfn = page_to_pfn(page);
22915+
22916+ paravirt_alloc_pd(mm, pfn);
22917+
22918+ /* Note: almost everything apart from _PAGE_PRESENT is
22919+ reserved at the pmd (PDPT) level. */
22920+ if (PagePinned(virt_to_page(mm->pgd))) {
22921+ BUG_ON(PageHighMem(page));
22922+ BUG_ON(HYPERVISOR_update_va_mapping(
22923+ (unsigned long)__va(pfn << PAGE_SHIFT),
22924+ pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22925+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
22926+ } else
22927+ *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
22928+
22929+ /*
22930+ * According to Intel App note "TLBs, Paging-Structure Caches,
22931+ * and Their Invalidation", April 2007, document 317080-001,
22932+ * section 8.1: in PAE mode we explicitly have to flush the
22933+ * TLB via cr3 if the top-level pgd is changed...
22934+ */
22935+ if (mm == current->active_mm)
22936+ xen_tlb_flush();
22937+}
22938+#endif /* CONFIG_X86_PAE */
22939
22940 #endif /* _I386_PGALLOC_H */
22941--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-02-16 16:18:36.000000000 +0100
22942+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
22943@@ -6,30 +6,13 @@
22944 #include <linux/mm.h>
22945 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
22946
22947-#include <xen/features.h>
22948-void make_page_readonly(void *va, unsigned int feature);
22949-void make_page_writable(void *va, unsigned int feature);
22950-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
22951-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
22952+pmd_t *early_get_pmd(unsigned long va);
22953+void early_make_page_readonly(void *va, unsigned int feature);
22954
22955 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
22956
22957-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
22958-{
22959- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
22960-}
22961-
22962-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22963-{
22964- if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22965- BUG_ON(HYPERVISOR_update_va_mapping(
22966- (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22967- pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22968- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22969- } else {
22970- *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
22971- }
22972-}
22973+#define pmd_populate_kernel(mm, pmd, pte) \
22974+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
22975
22976 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
22977 {
22978@@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
22979 }
22980 }
22981
22982-extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
22983-extern void pte_free(struct page *pte);
22984+#define pmd_pgtable(pmd) pmd_page(pmd)
22985
22986-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
22987+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22988 {
22989- struct page *pg;
22990-
22991- pg = pte_alloc_one(mm, addr);
22992- return pg ? page_address(pg) : NULL;
22993+ if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22994+ BUG_ON(HYPERVISOR_update_va_mapping(
22995+ (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22996+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22997+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22998+ } else {
22999+ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23000+ }
23001 }
23002
23003-static inline void pmd_free(pmd_t *pmd)
23004+extern void __pmd_free(pgtable_t);
23005+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23006 {
23007 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23008- pte_free(virt_to_page(pmd));
23009+ __pmd_free(virt_to_page(pmd));
23010 }
23011
23012+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23013+
23014 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23015 {
23016- struct page *pg;
23017-
23018- pg = pte_alloc_one(mm, addr);
23019- return pg ? page_address(pg) : NULL;
23020+ return (pud_t *)pmd_alloc_one(mm, addr);
23021 }
23022
23023-static inline void pud_free(pud_t *pud)
23024+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23025 {
23026 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23027- pte_free(virt_to_page(pud));
23028+ __pmd_free(virt_to_page(pud));
23029 }
23030
23031 static inline void pgd_list_add(pgd_t *pgd)
23032 {
23033 struct page *page = virt_to_page(pgd);
23034+ unsigned long flags;
23035
23036- spin_lock(&pgd_lock);
23037+ spin_lock_irqsave(&pgd_lock, flags);
23038 list_add(&page->lru, &pgd_list);
23039- spin_unlock(&pgd_lock);
23040+ spin_unlock_irqrestore(&pgd_lock, flags);
23041 }
23042
23043 static inline void pgd_list_del(pgd_t *pgd)
23044 {
23045 struct page *page = virt_to_page(pgd);
23046+ unsigned long flags;
23047
23048- spin_lock(&pgd_lock);
23049+ spin_lock_irqsave(&pgd_lock, flags);
23050 list_del(&page->lru);
23051- spin_unlock(&pgd_lock);
23052+ spin_unlock_irqrestore(&pgd_lock, flags);
23053 }
23054
23055 extern void pgd_test_and_unpin(pgd_t *);
23056@@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23057 return pgd;
23058 }
23059
23060-static inline void pgd_free(pgd_t *pgd)
23061+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23062 {
23063 pgd_test_and_unpin(pgd);
23064 pgd_list_del(pgd);
23065@@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23066 return pte;
23067 }
23068
23069+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23070+
23071 /* Should really implement gc for free page table pages. This could be
23072 done with a reference count in struct page. */
23073
23074-static inline void pte_free_kernel(pte_t *pte)
23075+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23076 {
23077 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23078 make_page_writable(pte, XENFEAT_writable_page_tables);
23079 free_page((unsigned long)pte);
23080 }
23081
23082-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23083+extern void __pte_free(pgtable_t);
23084+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23085+{
23086+ __pte_free(pte);
23087+}
23088+
23089+#define __pte_free_tlb(tlb,pte) \
23090+do { \
23091+ pgtable_page_dtor((pte)); \
23092+ tlb_remove_page((tlb), (pte)); \
23093+} while (0)
23094+
23095 #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23096 #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23097
23098--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-02-16 16:18:36.000000000 +0100
23099+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
23100@@ -1,5 +1,467 @@
23101+#ifndef _ASM_X86_PGTABLE_H
23102+#define _ASM_X86_PGTABLE_H
23103+
23104+#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
23105+#define FIRST_USER_ADDRESS 0
23106+
23107+#define _PAGE_BIT_PRESENT 0
23108+#define _PAGE_BIT_RW 1
23109+#define _PAGE_BIT_USER 2
23110+#define _PAGE_BIT_PWT 3
23111+#define _PAGE_BIT_PCD 4
23112+#define _PAGE_BIT_ACCESSED 5
23113+#define _PAGE_BIT_DIRTY 6
23114+#define _PAGE_BIT_FILE 6
23115+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
23116+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
23117+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23118+#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
23119+ * has no associated page struct. */
23120+#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
23121+#define _PAGE_BIT_UNUSED3 11
23122+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23123+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
23124+
23125+/*
23126+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
23127+ * sign-extended value on 32-bit with all 1's in the upper word,
23128+ * which preserves the upper pte values on 64-bit ptes:
23129+ */
23130+#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
23131+#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
23132+#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
23133+#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
23134+#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
23135+#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
23136+#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
23137+#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
23138+#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
23139+#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
23140+#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
23141+#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
23142+#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
23143+#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
23144+
23145+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
23146+#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
23147+#else
23148+#define _PAGE_NX 0
23149+#endif
23150+
23151+/* If _PAGE_PRESENT is clear, we use these: */
23152+#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
23153+#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
23154+ pte_present gives true */
23155+
23156+#ifndef __ASSEMBLY__
23157+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
23158+extern unsigned int __kernel_page_user;
23159+#else
23160+#define __kernel_page_user 0
23161+#endif
23162+#endif
23163+
23164+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23165+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
23166+
23167+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23168+
23169+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23170+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23171+
23172+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23173+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23174+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23175+#define PAGE_COPY PAGE_COPY_NOEXEC
23176+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23177+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23178+
23179+#ifdef CONFIG_X86_32
23180+#define _PAGE_KERNEL_EXEC \
23181+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23182+#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
23183+
23184+#ifndef __ASSEMBLY__
23185+extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23186+#endif /* __ASSEMBLY__ */
23187+#else
23188+#define __PAGE_KERNEL_EXEC \
23189+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
23190+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
23191+#endif
23192+
23193+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23194+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23195+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
23196+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
23197+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
23198+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
23199+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
23200+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23201+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23202+
23203+/*
23204+ * We don't support GLOBAL page in xenolinux64
23205+ */
23206+#define MAKE_GLOBAL(x) __pgprot((x))
23207+
23208+#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
23209+#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
23210+#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
23211+#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
23212+#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
23213+#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
23214+#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
23215+#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
23216+#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
23217+#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
23218+#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
23219+
23220+/* xwr */
23221+#define __P000 PAGE_NONE
23222+#define __P001 PAGE_READONLY
23223+#define __P010 PAGE_COPY
23224+#define __P011 PAGE_COPY
23225+#define __P100 PAGE_READONLY_EXEC
23226+#define __P101 PAGE_READONLY_EXEC
23227+#define __P110 PAGE_COPY_EXEC
23228+#define __P111 PAGE_COPY_EXEC
23229+
23230+#define __S000 PAGE_NONE
23231+#define __S001 PAGE_READONLY
23232+#define __S010 PAGE_SHARED
23233+#define __S011 PAGE_SHARED
23234+#define __S100 PAGE_READONLY_EXEC
23235+#define __S101 PAGE_READONLY_EXEC
23236+#define __S110 PAGE_SHARED_EXEC
23237+#define __S111 PAGE_SHARED_EXEC
23238+
23239+#ifndef __ASSEMBLY__
23240+
23241+/*
23242+ * ZERO_PAGE is a global shared page that is always zero: used
23243+ * for zero-mapped memory areas etc..
23244+ */
23245+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
23246+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23247+
23248+extern spinlock_t pgd_lock;
23249+extern struct list_head pgd_list;
23250+
23251+/*
23252+ * The following only work if pte_present() is true.
23253+ * Undefined behaviour if not..
23254+ */
23255+static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
23256+static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
23257+static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
23258+static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
23259+static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
23260+static inline int pte_global(pte_t pte) { return 0; }
23261+static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
23262+
23263+static inline int pmd_large(pmd_t pte) {
23264+ return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
23265+ (_PAGE_PSE|_PAGE_PRESENT);
23266+}
23267+
23268+static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
23269+static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
23270+static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
23271+static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
23272+static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
23273+static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
23274+static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
23275+static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
23276+static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
23277+static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
23278+static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
23279+
23280+extern pteval_t __supported_pte_mask;
23281+
23282+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23283+{
23284+ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
23285+ pgprot_val(pgprot)) & __supported_pte_mask);
23286+}
23287+
23288+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
23289+{
23290+ return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
23291+ pgprot_val(pgprot)) & __supported_pte_mask);
23292+}
23293+
23294+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23295+{
23296+ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
23297+ pgprot_val(pgprot)) & __supported_pte_mask);
23298+}
23299+
23300+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23301+{
23302+ pteval_t val = pte_val(pte);
23303+
23304+ val &= _PAGE_CHG_MASK;
23305+ val |= pgprot_val(newprot) & __supported_pte_mask;
23306+
23307+ return __pte(val);
23308+}
23309+
23310+#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
23311+
23312+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
23313+
23314+#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23315+#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23316+
23317+#define set_pte_atomic(ptep, pte) \
23318+ xen_set_pte_atomic(ptep, pte)
23319+
23320+#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23321+
23322+#ifndef __PAGETABLE_PUD_FOLDED
23323+#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
23324+#define pgd_clear(pgd) xen_pgd_clear(pgd)
23325+#endif
23326+
23327+#ifndef set_pud
23328+# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23329+#endif
23330+
23331+#ifndef __PAGETABLE_PMD_FOLDED
23332+#define pud_clear(pud) xen_pud_clear(pud)
23333+#endif
23334+
23335+#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23336+#define pmd_clear(pmd) xen_pmd_clear(pmd)
23337+
23338+#define pte_update(mm, addr, ptep) do { } while (0)
23339+#define pte_update_defer(mm, addr, ptep) do { } while (0)
23340+
23341+#endif /* __ASSEMBLY__ */
23342+
23343 #ifdef CONFIG_X86_32
23344 # include "pgtable_32.h"
23345 #else
23346 # include "pgtable_64.h"
23347 #endif
23348+
23349+#ifndef __ASSEMBLY__
23350+
23351+enum {
23352+ PG_LEVEL_NONE,
23353+ PG_LEVEL_4K,
23354+ PG_LEVEL_2M,
23355+ PG_LEVEL_1G,
23356+};
23357+
23358+/*
23359+ * Helper function that returns the kernel pagetable entry controlling
23360+ * the virtual address 'address'. NULL means no pagetable entry present.
23361+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23362+ * as a pte too.
23363+ */
23364+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
23365+
23366+/* local pte updates need not use xchg for locking */
23367+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23368+{
23369+ xen_set_pte(ptep, __pte(0));
23370+ return res;
23371+}
23372+
23373+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23374+ pte_t *ptep , pte_t pte)
23375+{
23376+ if ((mm != current->mm && mm != &init_mm) ||
23377+ HYPERVISOR_update_va_mapping(addr, pte, 0))
23378+ xen_set_pte(ptep, pte);
23379+}
23380+
23381+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
23382+ pte_t *ptep)
23383+{
23384+ if ((mm != current->mm && mm != &init_mm)
23385+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
23386+ __xen_pte_clear(ptep);
23387+}
23388+
23389+#ifndef CONFIG_PARAVIRT
23390+/*
23391+ * Rules for using pte_update - it must be called after any PTE update which
23392+ * has not been done using the set_pte / clear_pte interfaces. It is used by
23393+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23394+ * updates should either be sets, clears, or set_pte_atomic for P->P
23395+ * transitions, which means this hook should only be called for user PTEs.
23396+ * This hook implies a P->P protection or access change has taken place, which
23397+ * requires a subsequent TLB flush. The notification can optionally be delayed
23398+ * until the TLB flush event by using the pte_update_defer form of the
23399+ * interface, but care must be taken to assure that the flush happens while
23400+ * still holding the same page table lock so that the shadow and primary pages
23401+ * do not become out of sync on SMP.
23402+ */
23403+#define pte_update(mm, addr, ptep) do { } while (0)
23404+#define pte_update_defer(mm, addr, ptep) do { } while (0)
23405+#endif
23406+
23407+/*
23408+ * We only update the dirty/accessed state if we set
23409+ * the dirty bit by hand in the kernel, since the hardware
23410+ * will do the accessed bit for us, and we don't want to
23411+ * race with other CPU's that might be updating the dirty
23412+ * bit at the same time.
23413+ */
23414+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23415+#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23416+({ \
23417+ int __changed = !pte_same(*(ptep), entry); \
23418+ if (__changed && (dirty)) { \
23419+ if ( likely((vma)->vm_mm == current->mm) ) { \
23420+ BUG_ON(HYPERVISOR_update_va_mapping(address, \
23421+ entry, \
23422+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23423+ UVMF_INVLPG|UVMF_MULTI)); \
23424+ } else { \
23425+ xen_l1_entry_update(ptep, entry); \
23426+ flush_tlb_page(vma, address); \
23427+ } \
23428+ } \
23429+ __changed; \
23430+})
23431+
23432+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23433+#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23434+ int __ret = 0; \
23435+ if (pte_young(*(ptep))) \
23436+ __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
23437+ &(ptep)->pte); \
23438+ if (__ret) \
23439+ pte_update((vma)->vm_mm, addr, ptep); \
23440+ __ret; \
23441+})
23442+
23443+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23444+#define ptep_clear_flush_young(vma, address, ptep) \
23445+({ \
23446+ pte_t __pte = *(ptep); \
23447+ int __young = pte_young(__pte); \
23448+ __pte = pte_mkold(__pte); \
23449+ if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
23450+ (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23451+ else if (__young) \
23452+ (ptep)->pte_low = __pte.pte_low; \
23453+ __young; \
23454+})
23455+
23456+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23457+#define ptep_clear_flush(vma, addr, ptep) \
23458+({ \
23459+ pte_t *__ptep = (ptep); \
23460+ pte_t __res = *__ptep; \
23461+ if (!pte_none(__res) && \
23462+ ((vma)->vm_mm != current->mm || \
23463+ HYPERVISOR_update_va_mapping(addr, __pte(0), \
23464+ (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23465+ UVMF_INVLPG|UVMF_MULTI))) { \
23466+ __xen_pte_clear(__ptep); \
23467+ flush_tlb_page(vma, addr); \
23468+ } \
23469+ __res; \
23470+})
23471+
23472+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23473+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23474+{
23475+ pte_t pte = *ptep;
23476+ if (!pte_none(pte)
23477+ && (mm != &init_mm
23478+ || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23479+ pte = xen_ptep_get_and_clear(ptep, pte);
23480+ pte_update(mm, addr, ptep);
23481+ }
23482+ return pte;
23483+}
23484+
23485+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23486+#define ptep_get_and_clear_full(mm, addr, ptep, full) \
23487+ ((full) ? ({ \
23488+ pte_t *__ptep = (ptep); \
23489+ pte_t __res = *__ptep; \
23490+ if (!PagePinned(virt_to_page((mm)->pgd))) \
23491+ __xen_pte_clear(__ptep); \
23492+ else if (!pte_none(__res)) \
23493+ xen_l1_entry_update(__ptep, __pte(0)); \
23494+ __res; \
23495+ }) : \
23496+ ptep_get_and_clear(mm, addr, ptep))
23497+
23498+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
23499+
23500+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23501+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23502+{
23503+ pte_t pte = *ptep;
23504+ if (pte_write(pte))
23505+ set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23506+}
23507+
23508+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23509+ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23510+
23511+#define arbitrary_virt_to_machine(va) \
23512+({ \
23513+ unsigned int __lvl; \
23514+ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
23515+ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
23516+ (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
23517+ | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
23518+})
23519+
23520+#ifdef CONFIG_HIGHPTE
23521+#include <asm/io.h>
23522+struct page *kmap_atomic_to_page(void *);
23523+#define ptep_to_machine(ptep) \
23524+({ \
23525+ pte_t *__ptep = (ptep); \
23526+ page_to_phys(kmap_atomic_to_page(__ptep)) \
23527+ | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
23528+})
23529+#else
23530+#define ptep_to_machine(ptep) virt_to_machine(ptep)
23531+#endif
23532+
23533+#include <asm-generic/pgtable.h>
23534+
23535+#include <xen/features.h>
23536+void make_page_readonly(void *va, unsigned int feature);
23537+void make_page_writable(void *va, unsigned int feature);
23538+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23539+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23540+
23541+struct vm_area_struct;
23542+
23543+int direct_remap_pfn_range(struct vm_area_struct *vma,
23544+ unsigned long address,
23545+ unsigned long mfn,
23546+ unsigned long size,
23547+ pgprot_t prot,
23548+ domid_t domid);
23549+int direct_kernel_remap_pfn_range(unsigned long address,
23550+ unsigned long mfn,
23551+ unsigned long size,
23552+ pgprot_t prot,
23553+ domid_t domid);
23554+int create_lookup_pte_addr(struct mm_struct *mm,
23555+ unsigned long address,
23556+ uint64_t *ptep);
23557+int touch_pte_range(struct mm_struct *mm,
23558+ unsigned long address,
23559+ unsigned long size);
23560+
23561+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23562+ unsigned long addr, unsigned long end, pgprot_t newprot,
23563+ int dirty_accountable);
23564+
23565+#endif /* __ASSEMBLY__ */
23566+
23567+#endif /* _ASM_X86_PGTABLE_H */
23568--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-02-16 16:17:21.000000000 +0100
23569+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
23570@@ -18,16 +18,18 @@
23571 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23572 &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23573
23574-#define pud_none(pud) 0
23575-#define pud_bad(pud) 0
23576-#define pud_present(pud) 1
23577
23578-/*
23579- * All present pages with !NX bit are kernel-executable:
23580- */
23581-static inline int pte_exec_kernel(pte_t pte)
23582+static inline int pud_none(pud_t pud)
23583+{
23584+ return __pud_val(pud) == 0;
23585+}
23586+static inline int pud_bad(pud_t pud)
23587 {
23588- return !(__pte_val(pte) & _PAGE_NX);
23589+ return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23590+}
23591+static inline int pud_present(pud_t pud)
23592+{
23593+ return __pud_val(pud) & _PAGE_PRESENT;
23594 }
23595
23596 /* Rules for using set_pte: the pte being assigned *must* be
23597@@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23598 ptep->pte_low = pte.pte_low;
23599 }
23600
23601-static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23602- pte_t *ptep , pte_t pte)
23603-{
23604- if ((mm != current->mm && mm != &init_mm) ||
23605- HYPERVISOR_update_va_mapping(addr, pte, 0))
23606- xen_set_pte(ptep, pte);
23607-}
23608-
23609 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23610 {
23611 set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23612@@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23613 * entry, so clear the bottom half first and enforce ordering with a compiler
23614 * barrier.
23615 */
23616-static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23617+static inline void __xen_pte_clear(pte_t *ptep)
23618 {
23619- if ((mm != current->mm && mm != &init_mm)
23620- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23621- ptep->pte_low = 0;
23622- smp_wmb();
23623- ptep->pte_high = 0;
23624- }
23625+ ptep->pte_low = 0;
23626+ smp_wmb();
23627+ ptep->pte_high = 0;
23628 }
23629
23630 static inline void xen_pmd_clear(pmd_t *pmd)
23631@@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23632 xen_l2_entry_update(pmd, __pmd(0));
23633 }
23634
23635-#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23636-#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23637-#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
23638-#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23639-#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23640-#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23641-#define pmd_clear(pmd) xen_pmd_clear(pmd)
23642+static inline void pud_clear(pud_t *pudp)
23643+{
23644+ pgdval_t pgd;
23645+
23646+ set_pud(pudp, __pud(0));
23647
23648-/*
23649- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23650- * the TLB via cr3 if the top-level pgd is changed...
23651- * We do not let the generic code free and clear pgd entries due to
23652- * this erratum.
23653- */
23654-static inline void pud_clear (pud_t * pud) { }
23655+ /*
23656+ * According to Intel App note "TLBs, Paging-Structure Caches,
23657+ * and Their Invalidation", April 2007, document 317080-001,
23658+ * section 8.1: in PAE mode we explicitly have to flush the
23659+ * TLB via cr3 if the top-level pgd is changed...
23660+ *
23661+ * Make sure the pud entry we're updating is within the
23662+ * current pgd to avoid unnecessary TLB flushes.
23663+ */
23664+ pgd = read_cr3();
23665+ if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23666+ xen_tlb_flush();
23667+}
23668
23669 #define pud_page(pud) \
23670 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
23671@@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
23672 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
23673 #endif
23674
23675-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23676-#define ptep_clear_flush(vma, addr, ptep) \
23677-({ \
23678- pte_t *__ptep = (ptep); \
23679- pte_t __res = *__ptep; \
23680- if (!pte_none(__res) && \
23681- ((vma)->vm_mm != current->mm || \
23682- HYPERVISOR_update_va_mapping(addr, __pte(0), \
23683- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23684- UVMF_INVLPG|UVMF_MULTI))) { \
23685- __ptep->pte_low = 0; \
23686- smp_wmb(); \
23687- __ptep->pte_high = 0; \
23688- flush_tlb_page(vma, addr); \
23689- } \
23690- __res; \
23691-})
23692-
23693 #define __HAVE_ARCH_PTE_SAME
23694 static inline int pte_same(pte_t a, pte_t b)
23695 {
23696@@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
23697 mfn_to_local_pfn(__pte_mfn(_pte)) : \
23698 __pte_mfn(_pte))
23699
23700-extern unsigned long long __supported_pte_mask;
23701-
23702-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23703-{
23704- return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
23705- pgprot_val(pgprot)) & __supported_pte_mask);
23706-}
23707-
23708-static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23709-{
23710- return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
23711- pgprot_val(pgprot)) & __supported_pte_mask);
23712-}
23713-
23714 /*
23715 * Bits 0, 6 and 7 are taken in the low part of the pte,
23716 * put the 32 bits of offset into the high part.
23717 */
23718 #define pte_to_pgoff(pte) ((pte).pte_high)
23719-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
23720+#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
23721 #define PTE_FILE_MAX_BITS 32
23722
23723 /* Encode and de-code a swap entry */
23724@@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
23725 #define __swp_offset(x) ((x).val >> 5)
23726 #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
23727 #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
23728-#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
23729-
23730-#define __pmd_free_tlb(tlb, x) do { } while (0)
23731+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
23732
23733 #endif /* _I386_PGTABLE_3LEVEL_H */
23734--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-02-16 16:18:36.000000000 +0100
23735+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
23736@@ -1,8 +1,6 @@
23737 #ifndef _I386_PGTABLE_H
23738 #define _I386_PGTABLE_H
23739
23740-#include <asm/hypervisor.h>
23741-
23742 /*
23743 * The Linux memory management assumes a three-level page table setup. On
23744 * the i386, we use that, but "fold" the mid level into the top-level page
23745@@ -25,20 +23,10 @@
23746
23747 struct vm_area_struct;
23748
23749-/*
23750- * ZERO_PAGE is a global shared page that is always zero: used
23751- * for zero-mapped memory areas etc..
23752- */
23753-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23754-extern unsigned long empty_zero_page[1024];
23755 extern pgd_t *swapper_pg_dir;
23756-extern struct kmem_cache *pmd_cache;
23757-extern spinlock_t pgd_lock;
23758-extern struct page *pgd_list;
23759-void check_pgt_cache(void);
23760
23761-void pmd_ctor(struct kmem_cache *, void *);
23762-void pgtable_cache_init(void);
23763+static inline void pgtable_cache_init(void) { }
23764+static inline void check_pgt_cache(void) { }
23765 void paging_init(void);
23766
23767
23768@@ -58,16 +46,9 @@ void paging_init(void);
23769 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
23770 #define PGDIR_MASK (~(PGDIR_SIZE-1))
23771
23772-#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
23773-#define FIRST_USER_ADDRESS 0
23774-
23775 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23776 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23777
23778-#define TWOLEVEL_PGDIR_SHIFT 22
23779-#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23780-#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23781-
23782 /* Just any arbitrary offset to the start of the vmalloc VM area: the
23783 * current 8MB value just means that there will be a 8MB "hole" after the
23784 * physical memory until the kernel virtual memory starts. That means that
23785@@ -78,121 +59,19 @@ void paging_init(void);
23786 #define VMALLOC_OFFSET (8*1024*1024)
23787 #define VMALLOC_START (((unsigned long) high_memory + \
23788 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23789-#ifdef CONFIG_HIGHMEM
23790-# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23791-#else
23792-# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23793-#endif
23794-
23795-/*
23796- * _PAGE_PSE set in the page directory entry just means that
23797- * the page directory entry points directly to a 4MB-aligned block of
23798- * memory.
23799- */
23800-#define _PAGE_BIT_PRESENT 0
23801-#define _PAGE_BIT_RW 1
23802-#define _PAGE_BIT_USER 2
23803-#define _PAGE_BIT_PWT 3
23804-#define _PAGE_BIT_PCD 4
23805-#define _PAGE_BIT_ACCESSED 5
23806-#define _PAGE_BIT_DIRTY 6
23807-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23808-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23809-/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
23810-#define _PAGE_BIT_UNUSED2 10
23811-#define _PAGE_BIT_UNUSED3 11
23812-#define _PAGE_BIT_NX 63
23813-
23814-#define _PAGE_PRESENT 0x001
23815-#define _PAGE_RW 0x002
23816-#define _PAGE_USER 0x004
23817-#define _PAGE_PWT 0x008
23818-#define _PAGE_PCD 0x010
23819-#define _PAGE_ACCESSED 0x020
23820-#define _PAGE_DIRTY 0x040
23821-#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23822-#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
23823-/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
23824-#define _PAGE_UNUSED2 0x400
23825-#define _PAGE_UNUSED3 0x800
23826-
23827-/* If _PAGE_PRESENT is clear, we use these: */
23828-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
23829-#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
23830- pte_present gives true */
23831 #ifdef CONFIG_X86_PAE
23832-#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
23833+#define LAST_PKMAP 512
23834 #else
23835-#define _PAGE_NX 0
23836+#define LAST_PKMAP 1024
23837 #endif
23838
23839-/* Mapped page is I/O or foreign and has no associated page struct. */
23840-#define _PAGE_IO 0x200
23841+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23842
23843-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23844-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23845-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23846-
23847-#define PAGE_NONE \
23848- __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23849-#define PAGE_SHARED \
23850- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23851-
23852-#define PAGE_SHARED_EXEC \
23853- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23854-#define PAGE_COPY_NOEXEC \
23855- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23856-#define PAGE_COPY_EXEC \
23857- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23858-#define PAGE_COPY \
23859- PAGE_COPY_NOEXEC
23860-#define PAGE_READONLY \
23861- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23862-#define PAGE_READONLY_EXEC \
23863- __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23864-
23865-#define _PAGE_KERNEL \
23866- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23867-#define _PAGE_KERNEL_EXEC \
23868- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23869-
23870-extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23871-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23872-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23873-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
23874-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23875-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23876-
23877-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
23878-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
23879-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
23880-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
23881-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
23882-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
23883-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23884-
23885-/*
23886- * The i386 can't do page protection for execute, and considers that
23887- * the same are read. Also, write permissions imply read permissions.
23888- * This is the closest we can get..
23889- */
23890-#define __P000 PAGE_NONE
23891-#define __P001 PAGE_READONLY
23892-#define __P010 PAGE_COPY
23893-#define __P011 PAGE_COPY
23894-#define __P100 PAGE_READONLY_EXEC
23895-#define __P101 PAGE_READONLY_EXEC
23896-#define __P110 PAGE_COPY_EXEC
23897-#define __P111 PAGE_COPY_EXEC
23898-
23899-#define __S000 PAGE_NONE
23900-#define __S001 PAGE_READONLY
23901-#define __S010 PAGE_SHARED
23902-#define __S011 PAGE_SHARED
23903-#define __S100 PAGE_READONLY_EXEC
23904-#define __S101 PAGE_READONLY_EXEC
23905-#define __S110 PAGE_SHARED_EXEC
23906-#define __S111 PAGE_SHARED_EXEC
23907+#ifdef CONFIG_HIGHMEM
23908+# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23909+#else
23910+# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23911+#endif
23912
23913 /*
23914 * Define this if things work differently on an i386 and an i486:
23915@@ -221,28 +100,6 @@ extern unsigned long pg0[];
23916
23917 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23918
23919-/*
23920- * The following only work if pte_present() is true.
23921- * Undefined behaviour if not..
23922- */
23923-static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
23924-static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
23925-static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
23926-static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
23927-
23928-/*
23929- * The following only works if pte_present() is not true.
23930- */
23931-static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
23932-
23933-static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23934-static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23935-static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
23936-static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23937-static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23938-static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
23939-static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
23940-
23941 #ifdef CONFIG_X86_PAE
23942 # include <asm/pgtable-3level.h>
23943 #else
23944@@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23945 #endif
23946
23947 /*
23948- * Rules for using pte_update - it must be called after any PTE update which
23949- * has not been done using the set_pte / clear_pte interfaces. It is used by
23950- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23951- * updates should either be sets, clears, or set_pte_atomic for P->P
23952- * transitions, which means this hook should only be called for user PTEs.
23953- * This hook implies a P->P protection or access change has taken place, which
23954- * requires a subsequent TLB flush. The notification can optionally be delayed
23955- * until the TLB flush event by using the pte_update_defer form of the
23956- * interface, but care must be taken to assure that the flush happens while
23957- * still holding the same page table lock so that the shadow and primary pages
23958- * do not become out of sync on SMP.
23959- */
23960-#define pte_update(mm, addr, ptep) do { } while (0)
23961-#define pte_update_defer(mm, addr, ptep) do { } while (0)
23962-
23963-/* local pte updates need not use xchg for locking */
23964-static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23965-{
23966- xen_set_pte(ptep, __pte(0));
23967- return res;
23968-}
23969-
23970-/*
23971- * We only update the dirty/accessed state if we set
23972- * the dirty bit by hand in the kernel, since the hardware
23973- * will do the accessed bit for us, and we don't want to
23974- * race with other CPU's that might be updating the dirty
23975- * bit at the same time.
23976- */
23977-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23978-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23979-({ \
23980- int __changed = !pte_same(*(ptep), entry); \
23981- if (__changed && (dirty)) { \
23982- if ( likely((vma)->vm_mm == current->mm) ) { \
23983- BUG_ON(HYPERVISOR_update_va_mapping(address, \
23984- entry, \
23985- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23986- UVMF_INVLPG|UVMF_MULTI)); \
23987- } else { \
23988- xen_l1_entry_update(ptep, entry); \
23989- flush_tlb_page(vma, address); \
23990- } \
23991- } \
23992- __changed; \
23993-})
23994-
23995-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23996-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23997- int __ret = 0; \
23998- if (pte_young(*(ptep))) \
23999- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
24000- &(ptep)->pte_low); \
24001- if (__ret) \
24002- pte_update((vma)->vm_mm, addr, ptep); \
24003- __ret; \
24004-})
24005-
24006-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24007-#define ptep_clear_flush_young(vma, address, ptep) \
24008-({ \
24009- pte_t __pte = *(ptep); \
24010- int __young = pte_young(__pte); \
24011- __pte = pte_mkold(__pte); \
24012- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24013- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24014- else if (__young) \
24015- (ptep)->pte_low = __pte.pte_low; \
24016- __young; \
24017-})
24018-
24019-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24020-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24021-{
24022- pte_t pte = *ptep;
24023- if (!pte_none(pte)
24024- && (mm != &init_mm
24025- || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24026- pte = xen_ptep_get_and_clear(ptep, pte);
24027- pte_update(mm, addr, ptep);
24028- }
24029- return pte;
24030-}
24031-
24032-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24033-#define ptep_get_and_clear_full(mm, addr, ptep, full) \
24034- ((full) ? ({ \
24035- pte_t __res = *(ptep); \
24036- if (PagePinned(virt_to_page((mm)->pgd))) \
24037- xen_l1_entry_update(ptep, __pte(0)); \
24038- else \
24039- *(ptep) = __pte(0); \
24040- __res; \
24041- }) : \
24042- ptep_get_and_clear(mm, addr, ptep))
24043-
24044-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24045-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24046-{
24047- pte_t pte = *ptep;
24048- if (pte_write(pte))
24049- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24050-}
24051-
24052-/*
24053 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
24054 *
24055 * dst - pointer to pgd range anwhere on a pgd page
24056@@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
24057
24058 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24059
24060-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24061-{
24062- /*
24063- * Since this might change the present bit (which controls whether
24064- * a pte_t object has undergone p2m translation), we must use
24065- * pte_val() on the input pte and __pte() for the return value.
24066- */
24067- paddr_t pteval = pte_val(pte);
24068-
24069- pteval &= _PAGE_CHG_MASK;
24070- pteval |= pgprot_val(newprot);
24071-#ifdef CONFIG_X86_PAE
24072- pteval &= __supported_pte_mask;
24073-#endif
24074- return __pte(pteval);
24075-}
24076-
24077-#define pmd_large(pmd) \
24078-((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
24079-
24080 /*
24081 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
24082 *
24083@@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
24084 */
24085 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
24086
24087+static inline int pud_large(pud_t pud) { return 0; }
24088+
24089 /*
24090 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
24091 *
24092@@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
24093 #define pmd_page_vaddr(pmd) \
24094 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
24095
24096-/*
24097- * Helper function that returns the kernel pagetable entry controlling
24098- * the virtual address 'address'. NULL means no pagetable entry present.
24099- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24100- * as a pte too.
24101- */
24102-extern pte_t *lookup_address(unsigned long address);
24103-
24104-/*
24105- * Make a given kernel text page executable/non-executable.
24106- * Returns the previous executability setting of that page (which
24107- * is used to restore the previous state). Used by the SMP bootup code.
24108- * NOTE: this is an __init function for security reasons.
24109- */
24110-#ifdef CONFIG_X86_PAE
24111- extern int set_kernel_exec(unsigned long vaddr, int enable);
24112-#else
24113- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
24114-#endif
24115-
24116 #if defined(CONFIG_HIGHPTE)
24117 #define pte_offset_map(dir, address) \
24118 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
24119@@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo
24120 */
24121 #define update_mmu_cache(vma,address,pte) do { } while (0)
24122
24123-#include <xen/features.h>
24124 void make_lowmem_page_readonly(void *va, unsigned int feature);
24125 void make_lowmem_page_writable(void *va, unsigned int feature);
24126-void make_page_readonly(void *va, unsigned int feature);
24127-void make_page_writable(void *va, unsigned int feature);
24128-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
24129-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
24130-
24131-#define virt_to_ptep(va) \
24132-({ \
24133- pte_t *__ptep = lookup_address((unsigned long)(va)); \
24134- BUG_ON(!__ptep || !pte_present(*__ptep)); \
24135- __ptep; \
24136-})
24137-
24138-#define arbitrary_virt_to_machine(va) \
24139- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24140- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24141-
24142-#ifdef CONFIG_HIGHPTE
24143-#include <asm/io.h>
24144-struct page *kmap_atomic_to_page(void *);
24145-#define ptep_to_machine(ptep) \
24146-({ \
24147- pte_t *__ptep = (ptep); \
24148- page_to_phys(kmap_atomic_to_page(__ptep)) \
24149- | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
24150-})
24151-#else
24152-#define ptep_to_machine(ptep) virt_to_machine(ptep)
24153-#endif
24154
24155 #endif /* !__ASSEMBLY__ */
24156
24157+/*
24158+ * kern_addr_valid() is (1) for FLATMEM and (0) for
24159+ * SPARSEMEM and DISCONTIGMEM
24160+ */
24161 #ifdef CONFIG_FLATMEM
24162 #define kern_addr_valid(addr) (1)
24163-#endif /* CONFIG_FLATMEM */
24164-
24165-int direct_remap_pfn_range(struct vm_area_struct *vma,
24166- unsigned long address,
24167- unsigned long mfn,
24168- unsigned long size,
24169- pgprot_t prot,
24170- domid_t domid);
24171-int direct_kernel_remap_pfn_range(unsigned long address,
24172- unsigned long mfn,
24173- unsigned long size,
24174- pgprot_t prot,
24175- domid_t domid);
24176-int create_lookup_pte_addr(struct mm_struct *mm,
24177- unsigned long address,
24178- uint64_t *ptep);
24179-int touch_pte_range(struct mm_struct *mm,
24180- unsigned long address,
24181- unsigned long size);
24182-
24183-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24184- unsigned long addr, unsigned long end, pgprot_t newprot,
24185- int dirty_accountable);
24186-
24187-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24188- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24189+#else
24190+#define kern_addr_valid(kaddr) (0)
24191+#endif
24192
24193 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
24194 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
24195
24196-#include <asm-generic/pgtable.h>
24197-
24198 #endif /* _I386_PGTABLE_H */
24199--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-02-16 16:18:36.000000000 +0100
24200+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
24201@@ -13,49 +13,26 @@
24202 #include <linux/threads.h>
24203 #include <linux/sched.h>
24204 #include <asm/pda.h>
24205-#ifdef CONFIG_XEN
24206-#include <asm/hypervisor.h>
24207
24208+#ifdef CONFIG_XEN
24209 extern pud_t level3_user_pgt[512];
24210
24211 extern void xen_init_pt(void);
24212-
24213-extern pte_t *lookup_address(unsigned long address);
24214-
24215-#define virt_to_ptep(va) \
24216-({ \
24217- pte_t *__ptep = lookup_address((unsigned long)(va)); \
24218- BUG_ON(!__ptep || !pte_present(*__ptep)); \
24219- __ptep; \
24220-})
24221-
24222-#define arbitrary_virt_to_machine(va) \
24223- (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24224- | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24225-
24226-#define ptep_to_machine(ptep) virt_to_machine(ptep)
24227 #endif
24228
24229 extern pud_t level3_kernel_pgt[512];
24230 extern pud_t level3_ident_pgt[512];
24231 extern pmd_t level2_kernel_pgt[512];
24232 extern pgd_t init_level4_pgt[];
24233-extern unsigned long __supported_pte_mask;
24234
24235 #define swapper_pg_dir init_level4_pgt
24236
24237 extern void paging_init(void);
24238-extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24239-
24240-/*
24241- * ZERO_PAGE is a global shared page that is always zero: used
24242- * for zero-mapped memory areas etc..
24243- */
24244-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24245-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24246
24247 #endif /* !__ASSEMBLY__ */
24248
24249+#define SHARED_KERNEL_PMD 1
24250+
24251 /*
24252 * PGDIR_SHIFT determines what a top-level page table entry can map
24253 */
24254@@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24255 #define pgd_none(x) (!__pgd_val(x))
24256 #define pud_none(x) (!__pud_val(x))
24257
24258-static inline void set_pte(pte_t *dst, pte_t val)
24259+struct mm_struct;
24260+
24261+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24262+
24263+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24264+{
24265+ *ptep = pte;
24266+}
24267+
24268+static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24269 {
24270- *dst = val;
24271+ xen_set_pte(ptep, pte);
24272 }
24273
24274-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24275-#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24276-#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24277+#ifdef CONFIG_SMP
24278+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24279+{
24280+ return __pte_ma(xchg(&xp->pte, 0));
24281+}
24282+#else
24283+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24284+#endif
24285
24286-static inline void pud_clear (pud_t * pud)
24287+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24288 {
24289- set_pud(pud, __pud(0));
24290+ xen_l2_entry_update(pmdp, pmd);
24291+}
24292+
24293+static inline void xen_pmd_clear(pmd_t *pmd)
24294+{
24295+ xen_set_pmd(pmd, xen_make_pmd(0));
24296+}
24297+
24298+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24299+{
24300+ xen_l3_entry_update(pudp, pud);
24301+}
24302+
24303+static inline void xen_pud_clear(pud_t *pud)
24304+{
24305+ xen_set_pud(pud, xen_make_pud(0));
24306 }
24307
24308 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24309
24310-static inline void pgd_clear (pgd_t * pgd)
24311+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24312 {
24313- set_pgd(pgd, __pgd(0));
24314- set_pgd(__user_pgd(pgd), __pgd(0));
24315+ xen_l4_entry_update(pgdp, pgd);
24316 }
24317
24318-#define pte_same(a, b) ((a).pte == (b).pte)
24319+static inline void xen_pgd_clear(pgd_t * pgd)
24320+{
24321+ xen_set_pgd(pgd, xen_make_pgd(0));
24322+ xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24323+}
24324
24325-#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24326+#define pte_same(a, b) ((a).pte == (b).pte)
24327
24328 #endif /* !__ASSEMBLY__ */
24329
24330@@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24331 #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
24332 #define PGDIR_MASK (~(PGDIR_SIZE-1))
24333
24334-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24335-#define FIRST_USER_ADDRESS 0
24336
24337 #define MAXMEM _AC(0x3fffffffffff, UL)
24338 #define VMALLOC_START _AC(0xffffc20000000000, UL)
24339@@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24340 #define MODULES_END _AC(0xfffffffffff00000, UL)
24341 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
24342
24343-#define _PAGE_BIT_PRESENT 0
24344-#define _PAGE_BIT_RW 1
24345-#define _PAGE_BIT_USER 2
24346-#define _PAGE_BIT_PWT 3
24347-#define _PAGE_BIT_PCD 4
24348-#define _PAGE_BIT_ACCESSED 5
24349-#define _PAGE_BIT_DIRTY 6
24350-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24351-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24352-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24353-
24354-#define _PAGE_PRESENT 0x001
24355-#define _PAGE_RW 0x002
24356-#define _PAGE_USER 0x004
24357-#define _PAGE_PWT 0x008
24358-#define _PAGE_PCD 0x010
24359-#define _PAGE_ACCESSED 0x020
24360-#define _PAGE_DIRTY 0x040
24361-#define _PAGE_PSE 0x080 /* 2MB page */
24362-#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
24363-#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
24364-
24365-#define _PAGE_PROTNONE 0x080 /* If not present */
24366-#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
24367-
24368-/* Mapped page is I/O or foreign and has no associated page struct. */
24369-#define _PAGE_IO 0x200
24370-
24371-#ifndef __ASSEMBLY__
24372-#if CONFIG_XEN_COMPAT <= 0x030002
24373-extern unsigned int __kernel_page_user;
24374-#else
24375-#define __kernel_page_user 0
24376-#endif
24377-#endif
24378-
24379-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24380-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24381-
24382-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24383-
24384-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24385-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24386-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24387-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24388-#define PAGE_COPY PAGE_COPY_NOEXEC
24389-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24390-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24391-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24392-#define __PAGE_KERNEL \
24393- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24394-#define __PAGE_KERNEL_EXEC \
24395- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24396-#define __PAGE_KERNEL_NOCACHE \
24397- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24398-#define __PAGE_KERNEL_RO \
24399- (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24400-#define __PAGE_KERNEL_VSYSCALL \
24401- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24402-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24403- (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24404-#define __PAGE_KERNEL_LARGE \
24405- (__PAGE_KERNEL | _PAGE_PSE)
24406-#define __PAGE_KERNEL_LARGE_EXEC \
24407- (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24408-
24409-/*
24410- * We don't support GLOBAL page in xenolinux64
24411- */
24412-#define MAKE_GLOBAL(x) __pgprot((x))
24413-
24414-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24415-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24416-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24417-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24418-#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24419-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24420-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24421-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24422-
24423-/* xwr */
24424-#define __P000 PAGE_NONE
24425-#define __P001 PAGE_READONLY
24426-#define __P010 PAGE_COPY
24427-#define __P011 PAGE_COPY
24428-#define __P100 PAGE_READONLY_EXEC
24429-#define __P101 PAGE_READONLY_EXEC
24430-#define __P110 PAGE_COPY_EXEC
24431-#define __P111 PAGE_COPY_EXEC
24432-
24433-#define __S000 PAGE_NONE
24434-#define __S001 PAGE_READONLY
24435-#define __S010 PAGE_SHARED
24436-#define __S011 PAGE_SHARED
24437-#define __S100 PAGE_READONLY_EXEC
24438-#define __S101 PAGE_READONLY_EXEC
24439-#define __S110 PAGE_SHARED_EXEC
24440-#define __S111 PAGE_SHARED_EXEC
24441-
24442 #ifndef __ASSEMBLY__
24443
24444 static inline unsigned long pgd_bad(pgd_t pgd)
24445@@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24446 return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24447 }
24448
24449-#define set_pte_at(_mm,addr,ptep,pteval) do { \
24450- if (((_mm) != current->mm && (_mm) != &init_mm) || \
24451- HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
24452- set_pte((ptep), (pteval)); \
24453-} while (0)
24454-
24455 #define pte_none(x) (!(x).pte)
24456 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24457-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24458
24459-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24460+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
24461
24462 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24463 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24464 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24465-#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
24466+#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
24467 (_pte).pte & _PAGE_PRESENT ? \
24468 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24469 __pte_mfn(_pte))
24470
24471 #define pte_page(x) pfn_to_page(pte_pfn(x))
24472
24473-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24474-{
24475- unsigned long pte = page_nr << PAGE_SHIFT;
24476- pte |= pgprot_val(pgprot);
24477- pte &= __supported_pte_mask;
24478- return __pte(pte);
24479-}
24480-
24481-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24482-{
24483- pte_t pte = *ptep;
24484- if (!pte_none(pte)) {
24485- if ((mm != &init_mm) ||
24486- HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24487- pte = __pte_ma(xchg(&ptep->pte, 0));
24488- }
24489- return pte;
24490-}
24491-
24492-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24493-{
24494- if (full) {
24495- pte_t pte = *ptep;
24496- if (PagePinned(virt_to_page(mm->pgd)))
24497- xen_l1_entry_update(ptep, __pte(0));
24498- else
24499- *ptep = __pte(0);
24500- return pte;
24501- }
24502- return ptep_get_and_clear(mm, addr, ptep);
24503-}
24504-
24505-#define ptep_clear_flush(vma, addr, ptep) \
24506-({ \
24507- pte_t *__ptep = (ptep); \
24508- pte_t __res = *__ptep; \
24509- if (!pte_none(__res) && \
24510- ((vma)->vm_mm != current->mm || \
24511- HYPERVISOR_update_va_mapping(addr, __pte(0), \
24512- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24513- UVMF_INVLPG|UVMF_MULTI))) { \
24514- __ptep->pte = 0; \
24515- flush_tlb_page(vma, addr); \
24516- } \
24517- __res; \
24518-})
24519-
24520-/*
24521- * The following only work if pte_present() is true.
24522- * Undefined behaviour if not..
24523- */
24524-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24525-static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24526-static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24527-static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24528-static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24529-static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24530-
24531-static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24532-static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24533-static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24534-static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24535-static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24536-static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24537-static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
24538-static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
24539-static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24540-
24541-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24542-{
24543- if (!pte_young(*ptep))
24544- return 0;
24545- return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24546-}
24547-
24548-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24549-{
24550- pte_t pte = *ptep;
24551- if (pte_write(pte))
24552- set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24553-}
24554-
24555 /*
24556 * Macro to mark a page protection value as "uncacheable".
24557 */
24558 #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24559
24560-static inline int pmd_large(pmd_t pte) {
24561- return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24562-}
24563-
24564
24565 /*
24566 * Conversion functions: convert a page and protection to a page entry,
24567@@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24568 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24569 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24570 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24571+static inline int pgd_large(pgd_t pgd) { return 0; }
24572 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24573
24574 /* PUD - Level3 access */
24575@@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24576 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24577 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24578
24579+static inline int pud_large(pud_t pte)
24580+{
24581+ return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24582+ (_PAGE_PSE|_PAGE_PRESENT);
24583+}
24584+
24585 /* PMD - Level 2 access */
24586 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24587 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24588@@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24589 #else
24590 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24591 #endif
24592-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
24593 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24594 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24595
24596 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24597-#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24598+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24599 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24600
24601 /* PTE - Level 1 access. */
24602
24603 /* page, protection -> pte */
24604 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24605-#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24606
24607-/* Change flags of a PTE */
24608-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24609-{
24610- /*
24611- * Since this might change the present bit (which controls whether
24612- * a pte_t object has undergone p2m translation), we must use
24613- * pte_val() on the input pte and __pte() for the return value.
24614- */
24615- unsigned long pteval = pte_val(pte);
24616-
24617- pteval &= _PAGE_CHG_MASK;
24618- pteval |= pgprot_val(newprot);
24619- pteval &= __supported_pte_mask;
24620- return __pte(pteval);
24621-}
24622-
24623 #define pte_index(address) \
24624 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24625 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24626@@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24627
24628 #define update_mmu_cache(vma,address,pte) do { } while (0)
24629
24630-/*
24631- * Rules for using ptep_establish: the pte MUST be a user pte, and
24632- * must be a present->present transition.
24633- */
24634-#define __HAVE_ARCH_PTEP_ESTABLISH
24635-#define ptep_establish(vma, address, ptep, pteval) \
24636- do { \
24637- if ( likely((vma)->vm_mm == current->mm) ) { \
24638- BUG_ON(HYPERVISOR_update_va_mapping(address, \
24639- pteval, \
24640- (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24641- UVMF_INVLPG|UVMF_MULTI)); \
24642- } else { \
24643- xen_l1_entry_update(ptep, pteval); \
24644- flush_tlb_page(vma, address); \
24645- } \
24646- } while (0)
24647-
24648-/* We only update the dirty/accessed state if we set
24649- * the dirty bit by hand in the kernel, since the hardware
24650- * will do the accessed bit for us, and we don't want to
24651- * race with other CPU's that might be updating the dirty
24652- * bit at the same time. */
24653-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24654-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24655-({ \
24656- int __changed = !pte_same(*(ptep), entry); \
24657- if (__changed && (dirty)) \
24658- ptep_establish(vma, address, ptep, entry); \
24659- __changed; \
24660-})
24661-
24662-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24663-#define ptep_clear_flush_young(vma, address, ptep) \
24664-({ \
24665- pte_t __pte = *(ptep); \
24666- int __young = pte_young(__pte); \
24667- __pte = pte_mkold(__pte); \
24668- if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24669- (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24670- else if (__young) \
24671- set_pte(ptep, __pte); \
24672- __young; \
24673-})
24674-
24675 /* Encode and de-code a swap entry */
24676 #define __swp_type(x) (((x).val >> 1) & 0x3f)
24677 #define __swp_offset(x) ((x).val >> 8)
24678 #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24679 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
24680-#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
24681-
24682-extern spinlock_t pgd_lock;
24683-extern struct list_head pgd_list;
24684+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
24685
24686 extern int kern_addr_valid(unsigned long addr);
24687-
24688-#define DOMID_LOCAL (0xFFFFU)
24689-
24690-struct vm_area_struct;
24691-
24692-int direct_remap_pfn_range(struct vm_area_struct *vma,
24693- unsigned long address,
24694- unsigned long mfn,
24695- unsigned long size,
24696- pgprot_t prot,
24697- domid_t domid);
24698-
24699-int direct_kernel_remap_pfn_range(unsigned long address,
24700- unsigned long mfn,
24701- unsigned long size,
24702- pgprot_t prot,
24703- domid_t domid);
24704-
24705-int create_lookup_pte_addr(struct mm_struct *mm,
24706- unsigned long address,
24707- uint64_t *ptep);
24708-
24709-int touch_pte_range(struct mm_struct *mm,
24710- unsigned long address,
24711- unsigned long size);
24712-
24713-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24714- unsigned long addr, unsigned long end, pgprot_t newprot,
24715- int dirty_accountable);
24716-
24717-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24718- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24719-
24720-pte_t *lookup_address(unsigned long addr);
24721+extern void cleanup_highmap(void);
24722
24723 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
24724 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24725
24726 #define HAVE_ARCH_UNMAPPED_AREA
24727+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24728
24729 #define pgtable_cache_init() do { } while (0)
24730 #define check_pgt_cache() do { } while (0)
24731@@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24732 #define kc_offset_to_vaddr(o) \
24733 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24734
24735-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24736-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24737-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24738-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24739-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24740 #define __HAVE_ARCH_PTE_SAME
24741-#include <asm-generic/pgtable.h>
24742 #endif /* !__ASSEMBLY__ */
24743
24744 #endif /* _X86_64_PGTABLE_H */
24745--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor.h 2009-02-16 16:18:36.000000000 +0100
24746+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
24747@@ -1,5 +1,793 @@
24748+#ifndef __ASM_X86_PROCESSOR_H
24749+#define __ASM_X86_PROCESSOR_H
24750+
24751+#include <asm/processor-flags.h>
24752+
24753+/* migration helpers, for KVM - will be removed in 2.6.25: */
24754+#include <asm/vm86.h>
24755+#define Xgt_desc_struct desc_ptr
24756+
24757+/* Forward declaration, a strange C thing */
24758+struct task_struct;
24759+struct mm_struct;
24760+
24761+#include <asm/vm86.h>
24762+#include <asm/math_emu.h>
24763+#include <asm/segment.h>
24764+#include <asm/types.h>
24765+#include <asm/sigcontext.h>
24766+#include <asm/current.h>
24767+#include <asm/cpufeature.h>
24768+#include <asm/system.h>
24769+#include <asm/page.h>
24770+#include <asm/percpu.h>
24771+#include <asm/msr.h>
24772+#include <asm/desc_defs.h>
24773+#include <asm/nops.h>
24774+#include <linux/personality.h>
24775+#include <linux/cpumask.h>
24776+#include <linux/cache.h>
24777+#include <linux/threads.h>
24778+#include <linux/init.h>
24779+#include <xen/interface/physdev.h>
24780+
24781+/*
24782+ * Default implementation of macro that returns current
24783+ * instruction pointer ("program counter").
24784+ */
24785+static inline void *current_text_addr(void)
24786+{
24787+ void *pc;
24788+ asm volatile("mov $1f,%0\n1:":"=r" (pc));
24789+ return pc;
24790+}
24791+
24792+#ifdef CONFIG_X86_VSMP
24793+#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
24794+#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
24795+#else
24796+#define ARCH_MIN_TASKALIGN 16
24797+#define ARCH_MIN_MMSTRUCT_ALIGN 0
24798+#endif
24799+
24800+/*
24801+ * CPU type and hardware bug flags. Kept separately for each CPU.
24802+ * Members of this structure are referenced in head.S, so think twice
24803+ * before touching them. [mj]
24804+ */
24805+
24806+struct cpuinfo_x86 {
24807+ __u8 x86; /* CPU family */
24808+ __u8 x86_vendor; /* CPU vendor */
24809+ __u8 x86_model;
24810+ __u8 x86_mask;
24811+#ifdef CONFIG_X86_32
24812+ char wp_works_ok; /* It doesn't on 386's */
24813+ char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
24814+ char hard_math;
24815+ char rfu;
24816+ char fdiv_bug;
24817+ char f00f_bug;
24818+ char coma_bug;
24819+ char pad0;
24820+#else
24821+ /* number of 4K pages in DTLB/ITLB combined(in pages)*/
24822+ int x86_tlbsize;
24823+ __u8 x86_virt_bits, x86_phys_bits;
24824+ /* cpuid returned core id bits */
24825+ __u8 x86_coreid_bits;
24826+ /* Max extended CPUID function supported */
24827+ __u32 extended_cpuid_level;
24828+#endif
24829+ int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
24830+ __u32 x86_capability[NCAPINTS];
24831+ char x86_vendor_id[16];
24832+ char x86_model_id[64];
24833+ int x86_cache_size; /* in KB - valid for CPUS which support this
24834+ call */
24835+ int x86_cache_alignment; /* In bytes */
24836+ int x86_power;
24837+ unsigned long loops_per_jiffy;
24838+#ifdef CONFIG_SMP
24839+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
24840+#endif
24841+ u16 x86_max_cores; /* cpuid returned max cores value */
24842+ u16 apicid;
24843+ u16 x86_clflush_size;
24844+#ifdef CONFIG_SMP
24845+ u16 booted_cores; /* number of cores as seen by OS */
24846+ u16 phys_proc_id; /* Physical processor id. */
24847+ u16 cpu_core_id; /* Core id */
24848+ u16 cpu_index; /* index into per_cpu list */
24849+#endif
24850+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
24851+
24852+#define X86_VENDOR_INTEL 0
24853+#define X86_VENDOR_CYRIX 1
24854+#define X86_VENDOR_AMD 2
24855+#define X86_VENDOR_UMC 3
24856+#define X86_VENDOR_NEXGEN 4
24857+#define X86_VENDOR_CENTAUR 5
24858+#define X86_VENDOR_TRANSMETA 7
24859+#define X86_VENDOR_NSC 8
24860+#define X86_VENDOR_NUM 9
24861+#define X86_VENDOR_UNKNOWN 0xff
24862+
24863+/*
24864+ * capabilities of CPUs
24865+ */
24866+extern struct cpuinfo_x86 boot_cpu_data;
24867+extern struct cpuinfo_x86 new_cpu_data;
24868+extern __u32 cleared_cpu_caps[NCAPINTS];
24869+
24870+#ifdef CONFIG_SMP
24871+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
24872+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
24873+#define current_cpu_data cpu_data(smp_processor_id())
24874+#else
24875+#define cpu_data(cpu) boot_cpu_data
24876+#define current_cpu_data boot_cpu_data
24877+#endif
24878+
24879+void cpu_detect(struct cpuinfo_x86 *c);
24880+
24881+extern void identify_cpu(struct cpuinfo_x86 *);
24882+extern void identify_boot_cpu(void);
24883+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
24884+extern void print_cpu_info(struct cpuinfo_x86 *);
24885+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
24886+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
24887+extern unsigned short num_cache_leaves;
24888+
24889+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
24890+extern void detect_ht(struct cpuinfo_x86 *c);
24891+#else
24892+static inline void detect_ht(struct cpuinfo_x86 *c) {}
24893+#endif
24894+
24895+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
24896+ unsigned int *ecx, unsigned int *edx)
24897+{
24898+ /* ecx is often an input as well as an output. */
24899+ __asm__(XEN_CPUID
24900+ : "=a" (*eax),
24901+ "=b" (*ebx),
24902+ "=c" (*ecx),
24903+ "=d" (*edx)
24904+ : "0" (*eax), "2" (*ecx));
24905+}
24906+
24907+static inline void load_cr3(pgd_t *pgdir)
24908+{
24909+ write_cr3(__pa(pgdir));
24910+}
24911+
24912+#ifndef CONFIG_X86_NO_TSS
24913+#ifdef CONFIG_X86_32
24914+/* This is the TSS defined by the hardware. */
24915+struct x86_hw_tss {
24916+ unsigned short back_link, __blh;
24917+ unsigned long sp0;
24918+ unsigned short ss0, __ss0h;
24919+ unsigned long sp1;
24920+ unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
24921+ unsigned long sp2;
24922+ unsigned short ss2, __ss2h;
24923+ unsigned long __cr3;
24924+ unsigned long ip;
24925+ unsigned long flags;
24926+ unsigned long ax, cx, dx, bx;
24927+ unsigned long sp, bp, si, di;
24928+ unsigned short es, __esh;
24929+ unsigned short cs, __csh;
24930+ unsigned short ss, __ssh;
24931+ unsigned short ds, __dsh;
24932+ unsigned short fs, __fsh;
24933+ unsigned short gs, __gsh;
24934+ unsigned short ldt, __ldth;
24935+ unsigned short trace, io_bitmap_base;
24936+} __attribute__((packed));
24937+extern struct tss_struct doublefault_tss;
24938+#else
24939+struct x86_hw_tss {
24940+ u32 reserved1;
24941+ u64 sp0;
24942+ u64 sp1;
24943+ u64 sp2;
24944+ u64 reserved2;
24945+ u64 ist[7];
24946+ u32 reserved3;
24947+ u32 reserved4;
24948+ u16 reserved5;
24949+ u16 io_bitmap_base;
24950+} __attribute__((packed)) ____cacheline_aligned;
24951+#endif
24952+#endif /* CONFIG_X86_NO_TSS */
24953+
24954+/*
24955+ * Size of io_bitmap.
24956+ */
24957+#define IO_BITMAP_BITS 65536
24958+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
24959+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
24960+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
24961+#define INVALID_IO_BITMAP_OFFSET 0x8000
24962+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
24963+
24964+#ifndef CONFIG_X86_NO_TSS
24965+struct tss_struct {
24966+ struct x86_hw_tss x86_tss;
24967+
24968+ /*
24969+ * The extra 1 is there because the CPU will access an
24970+ * additional byte beyond the end of the IO permission
24971+ * bitmap. The extra byte must be all 1 bits, and must
24972+ * be within the limit.
24973+ */
24974+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
24975+ /*
24976+ * Cache the current maximum and the last task that used the bitmap:
24977+ */
24978+ unsigned long io_bitmap_max;
24979+ struct thread_struct *io_bitmap_owner;
24980+ /*
24981+ * pads the TSS to be cacheline-aligned (size is 0x100)
24982+ */
24983+ unsigned long __cacheline_filler[35];
24984+ /*
24985+ * .. and then another 0x100 bytes for emergency kernel stack
24986+ */
24987+ unsigned long stack[64];
24988+} __attribute__((packed));
24989+
24990+DECLARE_PER_CPU(struct tss_struct, init_tss);
24991+
24992+/* Save the original ist values for checking stack pointers during debugging */
24993+struct orig_ist {
24994+ unsigned long ist[7];
24995+};
24996+#endif /* CONFIG_X86_NO_TSS */
24997+
24998+#define MXCSR_DEFAULT 0x1f80
24999+
25000+struct i387_fsave_struct {
25001+ u32 cwd;
25002+ u32 swd;
25003+ u32 twd;
25004+ u32 fip;
25005+ u32 fcs;
25006+ u32 foo;
25007+ u32 fos;
25008+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25009+ u32 status; /* software status information */
25010+};
25011+
25012+struct i387_fxsave_struct {
25013+ u16 cwd;
25014+ u16 swd;
25015+ u16 twd;
25016+ u16 fop;
25017+ union {
25018+ struct {
25019+ u64 rip;
25020+ u64 rdp;
25021+ };
25022+ struct {
25023+ u32 fip;
25024+ u32 fcs;
25025+ u32 foo;
25026+ u32 fos;
25027+ };
25028+ };
25029+ u32 mxcsr;
25030+ u32 mxcsr_mask;
25031+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25032+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
25033+ u32 padding[24];
25034+} __attribute__((aligned(16)));
25035+
25036+struct i387_soft_struct {
25037+ u32 cwd;
25038+ u32 swd;
25039+ u32 twd;
25040+ u32 fip;
25041+ u32 fcs;
25042+ u32 foo;
25043+ u32 fos;
25044+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25045+ u8 ftop, changed, lookahead, no_update, rm, alimit;
25046+ struct info *info;
25047+ u32 entry_eip;
25048+};
25049+
25050+union i387_union {
25051+ struct i387_fsave_struct fsave;
25052+ struct i387_fxsave_struct fxsave;
25053+ struct i387_soft_struct soft;
25054+};
25055+
25056+#ifdef CONFIG_X86_32
25057+DECLARE_PER_CPU(u8, cpu_llc_id);
25058+#elif !defined(CONFIG_X86_NO_TSS)
25059+DECLARE_PER_CPU(struct orig_ist, orig_ist);
25060+#endif
25061+
25062+extern void print_cpu_info(struct cpuinfo_x86 *);
25063+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25064+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25065+extern unsigned short num_cache_leaves;
25066+
25067+struct thread_struct {
25068+/* cached TLS descriptors. */
25069+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25070+ unsigned long sp0;
25071+ unsigned long sp;
25072+#ifdef CONFIG_X86_32
25073+ unsigned long sysenter_cs;
25074+#else
25075+ unsigned long usersp; /* Copy from PDA */
25076+ unsigned short es, ds, fsindex, gsindex;
25077+#endif
25078+ unsigned long ip;
25079+ unsigned long fs;
25080+ unsigned long gs;
25081+/* Hardware debugging registers */
25082+ unsigned long debugreg0;
25083+ unsigned long debugreg1;
25084+ unsigned long debugreg2;
25085+ unsigned long debugreg3;
25086+ unsigned long debugreg6;
25087+ unsigned long debugreg7;
25088+/* fault info */
25089+ unsigned long cr2, trap_no, error_code;
25090+/* floating point info */
25091+ union i387_union i387 __attribute__((aligned(16)));;
25092+#ifdef CONFIG_X86_32
25093+/* virtual 86 mode info */
25094+ struct vm86_struct __user *vm86_info;
25095+ unsigned long screen_bitmap;
25096+ unsigned long v86flags, v86mask, saved_sp0;
25097+ unsigned int saved_fs, saved_gs;
25098+#endif
25099+/* IO permissions */
25100+ unsigned long *io_bitmap_ptr;
25101+ unsigned long iopl;
25102+/* max allowed port in the bitmap, in bytes: */
25103+ unsigned io_bitmap_max;
25104+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
25105+ unsigned long debugctlmsr;
25106+/* Debug Store - if not 0 points to a DS Save Area configuration;
25107+ * goes into MSR_IA32_DS_AREA */
25108+ unsigned long ds_area_msr;
25109+};
25110+
25111+static inline unsigned long xen_get_debugreg(int regno)
25112+{
25113+ return HYPERVISOR_get_debugreg(regno);
25114+}
25115+
25116+static inline void xen_set_debugreg(int regno, unsigned long value)
25117+{
25118+ WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25119+}
25120+
25121+/*
25122+ * Set IOPL bits in EFLAGS from given mask
25123+ */
25124+static inline void xen_set_iopl_mask(unsigned mask)
25125+{
25126+ struct physdev_set_iopl set_iopl;
25127+
25128+ /* Force the change at ring 0. */
25129+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25130+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25131+}
25132+
25133+#ifndef CONFIG_X86_NO_TSS
25134+static inline void native_load_sp0(struct tss_struct *tss,
25135+ struct thread_struct *thread)
25136+{
25137+ tss->x86_tss.sp0 = thread->sp0;
25138+#ifdef CONFIG_X86_32
25139+ /* Only happens when SEP is enabled, no need to test "SEP"arately */
25140+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25141+ tss->x86_tss.ss1 = thread->sysenter_cs;
25142+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25143+ }
25144+#endif
25145+}
25146+#else
25147+#define xen_load_sp0(tss, thread) do { \
25148+ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
25149+ BUG(); \
25150+} while (0)
25151+#endif
25152+
25153+#define __cpuid xen_cpuid
25154+#define paravirt_enabled() 0
25155+
25156+/*
25157+ * These special macros can be used to get or set a debugging register
25158+ */
25159+#define get_debugreg(var, register) \
25160+ (var) = xen_get_debugreg(register)
25161+#define set_debugreg(value, register) \
25162+ xen_set_debugreg(register, value)
25163+
25164+#define load_sp0 xen_load_sp0
25165+
25166+#define set_iopl_mask xen_set_iopl_mask
25167+
25168+/*
25169+ * Save the cr4 feature set we're using (ie
25170+ * Pentium 4MB enable and PPro Global page
25171+ * enable), so that any CPU's that boot up
25172+ * after us can get the correct flags.
25173+ */
25174+extern unsigned long mmu_cr4_features;
25175+
25176+static inline void set_in_cr4(unsigned long mask)
25177+{
25178+ unsigned cr4;
25179+ mmu_cr4_features |= mask;
25180+ cr4 = read_cr4();
25181+ cr4 |= mask;
25182+ write_cr4(cr4);
25183+}
25184+
25185+static inline void clear_in_cr4(unsigned long mask)
25186+{
25187+ unsigned cr4;
25188+ mmu_cr4_features &= ~mask;
25189+ cr4 = read_cr4();
25190+ cr4 &= ~mask;
25191+ write_cr4(cr4);
25192+}
25193+
25194+struct microcode_header {
25195+ unsigned int hdrver;
25196+ unsigned int rev;
25197+ unsigned int date;
25198+ unsigned int sig;
25199+ unsigned int cksum;
25200+ unsigned int ldrver;
25201+ unsigned int pf;
25202+ unsigned int datasize;
25203+ unsigned int totalsize;
25204+ unsigned int reserved[3];
25205+};
25206+
25207+struct microcode {
25208+ struct microcode_header hdr;
25209+ unsigned int bits[0];
25210+};
25211+
25212+typedef struct microcode microcode_t;
25213+typedef struct microcode_header microcode_header_t;
25214+
25215+/* microcode format is extended from prescott processors */
25216+struct extended_signature {
25217+ unsigned int sig;
25218+ unsigned int pf;
25219+ unsigned int cksum;
25220+};
25221+
25222+struct extended_sigtable {
25223+ unsigned int count;
25224+ unsigned int cksum;
25225+ unsigned int reserved[3];
25226+ struct extended_signature sigs[0];
25227+};
25228+
25229+typedef struct {
25230+ unsigned long seg;
25231+} mm_segment_t;
25232+
25233+
25234+/*
25235+ * create a kernel thread without removing it from tasklists
25236+ */
25237+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
25238+
25239+/* Free all resources held by a thread. */
25240+extern void release_thread(struct task_struct *);
25241+
25242+/* Prepare to copy thread state - unlazy all lazy status */
25243+extern void prepare_to_copy(struct task_struct *tsk);
25244+
25245+unsigned long get_wchan(struct task_struct *p);
25246+
25247+/*
25248+ * Generic CPUID function
25249+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25250+ * resulting in stale register contents being returned.
25251+ */
25252+static inline void cpuid(unsigned int op,
25253+ unsigned int *eax, unsigned int *ebx,
25254+ unsigned int *ecx, unsigned int *edx)
25255+{
25256+ *eax = op;
25257+ *ecx = 0;
25258+ __cpuid(eax, ebx, ecx, edx);
25259+}
25260+
25261+/* Some CPUID calls want 'count' to be placed in ecx */
25262+static inline void cpuid_count(unsigned int op, int count,
25263+ unsigned int *eax, unsigned int *ebx,
25264+ unsigned int *ecx, unsigned int *edx)
25265+{
25266+ *eax = op;
25267+ *ecx = count;
25268+ __cpuid(eax, ebx, ecx, edx);
25269+}
25270+
25271+/*
25272+ * CPUID functions returning a single datum
25273+ */
25274+static inline unsigned int cpuid_eax(unsigned int op)
25275+{
25276+ unsigned int eax, ebx, ecx, edx;
25277+
25278+ cpuid(op, &eax, &ebx, &ecx, &edx);
25279+ return eax;
25280+}
25281+static inline unsigned int cpuid_ebx(unsigned int op)
25282+{
25283+ unsigned int eax, ebx, ecx, edx;
25284+
25285+ cpuid(op, &eax, &ebx, &ecx, &edx);
25286+ return ebx;
25287+}
25288+static inline unsigned int cpuid_ecx(unsigned int op)
25289+{
25290+ unsigned int eax, ebx, ecx, edx;
25291+
25292+ cpuid(op, &eax, &ebx, &ecx, &edx);
25293+ return ecx;
25294+}
25295+static inline unsigned int cpuid_edx(unsigned int op)
25296+{
25297+ unsigned int eax, ebx, ecx, edx;
25298+
25299+ cpuid(op, &eax, &ebx, &ecx, &edx);
25300+ return edx;
25301+}
25302+
25303+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25304+static inline void rep_nop(void)
25305+{
25306+ __asm__ __volatile__("rep;nop": : :"memory");
25307+}
25308+
25309+/* Stop speculative execution */
25310+static inline void sync_core(void)
25311+{
25312+ int tmp;
25313+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
25314+ : "ebx", "ecx", "edx", "memory");
25315+}
25316+
25317+#define cpu_relax() rep_nop()
25318+
25319+static inline void __monitor(const void *eax, unsigned long ecx,
25320+ unsigned long edx)
25321+{
25322+ /* "monitor %eax,%ecx,%edx;" */
25323+ asm volatile(
25324+ ".byte 0x0f,0x01,0xc8;"
25325+ : :"a" (eax), "c" (ecx), "d"(edx));
25326+}
25327+
25328+static inline void __mwait(unsigned long eax, unsigned long ecx)
25329+{
25330+ /* "mwait %eax,%ecx;" */
25331+ asm volatile(
25332+ ".byte 0x0f,0x01,0xc9;"
25333+ : :"a" (eax), "c" (ecx));
25334+}
25335+
25336+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
25337+{
25338+ /* "mwait %eax,%ecx;" */
25339+ asm volatile(
25340+ "sti; .byte 0x0f,0x01,0xc9;"
25341+ : :"a" (eax), "c" (ecx));
25342+}
25343+
25344+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25345+
25346+extern int force_mwait;
25347+
25348+extern void select_idle_routine(const struct cpuinfo_x86 *c);
25349+
25350+extern unsigned long boot_option_idle_override;
25351+
25352+extern void enable_sep_cpu(void);
25353+extern int sysenter_setup(void);
25354+
25355+/* Defined in head.S */
25356+extern struct desc_ptr early_gdt_descr;
25357+
25358+extern void cpu_set_gdt(int);
25359+extern void switch_to_new_gdt(void);
25360+extern void cpu_init(void);
25361+extern void init_gdt(int cpu);
25362+
25363+/* from system description table in BIOS. Mostly for MCA use, but
25364+ * others may find it useful. */
25365+extern unsigned int machine_id;
25366+extern unsigned int machine_submodel_id;
25367+extern unsigned int BIOS_revision;
25368+
25369+/* Boot loader type from the setup header */
25370+extern int bootloader_type;
25371+
25372+extern char ignore_fpu_irq;
25373+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25374+
25375+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
25376+#define ARCH_HAS_PREFETCHW
25377+#define ARCH_HAS_SPINLOCK_PREFETCH
25378+
25379+#ifdef CONFIG_X86_32
25380+#define BASE_PREFETCH ASM_NOP4
25381+#define ARCH_HAS_PREFETCH
25382+#else
25383+#define BASE_PREFETCH "prefetcht0 (%1)"
25384+#endif
25385+
25386+/* Prefetch instructions for Pentium III and AMD Athlon */
25387+/* It's not worth to care about 3dnow! prefetches for the K6
25388+ because they are microcoded there and very slow.
25389+ However we don't do prefetches for pre XP Athlons currently
25390+ That should be fixed. */
25391+static inline void prefetch(const void *x)
25392+{
25393+ alternative_input(BASE_PREFETCH,
25394+ "prefetchnta (%1)",
25395+ X86_FEATURE_XMM,
25396+ "r" (x));
25397+}
25398+
25399+/* 3dnow! prefetch to get an exclusive cache line. Useful for
25400+ spinlocks to avoid one state transition in the cache coherency protocol. */
25401+static inline void prefetchw(const void *x)
25402+{
25403+ alternative_input(BASE_PREFETCH,
25404+ "prefetchw (%1)",
25405+ X86_FEATURE_3DNOW,
25406+ "r" (x));
25407+}
25408+
25409+#define spin_lock_prefetch(x) prefetchw(x)
25410 #ifdef CONFIG_X86_32
25411-# include "processor_32.h"
25412+/*
25413+ * User space process size: 3GB (default).
25414+ */
25415+#define TASK_SIZE (PAGE_OFFSET)
25416+#define STACK_TOP TASK_SIZE
25417+#define STACK_TOP_MAX STACK_TOP
25418+
25419+#define INIT_THREAD { \
25420+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
25421+ .vm86_info = NULL, \
25422+ .sysenter_cs = __KERNEL_CS, \
25423+ .io_bitmap_ptr = NULL, \
25424+ .fs = __KERNEL_PERCPU, \
25425+}
25426+
25427+/*
25428+ * Note that the .io_bitmap member must be extra-big. This is because
25429+ * the CPU will access an additional byte beyond the end of the IO
25430+ * permission bitmap. The extra byte must be all 1 bits, and must
25431+ * be within the limit.
25432+ */
25433+#define INIT_TSS { \
25434+ .x86_tss = { \
25435+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
25436+ .ss0 = __KERNEL_DS, \
25437+ .ss1 = __KERNEL_CS, \
25438+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25439+ }, \
25440+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
25441+}
25442+
25443+#define start_thread(regs, new_eip, new_esp) do { \
25444+ __asm__("movl %0,%%gs": :"r" (0)); \
25445+ regs->fs = 0; \
25446+ set_fs(USER_DS); \
25447+ regs->ds = __USER_DS; \
25448+ regs->es = __USER_DS; \
25449+ regs->ss = __USER_DS; \
25450+ regs->cs = __USER_CS; \
25451+ regs->ip = new_eip; \
25452+ regs->sp = new_esp; \
25453+} while (0)
25454+
25455+
25456+extern unsigned long thread_saved_pc(struct task_struct *tsk);
25457+
25458+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25459+#define KSTK_TOP(info) \
25460+({ \
25461+ unsigned long *__ptr = (unsigned long *)(info); \
25462+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25463+})
25464+
25465+/*
25466+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25467+ * This is necessary to guarantee that the entire "struct pt_regs"
25468+ * is accessable even if the CPU haven't stored the SS/ESP registers
25469+ * on the stack (interrupt gate does not save these registers
25470+ * when switching to the same priv ring).
25471+ * Therefore beware: accessing the ss/esp fields of the
25472+ * "struct pt_regs" is possible, but they may contain the
25473+ * completely wrong values.
25474+ */
25475+#define task_pt_regs(task) \
25476+({ \
25477+ struct pt_regs *__regs__; \
25478+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25479+ __regs__ - 1; \
25480+})
25481+
25482+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
25483+
25484 #else
25485-# include "processor_64.h"
25486+/*
25487+ * User space process size. 47bits minus one guard page.
25488+ */
25489+#define TASK_SIZE64 (0x800000000000UL - 4096)
25490+
25491+/* This decides where the kernel will search for a free chunk of vm
25492+ * space during mmap's.
25493+ */
25494+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
25495+ 0xc0000000 : 0xFFFFe000)
25496+
25497+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
25498+ IA32_PAGE_OFFSET : TASK_SIZE64)
25499+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
25500+ IA32_PAGE_OFFSET : TASK_SIZE64)
25501+
25502+#define STACK_TOP TASK_SIZE
25503+#define STACK_TOP_MAX TASK_SIZE64
25504+
25505+#define INIT_THREAD { \
25506+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25507+}
25508+
25509+#define INIT_TSS { \
25510+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25511+}
25512+
25513+#define start_thread(regs, new_rip, new_rsp) do { \
25514+ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
25515+ load_gs_index(0); \
25516+ (regs)->ip = (new_rip); \
25517+ (regs)->sp = (new_rsp); \
25518+ write_pda(oldrsp, (new_rsp)); \
25519+ (regs)->cs = __USER_CS; \
25520+ (regs)->ss = __USER_DS; \
25521+ (regs)->flags = 0x200; \
25522+ set_fs(USER_DS); \
25523+} while (0)
25524+
25525+/*
25526+ * Return saved PC of a blocked thread.
25527+ * What is this good for? it will be always the scheduler or ret_from_fork.
25528+ */
25529+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
25530+
25531+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
25532+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
25533+#endif /* CONFIG_X86_64 */
25534+
25535+/* This decides where the kernel will search for a free chunk of vm
25536+ * space during mmap's.
25537+ */
25538+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25539+
25540+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
25541+
25542 #endif
25543--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor_32.h 2009-02-16 16:18:36.000000000 +0100
25544+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
25545@@ -1,751 +0,0 @@
25546-/*
25547- * include/asm-i386/processor.h
25548- *
25549- * Copyright (C) 1994 Linus Torvalds
25550- */
25551-
25552-#ifndef __ASM_I386_PROCESSOR_H
25553-#define __ASM_I386_PROCESSOR_H
25554-
25555-#include <asm/vm86.h>
25556-#include <asm/math_emu.h>
25557-#include <asm/segment.h>
25558-#include <asm/page.h>
25559-#include <asm/types.h>
25560-#include <asm/sigcontext.h>
25561-#include <asm/cpufeature.h>
25562-#include <asm/msr.h>
25563-#include <asm/system.h>
25564-#include <linux/cache.h>
25565-#include <linux/threads.h>
25566-#include <asm/percpu.h>
25567-#include <linux/cpumask.h>
25568-#include <linux/init.h>
25569-#include <asm/processor-flags.h>
25570-#include <xen/interface/physdev.h>
25571-
25572-/* flag for disabling the tsc */
25573-#define tsc_disable 0
25574-
25575-struct desc_struct {
25576- unsigned long a,b;
25577-};
25578-
25579-#define desc_empty(desc) \
25580- (!((desc)->a | (desc)->b))
25581-
25582-#define desc_equal(desc1, desc2) \
25583- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25584-/*
25585- * Default implementation of macro that returns current
25586- * instruction pointer ("program counter").
25587- */
25588-#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25589-
25590-/*
25591- * CPU type and hardware bug flags. Kept separately for each CPU.
25592- * Members of this structure are referenced in head.S, so think twice
25593- * before touching them. [mj]
25594- */
25595-
25596-struct cpuinfo_x86 {
25597- __u8 x86; /* CPU family */
25598- __u8 x86_vendor; /* CPU vendor */
25599- __u8 x86_model;
25600- __u8 x86_mask;
25601- char wp_works_ok; /* It doesn't on 386's */
25602- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
25603- char hard_math;
25604- char rfu;
25605- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25606- unsigned long x86_capability[NCAPINTS];
25607- char x86_vendor_id[16];
25608- char x86_model_id[64];
25609- int x86_cache_size; /* in KB - valid for CPUS which support this
25610- call */
25611- int x86_cache_alignment; /* In bytes */
25612- char fdiv_bug;
25613- char f00f_bug;
25614- char coma_bug;
25615- char pad0;
25616- int x86_power;
25617- unsigned long loops_per_jiffy;
25618-#ifdef CONFIG_SMP
25619- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25620-#endif
25621- unsigned char x86_max_cores; /* cpuid returned max cores value */
25622- unsigned char apicid;
25623- unsigned short x86_clflush_size;
25624-#ifdef CONFIG_SMP
25625- unsigned char booted_cores; /* number of cores as seen by OS */
25626- __u8 phys_proc_id; /* Physical processor id. */
25627- __u8 cpu_core_id; /* Core id */
25628- __u8 cpu_index; /* index into per_cpu list */
25629-#endif
25630-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25631-
25632-#define X86_VENDOR_INTEL 0
25633-#define X86_VENDOR_CYRIX 1
25634-#define X86_VENDOR_AMD 2
25635-#define X86_VENDOR_UMC 3
25636-#define X86_VENDOR_NEXGEN 4
25637-#define X86_VENDOR_CENTAUR 5
25638-#define X86_VENDOR_TRANSMETA 7
25639-#define X86_VENDOR_NSC 8
25640-#define X86_VENDOR_NUM 9
25641-#define X86_VENDOR_UNKNOWN 0xff
25642-
25643-/*
25644- * capabilities of CPUs
25645- */
25646-
25647-extern struct cpuinfo_x86 boot_cpu_data;
25648-extern struct cpuinfo_x86 new_cpu_data;
25649-#ifndef CONFIG_X86_NO_TSS
25650-extern struct tss_struct doublefault_tss;
25651-DECLARE_PER_CPU(struct tss_struct, init_tss);
25652-#endif
25653-
25654-#ifdef CONFIG_SMP
25655-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25656-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25657-#define current_cpu_data cpu_data(smp_processor_id())
25658-#else
25659-#define cpu_data(cpu) boot_cpu_data
25660-#define current_cpu_data boot_cpu_data
25661-#endif
25662-
25663-/*
25664- * the following now lives in the per cpu area:
25665- * extern int cpu_llc_id[NR_CPUS];
25666- */
25667-DECLARE_PER_CPU(u8, cpu_llc_id);
25668-extern char ignore_fpu_irq;
25669-
25670-void __init cpu_detect(struct cpuinfo_x86 *c);
25671-
25672-extern void identify_boot_cpu(void);
25673-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25674-extern void print_cpu_info(struct cpuinfo_x86 *);
25675-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25676-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25677-extern unsigned short num_cache_leaves;
25678-
25679-#ifdef CONFIG_X86_HT
25680-extern void detect_ht(struct cpuinfo_x86 *c);
25681-#else
25682-static inline void detect_ht(struct cpuinfo_x86 *c) {}
25683-#endif
25684-
25685-static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25686- unsigned int *ecx, unsigned int *edx)
25687-{
25688- /* ecx is often an input as well as an output. */
25689- __asm__(XEN_CPUID
25690- : "=a" (*eax),
25691- "=b" (*ebx),
25692- "=c" (*ecx),
25693- "=d" (*edx)
25694- : "0" (*eax), "2" (*ecx));
25695-}
25696-
25697-#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25698-
25699-/*
25700- * Save the cr4 feature set we're using (ie
25701- * Pentium 4MB enable and PPro Global page
25702- * enable), so that any CPU's that boot up
25703- * after us can get the correct flags.
25704- */
25705-extern unsigned long mmu_cr4_features;
25706-
25707-static inline void set_in_cr4 (unsigned long mask)
25708-{
25709- unsigned cr4;
25710- mmu_cr4_features |= mask;
25711- cr4 = read_cr4();
25712- cr4 |= mask;
25713- write_cr4(cr4);
25714-}
25715-
25716-static inline void clear_in_cr4 (unsigned long mask)
25717-{
25718- unsigned cr4;
25719- mmu_cr4_features &= ~mask;
25720- cr4 = read_cr4();
25721- cr4 &= ~mask;
25722- write_cr4(cr4);
25723-}
25724-
25725-/* Stop speculative execution */
25726-static inline void sync_core(void)
25727-{
25728- int tmp;
25729- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25730-}
25731-
25732-static inline void __monitor(const void *eax, unsigned long ecx,
25733- unsigned long edx)
25734-{
25735- /* "monitor %eax,%ecx,%edx;" */
25736- asm volatile(
25737- ".byte 0x0f,0x01,0xc8;"
25738- : :"a" (eax), "c" (ecx), "d"(edx));
25739-}
25740-
25741-static inline void __mwait(unsigned long eax, unsigned long ecx)
25742-{
25743- /* "mwait %eax,%ecx;" */
25744- asm volatile(
25745- ".byte 0x0f,0x01,0xc9;"
25746- : :"a" (eax), "c" (ecx));
25747-}
25748-
25749-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25750-
25751-/* from system description table in BIOS. Mostly for MCA use, but
25752-others may find it useful. */
25753-extern unsigned int machine_id;
25754-extern unsigned int machine_submodel_id;
25755-extern unsigned int BIOS_revision;
25756-extern unsigned int mca_pentium_flag;
25757-
25758-/* Boot loader type from the setup header */
25759-extern int bootloader_type;
25760-
25761-/*
25762- * User space process size: 3GB (default).
25763- */
25764-#define TASK_SIZE (PAGE_OFFSET)
25765-
25766-/* This decides where the kernel will search for a free chunk of vm
25767- * space during mmap's.
25768- */
25769-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25770-
25771-#define HAVE_ARCH_PICK_MMAP_LAYOUT
25772-
25773-extern void hard_disable_TSC(void);
25774-extern void disable_TSC(void);
25775-extern void hard_enable_TSC(void);
25776-
25777-/*
25778- * Size of io_bitmap.
25779- */
25780-#define IO_BITMAP_BITS 65536
25781-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25782-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25783-#ifndef CONFIG_X86_NO_TSS
25784-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25785-#endif
25786-#define INVALID_IO_BITMAP_OFFSET 0x8000
25787-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25788-
25789-struct i387_fsave_struct {
25790- long cwd;
25791- long swd;
25792- long twd;
25793- long fip;
25794- long fcs;
25795- long foo;
25796- long fos;
25797- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25798- long status; /* software status information */
25799-};
25800-
25801-struct i387_fxsave_struct {
25802- unsigned short cwd;
25803- unsigned short swd;
25804- unsigned short twd;
25805- unsigned short fop;
25806- long fip;
25807- long fcs;
25808- long foo;
25809- long fos;
25810- long mxcsr;
25811- long mxcsr_mask;
25812- long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25813- long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
25814- long padding[56];
25815-} __attribute__ ((aligned (16)));
25816-
25817-struct i387_soft_struct {
25818- long cwd;
25819- long swd;
25820- long twd;
25821- long fip;
25822- long fcs;
25823- long foo;
25824- long fos;
25825- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25826- unsigned char ftop, changed, lookahead, no_update, rm, alimit;
25827- struct info *info;
25828- unsigned long entry_eip;
25829-};
25830-
25831-union i387_union {
25832- struct i387_fsave_struct fsave;
25833- struct i387_fxsave_struct fxsave;
25834- struct i387_soft_struct soft;
25835-};
25836-
25837-typedef struct {
25838- unsigned long seg;
25839-} mm_segment_t;
25840-
25841-struct thread_struct;
25842-
25843-#ifndef CONFIG_X86_NO_TSS
25844-/* This is the TSS defined by the hardware. */
25845-struct i386_hw_tss {
25846- unsigned short back_link,__blh;
25847- unsigned long esp0;
25848- unsigned short ss0,__ss0h;
25849- unsigned long esp1;
25850- unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25851- unsigned long esp2;
25852- unsigned short ss2,__ss2h;
25853- unsigned long __cr3;
25854- unsigned long eip;
25855- unsigned long eflags;
25856- unsigned long eax,ecx,edx,ebx;
25857- unsigned long esp;
25858- unsigned long ebp;
25859- unsigned long esi;
25860- unsigned long edi;
25861- unsigned short es, __esh;
25862- unsigned short cs, __csh;
25863- unsigned short ss, __ssh;
25864- unsigned short ds, __dsh;
25865- unsigned short fs, __fsh;
25866- unsigned short gs, __gsh;
25867- unsigned short ldt, __ldth;
25868- unsigned short trace, io_bitmap_base;
25869-} __attribute__((packed));
25870-
25871-struct tss_struct {
25872- struct i386_hw_tss x86_tss;
25873-
25874- /*
25875- * The extra 1 is there because the CPU will access an
25876- * additional byte beyond the end of the IO permission
25877- * bitmap. The extra byte must be all 1 bits, and must
25878- * be within the limit.
25879- */
25880- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
25881- /*
25882- * Cache the current maximum and the last task that used the bitmap:
25883- */
25884- unsigned long io_bitmap_max;
25885- struct thread_struct *io_bitmap_owner;
25886- /*
25887- * pads the TSS to be cacheline-aligned (size is 0x100)
25888- */
25889- unsigned long __cacheline_filler[35];
25890- /*
25891- * .. and then another 0x100 bytes for emergency kernel stack
25892- */
25893- unsigned long stack[64];
25894-} __attribute__((packed));
25895-#endif
25896-
25897-#define ARCH_MIN_TASKALIGN 16
25898-
25899-struct thread_struct {
25900-/* cached TLS descriptors. */
25901- struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25902- unsigned long esp0;
25903- unsigned long sysenter_cs;
25904- unsigned long eip;
25905- unsigned long esp;
25906- unsigned long fs;
25907- unsigned long gs;
25908-/* Hardware debugging registers */
25909- unsigned long debugreg[8]; /* %%db0-7 debug registers */
25910-/* fault info */
25911- unsigned long cr2, trap_no, error_code;
25912-/* floating point info */
25913- union i387_union i387;
25914-/* virtual 86 mode info */
25915- struct vm86_struct __user * vm86_info;
25916- unsigned long screen_bitmap;
25917- unsigned long v86flags, v86mask, saved_esp0;
25918- unsigned int saved_fs, saved_gs;
25919-/* IO permissions */
25920- unsigned long *io_bitmap_ptr;
25921- unsigned long iopl;
25922-/* max allowed port in the bitmap, in bytes: */
25923- unsigned long io_bitmap_max;
25924-};
25925-
25926-#define INIT_THREAD { \
25927- .esp0 = sizeof(init_stack) + (long)&init_stack, \
25928- .vm86_info = NULL, \
25929- .sysenter_cs = __KERNEL_CS, \
25930- .io_bitmap_ptr = NULL, \
25931- .fs = __KERNEL_PERCPU, \
25932-}
25933-
25934-/*
25935- * Note that the .io_bitmap member must be extra-big. This is because
25936- * the CPU will access an additional byte beyond the end of the IO
25937- * permission bitmap. The extra byte must be all 1 bits, and must
25938- * be within the limit.
25939- */
25940-#define INIT_TSS { \
25941- .x86_tss = { \
25942- .esp0 = sizeof(init_stack) + (long)&init_stack, \
25943- .ss0 = __KERNEL_DS, \
25944- .ss1 = __KERNEL_CS, \
25945- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25946- }, \
25947- .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
25948-}
25949-
25950-#define start_thread(regs, new_eip, new_esp) do { \
25951- __asm__("movl %0,%%gs": :"r" (0)); \
25952- regs->xfs = 0; \
25953- set_fs(USER_DS); \
25954- regs->xds = __USER_DS; \
25955- regs->xes = __USER_DS; \
25956- regs->xss = __USER_DS; \
25957- regs->xcs = __USER_CS; \
25958- regs->eip = new_eip; \
25959- regs->esp = new_esp; \
25960-} while (0)
25961-
25962-/* Forward declaration, a strange C thing */
25963-struct task_struct;
25964-struct mm_struct;
25965-
25966-/* Free all resources held by a thread. */
25967-extern void release_thread(struct task_struct *);
25968-
25969-/* Prepare to copy thread state - unlazy all lazy status */
25970-extern void prepare_to_copy(struct task_struct *tsk);
25971-
25972-/*
25973- * create a kernel thread without removing it from tasklists
25974- */
25975-extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25976-
25977-extern unsigned long thread_saved_pc(struct task_struct *tsk);
25978-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25979-
25980-unsigned long get_wchan(struct task_struct *p);
25981-
25982-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25983-#define KSTK_TOP(info) \
25984-({ \
25985- unsigned long *__ptr = (unsigned long *)(info); \
25986- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25987-})
25988-
25989-/*
25990- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25991- * This is necessary to guarantee that the entire "struct pt_regs"
25992- * is accessable even if the CPU haven't stored the SS/ESP registers
25993- * on the stack (interrupt gate does not save these registers
25994- * when switching to the same priv ring).
25995- * Therefore beware: accessing the xss/esp fields of the
25996- * "struct pt_regs" is possible, but they may contain the
25997- * completely wrong values.
25998- */
25999-#define task_pt_regs(task) \
26000-({ \
26001- struct pt_regs *__regs__; \
26002- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
26003- __regs__ - 1; \
26004-})
26005-
26006-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
26007-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
26008-
26009-
26010-struct microcode_header {
26011- unsigned int hdrver;
26012- unsigned int rev;
26013- unsigned int date;
26014- unsigned int sig;
26015- unsigned int cksum;
26016- unsigned int ldrver;
26017- unsigned int pf;
26018- unsigned int datasize;
26019- unsigned int totalsize;
26020- unsigned int reserved[3];
26021-};
26022-
26023-struct microcode {
26024- struct microcode_header hdr;
26025- unsigned int bits[0];
26026-};
26027-
26028-typedef struct microcode microcode_t;
26029-typedef struct microcode_header microcode_header_t;
26030-
26031-/* microcode format is extended from prescott processors */
26032-struct extended_signature {
26033- unsigned int sig;
26034- unsigned int pf;
26035- unsigned int cksum;
26036-};
26037-
26038-struct extended_sigtable {
26039- unsigned int count;
26040- unsigned int cksum;
26041- unsigned int reserved[3];
26042- struct extended_signature sigs[0];
26043-};
26044-
26045-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26046-static inline void rep_nop(void)
26047-{
26048- __asm__ __volatile__("rep;nop": : :"memory");
26049-}
26050-
26051-#define cpu_relax() rep_nop()
26052-
26053-#ifndef CONFIG_X86_NO_TSS
26054-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
26055-{
26056- tss->x86_tss.esp0 = thread->esp0;
26057- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
26058- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26059- tss->x86_tss.ss1 = thread->sysenter_cs;
26060- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26061- }
26062-}
26063-#else
26064-#define xen_load_esp0(tss, thread) do { \
26065- if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
26066- BUG(); \
26067-} while (0)
26068-#endif
26069-
26070-
26071-static inline unsigned long xen_get_debugreg(int regno)
26072-{
26073- return HYPERVISOR_get_debugreg(regno);
26074-}
26075-
26076-static inline void xen_set_debugreg(int regno, unsigned long value)
26077-{
26078- WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26079-}
26080-
26081-/*
26082- * Set IOPL bits in EFLAGS from given mask
26083- */
26084-static inline void xen_set_iopl_mask(unsigned mask)
26085-{
26086- struct physdev_set_iopl set_iopl;
26087-
26088- /* Force the change at ring 0. */
26089- set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26090- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26091-}
26092-
26093-
26094-#define paravirt_enabled() 0
26095-#define __cpuid xen_cpuid
26096-
26097-#define load_esp0 xen_load_esp0
26098-
26099-/*
26100- * These special macros can be used to get or set a debugging register
26101- */
26102-#define get_debugreg(var, register) \
26103- (var) = xen_get_debugreg(register)
26104-#define set_debugreg(value, register) \
26105- xen_set_debugreg(register, value)
26106-
26107-#define set_iopl_mask xen_set_iopl_mask
26108-
26109-/*
26110- * Generic CPUID function
26111- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26112- * resulting in stale register contents being returned.
26113- */
26114-static inline void cpuid(unsigned int op,
26115- unsigned int *eax, unsigned int *ebx,
26116- unsigned int *ecx, unsigned int *edx)
26117-{
26118- *eax = op;
26119- *ecx = 0;
26120- __cpuid(eax, ebx, ecx, edx);
26121-}
26122-
26123-/* Some CPUID calls want 'count' to be placed in ecx */
26124-static inline void cpuid_count(unsigned int op, int count,
26125- unsigned int *eax, unsigned int *ebx,
26126- unsigned int *ecx, unsigned int *edx)
26127-{
26128- *eax = op;
26129- *ecx = count;
26130- __cpuid(eax, ebx, ecx, edx);
26131-}
26132-
26133-/*
26134- * CPUID functions returning a single datum
26135- */
26136-static inline unsigned int cpuid_eax(unsigned int op)
26137-{
26138- unsigned int eax, ebx, ecx, edx;
26139-
26140- cpuid(op, &eax, &ebx, &ecx, &edx);
26141- return eax;
26142-}
26143-static inline unsigned int cpuid_ebx(unsigned int op)
26144-{
26145- unsigned int eax, ebx, ecx, edx;
26146-
26147- cpuid(op, &eax, &ebx, &ecx, &edx);
26148- return ebx;
26149-}
26150-static inline unsigned int cpuid_ecx(unsigned int op)
26151-{
26152- unsigned int eax, ebx, ecx, edx;
26153-
26154- cpuid(op, &eax, &ebx, &ecx, &edx);
26155- return ecx;
26156-}
26157-static inline unsigned int cpuid_edx(unsigned int op)
26158-{
26159- unsigned int eax, ebx, ecx, edx;
26160-
26161- cpuid(op, &eax, &ebx, &ecx, &edx);
26162- return edx;
26163-}
26164-
26165-/* generic versions from gas */
26166-#define GENERIC_NOP1 ".byte 0x90\n"
26167-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
26168-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
26169-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
26170-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
26171-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
26172-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
26173-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
26174-
26175-/* Opteron nops */
26176-#define K8_NOP1 GENERIC_NOP1
26177-#define K8_NOP2 ".byte 0x66,0x90\n"
26178-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26179-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26180-#define K8_NOP5 K8_NOP3 K8_NOP2
26181-#define K8_NOP6 K8_NOP3 K8_NOP3
26182-#define K8_NOP7 K8_NOP4 K8_NOP3
26183-#define K8_NOP8 K8_NOP4 K8_NOP4
26184-
26185-/* K7 nops */
26186-/* uses eax dependencies (arbitary choice) */
26187-#define K7_NOP1 GENERIC_NOP1
26188-#define K7_NOP2 ".byte 0x8b,0xc0\n"
26189-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
26190-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
26191-#define K7_NOP5 K7_NOP4 ASM_NOP1
26192-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
26193-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
26194-#define K7_NOP8 K7_NOP7 ASM_NOP1
26195-
26196-/* P6 nops */
26197-/* uses eax dependencies (Intel-recommended choice) */
26198-#define P6_NOP1 GENERIC_NOP1
26199-#define P6_NOP2 ".byte 0x66,0x90\n"
26200-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26201-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26202-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26203-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26204-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26205-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26206-
26207-#ifdef CONFIG_MK8
26208-#define ASM_NOP1 K8_NOP1
26209-#define ASM_NOP2 K8_NOP2
26210-#define ASM_NOP3 K8_NOP3
26211-#define ASM_NOP4 K8_NOP4
26212-#define ASM_NOP5 K8_NOP5
26213-#define ASM_NOP6 K8_NOP6
26214-#define ASM_NOP7 K8_NOP7
26215-#define ASM_NOP8 K8_NOP8
26216-#elif defined(CONFIG_MK7)
26217-#define ASM_NOP1 K7_NOP1
26218-#define ASM_NOP2 K7_NOP2
26219-#define ASM_NOP3 K7_NOP3
26220-#define ASM_NOP4 K7_NOP4
26221-#define ASM_NOP5 K7_NOP5
26222-#define ASM_NOP6 K7_NOP6
26223-#define ASM_NOP7 K7_NOP7
26224-#define ASM_NOP8 K7_NOP8
26225-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
26226- defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
26227- defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
26228-#define ASM_NOP1 P6_NOP1
26229-#define ASM_NOP2 P6_NOP2
26230-#define ASM_NOP3 P6_NOP3
26231-#define ASM_NOP4 P6_NOP4
26232-#define ASM_NOP5 P6_NOP5
26233-#define ASM_NOP6 P6_NOP6
26234-#define ASM_NOP7 P6_NOP7
26235-#define ASM_NOP8 P6_NOP8
26236-#else
26237-#define ASM_NOP1 GENERIC_NOP1
26238-#define ASM_NOP2 GENERIC_NOP2
26239-#define ASM_NOP3 GENERIC_NOP3
26240-#define ASM_NOP4 GENERIC_NOP4
26241-#define ASM_NOP5 GENERIC_NOP5
26242-#define ASM_NOP6 GENERIC_NOP6
26243-#define ASM_NOP7 GENERIC_NOP7
26244-#define ASM_NOP8 GENERIC_NOP8
26245-#endif
26246-
26247-#define ASM_NOP_MAX 8
26248-
26249-/* Prefetch instructions for Pentium III and AMD Athlon */
26250-/* It's not worth to care about 3dnow! prefetches for the K6
26251- because they are microcoded there and very slow.
26252- However we don't do prefetches for pre XP Athlons currently
26253- That should be fixed. */
26254-#define ARCH_HAS_PREFETCH
26255-static inline void prefetch(const void *x)
26256-{
26257- alternative_input(ASM_NOP4,
26258- "prefetchnta (%1)",
26259- X86_FEATURE_XMM,
26260- "r" (x));
26261-}
26262-
26263-#define ARCH_HAS_PREFETCH
26264-#define ARCH_HAS_PREFETCHW
26265-#define ARCH_HAS_SPINLOCK_PREFETCH
26266-
26267-/* 3dnow! prefetch to get an exclusive cache line. Useful for
26268- spinlocks to avoid one state transition in the cache coherency protocol. */
26269-static inline void prefetchw(const void *x)
26270-{
26271- alternative_input(ASM_NOP4,
26272- "prefetchw (%1)",
26273- X86_FEATURE_3DNOW,
26274- "r" (x));
26275-}
26276-#define spin_lock_prefetch(x) prefetchw(x)
26277-
26278-extern void select_idle_routine(const struct cpuinfo_x86 *c);
26279-
26280-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26281-
26282-extern unsigned long boot_option_idle_override;
26283-extern void enable_sep_cpu(void);
26284-extern int sysenter_setup(void);
26285-
26286-/* Defined in head.S */
26287-extern struct Xgt_desc_struct early_gdt_descr;
26288-
26289-extern void cpu_set_gdt(int);
26290-extern void switch_to_new_gdt(void);
26291-extern void cpu_init(void);
26292-extern void init_gdt(int cpu);
26293-
26294-extern int force_mwait;
26295-
26296-#endif /* __ASM_I386_PROCESSOR_H */
26297--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/processor_64.h 2009-02-16 16:18:36.000000000 +0100
26298+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26299@@ -1,461 +0,0 @@
26300-/*
26301- * include/asm-x86_64/processor.h
26302- *
26303- * Copyright (C) 1994 Linus Torvalds
26304- */
26305-
26306-#ifndef __ASM_X86_64_PROCESSOR_H
26307-#define __ASM_X86_64_PROCESSOR_H
26308-
26309-#include <asm/segment.h>
26310-#include <asm/page.h>
26311-#include <asm/types.h>
26312-#include <asm/sigcontext.h>
26313-#include <asm/cpufeature.h>
26314-#include <linux/threads.h>
26315-#include <asm/msr.h>
26316-#include <asm/current.h>
26317-#include <asm/system.h>
26318-#include <asm/mmsegment.h>
26319-#include <asm/percpu.h>
26320-#include <linux/personality.h>
26321-#include <linux/cpumask.h>
26322-#include <asm/processor-flags.h>
26323-
26324-#define TF_MASK 0x00000100
26325-#define IF_MASK 0x00000200
26326-#define IOPL_MASK 0x00003000
26327-#define NT_MASK 0x00004000
26328-#define VM_MASK 0x00020000
26329-#define AC_MASK 0x00040000
26330-#define VIF_MASK 0x00080000 /* virtual interrupt flag */
26331-#define VIP_MASK 0x00100000 /* virtual interrupt pending */
26332-#define ID_MASK 0x00200000
26333-
26334-#define desc_empty(desc) \
26335- (!((desc)->a | (desc)->b))
26336-
26337-#define desc_equal(desc1, desc2) \
26338- (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
26339-
26340-/*
26341- * Default implementation of macro that returns current
26342- * instruction pointer ("program counter").
26343- */
26344-#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
26345-
26346-/*
26347- * CPU type and hardware bug flags. Kept separately for each CPU.
26348- */
26349-
26350-struct cpuinfo_x86 {
26351- __u8 x86; /* CPU family */
26352- __u8 x86_vendor; /* CPU vendor */
26353- __u8 x86_model;
26354- __u8 x86_mask;
26355- int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
26356- __u32 x86_capability[NCAPINTS];
26357- char x86_vendor_id[16];
26358- char x86_model_id[64];
26359- int x86_cache_size; /* in KB */
26360- int x86_clflush_size;
26361- int x86_cache_alignment;
26362- int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26363- __u8 x86_virt_bits, x86_phys_bits;
26364- __u8 x86_max_cores; /* cpuid returned max cores value */
26365- __u32 x86_power;
26366- __u32 extended_cpuid_level; /* Max extended CPUID function supported */
26367- unsigned long loops_per_jiffy;
26368-#ifdef CONFIG_SMP
26369- cpumask_t llc_shared_map; /* cpus sharing the last level cache */
26370-#endif
26371- __u8 apicid;
26372-#ifdef CONFIG_SMP
26373- __u8 booted_cores; /* number of cores as seen by OS */
26374- __u8 phys_proc_id; /* Physical Processor id. */
26375- __u8 cpu_core_id; /* Core id. */
26376- __u8 cpu_index; /* index into per_cpu list */
26377-#endif
26378-} ____cacheline_aligned;
26379-
26380-#define X86_VENDOR_INTEL 0
26381-#define X86_VENDOR_CYRIX 1
26382-#define X86_VENDOR_AMD 2
26383-#define X86_VENDOR_UMC 3
26384-#define X86_VENDOR_NEXGEN 4
26385-#define X86_VENDOR_CENTAUR 5
26386-#define X86_VENDOR_TRANSMETA 7
26387-#define X86_VENDOR_NUM 8
26388-#define X86_VENDOR_UNKNOWN 0xff
26389-
26390-#ifdef CONFIG_SMP
26391-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26392-#define cpu_data(cpu) per_cpu(cpu_info, cpu)
26393-#define current_cpu_data cpu_data(smp_processor_id())
26394-#else
26395-#define cpu_data(cpu) boot_cpu_data
26396-#define current_cpu_data boot_cpu_data
26397-#endif
26398-
26399-extern char ignore_irq13;
26400-
26401-extern void identify_cpu(struct cpuinfo_x86 *);
26402-extern void print_cpu_info(struct cpuinfo_x86 *);
26403-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26404-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26405-extern unsigned short num_cache_leaves;
26406-
26407-/*
26408- * Save the cr4 feature set we're using (ie
26409- * Pentium 4MB enable and PPro Global page
26410- * enable), so that any CPU's that boot up
26411- * after us can get the correct flags.
26412- */
26413-extern unsigned long mmu_cr4_features;
26414-
26415-static inline void set_in_cr4 (unsigned long mask)
26416-{
26417- mmu_cr4_features |= mask;
26418- __asm__("movq %%cr4,%%rax\n\t"
26419- "orq %0,%%rax\n\t"
26420- "movq %%rax,%%cr4\n"
26421- : : "irg" (mask)
26422- :"ax");
26423-}
26424-
26425-static inline void clear_in_cr4 (unsigned long mask)
26426-{
26427- mmu_cr4_features &= ~mask;
26428- __asm__("movq %%cr4,%%rax\n\t"
26429- "andq %0,%%rax\n\t"
26430- "movq %%rax,%%cr4\n"
26431- : : "irg" (~mask)
26432- :"ax");
26433-}
26434-
26435-
26436-/*
26437- * User space process size. 47bits minus one guard page.
26438- */
26439-#define TASK_SIZE64 (0x800000000000UL - 4096)
26440-
26441-/* This decides where the kernel will search for a free chunk of vm
26442- * space during mmap's.
26443- */
26444-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
26445-
26446-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26447-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26448-
26449-#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
26450-
26451-/*
26452- * Size of io_bitmap.
26453- */
26454-#define IO_BITMAP_BITS 65536
26455-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26456-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26457-#ifndef CONFIG_X86_NO_TSS
26458-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
26459-#endif
26460-#define INVALID_IO_BITMAP_OFFSET 0x8000
26461-
26462-struct i387_fxsave_struct {
26463- u16 cwd;
26464- u16 swd;
26465- u16 twd;
26466- u16 fop;
26467- u64 rip;
26468- u64 rdp;
26469- u32 mxcsr;
26470- u32 mxcsr_mask;
26471- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
26472- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
26473- u32 padding[24];
26474-} __attribute__ ((aligned (16)));
26475-
26476-union i387_union {
26477- struct i387_fxsave_struct fxsave;
26478-};
26479-
26480-#ifndef CONFIG_X86_NO_TSS
26481-struct tss_struct {
26482- u32 reserved1;
26483- u64 rsp0;
26484- u64 rsp1;
26485- u64 rsp2;
26486- u64 reserved2;
26487- u64 ist[7];
26488- u32 reserved3;
26489- u32 reserved4;
26490- u16 reserved5;
26491- u16 io_bitmap_base;
26492- /*
26493- * The extra 1 is there because the CPU will access an
26494- * additional byte beyond the end of the IO permission
26495- * bitmap. The extra byte must be all 1 bits, and must
26496- * be within the limit. Thus we have:
26497- *
26498- * 128 bytes, the bitmap itself, for ports 0..0x3ff
26499- * 8 bytes, for an extra "long" of ~0UL
26500- */
26501- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26502-} __attribute__((packed)) ____cacheline_aligned;
26503-
26504-DECLARE_PER_CPU(struct tss_struct,init_tss);
26505-#endif
26506-
26507-
26508-extern struct cpuinfo_x86 boot_cpu_data;
26509-#ifndef CONFIG_X86_NO_TSS
26510-/* Save the original ist values for checking stack pointers during debugging */
26511-struct orig_ist {
26512- unsigned long ist[7];
26513-};
26514-DECLARE_PER_CPU(struct orig_ist, orig_ist);
26515-#endif
26516-
26517-#ifdef CONFIG_X86_VSMP
26518-#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26519-#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26520-#else
26521-#define ARCH_MIN_TASKALIGN 16
26522-#define ARCH_MIN_MMSTRUCT_ALIGN 0
26523-#endif
26524-
26525-struct thread_struct {
26526- unsigned long rsp0;
26527- unsigned long rsp;
26528- unsigned long userrsp; /* Copy from PDA */
26529- unsigned long fs;
26530- unsigned long gs;
26531- unsigned short es, ds, fsindex, gsindex;
26532-/* Hardware debugging registers */
26533- unsigned long debugreg0;
26534- unsigned long debugreg1;
26535- unsigned long debugreg2;
26536- unsigned long debugreg3;
26537- unsigned long debugreg6;
26538- unsigned long debugreg7;
26539-/* fault info */
26540- unsigned long cr2, trap_no, error_code;
26541-/* floating point info */
26542- union i387_union i387 __attribute__((aligned(16)));
26543-/* IO permissions. the bitmap could be moved into the GDT, that would make
26544- switch faster for a limited number of ioperm using tasks. -AK */
26545- int ioperm;
26546- unsigned long *io_bitmap_ptr;
26547- unsigned io_bitmap_max;
26548-/* cached TLS descriptors. */
26549- u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26550- unsigned int iopl;
26551-} __attribute__((aligned(16)));
26552-
26553-#define INIT_THREAD { \
26554- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26555-}
26556-
26557-#ifndef CONFIG_X86_NO_TSS
26558-#define INIT_TSS { \
26559- .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26560-}
26561-#endif
26562-
26563-#define INIT_MMAP \
26564-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26565-
26566-#define start_thread(regs,new_rip,new_rsp) do { \
26567- asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
26568- load_gs_index(0); \
26569- (regs)->rip = (new_rip); \
26570- (regs)->rsp = (new_rsp); \
26571- write_pda(oldrsp, (new_rsp)); \
26572- (regs)->cs = __USER_CS; \
26573- (regs)->ss = __USER_DS; \
26574- (regs)->eflags = 0x200; \
26575- set_fs(USER_DS); \
26576-} while(0)
26577-
26578-#define get_debugreg(var, register) \
26579- var = HYPERVISOR_get_debugreg(register)
26580-#define set_debugreg(value, register) do { \
26581- if (HYPERVISOR_set_debugreg(register, value)) \
26582- BUG(); \
26583-} while (0)
26584-
26585-struct task_struct;
26586-struct mm_struct;
26587-
26588-/* Free all resources held by a thread. */
26589-extern void release_thread(struct task_struct *);
26590-
26591-/* Prepare to copy thread state - unlazy all lazy status */
26592-extern void prepare_to_copy(struct task_struct *tsk);
26593-
26594-/*
26595- * create a kernel thread without removing it from tasklists
26596- */
26597-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26598-
26599-/*
26600- * Return saved PC of a blocked thread.
26601- * What is this good for? it will be always the scheduler or ret_from_fork.
26602- */
26603-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26604-
26605-extern unsigned long get_wchan(struct task_struct *p);
26606-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26607-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26608-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26609-
26610-
26611-struct microcode_header {
26612- unsigned int hdrver;
26613- unsigned int rev;
26614- unsigned int date;
26615- unsigned int sig;
26616- unsigned int cksum;
26617- unsigned int ldrver;
26618- unsigned int pf;
26619- unsigned int datasize;
26620- unsigned int totalsize;
26621- unsigned int reserved[3];
26622-};
26623-
26624-struct microcode {
26625- struct microcode_header hdr;
26626- unsigned int bits[0];
26627-};
26628-
26629-typedef struct microcode microcode_t;
26630-typedef struct microcode_header microcode_header_t;
26631-
26632-/* microcode format is extended from prescott processors */
26633-struct extended_signature {
26634- unsigned int sig;
26635- unsigned int pf;
26636- unsigned int cksum;
26637-};
26638-
26639-struct extended_sigtable {
26640- unsigned int count;
26641- unsigned int cksum;
26642- unsigned int reserved[3];
26643- struct extended_signature sigs[0];
26644-};
26645-
26646-
26647-#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26648-#define ASM_NOP1 P6_NOP1
26649-#define ASM_NOP2 P6_NOP2
26650-#define ASM_NOP3 P6_NOP3
26651-#define ASM_NOP4 P6_NOP4
26652-#define ASM_NOP5 P6_NOP5
26653-#define ASM_NOP6 P6_NOP6
26654-#define ASM_NOP7 P6_NOP7
26655-#define ASM_NOP8 P6_NOP8
26656-#else
26657-#define ASM_NOP1 K8_NOP1
26658-#define ASM_NOP2 K8_NOP2
26659-#define ASM_NOP3 K8_NOP3
26660-#define ASM_NOP4 K8_NOP4
26661-#define ASM_NOP5 K8_NOP5
26662-#define ASM_NOP6 K8_NOP6
26663-#define ASM_NOP7 K8_NOP7
26664-#define ASM_NOP8 K8_NOP8
26665-#endif
26666-
26667-/* Opteron nops */
26668-#define K8_NOP1 ".byte 0x90\n"
26669-#define K8_NOP2 ".byte 0x66,0x90\n"
26670-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26671-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26672-#define K8_NOP5 K8_NOP3 K8_NOP2
26673-#define K8_NOP6 K8_NOP3 K8_NOP3
26674-#define K8_NOP7 K8_NOP4 K8_NOP3
26675-#define K8_NOP8 K8_NOP4 K8_NOP4
26676-
26677-/* P6 nops */
26678-/* uses eax dependencies (Intel-recommended choice) */
26679-#define P6_NOP1 ".byte 0x90\n"
26680-#define P6_NOP2 ".byte 0x66,0x90\n"
26681-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26682-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26683-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26684-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26685-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26686-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26687-
26688-#define ASM_NOP_MAX 8
26689-
26690-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26691-static inline void rep_nop(void)
26692-{
26693- __asm__ __volatile__("rep;nop": : :"memory");
26694-}
26695-
26696-/* Stop speculative execution */
26697-static inline void sync_core(void)
26698-{
26699- int tmp;
26700- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26701-}
26702-
26703-#define ARCH_HAS_PREFETCHW 1
26704-static inline void prefetchw(void *x)
26705-{
26706- alternative_input("prefetcht0 (%1)",
26707- "prefetchw (%1)",
26708- X86_FEATURE_3DNOW,
26709- "r" (x));
26710-}
26711-
26712-#define ARCH_HAS_SPINLOCK_PREFETCH 1
26713-
26714-#define spin_lock_prefetch(x) prefetchw(x)
26715-
26716-#define cpu_relax() rep_nop()
26717-
26718-static inline void __monitor(const void *eax, unsigned long ecx,
26719- unsigned long edx)
26720-{
26721- /* "monitor %eax,%ecx,%edx;" */
26722- asm volatile(
26723- ".byte 0x0f,0x01,0xc8;"
26724- : :"a" (eax), "c" (ecx), "d"(edx));
26725-}
26726-
26727-static inline void __mwait(unsigned long eax, unsigned long ecx)
26728-{
26729- /* "mwait %eax,%ecx;" */
26730- asm volatile(
26731- ".byte 0x0f,0x01,0xc9;"
26732- : :"a" (eax), "c" (ecx));
26733-}
26734-
26735-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26736-{
26737- /* "mwait %eax,%ecx;" */
26738- asm volatile(
26739- "sti; .byte 0x0f,0x01,0xc9;"
26740- : :"a" (eax), "c" (ecx));
26741-}
26742-
26743-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26744-
26745-#define stack_current() \
26746-({ \
26747- struct thread_info *ti; \
26748- asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
26749- ti->task; \
26750-})
26751-
26752-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26753-
26754-extern unsigned long boot_option_idle_override;
26755-/* Boot loader type from the setup header */
26756-extern int bootloader_type;
26757-
26758-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26759-
26760-#endif /* __ASM_X86_64_PROCESSOR_H */
26761--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/segment.h 2009-02-16 16:18:36.000000000 +0100
26762+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
26763@@ -1,5 +1,204 @@
26764+#ifndef _ASM_X86_SEGMENT_H_
26765+#define _ASM_X86_SEGMENT_H_
26766+
26767+/* Simple and small GDT entries for booting only */
26768+
26769+#define GDT_ENTRY_BOOT_CS 2
26770+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
26771+
26772+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
26773+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
26774+
26775+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
26776+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
26777+
26778 #ifdef CONFIG_X86_32
26779-# include "segment_32.h"
26780+/*
26781+ * The layout of the per-CPU GDT under Linux:
26782+ *
26783+ * 0 - null
26784+ * 1 - reserved
26785+ * 2 - reserved
26786+ * 3 - reserved
26787+ *
26788+ * 4 - unused <==== new cacheline
26789+ * 5 - unused
26790+ *
26791+ * ------- start of TLS (Thread-Local Storage) segments:
26792+ *
26793+ * 6 - TLS segment #1 [ glibc's TLS segment ]
26794+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
26795+ * 8 - TLS segment #3
26796+ * 9 - reserved
26797+ * 10 - reserved
26798+ * 11 - reserved
26799+ *
26800+ * ------- start of kernel segments:
26801+ *
26802+ * 12 - kernel code segment <==== new cacheline
26803+ * 13 - kernel data segment
26804+ * 14 - default user CS
26805+ * 15 - default user DS
26806+ * 16 - TSS
26807+ * 17 - LDT
26808+ * 18 - PNPBIOS support (16->32 gate)
26809+ * 19 - PNPBIOS support
26810+ * 20 - PNPBIOS support
26811+ * 21 - PNPBIOS support
26812+ * 22 - PNPBIOS support
26813+ * 23 - APM BIOS support
26814+ * 24 - APM BIOS support
26815+ * 25 - APM BIOS support
26816+ *
26817+ * 26 - ESPFIX small SS
26818+ * 27 - per-cpu [ offset to per-cpu data area ]
26819+ * 28 - unused
26820+ * 29 - unused
26821+ * 30 - unused
26822+ * 31 - TSS for double fault handler
26823+ */
26824+#define GDT_ENTRY_TLS_MIN 6
26825+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
26826+
26827+#define GDT_ENTRY_DEFAULT_USER_CS 14
26828+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
26829+
26830+#define GDT_ENTRY_DEFAULT_USER_DS 15
26831+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
26832+
26833+#define GDT_ENTRY_KERNEL_BASE 12
26834+
26835+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
26836+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
26837+
26838+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
26839+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
26840+
26841+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
26842+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
26843+
26844+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
26845+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
26846+
26847+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
26848+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
26849+
26850+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
26851+#ifdef CONFIG_SMP
26852+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
26853 #else
26854-# include "../../segment_64.h"
26855+#define __KERNEL_PERCPU 0
26856+#endif
26857+
26858+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
26859+
26860+/*
26861+ * The GDT has 32 entries
26862+ */
26863+#define GDT_ENTRIES 32
26864+
26865+/* The PnP BIOS entries in the GDT */
26866+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
26867+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
26868+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
26869+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
26870+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
26871+
26872+/* The PnP BIOS selectors */
26873+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
26874+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
26875+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
26876+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
26877+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
26878+
26879+/* Bottom two bits of selector give the ring privilege level */
26880+#define SEGMENT_RPL_MASK 0x3
26881+/* Bit 2 is table indicator (LDT/GDT) */
26882+#define SEGMENT_TI_MASK 0x4
26883+
26884+/* User mode is privilege level 3 */
26885+#define USER_RPL 0x3
26886+/* LDT segment has TI set, GDT has it cleared */
26887+#define SEGMENT_LDT 0x4
26888+#define SEGMENT_GDT 0x0
26889+
26890+/*
26891+ * Matching rules for certain types of segments.
26892+ */
26893+
26894+/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
26895+#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
26896+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
26897+
26898+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
26899+#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
26900+ || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
26901+ || ((x) & ~3) == (FLAT_USER_CS & ~3))
26902+
26903+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
26904+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
26905+
26906+#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
26907+
26908+#else
26909+#include <asm/cache.h>
26910+
26911+#define __KERNEL_CS 0x10
26912+#define __KERNEL_DS 0x18
26913+
26914+#define __KERNEL32_CS 0x08
26915+
26916+/*
26917+ * we cannot use the same code segment descriptor for user and kernel
26918+ * -- not even in the long flat mode, because of different DPL /kkeil
26919+ * The segment offset needs to contain a RPL. Grr. -AK
26920+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
26921+ */
26922+
26923+#define __USER32_CS 0x23 /* 4*8+3 */
26924+#define __USER_DS 0x2b /* 5*8+3 */
26925+#define __USER_CS 0x33 /* 6*8+3 */
26926+#define __USER32_DS __USER_DS
26927+
26928+#define GDT_ENTRY_TSS 8 /* needs two entries */
26929+#define GDT_ENTRY_LDT 10 /* needs two entries */
26930+#define GDT_ENTRY_TLS_MIN 12
26931+#define GDT_ENTRY_TLS_MAX 14
26932+
26933+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
26934+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
26935+
26936+/* TLS indexes for 64bit - hardcoded in arch_prctl */
26937+#define FS_TLS 0
26938+#define GS_TLS 1
26939+
26940+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
26941+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
26942+
26943+#define GDT_ENTRIES 16
26944+
26945+#endif
26946+
26947+/* User mode is privilege level 3 */
26948+#define USER_RPL 0x3
26949+/* LDT segment has TI set, GDT has it cleared */
26950+#define SEGMENT_LDT 0x4
26951+#define SEGMENT_GDT 0x0
26952+
26953+/* Bottom two bits of selector give the ring privilege level */
26954+#define SEGMENT_RPL_MASK 0x3
26955+/* Bit 2 is table indicator (LDT/GDT) */
26956+#define SEGMENT_TI_MASK 0x4
26957+
26958+#define IDT_ENTRIES 256
26959+#define GDT_SIZE (GDT_ENTRIES * 8)
26960+#define GDT_ENTRY_TLS_ENTRIES 3
26961+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
26962+
26963+#ifdef __KERNEL__
26964+#ifndef __ASSEMBLY__
26965+extern const char early_idt_handlers[IDT_ENTRIES][10];
26966+#endif
26967+#endif
26968+
26969 #endif
26970--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/segment_32.h 2008-12-15 11:27:22.000000000 +0100
26971+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26972@@ -1,150 +0,0 @@
26973-#ifndef _ASM_SEGMENT_H
26974-#define _ASM_SEGMENT_H
26975-
26976-/*
26977- * The layout of the per-CPU GDT under Linux:
26978- *
26979- * 0 - null
26980- * 1 - reserved
26981- * 2 - reserved
26982- * 3 - reserved
26983- *
26984- * 4 - unused <==== new cacheline
26985- * 5 - unused
26986- *
26987- * ------- start of TLS (Thread-Local Storage) segments:
26988- *
26989- * 6 - TLS segment #1 [ glibc's TLS segment ]
26990- * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
26991- * 8 - TLS segment #3
26992- * 9 - reserved
26993- * 10 - reserved
26994- * 11 - reserved
26995- *
26996- * ------- start of kernel segments:
26997- *
26998- * 12 - kernel code segment <==== new cacheline
26999- * 13 - kernel data segment
27000- * 14 - default user CS
27001- * 15 - default user DS
27002- * 16 - TSS
27003- * 17 - LDT
27004- * 18 - PNPBIOS support (16->32 gate)
27005- * 19 - PNPBIOS support
27006- * 20 - PNPBIOS support
27007- * 21 - PNPBIOS support
27008- * 22 - PNPBIOS support
27009- * 23 - APM BIOS support
27010- * 24 - APM BIOS support
27011- * 25 - APM BIOS support
27012- *
27013- * 26 - ESPFIX small SS
27014- * 27 - per-cpu [ offset to per-cpu data area ]
27015- * 28 - unused
27016- * 29 - unused
27017- * 30 - unused
27018- * 31 - TSS for double fault handler
27019- */
27020-#define GDT_ENTRY_TLS_ENTRIES 3
27021-#define GDT_ENTRY_TLS_MIN 6
27022-#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27023-
27024-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27025-
27026-#define GDT_ENTRY_DEFAULT_USER_CS 14
27027-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27028-
27029-#define GDT_ENTRY_DEFAULT_USER_DS 15
27030-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27031-
27032-#define GDT_ENTRY_KERNEL_BASE 12
27033-
27034-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27035-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27036-
27037-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27038-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27039-
27040-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27041-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27042-
27043-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27044-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27045-
27046-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27047-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27048-
27049-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27050-#ifdef CONFIG_SMP
27051-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27052-#else
27053-#define __KERNEL_PERCPU 0
27054-#endif
27055-
27056-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27057-
27058-/*
27059- * The GDT has 32 entries
27060- */
27061-#define GDT_ENTRIES 32
27062-#define GDT_SIZE (GDT_ENTRIES * 8)
27063-
27064-/* Simple and small GDT entries for booting only */
27065-
27066-#define GDT_ENTRY_BOOT_CS 2
27067-#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27068-
27069-#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27070-#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27071-
27072-/* The PnP BIOS entries in the GDT */
27073-#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27074-#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27075-#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27076-#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27077-#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27078-
27079-/* The PnP BIOS selectors */
27080-#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27081-#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27082-#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27083-#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27084-#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27085-
27086-/*
27087- * The interrupt descriptor table has room for 256 idt's,
27088- * the global descriptor table is dependent on the number
27089- * of tasks we can have..
27090- */
27091-#define IDT_ENTRIES 256
27092-
27093-/* Bottom two bits of selector give the ring privilege level */
27094-#define SEGMENT_RPL_MASK 0x3
27095-/* Bit 2 is table indicator (LDT/GDT) */
27096-#define SEGMENT_TI_MASK 0x4
27097-
27098-/* User mode is privilege level 3 */
27099-#define USER_RPL 0x3
27100-/* LDT segment has TI set, GDT has it cleared */
27101-#define SEGMENT_LDT 0x4
27102-#define SEGMENT_GDT 0x0
27103-
27104-#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27105-
27106-/*
27107- * Matching rules for certain types of segments.
27108- */
27109-
27110-/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27111-#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27112- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27113-
27114-/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27115-#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27116- || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27117- || ((x) & ~3) == (FLAT_USER_CS & ~3))
27118-
27119-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27120-#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27121-
27122-#endif
27123--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-02-16 16:18:36.000000000 +0100
27124+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
27125@@ -1,56 +1,51 @@
27126 #ifndef __ASM_SMP_H
27127 #define __ASM_SMP_H
27128
27129+#ifndef __ASSEMBLY__
27130+#include <linux/cpumask.h>
27131+#include <linux/init.h>
27132+
27133 /*
27134 * We need the APIC definitions automatically as part of 'smp.h'
27135 */
27136-#ifndef __ASSEMBLY__
27137-#include <linux/kernel.h>
27138-#include <linux/threads.h>
27139-#include <linux/cpumask.h>
27140+#ifdef CONFIG_X86_LOCAL_APIC
27141+# include <asm/mpspec.h>
27142+# include <asm/apic.h>
27143+# ifdef CONFIG_X86_IO_APIC
27144+# include <asm/io_apic.h>
27145+# endif
27146 #endif
27147
27148-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27149-#include <linux/bitops.h>
27150-#include <asm/mpspec.h>
27151-#include <asm/apic.h>
27152-#ifdef CONFIG_X86_IO_APIC
27153-#include <asm/io_apic.h>
27154-#endif
27155-#endif
27156+#define cpu_callout_map cpu_possible_map
27157+#define cpu_callin_map cpu_possible_map
27158
27159-#define BAD_APICID 0xFFu
27160-#ifdef CONFIG_SMP
27161-#ifndef __ASSEMBLY__
27162+extern int smp_num_siblings;
27163+extern unsigned int num_processors;
27164
27165-/*
27166- * Private routines/data
27167- */
27168-
27169 extern void smp_alloc_memory(void);
27170-extern int pic_mode;
27171-extern int smp_num_siblings;
27172-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27173-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27174+extern void lock_ipi_call_lock(void);
27175+extern void unlock_ipi_call_lock(void);
27176
27177 extern void (*mtrr_hook) (void);
27178 extern void zap_low_mappings (void);
27179-extern void lock_ipi_call_lock(void);
27180-extern void unlock_ipi_call_lock(void);
27181
27182-#define MAX_APICID 256
27183-extern u8 __initdata x86_cpu_to_apicid_init[];
27184-extern void *x86_cpu_to_apicid_ptr;
27185+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27186+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27187+DECLARE_PER_CPU(u8, cpu_llc_id);
27188 DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27189
27190-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27191-
27192 #ifdef CONFIG_HOTPLUG_CPU
27193 extern void cpu_exit_clear(void);
27194 extern void cpu_uninit(void);
27195 #endif
27196
27197+#ifdef CONFIG_SMP
27198+
27199 #ifndef CONFIG_XEN
27200+
27201+/* Globals due to paravirt */
27202+extern void set_cpu_sibling_map(int cpu);
27203+
27204 struct smp_ops
27205 {
27206 void (*smp_prepare_boot_cpu)(void);
27207@@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27208 int native_cpu_up(unsigned int cpunum);
27209 void native_smp_cpus_done(unsigned int max_cpus);
27210
27211-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
27212-do { } while (0)
27213-
27214-#else
27215+#ifndef CONFIG_PARAVIRT
27216+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27217+#endif
27218
27219+#else /* CONFIG_XEN */
27220
27221 void xen_smp_send_stop(void);
27222 void xen_smp_send_reschedule(int cpu);
27223@@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27224 #define smp_send_reschedule xen_smp_send_reschedule
27225 #define smp_call_function_mask xen_smp_call_function_mask
27226
27227-#endif
27228+extern void prefill_possible_map(void);
27229+
27230+#endif /* CONFIG_XEN */
27231+
27232+extern int __cpu_disable(void);
27233+extern void __cpu_die(unsigned int cpu);
27234
27235 /*
27236 * This function is needed by all SMP systems. It must _always_ be valid
27237@@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27238 DECLARE_PER_CPU(int, cpu_number);
27239 #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27240
27241-extern cpumask_t cpu_possible_map;
27242-#define cpu_callin_map cpu_possible_map
27243+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27244+
27245+#define safe_smp_processor_id() smp_processor_id()
27246
27247 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27248 static inline int num_booting_cpus(void)
27249 {
27250- return cpus_weight(cpu_possible_map);
27251+ return cpus_weight(cpu_callout_map);
27252 }
27253
27254-#define safe_smp_processor_id() smp_processor_id()
27255-extern int __cpu_disable(void);
27256-extern void __cpu_die(unsigned int cpu);
27257-extern void prefill_possible_map(void);
27258-extern unsigned int num_processors;
27259-
27260-#endif /* !__ASSEMBLY__ */
27261-
27262 #else /* CONFIG_SMP */
27263
27264 #define safe_smp_processor_id() 0
27265 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
27266
27267-#define NO_PROC_ID 0xFF /* No processor magic marker */
27268-
27269-#endif /* CONFIG_SMP */
27270-
27271-#ifndef __ASSEMBLY__
27272+#endif /* !CONFIG_SMP */
27273
27274 #ifdef CONFIG_X86_LOCAL_APIC
27275
27276-#ifdef APIC_DEFINITION
27277+static __inline int logical_smp_processor_id(void)
27278+{
27279+ /* we don't want to mark this access volatile - bad code generation */
27280+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27281+}
27282+
27283+# ifdef APIC_DEFINITION
27284 extern int hard_smp_processor_id(void);
27285-#else
27286-#include <mach_apicdef.h>
27287+# else
27288+# include <mach_apicdef.h>
27289 static inline int hard_smp_processor_id(void)
27290 {
27291 /* we don't want to mark this access volatile - bad code generation */
27292- return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27293+ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27294 }
27295-#endif /* APIC_DEFINITION */
27296+# endif /* APIC_DEFINITION */
27297
27298 #else /* CONFIG_X86_LOCAL_APIC */
27299
27300-#ifndef CONFIG_SMP
27301-#define hard_smp_processor_id() 0
27302-#endif
27303+# ifndef CONFIG_SMP
27304+# define hard_smp_processor_id() 0
27305+# endif
27306
27307 #endif /* CONFIG_X86_LOCAL_APIC */
27308
27309-extern u8 apicid_2_node[];
27310-
27311-#ifdef CONFIG_X86_LOCAL_APIC
27312-static __inline int logical_smp_processor_id(void)
27313-{
27314- /* we don't want to mark this access volatile - bad code generation */
27315- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27316-}
27317-#endif
27318-#endif
27319-
27320+#endif /* !ASSEMBLY */
27321 #endif
27322--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-02-16 16:18:36.000000000 +0100
27323+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
27324@@ -1,139 +1,103 @@
27325 #ifndef __ASM_SMP_H
27326 #define __ASM_SMP_H
27327
27328-/*
27329- * We need the APIC definitions automatically as part of 'smp.h'
27330- */
27331-#include <linux/threads.h>
27332 #include <linux/cpumask.h>
27333-#include <linux/bitops.h>
27334 #include <linux/init.h>
27335-extern int disable_apic;
27336
27337 #ifdef CONFIG_X86_LOCAL_APIC
27338-#include <asm/mpspec.h>
27339+/*
27340+ * We need the APIC definitions automatically as part of 'smp.h'
27341+ */
27342 #include <asm/apic.h>
27343 #ifdef CONFIG_X86_IO_APIC
27344 #include <asm/io_apic.h>
27345 #endif
27346-#include <asm/thread_info.h>
27347+#include <asm/mpspec.h>
27348 #endif
27349-
27350-#ifdef CONFIG_SMP
27351-
27352 #include <asm/pda.h>
27353+#include <asm/thread_info.h>
27354
27355-struct pt_regs;
27356-
27357-extern cpumask_t cpu_present_mask;
27358-extern cpumask_t cpu_possible_map;
27359-extern cpumask_t cpu_online_map;
27360 extern cpumask_t cpu_initialized;
27361
27362-/*
27363- * Private routines/data
27364- */
27365-
27366+extern int smp_num_siblings;
27367+extern unsigned int num_processors;
27368+
27369 extern void smp_alloc_memory(void);
27370-extern volatile unsigned long smp_invalidate_needed;
27371 extern void lock_ipi_call_lock(void);
27372 extern void unlock_ipi_call_lock(void);
27373-extern int smp_num_siblings;
27374-extern void smp_send_reschedule(int cpu);
27375+
27376 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27377 void *info, int wait);
27378
27379-/*
27380- * cpu_sibling_map and cpu_core_map now live
27381- * in the per cpu area
27382- *
27383- * extern cpumask_t cpu_sibling_map[NR_CPUS];
27384- * extern cpumask_t cpu_core_map[NR_CPUS];
27385- */
27386 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27387 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27388-DECLARE_PER_CPU(u8, cpu_llc_id);
27389-
27390-#define SMP_TRAMPOLINE_BASE 0x6000
27391+DECLARE_PER_CPU(u16, cpu_llc_id);
27392+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27393+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27394
27395-/*
27396- * On x86 all CPUs are mapped 1:1 to the APIC space.
27397- * This simplifies scheduling and IPI sending and
27398- * compresses data structures.
27399- */
27400-
27401-static inline int num_booting_cpus(void)
27402+#ifdef CONFIG_X86_LOCAL_APIC
27403+static inline int cpu_present_to_apicid(int mps_cpu)
27404 {
27405- return cpus_weight(cpu_possible_map);
27406+ if (cpu_present(mps_cpu))
27407+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27408+ else
27409+ return BAD_APICID;
27410 }
27411+#endif
27412
27413-#define raw_smp_processor_id() read_pda(cpunumber)
27414+#ifdef CONFIG_SMP
27415+
27416+#define SMP_TRAMPOLINE_BASE 0x6000
27417
27418 extern int __cpu_disable(void);
27419 extern void __cpu_die(unsigned int cpu);
27420 extern void prefill_possible_map(void);
27421-extern unsigned num_processors;
27422 extern unsigned __cpuinitdata disabled_cpus;
27423
27424-#define NO_PROC_ID 0xFF /* No processor magic marker */
27425-
27426-#endif /* CONFIG_SMP */
27427+#define raw_smp_processor_id() read_pda(cpunumber)
27428+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27429
27430-#define safe_smp_processor_id() smp_processor_id()
27431-
27432-#ifdef CONFIG_X86_LOCAL_APIC
27433-static inline int hard_smp_processor_id(void)
27434-{
27435- /* we don't want to mark this access volatile - bad code generation */
27436- return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27437-}
27438-#endif
27439+#define stack_smp_processor_id() \
27440+ ({ \
27441+ struct thread_info *ti; \
27442+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27443+ ti->cpu; \
27444+})
27445
27446 /*
27447- * Some lowlevel functions might want to know about
27448- * the real APIC ID <-> CPU # mapping.
27449+ * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27450+ * scheduling and IPI sending and compresses data structures.
27451 */
27452-extern u8 __initdata x86_cpu_to_apicid_init[];
27453-extern void *x86_cpu_to_apicid_ptr;
27454-DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
27455-extern u8 bios_cpu_apicid[];
27456-
27457-#ifdef CONFIG_X86_LOCAL_APIC
27458-static inline int cpu_present_to_apicid(int mps_cpu)
27459+static inline int num_booting_cpus(void)
27460 {
27461- if (mps_cpu < NR_CPUS)
27462- return (int)bios_cpu_apicid[mps_cpu];
27463- else
27464- return BAD_APICID;
27465+ return cpus_weight(cpu_possible_map);
27466 }
27467-#endif
27468
27469-#ifndef CONFIG_SMP
27470+extern void smp_send_reschedule(int cpu);
27471+
27472+#else /* CONFIG_SMP */
27473+
27474+extern unsigned int boot_cpu_id;
27475+#define cpu_physical_id(cpu) boot_cpu_id
27476 #define stack_smp_processor_id() 0
27477-#define cpu_logical_map(x) (x)
27478-#else
27479-#include <asm/thread_info.h>
27480-#define stack_smp_processor_id() \
27481-({ \
27482- struct thread_info *ti; \
27483- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27484- ti->cpu; \
27485-})
27486-#endif
27487+
27488+#endif /* !CONFIG_SMP */
27489+
27490+#define safe_smp_processor_id() smp_processor_id()
27491
27492 #ifdef CONFIG_X86_LOCAL_APIC
27493 static __inline int logical_smp_processor_id(void)
27494 {
27495 /* we don't want to mark this access volatile - bad code generation */
27496- return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27497+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27498+}
27499+
27500+static inline int hard_smp_processor_id(void)
27501+{
27502+ /* we don't want to mark this access volatile - bad code generation */
27503+ return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27504 }
27505 #endif
27506
27507-#ifdef CONFIG_SMP
27508-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27509-#else
27510-extern unsigned int boot_cpu_id;
27511-#define cpu_physical_id(cpu) boot_cpu_id
27512-#endif /* !CONFIG_SMP */
27513 #endif
27514
27515--- /dev/null 1970-01-01 00:00:00.000000000 +0000
27516+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
27517@@ -0,0 +1,333 @@
27518+#ifndef _X86_SPINLOCK_H_
27519+#define _X86_SPINLOCK_H_
27520+
27521+#include <asm/atomic.h>
27522+#include <asm/rwlock.h>
27523+#include <asm/page.h>
27524+#include <asm/processor.h>
27525+#include <linux/compiler.h>
27526+
27527+/*
27528+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
27529+ *
27530+ * Simple spin lock operations. There are two variants, one clears IRQ's
27531+ * on the local processor, one does not.
27532+ *
27533+ * These are fair FIFO ticket locks, which are currently limited to 256
27534+ * CPUs.
27535+ *
27536+ * (the type definitions are in asm/spinlock_types.h)
27537+ */
27538+
27539+#ifdef CONFIG_X86_32
27540+# define LOCK_PTR_REG "a"
27541+# define REG_PTR_MODE "k"
27542+#else
27543+# define LOCK_PTR_REG "D"
27544+# define REG_PTR_MODE "q"
27545+#endif
27546+
27547+#if defined(CONFIG_X86_32) && \
27548+ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27549+/*
27550+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27551+ * (PPro errata 66, 92)
27552+ */
27553+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27554+#else
27555+# define UNLOCK_LOCK_PREFIX
27556+#endif
27557+
27558+int xen_spinlock_init(unsigned int cpu);
27559+void xen_spinlock_cleanup(unsigned int cpu);
27560+extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27561+extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27562+ unsigned int flags);
27563+extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27564+extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27565+
27566+/*
27567+ * Ticket locks are conceptually two parts, one indicating the current head of
27568+ * the queue, and the other indicating the current tail. The lock is acquired
27569+ * by atomically noting the tail and incrementing it by one (thus adding
27570+ * ourself to the queue and noting our position), then waiting until the head
27571+ * becomes equal to the the initial value of the tail.
27572+ *
27573+ * We use an xadd covering *both* parts of the lock, to increment the tail and
27574+ * also load the position of the head, which takes care of memory ordering
27575+ * issues and should be optimal for the uncontended case. Note the tail must be
27576+ * in the high part, because a wide xadd increment of the low part would carry
27577+ * up and contaminate the high part.
27578+ *
27579+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27580+ * save some instructions and make the code more elegant. There really isn't
27581+ * much between them in performance though, especially as locks are out of line.
27582+ */
27583+#if (NR_CPUS < 256)
27584+#define TICKET_SHIFT 8
27585+#define __raw_spin_lock_preamble \
27586+ asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27587+ "cmpb %h0, %b0\n\t" \
27588+ "sete %1" \
27589+ : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27590+ : "0" (0x0100) \
27591+ : "memory", "cc")
27592+#define __raw_spin_lock_body \
27593+ asm("1:\t" \
27594+ "cmpb %h0, %b0\n\t" \
27595+ "je 2f\n\t" \
27596+ "decl %1\n\t" \
27597+ "jz 2f\n\t" \
27598+ "rep ; nop\n\t" \
27599+ "movb %2, %b0\n\t" \
27600+ /* don't need lfence here, because loads are in-order */ \
27601+ "jmp 1b\n" \
27602+ "2:" \
27603+ : "+Q" (token), "+g" (count) \
27604+ : "m" (lock->slock) \
27605+ : "memory", "cc")
27606+
27607+
27608+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27609+{
27610+ int tmp, new;
27611+
27612+ asm("movzwl %2, %0\n\t"
27613+ "cmpb %h0, %b0\n\t"
27614+ "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27615+ "jne 1f\n\t"
27616+ LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27617+ "1:\t"
27618+ "sete %b1\n\t"
27619+ "movzbl %b1, %0\n\t"
27620+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27621+ :
27622+ : "memory", "cc");
27623+
27624+ return tmp;
27625+}
27626+
27627+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27628+{
27629+ unsigned int token;
27630+ unsigned char kick;
27631+
27632+ asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27633+ "movzwl %2, %0\n\t"
27634+ "cmpb %h0, %b0\n\t"
27635+ "setne %1"
27636+ : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27637+ :
27638+ : "memory", "cc");
27639+ if (kick)
27640+ xen_spin_kick(lock, token);
27641+}
27642+#else
27643+#define TICKET_SHIFT 16
27644+#define __raw_spin_lock_preamble \
27645+ do { \
27646+ unsigned int tmp; \
27647+ asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27648+ "shldl $16, %0, %3\n\t" \
27649+ "cmpw %w3, %w0\n\t" \
27650+ "sete %1"
27651+ : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27652+ "=&g" (tmp) \
27653+ : "0" (0x00010000) \
27654+ : "memory", "cc"); \
27655+ } while (0)
27656+#define __raw_spin_lock_body \
27657+ do { \
27658+ unsigned int tmp; \
27659+ asm("shldl $16, %0, %2\n" \
27660+ "1:\t" \
27661+ "cmpw %w2, %w0\n\t" \
27662+ "je 2f\n\t" \
27663+ "decl %1\n\t" \
27664+ "jz 2f\n\t" \
27665+ "rep ; nop\n\t" \
27666+ "movw %3, %w0\n\t" \
27667+ /* don't need lfence here, because loads are in-order */ \
27668+ "jmp 1b\n" \
27669+ "2:" \
27670+ : "+r" (token), "+g" (count), "=&g" (tmp) \
27671+ : "m" (lock->slock) \
27672+ : "memory", "cc"); \
27673+ } while (0)
27674+
27675+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27676+{
27677+ int tmp;
27678+ int new;
27679+
27680+ asm("movl %2, %0\n\t"
27681+ "movl %0, %1\n\t"
27682+ "roll $16, %0\n\t"
27683+ "cmpl %0, %1\n\t"
27684+ "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
27685+ "jne 1f\n\t"
27686+ LOCK_PREFIX "cmpxchgl %1, %2\n"
27687+ "1:\t"
27688+ "sete %b1\n\t"
27689+ "movzbl %b1, %0\n\t"
27690+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27691+ :
27692+ : "memory", "cc");
27693+
27694+ return tmp;
27695+}
27696+
27697+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27698+{
27699+ unsigned int token, tmp;
27700+ bool kick;
27701+
27702+ asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
27703+ "movl %2, %0\n\t"
27704+ "shldl $16, %0, %3\n\t"
27705+ "cmpw %w3, %w0\n\t"
27706+ "setne %1"
27707+ : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
27708+ :
27709+ : "memory", "cc");
27710+ if (kick)
27711+ xen_spin_kick(lock, token);
27712+}
27713+#endif
27714+
27715+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
27716+{
27717+ int tmp = *(volatile signed int *)(&(lock)->slock);
27718+
27719+ return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
27720+}
27721+
27722+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
27723+{
27724+ int tmp = *(volatile signed int *)(&(lock)->slock);
27725+
27726+ return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
27727+}
27728+
27729+static inline void __raw_spin_lock(raw_spinlock_t *lock)
27730+{
27731+ unsigned int token, count;
27732+ bool free;
27733+
27734+ __raw_spin_lock_preamble;
27735+ if (unlikely(!free))
27736+ token = xen_spin_adjust(lock, token);
27737+ do {
27738+ count = 1 << 10;
27739+ __raw_spin_lock_body;
27740+ } while (unlikely(!count) && !xen_spin_wait(lock, token));
27741+}
27742+
27743+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
27744+ unsigned long flags)
27745+{
27746+ unsigned int token, count;
27747+ bool free;
27748+
27749+ __raw_spin_lock_preamble;
27750+ if (unlikely(!free))
27751+ token = xen_spin_adjust(lock, token);
27752+ do {
27753+ count = 1 << 10;
27754+ __raw_spin_lock_body;
27755+ } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
27756+}
27757+
27758+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
27759+{
27760+ while (__raw_spin_is_locked(lock))
27761+ cpu_relax();
27762+}
27763+
27764+/*
27765+ * Read-write spinlocks, allowing multiple readers
27766+ * but only one writer.
27767+ *
27768+ * NOTE! it is quite common to have readers in interrupts
27769+ * but no interrupt writers. For those circumstances we
27770+ * can "mix" irq-safe locks - any writer needs to get a
27771+ * irq-safe write-lock, but readers can get non-irqsafe
27772+ * read-locks.
27773+ *
27774+ * On x86, we implement read-write locks as a 32-bit counter
27775+ * with the high bit (sign) being the "contended" bit.
27776+ */
27777+
27778+/**
27779+ * read_can_lock - would read_trylock() succeed?
27780+ * @lock: the rwlock in question.
27781+ */
27782+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
27783+{
27784+ return (int)(lock)->lock > 0;
27785+}
27786+
27787+/**
27788+ * write_can_lock - would write_trylock() succeed?
27789+ * @lock: the rwlock in question.
27790+ */
27791+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
27792+{
27793+ return (lock)->lock == RW_LOCK_BIAS;
27794+}
27795+
27796+static inline void __raw_read_lock(raw_rwlock_t *rw)
27797+{
27798+ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
27799+ "jns 1f\n"
27800+ "call __read_lock_failed\n\t"
27801+ "1:\n"
27802+ ::LOCK_PTR_REG (rw) : "memory");
27803+}
27804+
27805+static inline void __raw_write_lock(raw_rwlock_t *rw)
27806+{
27807+ asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
27808+ "jz 1f\n"
27809+ "call __write_lock_failed\n\t"
27810+ "1:\n"
27811+ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
27812+}
27813+
27814+static inline int __raw_read_trylock(raw_rwlock_t *lock)
27815+{
27816+ atomic_t *count = (atomic_t *)lock;
27817+
27818+ atomic_dec(count);
27819+ if (atomic_read(count) >= 0)
27820+ return 1;
27821+ atomic_inc(count);
27822+ return 0;
27823+}
27824+
27825+static inline int __raw_write_trylock(raw_rwlock_t *lock)
27826+{
27827+ atomic_t *count = (atomic_t *)lock;
27828+
27829+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
27830+ return 1;
27831+ atomic_add(RW_LOCK_BIAS, count);
27832+ return 0;
27833+}
27834+
27835+static inline void __raw_read_unlock(raw_rwlock_t *rw)
27836+{
27837+ asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
27838+}
27839+
27840+static inline void __raw_write_unlock(raw_rwlock_t *rw)
27841+{
27842+ asm volatile(LOCK_PREFIX "addl %1, %0"
27843+ : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
27844+}
27845+
27846+#define _raw_spin_relax(lock) cpu_relax()
27847+#define _raw_read_relax(lock) cpu_relax()
27848+#define _raw_write_relax(lock) cpu_relax()
27849+
27850+#endif
27851--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system.h 2009-02-16 16:18:36.000000000 +0100
27852+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
27853@@ -1,5 +1,393 @@
27854+#ifndef _ASM_X86_SYSTEM_H_
27855+#define _ASM_X86_SYSTEM_H_
27856+
27857+#include <asm/asm.h>
27858+#include <asm/segment.h>
27859+#include <asm/cpufeature.h>
27860+#include <asm/cmpxchg.h>
27861+#include <asm/nops.h>
27862+#include <asm/hypervisor.h>
27863+
27864+#include <linux/kernel.h>
27865+#include <linux/irqflags.h>
27866+
27867+/* entries in ARCH_DLINFO: */
27868+#ifdef CONFIG_IA32_EMULATION
27869+# define AT_VECTOR_SIZE_ARCH 2
27870+#else
27871+# define AT_VECTOR_SIZE_ARCH 1
27872+#endif
27873+
27874+#ifdef CONFIG_X86_32
27875+
27876+struct task_struct; /* one of the stranger aspects of C forward declarations */
27877+struct task_struct *__switch_to(struct task_struct *prev,
27878+ struct task_struct *next);
27879+
27880+/*
27881+ * Saving eflags is important. It switches not only IOPL between tasks,
27882+ * it also protects other tasks from NT leaking through sysenter etc.
27883+ */
27884+#define switch_to(prev, next, last) do { \
27885+ unsigned long esi, edi; \
27886+ asm volatile("pushfl\n\t" /* Save flags */ \
27887+ "pushl %%ebp\n\t" \
27888+ "movl %%esp,%0\n\t" /* save ESP */ \
27889+ "movl %5,%%esp\n\t" /* restore ESP */ \
27890+ "movl $1f,%1\n\t" /* save EIP */ \
27891+ "pushl %6\n\t" /* restore EIP */ \
27892+ "jmp __switch_to\n" \
27893+ "1:\t" \
27894+ "popl %%ebp\n\t" \
27895+ "popfl" \
27896+ :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
27897+ "=a" (last), "=S" (esi), "=D" (edi) \
27898+ :"m" (next->thread.sp), "m" (next->thread.ip), \
27899+ "2" (prev), "d" (next)); \
27900+} while (0)
27901+
27902+/*
27903+ * disable hlt during certain critical i/o operations
27904+ */
27905+#define HAVE_DISABLE_HLT
27906+#else
27907+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
27908+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
27909+
27910+/* frame pointer must be last for get_wchan */
27911+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
27912+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
27913+
27914+#define __EXTRA_CLOBBER \
27915+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
27916+ "r12", "r13", "r14", "r15"
27917+
27918+/* Save restore flags to clear handle leaking NT */
27919+#define switch_to(prev, next, last) \
27920+ asm volatile(SAVE_CONTEXT \
27921+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
27922+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
27923+ "call __switch_to\n\t" \
27924+ ".globl thread_return\n" \
27925+ "thread_return:\n\t" \
27926+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
27927+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
27928+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
27929+ "movq %%rax,%%rdi\n\t" \
27930+ "jc ret_from_fork\n\t" \
27931+ RESTORE_CONTEXT \
27932+ : "=a" (last) \
27933+ : [next] "S" (next), [prev] "D" (prev), \
27934+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
27935+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
27936+ [tif_fork] "i" (TIF_FORK), \
27937+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
27938+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
27939+ : "memory", "cc" __EXTRA_CLOBBER)
27940+#endif
27941+
27942+#ifdef __KERNEL__
27943+#define _set_base(addr, base) do { unsigned long __pr; \
27944+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27945+ "rorl $16,%%edx\n\t" \
27946+ "movb %%dl,%2\n\t" \
27947+ "movb %%dh,%3" \
27948+ :"=&d" (__pr) \
27949+ :"m" (*((addr)+2)), \
27950+ "m" (*((addr)+4)), \
27951+ "m" (*((addr)+7)), \
27952+ "0" (base) \
27953+ ); } while (0)
27954+
27955+#define _set_limit(addr, limit) do { unsigned long __lr; \
27956+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27957+ "rorl $16,%%edx\n\t" \
27958+ "movb %2,%%dh\n\t" \
27959+ "andb $0xf0,%%dh\n\t" \
27960+ "orb %%dh,%%dl\n\t" \
27961+ "movb %%dl,%2" \
27962+ :"=&d" (__lr) \
27963+ :"m" (*(addr)), \
27964+ "m" (*((addr)+6)), \
27965+ "0" (limit) \
27966+ ); } while (0)
27967+
27968+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
27969+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
27970+
27971+extern void load_gs_index(unsigned);
27972+
27973+/*
27974+ * Load a segment. Fall back on loading the zero
27975+ * segment if something goes wrong..
27976+ */
27977+#define loadsegment(seg, value) \
27978+ asm volatile("\n" \
27979+ "1:\t" \
27980+ "movl %k0,%%" #seg "\n" \
27981+ "2:\n" \
27982+ ".section .fixup,\"ax\"\n" \
27983+ "3:\t" \
27984+ "movl %k1, %%" #seg "\n\t" \
27985+ "jmp 2b\n" \
27986+ ".previous\n" \
27987+ _ASM_EXTABLE(1b,3b) \
27988+ : :"r" (value), "r" (0))
27989+
27990+
27991+/*
27992+ * Save a segment register away
27993+ */
27994+#define savesegment(seg, value) \
27995+ asm volatile("mov %%" #seg ",%0":"=rm" (value))
27996+
27997+static inline unsigned long get_limit(unsigned long segment)
27998+{
27999+ unsigned long __limit;
28000+ __asm__("lsll %1,%0"
28001+ :"=r" (__limit):"r" (segment));
28002+ return __limit+1;
28003+}
28004+
28005+static inline void xen_clts(void)
28006+{
28007+ HYPERVISOR_fpu_taskswitch(0);
28008+}
28009+
28010+static inline void xen_stts(void)
28011+{
28012+ HYPERVISOR_fpu_taskswitch(1);
28013+}
28014+
28015+/*
28016+ * Volatile isn't enough to prevent the compiler from reordering the
28017+ * read/write functions for the control registers and messing everything up.
28018+ * A memory clobber would solve the problem, but would prevent reordering of
28019+ * all loads stores around it, which can hurt performance. Solution is to
28020+ * use a variable and mimic reads and writes to it to enforce serialization
28021+ */
28022+static unsigned long __force_order;
28023+
28024+static inline unsigned long xen_read_cr0(void)
28025+{
28026+ unsigned long val;
28027+ asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28028+ return val;
28029+}
28030+
28031+static inline void xen_write_cr0(unsigned long val)
28032+{
28033+ asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28034+}
28035+
28036+#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28037+#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28038+
28039+static inline unsigned long xen_read_cr3(void)
28040+{
28041+ unsigned long val;
28042+ asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28043+#ifdef CONFIG_X86_32
28044+ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28045+#else
28046+ return machine_to_phys(val);
28047+#endif
28048+}
28049+
28050+static inline void xen_write_cr3(unsigned long val)
28051+{
28052+#ifdef CONFIG_X86_32
28053+ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28054+#else
28055+ val = phys_to_machine(val);
28056+#endif
28057+ asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28058+}
28059+
28060+static inline unsigned long xen_read_cr4(void)
28061+{
28062+ unsigned long val;
28063+ asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28064+ return val;
28065+}
28066+
28067+#define xen_read_cr4_safe() xen_read_cr4()
28068+
28069+static inline void xen_write_cr4(unsigned long val)
28070+{
28071+ asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28072+}
28073+
28074+#ifdef CONFIG_X86_64
28075+static inline unsigned long xen_read_cr8(void)
28076+{
28077+ return 0;
28078+}
28079+
28080+static inline void xen_write_cr8(unsigned long val)
28081+{
28082+ BUG_ON(val);
28083+}
28084+#endif
28085+
28086+static inline void xen_wbinvd(void)
28087+{
28088+ asm volatile("wbinvd": : :"memory");
28089+}
28090+#define read_cr0() (xen_read_cr0())
28091+#define write_cr0(x) (xen_write_cr0(x))
28092+#define read_cr2() (xen_read_cr2())
28093+#define write_cr2(x) (xen_write_cr2(x))
28094+#define read_cr3() (xen_read_cr3())
28095+#define write_cr3(x) (xen_write_cr3(x))
28096+#define read_cr4() (xen_read_cr4())
28097+#define read_cr4_safe() (xen_read_cr4_safe())
28098+#define write_cr4(x) (xen_write_cr4(x))
28099+#define wbinvd() (xen_wbinvd())
28100+#ifdef CONFIG_X86_64
28101+#define read_cr8() (xen_read_cr8())
28102+#define write_cr8(x) (xen_write_cr8(x))
28103+#endif
28104+
28105+/* Clear the 'TS' bit */
28106+#define clts() (xen_clts())
28107+#define stts() (xen_stts())
28108+
28109+#endif /* __KERNEL__ */
28110+
28111+static inline void clflush(volatile void *__p)
28112+{
28113+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28114+}
28115+
28116+#define nop() __asm__ __volatile__ ("nop")
28117+
28118+void disable_hlt(void);
28119+void enable_hlt(void);
28120+
28121+extern int es7000_plat;
28122+void cpu_idle_wait(void);
28123+
28124+extern unsigned long arch_align_stack(unsigned long sp);
28125+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28126+
28127+void default_idle(void);
28128+
28129+/*
28130+ * Force strict CPU ordering.
28131+ * And yes, this is required on UP too when we're talking
28132+ * to devices.
28133+ */
28134 #ifdef CONFIG_X86_32
28135-# include "system_32.h"
28136+/*
28137+ * For now, "wmb()" doesn't actually do anything, as all
28138+ * Intel CPU's follow what Intel calls a *Processor Order*,
28139+ * in which all writes are seen in the program order even
28140+ * outside the CPU.
28141+ *
28142+ * I expect future Intel CPU's to have a weaker ordering,
28143+ * but I'd also expect them to finally get their act together
28144+ * and add some real memory barriers if so.
28145+ *
28146+ * Some non intel clones support out of order store. wmb() ceases to be a
28147+ * nop for these.
28148+ */
28149+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28150+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28151+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28152+#else
28153+#define mb() asm volatile("mfence":::"memory")
28154+#define rmb() asm volatile("lfence":::"memory")
28155+#define wmb() asm volatile("sfence" ::: "memory")
28156+#endif
28157+
28158+/**
28159+ * read_barrier_depends - Flush all pending reads that subsequents reads
28160+ * depend on.
28161+ *
28162+ * No data-dependent reads from memory-like regions are ever reordered
28163+ * over this barrier. All reads preceding this primitive are guaranteed
28164+ * to access memory (but not necessarily other CPUs' caches) before any
28165+ * reads following this primitive that depend on the data return by
28166+ * any of the preceding reads. This primitive is much lighter weight than
28167+ * rmb() on most CPUs, and is never heavier weight than is
28168+ * rmb().
28169+ *
28170+ * These ordering constraints are respected by both the local CPU
28171+ * and the compiler.
28172+ *
28173+ * Ordering is not guaranteed by anything other than these primitives,
28174+ * not even by data dependencies. See the documentation for
28175+ * memory_barrier() for examples and URLs to more information.
28176+ *
28177+ * For example, the following code would force ordering (the initial
28178+ * value of "a" is zero, "b" is one, and "p" is "&a"):
28179+ *
28180+ * <programlisting>
28181+ * CPU 0 CPU 1
28182+ *
28183+ * b = 2;
28184+ * memory_barrier();
28185+ * p = &b; q = p;
28186+ * read_barrier_depends();
28187+ * d = *q;
28188+ * </programlisting>
28189+ *
28190+ * because the read of "*q" depends on the read of "p" and these
28191+ * two reads are separated by a read_barrier_depends(). However,
28192+ * the following code, with the same initial values for "a" and "b":
28193+ *
28194+ * <programlisting>
28195+ * CPU 0 CPU 1
28196+ *
28197+ * a = 2;
28198+ * memory_barrier();
28199+ * b = 3; y = b;
28200+ * read_barrier_depends();
28201+ * x = a;
28202+ * </programlisting>
28203+ *
28204+ * does not enforce ordering, since there is no data dependency between
28205+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
28206+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28207+ * in cases like this where there are no data dependencies.
28208+ **/
28209+
28210+#define read_barrier_depends() do { } while (0)
28211+
28212+#ifdef CONFIG_SMP
28213+#define smp_mb() mb()
28214+#ifdef CONFIG_X86_PPRO_FENCE
28215+# define smp_rmb() rmb()
28216 #else
28217-# include "system_64.h"
28218+# define smp_rmb() barrier()
28219+#endif
28220+#ifdef CONFIG_X86_OOSTORE
28221+# define smp_wmb() wmb()
28222+#else
28223+# define smp_wmb() barrier()
28224+#endif
28225+#define smp_read_barrier_depends() read_barrier_depends()
28226+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28227+#else
28228+#define smp_mb() barrier()
28229+#define smp_rmb() barrier()
28230+#define smp_wmb() barrier()
28231+#define smp_read_barrier_depends() do { } while (0)
28232+#define set_mb(var, value) do { var = value; barrier(); } while (0)
28233+#endif
28234+
28235+/*
28236+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
28237+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
28238+ * code region.
28239+ *
28240+ * (Could use an alternative three way for this if there was one.)
28241+ */
28242+static inline void rdtsc_barrier(void)
28243+{
28244+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
28245+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
28246+}
28247+
28248 #endif
28249--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system_32.h 2009-02-16 16:18:36.000000000 +0100
28250+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28251@@ -1,312 +0,0 @@
28252-#ifndef __ASM_SYSTEM_H
28253-#define __ASM_SYSTEM_H
28254-
28255-#include <linux/kernel.h>
28256-#include <asm/segment.h>
28257-#include <asm/cpufeature.h>
28258-#include <asm/cmpxchg.h>
28259-#include <asm/synch_bitops.h>
28260-#include <asm/hypervisor.h>
28261-
28262-#ifdef __KERNEL__
28263-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28264-
28265-struct task_struct; /* one of the stranger aspects of C forward declarations.. */
28266-extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28267-
28268-/*
28269- * Saving eflags is important. It switches not only IOPL between tasks,
28270- * it also protects other tasks from NT leaking through sysenter etc.
28271- */
28272-#define switch_to(prev,next,last) do { \
28273- unsigned long esi,edi; \
28274- asm volatile("pushfl\n\t" /* Save flags */ \
28275- "pushl %%ebp\n\t" \
28276- "movl %%esp,%0\n\t" /* save ESP */ \
28277- "movl %5,%%esp\n\t" /* restore ESP */ \
28278- "movl $1f,%1\n\t" /* save EIP */ \
28279- "pushl %6\n\t" /* restore EIP */ \
28280- "jmp __switch_to\n" \
28281- "1:\t" \
28282- "popl %%ebp\n\t" \
28283- "popfl" \
28284- :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
28285- "=a" (last),"=S" (esi),"=D" (edi) \
28286- :"m" (next->thread.esp),"m" (next->thread.eip), \
28287- "2" (prev), "d" (next)); \
28288-} while (0)
28289-
28290-#define _set_base(addr,base) do { unsigned long __pr; \
28291-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28292- "rorl $16,%%edx\n\t" \
28293- "movb %%dl,%2\n\t" \
28294- "movb %%dh,%3" \
28295- :"=&d" (__pr) \
28296- :"m" (*((addr)+2)), \
28297- "m" (*((addr)+4)), \
28298- "m" (*((addr)+7)), \
28299- "0" (base) \
28300- ); } while(0)
28301-
28302-#define _set_limit(addr,limit) do { unsigned long __lr; \
28303-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28304- "rorl $16,%%edx\n\t" \
28305- "movb %2,%%dh\n\t" \
28306- "andb $0xf0,%%dh\n\t" \
28307- "orb %%dh,%%dl\n\t" \
28308- "movb %%dl,%2" \
28309- :"=&d" (__lr) \
28310- :"m" (*(addr)), \
28311- "m" (*((addr)+6)), \
28312- "0" (limit) \
28313- ); } while(0)
28314-
28315-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28316-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28317-
28318-/*
28319- * Load a segment. Fall back on loading the zero
28320- * segment if something goes wrong..
28321- */
28322-#define loadsegment(seg,value) \
28323- asm volatile("\n" \
28324- "1:\t" \
28325- "mov %0,%%" #seg "\n" \
28326- "2:\n" \
28327- ".section .fixup,\"ax\"\n" \
28328- "3:\t" \
28329- "pushl $0\n\t" \
28330- "popl %%" #seg "\n\t" \
28331- "jmp 2b\n" \
28332- ".previous\n" \
28333- ".section __ex_table,\"a\"\n\t" \
28334- ".align 4\n\t" \
28335- ".long 1b,3b\n" \
28336- ".previous" \
28337- : :"rm" (value))
28338-
28339-/*
28340- * Save a segment register away
28341- */
28342-#define savesegment(seg, value) \
28343- asm volatile("mov %%" #seg ",%0":"=rm" (value))
28344-
28345-static inline void xen_clts(void)
28346-{
28347- HYPERVISOR_fpu_taskswitch(0);
28348-}
28349-
28350-static inline unsigned long xen_read_cr0(void)
28351-{
28352- unsigned long val;
28353- asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28354- return val;
28355-}
28356-
28357-static inline void xen_write_cr0(unsigned long val)
28358-{
28359- asm volatile("movl %0,%%cr0": :"r" (val));
28360-}
28361-
28362-#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28363-
28364-static inline void xen_write_cr2(unsigned long val)
28365-{
28366- asm volatile("movl %0,%%cr2": :"r" (val));
28367-}
28368-
28369-static inline unsigned long xen_read_cr3(void)
28370-{
28371- unsigned long val;
28372- asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28373- return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28374-}
28375-
28376-static inline void xen_write_cr3(unsigned long val)
28377-{
28378- val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28379- asm volatile("movl %0,%%cr3": :"r" (val));
28380-}
28381-
28382-static inline unsigned long xen_read_cr4(void)
28383-{
28384- unsigned long val;
28385- asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28386- return val;
28387-}
28388-
28389-static inline unsigned long xen_read_cr4_safe(void)
28390-{
28391- unsigned long val;
28392- /* This could fault if %cr4 does not exist */
28393- asm volatile("1: movl %%cr4, %0 \n"
28394- "2: \n"
28395- ".section __ex_table,\"a\" \n"
28396- ".long 1b,2b \n"
28397- ".previous \n"
28398- : "=r" (val): "0" (0));
28399- return val;
28400-}
28401-
28402-static inline void xen_write_cr4(unsigned long val)
28403-{
28404- asm volatile("movl %0,%%cr4": :"r" (val));
28405-}
28406-
28407-static inline void xen_wbinvd(void)
28408-{
28409- asm volatile("wbinvd": : :"memory");
28410-}
28411-
28412-static inline void clflush(volatile void *__p)
28413-{
28414- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28415-}
28416-
28417-#define read_cr0() (xen_read_cr0())
28418-#define write_cr0(x) (xen_write_cr0(x))
28419-#define read_cr2() (xen_read_cr2())
28420-#define write_cr2(x) (xen_write_cr2(x))
28421-#define read_cr3() (xen_read_cr3())
28422-#define write_cr3(x) (xen_write_cr3(x))
28423-#define read_cr4() (xen_read_cr4())
28424-#define read_cr4_safe() (xen_read_cr4_safe())
28425-#define write_cr4(x) (xen_write_cr4(x))
28426-#define wbinvd() (xen_wbinvd())
28427-
28428-/* Clear the 'TS' bit */
28429-#define clts() (xen_clts())
28430-
28431-/* Set the 'TS' bit */
28432-#define stts() (HYPERVISOR_fpu_taskswitch(1))
28433-
28434-#endif /* __KERNEL__ */
28435-
28436-static inline unsigned long get_limit(unsigned long segment)
28437-{
28438- unsigned long __limit;
28439- __asm__("lsll %1,%0"
28440- :"=r" (__limit):"r" (segment));
28441- return __limit+1;
28442-}
28443-
28444-#define nop() __asm__ __volatile__ ("nop")
28445-
28446-/*
28447- * Force strict CPU ordering.
28448- * And yes, this is required on UP too when we're talking
28449- * to devices.
28450- *
28451- * For now, "wmb()" doesn't actually do anything, as all
28452- * Intel CPU's follow what Intel calls a *Processor Order*,
28453- * in which all writes are seen in the program order even
28454- * outside the CPU.
28455- *
28456- * I expect future Intel CPU's to have a weaker ordering,
28457- * but I'd also expect them to finally get their act together
28458- * and add some real memory barriers if so.
28459- *
28460- * Some non intel clones support out of order store. wmb() ceases to be a
28461- * nop for these.
28462- */
28463-
28464-
28465-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28466-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28467-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28468-
28469-/**
28470- * read_barrier_depends - Flush all pending reads that subsequents reads
28471- * depend on.
28472- *
28473- * No data-dependent reads from memory-like regions are ever reordered
28474- * over this barrier. All reads preceding this primitive are guaranteed
28475- * to access memory (but not necessarily other CPUs' caches) before any
28476- * reads following this primitive that depend on the data return by
28477- * any of the preceding reads. This primitive is much lighter weight than
28478- * rmb() on most CPUs, and is never heavier weight than is
28479- * rmb().
28480- *
28481- * These ordering constraints are respected by both the local CPU
28482- * and the compiler.
28483- *
28484- * Ordering is not guaranteed by anything other than these primitives,
28485- * not even by data dependencies. See the documentation for
28486- * memory_barrier() for examples and URLs to more information.
28487- *
28488- * For example, the following code would force ordering (the initial
28489- * value of "a" is zero, "b" is one, and "p" is "&a"):
28490- *
28491- * <programlisting>
28492- * CPU 0 CPU 1
28493- *
28494- * b = 2;
28495- * memory_barrier();
28496- * p = &b; q = p;
28497- * read_barrier_depends();
28498- * d = *q;
28499- * </programlisting>
28500- *
28501- * because the read of "*q" depends on the read of "p" and these
28502- * two reads are separated by a read_barrier_depends(). However,
28503- * the following code, with the same initial values for "a" and "b":
28504- *
28505- * <programlisting>
28506- * CPU 0 CPU 1
28507- *
28508- * a = 2;
28509- * memory_barrier();
28510- * b = 3; y = b;
28511- * read_barrier_depends();
28512- * x = a;
28513- * </programlisting>
28514- *
28515- * does not enforce ordering, since there is no data dependency between
28516- * the read of "a" and the read of "b". Therefore, on some CPUs, such
28517- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28518- * in cases like this where there are no data dependencies.
28519- **/
28520-
28521-#define read_barrier_depends() do { } while(0)
28522-
28523-#ifdef CONFIG_SMP
28524-#define smp_mb() mb()
28525-#ifdef CONFIG_X86_PPRO_FENCE
28526-# define smp_rmb() rmb()
28527-#else
28528-# define smp_rmb() barrier()
28529-#endif
28530-#ifdef CONFIG_X86_OOSTORE
28531-# define smp_wmb() wmb()
28532-#else
28533-# define smp_wmb() barrier()
28534-#endif
28535-#define smp_read_barrier_depends() read_barrier_depends()
28536-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28537-#else
28538-#define smp_mb() barrier()
28539-#define smp_rmb() barrier()
28540-#define smp_wmb() barrier()
28541-#define smp_read_barrier_depends() do { } while(0)
28542-#define set_mb(var, value) do { var = value; barrier(); } while (0)
28543-#endif
28544-
28545-#include <linux/irqflags.h>
28546-
28547-/*
28548- * disable hlt during certain critical i/o operations
28549- */
28550-#define HAVE_DISABLE_HLT
28551-void disable_hlt(void);
28552-void enable_hlt(void);
28553-
28554-extern int es7000_plat;
28555-void cpu_idle_wait(void);
28556-
28557-extern unsigned long arch_align_stack(unsigned long sp);
28558-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28559-
28560-void default_idle(void);
28561-void __show_registers(struct pt_regs *, int all);
28562-
28563-#endif
28564--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/system_64.h 2009-02-16 16:18:36.000000000 +0100
28565+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/system_64.h 2009-03-16 16:33:40.000000000 +0100
28566@@ -1,122 +1,9 @@
28567 #ifndef __ASM_SYSTEM_H
28568 #define __ASM_SYSTEM_H
28569
28570-#include <linux/kernel.h>
28571 #include <asm/segment.h>
28572 #include <asm/cmpxchg.h>
28573
28574-#include <asm/synch_bitops.h>
28575-#include <asm/hypervisor.h>
28576-#include <xen/interface/arch-x86_64.h>
28577-
28578-#ifdef __KERNEL__
28579-
28580-/* entries in ARCH_DLINFO: */
28581-#ifdef CONFIG_IA32_EMULATION
28582-# define AT_VECTOR_SIZE_ARCH 2
28583-#else
28584-# define AT_VECTOR_SIZE_ARCH 1
28585-#endif
28586-
28587-#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28588-#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28589-
28590-/* frame pointer must be last for get_wchan */
28591-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28592-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28593-
28594-#define __EXTRA_CLOBBER \
28595- ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28596-
28597-/* Save restore flags to clear handle leaking NT */
28598-#define switch_to(prev,next,last) \
28599- asm volatile(SAVE_CONTEXT \
28600- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28601- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28602- "call __switch_to\n\t" \
28603- ".globl thread_return\n" \
28604- "thread_return:\n\t" \
28605- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28606- "movq %P[thread_info](%%rsi),%%r8\n\t" \
28607- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28608- "movq %%rax,%%rdi\n\t" \
28609- "jc ret_from_fork\n\t" \
28610- RESTORE_CONTEXT \
28611- : "=a" (last) \
28612- : [next] "S" (next), [prev] "D" (prev), \
28613- [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28614- [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28615- [tif_fork] "i" (TIF_FORK), \
28616- [thread_info] "i" (offsetof(struct task_struct, stack)), \
28617- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28618- : "memory", "cc" __EXTRA_CLOBBER)
28619-
28620-extern void load_gs_index(unsigned);
28621-
28622-/*
28623- * Load a segment. Fall back on loading the zero
28624- * segment if something goes wrong..
28625- */
28626-#define loadsegment(seg,value) \
28627- asm volatile("\n" \
28628- "1:\t" \
28629- "movl %k0,%%" #seg "\n" \
28630- "2:\n" \
28631- ".section .fixup,\"ax\"\n" \
28632- "3:\t" \
28633- "movl %1,%%" #seg "\n\t" \
28634- "jmp 2b\n" \
28635- ".previous\n" \
28636- ".section __ex_table,\"a\"\n\t" \
28637- ".align 8\n\t" \
28638- ".quad 1b,3b\n" \
28639- ".previous" \
28640- : :"r" (value), "r" (0))
28641-
28642-/*
28643- * Clear and set 'TS' bit respectively
28644- */
28645-#define clts() (HYPERVISOR_fpu_taskswitch(0))
28646-
28647-static inline unsigned long read_cr0(void)
28648-{
28649- unsigned long cr0;
28650- asm volatile("movq %%cr0,%0" : "=r" (cr0));
28651- return cr0;
28652-}
28653-
28654-static inline void write_cr0(unsigned long val)
28655-{
28656- asm volatile("movq %0,%%cr0" :: "r" (val));
28657-}
28658-
28659-#define read_cr2() current_vcpu_info()->arch.cr2
28660-
28661-#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28662-
28663-#define read_cr3() ({ \
28664- unsigned long __dummy; \
28665- asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28666- machine_to_phys(__dummy); \
28667-})
28668-
28669-static inline void write_cr3(unsigned long val)
28670-{
28671- val = phys_to_machine(val);
28672- asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28673-}
28674-
28675-static inline unsigned long read_cr4(void)
28676-{
28677- unsigned long cr4;
28678- asm volatile("movq %%cr4,%0" : "=r" (cr4));
28679- return cr4;
28680-}
28681-
28682-static inline void write_cr4(unsigned long val)
28683-{
28684- asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28685-}
28686
28687 static inline unsigned long read_cr8(void)
28688 {
28689@@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28690 BUG_ON(val);
28691 }
28692
28693-#define stts() (HYPERVISOR_fpu_taskswitch(1))
28694-
28695-#define wbinvd() \
28696- __asm__ __volatile__ ("wbinvd": : :"memory")
28697-
28698-#endif /* __KERNEL__ */
28699-
28700-static inline void clflush(volatile void *__p)
28701-{
28702- asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28703-}
28704-
28705-#define nop() __asm__ __volatile__ ("nop")
28706-
28707-#ifdef CONFIG_SMP
28708-#define smp_mb() mb()
28709-#define smp_rmb() barrier()
28710-#define smp_wmb() barrier()
28711-#define smp_read_barrier_depends() do {} while(0)
28712-#else
28713-#define smp_mb() barrier()
28714-#define smp_rmb() barrier()
28715-#define smp_wmb() barrier()
28716-#define smp_read_barrier_depends() do {} while(0)
28717-#endif
28718-
28719-
28720-/*
28721- * Force strict CPU ordering.
28722- * And yes, this is required on UP too when we're talking
28723- * to devices.
28724- */
28725-#define mb() asm volatile("mfence":::"memory")
28726-#define rmb() asm volatile("lfence":::"memory")
28727-#define wmb() asm volatile("sfence" ::: "memory")
28728-
28729-#define read_barrier_depends() do {} while(0)
28730-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28731-
28732-#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28733-
28734 #include <linux/irqflags.h>
28735
28736-void cpu_idle_wait(void);
28737-
28738-extern unsigned long arch_align_stack(unsigned long sp);
28739-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28740-
28741 #endif
28742--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-02-16 16:18:36.000000000 +0100
28743+++ sle11-2009-06-29/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
28744@@ -1,5 +1,106 @@
28745+#ifndef _ASM_X86_TLBFLUSH_H
28746+#define _ASM_X86_TLBFLUSH_H
28747+
28748+#include <linux/mm.h>
28749+#include <linux/sched.h>
28750+
28751+#include <asm/processor.h>
28752+#include <asm/system.h>
28753+
28754+#define __flush_tlb() xen_tlb_flush()
28755+#define __flush_tlb_global() xen_tlb_flush()
28756+#define __flush_tlb_single(addr) xen_invlpg(addr)
28757+#define __flush_tlb_all() xen_tlb_flush()
28758+#define __flush_tlb_one(addr) xen_invlpg(addr)
28759+
28760 #ifdef CONFIG_X86_32
28761-# include "tlbflush_32.h"
28762+# define TLB_FLUSH_ALL 0xffffffff
28763 #else
28764-# include "tlbflush_64.h"
28765+# define TLB_FLUSH_ALL -1ULL
28766 #endif
28767+
28768+/*
28769+ * TLB flushing:
28770+ *
28771+ * - flush_tlb() flushes the current mm struct TLBs
28772+ * - flush_tlb_all() flushes all processes TLBs
28773+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28774+ * - flush_tlb_page(vma, vmaddr) flushes one page
28775+ * - flush_tlb_range(vma, start, end) flushes a range of pages
28776+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28777+ *
28778+ * ..but the i386 has somewhat limited tlb flushing capabilities,
28779+ * and page-granular flushes are available only on i486 and up.
28780+ *
28781+ * x86-64 can only flush individual pages or full VMs. For a range flush
28782+ * we always do the full VM. Might be worth trying if for a small
28783+ * range a few INVLPGs in a row are a win.
28784+ */
28785+
28786+#ifndef CONFIG_SMP
28787+
28788+#define flush_tlb() __flush_tlb()
28789+#define flush_tlb_all() __flush_tlb_all()
28790+#define local_flush_tlb() __flush_tlb()
28791+
28792+static inline void flush_tlb_mm(struct mm_struct *mm)
28793+{
28794+ if (mm == current->active_mm)
28795+ __flush_tlb();
28796+}
28797+
28798+static inline void flush_tlb_page(struct vm_area_struct *vma,
28799+ unsigned long addr)
28800+{
28801+ if (vma->vm_mm == current->active_mm)
28802+ __flush_tlb_one(addr);
28803+}
28804+
28805+static inline void flush_tlb_range(struct vm_area_struct *vma,
28806+ unsigned long start, unsigned long end)
28807+{
28808+ if (vma->vm_mm == current->active_mm)
28809+ __flush_tlb();
28810+}
28811+
28812+#else /* SMP */
28813+
28814+#include <asm/smp.h>
28815+
28816+#define local_flush_tlb() __flush_tlb()
28817+
28818+#define flush_tlb_all xen_tlb_flush_all
28819+#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
28820+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28821+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28822+
28823+#define flush_tlb() flush_tlb_current_task()
28824+
28825+static inline void flush_tlb_range(struct vm_area_struct *vma,
28826+ unsigned long start, unsigned long end)
28827+{
28828+ flush_tlb_mm(vma->vm_mm);
28829+}
28830+
28831+#define TLBSTATE_OK 1
28832+#define TLBSTATE_LAZY 2
28833+
28834+#ifdef CONFIG_X86_32
28835+struct tlb_state
28836+{
28837+ struct mm_struct *active_mm;
28838+ int state;
28839+ char __cacheline_padding[L1_CACHE_BYTES-8];
28840+};
28841+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28842+#endif
28843+
28844+#endif /* SMP */
28845+
28846+static inline void flush_tlb_kernel_range(unsigned long start,
28847+ unsigned long end)
28848+{
28849+ flush_tlb_all();
28850+}
28851+
28852+#endif /* _ASM_X86_TLBFLUSH_H */
28853--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush_32.h 2009-02-16 16:18:36.000000000 +0100
28854+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28855@@ -1,99 +0,0 @@
28856-#ifndef _I386_TLBFLUSH_H
28857-#define _I386_TLBFLUSH_H
28858-
28859-#include <linux/mm.h>
28860-#include <asm/processor.h>
28861-
28862-#define __flush_tlb() xen_tlb_flush()
28863-#define __flush_tlb_global() xen_tlb_flush()
28864-#define __flush_tlb_all() xen_tlb_flush()
28865-
28866-#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
28867-
28868-#define __flush_tlb_single(addr) xen_invlpg(addr)
28869-
28870-#define __flush_tlb_one(addr) __flush_tlb_single(addr)
28871-
28872-/*
28873- * TLB flushing:
28874- *
28875- * - flush_tlb() flushes the current mm struct TLBs
28876- * - flush_tlb_all() flushes all processes TLBs
28877- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28878- * - flush_tlb_page(vma, vmaddr) flushes one page
28879- * - flush_tlb_range(vma, start, end) flushes a range of pages
28880- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28881- *
28882- * ..but the i386 has somewhat limited tlb flushing capabilities,
28883- * and page-granular flushes are available only on i486 and up.
28884- */
28885-
28886-#define TLB_FLUSH_ALL 0xffffffff
28887-
28888-
28889-#ifndef CONFIG_SMP
28890-
28891-#include <linux/sched.h>
28892-
28893-#define flush_tlb() __flush_tlb()
28894-#define flush_tlb_all() __flush_tlb_all()
28895-#define local_flush_tlb() __flush_tlb()
28896-
28897-static inline void flush_tlb_mm(struct mm_struct *mm)
28898-{
28899- if (mm == current->active_mm)
28900- __flush_tlb();
28901-}
28902-
28903-static inline void flush_tlb_page(struct vm_area_struct *vma,
28904- unsigned long addr)
28905-{
28906- if (vma->vm_mm == current->active_mm)
28907- __flush_tlb_one(addr);
28908-}
28909-
28910-static inline void flush_tlb_range(struct vm_area_struct *vma,
28911- unsigned long start, unsigned long end)
28912-{
28913- if (vma->vm_mm == current->active_mm)
28914- __flush_tlb();
28915-}
28916-
28917-#else /* SMP */
28918-
28919-#include <asm/smp.h>
28920-
28921-#define local_flush_tlb() \
28922- __flush_tlb()
28923-
28924-#define flush_tlb_all xen_tlb_flush_all
28925-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
28926-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28927-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28928-
28929-#define flush_tlb() flush_tlb_current_task()
28930-
28931-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
28932-{
28933- flush_tlb_mm(vma->vm_mm);
28934-}
28935-
28936-#define TLBSTATE_OK 1
28937-#define TLBSTATE_LAZY 2
28938-
28939-struct tlb_state
28940-{
28941- struct mm_struct *active_mm;
28942- int state;
28943- char __cacheline_padding[L1_CACHE_BYTES-8];
28944-};
28945-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28946-#endif /* SMP */
28947-
28948-static inline void flush_tlb_kernel_range(unsigned long start,
28949- unsigned long end)
28950-{
28951- flush_tlb_all();
28952-}
28953-
28954-#endif /* _I386_TLBFLUSH_H */
28955--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/asm/tlbflush_64.h 2009-02-16 16:18:36.000000000 +0100
28956+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28957@@ -1,97 +0,0 @@
28958-#ifndef _X8664_TLBFLUSH_H
28959-#define _X8664_TLBFLUSH_H
28960-
28961-#include <linux/mm.h>
28962-#include <linux/sched.h>
28963-#include <asm/processor.h>
28964-#include <asm/system.h>
28965-
28966-#define __flush_tlb() xen_tlb_flush()
28967-
28968-/*
28969- * Global pages have to be flushed a bit differently. Not a real
28970- * performance problem because this does not happen often.
28971- */
28972-#define __flush_tlb_global() xen_tlb_flush()
28973-
28974-#define __flush_tlb_all() __flush_tlb_global()
28975-
28976-#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
28977-
28978-
28979-/*
28980- * TLB flushing:
28981- *
28982- * - flush_tlb() flushes the current mm struct TLBs
28983- * - flush_tlb_all() flushes all processes TLBs
28984- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28985- * - flush_tlb_page(vma, vmaddr) flushes one page
28986- * - flush_tlb_range(vma, start, end) flushes a range of pages
28987- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28988- *
28989- * x86-64 can only flush individual pages or full VMs. For a range flush
28990- * we always do the full VM. Might be worth trying if for a small
28991- * range a few INVLPGs in a row are a win.
28992- */
28993-
28994-#ifndef CONFIG_SMP
28995-
28996-#define flush_tlb() __flush_tlb()
28997-#define flush_tlb_all() __flush_tlb_all()
28998-#define local_flush_tlb() __flush_tlb()
28999-
29000-static inline void flush_tlb_mm(struct mm_struct *mm)
29001-{
29002- if (mm == current->active_mm)
29003- __flush_tlb();
29004-}
29005-
29006-static inline void flush_tlb_page(struct vm_area_struct *vma,
29007- unsigned long addr)
29008-{
29009- if (vma->vm_mm == current->active_mm)
29010- __flush_tlb_one(addr);
29011-}
29012-
29013-static inline void flush_tlb_range(struct vm_area_struct *vma,
29014- unsigned long start, unsigned long end)
29015-{
29016- if (vma->vm_mm == current->active_mm)
29017- __flush_tlb();
29018-}
29019-
29020-#else
29021-
29022-#include <asm/smp.h>
29023-
29024-#define local_flush_tlb() \
29025- __flush_tlb()
29026-
29027-#define flush_tlb_all xen_tlb_flush_all
29028-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
29029-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29030-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29031-
29032-#define flush_tlb() flush_tlb_current_task()
29033-
29034-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29035-{
29036- flush_tlb_mm(vma->vm_mm);
29037-}
29038-
29039-#define TLBSTATE_OK 1
29040-#define TLBSTATE_LAZY 2
29041-
29042-/* Roughly an IPI every 20MB with 4k pages for freeing page table
29043- ranges. Cost is about 42k of memory for each CPU. */
29044-#define ARCH_FREE_PTE_NR 5350
29045-
29046-#endif
29047-
29048-static inline void flush_tlb_kernel_range(unsigned long start,
29049- unsigned long end)
29050-{
29051- flush_tlb_all();
29052-}
29053-
29054-#endif /* _X8664_TLBFLUSH_H */
29055--- sle11-2009-06-29.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-06-29 15:14:52.000000000 +0200
29056+++ sle11-2009-06-29/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
29057@@ -82,7 +82,8 @@
29058
29059 #define RESCHEDULE_VECTOR 0
29060 #define CALL_FUNCTION_VECTOR 1
29061-#define NR_IPIS 2
29062+#define SPIN_UNLOCK_VECTOR 2
29063+#define NR_IPIS 3
29064
29065 /*
29066 * The maximum number of vectors supported by i386 processors
29067--- sle11-2009-06-29.orig/include/asm-x86/mmu.h 2009-02-16 16:18:36.000000000 +0100
29068+++ sle11-2009-06-29/include/asm-x86/mmu.h 2009-03-16 16:33:40.000000000 +0100
29069@@ -23,7 +23,7 @@ typedef struct {
29070 void *vdso;
29071 } mm_context_t;
29072
29073-#ifdef CONFIG_SMP
29074+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29075 void leave_mm(int cpu);
29076 #else
29077 static inline void leave_mm(int cpu)
29078--- sle11-2009-06-29.orig/include/asm-x86/ptrace.h 2009-06-29 15:14:52.000000000 +0200
29079+++ sle11-2009-06-29/include/asm-x86/ptrace.h 2009-03-16 16:33:40.000000000 +0100
29080@@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29081 extern void user_disable_single_step(struct task_struct *);
29082
29083 extern void user_enable_block_step(struct task_struct *);
29084-#ifdef CONFIG_X86_DEBUGCTLMSR
29085+#if defined(CONFIG_XEN)
29086+#define arch_has_block_step() (0)
29087+#elif defined(CONFIG_X86_DEBUGCTLMSR)
29088 #define arch_has_block_step() (1)
29089 #else
29090 #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
29091--- sle11-2009-06-29.orig/include/asm-x86/thread_info.h 2009-02-16 16:17:21.000000000 +0100
29092+++ sle11-2009-06-29/include/asm-x86/thread_info.h 2009-03-16 16:33:40.000000000 +0100
29093@@ -94,6 +94,9 @@ struct thread_info {
29094 #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
29095 #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
29096 #define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
29097+#ifdef CONFIG_X86_XEN
29098+#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
29099+#endif
29100
29101 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
29102 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
29103@@ -118,6 +121,7 @@ struct thread_info {
29104 #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
29105 #define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK)
29106 #define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW)
29107+#define _TIF_CSTAR (1 << TIF_CSTAR)
29108
29109 /* work to do in syscall_trace_enter() */
29110 #define _TIF_WORK_SYSCALL_ENTRY \
29111@@ -147,12 +151,12 @@ struct thread_info {
29112 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29113 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29114
29115-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29116-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29117 #else
29118-#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29119-#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29120+#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29121+ /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29122 #endif
29123+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29124+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29125
29126 #define PREEMPT_ACTIVE 0x10000000
29127
29128--- sle11-2009-06-29.orig/include/asm-x86/time.h 2009-06-29 15:14:52.000000000 +0200
29129+++ sle11-2009-06-29/include/asm-x86/time.h 2009-03-16 16:33:40.000000000 +0100
29130@@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29131
29132 extern unsigned long __init calibrate_cpu(void);
29133
29134+#ifdef CONFIG_XEN
29135+extern int xen_independent_wallclock(void);
29136+extern unsigned long xen_read_persistent_clock(void);
29137+extern int xen_update_persistent_clock(void);
29138+#endif
29139+
29140 #endif
29141--- sle11-2009-06-29.orig/include/linux/page-flags.h 2009-02-16 16:17:21.000000000 +0100
29142+++ sle11-2009-06-29/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
29143@@ -102,8 +102,8 @@ enum pageflags {
29144 PG_foreign, /* Page is owned by foreign allocator. */
29145 PG_pinned, /* Cannot alias with PG_owner_priv_1 since
29146 * bad_page() checks include this bit.
29147- * Also cannot use PG_arch_1 since that now
29148- * has a different purpose on x86. */
29149+ * Should not use PG_arch_1 as that may have
29150+ * a different purpose elsewhere. */
29151 #endif
29152 __NR_PAGEFLAGS,
29153
29154--- sle11-2009-06-29.orig/include/linux/pci.h 2008-12-15 11:27:22.000000000 +0100
29155+++ sle11-2009-06-29/include/linux/pci.h 2009-03-16 16:33:40.000000000 +0100
29156@@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29157 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29158 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29159 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29160+#ifdef CONFIG_XEN
29161+void pci_restore_bars(struct pci_dev *);
29162+#endif
29163
29164 /* ROM control related routines */
29165 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29166--- sle11-2009-06-29.orig/include/xen/evtchn.h 2009-03-04 11:28:34.000000000 +0100
29167+++ sle11-2009-06-29/include/xen/evtchn.h 2009-03-16 16:33:40.000000000 +0100
29168@@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29169 synch_clear_bit(port, s->evtchn_pending);
29170 }
29171
29172+static inline void set_evtchn(int port)
29173+{
29174+ shared_info_t *s = HYPERVISOR_shared_info;
29175+ synch_set_bit(port, s->evtchn_pending);
29176+}
29177+
29178+static inline int test_evtchn(int port)
29179+{
29180+ shared_info_t *s = HYPERVISOR_shared_info;
29181+ return synch_test_bit(port, s->evtchn_pending);
29182+}
29183+
29184 static inline void notify_remote_via_evtchn(int port)
29185 {
29186 struct evtchn_send send = { .port = port };
29187 VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29188 }
29189
29190+/* Clear an irq's pending state, in preparation for polling on it. */
29191+void xen_clear_irq_pending(int irq);
29192+
29193+/* Set an irq's pending state, to avoid blocking on it. */
29194+void xen_set_irq_pending(int irq);
29195+
29196+/* Test an irq's pending state. */
29197+int xen_test_irq_pending(int irq);
29198+
29199+/* Poll waiting for an irq to become pending. In the usual case, the
29200+ irq will be disabled so it won't deliver an interrupt. */
29201+void xen_poll_irq(int irq);
29202+
29203 /*
29204 * Use these to access the event channel underlying the IRQ handle returned
29205 * by bind_*_to_irqhandler().
29206--- sle11-2009-06-29.orig/kernel/sysctl_check.c 2009-02-16 16:18:36.000000000 +0100
29207+++ sle11-2009-06-29/kernel/sysctl_check.c 2009-03-16 16:33:40.000000000 +0100
29208@@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29209 };
29210
29211 #ifdef CONFIG_XEN
29212-static struct trans_ctl_table trans_xen_table[] = {
29213+static const struct trans_ctl_table trans_xen_table[] = {
29214 { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
29215 { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
29216 {}
29217--- sle11-2009-06-29.orig/lib/swiotlb-xen.c 2009-02-16 16:18:36.000000000 +0100
29218+++ sle11-2009-06-29/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
29219@@ -30,7 +30,6 @@
29220 #include <asm/gnttab_dma.h>
29221
29222 int swiotlb;
29223-EXPORT_SYMBOL(swiotlb);
29224
29225 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29226
29227@@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29228 }
29229 }
29230
29231+static inline unsigned int is_span_boundary(unsigned int index,
29232+ unsigned int nslots,
29233+ unsigned long offset_slots,
29234+ unsigned long max_slots)
29235+{
29236+ unsigned long offset = (offset_slots + index) & (max_slots - 1);
29237+ return offset + nslots > max_slots;
29238+}
29239+
29240 /*
29241 * Allocates bounce buffer and returns its kernel virtual address.
29242 */
29243@@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29244 unsigned int nslots, stride, index, wrap;
29245 struct phys_addr slot_buf;
29246 int i;
29247+ unsigned long mask;
29248+ unsigned long offset_slots;
29249+ unsigned long max_slots;
29250+
29251+ mask = dma_get_seg_boundary(hwdev);
29252+ offset_slots = -IO_TLB_SEGSIZE;
29253+ max_slots = mask + 1
29254+ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29255+ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29256
29257 /*
29258 * For mappings greater than a page, we limit the stride (and
29259@@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29260 */
29261 spin_lock_irqsave(&io_tlb_lock, flags);
29262 {
29263- wrap = index = ALIGN(io_tlb_index, stride);
29264-
29265+ index = ALIGN(io_tlb_index, stride);
29266 if (index >= iotlb_nslabs)
29267- wrap = index = 0;
29268+ index = 0;
29269+ wrap = index;
29270
29271 do {
29272+ while (is_span_boundary(index, nslots, offset_slots,
29273+ max_slots)) {
29274+ index += stride;
29275+ if (index >= iotlb_nslabs)
29276+ index = 0;
29277+ if (index == wrap)
29278+ goto not_found;
29279+ }
29280+
29281 /*
29282 * If we find a slot that indicates we have 'nslots'
29283 * number of contiguous buffers, we allocate the
29284@@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29285 index = 0;
29286 } while (index != wrap);
29287
29288+ not_found:
29289 spin_unlock_irqrestore(&io_tlb_lock, flags);
29290 return NULL;
29291 }